本人在做图表相关的一个课题,正好要用到大数据量的优秀图件,搜了一下没有相关的代码,于是自己写了一个。本人转码第一年,第一次发~废话不多说,上代码。一开始不用requests库是因为requests.get抓取不出网页的源代码,用selenium就可以了,也不知道为什么。有些用requests库抓取的,我试过,都没用,可能是我设置的问题吧。
补充:如果需要加搜索的关键词,将url换成
url = "https://www.nature.com/search?q=" + "你的关键词" + "&page=" + str(page)
from selenium import webdriver
import selenium
import random
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
import requests
#库
headers = {"User-Agent": random.choice(
[
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",
"Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/78.0.3904.97 Safari/535.11"
]),
"X-Forwarded-For": str(random.randint(0, 255)) + "." + str(random.randint(0, 255)) + "." + str(
random.randint(0, 255)) + "." + str(random.randint(0, 255))
}
#headers
def getpages(page):
option = webdriver.ChromeOptions()
option.add_argument('headless') # 设置option
driver = webdriver.Chrome(options=option) # 调用带参数的谷歌浏览器(为了使网页在后台运行)
url = "https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=" + str(page)
driver.get(url=url)
time.sleep(1)
text = driver.page_source
bf = BeautifulSoup(text, features="lxml")
pic_url = bf.find_all("a",class_="c-card__link u-link-inherit")
pagelist = []
namelist = [] #为了命名
for i in pic_url:
try:
data = "https://www.nature.com" + i.get("href")
except TypeError:
continue
pagelist.append(data)
namelist.append(i.get("href"))
driver.quit()
image_list = []
for i in pagelist:
articalimagelist = []
for num in range(1, 50):
pageurl = i + "/figures/" + str(num)
driver = webdriver.Chrome(options=option)
try:
driver.get(url=pageurl)
except selenium.common.exceptions.TimeoutException:
time.sleep(100)
try:
driver.get(url=pageurl)
except selenium.common.exceptions.TimeoutException:
time.sleep(100)
driver.get(url=pageurl)
text = driver.page_source
bf = BeautifulSoup(text, features="lxml")
pic_url = bf.find_all("source", type="image/webp")
if pic_url == []:
driver.quit()
break
try:
data = "https:" + pic_url[0].get("srcset")
except TypeError:
driver.quit()
break
articalimagelist.append(data)
print(data)
driver.quit()
time.sleep(3)
image_list.append(articalimagelist)
return image_list,namelist #把所有图片的url和图片对应的论文的href返回
def getimg(List, namelist):
global x
for i in range(len(List)):
index = 1
for t in List[i]:
try:
img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
except requests.exceptions.ConnectionError:
time.sleep(60)
img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
except urllib3.exceptions.ReadTimeoutError:
time.sleep(60)
img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
except requests.exceptions.ReadTimeout:
time.sleep(100)
img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
name = namelist[i].replace("/articles/", "")
string = name + "_" + str(index) + '.jpg' #命名即为“论文的href+论文中图片的序号.jpg”
fp = open(string, 'wb')
fp.write(img.content)
fp.close()
print("%d已下载好" % x)
x += 1
index += 1
if __name__ == '__main__':
x = 1
for num in range(1, 50): #页数 1~50页
List, namelist = getpages(num)
print("第%d页开始"%num)
getimg(List, namelist)
print("第%d页结束"%num)



