【爬虫】Nature论文中的图表爬取

本人在做图表相关的一个课题，正好要用到大数据量的优秀图件，搜了一下没有相关的代码，于是自己写了一个。本人转码第一年，第一次发~废话不多说，上代码。一开始不用requests库是因为requests.get抓取不出网页的源代码，用selenium就可以了，也不知道为什么。有些用requests库抓取的，我试过，都没用，可能是我设置的问题吧。

补充：如果需要加搜索的关键词，将url换成

url = "https://www.nature.com/search?q=" + "你的关键词" + "&page=" + str(page)

from selenium import webdriver
import selenium
import random
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from bs4 import BeautifulSoup
import requests
#库
headers = {"User-Agent": random.choice(

    [

        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0",

        "Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",

        "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/78.0.3904.97 Safari/535.11"

    ]),

    "X-Forwarded-For": str(random.randint(0, 255)) + "." + str(random.randint(0, 255)) + "." + str(

        random.randint(0, 255)) + "." + str(random.randint(0, 255))

}
#headers



def getpages(page):
    option = webdriver.ChromeOptions()
    option.add_argument('headless')  # 设置option
    driver = webdriver.Chrome(options=option)  # 调用带参数的谷歌浏览器（为了使网页在后台运行）
    url = "https://www.nature.com/nature/research-articles?searchType=journalSearch&sort=PubDate&page=" + str(page)
    driver.get(url=url)
    time.sleep(1)
    text = driver.page_source
    bf = BeautifulSoup(text, features="lxml")
    pic_url = bf.find_all("a",class_="c-card__link u-link-inherit")
    pagelist = []
    namelist = []   #为了命名
    for i in pic_url:
        try:
            data = "https://www.nature.com" + i.get("href")
        except TypeError:
            continue
        pagelist.append(data)
        namelist.append(i.get("href"))
    driver.quit()
    image_list = []
    for i in pagelist:
        articalimagelist = []
        for num in range(1, 50):
            pageurl = i + "/figures/" + str(num)
            driver = webdriver.Chrome(options=option)
            try:
                driver.get(url=pageurl)
            except selenium.common.exceptions.TimeoutException:
                time.sleep(100)
                try:
                    driver.get(url=pageurl)
                except selenium.common.exceptions.TimeoutException:
                    time.sleep(100)
                    driver.get(url=pageurl)
            text = driver.page_source
            bf = BeautifulSoup(text, features="lxml")
            pic_url = bf.find_all("source", type="image/webp")
            if pic_url == []:
                driver.quit()
                break
            try:
                data = "https:" + pic_url[0].get("srcset")
            except TypeError:
                driver.quit()
                break
            articalimagelist.append(data)
            print(data)
            driver.quit()
            time.sleep(3)
        image_list.append(articalimagelist)
    return image_list,namelist   #把所有图片的url和图片对应的论文的href返回

def getimg(List, namelist):
    global x
    for i in range(len(List)):
        index = 1
        for t in List[i]:
            try:
                img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
            except requests.exceptions.ConnectionError:
                time.sleep(60)
                img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
            except urllib3.exceptions.ReadTimeoutError:
                time.sleep(60)
                img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
            except requests.exceptions.ReadTimeout:
                time.sleep(100)
                img = requests.get(t, headers=headers, verify=False, timeout=70, allow_redirects=False)
            name = namelist[i].replace("/articles/", "")
            string = name + "_" + str(index) + '.jpg'   #命名即为“论文的href+论文中图片的序号.jpg”
            fp = open(string, 'wb')
            fp.write(img.content)
            fp.close()
            print("%d已下载好" % x)
            x += 1
            index += 1


if __name__ == '__main__':
    x = 1
    for num in range(1, 50):   #页数 1~50页
        List, namelist = getpages(num)
        print("第%d页开始"%num)
        getimg(List, namelist)
        print("第%d页结束"%num)

【爬虫】Nature论文中的图表爬取

大数据系统相关栏目本月热门文章