数据爬虫（二）：跳页无规律如何分页爬取图片

花了半天时间摸索了一下爬虫，终于写完了整个爬虫代码，只需要静静等待一晚上，就可以看到漂亮的数据集啦！

本爬取的数据仅仅用于学习，不应用于学术于商业

首先做准备工作

1.基本的功能得包含，由于习惯注释都是Chinglish,哈哈哈

当分页的url没有那么明显的规律的时候，应该采用响应表单的方式完成页面的跳转

data = {
    "pageNo": 1,
    "pageSize": 20,
    "zym": None,
    "office.id": None,
    "cd": None,
    "zyglbm.id": None,
    "zydlId": None
}

root = "http://www.nimrf.net.cn/"

# get image list page
def get_img_list_page(href):
    return requests.get(root+href, headers=header)

2.分页爬取图片

def enter_images_pages(href_list, path, max_counter = 300):
    respose = get_img_list_page(href_list[0])
    # search all single image href
    soup = BeautifulSoup(respose.text, "lxml")
    # 1.get all list from table
    count = 0
    page_size = re.findall(string=str(soup), pattern=r");">(d+)")
    page_size = max([eval(e) for e in page_size])
    data["pageSize"] = page_size
    # get multi page
    for i in range(1, page_size+1):
        data["pageNo"] = i
        resp = requests.post(respose.url, data=data, headers=header,
                             verify=False, timeout=10)
        soup = BeautifulSoup(resp.text, "lxml")
        tr_list = soup.find("table", {"id": "contentTable"}).find("tbody").find_all("tr")
        for tr in tr_list:
            # ref_id use to save image within name
            ref_id = re.search(string=str(tr), pattern=r"(w*)").group(1)
            # get image url
            ref_single = tr.find("a").get("href")
            single_page = get_img_list_page(ref_single)
            single_soup = BeautifulSoup(single_page.text, "lxml")
            img_href = single_soup.find("table", {"id":"tpzlTb"}).find("a").get("href")
            type_img = img_href.split('.')[-1]
            urllib.request.urlretrieve(root+img_href, path + "/" + ref_id + f'-{count}.{type_img}')
            print(f"save image:{root+img_href},{path}+/+{ref_id}-{count}.{type_img} successfully!")
            count += 1
            if(count >= max_counter):
                return
            time.sleep(0.1)
    return 0

3.DFS便利所有分页的图片

def get_image_node(html, id=0, path=".", end_id=3):
    soup = BeautifulSoup(html, "lxml")
    # get all  contain all message
    li_tree = soup.find_all("li", {"id": re.compile(r"tree_d+"), "class":"level"+str(id)})
    span_list = []
    href_list = []
    # get .text and .href from 

    for tree in li_tree:
        _id = tree.get("id")
        # get span list
        spans = tree.find("span", {"id": re.compile(f"{_id}+_span")})
        for span in spans:
            span_list.append(span.text)
        if id == end_id:
            # get href
            href = tree.find("a", {"id": re.compile(f"{_id}+_a")}).get("href")
            href_list.append(href)
    # if id == 0:
    #     path = span_list[0]
    if id < end_id:
        for span in span_list:
            get_image_node(html, id+1, path+"/"+span)
    else:
        for i in range(len(span_list)):
            flag = enter_images_pages(href_list[i], path+"/"+span_list[i])

大功告成！不过为了防止被杀IP，要适当延时保存间隔哦

数据爬虫（二）：跳页无规律如何分页爬取图片

Python相关栏目本月热门文章