花了半天时间摸索了一下爬虫,终于写完了整个爬虫代码,只需要静静等待一晚上,就可以看到漂亮的数据集啦!
本爬取的数据仅仅用于学习,不应用于学术于商业
首先做准备工作
1.基本的功能得包含,由于习惯注释都是Chinglish,哈哈哈当分页的url没有那么明显的规律的时候,应该采用响应表单的方式完成页面的跳转
data = {
"pageNo": 1,
"pageSize": 20,
"zym": None,
"office.id": None,
"cd": None,
"zyglbm.id": None,
"zydlId": None
}
root = "http://www.nimrf.net.cn/"
# get image list page
def get_img_list_page(href):
return requests.get(root+href, headers=header)
2.分页爬取图片
def enter_images_pages(href_list, path, max_counter = 300):
respose = get_img_list_page(href_list[0])
# search all single image href
soup = BeautifulSoup(respose.text, "lxml")
# 1.get all list from table
count = 0
page_size = re.findall(string=str(soup), pattern=r");">(d+)")
page_size = max([eval(e) for e in page_size])
data["pageSize"] = page_size
# get multi page
for i in range(1, page_size+1):
data["pageNo"] = i
resp = requests.post(respose.url, data=data, headers=header,
verify=False, timeout=10)
soup = BeautifulSoup(resp.text, "lxml")
tr_list = soup.find("table", {"id": "contentTable"}).find("tbody").find_all("tr")
for tr in tr_list:
# ref_id use to save image within name
ref_id = re.search(string=str(tr), pattern=r"(w*) ").group(1)
# get image url
ref_single = tr.find("a").get("href")
single_page = get_img_list_page(ref_single)
single_soup = BeautifulSoup(single_page.text, "lxml")
img_href = single_soup.find("table", {"id":"tpzlTb"}).find("a").get("href")
type_img = img_href.split('.')[-1]
urllib.request.urlretrieve(root+img_href, path + "/" + ref_id + f'-{count}.{type_img}')
print(f"save image:{root+img_href},{path}+/+{ref_id}-{count}.{type_img} successfully!")
count += 1
if(count >= max_counter):
return
time.sleep(0.1)
return 0
3.DFS便利所有分页的图片
def get_image_node(html, id=0, path=".", end_id=3):
soup = BeautifulSoup(html, "lxml")
# get all 大功告成!不过为了防止被杀IP,要适当延时保存间隔哦



