最近在跟着网上的视频复习爬虫,记录一下
# -*-coding:utf-8-*-
# 爬取糗图百科中热图模块下的所有图片
import requests
import re
import os
if __name__=='__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
url = "https://www.qiushibaike.com/imgrank/"
'''
url = "https://pic.qiushibaike.com/system/pictures/12482/124821966/medium/TJ4REMVCKAP8RJVA.jpg"
一般爬虫的使用方式及保存
# content返回的是二进制形式的图片数据,
# text(字符串)、content(二进制)、json()(对象)
img_data=requests.get(url,headers).content
with open('out.png','wb')as f:
f.write(img_data)
'''
if not os.path.exists('./糗图'):
os.mkdir("./糗图")
# 使用re正则解析
# 使用requests对整张页面进行爬取
page_data = requests.get(url, headers).text
# 解析/提取
# 这个正则能提取到多少元素就有多少个元素
ex = '.*?
# 将正则应用到page_data。返回为list
# re.S单行匹配,re.M多行匹配
img_src_list=re.findall(ex, page_data,re.S)
for src in img_src_list:
# 拼接成完整的图片url
url_new="https:"+src
img_data = requests.get(url_new,headers).content
# 图片名称
img_name=src.split("/")[-1]
img_path='./糗图/'+img_name
with open(img_path,'wb') as f:
f.write(img_data)
print(img_name,"ok!")
升级,实现分页功能
# -*-coding:utf-8-*-
import requests
import re
import os
if __name__=='__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
# 分页实现一
# url="https://www.qiushibaike.com/imgrank/page/%d/"
# 分页实现二
url = "https://www.qiushibaike.com/imgrank/page/"
for pageNum in range(1,10):
# url_new = format(url%pageNum) # 一
url_new = url+str(pageNum) # 二
path = "./糗图_分页/"+str(pageNum)+"/"
if not os.path.exists(path):
os.mkdir(path)
ex = '.*?
page_list = requests.get(url_new,headers).text
img_list = re.findall(ex,page_list,re.S)
for src in img_list:
url_img = "https:"+src
img_name = src.split('/')[-1]
img_path= path+img_name
img_data=requests.get(url_img).content
with open(img_path,'wb') as f:
f.write(img_data)
print(pageNum,"ok")



