批量爬取PPT,分页爬取
import os
import requests
from lxml import etree
if __name__ == '__main__':
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.70 Safari/537.36'
}
# 指定url
url = 'https://sc.chinaz.com/ppt/free_%d.html'
# 分页爬取
for pageNum in range(1, 3):
new_url = format(url % pageNum)
if pageNum == 1:
new_url = 'https://sc.chinaz.com/ppt/free.html'
# 发起请求
response = requests.get(url=new_url, headers=headers)
# 手动设置响应数据的编码格式
response.encoding = 'utf-8'
page_text = response.text
# xpath解析标签
tree = etree.HTML(page_text)
ppt_list = tree.xpath('//div[@]/div[5]/div')
# 判断文件夹
if not os.path.exists('./ppt'):
os.mkdir('./ppt')
for div in ppt_list:
# 获取ppt的url
data_url = 'https://sc.chinaz.com/' + div.xpath('.//div[2]/a/@href')[0]
# 获取名称
ppt_name = div.xpath('./div[2]/a/text()')[0] + '.rar'
# 通用处理中文乱码的解决方案
# ppt_name = ppt_name.encode('iso-8859-1').decode('gbk')
# 发起请求
page_text = requests.get(url=data_url, headers=headers).text
# 解析下载url
a_terr = etree.HTML(page_text)
# 处理异常
try:
a_page_text = a_terr.xpath('//div[@]/a/@href')[0]
except IndexError:
continue
# 发起请求
rar_href = requests.get(url=a_page_text, headers=headers).content
# 路径
ppt_path = 'ppt/' + ppt_name
# 保存
with open(ppt_path, 'wb') as fp:
fp.write(rar_href)
print(ppt_name, '--下载成功--')



