#!usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import os
from lxml import etree
#项目需求:解析出第一页的免费简历,并进行下载保存 https://sc.chinaz.com/jianli/free.html
if __name__ == '__main__':
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
#取前三页简历
url = ["https://sc.chinaz.com/jianli/free.html"]
for i in range(2, 4):
url_new = "https://sc.chinaz.com/jianli/free_" + str(i) + ".html"
url.append(url_new)
for i in url:
page_text = requests.get(url=i, headers=headers).text
tree = etree.HTML(page_text)
# 解析到简历页面的的url
div_list = tree.xpath('//div[@id="main"]/div/div')
div_lists = []
for div in div_list:
resume_htm = div.xpath('./a/@href')[0]
resume_htm = 'https:'+resume_htm
div_lists.append(resume_htm)
print('爬取成功!!!', div_lists)
#解析简历模板压缩包
if not os.path.exists('./jianliLibs'):
os.mkdir('./jianliLibs')
for url in div_lists:
jianli_page_content = requests.get(url=url, headers=headers).text
tree = etree.HTML(jianli_page_content)
# 解析到简历压缩包的url
jianli_list = tree.xpath('//div[@]/ul')
jianli_lists = []
for li in jianli_list:
resume_rar_url = li.xpath('./li/a/@href')[0]
# 请求到了简历的二进制数据
rar_data = requests.get(url=resume_rar_url, headers=headers).content
rar_name = resume_rar_url.split('/')[-1]
imgPath = './jianliLibs/' + rar_name
with open(imgPath, 'wb') as fp:
fp.write(rar_data)
print(rar_name, '下载成功!!!')