完整代码如下:
#前程无忧代码
import requests
import pandas
import time
import random
import json
#用于获取页面信息
def getWebResult(url,cookies,header):
html = requests.get(url=url,cookies=cookies, headers=header)
result = json.loads(html.text)
#找到html中result包含的招聘职位信息
data = result['engine_jds'] # 返回结果在preview中的具体返回值
return data
#将招聘信息按照对应的参数,组装成字典
def getGoalData(data):
for i in range(50):#每页默认15个职位
info={
'job_name': data[i]['job_name'], #岗位简称
'workarea_text':data[i]['workarea_text'], #工作地点
'company_name':data[i]['company_name'], #公司名称
'providesalary_text':data[i]['providesalary_text'], # 工资
'issuedate':data[i]['issuedate'], #发布日期
'workyear':data[i]['workyear'], #工作经验
'attribute_text':data[i]['attribute_text'][-2],#学历
'companyind_text':data[i]['companyind_text'], #所属行业
'companytype_text':data[i]['companytype_text'], #公司类型
'companysize_text':data[i]['companysize_text'], #公司规模
'jobwelf':data[i]['jobwelf'] #岗位福利
}
data[i]=info
return data
#保存data至本地csv文件
def saveData(data,stage):
table = pandas.Dataframe(data)
table.to_csv('C:/12.3/qianchengwuyou.csv', header=stage, index=False, mode='a+')
def main():
# 拼装header信息
header = {
'Host': 'search.51job.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
}
cookies = {
'cookie':'guid=ffb4ef67ea9cef6c2bb600ba6cc1fc50; partner=sem_pc360s5_99484; adv=ad_logid_url%3Dhttps%253A%252F%252Ftrace.51job.com%252Ftrace.php%253Fpartner%253Dsem_pc360s5_99484%2526ajp%253DaHR0cHM6Ly9ta3QuNTFqb2IuY29tL3RnL3NlbS9MUF8yMDIwXzEuaHRtbD9mcm9tPTM2MGFk%2526k%253D7d16490a53bc7f778963fbe04432456c%2526qhclickid%253D7617d9207a42309b%26%7C%26; _ujz=MjAwODQ4MzUzMA%3D%3D; ps=needv%3D0; 51job=cuid%3D200848353%26%7C%26cusername%3Dv4fQoJQ2LUQdMESXP3zLj%252FV4h9PELIjv55Kq37MsS8Y%253D%26%7C%26cpassword%3D%26%7C%26cname%3D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0CBxe0wORHV2%26%7C%26c/confirm/ikey%3D%25241%2524VQijl%252FM1%2524zc.kGnF4FTODhRNO7uFgA%252F%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D%26%7C%26cnamekey%3D%25241%2524k6fk98PF%2524uN6F6SDNOGXwyR7V4wZNV%252F%26%7C%26to%3De20a62709d53a8d943e89206efa0a19f61b01893%26%7C%26; slife=lowbrowser%3Dnot%26%7C%26lastlogindate%3D20211208%26%7C%26securetime%3DVWlcaVYzWDQFblZrDjUKZFpuAT8%253D; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'
}
# 职位关键字
job='数据分析师'
# 职位所属地
city = '北京'
# 模拟请求的url
for i in range(1,215):
url='https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588,2,3.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='.format(i)
print('------page %s-------' % i)
if i == 1:
flag = 'true' #当是首次请求时,使用flag=true标志
stage = True #stage是用来标示csv是否创建表头的参数,仅在第一次保存数据时创建
else:
flag = 'false'
stage = False
data = getWebResult(url,cookies, header)
#调用函数,拼装招聘信息
data_goal = getGoalData(data)
#调用函数,保存info数据
saveData(data_goal, stage)
#休眠一定时间
#time.sleepint(20+random.rand(10,30))
if __name__ == '__main__':
main()
得到结果如下:
代码运行时,可能会出现sleep有关错误,删除休眠时间的代码即可运行成功



