'''
51job(*城市:北,杭,广,深, *职位关键字:爬虫,python,数据分析 )
使用requests模块 xpath
需要登录 在发送请求携带请求头和cookie
'''
import requests,time
from lxml import etree
from spider_tool import Tool
import json
import re
import pymysql
def get_html(url):
'''
获得返回页
:param url:
:return:
'''
session = requests.session()
session.headers = headers
res = session.get(url)
# print(res.status_code)
res.encoding = res.apparent_encoding
return res.text
def get_data(res):
'''
获得部分数据和具体数据的网址
:param res:
:return:
'''
rule = re.compile(r'window.__SEARCH_RESULT__ = (.*?)')
result = json.loads(rule.findall(res)[0])
job_info = result["engine_jds"]
job_data = []
for i in job_info:
data = {}
'''
print(i["job_href"]) #职位链接
print(i["job_name"]) #职位名字
print(i["company_name"]) #公司名字
print(i["providesalary_text"]) #薪资
print(i["workarea_text"]) #工作地区
print(i["updatedate"]) #发布时间
print(i["companytype_text"]) #公司类型
print(i["jobwelf"]) #职位福利标签
print(i["attribute_text"]) #职位简要要求
print(i["companysize_text"]) #公司规模
print(i["companyind_text"]) #公司类型
'''
data["job_href"] = i["job_href"]
data["job_name"] = i["job_name"]
data["company_name"] = i["company_name"]
data["providesalary_text"] = i["providesalary_text"]
data["workarea_text"] = i["workarea_text"]
data["updatedate"] = i["updatedate"]
data["companytype_text"] = i["companytype_text"]
data["jobwelf"] = i["jobwelf"]
data["attribute_text"] = "".join(i["attribute_text"])
data["companysize_text"] = i["companysize_text"]
data["companyind_text"] = i["companyind_text"]
job_data.append(data)
return job_data
def get_job_data(res):
'''
获取具体的职位信息
:param res:
:return:
'''
print(res)
html = etree.HTML(res)
job_info = html.xpath("//div[@class='bmsg job_msg inbox']/div")
if not job_info:
job_info = html.xpath("//div[@class='bmsg job_msg inbox']/p")
job_info = "".join(j.strip() for j in [i.xpath('string(.)') for i in job_info[:-1]])
job_area = "".join(html.xpath("//div[@class='bmsg inbox']/p/text()"))
company_info = "".join(html.xpath("//div[@class='tmsg inbox']/text()"))
# print(job_info)
# print(job_area)
# print(company_info)
return [job_info,job_area,company_info]
def save_to_mysql(args):
'''
存入数据库中
:param args:
:return:
'''
sql = 'insert into 51job values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
try:
cursor.execute(sql,args)
except Exception as e:
print(e)
conn.rollback()
else:
print("存入数据库")
conn.commit()
if __name__ == '__main__':
t = Tool()
conn = pymysql.Connect(
host='localhost',
user='root',
password='123456',
database='testdata',
port=3306,
charset='utf8'
)
cursor = conn.cursor()
#请求头中 cookie是过一段时间改变的 建议使用selenium 去获取数据
#或者利用js去加载生成cookie的函数来随时的改变cookie
headers = {
'User-Agnet': t.generate_user_agent(),
# 'cookie': '*******'
#host 和 refer 可要可以不要
"Host": "jobs.51job.com",
"refer": "https://jobs.51job.com/jinshuiqu/130762161.html?s=sou_sou_soulb&t=0_0"
}
# 这里只爬取了一页的url 可以将地区和职位信息所代表的地方动态的添加 就可以获取多个地方的多个职位的信息
url = 'https://search.51job.com/list/170200,000000,0000,00,9,99,%25E7%2588%25AC%25E8%2599%25AB%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
res = get_html(url)
job_list = get_data(res)
#进行数据处理
for job in job_list:
url = job["job_href"]
print(url)
print(job["job_name"])
# print(job.values())
job_info_list = get_job_data(get_html(url))
# print(job_info_list)
job_infos = list(job.values())[1:]
job_infos.extend(job_info_list)
print(len(job_infos))
save_to_mysql(tuple(job_infos))
time.sleep(1)