Python爬取拉勾网招聘信息并存储到MySQL数据库中
import requests
import pymysql
import time
import random
import csv
# 复制请求拉勾网的请求头,主要需要复制cookie
header = {
'cookie': 'user_trace_token=20211007083056-bec5b35e-724f-49f5-a8b2-be5e58f3b1f8; _ga=GA1.2.2001419745.1633566685; JSESSIonID=ABAAAECABFAACEA0F4F8A1BA3E248FB8C6483A08C472C9B; WEBTJ-ID=20211007083137-17c5829ab2814d-050d60c59c5313-4343363-1440000-17c5829ab2951a; RECOMMEND_TIP=true; privacyPolicyPopup=false; LGUID=20211007083158-6f8f506b-695d-4ec2-b5fa-79922eef422b; _gid=GA1.2.2140722.1633566715; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633566685,1633566812,1633566855; gate_login_token=04e80a902d4f271df5c5a44c2bc22913f9c55dfd64c49f6e5fa14a04c7daae3b; _putrc=5F92BD70167483B5123F89F2B170EADC; login=true; hasDeliver=0; unick=%E9%83%AD%E6%99%A8; X_HTTP_TOKEN=0c8b4dfa25a774e97797653361b7dc46edd2996eb6; __SAFETY_CLOSE_TIME__22792627=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633567973; LGRID=20211007085301-ab66ee19-6d10-459e-b1a2-4791f8ae1522; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2222792627%22%2C%22first_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2292.0.4515.131%22%7D%2C%22%24device_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%7D',
'origin': 'https://www.lagou.com',
'referer': 'https://www.lagou.com/wn/jobs?px=new&pn=3&fromSearch=true&kd=%E5%A4%A7%E6%95%B0%E6%8D%AE',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
# 设置mysql数据库参数
config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'root',
'db': 'lagou',
'charset': 'utf8'
}
conn = pymysql.connect(**config)
cur = conn.cursor() # 使用连接对象获得一个cursor(光标)对象
# 请求参数,在此函数中实现翻页操作
def param_data(num):
data = {
'first': 'true',
'needAddtionalResult': 'false',
'city': '全国',
'px': 'new',
'pn': f'{num}',
'fromSearch': 'true',
'kd': '大数据'
}
return data
# 抓取招聘信息
def crawl_data(num, result):
for i in result['content']['positionResult']['result']:
position_name = i['positionName']
company_full_name = i['companyFullName']
salary = i['salary']
salary_split = salary.split('-')
minsalary = int(salary_split[0][:-1])
maxsalary = int(salary_split[1][:-1])
education = i['education']
city = i['city']
companyLabelList = ",".join(i['companyLabelList'])
positionDetail = i['positionDetail'].replace('
', '').replace('
', '').replace('', '').replace(' ','').replace('
','')
print("职位名称:", position_name)
print("公司全称:", company_full_name)
print("薪资标准:", "".join(salary))
print("最少薪资:", minsalary)
print("最多薪资:", maxsalary)
print("学历要求:", education)
print("所在城市:", city)
print("公司待遇:", companyLabelList)
print('职位要求:', positionDetail)
'''
# csv文件写入 此部分可以使用,但是保存的文件需要转换格式才可以以csv文件格式打开查看
csv_headers = ['position_name', 'company_full_name', 'salary','minsalary','maxsalary','education','city','companyLabelList','positionDetail']
row = [position_name, company_full_name, salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail]
with open('position.csv', 'a',encoding='utf-8', newline='')as file:
file_csv = csv.writer(file)
with open("position.csv", "r", encoding='utf-8', newline="") as f:
reader = csv.reader(f)
if not [csv_headers for csv_headers in reader]:
file_csv.writerow(csv_headers)
file_csv.writerow(row)
else:
file_csv.writerow(row)
'''
# 将数据存储到MySQL中
cur.execute(
"INSERT INTO lagou (position_name,company_full_name,salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail)"
" VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
(position_name, company_full_name, salary, minsalary, maxsalary, education, city, companyLabelList,
positionDetail))
cur.connection.commit()
time_out = random.randint(3, 8)
time.sleep(time_out)
print("休眠时间:%s s" % time_out)
print(f'------------------完成第{num}页爬取------------------')
if __name__ == "__main__":
# 从F12开发者模式找到拉勾网中包含职位信息的json链接并进行请求
url = 'https://www.lagou.com/jobs/v2/positionAjax.json'
for num in range(1, 151):
data = param_data(num)
res = requests.post(url=url, headers=header, params=data)
result = res.json()
try:
crawl_data(num, result)
except:
while True:
if res.status_code == 200:
crawl_data(num, result)
else:
print("---------------------------------程序异常,请立即解决!---------------------------------")
cur.close()
conn.close()



