栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

Python爬取拉勾网招聘信息并存储到MySQL数据库中

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

Python爬取拉勾网招聘信息并存储到MySQL数据库中

Python爬取拉勾网招聘信息并存储到MySQL数据库中

import requests
import pymysql
import time
import random
import csv
# 复制请求拉勾网的请求头,主要需要复制cookie
header = {
    'cookie': 'user_trace_token=20211007083056-bec5b35e-724f-49f5-a8b2-be5e58f3b1f8; _ga=GA1.2.2001419745.1633566685; JSESSIonID=ABAAAECABFAACEA0F4F8A1BA3E248FB8C6483A08C472C9B; WEBTJ-ID=20211007083137-17c5829ab2814d-050d60c59c5313-4343363-1440000-17c5829ab2951a; RECOMMEND_TIP=true; privacyPolicyPopup=false; LGUID=20211007083158-6f8f506b-695d-4ec2-b5fa-79922eef422b; _gid=GA1.2.2140722.1633566715; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633566685,1633566812,1633566855; gate_login_token=04e80a902d4f271df5c5a44c2bc22913f9c55dfd64c49f6e5fa14a04c7daae3b; _putrc=5F92BD70167483B5123F89F2B170EADC; login=true; hasDeliver=0; unick=%E9%83%AD%E6%99%A8; X_HTTP_TOKEN=0c8b4dfa25a774e97797653361b7dc46edd2996eb6; __SAFETY_CLOSE_TIME__22792627=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633567973; LGRID=20211007085301-ab66ee19-6d10-459e-b1a2-4791f8ae1522; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2222792627%22%2C%22first_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2292.0.4515.131%22%7D%2C%22%24device_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%7D',
    'origin': 'https://www.lagou.com',
    'referer': 'https://www.lagou.com/wn/jobs?px=new&pn=3&fromSearch=true&kd=%E5%A4%A7%E6%95%B0%E6%8D%AE',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
# 设置mysql数据库参数
config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'root',
    'db': 'lagou',
    'charset': 'utf8'
}
conn = pymysql.connect(**config)
cur = conn.cursor()  # 使用连接对象获得一个cursor(光标)对象

# 请求参数,在此函数中实现翻页操作
def param_data(num):
    data = {
        'first': 'true',
        'needAddtionalResult': 'false',
        'city': '全国',
        'px': 'new',
        'pn': f'{num}',
        'fromSearch': 'true',
        'kd': '大数据'
    }
    return data

# 抓取招聘信息
def crawl_data(num, result):
    for i in result['content']['positionResult']['result']:
        position_name = i['positionName']
        company_full_name = i['companyFullName']
        salary = i['salary']
        salary_split = salary.split('-')
        minsalary = int(salary_split[0][:-1])
        maxsalary = int(salary_split[1][:-1])
        education = i['education']
        city = i['city']
        companyLabelList = ",".join(i['companyLabelList'])
        positionDetail = i['positionDetail'].replace('
', '').replace('
', '').replace('

', '').replace(' ','').replace('

','') print("职位名称:", position_name) print("公司全称:", company_full_name) print("薪资标准:", "".join(salary)) print("最少薪资:", minsalary) print("最多薪资:", maxsalary) print("学历要求:", education) print("所在城市:", city) print("公司待遇:", companyLabelList) print('职位要求:', positionDetail) ''' # csv文件写入 此部分可以使用,但是保存的文件需要转换格式才可以以csv文件格式打开查看 csv_headers = ['position_name', 'company_full_name', 'salary','minsalary','maxsalary','education','city','companyLabelList','positionDetail'] row = [position_name, company_full_name, salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail] with open('position.csv', 'a',encoding='utf-8', newline='')as file: file_csv = csv.writer(file) with open("position.csv", "r", encoding='utf-8', newline="") as f: reader = csv.reader(f) if not [csv_headers for csv_headers in reader]: file_csv.writerow(csv_headers) file_csv.writerow(row) else: file_csv.writerow(row) ''' # 将数据存储到MySQL中 cur.execute( "INSERT INTO lagou (position_name,company_full_name,salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail)" " VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s)", (position_name, company_full_name, salary, minsalary, maxsalary, education, city, companyLabelList, positionDetail)) cur.connection.commit() time_out = random.randint(3, 8) time.sleep(time_out) print("休眠时间:%s s" % time_out) print(f'------------------完成第{num}页爬取------------------') if __name__ == "__main__": # 从F12开发者模式找到拉勾网中包含职位信息的json链接并进行请求 url = 'https://www.lagou.com/jobs/v2/positionAjax.json' for num in range(1, 151): data = param_data(num) res = requests.post(url=url, headers=header, params=data) result = res.json() try: crawl_data(num, result) except: while True: if res.status_code == 200: crawl_data(num, result) else: print("---------------------------------程序异常,请立即解决!---------------------------------") cur.close() conn.close()
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/309040.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号