Python爬取拉勾网招聘信息并存储到MySQL数据库中

import requests
import pymysql
import time
import random
import csv
# 复制请求拉勾网的请求头，主要需要复制cookie
header = {
    'cookie': 'user_trace_token=20211007083056-bec5b35e-724f-49f5-a8b2-be5e58f3b1f8; _ga=GA1.2.2001419745.1633566685; JSESSIonID=ABAAAECABFAACEA0F4F8A1BA3E248FB8C6483A08C472C9B; WEBTJ-ID=20211007083137-17c5829ab2814d-050d60c59c5313-4343363-1440000-17c5829ab2951a; RECOMMEND_TIP=true; privacyPolicyPopup=false; LGUID=20211007083158-6f8f506b-695d-4ec2-b5fa-79922eef422b; _gid=GA1.2.2140722.1633566715; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633566685,1633566812,1633566855; gate_login_token=04e80a902d4f271df5c5a44c2bc22913f9c55dfd64c49f6e5fa14a04c7daae3b; _putrc=5F92BD70167483B5123F89F2B170EADC; login=true; hasDeliver=0; unick=%E9%83%AD%E6%99%A8; X_HTTP_TOKEN=0c8b4dfa25a774e97797653361b7dc46edd2996eb6; __SAFETY_CLOSE_TIME__22792627=1; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1633567973; LGRID=20211007085301-ab66ee19-6d10-459e-b1a2-4791f8ae1522; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2222792627%22%2C%22first_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%2292.0.4515.131%22%7D%2C%22%24device_id%22%3A%2217c582a073163d-09fc0b72e50a2f-4343363-1440000-17c582a0732c45%22%7D',
    'origin': 'https://www.lagou.com',
    'referer': 'https://www.lagou.com/wn/jobs?px=new&pn=3&fromSearch=true&kd=%E5%A4%A7%E6%95%B0%E6%8D%AE',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
}
# 设置mysql数据库参数
config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'root',
    'db': 'lagou',
    'charset': 'utf8'
}
conn = pymysql.connect(**config)
cur = conn.cursor()  # 使用连接对象获得一个cursor（光标）对象

# 请求参数，在此函数中实现翻页操作
def param_data(num):
    data = {
        'first': 'true',
        'needAddtionalResult': 'false',
        'city': '全国',
        'px': 'new',
        'pn': f'{num}',
        'fromSearch': 'true',
        'kd': '大数据'
    }
    return data

# 抓取招聘信息
def crawl_data(num, result):
    for i in result['content']['positionResult']['result']:
        position_name = i['positionName']
        company_full_name = i['companyFullName']
        salary = i['salary']
        salary_split = salary.split('-')
        minsalary = int(salary_split[0][:-1])
        maxsalary = int(salary_split[1][:-1])
        education = i['education']
        city = i['city']
        companyLabelList = ",".join(i['companyLabelList'])
        positionDetail = i['positionDetail'].replace('
', '').replace('
', '').replace('', '').replace(' ','').replace('','')
        print("职位名称：", position_name)
        print("公司全称：", company_full_name)
        print("薪资标准：", "".join(salary))
        print("最少薪资：", minsalary)
        print("最多薪资：", maxsalary)
        print("学历要求：", education)
        print("所在城市：", city)
        print("公司待遇：", companyLabelList)
        print('职位要求：', positionDetail)
        '''
        # csv文件写入 此部分可以使用，但是保存的文件需要转换格式才可以以csv文件格式打开查看
        csv_headers = ['position_name', 'company_full_name', 'salary','minsalary','maxsalary','education','city','companyLabelList','positionDetail']
        row = [position_name, company_full_name, salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail]
        
        with open('position.csv', 'a',encoding='utf-8', newline='')as file:

            file_csv = csv.writer(file)
            with open("position.csv", "r", encoding='utf-8', newline="") as f:
                reader = csv.reader(f)
                if not [csv_headers for csv_headers in reader]:
                    file_csv.writerow(csv_headers)
                    file_csv.writerow(row)
                else:
                    file_csv.writerow(row)
        '''
		# 将数据存储到MySQL中
        cur.execute(
            "INSERT INTO lagou (position_name,company_full_name,salary,minsalary,maxsalary,education,city,companyLabelList,positionDetail)"
            " VALUE (%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            (position_name, company_full_name, salary, minsalary, maxsalary, education, city, companyLabelList,
             positionDetail))
        cur.connection.commit()


    time_out = random.randint(3, 8)
    time.sleep(time_out)
    print("休眠时间：%s s" % time_out)
    print(f'------------------完成第{num}页爬取------------------')


if __name__ == "__main__":
	# 从F12开发者模式找到拉勾网中包含职位信息的json链接并进行请求
    url = 'https://www.lagou.com/jobs/v2/positionAjax.json'

    for num in range(1, 151):
        data = param_data(num)
        res = requests.post(url=url, headers=header, params=data)
        result = res.json()
        try:
            crawl_data(num, result)
        except:
            while True:
                if res.status_code == 200:
                    crawl_data(num, result)
                else:
                    print("---------------------------------程序异常，请立即解决！---------------------------------")

    cur.close()
    conn.close()
Python爬取拉勾网招聘信息并存储到MySQL数据库中

Python相关栏目本月热门文章