- 介绍
- 结果展示
- 爬取的网页
- 参数
- 运行环境
- 项目分析
- 简单爬取
- 多线程爬取
- 数据的存储
- csv
- json
- excel
- mysql
- 展示数据
- 运行
- 注意
该项目为课程期末作业,python三个月的学习成果展示,分析各地职位薪水的多少
结果展示可视化结果:
excel结果:
json结果:
mysql:
源代码上传到Gitee:
https://gitee.com/ren-wenqing/PythonLearning.git
https://search.51job.com/
参数例:https://search.51job.com/list/090000,000000,0000,00,9,99,java,2,79.html
090000:城市代码
java:关键词
79:页数
具体参数可以自己在https://search.51job.com/中的搜索框中输入数据,观察浏览器地址的变化
运行环境python:3.9.6
mysql:8.0.18
需要用到的库:
import collections import json import xlsxwriter as xlsxwriter from utils import get_header import requests import queue import threading import os import csv import re import pymysql import matplotlib.pyplot as plt from openpyxl import load_workbook from faker import Faker项目分析 简单爬取
直接向网页发送请求就可以获取到网页的源代码
req=requests.get(url=url, headers=self.header)
获取结果:
这里页面中的加载框一直是加载状态,并没有我们需要的数据,但爬取的网址在浏览器中却可以正常显示:
这是因为requests库无法解析js,页面源代码中可能包含招聘列表
查看页面源代码,可以看到有一行特别长的js
复制到解析工具:
都是我们需要的数据
这里直接用正则匹配最快
def Spider(self):
while not self.pagequeue.empty():
url = self.pagequeue.get()
print('正在爬取:{}'.format(url))
req = requests.get(url, headers=get_header())
req.encoding = 'gbk'
response = req.content.decode('gbk')
for i in range(1, 12):
try:
title = re.findall(r'"job_title":"(.*?)"', response)
if title[0] == None:
break
company_name = re.findall(r'"company_name":"(.*?)"', response)
job_name = re.findall(r'"job_title":"(.*?)"', response)
salary = re.findall(r'"providesalary_text":"(.*?)"', response)
work_area = re.findall(r'"workarea_text":"(.*?)"', response)
time = re.findall(r'"issuedate":"(.*?)"', response)
company_type = re.findall(r'"companytype_text":"(.*?)"', response)
company_hangye = re.findall(r'"companyind_text":"(.*?)"', response)
detail = re.findall(r'"job_href":"(.*?)"', response)
if len(salary[i]) != 0:
salary = salary[i].replace('\/', '/')
salary=formatyue(salary)
salary = formatqian(salary)
salary=fomatpjun(salary)
detail = detail[i].replace('\/', '/')
company_hangye = company_hangye[i].replace('\/', '/')
data = {
"职位名称": job_name[i],
"薪资(万/月)": salary,
"公司名称": company_name[i],
"工作地点": work_area[i],
"发布时间": time[i],
"公司链接": detail,
"公司类型": company_type[i],
"公司行业": company_hangye,
}
self.jobqueue.put(data)
except:
continue
这些是我debug后显示的数据,可以看到里面有很多转义“”符号,需要剔除
我定义了三个方法对薪水解析统一格式,因为有的薪水是(千/月),有的是(万/月),还有的是(万/年)
def formatyue(str):
sa = re.findall(r'(.*?)万/年', str)
if len(sa) ==0:
return str
else:
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])
sal = float(sa1[0][1])
saf = saf / 12
sal = sal / 12
str = "{}-{}万/月".format(round(saf, 1), round(sal, 1))
return str
def formatqian(str):
sa = re.findall(r'(.*?)千/月', str)
if len(sa) ==0:
return str
else:
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])
sal = float(sa1[0][1])
saf = saf / 10
sal = sal / 10
str = "{}-{}万/月".format(round(saf, 1), round(sal, 1))
return str
def fomatpjun(str):
sa = re.findall(r'(.*?)万/月', str)
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])+float(sa1[0][1])
str = "{}".format(round(saf/2, 1))
return str
多线程爬取
加快爬取速度
def run(self): thread_list = [] for i in range(self.thread): t = threading.Thread(target=self.Spider) thread_list.append(t) for t in thread_list: t.setDaemon(True) t.start() for t in thread_list: t.join()
地址字段输入的是文字,但浏览器里是地址的数字代码,需要遍历地址对应的json
链接: https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js.
def _get_city_code(self): url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js' req = requests.get(url, headers=self.header).text a = req.find(self.city) return req[a - 9:a - 3]数据的存储 csv
读取文件路径后,根据Spider方法里返回的列表,进行读取写入
if os.path.exists(self.path):
data_list = []
self.path = os.path.join(self.path, 'save_list')
while not self.jobqueue.empty():
data_list.append(self.jobqueue.get())
with open(os.path.join(self.path, 'info——job——{}——zone{}.csv'.format(self.keyword, self.city)), 'w',
newline='', encoding='utf-8') as f:
f_csv = csv.DictWriter(f, self.csv_header)
f_csv.writeheader()
f_csv.writerows(data_list)
json
这个是通过数据库排序输出获得的
import pymysql
def check():
con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8')
cursor = con.cursor()
sql = "SELECT * FROM `jobinfo` order by salary desc;"
cursor.execute(sql)
con.commit()
results = cursor.fetchall()
col_names = ['job_name','salary', 'company_name', 'work_area','company_hangye','detail']
strs = {}
for row in results:
strs[row[0]] = dict(zip(col_names, row[2:]))
cursor.close()
con.close()
result = json.dumps(strs)
filename = 'gongsi.json'
with open(filename, 'w') as file_obj:
file_obj.write(result)
excel
import xlsxwriter as xlsxwriter
workbook = xlsxwriter.Workbook(r'C:UsersSixStartPycharmProjectspythonProject1期末python.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write("A1", "职位名称")
worksheet.write("B1", "薪资(万/月)")
worksheet.write("C1", "公司名称")
worksheet.write("D1", "工作地点")
worksheet.write("E1", "发布时间")
worksheet.write("F1", "公司链接")
worksheet.write("G1", "公司类型")
worksheet.write("H1", "公司行业")
worksheet.write("L1", "工资")
worksheet.write("M1", "次数")
salarylist = []
for i in range(1, len(data_list)):
job_name = data_list[i]['职位名称']
salary = data_list[i]['薪资(万/月)']
company_name = data_list[i]['公司名称']
work_area = data_list[i]['工作地点']
time = data_list[i]['发布时间']
detail = data_list[i]['公司链接']
company_type = data_list[i]['公司类型']
company_hangye = data_list[i]['公司行业']
salarylist.append(salary)
try:
worksheet.write("A{}".format(i+1), job_name)
worksheet.write("B{}".format(i+1), salary)
worksheet.write("C{}".format(i+1), company_name)
worksheet.write("D{}".format(i+1), work_area)
worksheet.write("E{}".format(i+1), time)
worksheet.write("F{}".format(i+1), detail)
worksheet.write("G{}".format(i+1), company_type)
worksheet.write("H{}".format(i+1), company_hangye)
except:
pass
workbook.close()
mysql
CREATE TABLE `jobinfo` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '信息编号', `job_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '职位名称', `salary` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '面议' COMMENT '薪资', `company_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司名称', `work_area` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '工作地点', `time` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '发布时间', `detail` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司链接', `company_type` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司类型', `company_hangye` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司行业', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 48561 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic; SET FOREIGN_KEY_CHECKS = 1;
import pymysql
con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8')
cursor = con.cursor()
sql = "INSERT INTO `qcwy`.`jobinfo`(`id`, `job_name`, `salary`, `company_name`, `work_area`, `time`, `detail`, `company_type`, `company_hangye`) VALUES"
sqlv = "(null,'{}','{}','{}','{}','{}','{}','{}','{}');".format(job_name,salary,company_name,work_area, time, detail, company_type,company_hangye)
sql2 = sql + sqlv
try:
cursor.execute(sql2)
con.commit()
except:
con.rollback()
cursor.close()
# 关闭连接
con.close()
展示数据
这个是和爬取分开的,所以重新打开了excel表读取(毕竟不能每次运行都一次爬几百个网页,防止我人无了)
import matplotlib.pyplot as plt
from openpyxl import load_workbook
def showpicuure():
salarymoneylist = []
salarycountlist = []
# 读取文件
wb = load_workbook('python.xlsx')
ws = wb.active
# 获取表中L列,L列为薪水从低到高排列
for col in ws['L']:
if isinstance(col.value, str):
salarymoneylist.append(col.value)
salarymoneylist.remove("工资")
# 获取表M列,M列为该薪水的招聘数量
for col in ws['M']:
if isinstance(col.value, int):
salarycountlist.append(col.value)
#绘图
plt.style.use('ggplot')
customers_index = range(len(salarymoneylist))
#设定图表长宽和像素
fig = plt.figure(figsize=(15, 3), dpi=100)
#设定图表所占区域
ax1 = fig.add_subplot(1, 1, 1)
#设定xy列,文字居中,图表颜色
ax1.bar(customers_index, salarycountlist, align='center', color='darkblue')
#设定文字在x轴底部,y轴左侧
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
#设定x轴的元素
plt.xticks(customers_index, salarymoneylist, rotation=0, fontsize='small')
plt.xlabel('nums')
plt.ylabel('salary')
plt.title('Salary—Statistics—{}')
plt.figure(dpi=80)
plt.savefig('salary.png', bbox_inches='tight')
plt.show()
运行
源代码在本博客开头哦,我博客上的这些当然是运行不了的,只能作为源代码的讲解
if __name__ == '__main__':
key1 = input("请输入关键词")
city1 = input("请输入城市")
#爬虫
zhaopin(keyword=key1, city=city1).run()
#数据库正序输出json
check()
#读取excel生成图像
showpicuure()
注意
不是每次都能运行成功,但是大部分都没什么问题,可能和网络质量有关,如果报错的gbk编码那些,就和网速有关,也可能是我引入的faker 库的问题



