直接上代码
'''
人人车 北京二手车 1.名称,2.价格,3.详情,4.图片,5.生产日期,6.公里数,7.首付,8.降价信息,9.车牌所在地,10.排放标准
11.过户记录,12.车主评价,13.车况信息——车辆外观,14.车况信息——车辆内饰,15.车况信息——车辆底盘,16.机构检测结果,17.年检到期时间
18.商业险到期时间,19.有无购车发票,20.是否4S店保养,21.交强险到期时间
'''
import pymysql
import requests
import time
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
#用来存放二手汽车名称
car_name = []
#用来存放二手车价格
car_price = []
#用来存放汽车详情页链接
car_url = []
#用来存放二手车图片链接
car_picture_url = []
#用来存放二手车生产日期
car_date = []
#用来存放二手车行驶里程数
car_km = []
#用来存放二手车首付价格
car_pay = []
#用来存放降价信息
car_pi = []
#用来存放车牌所在地信息
car_location = []
#用来存放二手车排放标准
car_es = []
#用来存放二手车过户记录
car_tf = []
#用来存放车主评价
car_usertx = []
#用来存放车况信息
car_condit = []
#用来存放车况信息——车辆外观
car_condit_out = []
#用来存放车况信息——车辆内饰
car_condit_in = []
#用来存放车况信息——车辆底盘
car_condit_chassis = []
#用来存放机构检测结果
car_result = []
#用来存放年检到期时间
car_procedures_YearlyInspection = []
#用来存放商业险到期时间
car_Ciet = []
#有无购车发票
car_invoice = []
#是否4S店保养
car_maintain = []
#用来存放交强险到期时间
car_compulsory = []
page = 1
def db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory):
# 打开数据库
db = pymysql.connect(host='localhost', user='root', password='1234', port=3306, database='rrc')
# 创建游标
cursor = db.cursor()
i = 1
sql = 'insert into rrc_table(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory) '
'values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
for name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory in
zip(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory):
try:
cursor.execute(sql,(name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory))
db.commit()
print("爬取成功")
except:
print("第" + str(i) + "条数据出现数据插入异常")
db.rollback()
i += 1
db.close()
def getcar_main():
for x in range(0,50):
base_url = 'https://www.renrenche.com/bj/ershouche/p'
print('开始第'+str(x+1)+'页内容爬取')
url = base_url + str(page)
car_name = get_carname(url)
car_date, car_km = get_producedate(url)
car_price = get_price(url)
car_pay = get_dp(url)
car_pi = get_pi(url)
car_url = get_carurl(url)
car_picture_url = get_picture(url)
print("正在抓取车牌所在地...")
car_location = get_carLicense(car_url)
print("正在抓取二手车排放标准...")
car_es = get_es(car_url)
print("正在抓取二手车过户记录...")
car_tf = get_transfer(car_url)
print("正在抓取车主评价...")
car_usertx = get_omt(car_url)
print("正在抓取车况信息...")
car_condit_out, car_condit_in, car_condit_chassis = get_condit(car_url)
print("正在抓取机构检测结果...")
car_result = get_result(car_url)
print("正在抓取车辆手续信息...")
car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory = get_procedures(car_url)
print("正在存入数据库...")
db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory)
print('开始清空列表列表...')
car_name.clear()
car_date.clear()
car_km.clear()
car_price.clear()
car_pay.clear()
car_pi.clear()
car_url.clear()
car_picture_url.clear()
car_location.clear()
car_es.clear()
car_tf.clear()
car_usertx.clear()
car_condit_out.clear()
car_condit_in.clear()
car_condit_chassis.clear()
car_result.clear()
car_procedures_YearlyInspection.clear()
car_Ciet.clear()
car_invoice.clear()
car_maintain.clear()
car_compulsory.clear()
print('所有列表已清空')
def get_page(url):
try:
Headers = {'user-agent': 'Mozilla/5.0'}
rs = requests.get(url=url,headers = Headers)
if rs.status_code == 200:
html = rs.text
doc = pq(html)
return doc
except:
print("url出错了!")
def get_in_page(url):
try:
Headers = {'user-agent': 'Mozilla/5.0'}
rs = requests.get(url=url,headers = Headers)
if rs.status_code == 200:
html = rs.text
doc = pq(html)
return doc
except:
print("url出错了!")
#function1:获取二手车详情页链接
def get_carurl(url):
doc = get_page(url)
for url in doc(".thumbnail ").items():
car_url.append('https://www.renrenche.com'+url.attr.href)
return car_url
#function2:获取二手车名称
def get_carname(url):
doc = get_page(url)
for name in doc(".schedule.btn-base.btn-wireframe").items():
car_name.append(name.attr('data-title'))
return car_name
#function3:获取二手车生产日期和公里数
def get_producedate(url):
doc = get_page(url)
for date in doc(".mileage").items():
car_date.append(date.text().split('/')[0])
car_km.append(date.text().split('/')[1])
return car_date,car_km
#function4:获取二手车价格
def get_price(url):
doc = get_page(url)
for price in doc(".tags-box").children('.price').remove('.down-payment').items():
car_price.append(price.text())
return car_price
#function5:获取二手车图片链接
def get_picture(url):
doc = get_page(url)
for p in doc(".thumbnail").children('.img-backgound').children('img').items():
if p.attr('data-src') is None:
car_picture_url.append('https:'+p.attr('src'))
else:
car_picture_url.append('https:'+p.attr('data-src'))
return car_picture_url
#function6:获取二手车首付
def get_dp(url):
doc = get_page(url)
for pice in doc(".tags-box").items():
# print(len(pice.text().split("n")))
x = len(pice.text().split("n"))
if x > 1:
car_pay.append(pice.text().split("n")[2])
else:
car_pay.append("不可首付")
return car_pay
#function7:获取二手车降价信息
def get_pi(url):
doc = get_page(url)
for pi in doc(".thumbnail").items():
if "已降" in pi.text().split("n"):
car_pi.append("已降"+pi.text().split("n")[1])
else:
car_pi.append("近期未降价")
return car_pi
#function7:获取二手车车牌所在地信息
def get_carLicense(car_url):
for url in car_url:
doc = get_in_page(url)
for Lpl in doc("#car-licensed").items():
car_location.append(Lpl.text())
return car_location
#function9:获取二手车排放标准
def get_es(car_url):
for url in car_url:
doc = get_in_page(url)
for es in doc(".span5.car-fluid-standard .detail-version3-right-icon .car-summary").items():
car_es.append(es.text())
return car_es
#function10:获取二手车过户记录
def get_transfer(car_url):
for url in car_url:
doc = get_in_page(url)
for tf in doc("#zhimaicar-detail-header-right .row-fluid-wrapper .car-transfer .car-summary").items():
car_tf.append(tf.text())
return car_tf
#function11:获取车主评价
def get_omt(car_url):
for url in car_url:
doc = get_in_page(url)
for tx in doc(".text-about-car-owner .owner-main-text").items():
car_usertx.append(tx.text())
return car_usertx
#function12:获取车况信息
def get_condit(car_url):
for url in car_url:
doc = get_in_page(url)
for cd in doc("#gallery .detail-car-appearance-title .zhimai-subtitle").items():
car_condit.append(cd.text())
car_condit_out = car_condit[0::3]
car_condit_in = car_condit[1::3]
car_condit_chassis =car_condit[2::3]
return car_condit_out,car_condit_in,car_condit_chassis
#function13:获取机构检测结果
def get_result(car_url):
for url in car_url:
doc = get_in_page(url)
for rs in doc(".report-inner-box .report-main .report-result-des").items():
car_result.append(rs.text())
return car_result
#function14:获取车辆手续信息
def get_procedures(car_url):
for url in car_url:
doc = get_in_page(url)
for pd in doc(".interval-title-content").items():
car_procedures_YearlyInspection.append(pd.text().split("n")[1])
car_Ciet.append(pd.text().split("n")[3])
car_invoice.append(pd.text().split("n")[5])
car_invoice.append(pd.text().split("n")[5])
car_maintain.append(pd.text().split("n")[7])
car_compulsory.append(pd.text().split("n")[9])
return car_procedures_YearlyInspection,car_Ciet,car_invoice,car_maintain,car_compulsory
#运行程序
getcar_main()