栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python简单爬取人人车网站在售北京二手车车况信息并存入mysql数据库

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python简单爬取人人车网站在售北京二手车车况信息并存入mysql数据库

直接上代码
'''
人人车 北京二手车 1.名称,2.价格,3.详情,4.图片,5.生产日期,6.公里数,7.首付,8.降价信息,9.车牌所在地,10.排放标准
11.过户记录,12.车主评价,13.车况信息——车辆外观,14.车况信息——车辆内饰,15.车况信息——车辆底盘,16.机构检测结果,17.年检到期时间
18.商业险到期时间,19.有无购车发票,20.是否4S店保养,21.交强险到期时间
'''
import pymysql
import requests
import time
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
#用来存放二手汽车名称
car_name = []
#用来存放二手车价格
car_price = []
#用来存放汽车详情页链接
car_url = []
#用来存放二手车图片链接
car_picture_url = []
#用来存放二手车生产日期
car_date = []
#用来存放二手车行驶里程数
car_km = []
#用来存放二手车首付价格
car_pay = []
#用来存放降价信息
car_pi = []
#用来存放车牌所在地信息
car_location = []
#用来存放二手车排放标准
car_es = []
#用来存放二手车过户记录
car_tf = []
#用来存放车主评价
car_usertx = []
#用来存放车况信息
car_condit = []
#用来存放车况信息——车辆外观
car_condit_out = []
#用来存放车况信息——车辆内饰
car_condit_in = []
#用来存放车况信息——车辆底盘
car_condit_chassis = []
#用来存放机构检测结果
car_result = []
#用来存放年检到期时间
car_procedures_YearlyInspection = []
#用来存放商业险到期时间
car_Ciet = []
#有无购车发票
car_invoice = []
#是否4S店保养
car_maintain = []
#用来存放交强险到期时间
car_compulsory = []
page = 1
def db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory):
    # 打开数据库
    db = pymysql.connect(host='localhost', user='root', password='1234', port=3306, database='rrc')
    # 创建游标
    cursor = db.cursor()
    i = 1
    sql = 'insert into rrc_table(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory) ' 
          'values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    for name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory in 
            zip(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory):
        try:
            cursor.execute(sql,(name,date,km,price,pay,pi,url,picture_url,location,es,tf,usertx,condit_out,condit_in,condit_chassis,result,procedures_YearlyInspection,Ciet,invoice,maintain,compulsory))
            db.commit()
            print("爬取成功")
        except:
            print("第" + str(i) + "条数据出现数据插入异常")
            db.rollback()
        i += 1
    db.close()

def getcar_main():
    for x in range(0,50):
        base_url = 'https://www.renrenche.com/bj/ershouche/p'
        print('开始第'+str(x+1)+'页内容爬取')
        url = base_url + str(page)
        car_name = get_carname(url)
        car_date, car_km = get_producedate(url)
        car_price = get_price(url)
        car_pay = get_dp(url)
        car_pi = get_pi(url)
        car_url = get_carurl(url)
        car_picture_url = get_picture(url)
        print("正在抓取车牌所在地...")
        car_location = get_carLicense(car_url)
        print("正在抓取二手车排放标准...")
        car_es = get_es(car_url)
        print("正在抓取二手车过户记录...")
        car_tf = get_transfer(car_url)
        print("正在抓取车主评价...")
        car_usertx = get_omt(car_url)
        print("正在抓取车况信息...")
        car_condit_out, car_condit_in, car_condit_chassis = get_condit(car_url)
        print("正在抓取机构检测结果...")
        car_result = get_result(car_url)
        print("正在抓取车辆手续信息...")
        car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory = get_procedures(car_url)
        print("正在存入数据库...")
        db_mysql(car_name,car_date,car_km,car_price,car_pay,car_pi,car_url,car_picture_url,car_location,car_es,car_tf,car_usertx,car_condit_out, car_condit_in, car_condit_chassis,car_result,car_procedures_YearlyInspection, car_Ciet, car_invoice, car_maintain, car_compulsory)
        print('开始清空列表列表...')
        car_name.clear()
        car_date.clear()
        car_km.clear()
        car_price.clear()
        car_pay.clear()
        car_pi.clear()
        car_url.clear()
        car_picture_url.clear()
        car_location.clear()
        car_es.clear()
        car_tf.clear()
        car_usertx.clear()
        car_condit_out.clear()
        car_condit_in.clear()
        car_condit_chassis.clear()
        car_result.clear()
        car_procedures_YearlyInspection.clear()
        car_Ciet.clear()
        car_invoice.clear()
        car_maintain.clear()
        car_compulsory.clear()
        print('所有列表已清空')
def get_page(url):
    try:
        Headers = {'user-agent': 'Mozilla/5.0'}
        rs = requests.get(url=url,headers = Headers)
        if rs.status_code == 200:
            html = rs.text
            doc = pq(html)
            return doc
    except:
        print("url出错了!")

def get_in_page(url):
    try:
        Headers = {'user-agent': 'Mozilla/5.0'}
        rs = requests.get(url=url,headers = Headers)
        if rs.status_code == 200:
            html = rs.text
            doc = pq(html)
            return doc
    except:
        print("url出错了!")

#function1:获取二手车详情页链接
def get_carurl(url):
    doc = get_page(url)
    for url in doc(".thumbnail ").items():
        car_url.append('https://www.renrenche.com'+url.attr.href)
    return car_url

#function2:获取二手车名称
def get_carname(url):
    doc = get_page(url)
    for name in doc(".schedule.btn-base.btn-wireframe").items():
        car_name.append(name.attr('data-title'))
    return car_name

#function3:获取二手车生产日期和公里数
def get_producedate(url):
    doc = get_page(url)
    for date in doc(".mileage").items():
        car_date.append(date.text().split('/')[0])
        car_km.append(date.text().split('/')[1])
    return car_date,car_km

#function4:获取二手车价格
def get_price(url):
    doc = get_page(url)
    for price in doc(".tags-box").children('.price').remove('.down-payment').items():
        car_price.append(price.text())
    return car_price

#function5:获取二手车图片链接
def get_picture(url):
    doc = get_page(url)
    for p in doc(".thumbnail").children('.img-backgound').children('img').items():
        if p.attr('data-src') is None:
            car_picture_url.append('https:'+p.attr('src'))
        else:
            car_picture_url.append('https:'+p.attr('data-src'))
    return car_picture_url

#function6:获取二手车首付
def get_dp(url):
    doc = get_page(url)
    for pice in doc(".tags-box").items():
        # print(len(pice.text().split("n")))
        x = len(pice.text().split("n"))
        if x > 1:
            car_pay.append(pice.text().split("n")[2])
        else:
            car_pay.append("不可首付")
    return  car_pay
#function7:获取二手车降价信息
def get_pi(url):
    doc = get_page(url)
    for pi in doc(".thumbnail").items():
        if "已降" in pi.text().split("n"):
            car_pi.append("已降"+pi.text().split("n")[1])
        else:
            car_pi.append("近期未降价")
    return car_pi

#function7:获取二手车车牌所在地信息
def get_carLicense(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for Lpl in doc("#car-licensed").items():
            car_location.append(Lpl.text())
    return car_location

#function9:获取二手车排放标准
def get_es(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for es in doc(".span5.car-fluid-standard .detail-version3-right-icon .car-summary").items():
            car_es.append(es.text())
    return car_es

#function10:获取二手车过户记录
def get_transfer(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for tf in doc("#zhimaicar-detail-header-right .row-fluid-wrapper .car-transfer .car-summary").items():
            car_tf.append(tf.text())
    return car_tf

#function11:获取车主评价
def get_omt(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for tx in doc(".text-about-car-owner .owner-main-text").items():
            car_usertx.append(tx.text())
    return car_usertx

#function12:获取车况信息
def get_condit(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for cd in doc("#gallery .detail-car-appearance-title .zhimai-subtitle").items():
            car_condit.append(cd.text())
    car_condit_out = car_condit[0::3]
    car_condit_in = car_condit[1::3]
    car_condit_chassis =car_condit[2::3]
    return car_condit_out,car_condit_in,car_condit_chassis

#function13:获取机构检测结果
def get_result(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for rs in doc(".report-inner-box .report-main .report-result-des").items():
            car_result.append(rs.text())
    return car_result

#function14:获取车辆手续信息
def get_procedures(car_url):
    for url in car_url:
        doc = get_in_page(url)
        for pd in doc(".interval-title-content").items():
           car_procedures_YearlyInspection.append(pd.text().split("n")[1])
           car_Ciet.append(pd.text().split("n")[3])
           car_invoice.append(pd.text().split("n")[5])
           car_invoice.append(pd.text().split("n")[5])
           car_maintain.append(pd.text().split("n")[7])
           car_compulsory.append(pd.text().split("n")[9])
    return car_procedures_YearlyInspection,car_Ciet,car_invoice,car_maintain,car_compulsory
#运行程序
getcar_main()
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/355561.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号