栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

利用python爬取京东商品的详情信息

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

利用python爬取京东商品的详情信息

1.导入模块

import random
import re
import time
import xlsxwriter
from selenium import webdriver
from lxml import etree
import requests

2.构建ua池和ip池

# ua池
ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x32) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.39 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.26 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X; zh-CN) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/17D50 UCBrowser/12.8.2.1268 Mobile AliApp(TUnionSDK/0.1.20.3)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 8.1.0; OPPO R11t Build/OPM1.171019.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/76.0.3809.89 Mobile Safari/537.36 T7/11.19 SP-engine/2.15.0 baiduboxapp/11.19.5.10 (Baidu; P1 8.1.0)',
        'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 SP-engine/2.14.0 main%2F1.0 baiduboxapp/11.18.0.16 (Baidu; P2 13.3.1) NABar/0.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    ]
ua = random.choice(ua_list)
# Ip池
proxy_list = [
    {"http" or "https": "124.71.14.222:10002"},
    {"http" or "https": '60.167.133.17:1133'},
    {"http" or "https": '183.0.203.167:8118'},
    {"http" or "https": '111.231.86.149:7890'},
    {"http" or "https": "183.0.203.167:8118"},
    {"http" or "https": '163.125.222.12:8118'},
    {"http" or "https": '111.59.199.58:8118'},
]
proxies = random.choice(proxy_list)

3.获取商品链接和id

word = input('请输入你要获取的商品:', )
page = input('请输入商品页数:',)
# 获取商品链接和id
def get_link():
    links = []
    skus_id = []
    for i in range(int(page)):
        url = f'https://search.jd.com/Search?keyword={word}&wq={word}&page={i}'
        headers = {
            "user-agent": ua,
        }
        res = requests.get(url=url, headers=headers, proxies=proxies).text
        time.sleep(0.5)
        # print(res)
        # 提取商品链接并进行拼接操作
        html = etree.HTML(res)
        link = html.xpath('//*[@id="J_goodsList"]/ul/li[*]/div/div[3]/a/@href')
        link = ['https:' + k for k in link]
        for l in link:
            links.append(l)
        # 提取商品id
        sku_id = [re.findall('d+', i)[0] for i in link]
        for s in sku_id:
            skus_id.append(s)
        print(f'第{i+1}页。')
    print(links)
    goods(links, skus_id)

4.获取商品详情数据

# 获取商品详情
def goods(links, skus_id):
    goo = []
    pict = 0
    for i in range(len(links)):
        headers = {
            "User-Agent": ua,
            'referer': 'https://search.jd.com/',
        }
        res = requests.get(url=links[i], headers=headers, proxies=proxies).text
        time.sleep(2)
        # print(res)
        html = etree.HTML(res)
        # 店铺名称
        title = html.xpath('//*[@id="crumb-wrap"]/div/div[2]/div[2]/div[1]/div/a/@title')
        print(title)
        # 品牌
        brand = html.xpath('//*[@id="parameter-brand"]/li/@title')
        print(brand)
        # 商品编号
        serial = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[2]/text()')
        serial = [serial[0].split(':')[-1]]
        print(serial)
        # 正式商品名称
        official = html.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/text()')
        official = [official[0].split(':')[-1].strip()]
        print(official)
        # 网页商品名称
        name = html.xpath('/html/body/div[6]/div/div[2]/div[1]/text()')
        if len(name) == 1:
            name = [name[0].strip()]
        elif len(name) == 2:
            name = [name[1].strip()]
        print(name)

        # 商品第一张主图片
        picture = ['https:' + html.xpath('//*[@id="spec-img"]/@data-origin')[0]]
        print(picture)
        res2 = requests.get(url=picture[0], headers=headers)
        with open(f'D:pythonprojectpython项目爬虫接单京东商品评价获取(接单考核)商品图片/{pict}.jpg', 'wb')as f:
            f.write(res2.content)
        pict += 1
        # 京东价,请求价格信息json
        p = requests.get('https://p.3.cn/prices/mgets?skuIds=J_' + skus_id[i], headers=headers, proxies=proxies).text
        print(p)
        price = re.findall('"p":"(.*?)","op"', p)
        print(price)
        # 优惠劵和促销
        options = webdriver.ChromeOptions()# 无界面模式
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        driver.get(links[i])
        time.sleep(1)
        # 获取源代码
        data = driver.page_source
        time.sleep(0.5)
        driver.close()
        driver.quit()
        # 促销
        html2 = etree.HTML(data)
        promotion1 = html2.xpath('//*[@id="prom"]/div/div[1]/em[2]/text()')
        promotion2 = html2.xpath('//*[@id="prom"]/div/div[2]/em[2]/text()')
        if promotion1 == [] and promotion2 == []:
            promotion = ['暂无促销信息']
        elif promotion1 == [] and promotion2 != []:
            promotion = promotion2
        elif promotion2 == [] and promotion1 != []:
            promotion = promotion1
        else:
            promotion = [promotion1[0], promotion2[0]]
        print(promotion)
        # 优惠劵信息
        coupon = html2.xpath('//*[@id="summary-quan"]/div[2]/dl/dd/a/span/span/text()')
        if coupon == []:
            coupon = ['暂无可领的优惠券']
        print(coupon)
        # 累计评价
        comm_url = f'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={skus_id[i]}'
        comment_headers = {
            'user-agent': ua,
        }
        res_js = requests.get(url=comm_url, headers=comment_headers, proxies=proxies).text
        comment = re.findall('"CommentCountStr":"(.*?)","CommentCount":', res_js)
        print(comment)
        for g in zip(title, brand, serial, official, name, price, promotion, coupon, comment, picture):
            goo.append(g)

        print(f'第{i+1}件商品打印完成。')
    print(goo)
    save(goo)

5.数据保存,并通过xlsxwrite进行图片保存

# 数据保存
def save(goo):
    # 创建工作簿
    workbook = xlsxwriter.Workbook('京东商品详情.xlsx')
    # 创建工作表
    worksheet = workbook.add_worksheet(word)
    # 大部分样式如下:
    format = {
        # 'font_size': 10,  # 字体大小
        'bold': True,  # 是否粗体
        # 'bg_color': '#101010',  # 表格背景颜色
        # 'fg_color': '#00FF00',
        # 'font_color': '#0000FF',  # 字体颜色
        'align': 'center',  # 水平居中对齐
        'valign': 'vcenter',  # 垂直居中对齐
        # 'num_format': 'yyyy-mm-dd H:M:S',# 设置日期格式
        # 后面参数是线条宽度
        'border': 1,  # 边框宽度
        'top': 1,  # 上边框
        'left': 1,  # 左边框
        'right': 1,  # 右边框
        'bottom': 1  # 底边框
    }
    style = workbook.add_format(format)
    # 写入图片
    a = 0
    worksheet.set_column(9, 9, 350)  # 设置列宽
    for i in range(len(goo)):
        worksheet.set_row(i + 1, 350)  # 设置行高350
        worksheet.insert_image(i + 1, 9, f'D:pythonprojectpython项目爬虫接单京东商品评价获取(接单考核)商品图片/{a}.jpg', {'url': goo[i][-1]})
        a += 1
    # 写入数据
    col = ('店铺名称', '品牌', '商品编号', '正式商品名称', '网页商品名称', '京东价', '促销', '优惠劵', '累计评价', '商品第一张主图片',)
    for i in range(len(col)):
        worksheet.write(0, i, col[i])
    for i in range(len(goo)):
        for c in range(len(col) - 1):
            worksheet.write(i + 1, c, goo[i][c], style)
    workbook.close()

6.开启程序

if __name__ == '__main__':
    get_link()

不足之处还有很多,希望大佬不要喷我。。。。

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/580575.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号