栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

Python——各类品牌库爬虫Demo

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

Python——各类品牌库爬虫Demo

解决方案

源代码:https://gitee.com/shentuzhigang/mini-project/tree/master/brand-crawler

唯品会
import json

import openpyxl

import requests
allBrandList = []
r = requests.get(
    'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360552&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=wifi&width=414&height=622&hierarchy_id=107&category_id=&category_filter=&sale_for=&client_from=wxsmall')
json1 = r.json()
data1 = json1['data']['data']['tabs']
for tab in data1:
    print(tab['categoryId'])
    r2 = requests.get(
        'https://wxapi.appvipshop.com/vips-mobile/rest/shopping/category/index/get_tab_data/v1?api_key=ce29a51aa5c94a318755b2529dcb8e0b&_xcxid=1648525360675&app_name=shop_weixin_mina&client=wechat_mini_program&source_app=shop_weixin_mina&app_version=4.0&client_type=wap&format=json&mobile_platform=2&ver=2.0&standby_id=native&mobile_channel=nature&mars_cid=1648202533707_3e155b4df020055b73d22e80fb75afa6&warehouse=VIP_SH&fdc_area_id=103103101&province_id=103103&wap_consumer=B&t=1648525360&net=WIFI&width=750&height=500&pcmpWidth=510&hierarchy_id=107&category_id=' +
        tab['categoryId'] + '&sale_for=')
    json2 = r2.json()
    data2 = json2['data']['data']
    sectionList = data2['sectionList']
    for section in sectionList:
        if section['sectionType'] == 'category' and section['category']['name'] == '精选品牌':
            for brand in section['category']['children']:
                B = dict(brand)
                for b in brand:
                    if isinstance(brand[b],dict):
                        B = dict(B,**brand[b])
                print(B)
                allBrandList.append(B)
f = openpyxl.Workbook()
sheet1 = f.create_sheet('vip')
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
    for key, value in allBrandList[jkey].items():
        if key in keys:
            continue
        sheet1.cell(row=1, column=i).value = key
        keys[key] = i
        i += 1

for jkey in range(len(allBrandList)):
    jk = jkey + 2
    cT = 0
    for key, value in allBrandList[jkey].items():
        cT += 1
        sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('vip.xlsx')

得物
import json

import openpyxl

load_dict = ''
with open("dewu.json", 'r') as load_f:
    load_dict = json.load(load_f)
series = load_dict['data']['list']
allBrandList = []
for l in series:
    dc = dict()
    for d in l:
        dc = dict(dc, **l[d])
    print(dc)
    allBrandList.append(dc)

keys = dict()
i = 1
for jkey in range(len(allBrandList)):
    for key, value in allBrandList[jkey].items():
        if key in keys:
            continue
        keys[key] = i
        i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('dewu')
for jkey in range(len(allBrandList)):
    jk = jkey + 1
    cT = 0
    for key, value in allBrandList[jkey].items():
        cT += 1
        if cT == 0:
            sheet1.cell(row=jk, column=keys[key]).value = key
        else:
            sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('dewu.xlsx')

蝉妈妈-抖音

平台限制只能取到前10000
Python2

# coding=utf-8
import json
import urllib2
f = open('data.json','w')
listAll = []
for i in range(1,100):
    url = "https://api-service.chanmama.com/v2/home/brand/search?page="+ str(i) +"&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
    print url
    request = urllib2.Request(url)
    # 模仿火狐浏览器
    request.add_header("cookie", "***")
    request.add_header("user-agent", "Mozilla/5.0")
    response = urllib2.urlopen(request)
    code = response.getcode()
    content = response.read()
    s = json.loads(content)
    data = s['data']
    list = data['list']
    listAll.extend(list)
f.write(json.dumps(listAll))

Python3

# coding=utf-8
import json
import requests

f = open('data.json', 'w')
listAll = []
for i in range(1, 100):
    url = "https://api-service.chanmama.com/v2/home/brand/search?page=" + str(
        i) + "&category=&keyword=&fans_age=&fans_gender=-1&fans_province=&sort=day_volume&order_by=desc&size=100&has_aweme_sale=0&has_live_sale=0&interaction_inc_range=&amount_range="
    print(url)
    response = requests.get(url, headers={
        "cookie": "***",
        # 模仿火狐浏览器
        "user-agent": "Mozilla/5.0"
    })
    code = response.status_code
    content = response.json()

    data = content['data']
    list = data['list']
    listAll.extend(list)
# f.write(json.dumps(listAll))

爱库存

版本一

import requests
import json

cookie = '***'
headers = {
    "cookie": cookie,
    # 模仿火狐浏览器
    "user-agent": "Mozilla/5.0"
}
response = requests.get(
    'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
    headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
    tagNos.append(tag['activityTagNo'])
    for status in range(1, 3):
        print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
        res = requests.get(
            'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
            params={
                'tagNo': tag['activityTagNo'],
                'status': status
            },
            headers=headers)
        json1 = res.json()
        if 'data' in json1:
            data = json1['data']
            brandLists = data['brandList']
            for brandList in brandLists:
                blist = brandList['brandList']
                for b in blist:
                    allBrandList.append(b)
                    print(dict(b, **b['brandExtend']))
f = open('aikucun.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))

版本二
保存到xlsx
解决乱码问题

import re
import requests
import openpyxl

cookie = ''
headers = {
    "cookie": cookie,
    # 模仿火狐浏览器
    "user-agent": "Mozilla/5.0"
}
response = requests.get(
    'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/queryActivityTagV2/WWDLOwYqYqr?shopBizCode=WWDLOwYqYqr',
    headers=headers)
tags = response.json()
tagNos = []
allBrandList = []
for tag in tags['data']:
    tagNos.append(tag['activityTagNo'])
    for status in range(1, 3):
        print('tag:' + tag['activityTagNo'] + ',status:' + str(status))
        res = requests.get(
            'https://h5-shop.aikucun.com/api/commodity/subscribe/v2/h5/querybrandList?shopBizCode=WWDLOwYqYqr',
            params={
                'tagNo': tag['activityTagNo'],
                'status': status
            },
            headers=headers)
        json1 = res.json()
        if 'data' in json1:
            data = json1['data']
            brandLists = data['brandList']
            for brandList in brandLists:
                blist = brandList['brandList']
                for b in blist:
                    print(dict(b, **b['brandExtend']))
                    if 'pcodelen' in b and b['pcodelen'] != '':
                        str0 = r'u"u{0}'.format(r'u'.join(re.findall(r'.{4}', str(b['pcodelen'])))) + '"'
                        print(str0)
                        str1 = str(eval(str0))
                        b['pinpaiming0'] = str1 + str(b['pinpaiming'])[len(str1):]
                        print(b['pinpaiming0'])

                    allBrandList.append(b)
                    print(sorted(dict(b, **b['brandExtend']).items(), key=lambda d: d[0]))
# f = open('aikucun.json', 'w', encoding='utf-8')
# f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))
keys = dict()
i = 1
for jkey in range(len(allBrandList)):
    for key, value in allBrandList[jkey].items():
        if key in keys:
            continue
        keys[key] = i
        i += 1
f = openpyxl.Workbook()
sheet1 = f.create_sheet('aikucun')
for jkey in range(len(allBrandList)):
    jk = jkey + 1
    cT = 0
    for key, value in allBrandList[jkey].items():
        cT += 1
        if cT == 0:
            sheet1.cell(row=jk, column=keys[key]).value = key
        else:
            sheet1.cell(row=jk, column=keys[key]).value = str(value)
f.save('aikucun.xlsx')

好衣库
import requests
import json
headers = {
    # 模仿火狐浏览器
    "user-agent": "Mozilla/5.0"
}

allBrandList = []
for i in range(-300, 600):
    for ty in [1, 5]:
        response = requests.post('https://www.webuy.ai/sesame/hyk/shopCategory/brand/detail',
                                 headers=headers,
                                 json={
                                     "exhibitionParkType": ty,
                                     "categoryId": i,
                                     "shopId": 3572,
                                     "pageSize": 1000,
                                     "pageNo": 1,
                                     "isPageQuery": False
                                 })
        print(response.json())
        json1 = response.json()
        entry = json1['entry']
        for b in entry:
            print(b)
            allBrandList.append(b)
f = open('webuy.json', 'w', encoding='utf-8')
f.write(json.dumps(allBrandList).encode('utf-8').decode('utf-8'))
时尚品牌网
import requests
from bs4 import BeautifulSoup
import openpyxl
from openpyxl.drawing.image import Image
from PIL import Image as PILImage
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
import threading
import time

f = openpyxl.Workbook()
sheet1 = f.create_sheet('chinasspp')
headers = ['品牌名称', '行业类别', '公司名称', '联系电话', '公司传真', '官方网站', '联系地址', '在线客服']
for index, name in enumerate(headers):
    sheet1.cell(row=1, column=index + 1).value = name

count = 1

def parseDetail(no, link):
    response = requests.get(link)
    response.encoding = "gbk"
    soup = BeautifulSoup(response.text, 'lxml')
    print('no' + str(no))
    for item in soup.select_one("#brand_info_ctl00_blink").select('li'):
        key = item.text.split(':')[0]
        value = item.text.split(':')[1]
        # print(item)
        # print(key + ':' + value)
        # print(headers.index(key))
        sheet1.cell(row=no, column=headers.index(key) + 1).value = value.encode('utf-8').decode('utf-8')
        if key == '联系电话':
            url1 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
            img1 = PILImage.open(BytesIO(requests.get(url1).content))
            sheet1.add_image(Image(img1), chr(ord("A") + headers.index(key)) + str(no))
        if key == '公司传真':
            url2 = 'http://www.chinasspp.com' + item.select_one('img').attrs.get('src')
            img2 = PILImage.open(BytesIO(requests.get(url2).content))
            sheet1.add_image(Image(img2), chr(ord("A") + headers.index(key)) + str(no))


with ThreadPoolExecutor(max_workers=16) as pool:
    for i in range(1, 516):
        print('Page ' + str(i))
        response = requests.get("http://www.chinasspp.com/brand/%E5%A5%B3%E8%A3%85%E5%93%81%E7%89%8C/" + str(i) + "/")
        soup = BeautifulSoup(response.text, 'lxml')
        soup.select(".brand")
        for brand in soup.select(".brand"):
            link = brand.select_one('.logo').attrs.get('href')
            count += 1
            th = pool.submit(parseDetail, count, link)
    pool.shutdown(wait=True)
    f.save('chinasspp.xlsx')

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/989616.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号