栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python爬虫7:完整的一个爬虫小demo

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python爬虫7:完整的一个爬虫小demo

from bs4 import BeautifulSoup
import re
import urllib.request
import xlwt
import sqlite3
import ssl

ssl._create_default_https_context = ssl._create_unverified_context
def main():
#网址就不说了,避免侵权
    baseUrl = "xxxxxx"
    data = getData(baseUrl)
    # saveDataToExcel(data)
    saveDataToDB(data)

def getData(baseUrl):
    headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1",
                "cookie": "__51uvsct__JUTBIqXUbBvBL3hd=1; __51vcke__JUTBIqXUbBvBL3hd=f89fdbc5-1dbf-531a-8d48-608ad53241e3; __51vuft__JUTBIqXUbBvBL3hd=1639549481152; XSRF-TOKEN=eyJpdiI6ImFIK2VFbDg2NXQzbE9lTUZmNGFWMFE9PSIsInZhbHVlIjoiQ1NYU2dZVTV4NlpSM1FNOUVVdHFoZVB1N1daS2NoXC9hZUhOUThhcVZBbnpoYlBNTlwvZWtKUHNoNUdBdWxueTUwIiwibWFjIjoiMzMyMzE1MjRhMDU2MTIzMDBhM2UzZTQwMzJmMTk5YmM3NjgwMTJhMzM5MzZhZjcxM2I0YmRiZTYwNzBjNzM4ZCJ9; 91btlife_session=eyJpdiI6InEybEZiNUFUc3FwT1VMaGE1V3F4Qnc9PSIsInZhbHVlIjoiQ2M0UUY0ZFRDZGFQK3VEU3NyXC9VV1lsZ2dUZWQ2cHd2dVlMWVU4TktMYVZrN3RhTldPUXg4bmpub0ZSeHBmZkMwNHNweEd1Z09yMGQ1OTNld1dEYUdJY3FJOXZPTmpObUFcLzZxQjQ1MHl4Tlp6YW9SYXFBRzhGOGpLK1B6NnRBayIsIm1hYyI6ImVjYjM3OTFmNWI1NTMzMzEzM2Y0MzVmYzM1ZjNkNTNiMjM3ZmEyOTVlYmY5NjhiZDA5ODM5ZDkwNTkwM2RkMzgifQ%3D%3D; __vtins__JUTBIqXUbBvBL3hd=%7B%22sid%22%3A%20%222761075f-9fcf-5878-a827-2fecd1e51f8d%22%2C%20%22vd%22%3A%202%2C%20%22stt%22%3A%2083220%2C%20%22dr%22%3A%2083220%2C%20%22expires%22%3A%201639551364368%2C%20%22ct%22%3A%201639549564368%7D"
               }
    request = urllib.request.Request(url=baseUrl, headers=headers)
    rsp = urllib.request.urlopen(request)

    soup = BeautifulSoup(rsp.read().decode('utf-8'), "html.parser")
    find_all = soup.find_all(class_="main-content")
    res={}
    titleList = {}
    detailList = []
    i=0
    #一级分类
    titles = re.findall(r'>(.*?)', str(find_all[0]))
    for title in titles:
        title_temp={i:title}
        titleList.update(title_temp)

        detail_htmls = re.findall(''+title+'[sS]+?

', str(find_all[0])) if len(detail_htmls)>0: dts=re.findall('(.*?)',detail_htmls[0]) urls=re.findall(r'data-original-title="(.*?)"',detail_htmls[0]) for j in range(0, len(dts)): dlt = [] dlt.append(i) dlt.append(dts[j]) dlt.append(urls[j]) detailList.append(dlt) i+=1 res.update({"titleList":titleList}) res.update({"detailList":detailList}) return res def saveDataToExcel(data): workbook = xlwt.Workbook(encoding="utf-8") sheet = workbook.add_sheet("sheet1") i=0 for title in data: sheet.write(i,0,title) i+=1 workbook.save("te.xls") def saveDataToDB(data): connect = sqlite3.connect("test.db") c = connect.cursor() #获取游标 # i=1 # for title in data: # sql = r"insert into t1 (id,title) values (%d,'%s');"%(i,title) # i+=1 # print(sql) # c.execute(sql) titleList = data['titleList'] detailList = data['detailList'] for key,value in titleList.items(): sql = r"insert into title (id,title) values (%d,'%s');" %(key, value) c.execute(sql) for val in detailList: sql = r"insert into detail (`name`,titleId,url) values ('%s',%d,'%s');" %(val[1], val[0],val[2]) c.execute(sql) connect.commit() c.close() if __name__ == "__main__": main()

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/664519.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号