栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

Python爬虫实例2

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

Python爬虫实例2

Python爬虫实例2 爬取豆瓣电影TOP250(https://movie.douban.com/top250)的相关内容

step1 准备工作 目标:

爬取豆瓣电影TOP250的电影名称、豆瓣评分、评价数、电影概况、电影链接等

分析:

第一页URL:https://movie.douban.com/top250,展示了排行1-25的电影;
第二页URL:https://movie.douban.com/top250?start=25&filter=,展示了排行26-50的电影;

获取TOP250,需要分开请求10次,参数start分别为:0,25…225

step2 获取数据 1.爬取第一页的源代码
import urllib.request

url = "https://movie.douban.com/top250"

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}

request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
小插曲:

其中出现了“unable to get local issuer certificate (_ssl.c:1129)”错误,加入下方代码即可

import ssl

ssl._create_default_https_context = ssl._create_unverified_context
2.提取信息

每部电影对应一个li节点

        
  • 1
    肖申克的救赎

    导演: 弗兰克·德拉邦特 Frank Darabont   主演: 蒂姆·罗宾斯 Tim Robbins /...
    1994 / 美国 / 犯罪 剧情

    9.7 2466166人评价

    希望让人自由。

  • 使用BeautifulSoup、re库匹配信息

    from bs4 import BeautifulSoup
    import re
    
    findlink = re.compile(r'')
    # 电影链接
    findImg = re.compile(r', re.S)
    # 图片链接
    findTitle = re.compile(r'(.*?)')
    # 电影名称
    findRating = re.compile(r'(.*?)')
    # 电影评分
    findJudge = re.compile(r'(d*)人评价')
    # 评价人数
    findInq = re.compile(r'(.*?)')
    # 评语
    findBd = re.compile(r'

    (.*?)

    ', re.S) # 背景 soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) image = re.findall(findImg, item)[0] data.append(image) title = re.findall(findTitle, item)[0] data.append(title) rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item)[0] data.append(inq) bd = re.findall(findBd, item)[0] data.append(bd) print(data)

    得到了第一页排名1-25的电影信息

    分页爬取

    给URL传入参数start=0,25…225

    def main():
        baseurl = "https://movie.douban.com/top250?start="
        datalist = getData(baseurl)
        print(datalist)
    

    在askURL函数添加了error

    def askURL(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
        }
        html = ''
        try:
            request = urllib.request.Request(url=url, headers=headers)
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf-8')
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
        return html
    
    def getData(baseurl):
        datalist = []
    
        for i in range(0, 10):
            url = baseurl+str(i*25)
            html = askURL(url)
    
            soup = BeautifulSoup(html, 'html.parser')
            for item in soup.find_all('div', class_="item"):
                data = []
                item = str(item)
                link = re.findall(findlink, item)[0]
                data.append(link)
                image = re.findall(findImg, item)[0]
                data.append(image)
                title = re.findall(findTitle, item)[0]
                data.append(title)
                rating = re.findall(findRating, item)[0]
                data.append(rating)
                judge = re.findall(findJudge, item)[0]
                data.append(judge)
                inq = re.findall(findInq, item)
                if len(inq) != 0:
                    inq = inq[0].replace("。", "")
                    data.append(inq)
                else:
                    data.append(" ")
                bd = re.findall(findBd, item)[0]
                data.append(bd)
    
                datalist.append(data)
    
        return datalist
    
    step3 保存数据
    def saveData(datalist, savepath):
        book = xlwt.Workbook(encoding='utf-8', style_compression=0)
        sheet = book.add_sheet("豆瓣电影TOP250")
        col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景")
        for i in range(0, 7):
            sheet.write(0, i, col[i])
        for i in range(0, 250):
            data = datalist[i]
            for j in range(0, 7):
                sheet.write(i+1, j, data[j])
        book.save(savepath)
    

    结果:

    爬取成功!!!

    完整代码:
    import xlwt
    from bs4 import BeautifulSoup
    import re
    import ssl
    import urllib.request
    import urllib.error
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    findlink = re.compile(r'')
    # 电影链接
    findImg = re.compile(r', re.S)
    # 图片链接
    findTitle = re.compile(r'(.*?)')
    # 电影名称
    findRating = re.compile(r'(.*?)')
    # 电影评分
    findJudge = re.compile(r'(d*)人评价')
    # 评价人数
    findInq = re.compile(r'(.*?)')
    # 评语
    findBd = re.compile(r'

    (.*?)

    ', re.S) # 背景 def askURL(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15' } html = '' try: request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html def getData(baseurl): datalist = [] for i in range(0, 10): url = baseurl + str(i * 25) html = askURL(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) image = re.findall(findImg, item)[0] data.append(image) title = re.findall(findTitle, item)[0] data.append(title) rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace("。", "") data.append(inq) else: data.append(" ") bd = re.findall(findBd, item)[0] data.append(bd) datalist.append(data) return datalist def saveData(datalist, savepath): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet("豆瓣电影TOP250") col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景") for i in range(0, 7): sheet.write(0, i, col[i]) for i in range(0, 250): data = datalist[i] for j in range(0, 7): sheet.write(i+1, j, data[j]) book.save(savepath) def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getData(baseurl) savepath = "豆瓣电影TOP250.xls" saveData(datalist, savepath) if __name__ == '__main__': main()
    转载请注明:文章转载自 www.mshxw.com
    本文地址:https://www.mshxw.com/it/326007.html
    我们一直用心在做
    关于我们 文章归档 网站地图 联系我们

    版权所有 (c)2021-2022 MSHXW.COM

    ICP备案号:晋ICP备2021003244-6号