# -*- coding: utf-8 -*- # @Author : llvyr # @qq:994814645 import urllib.parse import urllib.request import urllib.error from bs4 import BeautifulSoup import urllib.parse import re import xlwt baseturl = "https://movie.douban.com/top250?start=" savaPath = "豆瓣电影Top250.xls" # 电影的详情匹配规则 # 电影链接 findlink = re.compile(r'') # 电影图片 findImg = re.compile(r'(.*)') # 电影评分 findRating = re.compile(r'(.*)') # 电影评价人数 findCommentNo = re.compile(r'(d*)人评价') # 电影概括 findDescriptors = re.compile(r'(.*)') # 电音的相关内容 findContent = re.compile(r'## 结果(.*?)
', re.S) # 爬取单个网页 def askUrl(baseturls): hesders = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", "cookie": '换成自己的cookie值' } request = urllib.request.Request(url=baseturls, headers=hesders) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") # print(html) except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html # 1. 爬取网页 def getData(baseturls): datalist = [] for i in range(0, 10): url = baseturls + str(i * 25) html = askUrl(url) # 2. 解析数据 suop = BeautifulSoup(html, "html.parser") for item in suop.find_all('div', class_="item"): data = [] item = str(item) print(item) title = re.findall(findTitle, item) # 名称 if len(title) == 2: ctitle = title[0] # 中文名 data.append(ctitle) otitle = title[1].replace('/', " ") # 外国名 替换 “/” # otitle = re.sub("xa0", "", otitle) data.append(otitle) else: data.append(title[0]) data.append(' ') link = re.findall(findlink, item)[0] # 链接 data.append(link) imgSrc = re.findall(findImg, item)[0] # 封面 data.append(imgSrc) rating = re.findall(findRating, item)[0] # 评分 data.append(rating) commentNo = re.findall(findCommentNo, item)[0] # 评价人数 data.append(commentNo) descriptors = re.findall(findDescriptors, item) # 描述 if len(descriptors) != 0: descriptors = descriptors[0].replace("。", "") data.append(descriptors) else: data.append(" ") content = re.findall(findContent, item)[0] # 内容 content = re.sub('
(s+)?', " ", content) # 去掉
content = re.sub("/", "", content) # content = re.sub("xa0", "", content) data.append(content.strip()) # 去除前后的空格 datalist.append(data) # 处理好的一部电影信息放进数组中 print(datalist) return datalist # 3. 存储数据 def savaData(datalist, savaPaths): workbook = xlwt.Workbook(encoding='utf-8', style_compression=0) worksheet = workbook.add_sheet("豆瓣电影Top250", cell_overwrite_ok=True) col = ("电影标题", "中文名称", "其他名称", "链接", "封面图片", "评分", "评价人数", "内容简介") for i in range(0, 8): worksheet.write(0, i, col[i]) for j in range(0, 250): print(f"第{j + 1}条") data = datalist[j] for k in range(0, 8): worksheet.write(j + 1, k, data[k]) workbook.save(savaPaths) # 保存文档 if __name__ == '__main__': getData(baseturl) datalists = getData(baseturl) # print(datalists) savaData(datalists, savaPath)



