爬取豆瓣电影TOP250的电影名称、豆瓣评分、评价数、电影概况、电影链接等
分析:第一页URL:https://movie.douban.com/top250,展示了排行1-25的电影;
第二页URL:https://movie.douban.com/top250?start=25&filter=,展示了排行26-50的电影;
…
获取TOP250,需要分开请求10次,参数start分别为:0,25…225
import urllib.request
url = "https://movie.douban.com/top250"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
小插曲:
其中出现了“unable to get local issuer certificate (_ssl.c:1129)”错误,加入下方代码即可
import ssl ssl._create_default_https_context = ssl._create_unverified_context2.提取信息
每部电影对应一个li节点
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...
1994 / 美国 / 犯罪 剧情
希望让人自由。
使用BeautifulSoup、re库匹配信息
from bs4 import BeautifulSoup import re findlink = re.compile(r'') # 电影链接 findImg = re.compile(r', re.S) # 图片链接 findTitle = re.compile(r'(.*?)') # 电影名称 findRating = re.compile(r'(.*?)') # 电影评分 findJudge = re.compile(r'(d*)人评价') # 评价人数 findInq = re.compile(r'(.*?)') # 评语 findBd = re.compile(r'(.*?)
', re.S) # 背景 soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) image = re.findall(findImg, item)[0] data.append(image) title = re.findall(findTitle, item)[0] data.append(title) rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item)[0] data.append(inq) bd = re.findall(findBd, item)[0] data.append(bd) print(data)
得到了第一页排名1-25的电影信息
给URL传入参数start=0,25…225
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
print(datalist)
在askURL函数添加了error
def askURL(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}
html = ''
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getData(baseurl):
datalist = []
for i in range(0, 10):
url = baseurl+str(i*25)
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="item"):
data = []
item = str(item)
link = re.findall(findlink, item)[0]
data.append(link)
image = re.findall(findImg, item)[0]
data.append(image)
title = re.findall(findTitle, item)[0]
data.append(title)
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
data.append(bd)
datalist.append(data)
return datalist
step3 保存数据
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("豆瓣电影TOP250")
col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景")
for i in range(0, 7):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 7):
sheet.write(i+1, j, data[j])
book.save(savepath)
结果:
爬取成功!!!
import xlwt from bs4 import BeautifulSoup import re import ssl import urllib.request import urllib.error ssl._create_default_https_context = ssl._create_unverified_context findlink = re.compile(r'') # 电影链接 findImg = re.compile(r', re.S) # 图片链接 findTitle = re.compile(r'(.*?)') # 电影名称 findRating = re.compile(r'(.*?)') # 电影评分 findJudge = re.compile(r'(d*)人评价') # 评价人数 findInq = re.compile(r'(.*?)') # 评语 findBd = re.compile(r'(.*?)
', re.S) # 背景 def askURL(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15' } html = '' try: request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) html = response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html def getData(baseurl): datalist = [] for i in range(0, 10): url = baseurl + str(i * 25) html = askURL(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', class_="item"): data = [] item = str(item) link = re.findall(findlink, item)[0] data.append(link) image = re.findall(findImg, item)[0] data.append(image) title = re.findall(findTitle, item)[0] data.append(title) rating = re.findall(findRating, item)[0] data.append(rating) judge = re.findall(findJudge, item)[0] data.append(judge) inq = re.findall(findInq, item) if len(inq) != 0: inq = inq[0].replace("。", "") data.append(inq) else: data.append(" ") bd = re.findall(findBd, item)[0] data.append(bd) datalist.append(data) return datalist def saveData(datalist, savepath): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet("豆瓣电影TOP250") col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景") for i in range(0, 7): sheet.write(0, i, col[i]) for i in range(0, 250): data = datalist[i] for j in range(0, 7): sheet.write(i+1, j, data[j]) book.save(savepath) def main(): baseurl = "https://movie.douban.com/top250?start=" datalist = getData(baseurl) savepath = "豆瓣电影TOP250.xls" saveData(datalist, savepath) if __name__ == '__main__': main()




