imdb电影爬取代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import random
def bs_dealing(bs):
movies_lst = []
global page
for movie in bs.findAll('div', {'class': 'lister-item-content'}):
title = movie.find('a').string
year = movie.find('span', {
'class': 'lister-item-year text-muted unbold'
}).string
genres = movie.find('span', 'genre').string
genres = genres.strip('n').strip()
# 控制网页返回的错误,runtime是否为空,以免数据无法正常写入
if movie.find('span', 'runtime') is not None:
runtime = movie.find('span', 'runtime').string
else:
runtime = 0
rating = movie.find('div', {
'class': "inline-block ratings-imdb-rating"
}).find('strong').string
# 根据不同的情况确定电影的投票数和gross票房
if len(movie.findAll('span', {'name': "nv"})) == 2:
votes = movie.findAll('span', {'name': "nv"})[0].string
gross = movie.findAll('span', {'name': "nv"})[1].string
else:
votes = movie.find('span', {'name': "nv"}).string
gross = np.nan
# 将所有信息存入一个列表
movies_lst.append([title, year, genres, runtime, rating, votes, gross])
print("第%s页爬取数据结束" % page)
return movies_lst
def csv_write(movies_lst, header):#目的是为了对爬取到的数据进行持久化存储,避免丢失。
global page
csv_name = '电影评分降序汇总.csv'
test = pd.Dataframe(movies_lst)
test.to_csv(csv_name,
mode='a+',
header=header,
index=False,
encoding='utf-8')
print("---------第%s页数据已存储" % page)
columns = ['Title', 'Year', 'Genres', 'Runtime', 'Rating', 'Votes', 'Gross']
for page in range(1, 200):#爬取一万个电影
num = (page - 1) * 50 + 1 #是因为每一页显示50个电影
if __name__ == '__main__':
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
url = 'http://www.imdb.com/search/title'
params = {
"title_type": "feature",
"start": num,
"year": "1900,2020",
"sort": "num_votes,desc"
}
response = requests.get(url=url, params=params, headers=headers).text
bs = BeautifulSoup(response, 'html.parser')
movies_lst = bs_dealing(bs)
if num == 1:
header = columns
csv_write(movies_lst, header)
else:
header = False
csv_write(movies_lst, header)
time.sleep(random.randint(0, 3)) #每爬取一个页面,休息一下,避免网站反爬。
moives_df=pd.read_csv(os.path.abspath('电影评分降序汇总.csv'),header=0)
moives_df.head(10)#展示投票前10的电影
imdb数据清洗代码
#导入爬取的imdb电影数据
import pandas as pd
pd.set_option('display.unicode.east_asian_width',True)
imdb=pd.read_table('电影降序汇总.csv',index_col=None)
imdb.head(10)
#修正runtime,去除min以便于分析
clean_runtime=imdb.loc[:,'runtime']
clean_runtime=clean_runtime.str.split(pat=None,n=-1,expand=True)
clean_runtime=clean_runtime.fillna(0)
clean_runtime=clean_runtime[0]
imdb.runtime=clean_runtime.apply(lambda x:int(x))
imdb.rename(columns={'runtime':'runtime(minute)'},inplace=True)
#对rating进行处理
clean_ra=imdb.loc[imdb.rating.isnull(),:]
clean_ra.info#发现rating中没有空值
imdb.rating.dtype#确定rating是数字
#对metascore进行清理
clean_metascore=imdb.loc[:,'metascore']
print(clean_metascore.dtype)
clean_metascore=clean_metascore.fillna(0)#填充空值
imdb.metascore=clean_metascore
#对votes进行处理,将数据类型转换为int
clean_votes=imdb.votes
print(clean_votes.dtype)#数据类型是str
clean_votes=clean_votes.str.replace(',','')
clean_votes=clean_votes.apply(lambda x :int(x))
print(clean_votes.dtype)
imdb.votes=clean_votes
#对gross进行处理
clean_gross=imdb.loc[:,'gross']
clean_gross=clean_gross.str.replace('M','')
clean_gross=clean_gross.str.replace('$','')
clean_gross=clean_gross.fillna(0)#填充空值
imdb.gross=clean_gross.apply(lambda x:float(x))
imdb.rename(columns={'gross':'gross(million)'},inplace=True)
#对year进行处理
#print(imdb[imdb.year.str.len() != 6].year)#让我们测试一下
clean_year=imdb.loc[:,'year']
version=clean_year.apply(lambda x: '' if len(x)==6 else x[0:4])
imdb.title=imdb.title.str.cat(version,sep='')
clean_year=clean_year.str.extract('(d+)').iloc[:,0]
imdb.year=clean_year.apply(lambda x:int(x))
#对genres进行处理
imdb.genres=imdb.genres.str.replace(' ','')
imdb.genres=imdb.genres.str.lower()
genres_set=set()
for genres_str in imdb.loc[:,'genres']:
genres_set.update(g for g in genres_str.split(','))
genres_set=sorted(genres_set)#这里我获取了所有电影类别
print(len(genres_set))
for genre in genres_set:
imdb['genre_'+genre]=[genre in movie.split(',') for movie in imdb.genres]#每个流派设为一列,单元格将为True 或者是False
imdb.drop(columns='genres',inplace=True)
imdb
对数据进行简单分析,获取出现次数最多的前十个电影类别
import numpy as np
genres_set=['genre_'+i for i in genres_set]
genre_count=imdb[genres_set]
genre_count=pd.Dataframe(genre_count.sum())
genre_count.columns=['出现次数']
genre_count=genre_count.sort_values(by='出现次数',ascending=False)
genre_count