(由于存在一定的反爬机制,以下代码最多只能爬取210条评论)
# 导入工具包
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# =============================================================================
# 爬取一页
# =============================================================================
# 爬取的网址
url = 'https://movie.douban.com/subject/25845392/comments?start=20&limit=20&status=P&sort=new_score'
# 获取信息
html = requests.get(url, headers=headers)
# 获取内容
data = html.text
soup = BeautifulSoup(data, 'html.parser')
# 信息
# 用户
names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
# 评级
pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
# 日期
riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
# 内容
neirongs = soup.select('#comments > div > div.comment > p > span')
# 空list
lis = []
for name, pingji, riqi, neirong in zip(names, pingjis, riqis, neirongs):
pingji_re = pingji.find_all('span')
lis.append([name.get_text(),
pingji_re[1]['class'],
pingji_re[1]['title'],
riqi.get_text().strip(),
neirong.get_text()])
result1 = pd.Dataframe(lis, columns=['用户', '评级', '等级', '日期', '内容'])
# print(result1)
# =============================================================================
# 利用循环结构爬取多页
# =============================================================================
url = ['https://movie.douban.com/subject/25845392/comments?start={}&limit=20&status=P&sort=new_score'.format(i) for i in
range(0, 500, 20)]
lis2 = []
for urli in url:
# 获取信息
html = requests.get(urli, headers=headers)
# 获取内容
data = html.text
soup = BeautifulSoup(data, 'html.parser')
# 用户
names = soup.select('#comments > div > div.comment > h3 > span.comment-info > a')
# 评级
pingjis = soup.select('#comments > div > div.comment > h3 > span.comment-info')
# 日期
riqis = soup.select('#comments > div > div.comment > h3 > span.comment-info > span.comment-time')
# 内容
neirongs = soup.select('#comments > div > div.comment > p > span')
for name, pingji, riqi, neirong in zip(names, pingjis, riqis, neirongs):
pingji_re = pingji.find_all('span')
lis2.append([name.get_text(),
pingji_re[1]['class'],
pingji_re[1]['title'],
riqi.get_text().strip(),
neirong.get_text()])
print('完成:', urli)
time.sleep(np.random.randint(5, 10))
result2 = pd.Dataframe(lis2, columns=['用户', '评级', '等级', '日期', '内容'])
# 写入excel
frame = pd.Dataframe(result2)
file = frame.to_csv('movie.csv')
二、文本分析
# -*- coding: utf-8 -*-
import pandas
import jieba
import re
# loading data
data = pandas.read_excel(
"C:\Users\Lenovo\documents\comments.xlsx"
)
# 1.文本内容清洗,清楚特殊符号,用正则表达式
pattern = r"[!"#$%&'()*+,-./:;<=>?@[\]^_^{|}~—!,。?、¥…():【】《》‘’“”s]+"
re_obj = re.compile(pattern)
def clear(text):
return re.sub(pattern, "", text)
data['comment'] = data['comment'].apply(clear)
print(data.head())
def cut_word(text): # 返回生成器
return jieba.cut(text)
# 2.分词 用jieba来实现分词
data['comment'] = data['comment'].apply(cut_word)
# 3.停用词处理,这里我用的是中文停词用表(文末附)
def get_stopword(): # 使用set
s = set()
with open('C:\Users\Lenovo\Desktop\cn_stopwords.txt', encoding='UTF-8') as f:
for line in f:
s.add(line.strip())
return s
def remove_stopword(words):
return [word for word in words if word not in stopword]
stopword = get_stopword()
data['comment'] = data['comment'].apply(remove_stopword)
# 4.词汇统计
from itertools import chain
from collections import Counter
li_2d = data['comment'].tolist()
# 将二维列表转换为一维
li_1d = list(chain.from_iterable(li_2d))
print(f'总词汇量:{len(li_1d)}')
c = Counter(li_1d)
print(f'不重复词汇量:{len(c)}')
common = c.most_common(50)
# print(common)
import pandas as pd
frame = pd.Dataframe(common)
file = frame.to_csv('common11.csv')
# 计算每个评论的用词数
num = [len(li) for li in li_2d]
import matplotlib.pyplot as plt
# 绘制所有用户在评论时所用词汇书,绘制直方图
# n, bins, patches = plt.hist(num, bins=20, alpha=0.5)
# plt.yscale('log')
# plt.show()
# 生成词云图
from wordcloud import WordCloud
# 导入图像处理库
import PIL.Image as image
# 导入数据处理库
import numpy as np
import matplotlib.colors as colors # 处理图片相关内容
# 文末附颜色对照表
colormaps = colors.ListedColormap(['#FF4500', '#FF7F50', '#FFD700'])
# mask可以用PPT画自己想要的图形(我这里是用来“长津湖”的艺术字)
mask1 = np.array(image.open('C:\Users\Lenovo\Desktop\aa.png'))
wc = WordCloud(font_path="simsun.ttc", background_color="white",
mask=mask1, colormap=colormaps)
img = wc.generate_from_frequencies(c)
plt.figure(figsize=(15, 10))
plt.imshow(img)
plt.axis('off')
plt.show()
下图为我的代码词云图:
(毕竟计算机不一定能满足我们的需求)
# 加载词典
jieba.load_userdict('my_dictionary.txt')
# 也可以添加自定义词典
jieba.add_word('易烊千玺')
jieba.add_word('长津湖')
四、分本情感分析
snownlp是对一个评论进行情感分析输出一个[0,1]之间的值,越高越表示评论越偏向于积极好的方面。
from snownlp import SnowNLP
# 示例:
# a = SnowNLP(u'确实很满意的一次购物。做工很好很精致。内外都很特别。这几天穿着很暖和的。而且轻薄。包装都很好。').sentiments
# print(a)
# 打开文件
fp = open(r"movie.txt", "r", encoding='utf-8')
lines = fp.readlines()
k = 0
m = 0
# 逐行读入
for line in lines:
try:
s = SnowNLP(line)
# 进行对没条评论情感分析打分累加
k = k+s.sentiments
# 对评论总数进行累加
m = m+1
except:
print("")
print(round(k/m, 3)) # 得到平均分
从输出结果为0.827,可以得出结论:《长津湖》广受好评。
五、附录1.停用词:
2.颜色代码:(很全很好用!)
百度文库链接



