一、界面展示
1.主界面2.词云图3.用户地区分布图4.评论情感得分5.评论数据浏览6.评论时间分布图7.用户信息浏览 二、代码结构展示
1.GUI代码整体 三、核心代码
1.用户爬虫代码2.评论爬虫代码3.传送门:[待加载...](http://www.baidu.com/)
一、界面展示 1.主界面 2.词云图 3.用户地区分布图 4.评论情感得分 5.评论数据浏览 6.评论时间分布图 7.用户信息浏览 二、代码结构展示代码采用模块化处理,每个脚本可以分别运行,也可以使用可视化进行辅助运行
1.GUI代码整体三、核心代码 1.用户爬虫代码重要包版本:
pyecharts0.5.5
jinja23.0.3
# -*- coding:utf8 -*-
from urllib import request
import json
import pymysql
import re
ROOT_URL = ****
DATAbase = ****
TABLE_USERS = ****
TABLE_COMMENTS = ****
PATTERN = re.compile(r'[ntr/]')
def getData_user(url):
if not url:
return None
print('Crawling>>> ' + url)
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46',
}
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
data = {}
if js['code'] == 200:
data['userId'] = js['profile']['userId']
data['userName'] = js['profile']['nickname']
data['avatar'] = js['profile']['avatarUrl']
data['gender'] = js['profile']['gender']
if int(js['profile']['birthday'])<0:
data['age'] = 0
else:
data['age'] =(2018-1970)-(int(js['profile']['birthday'])//(1000*365*24*3600))
if int(data['age'])<0:
data['age'] = 0
data['level'] = js['level']
data['sign'] = PATTERN.sub(' ', js['profile']['signature'])
data['eventCount'] = js['profile']['eventCount']
data['followCount'] = js['profile']['follows']
data['fanCount'] = js['profile']['followeds']
data['city'] = js['profile']['city']
data['recordCount'] = js['listenSongs']
return data
except Exception as e:
print('Down err>>> ', e)
return None
def saveData_user(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db=DATAbase, charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储签名中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + 'users' + '(id,userName,gender,age,level,city,sign,eventCount,followsCount,followedCount,recordCount,avatar,userId) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
try:
# cursor.execute('SELECt max(id) FROM '+TABLE_USERS)
# id_ = cursor.fetchone()[0]
# cursor.execute(sql)
cursor.execute(sql,(0,data['userName'],data['gender'],data['age'],data['level'],data['city'],data['sign'],data['eventCount'],data['followCount'],data['fanCount'],data['recordCount'],data['avatar'],data['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ',data['userId'],e)
pass
finally:
cursor.close()
conn.close()
def getID_user():
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db=DATAbase, charset='utf8mb4')
cursor = conn.cursor()
sql = 'SELECt userId FROM '+TABLE_COMMENTS
try:
cursor.execute(sql)
res = cursor.fetchall()
return res
except Exception as e:
print('get err>>> ', e)
pass
finally:
cursor.close()
conn.close()
return None
if __name__ == '__main__':
usersID = getID_user()
for i in usersID:
data = getData_user(ROOT_URL+i[0].strip())
saveData_user(data)
2.评论爬虫代码
# -*- coding:utf8 -*-
from urllib import request
import json
import pymysql
from datetime import datetime
import re
ROOT_URL = ****
LIMIT_NUMS = 50 # 每页限制爬取数
DATAbase = **** # 数据库名
TABLE = **** # 数据库表名
PATTERN = re.compile(r'[ntr/]')
def getData_com(url):
if not url:
return None, None
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
"Host": "music.163.com",
}
print('Crawling>>> ' + url)
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
total = int(js['total'])
datas = []
for c in js['comments']:
data = dict()
data['commentId'] = c['commentId']
data['content'] = PATTERN.sub('', c['content'])
data['time'] = datetime.fromtimestamp(c['time'] // 1000)
data['likedCount'] = c['likedCount']
data['userId'] = c['user']['userId']
datas.append(data)
return total, datas
except Exception as e:
print('Down err>>> ', e)
pass
def saveData_com(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db='wangyiyun',
charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储评论中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + TABLE + ' (id,commentId,content,likedCount,time,userId) VALUES (%s,%s,%s,%s,%s,%s)'
for d in data:
try:
cursor.execute('SELECt max(id) FROM ' + TABLE)
# id_ = cursor.fetchone()[0]
cursor.execute(sql, (0,d['commentId'], d['content'], d['likedCount'], d['time'], d['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ', d['commentId'], e)
pass
cursor.close()
conn.close()
if __name__ == '__main__':
songId = input('歌曲ID:').strip()
total, data = getData_com(ROOT_URL % (songId, LIMIT_NUMS, 0))
saveData_com(data)
if total:
for i in range(1, total // num + 1): #num页数
_, data = getData_com(ROOT_URL % (songId, LIMIT_NUMS, i * (LIMIT_NUMS)))
saveData_com(data)
3.传送门:待加载…
有问题评论区告诉我!



