python抓取猫眼电影评论，200多行代码，哈哈

直接上接口api 不犯法吧。大家都知道~~~

http://m.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_ yes offset 1

http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_ yes offset 1

m接口因太频繁测试访问自个儿机子被抓住了哈哈。api是多年前抓的接口现在还能用。漂亮

抓取这个接口的时候会有美团滑块验证。。刚好之前看到了擦姐的滑块文章就尝试着拿过来了。

当时看的是这个文章 https://dream.blog.csdn.net/article/details/111327362 文中的测试地址无效啦所以刚好碰到一个需要滑块的拿来学习。如何配置使用Selenium 擦姐也都写好了 Selenium 与 Python 之间如何才能交融在一起简直不要太棒她啥都有

文中所需模块没有的自行安装。很简单 pip install xxx 最新版就是了

1. 接着就是开始滑块测试了。

滑块源码无用的注释自行删除。

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import traceback
import time
import random, sys
# driver webdriver.Firefox()
# # 浏览器最大化
# driver.maximize_window()
# # 打开注册页面
# driver.get( https://reg.taobao.com/member/reg/fill_mobile.htm )
# navigator.appVersion
def selenium_get_html(url http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_ yes offset 1 ):
 # url http://m.maoyan.com/mmdb/comments/movie/341516.json?_v_ yes offset 1 
 option webdriver.ChromeOptions()
 # 不打开窗口 静默模式
 # option.add_argument( headless )
 # 禁用js
 # prefs {
 # profile.default_content_setting_values : {
 # images : 2,
 # javascript :2
 # option.add_experimental_option( prefs , prefs)
 # 防止打印一些无用的日志
 option.add_experimental_option( excludeSwitches , [ enable-automation , enable-logging ])
 driver webdriver.Chrome(chrome_options option)
 # driver.set_window_size(200,200)
 # driver.maximize_window()
 driver.get(url)
 locator (By.ID, yodaMoveingBar )
 try:
 a WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))
 time.sleep(1)
 # 发现滑块
 yodaBox driver.find_element_by_id( yodaBox )
 # print(yodaBox.size)
 # 滑块区域
 source driver.find_element_by_id( yodaBoxWrapper )
 # print(source.size, source.size[ width ], type(source.size[ width ]))
 ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size[ width ], source.size[ height ]).perform()
 except TimeoutException as e :
 print( 等待超时... )
 sys.exit(1)
 except baseException as e:
 print ( repr(e):t )
 #以下两步都是输出错误的具体位置的
 traceback.print_exc()
 print ( traceback.format_exc():n%s % traceback.format_exc())
 finally: 
 time.sleep(12)
 driver.quit()
 return print(driver.current_url) # current_url 方法可以得到当前页面的URL
if __name__ __main__ :
 url http://m.maoyan.com/mmdb/comments/movie/1263235.json?_v_ yes offset 1 
 # 验证滑块
 a selenium_get_html(url)
 print(a)

标注

这里是等待网页渲染完成滑块节点出现后才能操作。

 locator (By.ID, yodaMoveingBar )
 a WebDriverWait(driver, 5, 0.5).until(EC.presence_of_element_located(locator))

获取滑块并滑动至。。那个xy坐标

 # 发现滑块
 yodaBox driver.find_element_by_id( yodaBox )
 # print(yodaBox.size)
 # 滑块区域
 source driver.find_element_by_id( yodaBoxWrapper )
 # print(source.size, source.size[ width ], type(source.size[ width ]))
 ActionChains(driver).drag_and_drop_by_offset(yodaBox, source.size[ width ], source.size[ height ]).perform()

刚开始是有成功过但后来就没有成功过哈哈哈哈。一直都是请求异常怎么优化解决暂不知晓了哦。最终为了能成功爬取我采取了手动验证后再接着下面的爬取工作以下爬取工作加入了这个验证操作代码哦。如若不行还是注释掉手动去滑块验证吧

2. 评论中需要电影的id 所以先通过搜索获取影片id

接口 https://maoyan.com/ajax/suggest?kw {keyword}
这里需要JSON去解析返回的数据 util封装的模块在上篇文章中有再贴一下。还写了个超级简单的exception封装哈哈哈。都贴下面了不要笑我哦。

下面这个方法返回的是list中的一个还没有拿到id的哦。

# 通过关键字 返回电影列表。只返回第一个
def get_movies(keyword):
 html util.get_html(f https://maoyan.com/ajax/suggest?kw {keyword} )
 # print(html)
 mvs json.loads(html)[ movies ][ list ]
 # print(mvs)
 if (len(mvs) 0):
 raise SkipException(f 找不到{keyword} )
 return mvs[0]

Util 模块加了个函数运行时间

import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ os.path.dirname(os.path.abspath(__file__))
# print(__dir__)
def get_headers(localhost True, refer https://www.baidu.com , host None):
 ua Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 
 if not localhost:
 uas [
 Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 ,
 Mozilla/5.0 (compatible; Baiduspider/2.0; http://www.baidu.com/search/spider.html) ,
 Mozilla/5.0 (compatible; Baiduspider-render/2.0; http://www.baidu.com/search/spider.html) ,
 Baiduspider-image ( http://www.baidu.com/search/spider.htm) ,
 Mozilla/5.0 (compatible; Googlebot/2.1; http://www.google.com/bot.html) ,
 Mozilla/5.0 (compatible; Googlebot-Image/1.0; http://www.google.com/bot.html) ,
 Sogou web spider/4.0( http://www.sogou.com/docs/help/webmasters.htm#07) ,
 Sogou News Spider/4.0( http://www.sogou.com/docs/help/webmasters.htm#07) ,
 Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0); ,
 Mozilla/5.0 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm) ,
 Sosospider ( http://help.soso.com/webspider.htm) ,
 Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html) 
 ua random.choice(uas)
 headers {
 User-Agent : ua,
 Referer : refer,
 Host : host
 return headers
def get_html(url, ret_type text , timeout 50, encoding utf-8 ):
 headers get_headers()
 res requests.get(url, headers headers, timeout timeout)
 res.encoding encoding
 # print(res.status_code)
 # print(res.text)
 if ret_type text :
 return res.text
 elif ret_type image :
 return res.content
 elif ret_type json :
 return res.json()
# 装饰器 函数运行时间
def run_time(func):
 # 这里的 wrapper 函数名可以为任意名称
 def wrapper(*args):
 s_time time.perf_counter()
 res func(*args)
 e_time time.perf_counter()
 print(f func --- {func.__name__}, runtime --- {e_time-s_time} )
 return res
 return wrapper

# 异常类 继承Exception
# 跳过异常
class SkipException(Exception):
 def __init__(self,msg):
 print(f 不可控异常 为不影响原程序运行 抛出此异常提示...{msg} )
 self.msg msg

3. 拿到id后就可以开始抓取了。接口返回的是JSON 测试发现该接口只返回直 offset 1000。所以程序里大于1000的直接pass了不会有数据

# 爬取 XX电影
def scrawl_mv(keyword):
 try:
 mv get_movies(keyword)
 except SkipException as e :
 print(e)
 movie_id mv[ id ]
 movie_name mv[ nm ]
 print(type(mv), movie_id, movie_name)
 url f http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_ yes offset 1 
 print(f 正在爬取 --- 【{movie_name}】 第一页 {url} )
 do_scrawl(url, movie_name)

do_scrawl 里的valid 这里是直接在第一次进行滑块验证。可能一直失败我就尝试做了3次。失败就sys.exit(1)退出了。

 try:
 html util.get_html(url)
 print( 第一次 进行反复循环 检测是否需要验证 )
 # 第一次 进行反复循环 检测是否需要验证
 html valid(html, url)
 except SkipException as obj:
 print(obj)
 sys.exit(1)

滑块可能会不起作用建议还是手动去验证吧哈哈。然后注掉代码再去跑

好吧直接贴一下所有代码。。。200多行不算多吧。

import sys, os
__dir__ os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, ../common )))
import util, maoyan_jieba
from exp import SkipException
import requests, json, time, random, slide_selenium, threading
from bs4 import BeautifulSoup
# 解析一页数据
def parse_ono_page(html):
 try:
 data json.loads(html)[ cmts ] #评论以json形式存储,故以json形式截取
 except Exception:
 raise SkipException( json解析错误 获取不到cmts ) 
 #data json.loads(html)[ hcmts ] #评论以json形式存储,故以json形式截取
 for item in data:
 yield { #该方法返回一个字典
 comment :item[ content ],
 date :item[ time ].split( )[0],
 rate :item[ score ],
 city :item[ cityName ],
 nickname :item[ nickName ]
# 返回评论总数
def parse_ono_pages(html):
 # 总数
 total json.loads(html)[ total ]
 return total
# { approve :0, assistAwardInfo :{ avatar : , celebrityId :0, celebrityName : , rank :0, title : }, avatarurl : https://img.meituan.net/maoyanuser/0d20974fe7a2dcb726680f4d94493b8511096.png , cityName : 北京 , content : 刘德华演技在线 画面真美 剧情太烂 , id :1143035845, isMajor :false, juryLevel :0, movieId :341516, nick : zhangsq0812 , nickName : zhangsq0812 , oppose :0, pro :false, reply :0, score :0.5, spoiler :0, startTime : 2021-09-10 11:55:57 , supportComment :true, supportLike :true, sureViewed :1, tagList :{ fixed :[{ id :2, name : 购票差评 },{ id :4, name : 购票 }]}, time : 2021-09-10 11:55 , userId :220211944, userLevel :0, vipType :0}
#保存数据到文本文档
def save_to_txt(url, filepath os.path.join(__dir__, f ../files/{str(round(time.time() * 1000))}.txt )):
 html util.get_html(url)
 # print(filepath)
 try:
 cmts parse_ono_page(html)
 except Exception:
 raise SkipException( 解析JSON异常 ) 
 for item in cmts:
 # print(item)
 with open(filepath, a ,encoding utf-8 ) as f:
 f.write(item[ date ] , item[ nickname ] , item[ city ] , str(item[ rate ]) , item[ comment ] n )
# 获取的评论可能有重复 为了最终统计的真实性 需做去重处理
def delete_repeat(old,new):
 oldfile open(old, r ,encoding UTF-8 )
 newfile open(new, w ,encoding UTF-8 )
 content_list oldfile.readlines() #读取的数据集
 content_alreadly_ditinct [] #存储不重复的评论数据
 for line in content_list:
 if line not in content_alreadly_ditinct: #评论不重复
 newfile.write(line n )
 content_alreadly_ditinct.append(line)
# 获取网页 并保存至txt
def scrawl(url):
 print(f 正在爬取{url} )
 html util.get_html(url)
 save_to_txt(html)
# 第一次访问 可能需要人工验证 滑块
 util.run_time
def valid(html, url):
 times 1
 while(True):
 soup BeautifulSoup(html, html.parser )
 titles soup.select( title )
 print(titles)
 if len(titles) 0:
 title titles[0].text
 if 验证 in title:
 slide_selenium.selenium_get_html(url)
 time.sleep(1)
 html util.get_html(url)
 times 1
 else:
 break
 else: 
 break
 # return html
 if times 3:
 raise SkipException( 无法通过滑块验证 error )
 return util.get_html(url)
# 通过id 获取电影名称
 util.run_time
def get_movie_name(movie_id):
 :param :movie_id - 电影id
 html util.get_html(f http://api.maoyan.com/mmdb/movie/v5/{movie_id}.json )
 data json.loads(html)[ data ][ movie ]
 return data
# 通过关键字 返回电影列表。只返回第一个
def get_movies(keyword):
 html util.get_html(f https://maoyan.com/ajax/suggest?kw {keyword} )
 # print(html)
 mvs json.loads(html)[ movies ][ list ]
 # print(mvs)
 if (len(mvs) 0):
 raise SkipException(f 找不到{keyword} )
 return mvs[0]
# 爬url
 util.run_time
def do_scrawl(url, movie_name f movie{str(round(time.time() * 1000))} ):
 :param :url 要爬取的链接
 :param :movie_name 电影名称 保存txt文件名
 try:
 html util.get_html(url)
 print( 第一次 进行反复循环 检测是否需要验证 )
 # 第一次 进行反复循环 检测是否需要验证
 html valid(html, url)
 except SkipException as obj:
 print(obj)
 sys.exit(1)
 # print(html)
 # 评论总数
 total parse_ono_pages(html)
 # 发现接口只返回1000条
 if total 1000:
 total 1000
 # 接口 返回的评论条数
 size 15
 # 取整的页数
 pages round(total/size)
 # 每个线程的工作量
 # thrs 2
 thrs random.randint(2,10)
 works round(pages / thrs)
 # 如果 线程*每个线程工作量 总页数 需要在启动一个线程
 if thrs * works pages:
 thrs 1
 root_path util.JarProjectPath.project_root_path( py )
 filepath root_path f files/{movie_name}.txt 
 print(f 共{total}条评论 每页{size}条 可爬{pages}页 随机预设【{thrs}】个线程 每个线程需爬取【{works}】页 )
 # 多线程爬取
 l []
 # 线程从1开始 各加1
 for i in range(1, thrs 1):
 if i thrs and pages % works ! 0:
 # 最后一个线程 不能超出可爬取的页数
 t threading.Thread(target save_batch, args (i, works*(i-1), works*(i-1) pages % works, filepath))
 else: 
 t threading.Thread(target save_batch, args (i, works*(i-1), works*i, filepath))
 l.append(t)
 print( 线程{} 启动 .format(i))
 t.start()
 for p in l:
 p.join()
 print( 多线程执行完成 爬取完毕 )
 print(f 共{total}条评论 每页{size}条 可爬{pages}页 随机预设【{thrs}】个线程 每个线程需爬取【{works}】页 )
# 爬取 XX电影
def scrawl_mv(keyword):
 try:
 mv get_movies(keyword)
 except SkipException as e :
 print(e)
 movie_id mv[ id ]
 movie_name mv[ nm ]
 print(type(mv), movie_id, movie_name)
 url f http://api.maoyan.com/mmdb/comments/movie/{movie_id}.json?_v_ yes offset 1 
 print(f 正在爬取 --- 【{movie_name}】 第一页 {url} )
 do_scrawl(url, movie_name)
# 多线程爬取
def save_batch(no, start, end, filepath):
 # 这里因为接口 第一页返回数据1-15 第二页返回数据2-16 造成数据重复。
 # 所以 这里给offset 循环一次加上15
 size 15 # 固定
 for i in range(start, end):
 # 反爬
 time.sleep(1 float(random.randint(1,100)) / 20)
 url f http://api.maoyan.com/mmdb/comments/movie/1263235.json?_v_ yes offset {i*size 1} 
 print(f Thread.{no} 正在保存 --- {url} )
 try:
 save_to_txt(url, filepath)
 except SkipException as obj:
 continue
 util.run_time
def thread_test(movie_name):
 # 评论总数
 total 1530
 # 接口 返回的评论条数
 size 15
 # 取整的页数
 pages round(total/size)
 # 每个线程的工作量
 works 50
 # 最大线程数
 r int(pages / works) 1 if pages % works 0 else 0
 print(f 共{total}条评论 每页{size}条 可爬{pages}页 预设每个线程爬取【{works}】页 需要【{r}】个线程 )
 root_path util.JarProjectPath.project_root_path( py )
 filepath root_path f files/{movie_name}.txt 
 # l []
 # for i in range(1, r 1):
 # if i r:
 # # 最后一个线程 不能超出可爬取的页数
 # t ThreadCrawl(str(i), save_batch, works*(i-1), works*(i-1) pages % works, filepath)
 # else: 
 # t ThreadCrawl(str(i), save_batch, works*(i-1), works*i, filepath)
 # l.append(t)
 # t.start()
 # for p in l:
 # p.join()
 print( 多线程执行完成 爬取完毕 )
class ThreadCrawl(threading.Thread):
 :param :thread_name 线程名称
 :param :func 线程要执行的函数
 def __init__(self, thread_name, func, *args):
 # threading.Thread.__init__(self)
 # 调用父类初始化方法
 super(ThreadCrawl, self).__init__()
 self.threadName thread_name
 self.func func(*args)
 print( 线程初始化 , *args)
 def run(self):
 # runrun(self.threadName)
 print(f 线程{self.threadName} ************启动************ )
 self.func
if __name__ __main__ :
 # print(get_movies( 怒火·重案 ))
 movie_name 我的青春有个你 
 scrawl_mv(movie_name)
 filepath f files/{movie_name}.txt 
 if os.path.exists(filepath):
 print(os.path.abspath(filepath))
 maoyan_jieba.analysis(os.path.abspath(filepath))

大部分有注释说明的吧哈哈。。

爬取txt后之后做了云词分析跟geo评论分布大概这样。文中maoyan_jieba 代码还没贴哦。

python抓取猫眼电影评论，200多行代码，哈哈

Python相关栏目本月热门文章