栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

爬取猫眼电影

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

爬取猫眼电影

针对构建的电影购票功能获取相应数据,主要获取:电影信息+影院信息+场次信息,都不难获取,主要是场次信息方面的构造、排布

1. 爬取首页所有电影链接

    def parse(self, response):
 movie_list = response.xpath('//dd//div[@class="movie-item"]/a/@href').extract()
 for url in movie_list:
     yield scrapy.Request("https://maoyan.com" + url, callback=self.parse_tag)
2. 进入子页面获取电影具体信息

    def parse_tag(self, response):
 item = MaoyanItem()
 item['movie_name'] = response.xpath('//h3[@class="name"]/text()').extract_first()
 item['ellipsis'] = response.xpath('//div[@class="ename ellipsis"]/text()').extract()
 time_str = ''.join(response.xpath('//li[@class="ellipsis"]//text()').extract())
 item['time'] = self.date_reg_exp.findall(time_str)[0]
 if response.xpath('//i[@class="imax3d"]'):
     item['vision'] = "3DIMAX"
 elif response.xpath('//i[@class="m3d"]'):
     item['vision'] = "3D"
 else:
     item['vision'] = ""
 yield item
3. 获取影院信息

因为这里需要的数据并不多,就自行选取部分区域爬取即可,想要爬取所有区域或者城市可以根据具体的城市id以及区域唯一标识id进行构造

4. 获取影院详情信息

 def parse_tag(self, response):
 item = MaoyanItem()
 item['movie_name'] = response.xpath('//h3[@class="name"]/text()').extract_first()
 item['ellipsis'] = response.xpath('//div[@class="ename ellipsis"]/text()').extract()
 time_str = ''.join(response.xpath('//li[@class="ellipsis"]//text()').extract())
 item['time'] = self.date_reg_exp.findall(time_str)[0]
 item['introduction'] = response.xpath('normalize-space(//span[@class="dra"]/text())').extract_first()
 duration = ''.join(response.xpath('//li[@class="ellipsis"][2]//text()').extract())
 item['duration'] = re.findall('d+', duration)[0]
 if response.xpath('//i[@class="imax3d"]'):
     item['vision'] = "3DIMAX"
 elif response.xpath('//i[@class="m3d"]'):
     item['vision'] = "3D"
 else:
     item['vision'] = ""
 yield item

5. 获取场次信息

场次信息我用的json文件来保存,因为其中涉及到不同日期不同场次,针对不同电影,信息也是不同的,这是一个比较庞大的数据,我们按照如下形式保存

'''
    movie_session = [不同电影的场次数据
 {movie_name: ...
  result:[每一天的场次数据
     {
  date: ...
  session_result:[当天的场次数据
      {},
      {}....
  ] 
     }
  ]
 }
    ]:每部电影的数据
'''

去对应影院的详情页面,对电影、日期进行循环,然后抓取到所有的场次列表,写入到文件当中

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
    "Referer": "https://maoyan.com/xseats/201906020127664?movieId=246061&cinemaId=17372",
    "Host": "maoyan.com",
    "Upgrade-Insecure-Requests": "1",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
}
movie_result = []

def getCinemas(urls):
    cinemas = []
    for url in urls:
 html = etree.HTML(getHtml(url))
 cinemas.extend(html.xpath('//div[@class="cinema-info"]/a/@href'))
    return cinemas


def getUrls(urls):
    url_list = []
    cinemas = getCinemas(urls)
    for cinema in cinemas:
 cinema_url = "http://maoyan.com" + cinema
 url_list.append(cinema_url)
    return url_list


def getHtml(url):
    return requests.get(url, headers=headers).text


#给出当天的xpath对象获取对应日期的场次数据
def getToday(today, time_list, num):
    result = []
    session_result = []
    session_list = today.xpath('.//table/tbody/tr')
    # 当天每个场次
    for session in session_list:
 begin_time = session.xpath('.//span[@class="begin-time"]/text()')[0]
 end_time = session.xpath('.//span[@class="end-time"]/text()')[0]
 lang = session.xpath('.//span[@class="lang"]/text()')[0]
 hall = session.xpath('.//span[@class="hall"]/text()')[0]
 session_result.append({
     "date": re.findall('d+', time_list[num])[0] + '-' + re.findall('d+', time_list[num])[1],
     "begin_time": begin_time,
     "end_time": end_time,
     "lang": lang,
     "hall": hall
 })
    result.append({
 "date": re.findall('d+', time_list[num])[0] + '-' + re.findall('d+', time_list[num])[1],
 "session_result": session_result
    })
    return result


def parse_tag(html, days):
    movie_session = []
    movie_name = html.xpath('//h3[@class="movie-name"]/text()')[0]
    time_list = html.xpath('//div[@class="show-list active"]//div[@class="show-date"]/span/text()')[1:]
    for day in days:
 result = getToday(day, time_list)
 movie_session.append({
     "movie_name": movie_name,
     "result": result
 })
    return movie_session

def test(text):
    movies_session = []
    result = []
    tmp_result = []
    html = etree.HTML(text)
    cinema_name = html.xpath('//h3[@class="name text-ellipsis"]/text()')[0]
    movies_list = html.xpath('//div[contains(@class, "show-list")]')

    #每一部电影数据
    for movie in movies_list:
 tmp_result.append(get_MovieData(movie)[0])
    with open('files/' + cinema_name + '.json', 'a+', encoding='utf-8') as f:
 f.write(json.dumps(tmp_result, ensure_ascii=False, indent=2))


def get_MovieData(movie):
    movie_session = []
    result = []
    movie_name = movie.xpath('.//h3[@class="movie-name"]/text()')[0]
    if movie.xpath('.//span[@class="score sc"]/text()'):
 star = movie.xpath('.//span[@class="score sc"]/text()')[0]
    else:
 star = '暂无评分'

    time_list = movie.xpath('.//div[@class="show-date"]/span/text()')[1:]
    #所有场次天数
    days = movie.xpath('.//div[contains(@class, "plist-container")]')
    for day in days:
 result.append(getToday(day, time_list, days.index(day)))
 # print(movie_result)
    movie_session.append({
 "movie_name": movie_name,
 "star": star,
 "result": result
    })

    return movie_session


def parse(text):
    html = etree.HTML(text)
    movie_session = []
    result = []
    session_result = []

    list = html.xpath('//div[@class="show-list active"]')
    movie_name = html.xpath('//h3[@class="movie-name"]/text()')[0]
    star = html.xpath('//span[@class="score sc"]/text()')[0]
    time_list = html.xpath('//div[@class="show-list active"]//div[@class="show-date"]/span/text()')[1:]#6月2, 6月3
    days = html.xpath('//div[@data-index="0"]//div[contains(@class, "plist-container")]')
    for day in days:
 print(json.dumps(getToday(day, time_list, days.index(day)), ensure_ascii=False))


if __name__ == '__main__':
    urls = [
 "https://maoyan.com/cinemas?districtId=3799",
 "https://maoyan.com/cinemas?districtId=3798",
 "https://maoyan.com/cinemas?areaId=-1&districtId=3802",
    ]
    cinemas = getUrls(urls)
    for url in cinemas:
 test(requests.get(url, headers=headers).text)

  • 具体代码
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/219877.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号