爬虫项目——m3u8后缀的电影

# -*- coding: utf-8 -*-
"""
Created on Wed Oct  6 10:47:47 2021

@author: yingzi

E-mail:guotaomath@163.com
"""
'''
目标：找到目标网页,源代码中已找到m3u8,且通过抓包工具知道，
第一层m3u8嵌套第二层m3u8(真实ts的下载地址)，视频无加密
'''

'''
流程：
    1. 拿到网页源代码
    2. 从网页源代码提取第一层m3u8的url
    3. 解析第一层的m3u8，获取第二层m3u8的url
    4. 解析第二层的m3u8,分别下载相应的ts
    5. 合并ts
'''
import requests
import re
import asyncio
import aiohttp
import aiofiles
import os

def get_m3u8_url(url):  #输入网页地址,得到网页地址内的m3u8的地址
    resp = requests.get(url)
    resp.encoding = "utf-8"
    obj = re.compile(r'now="(?P.*?)"',re.S) #用来提取m3u8地址
    m3u8_url = obj.search(resp.text).group("url")
    return m3u8_url
    
def down_first_m3u8(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    with open("法律之地.txt",mode="wb") as f:
        f.write(resp.content)
        
def get_second_m3u8_url(url):
    with open("法律之地.txt",mode="r",encoding="utf-8") as f:
        for line in f:
            if line.startswith("#"):
                continue
            m3u8_url = url.split(r"/20210704")[0]+line.strip()
    return m3u8_url

def  down_second_m3u8(url):
    resp = requests.get(url)
    resp.encoding = "utf-8"
    with open("法律之地2.txt",mode="wb") as f:
        f.write(resp.content)

async def download_ts(url,name,session):
    async with session.get(url) as resp:
        async with aiofiles.open(f"video/法律之地/{name}",mode="wb") as f:
            await f.write(await resp.content.read())
    print(f"{name}下载完毕!!")


async def aio_download():
    tasks = []
    async with aiohttp.ClientSession() as session:    # 提前准备好session
        async with aiofiles.open("法律之地2.txt",mode="r",encoding="utf-8") as f:
            async for line in f:
                if line.startswith("#"):
                    continue
                ts_url = line.strip()
                task = asyncio.create_task(download_ts(ts_url,ts_url.rsplit("/",1)[1],session))
                tasks.append(task)
            await asyncio.wait(tasks)
    
# def merge_ts_1():
#     lst = []
#     with open("法律之地2.txt",mode="r",encoding="utf-8") as f:
#         for line in f:
#             if line.startswith("#"):
#                 continue
#             name = line.strip().rsplit("/",1)[1]
#             lst.append(f"video/法律之地/{name}")
#     s = "+".join(lst)
#     os.system(f"copy /b {s} video.mp4")
#     print("搞定！！")
   
def merge_ts_2():
    with open("法律之地2.txt",mode="r",encoding="utf-8") as f:
        with open("video.ts",'wb+') as f1:
            for line in f:
                if line.startswith("#"):
                    continue
                name = line.strip().rsplit("/",1)[1]
                if os.path.exists(os.path.join(f"video/法律之地/{name}")):
                    ts_video_path = os.path.join(f"video/法律之地/{name}")
                    f1.write(open(ts_video_path,'rb').read())
    print("搞定！！")     

if __name__ == '__main__':
    url = "https://www.daquan.cc/play/?15855-1-0.html"
    m3u8_first_url = get_m3u8_url(url)   # 2.1 获取第一层的m3u8的地址
    
    down_first_m3u8(m3u8_first_url)      # 3.1 解析第一层的m3u8
    m3u8_second_url = get_second_m3u8_url(m3u8_first_url) # 3.2获取第二层m3u8的url
    
    down_second_m3u8(m3u8_second_url)    # 4.1 解析第二层的m3u8
    asyncio.create_task(aio_download())  # 4.2 调用异步协程，加快下载ts文件
    
    merge_ts_2()  # 5合并ts文件

后续改进：

现在只启用了异步协程下载ts文件，未添加线程，文件量太大时，协程等待 await会强制退出，下载电影后半部分没下完，后续学会添加线程了再加上去，应该可以解决这个问题。
merge_ts_1这个函数不知为何用不了，用了merge_ts_2函数替代了

爬虫项目——m3u8后缀的电影

Python相关栏目本月热门文章