栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

爬虫爬取奥特曼url~今日份练习

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

爬虫爬取奥特曼url~今日份练习

def __init__(self): self.url https://tv.ci/sb/ke7nhZe3c1-.html?wd %E5%A5%A5%E7%89%B9%E6%9B%BC submit self.headers { user-agent : Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 } def get_all_html(self): count 1 while True: try: print(f 总url第{count}次请求 ) count 1 response requests.get(self.url, headers self.headers, timeout 5) except Exception: time.sleep(3) else: text response.content.decode() return text def get_all_url(self, text): html etree.HTML(text) href html.xpath( /html/body/div[1]/div/div/div[1]/div/div/div[2]/ul/li[*]/div[2]/h4/a/ href ) name html.xpath( /html/body/div[1]/div/div/div[1]/div/div/div[2]/ul/li[*]/div[2]/h4/a/text() ) dict_text dict(zip(name, [f https://tv.ci/{i} for i in href])) print( 所有url已爬取 ) return dict_text def get_html(self, name, url): count 1 while True: try: print(f {name}第{count}次请求 ) count 1 response requests.get(url, headers self.headers, timeout 5) except Exception: time.sleep(3) else: text response.content.decode() return text def get_url(self, url, text): html etree.HTML(text) href html.xpath( /html/body/div[1]/div/div[1]/div[3]/div/div[2]/div[1]/ul/li[*]/a/ href ) name html.xpath( /html/body/div[1]/div/div[1]/div[3]/div/div[2]/div[1]/ul/li[*]/a/text() ) dict_text dict(zip(name, [f https://tv.ci/{i} for i in href])) return dict_text # def get_data(self, name, url): # count 1 # while True: # try: # print(f {name}第{count}次请求 ) # count 1 # response requests.get(url, headers self.headers, timeout 5) # except Exception: # time.sleep(3) # else: # text response.content.decode() # return text # def save_data(self, name, url): # text f {name}n # with open( ./00文件夹/url/奥特曼.txt , w , encoding utf-8 ) as f: # f.write(name, url) def run(self): url_text # 获取主页html all_html self.get_all_html() # 获取所有奥特曼url dict_all_url self.get_all_url(all_html) for name, url in dict_all_url.items(): html self.get_html(name, url) dict_url self.get_url(url, html) url_text name n str(dict_url) n # for name_other, url_other in dict_url: # # self.get_data() # self.save_data(name_other, url_other) print(f {name}已爬取 ) with open( ./00文件夹/url/奥特曼.txt , w , encoding utf-8 ) as f: f.write(url_text)
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/267812.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号