栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

Python爬虫实战

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

Python爬虫实战

代码均为原创,侵权立删。qq:1392516067

全部代码:

import requests
from lxml import etree
import os
import shutil
from time import sleep

count= 50

def main():
    global count
    count = int(input('请输入下载章节数:'))
    url0 = 'http://www.cits0871.com/booktxt/44781/'
    url0 = input('请输入下载网址url:')
    headers = {
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
    }
    html = requests.get(url0, headers=headers).text
    # 解析html
    root = etree.HTML(html)
    Name = getName(root, headers)
    creFile(Name)
    print(getName(root, headers))
    downImage(root, headers)
    downIntroduction(root)
    Textname = getTextname(root)
    downText(root,headers,Textname)

def creFile(Name):
    os.chdir("D:PythonPycharmworkspacePycharmProjectsPythonTraining")
    if Name is not None:
        if os.path.exists(f"{Name}"):
            shutil.rmtree(f"{Name}")
        os.mkdir(f"{Name}")
    else:
        os.mkdir(f"小说文档")

def getName(root, headers):
    bookName_xpath = '//div[@id = "info"]/h1/text()'
    bookNameList = root.xpath(bookName_xpath)
    author_xpath = '//div[@id = "info"]/p/text()'
    authorList = root.xpath(author_xpath)
    authorList[0] = str(authorList[0]).replace("作    者:","")
    global Name
    Name = bookNameList[0] + '__' + authorList[0]+' 著'
    return Name

def downImage(root,headers):
    cover_xpath = '//div[@id = "fmimg"]/img'
    imgList = root.xpath(cover_xpath)
    print(imgList)
    imgurl = imgList[0].attrib['src']
    print(imgurl)
    # 使用流的形式读取响应
    if Name is not None:
        os.chdir(f"D:PythonPycharmworkspacePycharmProjectsPythonTraining{Name}")
    else:
        os.chdir(f"D:PythonPycharmworkspacePycharmProjectsPythonTraining小说文档")
    respStream = requests.get(imgurl, headers=headers, stream=True)
    if respStream.status_code == 200:
        with open(f'{Name}.jpg', 'wb') as fd:
            for chunk in respStream.iter_content(chunk_size=128):
                fd.write(chunk)
        print(f'图片{Name}下载完毕')

def downIntroduction(root):
    full_path = f'D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\小说简介.txt'  # 也可以创建一个.doc的word文档
    state_xpath = '//div[@id = "intro"]/p/text()'
    stateList = root.xpath(state_xpath)
    #print(stateList)
    file = open(full_path,'w')
    for state in stateList:
        file.write(state + '  ')
    file.close()
    txt_xpath = '//div[@id = "listtj"]/a/text()'
    txtList = root.xpath(txt_xpath)
    basefile = text_read()
    txtfile = basefile + txtList
    #print(txtfile)
    file = open(full_path, 'w')
    for x in txtfile:
        file.write('n' + x)
    file.close()
def text_read():
    file = open(f'D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\小说简介.txt','r+')
    txt = file.readlines()
    a = []
    for w in txt:
        w = w.replace('n', '')
        a.append(w)
    return a

def getTextname(root):
    #创建每章的txt文件
    textname_xpath = '//div[@class = "box_con"]/div[@id = "list"]/dl/dd/a/text()'
    textnameList = root.xpath(textname_xpath)
    print(textnameList)
    return textnameList

    """"
    os.chdir(f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}")
    text_path = f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\"
    for i in range(10):
        path = text_path + textnameList[i] + '.txt'
        file = open(path, 'w')
        file.close()
    print('小说章节文件创建完毕')
    """

def downText(root,headers,Textname):
    text_xpath = '//div[@id = "list"]/dl/dd/a'
    textList = root.xpath(text_xpath)
    print(textList)
    url = []
    for el in textList:
        url.append('http://www.cits0871.com'+el.attrib['href'])
    print(url)
    global ar_count,count
    ar_count = 0
    page_count = len(url)
    for i  in range(count):
        if ar_count > page_count:
            break
        #time.sleep(2)
        html = requests.get(url[i], headers=headers).text
        #print(html)
        ar_root = etree.HTML(html)
        print(ar_root)
        article_xpath = '//div[@id = "content"]/p/text()'
        articleList = ar_root.xpath(article_xpath)
        print(articleList)
        file_path = f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\" + Textname[ar_count] + ".txt"
        ar_count += 1
        file = open(file_path, 'w')
        for x in articleList:
            file.write('n' + x)
        file.close()
    print("小说下载完成")


if __name__ == '__main__':
    main()

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/828862.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号