小白篇—Python从网页抓取小说

这里以笔趣阁中的小说 https://www.xbiquge.la/xiaoshuodaquan/ 为例，将每一章的内容爬取下来保存到本地。
特别声明：该分享仅供参考与学习使用，请勿用于其他商业等非法用途，如有侵权请联系我删除该博文！

1. 导入所需的库

import lxml
from bs4 import BeautifulSoup
import requests
import time
import random
import os

'''
目标：https://www.xbiquge.la/xiaoshuodaquan/
'''

2.定义一个根据 http 地址获取 BeautifulSoup 对象的方法

def getBS(url, end):
    '''
    得到 URL 对应的 BeautifulSoup 对象
    :param url: 网址
    :param end: 最大尝试次数
    :return: BeautifulSoup(如果尝试 end 次仍不成功则返回None)
    '''
    # 设置头部信息
    headr = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
    }
    # 判断是否为最后一次尝试
    if end == 0:
        print(url, "尝试连接无效!")
        # 最后一次尝试仍不成功， 将此 URL 写入 log.txt 文件中
        with open("log.txt", "a", encoding="utf-8") as f:
            f.write(url + "n")
        return None
    # 让此进程睡眠0.5s, 避免访问过快
    time.sleep(0.5)
    # 捕获可能产生的异常
    try:
        response = requests.get(url=url, headers=headr)
        # 如果响应值为200 ，则说明服务器同意请求
        if response.status_code == 200:
            print(url + " 连接成功!")
            html = response.content.decode("utf-8")
            bs = BeautifulSoup(html, "lxml")
            return bs
        else:
            # 如果响应值不等于200 再次尝试
            print(url + " 连接失败! 再次尝试连接...")
            # 让此进程随机睡眠2-5秒
            time.sleep(random.randint(2, 5))
            # 将最大次数减一,再次递归调用此方法
            return getBS(url, end - 1)
    except Exception as e:
        # 如果产生异常说明此网址已经不能访问了
        print(e)

3. 定义一个解析小说大全的方法，返回一个由小说名和对应http地址的字典对象

def getCatalogue(url, end):
    '''
    获取书名的目录,返回一个 书名：网址 字典
    :param url:
    :param end: 最大尝试次数
    :return:
    '''
    if end == 0:
        print(url, "尝试连接无效!!!")
        with open("log.txt", "a", encoding="utf-8") as f:
            f.write(url + "n")
        return {}

    dic = {}
    bs = getBS(url, 10)
    if bs != None:
        try:
            # 通过bs得到所需要的信息，根据网页具体结构
            catalogues = bs.find("div", class_="novellist").findAll("li")
            for li in catalogues:
                if len(li.get_text()) >= 2 and li.find("a") != None:
                    if len(li.find("a").get("href")) > 4:
                        # 以书名为键，http地址为值 存入字典
                        dic[li.get_text()] = li.find("a").get("href")
            return dic
        except Exception as e:
            # 如果前面出现异常，再次递归调用此方法
            print(url, " 获取失败, 再次尝试中!")
            time.sleep(random.randint(2, 5))
            return getCatalogue(url, end - 1)
    else:
        return {}

4. 定义一个根据http地址解析具体小说的方法，返回一个由章节和章节地址组成的字典对象

def getChapter(url, end):
    '''
    获取每本小说对应的章节地址
    :param url: 每本小说的地址
    :param end: 最大连接次数
    :return:
    '''
    if end == 0:
        print(url, "尝试连接无效!")
        with open("log.txt", "a", encoding="utf-8") as f:
            f.write(url + "n")
        return {}
    dic = {}
    bs = getBS(url, 5)

    if bs != None:
        try:
            catalogues = bs.find("div", id="list").findAll("dd")
            for dd in catalogues:
                if dd.find("a") != None:
                    textUrl = dd.find("a").get("href")
                    if textUrl != None and len(textUrl.split("/")) >= 3:
                        dic[dd.get_text()] = url + textUrl.split("/")[3]
            return dic
        except Exception as e:
            print(url, " 获取失败, 再次尝试中!")
            time.sleep(random.randint(2, 5))
            return getChapter(url, end - 1)
    else:
        return {}

5. 定义一个根据具体章节地址返回对应的字符信息的方法

def getText(url, end):
    '''
    获取每个章节对应的具体文本信息
    :param url:
    :param end:
    :return:
    '''
    if end == 0:
        print(url, "尝试连接无效!")
        print('Txt get error!')
        with open("log.txt", "a", encoding="utf-8") as f:
            f.write(url)
        return 'TxtGetError'
    bs = getBS(url, 5)
    if bs != None:
        cur = bs.find("div", id="content")
        if cur != None:
            txt = cur.get_text()
            if len(txt) > 2:
                return txt
        else:
            print("无法解析!")
    else:
        print("连接错误!")
        time.sleep(random.randint(2, 5))
        return getText(url, end - 1)

6. 定义一个将具体字符信息写入对应txt文件的方法

def createText(url):
    '''
    将得到的文本信息写入本地文件
    :param url:
    :return:
    '''
    # 得到目录对应的小说http地址
    catas = getCatalogue(url, 3)
    dics = {}
    start = 5
    end = 7
    n = 0
    # 如果 catas 为空则进入else
    if catas:
        # catas 内容过大，拷贝一部分到 dics中
        for key in catas.keys():
            if start <= n <= end:
                dics[key] = catas[key]
            n += 1

        # 遍历 dics 字典
        for i in dics.keys():
            # path = "books\" + str(i) + ".txt"
            # if not os.path.exists(path):
            #     os.mkdir(path)

            # 打开一个以 dics 键为名字的 txt 输入流
            with open("books\" + str(i) + ".txt", "a", encoding="utf-8") as f:
                # 获取 dics 键 对应的值即 http地址 传给 getChapter() 并设置最大连接次数 5 次，得到章节字典
                chaps = getChapter(catas[i], 5)
                # 判断章节字典是否为 空
                if chaps:
                    # 遍历章节字典
                    for c in chaps.keys():
                        # 获取 chaps 键 对应的值即 http地址 传给 getText() 并设置最大连接次数 5 次 得到章节对应的具体文本信息
                        text = getText(chaps[c], 5)
                        # 判断 getText() 的返回值
                        if text == "TxtGetError":
                            print(c, "写入失败!")
                        else:
                            # 写入具体文件中
                            f.write(text)
                            print(c, "写入成功!")
                    print(str(i), ".txt 下载完成!")


# 入口函数
if __name__ == '__main__':
    createText("https://www.xbiquge.la/xiaoshuodaquan/")

小白篇—Python从网页抓取小说

Python相关栏目本月热门文章