华侨大学实验室安全培训与考试系统题库爬取

http://labsafety.hqu.edu.cn/
其实思路很简单
因为不用登入，所以简简单单
先把这几个的地址爬了

然后爬取页码
然后每一页爬就好了
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import re

import bs4
import requests

# 寻找页面总数的正则
patten = re.compile('题 　当前第 \d+/(\d+) 页', re.DOTALL)
# 浏览器头
HEADER = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
    # 浏览器头部
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 客户端能够接收的内容类型
    'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7',  # 浏览器可接受的语言
    'Connection': 'keep-alive'  # 表示是否需要持久连接
}
write_path = 'test.txt'


# 问题类
class Question:
    def __init__(self):
        self.id = 0  # 题号
        self.question = ''  # 问题
        self.options = []  # 选项
        self.answer = ''  # 答案

    def set_answer(self, text):
        text = "".join(text.split())
        self.answer = text

    def set_question(self, text):
        text = text.strip()
        self.question = text
        self.id = int(self.question.split('、')[0])

    def set_options(self, text):
        text = text.strip()
        self.options.append(text)

    def __str__(self):
        return self.question + 'n' + 'n'.join(self.options) + 'n' + self.answer


# 所有的问题
all_questions = []


def get_content_by_url(url):
    """
    根据url获取页面内容

    :param url: url
    :return: 页面内容
    """
    s = requests.session()
    response = s.get(url, headers=HEADER)
    status_code = response.status_code

    if 200 == status_code:
        return bs4.BeautifulSoup(response.content.decode("gbk"), "lxml")
    raise Exception(url + ' FAILED')


def get_all_urls():
    """
    爬取所有的题库的地址

    :return: 题库地址
    """
    base_url = 'http://labsafety.hqu.edu.cn/'
    url = 'http://labsafety.hqu.edu.cn/redir.php?catalog_id=121'
    content = get_content_by_url(url)
    urls = []
    elements = content.find_all(attrs={'class': 'mainLeftContent'})[-1]
    for element in elements.find_all('a'):
        urls.append(base_url + element['href'])
    return urls


def get_questions(url):
    """
    获取问题

    :param url: 题库链接
    """
    page = 1
    total_page = 1
    first = True
    while page <= total_page:
        content = get_content_by_url(url + '&page={}'.format(page))
        # 爬取总页数
        if first:
            first = False
            total_page = int(patten.search(content.find(attrs={'class': 'fy'}).text).group(1))
            print('总共{}页'.format(total_page))
        print('目前第{}页'.format(page))
        element = content.find(id='shiti-content')
        question = None
        flag = True  # 题目和答案成对出现
        for child in element.children:
            if not isinstance(child, bs4.element.Tag):
                continue
            if flag:
                question = Question()
                question.set_question(child.find(name='h3').text)
                for option in child.find(name='ul').children:
                    if not isinstance(child, bs4.element.Tag):
                        continue
                    question.set_options(option.text)
            else:
                question.set_answer(child.text)
                all_questions.append(question)
            flag = not flag
        page += 1


if __name__ == '__main__':
    all_urls = get_all_urls()

    for url in all_urls:
        print('正在爬取' + url)
        get_questions(url)
    with open(write_path, 'w', encoding='utf-8') as f:
        for question in all_questions:
            f.write(str(question) + 'n')
华侨大学实验室安全培训与考试系统题库爬取

Python相关栏目本月热门文章