http://labsafety.hqu.edu.cn/
其实思路很简单
因为不用登入,所以简简单单
先把这几个的地址爬了
然后爬取页码
然后每一页爬就好了
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import re
import bs4
import requests
# 寻找页面总数的正则
patten = re.compile('题 当前第 \d+/(\d+) 页', re.DOTALL)
# 浏览器头
HEADER = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
# 浏览器头部
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 客户端能够接收的内容类型
'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en;q=0.7', # 浏览器可接受的语言
'Connection': 'keep-alive' # 表示是否需要持久连接
}
write_path = 'test.txt'
# 问题类
class Question:
def __init__(self):
self.id = 0 # 题号
self.question = '' # 问题
self.options = [] # 选项
self.answer = '' # 答案
def set_answer(self, text):
text = "".join(text.split())
self.answer = text
def set_question(self, text):
text = text.strip()
self.question = text
self.id = int(self.question.split('、')[0])
def set_options(self, text):
text = text.strip()
self.options.append(text)
def __str__(self):
return self.question + 'n' + 'n'.join(self.options) + 'n' + self.answer
# 所有的问题
all_questions = []
def get_content_by_url(url):
"""
根据url获取页面内容
:param url: url
:return: 页面内容
"""
s = requests.session()
response = s.get(url, headers=HEADER)
status_code = response.status_code
if 200 == status_code:
return bs4.BeautifulSoup(response.content.decode("gbk"), "lxml")
raise Exception(url + ' FAILED')
def get_all_urls():
"""
爬取所有的题库的地址
:return: 题库地址
"""
base_url = 'http://labsafety.hqu.edu.cn/'
url = 'http://labsafety.hqu.edu.cn/redir.php?catalog_id=121'
content = get_content_by_url(url)
urls = []
elements = content.find_all(attrs={'class': 'mainLeftContent'})[-1]
for element in elements.find_all('a'):
urls.append(base_url + element['href'])
return urls
def get_questions(url):
"""
获取问题
:param url: 题库链接
"""
page = 1
total_page = 1
first = True
while page <= total_page:
content = get_content_by_url(url + '&page={}'.format(page))
# 爬取总页数
if first:
first = False
total_page = int(patten.search(content.find(attrs={'class': 'fy'}).text).group(1))
print('总共{}页'.format(total_page))
print('目前第{}页'.format(page))
element = content.find(id='shiti-content')
question = None
flag = True # 题目和答案成对出现
for child in element.children:
if not isinstance(child, bs4.element.Tag):
continue
if flag:
question = Question()
question.set_question(child.find(name='h3').text)
for option in child.find(name='ul').children:
if not isinstance(child, bs4.element.Tag):
continue
question.set_options(option.text)
else:
question.set_answer(child.text)
all_questions.append(question)
flag = not flag
page += 1
if __name__ == '__main__':
all_urls = get_all_urls()
for url in all_urls:
print('正在爬取' + url)
get_questions(url)
with open(write_path, 'w', encoding='utf-8') as f:
for question in all_questions:
f.write(str(question) + 'n')



