import requests
import console # 颜色打印需要的
import re
import os
from bs4 import BeautifulSoup, NavigableString
from concurrent.futures import ThreadPoolExecutor
# 存放路径,可自行修改
download_path = os.path.join(os.path.dirname(__file__), '博客园随笔md格式')
# 懒人模式,一键创建
if not os.path.exists(download_path):
os.mkdir(download_path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
# 小猿取经博客密码
blog_data = {
"tb_password": 'xiaoyuanqujing@666'
}
def remove_div(body):
'''去除多余的div'''
while body.div:
body.div.unwrap()
def md_toc(body):
'''替换目录'''
toc = body.select_one(".toc")
if toc:
toc.replace_with('[TOC]')
def md_h(body):
'''修改标题,已简化标题'''
h = body.find_all(re.compile('^h(d{1})'))
for i in h:
n = int(i.name[1])
i.replace_with('#' * n + ' ' + i.get_text())
def md_table(body):
'''改变表格'''
tbs = body.find_all('table')
for t in tbs:
ths = t.select_one('thead > tr') # 表头
trs = t.select('tbody > tr') # 表行
if not ths: # 防止对方格式不对,没有表头
ths, tds = trs[0], trs[1:]
head = mid = '|'
for th in ths.find_all(re.compile('th|td')):
style = th.get('style')
style = style if style else 'center'
if 'left' in style:
head += ' ' + th.get_text() + ' |'
mid += ' :--- |'
elif 'right' in style:
head += ' ' + th.get_text() + ' |'
mid += ' ---: |'
else:
head += ' ' + th.get_text() + ' |'
mid += ' :--: |'
body = head + 'n' + mid + 'n'
for tr in trs:
row = "|"
for td in tr.find_all('td'):
style = td.get('style')
style = style if style else 'center'
if 'left' in style:
row += ' ' + td.get_text() + ' |'
elif 'right' in style:
row += ' ' + td.get_text() + ' |'
else:
row += ' ' + td.get_text() + ' |'
body = body + row + 'n'
t.replace_with(body)
def md_pre(body):
''' 修改代码块部分'''
for pre in body.find_all('pre') + body.select('.cnblogs_code'):
code_type = pre.get('class') # 确定代码块语言
code_type = code_type if code_type else [
'python'] # 如果没有class则默认为python
pre_replace = pre.parent
hide_sta = ''
if pre_replace: # 后部分的可能会被替换,此节点变为None
if not pre_replace.get('class'): # 内嵌代码块
pre_replace = pre
if code_type and 'cnblogs_code' in code_type: # 处理除pre外的代码块
code_type = ['css'] # 这个随意调整
pre_replace = pre
if '```' in pre.text: # 防止多层内嵌(不知道怎么有这样的格式)
pre.replace_with(pre.text)
continue
# 处理隐藏代码块(实际上markdown不支持隐藏代码块)
elif pre_replace.get('class') and pre_replace.get('class')[0] == 'cnblogs_code_hide':
pre_replace = pre_replace.parent
pre_div = pre_replace.find_all(True)[-1]
pre_title = pre_div.string.lstrip()
# 隐藏代码块的父类的最后一个元素是隐藏的标题(加粗+斜体+显示)
hide_sta = '' if pre_title == 'View Code' else f' ***{pre_title}***n'
else:
# 替换形式
# pre_replace = pre_replace if pre_replace.get('class')[0] == 'cnblogs_code' else pre
if not pre_replace.get('class') or pre_replace.get('class')[0] != 'cnblogs_code':
pre_replace = pre
# 代码块主体内容
content = f'```{code_type[0] if code_type else ""}n'
+ pre.get_text().lstrip().rstrip().replace('```', '```') + 'n```'
if hide_sta: # 这个只和隐藏代码有关
content = hide_sta + content
if pre_replace in body: # 检查是否还在文档树里
pre_replace.replace_with(content)
def md_coed(body):
'''修改单独的代码'''
codes = body.find_all('code')
for i in codes:
if not i.parent.name == 'pre':
i.replace_with(f"`{i.get_text()}`")
def md_img(body):
'''修改图片'''
imgs = body.find_all('img')
for i in imgs:
src = i.get('src') # 拿到img的路径地址,不要使用i.src!!! src会返回None
i.replace_with(f"[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bc9XYMra-1636710221157)({src})]")
def md_link(body):
'''修改链接'''
links = body.find_all('a')
for a in links:
# 拿到a连接的指向地址,不要使用a.href!!! 会返回None,同img.src 参考上方代码
href = a.get('href')
a.replace_with(f"[{a.text}]({href})")
# if '/p/' in href or '/article/' in href:
# get_one_md(href)
def md_b_i(body):
'''修改斜体粗体'''
b = body.find_all(re.compile('strong|em'))
for i in b:
xing = '**' if i.name == 'strong' else '*'
h_re = re.compile('^h(d{1})')
if not h_re.match(i.parent.name):
i.replace_with(xing + i.get_text() + xing)
def md_ol_ul(body):
'''修改列表'''
l_lis = body.find_all(re.compile('^ul|ol'))
for i in l_lis:
start = "* " if i.name == 'ul' else 1
lis = i.find_all('li')
if len(lis) > 0:
for j in lis:
if isinstance(start, str):
j.replace_with(start + j.get_text().lstrip())
else:
j.replace_with(f"{start}. " + j.get_text().lstrip())
start += 1
i.replace_with(i.get_text())
else:
if i.li:
i.li.unwarp()
# i.unwrap()
def md_p(body):
'''修改段落'''
while body.span:
body.span.unwrap()
while body.p:
body.p.unwrap()
def to_md(body):
item_name = body.name
if not isinstance(body, NavigableString): # 清除格式,不过基本没用,目的是看起来简洁一点
body.attrs.clear()
if item_name == 'br': # 可能会影响格式
return ' n'
if item_name == 'hr':
return 'n----n'
else:
return str(body).replace('<', '<').replace('>', '>')
def transform(body):
'''整合所有修改,尽量不要改变顺序'''
md_toc(body)
md_h(body)
md_table(body)
md_pre(body)
remove_div(body)
md_img(body)
md_link(body)
md_coed(body)
md_b_i(body)
md_ol_ul(body)
md_p(body)
def get_right_name(name):
'''获取正确的名字,windows文件名不能包含下列任何字符:'''
name = name.split()
error_name = r'/:*?"<>|'
name = ''.join(name)
for ch in error_name:
if ch in name:
name = name.replace(ch, '_')
if os.path.exists(name):
count = 1
while os.path.exists('%s(%s)' % (name, count)):
count += 1
return '%s(%s)' % (name, count)
return name
def get_one_md(url='', down_path=download_path):
if not url:
url = input("请粘贴要转化的博客地址:
nt如:https://www.cnblogs.com/Du704/p/11270106.htmlnt>> ").strip()
if not url:
print(' 33[1;31m输入信息不能为空! 33[0m')
return False
print(f"正在读取... {url} ")
# post请求是与加密文章相关,无加密信息也不会影响,可自行设置
res = requests.post(url, data=blog_data, headers=headers)
if res.status_code != 200:
print(f"{url} 请求失败!")
return False
soup = BeautifulSoup(res.text, 'lxml')
# 有的文章会进行二次加密
if '博文阅读密码验证' in soup.select_one('head > title').get_text():
url = 'https://www.cnblogs.com' +
soup.select_one('body > form').get('action')
return get_one_md(url, down_path)
# 本篇博客名,利用空格分开,不然window上不能保存有空格的文件
blog_name = soup.select_one('#cb_post_title_url').get_text().strip()
# 获取正确文件名,可能会不允许的符号,如空格。还有重名的情况
blog_name = get_right_name(blog_name)+'.md'
# 如果爬虫有错可以离线修改,也可以查看网页源代码
# with open(os.path.join(download_path, blog_name + '.html'), 'w', encoding='utf-8') as f:
# f.write(soup.prettify())
# 这一步的目的是防止对方的博客就是粘贴来的
body = soup.select_one('#cnblogs_post_body')
try:
# 转换为md
transform(body)
except Exception as e:
print(f" 33[31m解析{url}失败, 33[35m该博客可能包含无法解析的内容! 33[0m")
print("异常信息:", e)
try:
with open(os.path.join(down_path, blog_name), 'w', encoding='utf-8') as f:
for i in body:
data = to_md(i)
f.write(data)
print(f"n 33[1;36m{blog_name} 33[33m下载完成! 33[0mnn")
except Exception as e:
print(f"保存时异常终止,可能是文件名出错: 33[31m{blog_name} 33[0mnt请查看是否有特殊字符!")
print("异常信息:", e)
def get_all_url(url=None):
if not url:
url = input("请粘贴要群爬的博客地址:
nt如:https://www.cnblogs.com/Du704/p/11270106.htmlnt>> ").strip()
if not url:
print(' 33[1;31m输入信息不能为空! 33[0m')
return False
all_urls = [url, ]
res = requests.get(url, headers=headers)
if res.status_code != 200:
print(f"{url} 请求失败!")
return False
soup = BeautifulSoup(res.text, 'lxml')
dir_name = soup.select_one(
'#cb_post_title_url').get_text().strip() # 整体的名字
dir_name = get_right_name(dir_name)
dir_name = os.path.join(download_path, dir_name)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
body = soup.select_one("#cnblogs_post_body > #cnblogs_post_body")
if not body:
body = soup.select_one('#cnblogs_post_body')
links = body.select(
'a[target="-Blank"]') or body.select('a[target="_blank"]')
ret = re.compile(r'href="(.*www.cnblogs.com.*)" target="_blank')
for a in links:
r = ret.findall(str(a))
if r:
all_urls.extend(r)
return all_urls, dir_name
def get_result(future):
'''这个回调函数我还没想好怎么用'''
pass
def get_all_of_one():
urls, down_path = get_all_url()
with ThreadPoolExecutor(max_workers=4) as pool:
for href in urls:
pool.submit(get_one_md, href,
down_path).add_done_callback(get_result)
func_dict = {
'1': ['爬单页面', get_one_md],
'2': ['爬一个网页链接下的所有文章', get_all_of_one],
'q': ['退出', ]
}
if __name__ == '__main__':
print(" 33[1;31m这是转换博客园文章为md格式的小工具,请勿用于非法用途!! 33[0m")
while 1:
for k, v in func_dict.items():
print(k, v[0])
func_c = input('选择功能:')
if func_c.lower() == 'q':
exit('Bye~')
elif func_c in func_dict:
func_dict.get(func_c)[1]()
else:
print("输入有误")