代码均为原创,侵权立删。qq:1392516067
全部代码:
import requests
from lxml import etree
import os
import shutil
from time import sleep
count= 50
def main():
global count
count = int(input('请输入下载章节数:'))
url0 = 'http://www.cits0871.com/booktxt/44781/'
url0 = input('请输入下载网址url:')
headers = {
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}
html = requests.get(url0, headers=headers).text
# 解析html
root = etree.HTML(html)
Name = getName(root, headers)
creFile(Name)
print(getName(root, headers))
downImage(root, headers)
downIntroduction(root)
Textname = getTextname(root)
downText(root,headers,Textname)
def creFile(Name):
os.chdir("D:PythonPycharmworkspacePycharmProjectsPythonTraining")
if Name is not None:
if os.path.exists(f"{Name}"):
shutil.rmtree(f"{Name}")
os.mkdir(f"{Name}")
else:
os.mkdir(f"小说文档")
def getName(root, headers):
bookName_xpath = '//div[@id = "info"]/h1/text()'
bookNameList = root.xpath(bookName_xpath)
author_xpath = '//div[@id = "info"]/p/text()'
authorList = root.xpath(author_xpath)
authorList[0] = str(authorList[0]).replace("作 者:","")
global Name
Name = bookNameList[0] + '__' + authorList[0]+' 著'
return Name
def downImage(root,headers):
cover_xpath = '//div[@id = "fmimg"]/img'
imgList = root.xpath(cover_xpath)
print(imgList)
imgurl = imgList[0].attrib['src']
print(imgurl)
# 使用流的形式读取响应
if Name is not None:
os.chdir(f"D:PythonPycharmworkspacePycharmProjectsPythonTraining{Name}")
else:
os.chdir(f"D:PythonPycharmworkspacePycharmProjectsPythonTraining小说文档")
respStream = requests.get(imgurl, headers=headers, stream=True)
if respStream.status_code == 200:
with open(f'{Name}.jpg', 'wb') as fd:
for chunk in respStream.iter_content(chunk_size=128):
fd.write(chunk)
print(f'图片{Name}下载完毕')
def downIntroduction(root):
full_path = f'D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\小说简介.txt' # 也可以创建一个.doc的word文档
state_xpath = '//div[@id = "intro"]/p/text()'
stateList = root.xpath(state_xpath)
#print(stateList)
file = open(full_path,'w')
for state in stateList:
file.write(state + ' ')
file.close()
txt_xpath = '//div[@id = "listtj"]/a/text()'
txtList = root.xpath(txt_xpath)
basefile = text_read()
txtfile = basefile + txtList
#print(txtfile)
file = open(full_path, 'w')
for x in txtfile:
file.write('n' + x)
file.close()
def text_read():
file = open(f'D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\小说简介.txt','r+')
txt = file.readlines()
a = []
for w in txt:
w = w.replace('n', '')
a.append(w)
return a
def getTextname(root):
#创建每章的txt文件
textname_xpath = '//div[@class = "box_con"]/div[@id = "list"]/dl/dd/a/text()'
textnameList = root.xpath(textname_xpath)
print(textnameList)
return textnameList
""""
os.chdir(f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}")
text_path = f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\"
for i in range(10):
path = text_path + textnameList[i] + '.txt'
file = open(path, 'w')
file.close()
print('小说章节文件创建完毕')
"""
def downText(root,headers,Textname):
text_xpath = '//div[@id = "list"]/dl/dd/a'
textList = root.xpath(text_xpath)
print(textList)
url = []
for el in textList:
url.append('http://www.cits0871.com'+el.attrib['href'])
print(url)
global ar_count,count
ar_count = 0
page_count = len(url)
for i in range(count):
if ar_count > page_count:
break
#time.sleep(2)
html = requests.get(url[i], headers=headers).text
#print(html)
ar_root = etree.HTML(html)
print(ar_root)
article_xpath = '//div[@id = "content"]/p/text()'
articleList = ar_root.xpath(article_xpath)
print(articleList)
file_path = f"D:\Python\Pycharm\workspace\PycharmProjects\PythonTraining\{Name}\" + Textname[ar_count] + ".txt"
ar_count += 1
file = open(file_path, 'w')
for x in articleList:
file.write('n' + x)
file.close()
print("小说下载完成")
if __name__ == '__main__':
main()



