一、明确需求
爬取网站内的小说名
小说内容
二、代码讲解下面根据代码,从浅入深给大家讲解分析一遍
-- codeing = utf-8 --,开头的这个是设置编码为utf-8 ,写在开头,防止乱码
然后下面 import就是导入一些库,做做准备工作
import os import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt import requests import SQL import pinyin from bs4 import BeautifulSoup from pymysql import *
首先我们要伪装成一个浏览器,再去访问我们需要爬取的网站
百度百科:
User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统及版本、CPU 类型、浏览器及版本、浏览器渲染引擎、浏览器语言、浏览器插件等。
以Chrome浏览器为例,在浏览器地址栏输入
可以看到,浏览器User-Agent为Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36
下面是详细的代码
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
}
url = 'https://www.shicimingju.com/book/%s.html'
name_p = pinyin.pinyin(name)
我们先建立起一个数据表用来保存一会我们要爬取的数据
sql = "CREATE TABLE `novel`.`{}`( `section` TEXT(100) , `article` TEXT(10000) );".format(
name
)
SQL.doSql(sql)
现在我们要开始爬取网页上我们需要的数据了,并且把爬取到的数据存入到我们所建立起的数据库里
for li in li_list:
title = li.a.string
detail_url = 'https://www.shicimingju.com'+li.a['href']
detail_page_text = requests.get(url=detail_url,headers=headers)
detail_page_text.encoding = 'utf-8'
html = detail_page_text
detail_page_text = detail_page_text.text
detail_soup = BeautifulSoup(detail_page_text,'lxml')
div_tag = detail_soup.find('div',class_='chapter_content')
content = div_tag.text
sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format(
name,title,content
)
SQL.doSql(sql)
print(title,'爬取成功!!!! ')
最后附上我们的完整代码
import os
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import SQL
import pinyin
from bs4 import BeautifulSoup
from pymysql import *
conn = connect(host='localhost',user='root',password='',db='novel',charset='utf8')
cs1 = conn.cursor()
# cur = connect.cursor()
# cur.execute()
# 生成小说
def book(name):
novel_path = os.path.join('./text', name)
os.mkdir(novel_path)
count = cs1.execute('SELECt section,article FROM `{}`;'.format(name))
for i in range(count):
result = cs1.fetchone()
I = str(i)
title_path = './text/' + name + '/' + I + '_' + result[0] + '.txt'
with open(title_path, 'w', encoding='utf-8') as fp:
fp.write(result[1])
# text中有哪些小说
def path():
import os
dir_path = './text'
for root, dirs, files in os.walk(dir_path):
list = dirs
break
return list
# 词云
def wc(word,name):
count = cs1.execute('SELECt section,article FROM `{}`;'.format(name))
content_num = []
for i in range(count):
result = cs1.fetchone()
words = jieba.lcut(result[1])
for word in words:
content_num.append(result[0])
# content_num = str(content_num)
cut_text = "".join(content_num)
if cut_text == NULL:
name = input('请重新输入关键词:')
wc(word,name)
wordcloud = WordCloud(
font_path="C:/Windows/Fonts/simfang.ttf",
background_color="white",
width=1000,
height=800
).generate(cut_text)
plt.imshow(wordcloud)
plt.show()
# 打开小说文件夹
def sel(name_r):
path_r = '.\text\' + name_r
os.startfile(path_r)
if __name__ == "__main__":
name = input('请输入小说名字:')
count = cs1.execute('SHOW TABLES FROM novel;')
content_list = []
# 处理小说名字加入列表中
for i in range(count):
result = str(cs1.fetchone())
result = result[2:-3]
content_list.append(result)
# 判断是否重名,重名打开小说,没重名爬下来
if (name in content_list):
for i in content_list:
print(i)
while True :
name_r = input("选择您要读的书籍:")
if (name_r in content_list):
break;
else:
print("输入有误请重新输入")
word = input("关键词:")
wc(word,name_r)
if (name_r in path()):
sel(name_r)
else:
book(name)
sel(name)
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
}
url = 'https://www.shicimingju.com/book/%s.html'
name_p = pinyin.pinyin(name)
url = format(url%name_p)
page_text = requests.get(url=url,headers=headers)
page_text.encoding = 'utf-8'
html = page_text
page_text = page_text.text
soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select('.book-mulu > ul > li')
sql = "CREATE TABLE `novel`.`{}`( `section` TEXT(100) , `article` TEXT(10000) );".format(
name
)
SQL.doSql(sql)
for li in li_list:
title = li.a.string
detail_url = 'https://www.shicimingju.com'+li.a['href']
detail_page_text = requests.get(url=detail_url,headers=headers)
detail_page_text.encoding = 'utf-8'
html = detail_page_text
detail_page_text = detail_page_text.text
detail_soup = BeautifulSoup(detail_page_text,'lxml')
div_tag = detail_soup.find('div',class_='chapter_content')
content = div_tag.text
sql = "INSERT INTO `novel`.`{}`(`section`,`article`) VALUES ( '{}','{}');".format(
name,title,content
)
SQL.doSql(sql)
print(title,'爬取成功!!!! ')
print("$$$$$$爬取结束!!!$$$$$$")
word = input("关键词:")
wc(word, name)
if (name in path()):
sel(name)
else:
book(name)
sel(name)
cs1.close()
conn.close()
完整代码里还有一些其他的功能,有感兴趣的小伙伴可以自己研究研究!



