题目:获取虎扑步行街论坛上所有帖子的数据,内容包括帖子名称、帖子链接、作者、作者链接、创建时间、回复数、浏览数、最后回复用户和最后回复时间,网络地址为:https://bbs.hupu.com/bxj
使用mysql作为数据存储器,完整代码如下:
import requests
from bs4 import BeautifulSoup
import pymysql
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
data_list = []
def get_info(url):
html = requests.get(url,headers=headers)
soup = BeautifulSoup(html.text,'lxml')
names = soup.select('div > div.post-title > a')
authors = soup.select('div > div.post-auth > a')
times = soup.select('div > div.post-time')
replys = soup.select('div > div.post-datum')
for name,author,posttime,reply in zip(names,authors,times,replys):
data = {
'nameik':'https://bbs.hupu.com'+name['href'],
'name':name.get_text().strip(),
'author':author.get_text().strip(),
'authorik':author['href'],
'posttime':posttime.get_text().strip(),
'reply':reply.get_text().strip().split('/')[0],
'reading':reply.get_text().strip().split('/')[1]
}
print(data)
data_list.append(data)
'''
建表语句
CREATE TABLE `hupuss` (
`nameik` varchar(200) DEFAULT NULL,
`name` varchar(200) DEFAULT NULL,
`author` varchar(200) DEFAULT NULL,
`authorik` varchar(200) DEFAULT NULL,
`posttime` varchar(200) DEFAULT NULL,
`reply` varchar(200) DEFAULT NULL,
`reading` varchar(200) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
'''
def get_sql(data_list):
data = data_list[0]
cols = ", ".join('`{}`'.format(k) for k in data.keys())
val_cols = ', '.join('%({})s'.format(k) for k in data.keys())
sql = """
INSERT INTO hupu(%s) VALUES(%s)
""" % (cols, val_cols)
return sql
def get_mysql():
conn = pymysql.connect(host='localhost', user='root', passwd='123456', db='mydb', port=3306, charset='utf8')
cursor = conn.cursor()
sql = get_sql(data_list)
cursor.executemany(sql,data_list)
conn.commit()
if __name__ == '__main__':
urls = ['https://bbs.hupu.com/bxj-{}'.format(str(i)) for i in range(0,11)]
for url in urls:
get_info(url)
time.sleep(2)
get_mysql()
使用mongoDB存储数据代码如下:
import requests
from bs4 import BeautifulSoup
import pymongo
import time
client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
hupustreet = mydb['hupu']
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
def get_info(url):
html = requests.get(url,headers=headers)
soup = BeautifulSoup(html.text,'lxml')
names = soup.select('div > div.post-title > a')
authors = soup.select('div > div.post-auth > a')#名称、链接
times = soup.select('div > div.post-time')
replys = soup.select('div > div.post-datum')
for name,author,posttime,reply in zip(names,authors,times,replys):
data = {
'帖子链接':'https://bbs.hupu.com'+name['href'],
'帖子名称':name.get_text().strip(),
'作者':author.get_text().strip(),
'作者链接':author['href'],
'创建时间':posttime.get_text().strip(),
'回复数':reply.get_text().strip().split('/')[0],
'浏览数':reply.get_text().strip().split('/')[1]
}
print(data)
hupustreet.insert_one(data)
if __name__ == '__main__':
urls = ['https://bbs.hupu.com/bxj-{}'.format(str(i)) for i in range(0,11)]
for url in urls:
get_info(url)
time.sleep(2)



