- 爬虫定义:网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。(来源:百度百科——爬虫)
- 爬取原理:万维网由许多的url构成,犹如一张巨大的网。通过url并发送请求获取数据(html),最后解析提取所需数据即可。
- 实战演示:获取简书的用户、文章、评论、图片等数据,并存入数据库。
(1)网站分析
1)获取文章url列表
2)获取文章信息
3)获取评论信息
4)获取用户信息
(2)编码实现
1)bean
# 用户:用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。
class User(object):
def __init__(self, name, password, sex, age, address, tel, head_image, email, followers_count, fans_count,
articles_count, words_count, likes_count, balance, profile):
self.user_id = None
self.name = name
self.password = password
self.sex = sex
self.age = age
self.address = address
self.tel = tel
self.head_image = head_image
self.email = email
self.followers_count = followers_count
self.fans_count = fans_count
self.articles_count = articles_count
self.words_count = words_count
self.likes_count = likes_count
self.balance = balance
self.profile = profile
self.create_time = None
def __str__(self):
print("id:" + str(self.user_id))
print("sex:" + str(self.sex))
print("age:" + str(self.age))
print("address:" + str(self.address))
print("tel:" + str(self.tel))
print("head_image:" + str(self.head_image))
print("email:" + str(self.email))
print("followers_count:" + str(self.followers_count))
print("fans_count:" + str(self.fans_count))
print("articles_count:" + str(self.articles_count))
print("words_count:" + str(self.words_count))
print("likes_count:" + str(self.likes_count))
print("balance:" + str(self.balance))
print("profile:" + str(self.profile))
# 图书分类:图书分类ID、分类名、描述、文章总数、粉丝总数。
class Classify(object):
def __init__(self, classify_id, name, description, articles_count, fans_count, article_url):
self.classify_id = classify_id
self.name = name
self.description = description
self.articles_count = articles_count
self.fans_count = fans_count
self.article_url = article_url
# 图书:图书ID、分类ID、作者ID、图书标题、图书简要、图书内容、总字数、浏览数、点赞数、评论数、赞赏数、发布时间。
class Book(object):
def __init__(self, book_id, classify_id, author_name, title, content, words_count, views_count, likes_count, comments_count, rewards_count):
self.book_id = book_id
self.classify_id = classify_id
self.author_id = None
self.author_name = author_name
self.title = title
self.description = None
self.content = content
self.words_count = words_count
self.views_count = views_count
self.likes_count = likes_count
self.comments_count = comments_count
self.rewards_count = rewards_count
self.create_time = None
def __str__(self):
print("作者:" + self.author_name)
print("标题:" + self.title)
print("内容:" + self.content)
print("字数:" + str(self.words_count))
print("浏览:" + str(self.views_count))
print("喜欢:" + str(self.likes_count))
print("评论:" + str(self.comments_count))
print("打赏:" + str(self.rewards_count))
# 评论:评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。
class Comment(object):
def __init__(self, comment_id, user_id, book_id, content, likes_count, reply_count, status):
self.comment_id = comment_id
self.user_id = user_id
self.book_id = book_id
self.content = content
self.likes_count = likes_count
self.reply_count = reply_count
self.status = status
self.create_time = None
def __str__(self):
print("id:" + str(self.comment_id))
print("book_id:" + str(self.book_id))
print("content:" + str(self.content))
print("likes_count:" + str(self.likes_count))
print("reply_count:" + str(self.reply_count))
print("status:" + str(self.status))
# 回复:回复ID、被回复评论ID、回复评论ID、回复时间。
class Reply(object):
def __init__(self, reply_id, be_reply_comment_id, reply_comment_id, create_time):
self.reply_id = reply_id
self.be_reply_comment_id = be_reply_comment_id
self.reply_comment_id = reply_comment_id
self.create_time = create_time
2)spider
import json
import os
import random
import re
import time
from xpinyin import Pinyin
import requests
from bean import bean
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/92.0.4515.131 Safari/537.36 '
}
root = "E:/study/JAVA/SmallWhiteBook/Python/SQL/"
images_root = 'E:/study/JAVA/SmallWhiteBook/Python/upload/images/'
home_url = 'https://www.jianshu.com'
classify = {
'程序员': 'NEt52a',
'摄影': '7b2be866f564',
'故事': 'fcd7a62be697',
'IT': 'V2CqjW',
'读书': 'yD9GAd',
'诗': 'vHz3Uc',
'手绘': '8c92f845cd4d',
'自然科普': 'cc7808b775b4',
'旅行': '5AUzod',
'电影': '1hjajt'
}
users = []
books = []
comments = []
images = []
classify_list = []
articles_id_list = []
user_urls = []
book_urls = []
comment_urls = []
articles_list = []
user_name_and_id = []
suffix = '.sql'
address_list = [
'广西南宁',
'广西河池',
'广西桂林',
'四川成都',
'贵州六盘水',
'广东深圳',
'北京天安门',
'巴黎埃菲尔铁塔',
"旧金山",
"洛杉矶",
"硅谷",
"黑龙江",
"内蒙古",
"兰州"
]
# get数据并编码
def get_content(url):
return requests.get(url, headers=headers).content.decode()
# 为防止sql异常用中文的单引号替换英文的单引号
def clear_single_quotation_mark(string):
string = re.sub("'", "’", str(string))
return str(string)
# 删除文件
def delete_file(path):
if os.path.exists(path):
os.remove(path)
# 下载图片
def download_images(url, name):
path = images_root + name
with open(path, "wb") as f_obj:
f_obj.write(get_content_no_decode(url))
# 下载用户头像
def download_head_images():
for user in users:
url = user.head_image
name = str(url).split('?')[0].split('/')[-1]
download_images(url, name)
# 将中文转化为拼音
def format_pinyin(name):
p = Pinyin()
res = p.get_pinyin(name)
ans = ""
split = res.split('-')
for s in split:
ans += s.capitalize()
return ans
# 写入文件
def write(path, content):
path = root + path + suffix
delete_file(path)
with open(path, "a", encoding='utf-8') as f_obj:
f_obj.write(content)
# 写入用户信息
def write_user():
table_name = "user"
users_set = []
# 清除重复
for u in users:
flag = False
for us in users_set:
if u.name == us.name:
flag = True
break
if not flag:
users_set.append(u)
user_id = 1
res = ""
for u in users_set:
# 用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。
sql = "insert " + table_name + "(userId, name, password, sex, age, address, tel, headImage, email, balance, followers, fans, articles, words, likes, profile) values "
sql += "('" + str(user_id) + "', '" + str(u.name) + "', '" + str(u.password) + "', '" + u.sex + "', '" + str(
u.age) + "', '" + str(u.address) + "', '" + str(
u.tel) + "', '" + u.head_image + "', '" + u.email + "', '" + str(u.balance) + "', '" + str(
u.followers_count) + "', '" + str(u.fans_count) + "', '" + str(u.articles_count) + "', '" + str(
u.words_count) + "', '" + str(u.likes_count) + "', '" + str(u.profile) + "') "
sql += ";n"
res += sql
user_id += 1
user_name_and_id.append([u.name, user_id])
write(table_name, res)
# 写入分类信息
def write_classify():
table_name = "classify"
res = ""
for c in classify_list:
sql = "insert " + table_name + "(classifyId, name, description, articles, fans, image) values "
sql += "('" + str(c.classify_id) + "', '" + str(c.name) + "', '" + clear_single_quotation_mark(
c.description) + "', '" + str(c.articles_count) + "', '" + str(c.fans_count) + "', '" + str(c.article_url) + "') "
sql += ";n"
res += sql
write(table_name, res)
# 写入图书信息
def write_book():
table_name = "book"
books_set = []
for b in books:
flag = False
for bs in books_set:
if b.book_id == bs.book_id:
flag = True
break
if not flag:
books_set.append(b)
res = ""
for b in books_set:
author_id = random.randint(1, len(user_name_and_id))
sql = "insert " + table_name + "(bookId, userId, classifyId, title, content, words, views, comments, rewards) values "
sql += "('" + str(b.book_id) + "', '" + str(author_id) + "', '" + str(b.classify_id) + "', '" + clear_single_quotation_mark(
b.title) + "', '" + clear_single_quotation_mark(b.content) + "', '" + str(b.words_count) + "', '" + str(
b.views_count) + "', '" + str(
b.comments_count) + "', '" + str(b.rewards_count) + "') "
sql += ";n"
res += sql
write(table_name, res)
# 写入评论信息
def write_comment():
table_name = "comment"
comments_set = []
res = ""
for c in comments:
flag = False
for cs in comments_set:
if cs.comment_id == c.comment_id:
flag = True
break
if not flag:
comments_set.append(c)
for c in comments_set:
# 评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。
user_id = random.randint(1, len(user_name_and_id))
sql = "insert " + table_name + "(userId, bookId, content, status, likes, replys) values "
sql += "('" + str(user_id) + "', '" + str(c.book_id) + "', '" + str(c.content) + "', '" + str(
c.status) + "', '" + str(c.likes_count) + "', '" + str(c.reply_count) + "') "
sql += ";n"
res += sql
write(table_name, res)
# 简书的爬虫类
class Spider(object):
# 主页、分类、每种分类爬取的数量
def __init__(self, home_url=home_url, classify=classify, per=52, comment_count=20):
self.home_url = home_url
self.classify = classify
self.per = per
self.comment_count = comment_count
# 获取分类详细信息
def get_classify_list(self):
id = 1
for k in classify:
# 获取分类url
url = self.get_classify_url(classify[k])
content = get_content(url)
pattern = re.compile(r'(.*?)- (.*?)
', re.S) article_content = pattern.findall(content)[0] # 获取评论信息 comment_url = self.get_comment_url(article_id) comment_urls.append([article_id, classify_id, comment_url]) pattern = re.compile( r'(.*?)', re.S) info = pattern.findall(main_top)[0] head_image = info[0] name = info[1] password = 123 pattern = re.compile(r'.*?.*?
(.*?)
.*?(3)效果展示



