Python爬虫获取简书的用户、文章、评论、图片等数据，并存入数据库

爬虫定义：网络爬虫（又称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。（来源：百度百科——爬虫）
爬取原理：万维网由许多的url构成，犹如一张巨大的网。通过url并发送请求获取数据（html），最后解析提取所需数据即可。
实战演示：获取简书的用户、文章、评论、图片等数据，并存入数据库。

（1）网站分析
1）获取文章url列表

2）获取文章信息

3）获取评论信息

4）获取用户信息

（2）编码实现
1）bean

# 用户：用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。
class User(object):
    def __init__(self, name, password, sex, age, address, tel, head_image, email, followers_count, fans_count,
                 articles_count, words_count, likes_count, balance, profile):
        self.user_id = None
        self.name = name
        self.password = password
        self.sex = sex
        self.age = age
        self.address = address
        self.tel = tel
        self.head_image = head_image
        self.email = email
        self.followers_count = followers_count
        self.fans_count = fans_count
        self.articles_count = articles_count
        self.words_count = words_count
        self.likes_count = likes_count
        self.balance = balance
        self.profile = profile
        self.create_time = None

    def __str__(self):
        print("id:" + str(self.user_id))
        print("sex:" + str(self.sex))
        print("age:" + str(self.age))
        print("address:" + str(self.address))
        print("tel:" + str(self.tel))
        print("head_image:" + str(self.head_image))
        print("email:" + str(self.email))
        print("followers_count:" + str(self.followers_count))
        print("fans_count:" + str(self.fans_count))
        print("articles_count:" + str(self.articles_count))
        print("words_count:" + str(self.words_count))
        print("likes_count:" + str(self.likes_count))
        print("balance:" + str(self.balance))
        print("profile:" + str(self.profile))


# 图书分类：图书分类ID、分类名、描述、文章总数、粉丝总数。
class Classify(object):
    def __init__(self, classify_id, name, description, articles_count, fans_count, article_url):
        self.classify_id = classify_id
        self.name = name
        self.description = description
        self.articles_count = articles_count
        self.fans_count = fans_count
        self.article_url = article_url


# 图书：图书ID、分类ID、作者ID、图书标题、图书简要、图书内容、总字数、浏览数、点赞数、评论数、赞赏数、发布时间。
class Book(object):

    def __init__(self, book_id, classify_id, author_name, title, content, words_count, views_count, likes_count, comments_count, rewards_count):
        self.book_id = book_id
        self.classify_id = classify_id
        self.author_id = None
        self.author_name = author_name
        self.title = title
        self.description = None
        self.content = content
        self.words_count = words_count
        self.views_count = views_count
        self.likes_count = likes_count
        self.comments_count = comments_count
        self.rewards_count = rewards_count
        self.create_time = None

    def __str__(self):
        print("作者：" + self.author_name)
        print("标题：" + self.title)
        print("内容：" + self.content)
        print("字数：" + str(self.words_count))
        print("浏览：" + str(self.views_count))
        print("喜欢：" + str(self.likes_count))
        print("评论：" + str(self.comments_count))
        print("打赏：" + str(self.rewards_count))


# 评论：评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。
class Comment(object):
    def __init__(self, comment_id, user_id, book_id, content, likes_count, reply_count, status):
        self.comment_id = comment_id
        self.user_id = user_id
        self.book_id = book_id
        self.content = content
        self.likes_count = likes_count
        self.reply_count = reply_count
        self.status = status
        self.create_time = None

    def __str__(self):
        print("id:" + str(self.comment_id))
        print("book_id:" + str(self.book_id))
        print("content:" + str(self.content))
        print("likes_count:" + str(self.likes_count))
        print("reply_count:" + str(self.reply_count))
        print("status:" + str(self.status))


# 回复：回复ID、被回复评论ID、回复评论ID、回复时间。
class Reply(object):
    def __init__(self, reply_id, be_reply_comment_id, reply_comment_id, create_time):
        self.reply_id = reply_id
        self.be_reply_comment_id = be_reply_comment_id
        self.reply_comment_id = reply_comment_id
        self.create_time = create_time

2）spider

import json
import os
import random
import re
import time
from xpinyin import Pinyin
import requests
from bean import bean

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/92.0.4515.131 Safari/537.36 '
}

root = "E:/study/JAVA/SmallWhiteBook/Python/SQL/"
images_root = 'E:/study/JAVA/SmallWhiteBook/Python/upload/images/'
home_url = 'https://www.jianshu.com'
classify = {
    '程序员': 'NEt52a',
    '摄影': '7b2be866f564',
    '故事': 'fcd7a62be697',
    'IT': 'V2CqjW',
    '读书': 'yD9GAd',
    '诗': 'vHz3Uc',
    '手绘': '8c92f845cd4d',
    '自然科普': 'cc7808b775b4',
    '旅行': '5AUzod',
    '电影': '1hjajt'
}
users = []
books = []
comments = []
images = []
classify_list = []
articles_id_list = []
user_urls = []
book_urls = []
comment_urls = []
articles_list = []
user_name_and_id = []
suffix = '.sql'
address_list = [
    '广西南宁',
    '广西河池',
    '广西桂林',
    '四川成都',
    '贵州六盘水',
    '广东深圳',
    '北京天安门',
    '巴黎埃菲尔铁塔',
    "旧金山",
    "洛杉矶",
    "硅谷",
    "黑龙江",
    "内蒙古",
    "兰州"
]


# get数据并编码
def get_content(url):
    return requests.get(url, headers=headers).content.decode()


# 为防止sql异常用中文的单引号替换英文的单引号
def clear_single_quotation_mark(string):
    string = re.sub("'", "’", str(string))
    return str(string)


# 删除文件
def delete_file(path):
    if os.path.exists(path):
        os.remove(path)


# 下载图片
def download_images(url, name):
    path = images_root + name
    with open(path, "wb") as f_obj:
        f_obj.write(get_content_no_decode(url))


# 下载用户头像
def download_head_images():
    for user in users:
        url = user.head_image
        name = str(url).split('?')[0].split('/')[-1]
        download_images(url, name)


# 将中文转化为拼音
def format_pinyin(name):
    p = Pinyin()
    res = p.get_pinyin(name)
    ans = ""
    split = res.split('-')
    for s in split:
        ans += s.capitalize()
    return ans


# 写入文件
def write(path, content):
    path = root + path + suffix
    delete_file(path)
    with open(path, "a", encoding='utf-8') as f_obj:
        f_obj.write(content)


# 写入用户信息
def write_user():
    table_name = "user"
    users_set = []
    # 清除重复
    for u in users:
        flag = False
        for us in users_set:
            if u.name == us.name:
                flag = True
                break
        if not flag:
            users_set.append(u)
    user_id = 1
    res = ""
    for u in users_set:
        # 用户ID、用户名、密码、性别、年龄、地址、电话、头像、邮箱、关注总数、粉丝总数、文章总数、总字数、喜欢总数、余额、简介、注册时间。
        sql = "insert " + table_name + "(userId, name, password, sex, age, address, tel, headImage, email, balance, followers, fans, articles, words, likes, profile) values "
        sql += "('" + str(user_id) + "', '" + str(u.name) + "', '" + str(u.password) + "', '" + u.sex + "', '" + str(
            u.age) + "', '" + str(u.address) + "', '" + str(
            u.tel) + "', '" + u.head_image + "', '" + u.email + "', '" + str(u.balance) + "', '" + str(
            u.followers_count) + "', '" + str(u.fans_count) + "', '" + str(u.articles_count) + "', '" + str(
            u.words_count) + "', '" + str(u.likes_count) + "', '" + str(u.profile) + "') "
        sql += ";n"
        res += sql
        user_id += 1
        user_name_and_id.append([u.name, user_id])
    write(table_name, res)


# 写入分类信息
def write_classify():
    table_name = "classify"
    res = ""
    for c in classify_list:
        sql = "insert " + table_name + "(classifyId, name, description, articles, fans, image) values "
        sql += "('" + str(c.classify_id) + "', '" + str(c.name) + "', '" + clear_single_quotation_mark(
            c.description) + "', '" + str(c.articles_count) + "', '" + str(c.fans_count) + "', '" + str(c.article_url) + "') "
        sql += ";n"
        res += sql
    write(table_name, res)


# 写入图书信息
def write_book():
    table_name = "book"
    books_set = []
    for b in books:
        flag = False
        for bs in books_set:
            if b.book_id == bs.book_id:
                flag = True
                break
        if not flag:
            books_set.append(b)
    res = ""
    for b in books_set:
        author_id = random.randint(1, len(user_name_and_id))
        sql = "insert " + table_name + "(bookId, userId, classifyId, title, content, words, views, comments, rewards) values "
        sql += "('" + str(b.book_id) + "', '" + str(author_id) + "', '" + str(b.classify_id) + "', '" + clear_single_quotation_mark(
            b.title) + "', '" + clear_single_quotation_mark(b.content) + "', '" + str(b.words_count) + "', '" + str(
            b.views_count) + "', '" + str(
            b.comments_count) + "', '" + str(b.rewards_count) + "') "
        sql += ";n"
        res += sql
    write(table_name, res)


# 写入评论信息
def write_comment():
    table_name = "comment"
    comments_set = []
    res = ""
    for c in comments:
        flag = False
        for cs in comments_set:
            if cs.comment_id == c.comment_id:
                flag = True
                break
        if not flag:
            comments_set.append(c)
    for c in comments_set:
        # 评论ID、用户ID、图书ID、评论内容、点赞数、回复数、评论状态、评论日期。
        user_id = random.randint(1, len(user_name_and_id))
        sql = "insert " + table_name + "(userId, bookId, content, status, likes, replys) values "
        sql += "('" + str(user_id) + "', '" + str(c.book_id) + "', '" + str(c.content) + "', '" + str(
            c.status) + "', '" + str(c.likes_count) + "', '" + str(c.reply_count) + "') "
        sql += ";n"
        res += sql
    write(table_name, res)


# 简书的爬虫类
class Spider(object):
    # 主页、分类、每种分类爬取的数量
    def __init__(self, home_url=home_url, classify=classify, per=52, comment_count=20):
        self.home_url = home_url
        self.classify = classify
        self.per = per
        self.comment_count = comment_count

    # 获取分类详细信息
    def get_classify_list(self):
        id = 1
        for k in classify:
            # 获取分类url
            url = self.get_classify_url(classify[k])
            content = get_content(url)
            pattern = re.compile(r'(.*?)(.*?)', re.S)
        note_list = pattern.findall(content)[0]
        pattern = re.compile(r'.*?,
                             re.S)
        note = pattern.findall(note_list)
        for n in note:
            # 获取文章id和地址
            article_id = n[0]
            article_url = self.home_url + n[1]
            articles_list.append([article_id, classify_id, article_url])

    # 获取图书信息
    def get_book(self, article_id, classify_id, url):
        # 解析图书信息
        content = get_content(url)
        pattern = re.compile(r'', re.S)
        article_content = pattern.findall(content)[0]
        # 获取评论信息
        comment_url = self.get_comment_url(article_id)
        comment_urls.append([article_id, classify_id, comment_url])
        pattern = re.compile(
            r'(.*?)',
            re.S)
        info = pattern.findall(main_top)[0]
        head_image = info[0]
        name = info[1]
        password = 123
        pattern = re.compile(r'.*?.*?(.*?).*? 
（3）效果展示

Python爬虫获取简书的用户、文章、评论、图片等数据，并存入数据库

Python相关栏目本月热门文章