Python 爬虫食品推荐系统

我们常接触的网络爬虫是百度、搜狗、谷歌（Google）等公司的搜索引擎，这些搜索引擎通过互联网上的入口获取网页，实时存储并更新索引。搜索引擎的基础就是网络爬虫，这些网络爬虫通过自动化的方式进行网页浏览并存储相关的信息。

个人总结

此处爬虫总结为个人开发基于hadoop 的食品推荐所爬取信息展现到web端下的感想。纯属个人小打小闹，不过也可为Python 爬虫的入门项目借鉴.

GitHub 源码 https://github.com/yeshenyong/FoodRecommendSys/tree/master/FoodSpider （包含前后端代码）

爬虫步骤

确定爬取域名网站
分析所需爬取域名的HTML 文件、确定爬取字段
修改HTTP 报头，“欺骗” 服务器，非程序访问
定义爬取匹配字符正则表达式
爬取数据分析处理
前后端展示（基于flask + jquery + bootstrap）

遇到死链接、其他服务器问题，程序不能一直等待爬取

try catch來解決
try:
	timeout;
except:
	sovle timeout

爬虫源码剖析

main.py

# -*- coding: utf-8 -*-

# @Time    : 2021/10/26 19:28
# @Author  : yeshenyong
# @File    : main.py

from Excel.excel import save_excel
from MySql.mysql import init_db, save_db
from Analysis.spider import getData

baseUrl = "https://www.meishij.net/china-food/?&page="
dbName = ""
page = 10
Excelpath = "食品爬取结果.xls"


def main():
    dataList = getData(baseUrl, page)
    save_excel(dataList, Excelpath)
    save_db(dataList)


if __name__ == '__main__':
    init_db()	# 初始化服务器
    main()		# 运行爬虫代码

Mysql 保存代码

mysql.py

# -*- coding: utf-8 -*-

# @Time    : 2021/10/26 19:35
# @Author  : yeshenyong
# @File    : mysql.py

import pymysql

conn = pymysql.connect(host='localhost', user='root', passwd='123456', database='foodserver', charset='utf8')
cursor = conn.cursor()
sql = 'insert into food(fname, fcomment, ffunc, fstep, ftaste, url) values(%s,%s,%s,%s,%s,%s) '

def init_db():
    pass


def save_db(dataList):
    for i in range(0, len(dataList)):
        # print(dataList[i])
        try:
            cursor.execute(sql, dataList[i])
            conn.commit()
        except Exception as e:
            print('插入数据失败', e)
            conn.rollback()  # 回滚

    # 关闭游标
    cursor.close()
    # 关闭连接
    conn.close()
    pass

Excel 保存代码

excel.py

# -*- coding: utf-8 -*-

# @Time    : 2021/10/26 20:26
# @Author  : yeshenyong
# @File    : excel.py

import xlwt


def save_excel(datalist, path):
    print("saving...")
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)
    sheet = book.add_sheet('食品', cell_overwrite_ok=True)
    """
        1.食品名字
        2.食品评论和人气
        3.食品的功效
        4.食品总共步骤
        5.食品的口味
        6.食品的图片URL
    """
    col = ("食品名字", "食品评论和人气", "食品功效", "食品步骤", "食品口味", "图片链接")
    for i in range(len(col)):
        sheet.write(0, i, col[i])

    for i in range(0, len(datalist)):
        # print("第%d条" % (i + 1))
        data = datalist[i]
        for j in range(0, len(data)):
            sheet.write(i + 1, j, data[j])
    book.save(path)

爬虫核心代码

spider.py

# -*- coding: utf-8 -*-

# @Time    : 2021/10/26 19:40
# @Author  : yeshenyong
# @File    : spider.py

import urllib.request, urllib.error
import re
from bs4 import BeautifulSoup

# 正则表达式

# 食品名字
findName = re.compile(r'(.*)')
# 食品评论和人气 和 食品的功效
findComment = re.compile(r'(.*)')
# 食品总共步骤
findStep = re.compile(r'(.*?)')
# 食品的口味
findTaste = re.compile(r'(.*)')
# 食品图像
findImgSrc = re.compile(r', re.S)  # re.S 让换行符包含在字符中


def getData(baseUrl, page):
    dataList = []
    for i in range(0, page):
        url = baseUrl + str(i+1)
        html = getEachHtml(url)
        # print(html)
        soup = BeautifulSoup(html, "html.parser")
        for item in soup.find_all('div', class_="listtyle1"):
            data = []
            item = str(item)

            name = re.findall(findName, item)
            data.append(name)

            comment = re.findall(findComment, item)
            if len(comment) == 2:
                foodcomment = comment[0]
                data.append(foodcomment)
                foodfunc = comment[1]
                data.append(foodfunc)
            else:
                foodcomment = comment[0]
                data.append(foodcomment)
                data.append(' ')

            step = re.findall(findStep, item)
            if len(step) == 1:
                data.append(step)
                # print(step)
            else:
                data.append(' ')

            taste = re.findall(findTaste, item)
            data.append(taste)

            imgsrc = re.findall(findImgSrc,item)
            data.append(imgsrc)

            dataList.append(data)
        """
            1.食品名字
            2.食品评论和人气
            3.食品的功效
            4.食品总共步骤
            5.食品的口味
            6.食品的图片URL
        """
    return dataList

def getEachHtml(baseUrl):
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }
    request = urllib.request.Request(baseUrl, headers=head)
    html = ""
    try:
        respones = urllib.request.urlopen(request)
        html = respones.read().decode("utf-8")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

爬取结果

Python 爬虫食品推荐系统

Python相关栏目本月热门文章