栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python爬虫-1.07-MaoyanSpider---持久化存MySQL储

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python爬虫-1.07-MaoyanSpider---持久化存MySQL储

docekr-compose的MySQL yaml文件

version: '3'
services:
  mysql:
    image: 'mysql/mysql-server:5.7'
    restart: always
    container_name: mysql
    environment:
      MYSQL_ROOT_PASSWORD: 123456
    command:
      --default-authentication-plugin=mysql_native_password
      --character-set-server=utf8mb4
      --collation-server=utf8mb4_general_ci
      --explicit_defaults_for_timestamp=true
      --lower_case_table_names=1
      --max_allowed_packet=128M;
    ports:
      - 3306:3306

pymysql模块的使用

"""
p8
pymysql模块与MySQL数据库交互流程
创建数据库连接对象 db=pymysql.connect(xx,xx,xx,xx,xx)
创建游标对象 cursor=db.cursor()
执行SQL命令 cursor.execute(sql语句,[xx,xx])
提交到数据库执行 db.commit()
关闭游标 cursor.close()
断开数据库连接 db.close()
"""
import pymysql

# #1、创建数据库连接对象 + 游标对象
# db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
# cursor = db.cursor()
# #2、利用游标对象的execute()方法执行SQL命令
# ins = 'insert into maoyantab values(%s,%s,%s)'
# cursor.execute(ins, ['大话西游之月光宝盒', '周星驰', '1993-01-01'])
# #3、提交到数据库执行:commit()
# db.commit()
# #4、关闭游标 + 断开数据库连接
# cursor.close()
# db.close()

"""
pymysql之excutemany()方法
作用:
---减少数据库IO次数,提高效率,一次性插入多条表记录

示例:
ins = 'insert into maoyantab values(%s,%s,%s)'
film_li = [
('大话西游之月光宝盒', '周星驰', '1993-01-01'),
'大话西游之月光宝盒', '周星驰', '1994-01-01'
]
cursor.executemany(ins,film_li)
"""

#利用excutemany()方法,一次性插入多条表记录,提升数据库的效率
#1、创建数据库连接对象 + 游标对象
db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
cursor = db.cursor()
#2、利用游标对象的execute()方法执行SQL命令
ins = 'insert into maoyantab values(%s,%s,%s)'
film_li = [
	('大话西游之月光宝盒', '周星驰', '1993'),
	('大话西游之月光宝盒', '周星驰', '1994'),
	('大话西游之大圣娶亲', '周星驰', '1995'),
	('大话西游之大圣娶亲', '周星驰', '1996')
]
cursor.executemany(ins, film_li)
#3、提交到数据库执行:commit()
db.commit()
#4、关闭游标 + 断开数据库连接
cursor.close()
db.close()

使用pymysql的execute()方法 单次存储抓取到的数据

"""
猫眼电影TOP100抓取 - execute()
"""
from urllib import request
import random
import time
import re
import pymysql


class MaoyanSpider:
	def __init__(self):
		self.url = 'https://maoyan.com/board/4?offset={}'
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
			'cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9'
		}
		# 添加计数变量
		self.i = 0
		# 连接数据库 + 游标对象
		self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
		self.cursor = self.db.cursor()
	
	def get_html(self, url):
		"""获取HTML内容"""
		req = request.Request(url=url, headers=self.headers)
		res = request.urlopen(req)
		html = res.read().decode()
		# print(html)
		# 直接调用解析函数
		self.parse_html(html)
	
	def parse_html(self, html):
		"""提取HTML内容"""
		regex = '.*?title="(.*?)".*?">.*?

.*?>(.*?)

.*?>(.*?)

' pattern = re.compile(regex, re.S) r_list = pattern.findall(html) # 调用数据处理函数 self.save_html(r_list) def save_html(self, r_list): """数据处理函数""" ins = 'insert into maoyantab values(%s,%s,%s)' for r in r_list: film_li = [ r[0].strip(), r[1].strip(), r[2].strip() ] self.cursor.execute(ins, film_li) self.db.commit() print(film_li) self.i += 1 def run(self): """程序运行调配""" for page in range(0, 91, 10): url = self.url.format(page) # print(url) self.get_html(url) # 控制数据抓取频率 time.sleep(random.randint(1, 2)) # 所有页面数据抓取完成后关闭 self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.run() print('电影数量:', spider.i)

使用pymysql的executemany()方法 一次性存储抓取到的数据

"""
猫眼电影TOP100抓取 - executemany()
"""
from urllib import request
import random
import time
import re
import pymysql


class MaoyanSpider:
	def __init__(self):
		self.url = 'https://maoyan.com/board/4?offset={}'
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
			'cookie': '__mta=214675752.1627866406865.1627875459110.1627875460018.12; uuid_n_v=v1; uuid=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; _csrf=38f9740349f3f3b55a88970a5164681765e4611ccbd2fc8ef5f526914970614d; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1627866407; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17b04661fb4c8-0813968a22224b-d7e1938-e1000-17b04661fb4c8; _lxsdk=E85FEA50F32D11EB8C9F5D6CCA53AC9DD7DBAF07A29F40DB93EF3FC782A0F81F; __mta=214675752.1627866406865.1627866406865.1627866409991.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1627875460; _lxsdk_s=17b04f00a6e-9d7-717-d8%7C%7C9'
		}
		# 添加计数变量
		self.i = 0
		# 连接数据库 + 游标对象 + 存放所有电影信息的大列表
		self.db = pymysql.connect(host='10.0.0.6', user='root', password='123456', database='maoyandb', charset='utf8')
		self.cursor = self.db.cursor()
		self.all_film_list = []
	
	def get_html(self, url):
		"""获取HTML内容"""
		req = request.Request(url=url, headers=self.headers)
		res = request.urlopen(req)
		html = res.read().decode()
		# print(html)
		# 直接调用解析函数
		self.parse_html(html)
	
	def parse_html(self, html):
		"""提取HTML内容"""
		regex = '.*?title="(.*?)".*?">.*?

.*?>(.*?)

.*?>(.*?)

' pattern = re.compile(regex, re.S) r_list = pattern.findall(html) # 调用数据处理函数 # print(r_list) self.save_html(r_list) def save_html(self, r_list): """数据处理函数""" for r in r_list: film_t = ( r[0].strip(), r[1].strip(), r[2].strip() ) # 把所有电影的元组添加到大列表中,用于最后的 executemany() self.all_film_list.append(film_t) print(film_t) self.i += 1 def run(self): """程序运行调配""" for page in range(0, 91, 10): url = self.url.format(page) # print(url) self.get_html(url) # 控制数据抓取频率 time.sleep(random.randint(1, 2)) # 所有页面数据抓取完成,一次性存入到MySQL数据库 ins = 'insert into maoyantab values(%s,%s,%s)' self.cursor.executemany(ins, self.all_film_list) self.db.commit() self.cursor.close() self.db.close() if __name__ == '__main__': spider = MaoyanSpider() spider.run() print('电影数量:', spider.i)
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/300359.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号