栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python爬虫-1.08MaoyanSpiderFilmAddress---持久化存储到MongoDB

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python爬虫-1.08MaoyanSpiderFilmAddress---持久化存储到MongoDB

docker方式运行MongoDB

docker run -itd --name mongo -p 27017:27017 mongo
docker exec -it mongo mongo

MongoDB数据库基础知识

MongoDB为非关系性数据库,数据以键值对方式存储
MongoDB基于磁盘存储
MongoDB数据类型单一,值为JSON文档,而Redis基于内存
MongoDB:库->集合->文档
Mysql: 库->表->表记录

MongoD基础语法

#查看所有库
show dbs
#切换到指定库
use 库名
#查看当前库中的所有集合
show collections
#查看当前库中的文档
db.集合名.find().pretty()
#统计集合中文档的数量
db.集合名.count()
#删除集合
db.集合名.drop()
#删除当前库
db.dropDatabase()

pymongo模块使用流程

sudo pip install pymongo
import pymongo

#创建连接对象
conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
#创建库对象
db = conn['maoyandb']
#创建集合对象
myset = db['maoyanset']
#在集合中插入文档
myset.insert_one({'name': '泰坦尼克号', 'star': 'T', 'time': '1990-01-01'})
#在集合中批量插入文档
myset.insert_many({'name': '泰坦尼克号', 'star': 'T', 'time': '1990-01-01'})
注意:MongoD无需提前建库建表。直接操作即可,会自动建库建表

代码示例

"""
猫眼电影首页抓取
"""
from urllib import request
import random
import time
import re
import pymongo

class MaoyanSpider_Home_Page:
	def __init__(self):
		self.url = 'https://www.maoyan.com/'
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
			'cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644741203; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk_s=17ef22e2f20-58d-04b-5bb%7C%7C5; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644740620254.1644741203077.2'
		}
		self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
		self.db = self.conn['maoyandb']
		self.myset = self.db['maoyanset']
	
	def get_html(self):
		req = request.Request(url=self.url, headers=self.headers)
		res = request.urlopen(req)
		html = res.read().decode()
		self.parse_html(html)
	
	def parse_html(self, html):
		regex = '(.*?)'
		pattern = re.compile(regex, re.S)
		r_list = pattern.findall(html)
		# print(r_list)
		self.save_html(r_list)
	
	def save_html(self, r_list):
		for film in r_list:
			itme = {}
			itme['name'] = film
			print(itme)
			self.myset.insert_one(itme)
	
	
	def run(self):
		self.get_html()
		time.sleep(random.randint(1, 2))


if __name__ == '__main__':
	spider = MaoyanSpider_Home_Page()
	spider.run()



代码示例2
说明:因为tops100榜单猫眼电影更了反爬虫机制,及时加入cookie也无法爬取,所以改变了爬取信息,能掌握存储到MongoDB的方式就好

"""
猫眼电影附近影院抓取
"""
from urllib import request
import random
import time
import re
import pymongo


class MaoyanSpiderFilmAddress:
	def __init__(self):
		self.url = 'https://www.maoyan.com/cinemas?offset={}'
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
			'cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644746835; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644746829263.1644746835412.32; _lxsdk_s=17ef2697afa-901-639-438%7C%7C62'
		}
		self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
		self.db = self.conn['maoyandb']
		self.myset = self.db['maoyanset']
		self.i =0
	
	def get_html(self, url):
		req = request.Request(url=url, headers=self.headers)
		res = request.urlopen(req)
		html = res.read().decode()
		self.parse_html(html)
	
	def parse_html(self, html):
		regex = '.*?(.*?).*?

(.*?)

' pattern = re.compile(regex, re.S) r_list = pattern.findall(html) self.save_html(r_list) def save_html(self, r_list): for i in r_list: item = {'name': i[0], 'address': i[1]} print(item) self.myset.insert_one(item) self.i += 1 def run(self): for page in range(0, 19, 1): num = page * 12 url = self.url.format(num) self.get_html(url=url) time.sleep(random.randint(1,5)) if __name__ == '__main__': spider = MaoyanSpiderFilmAddress() spider.run() print('附近影院数量:',spider.i)

结果展示

{
        "_id" : ObjectId("6208df549494e00eef326a61"),
        "name" : "保利国际影城(杭州西溪天堂店)",
        "address" : "地址:西湖区紫金港路21号西溪天堂商业街地下一层(喜来登国际会议中心旁)"
}
{
        "_id" : ObjectId("6208df549494e00eef326a62"),
        "name" : "千红时代影城",
        "address" : "地址:拱墅区丰庆路710号(世纪联华超市4楼)"
}
{
        "_id" : ObjectId("6208df549494e00eef326a63"),
        "name" : "华纳影城",
        "address" : "地址:萧山区临浦镇萧山建材商贸城25幢4楼(新起点ktv)"
}
{
        "_id" : ObjectId("6208df549494e00eef326a64"),
        "name" : "嘉博杜比巨幕影城",
        "address" : "地址:下城区石桥路274号西狗茂南区2楼"
}
{
        "_id" : ObjectId("6208df549494e00eef326a65"),
        "name" : "大地影院(杭州临安宝龙店)",
        "address" : "地址:临安区临安市锦北街道农林大路899号宝龙广场3层M-F3-025室"
}
{
        "_id" : ObjectId("6208df549494e00eef326a66"),
        "name" : "天玖国际影城(浙商国际中心店)",
        "address" : "地址:上城区笕桥街道机场路355号浙商国际中心2幢4楼"
}
{
        "_id" : ObjectId("6208df549494e00eef326a67"),
        "name" : "太平洋影城(杭州下沙店)",
        "address" : "地址:钱塘区下沙街道天城东路955号郡原蓝湖国际4幢3楼"
}
{
        "_id" : ObjectId("6208df549494e00eef326a68"),
        "name" : "太平洋影城(滨江店)",
        "address" : "地址:滨江区江陵路2028号星耀城3幢3楼"
}
Type "it" for more
>

代码示例3

"""
猫眼电影经典影片 按照热门排序
"""

"""

            
              奇迹·笨小孩
                9.5
            
            
              类型:
              剧情
            
            
              主演:
              易烊千玺/田雨/陈哈琳
            
            
              上映时间:
              2022-02-01 08:00
            
          


.*?(.*?).*?(.*?)(.*?).*?类型:(.*?).*?主演:(.*?).*?上映时间:(.*?).*?
"""
from urllib import request
import random
import time
import re
import pymongo


class MaoyanSpider_Classic_Film:
	def __init__(self):
		self.url = 'https://www.maoyan.com/films?showType=3&sortId=1&offset={}'
		self.headers = {
			'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0',
			'cookie': 'uuid_n_v=v1; uuid=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; _csrf=ebdd97cd428809914f8919dcfe0c1031f72c3caf9b01aa03d9c831fae4dffd7f; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1644740619; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1644765969; _lxsdk_cuid=17ef22e2f1fc8-0bb075b44e2f43-4c3e237c-144000-17ef22e2f1f27; _lxsdk=3AE37E108CA611EC8FDC8396A0F7AFFCD945C250A076432C8AFD89FCB0E193D7; __mta=150189705.1644740620254.1644765647349.1644765969306.80; _lxsdk_s=17ef39966be-c29-c61-4a0%7C%7C43'
		}
		self.i = 0
		self.conn = pymongo.MongoClient(host='10.0.0.101', port=27017)
		self.db = self.conn['maoyandb']
		self.myset = self.db['maoyanset']
	
	def get_html(self, url):
		"获取HTML内容"
		req = request.Request(url=url, headers=self.headers)
		res = request.urlopen(req)
		html = res.read().decode()
		# print(html)
		# 直接调用解析函数
		self.parse_html(html)
	
	def parse_html(self, html):
		"提取HTML内容"
		regex = '.*?(.*?).*?(.*?)(.*?).*?类型:(.*?).*?主演:(.*?).*?上映时间:(.*?).*?'
		pattern = re.compile(regex, re.S)
		r_list = pattern.findall(html)
		# 调用数据处理函数
		self.save_html(r_list)
	
	def save_html(self, r_list):
		"数据处理函数"
		for r in r_list:
			item = {}
			item['name'] = r[0].strip()
			item['score'] = r[1].strip() + r[2].strip()
			item['type'] = r[3].strip()
			item['star'] = r[4].strip()
			item['time'] = r[5].strip()
			print(item)
			self.myset.insert_one(item)
			self.i += 1
	
	def run(self):
		"程序运行调配"
		for page in range(0, 91, 10):
			self.get_html(url=self.url.format(page * 30))
			time.sleep(random.randint(1, 2))


if __name__ == '__main__':
	spider = MaoyanSpider_Classic_Film()
	spider.run()
	print('电影数量:', spider.i)


转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/739467.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号