分享点以前做的项目,攒点项目经验
前面讲了怎么做scrapy-redis的配置,这次做个采集的实例
网站是彼岸图网:https://pic.netbian.com/4kmeinv/
采集上面这个链接分类下的所有图片,分析网站是148页,读取下一页的链接前往下一页,读取上面的图片链接进入详情取大图的地址
中途停了以后也可以在命令行重新输入命令继续
再次运行它还是从那里开始接着继续
数据库存储大图的标题和下载url
建表语句为
CREATE TABLE `db1`.`pictable` ( `id` INT NOT NULL AUTO_INCREMENT, `title` VARCHAR(100) NOT NULL COMMENT '图片标题', `picurl` VARCHAR(150) NOT NULL COMMENT '图片地址', PRIMARY KEY (`id`)) COMMENT = '图片地址存储表';items文件
items文件中就放置下面两个字段即可,id字段是自动生成的
class MyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
picurl=scrapy.Field()
管道处理
管道主要执行启动和关闭数据库,并对数据做简单查重后存入数据库中
import pymysql
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MyPipeline:
def open_spider(self, spider):
self.client = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='sa123456',
database='db1',
charset='utf8'
)
self.cursor = self.client.cursor()
def close_spider(self, spider):
self.cursor.close()
self.client.close()
def process_item(self, item, spider):
title = item['title']
picurl = item['picurl']
print('保存%s'%title)
sql = f"select * from pictable where title='{title}' and picurl='{picurl}'"
print(sql)
rows = self.cursor.execute(sql)
if rows == 0:
sql2 = f"insert into pictable(title,picurl) values ('{title}','{picurl}')"
self.cursor.execute(sql2)
print('新增了一条数据')
self.client.commit()
else:
print(f'数据({title},{picurl})已存在')
raise DropItem
return item
基本设置
设置文件中做好如下设置,并注意好管道的开放,UA的调整
# 分布式的配置 SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 调度 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 去重 SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue' # 任务的优先级别 SCHEDULER_PERSIST = True # SCHEDULER_FLUSH_ON_START = True # 你有密码 redis://user:password@IP:port/db REDIS_URL = 'redis:/你的redis'主爬虫程序
parse函数是默认的处理函数,处理类似于每页的链接
imgdownload函数处理每个页面显示图片背后的链接
downloader处理图片文件的下载
import scrapy
import logging
from scrapy.selector import Selector
from scrapy_redis.spiders import RedisSpider
from my.items import MyItem
class StaSpider(RedisSpider):
name = 'sta'
# allowed_domains = ['*']
baseurl = 'http://pic.netbian.com'
# start_urls = ['https://pic.netbian.com/4kmeinv/']
def parse(self, response):
print('开始一个爬虫')
logging.warning('开始一个爬虫')
# print(response.text)
selector=Selector(text=response.text)
pics=selector.xpath('/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'connection':'keep-alive',
}
这是在默认中间件调用的,位置是
请求头和用户代理的中间件的默认排序值可以在defaultsettings文件中找到
如果通过自定义中间件来设置随机UA,放在headers设置前面的话,可能会有什么问题
目前这里没看出来覆盖的情况
这里就是把默认的UA中间件代码拿过来,自己写一个随机UA的类,然后在request处理函数中设置UA即可
from collections import defaultdict, deque
import logging
import pprint
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.defer import process_parallel, process_chain, process_chain_both
logger = logging.getLogger(__name__)
import random
class MyUA:
first_num = random.randint(55, 62)
third_num = random.randint(0, 3200)
fourth_num = random.randint(0, 140)
os_type = [
'(Windows NT 6.1; WOW64)', '(Windows NT 10.0; WOW64)', '(X11;Linux x86_64)','(Macintosh; Intel Mac OS X 10_12_6)'
]
chrome_version = 'Chrome/{}.0.{}.{}'.format(first_num, third_num,
fourth_num)
@classmethod
def get_ua(cls):
return ' '.join(['Mozilla/5.0', random.choice(cls.os_type),
'AppleWebKit/537.36','(KHTML, like Gecko)', cls.chrome_version,
'Safari/537.36'])
class RandomUAMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
# def __init__(self, user_agent='Scrapy'):
# self.user_agent = MyUA.get_ua()
#不处理设置中的UA
# @classmethod
# def from_crawler(cls, crawler):
# o = cls(crawler.settings['USER_AGENT'])
# crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
# return o
#
# def spider_opened(self, spider):
# self.user_agent = self.user_agent
def process_request(self, request, spider):
request.headers[b'User-Agent']= MyUA.get_ua()
# print('UA中间件调用2',request.headers)
代理服务器
如果是代理服务器proxies的话,可以在相关网站购买能用的ip,并用中间件处理,如下所示使用的是代理ip之后的请求
ip的使用时有时限的,建议增加一段程序,动态的从IP网站重新提取ip
代码参考:
设置中要有PROXIES和HTTPPROXY_ENABLED两项,前一个是列表,后一个是布尔值
我购买的代理IP有提取接口,即时提取一定数量,取到的是ip:端口的形式,要根据形式调整格式,最后提交给request的是http://ip:port的形式
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from collections import defaultdict
from scrapy import signals
from scrapy.exceptions import NotConfigured
from twisted.internet.error import ConnectionRefusedError, TimeoutError
class RandomProxyMiddleware:
def __init__(self, settings):
# 2. 初始化配置及相关变量
self.proxies = settings.getlist('PROXIES')
self.stats = defaultdict(int)
self.max_failed = 3
@classmethod
def from_crawler(cls, crawler):
# 1. 创建中间件对象
if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
raise NotConfigured
return cls(crawler.settings)
def process_request(self, request, spider):
# 3. 为每个request对象分配一个随机的IP代理
if self.proxies and not request.meta.get('proxy')
and request.url not in spider.start_urls:
print(self.proxies)
request.meta['proxy'] = 'http://'+random.choice(self.proxies)
print(request.meta['proxy'])
def process_response(self, request, response, spider):
# 4. 请求成功则调用process_response
cur_proxy = request.meta.get('proxy')
# 判断是否被对方封禁
if response.status in (401, 403):
# 给相应的IP失败次数+1
self.stats[cur_proxy] += 1
print('%s got wrong code %s times' % (cur_proxy, self.stats[cur_proxy]))
# 当某个IP的失败次数累计到一定数量
if self.stats[cur_proxy] >= self.max_failed:
print('got wrong http code (%s) when use %s'
% (response.status, cur_proxy))
# 可以认为该IP被对方封禁了,从代理池中将该IP删除
self.remove_proxy(cur_proxy)
del request.meta['proxy']
# 将该请求重新安排调度下载
return request
return response
def process_exception(self, request, exception, spider):
# 4. 请求失败则调用process_exception
cur_proxy = request.meta.get('proxy')
# 如果本次请求使用了代理,并且网络请求报错,认为该IP出现问题了
if cur_proxy and isinstance(exception, (ConnectionRefusedError, TimeoutError)):
print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
self.remove_proxy(cur_proxy)
del request.meta['proxy']
return request
def remove_proxy(self, proxy):
if proxy in self.proxies:
self.proxies.remove(proxy)
print('remove %s from proxy list' % proxy)



