我的工作最终结果是:
spider.py:
import scrapyimport reimport urlparsefrom scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import linkExtractorfrom scrapy.loader.processors import Join, MapCompose, TakeFirstfrom scrapy.pipelines.images import ImagesPipelinefrom production.items import ProductionItemfrom production.items import ImageItemclass productionSpider(scrapy.Spider): name = "production" allowed_domains = ["url"] start_urls = [ "startingurl.com" ]def parse(self, response): for sel in response.xpath('//html/body'): item = ProductionItem() img_url = sel.xpath('//a[@idd="followclaslink"]/@href').extract()[0] yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages, meta={'item': item})def parseImages(self, response): for elem in response.xpath("//img"): img_url = elem.xpath("@src").extract_first() yield ImageItem(image_urls=[img_url])Settings.py
BOT_NAME = 'production'SPIDER_MODULES = ['production.spiders']NEWSPIDER_MODULE = 'production.spiders'DEFAULT_ITEM_CLASS = 'production.items'ROBOTSTXT_OBEY = TrueIMAGES_STORE = '/Users/home/images'DOWNLOAD_DELAY = 2ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}# Disable cookies (enabled by default)items.py# -*- coding: utf-8 -*-import scrapyclass ProductionItem(scrapy.Item): img_url = scrapy.Field()# ScrapingList Residential & Yield Estate for saleclass ListResidentialItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field()class ImageItem(scrapy.Item): image_urls = scrapy.Field() images = scrapy.Field()pipelines.py
import scrapyfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exceptions import DropItemclass MyImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['image_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item


