在 pipelines.py
from scrapy.contrib.pipeline.images import ImagesPipelinefrom scrapy.http import Requestfrom PIL import Imagefrom cStringIO import StringIOimport reclass jellyImagesPipeline(ImagesPipeline): CONVERTED_ORIGINAL = re.compile('^full/[0-9,a-f]+.jpg$') # name information coming from the spider, in each item # add this information to Requests() for individual images downloads # through "meta" dictionary def get_media_requests(self, item, info): print "get_media_requests" return [Request(x, meta={'image_name': item["image_name"]}) for x in item.get('image_urls', [])] # this is where the image is extracted from the HTTP response def get_images(self, response, request, info): print "get_images" for key, image, buf, in super(jellyImagesPipeline, self).get_images(response, request, info): if self.CONVERTED_ORIGINAL.match(key): key = self.change_filename(key, response) yield key, image, buf def change_filename(self, key, response): return "full/%s.jpg" % response.meta['image_name'][0]在中settings.py,确保有
ITEM_PIPELINES = ['jelly.pipelines.jellyImagesPipeline']IMAGES_STORE = '/path/to/where/you/want/to/store/images'
spider示例:从Python.org主页获取图像,已保存图像的名称(和路径)将遵循站点结构,即位于名为 www.python.org
from scrapy.spider import baseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.item import Item, Fieldimport urlparseclass CustomItem(Item): image_urls = Field() image_names = Field() images = Field()class ImageSpider(baseSpider): name = "customimg" allowed_domains = ["www.python.org"] start_urls = ['http://www.python.org'] def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//img') items = [] for site in sites: item = CustomItem() item['image_urls'] = [urlparse.urljoin(response.url, u) for u in site.select('@src').extract()] # the name information for your image item['image_name'] = ['whatever_you_want'] items.append(item) return items


