Scrapy：使用重命名默认图像名称自定义图像管道

在 pipelines.py

from scrapy.contrib.pipeline.images import ImagesPipelinefrom scrapy.http import Requestfrom PIL import Imagefrom cStringIO import StringIOimport reclass jellyImagesPipeline(ImagesPipeline):    CONVERTED_ORIGINAL = re.compile('^full/[0-9,a-f]+.jpg$')    # name information coming from the spider, in each item    # add this information to Requests() for individual images downloads    # through "meta" dictionary    def get_media_requests(self, item, info):        print "get_media_requests"        return [Request(x, meta={'image_name': item["image_name"]})     for x in item.get('image_urls', [])]    # this is where the image is extracted from the HTTP response    def get_images(self, response, request, info):        print "get_images"        for key, image, buf, in super(jellyImagesPipeline, self).get_images(response, request, info): if self.CONVERTED_ORIGINAL.match(key):     key = self.change_filename(key, response) yield key, image, buf    def change_filename(self, key, response):        return "full/%s.jpg" % response.meta['image_name'][0]

在中settings.py，确保有

ITEM_PIPELINES = ['jelly.pipelines.jellyImagesPipeline']IMAGES_STORE = '/path/to/where/you/want/to/store/images'

spider示例：从Python.org主页获取图像，已保存图像的名称（和路径）将遵循站点结构，即位于名为 www.python.org

from scrapy.spider import baseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.item import Item, Fieldimport urlparseclass CustomItem(Item):    image_urls = Field()    image_names = Field()    images = Field()class ImageSpider(baseSpider):    name = "customimg"    allowed_domains = ["www.python.org"]    start_urls = ['http://www.python.org']    def parse(self, response):        hxs = HtmlXPathSelector(response)        sites = hxs.select('//img')        items = []        for site in sites: item = CustomItem() item['image_urls'] = [urlparse.urljoin(response.url, u) for u in site.select('@src').extract()] # the name information for your image item['image_name'] = ['whatever_you_want'] items.append(item)        return items

Scrapy：使用重命名默认图像名称自定义图像管道

面试问答相关栏目本月热门文章