检出以下有关如何将selenium一起使用的摘要。爬网速度会变慢,因为你不仅要下载html,还可以完全访问DOM。
注意:由于先前提供的链接不再起作用,因此我已复制粘贴此代码段。
# Snippet imported from snippets.scrapy.org (which no longer works)from scrapy.contrib.spiders import CrawlSpider, Rulefrom scrapy.contrib.linkextractors.sgml import SgmllinkExtractorfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom selenium import seleniumclass SeleniumSpider(CrawlSpider): name = "SeleniumSpider" start_urls = ["http://www.domain.com"] rules = ( Rule(SgmllinkExtractor(allow=('.html', )), callback='parse_page',follow=True), ) def __init__(self): CrawlSpider.__init__(self) self.verificationErrors = [] self.selenium = selenium("localhost", 4444, "*chrome", "http://www.domain.com") self.selenium.start() def __del__(self): self.selenium.stop() print self.verificationErrors CrawlSpider.__del__(self) def parse_page(self, response): item = Item() hxs = HtmlXPathSelector(response) #Do some XPath selection with Scrapy hxs.select('//div').extract() sel = self.selenium sel.open(response.url) #Wait for javscript to load in Selenium time.sleep(2.5) #Do some crawling of javascript created content with Selenium sel.get_text("//div") yield item


