我认为,仅Splash不能很好地处理此特殊情况。
这是工作思路:
- 使用
selenium
和PhantomJS
无头的浏览器登录到网站 - 将浏览器
cookie
从传递PhantomJS
到Scrapy
代码:
import scrapyfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.support import expected_conditions as ECclass BboSpider(scrapy.Spider): name = "bbo" allowed_domains = ["bridgebase.com"] login_page = "http://www.bridgebase.com/myhands/myhands_login.php?t=%2Fmyhands%2Findex.php%3F" def start_requests(self): driver = webdriver.PhantomJS() driver.get(self.login_page) driver.find_element_by_id("username").send_keys("user") driver.find_element_by_id("password").send_keys("password") driver.find_element_by_name("submit").click() driver.save_screenshot("test.png") WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.link_TEXT, "Click here for results of recent tournaments"))) cookies = driver.get_cookies() driver.close() yield scrapy.Request("http://www.bridgebase.com/myhands/index.php", cookies=cookies) def parse(self, response): if "recent tournaments" in response.body: self.log("Login successful") else: self.log("Login failed") print(response.body)打印
Login successful和“手”页面的HTML。



