本文档简要介绍了NEEA托福考位本地爬虫的使用方法。
This document provides a brief intro of the usage of NEEA TOEFL Test Seats Selenium Crawler.
Github: https://github.com/jianqiaomo/NEEA-TOEFL-Testseat-Crawler
https://jqmo.top
https://engineering.nyu.edu/jianqiao-mo
动机 MotivationNEEA 托福考位网站正在提供着不便的服务。在寻找考位时,我们需要按每个日期,每个城市一个个地搜索考位,
这为那些想尽快找到测试座位的人带来了无法忍受的体验。
为什么不直接以表格形式显示所有考位?
NEEA TOEFL Test Seat website, supported by Chinese National Education
Examinations Authority (NEEA), is providing an inconvenience service. When looking for a test seat,
we need to search date by every date, every city, which brings an intolerable experience for those
who just want to find a test seat ASAP. Why not display the form of all the test seat?
Firefox mozilla geckodriver v0.26.0
How to install webdriver Firefox ≥ 60pip install selenium 安装方式 Install
Firefox mozilla geckodriver: the default geckodriver path is “C:Program FilesMozilla Firefoxgeckodriver.exe”.
If you want to set your executable path, please use –webdriver_path=‘your path’ to start.
默认Firefox mozilla geckodriver是安装在"C:Program FilesMozilla Firefoxgeckodriver.exe"路径中,如果你希望使用其他路径,
请使用 –webdriver_path=‘your path’ 来启动爬虫。
default start
python crawler_toefl.py --username='NEEA ID number' --password='password'
When finished, you can get a .csv form file. 爬虫完成后将得到.csv表格文件。
Todo:- faster, test time is 25min 爬虫速度太慢了, 爬完全部数据目前需要25分钟headless mode 无界面模式怎么绕开反爬虫?Anti anti-crawler when click the ‘search seats’ button 怎么绕开反爬虫?online crawler (use a server) 在线爬虫(服务器)different modes 用户定制化爬虫
This idea is initially coming from https://www.jianshu.com/p/2541d918869e, thanks!
Github crawler_toefl.py:# *_*coding:utf-8 *_*
# test on python 3.6
# thanks https://www.jianshu.com/p/2541d918869e
# version 1.0
# author cambridge.mo@foxmail.com
# month Jul 2020
import os
import csv
import time
import requests
from PIL import Image
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
import win32gui
import win32api
import argparse
CITYS = []
DATES = []
def parse_args():
# Parse input arguments
parser = argparse.ArgumentParser(description='TOEFL crawler args')
parser.add_argument('--username', dest='USERNAME_TF',
type=str, default='8625374')
parser.add_argument('--password', dest='PASSWORD_TF',
type=str, default='mJq3183601mJq!!')
parser.add_argument('--headless', dest='headless',
help='(Not suport in this version) start headless, browser will not display',
default=False, action='store_true')
parser.add_argument('--eager', dest='eager',
help='eager mode (unstable!) is faster when loading web-page',
default=False, action='store_true')
parser.add_argument('--webdriver_path', dest='webdriver_path',
help='set Firefox webdriver path',
type=str, default="C:Program FilesMozilla Firefoxgeckodriver.exe")
# parser.add_argument('--mode', dest='mode',
# help='enum the mode',
# type=int)
args = parser.parse_args()
return args
class GetToeflTestInfos():
def __init__(self):
args = parse_args()
self.username = args.USERNAME_TF
self.password = args.PASSWORD_TF
if self.username==None:
self.username = input('请输入账户名 Please enter username:')
if self.password==None:
self.password = input('请输入密码 Please enter password:')
self.index_url = "https://toefl.neea.cn/login"
self.hwnd = None
self.option = webdriver.FirefoxOptions() # for anti-crawler, only FireFox can be used
self.option.add_argument('--user-agent="Firefox/60.0"')
if args.headless:
self.option.add_argument('--headless') # start 'headless', browser will not display
if args.eager:
desired_capabilities = DesiredCapabilities.FIREFOX
desired_capabilities["pageLoadStrategy"] = "eager" # eager mode (unstable) is faster when loading web-page
try:
self.driver = webdriver.Firefox(executable_path=args.webdriver_path, options=self.option)
except:
print("Your webdriver executable path is wrong: Cannot start webdriver.")
print("Please use --webdriver_path to set webdriver executable path")
print('See https://github.com/jianqiaomo/NEEA-TOEFL-Testseat-Crawler#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F-install')
raise
self.wait = WebDriverWait(self.driver, timeout=50)
self.CITY = None
self.DATE = None
def input_infos(self):
"""
Enter username and password
"""
self.driver.get(self.index_url)
print("自动输入用户名和密码 Automatically enter username and password")
# username
time.sleep(2)
input_name = self.wait.until(
EC.presence_of_element_located((By.ID, "userName"))
)
input_name.clear()
input_name.send_keys(self.username)
# password
input_pwd = self.wait.until(
EC.presence_of_element_located((By.ID, "textPassword"))
)
input_pwd.clear()
input_pwd.send_keys(self.password)
def get_captcha(self):
"""
get captcha, :return: captcha
"""
print("等待加载验证码 Loading captcha...")
# 模拟点击
input_code = self.wait.until(
EC.element_to_be_clickable((By.ID, "verifyCode"))
)
self.hwnd = win32gui.FindWindow('MozillaWindowClass', '首页 - 教育部考试中心托福网上报名 - Mozilla Firefox')
win32api.keybd_event(27, 0, 0, 0) # VK_code
win32gui.SetForegroundWindow(self.hwnd)
while True:
input_code.click()
time.sleep(4)
# get captcha link, send requests
src = self.wait.until(
EC.presence_of_element_located((By.ID, "chkImg"))
)
time.sleep(2.5)
src_url = src.get_attribute("src")
print(src_url)
if (not ('loading' in src_url)) and (src_url is not None):
break
res = requests.get(src_url)
time.sleep(1.5)
with open('code.png', 'wb') as f:
f.write(res.content)
# Open local captcha, manually identify
try:
im = Image.open('code.png')
im.show()
im.close()
except:
print('到本地目录打开code.png获取验证码 Go local directory, open code.png to see captcha')
finally:
captcha = input('请输入验证码 Please enter the captcha:')
os.remove('code.png')
print('尝试登录中 Logging in...')
return captcha
def login(self, code):
input_code = self.wait.until(
EC.presence_of_element_located((By.ID, "verifyCode"))
)
input_code.send_keys(code)
submit_button = self.wait.until(
EC.element_to_be_clickable((By.ID, "btnLogin"))
)
submit_button.click()
# Check if the login is successful
try:
success = self.wait.until(
EC.text_to_be_present_in_element((By.XPATH, '//div[@]/span[2]'), self.username)
)
if success:
print("==登录成功页面 Page Login Success==")
except:
self.input_infos()
code_str = self.get_captcha()
self.login(code_str)
def find_seat(self):
print('开始考位查询 Turn to Page Find-Seat')
success = False
while not success:
self.driver.get("https://toefl.neea.cn/myHome/8625374/index#!/testSeat")
time.sleep(1)
try:
success = self.wait.until(
EC.text_to_be_present_in_element((By.XPATH, '//div[@]/h4'), "查询条件")
)
if success:
print("==考位查询页面 Page Find-Seat==")
except:
success = False
# self.driver.switch_to.alert.accept()
def get_all_DATE(self):
CITYS, DATES = [], []
CITY = "上海"
time.sleep(1)
city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(CITY)
CITYS = self.driver.find_element_by_id("centerProvinceCity").text.split("n")
del CITYS[0]
all_options = self.driver.find_element_by_id("testDays").find_elements_by_tag_name('option')
for option in all_options:
DATES.append(option.get_attribute("value"))
del DATES[0]
print("已获取全部城市、考试日期 get all test DATE/CITYs")
return [CITYS, DATES]
def send_query_condition(self, virgin=False):
city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(self.CITY)
date = Select(self.driver.find_element_by_id("testDays")).select_by_value(self.DATE)
if virgin:
click = False
while not click:
try:
win32api.keybd_event(27, 0, 0, 0) # VK_code
win32gui.SetForegroundWindow(self.hwnd)
print("正在反-反爬虫, 或许需要您点一下火狐浏览器 Anti anti-crawler, you can click the Firefox browser...")
scrool = random.randint(0, 100)
self.driver.execute_script('window.scrollBy(0,%d)' % scrool)
time.sleep(1)
self.driver.execute_script('window.scrollBy(0,%d)' % -scrool)
query_button = self.wait.until(
EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
)
time.sleep(1)
query_button.click()
click = bool(WebDriverWait(self.driver, timeout=5).until(alert_or_success()))
except:
click = False
else:
time.sleep(0.2)
query_button = self.wait.until(
EC.element_to_be_clickable((By.ID, "btnQuerySeat"))
)
query_button.click()
def save_date(self, i=1):
"""
save to .csv
"""
csv_fp = open("toefl_{}_check.csv".format(time.strftime('%Y-%m-%d', time.localtime(time.time()))), "a+",
encoding='utf-8-sig', newline='')
writer = csv.writer(csv_fp)
try:
is_success = EC.text_to_be_present_in_element((By.XPATH, '//td[@]'), s_city)(
self.driver)
except:
is_success = 0
print('save: 是否有考位 Seats Available ', bool(is_success))
if bool(is_success):
# head 1: test date
boxhead1 = self.wait.until(
EC.presence_of_all_elements_located(
(By.XPATH, '//table[@][{}]/thead/tr[1]/th/span'.format(i))
)
)
head1_ls = []
for head1 in boxhead1:
if not head1.text:
continue
head1_ls.append(head1.text)
writer.writerow(head1_ls)
print(head1_ls)
# head 2
boxhead2 = self.wait.until(
EC.presence_of_all_elements_located(
(By.XPATH, '//table[@][{}]/thead/tr[2]/th'.format(i))
)
)
head2_ls = []
for head2 in boxhead2:
head2_ls.append(head2.text.replace('n', ''))
writer.writerow(head2_ls)
print(head2_ls)
# inquiry form
items = self.wait.until(
EC.presence_of_all_elements_located(
(By.XPATH, '//table[@][{}]/tbody/tr'.format(i))
)
)
for item in items:
body_dict = {}
body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text
body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text
body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text
body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text
writer.writerow(body_dict.values())
print(body_dict)
else:
null_line = [self.CITY, self.DATE, "未查询到考位信息"]
print(null_line)
writer.writerow(null_line)
csv_fp.close()
class alert_or_success:
def __init__(self):
self.is_success, self.is_alert = 0, 0
def __call__(self, driver):
'''
wait to see whether is '考位查询结果' or '未查询到考位信息'
'''
try:
self.is_success = EC.text_to_be_present_in_element((By.XPATH, '//div[@id="qrySeatResult"]/h4'), "考位查询结果")(
driver)
except:
self.is_alert = EC.visibility_of_element_located(
(By.XPATH, '//i[@]'))(driver)
if bool(self.is_success):
self.is_alert = 0
return True
elif bool(self.is_alert):
self.is_success = 0
return True
else:
self.is_success, self.is_alert = 0, 0
return False
if __name__ == "__main__":
GetToeflCrawler = GetToeflTestInfos()
GetToeflCrawler.input_infos()
captcha = GetToeflCrawler.get_captcha()
GetToeflCrawler.login(captcha)
GetToeflCrawler.find_seat()
[CITYS, DATES] = GetToeflCrawler.get_all_DATE()
CITYS.reverse()
for s_date in DATES:
for s_city in CITYS:
GetToeflCrawler.CITY, GetToeflCrawler.DATE = s_city, s_date
if [s_city, s_date] == [CITYS[0], DATES[0]]:
virgin = True
else:
virgin = False
GetToeflCrawler.send_query_condition(virgin)
flag = WebDriverWait(GetToeflCrawler.driver, timeout=50).until(alert_or_success())
GetToeflCrawler.save_date(i=1)
GetToeflCrawler.driver.quit()



