selenium爬虫自动抓获NEEA网站TOEFL考位

时间：2023-04-26

NEEA自动考位爬虫 Getting Started with NEEA TOEFL Testseat Crawler

本文档简要介绍了NEEA托福考位本地爬虫的使用方法。
This document provides a brief intro of the usage of NEEA TOEFL Test Seats Selenium Crawler.

Github: https://github.com/jianqiaomo/NEEA-TOEFL-Testseat-Crawler

https://jqmo.top

https://engineering.nyu.edu/jianqiao-mo

动机 Motivation

NEEA 托福考位网站正在提供着不便的服务。在寻找考位时，我们需要按每个日期，每个城市一个个地搜索考位，
这为那些想尽快找到测试座位的人带来了无法忍受的体验。

为什么不直接以表格形式显示所有考位？

NEEA TOEFL Test Seat website, supported by Chinese National Education
Examinations Authority (NEEA), is providing an inconvenience service、When looking for a test seat,
we need to search date by every date, every city, which brings an intolerable experience for those
who just want to find a test seat ASAP、Why not display the form of all the test seat?

安装要求 Requirements

Firefox mozilla geckodriver v0.26.0

How to install webdriver Firefox ≥ 60pip install selenium 安装方式 Install

Firefox mozilla geckodriver: the default geckodriver path is “C:Program FilesMozilla Firefoxgeckodriver.exe”.
If you want to set your executable path, please use –webdriver_path=‘your path’ to start.

默认Firefox mozilla geckodriver是安装在"C:Program FilesMozilla Firefoxgeckodriver.exe"路径中，如果你希望使用其他路径，
请使用 –webdriver_path=‘your path’ 来启动爬虫。

Get start

default start

python crawler_toefl.py --username='NEEA ID number' --password='password'

When finished, you can get a .csv form file、爬虫完成后将得到.csv表格文件。

Todo: faster, test time is 25min 爬虫速度太慢了, 爬完全部数据目前需要25分钟headless mode 无界面模式怎么绕开反爬虫?Anti anti-crawler when click the ‘search seats’ button 怎么绕开反爬虫?online crawler (use a server) 在线爬虫(服务器)different modes 用户定制化爬虫 Acknowledgement

This idea is initially coming from https://www.jianshu.com/p/2541d918869e, thanks!

Github crawler_toefl.py:

# *_*coding:utf-8 *_*# test on python 3.6# thanks https://www.jianshu.com/p/2541d918869e# version 1.0# author cambridge.mo@foxmail.com# month Jul 2020import osimport csvimport timeimport requestsfrom PIL import Imagefrom selenium import webdriverfrom selenium.webdriver.support.ui import WebDriverWaitfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support.select import Selectfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.common.desired_capabilities import DesiredCapabilitiesimport randomimport win32guiimport win32apiimport argparseCITYS = []DATES = []def parse_args(): # Parse input arguments parser = argparse.ArgumentParser(description='TOEFL crawler args') parser.add_argument('--username', dest='USERNAME_TF', type=str, default='8625374') parser.add_argument('--password', dest='PASSWORD_TF', type=str, default='mJq3183601mJq!!') parser.add_argument('--headless', dest='headless', help='(Not suport in this version) start headless, browser will not display', default=False, action='store_true') parser.add_argument('--eager', dest='eager', help='eager mode (unstable!) is faster when loading web-page', default=False, action='store_true') parser.add_argument('--webdriver_path', dest='webdriver_path', help='set Firefox webdriver path', type=str, default="C:Program FilesMozilla Firefoxgeckodriver.exe") # parser.add_argument('--mode', dest='mode', # help='enum the mode', # type=int) args = parser.parse_args() return argsclass GetToeflTestInfos(): def __init__(self): args = parse_args() self.username = args.USERNAME_TF self.password = args.PASSWORD_TF if self.username==None: self.username = input('请输入账户名 Please enter username:') if self.password==None: self.password = input('请输入密码 Please enter password:') self.index_url = "https://toefl.neea.cn/login" self.hwnd = None self.option = webdriver.FirefoxOptions() # for anti-crawler, only FireFox can be used self.option.add_argument('--user-agent="Firefox/60.0"') if args.headless: self.option.add_argument('--headless') # start 'headless', browser will not display if args.eager: desired_capabilities = DesiredCapabilities.FIREFOX desired_capabilities["pageLoadStrategy"] = "eager" # eager mode (unstable) is faster when loading web-page try: self.driver = webdriver.Firefox(executable_path=args.webdriver_path, options=self.option) except: print("Your webdriver executable path is wrong: Cannot start webdriver.") print("Please use --webdriver_path to set webdriver executable path") print('See https://github.com/jianqiaomo/NEEA-TOEFL-Testseat-Crawler#%E5%AE%89%E8%A3%85%E6%96%B9%E5%BC%8F-install') raise self.wait = WebDriverWait(self.driver, timeout=50) self.CITY = None self.DATE = None def input_infos(self): """ Enter username and password """ self.driver.get(self.index_url) print("自动输入用户名和密码 Automatically enter username and password") # username time.sleep(2) input_name = self.wait.until( EC.presence_of_element_located((By.ID, "userName")) ) input_name.clear() input_name.send_keys(self.username) # password input_pwd = self.wait.until( EC.presence_of_element_located((By.ID, "textPassword")) ) input_pwd.clear() input_pwd.send_keys(self.password) def get_captcha(self): """ get captcha, :return: captcha """ print("等待加载验证码 Loading captcha...") # 模拟点击 input_code = self.wait.until( EC.element_to_be_clickable((By.ID, "verifyCode")) ) self.hwnd = win32gui.FindWindow('MozillaWindowClass', '首页 - 教育部考试中心托福网上报名 - Mozilla Firefox') win32api.keybd_event(27, 0, 0, 0) # VK_code win32gui.SetForegroundWindow(self.hwnd) while True: input_code.click() time.sleep(4) # get captcha link, send requests src = self.wait.until( EC.presence_of_element_located((By.ID, "chkImg")) ) time.sleep(2.5) src_url = src.get_attribute("src") print(src_url) if (not ('loading' in src_url)) and (src_url is not None): break res = requests.get(src_url) time.sleep(1.5) with open('code.png', 'wb') as f: f.write(res.content) # Open local captcha, manually identify try: im = Image.open('code.png') im.show() im.close() except: print('到本地目录打开code.png获取验证码 Go local directory, open code.png to see captcha') finally: captcha = input('请输入验证码 Please enter the captcha:') os.remove('code.png') print('尝试登录中 Logging in...') return captcha def login(self, code): input_code = self.wait.until( EC.presence_of_element_located((By.ID, "verifyCode")) ) input_code.send_keys(code) submit_button = self.wait.until( EC.element_to_be_clickable((By.ID, "btnLogin")) ) submit_button.click() # Check if the login is successful try: success = self.wait.until( EC.text_to_be_present_in_element((By.XPATH, '//div[@]/span[2]'), self.username) ) if success: print("==登录成功页面 Page Login Success==") except: self.input_infos() code_str = self.get_captcha() self.login(code_str) def find_seat(self): print('开始考位查询 Turn to Page Find-Seat') success = False while not success: self.driver.get("https://toefl.neea.cn/myHome/8625374/index#!/testSeat") time.sleep(1) try: success = self.wait.until( EC.text_to_be_present_in_element((By.XPATH, '//div[@]/h4'), "查询条件") ) if success: print("==考位查询页面 Page Find-Seat==") except: success = False # self.driver.switch_to.alert.accept() def get_all_DATE(self): CITYS, DATES = [], [] CITY = "上海" time.sleep(1) city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(CITY) CITYS = self.driver.find_element_by_id("centerProvinceCity").text.split("n") del CITYS[0] all_options = self.driver.find_element_by_id("testDays").find_elements_by_tag_name('option') for option in all_options: DATES.append(option.get_attribute("value")) del DATES[0] print("已获取全部城市、考试日期 get all test DATE/CITYs") return [CITYS, DATES] def send_query_condition(self, virgin=False): city = Select(self.driver.find_element_by_id("centerProvinceCity")).select_by_visible_text(self.CITY) date = Select(self.driver.find_element_by_id("testDays")).select_by_value(self.DATE) if virgin: click = False while not click: try: win32api.keybd_event(27, 0, 0, 0) # VK_code win32gui.SetForegroundWindow(self.hwnd) print("正在反-反爬虫, 或许需要您点一下火狐浏览器 Anti anti-crawler, you can click the Firefox browser...") scrool = random.randint(0, 100) self.driver.execute_script('window.scrollBy(0,%d)' % scrool) time.sleep(1) self.driver.execute_script('window.scrollBy(0,%d)' % -scrool) query_button = self.wait.until( EC.element_to_be_clickable((By.ID, "btnQuerySeat")) ) time.sleep(1) query_button.click() click = bool(WebDriverWait(self.driver, timeout=5).until(alert_or_success())) except: click = False else: time.sleep(0.2) query_button = self.wait.until( EC.element_to_be_clickable((By.ID, "btnQuerySeat")) ) query_button.click() def save_date(self, i=1): """ save to .csv """ csv_fp = open("toefl_{}_check.csv".format(time.strftime('%Y-%m-%d', time.localtime(time.time()))), "a+", encoding='utf-8-sig', newline='') writer = csv.writer(csv_fp) try: is_success = EC.text_to_be_present_in_element((By.XPATH, '//td[@]'), s_city)( self.driver) except: is_success = 0 print('save: 是否有考位 Seats Available ', bool(is_success)) if bool(is_success): # head 1: test date boxhead1 = self.wait.until( EC.presence_of_all_elements_located( (By.XPATH, '//table[@][{}]/thead/tr[1]/th/span'.format(i)) ) ) head1_ls = [] for head1 in boxhead1: if not head1.text: continue head1_ls.append(head1.text) writer.writerow(head1_ls) print(head1_ls) # head 2 boxhead2 = self.wait.until( EC.presence_of_all_elements_located( (By.XPATH, '//table[@][{}]/thead/tr[2]/th'.format(i)) ) ) head2_ls = [] for head2 in boxhead2: head2_ls.append(head2.text.replace('n', '')) writer.writerow(head2_ls) print(head2_ls) # inquiry form items = self.wait.until( EC.presence_of_all_elements_located( (By.XPATH, '//table[@][{}]/tbody/tr'.format(i)) ) ) for item in items: body_dict = {} body_dict["test_city"] = item.find_element_by_xpath('./td[1]').text body_dict["test_venues"] = item.find_element_by_xpath('./td[2]').text body_dict["test_fee"] = item.find_element_by_xpath('./td[3]').text body_dict["test_seat"] = item.find_element_by_xpath('./td[4]').text writer.writerow(body_dict.values()) print(body_dict) else: null_line = [self.CITY, self.DATE, "未查询到考位信息"] print(null_line) writer.writerow(null_line) csv_fp.close()class alert_or_success: def __init__(self): self.is_success, self.is_alert = 0, 0 def __call__(self, driver): ''' wait to see whether is '考位查询结果' or '未查询到考位信息' ''' try: self.is_success = EC.text_to_be_present_in_element((By.XPATH, '//div[@id="qrySeatResult"]/h4'), "考位查询结果")( driver) except: self.is_alert = EC.visibility_of_element_located( (By.XPATH, '//i[@]'))(driver) if bool(self.is_success): self.is_alert = 0 return True elif bool(self.is_alert): self.is_success = 0 return True else: self.is_success, self.is_alert = 0, 0 return Falseif __name__ == "__main__": GetToeflCrawler = GetToeflTestInfos() GetToeflCrawler.input_infos() captcha = GetToeflCrawler.get_captcha() GetToeflCrawler.login(captcha) GetToeflCrawler.find_seat() [CITYS, DATES] = GetToeflCrawler.get_all_DATE() CITYS.reverse() for s_date in DATES: for s_city in CITYS: GetToeflCrawler.CITY, GetToeflCrawler.DATE = s_city, s_date if [s_city, s_date] == [CITYS[0], DATES[0]]: virgin = True else: virgin = False GetToeflCrawler.send_query_condition(virgin) flag = WebDriverWait(GetToeflCrawler.driver, timeout=50).until(alert_or_success()) GetToeflCrawler.save_date(i=1) GetToeflCrawler.driver.quit()

上一篇：python多线程：主线程、子线程、守护线程、join方法（附源码）

下一篇：Python程序设计学习（一）