背景
高考出分后,填志愿前,以为官方的今年的招生计划只在填报志愿时候才出,想着爬下来。当然,那个时候python还没怎么学呢,怎么会爬?(笑) 最近浅得python爬虫,突然就想起来这个没有完成的事情了。
然后,经过的话,最后是用selenium完成全程。
然后附上破防实录(^ ^;
运行结果预览
输出内容预览:
运行要求
- 本代码编写在python3.10版本(不确定低版本会不会有问题)
- selenium 3.141.0
- chrome浏览器 - chrome handless mode
- 拥有升学e网通可以浏览这些内容权限的账号
代码
代码语言:javascript复制from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
def share_browser():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# path修改为自己chrome浏览器的文件路径
path = r'C:Program FilesGoogleChromeApplicationchrome.exe'
chrome_options.binary_location = path
# chrome_option 换成options了!!!
browser = webdriver.Chrome(options=chrome_options)
return browser
def get_data():
# content = browser.page_source
# print(content)
# tree = etree.parse(content)
# base_xpath = tree.xpath('//tr[@class="ant-table-row ant-table-row-level-0"]')
# for tr_data in base_xpath:
title = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]//a')
major = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]//div[contains(@class,"majorname")]')
ben_zhuan = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[4]/span')
# 科目要求就懒得爬了
# 2022, 2021, 2020, 2019
plan_number_2022_cnt = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[6]//span[@class="grayer font14"]')
plan_number_2021_rank = [1]*15
plan_number_2020_rank = [1]*15
plan_number_2019_rank = [1]*15
plan_number_2021_score = [1]*15
plan_number_2020_score = [1]*15
plan_number_2019_score = [1]*15
plan_number_2021_cnt = [1]*15
plan_number_2020_cnt = [1]*15
plan_number_2019_cnt = [1]*15
rank__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[1]')
# print(len(title),len(rank__))
for i in range(len(title)):
plan_number_2021_rank[i] = rank__[i*3]
plan_number_2020_rank[i] = rank__[i*3 1]
plan_number_2019_rank[i] = rank__[i*3 2]
score__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[2]')
for i in range(len(title)):
plan_number_2021_score[i] = score__[i*3]
plan_number_2020_score[i] = score__[i*3 1]
plan_number_2019_score[i] = score__[i*3 2]
cnt__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[3]')
for i in range(len(title)):
plan_number_2021_cnt[i] = cnt__[i*3]
plan_number_2020_cnt[i] = cnt__[i*3 1]
plan_number_2019_cnt[i] = cnt__[i*3 2]
# rank score cnt
# print(type(title))
# print(type(major))
# print(type(ben_zhuan))
# print(type(plan_number_2022_cnt))
# print(type(plan_number_2021_rank))
# print(type(plan_number_2021_score))
# print(type(plan_number_2021_cnt))
# print(type(rank__))
for i in range(len(title)):
fp.write(title[i].text ' ' major[i].text ' ' ben_zhuan[i].text ' ' 'n2022' ' ' plan_number_2022_cnt[i].text.replace('/', '') ' ' 'n2021' ' ' plan_number_2021_rank[i].text[4:] ' ' plan_number_2021_score[i].text ' ' plan_number_2021_cnt[i].text ' ' 'n2020' ' ' plan_number_2020_rank[i].text[4:] ' ' plan_number_2020_score[i].text ' ' plan_number_2020_cnt[i].text ' ' 'n2019' ' ' plan_number_2019_rank[i].text[4:] ' ' plan_number_2019_score[i].text ' ' plan_number_2019_cnt[i].text)
fp.write('nn')
time.sleep(2)
def get_next_page():
next_page = browser.find_element_by_xpath('//li[@title="Next Page"]/a')
next_page.click()
time.sleep(4)
browser.execute_script(js_bottom)
time.sleep(2)
browser = share_browser()
url = 'https://www.ewt360.com/'
browser.get(url)
time.sleep(2)
account_input = browser.find_element_by_id('login__password_userName')
# 输入账号
account_input.send_keys('***********')
password_input = browser.find_element_by_id('login__password_password')
# 输入密码
password_input.send_keys('******')
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt1.png')
login_button = browser.find_element_by_xpath('//button[@type="submit"]')
login_button.click()
# browser.save_screenshot('.\ewt__\ewt2.png')
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt3.png')
reject_bond_phone_button = browser.find_element_by_xpath('//button[@class="ant-btn"]')
reject_bond_phone_button.click()
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt4.png')
enter_apply_button = browser.find_element_by_xpath('//a[@href="/Apply"][4]')
enter_apply_button.click()
# browser.save_screenshot('.\ewt__\ewt5.png')
time.sleep(5)
known_button = browser.find_element_by_class_name('ant-checkbox')
known_button.click()
# browser.save_screenshot('.\ewt__\ewt6.png')
time.sleep(5)
known_button = browser.find_element_by_xpath('//button[@style]')
known_button.click()
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt7.png')
time.sleep(2)
known_button = browser.find_element_by_xpath('//button[@class="ant-btn volunteer-modal-btn ant-btn-primary ant-btn-lg"]')
known_button.click()
# browser.save_screenshot('.\ewt__\ewt8.png')
time.sleep(2)
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)
time.sleep(2)
browser.refresh()
time.sleep(2)
browser.execute_script(js_bottom)
time.sleep(5)
# browser.save_screenshot('.\ewt__\ewt9.png')
fp = open('.\ewt__\data.txt','w',encoding='utf-8')
for page in range(300):# 1004 总页数
print("page:",page 1)
get_data()
get_next_page()
fp.close()