【爬虫】使用Selenium爬取升学e网通网站的近几年大学录取情况

2022-10-26 16:15:18 浏览数 (3)

背景

高考出分后,填志愿前,以为官方的今年的招生计划只在填报志愿时候才出,想着爬下来。当然,那个时候python还没怎么学呢,怎么会爬?(笑) 最近浅得python爬虫,突然就想起来这个没有完成的事情了。

然后,经过的话,最后是用selenium完成全程。

然后附上破防实录(^ ^;

运行结果预览

输出内容预览:

运行要求

  1. 本代码编写在python3.10版本(不确定低版本会不会有问题)
  2. selenium 3.141.0
  3. chrome浏览器 - chrome handless mode
  4. 拥有升学e网通可以浏览这些内容权限的账号

代码

代码语言:javascript复制
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

def share_browser():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    # path修改为自己chrome浏览器的文件路径
    path = r'C:Program FilesGoogleChromeApplicationchrome.exe'
    chrome_options.binary_location = path
    # chrome_option 换成options了!!!
    browser = webdriver.Chrome(options=chrome_options)
    return browser

def get_data():
    # content = browser.page_source
    # print(content)
    # tree = etree.parse(content)
    # base_xpath = tree.xpath('//tr[@class="ant-table-row ant-table-row-level-0"]')
    # for tr_data in base_xpath:
    title = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]//a')
    major = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]//div[contains(@class,"majorname")]')
    ben_zhuan = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[4]/span')
    # 科目要求就懒得爬了
    # 2022, 2021, 2020, 2019
    plan_number_2022_cnt = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[6]//span[@class="grayer font14"]')

    plan_number_2021_rank = [1]*15
    plan_number_2020_rank = [1]*15
    plan_number_2019_rank = [1]*15
    plan_number_2021_score = [1]*15
    plan_number_2020_score = [1]*15
    plan_number_2019_score = [1]*15
    plan_number_2021_cnt = [1]*15
    plan_number_2020_cnt = [1]*15
    plan_number_2019_cnt = [1]*15
    rank__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[1]')
    # print(len(title),len(rank__))
    for i in range(len(title)):
        plan_number_2021_rank[i] = rank__[i*3]
        plan_number_2020_rank[i] = rank__[i*3 1]
        plan_number_2019_rank[i] = rank__[i*3 2]
    score__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[2]')
    for i in range(len(title)):
        plan_number_2021_score[i] = score__[i*3]
        plan_number_2020_score[i] = score__[i*3 1]
        plan_number_2019_score[i] = score__[i*3 2]
    cnt__ = browser.find_elements_by_xpath('//tr[@class="ant-table-row ant-table-row-level-0"]/td[7]//div/span[3]')
    for i in range(len(title)):
        plan_number_2021_cnt[i] = cnt__[i*3]
        plan_number_2020_cnt[i] = cnt__[i*3 1]
        plan_number_2019_cnt[i] = cnt__[i*3 2]
    # rank score cnt

    # print(type(title))
    # print(type(major))
    # print(type(ben_zhuan))
    # print(type(plan_number_2022_cnt))
    # print(type(plan_number_2021_rank))
    # print(type(plan_number_2021_score))
    # print(type(plan_number_2021_cnt))
    # print(type(rank__))
    for i in range(len(title)):
        fp.write(title[i].text ' ' major[i].text ' ' ben_zhuan[i].text ' ' 'n2022' ' ' plan_number_2022_cnt[i].text.replace('/', '') ' ' 'n2021' ' ' plan_number_2021_rank[i].text[4:] ' ' plan_number_2021_score[i].text ' ' plan_number_2021_cnt[i].text ' ' 'n2020' ' ' plan_number_2020_rank[i].text[4:] ' ' plan_number_2020_score[i].text ' ' plan_number_2020_cnt[i].text ' ' 'n2019' ' ' plan_number_2019_rank[i].text[4:] ' ' plan_number_2019_score[i].text ' ' plan_number_2019_cnt[i].text)
        fp.write('nn')
    time.sleep(2)


def get_next_page():
    next_page = browser.find_element_by_xpath('//li[@title="Next Page"]/a')
    next_page.click()
    time.sleep(4)
    browser.execute_script(js_bottom)
    time.sleep(2)

browser = share_browser()

url = 'https://www.ewt360.com/'
browser.get(url)
time.sleep(2)
account_input = browser.find_element_by_id('login__password_userName')
# 输入账号
account_input.send_keys('***********')
password_input = browser.find_element_by_id('login__password_password')
# 输入密码
password_input.send_keys('******')
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt1.png')
login_button = browser.find_element_by_xpath('//button[@type="submit"]')
login_button.click()
# browser.save_screenshot('.\ewt__\ewt2.png')
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt3.png')
reject_bond_phone_button = browser.find_element_by_xpath('//button[@class="ant-btn"]')
reject_bond_phone_button.click()
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt4.png')
enter_apply_button = browser.find_element_by_xpath('//a[@href="/Apply"][4]')
enter_apply_button.click()
# browser.save_screenshot('.\ewt__\ewt5.png')
time.sleep(5)
known_button = browser.find_element_by_class_name('ant-checkbox')
known_button.click()
# browser.save_screenshot('.\ewt__\ewt6.png')
time.sleep(5)
known_button = browser.find_element_by_xpath('//button[@style]')
known_button.click()
time.sleep(2)
# browser.save_screenshot('.\ewt__\ewt7.png')
time.sleep(2)
known_button = browser.find_element_by_xpath('//button[@class="ant-btn volunteer-modal-btn ant-btn-primary ant-btn-lg"]')
known_button.click()
# browser.save_screenshot('.\ewt__\ewt8.png')
time.sleep(2)
js_bottom = 'document.documentElement.scrollTop=100000'
browser.execute_script(js_bottom)
time.sleep(2)
browser.refresh()
time.sleep(2)
browser.execute_script(js_bottom)
time.sleep(5)
# browser.save_screenshot('.\ewt__\ewt9.png')
fp = open('.\ewt__\data.txt','w',encoding='utf-8')

for page in range(300):# 1004 总页数
    print("page:",page 1)
    get_data()
    get_next_page()

fp.close()

最后再贴一张爬虫与反爬的图

0 人点赞