Selenium模拟浏览器自动下载数据 一开始固定休眠时,下载几个文件就会被踢下线, 随机休眠后基本无问题了
代码语言:javascript复制from selenium import webdriver
import time,os,shutil
import random
import pandas as pd
import numpy as np
def bitalk_log(username,password,date_xl,tmp_path,data_path):
# 请求登录页面
try:
chrome_options = webdriver.ChromeOptions()
# 设置好应用扩展
#加载狗扩展程序,需要用谷歌浏览器开发者压缩为crx文件
extension_path = r"F:JupyterNotebookxxx.crx"
chrome_options.add_extension(extension_path)
'''
download.default_directory:设置下载路径
profile.default_content_settings.popups:设置为 0 禁止弹出窗口
'''
#添加下载路径
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory':tmp_path}
chrome_options.add_experimental_option('prefs', prefs)
# 启动浏览器,并设置好wait
drive = webdriver.Chrome(chrome_options=chrome_options)
url = 'http://qbt.ecdataway.com/shop'
drive.get(url)
# 随机睡眠时间
tm=random.uniform(2,4)
time.sleep(tm)
# 找到用户名输入用户名
user = drive.find_element_by_name("LoginForm[username]")
user.send_keys(username)
# 找到密码输入密码
pwd=drive.find_element_by_id("LoginForm_password")
pwd.send_keys(password)
# # 点击登录按钮实现登录
drive.find_element_by_class_name("login_btn").click()
# # 登录成功后跳转首页,进行加载,休眠5秒加载页面
tm=random.uniform(4,6)
time.sleep(tm)
drive.find_element_by_xpath('//ul/li/a[text()="店铺分析"]').click()
tm=random.uniform(4,6)
time.sleep(tm)
links = drive.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
shop_num=len(links)
print("店铺数量 %s "%shop_num)
start_num=int(input("从第几家店铺开始:"))
for i in range(start_num,shop_num 1):
drive.find_element_by_xpath('//ul/li/a[text()="店铺分析"]').click()
tm=random.uniform(2,3)
time.sleep(tm)
drive.find_element_by_xpath('//*[@id="tag-flag"]/tr[%s]/td[12]/div/a[6]/img[@title="分时统计"]'%i).click()
tm=random.uniform(3,4)
time.sleep(tm)
drive.find_element_by_xpath('//*[@id="f1"]/div[1]/div/div[2]/input[2]').click() #选择上架时间 否
tm=random.uniform(2,3)
time.sleep(tm)
total=""
shop=drive.find_element_by_xpath('/html/body/div[2]/div[2]/a[2]').text
for date in date_xl["日期"].astype(str):
start_date=drive.find_element_by_xpath('//*[@id="start_date_fenshi"]')
start_date.click()
start_date.clear()#清除开始时间
start_date.send_keys(date)#填入开始时间
end_date=drive.find_element_by_xpath('//*[@id="end_date_fenshi"]')
end_date.click()
end_date.clear()
end_date.send_keys(date)
tm=random.uniform(1,2)
time.sleep(tm)
drive.find_element_by_xpath('//*[@id="f1"]/div[1]/div/div[6]/input[@value="检索"]').click()
tm=random.uniform(4,5)
time.sleep(tm)
lk = drive.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
lk_num=len(lk)
total_sale=drive.find_element_by_xpath('//*[@id="content"]/div[2]/form/table/tbody/tr[%s]/td[6]/center'%lk_num).text #获取总计金额
if total_sale!=total:
drive.find_element_by_xpath('//*[@id="f1"]/p/input[@value="CSV下载"]').click()
else:
print("可能未加载完成,请稍等!")
tm=random.uniform(4,5)
time.sleep(tm)
drive.find_element_by_xpath('//*[@id="f1"]/p/input[@value="CSV下载"]').click()
total=total_sale
tm=random.uniform(4,5)
time.sleep(tm)
try:
f = os.listdir(tmp_path)[0]
while "crdownload" in f:
# print("---下载未完成,请稍等---")
tm=random.uniform(4,5)
time.sleep(tm)
f = os.listdir(tmp_path)[0]
#找到老的文件所在的位置
old_file=os.path.join(tmp_path,f)
#指定新文件的位置
new_file=os.path.join(data_path,date shop ".xls")
os.rename(old_file,new_file)#重命名文件
print("-----%s下载已完成-----"%(date shop))
except Exception as e:
print(e)
print("---%s下载完成,即将开始下载下一个店铺------"%shop)
except Exception as e:
print("出现问题",e)
tmp_path="" #临时下载路径
data_path=""#重命名后数据存储路径
date_xl=pd.read_excel("需要爬取得时间序列.xlsx")
bitalk_log(username,password,date_xl,tmp_path,data_path)