Selenium自动下载qbt数据

2022-05-09 18:53:13 浏览数 (1)

Selenium模拟浏览器自动下载数据 一开始固定休眠时,下载几个文件就会被踢下线, 随机休眠后基本无问题了

代码语言:javascript复制
from selenium import webdriver
import time,os,shutil
import random
import pandas as pd
import numpy as np

def bitalk_log(username,password,date_xl,tmp_path,data_path):
    # 请求登录页面
    try:
        chrome_options = webdriver.ChromeOptions()
        # 设置好应用扩展
        #加载狗扩展程序,需要用谷歌浏览器开发者压缩为crx文件
        extension_path = r"F:JupyterNotebookxxx.crx"
        chrome_options.add_extension(extension_path)
        
        '''
        download.default_directory:设置下载路径
        profile.default_content_settings.popups:设置为 0 禁止弹出窗口
        '''
        #添加下载路径
        prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory':tmp_path}
        chrome_options.add_experimental_option('prefs', prefs)

        # 启动浏览器,并设置好wait
        drive = webdriver.Chrome(chrome_options=chrome_options)
        url = 'http://qbt.ecdataway.com/shop'
        drive.get(url)
        # 随机睡眠时间
        tm=random.uniform(2,4)
        time.sleep(tm)
        # 找到用户名输入用户名
        user = drive.find_element_by_name("LoginForm[username]")
        user.send_keys(username)
        # 找到密码输入密码
        pwd=drive.find_element_by_id("LoginForm_password")
        pwd.send_keys(password)
#         # 点击登录按钮实现登录
        drive.find_element_by_class_name("login_btn").click()
#         # 登录成功后跳转首页,进行加载,休眠5秒加载页面
        tm=random.uniform(4,6)
        time.sleep(tm)
        drive.find_element_by_xpath('//ul/li/a[text()="店铺分析"]').click()
        tm=random.uniform(4,6)
        time.sleep(tm)
        links = drive.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
        shop_num=len(links)
        print("店铺数量 %s "%shop_num)
        start_num=int(input("从第几家店铺开始:"))
        for i in range(start_num,shop_num 1):
            drive.find_element_by_xpath('//ul/li/a[text()="店铺分析"]').click()
            tm=random.uniform(2,3)
            time.sleep(tm)
            drive.find_element_by_xpath('//*[@id="tag-flag"]/tr[%s]/td[12]/div/a[6]/img[@title="分时统计"]'%i).click()
            tm=random.uniform(3,4)
            time.sleep(tm)
            drive.find_element_by_xpath('//*[@id="f1"]/div[1]/div/div[2]/input[2]').click()  #选择上架时间  否
            tm=random.uniform(2,3)
            time.sleep(tm)
            total=""
            shop=drive.find_element_by_xpath('/html/body/div[2]/div[2]/a[2]').text
            for date in date_xl["日期"].astype(str):
                start_date=drive.find_element_by_xpath('//*[@id="start_date_fenshi"]')
                start_date.click()
                start_date.clear()#清除开始时间
                start_date.send_keys(date)#填入开始时间
                end_date=drive.find_element_by_xpath('//*[@id="end_date_fenshi"]')
                end_date.click()
                end_date.clear()
                end_date.send_keys(date)
                tm=random.uniform(1,2)
                time.sleep(tm)
                drive.find_element_by_xpath('//*[@id="f1"]/div[1]/div/div[6]/input[@value="检索"]').click()
                tm=random.uniform(4,5)
                time.sleep(tm)
                lk = drive.find_element_by_tag_name("tbody").find_elements_by_tag_name("tr")
                lk_num=len(lk)
                total_sale=drive.find_element_by_xpath('//*[@id="content"]/div[2]/form/table/tbody/tr[%s]/td[6]/center'%lk_num).text #获取总计金额
                if total_sale!=total:
                    drive.find_element_by_xpath('//*[@id="f1"]/p/input[@value="CSV下载"]').click()
                else:
                    print("可能未加载完成,请稍等!")
                    tm=random.uniform(4,5)
                    time.sleep(tm)
                    drive.find_element_by_xpath('//*[@id="f1"]/p/input[@value="CSV下载"]').click()
                total=total_sale
                tm=random.uniform(4,5)
                time.sleep(tm)
                try:
                    f = os.listdir(tmp_path)[0]
                    while "crdownload" in f:
#                             print("---下载未完成,请稍等---")
                            tm=random.uniform(4,5)
                            time.sleep(tm)
                            f = os.listdir(tmp_path)[0]
                    #找到老的文件所在的位置
                    old_file=os.path.join(tmp_path,f)
                    #指定新文件的位置
                    new_file=os.path.join(data_path,date shop ".xls")
                    os.rename(old_file,new_file)#重命名文件
                    print("-----%s下载已完成-----"%(date shop))
                except Exception as e:              
                    print(e)
            print("---%s下载完成,即将开始下载下一个店铺------"%shop)
    except Exception as e:
        print("出现问题",e)
tmp_path=""  #临时下载路径
data_path=""#重命名后数据存储路径
date_xl=pd.read_excel("需要爬取得时间序列.xlsx")
bitalk_log(username,password,date_xl,tmp_path,data_path)

0 人点赞