抓取淘宝美食数据

2019-07-19 15:50:51 浏览数 (1)

1.进入淘宝,主页:https://www.taobao.com/

2.搜索:美食,点击搜索

3.得到当前搜索结果商品的:price(价格),location(销售地),shop(商店名称),image(图片),title(商品名称),product_link商品连接

4切换点击到最后一页停止

代码:

代码语言:javascript复制
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re
from bs4 import BeautifulSoup

SERVICE_ARGS = ['--load-images=false', '--disk-cache=false']
# 全局变量

# 无界面的浏览器
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(service_args=SERVICE_ARGS, options=options)

# 设置浏览器的窗口大小
# driver.set_window_size(width=1360, height=768)
# 第一个参数是传入driver,第二个参数是等待时间
wait = WebDriverWait(driver, 5)


def next_page(page):
    print("正在切换===", page, "页")

    input = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form input")
    # 清空内容
    input.clear()
    # 输入传入的页面
    input.send_keys(page)

    submit = driver.find_element_by_css_selector("#mainsrp-pager div div div div.form span.btn.J_Submit")
    submit.click()

    # 判断是否切换成功,等待校验是否成功
    wait.until(
        EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active"),
                                         str(page)))

    # 解析对应页面的数据
    get_product_info(page)


def get_product_info(page):
    print("当前正在解析========", page, "页")

    # 判断页面是否加载完成
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))

    # 当前页面的数据
    html = driver.page_source

    soup = BeautifulSoup(html, "lxml")
    # 得到当前页面的所以的商品标签item
    product_lists = soup.select("#mainsrp-itemlist .items .item")

    for product in product_lists:
        print("---" * 100)
        item = {}
        # 出售地点
        location = product.select(".location")[0].text
        # 店铺名称
        shopname = product.select(".shopname")[0].text
        # 商品名称
        title = product.select(".title .J_ClickStat")[0].text

        # 商品的图片
        iamge = product.select("img")[0]["data-src"]

        data_link = product.select(".pic-link.J_ClickStat.J_ItemPicA")[0]["href"]

        # print(location,shopname,title,iamge,data_link)
        item["location"] = location
        item["shopname"] = shopname
        item["title"] = title
        item["iamge"] = iamge
        item["data_link"] = data_link

        print(item)


# 得到总页数

def get_total_page():
    driver.get("https://www.taobao.com/")
    # 等待搜索宽出现
    input = wait.until(EC.presence_of_element_located((By.ID, "q")))
    input.clear()
    input.send_keys("美食")

    # 按点击按钮

    driver.find_element_by_css_selector(".btn-search").click()

    # 第1页

    # 得到总页数
    total = driver.find_element_by_class_name("total").text
    # print(total)
    total_num = re.compile(r'd ').search(total).group()

    # 写一个函数获取当前页(第一页的数据)

    get_product_info(1)

    return total_num


if __name__ == "__main__":
    toto_page = get_total_page()
    print("总页数===", toto_page)

    for page in range(2, int(toto_page)   1):
        print(page)
        next_page(page)

    # 退出浏览器
    driver.quit()

运行效果:

0 人点赞