Selenium爬取京东商品案例

2020-02-13 15:06:35 浏览数 (1)

Selenium爬取京东衣服评论

16/10

周三 晴

需要的模块:

代码语言:javascript复制
from selenium import webdriver
from lxml import etree
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
import csv

这个为主程序第六行说明

代码语言:javascript复制
ChromeDriver浏览器驱动的安装:(注意浏览器版本:)
首先查看当前谷歌Chrome浏览器的版本V61~V67(对应2.35~2.38),再到下面网址下载
网址:https://chromedriver.storage.googleapis.com/index.html
Windows安装:将解压的文件:chromedriver.exe 放置到Python的Scripts目录下。
Mac/Linux安装:将解压的文件:chromedriver 放置到/usr/local/bin/目录下
代码语言:javascript复制
class JD_spider(object):
    def __init__(self):
        options = webdriver.ChromeOptions()  # 定义浏览器设置
        # options.add_argument('--headless')  # 禁止自动打开浏览器
        options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})  # 不加载网页图片,加快速度
        self.driver = webdriver.Chrome(r'D:外安装软件selenium1chromedriver_win32chromedriver.exe', options=options)  # 浏览器驱动
        self.data = []  # 定义列表, 将数据存入列表中
        self.url = 'https://search.jd.com/Search?keyword=衣服&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=衣服&psort=4&page=num' # 目标网站
        self.baseurl = 'https:'  # 网址头部, 为下面提供
        self.driver.get(self.url)  # 使用浏览器打开网站
        source = self.driver.page_source  # 获取网页源代码
        self.url_(source)  # 将数据传递给下一个函数
    # 解析含有评论的URL
    def url_(self, source):
        html = etree.HTML(source)
        lis = html.xpath('//li[@class="gl-item"]')
        for li in lis:
            commit = li.xpath('.//div[@class="p-commit"]/strong/a/@href')[0]
            commit_url = self.baseurl   commit
            self.parse_commit(commit_url)
            time.sleep(1)

    # 解析每页的数据直到最后一页
    def parse_commit(self, url):
        self.driver.get(url)
        while True:
            source = self.driver.page_source
            self.parse_index(source)
            WebDriverWait(self.driver, timeout=20).until(
                ec.presence_of_element_located((By.XPATH, '//*[@id="comment-0"]/div[13]/div/div/a[7]')))
            next_page = self.driver.find_element_by_xpath('//*[@id="comment-0"]/div[13]/div/div/a[7]')
            if 'class="ui-page-curr"' in next_page.get_attribute('class'):
                break
            else:
                self.driver.execute_script("arguments[0].click()", next_page)
                time.sleep(7)

    # 爬取每页的评论的数据
    def parse_index(self, source):
        html = etree.HTML(source)
        commit_items = html.xpath('//div[@class="comment-item"]')
        for commit_item in commit_items:
            commit = commit_item.xpath('.//p[@class="comment-con"]/text()')[0]
            print(commit)
            data = {"commit": commit}
            self.data.append(data)
            self.csv_()

    # 保存为csv文件
    def csv_(self):
        headers = ['commit']
        with open('评论.csv', 'w', newline="", encoding='utf-8')as fb:
            writer = csv.DictWriter(fb, headers)
            writer.writeheader()
            writer.writerows(self.data)


if __name__ == '__main__':
    tb = JD_spider()

END

0 人点赞