Selenium爬取京东衣服评论
16/10
周三 晴
需要的模块:
代码语言:javascript复制from selenium import webdriver
from lxml import etree
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
import csv
这个为主程序第六行说明
代码语言:javascript复制ChromeDriver浏览器驱动的安装:(注意浏览器版本:)
首先查看当前谷歌Chrome浏览器的版本V61~V67(对应2.35~2.38),再到下面网址下载
网址:https://chromedriver.storage.googleapis.com/index.html
Windows安装:将解压的文件:chromedriver.exe 放置到Python的Scripts目录下。
Mac/Linux安装:将解压的文件:chromedriver 放置到/usr/local/bin/目录下
代码语言:javascript复制class JD_spider(object):
def __init__(self):
options = webdriver.ChromeOptions() # 定义浏览器设置
# options.add_argument('--headless') # 禁止自动打开浏览器
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 不加载网页图片,加快速度
self.driver = webdriver.Chrome(r'D:外安装软件selenium1chromedriver_win32chromedriver.exe', options=options) # 浏览器驱动
self.data = [] # 定义列表, 将数据存入列表中
self.url = 'https://search.jd.com/Search?keyword=衣服&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=衣服&psort=4&page=num' # 目标网站
self.baseurl = 'https:' # 网址头部, 为下面提供
self.driver.get(self.url) # 使用浏览器打开网站
source = self.driver.page_source # 获取网页源代码
self.url_(source) # 将数据传递给下一个函数
# 解析含有评论的URL
def url_(self, source):
html = etree.HTML(source)
lis = html.xpath('//li[@class="gl-item"]')
for li in lis:
commit = li.xpath('.//div[@class="p-commit"]/strong/a/@href')[0]
commit_url = self.baseurl commit
self.parse_commit(commit_url)
time.sleep(1)
# 解析每页的数据直到最后一页
def parse_commit(self, url):
self.driver.get(url)
while True:
source = self.driver.page_source
self.parse_index(source)
WebDriverWait(self.driver, timeout=20).until(
ec.presence_of_element_located((By.XPATH, '//*[@id="comment-0"]/div[13]/div/div/a[7]')))
next_page = self.driver.find_element_by_xpath('//*[@id="comment-0"]/div[13]/div/div/a[7]')
if 'class="ui-page-curr"' in next_page.get_attribute('class'):
break
else:
self.driver.execute_script("arguments[0].click()", next_page)
time.sleep(7)
# 爬取每页的评论的数据
def parse_index(self, source):
html = etree.HTML(source)
commit_items = html.xpath('//div[@class="comment-item"]')
for commit_item in commit_items:
commit = commit_item.xpath('.//p[@class="comment-con"]/text()')[0]
print(commit)
data = {"commit": commit}
self.data.append(data)
self.csv_()
# 保存为csv文件
def csv_(self):
headers = ['commit']
with open('评论.csv', 'w', newline="", encoding='utf-8')as fb:
writer = csv.DictWriter(fb, headers)
writer.writeheader()
writer.writerows(self.data)
if __name__ == '__main__':
tb = JD_spider()
END