携程,去哪儿评论,攻略爬取
前几天受朋友委托要爬取携程网和去哪儿网一些景点的评论,在翻阅了许多代码后并自己改写后终于完成。
一开始想直接通过分别发送请求再使用BeautifulSoup进行分析,但发现单纯通过发送请求获取HTML的方法行不通,因为有时候发送请求返回的是一段js代码,而最终的html代码是需要通过执行js代码获得。
因此针对此采用selenium模拟实际浏览器点击浏览的动作,等待网页完全显示后再去获取HTML代码进行解析。
其中遇到一个小差错就是携程网大半夜的html结构突然发生变化,导致写好的代码无法分析,因此只能继续改代码。
具体思路
采用selenium BeautifulSoup(以下简称BS,注释中为靓汤) pandas
思路是通过使用selenium库打开浏览器,进入相关网页,然后采用BS进行解析获取其中的评论。如果需要翻页的话也可通过selenium进行翻页按钮点击。
1.携程网
由于景点评论是分页的(一页显示10条评论),而要获取下一页必须得点击页面中下一页按钮,因此通过selenium模拟点击下一页,使下一页评论显示出来,接着再次使用BS解析获取评论…往返循环,直到所有页的评论都获取出来,再通过pandas写到csv文件中,导出。
对景点信息(评分,图片url等)的获取代码已注释。
这里获取了四个景点: 岐澳古道,五桂山, 唐家湾古镇, 会同村
代码语言:javascript复制# -*- coding: utf-8 -*-
import requests
import io
from bs4 import BeautifulSoup as BS
import time
from selenium import webdriver
from pyquery import PyQuery as pq
import pandas as pd
from math import ceil
"""从网上爬取数据"""
# 请求头
headers = {
"Origin": "https://piao.ctrip.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
# places = ["zhuhai27"]
# 地名,用来保存在输出文件的名称
placenames = ["岐澳古道", "五桂山", "唐家湾古镇", "会同村"]
# 携程网的url
base = "https://you.ctrip.com/sight/";
# suffixUrl = "zhongshan233/5631357.html";
# url的后缀,依次保存对应景点的url
suffixUrl = ["zhongshan233/5631357.html", "zhongshan233/23029.html", "zhuhai27/1511281.html", "zhuhai27/122391.html"];
# 将每次获取到的网页的html保存写入文件
# 使用selenium翻页
browser = webdriver.Chrome() # 打开浏览器
for k in range(len(placenames)):
browser.get(base suffixUrl[k]) # 进入相关网站
print(placenames[k])
print(base suffixUrl[k])
res = str(pq(browser.page_source)) # 获取网站源码
# print(res)
time.sleep(3); # 休眠
with open("3.html", "w", encoding="utf-8") as f:
f.write(res)
# 使用靓汤对其解析
soupi = BS(res, "html.parser")
# 1.景点介绍
# vis = soupi.find_all(name="div", attrs={"class": "text_style"});
# introduce = []
# for i in range(len(vis)):
# introduce.append(vis[i].get_text())
# 2.图片的url
# imgs = [];
# imglinks = soupi.find_all(name="img", attrs={"width": "350"})
# print(imglinks)
# for img in imglinks:
# imgs.append(img.attrs["src"])
# 3.评分
# score = soupi.find(name="span", attrs={"class": "score"}).b.get_text()
# scores = [];
# 3.1 把总体评分加入
# scores.append(score);
# 3.2 把分评分加入,景色,趣味,性价比
# scorelinks = soupi.find(name="dl", attrs={"class": "comment_show"}).find_all(name="dd")
# for link in scorelinks:
# scores.append(link.find(name="span", attrs={"class": "score"}).string)
'''
这里使用靓汤依次解析,并保存到评论中
'''
# 4.评论
# 4.1 获取页数
pagediv = soupi.find(name="div", attrs={"class": "commentModule normalModule"})
pageobj = pagediv.find_all(name="div", attrs={"class": "moduleTitle"})
commentNum = int(str(pageobj[0]).split("(", 1)[1].split(')', 1)[0])
page = ceil(commentNum/10)
# pageobj = soupi.find_all(name="b", attrs={"class": "numpage"});
print("pageobj")
print(page)
print("page=", page)
# 4.2 根据页数获取评论
comments = [];
for i in range(page):
res = str(pq(browser.page_source)) # 获取网站源码
# 使用靓汤对其解析
soupi = BS(res, "html.parser")
print("爬取第", (i 1), "页评论...")
# 滑倒页面底部,才能显示按钮和评论等
js = "window.scrollTo(0,100000)"
browser.execute_script(js)
commentlinks = soupi.find_all(name="div", attrs={"class": "commentDetail"});
for link in commentlinks:
comments.append(link.get_text())
# print("第",i,"页评论")
# print(commentlinks)
# 获取完后点击下一页,继续获取
# 最后一页不翻页
if i != (page - 1):
browser.execute_script(
"document.getElementsByClassName('ant-pagination-item-comment')[1].firstChild.click()")
time.sleep(1)
# browser.execute_script(
# "console.log(document.getElementsByClassName('ant-pagination-item-comment')[1].firstChild)")
# # browser.find_element_by_class_name("ant-btn jumpButton").click();
# time.sleep(1)
# 休眠3s后更新html数据
# comments = [];
# commentlinks = soupi.find_all(name="span", attrs={"class": "heightbox"});
# for link in commentlinks:
# comments.append(link.get_text())
# 5.整合解析的数据
tmp = {};
# 5.1 景点id
# 5.2 景点名字
# tmp["name"] = soupi.find(name="div", attrs={"class": "f_left"}).find(name="h1").find_all(name="a")[0].string;
# tmp["name"] = tmp["name"].replace(" ", "").replace("n", "");
# tmp["introduce"] = introduce
# 5.4 景点总体评分
# tmp["score"] = scores;
# 5.5 景点位置
# tmp["position"] = soupi.find_all(name="p", attrs={"class": "s_sight_addr"})[0].string;
# tmp["position"] = tmp["position"].replace(" ", "").replace("n", "").split(":", 1)[1];
# 5.6 景点图片的url
# tmp["img"] = imgs
# 5.7 景点等级
# tmp["grade"] = soupi.find_all(name="span", attrs={"class": "s_sight_con"})[0].get_text()
# tmp["grade"] = tmp["grade"].replace(" ", "").replace("n", "")
# 5.8 景点评论
tmp["comment"] = comments;
print(len(tmp["comment"]))
print("打印tmp", tmp["comment"]);
# 把评论写入csv中
df = pd.DataFrame({"评论": tmp["comment"]})
# index=False表示不写入索引
print("写入",placenames[k],".csv")
df.to_csv(placenames[k] ".csv", encoding='utf_8_sig', index=False)
2. 去哪儿
与携程网的思路同理,并且还多了景点攻略的获取,攻略写入txt,评论写入csv
这里获取了三个景点: 五桂山, 唐家湾古镇, 会同村
代码语言:javascript复制# -*- coding: utf-8 -*-
import requests
import io
from bs4 import BeautifulSoup as BS
import time
from selenium import webdriver
from pyquery import PyQuery as pq
import pandas as pd
from math import ceil
"""从网上爬取数据"""
# 请求头
headers = {
"Origin": "https://piao.ctrip.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
}
placenames = ["五桂山", "唐家湾古镇", "会同村"]
placeurls = [
"https://travel.qunar.com/search/all/%E4%BA%94%E6%A1%82%E5%B1%B1",
"https://travel.qunar.com/search/all/%E5%94%90%E5%AE%B6%E6%B9%BE%E5%8F%A4%E9%95%87",
"https://travel.qunar.com/search/all/%E4%BC%9A%E5%90%8C%E6%9D%91"
]
commenturls = [
"https://travel.qunar.com/p-oi713048-wuguishan",
"https://travel.qunar.com/p-oi10023434-tangjiawanguzhen",
"https://travel.qunar.com/p-oi8305797-huitongcun",
]
browser = webdriver.Chrome() # 打开浏览器
for pp in range(len(placenames)):
# 1.打开攻略链接
browser.get(placeurls[pp])
time.sleep(2)
# 2.获取源码
res = str(pq(browser.page_source)) # 获取网站源码
time.sleep(2)
# 3.靓汤解析
soup1 = BS(res, "html.parser")
# 4.获取攻略列表
strategyList = soup1.find(name="ul", attrs={"class": "b_strategy_list"}).find_all(name="li", attrs={"class": "list_item"})
# 5.将路径获取出来(data-url),并构成完整的攻略路径,加入到一个列表里
Links = []
base = "https://travel.qunar.com/travelbook/note/"
for i in range(len(strategyList)):
print(strategyList[i].attrs["data-url"])
tmpSuffix = str(strategyList[i].attrs["data-url"]).split('/', 2)[2]
Links.append(base tmpSuffix)
# print(strategyLinks)
# 6.依次访问这些路径,获取攻略内容
for i in range(len(Links)):
url = Links[i] #url
browser.get(url)
time.sleep(2)
# 滑倒页面底部,保证加载
js = "window.scrollTo(0,100000)"
browser.execute_script(js)
time.sleep(1)
# 靓汤对攻略进行解析
ssoup = BS(str(pq(browser.page_source)), "html.parser")
# 6.1 定位到攻略具体内容
strategyText = ssoup.find(name="div", attrs={"class": "e_main"})
# 6.2 获取攻略内容,写入一个字符串
textList = strategyText.find_all(name="p")
tstr = ""
for j in range(len(textList)):
tstr = str(textList[j].string)
tstr = tstr.replace("None", "").replace(" 1", "")
# 6.3 写入日记txt文件
with io.open("E:/PyCharmProjects/test1/pachong/file/" placenames[pp] str(i) ".txt", 'w',
encoding="utf-8") as f:
f.write(str(tstr))
print(tstr)
time.sleep(1)
# 7.打开链接,进入评论页面
print("url=" commenturls[pp])
browser.get(commenturls[pp])
time.sleep(2)
# 8.靓汤解析评论界面
csoup = BS(str(pq(browser.page_source)), "html.parser")
comments = []
# 9.找到评论数目
cnum = int(str(csoup.find(name="span", attrs={"class": "num"}).string)[1:-1])
print("cnum=", cnum)
page = ceil(cnum/10)
print("page=", page)
print("爬取", placenames[pp], "的评论")
# 10.根据页数找到所有评论
for j in range(page):
print("爬取第", j, "页...")
# 10.1 找到所有的评论
js = "window.scrollTo(0,100000)"
browser.execute_script(js)
time.sleep(2)
res = str(pq(browser.page_source)) # 获取网站源码
# 使用靓汤对其解析
csoup = BS(res, "html.parser")
commentList = csoup.find_all(name="div", attrs={"class": "e_comment_content"})
for i in range(len(commentList)):
ctext = commentList[i].find_all(name="p")
tmpstr = ""
for ii in range(len(ctext)):
# print(ctext[ii].get_text())
tmpstr = ctext[ii].get_text()
# print(tmpstr)
comments.append(tmpstr)
# 10.2 如果不是最后一页则点击下一页
if j != (page - 1):
browser.execute_script("document.getElementsByClassName('page next')[0].click()")
time.sleep(1)
print(comments)
# 11.写入csv
df = pd.DataFrame({"评论": comments})
print("写入", placenames[pp], ".csv")
df.to_csv("E:/PyCharmProjects/test1/pachong/file2/" placenames[pp] ".csv", encoding='utf_8_sig', index=False)
3. 结果
1. 携程网
2. 去哪儿网
4.总结
在了解selenium BeautifulSoup pandas的基础上要完成爬取就比较简单。其实当初委托中还有要爬马蜂窝的评论,但马蜂窝的反爬机制相对较强,试了很多方法都不成功。因此最后只爬了去哪儿网和携程网。本蒟蒻知识有限,按传统功夫,点到为止,权当兴趣了解,勿喷。