续昨天的携程网爬虫笔记,昨天只是爬取当页的酒店列表的基本信息,今天是爬取酒店的详细信息,和昨天内容差不多,不过多了一些对字典的操作,将两个字符串合并成一个字典,将两个数组合并成一个字典。
生命在于折腾,仅此而已。
代码如下:
代码语言:javascript复制from bs4 import BeautifulSoup
from selenium import webdriver
url='http://hotels.ctrip.com/hotel/436448.html'
hotelid='436448'
def gethoteldetail(hotelid,url):
driver = webdriver.Chrome(r'D:Python36CodingPycharmProjectstttchromedriver_win32chromedriver.exe')
driver.get(url)
htmlhotelinfo = driver.page_source
xmlhotelinfo = BeautifulSoup(htmlhotelinfo, 'lxml')
try:
hotelspecialinfo = xmlhotelinfo.find("div", class_='special_info').get_text()
except AttributeError:
hotelspecialinfo = ''
try:
hotelotherinfo = xmlhotelinfo.find("div", class_='htl_room_txt text_3l').find("p").get_text()
except AttributeError:
hotelotherinfo = ''
try:
hoteltelnum=xmlhotelinfo.find(attrs={"data-real": True})['data-real']
except AttributeError:
hoteltelnum = ''
speciallabels = []
try:
speciallabeltag = xmlhotelinfo.find("div", class_="special_label").find_all("i")
for speciallabel in speciallabeltag:
speciallabels.append(speciallabel.get_text())
except AttributeError:
speciallabels = []
hotelfacilitys=[]
try:
hotelfacilitytag = xmlhotelinfo.find_all("tr",attrs={"data-init": True})
for hotelfacility in hotelfacilitytag:
hotelfacilityclass=hotelfacility.find("th").get_text()
hotelfacilitydetail=hotelfacility.find("td").get_text()
hotelfacilitydic = {hotelfacilityclass:hotelfacilitydetail}
hotelfacilitys.append(hotelfacilitydic)
except AttributeError:
hotelfacilitys = []
hotelextracontenclasss=[]
hotelextracontentdetails=[]
hotelextracontentsdic=set()
try:
hotelextracontenttag = xmlhotelinfo.find("table",class_="detail_extracontent").find_all("th")
for hotelextracontenclass in hotelextracontenttag:
hotelextracontenclasss.append(hotelextracontenclass.get_text())
hotelextracontentdetailtag = xmlhotelinfo.find("table", class_="detail_extracontent").find_all("td")
for hotelextracontentdetail in hotelextracontentdetailtag:
hotelextracontentdetails.append(hotelextracontentdetail.get_text())
hotelextracontentsdic=dict(zip(hotelextracontenclasss,hotelextracontentdetails))
except AttributeError:
hotelextracontentsdic = set()
print('hotel detail=',hotelid,hotelspecialinfo,hotelotherinfo,hoteltelnum,speciallabels,hotelfacilitys,hotelextracontentsdic)
gethoteldetail(hotelid,url)