Python爬虫之携程网笔记二

2022-03-11 13:22:54 浏览数 (1)

续昨天的携程网爬虫笔记,昨天只是爬取当页的酒店列表的基本信息,今天是爬取酒店的详细信息,和昨天内容差不多,不过多了一些对字典的操作,将两个字符串合并成一个字典,将两个数组合并成一个字典。

生命在于折腾,仅此而已。

代码如下:

代码语言:javascript复制
from bs4 import BeautifulSoup
from selenium import webdriver

url='http://hotels.ctrip.com/hotel/436448.html'
hotelid='436448'

def gethoteldetail(hotelid,url):
    driver = webdriver.Chrome(r'D:Python36CodingPycharmProjectstttchromedriver_win32chromedriver.exe')
    driver.get(url)
    htmlhotelinfo = driver.page_source
    xmlhotelinfo = BeautifulSoup(htmlhotelinfo, 'lxml')

    try:
        hotelspecialinfo = xmlhotelinfo.find("div", class_='special_info').get_text()
    except AttributeError:
        hotelspecialinfo = ''

    try:
        hotelotherinfo = xmlhotelinfo.find("div", class_='htl_room_txt text_3l').find("p").get_text()
    except AttributeError:
        hotelotherinfo = ''

    try:
        hoteltelnum=xmlhotelinfo.find(attrs={"data-real": True})['data-real']
    except AttributeError:
        hoteltelnum = ''

    speciallabels = []
    try:
        speciallabeltag = xmlhotelinfo.find("div", class_="special_label").find_all("i")
        for speciallabel in speciallabeltag:
            speciallabels.append(speciallabel.get_text())
    except AttributeError:
        speciallabels = []

    hotelfacilitys=[]
    try:
        hotelfacilitytag = xmlhotelinfo.find_all("tr",attrs={"data-init": True})
        for hotelfacility in hotelfacilitytag:
            hotelfacilityclass=hotelfacility.find("th").get_text()
            hotelfacilitydetail=hotelfacility.find("td").get_text()
            hotelfacilitydic = {hotelfacilityclass:hotelfacilitydetail}
            hotelfacilitys.append(hotelfacilitydic)
    except AttributeError:
        hotelfacilitys = []

    hotelextracontenclasss=[]
    hotelextracontentdetails=[]
    hotelextracontentsdic=set()
    try:
        hotelextracontenttag = xmlhotelinfo.find("table",class_="detail_extracontent").find_all("th")
        for hotelextracontenclass in hotelextracontenttag:
            hotelextracontenclasss.append(hotelextracontenclass.get_text())
        hotelextracontentdetailtag = xmlhotelinfo.find("table", class_="detail_extracontent").find_all("td")
        for hotelextracontentdetail in hotelextracontentdetailtag:
            hotelextracontentdetails.append(hotelextracontentdetail.get_text())
        hotelextracontentsdic=dict(zip(hotelextracontenclasss,hotelextracontentdetails))
    except AttributeError:
        hotelextracontentsdic = set()
    print('hotel detail=',hotelid,hotelspecialinfo,hotelotherinfo,hoteltelnum,speciallabels,hotelfacilitys,hotelextracontentsdic)

gethoteldetail(hotelid,url)

0 人点赞