爬取壁纸

2022-09-06 14:28:26 浏览数 (1)

本次爬虫主要爬取的是4k壁纸网的美女壁纸,该网页的结构相对比较简单,这次爬虫的主要目的学会使用bs进行解析,另外是关于当爬取的数据是非文本数据时数据的解析问题。

最终完成代码:

代码语言:javascript复制
import requests
import time
#import _thread
# 爬虫结果输出路径
out_dir = './pictures/'
root_url = 'https://www.4kbizhi.com/meinv/'
headers = {
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37',
    #'Cookie': 'csrfToken=fxB64yKN6YmKp2x6IBImOond; global_cookie=4qsim60u3xw9srizbptt3nh3q1yl5377g9v; engine_source_cookie=bing; sf_source=bing; city=xian; lastscanpage=0; g_sourcepage=esf_fy^xq_pc; unique_cookie=U_4qsim60u3xw9srizbptt3nh3q1yl5377g9v*62',
}
# 记录匹配失败网页
fail_url=[]
def get_url(url):
    """
    获取html文件
    """
    response = requests.get(url,   headers=headers, verify=True)
    print(f"开始爬取{url}")
    return response
def parse_page(response):
    """
    提取当页中所有所需数据的存储位置以及下一爬取网页
    """
    from bs4 import BeautifulSoup
    # 解析页面提取页面内的房源url
    html = response.content.decode('gbk')
    img_url_dict={}
    next_url=None
    bs = BeautifulSoup(html, 'html.parser')
    #bs.find_all('a',attrs={'target':'_blank'})
    result = bs.find_all('img')
    for i in result:
        img_name = i.get('alt')
        img_url = 'https://www.4kbizhi.com/' i.get('src')
        # 去除一些干扰图片(.gif)
        if img_url.split('.')[-1] in ['jpg','png']:
            img_url_dict[img_name]=img_url
    next_url = bs.find_all(attrs={'class': 'next'})
    if next_url:
        return img_url_dict, 'https://www.4kbizhi.com/' next_url[0].get('href')
    #elif temp_url[-3:-1]%==0:
       # return img_url_dict,None
    else:
        return img_url_dict,next_url
def save(response, out_dir,img_name):
    """
    保存爬取结果
    """
    import os
    img_url=response.url
    if not(os.path.exists(out_dir)):
        os.mkdir(out_dir)
    with open(out_dir img_name '.' img_url.split('.')[-1], 'wb') as fp:
        fp.write(response.content)
def parse_data_page(response,img_name):
    """
    获得所需数据
    """
    try:
        save(response, out_dir,img_name)
        print(f"壁纸{response.url}保存成功")
    except:
        fail_url.append(response.url)
        #save([response.url], 'fail_url.csv',['链接地址'])  # 字符串写入时会默认拆开
        print(f"{response.url}解析失败,请稍后查看实际情况,当前解析失败网页总数为{len(fail_url)}")
def  main(url):
    response = get_url(url)
    img_url_dict, next_url = parse_page(response)
    while(next_url) and (next_url.split('.')[0][-1]>'3'):
        for img_name,img_url in img_url_dict.items():
            response = get_url(img_url)
            parse_data_page(response,img_name)
        response=get_url(next_url)
        img_url_dict, next_url = parse_page(response)
        if not(next_url):#将最后一页数据进行解析
            for img_name, img_url in img_url_dict.items():
                response = get_url(img_url)
                parse_data_page(response, img_name)
if __name__ == '__main__':
    start = time.time()
    main(root_url)
    with open('fail_url.csv', 'w') as f:
        for i in fail_url:
            f.write(i 'n')
    end = time.time()
    print("运行程序花费了%s秒" % (end-start))

需要注意的几个点:

  • 该网页都是使用gbk编码,为了解析出正常的汉字可以通过首先得到二进制编码的html(response.content)然后解码为’gbk’即可,至于网页本身的编码方式可以通过网页的属性看出
  • 访问网页得到的图片文件的保存主要是二进制文件的保存,要保存的文件其实就是response.content,需要注意的是二进制文件的保存在使用open方法时必须要注明(‘wb’)
  • 注意bs的解析规则

0 人点赞