本次爬虫主要爬取的是4k壁纸网的美女壁纸,该网页的结构相对比较简单,这次爬虫的主要目的学会使用bs进行解析,另外是关于当爬取的数据是非文本数据时数据的解析问题。
最终完成代码:
代码语言:javascript复制import requests
import time
#import _thread
# 爬虫结果输出路径
out_dir = './pictures/'
root_url = 'https://www.4kbizhi.com/meinv/'
headers = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37',
#'Cookie': 'csrfToken=fxB64yKN6YmKp2x6IBImOond; global_cookie=4qsim60u3xw9srizbptt3nh3q1yl5377g9v; engine_source_cookie=bing; sf_source=bing; city=xian; lastscanpage=0; g_sourcepage=esf_fy^xq_pc; unique_cookie=U_4qsim60u3xw9srizbptt3nh3q1yl5377g9v*62',
}
# 记录匹配失败网页
fail_url=[]
def get_url(url):
"""
获取html文件
"""
response = requests.get(url, headers=headers, verify=True)
print(f"开始爬取{url}")
return response
def parse_page(response):
"""
提取当页中所有所需数据的存储位置以及下一爬取网页
"""
from bs4 import BeautifulSoup
# 解析页面提取页面内的房源url
html = response.content.decode('gbk')
img_url_dict={}
next_url=None
bs = BeautifulSoup(html, 'html.parser')
#bs.find_all('a',attrs={'target':'_blank'})
result = bs.find_all('img')
for i in result:
img_name = i.get('alt')
img_url = 'https://www.4kbizhi.com/' i.get('src')
# 去除一些干扰图片(.gif)
if img_url.split('.')[-1] in ['jpg','png']:
img_url_dict[img_name]=img_url
next_url = bs.find_all(attrs={'class': 'next'})
if next_url:
return img_url_dict, 'https://www.4kbizhi.com/' next_url[0].get('href')
#elif temp_url[-3:-1]%==0:
# return img_url_dict,None
else:
return img_url_dict,next_url
def save(response, out_dir,img_name):
"""
保存爬取结果
"""
import os
img_url=response.url
if not(os.path.exists(out_dir)):
os.mkdir(out_dir)
with open(out_dir img_name '.' img_url.split('.')[-1], 'wb') as fp:
fp.write(response.content)
def parse_data_page(response,img_name):
"""
获得所需数据
"""
try:
save(response, out_dir,img_name)
print(f"壁纸{response.url}保存成功")
except:
fail_url.append(response.url)
#save([response.url], 'fail_url.csv',['链接地址']) # 字符串写入时会默认拆开
print(f"{response.url}解析失败,请稍后查看实际情况,当前解析失败网页总数为{len(fail_url)}")
def main(url):
response = get_url(url)
img_url_dict, next_url = parse_page(response)
while(next_url) and (next_url.split('.')[0][-1]>'3'):
for img_name,img_url in img_url_dict.items():
response = get_url(img_url)
parse_data_page(response,img_name)
response=get_url(next_url)
img_url_dict, next_url = parse_page(response)
if not(next_url):#将最后一页数据进行解析
for img_name, img_url in img_url_dict.items():
response = get_url(img_url)
parse_data_page(response, img_name)
if __name__ == '__main__':
start = time.time()
main(root_url)
with open('fail_url.csv', 'w') as f:
for i in fail_url:
f.write(i 'n')
end = time.time()
print("运行程序花费了%s秒" % (end-start))
需要注意的几个点:
- 该网页都是使用gbk编码,为了解析出正常的汉字可以通过首先得到二进制编码的html(response.content)然后解码为’gbk’即可,至于网页本身的编码方式可以通过网页的属性看出
- 访问网页得到的图片文件的保存主要是二进制文件的保存,要保存的文件其实就是response.content,需要注意的是二进制文件的保存在使用
open
方法时必须要注明(‘wb’) - 注意bs的解析规则