Python素材下载爬虫,ui素材下载爬取采集源码

2020-07-22 14:27:52 浏览数 (1)

Uimaker是为UI设计师提供学UI设计的专业UI平台,拥有UI教程、UI素材、ICON、图标设计UI、手机UI、ui设计师招聘、软件界面设计、后台界面、后台模版等相关内容,快来uimaker学UI设计。

目标网站:

http://www.uimaker.com/uimakerdown/list_36_1.html

uimaker

爬取思路:

第一步:或缺素材页码总页数

第二步:爬取素材列表链接

第三步:爬取素材详情

python爬取采集关键点:

1.页码总数格式化处理

replace函数和split函数

代码语言:javascript复制
pagenum=pagenum_url.replace(".html",'').split('_')[-1]

2.内容详情格式化处理

代码语言:javascript复制
article=req.xpath('//div[@class="contentinfo"]/table//text()')
article =''.join(article)
article =article.strip()

3.是否需要u币下载,这里作了判断

代码语言:javascript复制
b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]
if int(b_num)==0:

4.特殊情况,无此链接信息的处理

代码语言:javascript复制
try:
    down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]
    down_name=f'{h2}/{h2}.rar'
    print(down_url,down_name)
    self.down(down_url,down_name)
except Exception as e:
    print("无此链接信息!")

运行情况:

运行情况

运行效果:

运行效果

完整源码:

代码语言:javascript复制
#uimaker素材获取
#20200310 by 微信:huguo00289
# -*- coding: UTF-8 -*-

import requests,time,os
from lxml import etree
from fake_useragent import UserAgent

class Uimaker():
    #初始化变量
    def __init__(self):
        self.ua=UserAgent()
        self.headers={"User-Agent":self.ua.random}

    #获取页码
    def get_pagenum(self):
        url="http://www.uimaker.com/uimakerdown/"
        response=requests.get(url,headers=self.headers,timeout=6).content.decode("gbk")
        req=etree.HTML(response)
        pagenum_url=req.xpath('//div[@class="page"]/ul[@class="pagelist"]/li')[-1].xpath('.//a/@href')[0]
        pagenum=pagenum_url.replace(".html",'').split('_')[-1]
        pagenum=int(pagenum)
        print(pagenum)
        return pagenum


    #获取列表链接
    def get_urllist(self,i):
        url=f"http://www.uimaker.com/uimakerdown/list_36_{i}.html"
        print(url)
        response = requests.get(url, headers=self.headers,timeout=6).content.decode("gbk")
        req = etree.HTML(response)
        urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href')
        print(len(urllist))
        print(urllist)
        return urllist

    #获取素材详情
    def get_dowm(self,url):
        response = requests.get(url,headers=self.headers,timeout=6).content.decode("gbk")
        req = etree.HTML(response)
        h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0]
        print(h2)
        os.makedirs(f'{h2}/',exist_ok=True)
        article=req.xpath('//div[@class="contentinfo"]/table//text()')
        article =''.join(article)
        article =article.strip()
        print(article)
        texts=f'{h2}n{article}'
        self.get_text(h2,texts)

        imgs=req.xpath('//div[@class="contentinfo"]/table//@src')
        if imgs:
            i=1
            for img in imgs:
                img_url=f'http://www.uimaker.com{img}'
                suffix=os.path.splitext(img)[1]
                img_name=f'{i}{suffix}'
                print(img_url,img_name)
                self.get_downimg(h2, img_url, img_name)
                i=i 1

        b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]
        if int(b_num)==0:
            try:
                down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]
                down_name=f'{h2}/{h2}.rar'
                print(down_url,down_name)
                self.down(down_url,down_name)
            except Exception as e:
                print("无此链接信息!")

        print(f'>>>获取素材成功!')


    #保存文本内容
    def get_text(self,h2,texts):
        print("开始保存文本内容...")
        with open(f'{h2}/{h2}.txt', 'w',encoding="utf-8") as f:
            f.write(texts)
        print(">>>保存文本内容完成!")

    #下载图片
    def get_downimg(self,h2,img_url,img_name):
        print("开始下载图片...")
        r = requests.get(img_url, headers=self.headers, timeout=6)
        with open(f'{h2}/{img_name}', 'wb') as f:
            f.write(r.content)
        print(">>>下载素材完成!")

    #下载素材
    def down(self,down_url,down_name):
        print("开始下载素材...")
        r=requests.get(down_url,headers=self.headers,timeout=6)
        with open(down_name,'wb') as f:
            f.write(r.content)
        print(">>>下载素材完成!")


    #运行程序
    def run(self):
        pagenum=self.get_pagenum()
        for i in range(1,pagenum 1):
            urllist=self.get_urllist(i)
            for url in urllist:
                self.get_dowm(url)



if __name__=='__main__':
    spider=Uimaker()
    spider.run()

0 人点赞