Uimaker是为UI设计师提供学UI设计的专业UI平台,拥有UI教程、UI素材、ICON、图标设计UI、手机UI、ui设计师招聘、软件界面设计、后台界面、后台模版等相关内容,快来uimaker学UI设计。
目标网站:
http://www.uimaker.com/uimakerdown/list_36_1.html
uimaker
爬取思路:
第一步:或缺素材页码总页数
第二步:爬取素材列表链接
第三步:爬取素材详情
python爬取采集关键点:
1.页码总数格式化处理
replace函数和split函数
代码语言:javascript复制pagenum=pagenum_url.replace(".html",'').split('_')[-1]
2.内容详情格式化处理
代码语言:javascript复制article=req.xpath('//div[@class="contentinfo"]/table//text()')
article =''.join(article)
article =article.strip()
3.是否需要u币下载,这里作了判断
代码语言:javascript复制b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]
if int(b_num)==0:
4.特殊情况,无此链接信息的处理
代码语言:javascript复制try:
down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]
down_name=f'{h2}/{h2}.rar'
print(down_url,down_name)
self.down(down_url,down_name)
except Exception as e:
print("无此链接信息!")
运行情况:
运行情况
运行效果:
运行效果
完整源码:
代码语言:javascript复制#uimaker素材获取
#20200310 by 微信:huguo00289
# -*- coding: UTF-8 -*-
import requests,time,os
from lxml import etree
from fake_useragent import UserAgent
class Uimaker():
#初始化变量
def __init__(self):
self.ua=UserAgent()
self.headers={"User-Agent":self.ua.random}
#获取页码
def get_pagenum(self):
url="http://www.uimaker.com/uimakerdown/"
response=requests.get(url,headers=self.headers,timeout=6).content.decode("gbk")
req=etree.HTML(response)
pagenum_url=req.xpath('//div[@class="page"]/ul[@class="pagelist"]/li')[-1].xpath('.//a/@href')[0]
pagenum=pagenum_url.replace(".html",'').split('_')[-1]
pagenum=int(pagenum)
print(pagenum)
return pagenum
#获取列表链接
def get_urllist(self,i):
url=f"http://www.uimaker.com/uimakerdown/list_36_{i}.html"
print(url)
response = requests.get(url, headers=self.headers,timeout=6).content.decode("gbk")
req = etree.HTML(response)
urllist = req.xpath('//dl[@class="imglist"]/dt/ul[@class="listimg"]/li/span[@class="listpic"]/a/@href')
print(len(urllist))
print(urllist)
return urllist
#获取素材详情
def get_dowm(self,url):
response = requests.get(url,headers=self.headers,timeout=6).content.decode("gbk")
req = etree.HTML(response)
h2 = req.xpath('//div[@class="arcinfo"]/h2/text()')[0]
print(h2)
os.makedirs(f'{h2}/',exist_ok=True)
article=req.xpath('//div[@class="contentinfo"]/table//text()')
article =''.join(article)
article =article.strip()
print(article)
texts=f'{h2}n{article}'
self.get_text(h2,texts)
imgs=req.xpath('//div[@class="contentinfo"]/table//@src')
if imgs:
i=1
for img in imgs:
img_url=f'http://www.uimaker.com{img}'
suffix=os.path.splitext(img)[1]
img_name=f'{i}{suffix}'
print(img_url,img_name)
self.get_downimg(h2, img_url, img_name)
i=i 1
b_num=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dd[1]/b/text()')[0]
if int(b_num)==0:
try:
down_url=req.xpath('//div[@class="download"]/dl[@class="downlink"]/dt/li/a/@href')[0]
down_name=f'{h2}/{h2}.rar'
print(down_url,down_name)
self.down(down_url,down_name)
except Exception as e:
print("无此链接信息!")
print(f'>>>获取素材成功!')
#保存文本内容
def get_text(self,h2,texts):
print("开始保存文本内容...")
with open(f'{h2}/{h2}.txt', 'w',encoding="utf-8") as f:
f.write(texts)
print(">>>保存文本内容完成!")
#下载图片
def get_downimg(self,h2,img_url,img_name):
print("开始下载图片...")
r = requests.get(img_url, headers=self.headers, timeout=6)
with open(f'{h2}/{img_name}', 'wb') as f:
f.write(r.content)
print(">>>下载素材完成!")
#下载素材
def down(self,down_url,down_name):
print("开始下载素材...")
r=requests.get(down_url,headers=self.headers,timeout=6)
with open(down_name,'wb') as f:
f.write(r.content)
print(">>>下载素材完成!")
#运行程序
def run(self):
pagenum=self.get_pagenum()
for i in range(1,pagenum 1):
urllist=self.get_urllist(i)
for url in urllist:
self.get_dowm(url)
if __name__=='__main__':
spider=Uimaker()
spider.run()