“Good Design Award”创立于1957年,也是日本国内唯一综合性的设计评价与推荐制度,通称为G-mark,中文称之为日本优良设计大奖。
一个日本设计网站的获奖数据爬取,获奖数据很齐全,国内访问有时候会访问失败!
爬取需求:
获取各个年份的获奖作品名称,描述及图片!
应用 python类 来构建一个简单的爬虫系统:
包括
爬取模块:gmspider.py
存储模块:store.py
下载模块:down.py
还有个运行程序:main.py
对于数据的衔接以及异常处理花费了不少时间和精力,自行尝试过后更相信可以认识到scrapy框架的强大之处!
几个关键点:
1.爬取数据的存储
第一:写入本地txt文件;
第二:写入本地mysql数据库。
2.数据模块之间的衔接处理
这里还是应用了队列,Queue,应用队列来获取传输数据;
如果是scrapy,数据管道会非常方便,没有就只能自己硬写了!
3.关于数据的提取下载
思考了两个方案:
方案一:一边爬取一边保存插入数据到数据库,再同时下载图片数据。
方案而.:爬取数据的同时保存插入数据到数据库,最后从数据库里查询数据,从而下载数据图片,此方案某些程度可以规避某些时段网络差的问题。
4.实际下载网络超时的重试,以及异常处理
网络超时的处理:
代码语言:javascript复制from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次
s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次
try:
r= s.get(img_url,timeout=20,headers=self.headers)
with open(f'{path}/{img_name}', 'wb') as f:
f.write(r.content)
print(f">>> 下载 {img_name} 图片成功!")
except requests.exceptions.RequestException as e:
三次重试机会,再加上首次获取,总共四次!
运行效果:
异常的处理
try except 以及异常情况日志的写入,这里是写入到本地的txt!
还是感觉代码繁多,以及各种异常情况很难有合适的方式处理!
5.关于etree解析网页的处理技巧
获取节点数据的时候,打印输出节点html源码是一个不错的方法。
如何将xpath定位到的元素进行转为HTML源码
代码语言:javascript复制#方法1:使用from lxml.html import tostring的tostring方法功能
from lxml.html import tostring
from lxml import etree
html_get = etree.HTML(resp_text)
div_ok = html_get.xpath('//div[@id="mw-content-text"]')[0]
div_content = tostring(div_ok).decode('utf-8')
#方法2(推荐使用,经过我效率测试,使用etree返回的html使用xpath定位到的元素,还使用etree转换为HTML源码效率更快):
from lxml import etree
html_get = etree.HTML(resp_text)
div_ok = html_get.xpath('//div[@id="mw-content-text"]')[0]
print(div_ok,type(div_ok))
div_content = etree.tostring(div_ok, pretty_print=True, method='html').decode('utf-8') # 转为字符串
#来源CSDN博主「奋斗吧-皮卡丘」
xpath获取数据需要多次尝试,一步步获取,输出查看效果!
最终还是看个人熟练程度,还是不熟练啊!
运行效果:
项目:
爬取效果:
附上源码
爬取模块 gmspider.py
代码语言:javascript复制import requests
from fake_useragent import UserAgent
from lxml import etree
import re,time
from queue import Queue
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
class Gspider(object):
def __init__(self,year):
self.ua=UserAgent()
self.headers={'User-Agent':self.ua.random}
self.url="https://www.g-mark.org"
self.year=str(year)
self.datas=Queue(1000)
#获取列表页数据
def parse_list(self):
url="%s/award/search?"%self.url
params={
'from': self.year,
'to': '',
'prizeCode': '',
'keyword': '',
'locale': 'en',
}
response=requests.get(url,params=params,headers=self.headers,timeout=10).content.decode('utf-8')
time.sleep(1)
html=etree.HTML(response)
sections=html.xpath('//section[@class="prizeArea section"]')
print(len(sections))
for section in sections:
category=section.xpath('*//img[1]/@alt')[0]
category = re.sub(r'[|/<>:*?\"]', '_', category) # 格式化奖项分类
category=str(category)
print(category)
#print(etree.tostring(section).decode('utf-8'))
page_urls=section.xpath('*//li/a[1]/@href')
print(category,page_urls)
self.parse_pages(category, page_urls)
datas_num = self.datas.qsize()
print(f'>>> 获取{self.year}年获奖作品数据完成!')
print(f'>>> 共{datas_num}条数据!')
return self.datas
#获取奖项分类详情页数据
def parse_pages(self,category,page_list):
for page_url in page_list:
page_url = '%s%s%s' % (self.url, page_url, '&locale=en')
print(page_url)
try:
data=self.parse_page(category,page_url)
self.datas.put(data)
except Exception as e:
print(f'获取数据失败,错误代码:{e}')
with open('page_spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{category,}-{page_url}-{self.year}-获取数据失败,错误代码:{e}-{now}n')
#获取详情页数据
def parse_page(self,category,page_url):
try:
response = requests.get(page_url, headers=self.headers, timeout=10).content.decode('utf-8')
html = etree.HTML(response)
title = html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0]
title = re.sub(r'[|/<>:*?'\."]', '_', title) # 获取标题
print(title)
detail = html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] # 获取简介
# print(detail)
imgs = html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src')
# print(imgs)
data = (category, title, imgs, self.year, detail)
print(data)
except Exception as e:
if "Read timed out" in str(e):
print(f'>>> 获取{page_url}数据超时,正在重试...')
time.sleep(2)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次
s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次
try:
response= s.get(page_url,timeout=20,headers=self.headers).content.decode('utf-8')
html = etree.HTML(response)
title = html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0]
title = re.sub(r'[|/<>:*?'\."]', '_', title) # 获取标题
print(title)
detail = html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] # 获取简介
# print(detail)
imgs = html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src')
# print(imgs)
data = (category, title, imgs, self.year, detail)
print(data)
except requests.exceptions.RequestException as e:
print(f'获取数据失败,错误代码:{e}')
with open('pageurl_spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{category,}-{page_url}-{self.year}-获取{page_url}失败,错误代码:{e}-{now}n')
else:
print(f'获取数据失败,错误代码:{e}')
with open('pageurl_spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{category,}-{page_url}-{self.year}-获取{page_url}失败,错误代码:{e}-{now}n')
return data
最初写的版本,按年份爬取数据
代码语言:javascript复制# -*- coding: UTF-8 -*-
import requests
from fake_useragent import UserAgent
from lxml import etree
import re,time
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
from queue import Queue
class Gspider(object):
def __init__(self,year):
self.ua=UserAgent()
self.headers={'User-Agent':self.ua.random}
self.url="https://www.g-mark.org"
self.year=str(year)
self.datas=Queue(300)
self.page_urls=[]
def timeout_get(self,url):
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
try:
response = s.get(url, headers=self.headers,timeout=20)
except requests.exceptions.RequestException as e:
print(f'访问失败,错误代码:{e}')
with open('spider.txt','a ',encoding='utf-8') as f:
now=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{url}-{e}-{now}')
return response
def parse_list(self):
url="%s/award/search?"%self.url
params={
'from': self.year,
'to': '',
'prizeCode': '',
'keyword': '',
}
response=requests.get(url,params=params,headers=self.headers,timeout=10).content.decode('utf-8')
time.sleep(1)
html=etree.HTML(response)
hrefs=html.xpath('//section/ul/li/a[1]/@href')
hrefs_num=(len(hrefs))
for href in hrefs:
page_url='%s%s'%(self.url,href)
self.page_urls.append(page_url)
print(f'>>> 获取{self.year}年获奖作品链接完成!')
print(f'>>> 共{hrefs_num}条链接!')
def parse_pages(self):
self.parse_list()
for page_url in self.page_urls:
print(page_url)
try:
data=self.parse_page(page_url)
self.datas.put(data)
except Exception as e:
print(f'获取数据失败,错误代码:{e}')
with open('spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{page_url}-{self.year}-获取数据失败,错误代码:{e}-{now}n')
break
datas_num = self.datas.qsize()
print(f'>>> 获取{self.year}年获奖作品数据完成!')
print(f'>>> 共{datas_num}条数据!')
return self.datas
def parse_page(self,page_url):
response = requests.get(page_url, headers=self.headers, timeout=10).content.decode('utf-8')
time.sleep(2)
html = etree.HTML(response)
title=html.xpath('//dl[@class="basicinfo"]/dd[@class="item"]/text()')[0]
title = re.sub(r'[|/<>:*?\"]', '_', title) #获取标题
#print(title)
detail =html.xpath('//dl[@class="detail"]/dd[1]/p/text()')[0] #获取简介
#print(detail)
imgs=html.xpath('//figure[@id="mainphoto"]/ul[@class="photo"]/li/a/img/@src')
#print(imgs)
data=(title,imgs,self.year,detail)
print(data)
return data
if __name__=='__main__':
spider=Gspider(2018)
spider.parse_pages()
存储模块 store.py
代码语言:javascript复制import pymysql
import os
import time
class Save(object):
def __init__(self,data):
self.host="localhost"
self.user="root"
self.password="123456"
self.db="xiaoshuo"
self.port=3306
self.connect = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
db=self.db,
port=self.port,
)
self.cursor = self.connect.cursor() # 设置游标
self.data=data
def insert(self):
category,title,imgs,year,detail=self.data
imgs=','.join(imgs)
print(imgs)
try:
sql="INSERT INTO g_mark(category,title,imgs,cyear,detail)VALUES(%s,%s,%s,%s,%s)"
val = (category,title,imgs,year,detail)
self.cursor.execute(sql, val)
self.connect.commit()
print(f'>>> 插入 {title} 数据成功!')
except Exception as e:
print(f'>>> 插入 {title} 数据失败!')
with open('spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{category},{title},{imgs},{year},{detail}-插入数据失败,错误代码:{e}-{now}n')
pass
def cs(self):
# 关闭数据库
self.cursor.close()
self.connect.close()
def save_text(self):
category,title,imgs,year,detail = self.data
os.makedirs(f'{year}/{category}',exist_ok=True) #创建目录
texts='%s%s%s'%(title,'n',detail)
try:
with open(f'{year}/{category}/{title}.txt','w',encoding='utf-8') as f:
f.write(texts)
print(f'>>> 保存 {title}.txt 文本内容成功!')
except Exception as e:
print(f'>>> 插入 {title} 数据失败!')
with open('spider.txt', 'a ', encoding='utf-8') as f:
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
f.write(f'{category},{title},{imgs},{year},{detail}-保存数据失败,错误代码:{e}-{now}n')
pass
def sav(self):
self.insert()
self.save_text()
self.cs()
数据库表:
数据库表结构:
title字段Unique索引去重:
下载模块 down.py
代码语言:javascript复制import requests
import os
from fake_useragent import UserAgent
import time
import pymysql
from requests.adapters import HTTPAdapter #引入 HTTPAdapter 库
class Gdown(object):
def __init__(self,data):
self.data=data
self.ua=UserAgent()
self.headers={'User-Agent':self.ua.random}
#传入数据下载
def date_downs(self):
category, title, imgs, year, detail = self.data
path=f'{year}/{category}'
os.makedirs(path,exist_ok=True)
i = 1
for img in imgs:
img_url=img.split('?')[0]
print(img_url)
suffix=os.path.splitext(img_url)[1] #获取图片文件格式(后缀)
img_name='%s%s%s%s'%(title,'_',i,suffix)
print(img_name)
self.get_img(img_url, img_name, path)
i=i 1
#下载图片
def get_img(self,img_url,img_name,path):
try:
r=requests.get(img_url,headers=self.headers,timeout=15)
time.sleep(2)
with open(f'{path}/{img_name}','wb') as f:
f.write(r.content)
print(f">>> 下载 {img_name} 图片成功!")
except Exception as e:
if "Read timed out" in str(e):
print(f'>>> 下载{img_name}图片超时,正在重试...')
time.sleep(2)
s = requests.Session()
s.mount('http://', HTTPAdapter(max_retries=3)) #重试3次
s.mount('https://', HTTPAdapter(max_retries=3)) #重试3次
try:
r= s.get(img_url,timeout=20,headers=self.headers)
with open(f'{path}/{img_name}', 'wb') as f:
f.write(r.content)
print(f">>> 下载 {img_name} 图片成功!")
except requests.exceptions.RequestException as e:
print(f">>> 下载 {img_name} 图片失败,错误代码:{e}!")
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
with open(f'{path}/img_spider.txt', 'a ', encoding='utf-8') as f:
f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}-{now}n')
else:
print(f">>> 下载 {img_name} 图片失败,错误代码:{e}!")
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
with open(f'{path}/img_spider.txt','a ',encoding='utf-8') as f:
f.write(f'{img_url},{img_name},{path}-下载图片失败,错误代码:{e}-{now}n')
#从数据库查询数据 下载图片数据
def sql_downs(self):
#连接数据库
connect = pymysql.connect(
host="localhost",
user="root",
password="123456",
db="xiaoshuo",
port=3306,
)
cursor = connect.cursor() # 设置游标
#查询数据库
cursor.execute("select category, title, imgs, cyear from g_mark")
datas = cursor.fetchall()
# 关闭数据库
cursor.close()
connect.close()
print(len(datas))
for category, title, imgs, cyear in datas:
path = f'{cyear}/{category}'
os.makedirs(path, exist_ok=True)
i = 1
img_urls = imgs.split(',')
for img in img_urls:
img_url = img.split('?')[0]
print(img_url)
suffix = os.path.splitext(img_url)[1] # 获取图片文件格式(后缀)
img_name = '%s%s%s%s' % (title, '_', i, suffix)
print(img_name)
self.get_img(img_url, img_name, path)
i = i 1
运行程序 main.py
代码语言:javascript复制from gmspider import Gspider #引入爬虫模块
from store import Save #引入存储模块
from down import Gdown #引入下载模块
import time
if __name__=='__main__':
for year in range(2010, 2020):
print(f'>>> 正在爬取{year}年数据..')
spider = Gspider(year)
datas=spider.parse_list()
while True:
if datas.empty():
break
data=datas.get()
print(data)
try:
save = Save(data)
save.sav()
except Exception as e:
print(f'插入/保存数据失败,错误代码:{e}')
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
with open(f'run_sql_spider.txt', 'a ', encoding='utf-8') as f:
f.write(f'{data}-插入/保存数据失败,错误代码:{e}-{now}n')
try:
downs=Gdown(data)
downs.date_downs()
except Exception as e:
print(f'下载图片数据失败,错误代码:{e}')
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
with open(f'run_img_spider.txt', 'a ', encoding='utf-8') as f:
f.write(f'{data}-下载图片数据失败,错误代码:{e}-{now}n')
以上是完整记录及过程整理
项目文件打包:
链接:
https://pan.baidu.com/s/1X_Ib4eAngSTTDUBZSeRuJA
提取码:
s84i