想要寻找图片的小伙伴们应该不会错过这个网站,对,没错,就是花瓣网,各种图片应有尽有,而花瓣网的推荐画板里面的词还是非常不错的,可惜被和谐了不少,想要采集花瓣画板的词,python爬虫当然是没问题的,花瓣的数据比较有意思!
查询源码,有点类似数据接口
app.page["explores"] = [{"keyword_id":1541, "name":"创意灯具", "urlname":"chuangyidengju", "cover":{"farm":"farm1", "bucket":"hbimg", "key":"f77b1c1df184ce91ff529a4d0b5211aa883872c91345f-tdQn2g", "type":"image/jpeg", "width":468, "height":702, "frames":1, "file_id":15723730}, "
想了下还是用正则获取比较简单方便!
正则
代码语言:javascript复制explores=re.findall(r'app.page["explores"] = [(. ?)];. ?app.page["followers"]',html,re.S)[0]
这里需要注意转义符
源码:
代码语言:javascript复制#花瓣推荐画报词采集
#20200314 by 微信:huguo00289
# -*- coding: UTF-8 -*-
from fake_useragent import UserAgent
import requests,re,time
from csql import Save
key_informations=[]
def search(key,keyurl):
print(f"正在查询: {key}")
ua = UserAgent()
headers = {"User-Agent": ua.random}
url=f"https://huaban.com/explore/{keyurl}/"
html=requests.get(url,headers=headers).content.decode("utf-8")
time.sleep(2)
if 'app.page["category"]' in html:
#print(html)
explores=re.findall(r'app.page["explores"] = [(. ?)];. ?app.page["followers"]',html,re.S)[0]
#print(explores)
keyfins=re.findall(r', "name":"(. ?)", "urlname":"(. ?)",',explores,re.S)
print(keyfins)
sa=Save(keyfins)
sa.sav()
for keyfin in keyfins:
if keyfin not in key_informations:
key_informations.append(keyfin)
search(keyfin[0], keyfin[1])
print(len(key_informations))
else:
print(f"查询关键词{key}不是工业设计分类,放弃查询!")
pass
print(len(key_informations))
print(key_informations)
search('3D打印', '3dp')
函数调用本身,不断循环访问网页获取数据!
花瓣网画板词采集
数据是下拉加载,ajax数据加载
同时有一个规律,那就是下一个下拉max是最后一个花瓣seq!
源码:
代码语言:javascript复制#花瓣画报词采集
#20200320 by 微信:huguo00289
# -*- coding: UTF-8 -*-
from csql import Save
import requests,json,time
def get_board(id):
headers={
'Cookie': 'UM_distinctid=170c29e8d8f84f-0b44fc835bc8e3-43450521-1fa400-170c29e8d903de; CNZZDATA1256914954=1367860536-1583810242-null|1583837292; _uab_collina=158415646085953266966037; __auc=30586f3f170d7154a5593583b24; __gads=ID=28115786a916a7a1:T=1584156505:S=ALNI_MbtohAUwMbbd5Yoa5OBBaSO0tSJkw; _hmt=1; sid=s:kwSz9iaMxZf-XtcJX9rrY4ltNDbqkeYs.bc8fvfAq6DLGxsRQ6LF9/mHcjOGIhRSZC0RkuKyHd7w; referer=https://www.baidu.com/link?url=f1FbGruB8SzQQxEDyaJ_mefz-bVnJFZJaAcQYJGXTZq&wd=&eqid=da22ff4e0005f208000000065e74adf2; uid=29417717; _f=iVBORw0KGgoAAAANSUhEUgAAADIAAAAUCAYAAADPym6aAAABJ0lEQVRYR+1VuxHCMAyVFqKjomEjVgkb0VDRMQgrmJMdBcUn2VbAXDiSJpb9/Hl6+iCEEAAAAiL9AJP5sgHSQuMXAOIB6NxXO354DOlhxodMhB8vicQxjgxrN4l1IrMRMRzmVkSeQ4pMIUdRp4RNaU4LsRzPNt9rKekmooWWDJVvjqVTuxKJeTWqJL1vkV2CZzJdifRWZ5EitfJrxbI2r6nEj8rxs5w08pAwLkXUgrGg/DoqdTN0IzK5ylAkXG6pgx/3sfPntuZqxsh9JUkk/ry7FtWbdXZvaNFFkgiPLRJyXe5txZfIbEQ4nMjLNe9K7FS9hJqrUeTnibQm+eoV0R5olZZctZqKGr5bsnuISPXy8muRssrv6X6AnNRbVau5LX8A+Ded/QkRsJAorSTxBAAAAABJRU5ErkJggg==,Win32.1920.1080.24; Hm_lvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584330161,1584348316,1584516528,1584705015; __asc=c7dc256a170f7c78b1b2b6abc60; CNZZDATA1256903590=1599552095-1584151635-https%3A%2F%2Fwww.baidu.com%2F|1584704759; _cnzz_CV1256903590=is-logon|logged-in|1584705067566&urlname|xpmvxxfddh|1584705067566; Hm_lpvt_d4a0e7c3cd16eb58a65472f40e7ee543=1584705067',
'Referer': 'https://huaban.com/discovery/industrial_design/boards/',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'X-Request': 'JSON',
'X-Requested-With': 'XMLHttpRequest',
}
url="https://huaban.com/discovery/industrial_design/boards/?k804hb1m&max=%s&limit=20&wfl=1" % id
html=requests.get(url,headers=headers,timeout=8).content.decode('utf-8')
time.sleep(1)
if html:
req=json.loads(html)
print(req)
boards=req['boards']
print(len(boards))
for board in boards:
print(board['title'])
sa = Save(board['title'])
sa.sav2()
#print(board['seq'])
next_id=boards[-1]['seq']
get_board(next_id)
if __name__ == '__main__':
id="1584416341304281760"
while True:
get_board(id)
使用 while 循环 以及自身循环
最后保存到数据库
源码
代码语言:javascript复制import pymysql
class Save(object):
def __init__(self,key):
self.host="localhost"
self.user="root"
self.password="123456"
self.db="xiaoshuo"
self.port=3306
self.connect = pymysql.connect(
host=self.host,
user=self.user,
password=self.password,
db=self.db,
port=self.port,
)
self.cursor = self.connect.cursor() # 设置游标
self.key=key
def insert(self):
for keyword in self.key:
try:
sql="INSERT INTO huaban(keyword)VALUES(%s)"
val = (keyword[0])
self.cursor.execute(sql, val)
self.connect.commit()
print(f'>>> 插入 {keyword[0]} 数据成功!')
except Exception as e:
print(e)
print(f'>>> 插入 {keyword[0]} 数据失败!')
def insert2(self):
keyword=self.key
try:
sql="INSERT INTO huaban2(keyword)VALUES(%s)"
val = keyword
self.cursor.execute(sql, val)
self.connect.commit()
print(f'>>> 插入 {keyword} 数据成功!')
except Exception as e:
print(e)
print(f'>>> 插入 {keyword} 数据失败!')
def cs(self):
# 关闭数据库
self.cursor.close()
self.connect.close()
def sav(self):
self.insert()
self.cs()
def sav2(self):
self.insert2()
self.cs()