运行效果:
搜狗微信搜索下拉框词采集核心代码参考:
代码语言:javascript复制#微信搜索下拉词采集
#首发:微信公众号:二爷记
#author:微信:huguo00289
# -*- coding: UTF-8 -*-
import requests
from fake_useragent import UserAgent
from urllib import parse
#采集微信搜索下拉词
def get_words(keyword):
word=parse.quote(keyword)
headers={
"User-Agent":UserAgent().random,
}
url="https://weixin.sogou.com/sugg/ajaj_json.jsp?key={}".format(word)
html=requests.get(url=url,headers=headers,timeout=5).text
content=html.replace("window.sogou.sug(",'').replace(')','').split(',[')[1]
words=content.replace("]",'').replace('"','').split(',')
print("采集 {} 微信搜索下拉词成功!".format(keyword))
print(words)
print("-"*100)
return words
写的比较渣,见谅,尤其是那段字符格式化处理,咯里吧嗦的!!
多线程采集源码参考:
代码语言:javascript复制#微信搜索下拉词采集
#首发:微信公众号:二爷记
#author:微信:huguo00289
# -*- coding: UTF-8 -*-
import requests
from fake_useragent import UserAgent
from urllib import parse
import threading
#采集微信搜索下拉词
def get_words(keyword):
word=parse.quote(keyword)
headers={
"User-Agent":UserAgent().random,
}
url="https://weixin.sogou.com/sugg/ajaj_json.jsp?key={}".format(word)
html=requests.get(url=url,headers=headers,timeout=5).text
content=html.replace("window.sogou.sug(",'').replace(')','').split(',[')[1]
words=content.replace("]",'').replace('"','').split(',')
print("采集 {} 微信搜索下拉词成功!".format(keyword))
print(words)
print("-"*100)
return words
#多线程采集微信搜索下拉词
def smt_words(words):
threadings=[]
for word in words:
t= threading.Thread(target=get_words,args=(word,))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print("多线程采集微信搜索下拉词完成!")
if __name__=='__main__':
words=get_words("21考研")
smt_words(words)
异步采集源码参考
代码语言:javascript复制#微信搜索下拉词异步采集
#首发:微信公众号:二爷记
#author:微信:huguo00289
#https://www.cnblogs.com/shenh/p/9090586.html
#https://blog.csdn.net/a__int__/article/details/104600972
# -*- coding: UTF-8 -*-
from fake_useragent import UserAgent
from urllib import parse
import aiohttp
import asyncio
loop = asyncio.get_event_loop()
class Async_get_words:
def __init__(self):
super().__init__() #super().__init__(),就是继承父类的init方法,同样可以使用super()点 其他方法名,去继承其他方法
self.CONCURRENCY = 10 #并发量为10
self.session = None
self.semaphore = asyncio.Semaphore(self.CONCURRENCY) #限制并发量
self.headers={
"User-Agent":UserAgent().random,
}
#异步协议访问
async def request(self, url):
async with self.semaphore:
# try:
print('getting', url)
# 添加ssl=False 防止SSLCertVerificationError
async with self.session.get(url,ssl=False,headers=self.headers) as response:
await asyncio.sleep(1)
return await response.read()
# except client_exceptions.ServerDisconnectedError:
# print('ServerDisconnectedError occurred while scraping ',url)
#处理数据
async def get_words(self,keyword):
word = parse.quote(keyword)
url = "https://weixin.sogou.com/sugg/ajaj_json.jsp?key={}".format(word)
repsonse= await self.request(url)
#print(repsonse)
html =repsonse.decode('gbk') #用utf-8解析
print(html)
content = html.replace("window.sogou.sug(", '').replace(')', '').split(',[')[1]
words = content.replace("]", '').replace('"', '').split(',')
print("采集 {} 微信搜索下拉词成功!".format(keyword))
print(words)
print("-" * 100)
return words
async def main(self,keywords):
self.session = aiohttp.ClientSession()
# 添加任务一行写法
tasks = [asyncio.ensure_future(self.get_words(keyword)) for keyword in keywords]
# tasks = []
# for keyword in keywords:
# tasks.append(asyncio.ensure_future(self.get_words(keyword)))
await asyncio.gather(*tasks)
await self.session.close()
def run(self):
keywords=['21考研时间', '21考研政治答案', '21考研数学答案', '21考研英语答案', '21考研英语', '21考研政治', '21考研人数', '21考研成绩什么时候公布', '21考研准考证打印时间', '21考研答案']
loop.run_until_complete(self.main(keywords))
if __name__ == '__main__':
spider=Async_get_words()
spider.run()