上代码:
代码语言:javascript复制#!/usr/bin/python3
import queue
import threading
import requests,csv,time,random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd
exitFlag = 0
#利用pandas读取csv文件
def getNames(csvfile):
data = pd.read_csv(csvfile,delimiter='|') # 1--读取的文件编码问题有待考虑
names = data['EnName']
return names
#获取ip列表
def get_ip_list():
f=open('ip.txt','r')
ip_list=f.readlines()
f.close()
return ip_list
#从IP列表中获取随机IP
def get_random_ip(ip_list):
proxy_ip = random.choice(ip_list)
proxy_ip=proxy_ip.strip('n')
proxies = {'https': proxy_ip}
return proxies
#功能:将信息写入文件
def write_file(filePath,row):
with open(filePath,'a ',encoding='utf-8',newline='') as csvfile:
spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)
spanreader.writerow(row)
def get_content(url,ip_list):
try:
try:
time.sleep(1)
proxies = get_random_ip(ip_list)
headers = {'User-Agent':str(UserAgent().random)}
req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)
except:
print("重新运行")
time.sleep(10)
proxies = get_random_ip(ip_list)
headers = {'User-Agent':str(UserAgent().random)}
req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40)
except:
print("第二次重新运行")
time.sleep(15)
proxies = get_random_ip(ip_list)
headers = {'User-Agent':str(UserAgent().random)}
req = requests.get(url=url, proxies=proxies,headers=headers)
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text,'lxml')
content = soup.find_all('div',class_='mbox')
return req.status_code, content
#获取准确的英文名、中文名、名字含义、来源、性别等信息
def get_infor_header(content):
content = content.find_all('span')
EnName = []
CnName = []
Gender = []
Source = []
Meaning = []
EnName.append(content[0].get_text())
if len(content) != 1:
CnName.append(content[1].get_text())
Meaning.append(content[2].get_text())
Source.append(content[3].get_text())
Gender.append(content[4].em.get('title'))
else:
CnName.append('')
Meaning.append('')
Source.append('')
Gender.append('')
#信息的链接方式EnName|CnName|Gender|Source|Meaning
list_header = EnName CnName Gender Source Meaning
return list_header
#获取英文名对应的名人
def get_infor_celebrity(content):
content = content.find_all('li')
list_celebrity = []
str_celebrity=''
for each in content:
if not str_celebrity:
str_celebrity =each.get_text()
else:
str_celebrity ='@' each.get_text()
list_celebrity.append(str_celebrity)
return list_celebrity
class myThread (threading.Thread):
def __init__(self, threadID, name, q,ip_list):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.q = q
self.ip_list = ip_list
def run(self):
print ("开启线程:" self.name)
process_data(self.name, self.q,ip_list)
print ("退出线程:" self.name)
def process_data(threadName, q,ip_list):
while not exitFlag:
queueLock.acquire()
if not workQueue.empty():
data = q.get()
queueLock.release()
print ("%s processing %s" % (threadName, data))
url = 'http://ename.dict.cn/{}'.format(data)
status_code, content = get_content(url,ip_list)
if status_code==200:
#获取准确的中文名、名字含义、来源、性别等信息
list_header = get_infor_header(content[0])
#获取名人信息
list_celebrity = get_infor_celebrity(content[1])
row = list_header list_celebrity
queueLock.acquire()
write_file('haici_infor.csv',row)
queueLock.release()
else:
queueLock.release()
time.sleep(1)
threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"]
nameList = getNames('A-Z.csv')
queueLock = threading.Lock()
workQueue = queue.Queue(100000)
threads = []
threadID = 1
# 创建新线程
ip_list = get_ip_list()
for tName in threadList:
thread = myThread(threadID, tName, workQueue,ip_list)
thread.start()
threads.append(thread)
threadID = 1
# 填充队列
queueLock.acquire()
for word in nameList:
workQueue.put(word)
queueLock.release()
# 等待队列清空
while not workQueue.empty():
pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
t.join()
print ("退出主线程")