Python3--爬取海词信息

2019-01-22 14:59:40 浏览数 (1)

上代码:

代码语言:javascript复制
#!/usr/bin/python3

import queue
import threading
import requests,csv,time,random  
from bs4 import BeautifulSoup  
from fake_useragent import UserAgent 
import pandas as pd 

exitFlag = 0

#利用pandas读取csv文件
def getNames(csvfile):
    data = pd.read_csv(csvfile,delimiter='|')                   # 1--读取的文件编码问题有待考虑
    names = data['EnName']
    return names

#获取ip列表  
def get_ip_list():      
    f=open('ip.txt','r')      
    ip_list=f.readlines()      
    f.close()      
    return ip_list      
      
#从IP列表中获取随机IP      
def get_random_ip(ip_list):      
    proxy_ip = random.choice(ip_list)      
    proxy_ip=proxy_ip.strip('n')      
    proxies = {'https': proxy_ip}      
    return proxies   
  
#功能:将信息写入文件      
def write_file(filePath,row):        
    with open(filePath,'a ',encoding='utf-8',newline='') as csvfile:        
        spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL)        
        spanreader.writerow(row)  

def get_content(url,ip_list):
    
    try:
        try:
            time.sleep(1)
            proxies = get_random_ip(ip_list)
            headers = {'User-Agent':str(UserAgent().random)}
            req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)
        except:
            print("重新运行")
            time.sleep(10)
            proxies = get_random_ip(ip_list)
            headers = {'User-Agent':str(UserAgent().random)}
            req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40)
    except:
        print("第二次重新运行")
        time.sleep(15)
        proxies = get_random_ip(ip_list)
        headers = {'User-Agent':str(UserAgent().random)}
        req = requests.get(url=url, proxies=proxies,headers=headers)

    req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text,'lxml')

    content = soup.find_all('div',class_='mbox')
    return req.status_code, content


#获取准确的英文名、中文名、名字含义、来源、性别等信息
def get_infor_header(content):
    content = content.find_all('span')
    
    EnName = []
    CnName = []
    Gender = []
    Source = []
    Meaning = []

    EnName.append(content[0].get_text())
    if len(content) != 1:
        CnName.append(content[1].get_text())
        Meaning.append(content[2].get_text()) 
        Source.append(content[3].get_text())
        Gender.append(content[4].em.get('title'))
    else:
        CnName.append('')
        Meaning.append('') 
        Source.append('')
        Gender.append('')

    #信息的链接方式EnName|CnName|Gender|Source|Meaning
    list_header = EnName   CnName   Gender   Source   Meaning

    return list_header

#获取英文名对应的名人
def get_infor_celebrity(content):
    content = content.find_all('li')
    list_celebrity = []
    str_celebrity=''
    for each in content:
        if not str_celebrity:
            str_celebrity  =each.get_text()
        else:
            str_celebrity  ='@'   each.get_text()
    list_celebrity.append(str_celebrity)
    return list_celebrity

class myThread (threading.Thread):
    def __init__(self, threadID, name, q,ip_list):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.q = q
        self.ip_list = ip_list
    def run(self):
        print ("开启线程:"   self.name)
        process_data(self.name, self.q,ip_list)
        print ("退出线程:"   self.name)

def process_data(threadName, q,ip_list):
    while not exitFlag:
        queueLock.acquire()
        if not workQueue.empty():
            data = q.get()
            queueLock.release()
            print ("%s processing %s" % (threadName, data))
            url = 'http://ename.dict.cn/{}'.format(data)
            status_code, content = get_content(url,ip_list)
            if status_code==200:
                #获取准确的中文名、名字含义、来源、性别等信息
                list_header = get_infor_header(content[0])
                #获取名人信息
                list_celebrity = get_infor_celebrity(content[1])
                row = list_header   list_celebrity
                queueLock.acquire()
                write_file('haici_infor.csv',row)
                queueLock.release()
        else:
            queueLock.release()
        time.sleep(1)

threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"]
nameList = getNames('A-Z.csv')
queueLock = threading.Lock()
workQueue = queue.Queue(100000)
threads = []
threadID = 1

# 创建新线程
ip_list = get_ip_list()
for tName in threadList:
    thread = myThread(threadID, tName, workQueue,ip_list)
    thread.start()
    threads.append(thread)
    threadID  = 1

# 填充队列
queueLock.acquire()
for word in nameList:
    workQueue.put(word)
queueLock.release()

# 等待队列清空
while not workQueue.empty():
    pass

# 通知线程是时候退出
exitFlag = 1

# 等待所有线程完成
for t in threads:
    t.join()
print ("退出主线程")

0 人点赞