糗事百科_多进程_demo(3)

2019-07-31 14:55:22 浏览数 (1)

版权声明:Copyright © https://cloud.tencent.com/developer/article/1477128

代码语言:javascript复制
import requests
from lxml import etree
from multiprocessing import Process
from multiprocessing import JoinableQueue as Queue


class QiubaiSpider:

    def __init__(self):
        self.temp_url = 'https://www.qiushibaike.com/8hr/page/{}/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 QQBrowser/4.4.108.400'
        }
        self.url_q = Queue()
        self.html_q = Queue()
        self.content_q = Queue()

    def get_url(self):
        url_list = [self.temp_url.format(i) for i in range(1,14)]
        for url in url_list:
            self.url_q.put(url)

    def parse_url(self):
        while True:
            url = self.url_q.get()
            response = requests.get(url,headers=self.headers)
            self.html_q.put(response.content.decode())
            self.url_q.task_done()

    def get_html(self):
        while True:
            html_str = self.html_q.get()
            html = etree.HTML(html_str)
            div_list = html.xpath('//div[@id="content-left"]/div')
            content_list =list()
            for div in div_list:
                item={}
                text = div.xpath('.//div[@class="content"]/span/text()')
                author = div.xpath('.//h2/text()')
                item['author'] = author
                item['text'] = text

                content_list.append(item)
            self.content_q.put(content_list)
            self.html_q.task_done()

    def save_html(self):

        current = 0
        while True:
            content_list = self.content_q.get()
            for content in content_list:
                current  = 1
                print(content)
                print(current)
            self.content_q.task_done()

    def run(self):
        self.get_url()
        process_list = []
        for i in range(3):
            p_parse = Process(target=self.parse_url)
            process_list.append(p_parse)

        p_html = Process(target=self.get_html)
        process_list.append(p_html)
        p_save = Process(target=self.save_html)
        process_list.append(p_save)

        for i in process_list:
            i.daemon = True
            i.start()

        for p in [self.url_q,self.html_q,self.content_q]:
            p.join()
        print('主进程结束')
if __name__ == '__main__':
    qiubai = QiubaiSpider()
    qiubai.run()

0 人点赞