版权声明:Copyright © https://cloud.tencent.com/developer/article/1477128
代码语言:javascript复制import requests
from lxml import etree
from multiprocessing import Process
from multiprocessing import JoinableQueue as Queue
class QiubaiSpider:
def __init__(self):
self.temp_url = 'https://www.qiushibaike.com/8hr/page/{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 QQBrowser/4.4.108.400'
}
self.url_q = Queue()
self.html_q = Queue()
self.content_q = Queue()
def get_url(self):
url_list = [self.temp_url.format(i) for i in range(1,14)]
for url in url_list:
self.url_q.put(url)
def parse_url(self):
while True:
url = self.url_q.get()
response = requests.get(url,headers=self.headers)
self.html_q.put(response.content.decode())
self.url_q.task_done()
def get_html(self):
while True:
html_str = self.html_q.get()
html = etree.HTML(html_str)
div_list = html.xpath('//div[@id="content-left"]/div')
content_list =list()
for div in div_list:
item={}
text = div.xpath('.//div[@class="content"]/span/text()')
author = div.xpath('.//h2/text()')
item['author'] = author
item['text'] = text
content_list.append(item)
self.content_q.put(content_list)
self.html_q.task_done()
def save_html(self):
current = 0
while True:
content_list = self.content_q.get()
for content in content_list:
current = 1
print(content)
print(current)
self.content_q.task_done()
def run(self):
self.get_url()
process_list = []
for i in range(3):
p_parse = Process(target=self.parse_url)
process_list.append(p_parse)
p_html = Process(target=self.get_html)
process_list.append(p_html)
p_save = Process(target=self.save_html)
process_list.append(p_save)
for i in process_list:
i.daemon = True
i.start()
for p in [self.url_q,self.html_q,self.content_q]:
p.join()
print('主进程结束')
if __name__ == '__main__':
qiubai = QiubaiSpider()
qiubai.run()