weibo_spider
微博爬虫:
- 爬取热搜榜:不需要登录,但是需要处理反扒措施
- 爬取热门话题:需要登录之后获得cookies和user_id
爬取热搜榜
微博热搜榜地址:https://s.weibo.com/top/summary?cate=realtimehot
,具体爬取代码如下,其中最终的部分在于对热搜标题中的反扒处理,具体可参见简书中的爬取新浪微博热搜。具体代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on 4/3/18 2:08 PM
@author: Chen Liang
@function: 爬取weibo实时热搜榜(不需要登录)
"""
from spider.base_spider import BaseSpider
import re
from urllib import parse
import json
import sys
class RealTimeHotSpider(BaseSpider):
"""爬取weibo实时热搜榜(不需要登录)"""
def get_hot_list(self, url):
html = self.get_html_content(url)
raw = re.findall(r'class=\"star_name\">.*?<a href=\"\/weibo\/(.*?)&Refer=[top|new]', html)
hot_degree_lst = re.findall(r'class=\"star_num\"><span>(.*?)<\/span>', html)
hot_degree_lst.insert(0, sys.maxsize)
title_lst = []
names_lst = []
links_lst = []
for i in raw:
title_lst.append(i.replace('25', ''))
for i in title_lst:
names_lst.append(parse.unquote(i))
link = 'https://s.weibo.com/weibo/' i
links_lst.append(link)
hot_info = []
for i, v in enumerate(names_lst):
# print(i, v, links_lst[i], hot_degree_lst[i])
json_data = {
"number": i,
"title": v,
"url": links_lst[i],
"hot": hot_degree_lst[i]
}
hot_info.append(json_data)
return json.dumps(hot_info)
if __name__ == '__main__':
spider = RealTimeHotSpider()
target_real_time_hot = 'https://s.weibo.com/top/summary?cate=realtimehot'
print(spider.get_hot_list(target_real_time_hot))
爬取热门话题
热门话题在登录之后会出现在home页面中,示例地址:https://weibo.com/u/3655576503/home
可以采用selenium模拟登录,模拟登录的关键是通过selenium提供的选择器找到对应的input标签和对应的submit按钮。使用过程中遇到了以下问题:
微博selenium模拟登录bug
验证版本: chromedriver=2.37 and os = ubuntu 16.04 LTS
如果不增加chrome_options会出现chromedriver加载失败的报错,报错如下:
代码语言:javascript复制Traceback (most recent call last):
File "/root/PycharmProjects/smart_login/sina_login/sina_login_by_selenium.py", line 54, in <module>
cookies = login(name_input, passwd_input, url)
File "/root/PycharmProjects/smart_login/sina_login/sina_login_by_selenium.py", line 14, in login
driver = webdriver.Chrome('/root/qk_python/python/data/collect/weibo_spider/priv/chromedriver')
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/chrome/webdriver.py", line 75, in __init__
desired_capabilities=desired_capabilities)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webdriver.py", line 154, in __init__
self.start_session(desired_capabilities, browser_profile)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webdriver.py", line 243, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webdriver.py", line 312, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally
(Driver info: chromedriver=2.37.544315 (730aa6a5fdba159ac9f4c1e8cbc59bf1b5ce12b7),platform=Linux 4.13.0-38-generic x86_64)
同时需要在get操作之前增加driver.set_window_size(1124, 850)
进行窗口大小的预置(合适的值即可),否则得到的WebElement的状态is_displayed为False,即不可见,导致进行clear操作和send_keys操作时出现异常。异常如下:
Traceback (most recent call last):
File "/root/PycharmProjects/smart_login/sina_login/sina_login_by_selenium.py", line 61, in <module>
cookies = login(name_input, passwd_input, url)
File "/root/PycharmProjects/smart_login/sina_login/sina_login_by_selenium.py", line 27, in login
name_field.clear()
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webelement.py", line 95, in clear
self._execute(Command.CLEAR_ELEMENT)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webelement.py", line 628, in _execute
return self._parent.execute(command, params)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/webdriver.py", line 312, in execute
self.error_handler.check_response(response)
File "/usr/local/lib/python3.5/dist-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.InvalidElementStateException: Message: invalid element state: Element is not currently interactable and may not be manipulated
(Session info: headless chrome=65.0.3325.162)
(Driver info: chromedriver=2.37.544315 (730aa6a5fdba159ac9f4c1e8cbc59bf1b5ce12b7),platform=Linux 4.13.0-38-generic x86_64)
处理方法参考PhantomJs 的一个issue:https://github.com/ariya/phantomjs/issues/11637
bug解决之后完整的登录代码如下:
代码语言:javascript复制#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on 4/8/18 10:41 AM
@author: Chen Liang
@function: selenium模拟登录
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ChromeOptions
import time
import re
# def singleton(cls):
# instances = {}
#
# def wrap(*args, **kwargs):
# if cls not in instances:
# instances[cls] = cls(*args, **kwargs)
# return instances[cls]
#
# return wrap
#
#
# @singleton
class WeiboSeleniumLogin(object):
def __init__(self, driver_path):
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
self.__driver = webdriver.Chrome(driver_path, chrome_options=chrome_options)
self.__driver.maximize_window()
self.__driver.set_page_load_timeout(30)
self.__driver.set_window_size(1124, 850)
self.__cookies = None
self._source = None
def login(self, username, password, login_url):
self.__driver.get(login_url)
name_field = self.__driver.find_element_by_id('loginname')
name_field.clear()
name_field.send_keys(username)
password_field = self.__driver.find_element_by_class_name('password').find_element_by_name('password')
password_field.clear()
password_field.send_keys(password)
submit = self.__driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span')
ActionChains(self.__driver).double_click(submit).perform()
time.sleep(5)
WebDriverWait(self.__driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'WB_miniblog')))
self._source = self.__driver.page_source
# print(self._source)
if self.is_login():
# print('login success')
self.__cookies = self.__driver.get_cookies()
self.__driver.quit()
return True
self.__driver.quit()
return False
def is_login(self):
rs = re.search("CONFIG['islogin']='(d)'", self._source)
if rs:
return int(rs.group(1)) == 1
else:
return False
@property
def cookies(self):
return self.__cookies
@property
def user_id(self):
rs = re.search("CONFIG['uid']='(d )'", self._source)
if rs:
return int(rs.group(1))
else:
return -1
if __name__ == '__main__':
chrome_driver_path1 = '/root/qk_python/python/data/collect/weibo_spider/priv/chromedriver'
username1 = input('请输入你的账号n')
password1 = input('请输入你的密码n')
login_url1 = 'https://weibo.com/login.php'
sl = WeiboSeleniumLogin(chrome_driver_path1)
if sl.login(username1, password1, login_url1):
print('login success')
print(sl.cookies)
print(sl.user_id)
else:
print('login failed')
登录成功之后可以获取到cookies数据和后续抓取所需要的user_id。并且cookies也需要转变成request库所需要的格式。
代码语言:javascript复制#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created on 4/3/18 2:08 PM
@author: Chen Liang
@function: 爬取weibo热门话题(需要处理登录)
"""
from spider.base_spider import BaseSpider
from login.simulate_login import WeiboSeleniumLogin
import re
class HotNewTopicSpider(BaseSpider):
"""爬取weibo热门话题(需要处理登录):需要获取title, url, 阅读数, 讨论数, 粉丝数"""
def __init__(self, cookies, user_id):
self._cookie_dict = {}
assert isinstance(cookies, list), "argument type error"
assert isinstance(user_id, int), "argument type error"
for cookie in cookies:
if 'name' in cookie and 'value' in cookie:
self._cookie_dict[cookie['name']] = cookie['value']
self._user_id = user_id
self._html = self.get_html_content('https://weibo.com/u/{}/home'.format(self._user_id), self._cookie_dict)
def get_topic_list(self):
"""获取热门话题列表"""
topic_list = []
pattern = r'<span class=\"total S_txt2\" title=\"阅读量\">(.*?)<.*?href=\"\/\/weibo.com\/p\/(.*?from='
r'trendtop_api&refer=index_hot_new).*?title=\"(.*?)\">'
raw_list = re.findall(pattern, self._html)
for raw in raw_list:
read_cnt, path, title = raw
topic_list.append(
{
'read_cnt': self.read_cnt_transfer(read_cnt),
'url': 'https://weibo.com/p/{}'.format(path),
'title': title.strip('#')
}
)
return topic_list
@staticmethod
def read_cnt_transfer(read_cnt_str):
if read_cnt_str.rstrip('亿') != read_cnt_str:
return int(float(read_cnt_str.rstrip('亿')) * 100000000)
elif read_cnt_str.rstrip('万') != read_cnt_str:
return int(float(read_cnt_str.rstrip('万')) * 10000)
else:
return int(read_cnt_str)
def get_numbers(self, url):
"""根据话题的url获取 阅读数, 讨论数, 粉丝数等数据
进一步解析topic_list中的详情url
"""
pass
if __name__ == '__main__':
chrome_driver_path1 = '/root/qk_python/python/data/collect/weibo_spider/priv/chromedriver'
username1 = input('请输入你的账号n')
password1 = input('请输入你的密码n')
login_url1 = 'https://weibo.com/login.php'
sl = WeiboSeleniumLogin(chrome_driver_path1)
if sl.login(username1, password1, login_url1):
print('login success')
# print(sl.cookies)
# print(sl.user_id)
else:
print('login failed')
spd = HotNewTopicSpider(sl.cookies, sl.user_id)
topic_list1 = spd.get_topic_list()
import json
print(json.dumps(topic_list1))
最后能成功获取到热门话题以及对应的链接和阅读量。
总结
整个爬取任务中的关键在于反扒,模拟登录,以及抓取正则表达式。本次涉及到了以下爬虫技巧:
- 登录:cookie
- 登录:表单
- 伪装成浏览器访问
- 反”反盗链”
要做一个完整的大爬虫,还需要涉及以下几个部分:
- 验证码:想尽一切办法避免验证码(目前验证码的难度比较大,一定有验证码时,可以做成服务)
- gzip/deflate:流式解压用于提高效率
- 多线程(线程类和线程池)
- http代理池:squid和haipproxy构成高可用的ip代理迟
这几个部分本次用不上,后续补充。
参考:
- weibo_spider:https://github.com/Flowsnow/weibo_spider
- smart_login:https://github.com/Flowsnow/smart_login
- 爬取新浪微博热搜:https://www.jianshu.com/p/e6c24feeb806
- python爬虫技巧总结:http://www.pythonclub.org/python-network-application/observer-spider