[TOC]
目录结构
(1) urllib 简单的爬取指定网站 (2) Scrapy 爬虫框架 (3) BeautifulSoup 爬虫解析
0x00 urllib简单爬取
1.初始爬虫 案例1:采用Python自带的url lib形成的urllib包
代码语言:javascript复制#!/usr/bin/python
#功能:爬虫的第一课
import urllib.request #导入urllib包里面的指定模块
import urllib.parse #解析使用
#案例1:
response = urllib.request.urlopen("http://www.weiyigeek.github.io" urllib.parse.quote("网络安全")) #Url中文解析
html = response.read() #进行返回一个二进制取字符串
html = html.decode('utf-8') #解码操作
print("正在写入文件之中.....")
f = open('weiyigeek.txt','w ',encoding='utf-8') #打开
f.writelines(html)
f.close() #关闭
print("网站请求的结果:n",html)
#案例2:
url = "http://placekitten.com/g/600/600"
response = urllib.request.urlopen(url) #可以是url字符串或者Request()对象,返回一个对象
img = response.read()
filename = url[-3:] '.jpg'
with open(filename,'wb ') as f: #注意这里存储二进制
f.write(img)
2.Py爬虫实现/优化
案例1:Spider调用有道翻译接口进行中英文翻译
代码语言:javascript复制#!/usr/bin/python
#功能:爬虫的第2课 JSON / 代理
import urllib.request
import urllib.parse
import json
import time
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
while True:
i = input("请输入翻译的英文(输入Q退出):")
if i == 'Q' or i == 'q':
break
data = {}
data['i'] = i
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['doctype'] = 'json'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['salt'] = '15550362545153'
data['sign'] = 'a28b8eb61693e30842ebbb4e0b36d406'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'false'
data = urllib.parse.urlencode(data).encode('utf-8')
#修改Header
#url 对象 request 以及 添加 请求头信息
req = urllib.request.Request(url, data) #也能直接传入 header 对象字典
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')
req.add_header('Cookie',' YOUDAO_MOBILE_ACCESS_TYPE=1; OUTFOX_SEARCH_USER_ID=597755369@10.169.0.84; OUTFOX_SEARCH_USER_ID_NCOO=1911553850.7151666; YOUDAO_FANYI_SELECTOR=ON; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc8N5HySla85aD-6kpOw; ___rl__test__cookies=1555036254514; UM_distinctid=16a0f2c1b0b146-0612adf0fe3fd6-4c312c7c-1fa400-16a0f2c1b0c659; SESSION_FROM_COOKIE=fanyiweb')
req.add_header('Referer','http://fanyi.youdao.com/')
#url 请求返回的对象
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
jtarget = json.loads(html) #json解析
print("翻译后的结果 :",jtarget['translateResult'][0][0]['tgt'])
time.sleep(1) #延迟1s 防止请求频繁
print("请求头信息:",req.headers)
print("请求URL:",res.geturl())
print("状态码:",res.getcode())
print("返回头消息:n",res.info())
# 请输入翻译的英文(输入Q退出):whoami
# 翻译后的结果 : 显示本用户信息
# 请求头信息: {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Cookie': ' YOUDAO_MOBILE_ACCESS_TYPE=1; OUTFOX_SEARCH_USER_ID=597755369@10.169.0.84; OUTFOX_SEARCH_USER_ID_NCOO=1911553850.7151666; YOUDAO_FANYI_SELECTOR=ON; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc8N5HySla85aD-6kpOw; ___rl__test__cookies=1555036254514; UM_distinctid=16a0f2c1b0b146-0612adf0fe3fd6-4c312c7c-1fa400-16a0f2c1b0c659; SESSION_FROM_COOKIE=fanyiweb', 'Referer': 'http://fanyi.youdao.com/'}
# 请求URL: http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule
# 状态码: 200
# 返回头消息:
# Server: Tengine
# Date: Fri, 12 Apr 2019 03:23:02 GMT
# Content-Type: application/json;charset=utf-8
# Transfer-Encoding: chunked
# Connection: close
# Vary: Accept-Encoding
# Vary: Accept-Encoding
# Content-Language: en-US
3.爬虫参数设置 案例3:使用代理进行请求网站
代码语言:javascript复制#!/usr/bin/python3
#爬虫第三课:代理 一般urllib使用代理ip的步骤如下
# 设置代理地址
# 创建Proxyhandler
# 创建Opener
# 安装Opener
import urllib.request
import random
url1 = 'http://myip.kkcha.com/'
url2 = 'http://freeapi.ipip.net/'
proxylist = ['116.209.52.49:9999','218.60.8.83:3129']
ualist = ['Mozilla/5.0 (compatible; MSIE 12.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/5.0 (Windows NT 6.7; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.7; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
]
proxyip = random.choice(proxylist)
# 代理设置 参数是一个字典 {'类型':'代理IP:端口'}
proxy = urllib.request.ProxyHandler({'http':proxyip})
#创建一个定制一个opener
pro_opener = urllib.request.build_opener(proxy)
pro_opener.addheaders = [('User-Agent',random.choice(ualist))] #随机请求头
#安装opener
urllib.request.install_opener(pro_opener)
##调用opener.open(url)
##利用代理进行请求
url2 = url2 proxyip.split(":")[0]
with urllib.request.urlopen(url1) as u:
print(u.headers)
res = u.read().decode('utf-8')
print(res)
with urllib.request.urlopen(url2) as u:
res = u.read().decode('utf-8')
print(res)
3.爬虫urllib 库的异常处理
代码语言:javascript复制#!/usr/bin/python3
#功能:urllib 异常处理
from urllib.request import Request,urlopen
from urllib.error import HTTPError,URLError
urlerror = 'http://www.weiyigeek.com'
urlcode = 'http://www.weiyigeek.github.io/demo.html'
def url_open(url):
req = Request(url)
req.add_header('APIKEY','This is a password!')
try:
res = urlopen(req)
except (HTTPError,URLError) as e:
if hasattr(e,'code'): #需要放在reason属性前面
print('HTTP请求错误代码:', e.code)
print(e.read().decode('utf-8')) #[注意]这里是e.read
elif hasattr(e,'reason'):
print('服务器链接失败',e.reason)
else:
print("Suceeccful!")
if __name__ == '__main__':
url_open(urlerror)
url_open(urlcode)
################## 执行结果 #####################
# 服务器链接失败 [Errno 11001] getaddrinfo failed
# HTTP请求错误代码: 404
# <html>
# <head><title>404 Not Found</title></head>
# <body>
# <center><h1>404 Not Found</h1></center>
# <hr><center>nginx/1.15.9</center>
# </body>
# </html>
4.爬虫之正则匹配 案例4:正则与爬虫利用
代码语言:javascript复制#!/usr/bin/python3
#功能:正则与爬虫
from urllib.request import Request,urlopen,urlretrieve
from urllib.error import HTTPError,URLError
import re
import os
def url_open(url):
req = Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')
try:
res = urlopen(req)
html = res.read()
except HTTPError as e:
print("服务器请求错误:",e.code())
return 0
except URLError as e:
print("链接服务器Fail:",e.reason())
return 0
else:
return html
def save_img(url,dir):
i = 0
os.mkdir(dir)
os.chdir(os.curdir '/' dir)
for each in url:
#以后将要废弃不建议使用但是真心方便
urlretrieve(each,str(i) '.jpg',None)
i = 1
else:
print("下载完成!aa")
def get_img(url):
res = url_open(url).decode('utf-8')
if res == 0:
exit("请求错误退出")
p = r'<img src="([^"] .jpg)"'
imglist= re.findall(p,res)
save_img(imglist,'test')
print(imglist)
if __name__ == '__main__':
url = 'http://tieba.baidu.com/f?kw=重庆第二师范学院&ie=utf-8&tab=album'
get_img(url)
WeiyiGeek.正则与爬虫利用
5.爬虫正则进阶 案例5:爬虫抓取代理网站的ip:port
代码语言:javascript复制#!/usr/bin/python3
#urllib爬虫最后一课
import urllib.request
from urllib.error import HTTPError,URLError
import re
import os
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0')
try:
res = urllib.request.urlopen(req)
except (HTTPError,URLError) as e:
print("出现错误:",e.code,'错误的网页:',e.read())
return 0
else:
return res.read().decode('utf-8')
def main1(url,filename):
html = url_open(url)
if html == 0:
exit("请求错误,程序退出!")
exp = r'<td>((?:(?:[01]{0,1}d{0,1}d|2[0-4]d|25[0-5]).){0,3}(?:[01]{0,1}d{0,1}d|2[0-4]d|25[0-5]))</td>n(?:s*?)<td>(?P<port>d{0,4})</td>' #这里是坑呀
regres = re.findall(exp,html,re.M)
iplist = []
for each in regres:
ipport = each[0] ':' each[1]
iplist.append(ipport)
with open(filename,'w ',encoding='utf-8') as f:
for i in range(len(iplist)):
f.write(iplist[i] 'n')
if __name__ == '__main__':
url = 'https://www.xicidaili.com/nn/'
main1(url,'proxyip.txt')
######### 抓取代理结果 ################
# 119.102.186.99:9999
# 111.177.175.234:9999
# 222.182.121.10:8118
# 110.52.235.219:9999
# 112.85.131.64:9999
0x02 Scrapy 爬虫框架
(1)Sccrapy 安装配置
1.1 Anaconda安装流程 这种方法是一种比较简单的安装Scrapy的方法(尤其是对Windows来说),你可以使用该方法安装,也可以选用下文中专用平台的安装方法。 Anaconda是包含了常用的数据科学库的Python发行版本,如果没有安装,可以到https://www.continuum.io/downloads下载对应平台的包安装。 如果已经安装,那么可以轻松地通过conda命令安装Scrapy。 安装命令如下: conda install Scrapy
1.2 Windows安装流程 WINDOS下最好的安装方式是通过wheel文件来安装;我这里是WIN10的环境所有还是采用pip3安装
代码语言:javascript复制#当前环境:win10 py3.7
pip3 install wheel
pip3 install lxml #注意找到对应的版本 - 安装lxml
pip3 install zope.interface #安装zope.interface
pip3 install Twisted
pip3 install pywin32
pip3 install Scrapy #最后安装Scrapy即可
#安装pyOpenSSL
#官方网站下载wheel文件,https://pypi.python.org/pypi/pyOpenSSL#downloads
pip3 install pyOpenSSL-16.2.0-py2.py3-none-any.whl
#Py3.7一键升级所有库
from subprocess import call
from pip._internal.utils.misc import get_installed_distributions
for dist in get_installed_distributions():
call("pip install --upgrade " dist.project_name, shell=True)
1.3 CentOS、RedHat、Fedora 确保一些必须的类库已经安装,运行如下命令: sudo yum groupinstall development tools sudo yum install python34-devel epel-release libxslt-devel libxml2-devel openssl-devel pip3 install Scrapy
1.4Ubuntu、Debian、Deepin 依赖库安装首先确保一些必须的类库已经安装,运行如下命令: sudo apt-get install build-essential python3-dev libssl-dev libffi-dev libxml2 libxml2-dev libxslt1-dev zlib1g-dev pip3 install Scrapy
1.5Mac OS 依赖库安装在Mac上构建Scrapy的依赖库需要C编译器以及开发头文件,它一般由Xcode提供,运行如下命令安装即可: xcode-select –install pip3 install Scrapy
验证安装之后,在命令行下输入,如果出现类似下方的结果,就证明Scrapy安装成功。
WeiyiGeek.scrapy
(2) Scrapy 介绍与使用
Scrapy是基于Python的爬虫框架,它为了爬取网站数据,提取结构性数据而编写的应用框架,可以应用在数据挖掘,信息处理或存储历史数据等需求之中;
使用Scrapy抓取一个网站分四个步骤:
- 创建一个Scrapy项目
- 定义Item容器:保存爬取得数据的一个容器,与字典类似,但却多额外的保护机制避免拼写错误导致未定义字段错误;
- 编写爬虫
- 存储内存
框架示例图:
WeiyiGeek.Scrapy
2.1 scrapy 常用命令
代码语言:javascript复制scrapy startproject douban #并初始化一个项目douban
scrapy genspider douban_spider movie.douban.com #建立通用爬虫文件后面是爬取的地址
scrapy crawl douban_spider #开启scrapy项目进行爬取,douban_spider 项目入口名称
scrapy shell <url> #交互测试爬虫项目中执行 测试提取数据的代码
scrapy shell "http://scrapy.org" --nolog #打印日志 注意是双引号
scrapy crawl douban_spider -o movielist.json #将爬取数据存储到特定格式
scrapy crawl douban_spider -o movielist.cvs
2.2 scrapy 项目解析
代码语言:javascript复制weiyigeek
│ items.py # 数据模型文件,容器创建对象(序号,名称,描述,评价)
│ middlewares.py # 中间件设置 (爬虫ip地址伪装)
│ pipelines.py # 将数据通过管道写入数据/磁盘中
│ settings.py # 项目设置(USER-AGENT,抓取时间)
│ __init__.py
├─spiders
│ │ douban_spider.py #爬虫项目入口
│ │ __init__.py
scrapy.cfg #配置文件信息
2.3 scrapy 选择器介绍 在Scrapy中是使用一种基于XPath和CSS的表达式机制的选择器(selectors),它有四个基本方法:
xpath() : 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表;
代码语言:javascript复制#xml的解析方法xpath语法:
response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
#选取class为article的div下,class为grid_view的ol下的所有li标签
WeiyiGeek.xpath语法属性
WeiyiGeek.示例
css():传入CSS表达式,返回该表达式所对应的所有节点的selector list 列表
代码语言:javascript复制response.css('.类名 标签::方法').extract() #截取字符串
extract():序列化该节点为unicode字符串并返回list
re():根据传入的正则表达式对数据进行提取,返回unicode字符串list列表
2.4 scrapy 交互调试 描述: Scrapy终端是一个交互终端,供您在未启动spider的情况下尝试及调试您的爬取代码;
- shelp() - 打印可用对象及快捷命令的帮助列表
- fetch(request_or_url) - 根据给定的请求(Request)对象或URL获取一个新的response,并更新相关的对象
- view(response) - 在本机的浏览器打开给定的response,把下载的html保存。 其会在response的body中添加一个 tag ,使得外部链接(例如图片及css)能正确显示。 注意该操作会在本地创建一个临时文件,且该文件不会被自动删除。
- crawler - 当前 Crawler 对象.
- spider - 处理URL的spider。 对当前URL没有处理的Spider时则为一个 Spider 对象。
- request - 最近获取到的页面的 Request 对象,您可以使用 replace() 修改该request。或者使用 fetch 快捷方式来获取新的request。
- response - 包含最近获取到的页面的 Response 对象。
- sel - 根据最近获取到的response构建的 Selector 对象。
- settings - 当前的 Scrapy settings
案例:
代码语言:javascript复制> scrapy shell "http://movie.douban.com/chart"
>>> help(命令)
>>> request
<GET http://www.weiyigeek.github.io>
> response.url
'https://movie.douban.com/chart'
>>> response
<200 https://movie.douban.com/chart>
>>> response.headers #请求头
>>> response.body #网页源代码
>>> response.text
>>> response.xpath('//title') #返回一个xpath选择器
>>> response.xpath('//title').extract() #xpath表达式抽取内容
['<title>n豆瓣电影排行榜n</title>']
response.xpath('//title/text()').extract() #抽取文本信息
['n豆瓣电影排行榜n']
>>> response.xpath("//div[@class='pl2']//a").extract_first().strip() # extract_first 提取第一次匹配的数据
'<a href="https://movie.douban.com/subject/25986662/" class="">n疯狂的外星人n / <span style="font-size:13px;">Crazy Alien</span>n </a>'
#CSS进行提取
>>> sel.css('.pl2 a::text').extract_first().strip()
'疯狂的外星人n /'
#有网站提取需要上request的header信息如何解决:
from scrapy import Request #导入模块
>>> data = Request("https://www.taobao.com",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
})
>>> fetch(data) #取得请求网站
2017-11-30 22:24:14 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.taobao.com> (referer: None)
>>> sel.xpath('/html/body/div[4]/div[1]/div[1]/div[1]/div/ul/li[1]/a[1]')
[<Selector xpath='/html/body/div[4]/div[1]/div[1]/div[1]/div/ul/li[1]/a[1]' data='<a href="https://www.taobao.com/markets/'>]
>>> data.headers #查看设置的header
{b'User-Agent': [b'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'], b'Accept-Encoding': [b'gzip,deflate']}
#匹配多个字符串进行迭代循环
>>> fetch('http://weiyigeek.github.io')
>>> title = response.css('.article-header a::text').extract()
>>> for each in title:
... print(each)
...
安全设备策略绕过技术总结.md
Win平台安全配置.md
Python3 正则表达式特殊符号及用法.md
Python3爬虫学习.md
磁盘高可用解决方案(DBA).md
Nodejs入门学习1.md
Node.js简介与安装.md
域控安全基础.md
Win内网渗透信息搜寻.md
高可用服务解决方案(DBA).md
WeiyiGeek.scrapyshell
2.4 scrapy 简单实例
代码语言:javascript复制scrapy startproject weiyigeek
scrapy genspider blog_spider www.weiyigeek.github.io
'''
items.py 抓取的对象编辑数据模型文件
'''
import scrapy
class WeiyigeekItem(scrapy.Item):
#items.py 设置需要抓取的对象编辑数据模型文件 ,创建对象(序号,名称,描述,评价)
title = scrapy.Field() #标题
href = scrapy.Field() #标题地址
time = scrapy.Field() #创建时间
'''
blog_spider.py 爬虫处理主文件
'''
# -*- coding: utf-8 -*-
import scrapy
from weiyigeek.items import WeiyigeekItem #导入数据容器中的类中的属性(其实就导入该项目中items.py)
class BlogSpiderSpider(scrapy.Spider):
name = 'blog_spider' #爬虫名称
allowed_domains = ['www.weiyigeek.github.io'] #爬虫允许抓取的域名
start_urls = ['http://www.weiyigeek.github.io/','http://weiyigeek.github.io/page/2/'] #爬虫抓取数据地址,给调度器
#解析请求返回的网页对象
def parse(self, response):
sel = scrapy.selector.Selector(response) #scrapy选择器
sites = sel.css('.article-header') #利用css选择器进行赛选
items = []
for each in sites:
item = WeiyigeekItem() #数据容器类
item['title'] = each.xpath('a/text()').extract()
item['href'] = each.xpath('a/@href').extract()
item['time'] = each.xpath('div[@class="article-meta"]/time/text()').extract() #注意这里使用的
items.append(item)
#输出到屏幕之中
print(">>>",item['title'],item['href'],item['time'])
return items
WeiyiGeek.执行结果
(3) Scrapy 实例项目
描述:爬取爱奇艺的TOPS250项目;
代码语言:javascript复制
#Step1.创建spider项目和初始化爬虫名称
scrapy startproject douban
scrapy genspider douban_spider movie.douban.com
'''
Step2.修改items模板文件
'''
class DoubanItem(scrapy.Item):
serial_number = scrapy.Field() #序号
movie_name = scrapy.Field() #电影名称
introduce = scrapy.Field() # 介绍
star = scrapy.Field() # 星级
evaluate = scrapy.Field() # 评价
describle = scrapy.Field() # 描述
'''
Step3.修改爬虫文件
'''
# -*- coding: utf-8 -*-
import scrapy
from douban.items import DoubanItem #导入容器 doubanitems.py
class DoubanSpiderSpider(scrapy.Spider):
name = 'douban_spider' # 爬虫的名称
allowed_domains = ['movie.douban.com'] # 爬虫允许抓取的域名
start_urls = ['https://movie.douban.com/top250'] # 爬虫抓取数据地址,给调度器
def parse(self, response):
movie_list = response.xpath("//div[@class='article']//ol[@class='grid_view']/li")
for i_item in movie_list:
douban_item = DoubanItem() #模型初始化
#以text()结束表示获取其信息, extract_first() 筛选结果的第一个值
douban_item['serial_number'] = i_item.xpath(".//div[@class='item']//em/text()").extract_first() #排名
douban_item['movie_name'] = i_item.xpath(".//div[@class='info']/div[@class='hd']/a/span[1]/text()").extract_first() #名称
descs = i_item.xpath(".//div[@class='info']//div[@class='bd']/p[1]/text()").extract_first() #
#处理空格问题
desc_str = ''
for i_desc in descs:
i_desc_str = "".join(i_desc.split())
desc_str = i_desc_str
douban_item['introduce'] = desc_str #介绍
douban_item['star'] = i_item.xpath(".//span[@class='rating_num']/text()").extract_first() #星星
douban_item['evaluate'] = i_item.xpath(".//div[@class='star']//span[4]/text()").extract_first() #评价数量
douban_item['describle'] = i_item.xpath(".//p[@class='quote']/span/text()").extract_first() #描述
yield douban_item #是将返回结果压入 item Pipline 进行处理(重点)
#处理下一页功能
next_link = response.xpath("//div[@class='article']//span[@class='next']/link/@href").extract()
if next_link:
next_link = next_link[0]
yield scrapy.Request("https://movie.douban.com/top250" next_link,callback=self.parse) #(重点)
# 解释:
# 1 每次for循环结束后,需要获取next页面链接:next_link
# 2 如果到最后一页时没有下一页,需要判断一下
# 3 下一页地址拼接: 点击第二页时页面地址是https://movie.douban.com/top250?start=25&filter=
# 4 callback=self.parse : 请求回调
'''
Step4.修改配置文件
'''
$ grep -E -v "^#" settings.py
BOT_NAME = 'douban' #项目名称
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
USER_AGENT = ' Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.5
#通道设置
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 300,
}
#下载中间件设置调用
DOWNLOADER_MIDDLEWARES = {
'douban.middlewares.my_proxy': 543,
'douban.middlewares.my_useragent': 544,
}
#设置mongo_db数据库信息
mongo_host = '172.16.0.0'
mongo_port = 27017
mongo_db_name = 'douban'
mongo_db_collection = 'douban_movie'
'''
Step5. 修改pipelines.py
'''
# -*- coding: utf-8 -*-
import pymongo
from douban.settings import mongo_host ,mongo_port,mongo_db_name,mongo_db_collection
class DoubanPipeline(object):
def __init__(self):
host = mongo_host
port = mongo_port
dbname = mongo_db_name
sheetname = mongo_db_collection
client = pymongo.MongoClient(host=host,port=port)
mydb = client[dbname]
self.post = mydb[sheetname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
'''
Step6. 中间价文件:middlewares.py
'''
# ip代理中间价编写(爬虫ip地址伪装) / 头信息User-Agent伪装随机
import base64
import random
#文件结尾添加方法:
class my_proxy(object): # 代理
def process_request(self,request,spider):
request.meta['proxy'] = 'http-cla.abuyun.com:9030'
proxy_name_pass = b'H622272STYB666BW:F78990HJSS7'
enconde_pass_name = base64.b64encode(proxy_name_pass)
request.headers['Proxy-Authorization'] = 'Basic ' enconde_pass_name.decode()
# 解释:根据阿布云注册购买http隧道列表信息
# request.meta['proxy'] : '服务器地址:端口号'
# proxy_name_pass: b'证书号:密钥' ,b开头是字符串base64处理
# base64.b64encode() : 变量做base64处理
# 'Basic ' : basic后一定要有空格
class my_useragent(object): # userAgent
def process_request(self, request, spider):
UserAgentList = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527 (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
]
agent = random.choice(UserAgentList)
request.headers['User_Agent'] = agent
## 运行 scrapy crawl 将会看到 上面设置的中间键方法
也可以将数据保存到 json文件 或者 csv文件
- scrapy crawl douban_spider -o movielist.csv
- scrapy crawl douban_spider -o movielist.json
Scrapy 入坑记
Q:安装twisted出现依赖问题?
WeiyiGeek.问题1
解决方法:官网下载twisted的whl包安装
Twisted‑19.2.0‑cp37‑cp37m‑win_amd64.whl
工作日常学习
功能:实现利用学校找出省份
代码语言:javascript复制#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#功能: 学校地区解析
import urllib.request
import urllib.parse
from lxml import etree
number = []
name = []
file1 = open('2.txt','r',encoding='utf-8')
for i in file1:
keyvalue = i.split(" ")
number.append(str(keyvalue[0]))
name.append(str(keyvalue[1]))
file1.close()
def test1(html,parm=""):
dom_tree = etree.HTML(html)
links = dom_tree.xpath("//div[@class='basic-info cmn-clearfix']/dl/dd/text()")
for i in links:
for v in range(0,len(number)):
if (i.find(name[v][:2]) != -1):
return number[v] name[v] parm "n"
return "未找到(或者是海外)"
file = open('1.txt','r ',encoding='utf-8')
file.seek(0,0)
for eachline in file:
url = "https://baike.baidu.com/item/" urllib.parse.quote(eachline[6:]);
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8') #解码操作
f = open('c:\weiyigeek.txt','a ',encoding='utf-8') #打开
res = test1(html,str(eachline[6:]))
f.writelines(res)
f.close() #关闭
file.close()
WeiyiGeek.执行后效果