python scrapy 爬虫实例_scrapy爬虫完整实例

2022-09-13 15:25:29 浏览数 (1)

大家好,又见面了,我是你们的朋友全栈君。

本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。

例程1: douban

目录树

douban

–douban

–spiders

–__init__.py

–bookspider.py

–douban_comment_spider.py

–doumailspider.py

–__init__.py

–items.py

–pipelines.py

–settings.py

–scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project

#

# Please refer to the documentation for information on how to create and manage

# your spiders.

bookspider.py

# -*- coding:utf-8 -*-

”’by sudo rm -rf http://imchenkun.com”’

import scrapy

from douban.items import DoubanBookItem

class BookSpider(scrapy.Spider):

name = ‘douban-book’

allowed_domains = [‘douban.com’]

start_urls = [

‘https://book.douban.com/top250’

]

def parse(self, response):

# 请求第一页

yield scrapy.Request(response.url, callback=self.parse_next)

# 请求其它页

for page in response.xpath(‘//div[@class=”paginator”]/a’):

link = page.xpath(‘@href’).extract()[0]

yield scrapy.Request(link, callback=self.parse_next)

def parse_next(self, response):

for item in response.xpath(‘//tr[@class=”item”]’):

book = DoubanBookItem()

book[‘name’] = item.xpath(‘td[2]/div[1]/a/@title’).extract()[0]

book[‘content’] = item.xpath(‘td[2]/p/text()’).extract()[0]

book[‘ratings’] = item.xpath(‘td[2]/div[2]/span[2]/text()’).extract()[0]

yield book

douban_comment_spider.py

# -*- coding:utf-8 -*-

import scrapy

from faker import Factory

from douban.items import DoubanMovieCommentItem

import urlparse

f = Factory.create()

class MailSpider(scrapy.Spider):

name = ‘douban-comment’

allowed_domains = [‘accounts.douban.com’, ‘douban.com’]

start_urls = [

‘https://www.douban.com/’

]

headers = {

‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8’,

‘Accept-Encoding’: ‘gzip, deflate, br’,

‘Accept-Language’: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3’,

‘Connection’: ‘keep-alive’,

‘Host’: ‘accounts.douban.com’,

‘User-Agent’: f.user_agent()

}

formdata = {

‘form_email’: ‘你的邮箱’,

‘form_password’: ‘你的密码’,

# ‘captcha-solution’: ”,

# ‘captcha-id’: ”,

‘login’: ‘登录’,

‘redir’: ‘https://www.douban.com/’,

‘source’: ‘None’

}

def start_requests(self):

return [scrapy.Request(url=’https://www.douban.com/accounts/login’,

headers=self.headers,

meta={‘cookiejar’: 1},

callback=self.parse_login)]

def parse_login(self, response):

# 如果有验证码要人为处理

if ‘captcha_image’ in response.body:

print ‘Copy the link:’

link = response.xpath(‘//img[@class=”captcha_image”]/@src’).extract()[0]

print link

captcha_solution = raw_input(‘captcha-solution:’)

captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)[‘id’]

self.formdata[‘captcha-solution’] = captcha_solution

self.formdata[‘captcha-id’] = captcha_id

return [scrapy.FormRequest.from_response(response,

formdata=self.formdata,

headers=self.headers,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

callback=self.after_login

)]

def after_login(self, response):

print response.status

self.headers[‘Host’] = “www.douban.com”

yield scrapy.Request(url=’https://movie.douban.com/subject/22266320/reviews’,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_comment_url)

yield scrapy.Request(url=’https://movie.douban.com/subject/22266320/reviews’,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_next_page,

dont_filter = True) #不去重

def parse_next_page(self, response):

print response.status

try:

next_url = response.urljoin(response.xpath(‘//span[@class=”next”]/a/@href’).extract()[0])

print “下一页”

print next_url

yield scrapy.Request(url=next_url,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_comment_url,

dont_filter = True)

yield scrapy.Request(url=next_url,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_next_page,

dont_filter = True)

except:

print “Next page Error”

return

def parse_comment_url(self, response):

print response.status

for item in response.xpath(‘//div[@class=”main review-item”]’):

comment_url = item.xpath(‘header/h3[@class=”title”]/a/@href’).extract()[0]

comment_title = item.xpath(‘header/h3[@class=”title”]/a/text()’).extract()[0]

print comment_title

print comment_url

yield scrapy.Request(url=comment_url,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_comment)

def parse_comment(self, response):

print response.status

for item in response.xpath(‘//div[@id=”content”]’):

comment = DoubanMovieCommentItem()

comment[‘useful_num’] = item.xpath(‘//div[@class=”main-panel-useful”]/button[1]/text()’).extract()[0].strip()

comment[‘no_help_num’] = item.xpath(‘//div[@class=”main-panel-useful”]/button[2]/text()’).extract()[0].strip()

comment[‘people’] = item.xpath(‘//span[@property=”v:reviewer”]/text()’).extract()[0]

comment[‘people_url’] = item.xpath(‘//header[@class=”main-hd”]/a[1]/@href’).extract()[0]

comment[‘star’] = item.xpath(‘//header[@class=”main-hd”]/span[1]/@title’).extract()[0]

data_type = item.xpath(‘//div[@id=”link-report”]/div/@data-original’).extract()[0]

print “data_type: “ data_type

if data_type == ‘0’:

comment[‘comment’] = “t#####t”.join(map(lambda x:x.strip(), item.xpath(‘//div[@id=”link-report”]/div/p/text()’).extract()))

elif data_type == ‘1’:

comment[‘comment’] = “t#####t”.join(map(lambda x:x.strip(), item.xpath(‘//div[@id=”link-report”]/div[1]/text()’).extract()))

comment[‘title’] = item.xpath(‘//span[@property=”v:summary”]/text()’).extract()[0]

comment[‘comment_page_url’] = response.url

#print comment

yield comment

doumailspider.py

# -*- coding:utf-8 -*-

”’by sudo rm -rf http://imchenkun.com”’

import scrapy

from faker import Factory

from douban.items import DoubanMailItem

import urlparse

f = Factory.create()

class MailSpider(scrapy.Spider):

name = ‘douban-mail’

allowed_domains = [‘accounts.douban.com’, ‘douban.com’]

start_urls = [

‘https://www.douban.com/’

]

headers = {

‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8’,

‘Accept-Encoding’: ‘gzip, deflate, br’,

‘Accept-Language’: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3’,

‘Connection’: ‘keep-alive’,

‘Host’: ‘accounts.douban.com’,

‘User-Agent’: f.user_agent()

}

formdata = {

‘form_email’: ‘你的邮箱’,

‘form_password’: ‘你的密码’,

# ‘captcha-solution’: ”,

# ‘captcha-id’: ”,

‘login’: ‘登录’,

‘redir’: ‘https://www.douban.com/’,

‘source’: ‘None’

}

def start_requests(self):

return [scrapy.Request(url=’https://www.douban.com/accounts/login’,

headers=self.headers,

meta={‘cookiejar’: 1},

callback=self.parse_login)]

def parse_login(self, response):

# 如果有验证码要人为处理

if ‘captcha_image’ in response.body:

print ‘Copy the link:’

link = response.xpath(‘//img[@class=”captcha_image”]/@src’).extract()[0]

print link

captcha_solution = raw_input(‘captcha-solution:’)

captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)[‘id’]

self.formdata[‘captcha-solution’] = captcha_solution

self.formdata[‘captcha-id’] = captcha_id

return [scrapy.FormRequest.from_response(response,

formdata=self.formdata,

headers=self.headers,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

callback=self.after_login

)]

def after_login(self, response):

print response.status

self.headers[‘Host’] = “www.douban.com”

return scrapy.Request(url=’https://www.douban.com/doumail/’,

meta={‘cookiejar’: response.meta[‘cookiejar’]},

headers=self.headers,

callback=self.parse_mail)

def parse_mail(self, response):

print response.status

for item in response.xpath(‘//div[@class=”doumail-list”]/ul/li’):

mail = DoubanMailItem()

mail[‘sender_time’] = item.xpath(‘div[2]/div/span[1]/text()’).extract()[0]

mail[‘sender_from’] = item.xpath(‘div[2]/div/span[2]/text()’).extract()[0]

mail[‘url’] = item.xpath(‘div[2]/p/a/@href’).extract()[0]

mail[‘title’] = item.xpath(‘div[2]/p/a/text()’).extract()[0]

print mail

yield mail

init.py

(此文件内无代码)

items.py

# -*- coding: utf-8 -*-

import scrapy

class DoubanBookItem(scrapy.Item):

name = scrapy.Field() # 书名

price = scrapy.Field() # 价格

edition_year = scrapy.Field() # 出版年份

publisher = scrapy.Field() # 出版社

ratings = scrapy.Field() # 评分

author = scrapy.Field() # 作者

content = scrapy.Field()

class DoubanMailItem(scrapy.Item):

sender_time = scrapy.Field() # 发送时间

sender_from = scrapy.Field() # 发送人

url = scrapy.Field() # 豆邮详细地址

title = scrapy.Field() # 豆邮标题

class DoubanMovieCommentItem(scrapy.Item):

useful_num = scrapy.Field() # 多少人评论有用

no_help_num = scrapy.Field() # 多少人评论无用

people = scrapy.Field() # 评论者

people_url = scrapy.Field() # 评论者页面

star = scrapy.Field() # 评分

comment = scrapy.Field() # 评论

title = scrapy.Field() # 标题

comment_page_url = scrapy.Field()# 当前页

pipelines.py

# -*- coding: utf-8 -*-

class DoubanBookPipeline(object):

def process_item(self, item, spider):

info = item[‘content’].split(‘ / ‘) # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元

item[‘name’] = item[‘name’]

item[‘price’] = info[-1]

item[‘edition_year’] = info[-2]

item[‘publisher’] = info[-3]

return item

class DoubanMailPipeline(object):

def process_item(self, item, spider):

item[‘title’] = item[‘title’].replace(‘ ‘, ”).replace(‘\n’, ”)

return item

class DoubanMovieCommentPipeline(object):

def process_item(self, item, spider):

return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for douban project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

# http://doc.scrapy.org/en/latest/topics/settings.html

# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘douban’

SPIDER_MODULES = [‘douban.spiders’]

NEWSPIDER_MODULE = ‘douban.spiders’

# Crawl responsibly by identifying yourself (and your website) on the user-agent

from faker import Factory

f = Factory.create()

USER_AGENT = f.user_agent()

# Obey robots.txt rules

ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

DEFAULT_REQUEST_HEADERS = {

‘Host’: ‘book.douban.com’,

‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8’,

‘Accept-Language’: ‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3’,

‘Accept-Encoding’: ‘gzip, deflate, br’,

‘Connection’: ‘keep-alive’,

}

#DEFAULT_REQUEST_HEADERS = {

# ‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8’,

# ‘Accept-Language’: ‘en’,

#}

# Enable or disable spider middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# ‘douban.middlewares.MyCustomSpiderMiddleware’: 543,

#}

# Enable or disable downloader middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# ‘douban.middlewares.MyCustomDownloaderMiddleware’: 543,

#}

# Enable or disable extensions

# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# ‘scrapy.extensions.telnet.TelnetConsole’: None,

#}

# Configure item pipelines

# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

#’douban.pipelines.DoubanBookPipeline’: 300,

#’douban.pipelines.DoubanMailPipeline’: 600,

‘douban.pipelines.DoubanMovieCommentPipeline’: 900,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = ‘httpcache’

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = ‘scrapy.extensions.httpcache.FilesystemCacheStorage’

scrapy.cfg

# Automatically created by: scrapy startproject

#

# For more information about the [deploy] section see:

# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]

default = douban.settings

[deploy]

#url = http://localhost:6800/

project = douban

例程2: douban_imgs

目录树

douban_imgs

–douban

–spiders

–__init__.py

–download_douban.py

–__init__.py

–items.py

–pipelines.py

–run_spider.py

–settings.py

–scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project

#

# Please refer to the documentation for information on how to create and manage

# your spiders.

download_douban.py

# coding=utf-8

from scrapy.spiders import Spider

import re

from scrapy import Request

from douban_imgs.items import DoubanImgsItem

class download_douban(Spider):

name = ‘download_douban’

default_headers = {

‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,image/webp,*/*;q=0.8’,

‘Accept-Encoding’: ‘gzip, deflate, sdch, br’,

‘Accept-Language’: ‘zh-CN,zh;q=0.8,en;q=0.6’,

‘Cache-Control’: ‘max-age=0’,

‘Connection’: ‘keep-alive’,

‘Host’: ‘www.douban.com’,

‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36’,

}

def __init__(self, url=’1638835355′, *args, **kwargs):

self.allowed_domains = [‘douban.com’]

self.start_urls = [

‘http://www.douban.com/photos/album/%s/’ % (url)]

self.url = url

# call the father base function

#super(download_douban, self).__init__(*args, **kwargs)

def start_requests(self):

for url in self.start_urls:

yield Request(url=url, headers=self.default_headers, callback=self.parse)

def parse(self, response):

list_imgs = response.xpath(‘//div[@class=”photolst clearfix”]//img/@src’).extract()

if list_imgs:

item = DoubanImgsItem()

item[‘image_urls’] = list_imgs

yield item

init.py

(此文件内无代码)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

from scrapy import Item, Field

class DoubanImgsItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

image_urls = Field()

images = Field()

image_paths = Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don’t forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.pipelines.images import ImagesPipeline

from scrapy.exceptions import DropItem

from scrapy import Request

from scrapy import log

class DoubanImgsPipeline(object):

def process_item(self, item, spider):

return item

class DoubanImgDownloadPipeline(ImagesPipeline):

default_headers = {

‘accept’: ‘image/webp,image/*,*/*;q=0.8’,

‘accept-encoding’: ‘gzip, deflate, sdch, br’,

‘accept-language’: ‘zh-CN,zh;q=0.8,en;q=0.6’,

‘cookie’: ‘bid=yQdC/AzTaCw’,

‘referer’: ‘https://www.douban.com/photos/photo/2370443040/’,

‘user-agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36’,

}

def get_media_requests(self, item, info):

for image_url in item[‘image_urls’]:

self.default_headers[‘referer’] = image_url

yield Request(image_url, headers=self.default_headers)

def item_completed(self, results, item, info):

image_paths = [x[‘path’] for ok, x in results if ok]

if not image_paths:

raise DropItem(“Item contains no images”)

item[‘image_paths’] = image_paths

return item

run_spider.py

from scrapy import cmdline

cmd_str = ‘scrapy crawl download_douban’

cmdline.execute(cmd_str.split(‘ ‘))

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for douban_imgs project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

# http://doc.scrapy.org/en/latest/topics/settings.html

# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = ‘douban_imgs’

SPIDER_MODULES = [‘douban_imgs.spiders’]

NEWSPIDER_MODULE = ‘douban_imgs.spiders’

# Crawl responsibly by identifying yourself (and your website) on the user-agent

# USER_AGENT = ‘douban_imgs ( http://www.yourdomain.com)’

# Configure maximum concurrent requests performed by Scrapy (default: 16)

# CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)

# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

# DOWNLOAD_DELAY=3

# The download delay setting will honor only one of:

# CONCURRENT_REQUESTS_PER_DOMAIN=16

# CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)

# COOKIES_ENABLED=False

# Disable Telnet Console (enabled by default)

# TELNETCONSOLE_ENABLED=False

# Override the default request headers:

# DEFAULT_REQUEST_HEADERS = {

# ‘Accept’: ‘text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8’,

# ‘Accept-Language’: ‘en’,

# }

# Enable or disable spider middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

# SPIDER_MIDDLEWARES = {

# ‘douban_imgs.middlewares.MyCustomSpiderMiddleware’: 543,

# }

# Enable or disable downloader middlewares

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html

# DOWNLOADER_MIDDLEWARES = {

# ‘douban_imgs.middlewares.MyCustomDownloaderMiddleware’: 543,

# }

# Enable or disable extensions

# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html

# EXTENSIONS = {

# ‘scrapy.telnet.TelnetConsole’: None,

# }

# Configure item pipelines

# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

‘douban_imgs.pipelines.DoubanImgDownloadPipeline’: 300,

}

IMAGES_STORE = ‘D:\doubanimgs’

#IMAGES_STORE = ‘/tmp’

IMAGES_EXPIRES = 90

# Enable and configure the AutoThrottle extension (disabled by default)

# See http://doc.scrapy.org/en/latest/topics/autothrottle.html

# NOTE: AutoThrottle will honour the standard settings for concurrency and delay

# AUTOTHROTTLE_ENABLED=True

# The initial download delay

# AUTOTHROTTLE_START_DELAY=5

# The maximum download delay to be set in case of high latencies

# AUTOTHROTTLE_MAX_DELAY=60

# Enable showing throttling stats for every response received:

# AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)

# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

# HTTPCACHE_ENABLED=True

# HTTPCACHE_EXPIRATION_SECS=0

# HTTPCACHE_DIR=’httpcache’

# HTTPCACHE_IGNORE_HTTP_CODES=[]

# HTTPCACHE_STORAGE=’scrapy.extensions.httpcache.FilesystemCacheStorage’

scrapy.cfg

# Automatically created by: scrapy startproject

#

# For more information about the [deploy] section see:

# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]

default = douban_imgs.settings

[deploy]

#url = http://localhost:6800/

project = douban_imgs

总结

以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!

发布者:全栈程序员栈长,转载请注明出处:https://javaforall.cn/153122.html原文链接:https://javaforall.cn

0 人点赞