需求:爬取豆瓣电影top250(https://movie.douban.com/top250)的电影数据:
标题(title ),电影评分(score),电影信息(content),简介 info。
一、分析页面,用xpath得到相应的数据
标题的xpath是://div[@class="info"]//span[@class="title"][1]/text()
电影信息xpath://div[@class="info"]//div[@class="bd"]/p[1]
电影评分xpath://div[@class="info"]//div[@class="star"]/span[2]/text()
得到简介的xpath://div[@class="info"]//span[@class="inq"]/text()
二、创建项目并且完成items.py
创建项目命令:scrapy startproject douban
进入项目目录:cd douban
创建爬虫程序:scrapy genspider movetop250 douban.com
1.启动程序的脚本---main.py
代码语言:javascript复制from scrapy import cmdline
cmdline.execute("scrapy crawl movetop250".split())
2.items.py
代码语言:javascript复制import scrapy
class DoubanItem(scrapy.Item):
# 电影标题
title = scrapy.Field()
# 电影评分
score = scrapy.Field()
# 电影信息
content = scrapy.Field()
# 简介
info = scrapy.Field()
3.movetop250.py
代码语言:javascript复制import scrapy
from douban.items import DoubanItem
class Movetop250Spider(scrapy.Spider):
name = 'movetop250'
allowed_domains = ['douban.com']
offset = 0
url = "https://movie.douban.com/top250?start="
start_urls = [url str(offset)]
def parse(self, response):
moves = response.xpath('//div[@class="info"]')
for move in moves:
item = DoubanItem()
# 电影名称
title = move.xpath('.//span[@class="title"][1]/text()').extract()[0]
# 点击的信息,例如导演等
content = move.xpath('.//div[@class="bd"]/p[1]/text()').extract()[0]
content = "".join(content).strip()
# 评分
score = move.xpath('.//div[@class="star"]/span[2]/text()').extract()[0]
# 电影一句话简介
info = move.xpath('.//span[@class="inq"]/text()').extract()
if len(info) > 0:
info = info[0]
item["title"] = title
item["content"] = content
item["score"] = score
item["info"] = info
yield item
# 请求每一页数据
if self.offset < 225:
self.offset = 25
url = self.url str(self.offset)
yield scrapy.Request(url, callback=self.parse)
4.pipelines.py
代码语言:javascript复制import json
import pymongo
from scrapy.conf import settings
class DoubanMongodbPipeline(object):
def __init__(self):
print("=====start=====")
host = settings["MONGO_HOST"]
port = settings["MONGO_PORT"]
dbname = settings["MONGO_DBNAME"]
sheetname = settings["MONGO_SHEETNAME"]
print("host==", host)
print("port==", port)
print("dbname==", dbname)
print("sheetname==", sheetname)
# 创建客户端
client = pymongo.MongoClient(host=host, port=port)
# 得到或者创建数据库对象
mydb = client[dbname]
# 得到或者创建表
self.post = mydb[sheetname]
def process_item(self, item, spider):
# dict_json = dict(item)
# json_str = json.dumps(dict_json, ensure_ascii=False) "n"
# self.file.write(json_str)
dict_item = dict(item)
self.post.insert(dict_item)
return item
def close_spider(self, spider):
print("======end======")
self.file.close()
class DoubanPipeline(object):
def __init__(self):
print("=====start=====")
self.file = open("movetop250.json", "w", encoding="utf-8")
def process_item(self, item, spider):
dict_json = dict(item)
json_str = json.dumps(dict_json, ensure_ascii=False) "n"
self.file.write(json_str)
return item
def close_spider(self, spider):
print("======close_spide======")
self.file.close()
5.settings.py
代码语言:javascript复制 BOT_NAME = 'douban'
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
ROBOTSTXT_OBEY = False # 一般情况不遵循爬虫规则
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/67.0.3396.99 Safari/537.36',
}
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 301,
'douban.pipelines.DoubanMongodbPipeline': 300,
}
# 设置日志
LOG_FILE = "dongguan.log"
LOG_LEVEL = "DEBUG"
# 设置用户代理
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 "
"Safari/537.36"
# 配置mongodb数据库的信息
# mongo 主机
MONGO_HOST = "127.0.0.1"
# mongo 端口
MONGO_PORT = 27017
# mongo 数据存放数据库库名称
MONGO_DBNAME = "douban"
# mongo 数据存放的表名称
MONGO_SHEETNAME = "movetop250"
# Disable cookies (enabled by default)
COOKIES_ENABLED = False