需求:爬取投诉帖子的名称、帖子的url、帖子的标题,和帖子里的内容。
1.规则爬虫--scrapy genspider -t crawl Question wz.sun0769.com
**Question .py
代码语言:javascript复制import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from Dongguan.items import DongguanItem
class QuestionSpider(CrawlSpider):
name = 'Question'
allowed_domains = ['wz.sun0769.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
rules = (
# Rule规则里面如果没有写Rule,默认是深度爬取
# 所以帖子的页面的数据
Rule(LinkExtractor(allow=r'type=4'), follow=True), # 下一页的匹配
Rule(LinkExtractor(allow=r'question/d /d .shtml'), process_links="handle_links", callback='parse_item',
follow=True),
)
# 把有错误的链接,可以修改过来,再去请求
def handle_links(self, links):
for link in links:
print("link====", link)
return links
# 帖子的详细信息
def parse_item(self, response):
item = DongguanItem()
# 帖子链接
url = response.url
title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
if len(title_number) > 0:
title_number = title_number[0]
# 编号:191166
# 帖子的编号
number = title_number.split("xa0xa0")[1]
number = number.split(":")[1]
# 帖子标题
title = title_number.split("xa0xa0")[0]
title = title.split(":")[1]
item["title"] = title
item["number"] = number
content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
# 把列表使用“”链接变成字符串
content = "".join(content).strip()
item["url"] = url
item["content"] = content
yield item
2.Spider版爬虫---scrapy genspider Question2 wz.sun0769.com
**Question2 .py
代码语言:javascript复制import scrapy
from Dongguan.items import DongguanItem
class Question2Spider(scrapy.Spider):
name = 'Question2'
allowed_domains = ['wz.sun0769.com']
# 偏移
offset = 0
url = "http://wz.sun0769.com/index.php/question/questionType?type=4&page="
start_urls = [url str(offset)]
# 就是帖子具体的内容了
def process_item(self, response):
item = DongguanItem()
# 帖子链接
url = response.url
title_number = response.xpath('//div[@class="pagecenter p3"]/div/div/div/strong/text()').extract()
if len(title_number) > 0:
title_number = title_number[0]
# 编号:191166
# 帖子的编号
number = title_number.split("xa0xa0")[1]
number = number.split(":")[1]
# 帖子标题
title = title_number.split("xa0xa0")[0]
title = title.split(":")[1]
item["title"] = title
item["number"] = number
content = response.xpath('//div[@class="c1 text14_2"]/text()|//div[@class="contentext"]/text()').extract()
# 把列表使用“”链接变成字符串
content = "".join(content).strip()
item["url"] = url
item["content"] = content
yield item
def parse(self, response):
# 得到某一页的所以的帖子的链接
current_page_link = response.xpath('//a[@class="news14"]/@href').extract()
print(current_page_link)
for link in current_page_link:
# 添加具体的帖子链接,让其帮我请求
yield scrapy.Request(link, callback=self.process_item)
# 拼接下一页
if self.offset < 93630:
self.offset = 30
# 下一页的链接
new_url = self.url str(self.offset)
yield scrapy.Request(new_url, callback=self.parse)
3.CrawlSpider(规则爬虫)和Spider版爬虫通用的----pipelines.py
代码语言:javascript复制import json
class DongguanPipeline(object):
def open_spider(self, spider):
# 创建文件
self.file = open(spider.name ".json", "w", encoding="utf-8")
def process_item(self, item, spider):
# python字典
python_dict = dict(item)
# python的str
python_str = json.dumps(python_dict, ensure_ascii=False) "n"
self.file.write(python_str)
return item
def close_spider(self, spider):
self.file.close()
4.CrawlSpider(规则爬虫)和Spider版爬虫通用的----item.py
代码语言:javascript复制import scrapy
class DongguanItem(scrapy.Item):
# define the fields for your item here like:
# 每个帖子的标题
title = scrapy.Field()
# 每个帖子的编号
number = scrapy.Field()
# 每个帖子的内容
content = scrapy.Field()
# 每个帖子的链接
url = scrapy.Field()
5.CrawlSpider(规则爬虫)和Spider版爬虫通用的----settings.py
代码语言:javascript复制# 爬虫的协议
ROBOTSTXT_OBEY = False
代码语言:javascript复制ITEM_PIPELINES = {
'Dongguan.pipelines.DongguanPipeline': 300,
}
# 设置日志
LOG_FILE = "dongguan.log"
LOG_LEVEL = "DEBUG"
# 设置用户代理
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"