python3 网络爬虫 实例1

2020-01-03 10:53:27 浏览数 (1)

scrapy

pip install scrapy pip install pyOpenSSL pip install cryptography pip install CFFI pip install lxml pip install cssselect pip install Twisted

创建爬虫项目

scrapy startproject zhipinSpider

生成爬虫

scrapy genspider job_position "zhipin.com"

image.png

目录结构: items.py : pipelines.py:处理爬取的内容 settings.py :配置文件

先调试数据

scrapy shell - s USER_AGENT="xx" https://www.zhipin.com/c101280100/h101280100/

让scrapy伪装成浏览器

XPath语法

/ 匹配根节点 // 任意节点 . 当前节点 .. 父节点 @ 属性 //div[@title="xxx"]/div

extract提取节点内容

image.png

CSS匹配

image.png

items.py

import scrapy

class ZhipinspiderItem(scrapy.Item): # 工作名称 title = scrapy.Field() # 工资 salary = scrapy.Field() # 招聘公司 company = scrapy.Field() # 工作详细链接 url = scrapy.Field() # 工作地点 work_addr = scrapy.Field() # 行业 industry = scrapy.Field() # 公司规模 company_size = scrapy.Field() # 招聘人 recruiter = scrapy.Field() # 发布时间 publish_date = scrapy.Field()

job_spider.py

import scrapy from ZhipinSpider.items import ZhipinspiderItem

class JobPositionSpider(scrapy.Spider): # 定义该Spider的名字 name = 'job_position' # 定义该Spider允许爬取的域名 allowed_domains = ['zhipin.com'] # 定义该Spider爬取的首页列表 start_urls = ['https://www.zhipin.com/c101280100/h_101280100/']

代码语言:javascript复制
# 该方法负责提取response所包含的信息
# response代表下载器从start_urls中每个URL下载得到的响应
def parse(self, response):
    # 遍历页面上所有//div[@class="job-primary"]节点
    for job_primary in response.xpath('//div[@class="job-primary"]'):
        item = ZhipinspiderItem()
        # 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点
        # 也就是匹配到包含工作信息的<div.../>元素
        info_primary = job_primary.xpath('./div[@class="info-primary"]')
        item['title'] = info_primary.xpath('./h3/a/div[@class="job-title"]/text()').extract_first()
        item['salary'] = info_primary.xpath('./h3/a/span[@class="red"]/text()').extract_first()
        item['work_addr'] = info_primary.xpath('./p/text()').extract_first()
        item['url'] = info_primary.xpath('./h3/a/@href').extract_first()
        # 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下
        # 的/div[@class="company-text"]的节点
        # 也就是匹配到包含公司信息的<div.../>元素
        company_text = job_primary.xpath('./div[@class="info-company"]'   
            '/div[@class="company-text"]')
        item['company'] = company_text.xpath('./h3/a/text()').extract_first()
        company_info = company_text.xpath('./p/text()').extract()
        if company_info and len(company_info) > 0:
            item['industry'] = company_info[0]
        if company_info and len(company_info) > 2:
            item['company_size'] = company_info[2]
        # 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下
        # 也就是匹配到包含发布人信息的<div.../>元素
        info_publis = job_primary.xpath('./div[@class="info-publis"]')
        item['recruiter'] = info_publis.xpath('./h3/text()').extract_first()
        item['publish_date'] = info_publis.xpath('./p/text()').extract_first()
        yield item

    # 解析下一页的链接
    new_links = response.xpath('//div[@class="page"]/a[@class="next"]/@href').extract()
    if new_links and len(new_links) > 0:
        # 获取下一页的链接
        new_link = new_links[0]
        # 再次发送请求获取下一页数据
        yield scrapy.Request("https://www.zhipin.com"   new_link, callback=self.parse)

pipelines.py

class ZhipinspiderPipeline(object): def process_item(self, item, spider): print("工作:" , item['title']) print("工资:" , item['salary']) print("工作地点:" , item['work_addr']) print("详情链接:" , item['url'])

代码语言:javascript复制
    print("公司:" , item['company'])
    print("行业:" , item['industry'])
    print("公司规模:" , item['company_size'])

    print("招聘人:" , item['recruiter'])
    print("发布日期:" , item['publish_date'])

settings.py

-- coding: utf-8 --

Scrapy settings for ZhipinSpider project

For simplicity, this file contains only settings considered important or

commonly used. You can find more settings consulting the documentation:

https://doc.scrapy.org/en/latest/topics/settings.html

https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ZhipinSpider'

SPIDER_MODULES = ['ZhipinSpider.spiders'] NEWSPIDER_MODULE = 'ZhipinSpider.spiders'

Crawl responsibly by identifying yourself (and your website) on the user-agent

USER_AGENT = 'ZhipinSpider ( http://www.yourdomain.com)'

Obey robots.txt rules

ROBOTSTXT_OBEY = True

Configure maximum concurrent requests performed by Scrapy (default: 16)

CONCURRENT_REQUESTS = 32

Configure a delay for requests for the same website (default: 0)

See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

See also autothrottle settings and docs

DOWNLOAD_DELAY = 3

The download delay setting will honor only one of:

CONCURRENT_REQUESTS_PER_DOMAIN = 16

CONCURRENT_REQUESTS_PER_IP = 16

Disable cookies (enabled by default)

COOKIES_ENABLED = False

Disable Telnet Console (enabled by default)

TELNETCONSOLE_ENABLED = False

Override the default request headers:

配置默认的请求头

DEFAULT_REQUEST_HEADERS = { "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", 'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,/;q=0.8' }

Enable or disable spider middlewares

See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

SPIDER_MIDDLEWARES = {'ZhipinSpider.middlewares.ZhipinspiderSpiderMiddleware': 543,}

Enable or disable downloader middlewares

See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {'ZhipinSpider.middlewares.ZhipinspiderDownloaderMiddleware': 543,}

Enable or disable extensions

See https://doc.scrapy.org/en/latest/topics/extensions.html

EXTENSIONS = {'scrapy.extensions.telnet.TelnetConsole': None,}

Configure item pipelines

See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

配置使用Pipeline

ITEM_PIPELINES = { 'ZhipinSpider.pipelines.ZhipinspiderPipeline': 300, }

Enable and configure the AutoThrottle extension (disabled by default)

See https://doc.scrapy.org/en/latest/topics/autothrottle.html

AUTOTHROTTLE_ENABLED = True

The initial download delay

AUTOTHROTTLE_START_DELAY = 5

The maximum download delay to be set in case of high latencies

AUTOTHROTTLE_MAX_DELAY = 60

The average number of requests Scrapy should be sending in parallel to

each remote server

AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

Enable showing throttling stats for every response received:

AUTOTHROTTLE_DEBUG = False

Enable and configure HTTP caching (disabled by default)

See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

HTTPCACHE_ENABLED = True

HTTPCACHE_EXPIRATION_SECS = 0

HTTPCACHE_DIR = 'httpcache'

HTTPCACHE_IGNORE_HTTP_CODES = []

HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

启动

scrapy crawl job_position


存入数据库:pipelines.py

导入访问MySQL的模块

import mysql.connector

class ZhipinspiderPipeline(object): # 定义构造器,初始化要写入的文件 def init(self): self.conn = mysql.connector.connect(user='root', password='32147', host='localhost', port='3306', database='python', use_unicode=True) self.cur = self.conn.cursor() # 重写close_spider回调方法,用于关闭数据库资源 def close_spider(self, spider): print('----------关闭数据库资源-----------') # 关闭游标 self.cur.close() # 关闭连接 self.conn.close() def process_item(self, item, spider): self.cur.execute("INSERT INTO job_inf VALUES(null, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (item['title'], item['salary'], item['company'], item['url'], item['work_addr'], item['industry'], item.get('company_size'), item['recruiter'], item['publish_date'])) self.conn.commit()


处理反爬虫

更改IP地址:middlewares.py

image.png

禁用cookie:settings.py

COOKIES_ENABLED=False

不遵守爬虫规则

image.png

设置访问频率

image.png

image.png

登录selenium

0 人点赞