Crawlab是一个功能强大的网络爬虫管理平台(WCMP),可以运行以各种编程语

2023-12-02 11:23:44 浏览数 (1)

Crawlab是一个功能强大的网络爬虫管理平台(WCMP),可以运行以各种编程语言开发的网络爬虫和爬虫,包括Python,Go,Node.js,Java,C#以及包括Scrapy,Colly,Selenium,Puppeteer在内的框架。它用于运行、管理和监控网络爬虫,特别是在可追溯性、可扩展性和稳定性是需要关注的主要因素的生产环境中。

代码语言:javascript复制
version: '3.3'
services:
  master:
    image: crawlabteam/crawlab
    container_name: crawlab_master
    restart: always
    environment:
      CRAWLAB_NODE_MASTER: "Y"  # Y: 主节点
      CRAWLAB_MONGO_HOST: "mongo"  # mongo host address. 在 Docker-Compose 网络中,直接引用 service 名称
      CRAWLAB_MONGO_PORT: "27017"  # mongo port 
      CRAWLAB_MONGO_DB: "crawlab"  # mongo database 
      CRAWLAB_MONGO_USERNAME: "username"  # mongo username
      CRAWLAB_MONGO_PASSWORD: "password"  # mongo password 
      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # mongo auth source 
    volumes:
      - "/opt/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
      - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
    ports:
      - "8080:8080"  # 开放 api 端口
    depends_on:
      - mongo

  mongo:
    image: mongo:4.2
    restart: always
    environment:
      MONGO_INITDB_ROOT_USERNAME: "username"  # mongo username
      MONGO_INITDB_ROOT_PASSWORD: "password"  # mongo password
    volumes:
      - "/opt/crawlab/mongo/data/db:/data/db"  # 持久化 mongo 数据
    ports:
      - "27017:27017"  # 开放 mongo 端口到宿主机

执行 docker-compose up -d 并在浏览器中导航至 http://<your_ip>:8080,然后开始使用 Crawlab。

代码语言:javascript复制
version: '3.3'
services:
  master:
    image: crawlabteam/crawlab
    container_name: crawlab_master
    restart: always
    environment:
      CRAWLAB_NODE_MASTER: "Y"  # Y: 主节点
      CRAWLAB_MONGO_HOST: "mongo"  # mongo host address. 在 Docker-Compose 网络中,直接引用 service 名称
      CRAWLAB_MONGO_PORT: "27017"  # mongo port 
      CRAWLAB_MONGO_DB: "crawlab"  # mongo database 
      CRAWLAB_MONGO_USERNAME: "username"  # mongo username
      CRAWLAB_MONGO_PASSWORD: "password"  # mongo password 
      CRAWLAB_MONGO_AUTHSOURCE: "admin"  # mongo auth source 
    volumes:
      - "/opt/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
      - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
    ports:
      - "8080:8080"  # 开放 api 端口
      - "9666:9666"  # 开放 grpc 端口
    depends_on:
      - mongo

  mongo:
    image: mongo:4.2
    restart: always
    environment:
      MONGO_INITDB_ROOT_USERNAME: "username"  # mongo username
      MONGO_INITDB_ROOT_PASSWORD: "password"  # mongo password
    volumes:
      - "/opt/crawlab/mongo/data/db:/data/db"  # 持久化 mongo 数据
    ports:
      - "27017:27017"  # 开放 mongo 端口到宿主机

搭建工作节点

在每个 工作节点 中创建 docker-compose.yml,并输入如下内容。 然后执行 docker-compose up -d 以启动容器。

代码语言:javascript复制
version: '3.3'
services:
  worker:
    image: crawlabteam/crawlab
    container_name: crawlab_worker
    restart: always
    environment:
      CRAWLAB_NODE_MASTER: "N"  # N: 工作节点
      CRAWLAB_GRPC_ADDRESS: "<master_node_ip>:9666"  # grpc address
      CRAWLAB_FS_FILER_URL: "http://<master_node_ip>:8080/api/filer"  # seaweedfs api
    volumes:
      - "/opt/.crawlab/worker:/root/.crawlab"  # 持久化 crawlab 元数据
      - "/opt/crawlab/worker:/data"  # 持久化 crawlab 数据

请注意您需要将 <master_node_ip> 替换为主节点 IP 地址,并保证其能被工作节点访问。

主节点和工作节点都启动之后,您可以导航至 http://<master_node_ip>:8080 并开始使用 Crawlab.

代码语言:javascript复制
version: '3.3'
services:
  master:
    image: crawlabteam/crawlab
    container_name: crawlab_master
    restart: always
    environment:
      CRAWLAB_NODE_MASTER: "Y"  # Y: 主节点
      CRAWLAB_MONGO_URI: "<mongo_uri>"  # mongo uri (单独设置)
      CRAWLAB_MONGO_HOST: "<mongo_host>"  # mongo host address
      CRAWLAB_MONGO_PORT: "<mongo_port>"  # mongo port 
      CRAWLAB_MONGO_DB: "<mongo_db>"  # mongo database 
      CRAWLAB_MONGO_USERNAME: "<mongo_username>"  # mongo username
      CRAWLAB_MONGO_PASSWORD: "<mongo_password>"  # mongo password 
      CRAWLAB_MONGO_AUTHSOURCE: "<mongo_auth_source>"  # mongo auth source 
      CRAWLAB_MONGO_AUTHMECHANISM: "<mongo_auth_mechanism>"  # mongo auth mechanism 
      CRAWLAB_MONGO_AUTHMECHANISMPROPERTIES: "<mongo_auth_mechanism_properties>"  # mongo auth mechanism properties
    volumes:
      - "/opt/.crawlab/master:/root/.crawlab"  # 持久化 crawlab 元数据
      - "/opt/crawlab/master:/data"  # 持久化 crawlab 数据
      - "/var/crawlab/log:/var/logs/crawlab" # 持久化 crawlab 任务日志
    ports:
      - "8080:8080"  # 开放 api 端口
      - "9666:9666"  # 开放 grpc 端口

可以看到,服务 mongo 被移除了,MongoDB 连接相关的环境变量 (例如 CRAWLAB_MONGO_HOST, CRAWLAB_MONGO_PORT) 指向了外部 MongoDB。您可以将其中一些不需要设置的环境变量留空。

本文来自:https://docs.crawlab.cn/zh/guide/installation/docker.html#外部-mongodb

#

0 人点赞