基于 nodejs 爬取大学城用户信息

2022-04-25 16:47:52 浏览数 (1)

基于 nodejs 爬取大学城用户信息

nodejs cheerio request-promise mongoose

安装依赖

代码语言:javascript复制
yarn add cheerio
yarn add request-promise
yarn add mongoose

配置数据库

config.js
代码语言:javascript复制
// config.js
module.exports = {
  mongoUrl: 'mongodb://admin:admin@mongodb.com:27017/db',
}
connection.js
代码语言:javascript复制
// connection.js
const mongoose = require('mongoose');

const DB_URL = require("./config").mongoUrl;

mongoose.connect(DB_URL, {
    useNewUrlParser: true,
    useUnifiedTopology: true,
    useFindAndModify: false
  })
  .then(() => console.log("we are connected"))
  .catch(error => console.log(error));

创建schemas

代码语言:javascript复制
const mongoose = require('mongoose');

const ItemSchema = new mongoose.Schema({
  id: {
    type: Number
  },
  name: {
    type: String
  },
  head: {
    type: String
  },
  city: {
    type: String
  },
  org: {
    type: String
  },
  email: {
    type: String
  },
  info: {
    type: [Object]
  },
  hobby: {
    type: [Object]
  }
})

module.exports = mongoose.model('Item', ItemSchema);

写入控制器

代码语言:javascript复制
require('../utils/connection');
const Item = require('../schemas/item');

class ItemController {
  async saveItem(item) {
    const _item = new Item(item);
    try {
      await _item.save();
      console.log('√ 写入成功: '   item.id);
    } catch (err) {
      console.log('× 写入失败: '   item.id);
    }
  }
}

module.exports = new ItemController();

引入依赖

代码语言:javascript复制
var request = require('request');
var rp = require('request-promise');
var cheerio = require('cheerio');
var ctrl = require('./controllers/item');

抓取回调

代码语言:javascript复制
function foramt(body) {
  if (body) {
    try {
      var $ = cheerio.load(body);
      var item = {};
      item.info = []
      item.hobby = []

      item.id = $('div#wrapper_left > div.tc.f16.fb > a').attr('href').replace(/[^0-9]/ig, "");
      item.name = $('div#wrapper_left .tc.f16.fb').text().trim();
      item.head = $('div#wrapper_left .tc.mt5.p10 a img').attr('src');
      item.city = $('div#wrapper_left ul:nth-child(4) li:nth-child(2) span:nth-child(2)').text();
      item.org = $('div#wrapper_left ul:nth-child(4) li:nth-child(1) a').text();
      item.email = $('div#wrapper_left ul:nth-child(4) li:nth-child(3) span:nth-child(2)').text();

      // info
      $('div#ctl00_ContentPlaceHolderMain_UserInfoDiv ul li').each(function () {
        let sign = ($(this).text().split(":"));
        let obj = {}
        obj[sign[0]] = sign[1];
        item.info.push(obj);
      });

      // hobby
      $('div#ctl00_ContentPlaceHolderMain_HobbyInfo ul li').each(function () {
        let sign = ($(this).text().split(":"));
        let obj = {}
        obj[sign[0]] = sign[1];
        item.hobby.push(obj);
      });
			
      // 入库
      ctrl.saveItem(item);
    } catch (error) {
      console.log('body error');
    }
  } else {
    console.log('net error');
  }
}

抓取主体

代码语言:javascript复制
function sp(id) {
  var options = {
    uri: `http://worlduc.com/SpaceShow/UserInfo.aspx?uid=${id}`,
    method: "GET",
    headers: {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
      "Content-Type": "application/x-www-form-urlencoded",
      "Cookie": "ASP.NET_SessionId=a3k5sfxv0avi5t1oqd2s34ss; BIGipServerweb_80=386076844.0.0000; WorldUC_ClientIdentity=e30f608aadb24ec39aafe0c127169f38; SnsUserToken=token=ZlGRKai3W1An1qZR0CfLrwGfLyF35Ku7J7iV00e250hHwMM8GZom B LSPx7V12Hxt34BJq0UMY=&headpic=201879105748rCNFB.jpg"
    }
  };

  rp(options)
    .then(function (res) {
      foramt(res);
    })
    .catch(function (err) {
      console.log(err);
    });
}

执行主体

代码语言:javascript复制
for (let i = 3391800; i <= 3391874; i  ) {
  sp(i);
}

执行

代码语言:javascript复制
node index.js

丢到服务器去跑

因为数据很多,最大约 3424260 条,所以丢到服务器去跑。

安装nodejs
代码语言:javascript复制
//安装wget
yum install wget -y

//下载node
wget https://nodejs.org/dist/v13.12.0/node-v13.12.0-linux-x64.tar.xz

//解压
tar -xvf node-v9.8.0-linux-x64.tar

//安装依赖
sudo yum install gcc gcc-c  

//编译安装
cd node-v*
./configure
make
sudo make install

//查看版本
node -v
npm -v

---------------------------------------------------------------------

//使用已编译版本安装
wget http://nodejs.org/dist/v13.12.0/node-v13.12.0-linux-x64.tar.gz

//解压
sudo tar --strip-components 1 -xzvf node-v* -C /usr/local

node --version => v13.12.0
ftp上传代码

运行
代码语言:javascript复制
yarn
node index.js

DONE !

0 人点赞