基于 nodejs 爬取大学城用户信息
nodejs cheerio request-promise mongoose
安装依赖
代码语言:javascript复制yarn add cheerio
yarn add request-promise
yarn add mongoose
配置数据库
config.js
代码语言:javascript复制// config.js
module.exports = {
mongoUrl: 'mongodb://admin:admin@mongodb.com:27017/db',
}
connection.js
代码语言:javascript复制// connection.js
const mongoose = require('mongoose');
const DB_URL = require("./config").mongoUrl;
mongoose.connect(DB_URL, {
useNewUrlParser: true,
useUnifiedTopology: true,
useFindAndModify: false
})
.then(() => console.log("we are connected"))
.catch(error => console.log(error));
创建schemas
代码语言:javascript复制const mongoose = require('mongoose');
const ItemSchema = new mongoose.Schema({
id: {
type: Number
},
name: {
type: String
},
head: {
type: String
},
city: {
type: String
},
org: {
type: String
},
email: {
type: String
},
info: {
type: [Object]
},
hobby: {
type: [Object]
}
})
module.exports = mongoose.model('Item', ItemSchema);
写入控制器
代码语言:javascript复制require('../utils/connection');
const Item = require('../schemas/item');
class ItemController {
async saveItem(item) {
const _item = new Item(item);
try {
await _item.save();
console.log('√ 写入成功: ' item.id);
} catch (err) {
console.log('× 写入失败: ' item.id);
}
}
}
module.exports = new ItemController();
引入依赖
代码语言:javascript复制var request = require('request');
var rp = require('request-promise');
var cheerio = require('cheerio');
var ctrl = require('./controllers/item');
抓取回调
代码语言:javascript复制function foramt(body) {
if (body) {
try {
var $ = cheerio.load(body);
var item = {};
item.info = []
item.hobby = []
item.id = $('div#wrapper_left > div.tc.f16.fb > a').attr('href').replace(/[^0-9]/ig, "");
item.name = $('div#wrapper_left .tc.f16.fb').text().trim();
item.head = $('div#wrapper_left .tc.mt5.p10 a img').attr('src');
item.city = $('div#wrapper_left ul:nth-child(4) li:nth-child(2) span:nth-child(2)').text();
item.org = $('div#wrapper_left ul:nth-child(4) li:nth-child(1) a').text();
item.email = $('div#wrapper_left ul:nth-child(4) li:nth-child(3) span:nth-child(2)').text();
// info
$('div#ctl00_ContentPlaceHolderMain_UserInfoDiv ul li').each(function () {
let sign = ($(this).text().split(":"));
let obj = {}
obj[sign[0]] = sign[1];
item.info.push(obj);
});
// hobby
$('div#ctl00_ContentPlaceHolderMain_HobbyInfo ul li').each(function () {
let sign = ($(this).text().split(":"));
let obj = {}
obj[sign[0]] = sign[1];
item.hobby.push(obj);
});
// 入库
ctrl.saveItem(item);
} catch (error) {
console.log('body error');
}
} else {
console.log('net error');
}
}
抓取主体
代码语言:javascript复制function sp(id) {
var options = {
uri: `http://worlduc.com/SpaceShow/UserInfo.aspx?uid=${id}`,
method: "GET",
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "ASP.NET_SessionId=a3k5sfxv0avi5t1oqd2s34ss; BIGipServerweb_80=386076844.0.0000; WorldUC_ClientIdentity=e30f608aadb24ec39aafe0c127169f38; SnsUserToken=token=ZlGRKai3W1An1qZR0CfLrwGfLyF35Ku7J7iV00e250hHwMM8GZom B LSPx7V12Hxt34BJq0UMY=&headpic=201879105748rCNFB.jpg"
}
};
rp(options)
.then(function (res) {
foramt(res);
})
.catch(function (err) {
console.log(err);
});
}
执行主体
代码语言:javascript复制for (let i = 3391800; i <= 3391874; i ) {
sp(i);
}
执行
代码语言:javascript复制node index.js
丢到服务器去跑
因为数据很多,最大约 3424260 条,所以丢到服务器去跑。
安装nodejs
代码语言:javascript复制//安装wget
yum install wget -y
//下载node
wget https://nodejs.org/dist/v13.12.0/node-v13.12.0-linux-x64.tar.xz
//解压
tar -xvf node-v9.8.0-linux-x64.tar
//安装依赖
sudo yum install gcc gcc-c
//编译安装
cd node-v*
./configure
make
sudo make install
//查看版本
node -v
npm -v
---------------------------------------------------------------------
//使用已编译版本安装
wget http://nodejs.org/dist/v13.12.0/node-v13.12.0-linux-x64.tar.gz
//解压
sudo tar --strip-components 1 -xzvf node-v* -C /usr/local
node --version => v13.12.0
ftp上传代码
运行
代码语言:javascript复制yarn
node index.js
DONE !