神箭手js爬取房天下

2019-12-20 11:34:45 浏览数 (1)

/* 神箭手云_爬虫开发 支持原生JavaScript 开发教程:http://docs.shenjian.io/develop/crawler/doc/concept/crawler.html */ var configs = { domains: ["fang.com"], // scanUrls: ["https://xinshikongguojigongyu.fang.com/office/xiangqing/"], scanUrls: [], contentUrlRegexes: [/https://.*/], //内容页url正则 helperUrlRegexes: [/https://.*/], //列表页url正则 可留空 autoFindUrls: false, enableJS: true,

fields: [ { // 楼盘名字 name: "name", selector: "//span[@class='biaoti']" //默认使用XPath }, { // 所属区域 name: "area", selector: "//dl[@class='xiangqing']/dd[1]" //默认使用XPath }, { // 楼盘地址 name: "address", selector: "//dl[@class='xiangqing']/dd[2]/span" //默认使用XPath }, { // 物业类型 name: "property_type", selector: "//dl[@class='xiangqing']/dd[4]" //默认使用XPath }, { // 写字楼的等级 name: "level", selector: "//dl[@class='xiangqing']/dd[5]" //默认使用XPath }, { // 竣工时间 name: "mtime", selector: "//dl[@class='xiangqing']/dd[9]" //默认使用XPath }, { //占地面积 name: "floor_area", selector: "//dl[@class='xiangqing']/dd[13]" //默认使用XPath }, { // 建筑面积 name: "covered_area", selector: "//dl[@class='xiangqing']/dd[14]" //默认使用XPath }, { // 百度经度 name: "longitude" }, { // 百度纬度 name: "latitude" }, { // 高德经度 name: "gaode_lon" }, { // 高德纬度 name: "gaode_lat" } ] };

configs.initCrawl = function(site) {

var sourceId = 11164939; //此ID需要修改为您自己的数据源ID var query = 'source{}'; var src = shenjian.readSource(sourceId, query);

site.async(function(src) { var infos = src.nextBatch(100) while (infos) { for (var i = 0; i < infos.length; i ) { urls = infos[i].d_url "xiangqing/"; site.addScanUrl(urls) } infos = src.nextBatch(100) } }, src);

};

configs.afterDownloadPage = function(page, site) {

var Turl = extract(page.raw, "//div[@class='blmapbox']/iframe/@src") pageData = site.requestUrl("https:" Turl, headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3941.4 Safari/537.36" } )

// pageData=JSON.parse(pageData) // var pos = /px:.*/.exec(page.raw) var pos = extract(pageData, "/html/body/script[1]")

page.contextData = JSON.stringify(pos) return page; };

/* 回调函数afterExtractField:对抽取出来的数据进行处理 */ configs.afterExtractField = function(fieldName, data, page, site) {

if (fieldName == "area") { return data.replace("所属区域:", "") }

if (fieldName == "property_type") { return data.replace("物业类别:", "") }

if (fieldName == "level") { return data.replace("写字楼等级:", "") }

if (fieldName == "mtime") { return data.replace("竣工时间:", "") }

if (fieldName == "floor_area") { return data.replace("占地面积:", "") }

if (fieldName == "covered_area") { return data.replace("建筑面积:", "") } return data; };

configs.afterExtractPage = function(page, data, site) { function isEmpty(obj) { if (typeof obj != "undefined" || obj != null || obj != "") { return true; } else { return false; } }

var jw = JSON.parse(page.contextData) ss_jw = JSON.stringify(jw.match(/[px:]d(.) /g)) s = ss_jw.replace('[', '').split('\"')

data.longitude = s[1] data.latitude = s[3] var baidu_lon = s[1] var baidu_lat = s[3] if (isEmpty(baidu_lon) && isEmpty(baidu_lat)) { if (s[1] === "" || s[3] === "") { data.gaode_lon = 0 data.gaode_lat = 0 } else { var lon = baidu_lon; var lat = baidu_lat;

var x_pi = 3.14159265358979324 * 3000.0 / 180.0 var x = lon - 0.0065 var y = lat - 0.006 var z = Math.sqrt(x * x y * y) - 0.00002 * Math.sin(y * x_pi) var theta = Math.atan2(y, x) - 0.000003 * Math.cos(x * x_pi)

data.gaode_lon = z * Math.cos(theta) data.gaode_lat = z * Math.sin(theta) }

} return data; };

var crawler = new Crawler(configs); crawler.start();

0 人点赞