今天使用R爬取数据的时候发现一个奇怪的问题,我将每个属性的数据先保存在vector中,然后再合并到data.frame中时,发现打印names时数据正常显示中文,但是打印data.frame或者写入csv文件时,却始终都是utf8的格式。 代码如下:
代码语言:javascript复制library('rvest')
url<-"http://www.yanglao.com.cn/shanghai"
base_url <- "http://www.yanglao.com.cn"
getInfo <- function(url){
webpage <- read_html(url)
div <- html_nodes(webpage, "div.cont")[[2]]
lis <- html_nodes(div, "li")
orgType <- html_text(lis[[2]])
orgProp <- html_text(lis[[3]])
return(list(orgType, orgProp))
}
orgTypes <- c()
orgProps <- c()
names <- c()
addresses <- c()
beds <- c()
prices <- c()
for(i in 1:61){
url<-"http://www.yanglao.com.cn/shanghai"
if(i!=1){
url<-paste0("http://www.yanglao.com.cn/shanghai_", i)
}
webpage <- read_html(url)
uls <- html_nodes(webpage,"ul.rest-items")
links <- html_nodes(uls[[1]], "li.rest-item")
for(link in links){
lis <- html_nodes(link, "li")
target <- html_nodes(link, "a")[[2]]
url <- paste0(base_url, html_attr(target, "href"))
info <- getInfo(url)
orgTypes <- c(orgTypes, info[[1]])
orgProps <- c(orgProps, info[[2]])
title <- html_text(target)
address <- html_text(lis[[1]])
bed <- html_text(lis[[2]])
price <- html_text(lis[[3]])
names <- c(names, title)
addresses <- c(addresses, address)
prices <- c(prices, price)
beds <- c(beds, bed)
}
}
df <- data.frame(names, addresses, prices, beds, orgTypes, orgProps)
解决办法:
代码语言:javascript复制Sys.setlocale(locale='Chinese')