原文地址
Land Temperature Change in the Continental US 1850-2013
读入数据
- 这一步学到一个新函数
tidyr::separate()
作用是根据指定分隔符拆分列 通过函数help(package="tidyr")
查看separate()
函数的帮助文档里面的实例
> df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
> df %>% separate(x, c("A", "B"))
A B
1 <NA> <NA>
2 a b
3 a d
4 b c
原文的用法是把时间数据拆分成年月日,比如 1999-11-11 拆分成三列,分别是年月日
- 代码
library(dplyr)
library(tidyr)
library(ggplot2)
df<-read.csv("../../../Desktop/Data_analysis_practice/Kaggle/Climate_Change/GlobalLandTemperaturesByState.csv")
df1<-df%>%
filter(Country=="United States")%>% #筛选国家为美国
separate(col=dt,into=c("Year","Month","Day"),convert=T) #拆分df列为年月日三列
df1<-na.omit(df1)#删除缺失值
df2<-df1%>%
filter(State!="Hawaii"&State!="Alaska")#删除Huwaii和Alaska
df2<-na.omit(df2)#删除缺失值
df3<-df2%>%
filter(Year>1850)%>% #筛选年份大于1850
group_by(Year)%>%
summarise(Temp=mean(AverageTemperature))#按照年份计算平均值
得到下面分析用到的数据集df3
数据可视化
- 美国1850到2013年间的年平均温度
ggplot(df3,aes(x=Year,y=Temp))
geom_point(aes(color=Temp))
geom_smooth()
scale_color_gradient(low="blue",high="red")
theme_bw()
labs(title="US Average Temperature 1850-2013")
image.png
- 箱线图展示美国各个州年平均温度(时间间隔为40年)
df4<-df2 %>%
filter(Year==1850 | Year==1890 | Year==1930 | Year==1970 | Year==2013) %>%
group_by(State,Year) %>%
summarise(Temp = mean(AverageTemperature))
head(df4)
df4$Year<-factor(df4$Year)
ggplot(df4,aes(x=Year,y=Temp,group=Year))
geom_boxplot(aes(fill=Year))
theme_bw()
ggtitle("Average Temperature for 40 Year Intervals")
labs(y="Average Temperature")
image.png
- 方差分析探究5年间的年平均温度是否有差异
c<-aov(Temp~Year,data=df4)
summary(c)
TukeyHSD(c)
###
> summary(c)
Df Sum Sq Mean Sq F value Pr(>F)
Year 4 218 54.52 2.807 0.0264 *
Residuals 240 4662 19.43
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> TukeyHSD(c)
Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Temp ~ Year, data = df4)
$`Year`
diff lwr upr p adj
1890-1850 0.415243197 -2.0323195 2.862806 0.9902477
1930-1850 0.607991497 -1.8395712 3.055554 0.9600409
1970-1850 0.420858844 -2.0267039 2.868422 0.9897377
2013-1850 2.666743197 0.2191805 5.114306 0.0250385
1930-1890 0.192748299 -2.2548144 2.640311 0.9995095
1970-1890 0.005615646 -2.4419471 2.453178 1.0000000
2013-1890 2.251500000 -0.1960627 4.699063 0.0877744
1970-1930 -0.187132653 -2.6346954 2.260430 0.9995637
2013-1930 2.058751701 -0.3888110 4.506314 0.1446209
2013-1970 2.245884354 -0.2016784 4.693447 0.0891290
方差分析结果P值小于0.05,说明有的组间存在差异,接下来通过邓肯检验确定具体哪些组之间存在差异 邓肯检验结果P值小于0.05是2013年和1850年
- ggplot2作图展示邓肯检验结果
Tuk<-TukeyHSD(c)$Year
dfTuk<-as.data.frame(Tuk)
dfTuk$Year<-rownames(Tuk)
dfTuk$Year<-factor(dfTuk$Year,levels=dfTuk$Year)
colnames(dfTuk)<-c("diff","lwr","upr","padj","Year")
dfTuk
ggplot()
geom_point(data=dfTuk,aes(x=Year,y=lwr))
geom_point(data=dfTuk,aes(x=Year,y=upr))
geom_segment(data=dfTuk,aes(x=Year,xend=Year,
y=lwr,yend=upr))
geom_text(data=dfTuk,aes(x=Year,y=mean(lwr upr),
label=ifelse(padj<0.05,"*","")),
vjust=0.1,size=10)
geom_hline(yintercept=0,lty="dashed")
coord_flip()
theme_bw() labs(x="")