配对样本检验及绘图

2021-12-06 18:15:19 浏览数 (2)

1. 下载GEO数据

代码语言:javascript复制
#=======================================================

#set the working files and load the packages
#=======================================================

# install some packages if neccessary
# if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("limma")

setwd("D:\SCIwork\F41\train")
library(GEOquery)
rm(list=ls())
library(dplyr)
library(tidyr)
library(Biobase)
library(limma)


#=======================================================


#=======================================================

Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10)
gsename = "GSE70768"
# 下载基因芯片数据,destdir参数指定下载到本地的地址
gse<- getGEO(gsename, destdir = ".")
##根据GSE号来下载数据,下载_series_matrix.txt.gz
gpl<- getGEO('GPL10558', destdir = ".")
##根据GPL号下载的是芯片设计的信息, soft文件
gse <- getGEO(filename = 'GSE70768_series_matrix.txt.gz')
gpl <- getGEO(filename = 'GPL10558.soft')
gpl <- gpl@dataTable@table
colnames(gpl)
gpl <- gpl %>%
  dplyr::select(ID, "Symbol")
write.csv(gpl,"GPL.csv", row.names = F)
# gse中的行名ID与gene name的对应关系
genename = read.csv("GPL.csv")

2. 注释及得到基因表达矩阵

代码语言:javascript复制
# 构建表达矩阵
exprSet <- as.data.frame(exprs(gse)) # 得到表达矩阵,行名为ID,需要转换
# 转换ID为gene name
exprSet$ID = rownames(exprSet)
express = merge( x=genename, y=exprSet, by="ID")
express$ID = NULL
express[1:5,1:5]
express[which(is.na(express),arr.ind = T)]<-0 #结合which进行缺失替代
exprSet <- aggregate(x = express[,2:ncol(express)],
                     by = list(express$Symbol),
                     FUN = median)
express[1:5,1:5]
exprSet <- as.data.frame(exprSet)
exprSet <-exprSet[-1,]
names(exprSet)[1] <- 'ID'
write.csv(exprSet, file="exprSet.csv")

##########################################################################################
##
###########################################################################################

colnames(exprSet)
exprSet[1:5,1:5]
exprSet <- exprSet[-1,]
gene1 <- c("ERBB2")
exprSet1 <- exprSet[which(exprSet$ID %in% gene1),]
rownames(exprSet1) <- exprSet1$ID
exprSet1$ID <- NULL
exprSet1 <- as.data.frame(t(exprSet1))
exprSet1$sample <- rownames(exprSet1)
> head(exprSet1)
              ERBB2     sample
GSM1817707 6.374360 GSM1817707
GSM1817708 6.434586 GSM1817708
GSM1817709 6.304334 GSM1817709
GSM1817710 6.286188 GSM1817710
GSM1817711 6.554135 GSM1817711
GSM1817712 6.451002 GSM1817712

3. 提取配对样本数据

代码语言:javascript复制
pd <- pData(gse)

dt <- subset(pd, select=c("geo_accession", "characteristics_ch1", 'title'))

dt$num <- 'num'

for (i in 1:dim(dt)[1]) {
  number <- as.numeric(nchar(dt$title[i]))-8
  dt$num[i] <- substr(x=dt$title[i], start = number, stop = number 8)
}

dt <- dt[-(1:13),]

df <- as.data.frame(table(dt$num))

df <- subset(df, df$Freq == 2)

dt <- dt[which(dt$num %in% df$Var1),]

dt$title <- NULL

names(dt)[2] <- 'group'
names(dt)[1] <- 'sample'

table(dt$group)

dt$group <- ifelse(dt$group == 'sample type: Tumour', 'Tumor', 'Normal')
table(dt$group)

dt <- merge(dt, exprSet1, by='sample')
> head(dt)
      sample group       num    ERBB2
1 GSM1817723 Tumor TB08.0341 6.572179
2 GSM1817724 Tumor TB08.0489 6.194203
3 GSM1817729 Tumor TB08.0598 6.188180
4 GSM1817730 Tumor TB08.0618 6.300513
5 GSM1817731 Tumor TB08.0667 6.279266
6 GSM1817737 Tumor TB08.0872 6.435920

5. 计算配对样本T检验及wilcox检验的P值

代码语言:javascript复制
dt_N <- subset(dt, group == "Normal")
dt_N <- dt_N$ERBB2

dt_T <- subset(dt, group == "Tumor")
dt_T <- dt_T$ERBB2

library(PairedData)
pd <- paired(dt_N, dt_T)

# 计算之前前后的差异
d <- with(dt, ERBB2[group == "Tumor"] - ERBB2[group == "Normal"])
#Shapiro-Wilk正态性检验差值是否符合正态分布
shapiro.test(d) # p-value = 0.11

# 配对样本t检验
res <- t.test(dt_N,dt_T, paired = TRUE)
# 显示结果
res

# 配对样本wilcox检验
res <- wilcox.test(dt_N,dt_T, paired = TRUE)
res

6. 绘图

代码语言:javascript复制
mean(dt_N)
#median(dt_N)
mean(dt_T)
#median(dt_T)

library(dplyr)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubclean())


plot <- ggplot(data = dt, aes(x = group, y = ERBB2))  
  geom_boxplot(fatten = NULL,aes(colour = group ))  
  scale_color_manual(values=c("#137F5F", "#ED553B")) 
  aes(colour = group) 
  stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
               width = 0.75, size = 1, linetype = "solid") 
  geom_point(aes(colour = factor(group)), size=1, alpha=0.5)  
  geom_line(aes(group=num), colour="gray50", linetype="11")  
  theme_classic()

print(plot)

pdf(file = 'pair.pdf', height = 4, width = 4)
print(plot)
dev.off()

0 人点赞