1. 下载GEO数据
代码语言:javascript
复制#=======================================================
#set the working files and load the packages
#=======================================================
# install some packages if neccessary
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("limma")
setwd("D:\SCIwork\F41\train")
library(GEOquery)
rm(list=ls())
library(dplyr)
library(tidyr)
library(Biobase)
library(limma)
#=======================================================
#=======================================================
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 10)
gsename = "GSE70768"
# 下载基因芯片数据,destdir参数指定下载到本地的地址
gse<- getGEO(gsename, destdir = ".")
##根据GSE号来下载数据,下载_series_matrix.txt.gz
gpl<- getGEO('GPL10558', destdir = ".")
##根据GPL号下载的是芯片设计的信息, soft文件
gse <- getGEO(filename = 'GSE70768_series_matrix.txt.gz')
gpl <- getGEO(filename = 'GPL10558.soft')
gpl <- gpl@dataTable@table
colnames(gpl)
gpl <- gpl %>%
dplyr::select(ID, "Symbol")
write.csv(gpl,"GPL.csv", row.names = F)
# gse中的行名ID与gene name的对应关系
genename = read.csv("GPL.csv")
2. 注释及得到基因表达矩阵
代码语言:javascript
复制# 构建表达矩阵
exprSet <- as.data.frame(exprs(gse)) # 得到表达矩阵,行名为ID,需要转换
# 转换ID为gene name
exprSet$ID = rownames(exprSet)
express = merge( x=genename, y=exprSet, by="ID")
express$ID = NULL
express[1:5,1:5]
express[which(is.na(express),arr.ind = T)]<-0 #结合which进行缺失替代
exprSet <- aggregate(x = express[,2:ncol(express)],
by = list(express$Symbol),
FUN = median)
express[1:5,1:5]
exprSet <- as.data.frame(exprSet)
exprSet <-exprSet[-1,]
names(exprSet)[1] <- 'ID'
write.csv(exprSet, file="exprSet.csv")
##########################################################################################
##
###########################################################################################
colnames(exprSet)
exprSet[1:5,1:5]
exprSet <- exprSet[-1,]
gene1 <- c("ERBB2")
exprSet1 <- exprSet[which(exprSet$ID %in% gene1),]
rownames(exprSet1) <- exprSet1$ID
exprSet1$ID <- NULL
exprSet1 <- as.data.frame(t(exprSet1))
exprSet1$sample <- rownames(exprSet1)
> head(exprSet1)
ERBB2 sample
GSM1817707 6.374360 GSM1817707
GSM1817708 6.434586 GSM1817708
GSM1817709 6.304334 GSM1817709
GSM1817710 6.286188 GSM1817710
GSM1817711 6.554135 GSM1817711
GSM1817712 6.451002 GSM1817712
3. 提取配对样本数据
代码语言:javascript
复制pd <- pData(gse)
dt <- subset(pd, select=c("geo_accession", "characteristics_ch1", 'title'))
dt$num <- 'num'
for (i in 1:dim(dt)[1]) {
number <- as.numeric(nchar(dt$title[i]))-8
dt$num[i] <- substr(x=dt$title[i], start = number, stop = number 8)
}
dt <- dt[-(1:13),]
df <- as.data.frame(table(dt$num))
df <- subset(df, df$Freq == 2)
dt <- dt[which(dt$num %in% df$Var1),]
dt$title <- NULL
names(dt)[2] <- 'group'
names(dt)[1] <- 'sample'
table(dt$group)
dt$group <- ifelse(dt$group == 'sample type: Tumour', 'Tumor', 'Normal')
table(dt$group)
dt <- merge(dt, exprSet1, by='sample')
> head(dt)
sample group num ERBB2
1 GSM1817723 Tumor TB08.0341 6.572179
2 GSM1817724 Tumor TB08.0489 6.194203
3 GSM1817729 Tumor TB08.0598 6.188180
4 GSM1817730 Tumor TB08.0618 6.300513
5 GSM1817731 Tumor TB08.0667 6.279266
6 GSM1817737 Tumor TB08.0872 6.435920
5. 计算配对样本T检验及wilcox检验的P值
代码语言:javascript
复制dt_N <- subset(dt, group == "Normal")
dt_N <- dt_N$ERBB2
dt_T <- subset(dt, group == "Tumor")
dt_T <- dt_T$ERBB2
library(PairedData)
pd <- paired(dt_N, dt_T)
# 计算之前前后的差异
d <- with(dt, ERBB2[group == "Tumor"] - ERBB2[group == "Normal"])
#Shapiro-Wilk正态性检验差值是否符合正态分布
shapiro.test(d) # p-value = 0.11
# 配对样本t检验
res <- t.test(dt_N,dt_T, paired = TRUE)
# 显示结果
res
# 配对样本wilcox检验
res <- wilcox.test(dt_N,dt_T, paired = TRUE)
res
6. 绘图
代码语言:javascript
复制mean(dt_N)
#median(dt_N)
mean(dt_T)
#median(dt_T)
library(dplyr)
library(ggplot2)
library(ggpubr)
theme_set(theme_pubclean())
plot <- ggplot(data = dt, aes(x = group, y = ERBB2))
geom_boxplot(fatten = NULL,aes(colour = group ))
scale_color_manual(values=c("#137F5F", "#ED553B"))
aes(colour = group)
stat_summary(fun = mean, geom = "errorbar", aes(ymax = ..y.., ymin = ..y..),
width = 0.75, size = 1, linetype = "solid")
geom_point(aes(colour = factor(group)), size=1, alpha=0.5)
geom_line(aes(group=num), colour="gray50", linetype="11")
theme_classic()
print(plot)
pdf(file = 'pair.pdf', height = 4, width = 4)
print(plot)
dev.off()