玮瑜课程

2024-01-10 16:45:47 浏览数 (1)

第一节

打开RStudio先运行以下代码

代码语言:r复制
library(GEOquery)
library(dplyr)
library(tidyverse)
library(data.table)

1.R包的安装

代码语言:r复制
1.install.packages("bao")

2.install.packages("BiocManager") 
BiocManager::install("bao")


3.devtools::install_github("bao")         

2.在GEO数据库中下载GSE信息

代码语言:r复制
install.packages("tidyverse")
library(GEOquery)
gset <- getGEO(GEO = "GSE12417",destdir = ".",getGPL = F)#destdir="."表示下载到当前路径
e2 <- gset[[2]] 
phe <- e2@phenoData@data  #提取临床信息
exp <- e2@assayData[["exprs"]]  #提取表达矩阵

3.在GEO数据库中下载对应平台信息:(平台信息含探针对应的基因名)

代码语言:r复制
library(data.table)
?fread
getwd
anno=fread("GPL96-57554.txt",sep="t",header=T,data.table = F)#能在excel中打开的文件都可以用fread函数读取
ID_symbol <- anno[,c("ID","Gene Symbol")]  

#部分探针对应的基因名有两个(如"DDR1 /// MIR4640" ),将排在后面的去掉
symbol <- ID_symbol$`Gene Symbol`
symbol1 <- strsplit(symbol,split=" /// ",fixed=T)#fixed=T表示精确查找
gene_symbol <- sapply(symbol1,function(x){x[1]})#提取每一个子列表中的第一个元素
ID_gene_symbol <- data.frame(ID_symbol$ID,gene_symbol)
rm(ID_symbol,symbol,symbol1,gene_symbol)

4.将表达矩阵与基因名合并

代码语言:r复制
exp <- as.data.frame(exp)#merge函数只对两个data.frame进行合并
exp1 <- merge(ID_gene_symbol,exp,by.x = 1,by.y = 0)
exp2 <- distinct(exp1,gene_symbol,.keep_all=T)#去除重复的基因名
exp3 <- na.omit(exp2)#将缺失的基因名去除
rownames(exp3) <- exp3$gene_symbol
exp4 <- exp3[,-c(1,2)]
rm(ID_gene_symbol,exp,exp1,exp2,exp3)

至此,我们获得了想要的横坐标为基因名,纵坐标为样本名的表达矩阵exp4

5.第五节:绘制生存曲线

代码语言:r复制
s=phe$characteristics_ch1#将含生存时间和生存状态的列取出来
s[1:5] #[1] "AML, normal karyotype (training set) FAB M4; age =62 years; OS = 4 days; status (0=alive/1=dead): 1"
s1 <- strsplit(s,split="; ",fixed = T)
os.time <- sapply(s1,function(x){x[3]})#此时提取出来的生存时间为字符型向量(如OS = 4 days)
os.time1 <- strsplit(os.time,split=" ",fixed = T)
os.time2 <- sapply(os.time1,function(x){x[4]})
os <- as.numeric(os.time2)

status <- sapply(s1,function(x){x[4]})#此时提取出来的生存状态为" status (0=alive/1=dead): 0"
status1 <- strsplit(status,split=": ",fixed = T)
status2 <- sapply(status1,function(x){x[2]})
final_status <- as.numeric(status2)

rm(s,s1,os.time,os.time1,os.time2,status,status1,status2)

OS_status <- data.frame(os,final_status)
rownames(OS_status) <- row.names(phe)

提取某个基因作为分组

代码语言:r复制
#中位数分组
OS_status_exp$EZH2 <- ifelse(OS_status_exp$EZH2>median(OS_status_exp$EZH2),"high","low")

library(survival)
library(survminer)
Surv(os,final_status)#创建生存曲线的对象
x1 <- survfit(Surv(os,final_status)~EZH2,data = OS_status_exp)#对生存曲线进行拟合
ggsurvplot(x1,conf.int =T,pval=T)#conf.int=T表示加上95%置信区间

#四分位数分组
OS_status_exp <- merge(OS_status,exp4.e,by.x=0,by.y = 0)
q <- quantile(OS_status_exp$EZH2)
OS_status_exp$fenzu <- ifelse(OS_status_exp$EZH2>q[4],"high",
                              ifelse(OS_status_exp$EZH2>q[3]& OS_status_exp$EZH2<q[4],"h.stable",
                                     ifelse(OS_status_exp$EZH2>q[2]&OS_status_exp$EZH2<q[3],"l.stable","down")))

x1 <- survfit(Surv(os,final_status)~fenzu,data = OS_status_exp)#对生存曲线进行拟合
ggsurvplot(x1,conf.int =T,pval=T,risk.table = T,#conf.int=T表示加上95%置信区间,添加风险因子表
           surv.median.line = "hv")#添加中位生存时间

#使用ggsurvplot_facet()函数绘制分面生存曲线
OS_status_exp$sex <- c(rep("m",100),rep("f",63))
x1 <- survfit(Surv(os,final_status)~EZH2,data = OS_status_exp)#对生存曲线进行拟合
ggsurvplot_facet(x1,OS_status_exp,
                 facet.by = "sex",#设置分面变量
                 palette="npg",#设置颜色画板
                 pval = T)

6.第五六节:差异分析

部分平台注释文件不能下载,解决办法:

1. gset <- getGEO("GSE149507",destdir = ".",getGPL = T)→gset[["GSE149507_series_matrix.txt.gz"]]@featureData@data

2.下载soft文件:soft <- getGEO(filename="GSE149507_family.soft")→soft@gpls[["GPL23270"]]@dataTable@table

#getGEO()可下载网页文件,也可读取soft文件 。

代码语言:r复制
gset <- getGEO("GSE149507",destdir = ".",getGPL = T)
gset <- gset[[1]]
pdata <- gset@phenoData@data
exprset <- gset@assayData[["exprs"]]
gpl <- gset@featureData@data

#将ENTREZ_GENE_ID转换为gene symbol(构建含基因名称的平台表格)
library(org.Hs.eg.db)
keytypes(org.Hs.eg.db)#查看一下有哪些基因ID
x1 <- gpl$ENTREZ_GENE_ID
x1 <- as.character(x1)
x2 <- AnnotationDbi::select(org.Hs.eg.db,keys=x1,columns=c("ENTREZID","SYMBOL",
                                            keytype = "ENTREZID")#keys和keytype需对应
gpl$gene <- x2$SYMBOL#在gpl表中加上探针对应的基因名称

#将表达矩阵和平台信息合并
exp <- as.data.frame(exprset)
exp.pl <- merge(gpl,exp,by.x = 1,by.y = 0)#将平台信息和含探针信息的表达矩阵按照探针合并
exp.pl1 <- distinct(exp.pl,gene,.keep_all=T)#去除重复的基因名
exp.pl2 <- na.omit(exp.pl1)#将缺失的基因名去除
rownames(exp.pl2) <- exp.pl2$gene
exp.pl3 <- exp.pl2[,-c(1:5)]
rm(x1,x2,exp,exp.pl,exp.pl1,exp.pl2)

差异分析

代码语言:r复制
tumor <- c(seq(1,35,2))
control <- tumor 1
exp4 <- exp.pl3[,c(control,tumor)]
group_list <- c(rep("control",18),rep("tumor",18))
group_list <- factor(group_list)
group_list <- relevel(group_list,ref="control")#强制限定顺序,control在前

library(limma )
exp5 <- normal izeBetweenArrays(exp4)#除去批次效应
boxplot(exp4,outline=F,notch=T,col=group_list,las=2)
boxplot(exp5,outline=F,notch=T,col=group_list,las=2)#绘制出的图形相同,说明样本已经进行过归一化处理

#构建差异分析的矩阵
design <- model.matrix(~group_list)
colnames(design) <- levels(group_list)
rownames(design) <- colnames(exp5)


fit <- lmFit(exp5,design)#构建线性拟合模型
fit <- eBayes(fit)#eBayes()使用trend=T对标准误差进行贝叶斯平滑,计算每个对比中每个基因的moderated t-statistic和log-odds
allDiff <- topTable(fit,coef=2,adjust="fdr",number = Inf)#topTable()给出一个最有可能在给定对比条件下差异表达的基因列表
#coef=2中的”2“代表design中的第2列

火山图

代码语言:javascript复制
#画个火山图
library(ggplot2)
library(ggrepel)
data <- allDiff
data$significant <- "stable"
data$significant[data$logFC>=0.585 & data$P.Value<0.05]="up"
data$significant[data$logFC<=-0.585 & data$P.Value<0.05]="down"

ggplot(data,aes(logFC,-1*log10(P.Value))) xlim(-2,2) ylim(0,8) 
         geom_point(aes(color=significant),size=0.8) theme_classic() 
  scale_color_manual(values = c("#2a9d8f","#EE7AE9","#f8961e")) 
  geom_hline(yintercept = 1.3,linetype=4,size=0.3) 
  geom_vline(xintercept = c(-0.585,0.585),linetype=4,size=0.3) 
  theme(title = element_text(size=18),text = element_text(size=18)) 
  labs(x="log2(foldchange)",y="-log10(P_Value)")

#筛选差异基因
select.FPKM <- data$AveExpr>5  #表达量>5
select.log2FC <- abs(data$logFC)>1
select.Pvalue <- data$P.Value<0.05
select.vec <- select.FPKM & select.log2FC & select.Pvalue
table(select.vec)

degs.list <- as.character(rownames(data))[select.vec]
label.deg <- sample(degs.list,20)#若想单独标注某个基因,则label.deg <- c("PID1","MT3")
p=ggplot(data,aes(logFC,-1*log10(P.Value))) xlim(-2,2) ylim(0,8) 
  geom_point(aes(color=significant),size=0.8) theme_classic() 
  scale_color_manual(values = c("#2a9d8f","#EE7AE9","#f8961e")) 
  geom_hline(yintercept = 1.3,linetype=4,size=0.3) 
  geom_vline(xintercept = c(-0.585,0.585),linetype=4,size=0.3) 
  theme(title = element_text(size=18),text = element_text(size=18)) 
  labs(x="log2(foldchange)",y="-log10(P_Value)")
data_selected <- data[label.deg,]
p geom_label_repel(data = data_selected,
                   aes(label=rownames(data_selected)))

热图

代码语言:javascript复制
#热图
annotation_col1 <- data.frame(
  database <- c(rep("GEO",36)),
   CellType <- c(rep("control",18),rep("treatment",18))
  )
rownames(annotation_col1) <- colnames(exp5)
class(exp5)
exp6 <- as.data.frame(exp5)
exprset.map <- exp6[label.deg,]
install.packages("pheatmap")
library(pheatmap)
pheatmap::pheatmap(exprset.map,#热图的数据
                   cluster_rows = F,#行聚类
                   cluster_cols=F,#列聚类,可以看出样本之间的区分度
                   annotation_col = annotation_col1,
                   show_colnames = F,
                   scale="row",#以行标准化
                   color=colorRampPalette(c("blue","white","red"))(100))

0 人点赞