表达芯片数据分析3——基因差异分析绘制火山图及差异基因热图

2023-10-02 23:27:54 浏览数 (2)

差异分析

芯片差异分析所需要的输入数据

代码语言:text复制
fviz_pca_ind(iris.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = iris$Species, # color by groups
             palette = c("#00AFBB", "#E7B800", "#FC4E07"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups"
             )
             

输入数据准备:

代码语言:text复制
rm(list = ls())  
load(file = "step2output.Rdata")
#输入数据:exp和Group
#Principal Component Analysis
#http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/112-pca-principal-component-analysis-essentials
# fviz_pca_ind(iris.pca,
#              geom.ind = "point", # show points only (nbut not "text")
#              col.ind = iris$Species, # color by groups
#              palette = c("#00AFBB", "#E7B800", "#FC4E07"),
#              addEllipses = TRUE, # Concentration ellipses
#              legend.title = "Groups"
# )
##ctrl shift C批量注释

PCA 图

代码语言:text复制
dat=as.data.frame(t(exp))
library(FactoMineR)
library(factoextra) 
dat.pca <- PCA(dat, graph = FALSE)
fviz_pca_ind(dat.pca,
             geom.ind = "point", # show points only (nbut not "text")
             col.ind = Group, # color by groups
             palette = c("#00AFBB", "#E7B800"),
             addEllipses = TRUE, # Concentration ellipses
             legend.title = "Groups"
)
# 2.top 1000 sd 热图---- 
###看一下数据,差异基因或者组内差异较大的基因;
g = names(tail(sort(apply(exp,1,sd)),1000)) #day7-apply的思考题
n = exp[g,]
library(pheatmap)
annotation_col = data.frame(row.names = colnames(n),
                            Group = Group)
##分组信息;
ann_colors = list(Group = c(Disease="#1B9E77", Normal="firebrick"))
##添加颜色;
pheatmap(n,
         show_colnames =F,
         show_rownames = F,
         annotation_col=annotation_col,
         annotation_colors = ann_colors,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
         scale = "row", #按行标准化,只保留行内差别,不保留行间差别,会把数据范围缩放到大概-5~5之间
         breaks = seq(-3,3,length.out = 100) #设置色带分布范围为-3~3之间,超出此范围的数字显示极限颜色
         ) 
?pheatmap
# 关于scale的进一步学习:zz.scale.R

芯片分析后的数据整理:

二分组差异分析

代码语言:text复制
rm(list = ls()) 
load(file = "step2output.Rdata")
#差异分析
library(limma)
design = model.matrix(~Group)#模型矩阵
fit = lmFit(exp,design)#线性拟合
fit = eBayes(fit)#贝叶斯检验
deg = topTable(fit,coef = 2,number = Inf)#提取结果

##函数表示
analy=function(exp,Group){
  design = model.matrix(~Group)#模型矩阵
  fit = lmFit(exp,design)#线性拟合
  fit = eBayes(fit)#贝叶斯检验
  deg = topTable(fit,coef = 2,number = Inf)#提取结果
}
deg2=analy(exp,Group)
identical(deg,deg2)

多个探针对应一个基因

随机去重、保留平均值最大的探针或者取多个探针的平均值。

代码语言:text复制
#为deg数据框添加几列
#1.加probe_id列,把行名变成一列
library(dplyr)
deg = mutate(deg,probe_id = rownames(deg))
?mutate
#2.加上探针注释
ids = distinct(ids,symbol,.keep_all = T)##随机去除重复值;
deg = inner_join(deg,ids,by="probe_id")
nrow(deg)
#3.加change列,标记上下调基因
logFC_t = 1
p_t = 0.05
k1 = (deg$P.Value < p_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < p_t)&(deg$logFC > logFC_t)
deg = mutate(deg,change = ifelse(k1,"down",ifelse(k2,"up","stable")))
table(deg$change)
#火山图
library(ggplot2)
ggplot(data = deg, 
            aes(x = logFC, 
                y = -log10(P.Value)))  
  geom_point(alpha=0.4, size=3.5, 
             aes(color=change))  
  scale_color_manual(values=c("blue", "grey","red")) 
  geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",linewidth=0.8)  
  geom_hline(yintercept = -log10(p_t),lty=4,col="black",linewidth=0.8)  
  theme_bw()##改灰色背景
代码语言:text复制
# 差异基因热图----
# 表达矩阵行名替换为基因名
exp = exp[deg$probe_id,]
rownames(exp) = deg$symbol
diff_gene = deg$symbol[deg$change !="stable"]
n = exp[diff_gene,]
library(pheatmap)
annotation_col = data.frame(group = Group)
rownames(annotation_col) = colnames(n) 
pheatmap(n,show_colnames =F,
         show_rownames = F,
         scale = "row",
         #cluster_cols = F, 
         annotation_col=annotation_col,
         breaks = seq(-3,3,length.out = 100)
) 

0 人点赞