R语言学习笔记-Day09

2024-07-17 18:14:19 浏览数 (1)

pheatmap Run Examples

#Create test matrix

test = matrix(rnorm(200), 20, 10) test1:10, seq(1, 10, 2) = test1:10, seq(1, 10, 2) 3 test11:20, seq(2, 10, 2) = test11:20, seq(2, 10, 2) 2 test15:20, seq(2, 10, 2) = test15:20, seq(2, 10, 2) 4 colnames(test) = paste("Test", 1:10, sep = "") rownames(test) = paste("Gene", 1:20, sep = "")

#Draw heatmaps

pheatmap(test) pheatmap(test, kmeans_k = 2) pheatmap(test, scale = "row", clustering_distance_rows = "correlation") pheatmap(test, color = colorRampPalette(c("navy", "white", "firebrick3"))(50)) pheatmap(test, cluster_row = FALSE) pheatmap(test, legend = FALSE)

#Show text within cells

pheatmap(test, display_numbers = TRUE) pheatmap(test, display_numbers = TRUE, number_format = "%.1e")#以指数形式显示数字,若设置为“%.2f”则显示小数点后两位 pheatmap(test, display_numbers = matrix(ifelse(test > 5, "*", ""), nrow(test))) pheatmap(test, cluster_row = FALSE, legend_breaks = -1:4, legend_labels = c("0", "1e-4", "1e-3", "1e-2", "1e-1", "1"))

#Fix cell sizes and save to file with correct size

pheatmap(test, cellwidth = 15, cellheight = 12, main = "Example heatmap") pheatmap(test, cellwidth = 15, cellheight = 12, fontsize = 8, filename = "test.pdf")

#Generate annotations for rows and columns

annotation_col = data.frame( CellType = factor(rep(c("CT1", "CT2"), 5)), Time = 1:5 ) rownames(annotation_col) = paste("Test", 1:10, sep = "")

annotation_row = data.frame( GeneClass = factor(rep(c("Path1", "Path2", "Path3"), c(10, 4, 6))) ) rownames(annotation_row) = paste("Gene", 1:20, sep = "")

#Display row and color annotations

pheatmap(test, annotation_col = annotation_col) pheatmap(test, annotation_col = annotation_col, annotation_legend = FALSE) pheatmap(test, annotation_col = annotation_col, annotation_row = annotation_row)

#Change angle of text in the columns

pheatmap(test, annotation_col = annotation_col, annotation_row = annotation_row, angle_col = "45") pheatmap(test, annotation_col = annotation_col, angle_col = "0")

#Specify colors

ann_colors = list(

Time = c("white", "firebrick"), CellType = c(CT1 = "#1B9E77", CT2 = "#D95F02"), GeneClass = c(Path1 = "#7570B3", Path2 = "#E7298A", Path3 = "#66A61E") )

pheatmap(test, annotation_col = annotation_col, annotation_colors = ann_colors, main = "Title") pheatmap(test, annotation_col = annotation_col, annotation_row = annotation_row, annotation_colors = ann_colors) pheatmap(test, annotation_col = annotation_col, annotation_colors = ann_colors2)

#Gaps in heatmaps

pheatmap(test, annotation_col = annotation_col, cluster_rows = FALSE, gaps_row = c(10, 14)) pheatmap(test, annotation_col = annotation_col, cluster_rows = FALSE, gaps_row = c(10, 14), cutree_col = 2)

#Show custom strings as row/col names

labels_row = c("", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "Il10", "Il15", "Il1b")

pheatmap(test, annotation_col = annotation_col, labels_row = labels_row)

#Specifying clustering from distance matrix

drows = dist(test, method = "minkowski") dcols = dist(t(test), method = "minkowski") pheatmap(test, clustering_distance_rows = drows, clustering_distance_cols = dcols)

#Modify ordering of the clusters using clustering callback option

callback = function(hc, mat){ sv = svd(t(mat))$v,1 dend = reorder(as.dendrogram(hc), wts = sv) as.hclust(dend) }

pheatmap(test, clustering_callback = callback)

差异分析#适用于二分组

library(limma) design = model.matrix(~Group) fit = lmFit(exp,design)#线性拟合 fit = eBayes(fit)#贝叶斯检验 deg = topTable(fit,coef = 2,number = Inf)#coef=2指对design的第二列进行分析 #topTable()默认参数为10,可输入参数number= Inf显示全部结果

判断两个数据框是否完全一致

identical(a,b)

差异分析后的数据整理

为避免行名丢失,建议将行名作为一列加入数据框

tibble::column_to_rownames() tibble::rownames_to_column()#将行名变为一列

将差异分析结果和探针注释合在一起

merge()#顺序会发生变更 inner_join()#仍然保留按照P.Value排序的表格顺序

最终呈现结果:

limma输出结果--探针名(行名)--基因名(ids)--上下调(ifelse)--ENTREZID(bitr)

在火山图上标记基因

for_lable <- test %>% filter(abs(logFC) > 4& -log10(P.value)> -log10(0.000001)) #通过自定义阈值设置显示基因的数量 p geom_point(size = 3,shape = 1,data = for_lable) ggrepel::geom_lable_repel( aes(label = symbol), data = for_label, color = "black" )

R包——tinyarray

library(tinyarray) draw_heatmap(n,Group) draw_pca(exp,Group,Style = "3D") draw_volcano(deg,pkg = 4,style = "ggplot2")#pkg = 4指单纯用limma作图 draw_boxplot(nsample(1:nrow(n),10),,Group)

富集分析

输入数据:差异基因的ENTREZID

并非和SYMBOL一一对应,存在损失均为正常现象

KEGG数据库

Kyoto Encyclopedia of Genes and Genomes

是系统分析基因功能、基因组信息数据库,有助于研究者把基因及表达信息作为一个整体网络进行研究,以理解生物系统的高级功能和实用程序资源库著称

GO数据库

Gene Ontology

是一个在生物信息学领域中广泛使用的本体,提供了一个可具代表性的规范化的基因和基因产物特性的属于描绘或词义解释的工作平台

  • 细胞组分(cellular component):细胞的每个部分和细胞外环境
  • 分子功能(molecular function):可以描述分子水平的活性,如催化或结合活性
  • 生物过程(biological process):生物过程系指由一个或多个分子功能有序组合而产生的系列事件,一般规律是,一个过程是由多个不同的步骤组成的 通过将差异基因作GO富集分析,可以把基因按照不同的功能进行归类,达到对基因进行注释和分类的目的

实战代码

rm(list = ls())undefinedload(file = 'step4output.Rdata') library(clusterProfiler) library(ggthemes) library(org.Hs.eg.db) library(enrichplot)

(1)输入数据

gene_diff = deg$ENTREZIDdeg$change != "stable"

(2)富集

#⭐下面的三句都要注意物种

ekk <- enrichKEGG(gene = gene_diff,organism = 'hsa') #其他物种https://www.genome.jp/kegg/catalog/org_list.html ekk <- setReadable(ekk,OrgDb = org.Hs.eg.db,keyType = "ENTREZID") #如果ekk是空的,这句就会报错,因为没富集到任何通路,即无pvalue<0.05。#将ENTREZID更换为对应的基因。 ego <- enrichGO(gene = gene_diff,OrgDb= org.Hs.eg.db, ont = "ALL",readable = TRUE) #setReadable和readable = TRUE都是把富集结果表格里的基因名称转为symbol class(ekk)

(3)可视化

#barplot可以换成dotplot

barplot(ego, split = "ONTOLOGY") facet_grid(ONTOLOGY ~ ., space = "free_y",scales = "free_y") #将结果按照CC, MF,BP进行分类 barplot(ekk) #如果ekk中没有padj<0.05的通路,就会报错,因为只画padj<0.05,没有参数

富集结果

#pvalue/qvalue:表征富集是否显著的参数,代表通路而非某一特定基因

##衡量每个通路中的基因在差异基因里是否足够多

#geneID:差异基因中哪些基因属于该通路

#GeneRatio:差异基因中属于该通路的数量/差异基因中被数据库收录的数量

#BgRatio:该通路中总计基因数/数据库中全部通路总计基因数

若富集不到

1* 调整logFC, pvalue和阈值,以改动差异基因数量

2* 不适用默认的padj(富集的),而是使用原始p值,在文章中写清即可

3* 更换富集方法,GSEA也可以作KEGG富集

4* 调整参数maxGSSize = 500,为默认参数,表示500个基因以上的通路不考虑,可以调高该值

#若基因symbol中多了空格,尝试使用trimws(deg$symbol)对空格进行去除

引用自生信技能树

0 人点赞