美化你的单细胞各个亚群特异性高表达基因小提琴图

单细胞数据分析里面最基础的就是降维聚类分群，参考前面的例子：人人都能学会的单细胞聚类分群注释 ，这个大家基本上问题不大了，使用seurat标准流程即可，不过它默认出图并不好看，详见以前我们做的投票：可视化单细胞亚群的标记基因的5个方法，下面的5个基础函数相信大家都是已经烂熟于心了：

VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
FeaturePlot(pbmc, features = c("MS4A1", "CD79A"))
RidgePlot(pbmc, features = c("MS4A1", "CD79A"), ncol = 1)
DotPlot(pbmc, features = unique(features)) RotatedAxis()
DoHeatmap(subset(pbmc, downsample = 100), features = features, size = 3)

最近，郑州大学第一附属医院的史阳同学无私的分享了他对这些基础函数的改造，颜值说不上巅峰，但打败基础函数是没有问题的，同时也算是抛砖引玉吧，希望广大生信技能树粉丝们都投稿分享自己的创作，投稿请发邮件到 jmzeng1314@163.com

仍然是以大家熟知的pbmc3k数据集为例

大家先安装这个数据集对应的包，并且对它进行降维聚类分群，，而且每个亚群找高表达量基因，都存储为Rdata文件。

代码语言：javascript复制

# 0.安装R包 ----
# devtools::install_github('caleblareau/BuenColors')
# utils::install.packages(pkgs = "ggstatsplot")
# InstallData("pbmc3k") 

# 1.加载R包和测试数据 ----
rm(list = ls())
library(SeuratData) #加载seurat数据集  
getOption('timeout')
options(timeout = 10000)
data("pbmc3k")  
sce <- pbmc3k.final  
library(Seurat)

DimPlot(sce,reduction = "umap",label = TRUE) 
unique(Idents(sce))
sce$celltype = Idents(sce)

# 2.单细胞分析基本流程 ----
# 主要是拿到 tSNE和Umap的坐标，因为默认的 pbmc3k.final 是没有的
sce <- NormalizeData(sce, normalization.method =  "LogNormalize",  
                     scale.factor = 1e4) 
sce <- FindVariableFeatures(sce, 
                            selection.method = "vst", nfeatures = 2000)  
sce <- ScaleData(sce) 
sce <- RunPCA(object = sce, pc.genes = VariableFeatures(sce))  
sce <- FindNeighbors(sce, dims = 1:15)
sce <- FindClusters(sce, resolution = 0.8) 
set.seed(123)
sce <- RunTSNE(object = sce, dims = 1:15, do.fast = TRUE) 
sce <- RunUMAP(object = sce, dims = 1:15, do.fast = TRUE)
DimPlot(object = sce, reduction = "umap",label = TRUE)   
# 3.定义分组信息 ---- 
Idents(sce) = sce$celltype

# 4.寻找各个单细胞亚群的特征性高表达量基因----
sce.markers <- FindAllMarkers(object = sce, 
                              only.pos = TRUE,
                              min.pct = 0.25, 
                              thresh.use = 0.25) 

save(sce, sce.markers, file = 'tmp.Rdata')

接下来改造小提琴图的函数

先放一下改造效果图：

小提琴图和热图一样的，需要指定基因，不同的是前面的热图可以自己随心所欲指定基因，但是这个时候的小提琴图没有做这样的自适应，仅仅是top基因：

代码语言：javascript复制

marker_selected_1 <- sce.markers  %>% 
  dplyr::filter(p_val_adj < 0.05) %>% 
  dplyr::filter(pct.1 >= 0.5 & pct.2 <= 0.6) %>% 
  dplyr::group_by(cluster) %>% 
  dplyr::slice_max(order_by = avg_log2FC, n = 2)

marker_selected_2 <- sce.markers  %>% 
  dplyr::filter(p_val_adj < 0.05) %>% 
  dplyr::filter(pct.1 >= 0.5 & pct.2 <= 0.6) %>% 
  dplyr::group_by(cluster) %>% 
  dplyr::slice_max(order_by = avg_log2FC, n = 3)

绘图代码很简单；

代码语言：javascript复制

source('ViolinPlot.R')
library(patchwork)
ViolinPlot(object = sce, groupBy = 'celltype', MarkerSelected = marker_selected_1)
ViolinPlot(object = sce, groupBy = 'celltype', MarkerSelected = marker_selected_2)

所有的细节都在 SeuratPlot.R 文件里面的 SeuratPlot函数，是自定义的，内容如下所示：

代码语言：javascript复制

ViolinPlot <- function(object, groupBy, MarkerSelected) {
  # (1)获取绘图数据1
  plot_data = FetchData(object = object, 
                        vars = c(MarkerSelected$gene, groupBy), 
                        slot = 'data') %>% 
    dplyr::rename(group = as.name(groupBy)) %>% 
    tidyr::pivot_longer(cols = -group, names_to = 'Feat', values_to = 'Expr')

  # (2)获取绘图数据2
  ident_plot = MarkerSelected %>% 
    dplyr::select(cluster, gene)
  
  # (3)绘图
  figure_1 = ggplot(data = plot_data, mapping = aes(x = Expr,
                                                    y = fct_rev(factor(x = Feat, 
                                                                       levels = MarkerSelected$gene)), 
                                                    fill = group, 
                                                    label = group))  
              geom_violin(scale = 'width', adjust = 1, trim = TRUE)  
              scale_x_continuous(expand = c(0, 0), labels = function(x)
                c(rep(x = '', times = length(x) - 2), x[length(x) - 1], ''))  
              facet_grid(cols = vars(group), scales = 'free')  
              cowplot::theme_cowplot(font_family = 'Arial')  
              scale_fill_manual(values = paletteer::paletteer_d('ggsci::category20c_d3'))  
              xlab('Expression Level')   
              ylab('')  
              theme(legend.position = 'none', 
                    panel.spacing = unit(x = 0, units = 'lines'),
                    axis.line = element_blank(), #去除x和y轴坐标线(不包括axis tick)；
                    panel.background = element_rect(fill = NA, color = 'black'),
                    strip.background = element_blank(), #去除分页题头背景；
                    strip.text = element_text(color = 'black', size = 10, family = 'Arial', face = 'bold'),
                    axis.text.x = element_text(color = 'black', family = 'Arial', size = 11),
                    axis.text.y = element_blank(),
                    axis.title.x = element_text(color = 'black', family = 'Arial', size = 15),
                    axis.ticks.x = element_line(color = 'black', lineend = 'round'),
                    axis.ticks.y = element_blank(),
                    axis.ticks.length = unit(x = 0.1, units = 'cm'))
  
  figure_2 = ggplot(data = ident_plot, aes(x = 1,
                                           y = fct_rev(factor(x = gene, levels = MarkerSelected$gene)),
                                           fill = cluster))  
    geom_tile()  
    theme_bw(base_size = 12)  
    scale_fill_manual(values = paletteer::paletteer_d('ggsci::category20c_d3'))  
    scale_x_continuous(expand = c(0, 0))  
    scale_y_discrete(expand = c(0, 0))  
    guides(fill = guide_legend(direction = 'vertical',
                               label.position = 'right',
                               title.theme = element_blank(),
                               keyheight = 0.5,
                               nrow = 2))  
    xlab('Feature')  
    theme(legend.text = element_text(family = 'Arial', color = 'black', size = 11),
          legend.position = 'bottom',
          legend.justification = 'left',
          legend.margin = margin(0,0,0,0),
          legend.box.margin = margin(-10,05,0,0),
          panel.spacing = unit(0, 'lines'),
          panel.background = element_blank(),
          panel.border = element_blank(),
          plot.background = element_blank(),
          plot.margin = unit(x = c(0,0,0,0), units = 'cm'),
          axis.title.y = element_blank(),
          axis.text.y = element_text(angle = 0, hjust = 1, vjust = 0.5, color = 'black', family = 'Arial'),
          axis.title.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_blank())

  figure_2   figure_1   patchwork::plot_layout(nrow = 1, widths = c(0.03, 0.97))
}

小提琴图展示各个单细胞亚群的特异性高表达量基因的一个好处是，它这里并不会受到各个细胞亚群的细胞总数的影响。

如果你确实觉得我的教程对你的科研课题有帮助，让你茅塞顿开，或者说你的课题大量使用我的技能，烦请日后在发表自己的成果的时候，加上一个简短的致谢，如下所示：

代码语言：javascript复制

We thank Dr.Jianming Zeng(University of Macau), and all the members of his bioinformatics team, biotrainee, for generously sharing their experience and codes.

十年后我环游世界各地的高校以及科研院所（当然包括中国大陆）的时候，如果有这样的情谊，我会优先见你。

com data size subset unique

0 人点赞

美化你的单细胞各个亚群特异性高表达基因小提琴图

仍然是以大家熟知的pbmc3k数据集为例

接下来改造 小提琴图的 函数

接下来改造小提琴图的函数