单细胞测序—GSE218208(流程简化)

2024-07-30 18:23:22 浏览数 (2)

单细胞测序—GSE218208

上一篇帖子学习记录了Seurat官方给出的分析流程单细胞测序—基础分析流程,本篇文章以GSE218208为例,记录下实际的单细胞分析流程,更为简便,其与官方给出的流程略有不同。

1 读取数据与创建Seurat对象

从GEO上下载GSE218208的数据,存放在工作目录下,其数据是以tar包给出,这也是常见的组织形式。

代码语言:r复制
untar("GSE218208_RAW.tar")
rm(list = ls())
a = data.table::fread("GSM6736629_10x-PBMC-1_ds0.1974_CountMatrix.tsv.gz",data.table = F)
#fread无法直接设置行名
a[1:4,1:4]
##                 alias:gene AAACCCAAGTAGGGTC AAACCCACACCATTCC AAACCCATCTACACTT
## 1   TSPAN6:ENSG00000000003                0                0                0
## 2     DPM1:ENSG00000000419                0                1                0
## 3    SCYL3:ENSG00000000457                0                0                0
## 4 C1orf112:ENSG00000000460                0                0                0
library(tidyverse)
#只保留基因名称,并去重
a$`alias:gene` = str_split(a$`alias:gene`,":",simplify = T)[,1]
#str_split_i(a$`alias:gene`,":",i = 1)
a = distinct(a,`alias:gene`,.keep_all = T)
#把基因的名字设置为行名
a = column_to_rownames(a,var = "alias:gene")
a[1:4,1:4]
##          AAACCCAAGTAGGGTC AAACCCACACCATTCC AAACCCATCTACACTT AAACGAAAGCACGTCC
## TSPAN6                  0                0                0                0
## DPM1                    0                1                0                0
## SCYL3                   0                0                0                0
## C1orf112                0                0                0                0
library(Seurat)
#CreateSeuratObject允许直接接受一个矩阵
pbmc <- CreateSeuratObject(counts = a, 
                           project = "a", 
                           min.cells = 3, 
                           min.features = 200)

2 质控

对细胞进行的质控,指标是:

  • 线粒体基因含量不能过高
  • nFeature_RNA 不能过高或过低
代码语言:r复制
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
head(pbmc@meta.data, 3)
##                  orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCCAAGTAGGGTC          a      10768         3213   7.030089
## AAACCCACACCATTCC          a       4102         1676   5.046319
## AAACCCATCTACACTT          a       4694         1740   6.305922
VlnPlot(pbmc, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt"), 
        ncol = 3,pt.size = 0.5)

根据小提琴图估算阈值

代码语言:r复制
pbmc = subset(pbmc,nFeature_RNA < 4200 &
                nCount_RNA < 18000 &
                percent.mt < 18)

3 高变基因 降维聚类分群

包括标准化、寻找高变基因、缩放数据、PCA、UMAP/t-SNE、聚类分群

这里PCA默认的维度是15,适用绝大多数情况,可以用ElbowPlot(pbmc),看拐点验证。

代码语言:r复制
f = "obj.Rdata"
if(!file.exists(f)){
  pbmc = pbmc %>% 
  NormalizeData() %>%  
  FindVariableFeatures() %>%  
  ScaleData(features = rownames(.)) %>%  
  RunPCA(features = VariableFeatures(.))  %>%
  FindNeighbors(dims = 1:15) %>% 
  FindClusters(resolution = 0.5) %>% 
  RunUMAP(dims = 1:15) %>% 
  RunTSNE(dims = 1:15)
  save(pbmc,file = f)
}

可以验证下PCA维度,从图中可以看出,维度取前15是比较合适的

代码语言:r复制
load(f)
ElbowPlot(pbmc)
代码语言:r复制
p1 <- DimPlot(pbmc, reduction = "umap",label = T) NoLegend();p1

4 Marker基因

代码语言:r复制
library(dplyr)
f = "markers.Rdata"
if(!file.exists(f)){
  pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,min.pct = 0.25)
  save(pbmc.markers,file = f)
}
load(f)
mks = pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
#如果一个基因同时是两簇的marker基因的话下面的代码会报错,因此需要去重
g = unique(mks$gene)
#可视化
DoHeatmap(pbmc, features = g)   NoLegend() 
  scale_fill_gradientn(colors = c("#2fa1dd", "white", "#f87669"))
DotPlot(pbmc, features = g,cols = "RdYlBu")  
  RotatedAxis()
#查看感兴趣的基因
VlnPlot(pbmc, features = g[1:3])
FeaturePlot(pbmc, features = g[1:4])

5 细胞注释

5.1 手动注释

代码语言:r复制
a = read.delim("../supp/markers.txt",header = F)
gt = split(a[,2],a[,1])

DotPlot(pbmc, features = gt,cols = "RdYlBu")  
  RotatedAxis()

将聚类得到的细胞群体重新命名的小技巧

代码语言:r复制
#打出以下内容,复制到新建的anno.tx文档里
#照着DotPlot图用unique(a$V1)的输出结果补全细胞类型
writeLines(paste0(0:11,","))
## 0,
## 1,
## 2,
## 3,
## 4,
## 5,
## 6,
## 7,
## 8,
## 9,
## 10,
## 11,
celltype = read.table("anno.txt",sep = ",") #自己照着DotPlot图填的
celltype
##    V1           V2
## 1   0  Naive CD4 T
## 2   1   CD14  Mono
## 3   2            B
## 4   3        CD8 T
## 5   4           NK
## 6   5  Naive CD4 T
## 7   6  Naive CD4 T
## 8   7 FCGR3A  Mono
## 9   8   CD14  Mono
## 10  9     Platelet
## 11 10           DC
## 12 11           DC
new.cluster.ids <- celltype$V2
names(new.cluster.ids) <- levels(pbmc)
seu.obj <- RenameIdents(pbmc, new.cluster.ids)
save(seu.obj,file = "seu.obj.Rdata")
p2 <- DimPlot(seu.obj, 
        reduction = "umap", 
        label = TRUE, 
        pt.size = 0.5)   NoLegend()
p2

5.2 自动注释

手动注释需要较多的背景知识,图方便的可以用自动注释,其较为粗糙。

自动注释的singleR

代码语言:r复制
library(celldex)
library(SingleR)
ls("package:celldex")
## [1] "BlueprintEncodeData"              "DatabaseImmuneCellExpressionData"
## [3] "HumanPrimaryCellAtlasData"        "ImmGenData"                      
## [5] "MonacoImmuneData"                 "MouseRNAseqData"                 
## [7] "NovershternHematopoieticData"
f = "../supp/single_ref/ref_BlueprintEncode.RData"
if(!file.exists(f)){
  ref <- celldex::BlueprintEncodeData()
  save(ref,file = f)
}
#ref兼容不同的变量名称
ref <- get(load(f))
library(BiocParallel)
scRNA = pbmc
test = scRNA@assays$RNA@layers$data
rownames(test) = Features(scRNA)
colnames(test) = Cells(scRNA)
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = scRNA@active.ident)
# 查看注释成的哪些类型
pred.scRNA$pruned.labels
##  [1] "CD4  T-cells" "Monocytes"    "B-cells"      "CD8  T-cells" "NK cells"    
##  [6] "CD4  T-cells" "CD4  T-cells" "Monocytes"    "Monocytes"    "Monocytes"   
## [11] "DC"           "Monocytes"
#查看注释的准确性,每一列最黄的是注释结果
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11"
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
## [1] "CD4  T-cells" "Monocytes"    "B-cells"      "CD8  T-cells" "NK cells"    
## [6] "DC"
p3 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5)   NoLegend()
#对比下手动注释与自动注释
p2 p3

0 人点赞