单细胞测序—GSE218208
上一篇帖子学习记录了Seurat官方给出的分析流程单细胞测序—基础分析流程,本篇文章以GSE218208为例,记录下实际的单细胞分析流程,更为简便,其与官方给出的流程略有不同。
1 读取数据与创建Seurat对象
从GEO上下载GSE218208的数据,存放在工作目录下,其数据是以tar包给出,这也是常见的组织形式。
代码语言:r复制untar("GSE218208_RAW.tar")
rm(list = ls())
a = data.table::fread("GSM6736629_10x-PBMC-1_ds0.1974_CountMatrix.tsv.gz",data.table = F)
#fread无法直接设置行名
a[1:4,1:4]
## alias:gene AAACCCAAGTAGGGTC AAACCCACACCATTCC AAACCCATCTACACTT
## 1 TSPAN6:ENSG00000000003 0 0 0
## 2 DPM1:ENSG00000000419 0 1 0
## 3 SCYL3:ENSG00000000457 0 0 0
## 4 C1orf112:ENSG00000000460 0 0 0
library(tidyverse)
#只保留基因名称,并去重
a$`alias:gene` = str_split(a$`alias:gene`,":",simplify = T)[,1]
#str_split_i(a$`alias:gene`,":",i = 1)
a = distinct(a,`alias:gene`,.keep_all = T)
#把基因的名字设置为行名
a = column_to_rownames(a,var = "alias:gene")
a[1:4,1:4]
## AAACCCAAGTAGGGTC AAACCCACACCATTCC AAACCCATCTACACTT AAACGAAAGCACGTCC
## TSPAN6 0 0 0 0
## DPM1 0 1 0 0
## SCYL3 0 0 0 0
## C1orf112 0 0 0 0
library(Seurat)
#CreateSeuratObject允许直接接受一个矩阵
pbmc <- CreateSeuratObject(counts = a,
project = "a",
min.cells = 3,
min.features = 200)
2 质控
对细胞进行的质控,指标是:
- 线粒体基因含量不能过高
- nFeature_RNA 不能过高或过低
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
head(pbmc@meta.data, 3)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCCAAGTAGGGTC a 10768 3213 7.030089
## AAACCCACACCATTCC a 4102 1676 5.046319
## AAACCCATCTACACTT a 4694 1740 6.305922
VlnPlot(pbmc,
features = c("nFeature_RNA",
"nCount_RNA",
"percent.mt"),
ncol = 3,pt.size = 0.5)
根据小提琴图估算阈值
代码语言:r复制pbmc = subset(pbmc,nFeature_RNA < 4200 &
nCount_RNA < 18000 &
percent.mt < 18)
3 高变基因 降维聚类分群
包括标准化、寻找高变基因、缩放数据、PCA、UMAP/t-SNE、聚类分群。
这里PCA默认的维度是15,适用绝大多数情况,可以用ElbowPlot(pbmc)
,看拐点验证。
f = "obj.Rdata"
if(!file.exists(f)){
pbmc = pbmc %>%
NormalizeData() %>%
FindVariableFeatures() %>%
ScaleData(features = rownames(.)) %>%
RunPCA(features = VariableFeatures(.)) %>%
FindNeighbors(dims = 1:15) %>%
FindClusters(resolution = 0.5) %>%
RunUMAP(dims = 1:15) %>%
RunTSNE(dims = 1:15)
save(pbmc,file = f)
}
可以验证下PCA维度,从图中可以看出,维度取前15是比较合适的
代码语言:r复制load(f)
ElbowPlot(pbmc)
代码语言:r复制p1 <- DimPlot(pbmc, reduction = "umap",label = T) NoLegend();p1
4 Marker基因
代码语言:r复制library(dplyr)
f = "markers.Rdata"
if(!file.exists(f)){
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE,min.pct = 0.25)
save(pbmc.markers,file = f)
}
load(f)
mks = pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)
#如果一个基因同时是两簇的marker基因的话下面的代码会报错,因此需要去重
g = unique(mks$gene)
#可视化
DoHeatmap(pbmc, features = g) NoLegend()
scale_fill_gradientn(colors = c("#2fa1dd", "white", "#f87669"))
DotPlot(pbmc, features = g,cols = "RdYlBu")
RotatedAxis()
#查看感兴趣的基因
VlnPlot(pbmc, features = g[1:3])
FeaturePlot(pbmc, features = g[1:4])
5 细胞注释
5.1 手动注释
代码语言:r复制a = read.delim("../supp/markers.txt",header = F)
gt = split(a[,2],a[,1])
DotPlot(pbmc, features = gt,cols = "RdYlBu")
RotatedAxis()
将聚类得到的细胞群体重新命名的小技巧
代码语言:r复制#打出以下内容,复制到新建的anno.tx文档里
#照着DotPlot图用unique(a$V1)的输出结果补全细胞类型
writeLines(paste0(0:11,","))
## 0,
## 1,
## 2,
## 3,
## 4,
## 5,
## 6,
## 7,
## 8,
## 9,
## 10,
## 11,
celltype = read.table("anno.txt",sep = ",") #自己照着DotPlot图填的
celltype
## V1 V2
## 1 0 Naive CD4 T
## 2 1 CD14 Mono
## 3 2 B
## 4 3 CD8 T
## 5 4 NK
## 6 5 Naive CD4 T
## 7 6 Naive CD4 T
## 8 7 FCGR3A Mono
## 9 8 CD14 Mono
## 10 9 Platelet
## 11 10 DC
## 12 11 DC
new.cluster.ids <- celltype$V2
names(new.cluster.ids) <- levels(pbmc)
seu.obj <- RenameIdents(pbmc, new.cluster.ids)
save(seu.obj,file = "seu.obj.Rdata")
p2 <- DimPlot(seu.obj,
reduction = "umap",
label = TRUE,
pt.size = 0.5) NoLegend()
p2
5.2 自动注释
手动注释需要较多的背景知识,图方便的可以用自动注释,其较为粗糙。
自动注释的singleR
代码语言:r复制library(celldex)
library(SingleR)
ls("package:celldex")
## [1] "BlueprintEncodeData" "DatabaseImmuneCellExpressionData"
## [3] "HumanPrimaryCellAtlasData" "ImmGenData"
## [5] "MonacoImmuneData" "MouseRNAseqData"
## [7] "NovershternHematopoieticData"
f = "../supp/single_ref/ref_BlueprintEncode.RData"
if(!file.exists(f)){
ref <- celldex::BlueprintEncodeData()
save(ref,file = f)
}
#ref兼容不同的变量名称
ref <- get(load(f))
library(BiocParallel)
scRNA = pbmc
test = scRNA@assays$RNA@layers$data
rownames(test) = Features(scRNA)
colnames(test) = Cells(scRNA)
pred.scRNA <- SingleR(test = test,
ref = ref,
labels = ref$label.main,
clusters = scRNA@active.ident)
# 查看注释成的哪些类型
pred.scRNA$pruned.labels
## [1] "CD4 T-cells" "Monocytes" "B-cells" "CD8 T-cells" "NK cells"
## [6] "CD4 T-cells" "CD4 T-cells" "Monocytes" "Monocytes" "Monocytes"
## [11] "DC" "Monocytes"
#查看注释的准确性,每一列最黄的是注释结果
plotScoreHeatmap(pred.scRNA, clusters=pred.scRNA@rownames, fontsize.row = 9,show_colnames = T)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
levels(scRNA)
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11"
scRNA <- RenameIdents(scRNA,new.cluster.ids)
levels(scRNA)
## [1] "CD4 T-cells" "Monocytes" "B-cells" "CD8 T-cells" "NK cells"
## [6] "DC"
p3 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) NoLegend()
#对比下手动注释与自动注释
p2 p3