过滤线粒体基因表达过高的细胞

2021-03-10 14:49:20 浏览数 (3)

因为损坏细胞和死细胞常表现出过大量的线粒体污染,因此需要过滤高线粒体基因表达的细胞。过滤原则为,移去线粒体基因表达比例过高的细胞,但是不能大量丢失样本细胞信息。

代码语言:javascript复制
library(Seurat)
library(tidyverse)
library(Matrix)

#加载所需R包,如果没有自行安装下

读入Rdata文件并查看Rdata文件保存的变量

代码语言:javascript复制
load("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")
attach("H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata")
# 注:路径改为自己文夹存放的路径
代码语言:javascript复制
## The following object is masked _by_ .GlobalEnv:
## 
##     object
代码语言:javascript复制
ls("file:H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata") 
代码语言:javascript复制
## [1] "object"

查看保存变量名后,读入Rdata文件

代码语言:javascript复制
load('H:/Scripts_scRNA_seq_2020/seurat_object.small.Rdata')
table(object@active.ident)
代码语言:javascript复制
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
## 100 100 100 100 100 100 100 100 100 100 100 100 100 100  33  92
代码语言:javascript复制
head(object@meta.data)
代码语言:javascript复制
##                    nCount_RNA nFeature_RNA percent.mito percent.ribo cluster
## B-AAACGAAGTGAGTAGC       4000         1284    10.525000    18.700000      12
## B-AAACGCTGTAGTCACT       7268         2502     9.506122    11.019397      10
## B-AAACGCTGTATTTCCT      10096         3035    21.018225     5.645800       3
## B-AAAGTGAAGTAACCTC       1741         1022     4.075775     6.142365       6
## B-AAAGTGACAGCACAAG       3720         1276     2.607527    15.967742      12
## B-AAAGTGAGTCGAGTGA      27745         5055     4.080014     9.511624      16
##                    sample
## B-AAACGAAGTGAGTAGC      B
## B-AAACGCTGTAGTCACT      B
## B-AAACGCTGTATTTCCT      B
## B-AAAGTGAAGTAACCTC      B
## B-AAAGTGACAGCACAAG      B
## B-AAAGTGAGTCGAGTGA      B

第一种方法:设置固定值过滤

代码语言:javascript复制
# Set threshold
# mitochondrial gene expression ratio must least than 20%
clean_data1 <- subset(object, percent.mito <= 20)
print(ncol(object))
代码语言:javascript复制
## [1] 1525
代码语言:javascript复制
print(ncol(clean_data1))
代码语言:javascript复制
## [1] 1328
代码语言:javascript复制
# Decreasing cell ratio is:
print(1 - ncol(clean_data1)/ncol(object))
代码语言:javascript复制
## [1] 0.1291803

第二种方法:设置固定值过滤并考虑线粒体基因表达分布情况

代码语言:javascript复制
Filter_high_mito_genes <- 
  function(mito_quantile, max_ratio){
    for (i in 1:length(mito_quantile)){
      while (mito_quantile[i] >= max_ratio){
        i = i - 1
        print(mito_quantile[i])
        return(m_raw[i])
        break
      }
    }
  }

# Set threshold
m_raw <- quantile(object$percent.mito, probs=seq(0,1,0.01)) 
m_raw_max <- Filter_high_mito_genes(m_raw, 20)
代码语言:javascript复制
##      87% 
## 19.87992
代码语言:javascript复制
clean_data2 <- subset(object, percent.mito <= m_raw_max)
print(ncol(object))
代码语言:javascript复制
## [1] 1525
代码语言:javascript复制
print(ncol(clean_data2))
代码语言:javascript复制
## [1] 1326
代码语言:javascript复制
# Decreasing cell ratio is:
print(1 - ncol(clean_data2)/ncol(object))
代码语言:javascript复制
## [1] 0.1304918

第三种方法:找寻离群值并删除

  • 使用绝对中位差(median absolute deviation, MAD)
代码语言:javascript复制
# MAD: https://eurekastatistics.com/using-the-median-absolute-deviation-to-find-outliers/
# Median absolute deviation
seurat_clean <- object
median(seurat_clean$percent.mito)
代码语言:javascript复制
## [1] 4.252895
代码语言:javascript复制
median(abs(seurat_clean$percent.mito-median(seurat_clean$percent.mito)))
代码语言:javascript复制
## [1] 3.02637
代码语言:javascript复制
mad(seurat_clean$percent.mito, constant=1)
代码语言:javascript复制
## [1] 3.02637
代码语言:javascript复制
mad_value <- mad(seurat_clean$percent.mito, constant=1)

第一种寻找离群值的方法

Hampel filter: https://towardsdatascience.com/outliers-detection-in-r-6c835f14e554Hampel filter认为在区间[median - 3*MAD, median 3*MAD]之外的值为离群值。考虑到能接受线粒体基因表达比例过低的细胞,因此,这里线粒体基因表达比例高于median 3*MAD的值认为是离群值。

代码语言:javascript复制
# I = [median - 3*MAD; median   3*MAD]
lower_bound <- median(seurat_clean$percent.mito - 3*mad_value)
upper_bound <- median(seurat_clean$percent.mito   3*mad_value)
# outlier_ind <- which(seurat_clean$percent.mito < lower_bound | seurat_clean$percent.mito > upper_bound)

# Set threshold
clean_data3_1 <- subset(object, percent.mito <= upper_bound)
print(ncol(object))
代码语言:javascript复制
## [1] 1525
代码语言:javascript复制
print(ncol(clean_data3_1))
代码语言:javascript复制
## [1] 1212
代码语言:javascript复制
# Decreasing cell ratio is:
print(1 - ncol(clean_data3_1)/ncol(object))
代码语言:javascript复制
## [1] 0.2052459

另一种寻找离群值的方法

  • 1 对线粒体基因表达分布计算MAD值
  • 2 根据MAD值计算MAD 偏差
  • 3 计算MAD偏差的中位数
  • 4 以MAD偏差的中位数中心化数据,拟合正态分布,过滤对p值BH检验后,p<0.01的细胞。
代码语言:javascript复制

# using a median-centered median absolute deviation (MAD)-variance normal distribution, 
# and then removed the cells with significantly higher expression levels than expected (determined by Benjamini-Hochberg corrected p < 0.01,
                                                                                                                                                                
# step1: calculate mad_value
# step2: calculate median mad variance
# step3: calculate median-centered mad variance

library(MASS)
代码语言:javascript复制
## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select
代码语言:javascript复制
mad_value <- mad(seurat_clean$percent.mito, constant=1)
median_mad_value <- median(seurat_clean$percent.mito-mad_value)
mad_data <- seurat_clean$percent.mito-median_mad_value
quantile(mad_data, probs=seq(0,1,0.01))
代码语言:javascript复制
##          0%          1%          2%          3%          4%          5% 
## -1.22652582 -1.12677189 -1.04505412 -0.96783664 -0.88353556 -0.81879921 
##          6%          7%          8%          9%         10%         11% 
## -0.73403847 -0.66443843 -0.56898173 -0.50345527 -0.40002066 -0.31144517 
##         12%         13%         14%         15%         16%         17% 
## -0.24071698 -0.16165378 -0.10054114 -0.02779893  0.04874940  0.10832668 
##         18%         19%         20%         21%         22%         23% 
##  0.16474711  0.25546482  0.32277906  0.37170908  0.45207025  0.53549713 
##         24%         25%         26%         27%         28%         29% 
##  0.65528358  0.70880294  0.77405797  0.84589355  0.92257368  0.97663224 
##         30%         31%         32%         33%         34%         35% 
##  1.11049555  1.17586328  1.27531613  1.34020968  1.40574423  1.46427697 
##         36%         37%         38%         39%         40%         41% 
##  1.52484548  1.60613395  1.68158378  1.77465127  1.86783711  1.94859642 
##         42%         43%         44%         45%         46%         47% 
##  2.10190300  2.17907731  2.32396704  2.46017357  2.56372110  2.70605979 
##         48%         49%         50%         51%         52%         53% 
##  2.77613932  2.87061865  3.02636956  3.17350690  3.33353077  3.45084425 
##         54%         55%         56%         57%         58%         59% 
##  3.65666815  3.77864615  3.94223341  4.15836488  4.37500052  4.51630880 
##         60%         61%         62%         63%         64%         65% 
##  4.76932730  4.99745931  5.29929727  5.60190488  5.79174774  6.01009750 
##         66%         67%         68%         69%         70%         71% 
##  6.25622972  6.52175657  6.86668520  7.20998333  7.53006017  7.82706009 
##         72%         73%         74%         75%         76%         77% 
##  8.16781243  8.45596040  8.93308566  9.59124010 10.08937106 10.79636937 
##         78%         79%         80%         81%         82%         83% 
## 11.41592714 11.76810435 12.36376280 13.17405483 14.26994801 15.17521921 
##         84%         85%         86%         87%         88%         89% 
## 15.90015886 16.84398507 17.66219670 18.65338941 19.79228322 21.11463974 
##         90%         91%         92%         93%         94%         95% 
## 22.60018875 24.04406053 26.04609465 29.19720742 33.30219617 37.55305544 
##         96%         97%         98%         99%        100% 
## 40.53573859 43.55046754 48.29564234 54.91861069 66.97004349
代码语言:javascript复制
fit <- fitdistr(mad_data, densfun='normal')
hist(mad_data, prob=T, main='')
curve(dnorm(x, fit$estimate[1], fit$estimate[2]), col='red', add=T)
代码语言:javascript复制
p_data <- pnorm(mad_data, mean=fit$estimate[1], sd=fit$estimate[2], lower.tail=F)
# quantile(p_data, probs=seq(0,1,0.01))
a_data <- p.adjust(p_data, method='BH')
# quantile(a_data, probs=seq(0,1,0.01))

# Set threshold
clean_data3_2 <- subset(object, cells=which(a_data > 0.01))
print(ncol(object))
代码语言:javascript复制
## [1] 1525
代码语言:javascript复制
print(ncol(clean_data3_2))
代码语言:javascript复制
## [1] 1502
代码语言:javascript复制
# Decreasing cell ratio is:
print(1 - ncol(clean_data3_2)/ncol(object))
代码语言:javascript复制
## [1] 0.01508197

总之,过滤细胞比例为1.5%到20.5%,具体线粒体基因表达比例的选择因样本而异,言之有理即可。

比较以上几种方法过滤后细胞线粒体基因分布情况

代码语言:javascript复制
mito_distribution <- function(seurat_object){
  metadata <- seurat_object@meta.data
  metadata %>% 
    ggplot(aes(color=sample, x=percent.mito, fill=sample))   
    geom_density(alpha = 0.2)   
    scale_x_continuous(breaks = seq(0, max(object$percent.mito), by=5))   
    theme_classic() 
}

p1 <- mito_distribution(object)
p2 <- mito_distribution(clean_data1)
p3 <- mito_distribution(clean_data2)
p4 <- mito_distribution(clean_data3_1)
p5 <- mito_distribution(clean_data3_2)
library(cowplot)
plot_grid(p1, p2, p3, p4, p5, labels=c('original', '1', '2', '3-1', '3-2'))

注:文中所用示例数据,可在公众号回复“线粒体”获得

0 人点赞