课后补充----关于单细胞空间基础分析的代码部分

2024-08-02 10:58:51 浏览数 (2)

作者,Evil Genius

我们本次2024年度的系列课程基础分析是没有放进来的,但是不是说明这部分不重要,相反,这部分是一切个性化分析的基础,我们来分享一下基础分析的代码部分

首先是scRNA/snRNA的基础分析,包括常见的基础质控 排污 双细胞 降维聚类差异 生成h5ad文件,当然了,代码是官网 经验 文献搜集而来。封装类的代码一般是公司在用,大家又能力可以自己封装一下,下面是代码示例,注意,是示例,需要大家根据自己需求稍作修改,有些地方需要注意。

代码语言:javascript复制
library (Seurat)
library (dplyr)
library (ggplot2)
library (harmony)
library (patchwork)
library (SoupX)
library (DoubletFinder)
library(reticulate)
library(sceasy)
library(anndata)
library(patchwork)
library(SeuratDisk)

#================================Ambient RNA Correction===============================#
Control1 = load10X('path/to/your/cellranger/outs/folder')
Control1 = autoEstCont(Control1)
Control1 = adjustCounts(Control1)
#Do it for all samples
#================================Doublet Removal===============================#
Control1.S <- CreateSeuratObject(counts = Control1, project = "Control1", min.cells = 3, min.features = 300) # create Seurat object
Control1.S <- NormalizeData(Control1.S)
Control1.S <- FindVariableFeatures(Control1.S, selection.method = "vst", nfeatures = 2000)
Control1.S <- ScaleData(Control1.S)
Control1.S <- RunPCA(Control1.S)
Control1.S <- RunUMAP(Control1.S, dims = 1:50)
sweep.res.list_kidney <- paramSweep_v3(Control1.S, PCs = 1:30, sct = FALSE)
sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
bcmvn_kidney <- find.pK(sweep.stats_kidney) #determine the pk
nExp_poi <- round(0.075*nrow(Control1.S@meta.data)) 
Control1.S <- doubletFinder_v3(seu_kidney, PCs = 1:30, pN = 0.25, pK = #depends on the previous sterp, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
Control1.S.Filtered <- subset(Control1.S, subset = #Doublet_related_column == "singlet")                                 
#Do it for all samples
#============================Merge Datssets======================================================#
HK_SC_RNA = merge (Control1.S.Filtered, y = c (#All other sc filtered objects))

#============================Add percent.mt to dataset======================================================#
HK_SC_RNA[["percent.mt"]] <- PercentageFeatureSet(HK_SC_RNA, pattern = "^MT-")

#============================Pre Processing======================================================#
HK_SC_RNA <- subset(HK_SC_RNA, subset = nFeature_RNA > 200 & nFeature_RNA < 3000 & percent.mt < 50 & percent.rpl < 15 & percent.rps < 15)

HK_SC_RNA <- FindVariableFeatures(object = HK_SC_RNA,selection.method = "vst")
HK_SC_RNA <- NormalizeData(HK_SC_RNA)
HK_SC_RNA <- ScaleData(HK_SC_RNA, vars.to.regress = c ("nCount_RNA", "percent.mt", "percent.rpl", "percent.rps"))
HK_SC_RNA <- RunPCA(HK_SC_RNA, assay = 'RNA', npcs = 30, features = VariableFeatures(object = HK_SC_RNA), 
                              verbose = TRUE, ndims.print = 1:5, nfeatures.print = 10)

#============= Run Harmony =============
HK_SC_RNA <- RunHarmony(HK_SC_RNA, group.by.vars = "orig.ident")
HK_SC_RNA <- RunUMAP(HK_SC_RNA, reduction = "harmony", dims = 1:30)
HK_SC_RNA <- FindNeighbors(HK_SC_RNA, reduction = "harmony", dims = 1:30) %>% FindClusters()                   
DimPlot(HK_SC_RNA, reduction = "umap", label = TRUE, pt.size = 0.5)   NoLegend()

#================================== DEG List and cell type annotation==============================#
HK_SC_RNA.Markers <- FindAllMarkers(HK_SC_RNA, only.pos = TRUE, logfc.threshold = 0.25)

#================================== Annotation and store data as Idents==============================#
HK_SC_RNA <- RenameIdents(HK_SC_RNA, `0` = "PT")

#==================================Convert to adata using sceasy ==============================#
HK_SC_RNA_anndata <- convertFormat(HK_SC_RNA, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
write_h5ad(HK_SC_RNA_anndata, filename = "HK_SC_RNA_anndata.h5ad")

ATAC部分的基础分析

代码语言:javascript复制
library (Signac)
library(Seurat)
library (dplyr)
library(ggplot2)
library (harmony)
library (patchwork)
library (EnsDb.Hsapiens.v86)
library(rtracklayer)
library(reticulate)
library(sceasy)
library(anndata)
library(patchwork)
library(SeuratDisk)
#--------------------------Prepare Object---------------------------------#
annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
genome(annotation) <- "hg38"
seqlevelsStyle(annotation) <- "UCSC"
counts <- Read10X_h5(filename = "/Your_Path/filtered_peak_bc_matrix.h5")
metadata <- read.csv(file = "/Your_Path/singlecell.csv", header = TRUE,row.names = 1)
fragpath <- "/Your_Path/outs/fragments.tsv.gz"
chromatin.assay <- CreateChromatinAssay(counts = counts, sep = c(":", "-"),fragments = fragpath,annotation = annotation)
Control1.S <- CreateSeuratObject(counts = chromatin.assay,assay = "peaks",meta.data = metadata)

#Annotation
hg38 <- import("/Your_Path_To_GTF_File/genes.gtf.gz")
genome(hg38) <- "hg38"
seqlevelsStyle(hg38) <- "UCSC"
hg38$gene_biotype <- hg38$gene_type
Annotation(Control1.S) <- hg38
#QC Filtering
Control1.S <- NucleosomeSignal(object = Control1.S)
Control1.S <- TSSEnrichment(Control1.S, fast = FALSE)
Control1.S$pct_reads_in_peaks <- Control1.S$peak_region_fragments / Control1.S$passed_filters * 100
Control1.S$blacklist_ratio <- Control1.S$blacklist_region_fragments / Control1.S$peak_region_fragments
Control1.S$high.tss <- ifelse(Control1.S$TSS.enrichment > 2, 'High', 'Low')
pdf("Viloin.HK.Fetal.Mito.pdf", width=6, height=4)
TSSPlot(Control1.S, group.by = 'high.tss')   NoLegend()
Control1.S$nucleosome_group <- ifelse(Control1.S$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
FragmentHistogram(object = Control1.S, group.by = 'nucleosome_group')
pdf("HK2481_1.ATAC.QC.pdf", width=18, height=4)
VlnPlot(object = Control1.S, features = c('pct_reads_in_peaks', 'peak_region_fragments', 'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'), pt.size = 0.1, ncol = 5)
Control1.S <- subset(x = Control1.S, subset = peak_region_fragments > 3000 & peak_region_fragments < 20000 & pct_reads_in_peaks > 15 & blacklist_ratio < 0.05 & nucleosome_signal < 4 & TSS.enrichment > 2)
#Do this step for all 20 samples
#--------------------------Merge Datasets---------------------------------#
HK.SN.ATAC = merge (x = Control1.S, y = list(#list of all other objects))
#--------------------------Pre-processing---------------------------------#  
HK.SN.ATAC <- RunTFIDF(HK.SN.ATAC)
HK.SN.ATAC <- FindTopFeatures(HK.SN.ATAC, min.cutoff = 'q0')
HK.SN.ATAC <- RunSVD(HK.SN.ATAC)
HK.SN.ATAC <- RunHarmony(
  object = HK.SN.ATAC,
  group.by.vars = 'dataset',
  reduction = 'lsi',
  assay.use = 'peaks',
  project.dim = FALSE
)
HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, dims = 2:50, reduction = 'harmony')
HK.SN.ATAC <- RunUMAP(HK.SN.ATAC, reduction = "harmony", dims = 2:50)
HK.SN.ATAC <- FindNeighbors(HK.SN.ATAC, reduction = "harmony", dims = 2:50) %>% FindClusters()
#--------------------------DAR---------------------------------#
DARs <- FindAllMarkers(object = HK.SN.ATAC, min.pct = 0.05, test.use = 'LR', latent.vars = 'peak_region_fragments')

HK_SN_ATAC_anndata <- convertFormat(HK_SN_ATAC_Filtered, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
write_h5ad(HK_SN_ATAC_anndata, filename = "HK_SN_ATAC_anndata.h5ad")

空间基础分析部分 ,包括NMF寻找微环境的部分

代码语言:javascript复制
library (dplyr)
library (ggplot2)
library(SeuratData)
library (harmony)
library (patchwork)
library (Seurat)
options(stringsAsFactors = F)
library(CellTrek)
library(Seurat)
library(viridis)
library(ConsensusClusterPlus)
library(SpotClean)
library(S4Vectors)
library (STUtility)
set.seed(123)
# Load 10x Visium data
Control1.ST_raw <- read10xRaw("/path/to/matrix/folder")
Control1.ST_slide_info <- read10xSlide("/path/to/tissue/csv", 
                                  "/path/to/tissue/image", 
                                  "/path/to/scale/factor")

# Visualize raw data
Control1.ST_obj <- createSlide(count_mat = Control1.ST_raw, 
                          slide_info = Control1.ST_slide_info)
visualizeSlide(slide_obj = Control1.ST_obj)
visualizeHeatmap(Control1.ST_obj,rownames(Control1.ST_raw)[1])

# Decontaminate raw data
Control1.ST.decont_obj <- spotclean(Control1.ST_obj)

# Repeat this step for all 14 ST samples

#-----------------------------Preprocessing------------------------------#
Control1.ST <- SCTransform(Control1.ST_obj, assay = "Spatial", verbose = FALSE)
#Do it for all samples
#-----------------------------Merge All SCT spatial samples------------------------------#
HK.ST = merge (Control1.ST, y =c (#All the ST objects))
  
#-----------------------------Merge All SCT spatial samples------------------------------# 
HK.ST <- RunPCA(HK.ST, assay = "SCT", verbose = FALSE)
HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident")
HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()

#-----------------------------Find DEGs between Clusters------------------------------# 
HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

#-----------------------------Run NMF for microenvironments------------------------------# 
RunNMF(
 HK.ST,
 features = NULL,
  nfactors = 20,
  rescale = TRUE,
  reduction.name = "NMF")
HK.ST <- RunHarmony(HK.ST, group.by.vars = "orig.ident", reduction = "NMF")  
HK.ST <- RunUMAP(HK.ST, reduction = "harmony", dims = 1:20)
HK.ST <- FindNeighbors(HK.ST, reduction = "harmony", dims = 1:20) %>% FindClusters()
#-----------------------------Find DEGs between MEs------------------------------# 
HK.ST <- FindAllMarkers(HK.ST, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

#-----------------------------------Mapping Back snRNA-seq, scRNA-seq, and snATAC-seq to Spatial dataset----------------------#
HK.SC.SN.ATAC <- HK.SC.SN.ATAC[, sample(colnames(HK.SN), size =20000, replace=F)] #Downsample the annotated snRNA-seq matrix
Control1.ST = Load10X_Spatial("/Your_10X-Folder/outs", filename = "filtered_feature_bc_matrix.h5", assay = "Spatial", slice = "slice1")
Control1.ST <- RenameCells(Control1.ST, new.names=make.names(Cells(Control1.ST)))
HK.SC.SN.ATAC <- RenameCells(HK.SC.SN.ATAC, new.names=make.names(Cells(HK.SC.SN.ATAC)))
Coltrol1.ST.Samples_traint <- CellTrek::traint(st_data=Control1.ST, sc_data=HK.SC.SN.ATAC, sc_assay='RNA', cell_names='seurat_clusters')
Control1.ST.HK.SN_celltrek <- CellTrek::celltrek(st_sc_int=Coltrol1.ST.Samples_traint, int_assay='traint', sc_data=HK.SC.SN.ATAC, sc_assay = 'RNA', reduction='pca', intp=T, intp_pnt=5000, intp_lin=F, nPCs=30, ntree=1000, dist_thresh=0.55, top_spot=5, spot_n=5, repel_r=20, repel_iter=20, keep_model=T)$celltrek
Control1.ST.HK.SN_celltrek$seurat_clusters <- factor(Control1.ST.HK.SN_celltrek$seurat_clusters, levels=sort(Control1.ST.HK.SN_celltrek$seurat_clusters)))
Idents = Control1.ST.HK.SN_celltrek[["Idents2"]]
Idents (Control1.ST.HK.SN_celltrek) = Idents

基础的解卷积部分,cell2location 共定位

代码语言:javascript复制
import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc

import cell2location

import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns

# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')


from google.colab import drive
drive.mount('/content/drive')


import cell2location

results_folder = '/content/drive/MyDrive/Spatial/HK2529_ST'
ref_run_name = f'{results_folder}/reference_signatures'
run_name = f'{results_folder}/cell2location_map'


adata = sc.read_visium(path ='/content/drive/MyDrive/HK2529_ST/', count_file= "filtered_feature_bc_matrix.h5", load_images=True)

adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)


sns.distplot(adata.obs["total_counts"], kde=False)


#sc.pp.filter_cells(adata, min_counts=5000)
#sc.pp.filter_cells(adata, max_counts=35000) 
#adata = adata[adata.obs["pct_counts_mt"] < 50]
print(f"#cells after MT filter: {adata.n_obs}")
sc.pp.filter_genes(adata, min_cells=10)


sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")


sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")


sc.pl.spatial(adata,color=["COL1A1", "NPHS1"], img_key=None, size=1,vmin=0, cmap='magma', vmax='p99.0', gene_symbols='SYMBOL')


adata_ref = sc.read ("/content/drive/MyDrive/adata_ref_withmodel_4.3.2023.h5ad")

# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
    inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}' 
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
else:
    inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}' 
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]


intersect = np.intersect1d(adata.var_names, inf_aver.index)
adata = adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()

# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata)
# can add additional slides to adata and add in a batch_key


mod = cell2location.models.Cell2location(
    adata, cell_state_df=inf_aver, 
    # the expected average cell abundance: tissue-dependent 
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=30,
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    detection_alpha=20
) 
mod.view_anndata_setup()


mod.train(max_epochs=30000, 
          # train using full data (batch_size=None)
          batch_size=None, 
          # use all data points in training because 
          # we need to estimate cell abundance at all locations
          train_size=1,
          use_gpu=True)

# plot ELBO loss history during training, removing first 100 epochs from the plot
mod.plot_history(1000)
plt.legend(labels=['full data training']);



# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata = mod.export_posterior(
    adata, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
)

adata.obs[adata.uns['mod']['factor_names']] = adata.obsm['q05_cell_abundance_w_sf']


# Save model
mod.save(f"{run_name}", overwrite=True)

# mod = cell2location.models.Cell2location.load(f"{run_name}", adata)

# Save anndata object with results
adata_file = f"{run_name}/sp.h5ad"
adata.write(adata_file)
adata_file



# select one slide
from cell2location.utils import select_slide
sc.pl.spatial(adata, cmap='magma',
                  color=['Podo',"Endo_G","PEC", "Mes", "iPT", "GS_Stromal", "PT_S1", "PT_S2", "PT_S3", "Fibroblast_1", "Fibroblast_2", "DCT1" , "M_TAL", "DTL",
                         "CD4T", "CD8T", "Mac", "Dedifferentiated_Tubule"], 
                  ncols=4, size=1.3, 
                  img_key='hires',
                  # limit color scale at 99.2% quantile of cell abundance
                  vmin=0, vmax='p99.2' 
                 )
                 
                 
                 sc.pp.neighbors(adata, use_rep='q05_cell_abundance_w_sf',
                n_neighbors = 15)
sc.tl.leiden(adata, resolution=1.1)
adata.obs["region_cluster"] = adata.obs["leiden"].astype("category")
sc.tl.umap(adata, min_dist = 0.3, spread = 1)


sc.pl.spatial(adata, color=['region_cluster'],
                  size=1.3, img_key='hires', alpha=0.5)
                  
                  
                  # Set the value of the "sample" column for all rows to "HK3513_ST"
adata.obs['sample'] = "HK2529_ST"

# Print the updated "obs" attribute
print(adata.obs)


# Now we use cell2location plotter that allows showing multiple cell types in one panel
from cell2location.plt import plot_spatial

# select up to 6 clusters
clust_labels = [ 'Podo', 'PEC', "GS_Stromal", "Fibroblast_1"]
clust_col = [''   str(i) for i in clust_labels] # in case column names differ from labels

slide = select_slide(adata, 'HK2529_ST')

with mpl.rc_context({'figure.figsize': (15, 15)}):
    fig = plot_spatial(
        adata=slide,
        # labels to show on a plot
        color=clust_col, labels=clust_labels,
        show_img=True,
        # 'fast' (white background) or 'dark_background'
        style='fast',
        # limit color scale at 99.2% quantile of cell abundance
        max_color_quantile=0.992,
        # size of locations (adjust depending on figure size)
        circle_diameter=12,
        colorbar_position='right'
    )
    
    from cell2location import run_colocation
res_dict, adata = run_colocation(
    adata,
    model_name='CoLocatedGroupsSklearnNMF',
    train_args={'n_fact': np.arange(5, 25), 'n_restarts': 3 })

生活很好,有你更好

0 人点赞