import scanpy as sc
import anndata
import pandas as pd
import numpy as np
import os
import gc
import cell2location
import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import seaborn as sns
# silence scanpy that prints a lot of warnings
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
drive.mount('/content/drive')
import cell2location
results_folder = '/content/drive/MyDrive/Spatial/HK2529_ST'
ref_run_name = f'{results_folder}/reference_signatures'
run_name = f'{results_folder}/cell2location_map'
adata = sc.read_visium(path ='/content/drive/MyDrive/HK2529_ST/', count_file= "filtered_feature_bc_matrix.h5", load_images=True)
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sns.distplot(adata.obs["total_counts"], kde=False)
#sc.pp.filter_cells(adata, min_counts=5000)
#sc.pp.filter_cells(adata, max_counts=35000)
#adata = adata[adata.obs["pct_counts_mt"] < 50]
print(f"#cells after MT filter: {adata.n_obs}")
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata, key_added="clusters")
sc.pl.spatial(adata,color=["COL1A1", "NPHS1"], img_key=None, size=1,vmin=0, cmap='magma', vmax='p99.0', gene_symbols='SYMBOL')
adata_ref = sc.read ("/content/drive/MyDrive/adata_ref_withmodel_4.3.2023.h5ad")
# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
for i in adata_ref.uns['mod']['factor_names']]].copy()
else:
inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}'
for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]
intersect = np.intersect1d(adata.var_names, inf_aver.index)
adata = adata[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()
# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata)
# can add additional slides to adata and add in a batch_key
mod = cell2location.models.Cell2location(
adata, cell_state_df=inf_aver,
# the expected average cell abundance: tissue-dependent
# hyper-prior which can be estimated from paired histology:
N_cells_per_location=30,
# hyperparameter controlling normalisation of
# within-experiment variation in RNA detection:
detection_alpha=20
)
mod.view_anndata_setup()
mod.train(max_epochs=30000,
# train using full data (batch_size=None)
batch_size=None,
# use all data points in training because
# we need to estimate cell abundance at all locations
train_size=1,
use_gpu=True)
# plot ELBO loss history during training, removing first 100 epochs from the plot
mod.plot_history(1000)
plt.legend(labels=['full data training']);
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata = mod.export_posterior(
adata, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
)
adata.obs[adata.uns['mod']['factor_names']] = adata.obsm['q05_cell_abundance_w_sf']
# Save model
mod.save(f"{run_name}", overwrite=True)
# mod = cell2location.models.Cell2location.load(f"{run_name}", adata)
# Save anndata object with results
adata_file = f"{run_name}/sp.h5ad"
adata.write(adata_file)
adata_file
# select one slide
from cell2location.utils import select_slide
sc.pl.spatial(adata, cmap='magma',
color=['Podo',"Endo_G","PEC", "Mes", "iPT", "GS_Stromal", "PT_S1", "PT_S2", "PT_S3", "Fibroblast_1", "Fibroblast_2", "DCT1" , "M_TAL", "DTL",
"CD4T", "CD8T", "Mac", "Dedifferentiated_Tubule"],
ncols=4, size=1.3,
img_key='hires',
# limit color scale at 99.2% quantile of cell abundance
vmin=0, vmax='p99.2'
)
sc.pp.neighbors(adata, use_rep='q05_cell_abundance_w_sf',
n_neighbors = 15)
sc.tl.leiden(adata, resolution=1.1)
adata.obs["region_cluster"] = adata.obs["leiden"].astype("category")
sc.tl.umap(adata, min_dist = 0.3, spread = 1)
sc.pl.spatial(adata, color=['region_cluster'],
size=1.3, img_key='hires', alpha=0.5)
# Set the value of the "sample" column for all rows to "HK3513_ST"
adata.obs['sample'] = "HK2529_ST"
# Print the updated "obs" attribute
print(adata.obs)
# Now we use cell2location plotter that allows showing multiple cell types in one panel
from cell2location.plt import plot_spatial
# select up to 6 clusters
clust_labels = [ 'Podo', 'PEC', "GS_Stromal", "Fibroblast_1"]
clust_col = ['' str(i) for i in clust_labels] # in case column names differ from labels
slide = select_slide(adata, 'HK2529_ST')
with mpl.rc_context({'figure.figsize': (15, 15)}):
fig = plot_spatial(
adata=slide,
# labels to show on a plot
color=clust_col, labels=clust_labels,
show_img=True,
# 'fast' (white background) or 'dark_background'
style='fast',
# limit color scale at 99.2% quantile of cell abundance
max_color_quantile=0.992,
# size of locations (adjust depending on figure size)
circle_diameter=12,
colorbar_position='right'
)
from cell2location import run_colocation
res_dict, adata = run_colocation(
adata,
model_name='CoLocatedGroupsSklearnNMF',
train_args={'n_fact': np.arange(5, 25), 'n_restarts': 3 })