作者,Evil Genius
速率
- Probabilistic modeling of RNA velocity
- Direct modeling of raw spliced and unspliced read count
- Multiple uncertainty diagnostics analysis and visualizations
- Synchronized cell time estimation across genes
- Multivariate denoised gene expression and velocity prediction
image
image
image
image
实现目标
image
整体框架
1、前处理
代码语言:javascript复制import scvelo as scv
adata = scv.read("local_file.h5ad")
adata.layers['raw_spliced'] = adata.layers['spliced']
adata.layers['raw_unspliced'] = adata.layers['unspliced']
adata.obs['u_lib_size_raw'] = adata.layers['raw_unspliced'].toarray().sum(-1)
adata.obs['s_lib_size_raw'] = adata.layers['raw_spliced'].toarray().sum(-1)
scv.pp.filter_and_normalize(adata, min_shared_counts=30, n_top_genes=2000)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
2、训练模型
代码语言:javascript复制from pyrovelocity.api import train_model
from pyrovelocity.plot import vector_field_uncertainty
num_epochs = 1000 # large data
# num_epochs = 4000 # small data
# Model 1
adata_model_pos = train_model(adata,
max_epochs=num_epochs, svi_train=True, log_every=100,
patient_init=45,
batch_size=4000, use_gpu=0, cell_state='state_info',
include_prior=True,
offset=False,
library_size=True,
patient_improve=1e-3,
model_type='auto',
guide_type='auto_t0_constraint',
train_size=1.0)
# Or Model 2
adata_model_pos = train_model(adata,
max_epochs=num_epochs, svi_train=True, log_every=100,
patient_init=45,
batch_size=4000, use_gpu=0, cell_state='state_info',
include_prior=True,
offset=True,
library_size=True,
patient_improve=1e-3,
model_type='auto',
guide_type='auto',
train_size=1.0)
# adata_model_pos is a returned list in which 0th element is the trained model,
# the 1st element is the posterior samples of all random variables
trained_model = adata_model_pos[0]
posterior_samples = adata_model_pos[1]
v_map_all, embeds_radian, fdri = vector_field_uncertainty(
adata,
posterior_samples=posterior_samples,
basis="umap"
)
save_res = True
if save_res:
trained_model.save('saved_model', overwrite=True)
result_dict = {"adata_model_pos": posterior_samples,
"v_map_all": v_map_all,
"embeds_radian": embeds_radian, "fdri": fdri} #, "embed_mean": embed_mean}
import pickle
with open("posterior_samples.pkl", "wb") as f:
pickle.dump(result_dict, f)
3、速率估计
代码语言:javascript复制from pyrovelocity.plot import plot_state_uncertainty
from pyrovelocity.plot import plot_posterior_time, plot_gene_ranking,
vector_field_uncertainty, plot_vector_field_uncertain,
plot_mean_vector_field, project_grid_points,rainbowplot,denoised_umap,
us_rainbowplot, plot_arrow_examples, set_colorbar
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
embedding = 'umap' # change to umap or tsne based on your embedding method
# This generates the posterior samples of all vector fields
# and statistical testing results from Rayleigh test
v_map_all, embeds_radian, fdri = vector_field_uncertainty(adata, posterior_samples,
basis=embedding, denoised=False, n_jobs=30)
fig, ax = plt.subplots()
# This returns the posterior mean of the vector field
embed_mean = plot_mean_vector_field(posterior_samples, adata, ax=ax, n_jobs=30, basis=embedding)
# This plot single-cell level vector field uncertainty
# and averaged cell vector field uncertainty on the grid points
# based on angular standard deviation
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(11.5, 5)
plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
ax=ax,
fig=fig, cbar=False, basis=embedding, scale=None)
# This generates shared time uncertainty plot with contour lines
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(12, 2.8)
adata.obs['shared_time_uncertain'] = posterior_samples['cell_time'].std(0).flatten()
ax_cb = scv.pl.scatter(adata, c='shared_time_uncertain', ax=ax[0], show=False, cmap='inferno', fontsize=7, s=20, colorbar=True, basis=embedding)
select = adata.obs['shared_time_uncertain'] > np.quantile(adata.obs['shared_time_uncertain'], 0.9)
sns.kdeplot(adata.obsm[f'X_{embedding}'][:, 0][select],
adata.obsm[f'X_{embedding}'][:, 1][select],
ax=ax[0], levels=3, fill=False)
# This generates vector field uncertainty based on Rayleigh test.
adata.obs.loc[:, 'vector_field_rayleigh_test'] = fdri
im = ax[1].scatter(adata.obsm[f'X_{embedding}'][:, 0],
adata.obsm[f'X_{embedding}'][:, 1], s=3, alpha=0.9,
c=adata.obs['vector_field_rayleigh_test'], cmap='inferno_r',
linewidth=0)
set_colorbar(im, ax[1], labelsize=5, fig=fig, position='right')
select = adata.obs['vector_field_rayleigh_test'] > np.quantile(adata.obs['vector_field_rayleigh_test'], 0.95)
sns.kdeplot(adata.obsm[f'X_{embedding}'][:, 0][select],
adata.obsm[f'X_{embedding}'][:, 1][select], ax=ax[1], levels=3, fill=False)
ax[1].axis('off')
ax[1].set_title("vector fieldnrayleigh testnfdr<0.05: %s%%" % (round((fdri < 0.05).sum()/fdri.shape[0], 2)*100), fontsize=7)
4、可视化
代码语言:javascript复制fig = plt.figure(figsize=(7.07, 4.5))
subfig = fig.subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[1.6, 4])
ax = fig.subplots(1)
# This generates the selected cell fate markers and output in DataFrame
volcano_data, _ = plot_gene_ranking([posterior_samples], [adata], ax=ax,
time_correlation_with='st', assemble=True)
# This generates the rainbow plots for the selected markers.
_ = rainbowplot(volcano_data, adata, posterior_samples,
subfig[1], data=['st', 'ut'], num_genes=4)
方法示例
代码语言:javascript复制%load_ext autoreload
%autoreload 2
import scvelo as scv
import numpy as np
import torch
import matplotlib.pyplot as plt
from pyrovelocity.data import load_data
from scipy.stats import spearmanr, pearsonr
from pyrovelocity.api import train_model
import seaborn as sns
import pandas as pd
from pyrovelocity.plot import plot_posterior_time, plot_gene_ranking,
vector_field_uncertainty, plot_vector_field_uncertain,
plot_mean_vector_field, project_grid_points,rainbowplot,denoised_umap,
us_rainbowplot, plot_arrow_examples
from pyrovelocity.utils import mae, mae_evaluate
adata = load_data(top_n=2000, min_shared_counts=30)
模型训练
代码语言:javascript复制adata_model_pos_split = train_model(adata, max_epochs=4000, svi_train=False, log_every=100,
patient_init=45, batch_size=-1, use_gpu=1,
include_prior=True, offset=False, library_size=True,
patient_improve=1e-4, guide_type='auto_t0_constraint', train_size=0.67)
mae_evaluate(adata_model_pos_split, adata)
adata_model_pos = train_model(adata, max_epochs=4000, svi_train=False, log_every=100,
patient_init=45, batch_size=-1, use_gpu=0,
include_prior=True, offset=False, library_size=True,
patient_improve=1e-4, guide_type='auto_t0_constraint', train_size=1.0)
mae_evaluate(adata_model_pos[1], adata)
v_map_all, embeds_radian, fdri = vector_field_uncertainty(adata, adata_model_pos[1], basis='umap', n_jobs=10)
可视化
代码语言:javascript复制fig, ax = plt.subplots()
embed_mean = plot_mean_vector_field(adata_model_pos[1], adata, ax=ax, n_jobs=10)
图片排列
代码语言:javascript复制fig = plt.figure(figsize=(7.07, 4.5))
subfig_A = fig.subfigures(2, 1, wspace=0.0, hspace=0, height_ratios=[1.2, 3])
dot_size = 3
font_size = 7
ress = pd.DataFrame({"cell_type": adata.obs['clusters'].values,
"X1": adata.obsm['X_umap'][:,0],
"X2": adata.obsm['X_umap'][:,1]})
subfig_A0 = subfig_A[0].subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[4, 2])
ax = subfig_A0[0].subplots(1, 4)
sns.scatterplot(x='X1', y='X2', data=ress, alpha=0.9, s=dot_size,
linewidth=0, edgecolor="none", hue='cell_type',
ax=ax[0], legend='brief')
ax[0].axis('off')
ax[0].set_title("Cell typesn", fontsize=font_size)
ax[0].legend(bbox_to_anchor=[2.9, -0.01], ncol=4, prop={'size': font_size}, fontsize=font_size, frameon=False)
kwargs = dict(color='gray', density=.8, add_margin=.1, s=dot_size,
show=False, alpha=.2, min_mass=3.5, frameon=False)
scv.pl.velocity_embedding_stream(adata, fontsize=font_size, ax=ax[1], title='', **kwargs)
ax[1].set_title("Scvelon", fontsize=7)
scv.pl.velocity_embedding_stream(adata, fontsize=font_size, basis='umap',
title='', ax=ax[2], vkey='velocity_pyro', **kwargs)
ax[2].set_title("Pyro-Velocityn", fontsize=7)
# plot_arrow_examples(adata_train, np.transpose(v_map_all, (1, 2, 0)), embeds_radian, ax=ax[3],
# n_sample=30, fig=fig, basis='umap', scale=0.004, alpha=0.18)
plot_arrow_examples(adata, np.transpose(v_map_all, (1, 2, 0)), embeds_radian, ax=ax[3],
n_sample=30, fig=fig, basis='umap', scale=0.008, alpha=0.2, index=5,
scale2=0.015, num_certain=6)
ax[3].set_title("Single cellnvector field", fontsize=7)
plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
fig=subfig_A0[1], cbar=True, basis='umap', scale=0.05)
subfig_A0[0].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.35)
subfig_A0[1].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.35)
subfig_A[0].text(-0.06, 0.58, "Pancreas", size=7, rotation="vertical", va="center")
subfig_B = subfig_A[1].subfigures(1, 2, wspace=0.0, hspace=0, width_ratios=[1.6, 4])
##subfig_B[0].subplots_adjust(hspace=0.2, wspace=0.1, left=0.01, right=0.99, top=0.99, bottom=0.1)
ax = subfig_B[0].subplots(2, 1)
plot_posterior_time(adata_model_pos[1], adata, ax=ax[0], fig=subfig_B[0], addition=False)
subfig_B[0].subplots_adjust(hspace=0.3, wspace=0.1, left=0.01, right=0.8, top=0.92, bottom=0.17)
volcano_data2, _ = plot_gene_ranking([adata_model_pos[1]], [adata], ax=ax[1],
time_correlation_with='st', assemble=True)
_ = rainbowplot(volcano_data2, adata, adata_model_pos[1], subfig_B[1], data=['st', 'ut'], num_genes=4)
image
代码语言:javascript复制fig.savefig("pancreas_output_figure1.tif", facecolor=fig.get_facecolor(), bbox_inches='tight', edgecolor='none', dpi=300)
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(11.5, 5)
plot_vector_field_uncertain(adata, embed_mean, embeds_radian,
ax=ax,
fig=fig, cbar=False, basis='umap', scale=0.05)
image
More uncertainty evaluation metrics
代码语言:javascript复制from pyrovelocity.plot import plot_state_uncertainty
fig, ax = plt.subplots(2, 3)
fig.set_size_inches(7.07, 3.5)
pos = adata_model_pos[1]
bin = 30
adata.obs['cell_time_uncertain'] = adata_model_pos[1]['cell_time'].std(0).flatten()
scv.pl.scatter(adata, c='cell_time_uncertain', ax=ax[0][0], show=False, cmap='inferno', fontsize=7)
_ = ax[1][0].hist(adata.obs.cell_time_uncertain, bins=bin, color='black', alpha=0.9)
ax[1][0].set_xlabel("shared timenstandard deviation", fontsize=7)
select = adata.obs['cell_time_uncertain'] > np.quantile(adata.obs['cell_time_uncertain'], 0.9)
sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
adata.obsm['X_umap'][:, 1][select], ax=ax[0][0], levels=3, fill=False)
adata.obs['vector_field_rayleigh_test'] = fdri
scv.pl.scatter(adata, c='vector_field_rayleigh_test', ax=ax[0][1], show=False, cmap='inferno_r', fontsize=7)
select = adata.obs['vector_field_rayleigh_test'] > np.quantile(adata.obs['vector_field_rayleigh_test'], 0.9)
sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
adata.obsm['X_umap'][:, 1][select], ax=ax[0][1], levels=3, fill=False)
_ = ax[1][1].hist(adata.obs.vector_field_rayleigh_test, bins=bin, color='black', alpha=0.9)
ax[1][1].set_xlabel("vector fieldnrayleigh test fdr", fontsize=7)
ax[1][1].text(0.08, 2000, "<1%% FDR:%s%%" % int((fdri < 0.01).sum()/fdri.shape[0]*100), fontsize=7)
ax[1][1].text(0.08, 1500, "<5%% FDR:%s%%" % int((fdri < 0.05).sum()/fdri.shape[0]*100), fontsize=7)
fig.subplots_adjust(hspace=0.3, wspace=0.7, left=0.01, right=0.8, top=0.92, bottom=0.17)
plot_state_uncertainty(pos, adata, kde=False, data='denoised', top_percentile=0.9, ax=ax[0][2])
_ = ax[1][2].hist(adata.obs['state_uncertain'], bins=bin, color='black', alpha=0.9)
ax[1][2].set_xlabel("averaged state uncertainty", fontsize=7)
select = adata.obs['state_uncertain'] > np.quantile(adata.obs['state_uncertain'], 0.9)
sns.kdeplot(adata.obsm['X_umap'][:, 0][select],
adata.obsm['X_umap'][:, 1][select], ax=ax[0][2], levels=3, fill=False)
ax[1][0].set_ylabel("Frequency", fontsize=7)
fig.tight_layout()