diff --git a/.gitignore b/.gitignore index a2dcf092c..2a0b463d2 100755 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,10 @@ docs/build docs/source/dynamo* docs/source/_autosummary +docs/generated/ +docs/_build/ +docs/api/reference/*rst + # always-ignore directories /build/ /dist/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..31ac15d16 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/tutorials/notebooks"] + path = docs/tutorials/notebooks + url = https://github.com/aristoteleo/dynamo-tutorials.git diff --git a/.readthedocs.yml b/.readthedocs.yml index 4a85e4e60..0cb602e76 100755 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -9,7 +9,7 @@ build: # Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/source/conf.py + configuration: docs/conf.py # python: # version: 3.7 @@ -23,4 +23,9 @@ python: extra_requirements: - docs # - method: setuptools - # path: package \ No newline at end of file + # path: package + +submodules: + include: + - "docs/tutorials/notebooks" + recursive: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 345ead81c..fda61cce8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,340 @@ -Released -========== -v1.0.0 Release: +# Release Notes -Added for new features. +## Dynamo Ver 1.4.1 -Changed for changes in existing functionality. +### DEBUG -Deprecated for soon-to-be removed features. +- Debug and refactor scPotential ([PR 624](https://github.com/aristoteleo/dynamo-release/pull/624)). +- Replace deprecated `np.asscalar()` with `np.ndarray.item()` ([PR 643](https://github.com/aristoteleo/dynamo-release/pull/643)). +- Create chunk option for normalization and gene selection ([PR 598](https://github.com/aristoteleo/dynamo-release/pull/598)). +- Debug `pd.state_graph()` ([PR 630](https://github.com/aristoteleo/dynamo-release/pull/630)). +- Debug `pl.jacobian_heatmap()` ([PR 653](https://github.com/aristoteleo/dynamo-release/pull/653)). +- Debug `pl.nneighbors()` ([PR 644](https://github.com/aristoteleo/dynamo-release/pull/644)). +- Retry codecov upload ([PR 656](https://github.com/aristoteleo/dynamo-release/pull/656)). +- Debug vectorfield given layer input ([PR 619](https://github.com/aristoteleo/dynamo-release/pull/619)). +- Debug simulation module ([PR 658](https://github.com/aristoteleo/dynamo-release/pull/658)). +- Extra filter after pearson residuals normalization ([PR 665](https://github.com/aristoteleo/dynamo-release/pull/665)). +- Add missing return value to deprecated functions ([PR 663](https://github.com/aristoteleo/dynamo-release/pull/663)). +- Debug networks plot ([PR 657](https://github.com/aristoteleo/dynamo-release/pull/657)). +- Implement `pl.plot_connectivity()` ([PR 652](https://github.com/aristoteleo/dynamo-release/pull/652)). +- Debug the preprocessing of integer matrix input ([PR 664](https://github.com/aristoteleo/dynamo-release/pull/664)). +- Missing return value in `pl.lap_min_time()` ([PR 668](https://github.com/aristoteleo/dynamo-release/pull/668)). +- Update matplotlib `Colorbar.draw_all()` to `Colorbar._draw_all()` ([PR 669](https://github.com/aristoteleo/dynamo-release/pull/669)). +- Optimize code coverage tests ([PR 605](https://github.com/aristoteleo/dynamo-release/pull/605)). +- Debug `test_gradop()` by ([PR 677](https://github.com/aristoteleo/dynamo-release/pull/677)). +- Constraint on matplotlib version by ([PR 679](https://github.com/aristoteleo/dynamo-release/pull/679)). +- Upgrade code coverage to v4 ([PR 684](https://github.com/aristoteleo/dynamo-release/pull/684)). +- Init a branch for updating dependency ([PR 690](https://github.com/aristoteleo/dynamo-release/pull/690)). +- Replace `louvain` with `leiden` ([PR 692](https://github.com/aristoteleo/dynamo-release/pull/692)). +- Debug `pl.highest_frac_genes()` ([PR 681](https://github.com/aristoteleo/dynamo-release/pull/681)). +- Deprecate more sparse matrix `.A` attributes ([PR 695](https://github.com/aristoteleo/dynamo-release/pull/695)). +- Fix matplotlib version issues and a circular import issue ([PR 686](https://github.com/aristoteleo/dynamo-release/pull/686)). +- Debug `set_figure_params()` ([PR 698](https://github.com/aristoteleo/dynamo-release/pull/698)). +- Debug: shape and name mismatch in cell-wise alpha saving ([PR 697](https://github.com/aristoteleo/dynamo-release/pull/697)). +- Debug: The sizes of the scatter plots are not set correctly ([PR 696](https://github.com/aristoteleo/dynamo-release/pull/696)). -Removed for now removed features. +### Others -Fixed for any bug fixes. +- Refactor `pd.fate()` with Trajectory class ([PR 645](https://github.com/aristoteleo/dynamo-release/pull/645)). +- Reorganize estimation module ([PR 662](https://github.com/aristoteleo/dynamo-release/pull/662)). +- Refactor `pl.scatters()` and `pl.scatters_interactive()` ([PR 654](https://github.com/aristoteleo/dynamo-release/pull/654)). +- Refactor `vf.VectorField()` function ([PR 620](https://github.com/aristoteleo/dynamo-release/pull/620)). +- Docstring and type hints for the prediction module ([PR 666](https://github.com/aristoteleo/dynamo-release/pull/666)). +- Update docstr and type hints for External module ([PR 661](https://github.com/aristoteleo/dynamo-release/pull/661)). +- Add doctring and type hints for simulation module ([PR 660](https://github.com/aristoteleo/dynamo-release/pull/660)). +- Docstring and type hints for root folder python files ([PR 667](https://github.com/aristoteleo/dynamo-release/pull/667)). -Security in case of vulnerabilities. +## Dynamo Ver 1.4.0 + +### Feature Changes + +- Shiny web application for in silico perturbation and least square action path analyses + ([PR 582](https://github.com/aristoteleo/dynamo-release/pull/582)). + +- More 3D plots ([PR 597](https://github.com/aristoteleo/dynamo-release/pull/597)): + + - 3D scatters with Plotly and Pyvista `dyn.pl.scatters_interactive()`. + - 3D vectors with Plotly and Pyvista `dyn.pl.cell_wise_vectors_3d()`. + - 3D topography with Plotly and Pyvista `dyn.pl.topography_3d()`. + - 3D animation with Pyvista `dyn.mv.PyvistaAnim()`. + +- Saved the velocity parameters in `adata.varm` instead of `adata.var` + ([PR 579](https://github.com/aristoteleo/dynamo-release/pull/579)). + +- DDRtree based pseudotime and graph learning ([PR 564](https://github.com/aristoteleo/dynamo-release/pull/564)): + `dyn.tl.order_cells()`, `dyn.tl.construct_velocity_tree()`. + +- Integrated `hnswlib` fast nearest neighbors method ([PR 552](https://github.com/aristoteleo/dynamo-release/pull/552)). + +- A helper functon to convert the AnnData object from Dynamo to Scvelo, or vice versa + ([PR 551](https://github.com/aristoteleo/dynamo-release/pull/551)). + +- The tools module has been reorganized ([PR 625](https://github.com/aristoteleo/dynamo-release/pull/625)): + + - Deprecate files `dynamo_fitting.py`, `dynamo_bk.py`, `dynamics_deprecated.py`, `utils_moments_deprecated.py`. + - Deprecate legacy functions in `construct_velocity_tree.py`,`pseudotime.py`, `moments.py`, `clustering.py`. + - Merge `utils_markers.py` and `markers.py`. + - Merge `time_series.py` (learns a direct principal graph by integrating the transition matrix between and DDRTree) + and `construct_velocity_tree.py`(Integrate pseudotime ordering with velocity to automatically assign the direction + of the learned trajectory.) to `DDRTree_graph.py`. + - Reorganize some functions to utils in the following file: `time_series.py`, `multiomics.py`. + - Rename: `DDRTree_py.py` to `DDRTree.py`, `psl_py.py` to `psl.py`. + +- Deprecate infomap clustering ([PR 555](https://github.com/aristoteleo/dynamo-release/pull/555)). + +### DEBUG + +- Fixed the bug that the `dyn.pl.kinetic_heatmap()` couldn't be transposed caused by wrong initialization + ([PR 558](https://github.com/aristoteleo/dynamo-release/pull/558)) + ([PR 636](https://github.com/aristoteleo/dynamo-release/pull/636)). +- Fixed the bug that `dyn.pl.cell_wise_vectors()` only output one color + ([PR 559](https://github.com/aristoteleo/dynamo-release/pull/559)). +- Debugged the sampling method in tools modules + ([PR 565](https://github.com/aristoteleo/dynamo-release/pull/565)). +- Fixed the panda error in `dyn.tl.gene_wise_confidence()` + ([PR 567](https://github.com/aristoteleo/dynamo-release/pull/567)). +- Fixed the bug that `pysal` submodules were not imported explicitly + ([PR 568](https://github.com/aristoteleo/dynamo-release/pull/568)). +- Debugged `dyn.tl.score_cells()` ([PR 569](https://github.com/aristoteleo/dynamo-release/pull/569)). +- Debugged the ambiguous if statement in `dyn.tl.psl()` + ([PR 573](https://github.com/aristoteleo/dynamo-release/pull/573)). +- Updated all the expired links of sample dataset ([PR 577](https://github.com/aristoteleo/dynamo-release/pull/577)). +- Fixed the bug that processed AnnData object couldn't be saved under some cases + ([PR 580](https://github.com/aristoteleo/dynamo-release/pull/580)). +- Debugged `pp/transform.py` ([PR 581](https://github.com/aristoteleo/dynamo-release/pull/581)). +- Debugged `dyn.tl.cell_velocities()` ([PR 585](https://github.com/aristoteleo/dynamo-release/pull/585)). +- Debugged `dyn.pl.kinetic_curves()` ([PR 587](https://github.com/aristoteleo/dynamo-release/pull/587)). +- Fixed the error caused by wrong type hints in `dyn.tl.BaseVectorField.find_fixed_points()` + ([PR 597](https://github.com/aristoteleo/dynamo-release/pull/597)). +- Fixed the error caused by excessive memory usage in tests + ([PR 602](https://github.com/aristoteleo/dynamo-release/pull/602)). +- Fixed the KeyError in `dyn.pp.convert2symbol()` when all genes are found + ([PR 603](https://github.com/aristoteleo/dynamo-release/pull/603)). +- Fixed the issue that `dyn.pp.highest_frac_genes()` didn't support sparse input + ([PR 604](https://github.com/aristoteleo/dynamo-release/pull/604)). +- Debugged `dyn.tl.cell_growth_rate()` ([PR 606](https://github.com/aristoteleo/dynamo-release/pull/606)). +- Debugged the arclength sampling method in `dyn.pd.fate()` + ([PR 592](https://github.com/aristoteleo/dynamo-release/pull/592)) + ([PR 610](https://github.com/aristoteleo/dynamo-release/pull/610)). +- Removed unnecessary import of pandas ([PR 614](https://github.com/aristoteleo/dynamo-release/pull/614)). +- Debugged the `dyn.pl.topography()` when the color is not provided + ([PR 617](https://github.com/aristoteleo/dynamo-release/pull/617)). +- Fixed the error that list object doesn't have to_list() method in `dyn.vf.hessian()` + ([PR 623](https://github.com/aristoteleo/dynamo-release/pull/623)). +- Fixed the ambiguous if statement in the `dyn.tl.MarkovChain.is_normalized()` + ([PR 626](https://github.com/aristoteleo/dynamo-release/pull/626)). +- Debugged the `dyn.pd.classify_clone_cell_type()` ([PR 627](https://github.com/aristoteleo/dynamo-release/pull/627)). +- Fixed the input of `minimize()` in `dyn.pd.lap_T()` + ([PR 628](https://github.com/aristoteleo/dynamo-release/pull/628)). +- Fixed the bug that average parameter didn't work in `dyn.pd.fate()` + ([PR 629](https://github.com/aristoteleo/dynamo-release/pull/629)). +- Debugged the `dyn.pl.line_integral_conv()` ([PR 639](https://github.com/aristoteleo/dynamo-release/pull/639)). + +### Others + +- Now available on [conda forge](https://anaconda.org/conda-forge/dynamo-release). +- Removed `cdlib` dependency ([PR 532](https://github.com/aristoteleo/dynamo-release/pull/532)). +- Removed `KDEpy` dependency ([PR 533](https://github.com/aristoteleo/dynamo-release/pull/533)). +- Added code coverage report ([PR 555](https://github.com/aristoteleo/dynamo-release/pull/555)). +- Optimized the structure of the umap dimension reduction + ([PR 556](https://github.com/aristoteleo/dynamo-release/pull/556)). +- Optimized the structure and supported sparse input in `tools/graph_calculus.py` + ([PR 557](https://github.com/aristoteleo/dynamo-release/pull/557)). +- Updated `networkx` API ([PR 560](https://github.com/aristoteleo/dynamo-release/pull/560)). +- Replaced `python-igraph` dependency with `igraph` ([PR 563](https://github.com/aristoteleo/dynamo-release/pull/563)). +- Added docstrings for tools module ([PR 570](https://github.com/aristoteleo/dynamo-release/pull/570)). +- Removed duplicate size factor calculation ([PR 596](https://github.com/aristoteleo/dynamo-release/pull/596)). +- Implemented a helper function for saving the plots + ([PR 609](https://github.com/aristoteleo/dynamo-release/pull/609)) + ([PR 635](https://github.com/aristoteleo/dynamo-release/pull/635)). +- Added docstrings for estimation module ([PR 611](https://github.com/aristoteleo/dynamo-release/pull/611)). +- Merged `dyn.pd.rank_cells()` and `dyn.pd.rank_cell_groups()` + ([PR 613](https://github.com/aristoteleo/dynamo-release/pull/613)). +- Added the conda badge ([PR 618](https://github.com/aristoteleo/dynamo-release/pull/618)). +- Handled the duplicate files when downloading sample data + ([PR 621](https://github.com/aristoteleo/dynamo-release/pull/621)). +- Debugged the ROC curve in Shiny app ([PR 637](https://github.com/aristoteleo/dynamo-release/pull/637)). + +## Dynamo Ver 1.3.0 + +### Feature Changes + +- The preprocessing module has been refactored: + + - Class *Preprocessor* is recommended for most preprocessing methods and recipes. `pp.recipe_monocle,` + `pp.recipe_velocyto` has been deprecated ([PR 497](https://github.com/aristoteleo/dynamo-release/pull/497)) + ([PR 500](https://github.com/aristoteleo/dynamo-release/pull/500)). + Check the tutorials [here](Preprocessor_tutorial.rst) for more instructions. + - Normalization has been refactored ([PR 474](https://github.com/aristoteleo/dynamo-release/pull/474)) + ([PR 475](https://github.com/aristoteleo/dynamo-release/pull/475)): `pp.normalize_cell_expr_by_size_factors` + has been deprecated, and new APIs are: + + - `pp.normalize_cell_expr_by_size_factors` -> `pp.calc_sz_factor, pp.normalize`. + + - Gene selection has been refactored ([PR 474](https://github.com/aristoteleo/dynamo-release/pull/474)). Now support + genes selected by fano factors. APIs are `pp.select_genes_monocle` and `pp.select_genes_by_seurat_recipe`. + - PCA has been refactored ([PR 469](https://github.com/aristoteleo/dynamo-release/pull/469)). `dyn.pp.pca_monocle` + has been deprecated. The new API is: + + - `pp.pca_monocle` -> `pp.pca`. + + - sctransform and pearson residuals recipe has been refactored + ([PR 510](https://github.com/aristoteleo/dynamo-release/pull/510)) + ([PR 512](https://github.com/aristoteleo/dynamo-release/pull/512)). Now those advanced methods will only be + performed on X layer. Other layers will get normalized by size factors. + - Calculation of `ntr` rate and `pp.cell_cycle_scores` has been added to the Preprocessor + ([PR 513](https://github.com/aristoteleo/dynamo-release/pull/513)). To enable cell cycle scores, set parameter + `cell_cycle_score_enable` to `True` when initializing the `pp.Preprocessor`. + - Now the size factors normalization will normalize all layers with its own size factors by default + ([PR 521](https://github.com/aristoteleo/dynamo-release/pull/521)). To normalize the labeled data with total size + factors, we need to set the `total_szfactor` to `total_Size_Factor` explicitly. + - Multiple new features added, includes genes selection by fano factors + ([PR 474](https://github.com/aristoteleo/dynamo-release/pull/474)), external data integration methods + ([PR 473](https://github.com/aristoteleo/dynamo-release/pull/473)) and `pp.regress_out` + ([PR 470](https://github.com/aristoteleo/dynamo-release/pull/470)) + ([PR 483](https://github.com/aristoteleo/dynamo-release/pull/483)) + ([PR 484](https://github.com/aristoteleo/dynamo-release/pull/484)). + - Created more tests for preprocessing module ([PR 485](https://github.com/aristoteleo/dynamo-release/pull/485)). + - Replaced `adata.obsm["X"]` with `adata.obsm["X_pca"]` + ([PR 514](https://github.com/aristoteleo/dynamo-release/pull/514)). + - Removed some console output. They can still be displayed with `DEBUG` logging mode. + - Other deprecated APIs include: `pp.calc_sz_factor_legacy, pp.filter_cells_legacy`, + `pp.filter_genes_by_outliers_legacy, pp.select_genes_monocle_legacy, pp.select_genes_by_dispersion_general`, + `pp.cook_dist, pp.normalize_cell_expr_by_size_factors`. More information can be found on our + [preprocessing tutorials](Preprocessor_tutorial.rst). + +### DEBUG + +- Fixed the bug that save_show_or_return flags not working + ([PR 414](https://github.com/aristoteleo/dynamo-release/pull/414)). +- Enabled the leiden algorithm to accept the resolution parameters + ([PR 441](https://github.com/aristoteleo/dynamo-release/pull/441)). +- Fixed the wrong attribute name of anndata object in `utils_dimensionReduction.py` + ([PR 458](https://github.com/aristoteleo/dynamo-release/pull/458)). +- Fixed the dimensionality issue in `moments.py` + ([PR 461](https://github.com/aristoteleo/dynamo-release/pull/461)). +- Fixed part of the bug that h5ad file cannot be saved correctly + ([PR 467](https://github.com/aristoteleo/dynamo-release/pull/467)). +- Fixed the bug that `pca_mean` will be `None` under some circumstances + ([PR 482](https://github.com/aristoteleo/dynamo-release/pull/482)). +- Removing warning message for nxviz + ([PR 489](https://github.com/aristoteleo/dynamo-release/pull/489)). +- Corrected the norm log-likelihood function + ([PR 495](https://github.com/aristoteleo/dynamo-release/pull/495)). +- Removed deprecated parameters in gseapy functions + ([PR 496](https://github.com/aristoteleo/dynamo-release/pull/496)). +- Fixed the bugs that functions will raise error when no fixed points are found in vector field by sampling + ([PR 501](https://github.com/aristoteleo/dynamo-release/pull/501)). +- Removed unwanted operations in dimension reduction + ([PR 502](https://github.com/aristoteleo/dynamo-release/pull/502)). + +### Tutorial Updates on Readthedocs + +- Documentation, Tutorials, and readthedocs update: + + - Update requirements for readthedocs ([PR 466](https://github.com/aristoteleo/dynamo-release/pull/466)). + - Update readme ([PR 479](https://github.com/aristoteleo/dynamo-release/pull/479)). + - Fixed documentation error caused by importing Literal + ([PR 486](https://github.com/aristoteleo/dynamo-release/pull/486)). + - Fixed readthedocs error caused by the new version of urllib3 + ([PR 488](https://github.com/aristoteleo/dynamo-release/pull/488)). + +### Other Changes + +- Docstring and type hints update: + + - Updated docstring and type hints for tools module + ([PR 419](https://github.com/aristoteleo/dynamo-release/pull/419)). + - Updated docstring and type hints for vector field module + ([PR 434](https://github.com/aristoteleo/dynamo-release/pull/434)). + - Updated the docstring and type hints for simulation and predicting module + ([PR 457](https://github.com/aristoteleo/dynamo-release/pull/457)). + - Update the docstring and type hints for hzplot + ([PR 456](https://github.com/aristoteleo/dynamo-release/pull/456)). + +## Dynamo Ver 1.1.0 + +### Feature Changes + +- Following new function are added, exported or documented in API / class page: + + - *Preprocessing*: `pp.convert2symbol, pp.filter_cells, pp.filter_gene,` + `pp.filter_genes_by_pattern, pp.normalize_cells, pp.scale, pp.log1p, pp.pca` + - *Kinetic parameters and RNA/protein velocity*: `tl.recipe_deg_data, tl.recipe_kin_data,` + `tl.recipe_mix_kin_deg_data, tl.recipe_one_shot_data, tl.velocity_N` + - *Labeling Velocity recipes*: `tl.infomap, tl.leiden, tl.louvain, tl.scc` + - *Clustering*: `tl.run_scvelo, tl.run_velocyto, tl.vlm_to_adata` + - *Converter and helper*: `vf.graphize_vecfld, vf.vector_field_function` + - *Vector field reconstruction*: `vf.FixedPoints, vf.VectorField2D, vf.assign_fixedpoints` + - *Beyond RNA velocity*: `vf.jacobian, vf.sensitivity` + - *Vector field ranking*: `vf.rank_cells, vf.rank_genes, vf.rank_expression_genes,` + `vf.rank_jacobian_genes, vf.rank_s_divergence_genes, vf.rank_sensitivity_genes` + - *Vector field clustering and graph*: `vf.cluster_field, vf.streamline_clusters` + - *Prediction* `pd.andecestor, pd.get_init_path, pd.least_action, pd.perturbation,` + `pd.rank_perturbation_cell_clusters, pd.rank_perturbation_cells, pd.rank_perturbation_genes,` + `pd.state_graph, pd.tree_model` + - *Preprocessing plot*: `pl.biplot, pl.loading, pl.highest_frac_genes, pl.bubble` + - *Space plot*: `pl.space` + - *Kinetics plot*: `pl.sensitivity_kinetics` + - *Vector field plots*: `pl.cell_wise_vectors_3d, pl.plot_fixed_points_2d` + - *differential geometry plots*: `pl.acceleration` + - *Regulatory network plots* `pl.arcPlot, pl.circosPlot, pl.circosPlotDeprecated, pl.hivePlot` + - *fate plots* `pl.fate` + - *heatmap plots* `pl.causality, pl.comb_logic, pl.plot_hill_function, pl.response` + - *Predictions plots* `pl.lap_min_time` + - *External functionality* `ext.normalize_layers_pearson_residuals,` + `ext.select_genes_by_pearson_residuals, ext.sctransform` + +- More differential geometry analyses + + - include the `switch` mode in rank_jacobian_genes + - added calculation of `sensitivity` matrix and relevant ranking + +- most probable path and *in silico* perturbation prediction + + - implemented least action path optimization (can be done in high dimensional space) with analytical Jacobian + - include genetic perturbation prediction by either changing the vector field function or simulate genetic perturbation via analytical Jacobian + +- preprocessor class implementation + + - extensible modular preprocess steps + - support following recipes: monocle (dynamo), seurat (seurat V3 flavor), sctransform (seurat), pearson residuals and pearson residuals for feature selection, combined with monocle recipe (ensure no negative values) + - following recipes tested on zebrafish dataset to make implemetation results consistent: + - monocle, seurat, pearson residuals +- CDlib integration + + - leiden, louvain, infomap community detection for cell clustering + - wrappers in `dyn.tl.*` for computing clusters + - wrappers in `dyn.pl.*` for plotting + +### Tutorial Updates on Readthedocs + +- human HSC hematopoiesis RNA velocity analysis tutorials +- *in silico* perturbation and least action path (LAP) predictions tutorials on HSC dataset +- differential geometry analysis on HSC dataset + + - Molecular mechanism of megakaryocytes + - Minimal network for basophil lineage commitment + - Cell-wise analyses: dominant interactions +- gallery: Pancreatic endocrinogenesis differential geometry + +Sample Dataset Updates + +### CI/CD Updates + +- update dynamo testing and pytest structure +- test building workflow on 3.7, 3.8, 3.9 (3.6 no longer tested on github building CI) + +Performance Improvements + +### API Changes + +- preprocess + + - `pp.pca` -> `pca.pca_monocle` +- Native implementation of various graphical calculus using Numpy without using igraph. + +### Other Changes + +- **general code refactor and bug fixing** +- **pl.scatters** refactor \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile old mode 100755 new mode 100644 index 69fe55ecf..d4bb2cbb9 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,11 +1,12 @@ # Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SOURCEDIR = source -BUILDDIR = build +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @@ -16,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/Preprocessor_tutorial_files/output_20_1.png b/docs/_static/Preprocessor_tutorial_files/output_20_1.png new file mode 100644 index 000000000..dd6196a29 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_20_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_23_1.png b/docs/_static/Preprocessor_tutorial_files/output_23_1.png new file mode 100644 index 000000000..c1bfe965d Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_23_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_26_1.png b/docs/_static/Preprocessor_tutorial_files/output_26_1.png new file mode 100644 index 000000000..020c18ba1 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_26_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_29_1.png b/docs/_static/Preprocessor_tutorial_files/output_29_1.png new file mode 100644 index 000000000..4fd30ce98 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_29_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_38_0.png b/docs/_static/Preprocessor_tutorial_files/output_38_0.png new file mode 100644 index 000000000..0b59b5934 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_38_0.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_40_1.png b/docs/_static/Preprocessor_tutorial_files/output_40_1.png new file mode 100644 index 000000000..4dbdf16fb Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_40_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_42_0.png b/docs/_static/Preprocessor_tutorial_files/output_42_0.png new file mode 100644 index 000000000..f23e9f3c2 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_42_0.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_49_1.png b/docs/_static/Preprocessor_tutorial_files/output_49_1.png new file mode 100644 index 000000000..871b502d5 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_49_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_54_1.png b/docs/_static/Preprocessor_tutorial_files/output_54_1.png new file mode 100644 index 000000000..daa408ed3 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_54_1.png differ diff --git a/docs/_static/Preprocessor_tutorial_files/output_67_1.png b/docs/_static/Preprocessor_tutorial_files/output_67_1.png new file mode 100644 index 000000000..ec81c25c0 Binary files /dev/null and b/docs/_static/Preprocessor_tutorial_files/output_67_1.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/1_1.png b/docs/_static/Shiny_tutorial_files/lap/1_1.png new file mode 100644 index 000000000..e3acb6ca4 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/1_1.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/1_2.png b/docs/_static/Shiny_tutorial_files/lap/1_2.png new file mode 100644 index 000000000..918d40eab Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/1_2.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_1.jpg b/docs/_static/Shiny_tutorial_files/lap/2_1.jpg new file mode 100644 index 000000000..977f8a16a Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_2.jpg b/docs/_static/Shiny_tutorial_files/lap/2_2.jpg new file mode 100644 index 000000000..899207d74 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_2.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_3.jpg b/docs/_static/Shiny_tutorial_files/lap/2_3.jpg new file mode 100644 index 000000000..929cfec90 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_3.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_4.png b/docs/_static/Shiny_tutorial_files/lap/2_4.png new file mode 100644 index 000000000..89cf83228 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_4.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_5.jpg b/docs/_static/Shiny_tutorial_files/lap/2_5.jpg new file mode 100644 index 000000000..5a56c31cf Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_5.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/2_6.jpg b/docs/_static/Shiny_tutorial_files/lap/2_6.jpg new file mode 100644 index 000000000..5314d6287 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/2_6.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/3_1.jpg b/docs/_static/Shiny_tutorial_files/lap/3_1.jpg new file mode 100644 index 000000000..84a85b077 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/3_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/3_2.png b/docs/_static/Shiny_tutorial_files/lap/3_2.png new file mode 100644 index 000000000..6c3890070 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/3_2.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/4_1.jpg b/docs/_static/Shiny_tutorial_files/lap/4_1.jpg new file mode 100644 index 000000000..23f8de23b Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/4_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/4_2.png b/docs/_static/Shiny_tutorial_files/lap/4_2.png new file mode 100644 index 000000000..4008ab6a2 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/4_2.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/5_1.jpg b/docs/_static/Shiny_tutorial_files/lap/5_1.jpg new file mode 100644 index 000000000..a53af1cdb Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/5_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/5_2.jpg b/docs/_static/Shiny_tutorial_files/lap/5_2.jpg new file mode 100644 index 000000000..538822ee5 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/5_2.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/6.png b/docs/_static/Shiny_tutorial_files/lap/6.png new file mode 100644 index 000000000..58d587940 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/6.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/7_1.png b/docs/_static/Shiny_tutorial_files/lap/7_1.png new file mode 100644 index 000000000..7f0e470dd Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/7_1.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/7_2.jpg b/docs/_static/Shiny_tutorial_files/lap/7_2.jpg new file mode 100644 index 000000000..041f24852 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/7_2.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/8_1.jpg b/docs/_static/Shiny_tutorial_files/lap/8_1.jpg new file mode 100644 index 000000000..b3bf53ab8 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/8_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/8_2.png b/docs/_static/Shiny_tutorial_files/lap/8_2.png new file mode 100644 index 000000000..726712825 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/8_2.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_1.png b/docs/_static/Shiny_tutorial_files/lap/9_1.png new file mode 100644 index 000000000..7f5ad84f2 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_1.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_2.jpg b/docs/_static/Shiny_tutorial_files/lap/9_2.jpg new file mode 100644 index 000000000..28744399f Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_2.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_3.jpg b/docs/_static/Shiny_tutorial_files/lap/9_3.jpg new file mode 100644 index 000000000..269e7496b Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_3.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_4.jpg b/docs/_static/Shiny_tutorial_files/lap/9_4.jpg new file mode 100644 index 000000000..d550812f0 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_4.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_5.png b/docs/_static/Shiny_tutorial_files/lap/9_5.png new file mode 100644 index 000000000..4e6e617db Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_5.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_6.jpg b/docs/_static/Shiny_tutorial_files/lap/9_6.jpg new file mode 100644 index 000000000..26e0b54f8 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_6.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_7.png b/docs/_static/Shiny_tutorial_files/lap/9_7.png new file mode 100644 index 000000000..7d8ce95ca Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_7.png differ diff --git a/docs/_static/Shiny_tutorial_files/lap/9_8.png b/docs/_static/Shiny_tutorial_files/lap/9_8.png new file mode 100644 index 000000000..78e8420c8 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/lap/9_8.png differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/1.png b/docs/_static/Shiny_tutorial_files/perturbation/1.png new file mode 100644 index 000000000..2017945e8 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/1.png differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/1_1.jpg b/docs/_static/Shiny_tutorial_files/perturbation/1_1.jpg new file mode 100644 index 000000000..cdc76fb69 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/1_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/2_1.jpg b/docs/_static/Shiny_tutorial_files/perturbation/2_1.jpg new file mode 100644 index 000000000..6d7dafa0d Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/2_1.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/2_2.jpg b/docs/_static/Shiny_tutorial_files/perturbation/2_2.jpg new file mode 100644 index 000000000..4e1b64f9c Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/2_2.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/2_3.jpg b/docs/_static/Shiny_tutorial_files/perturbation/2_3.jpg new file mode 100644 index 000000000..f9279ee95 Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/2_3.jpg differ diff --git a/docs/_static/Shiny_tutorial_files/perturbation/3_1.jpg b/docs/_static/Shiny_tutorial_files/perturbation/3_1.jpg new file mode 100644 index 000000000..80251bf7f Binary files /dev/null and b/docs/_static/Shiny_tutorial_files/perturbation/3_1.jpg differ diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 000000000..d6d520dc6 --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,37 @@ +.caption-text{ + padding: 0px; +} + +.rst-content .output_area img { + max-width: unset; + width: 100% !important; + height: auto !important; +} + + +/* sidebar */ +.rst-content .sidebar { + /* margin: 0px 0px 0px 12px; */ + padding-bottom: 0px; +} +.rst-content .sidebar p { + margin-bottom: 12px; +} +.rst-content .sidebar p, +.rst-content .sidebar ul, +.rst-content .sidebar dl { + font-size: 13px; +} + +.scrollit { + overflow-x:auto; +} + +/* Sidebar header (and topbar for mobile) */ +.wy-side-nav-search, .wy-nav-top { + background: #1b1b1b; +} +/* Sidebar */ +.wy-nav-side { +background: #373737; +} \ No newline at end of file diff --git a/docs/_static/css/override.css b/docs/_static/css/override.css new file mode 100644 index 000000000..e69de29bb diff --git a/docs/_static/dynamo-horizontal.svg b/docs/_static/dynamo-horizontal.svg new file mode 100644 index 000000000..a4f72f3d1 --- /dev/null +++ b/docs/_static/dynamo-horizontal.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/docs/_static/img/anndata_manager_schematic.svg b/docs/_static/img/anndata_manager_schematic.svg new file mode 100644 index 000000000..59c29a7fd --- /dev/null +++ b/docs/_static/img/anndata_manager_schematic.svg @@ -0,0 +1 @@ +LegendData ObjectInstance VariableClass InstanceInstance FunctionAnnDataManagerOther AnnDataAnnDataManagerself.fieldsLayerField(”X”, “raw_counts”)CategoricalObsField(”batch”, “batch”)CategoricalObsField(”labels”, “cell_types”)register_fieldsAnnDatastate_registrydata_registrysummary_statsself.registrytransfer_fieldsOther AnnDataself.adataBaseAnnDataFieldAnnDataOther AnnDataregister_fieldtransfer_fieldsstate_registrystate_registryimplementsinitializes diff --git a/docs/_static/img/setup_anndata_before_after.svg b/docs/_static/img/setup_anndata_before_after.svg new file mode 100644 index 000000000..5c29af94b --- /dev/null +++ b/docs/_static/img/setup_anndata_before_after.svg @@ -0,0 +1 @@ + diff --git a/docs/_static/logo.png b/docs/_static/logo.png new file mode 100644 index 000000000..44fdbd294 Binary files /dev/null and b/docs/_static/logo.png differ diff --git a/docs/_static/logo.svg b/docs/_static/logo.svg new file mode 100644 index 000000000..ac037f1c1 --- /dev/null +++ b/docs/_static/logo.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst new file mode 100644 index 000000000..e4665dfc7 --- /dev/null +++ b/docs/_templates/autosummary/class.rst @@ -0,0 +1,61 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. add toctree option to make autodoc generate the pages + +.. autoclass:: {{ objname }} + +{% block attributes %} +{% if attributes %} +Attributes table +~~~~~~~~~~~~~~~~~~ + +.. autosummary:: +{% for item in attributes %} + ~{{ fullname }}.{{ item }} +{%- endfor %} +{% endif %} +{% endblock %} + +{% block methods %} +{% if methods %} +Methods table +~~~~~~~~~~~~~ + +.. autosummary:: +{% for item in methods %} + {%- if item != '__init__' %} + ~{{ fullname }}.{{ item }} + {%- endif -%} +{%- endfor %} +{% endif %} +{% endblock %} + +{% block attributes_documentation %} +{% if attributes %} +Attributes +~~~~~~~~~~~ + +{% for item in attributes %} + +.. autoattribute:: {{ [objname, item] | join(".") }} +{%- endfor %} + +{% endif %} +{% endblock %} + +{% block methods_documentation %} +{% if methods %} +Methods +~~~~~~~ + +{% for item in methods %} +{%- if item != '__init__' %} + +.. automethod:: {{ [objname, item] | join(".") }} +{%- endif -%} +{%- endfor %} + +{% endif %} +{% endblock %} diff --git a/docs/_templates/class_no_inherited.rst b/docs/_templates/class_no_inherited.rst new file mode 100644 index 000000000..837d4e5a5 --- /dev/null +++ b/docs/_templates/class_no_inherited.rst @@ -0,0 +1,68 @@ +{{ fullname | escape | underline}} + +.. currentmodule:: {{ module }} + +.. add toctree option to make autodoc generate the pages + +.. autoclass:: {{ objname }} + :show-inheritance: + +{% block attributes %} +{% if attributes %} +Attributes table +~~~~~~~~~~~~~~~~ + +.. autosummary:: +{% for item in attributes %} + {%- if item not in inherited_members%} + ~{{ fullname }}.{{ item }} + {%- endif -%} +{%- endfor %} +{% endif %} +{% endblock %} + + +{% block methods %} +{% if methods %} +Methods table +~~~~~~~~~~~~~~ + +.. autosummary:: +{% for item in methods %} + {%- if item != '__init__' and item not in inherited_members%} + ~{{ fullname }}.{{ item }} + {%- endif -%} + +{%- endfor %} +{% endif %} +{% endblock %} + +{% block attributes_documentation %} +{% if attributes %} +Attributes +~~~~~~~~~~ + +{% for item in attributes %} +{%- if item not in inherited_members%} + +.. autoattribute:: {{ [objname, item] | join(".") }} +{%- endif -%} +{%- endfor %} + +{% endif %} +{% endblock %} + +{% block methods_documentation %} +{% if methods %} +Methods +~~~~~~~ + +{% for item in methods %} +{%- if item != '__init__' and item not in inherited_members%} + +.. automethod:: {{ [objname, item] | join(".") }} +{%- endif -%} +{%- endfor %} + +{% endif %} +{% endblock %} diff --git a/docs/api/datasets.md b/docs/api/datasets.md new file mode 100644 index 000000000..4c9857f1a --- /dev/null +++ b/docs/api/datasets.md @@ -0,0 +1,38 @@ +# Datasets + +Import dynamo as: + +``` +import dynamo as dyn +``` + +```{eval-rst} +.. currentmodule:: dynamo + +``` + +## Built in data + +Here we host some published datasets that are useful for benchmarking and testing models. + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + sample_data.scNT_seq_neuron_splicing + sample_data.scNT_seq_neuron_labeling + sample_data.zebrafish + sample_data.DentateGyrus + sample_data.Haber + sample_data.hgForebrainGlutamatergic + sample_data.chromaffin + sample_data.BM + sample_data.pancreatic_endocrinogenesis + sample_data.DentateGyrus_scvelo + sample_data.scEU_seq_rpe1 + sample_data.scEU_seq_organoid + sample_data.hematopoiesis + sample_data.hematopoiesis_raw + +``` diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 000000000..a3899ebcc --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,14 @@ +# API + +Import dynamo as: + +``` +import dynamo as dyn +``` + +```{toctree} +:maxdepth: 2 + +user +datasets +``` diff --git a/docs/api/user.md b/docs/api/user.md new file mode 100644 index 000000000..111ec0e49 --- /dev/null +++ b/docs/api/user.md @@ -0,0 +1,686 @@ +# User + +Import dynamo as: + +``` +import dynamo as dyn +``` + +```{eval-rst} +.. currentmodule:: dynamo + +``` + +## Data IO + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + read + read_h5ad + read_loom + +``` + +## Tools (tl) + +_kNN and moments of expressions_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.neighbors + tl.mnn + tl.moments + +``` + +_Kinetic parameters and RNA/protein velocity_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.dynamics + +``` + +_Labeling Velocity recipes_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.recipe_deg_data + tl.recipe_kin_data + tl.recipe_mix_kin_deg_data + tl.recipe_one_shot_data + tl.velocity_N + + +``` + +_Labeling Velocity recipes_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.reduceDimension + tl.DDRTree + tl.psl + +``` + +_Clustering_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.hdbscan + tl.leiden + tl.louvain + tl.scc + +``` + +_Velocity projection_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.cell_velocities + tl.confident_cell_velocities + +``` + +_Velocity metrics_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.cell_wise_confidence + tl.gene_wise_confidence + +``` + +_Markov chain_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.generalized_diffusion_map + tl.stationary_distribution + tl.diffusion + tl.expected_return_time + +``` + +_Markers and differential expressions_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.moran_i + tl.find_group_markers + tl.two_groups_degs + tl.top_n_markers + tl.glm_degs + +``` + +_Cell proliferation and apoptosis_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.score_cells + tl.cell_growth_rate + +``` + +_Converter and helper_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + tl.converter + tl.run_scvelo + tl.run_velocyto + tl.vlm_to_adata + +``` + +## Vector field (vf) + +_Vector field reconstruction_ + +:::{note} + Vector field class is internally to vf.VectorField. See our vector field classes here: [vector field](https://dynamo-release.readthedocs.io/en/latest/Class.html#vector-field) +::: + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.VectorField + vf.SparseVFC + vf.BaseVectorField + vf.SvcVectorField + vf.graphize_vecfld + vf.vector_field_function + +``` + +_Vector field topology_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.cluster_field + vf.topography + vf.FixedPoints + vf.assign_fixedpoints + +``` + +_Beyond RNA velocity_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.velocities + vf.speed + vf.jacobian + vf.divergence + vf.curl + vf.acceleration + vf.curvature + vf.torsion + vf.sensitivity + +``` + +_Beyond velocity vector field_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.cell_accelerations + vf.cell_curvatures + +``` + +_Vector field ranking_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.rank_genes + vf.rank_expression_genes + vf.rank_velocity_genes + vf.rank_divergence_genes + vf.rank_acceleration_genes + vf.rank_curvature_genes + vf.rank_jacobian_genes + vf.rank_s_divergence_genes + vf.rank_sensitivity_genes + +``` + +_Single cell potential: three approaches_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.gen_fixed_points + vf.gen_gradient + vf.IntGrad + vf.DiffusionMatrix + vf.action + vf.Potential + vf.path_integral + vf.alignment + vf.Wang_action + vf.Wang_LAP + vf.transition_rate + vf.MFPT + vf.Ao_pot_map + vf.solveQ + +``` + +_Stochastic processes_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + + vf.diffusionMatrix + +``` + +_Vector field clustering and graph_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + vf.cluster_field + vf.streamline_clusters + vf.vfGraph + +``` + +## Prediction (pd) + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pd.andecestor + pd.fate + pd.fate_bias + pd.get_init_path + pd.least_action + pd.perturbation + pd.state_graph + pd.KO + pd.rank_perturbation_cell_clusters + pd.rank_perturbation_cells + pd.rank_perturbation_genes + pd.tree_model + +``` + +## Plotting (pl) + +_Preprocessing_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.basic_stats + pl.show_fraction + pl.feature_genes + pl.biplot + pl.loading + pl.variance_explained + pl.highest_frac_genes + pl.exp_by_groups + pl.bubble + +``` + +_Cell cycle staging_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.cell_cycle_scores + +``` + +_Scatter base_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.scatters + +``` + +_Space plot_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.space + +``` + +_Phase diagram: conventional scRNA-seq_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.phase_portraits + +``` + +_Kinetic models: labeling based scRNA-seq_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.dynamics + +``` + +_Kinetics_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.kinetic_curves + pl.kinetic_heatmap + pl.jacobian_kinetics + pl.sensitivity_kinetics + +``` + +_Dimension reduction_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.pca + pl.tsne + pl.umap + pl.trimap + +``` + +_Clustering_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.leiden + pl.louvain + pl.infomap + pl.streamline_clusters + +``` + +_Neighbor graph_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.nneighbors + pl.state_graph + +``` + +_Vector field plots: velocities and accelerations_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.cell_wise_vectors + pl.cell_wise_vectors_3d + pl.grid_vectors + pl.streamline_plot + pl.line_integral_conv + pl.plot_energy + pl.plot_3d_streamtube + +``` + +_Vector field topology_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.plot_flow_field + pl.plot_fixed_points + pl.plot_fixed_points_2d + pl.plot_nullclines + pl.plot_separatrix + pl.plot_traj + pl.topography + pl.response + +``` + +_Beyond RNA velocity_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.speed + pl.divergence + pl.acceleration + pl.curl + pl.curvature + pl.jacobian + pl.jacobian_heatmap + pl.sensitivity + pl.sensitivity_heatmap + +``` + +_Regulatory network_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.arcPlot + pl.circosPlot + pl.circosPlotDeprecated + pl.hivePlot + +``` + +_Potential landscape_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.show_landscape + +``` + +_Cell fate_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.fate + pl.fate_bias + +``` + +_Heatmaps_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.causality + pl.comb_logic + pl.plot_hill_function + pl.response + +``` + +_Predictions_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.lap_min_time + +``` + +_Save figures_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + pl.save_fig + +``` + +## Movie (mv) + +:::{note} + animation class is internally to mv.animate_fates. See our animation classes here: [animation](https://dynamo-release.readthedocs.io/en/latest/Class.html#movie) +::: + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + mv.animate_fates + +``` + +## Simulation (sim) + +_Simple ODE vector field simulation_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + sim.toggle + sim.Ying_model + +``` + +_Gillespie simulation_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + sim.Gillespie + sim.Simulator + sim.state_space_sampler + sim.evaluate + +``` + +## External (ext) + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + ext.ddhodge + ext.enrichr + ext.scribe + ext.coexp_measure + ext.scifate_glmnet + +``` + +## Utilities + +_Package versions_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + get_all_dependencies_version + +``` + +_Clean up adata_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + cleanup + +``` + +_Figures configuration_ + +```{eval-rst} +.. autosummary:: + :toctree: reference/ + :nosignatures: + + configuration.set_figure_params + configuration.set_pub_style + +``` + +[anndata]: https://anndata.readthedocs.io/en/stable/ +[scanpy]: https://scanpy.readthedocs.io/en/stable/index.html +[utilities]: https://scanpy.readthedocs.io/en/stable/api/index.html#reading +[ray tune]: https://docs.ray.io/en/latest/tune/index.html diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 000000000..d9e79ba64 --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,3 @@ +```{include} ../CHANGELOG.md + +``` diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..51cb2bdb0 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,250 @@ +import importlib.util +import inspect +import os +import re +import subprocess +import sys +from pathlib import Path +from importlib.metadata import metadata +from datetime import datetime +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from typing import Any + +HERE = Path(__file__).parent +sys.path[:0] = [str(HERE.parent), str(HERE / "extensions")] + +# -- Project information ----------------------------------------------------- + +info = metadata("dynamo-release") +project_name = info["Name"] +author = info["Author"] +copyright = f"{datetime.now():%Y}, {author}." +version = info["Version"] +repository_url = f"https://github.com/aristoteleo/{project_name}" + +# The full version, including alpha/beta/rc tags +release = info["Version"] + +bibtex_bibfiles = ["references.bib"] +templates_path = ["_templates"] +nitpicky = True # Warn about broken links +needs_sphinx = "4.0" + +html_context = { + "display_github": True, # Integrate GitHub + "github_user": "aristoteleo", # Username + "github_repo": project_name, # Repo name + "github_version": "main", # Version + "conf_py_path": "/docs/", # Path in the checkout to the docs root +} + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "myst_nb", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.linkcode", + "sphinx.ext.mathjax", + "sphinx.ext.napoleon", + "sphinx_autodoc_typehints", # needs to be after napoleon + "sphinx.ext.extlinks", + "sphinx.ext.autosummary", + "sphinxcontrib.bibtex", + *[p.stem for p in (HERE / "extensions").glob("*.py")], + "sphinx_copybutton", + "sphinx_design", + "sphinxext.opengraph", + "hoverxref.extension", +] + + +# for sharing urls with nice info +#ogp_site_url = "https://docs.scvi-tools.org/" +#ogp_image = "https://docs.scvi-tools.org/en/stable/_static/logo.png" + + +# Generate the API documentation when building +autosummary_generate = True +autodoc_member_order = "bysource" +bibtex_reference_style = "author_year" +napoleon_google_docstring = True # for pytorch lightning +napoleon_numpy_docstring = True # use numpydoc style +napoleon_include_init_with_doc = False +napoleon_use_rtype = True # having a separate entry generally helps readability +napoleon_use_param = True +napoleon_custom_sections = [("Params", "Parameters")] +todo_include_todos = False +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "html_image", + "html_admonition", +] +myst_url_schemes = ("http", "https", "mailto") +nb_output_stderr = "remove" +nb_execution_mode = "off" +nb_merge_streams = True +typehints_defaults = "braces" + +source_suffix = { + ".rst": "restructuredtext", + ".ipynb": "myst-nb", + ".myst": "myst-nb", +} + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] + +# extlinks config +extlinks = { + "issue": (f"{repository_url}/issues/%s", "#%s"), + "pr": (f"{repository_url}/pull/%s", "#%s"), + "ghuser": ("https://github.com/%s", "@%s"), +} + +intersphinx_mapping = { + "anndata": ("https://anndata.readthedocs.io/en/stable/", None), + "ipython": ("https://ipython.readthedocs.io/en/stable/", None), + "matplotlib": ("https://matplotlib.org/", None), + "numpy": ("https://numpy.org/doc/stable/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), + "python": ("https://docs.python.org/3", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "sklearn": ("https://scikit-learn.org/stable/", None), + "torch": ("https://pytorch.org/docs/master/", None), + "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None), + "lightning": ("https://lightning.ai/docs/pytorch/stable/", None), + "pyro": ("http://docs.pyro.ai/en/stable/", None), + "pymde": ("https://pymde.org/", None), + "flax": ("https://flax.readthedocs.io/en/latest/", None), + "jax": ("https://jax.readthedocs.io/en/latest/", None), + "ml_collections": ("https://ml-collections.readthedocs.io/en/latest/", None), + "mudata": ("https://mudata.readthedocs.io/en/latest/", None), + "ray": ("https://docs.ray.io/en/latest/", None), + "huggingface_hub": ("https://huggingface.co/docs/huggingface_hub/main/en", None), + "sparse": ("https://sparse.pydata.org/en/stable/", None), +} + +# -- Options for HTML output ------------------------------------------- + +# html_show_sourcelink = True +html_theme = "sphinx_book_theme" +html_title = project_name + +html_logo = "_static/logo.png" + +html_theme_options = { + "repository_url": repository_url, + "use_repository_button": True, + "logo_only": True, + "show_toc_level": 1, + "launch_buttons": {"colab_url": "https://colab.research.google.com"}, + "path_to_docs": "docs/", + "repository_branch": version, +} + +pygments_style = "default" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +html_css_files = ["css/override.css"] +html_show_sphinx = False + + +def setup(app): + """App setup hook.""" + app.add_config_value( + "recommonmark_config", + { + "auto_toc_tree_section": "Contents", + "enable_auto_toc_tree": True, + "enable_math": True, + "enable_inline_math": False, + "enable_eval_rst": True, + }, + True, + ) + + +# -- Config for linkcode ------------------------------------------- + + +def git(*args): + """Run git command and return output as string.""" + return subprocess.check_output(["git", *args]).strip().decode() + + +# https://github.com/DisnakeDev/disnake/blob/7853da70b13fcd2978c39c0b7efa59b34d298186/docs/conf.py#L192 +# Current git reference. Uses branch/tag name if found, otherwise uses commit hash +git_ref = None +try: + git_ref = git("name-rev", "--name-only", "--no-undefined", "HEAD") + git_ref = re.sub(r"^(remotes/[^/]+|tags)/", "", git_ref) +except Exception: + pass + +# (if no name found or relative ref, use commit hash instead) +if not git_ref or re.search(r"[\^~]", git_ref): + try: + git_ref = git("rev-parse", "HEAD") + except Exception: + git_ref = "main" + +# https://github.com/DisnakeDev/disnake/blob/7853da70b13fcd2978c39c0b7efa59b34d298186/docs/conf.py#L192 +_dynamo_module_path = os.path.dirname(importlib.util.find_spec("dynamo").origin) # type: ignore + + +def linkcode_resolve(domain, info): + """Determine the URL corresponding to Python object.""" + if domain != "py": + return None + + try: + obj: Any = sys.modules[info["module"]] + for part in info["fullname"].split("."): + obj = getattr(obj, part) + obj = inspect.unwrap(obj) + + if isinstance(obj, property): + obj = inspect.unwrap(obj.fget) # type: ignore + + path = os.path.relpath(inspect.getsourcefile(obj), start=_dynamo_module_path) # type: ignore + src, lineno = inspect.getsourcelines(obj) + except Exception: + return None + + path = f"{path}#L{lineno}-L{lineno + len(src) - 1}" + return f"{repository_url}/blob/{git_ref}/scvi/{path}" + + +# -- Config for hoverxref ------------------------------------------- + +hoverx_default_type = "tooltip" +hoverxref_domains = ["py"] +hoverxref_role_types = dict.fromkeys( + ["ref", "class", "func", "meth", "attr", "exc", "data", "mod"], + "tooltip", +) +hoverxref_intersphinx = [ + "python", + "numpy", + "scanpy", + "anndata", + "pytorch_lightning", + "scipy", + "pandas", + "ml_collections", + "ray", +] +# use proxied API endpoint on rtd to avoid CORS issues +if os.environ.get("READTHEDOCS"): + hoverxref_api_host = "/_" diff --git a/docs/developer.md b/docs/developer.md new file mode 100644 index 000000000..66c1f98d3 --- /dev/null +++ b/docs/developer.md @@ -0,0 +1,3 @@ +```{include} ../CONTRIBUTING.md + +``` diff --git a/docs/extensions/edit_colab_url.py b/docs/extensions/edit_colab_url.py new file mode 100644 index 000000000..1762e5689 --- /dev/null +++ b/docs/extensions/edit_colab_url.py @@ -0,0 +1,39 @@ +from sphinx.application import Sphinx + + +def edit_colab_url( + app: Sphinx, + pagename: str, + templatename: str, + context: dict, + doctree: str, +): + """Edit the colab url to point to the correct repo. + + This assumes that the tutorials repo makes the same tag releases as the main repo, + in addition to only using colab urls (no binder or jupyterhub) + + If this code needs updating, see how the sphinx book theme handles launch buttons. + """ + try: + header_buttons = context["header_buttons"] + except KeyError: + return + for button in header_buttons: + # get launch buttons + if button["label"] == "launch-buttons": + # only one items in the launch buttons list as we only use colab + # remove "tutorials/notebooks" from url + button["buttons"][0]["url"] = button["buttons"][0]["url"].replace( + "/docs/tutorials/notebooks", "" + ) + button["buttons"][0]["url"] = button["buttons"][0]["url"].replace( + "scvi-tools", "scvi-tutorials" + ) + + +def setup(app: Sphinx): + """Setup the extension.""" + # Priority is set to 502 to ensure that this runs after the sphinx-book-theme + # The launch buttons are added in the sphinx-book-theme with priority 501 + app.connect("html-page-context", edit_colab_url, priority=502) diff --git a/docs/extensions/typed_returns.py b/docs/extensions/typed_returns.py new file mode 100644 index 000000000..47292453a --- /dev/null +++ b/docs/extensions/typed_returns.py @@ -0,0 +1,35 @@ +# code from https://github.com/theislab/scanpy/blob/master/docs/extensions/typed_returns.py +# with some minor adjustment +from __future__ import annotations + +import re + +from sphinx.ext.napoleon import NumpyDocstring +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from sphinx.application import Sphinx + from collections.abc import Generator, Iterable + + +def _process_return(lines: Iterable[str]) -> Generator[str, None, None]: + for line in lines: + if m := re.fullmatch(r"(?P\w+)\s+:\s+(?P[\w.]+)", line): + yield f'-{m["param"]} (:class:`~{m["type"]}`)' + else: + yield line + + +def _parse_returns_section(self: NumpyDocstring, section: str) -> list[str]: + lines_raw = self._dedent(self._consume_to_next_section()) + if lines_raw[0] == ":": + del lines_raw[0] + lines = self._format_block(":returns: ", list(_process_return(lines_raw))) + if lines and lines[-1]: + lines.append("") + return lines + + +def setup(app: Sphinx): + """Set app.""" + NumpyDocstring._parse_returns_section = _parse_returns_section diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 000000000..40a3ab9db --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,2 @@ +# Frequently asked questions + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..70534dee8 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,74 @@ +# Documentation + +## Dynamo: Mapping Vector Field of Single Cells + +Inclusive model of expression dynamics with metabolic labeling based scRNA-seq / multiomics, vector field reconstruction, potential landscape mapping, differential geometry analyses, and most probably paths / in silico perturbation predictions. + +![dyname-abstract](https://user-images.githubusercontent.com/7456281/152110270-7ee1b0ed-1205-495d-9d65-59c7984d2fa2.png) + +If you find dynamo to be useful for your research, please consider citing the [dynamo manuscript](https://www.sciencedirect.com/science/article/pii/S0092867421015774) + +::::{grid} 1 2 3 3 +:gutter: 2 + +:::{grid-item-card} Installation {octicon}`plug;1em;` +:link: installation +:link-type: doc + +New to _dynamo_? Check out the installation guide. +::: + +:::{grid-item-card} User guide {octicon}`info;1em;` +:link: user_guide/index +:link-type: doc + +The user guide provides distilled mathematical descriptions of +the models implemented in dynamo and connects the math +with the code. +::: + +:::{grid-item-card} API reference {octicon}`book;1em;` +:link: api/index +:link-type: doc + +The API reference contains a detailed description of +the dynamo API. +::: + +:::{grid-item-card} Tutorials {octicon}`play;1em;` +:link: tutorials/index +:link-type: doc + +The tutorials walk you through real-world applications of dynamo. +::: + +:::{grid-item-card} Discussion {octicon}`megaphone;1em;` +:link: https://github.com/aristoteleo/dynamo-release/discussions + +Need help? Reach out on our forum to get your questions answered! +::: + +:::{grid-item-card} GitHub {octicon}`mark-github;1em;` +:link: https://github.com/aristoteleo/dynamo-release/ + +Find a bug? Interested in improving dynamo? Checkout our GitHub for the latest developments. +::: +:::: + +```{toctree} +:hidden: true +:maxdepth: 3 +:titlesonly: true + +introduction/index +installation +tutorials/index +faq +user_guide/index +api/index +developer +changelog.md +references +Discussion +GitHub +``` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 000000000..7e1ee48c2 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,53 @@ +# Installation + +## Quick install + +dynamo can be installed via `conda` or `pip`. We recommend installing into a virtual +environment to avoid conflicts with other packages. + +```bash +conda install -c conda-forge dynamo-release +``` + +or + +```bash +pip install dynamo-release +``` + +To install the newest version of dynamo, you can git clone our repo and then pip install: + +```bash +git clone https://github.com/aristoteleo/dynamo-release.git +pip install dynamo-release/ --user +``` + +Don't know how to get started with virtual environments or `conda`/`pip`? Check out the +[prerequisites](#prerequisites) section. + +## Prerequisites + +### Virtual environment + +A virtual environment can be created with either `conda` or `venv`. We recommend using `conda`. We +currently support Python 3.10 - 3.12. + +For `conda`, we recommend using the [Miniforge](https://github.com/conda-forge/miniforge) +distribution, which is generally faster than the official distribution and comes with conda-forge +as the default channel (where dynamo is hosted). + +```bash +conda create -n dynamo-env python=3.10 # any python 3.10 to 3.12 +conda activate dynamo-env +``` + +For `venv`, we recommend using [uv](https://github.com/astral-sh/uv). + +```bash +pip install -U uv +uv venv .dynamo-env +source .dynamo-env/bin/activate # for macOS and Linux +.scvi-env\Scripts\activate # for Windows +``` + + diff --git a/docs/introduction/dynamo_paper_figures/fig6_a.png b/docs/introduction/dynamo_paper_figures/fig6_a.png new file mode 100644 index 000000000..dd7cd9d18 Binary files /dev/null and b/docs/introduction/dynamo_paper_figures/fig6_a.png differ diff --git a/docs/introduction/dynamo_paper_figures/fig6_b.png b/docs/introduction/dynamo_paper_figures/fig6_b.png new file mode 100644 index 000000000..324b61324 Binary files /dev/null and b/docs/introduction/dynamo_paper_figures/fig6_b.png differ diff --git a/docs/introduction/dynamo_paper_figures/fig6_c.png b/docs/introduction/dynamo_paper_figures/fig6_c.png new file mode 100644 index 000000000..8b6fd7bcf Binary files /dev/null and b/docs/introduction/dynamo_paper_figures/fig6_c.png differ diff --git a/docs/introduction/dynamo_paper_figures/fig7_a.png b/docs/introduction/dynamo_paper_figures/fig7_a.png new file mode 100644 index 000000000..365adc08b Binary files /dev/null and b/docs/introduction/dynamo_paper_figures/fig7_a.png differ diff --git a/docs/introduction/index.md b/docs/introduction/index.md new file mode 100644 index 000000000..31745a6fa --- /dev/null +++ b/docs/introduction/index.md @@ -0,0 +1,16 @@ +# Introduction + +Single-cell (sc)RNA-seq, together with RNA velocity and metabolic labeling, reveals cellular states and transitions at unprecedented resolution. Fully exploiting these data, however, requires kinetic models capable of unveiling governing regulatory functions. Here, we introduce an analytical framework dynamo, which infers absolute RNA velocity, reconstructs continuous vector fields that predict cell fates, employs differential geometry to extract underlying regulations, and ultimately predicts optimal reprogramming paths and perturbation outcomes. We highlight dynamo’s power to overcome fundamental limitations of conventional splicing-based RNA velocity analyses to enable accurate velocity estimations on a metabolically labeled human hematopoiesis scRNA-seq dataset. Furthermore, differential geometry analyses reveal mechanisms driving early megakaryocyte appearance and elucidate asymmetrical regulation within the PU.1-GATA1 circuit. Leveraging the least-action-path method, dynamo accurately predicts drivers of numerous hematopoietic transitions. Finally, in silico perturbations predict cell-fate diversions induced by gene perturbations. Dynamo, thus, represents an important step in advancing quantitative and predictive theories of cell-state transitions. + + +```{toctree} +:maxdepth: 2 + +index_time +index_geo +index_cellfate +index_silico + +``` + + diff --git a/docs/introduction/index_cellfate.md b/docs/introduction/index_cellfate.md new file mode 100644 index 000000000..d33c9af2e --- /dev/null +++ b/docs/introduction/index_cellfate.md @@ -0,0 +1,116 @@ + +# Optimal cell fate transitions via most probable path + +The ability to drive conversion between different cell states has garnered a great deal of attention as a promising avenue for disease modeling. A fundamental challenge in the field of stem cell biology is, thus, to assess the feasibility and identify optimal paths and key transcription factors (TFs) of such inter-conversions. We summarize this grand problem of predicting optimal cell fate conversions (OPCs) in the figure below [here](#lap_theory_dynamo_paper_fig6_a). + +![The grand problem of predicting OPtimal cell-fate Conversions(OPCs).](dynamo_paper_figures/fig6_a.png){:align="center" width="400"} + +The least action path (LAP) principle, first proposed as early as 1744 by [Terrall](#) and famously advocated by Feynman with his reformulation of quantum mechanics via the path integral of the classical Hamilton action [Feynman, 1965](#), has previously been used in predicting the optimal transition path of cell fate transition for simplistic and designed systems [Qiu et al., 2012; Wang et al., 2014; Wang et al., 2017](#). We reason that with the reconstructed continuous and differentiable vector field, we can extend the LAP approach to real datasets in transcriptomic space to computationally explore optimal paths for differentiation and reprogramming (dedifferentiation and transdifferentiation), which then helps us identify key transcription factors whose expression levels vary strongest along these paths. + +The hematopoietic scNT-seq dataset we generated in this study is well suited for testing LAP. Among the cell types from our tscRNA-seq data [developmental tree](#lap_theory_dynamo_paper_fig6_b), there are five developmental events (from HSC to each of the terminal cell type), one reported dedifferentiation event (from Meg to HSC), and a total of eight reported transdifferentiation events. Considering all-against-all conversions, we are left with 18 unreported transitions between different mature cell types [transition matrix](#lap_theory_dynamo_paper_fig6_b). + +![Predicting OPCs for hematopoietic cell types.](dynamo_paper_figures/fig6_b.png) + +Here we first briefly introduce the intuition of the LAP and what we can do with it. Intuitively, the optimal path between any two cell states (e.g., the fixed point of HSCs and that of megakaryocytes) is searched by varying the continuous path connecting the source state to the target while minimizing its action and updating the associated transition time [LAP](#lap_theory_dynamo_paper_fig6_c). The resultant LAP has the highest transition probability and is associated with a particular transition time. In order to identify the associated key regulators, we focus only on TFs and rank them by the path integral of the mean square displacement (MSD) of gene expression with respect to the initial expression. + +![The optimal paths for hematopoietic transitions can be found by identifying the LAPs between the fixed points that correspond to each stable cell type.](dynamo_paper_figures/fig6_c.png){:align="center" width="400"} + +Given the vector field function, $\boldsymbol{f}$, optimal pathways of cell fate conversion can be mathematically analyzed by least action paths (LAPs) [Freidlin & Wentzell, 2012; Onsager, 1953; Maier & Stein, 1997](#). The action is defined as: + +```math +S_T(\boldsymbol{x}) = \frac{1}{2} \int_{0}^{T}\mathrm{d}t \left(\boldsymbol{v}(t) - \boldsymbol{f}(\boldsymbol{x}(t))\right)^\mathsf{T} \boldsymbol{D}^{-1} \left(\boldsymbol{v}(t) - \boldsymbol{f}(\boldsymbol{x}(t))\right), +``` + +where $\boldsymbol{x}$ is a path and $\boldsymbol{v}$ is $\boldsymbol{x}$'s tangential velocity (the path is parametrized by time $t$, so $\boldsymbol{v}(t) = \dot{\boldsymbol{x}}(t)$). $\boldsymbol{D}$ is the diffusion coefficient matrix accounting for the stochasticity of gene expression, and for simplicity here we assume it to be a constant. $T$ is the time needed for a cell to traverse the path. By this definition, a path that strictly follows a streamline of a vector field whose tangential velocity also equals the evaluated velocity of the vector field has zero action, whereas any deviation increases action. In other words, developmental processes are (mostly) a spontaneous process and driven by intrinsic cell states, whereas dedifferentiation requires external forces such as ectopic expression of exogenous TFs or specific chemical inductions. + +Computationally, given the starting and end cell states $\boldsymbol{x}_0$ and $\boldsymbol{x}_{n}$, such as HSCs and megakaryocytes, and a specific traversal time $T$, the LAP can be found by discretizing the path as a sequence of points $P=\{\boldsymbol{x}_0, \boldsymbol{x}_1, \dots, \boldsymbol{x}_n\}$, which forms $n$ line segments. For each line segment, the discrete tangential velocity can be calculated as $\boldsymbol{v}_k = (\boldsymbol{x}_k-\boldsymbol{x}_{k-1})/\Delta t$, where $\Delta t = T/n$. The action along the discrete path is defined as [Perez-Carrasco et al., 2016; Tang et al., 2017](#): + +```math +S_T(P) = \frac{1}{2D}\sum_{k=1}^{n} \left(\boldsymbol{v}_k - \boldsymbol{f}(\boldsymbol{y}_k)\right)^2\Delta t, +``` + +where $\boldsymbol{y}_k$ are the middle points of the line segments, i.e., $\boldsymbol{y}_k = (\boldsymbol{x}_{k-1} + \boldsymbol{x}_k)/2$. Given a traversal time $T$, the LAP is a path such that: + +```math +P^* = \underset{P}{\operatorname{argmin}}\ S_T(P) = \underset{P}{\operatorname{argmin}}\ \frac{1}{2D}\sum_{k=1}^{n} \left(\boldsymbol{v}_k - \boldsymbol{f}(\boldsymbol{y}_k)\right)^2\Delta t. +``` + +To obtain the global LAP, the optimal traversal time $T^*$ is determined as: + +```math +T^* = \underset{T}{\operatorname{argmin}}\ S_T(P). +``` + +The algorithm discretizes the path as a sequence of points, $P=\{\boldsymbol{x}_0, \boldsymbol{x}_1, \dots, \boldsymbol{x}_n\}$, which forms $n$ line segments. For each line segment, the discrete tangential velocity can be calculated as $\boldsymbol{v}_k=(\boldsymbol{x}_k - \boldsymbol{x}_{k-1})/\Delta t$, where $\Delta t$ is the time step for the cell to move from $\boldsymbol{x}_{k-1}$. In addition to the deterministic vector field, we also assume a certain degree of stochasticity in the system: + +```math +\dot{\boldsymbol{x}} = \boldsymbol{f}(\boldsymbol{x}) + \sigma \boldsymbol{\eta}(t), +``` + +where $\boldsymbol{\eta}(t)$ is a stochastic white noise and $\boldsymbol{\sigma}$ the size of it. The action $S$ along the discrete path is defined as (Perez-Carrasco et al., 2016): + +```math +S(P, \Delta t) = \frac{1}{2D}\sum_{k=1}^{n}\left(\boldsymbol{v}_k - \boldsymbol{f}(\boldsymbol{y}_k)\right)^2\Delta t, +``` + +where $\boldsymbol{y}_k$ are the middle points of the line segments, i.e., $\boldsymbol{y}_k = (\boldsymbol{x}_{k-1} + \boldsymbol{x}_k)/2$. We have also assumed the diffusion matrix to be a constant $D$, such that $D=\sigma^2/2$. It is intuitive that a path whose tangential velocities $\boldsymbol{v}$ align with the vector field has smaller action than paths that do not. The LAP is a path such that: + +```math +P^* = \underset{P, \Delta t}{\operatorname{argmin}} S(P, \Delta t) = \underset{P, \Delta t}{\operatorname{argmin}}\frac{1}{2D}\sum_{k=1}^{n}\left(\boldsymbol{v}_k - \boldsymbol{f}(\boldsymbol{y}_k)\right)^2\Delta t, +``` + +The algorithm for finding the LAP therefore consists of two steps: + +- Minimization of the action by varying the time step. The optimal time step given a fixed path is a simple univariate least square minimization, i.e.: + +```math +\Delta t^* = \underset{\Delta t}{\operatorname{argmin}}\frac{1}{2D}\sum_{k=1}^{n}\left(\frac{\boldsymbol{x}_k - \boldsymbol{x}_{k-1}}{\Delta t} - \boldsymbol{f}(\boldsymbol{y}_k)\right)^2\Delta t, +``` + +- Minimization of the action by varying the path without moving the starting and end points. The optimal path given a fixed time step is found by: + +```math +P^* = \underset{\{\boldsymbol{x}_1, \boldsymbol{x}_2, \dots, \boldsymbol{x}_{n-1}\}}{\operatorname{argmin}}\frac{1}{2D}\sum_{k=1}^{n}\left(\frac{\boldsymbol{x}_k - \boldsymbol{x}_{k-1}}{\Delta t} - \boldsymbol{f}\left(\frac{\boldsymbol{x}_{k-1} + \boldsymbol{x}_k}{2}\right)\right)^2\Delta t, +``` + +For a $d$-dimensional vector field, the number of variables in the above optimization problem is $d\times n$. To mitigate the computational cost, the Jacobian of the action w.r.t. the path (more specifically, the a-th component of the $k$-th point) is analytically computed: + +```math +\frac{\partial{S}}{\partial{x_k^a}} = \frac{1}{D}\left(v_k^a - v_{k+1}^a + f^a(\boldsymbol{y}_{k+1}) - f^a(\boldsymbol{y}_k)\right) -\frac{1}{2D}\left(\left(\boldsymbol{v}_{k+1} - \boldsymbol{f}(\boldsymbol{x}_{k+1})\right) \cdot \frac{\partial{f}}{\partial{x^a}}\Big|_{\boldsymbol{x}_{k+1}} + \left(\boldsymbol{v}_k - \boldsymbol{f}(\boldsymbol{x}_k)\right)\cdot\frac{\partial f}{\partial{x^a}}\Big|_{\boldsymbol{x}_k}\right) +``` + +Note that the partial derivative of the vector field is the $a$-th row of the Jacobian of the vector field. With the analytical Jacobian, the computation efficiency of the LAP optimization improves tremendously, making the LAP calculation feasible to operate in high-dimensional space, such as the top 30 PCs. + +The LAP is found by iterating between the two steps, and empirically we found that the path converges in two or three iterations. By default, the LAP optimization is initialized with the interpolated shortest path on the kNN graph of cells. + +Notably, when LAPs are calculated in the PCA space, we can transform them back to the original gene expression space to predict the full transcriptomic kinetics along the optimal path, inspect waves of those kinetics along the path, and do so in absolute time units when the vector field used is based on tscRNA-seq. + +For rare transitions with $S_{T^*} \gg 0$ (e.g., dedifferentiation and transdifferentiation), the transition rate (number of transitions per unit time) is proportional to the exponential of actions of all paths. The Freidlin–Wentzell theorem dictates that the LAP with the minimal traversal time (which will be referred to as the optimal path below) contributes the most to this transition rate [Freidlin & Wentzell, 2012; Onsager, 1953; Maier & Stein, 1997; Aurell et al., 2002](#): + +```math +R(A\rightarrow B) \approx C\exp(-S_{T^*}), +``` + +where $A$ and $B$ are two cell types, $S_{T^*}$ the action of the optimal path, and $C$ a proportional factor. Furthermore, the transition time, or more specifically the mean first passage time (MFPT), is related to the transition rate: + +```math +\mathrm{MFPT} = \frac{1}{R(A\rightarrow B)} +``` + +Therefore, the action of the optimal path predicts both the likelihood and transition time for such rare transitions. Again, most reprogramming experiments take a few weeks or months, depending on the exact initial and terminal cell states [Takahashi & Yamanaka, 2006](#). + +For natural transitions between points that are connected by the vector field streamlines (e.g., from a repulsor to an adjacent attractor), the actions of LAPs, within a certain range of $T$, are all zero, because a path following the streamline downstream is a LAP with zero action. The above approximation that the LAP contributes the most to the transition rate no longer applies. Differentiation processes are often close to such natural transitions, and the action of a differentiation LAP cannot tell us any information on the transition rate. However, LAPs are still the most probable paths for cells to take, as they are optimized to follow the streamline of the vector field. The waiting time for the cell to initiate the transition is negligible in this case, so the transition time can be approximated by the traversal time of the LAP. + +In addition to the computation of transition time and traversal time (see below), analyzing gene expression variations along LAPs provides essential information on regulatory genes, and their dynamics, during cell fate transitions. + +- Transition time: the expected waiting time for a cell to initiate and finish the transition between two states, regardless of the path it takes. This corresponds to the experimentally measured time for one cell type to commit into another. + +- Traversal time: the time the cell spends traveling along a specific path. Theoretically, this is the time for a single cell to complete the cell type conversion once the cell has decided on the commitment. + +We calculate the mean squared displacement (MSD) for every gene $i$ along the optimal path: + +```math +\mathrm{MSD}_i = \sum_{t=0}^{T} \big(y_i(t) - y_i(0)\big)^2 +``` + +Genes with large MSD are potentially genes that regulate the corresponding transitions. \ No newline at end of file diff --git a/docs/introduction/index_geo.md b/docs/introduction/index_geo.md new file mode 100644 index 000000000..87464e1f3 --- /dev/null +++ b/docs/introduction/index_geo.md @@ -0,0 +1,75 @@ + +
+ + Open In Colab + + Open In nbviewer +
+ +# Primer on differential geometry + +In this work, we introduced **dynamical systems theory** and **differential geometry** analysis to single-cell genomics. A dynamical system describes the time dependence of points in a geometrical space, e.g., planetary motion or cell fate transitions, whereas differential geometry uses the techniques of differential/integral calculus and linear/multilinear algebra to study problems in geometry, e.g., the topology or geometric features along a streamline in vector field of the gene expression space. + +A vector field function $\mathbf{f}$, a fundamental topic of dynamical systems theories, takes spatial coordinate input $\mathbf{x}$ (e.g., single-cell expression in gene state space) in a high-dimensional space (each gene corresponds to a dimension) as input and outputs a vector $\mathbf{v}$ (e.g., corresponds to gene expression velocity vector from a single cell) in the same space, i.e. $\mathbf{v} = \mathbf{f}(\mathbf{x})$. In this study, we specifically discuss velocity vector fields that can be used to derive acceleration and curvature vector fields (see **below**). With analytical velocity vector field functions, including the ones that we learned directly from data, we can move beyond velocity to high-order quantities, including the Jacobian, divergence, acceleration, curvature, curl, etc., using theories developed in differential geometry. The discussion of the velocity vector field in this study focuses on transcriptomic space; vector fields, however, can be generally applicable to other spaces, such as morphological, proteomic, or metabolic space. + +Because $\mathbf{f}$ is a vector-valued multivariate function, a $d\times d$ matrix encoding its derivatives, called the *Jacobian*, plays a fundamental role in differential geometry analysis of vector fields: + +\[ +\mathbf{J} = \begin{bmatrix} +\dfrac{\partial f_1}{\partial x_1} & \dfrac{\partial f_1}{\partial x_2} & \cdots & \dfrac{\partial f_1}{\partial x_d} \\ +\dfrac{\partial f_2}{\partial x_1} & \dfrac{\partial f_2}{\partial x_2} & \cdots & \dfrac{\partial f_2}{\partial x_d} \\ +\vdots & \vdots & \ddots & \vdots \\ +\dfrac{\partial f_d}{\partial x_1} & \dfrac{\partial f_d}{\partial x_2} & \cdots & \dfrac{\partial f_d}{\partial x_d} +\end{bmatrix} +\] + +A Jacobian element $\partial f_i/\partial x_j$ reflects how the velocity of $x_i$ is impacted by changes in $x_j$. + +## Box Fig. 1. Divergence, curl, acceleration and curvature of vector field. + +![Box Fig. 1](https://raw.githubusercontent.com/Xiaojieqiu/jungle/master/Box1.png) + +The trace of the Jacobian is divergence: +\[ +\nabla \cdot \mathbf{f} = \sum_{i=1}^{d}\dfrac{\partial f_i}{\partial x_i} = \mathrm{tr} \mathbf{J} +\] + +Divergence measures the degree of “outgoingness” at any point, summarized in **Box Fig. 1A**. + +By definition, an attractor (repulsor) converges (diverges) in any direction. Note that it is possible to have a point where the vectors converge in one direction but diverge in another, a case that is not depicted in the diagram above. This means that although an attractor (repulsor) always has negative (positive) divergence, the opposite does not necessarily hold. + +*Curl* is a quantity measuring the degree of rotation at a given point in the vector field. It is well-defined only in two or three dimensions (e.g. two or three reduced principal components or UMAP components): + +\[ +\nabla \times \mathbf{f} = \begin{bmatrix} +\dfrac{\partial f_z}{\partial y} - \dfrac{\partial f_y}{\partial z} \\ +\dfrac{\partial f_x}{\partial z} - \dfrac{\partial f_z}{\partial x} \\ +\dfrac{\partial f_y}{\partial x} - \dfrac{\partial f_x}{\partial y} +\end{bmatrix} +\] + +The behavior of curl is summarized in **Box Fig. 1B**. + +Many differential geometry quantities are defined on *streamlines*, which are curves everywhere tangent to the vector field. The streamlines can be parametrized with time $t$, denoted $\mathbf{x}(t)$, as they are essentially trajectories of cells moving in the vector field. In practice, they are often calculated using numerical integration methods, e.g., the Runge–Kutta algorithm. The *acceleration* is the time derivative of the velocity, as shown in **Box Fig. 1C** (orange shade), and can be defined as: + +\[ +\mathbf{a} = \dfrac{\mathrm{d} \mathbf{v}}{\mathrm{d} t} = \dfrac{\mathrm{d}}{\mathrm{d} t}\mathbf{f}\Big(\mathbf{x}(t)\Big) = \sum_{i=1}^{d} \dfrac{\partial \mathbf{f}}{\partial x_i}\dfrac{\partial x_i}{\partial t} = \mathbf{J} \mathbf{v} +\] + +The curvature vector (**Box Fig. 1C**, green shade) of a curve is defined as the derivative of the unit tangent vector $\left(\frac{\mathrm{d}}{\mathrm{dt}}\frac{\mathbf{v}}{|\mathbf{v}|}\right)$, divided by the length of the tangent ($|\mathbf{v}|$): + +\[ +\kappa = \dfrac{1}{|\mathbf{v}|}\dfrac{\mathrm{d}}{\mathrm{d} t}\dfrac{\mathbf{v}}{|\mathbf{v}|} = \dfrac{\mathbf{J}\mathbf{v}(\mathbf{v}\cdot \mathbf{v}) - \mathbf{v}(\mathbf{v}\cdot \mathbf{J}\mathbf{v})}{|\mathbf{v}|^4} +\] + +In the context of velocity vector fields and streamlines, the unit tangent vector is the normalized velocity. + +By definition, acceleration measures the rate of change of velocity in terms of both its magnitude and direction. Curvature, on the other hand, measures only the change in direction, as the velocity vector is normalized. **Box Fig. 1C** (green shade) illustrates how the acceleration can be decomposed into a tangential and a radial component, and the latter is connected to the curvature: + +\[ +\mathbf{a} = \mathbf{a}_t + |\mathbf{v}|^2\mathbf{\kappa} +\] + +Although acceleration and curvature are mathematically defined on streamlines, the actual calculation, as shown above, can be done pointwise using only the velocity and the Jacobian evaluated at the point of interest. Because the acceleration or the curvature can be calculated for any point in the state space, one obtains the acceleration or curvature vector field. + +Other relevant differential geometric analyses, including torsion (applicable to three-dimensional vector field), vector Laplacian, etc., can also be computed using vector field functions, although they were not extensively studied in this work. diff --git a/docs/introduction/index_silico.md b/docs/introduction/index_silico.md new file mode 100644 index 000000000..995329fb7 --- /dev/null +++ b/docs/introduction/index_silico.md @@ -0,0 +1,44 @@ + +# *In silico* perturbation + +\[ +\newcommand{\pdv}[2]{\dfrac{\partial #1}{\partial #2}} \newcommand{\trp}{\mathsf{T}} +\] + +We leverage the analytical Jacobian of the reconstructed vector field function to make *in silico* genetic perturbations (left panel in [this figure](#dynamo_fig7_a)) and predict cell-fate outcomes after the perturbation (right panel in [this figure](#dynamo_fig7_a)). + +![*In silico* genetic perturbation of the velocity vector field.](dynamo_paper_figures/fig7_a.png){:align="center"} + +Intuitively, to simulate the genetic perturbation effects, we will introduce genetic perturbations to the system (encoded by the perturbation vector) and then let the perturbations propagate in the gene regulatory network (encoded by the Jacobian matrix) to execute downstream responses. Mathematically, for gene \(i\) in any cell, the genetic perturbation effects or changes in its velocity (or more accurately, the vector field) w.r.t. to small perturbations in the expression of all genes in the network (encoded by the Jacobian matrix \(\boldsymbol J\)), \(\mathrm dx_1\), \(\mathrm dx_2\),…, \(\mathrm dx_n\), can be calculated with the *exact differential*: + +```math +\mathrm{d} f_i = \pdv{f_i}{x_1}\mathrm{d}x_1 + \pdv{f_i}{x_2}\mathrm{d}x_2 + \ldots + \pdv{f_i}{x_n}\mathrm{d}x_n. +``` + +In vectorized form: + +```math +\begin{bmatrix} \mathrm{d}f_1 \\[1.5ex] \mathrm{d}f_2 \\[1.5ex] \dots \\[1.5ex] \mathrm{d}f_n \end{bmatrix} = \begin{bmatrix} \pdv{f_1}{x_1} & \pdv{f_1}{x_2} & \dots & \pdv{f_1}{x_n} \\[2ex] \pdv{f_2}{x_1} & \pdv{f_2}{x_2} & \dots & \pdv{f_2}{x_n} \\[2ex] \dots & \dots & \dots & \dots \\[2ex] \pdv{f_n}{x_1} & \pdv{f_n}{x_2} & \dots & \pdv{f_n}{x_n} \end{bmatrix} \begin{bmatrix} \mathrm{d}x_1 \\[1.5ex] \mathrm{d}x_2 \\[1.5ex] \dots \\[1.5ex] \mathrm{d}x_n \end{bmatrix}. +``` + +The matrix on the right hand side is the Jacobian of the vector field. Replacing infinitesimal changes with finite perturbations, the above equation becomes: + +```math +\Delta \boldsymbol{f} = \boldsymbol{J} \Delta \boldsymbol{x}. +``` + +In practice, a proportionality constant \(c\) (i.e. setting a perturbation to be 100 or -100) is often added to the perturbation \(\Delta \boldsymbol{x}\) to amplify the response \(\Delta \boldsymbol{f}\). Furthermore, because vector fields are often learned in the PCA space, the perturbations in the \(d\)-dimensional gene space are first transformed to the \(k\)-dimensional PCA space by: + +```math +\Delta \boldsymbol{x} = \boldsymbol{Q}^\trp (\Delta \boldsymbol{y} - \boldsymbol{\mu}). +``` + +where \(\boldsymbol{Q}\) is the \(d\)-by-\(k\) PCA loading matrix, and \(\boldsymbol{\mu}\) is the mean of the PCA-transformed data. The response \(\Delta \boldsymbol{f}\) can be transformed back to the PCA space: + +```math +\Delta \boldsymbol{g} = \boldsymbol{Q} \Delta \boldsymbol{f} + \boldsymbol{\mu}. +``` + +One can then use \(\Delta \boldsymbol{f}\), a gene by cell matrix, to identify the strongest positive or negative responders of the genetic perturbation across cells. + +Importantly, because \(\Delta \boldsymbol{f}\) implies how each cell state will be affected after genetic perturbations, we can predict the cell fate trajectory under genetic perturbations by integrating the perturbation effects across cells over gene expression space. To visualize the cell fate trajectory, pairs of \(\boldsymbol{x}\) and \(\Delta \boldsymbol{g}\) are used in the same vein as the gene expression and RNA velocity vector to be further projected onto the UMAP or other low dimensional embeddings using the transition matrix [Bergen et al., 2020; La Manno et al., 2018](#) and then plotted with streamlines. diff --git a/docs/introduction/index_time.md b/docs/introduction/index_time.md new file mode 100644 index 000000000..2a287aa7c --- /dev/null +++ b/docs/introduction/index_time.md @@ -0,0 +1,63 @@ + +
+ + Open In Colab + + Open In nbviewer +
+ +# Time-resolved scRNA-seq + +## Seminal RNA velocity limitations + +Although the seminal RNA velocity work is exciting, it has the following limitations: + +1. It can only predict short-term direction and magnitude of RNA dynamics. +2. It is mostly a descriptive instead of a predictive tool. +3. It relies on the `mis-priming` of intron reads for current single-cell platforms and thus the intron measures are biased and inaccurate. +4. RNA velocity was estimated as \(U - \gamma / \beta S\) (\(U\): unspliced RNA, \(S\): spliced RNA, \(\gamma\): degradation rate, \(\beta\): splicing rate, \(\gamma / \beta\) is the slope of the steady state cell fitting.), it is thus scaled by the splicing rate and lacks real physical meanings (i.e. molecules / hour). + +We reason that metabolic labeling based method which measures both the historical or old, and the new and nascent RNA of cells in a controllable way will be better measurements for RNA velocity and transcriptomic dynamics. When extending metabolic labeling to single cell RNA-seq, labeling based scRNA-seq essentially measures two modalities or timepoints for the same cell. + +## How does metabolic labeling work + +How can we quantify nascent RNA via metabolic labeling? Overall there are two different methods, the biotin purification or chemical conversion based approach. Both approaches are quite similar in that we first need to apply different labeling strategies to label the cells. For biotin purification, we need to use thiol-specific biotinylation to tag labeled mRNA. Then the streptavidin beads can be used to pull down and separate the pre-existing RNA and newly transcribed RNA. Then we will follow by preparing two separate libraries, old and new RNAs, for sequencing. There are a few very well-known issues regarding this method: + +1. It often introduces 20-30% cross-contamination between old and new RNAs. +2. It also leads to some normalization issues between different libraries. + +On the other hand, the chemical conversion based approaches avoid the laborious and error-prone procedure of separating old/old RNA and preparing two different libraries and emerged as the favored strategy recently. The key idea of chemical conversion based methods are that by some chemical reaction we can artificially introduce T to C mutation which can then be used to distinguish labelled and thus new RNA from old RNA. There are about three different chemistry developed: IAA alkylation or hydrogen bond reconfiguration via TimeLapse-seq or TUC-seq chemistry. + +In fact, metabolic labeling has been widely adapted for the past few decades. We can use various nucleotides to label RNA, for example, BrU, Eu and Biotin-NTP. For 4sU based labeling, there are about three different strategies, namely, SLAM-seq, TUC-seq, and Time-lapse-seq. + +![Metabolic labeling](https://user-images.githubusercontent.com/7456281/93838316-346af300-fc57-11ea-9cf9-79d37d8ff927.png) + +## Metabolic labeling based scRNA-seq + +Recently a few groups adapted the bulk method to either the plate-based scRNA-seq with SMART-seq2 method, for example, [scSLAM-seq](https://www.nature.com/articles/s41586-019-1369-y) or [NASC-seq](https://www.nature.com/articles/s41467-019-11028-9). [scEU-seq](https://science.sciencemag.org/content/367/6482/1151.full) is based on [CEL-Seq2](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0938-8) and is also plate-based but uses UMI in contrast to scSLAM-seq or NASC-seq. The scEU-seq method is based on EU and required purification and it thus may involve cross-contamination or normalization issues. + +[Cao, et al](https://www.nature.com/articles/s41587-020-0480-9#:~:text=Abstract,not%20directly%20capture%20transcriptional%20dynamics.&text=We%20used%20sci%2Dfate%20to,in%20%3E6%2C000%20single%20cultured%20cells) recently developed sci-fate which integrates 4sU labeling and combinatorial indexing based scRNA-seq so it can potentially enable measuring hundreds of thousands of single cells. + +For the first time, [Wu lab](https://www.wulabupenn.org/) from Upenn developed a drop-seq based metabolic labeling based scRNA-seq, scNT-seq. + +![scNT-seq](https://user-images.githubusercontent.com/7456281/93839221-4601ca00-fc5a-11ea-8e56-d39ec1725a6b.png) + +## Comparison between different labeling based scRNA-seq methods + +In [Qiu, Hu, et. al](https://www.nature.com/articles/s41592-020-0935-4), we performed a detailed comparison (Supplementary table 7) between scNT-seq with other available methods. Especially for the improved second-strand synthesis based strategy, we are able to obtain substantially high number of genes and UMIs per cell with relatively few number of reads. Thus scNT-seq is arguably one of the best metabolic labeling based scRNA-seq strategies. + +In our study, we show that dynamo can be used to leverage scNT-seq datasets for time-resolved RNA-velocity analysis. Those results demonstrate the power of dynamo and scNT-seq in revealing the fine-grained transcriptomic dynamics. + +![Comparison](https://user-images.githubusercontent.com/7456281/93838287-1b624200-fc57-11ea-9674-76006ba07950.png) + +## Labeling strategies + +We can be very creative and smart in designing the metabolic labeling experiments. For example, you can design an experiment where you can take different days and perform a kinetic experiment at each day. This can help you obtain transcription rate, splicing and degradation rate over time. But this is often time-consuming, so we may just choose a typical day for a single kinetic experiment. In addition, we may also perform a degradation experiment where we label the cells with 4sU for an extended time period to saturate the 4sU labeling in cells. Then we can wash out the 4sU and replaced with excess U, followed by chasing at different time points. This can help us to estimate the splicing and degradation rates (and half-life) of RNA. We can also just design a one-shot labeling experiment to label cells at different time points. Since splicing and degradation rate of mRNA is often constant, thus combining one-shot experiments with degradation experiments, we are able to get even more accurate estimates of the transcription rate at each time point. We also want to note that we can combine different labeling strategies, for example, combining pulse chase in a single experiment or integrating metabolic labeling with drug treatment or genetic perturbations. + +![Labeling strategies](https://user-images.githubusercontent.com/7456281/93838322-392fa700-fc57-11ea-9019-e76358160f57.png) + +## Dynamo’s comprehensive model framework for analyzing labeling datasets + +In order to fully take advantage of the scSLAM-seq data, we recently developed a sophisticated framework, dynamo that provides an inclusive model of expression dynamics with scSLAM-seq and multiomics, vector field reconstruction and potential landscape mapping. In dynamo, we abstract every step from RNA transcription, splicing, metabolic labeling, translation and RNA or protein degradation. We can model the mean and variance of RNA species via a set of moment equations, we then transform them into a matrix format and solve them efficiently. In dynamo, we also implemented the traditional RNA velocity method based on the steady state assumptions to support analyzing regular 10x data. Similarly, dynamo supports studying cite-seq data to estimate protein velocity. + +![Dynamo framework](https://user-images.githubusercontent.com/7456281/93838322-392fa700-fc57-11ea-9019-e76358160f57.png) diff --git a/docs/introduction/lap_tutorial/bas-hsc-kinetic.png b/docs/introduction/lap_tutorial/bas-hsc-kinetic.png new file mode 100644 index 000000000..1efbae01d Binary files /dev/null and b/docs/introduction/lap_tutorial/bas-hsc-kinetic.png differ diff --git a/docs/introduction/lap_tutorial/lap_tutorial.rst b/docs/introduction/lap_tutorial/lap_tutorial.rst new file mode 100644 index 000000000..648817c09 --- /dev/null +++ b/docs/introduction/lap_tutorial/lap_tutorial.rst @@ -0,0 +1,1221 @@ +.. raw:: html + +
+ + Open In Colab + + Open In nbviewer +
+ +Most probable path predictions +============================== + +Introduction +~~~~~~~~~~~~ + +| The ability to drive conversion between different cell states has + garnered a great deal of attention as a promising avenue for disease + modeling. A fundamental challenge in the field + of stem cell biology is to identify and assess the feasibility of + optimal paths and key TFs (transcription factors) of such + interconversions (Figure 6A of :cite:p:`QIU2022`). The least action path (LAP) is a principled method that has previously been used in + theoretical efforts to predict the most probable path a cell will + follow during fate transition. Specifically, the optimal path between any two cell states + (e.g. the fixed point of HSCs and that of megakaryocytes) is searched + by variating the continuous path connecting the source state to the + target while minimizing its action and updating the associated + transition time. The resultant least action path has the highest + transition probability and is associated with a particular transition + time. Once the LAP is identified, we can focus only on TFs and rank them by the path integral of the mean square displacement (MSD) of gene expression with respect to the initial expression to identify key transcription factors of the associated cell fate transitions. + +| In this tutorial, we will demonstrate how to +- perform LAP analyses; +- visualize transition paths found by the LAP approach on the vector field; +- plot heatmaps of actions and transition times matrix between all hematopoietic cell types; +- prioritize transcription factors of each predicted optimal path; +- ROC analyses of the LAP predictions. + +Import relevant packages + +.. code:: ipython3 + + import numpy as np + import pandas as pd + import seaborn as sns + import matplotlib.pyplot + import matplotlib.pyplot as plt + + import sys + import os + + import dynamo as dyn + + dyn.dynamo_logger.main_silence() + + + +.. parsed-literal:: + + |-----> setting visualization default mode in dynamo. Your customized matplotlib settings might be overritten. + + +Let us first load the human hematopoiesis scNT-seq dataset, which has stored as a sample dataset within dynamo that can be download directly using the above function, produced in this study (:cite:p:`QIU2022`). In this tutorial we will focus on analyzing this scNT-seq dataset because decades of researches in hematopoiesis make it a well suited system for testing LAP predictions. + +.. code:: ipython3 + + adata_labeling = dyn.sample_data.hematopoiesis() + +Let us take a glance at what is in ``adata`` object. Preprocessing, normalization, umap dimension reduction, total RNA velocity, as well as the continous RNA velocity vector field are computed (notebooks on these operations will be released shortly. Please also check other existing notebooks for these operations). + +.. code:: ipython3 + + adata_labeling + + +.. parsed-literal:: + + AnnData object with n_obs × n_vars = 1947 × 1956 + obs: 'batch', 'time', 'cell_type', 'nGenes', 'nCounts', 'pMito', 'pass_basic_filter', 'new_Size_Factor', 'initial_new_cell_size', 'total_Size_Factor', 'initial_total_cell_size', 'spliced_Size_Factor', 'initial_spliced_cell_size', 'unspliced_Size_Factor', 'initial_unspliced_cell_size', 'Size_Factor', 'initial_cell_size', 'ntr', 'cell_cycle_phase', 'leiden', 'umap_leiden', 'umap_louvain', 'control_point_pca', 'inlier_prob_pca', 'obs_vf_angle_pca', 'pca_ddhodge_div', 'pca_ddhodge_potential', 'umap_ddhodge_div', 'umap_ddhodge_potential', 'curl_umap', 'divergence_umap', 'control_point_umap', 'inlier_prob_umap', 'obs_vf_angle_umap', 'acceleration_pca', 'curvature_pca', 'n_counts', 'mt_frac', 'jacobian_det_pca', 'manual_selection', 'divergence_pca', 'curvature_umap', 'acceleration_umap', 'control_point_umap', 'inlier_prob_umap', 'obs_vf_angle_umap', 'curvature_umap', 'curv_leiden', 'curv_louvain', 'SPI1->GATA1_jacobian', 'jacobian' + var: 'gene_name', 'gene_id', 'nCells', 'nCounts', 'pass_basic_filter', 'use_for_pca', 'frac', 'ntr', 'time_3_alpha', 'time_3_beta', 'time_3_gamma', 'time_3_half_life', 'time_3_alpha_b', 'time_3_alpha_r2', 'time_3_gamma_b', 'time_3_gamma_r2', 'time_3_gamma_logLL', 'time_3_delta_b', 'time_3_delta_r2', 'time_3_bs', 'time_3_bf', 'time_3_uu0', 'time_3_ul0', 'time_3_su0', 'time_3_sl0', 'time_3_U0', 'time_3_S0', 'time_3_total0', 'time_3_beta_k', 'time_3_gamma_k', 'time_5_alpha', 'time_5_beta', 'time_5_gamma', 'time_5_half_life', 'time_5_alpha_b', 'time_5_alpha_r2', 'time_5_gamma_b', 'time_5_gamma_r2', 'time_5_gamma_logLL', 'time_5_bs', 'time_5_bf', 'time_5_uu0', 'time_5_ul0', 'time_5_su0', 'time_5_sl0', 'time_5_U0', 'time_5_S0', 'time_5_total0', 'time_5_beta_k', 'time_5_gamma_k', 'use_for_dynamics', 'gamma', 'gamma_r2', 'use_for_transition', 'gamma_k', 'gamma_b' + uns: 'PCs', 'VecFld_pca', 'VecFld_umap', 'VecFld_umap', 'X_umap_neighbors', 'cell_phase_genes', 'cell_type_colors', 'dynamics', 'explained_variance_ratio_', 'feature_selection', 'grid_velocity_pca', 'grid_velocity_umap', 'grid_velocity_umap', 'grid_velocity_umap_perturbation', 'grid_velocity_umap_test', 'grid_velocity_umap_perturbation', 'jacobian_pca', 'leiden', 'neighbors', 'pca_mean', 'pp', 'response' + obsm: 'X', 'X_pca', 'X_pca_SparseVFC', 'X_umap', 'X_umap_SparseVFC', 'X_umap', 'X_umap_SparseVFC', 'X_umap_perturbation', 'X_umap_test', 'X_umap_perturbation', 'acceleration_pca', 'acceleration_umap', 'cell_cycle_scores', 'curvature_pca', 'curvature_umap', 'curvature_umap', 'j_delta_x_perturbation', 'velocity_pca', 'velocity_pca_SparseVFC', 'velocity_umap', 'velocity_umap_SparseVFC', 'velocity_umap', 'velocity_umap_SparseVFC', 'velocity_umap_perturbation', 'velocity_umap_test', 'velocity_umap_perturbation' + layers: 'M_n', 'M_nn', 'M_t', 'M_tn', 'M_tt', 'X_new', 'X_total', 'velocity_alpha_minus_gamma_s' + obsp: 'X_umap_connectivities', 'X_umap_distances', 'connectivities', 'cosine_transition_matrix', 'distances', 'fp_transition_rate', 'moments_con', 'pca_ddhodge', 'perturbation_transition_matrix', 'umap_ddhodge' + + +We will first show the streamline plot of this dataset in the UMAP space. From which, we can see that we have six major cell types, namely hematopoietic stem cells (HSC), neutrophil (Neu), monocyte (Mon), basophil (Bas), megakaryocyte (Meg) and erythrocytes (Ery). From the streamline plot, we can see that HSC will first become GMP (granulocyte monocyte progenitor)-like or MEP (megakaryocyte and erythrocyte progenitor)-like cells and then bifurcate into Neu and Mon or Ery, Bas and Meg, respectively. Here we will select a few characteristic cells for each specific cell type via ``dyn.tl.select_cell``. + +Among the cell types from our tscRNA-seq data, there are five developmental events (from HSC to each of the terminal cell type), one reported dedifferentiation event (from Meg to HSC), and a total of eight reported transdifferentiation events. Considering all-against-all conversions, we are left with 18 unreported transitions between different mature cell types. Thus, this system provides a broad range of known transitions and associated transcription factors to confirm our predictions while also allows us to make non-trivial predictions for the remaining 18 unreported transitions. + +.. code:: ipython3 + + dyn.pl.streamline_plot(adata_labeling, basis="umap", color="cell_type") + + HSC_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "HSC") + Meg_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "Meg") + Ery_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "Ery") + Bas_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "Bas") + Mon_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "Mon") + Neu_cells = dyn.tl.select_cell(adata_labeling, "cell_type", "Neu") + + +.. image:: output_6_0.png + :width: 487px + + +| We select the five closest cells of the identified fixed_points that correspond to each of the six cell types to represent the typical cell state of these cells (note that fixed_points often don't correspond to any particular cell). + +| Then nearest cells of these ``fixed_points`` are saved to + ``*_cells_indices variables``, which points to their cell indices in + the adata object. Note that we could just take the fixed_points for LAP analyses but using the actual cells offering us the benefits to take advantage of the nearest neighbor graph of cells to intialize the searching of LAP (see below). + +.. code:: ipython3 + + from dynamo.tools.utils import nearest_neighbors + + fixed_points = np.array( + [ + [8.45201833, 9.37697661], + [14.00630381, 2.53853712], + [17.30550636, 6.81561775], + [18.06891717, 11.9840678], + [14.13613403, 15.22244713], + [9.72644402, 14.83745969], + ] + ) + + HSC_cells_indices = nearest_neighbors(fixed_points[0], adata_labeling.obsm["X_umap"]) + Meg_cells_indices = nearest_neighbors(fixed_points[1], adata_labeling.obsm["X_umap"]) + Ery_cells_indices = nearest_neighbors(fixed_points[2], adata_labeling.obsm["X_umap"]) + Bas_cells_indices = nearest_neighbors(fixed_points[3], adata_labeling.obsm["X_umap"]) + Mon_cells_indices = nearest_neighbors(fixed_points[4], adata_labeling.obsm["X_umap"]) + Neu_cells_indices = nearest_neighbors(fixed_points[5], adata_labeling.obsm["X_umap"]) + + +.. code:: ipython3 + + import matplotlib.pyplot as plt + + # plt.figure(figsize=(4, 4)) + + plt.scatter(*adata_labeling.obsm["X_umap"].T) + for indices in [ + HSC_cells_indices, + Meg_cells_indices, + Ery_cells_indices, + Bas_cells_indices, + Mon_cells_indices, + Neu_cells_indices, + ]: + plt.scatter(*adata_labeling[indices[0]].obsm["X_umap"].T) + + +.. image:: output_9_1.png + :width: 543px + + +We can see, for example, the cell indices ``1587, 1557, 1725, 1091, 1070`` are the nearest cells to the identified HSC attractor. + +.. code:: ipython3 + + HSC_cells_indices + + +.. parsed-literal:: + + array([[1587, 1557, 1725, 1091, 1070]]) + + + +.. + Development path for Meg, Ery, Bas, Mon and Neu cells + ----------------------------------------------------- + +Now we are ready to perform the LAP analyses. We will start with computing the neighbor graph of cells in the umap space (pca space works too) and use the shortest paths between any two represented cells as the initial guess of the LAP. We will next run the LAP analyses between all pair-wise combinations of cells. We can either perform the LAP analyses on the UMAP space or in the PCA space, using the vector field reconstructed in UMAP or PCA space, respectively. With the vector field learned in the PCA space, we can further projected the optimized LAP back to the original gene expression space to reveal the transcriptomic kinetics along the LAP. + +Compute neighbor graph based on ``umap`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: ipython3 + + dyn.tl.neighbors(adata_labeling, basis="umap", result_prefix="umap") + + + +.. parsed-literal:: + + |-----> Start computing neighbor graph... + |-----------> X_data is None, fetching or recomputing... + |-----> fetching X data from layer:None, basis:umap + |-----> method arg is None, choosing methods automatically... + |-----------> method kd_tree selected + |-----> umap_connectivities to obsp in AnnData Object. + |-----> umap_distances to obsp in AnnData Object. + |-----> umap_neighbors to uns in AnnData Object. + |-----> umap_neighbors.indices to uns in AnnData Object. + |-----> umap_neighbors.params to uns in AnnData Object. + + + +Run pairwise least action path analyses among six distinct hematopoietic cell types +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This section will demonstrate how to compute LAPs for all possible cell type transition pairs in our scNT-seq dataset. The corresponding function in +*dynamo* is ``dyn.pd.least_action``. This function takes an ``adata`` object that has continous vector field reconstructed, a start +cell and a target cell to compute least action path or most probable path between them. As shown +above, either UMAP or PCA basis can be used. Here we use the UMAP basis to visualize the LAP and the PCA basis +for downstream transcription factor prioritization and other analyses. + +Note that the following block also demonstrates using the `GeneTrajectory` function to reverse project the optimized LAP in PCA space back to the original gene expression space to reveal the transcriptomic kinetics along the LAP. We then calculate the accumulative MSD (mean square displacement) with respect to the initial state of each gene along the LAP in the original gene expression space (with `calc_msd` function) and use this score to prioritize the importance of each gene (with `rank_genes` function). Genes with top MSD have higher variances with respect to the initial state and will be ranked higher, which may also indicate key roles in making the cell fate conversions. + +Please refer to the API documentation of each of these functions for +detailed explanation of their input parameters, output, etc. Please also check our primers on the optimal path and the Cell paper for more in-depth understandings. + +.. code:: ipython3 + + %%capture + + dyn.dynamo_logger.main_silence() + transition_graph = {} + cell_type = ["HSC", "Meg", "Ery", "Bas", "Mon", "Neu"] + start_cell_indices = [ + HSC_cells_indices, + Meg_cells_indices, + Ery_cells_indices, + Bas_cells_indices, + Mon_cells_indices, + Neu_cells_indices, + ] + end_cell_indices = start_cell_indices + for i, start in enumerate(start_cell_indices): + for j, end in enumerate(end_cell_indices): + if start is not end: + min_lap_t = True if i == 0 else False + dyn.pd.least_action( + adata_labeling, + [adata_labeling.obs_names[start[0]][0]], + [adata_labeling.obs_names[end[0]][0]], + basis="umap", + adj_key="X_umap_distances", + min_lap_t= min_lap_t, + EM_steps=2, + ) + dyn.pl.least_action(adata_labeling, basis="umap") + lap = dyn.pd.least_action( + adata_labeling, + [adata_labeling.obs_names[start[0]][0]], + [adata_labeling.obs_names[end[0]][0]], + basis="pca", + adj_key="cosine_transition_matrix", + min_lap_t=min_lap_t, + EM_steps=2, + ) + dyn.pl.kinetic_heatmap( + adata_labeling, + basis="pca", + mode="lap", + genes=adata_labeling.var_names[adata_labeling.var.use_for_transition], + project_back_to_high_dim=True, + ) + # The `GeneTrajectory` class can be used to output trajectories for any set of genes of interest + gtraj = dyn.pd.GeneTrajectory(adata_labeling) + gtraj.from_pca(lap.X, t=lap.t) + gtraj.calc_msd() + ranking = dyn.vf.rank_genes(adata_labeling, "traj_msd") + + print(start, "->", end) + genes = ranking[:5]["all"].to_list() + arr = gtraj.select_gene(genes) + + dyn.pl.multiplot(lambda k: [plt.plot(arr[k, :]), plt.title(genes[k])], np.arange(len(genes))) + + transition_graph[cell_type[i] + "->" + cell_type[j]] = { + "lap": lap, + "LAP_umap": adata_labeling.uns["LAP_umap"], + "LAP_pca": adata_labeling.uns["LAP_pca"], + "ranking": ranking, + "gtraj": gtraj, + } + + + +.. parsed-literal:: + + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [4.8274s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [87.3331s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [6.1928s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [87.8599s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [6.2292s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [81.9887s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [5.9224s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [82.8575s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [4.7673s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [120.9742s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.3654s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [17.5986s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.3836s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [17.3726s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4288s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [20.0245s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5213s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [17.8978s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4509s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [27.8622s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4895s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [16.6043s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.3779s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [12.5543s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4918s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [21.3984s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6045s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [18.5405s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6157s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [21.0733s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6774s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [28.5954s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4858s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [22.6107s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.3991s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [23.0945s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5327s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [24.0878s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6087s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [24.2374s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5198s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [18.8253s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6873s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [14.9045s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5671s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [10.4933s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4429s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [13.5975s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.3534s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [19.1570s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5459s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [30.7210s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.6741s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [17.5307s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5748s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [22.9212s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.5324s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [18.0897s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [0.4742s] + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [29.8258s] + + +The LAPs between all pairs of cell types are stored in the `transition_graph` object. Here we will use the LAP results to visualize the developmental, reprogram and transdifferentiation least action paths. Interestingly, we show that the LAP is not simply the shortest paths between two cell states but instead follow the curved vector field flow. + +Visualize developmental LAPs +---------------------------- + +.. code:: ipython3 + + develope_keys = ["HSC->Meg", "HSC->Ery", "HSC->Bas", "HSC->Mon", "HSC->Neu"] + reprogram_keys = ["Meg->HSC", "Ery->HSC", "Bas->HSC", "Mon->HSC", "Neu->HSC"] + transdifferentiation = [ + "Ery->Meg", + "Neu->Bas", + "Mon->Ery", + "Bas->Meg", + "Neu->Meg", + "Meg->Bas", + "Mon->Bas", + "Neu->Mon", + "Meg->Ery", + "Ery->Bas", + "Bas->Mon", + "Mon->Neu", + "Neu->Ery", + "Mon->Meg", + "Bas->Neu", + "Meg->Neu", + "Ery->Mon", + "Meg->Mon", + "Ery->Neu", + "Bas->Ery", + ] + +We define a helper function ``plot_lap`` to visualize different set of paths. Here we visualize developmental LAPs. + +.. code:: ipython3 + + from dynamo.plot.utils import map2color + def plot_lap(paths): + fig, ax = plt.subplots(figsize=(5, 4)) + ax = dyn.pl.streamline_plot( + adata_labeling, basis="umap", save_show_or_return="return", ax=ax, color="cell_type", frontier=True + ) + ax = ax[0] + x, y = 0, 1 + + # plot paths + for path in paths: + lap_dict = transition_graph[path]["LAP_umap"] + for prediction, action in zip(lap_dict["prediction"], lap_dict["action"]): + ax.scatter(*prediction[:, [x, y]].T, c=map2color(action)) + ax.plot(*prediction[:, [x, y]].T, c="k") + plot_lap(develope_keys) + + + + +.. image:: output_19_0.png + :width: 407px + +**Reprogram LAPs** + +.. code:: ipython3 + + plot_lap(reprogram_keys) + +.. image:: reprogram-lap.png + :width: 407px + +**Transdifferentiation LAPs** + +.. code:: ipython3 + + plot_lap(transdifferentiation) + +.. image:: transdifferentiation-lap.png + :width: 407px + + +Next, we will focus on transcription factors (TFs) and rank them based on their MSD along the LAP path to prioritize the importance of each TF. Meanwhile, we will also keep the action (an functional of the LAP) and the least action path time, with `action_df` and `t_df`, respectively, of each of these conversions. + +.. code:: ipython3 + + human_tfs = dyn.sample_data.human_tfs() + human_tfs_names = list(human_tfs["Symbol"]) + + +.. code:: ipython3 + + action_df = pd.DataFrame(index=cell_type, columns=cell_type) + t_df = pd.DataFrame(index=cell_type, columns=cell_type) + for i, start in enumerate( + [ + HSC_cells_indices, + Meg_cells_indices, + Ery_cells_indices, + Bas_cells_indices, + Mon_cells_indices, + Neu_cells_indices, + ] + ): + for j, end in enumerate( + [ + HSC_cells_indices, + Meg_cells_indices, + Ery_cells_indices, + Bas_cells_indices, + Mon_cells_indices, + Neu_cells_indices, + ] + ): + if start is not end: + print(cell_type[i] + "->" + cell_type[j], end=",") + lap = transition_graph[cell_type[i] + "->" + cell_type[j]]["lap"] # lap + gtraj = transition_graph[cell_type[i] + "->" + cell_type[j]]["gtraj"] + ranking = transition_graph[cell_type[i] + "->" + cell_type[j]]["ranking"].copy() + ranking["TF"] = [i in human_tfs_names for i in list(ranking["all"])] + genes = ranking.query("TF == True").head(10)["all"].to_list() + arr = gtraj.select_gene(genes) + action_df.loc[cell_type[i], cell_type[j]] = lap.action()[-1] + t_df.loc[cell_type[i], cell_type[j]] = lap.t[-1] + + + +.. parsed-literal:: + + HSC->Meg,HSC->Ery,HSC->Bas,HSC->Mon,HSC->Neu,Meg->HSC,Meg->Ery,Meg->Bas,Meg->Mon,Meg->Neu,Ery->HSC,Ery->Meg,Ery->Bas,Ery->Mon,Ery->Neu,Bas->HSC,Bas->Meg,Bas->Ery,Bas->Mon,Bas->Neu,Mon->HSC,Mon->Meg,Mon->Ery,Mon->Bas,Mon->Neu,Neu->HSC,Neu->Meg,Neu->Ery,Neu->Bas,Neu->Mon, + +We now visualize the LAP time of all developmental LAPs. Interestingly, we show that the LAP time from HSC to Meg lineage LAP (28 hour) is the shortest among all +developmental LAPs, consistent with the fact that megakaryocyte is the earliest cell type to appear. The predicted 28 hours is also on the time-scale of what has been reported for the single HSC transplantation experiments. We want to note that because we used the metabolic labeling based scRNA-seq, we obtained absolute RNA velocity and thus we can predict the actual time (with units of hour) of the LAP, a rather remarkable feature of the labeling data. + +.. code:: ipython3 + + dyn.configuration.set_pub_style(scaler=1.5) + develop_time_df = pd.DataFrame({"integration time": t_df.iloc[0, :].T}) + develop_time_df["lineage"] = ["HSC", "Meg", "Ery", "Bas", "Mon", "Neu"] + print(develop_time_df) + ig, ax = plt.subplots(figsize=(4, 3)) + dynamo_color_dict = { + "Mon": "#b88c7a", + "Meg": "#5b7d80", + "MEP-like": "#6c05e8", + "Ery": "#5d373b", + "Bas": "#d70000", + "GMP-like": "#ff4600", + "HSC": "#c35dbb", + "Neu": "#2f3ea8", + } + + sns.barplot( + y="lineage", + x="integration time", + hue="lineage", + data=develop_time_df.iloc[1:, :], + dodge=False, + palette=dynamo_color_dict, + ax=ax, + ) + ax.set_ylabel("") + plt.tight_layout() + plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") + +.. parsed-literal:: + + integration time lineage + HSC NaN HSC + Meg 28.335868 Meg + Ery 46.227644 Ery + Bas 45.575254 Bas + Mon 41.797433 Mon + Neu 76.469544 Neu + + + + +.. parsed-literal:: + + + + + + +.. image:: output_22_2.png + :width: 449px + +Here we are going to visualize the transition matrices of actions and LAP time between all pair-wise cell type conversions with heatmaps. Overall, we find the the developmental LAP time is much larger than that of the dedifferentiation LAP while the action has the opposite pattern. + +Heatmap of LAP actions and LAP time matrices of pairwise cell fate conversions +------------------------------------------------------------------------------- + +.. code:: ipython3 + + action_df = action_df.fillna(0) + f, ax = plt.subplots(figsize=(5, 5)) + dyn.configuration.set_pub_style(scaler=3) + ax = sns.heatmap(action_df, annot=True, ax=ax, fmt=".2g") + + + + +.. image:: output_24_0.png + :width: 448px + + +.. code:: ipython3 + + t_df = t_df.fillna(0) + dyn.configuration.set_pub_style(scaler=3) + ax = sns.heatmap(t_df, annot=True, fmt=".3g") + + + + +.. image:: output_25_0.png + :width: 515px + + +Kinetics heatmap of gene expression dynamics along the LAP +----------------------------------------------------------- + +As mentioned above, we are able to obtain the gene-wise kinetics when we reverse projected the LAP learned in PCA space back to gene-wise space. In this section we will show how to do so and we will create a kinetics heatmap of the transcriptomic dynamics along the LAP from HSC to basophil lineage. We will rely on mainly two functions, ``dyn.pd.least_action`` and ``dyn.pl.kinetic_heatmap``. ``dyn.pd.least_action`` can be used to computes the optimal paths between any two cell states, as mentioned above while ``dyn.pl.kinetic_heatmap`` can be used to plot kinetics +heatmap. + +Here we will identify the LAP from the HSC to basophil lineage, and thus one typical HSC and one typical basophil cell are chosen as the initial and target cell, respectively. + +.. code:: ipython3 + + init_cells = [adata_labeling.obs_names[HSC_cells_indices[0][0]]] + target_cells = [adata_labeling.obs_names[Bas_cells_indices[0][0]]] + print("init cells:", init_cells) + print("end cells:", target_cells) + + +.. parsed-literal:: + + init cells: ['GGGGGGCGGCCT-JL_10'] + end cells: ['GCAGCGAAGGCA-JL12_0'] + + +Now let us find the optimal path between HSC to basophil lineage via the ``least_action`` function. + +.. code:: ipython3 + + dyn.configuration.set_pub_style(scaler=0.6) + + lap = dyn.pd.least_action( + adata_labeling, + init_cells=init_cells, + target_cells=target_cells, + basis="pca", + adj_key="cosine_transition_matrix", + ) + + + +.. parsed-literal:: + + |-----> [iterating through 1 pairs] in progress: 100.0000% + |-----> [iterating through 1 pairs] finished [9.2680s] + + +Now let us plot the kinetic heatmap of the gene expression kinetics of all transcription factors (restricted only to those that are used for calculating the velocity transition matrix) along the LAP from HSC to basophil lineage. + +.. code:: ipython3 + + + is_human_tfs = [gene in human_tfs_names for gene in adata_labeling.var_names[adata_labeling.var.use_for_transition]] + human_genes = adata_labeling.var_names[adata_labeling.var.use_for_transition][is_human_tfs] + dyn.configuration.set_pub_style(scaler=0.6) + sns.set(font_scale=0.8) + sns_heatmap = dyn.pl.kinetic_heatmap( + adata_labeling, + basis="pca", + mode="lap", + figsize=(10, 5), + genes=human_genes, + project_back_to_high_dim=True, + save_show_or_return="return", + color_map="bwr", + transpose=True, + xticklabels=True, + yticklabels=False + ) + + plt.setp(sns_heatmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) + plt.tight_layout() + + + + +.. image:: output_31_0.png + :width: 818px + + +Now let us swap ``target_cells`` and ``init_cells``, when passing parameters into ``dyn.pd.least_action``, to draw the the kinetic heatmap of the gene expression kinetics of all transcription factors along the LAP from basophil to HSC. + +.. code:: ipython3 + + from matplotlib import pyplot, transforms + + is_human_tfs = [gene in human_tfs_names for gene in adata_labeling.var_names[adata_labeling.var.use_for_transition]] + human_genes = adata_labeling.var_names[adata_labeling.var.use_for_transition][is_human_tfs] + lap = dyn.pd.least_action( + adata_labeling, + init_cells=target_cells, + target_cells=init_cells, + basis="pca", + adj_key="cosine_transition_matrix", + ) + sns.set(font_scale=0.8) + sns_heatmap = dyn.pl.kinetic_heatmap( + adata_labeling, + basis="pca", + mode="lap", + figsize=(16, 8), + genes=human_genes, + project_back_to_high_dim=True, + save_show_or_return="return", + color_map="bwr", + transpose=True, + xticklabels=True, + yticklabels=False, + ) + plt.setp(sns_heatmap.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) + plt.tight_layout() + +.. image:: bas-hsc-kinetic.png + :width: 818px + + +Evaluate TF rankings based on LAP analyses +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +As mentioned above, we can rank TFs based on the mean square displacement (MSD) along the LAP . In this section, we are going to evaluate rankings from LAP analyses by comparing with known transcription factors that enable the successful cell fate conversion, reported from literature. More details can be found in the dynamo paper :cite:p:`QIU2022`. + +We first prepare TF ranking dataframes that will be used to create ranking statistics in this +section. We first identify the TFs from all genes (``["TF"]`` key) and tag TFs that are known transcription factor for the corresponding cell fate conversion (``["known_TF"]`` key). To the best we can, we all manually compiled all known factors for all known hematopoietic cell fate transitions (including developmental process). Please see supplementary table 2 from dynamo paper :cite:p:`QIU2022` for more details. + +This part is specific to our scNT-seq dataset but should be easily changed to meet your needs as well. + +.. code:: ipython3 + + %%capture + HSC_Meg_ranking = transition_graph["HSC->Meg"]["ranking"] + HSC_Meg_ranking["TF"] = [i in human_tfs_names for i in list(HSC_Meg_ranking["all"])] + + HSC_Meg_ranking = HSC_Meg_ranking.query("TF == True") + HSC_Meg_ranking["known_TF"] = [ + i in ["GATA1", "GATA2", "ZFPM1", "GFI1B", "FLI1", "NFE2"] for i in list(HSC_Meg_ranking["all"]) + ] + + HSC_Ery_ranking = transition_graph["HSC->Ery"]["ranking"] + HSC_Ery_ranking["TF"] = [i in human_tfs_names for i in list(HSC_Ery_ranking["all"])] + + HSC_Ery_ranking = HSC_Ery_ranking.query("TF == True") + HSC_Ery_ranking["known_TF"] = [ + i in ["GATA1", "ZFPM1", "GFI1B", "KLF1", "SPI1", "GATA2", "LDB1", "TAL1", "ZFPM1"] + for i in list(HSC_Ery_ranking["all"]) + ] + + HSC_Bas_ranking = transition_graph["HSC->Bas"]["ranking"] + HSC_Bas_ranking["TF"] = [i in human_tfs_names for i in list(ranking["all"])] + + HSC_Bas_ranking = HSC_Bas_ranking.query("TF == True") + HSC_Bas_ranking["known_TF"] = [i in ["CEBPA", "GATA2", "GATA1", "RUNX1"] for i in list(HSC_Bas_ranking["all"])] + HSC_Bas_ranking + + HSC_Mon_ranking = transition_graph["HSC->Mon"]["ranking"] + HSC_Mon_ranking["TF"] = [i in human_tfs_names for i in list(ranking["all"])] + + HSC_Mon_ranking = HSC_Mon_ranking.query("TF == True") + HSC_Mon_ranking["known_TF"] = [i in ["SPI1", "IRF8", "IRF5", "ZEB2", "KLF4"] for i in list(HSC_Mon_ranking["all"])] + HSC_Mon_ranking + + HSC_Neu_ranking = transition_graph["HSC->Neu"]["ranking"] + HSC_Neu_ranking["TF"] = [i in human_tfs_names for i in list(HSC_Neu_ranking["all"])] + + HSC_Neu_ranking = HSC_Neu_ranking.query("TF == True") + HSC_Neu_ranking["known_TF"] = [i in ["GFI1", "PER3", "GATA1", "ETS3"] for i in list(HSC_Neu_ranking["all"])] + HSC_Neu_ranking + + # + Meg_HSC_ranking = transition_graph["Meg->HSC"]["ranking"] + Meg_HSC_ranking["TF"] = [i in human_tfs_names for i in list(Meg_HSC_ranking["all"])] + + Meg_HSC_ranking = Meg_HSC_ranking.query("TF == True") + Meg_HSC_ranking["known_TF"] = [ + i in ["RUN1T1", "HLF", "LMO2", "PRDM5", "PBX1", "ZFP37", "MYCN", "MEIS1"] for i in list(Meg_HSC_ranking["all"]) + ] + + + Ery_Mon_ranking = transition_graph["Ery->Mon"]["ranking"] + Ery_Mon_ranking["TF"] = [i in human_tfs_names for i in list(Ery_Mon_ranking["all"])] + + Ery_Mon_ranking = Ery_Mon_ranking.query("TF == True") + Ery_Mon_ranking["known_TF"] = [i in ["LSD1", "RUNX1"] for i in list(Ery_Mon_ranking["all"])] + + Ery_Neu_ranking = transition_graph["Ery->Neu"]["ranking"] + Ery_Neu_ranking["TF"] = [i in human_tfs_names for i in list(Ery_Neu_ranking["all"])] + + Ery_Neu_ranking = Ery_Neu_ranking.query("TF == True") + Ery_Neu_ranking["known_TF"] = [i in ["LSD1", "RUNX1"] for i in list(Ery_Neu_ranking["all"])] + + # http://genesdev.cshlp.org/content/20/21/3010.long + + Mon_Bas_ranking = transition_graph["Mon->Bas"]["ranking"] + Mon_Bas_ranking["TF"] = [i in human_tfs_names for i in list(Mon_Bas_ranking["all"])] + + Mon_Bas_ranking = Mon_Bas_ranking.query("TF == True") + Mon_Bas_ranking["known_TF"] = [i in ["GATA2", "CEBPA"] for i in list(Mon_Bas_ranking["all"])] + + Neu_Bas_ranking = transition_graph["Neu->Bas"]["ranking"] + Neu_Bas_ranking["TF"] = [i in human_tfs_names for i in list(Neu_Bas_ranking["all"])] + + Neu_Bas_ranking = Neu_Bas_ranking.query("TF == True") + Neu_Bas_ranking["known_TF"] = [i in ["GATA2", "CEBPA"] for i in list(Mon_Bas_ranking["all"])] + + + # GATA-1 Converts Lymphoid and Myelomonocytic Progenitors into the Megakaryocyte/Erythrocyte Lineages + + Mon_Meg_ranking = transition_graph["Mon->Meg"]["ranking"] + Mon_Meg_ranking["TF"] = [i in human_tfs_names for i in list(Mon_Meg_ranking["all"])] + + Mon_Meg_ranking = Mon_Meg_ranking.query("TF == True") + Mon_Meg_ranking["known_TF"] = [i in ["GATA1", "ZFPM1", "GATA2"] for i in list(Mon_Meg_ranking["all"])] + + Mon_Ery_ranking = transition_graph["Mon->Ery"]["ranking"] + Mon_Ery_ranking["TF"] = [i in human_tfs_names for i in list(Mon_Ery_ranking["all"])] + + Mon_Ery_ranking = Mon_Ery_ranking.query("TF == True") + Mon_Ery_ranking["known_TF"] = [i in ["GATA1", "ZFPM1", "GATA2"] for i in list(Mon_Ery_ranking["all"])] + + + # Tom's paper + Meg_Neu_ranking = transition_graph["Meg->Neu"]["ranking"] + Meg_Neu_ranking["TF"] = [i in human_tfs_names for i in list(Meg_Neu_ranking["all"])] + + Meg_Neu_ranking = Meg_Neu_ranking.query("TF == True") + Meg_Neu_ranking["known_TF"] = [i in ["CEBPA", "CEBPB", "CEBPE", "SPI1"] for i in list(Meg_Neu_ranking["all"])] + + Ery_Neu_ranking = transition_graph["Ery->Neu"]["ranking"] + Ery_Neu_ranking["TF"] = [i in human_tfs_names for i in list(Ery_Neu_ranking["all"])] + + Ery_Neu_ranking = Ery_Neu_ranking.query("TF == True") + Ery_Neu_ranking["known_TF"] = [i in ["CEBPA", "CEBPB", "CEBPE", "SPI1"] for i in list(Ery_Neu_ranking["all"])] + + +.. code:: ipython3 + + lap_dict = transition_graph[cell_type[0] + "->" + cell_type[3]]["LAP_pca"] + lap_dict["t"] *= 3 + adata_labeling.uns["LAP_pca"] = lap_dict + + +Ranking TF's importance for each LAP +------------------------------------ + +Let's re-rank each known TF from each known hematopoietic fate conversion based on their MSD rankings among all TFs. We will use the helper function +``assign_tf_ranks`` to achieve this purpose. All the known TFs are collected from literature as mentioned above. + +.. code:: ipython3 + + def assign_tf_ranks(transition_graph: dict, transition: str, tfs: list, tfs_key="TFs", tfs_rank_key="TFs_rank"): + ranking = transition_graph[transition]["ranking"] + ranking["TF"] = [i in human_tfs_names for i in list(ranking["all"])] + true_tf_list = list(ranking.query("TF == True")["all"]) + all_tfs = list(ranking.query("TF == True")["all"]) + transition_graph[transition][tfs_key] = tfs + + transition_graph[transition][tfs_rank_key] = [ + all_tfs.index(key) if key in true_tf_list else -1 for key in transition_graph[transition][tfs_key] + ] + + + assign_tf_ranks(transition_graph, "HSC->Meg", ["GATA1", "GATA2", "ZFPM1", "GFI1B", "FLI1", "NFE2"]) + + +.. code:: ipython3 + + transition_graph["HSC->Meg"]["TFs"] + + + + + +.. parsed-literal:: + + ['GATA1', 'GATA2', 'ZFPM1', 'GFI1B', 'FLI1', 'NFE2'] + + + +.. code:: ipython3 + + assign_tf_ranks( + transition_graph, "HSC->Ery", ["GATA1", "ZFPM1", "GFI1B", "KLF1", "SPI1", "GATA2", "LDB1", "TAL1", "ZFPM1"] + ) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "HSC->Bas", ["STAT5", "GATA2", "CEBPA", "MITF"]) + assign_tf_ranks(transition_graph, "HSC->Bas", ["CEBPA", "GATA2", "GATA1", "RUNX1"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "HSC->Mon", ["SPI1", "IRF8", "IRF5", "ZEB2", "KLF4"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "HSC->Neu", ["GFI1", "PER3", "GATA1", "ETS3"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "Meg->HSC", ["RUN1T1", "HLF", "LMO2", "PRDM5", "PBX1", "ZFP37", "MYCN", "MEIS1"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "Mon->Meg", ["GATA1", "ZFPM1", "GATA2"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "Mon->Ery", ["GATA1", "ZFPM1", "GATA2"]) + + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "Meg->Neu", ["CEBPA", "CEBPB", "CEBPE", "SPI1"]) + + +.. code:: ipython3 + + # Tom's paper + assign_tf_ranks( + transition_graph, "Ery->Neu", ["CEBPA", "CEBPB", "CEBPE", "SPI1"], tfs_rank_key="TFs_rank2", tfs_key="TFs2" + ) + + +An erythroid to myeloid cell fate conversion is elicited by LSD1 +inactivation + +.. code:: ipython3 + + assign_tf_ranks(transition_graph, "Ery->Mon", ["LSD1", "RUNX1"]) + assign_tf_ranks(transition_graph, "Ery->Neu", ["LSD1", "RUNX1"], tfs_rank_key="TFs_rank1", tfs_key="TFs1") + + +.. code:: ipython3 + + # http://genesdev.cshlp.org/content/20/21/3010.long + assign_tf_ranks(transition_graph, "Mon->Bas", ["GATA2", "CEBPA"]) + + assign_tf_ranks(transition_graph, "Neu->Bas", ["GATA2", "CEBPA"]) + + +.. code:: ipython3 + + transition_graph["Ery->Neu"]["TFs2"], transition_graph["Ery->Neu"]["TFs_rank2"] + + + + + +.. parsed-literal:: + + (['CEBPA', 'CEBPB', 'CEBPE', 'SPI1'], [0, -1, -1, 17]) + + +Here we will convert the rankings of known TFs to a priority score, simply defined as :math:`1 - \frac{\# rank}{\# TF}`. + +.. code:: ipython3 + + from functools import reduce + + reprogramming_mat_dict = { + "HSC->Meg": { + "genes": transition_graph["HSC->Meg"]["TFs"], + "rank": transition_graph["HSC->Meg"]["TFs_rank"], + "PMID": 18295580, + }, + "HSC->Ery": { + "genes": transition_graph["HSC->Ery"]["TFs"], + "rank": transition_graph["HSC->Ery"]["TFs_rank"], + "PMID": 18295580, + }, + "HSC->Bas": { + "genes": transition_graph["HSC->Ery"]["TFs"], + "rank": transition_graph["HSC->Ery"]["TFs_rank"], + "PMID": 18295580, + }, + "HSC->Mon": { + "genes": transition_graph["HSC->Mon"]["TFs"], + "rank": transition_graph["HSC->Mon"]["TFs_rank"], + "PMID": 18295580, + }, + "HSC->Neu": { + "genes": transition_graph["HSC->Neu"]["TFs"], + "rank": transition_graph["HSC->Neu"]["TFs_rank"], + "PMID": 18295580, + }, + "Meg->HSC": { + "genes": transition_graph["Meg->HSC"]["TFs"], + "rank": transition_graph["Meg->HSC"]["TFs_rank"], + "PMID": 24766805, + }, + "Meg->Neu": { + "genes": transition_graph["Meg->Neu"]["TFs"], + "rank": transition_graph["Meg->Neu"]["TFs_rank"], + "PMID": 31395745, + }, + "Ery->Mon": { + "genes": transition_graph["Ery->Mon"]["TFs"], + "rank": transition_graph["Ery->Mon"]["TFs_rank"], + "PMID": 34324630, + }, + "Ery->Neu1": { + "genes": transition_graph["Ery->Neu"]["TFs1"], + "rank": transition_graph["Ery->Neu"]["TFs_rank1"], + "PMID": 31395745, + }, + "Ery->Neu2": { + "genes": transition_graph["Ery->Neu"]["TFs2"], + "rank": transition_graph["Ery->Neu"]["TFs_rank2"], + "PMID": 34324630, + }, + "Mon->Meg": { + "genes": transition_graph["Mon->Meg"]["TFs"], + "rank": transition_graph["Mon->Meg"]["TFs_rank"], + "PMID": 14499119, + }, + "Mon->Ery": { + "genes": transition_graph["Mon->Ery"]["TFs"], + "rank": transition_graph["Mon->Ery"]["TFs_rank"], + "PMID": 14499119, + }, + "Mon->Bas": { + "genes": transition_graph["Mon->Bas"]["TFs"], + "rank": transition_graph["Mon->Bas"]["TFs_rank"], + "PMID": 17079688, + }, + "Neu->Bas": { + "genes": transition_graph["Neu->Bas"]["TFs"], + "rank": transition_graph["Neu->Bas"]["TFs_rank"], + "PMID": 17079688, + }, + } + + + reprogramming_mat_df = pd.DataFrame(reprogramming_mat_dict) + + for key in reprogramming_mat_df: + assert len(reprogramming_mat_df[key]["genes"]) == len(reprogramming_mat_df[key]["rank"]), str(key) + + + all_genes = reduce(lambda a, b: a + b, reprogramming_mat_df.loc["genes", :]) + all_rank = reduce(lambda a, b: a + b, reprogramming_mat_df.loc["rank", :]) + all_keys = np.repeat( + np.array(list(reprogramming_mat_dict.keys())), [len(i) for i in reprogramming_mat_df.loc["genes", :]] + ) + + reprogramming_mat_df_p = pd.DataFrame({"genes": all_genes, "rank": all_rank, "transition": all_keys}) + reprogramming_mat_df_p = reprogramming_mat_df_p.query("rank > -1") + reprogramming_mat_df_p["type"] = reprogramming_mat_df_p["transition"].map( + { + "HSC->Meg": "development", + "HSC->Ery": "development", + "HSC->Bas": "development", + "HSC->Mon": "development", + "HSC->Neu": "development", + "Meg->HSC": "reprogramming", + "Meg->Neu": "transdifferentiation", + "Ery->Mon": "transdifferentiation", + "Ery->Neu1": "transdifferentiation", + "Ery->Neu2": "transdifferentiation", + "Mon->Meg": "transdifferentiation", + "Mon->Ery": "transdifferentiation", + "Mon->Bas": "transdifferentiation", + "Neu->Bas": "transdifferentiation", + } + ) + + reprogramming_mat_df_p["rank"] /= 133 + reprogramming_mat_df_p["rank"] = 1 - reprogramming_mat_df_p["rank"] + + +Plotting priority scores of known TFs for specific hematopoietic trandifferentiations +------------------------------------------------------------------------------------- + +The y-axis is the hematopoietic trandifferentiation and the x-axis the TF priority scores for a specific transition. + +.. code:: ipython3 + + dyn.configuration.set_pub_style() + transition_color_dict = {"development": "#2E3192", "reprogramming": "#EC2227", "transdifferentiation": "#B9519E"} + + reprogramming_mat_df_p_subset = reprogramming_mat_df_p.query("type == 'transdifferentiation'") + rank = reprogramming_mat_df_p_subset["rank"].values + transition = reprogramming_mat_df_p_subset["transition"].values + genes = reprogramming_mat_df_p_subset["genes"].values + + fig, ax = plt.subplots(1, 1, figsize=(6, 4)) + sns.scatterplot( + y="transition", + x="rank", + data=reprogramming_mat_df_p_subset, + ec=None, + hue="type", + alpha=0.8, + ax=ax, + s=50, + palette=transition_color_dict, + clip_on=False, + ) + + for i in range(reprogramming_mat_df_p_subset.shape[0]): + annote_text = genes[i] # STK_ID + ax.annotate( + annote_text, xy=(rank[i], transition[i]), xytext=(0, 3), textcoords="offset points", ha="center", va="bottom" + ) + + plt.axvline(0.8, linestyle="--", lw=0.5) + ax.set_xlim(0.6, 1.01) + ax.set_xlabel("") + ax.set_xlabel("Score") + ax.set_yticklabels(list(reprogramming_mat_dict.keys())[6:], rotation=0) + ax.legend().set_visible(False) + ax.spines.top.set_position(("outward", 10)) + ax.spines.bottom.set_position(("outward", 10)) + + ax.spines.right.set_visible(False) + ax.spines.top.set_visible(False) + ax.yaxis.set_ticks_position("left") + ax.xaxis.set_ticks_position("bottom") + plt.show() + + +.. image:: output_55_1.png + :width: 650px + +From the above plot, you can appreciate that our prediction works very well. Majority of the known TFs of the known transitions are prioritized as > 0.8 while some of them achiving perfect prioritization (score ~= 1). + + +ROC curve analyses of TF priorization of the LAP predictions +------------------------------------------------------------ + +Last but not least, let us evaluate our TF ranking via receiver operating curve (ROC) analyses. ROC of LAP TF prioritization predictions when using all known genes of all known transitions as the gold standard (see STAR Methods of :cite:p:`QIU2022`) reveals an AUC (area under curve) of ``0.83``, again indicating our LAP predictions and TFs prioritization works quiet well. + +These analyses reveal the potential of the LAP approach to predict the optimal paths and TF cocktails of cell-fate transitions with high accuracy, paving the road for à la carte reprogramming between any cell types of interest for applications in regenerative medicine (Graf and Enver, 2009). + +.. code:: ipython3 + + all_ranks_list = [ + HSC_Meg_ranking, + HSC_Ery_ranking, + HSC_Bas_ranking, + HSC_Mon_ranking, + HSC_Neu_ranking, + Meg_HSC_ranking, + Ery_Mon_ranking, + Ery_Neu_ranking, + Mon_Bas_ranking, + Neu_Bas_ranking, + Mon_Meg_ranking, + Mon_Ery_ranking, + Meg_Neu_ranking, + Ery_Neu_ranking, + ] + + all_ranks_df = pd.concat(all_ranks_list) + + all_ranks_df["priority_score"] = ( + 1 - np.tile(np.arange(HSC_Bas_ranking.shape[0]), len(all_ranks_list)) / HSC_Bas_ranking.shape[0] + ) + # all_ranks_df['priority_score'].hist() + TFs = ranking["all"][ranking["TF"]].values + valid_TFs = np.unique(reprogramming_mat_df_p["genes"].values) + + +.. code:: ipython3 + + from sklearn.metrics import roc_curve, auc + + use_abs = False + top_genes = len(TFs) + + cls = all_ranks_df["known_TF"].astype(int) + pred = all_ranks_df["priority_score"] + + fpr, tpr, _ = roc_curve(cls, pred) + roc_auc = auc(fpr, tpr) + + + dyn.configuration.set_pub_style_mpltex() + plt.figure(figsize=(1.3, 1)) + + lw = 0.5 + plt.figure(figsize=(5, 5)) + plt.plot(fpr, tpr, color="darkorange", lw=lw, label="ROC curve (area = %0.2f)" % roc_auc) + plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--") + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + # plt.title(cur_guide) + plt.legend(loc="lower right") + plt.show() + + plt.tight_layout() + + + +.. image:: output_58_2.png + :width: 500px + diff --git a/docs/introduction/lap_tutorial/output_19_0.png b/docs/introduction/lap_tutorial/output_19_0.png new file mode 100644 index 000000000..ab31c52d4 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_19_0.png differ diff --git a/docs/introduction/lap_tutorial/output_22_2.png b/docs/introduction/lap_tutorial/output_22_2.png new file mode 100644 index 000000000..7909db6ba Binary files /dev/null and b/docs/introduction/lap_tutorial/output_22_2.png differ diff --git a/docs/introduction/lap_tutorial/output_24_0.png b/docs/introduction/lap_tutorial/output_24_0.png new file mode 100644 index 000000000..d7bdb1116 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_24_0.png differ diff --git a/docs/introduction/lap_tutorial/output_25_0.png b/docs/introduction/lap_tutorial/output_25_0.png new file mode 100644 index 000000000..599a31ba9 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_25_0.png differ diff --git a/docs/introduction/lap_tutorial/output_31_0.png b/docs/introduction/lap_tutorial/output_31_0.png new file mode 100644 index 000000000..57b2f9659 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_31_0.png differ diff --git a/docs/introduction/lap_tutorial/output_55_1.png b/docs/introduction/lap_tutorial/output_55_1.png new file mode 100644 index 000000000..00a224450 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_55_1.png differ diff --git a/docs/introduction/lap_tutorial/output_58_2.png b/docs/introduction/lap_tutorial/output_58_2.png new file mode 100644 index 000000000..060bc3167 Binary files /dev/null and b/docs/introduction/lap_tutorial/output_58_2.png differ diff --git a/docs/introduction/lap_tutorial/output_6_0.png b/docs/introduction/lap_tutorial/output_6_0.png new file mode 100644 index 000000000..31c4702ad Binary files /dev/null and b/docs/introduction/lap_tutorial/output_6_0.png differ diff --git a/docs/introduction/lap_tutorial/output_9_1.png b/docs/introduction/lap_tutorial/output_9_1.png new file mode 100644 index 000000000..b77bb6cfc Binary files /dev/null and b/docs/introduction/lap_tutorial/output_9_1.png differ diff --git a/docs/introduction/lap_tutorial/reprogram-lap.png b/docs/introduction/lap_tutorial/reprogram-lap.png new file mode 100644 index 000000000..f78aca1a8 Binary files /dev/null and b/docs/introduction/lap_tutorial/reprogram-lap.png differ diff --git a/docs/introduction/lap_tutorial/transdifferentiation-lap.png b/docs/introduction/lap_tutorial/transdifferentiation-lap.png new file mode 100644 index 000000000..389cfb7e4 Binary files /dev/null and b/docs/introduction/lap_tutorial/transdifferentiation-lap.png differ diff --git a/docs/introduction/perturbation_tutorial/output_14_1.png b/docs/introduction/perturbation_tutorial/output_14_1.png new file mode 100644 index 000000000..56526b5d5 Binary files /dev/null and b/docs/introduction/perturbation_tutorial/output_14_1.png differ diff --git a/docs/introduction/perturbation_tutorial/output_16_1.png b/docs/introduction/perturbation_tutorial/output_16_1.png new file mode 100644 index 000000000..ef4a5f413 Binary files /dev/null and b/docs/introduction/perturbation_tutorial/output_16_1.png differ diff --git a/docs/introduction/perturbation_tutorial/output_18_1.png b/docs/introduction/perturbation_tutorial/output_18_1.png new file mode 100644 index 000000000..ad335bc98 Binary files /dev/null and b/docs/introduction/perturbation_tutorial/output_18_1.png differ diff --git a/docs/introduction/perturbation_tutorial/output_20_1.png b/docs/introduction/perturbation_tutorial/output_20_1.png new file mode 100644 index 000000000..de76ab4a2 Binary files /dev/null and b/docs/introduction/perturbation_tutorial/output_20_1.png differ diff --git a/docs/introduction/perturbation_tutorial/output_22_1.png b/docs/introduction/perturbation_tutorial/output_22_1.png new file mode 100644 index 000000000..96fc05e56 Binary files /dev/null and b/docs/introduction/perturbation_tutorial/output_22_1.png differ diff --git a/docs/introduction/perturbation_tutorial/perturbation_tutorial.rst b/docs/introduction/perturbation_tutorial/perturbation_tutorial.rst new file mode 100644 index 000000000..0783ce22b --- /dev/null +++ b/docs/introduction/perturbation_tutorial/perturbation_tutorial.rst @@ -0,0 +1,204 @@ +.. raw:: html + +
+ + Open In Colab + + Open In nbviewer +
+ + +*in silico* perturbation +================= + +In the dynamo Cell paper :cite:p:`QIU2022`, we introduced the analytical form of a +vector field. This permits *in silico* perturbation predictions of expression +for each gene in each cell and the cell fate diversions after +genetic perturbations. In particular, we demonstrated the predictive +power of hematopoietic fate trajectory predictions after genetic +perturbations. + +| In this tutorial, we will cover the following topics: +- Perturbation functionality and API in dynamo +- How to single or combinatorial perturbation (either repression or activation) in hematopoietic scNT-seq dataset +- Visualize gene perturbation effects +- Reproduce results in dynamo paper Fig.7 :cite:p:`QIU2022` + +| - :ref:`You can read more about theory part here`. +Perturbation method introduction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Import relevant packages + +.. code:: ipython3 + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + + import sys + import os + + import dynamo as dyn + dyn.dynamo_logger.main_silence() + + +.. parsed-literal:: + + |-----> setting visualization default mode in dynamo. Your customized matplotlib settings might be overritten. + + +.. code:: ipython3 + + adata_labeling = dyn.sample_data.hematopoiesis() + + +Let us take a glance at what is in ``adata`` object. Preprocessing, normalization, umap dimension reduction, total RNA velocity, as well as the continous RNA velocity vector field are computed (notebooks on these operations will be released shortly. Please also check other existing notebooks for these operations). + +.. code:: ipython3 + + adata_labeling + + + + +.. parsed-literal:: + + AnnData object with n_obs × n_vars = 1947 × 1956 + obs: 'batch', 'time', 'cell_type', 'nGenes', 'nCounts', 'pMito', 'pass_basic_filter', 'new_Size_Factor', 'initial_new_cell_size', 'total_Size_Factor', 'initial_total_cell_size', 'spliced_Size_Factor', 'initial_spliced_cell_size', 'unspliced_Size_Factor', 'initial_unspliced_cell_size', 'Size_Factor', 'initial_cell_size', 'ntr', 'cell_cycle_phase', 'leiden', 'umap_leiden', 'umap_louvain', 'control_point_pca', 'inlier_prob_pca', 'obs_vf_angle_pca', 'pca_ddhodge_div', 'pca_ddhodge_potential', 'umap_ori_ddhodge_div', 'umap_ori_ddhodge_potential', 'curl_umap_ori', 'divergence_umap_ori', 'control_point_umap_ori', 'inlier_prob_umap_ori', 'obs_vf_angle_umap_ori', 'acceleration_pca', 'curvature_pca', 'n_counts', 'mt_frac', 'jacobian_det_pca', 'manual_selection', 'divergence_pca', 'curvature_umap_ori', 'acceleration_umap_ori', 'control_point_umap', 'inlier_prob_umap', 'obs_vf_angle_umap', 'curvature_umap', 'curv_leiden', 'curv_louvain', 'SPI1->GATA1_jacobian', 'jacobian' + var: 'gene_name', 'gene_id', 'nCells', 'nCounts', 'pass_basic_filter', 'use_for_pca', 'frac', 'ntr', 'time_3_alpha', 'time_3_beta', 'time_3_gamma', 'time_3_half_life', 'time_3_alpha_b', 'time_3_alpha_r2', 'time_3_gamma_b', 'time_3_gamma_r2', 'time_3_gamma_logLL', 'time_3_delta_b', 'time_3_delta_r2', 'time_3_bs', 'time_3_bf', 'time_3_uu0', 'time_3_ul0', 'time_3_su0', 'time_3_sl0', 'time_3_U0', 'time_3_S0', 'time_3_total0', 'time_3_beta_k', 'time_3_gamma_k', 'time_5_alpha', 'time_5_beta', 'time_5_gamma', 'time_5_half_life', 'time_5_alpha_b', 'time_5_alpha_r2', 'time_5_gamma_b', 'time_5_gamma_r2', 'time_5_gamma_logLL', 'time_5_bs', 'time_5_bf', 'time_5_uu0', 'time_5_ul0', 'time_5_su0', 'time_5_sl0', 'time_5_U0', 'time_5_S0', 'time_5_total0', 'time_5_beta_k', 'time_5_gamma_k', 'use_for_dynamics', 'gamma', 'gamma_r2', 'use_for_transition', 'gamma_k', 'gamma_b' + uns: 'PCs', 'VecFld_pca', 'VecFld_umap', 'VecFld_umap_ori', 'X_umap_ori_neighbors', 'cell_phase_genes', 'cell_type_colors', 'dynamics', 'explained_variance_ratio_', 'feature_selection', 'grid_velocity_pca', 'grid_velocity_umap', 'grid_velocity_umap_ori', 'grid_velocity_umap_ori_perturbation', 'grid_velocity_umap_ori_test', 'grid_velocity_umap_perturbation', 'jacobian_pca', 'leiden', 'neighbors', 'pca_mean', 'pp', 'response' + obsm: 'X', 'X_pca', 'X_pca_SparseVFC', 'X_umap', 'X_umap_SparseVFC', 'X_umap_ori', 'X_umap_ori_SparseVFC', 'X_umap_ori_perturbation', 'X_umap_ori_test', 'X_umap_perturbation', 'acceleration_pca', 'acceleration_umap_ori', 'cell_cycle_scores', 'curvature_pca', 'curvature_umap', 'curvature_umap_ori', 'j_delta_x_perturbation', 'velocity_pca', 'velocity_pca_SparseVFC', 'velocity_umap', 'velocity_umap_SparseVFC', 'velocity_umap_ori', 'velocity_umap_ori_SparseVFC', 'velocity_umap_ori_perturbation', 'velocity_umap_ori_test', 'velocity_umap_perturbation' + layers: 'M_n', 'M_nn', 'M_t', 'M_tn', 'M_tt', 'X_new', 'X_total', 'velocity_alpha_minus_gamma_s' + obsp: 'X_umap_ori_connectivities', 'X_umap_ori_distances', 'connectivities', 'cosine_transition_matrix', 'distances', 'fp_transition_rate', 'moments_con', 'pca_ddhodge', 'perturbation_transition_matrix', 'umap_ori_ddhodge' + + +*In silico* perturbation with ``dyn.pd.perturbation`` +---------------------------------------------------- + +The ``dyn.pd.perturbation`` function from *dynamo* can be used to either upregulating or suppressing a single or multiple genes in a particular cell or across all cells to perform *in silico* genetic perturbation. +When integrating the perturbation vectors across cells we can then also predict cell-fate outcomes after the perturbation which can be visualized as the perturbation streamlines. + +In the following, we will first delve into the *in silico* perturbations of the canonical PU.1/SPI1-GATA1 network motif that specifies the GMP or MEP lineage during hematopoiesis, respectively. + +Mutual exclusive effects after perturbing either GATA1 or SPI1 gene +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As we all know, GATA1 is the master regulator of the GMP lineage while SPI1 is the master regulator for the MEP lineage and GATA1 and PU1 forms a mutual inhibition and self-activation network motif. + +We first suppress the expression of GATA1 and it can divert cells from GMP-related lineages to MEP-related lineages. + +.. code:: ipython3 + + gene = "GATA1" + dyn.pd.perturbation(adata_labeling, gene, [-100], emb_basis="umap") + dyn.pl.streamline_plot(adata_labeling, color=["cell_type", gene], basis="umap_perturbation") + + +.. parsed-literal:: + + |-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000% + |-----> [projecting velocity vector to low dimensional embedding] finished [0.3502s] + + + +.. image:: output_14_1.png + :width: 955px + + +When suppressing the expression of SPI1, we find that cells from MEP-related lineages are diverted to GMP-related lineages. + +.. code:: ipython3 + + gene = "SPI1" + dyn.pd.perturbation(adata_labeling, gene, [-100], emb_basis="umap") + dyn.pl.streamline_plot(adata_labeling, color=["cell_type", gene], basis="umap_perturbation") + + +.. parsed-literal:: + + |-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000% + |-----> [projecting velocity vector to low dimensional embedding] finished [0.3635s] + + + +.. image:: output_16_1.png + :width: 962px + + +Double suppression of SPI1/GATA trap cell in the middle +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppression of both SPI1 and GATA1 traps cells in the progenitor states. +These predictions align well with those reported in (Rekhtman et al., 1999) and reveal a seesaw-effect regulation between SPI1 and GATA1 in driving the GMP and the MEP lineages. + +.. code:: ipython3 + + selected_genes = [ "SPI1", "GATA1"] + # expr_vals = [-100, -100] + expr_vals = [-100, -15] + dyn.pd.perturbation(adata_labeling, selected_genes, expr_vals, emb_basis="umap") + dyn.pl.streamline_plot(adata_labeling, color=["cell_type", gene], basis="umap_perturbation") + + + +.. parsed-literal:: + + |-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000% + |-----> [projecting velocity vector to low dimensional embedding] finished [0.4156s] + + + +.. image:: output_18_1.png + :width: 954px + + +Activate KLF1 +~~~~~~~~~~~~~ + +Dynamo *in silico* perturbation can correctly predicts other cellular transitions, showcased in :cite:p:`QIU2022`. Here we show that activation of KLF1 leads other cells convert into erythroid cells, consistent with :cite:p:`Orkin2008-vp`. + +.. code:: ipython3 + + gene = "KLF1" + dyn.pd.perturbation(adata_labeling, gene, [100], emb_basis="umap") + dyn.pl.streamline_plot(adata_labeling, color=["cell_type", gene], basis="umap_perturbation") + + +.. parsed-literal:: + + |-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000% + |-----> [projecting velocity vector to low dimensional embedding] finished [0.3362s] + + + +.. image:: output_20_1.png + + +Triple activation of "GATA1", "KLF1", "TAL1" +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Triple activation of GATA1, KLF1, and TAL1, known erythrocyte factors, +and TFs used for reprogramming fibroblasts into erythrocytes, diverts +most other cells into the Ery lineage :cite:p:`Capellera-Garcia2016-qp`. + +.. code:: ipython3 + + selected_genes = ["GATA1", "KLF1", "TAL1"] + expr_vals = [100, 100, 100] + dyn.pd.perturbation(adata_labeling, selected_genes, expr_vals, emb_basis="umap") + dyn.pl.streamline_plot(adata_labeling, color=["cell_type", gene], basis="umap_perturbation") + + + +.. parsed-literal:: + + |-----> [projecting velocity vector to low dimensional embedding] in progress: 100.0000% + |-----> [projecting velocity vector to low dimensional embedding] finished [0.3842s] + + + +.. image:: output_22_1.png + :width: 954px + diff --git a/docs/make.bat b/docs/make.bat old mode 100755 new mode 100644 index 543c6b13b..32bb24529 --- a/docs/make.bat +++ b/docs/make.bat @@ -7,10 +7,8 @@ REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help +set SOURCEDIR=. +set BUILDDIR=_build %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( @@ -21,15 +19,17 @@ if errorlevel 9009 ( echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ + echo.https://www.sphinx-doc.org/ exit /b 1 ) -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd diff --git a/docs/references.bib b/docs/references.bib new file mode 100644 index 000000000..05cc78bd3 --- /dev/null +++ b/docs/references.bib @@ -0,0 +1,3284 @@ +@Book{terrall, + address = {Chicago}, + author = {Terrall, Mary}, + publisher = {University of Chicago Press}, + title = {The Man Who Flattened the Earth: Maupertuis and the Sciences in the Enlightenment}, + year = 2006 +} + +@ARTICLE{Cao2020-lv, + title = "Sci-fate characterizes the dynamics of gene expression in single + cells", + author = "Cao, Junyue and Zhou, Wei and Steemers, Frank and Trapnell, Cole + and Shendure, Jay", + abstract = "Gene expression programs change over time, differentiation and + development, and in response to stimuli. However, nearly all + techniques for profiling gene expression in single cells do not + directly capture transcriptional dynamics. In the present study, + we present a method for combined single-cell combinatorial + indexing and messenger RNA labeling (sci-fate), which uses + combinatorial cell indexing and 4-thiouridine labeling of newly + synthesized mRNA to concurrently profile the whole and newly + synthesized transcriptome in each of many single cells. We used + sci-fate to study the cortisol response in >6,000 single cultured + cells. From these data, we quantified the dynamics of the cell + cycle and glucocorticoid receptor activation, and explored their + intersection. Finally, we developed software to infer and analyze + cell-state transitions. We anticipate that sci-fate will be + broadly applicable to quantitatively characterize transcriptional + dynamics in diverse systems.", + journal = "Nat. Biotechnol.", + volume = 38, + number = 8, + pages = "980--988", + month = aug, + year = 2020, + language = "en" +} + +@ARTICLE{Rayon2020-af, + title = "Species-specific pace of development is associated with + differences in protein stability", + author = "Rayon, Teresa and Stamataki, Despina and Perez-Carrasco, Ruben + and Garcia-Perez, Lorena and Barrington, Christopher and + Melchionda, Manuela and Exelby, Katherine and Lazaro, Jorge and + Tybulewicz, Victor L J and Fisher, Elizabeth M C and Briscoe, + James", + abstract = "Although many molecular mechanisms controlling developmental + processes are evolutionarily conserved, the speed at which the + embryo develops can vary substantially between species. For + example, the same genetic program, comprising sequential changes + in transcriptional states, governs the differentiation of motor + neurons in mouse and human, but the tempo at which it operates + differs between species. Using in vitro directed differentiation + of embryonic stem cells to motor neurons, we show that the + program runs more than twice as fast in mouse as in human. This + is not due to differences in signaling, nor the genomic sequence + of genes or their regulatory elements. Instead, there is an + approximately two-fold increase in protein stability and cell + cycle duration in human cells compared with mouse cells. This can + account for the slower pace of human development and suggests + that differences in protein turnover play a role in interspecies + differences in developmental tempo.", + journal = "Science", + volume = 369, + number = 6510, + month = sep, + year = 2020, + language = "en" +} + +@ARTICLE{Ma2013-sp, + title = "Regularized vector field learning with sparse approximation for + mismatch removal", + author = "Ma, Jiayi and Zhao, Ji and Tian, Jinwen and Bai, Xiang and Tu, + Zhuowen", + abstract = "In vector field learning, regularized kernel methods such as + regularized least-squares require the number of basis functions + to be equivalent to the training sample size, N. The learning + process thus has O(N3) and O(N2) in the time and space + complexity, respectively. This poses significant burden on the + vector learning problem for large datasets. In this paper, we + propose a sparse approximation to a robust vector field learning + method, sparse vector field consensus (SparseVFC), and derive a + statistical learning bound on the speed of the convergence. We + apply SparseVFC to the mismatch removal problem. The quantitative + results on benchmark datasets demonstrate the significant speed + advantage of SparseVFC over the original VFC algorithm (two + orders of magnitude faster) without much performance degradation; + we also demonstrate the large improvement by SparseVFC over + traditional methods like RANSAC. Moreover, the proposed method is + general and it can be applied to other applications in vector + field learning.", + journal = "Pattern Recognit.", + volume = 46, + number = 12, + pages = "3519--3532", + month = dec, + year = 2013, + keywords = "Vector field learning; Sparse approximation; Regularization; + Reproducing kernel Hilbert space; Outlier; Mismatch removal" +} + +@ARTICLE{Smedley2015-su, + title = "The {BioMart} community portal: an innovative alternative to + large, centralized data repositories", + author = "Smedley, Damian and Haider, Syed and Durinck, Steffen and + Pandini, Luca and Provero, Paolo and Allen, James and Arnaiz, + Olivier and Awedh, Mohammad Hamza and Baldock, Richard and + Barbiera, Giulia and Bardou, Philippe and Beck, Tim and Blake, + Andrew and Bonierbale, Merideth and Brookes, Anthony J and Bucci, + Gabriele and Buetti, Iwan and Burge, Sarah and Cabau, C{\'e}dric + and Carlson, Joseph W and Chelala, Claude and Chrysostomou, + Charalambos and Cittaro, Davide and Collin, Olivier and Cordova, + Raul and Cutts, Rosalind J and Dassi, Erik and Di Genova, Alex + and Djari, Anis and Esposito, Anthony and Estrella, Heather and + Eyras, Eduardo and Fernandez-Banet, Julio and Forbes, Simon and + Free, Robert C and Fujisawa, Takatomo and Gadaleta, Emanuela and + Garcia-Manteiga, Jose M and Goodstein, David and Gray, Kristian + and Guerra-Assun{\c c}{\~a}o, Jos{\'e} Afonso and Haggarty, + Bernard and Han, Dong-Jin and Han, Byung Woo and Harris, Todd and + Harshbarger, Jayson and Hastings, Robert K and Hayes, Richard D + and Hoede, Claire and Hu, Shen and Hu, Zhi-Liang and Hutchins, + Lucie and Kan, Zhengyan and Kawaji, Hideya and Keliet, Aminah and + Kerhornou, Arnaud and Kim, Sunghoon and Kinsella, Rhoda and + Klopp, Christophe and Kong, Lei and Lawson, Daniel and Lazarevic, + Dejan and Lee, Ji-Hyun and Letellier, Thomas and Li, Chuan-Yun + and Lio, Pietro and Liu, Chu-Jun and Luo, Jie and Maass, + Alejandro and Mariette, Jerome and Maurel, Thomas and Merella, + Stefania and Mohamed, Azza Mostafa and Moreews, Francois and + Nabihoudine, Ibounyamine and Ndegwa, Nelson and Noirot, + C{\'e}line and Perez-Llamas, Cristian and Primig, Michael and + Quattrone, Alessandro and Quesneville, Hadi and Rambaldi, Davide + and Reecy, James and Riba, Michela and Rosanoff, Steven and + Saddiq, Amna Ali and Salas, Elisa and Sallou, Olivier and + Shepherd, Rebecca and Simon, Reinhard and Sperling, Linda and + Spooner, William and Staines, Daniel M and Steinbach, Delphine + and Stone, Kevin and Stupka, Elia and Teague, Jon W and Dayem + Ullah, Abu Z and Wang, Jun and Ware, Doreen and Wong-Erasmus, + Marie and Youens-Clark, Ken and Zadissa, Amonida and Zhang, + Shi-Jian and Kasprzyk, Arek", + abstract = "The BioMart Community Portal (www.biomart.org) is a + community-driven effort to provide a unified interface to + biomedical databases that are distributed worldwide. The portal + provides access to numerous database projects supported by 30 + scientific organizations. It includes over 800 different + biological datasets spanning genomics, proteomics, model + organisms, cancer data, ontology information and more. All + resources available through the portal are independently + administered and funded by their host organizations. The BioMart + data federation technology provides a unified interface to all + the available data. The latest version of the portal comes with + many new databases that have been created by our ever-growing + community. It also comes with better support and extensibility + for data analysis and visualization tools. A new addition to our + toolbox, the enrichment analysis tool is now accessible through + graphical and web service interface. The BioMart community portal + averages over one million requests per day. Building on this + level of service and the wealth of information that has become + available, the BioMart Community Portal has introduced a new, + more scalable and cheaper alternative to the large data stores + maintained by specialized organizations.", + journal = "Nucleic Acids Res.", + volume = 43, + number = "W1", + pages = "W589--98", + month = jul, + year = 2015, + language = "en" +} + +@MISC{Rodriques2020-xc, + title = "{RNA} timestamps identify the age of single molecules in {RNA} + sequencing", + author = "Rodriques, Samuel G and Chen, Linlin M and Liu, Sophia and Zhong, + Ellen D and Scherrer, Joseph R and Boyden, Edward S and Chen, Fei", + journal = "Nature Biotechnology", + year = 2020 +} + +@ARTICLE{Saunders2019-ey, + title = "Thyroid hormone regulates distinct paths to maturation in pigment cell lineages", + author = "Saunders, Lauren M and Mishra, Abhishek K and Aman, Andrew J and Lewis, Victor M and Toomey, Matthew B and Packer, Jonathan S and + Qiu, Xiaojie and McFaline-Figueroa, Jose L and Corbo, Joseph C + and Trapnell, Cole and Parichy, David M", + abstract = "Thyroid hormone (TH) regulates diverse developmental events and can drive disparate cellular outcomes. In zebrafish, TH has + opposite effects on neural crest derived pigment cells of the adult stripe pattern, limiting melanophore population expansion, yet increasing yellow/orange xanthophore numbers. To learn how TH + elicits seemingly opposite responses in cells having a common + embryological origin, we analyzed individual transcriptomes from + thousands of neural crest-derived cells, reconstructed + developmental trajectories, identified pigment cell-lineage + specific responses to TH, and assessed roles for TH receptors. We + show that TH promotes maturation of both cell types but in + distinct ways. In melanophores, TH drives terminal differentiation, limiting final cell numbers. In xanthophores, TH promotes accumulation of orange carotenoids, making the cells visible. TH receptors act primarily to repress these programs when TH is limiting. Our findings show how a single endocrine factor integrates very different cellular activities during the generation of adult form.", + journal = "Elife", + volume = 8, + month = may, + year = 2019, + keywords = "developmental biology; genetics; genomics; melanophore; neural + crest; pigmentation; post-embryonic development; thyroid hormone; + xanthophore; zebrafish", + language = "en" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{Zhu2020-lx, + title = "Developmental trajectory of prehematopoietic stem cell formation + from endothelium", + author = "Zhu, Qin and Gao, Peng and Tober, Joanna and Bennett, Laura and + Chen, Changya and Uzun, Yasin and Li, Yan and Howell, Elizabeth D + and Mumau, Melanie and Yu, Wenbao and He, Bing and Speck, Nancy A + and Tan, Kai", + abstract = "Hematopoietic stem and progenitor cells (HSPCs) in the bone + marrow are derived from a small population of hemogenic + endothelial (HE) cells located in the major arteries of the + mammalian embryo. HE cells undergo an endothelial to + hematopoietic cell transition, giving rise to HSPCs that + accumulate in intra-arterial clusters (IAC) before colonizing the + fetal liver. To examine the cell and molecular transitions + between endothelial (E), HE, and IAC cells, and the heterogeneity + of HSPCs within IACs, we profiled ∼40 000 cells from the caudal + arteries (dorsal aorta, umbilical, vitelline) of 9.5 days post + coitus (dpc) to 11.5 dpc mouse embryos by single-cell RNA + sequencing and single-cell assay for transposase-accessible + chromatin sequencing. We identified a continuous developmental + trajectory from E to HE to IAC cells, with identifiable + intermediate stages. The intermediate stage most proximal to HE, + which we term pre-HE, is characterized by increased accessibility + of chromatin enriched for SOX, FOX, GATA, and SMAD motifs. A + developmental bottleneck separates pre-HE from HE, with RUNX1 + dosage regulating the efficiency of the pre-HE to HE transition. + A distal candidate Runx1 enhancer exhibits high chromatin + accessibility specifically in pre-HE cells at the bottleneck, but + loses accessibility thereafter. Distinct developmental + trajectories within IAC cells result in 2 populations of CD45+ + HSPCs; an initial wave of lymphomyeloid-biased progenitors, + followed by precursors of hematopoietic stem cells (pre-HSCs). + This multiomics single-cell atlas significantly expands our + understanding of pre-HSC ontogeny.", + journal = "Blood", + volume = 136, + number = 7, + pages = "845--856", + month = aug, + year = 2020, + language = "en" +} + +@UNPUBLISHED{Maehara2019-tn, + title = "Modeling latent flows on single-cell data using the Hodge + decomposition", + author = "Maehara, Kazumitsu and Ohkawa, Yasuyuki", + abstract = "Abstract Single-cell analysis is a powerful technique used to + identify a specific cell population of interest during + differentiation, aging, or oncogenesis. Individual cells occupy a + particular transient state in the cell cycle, circadian rhythm, + or during cell death. An appealing concept of pseudo-time + trajectory analysis of single-cell RNA sequencing data was + proposed in the software Monocle, and several methods of + trajectory analysis have since been published to date. These aim + to infer the ordering of cells and enable the tracing of gene + expression profile trajectories in cell differentiation and + reprogramming. However, the methods are restricted in terms of + time structure because of the pre-specified structure of + trajectories (linear, branched, tree or cyclic) which contrasts + with the mixed state of single cells.Here, we propose a technique + to extract underlying flows in single-cell data based on the + Hodge decomposition (HD). HD is a theorem of vector fields on a + manifold which guarantees that any given flow can decompose into + three types of orthogonal component: gradient-flow (acyclic), + curl-, and harmonic-flow (cyclic). HD is generalized on a + simplicial complex (graph) and the discretized HD has only a weak + assumption that the graph is directed. Therefore, in principle, + HD can extract flows from any mixture of tree and cyclic time + flows of observed cells. The decomposed flows provide intuitive + interpretations about complex flow because of their linearity and + orthogonality. Thus, each extracted flow can be focused on + separately with no need to consider crosstalk.We developed + ddhodge software, which aims to model the underlying flow + structure that implies unobserved time or causal relations in the + hodge-podge collection of data points. We demonstrated that the + mathematical framework of HD is suitable to reconstruct a sparse + graph representation of diffusion process as a candidate model of + differentiation while preserving the divergence of the original + fully-connected graph. The preserved divergence can be used as an + indicator of the source and sink cells in the observed + population. A sparse graph representation of the diffusion + process transforms data analysis of the non-linear structure + embedded in the high-dimensional space of single-cell data into + inspection of the visible flow using graph algorithms. Hence, + ddhodge is a suitable toolkit to visualize, inspect, and + subsequently interpret large data sets including, but not limited + to, high-throughput measurements of biological data.The beta + version of ddhodge R package is available at: + https://github.com/kazumits/ddhodge", + journal = "Cold Spring Harbor Laboratory", + pages = "592089", + month = mar, + year = 2019, + language = "en" +} + +@ARTICLE{Weinreb2020-dz, + title = "Lineage tracing on transcriptional landscapes links state to fate + during differentiation", + author = "Weinreb, Caleb and Rodriguez-Fraticelli, Alejo and Camargo, + Fernando D and Klein, Allon M", + abstract = "A challenge in biology is to associate molecular differences + among progenitor cells with their capacity to generate mature + cell types. Here, we used expressed DNA barcodes to clonally + trace transcriptomes over time and applied this to study fate + determination in hematopoiesis. We identified states of primed + fate potential and located them on a continuous transcriptional + landscape. We identified two routes of monocyte differentiation + that leave an imprint on mature cells. Analysis of sister cells + also revealed cells to have intrinsic fate biases not detectable + by single-cell RNA sequencing. Finally, we benchmarked + computational methods of dynamic inference from single-cell + snapshots, showing that fate choice occurs earlier than is + detected by state-of the-art algorithms and that cells progress + steadily through pseudotime with precise and consistent dynamics.", + journal = "Science", + volume = 367, + number = 6479, + month = feb, + year = 2020, + language = "en" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{Cusanovich2018-wz, + title = "A {Single-Cell} Atlas of In Vivo Mammalian Chromatin + Accessibility", + author = "Cusanovich, Darren A and Hill, Andrew J and Aghamirzaie, Delasa + and Daza, Riza M and Pliner, Hannah A and Berletch, Joel B and + Filippova, Galina N and Huang, Xingfan and Christiansen, Lena and + DeWitt, William S and Lee, Choli and Regalado, Samuel G and Read, + David F and Steemers, Frank J and Disteche, Christine M and + Trapnell, Cole and Shendure, Jay", + abstract = "We applied a combinatorial indexing assay, sci-ATAC-seq, to + profile genome-wide chromatin accessibility in ∼100,000 single + cells from 13 adult mouse tissues. We identify 85 distinct + patterns of chromatin accessibility, most of which can be + assigned to cell types, and ∼400,000 differentially accessible + elements. We use these data to link regulatory elements to their + target genes, to define the transcription factor grammar + specifying each cell type, and to discover in vivo correlates of + heterogeneity in accessibility within cell types. We develop a + technique for mapping single cell gene expression data to + single-cell chromatin accessibility data, facilitating the + comparison of atlases. By intersecting mouse chromatin + accessibility with human genome-wide association summary + statistics, we identify cell-type-specific enrichments of the + heritability signal for hundreds of complex traits. These data + define the in vivo landscape of the regulatory genome for common + mammalian cell types at single-cell resolution.", + journal = "Cell", + volume = 174, + number = 5, + pages = "1309--1324.e18", + month = aug, + year = 2018, + keywords = "ATAC-seq; GWAS; chromatin; chromatin accessibility; epigenetics; + epigenomics; regulatory; single cell", + language = "en" +} + +@UNPUBLISHED{Cho2019-bh, + title = "Mathematical modeling with single-cell sequencing data", + author = "Cho, Heyrim and Rockne, Russell C", + abstract = "Abstract Single-cell sequencing technologies have revolutionized + molecular and cellular biology and stimulated the development of + computational tools to analyze the data generated from these + technology platforms. However, despite the recent explosion of + computational analysis tools, relatively few mathematical models + have been developed to utilize these data. Here we compare and + contrast two approaches for building mathematical models of cell + state-transitions with single-cell RNA-sequencing data with + hematopoeisis as a model system; by solving partial differential + equations on a graph representing discrete cell state + relationships, and by solving the equations on a continuous cell + state-space. We demonstrate how to calibrate model parameters + from single or multiple time-point single-cell sequencing data, + and examine the effects of data processing algorithms on the + model calibration and predictions. As an application of our + approach, we demonstrate how the calibrated models may be used to + mathematically perturb normal hematopoeisis to simulate, predict, + and study the emergence of novel cell types during the + pathogenesis of acute myeloid leukemia. The mathematical modeling + framework we present is general and can be applied to study cell + state-transitions in any single-cell genome sequencing + dataset.Author summary Here we compare and contrast graph- and + continuum-based approaches for constructing mathematical models + of cell state-transitions using single-cell RNA-sequencing data. + Using two publicly available datasets, we demonstrate how to + calibrate mathematical models of hematopoeisis and how to use the + models to predict dynamics of acute myeloid leukemia pathogenesis + by mathematically perturbing the process of cellular + proliferation and differentiation. We apply these modeling + approaches to study the effects of perturbing individual or sets + of genes in subsets of cells, or by modeling the dynamics of cell + state-transitions directly in a reduced dimensional space. We + examine the effects of different graph abstraction and trajectory + inference algorithms on calibrating the models and the subsequent + model predictions. We conclude that both the graph- and + continuum-based modeling approaches can be equally well + calibrated to data and discuss situations in which one method may + be preferable over the other. This work presents a general + mathematical modeling framework, applicable to any single-cell + sequencing dataset where cell state-transitions are of interest.", + journal = "Cold Spring Harbor Laboratory", + pages = "710640", + month = jul, + year = 2019, + language = "en" +} + +@ARTICLE{Lubeck2014-pa, + title = "Single-cell in situ {RNA} profiling by sequential hybridization", + author = "Lubeck, Eric and Coskun, Ahmet F and Zhiyentayev, Timur and + Ahmad, Mubhij and Cai, Long", + journal = "Nat. Methods", + volume = 11, + number = 4, + pages = "360--361", + month = apr, + year = 2014, + language = "en" +} + +@ARTICLE{Karr2012-ns, + title = "A whole-cell computational model predicts phenotype from genotype", + author = "Karr, Jonathan R and Sanghvi, Jayodita C and Macklin, Derek N and + Gutschow, Miriam V and Jacobs, Jared M and Bolival, Jr, Benjamin + and Assad-Garcia, Nacyra and Glass, John I and Covert, Markus W", + abstract = "Understanding how complex phenotypes arise from individual + molecules and their interactions is a primary challenge in + biology that computational approaches are poised to tackle. We + report a whole-cell computational model of the life cycle of the + human pathogen Mycoplasma genitalium that includes all of its + molecular components and their interactions. An integrative + approach to modeling that combines diverse mathematics enabled + the simultaneous inclusion of fundamentally different cellular + processes and experimental measurements. Our whole-cell model + accounts for all annotated gene functions and was validated + against a broad range of data. The model provides insights into + many previously unobserved cellular behaviors, including in vivo + rates of protein-DNA association and an inverse relationship + between the durations of DNA replication initiation and + replication. In addition, experimental analysis directed by model + predictions identified previously undetected kinetic parameters + and biological functions. We conclude that comprehensive + whole-cell models can be used to facilitate biological discovery.", + journal = "Cell", + volume = 150, + number = 2, + pages = "389--401", + month = jul, + year = 2012, + language = "en" +} + +@UNPUBLISHED{Emanuel2020-aw, + title = "Bulk and single-cell gene expression profiling of {SARS-CoV-2} + infected human cell lines identifies molecular targets for + therapeutic intervention", + author = "Emanuel, Wyler and Kirstin, M{\"o}sbauer and Vedran, Franke and + Asija, Diag and Theresa, Gottula Lina and Roberto, Arsie and + Filippos, Klironomos and David, Koppstein and Salah, Ayoub and + Christopher, Buccitelli and Anja, Richter and Ivano, Legnini and + Andranik, Ivanov and Tommaso, Mari and Simone, Del Giudice and + Patrick, Papies Jan and Alexander, M{\"u}ller Marcel and Daniela, + Niemeyer and Matthias, Selbach and Altuna, Akalin and Nikolaus, + Rajewsky and Christian, Drosten and Markus, Landthaler", + abstract = "The coronavirus disease 2019 (COVID-19) pandemic, caused by the + novel severe acute respiratory syndrome coronavirus 2 + (SARS-CoV-2), is an ongoing global health threat with more than + two million infected people since its emergence in late 2019. + Detailed knowledge of the molecular biology of the infection is + indispensable for understanding of the viral replication, host + responses, and disease progression. We provide gene expression + profiles of SARS-CoV and SARS-CoV-2 infections in three human + cell lines (H1299, Caco-2 and Calu-3 cells), using bulk and + single-cell transcriptomics. Small RNA profiling showed strong + expression of the immunity and inflammation-associated microRNA + miRNA-155 upon infection with both viruses. SARS-CoV-2 elicited + approximately two-fold higher stimulation of the interferon + response compared to SARS-CoV in the permissive human epithelial + cell line Calu-3, and induction of cytokines such as CXCL10 or + IL6. Single cell RNA sequencing data showed that canonical + interferon stimulated genes such as IFIT2 or OAS2 were broadly + induced, whereas interferon beta (IFNB1) and lambda (IFNL1-4) + were expressed only in a subset of infected cells. In addition, + temporal resolution of transcriptional responses suggested + interferon regulatory factors (IRFs) activities precede that of + nuclear factor-$\kappa$B (NF-$\kappa$B). Lastly, we identified + heat shock protein 90 (HSP90) as a protein relevant for the + infection. Inhibition of the HSP90 charperone activity by + Tanespimycin/17-N-allylamino-17-demethoxygeldanamycin (17-AAG) + resulted in a reduction of viral replication, and of TNF and IL1B + mRNA levels. In summary, our study established in vitro cell + culture models to study SARS-CoV-2 infection and identified HSP90 + protein as potential drug target for therapeutic intervention of + SARS-CoV-2 infection. \#\#\# Competing Interest Statement The + authors have declared no competing interest.", + journal = "Cold Spring Harbor Laboratory", + pages = "2020.05.05.079194", + month = may, + year = 2020, + language = "en" +} + +@ARTICLE{Webb2020-ox, + title = "Sequence homology between human {PARP14} and the {SARS-CoV-2} + {ADP} ribose 1'-phosphatase", + author = "Webb, Thomas E and Saad, Ramy", + journal = "Immunol. Lett.", + volume = 224, + pages = "38--39", + month = aug, + year = 2020, + language = "en" +} + +@ARTICLE{Lim2009-cm, + title = "Silencing the {Metallothionein-2A} gene inhibits cell cycle + progression from {G1-} to S-phase involving {ATM} and cdc25A + signaling in breast cancer cells", + author = "Lim, Daina and Jocelyn, Koh Mei-Xin and Yip, George Wai-Cheong + and Bay, Boon-Huat", + abstract = "Metallothioneins (MTs) are a group of metal-binding proteins + involved in cell proliferation, differentiation and apoptosis. + The MT-2A isoform is generally the most abundant isoform among + the 10 known functional MT genes. In the present study, we + observed that down-regulation of the MT-2A gene in MCF-7 cells + via siRNA-mediated silencing inhibited cell growth by inducing + cell cycle arrest in G1-phase (G1-arrest) and a marginal increase + in cells in sub-G1-phase. Scanning electron microscopic + examination of the cells with silenced expression of MT-2A + (siMT-2A cells) revealed essentially normal cell morphology with + presence of scattered apoptotic cells. To elucidate the + underlying molecular mechanism, we examined the expression of + cell cycle related genes in MT-2A-silenced cells and found a + higher expression of the ataxia telangiectasia mutated (ATM) gene + concomitant with a lower expression of the cdc25A gene. These + data suggest that MT-2A could plausibly modulate cell cycle + progression from G1- to S-phase via the ATM/Chk2/cdc25A pathway.", + journal = "Cancer Lett.", + volume = 276, + number = 1, + pages = "109--117", + month = apr, + year = 2009, + language = "en" +} + +@ARTICLE{Wang2014-zc, + title = "Epigenetic state network approach for describing cell phenotypic + transitions", + author = "Wang, Ping and Song, Chaoming and Zhang, Hang and Wu, Zhanghan + and Tian, Xiao-Jun and Xing, Jianhua", + abstract = "Recent breakthroughs of cell phenotype reprogramming impose + theoretical challenges on unravelling the complexity of large + circuits maintaining cell phenotypes coupled at many different + epigenetic and gene regulation levels, and quantitatively + describing the phenotypic transition dynamics. A popular picture + proposed by Waddington views cell differentiation as a ball + sliding down a landscape with valleys corresponding to different + cell types separated by ridges. Based on theories of dynamical + systems, we establish a novel 'epigenetic state network' + framework that captures the global architecture of cell + phenotypes, which allows us to translate the metaphorical + low-dimensional Waddington epigenetic landscape concept into a + simple-yet-predictive rigorous mathematical framework of cell + phenotypic transitions. Specifically, we simplify a + high-dimensional epigenetic landscape into a collection of + discrete states corresponding to stable cell phenotypes connected + by optimal transition pathways among them. We then apply the + approach to the phenotypic transition processes among fibroblasts + (FBs), pluripotent stem cells (PSCs) and cardiomyocytes (CMs). + The epigenetic state network for this case predicts three major + transition pathways connecting FBs and CMs. One goes by way of + PSCs. The other two pathways involve transdifferentiation either + indirectly through cardiac progenitor cells or directly from FB + to CM. The predicted pathways and multiple intermediate states + are supported by existing microarray data and other experiments. + Our approach provides a theoretical framework for studying cell + phenotypic transitions. Future studies at single-cell levels can + directly test the model predictions.", + journal = "Interface Focus", + volume = 4, + number = 3, + pages = "20130068", + month = jun, + year = 2014, + keywords = "gene regulatory network; non-equilibrium steady state; nonlinear + dynamics", + language = "en" +} + +@UNPUBLISHED{Chapman2020-oj, + title = "Correlated Gene Modules Uncovered by {Single-Cell} + Transcriptomics with High Detectability and Accuracy", + author = "Chapman, Alec R and Lee, David F and Cai, Wenting and Ma, Wenping + and Li, Xiang and Sun, Wenjie and Sunney Xie, X", + abstract = "Abstract Single cell transcriptome sequencing has become + extremely useful for cell typing. However, such differential + expression data has shed little light on regulatory relationships + among genes. Here, by examining pairwise correlations between + mRNA levels of any two genes under steady-state conditions, we + uncovered correlated gene modules (CGMs), clusters of + intercorrelated genes that carry out certain biological functions + together. We report a novel single-cell RNA-seq method called + MALBAC-DT with higher detectability and accuracy, allowing + determination of the covariance matrix of the expressed mRNAs for + a homogenous cell population. We observed a prevalence of + positive correlations between pairs of genes, with higher + correlations corresponding to higher likelihoods of + protein-protein interactions. Some CGMs, such as the p53 module + in a cancer cell line, are cell type specific, while others, such + as the protein synthesis CGM, are shared by different cell types. + CGMs distinguished direct targets of p53 and exposed different + modes of regulation of these genes in different cell types. Our + covariance analyses of steady-state fluctuations provides a + powerful way to advance our functional understanding of + gene-to-gene interactions.", + journal = "Cold Spring Harbor Laboratory", + pages = "2019.12.31.892190", + month = jan, + year = 2020, + language = "en" +} + +@ARTICLE{Frieda2017-mz, + title = "Synthetic recording and in situ readout of lineage information in + single cells", + author = "Frieda, Kirsten L and Linton, James M and Hormoz, Sahand and + Choi, Joonhyuk and Chow, Ke-Huan K and Singer, Zakary S and + Budde, Mark W and Elowitz, Michael B and Cai, Long", + abstract = "Reconstructing the lineage relationships and dynamic event + histories of individual cells within their native spatial context + is a long-standing challenge in biology. Many biological + processes of interest occur in optically opaque or physically + inaccessible contexts, necessitating approaches other than direct + imaging. Here we describe a synthetic system that enables cells + to record lineage information and event histories in the genome + in a format that can be subsequently read out of single cells in + situ. This system, termed memory by engineered mutagenesis with + optical in situ readout (MEMOIR), is based on a set of barcoded + recording elements termed scratchpads. The state of a given + scratchpad can be irreversibly altered by CRISPR/Cas9-based + targeted mutagenesis, and later read out in single cells through + multiplexed single-molecule RNA fluorescence hybridization + (smFISH). Using MEMOIR as a proof of principle, we engineered + mouse embryonic stem cells to contain multiple scratchpads and + other recording components. In these cells, scratchpads were + altered in a progressive and stochastic fashion as the cells + proliferated. Analysis of the final states of scratchpads in + single cells in situ enabled reconstruction of lineage + information from cell colonies. Combining analysis of endogenous + gene expression with lineage reconstruction in the same cells + further allowed inference of the dynamic rates at which embryonic + stem cells switch between two gene expression states. Finally, + using simulations, we show how parallel MEMOIR systems operating + in the same cell could enable recording and readout of dynamic + cellular event histories. MEMOIR thus provides a versatile + platform for information recording and in situ, single-cell + readout across diverse biological systems.", + journal = "Nature", + volume = 541, + number = 7635, + pages = "107--111", + month = jan, + year = 2017, + language = "en" +} + +@ARTICLE{Matsuda2020-vf, + title = "Species-specific segmentation clock periods are due to + differential biochemical reaction speeds", + author = "Matsuda, Mitsuhiro and Hayashi, Hanako and Garcia-Ojalvo, Jordi + and Yoshioka-Kobayashi, Kumiko and Kageyama, Ryoichiro and + Yamanaka, Yoshihiro and Ikeya, Makoto and Toguchida, Junya and + Alev, Cantas and Ebisuya, Miki", + abstract = "Although mechanisms of embryonic development are similar between + mice and humans, the time scale is generally slower in humans. To + investigate these interspecies differences in development, we + recapitulate murine and human segmentation clocks that display 2- + to 3-hour and 5- to 6-hour oscillation periods, respectively. Our + interspecies genome-swapping analyses indicate that the period + difference is not due to sequence differences in the HES7 locus, + the core gene of the segmentation clock. Instead, we demonstrate + that multiple biochemical reactions of HES7, including the + degradation and expression delays, are slower in human cells than + they are in mouse cells. With the measured biochemical + parameters, our mathematical model accounts for the two- to + threefold period difference between the species. We propose that + cell-autonomous differences in biochemical reaction speeds + underlie temporal differences in development between species.", + journal = "Science", + volume = 369, + number = 6510, + pages = "1450--1455", + month = sep, + year = 2020, + language = "en" +} + +@ARTICLE{Kim2020-bo, + title = "The Architecture of {SARS-CoV-2} Transcriptome", + author = "Kim, Dongwan and Lee, Joo-Yeon and Yang, Jeong-Sun and Kim, Jun + Won and Kim, V Narry and Chang, Hyeshik", + abstract = "SARS-CoV-2 is a betacoronavirus responsible for the COVID-19 + pandemic. Although the SARS-CoV-2 genome was reported recently, + its transcriptomic architecture is unknown. Utilizing two + complementary sequencing techniques, we present a high-resolution + map of the SARS-CoV-2 transcriptome and epitranscriptome. DNA + nanoball sequencing shows that the transcriptome is highly + complex owing to numerous discontinuous transcription events. In + addition to the canonical genomic and 9 subgenomic RNAs, + SARS-CoV-2 produces transcripts encoding unknown ORFs with + fusion, deletion, and/or frameshift. Using nanopore direct RNA + sequencing, we further find at least 41 RNA modification sites on + viral transcripts, with the most frequent motif, AAGAA. Modified + RNAs have shorter poly(A) tails than unmodified RNAs, suggesting + a link between the modification and the 3' tail. Functional + investigation of the unknown transcripts and RNA modifications + discovered in this study will open new directions to our + understanding of the life cycle and pathogenicity of SARS-CoV-2.", + journal = "Cell", + volume = 181, + number = 4, + pages = "914--921.e10", + month = may, + year = 2020, + keywords = "COVID-19; RNA modification; SARS-CoV-2; coronavirus; direct RNA + sequencing; discontinuous transcription; epitranscriptome; + nanopore; poly(A) tail; transcriptome", + language = "en" +} + +@MISC{Creighton1958-ff, + title = "The Strategy of the Genes", + author = "Creighton, Harriet and Waddington, C H", + journal = "AIBS Bulletin", + volume = 8, + number = 2, + pages = "49", + year = 1958 +} + +@MISC{Alon2006-ea, + title = "An Introduction to Systems Biology", + author = "Alon, Uri", + year = 2006 +} + +@ARTICLE{McKenna2016-ma, + title = "Whole-organism lineage tracing by combinatorial and cumulative + genome editing", + author = "McKenna, Aaron and Findlay, Gregory M and Gagnon, James A and + Horwitz, Marshall S and Schier, Alexander F and Shendure, Jay", + abstract = "Multicellular systems develop from single cells through distinct + lineages. However, current lineage-tracing approaches scale + poorly to whole, complex organisms. Here, we use genome editing + to progressively introduce and accumulate diverse mutations in a + DNA barcode over multiple rounds of cell division. The barcode, + an array of clustered regularly interspaced short palindromic + repeats (CRISPR)/Cas9 target sites, marks cells and enables the + elucidation of lineage relationships via the patterns of + mutations shared between cells. In cell culture and zebrafish, we + show that rates and patterns of editing are tunable and that + thousands of lineage-informative barcode alleles can be + generated. By sampling hundreds of thousands of cells from + individual zebrafish, we find that most cells in adult organs + derive from relatively few embryonic progenitors. In future + analyses, genome editing of synthetic target arrays for lineage + tracing (GESTALT) can be used to generate large-scale maps of + cell lineage in multicellular systems for normal development and + disease.", + journal = "Science", + volume = 353, + number = 6298, + pages = "aaf7907", + month = jul, + year = 2016, + language = "en" +} + +@ARTICLE{Cusanovich2015-vn, + title = "Multiplex single cell profiling of chromatin accessibility by + combinatorial cellular indexing", + author = "Cusanovich, Darren A and Daza, Riza and Adey, Andrew and Pliner, + Hannah A and Christiansen, Lena and Gunderson, Kevin L and + Steemers, Frank J and Trapnell, Cole and Shendure, Jay", + abstract = "Technical advances have enabled the collection of genome and + transcriptome data sets with single-cell resolution. However, + single-cell characterization of the epigenome has remained + challenging. Furthermore, because cells must be physically + separated before biochemical processing, conventional single-cell + preparatory methods scale linearly. We applied combinatorial + cellular indexing to measure chromatin accessibility in thousands + of single cells per assay, circumventing the need for + compartmentalization of individual cells. We report chromatin + accessibility profiles from more than 15,000 single cells and use + these data to cluster cells on the basis of chromatin + accessibility landscapes. We identify modules of coordinately + regulated chromatin accessibility at the level of single cells + both between and within cell types, with a scalable method that + may accelerate progress toward a human cell atlas.", + journal = "Science", + volume = 348, + number = 6237, + pages = "910--914", + month = may, + year = 2015, + language = "en" +} + +@ARTICLE{Hochgerner2018-wk, + title = "Conserved properties of dentate gyrus neurogenesis across + postnatal development revealed by single-cell {RNA} sequencing", + author = "Hochgerner, Hannah and Zeisel, Amit and L{\"o}nnerberg, Peter and + Linnarsson, Sten", + abstract = "The dentate gyrus of the hippocampus is a brain region in which + neurogenesis persists into adulthood; however, the relationship + between developmental and adult dentate gyrus neurogenesis has + not been examined in detail. Here we used single-cell RNA + sequencing to reveal the molecular dynamics and diversity of + dentate gyrus cell types in perinatal, juvenile, and adult mice. + We found distinct quiescent and proliferating progenitor cell + types, linked by transient intermediate states to neuroblast + stages and fully mature granule cells. We observed shifts in the + molecular identity of quiescent and proliferating radial glia and + granule cells during the postnatal period that were then + maintained through adult stages. In contrast, intermediate + progenitor cells, neuroblasts, and immature granule cells were + nearly indistinguishable at all ages. These findings demonstrate + the fundamental similarity of postnatal and adult neurogenesis in + the hippocampus and pinpoint the early postnatal transformation + of radial glia from embryonic progenitors to adult quiescent stem + cells.", + journal = "Nat. Neurosci.", + volume = 21, + number = 2, + pages = "290--299", + month = feb, + year = 2018, + language = "en" +} + +@ARTICLE{Bastidas-Ponce2019-ma, + title = "Comprehensive single cell {mRNA} profiling reveals a detailed + roadmap for pancreatic endocrinogenesis", + author = "Bastidas-Ponce, Aim{\'e}e and Tritschler, Sophie and Dony, + Leander and Scheibner, Katharina and Tarquis-Medina, Marta and + Salinno, Ciro and Schirge, Silvia and Burtscher, Ingo and + B{\"o}ttcher, Anika and Theis, Fabian J and Lickert, Heiko and + Bakhti, Mostafa", + abstract = "Deciphering mechanisms of endocrine cell induction, specification + and lineage allocation in vivo will provide valuable insights + into how the islets of Langerhans are generated. Currently, it is + ill defined how endocrine progenitors segregate into different + endocrine subtypes during development. Here, we generated a novel + neurogenin 3 (Ngn3)-Venus fusion (NVF) reporter mouse line, that + closely mirrors the transient endogenous Ngn3 protein expression. + To define an in vivo roadmap of endocrinogenesis, we performed + single cell RNA sequencing of 36,351 pancreatic epithelial and + NVF+ cells during secondary transition. This allowed Ngn3low + endocrine progenitors, Ngn3high endocrine precursors, Fev+ + endocrine lineage and hormone+ endocrine subtypes to be + distinguished and time-resolved, and molecular programs during + the step-wise lineage restriction steps to be delineated. + Strikingly, we identified 58 novel signature genes that show the + same transient expression dynamics as Ngn3 in the 7260 profiled + Ngn3-expressing cells. The differential expression of these genes + in endocrine precursors associated with their cell-fate + allocation towards distinct endocrine cell types. Thus, the + generation of an accurately regulated NVF reporter allowed us to + temporally resolve endocrine lineage development to provide a + fine-grained single cell molecular profile of endocrinogenesis in + vivo.", + journal = "Development", + volume = 146, + number = 12, + month = jun, + year = 2019, + keywords = "Endocrine cell allocation; Endocrine progenitor-precursor; + Endocrinogenesis; Mouse; Neurog3; Single cell RNA sequencing", + language = "en" +} + +@ARTICLE{Liu2020-mw, + title = "{High-Spatial-Resolution} {Multi-Omics} Sequencing via + Deterministic Barcoding in Tissue", + author = "Liu, Yang and Yang, Mingyu and Deng, Yanxiang and Su, Graham and + Enninful, Archibald and Guo, Cindy C and Tebaldi, Toma and Zhang, + Di and Kim, Dongjoo and Bai, Zhiliang and Norris, Eileen and Pan, + Alisia and Li, Jiatong and Xiao, Yang and Halene, Stephanie and + Fan, Rong", + abstract = "We present deterministic barcoding in tissue for spatial omics + sequencing (DBiT-seq) for co-mapping of mRNAs and proteins in a + formaldehyde-fixed tissue slide via next-generation sequencing + (NGS). Parallel microfluidic channels were used to deliver DNA + barcodes to the surface of a tissue slide, and crossflow of two + sets of barcodes, A1-50 and B1-50, followed by ligation in situ, + yielded a 2D mosaic of tissue pixels, each containing a unique + full barcode AB. Application to mouse embryos revealed major + tissue types in early organogenesis as well as fine features like + microvasculature in a brain and pigmented epithelium in an eye + field. Gene expression profiles in 10-$\mu$m pixels conformed + into the clusters of single-cell transcriptomes, allowing for + rapid identification of cell types and spatial distributions. + DBiT-seq can be adopted by researchers with no experience in + microfluidics and may find applications in a range of fields + including developmental biology, cancer biology, neuroscience, + and clinical pathology.", + journal = "Cell", + volume = 183, + number = 6, + pages = "1665--1681.e18", + month = dec, + year = 2020, + keywords = "high spatial resolution; in situ barcoding; mouse embryo; + next-generation sequencing; spatial multi-omics", + language = "en" +} + +@ARTICLE{Chen2015-nk, + title = "Spatially resolved, highly multiplexed {RNA} profiling in single + cells", + author = "Chen, Kok Hao and Boettiger, Alistair N and Moffitt, Jeffrey R + and Wang, Siyuan and Zhuang, Xiaowei", + abstract = "The basis of cellular function is where and when proteins are + expressed and in what quantities. Single-molecule fluorescence + in situ hybridization (smFISH) experiments quantify the copy + number and location of mRNA molecules; however, the numbers of + RNA species that can be simultaneously measured by smFISH has + been limited. Using combinatorial labeling with error-robust + encoding schemes, Chen et al. simultaneously imaged 100 to 1000 + RNA species in a single cell. Such large-scale detection allows + regulatory interactions to be analyzed at the transcriptome + scale. Science , this issue p. [10.1126/science.aaa6090][1] + \#\#\# INTRODUCTION The copy number and intracellular + localization of RNA are important regulators of gene expression. + Measurement of these properties at the transcriptome scale in + single cells will give answers to many questions related to gene + expression and regulation. Single-molecule RNA imaging + approaches, such as single-molecule fluorescence in situ + hybridization (smFISH), are powerful tools for counting and + mapping RNA; however, the number of RNA species that can be + simultaneously imaged in individual cells has been limited. This + makes it challenging to perform transcriptomic analysis of + single cells in a spatially resolved manner. Here, we report + multiplexed error-robust FISH (MERFISH), a single-molecule + imaging method that allows thousands of RNA species to be imaged + in single cells by using combinatorial FISH labeling with + encoding schemes capable of detecting and/or correcting errors. + \#\#\# RATIONALE We labeled each cellular RNA with a set of + encoding probes, which contain targeting sequences that bind the + RNA and readout sequences that bind fluorescently labeled + readout probes. Each RNA species is encoded with a particular + combination of readout sequences. We used successive rounds of + hybridization and imaging, each with a different readout probe, + to identify the readout sequences bound to each RNA and to + decode the RNA. In principle, combinatorial labeling allows the + number of detectable RNA species to grow exponentially with the + number of imaging rounds, but the detection errors also increase + exponentially. To combat such accumulating errors, we exploited + error-robust encoding schemes used in digital electronics, such + as the extended Hamming code, in the design of our encoding + probes but modified these schemes in order to account for the + error properties in FISH measurements. We assigned each RNA a + binary word in our modified Hamming code and encoded the RNA + with a combination of readout sequences according to this binary + word. \#\#\# RESULTS We first imaged 140 RNA species in human + fibroblast cells using MERFISH with 16 rounds of hybridization + and a modified Hamming code capable of both error detection and + correction. We obtained ~80\% detection efficiency and observed + excellent correlation of RNA copy numbers determined with + MERFISH with both bulk RNA sequencing data and conventional + smFISH measurements of individual genes. Next, we used an + alternative MERFISH encoding scheme, which is capable of + detecting but not correcting errors, to image 1001 RNA species + in individual cells using only 14 rounds of hybridization. The + observed RNA copy numbers again correlate well with bulk + sequencing data. However, the detection efficiency is only + one-third that of the error-correcting encoding scheme. We + performed correlation analysis of the 104 to 106 pairs of + measured genes and identified many covarying gene groups that + share common regulatory elements. Such grouping allowed us to + hypothesize potential functions of ~100 unannotated or partially + annotated genes of unknown functions. We further analyzed + correlations in the spatial distributions of different RNA + species and identified groups of RNAs with different + distribution patterns in the cell. \#\#\# DISCUSSION This highly + multiplexed imaging approach enables analyses based on the + variation and correlation of copy numbers and spatial + distributions of a large number of RNA species within single + cells. Such analyses should facilitate the delineation of + regulatory networks and in situ identification of cell types. We + envision that this approach will allow spatially resolved + transcriptomes to be determined for single cells. ![Figure][2] + MERFISH for transcriptome imaging. Numerous RNA species can be + identified, counted, and localized in a single cell by using + MERFISH, a single-molecule imaging approach that uses + combinatorial labeling and sequential imaging with encoding + schemes capable of detection and/or correction of errors. This + highly multiplexed measurement of individual RNAs can be used to + compute the gene expression profile and noise, covariation in + expression among different genes, and spatial distribution of + RNAs within single cells. Knowledge of the expression profile + and spatial landscape of the transcriptome in individual cells + is essential for understanding the rich repertoire of cellular + behaviors. Here, we report multiplexed error-robust fluorescence + in situ hybridization (MERFISH), a single-molecule imaging + approach that allows the copy numbers and spatial localizations + of thousands of RNA species to be determined in single cells. + Using error-robust encoding schemes to combat single-molecule + labeling and detection errors, we demonstrated the imaging of + 100 to 1000 distinct RNA species in hundreds of individual + cells. Correlation analysis of the ~104 to 106 pairs of genes + allowed us to constrain gene regulatory networks, predict novel + functions for many unannotated genes, and identify distinct + spatial distribution patterns of RNAs that correlate with + properties of the encoded proteins. [1]: + /lookup/doi/10.1126/science.aaa6090 [2]: pending:yes", + journal = "Science", + publisher = "American Association for the Advancement of Science", + volume = 348, + number = 6233, + month = apr, + year = 2015, + language = "en" +} + +@MISC{Iman1981-uu, + title = "An Approach to Sensitivity Analysis of Computer Models: Part + {I---Introduction}, Input Variable Selection and Preliminary + Variable Assessment", + author = "Iman, Ronald L and Helton, Jon C and Campbell, James E", + journal = "Journal of Quality Technology", + volume = 13, + number = 3, + pages = "174--183", + year = 1981 +} + +@ARTICLE{Bizzotto2020-nx, + title = "{SARS-CoV-2} Infection Boosts {MX1} Antiviral Effector in + {COVID-19} Patients", + author = "Bizzotto, Juan and Sanchis, Pablo and Abbate, Mercedes and + Lage-Vickers, Sof{\'\i}a and Lavignolle, Rosario and Toro, + Ayel{\'e}n and Olszevicki, Santiago and Sabater, Agustina and + Cascardo, Florencia and Vazquez, Elba and Cotignola, Javier and + Gueron, Geraldine", + abstract = "In a published case-control study (GSE152075) from + SARS-CoV-2-positive (n = 403) and -negative patients (n = 50), we + analyzed the response to infection assessing gene expression of + host cell receptors and antiviral proteins. The expression + analysis associated with reported risk factors for COVID-19 was + also assessed. SARS-CoV-2 cases had higher ACE2, but lower + TMPRSS2, BSG/CD147, and CTSB expression compared with negative + cases. COVID-19 patients' age negatively affected ACE2 + expression. MX1 and MX2 were higher in COVID-19 patients. A + negative trend for MX1 and MX2 was observed as patients' age + increased. Principal-component analysis determined that ACE2, + MX1, MX2, and BSG/CD147 expression was able to cluster + non-COVID-19 and COVID-19 individuals. Multivariable regression + showed that MX1 expression significantly increased for each unit + of viral load increment. Altogether, these findings support + differences in ACE2, MX1, MX2, and BSG/CD147 expression between + COVID-19 and non-COVID-19 patients and point out to MX1 as a + critical responder in SARS-CoV-2 infection.", + journal = "iScience", + volume = 23, + number = 10, + pages = "101585", + month = oct, + year = 2020, + keywords = "Health Informatics; Virology", + language = "en" +} + +@ARTICLE{Qiu2020-kj, + title = "Inferring Causal Gene Regulatory Networks from Coupled + {Single-Cell} Expression Dynamics Using Scribe", + author = "Qiu, Xiaojie and Rahimzamani, Arman and Wang, Li and Ren, + Bingcheng and Mao, Qi and Durham, Timothy and McFaline-Figueroa, + Jos{\'e} L and Saunders, Lauren and Trapnell, Cole and Kannan, + Sreeram", + abstract = "Here, we present Scribe + (https://github.com/aristoteleo/Scribe-py), a toolkit for + detecting and visualizing causal regulatory interactions between + genes and explore the potential for single-cell experiments to + power network reconstruction. Scribe employs restricted directed + information to determine causality by estimating the strength of + information transferred from a potential regulator to its + downstream target. We apply Scribe and other leading approaches + for causal network reconstruction to several types of single-cell + measurements and show that there is a dramatic drop in + performance for ``pseudotime''-ordered single-cell data compared + with true time-series data. We demonstrate that performing causal + inference requires temporal coupling between measurements. We + show that methods such as ``RNA velocity'' restore some degree of + coupling through an analysis of chromaffin cell fate commitment. + These analyses highlight a shortcoming in experimental and + computational methods for analyzing gene regulation at + single-cell resolution and suggest ways of overcoming it.", + journal = "Cell Syst", + volume = 10, + number = 3, + pages = "265--274.e11", + month = mar, + year = 2020, + keywords = "RNA velocity; Scribe; causal network inference; coupled dynamics; + gene regulatory network inference; pseudotime; real time; + single-cell RNA-seq; single-cell trajectories; slam-seq", + language = "en" +} + +@ARTICLE{Baker2010-yk, + title = "Taking a long, hard look", + author = "Baker, Monya", + abstract = "Long-term, live-cell imaging helps to settle long-running + debates. Monya Baker investigates how the huge investment and + time commitment is finally paying off.", + journal = "Nature", + publisher = "Nature Publishing Group", + volume = 466, + number = 7310, + pages = "1137--1138", + month = aug, + year = 2010, + language = "en" +} + +@BOOK{Alon2019-yd, + title = "An Introduction to Systems Biology: Design Principles of + Biological Circuits", + author = "Alon, Uri", + abstract = "Praise for the first edition: ... superb, beautifully written + and organized work that takes an engineering approach to systems + biology. Alon provides nicely written appendices to explain the + basic mathematical and biological concepts clearly and + succinctly without interfering with the main text. He starts + with a mathematical description of transcriptional activation + and then describes some basic transcription-network motifs + (patterns) that can be combined to form larger networks. -- + Nature [This text deserves] serious attention from any + quantitative scientist who hopes to learn about modern biology + ... It assumes no prior knowledge of or even interest in biology + ... One final aspect that must be mentioned is the wonderful set + of exercises that accompany each chapter. ... Alon's book should + become a standard part of the training of graduate students. -- + Physics Today Written for students and researchers, the second + edition of this best-selling textbook continues to offer a clear + presentation of design principles that govern the structure and + behavior of biological systems. It highlights simple, recurring + circuit elements that make up the regulation of cells and + tissues. Rigorously classroom-tested, this edition includes new + chapters on exciting advances made in the last decade. Features: + Includes seven new chapters The new edition has 189 exercises, + the previous edition had 66 Offers new examples relevant to + human physiology and disease The book website including course + videos can be found here: + https://www.weizmann.ac.il/mcb/UriAlon/introduction-systems-biology-design-principles-biological-circuits.", + publisher = "CRC Press", + month = jul, + year = 2019, + language = "en" +} + +@ARTICLE{Petukhov2018-ck, + title = "dropEst: pipeline for accurate estimation of molecular counts in + droplet-based single-cell {RNA-seq} experiments", + author = "Petukhov, Viktor and Guo, Jimin and Baryawno, Ninib and Severe, + Nicolas and Scadden, David T and Samsonova, Maria G and + Kharchenko, Peter V", + abstract = "Recent single-cell RNA-seq protocols based on droplet + microfluidics use massively multiplexed barcoding to enable + simultaneous measurements of transcriptomes for thousands of + individual cells. The increasing complexity of such data creates + challenges for subsequent computational processing and + troubleshooting of these experiments, with few software options + currently available. Here, we describe a flexible pipeline for + processing droplet-based transcriptome data that implements + barcode corrections, classification of cell quality, and + diagnostic information about the droplet libraries. We introduce + advanced methods for correcting composition bias and sequencing + errors affecting cellular and molecular barcodes to provide more + accurate estimates of molecular counts in individual cells.", + journal = "Genome Biol.", + volume = 19, + number = 1, + pages = "78", + month = jun, + year = 2018, + language = "en" +} + +@ARTICLE{Macklin2020-vj, + title = "Simultaneous cross-evaluation of heterogeneous E. coli datasets + via mechanistic simulation", + author = "Macklin, Derek N and Ahn-Horst, Travis A and Choi, Heejo and + Ruggero, Nicholas A and Carrera, Javier and Mason, John C and + Sun, Gwanggyu and Agmon, Eran and DeFelice, Mialy M and Maayan, + Inbal and Lane, Keara and Spangler, Ryan K and Gillies, Taryn E + and Paull, Morgan L and Akhter, Sajia and Bray, Samuel R and + Weaver, Daniel S and Keseler, Ingrid M and Karp, Peter D and + Morrison, Jerry H and Covert, Markus W", + abstract = "The extensive heterogeneity of biological data poses challenges + to analysis and interpretation. Construction of a large-scale + mechanistic model of Escherichia coli enabled us to integrate and + cross-evaluate a massive, heterogeneous dataset based on + measurements reported by various groups over decades. We + identified inconsistencies with functional consequences across + the data, including that the total output of the ribosomes and + RNA polymerases described by data are not sufficient for a cell + to reproduce measured doubling times, that measured metabolic + parameters are neither fully compatible with each other nor with + overall growth, and that essential proteins are absent during the + cell cycle-and the cell is robust to this absence. Finally, + considering these data as a whole leads to successful predictions + of new experimental outcomes, in this case protein half-lives.", + journal = "Science", + volume = 369, + number = 6502, + month = jul, + year = 2020, + language = "en" +} + +@ARTICLE{Chan2019-cc, + title = "Molecular recording of mammalian embryogenesis", + author = "Chan, Michelle M and Smith, Zachary D and Grosswendt, Stefanie + and Kretzmer, Helene and Norman, Thomas M and Adamson, Britt and + Jost, Marco and Quinn, Jeffrey J and Yang, Dian and Jones, + Matthew G and Khodaverdian, Alex and Yosef, Nir and Meissner, + Alexander and Weissman, Jonathan S", + abstract = "Ontogeny describes the emergence of complex multicellular + organisms from single totipotent cells. This field is + particularly challenging in mammals, owing to the indeterminate + relationship between self-renewal and differentiation, variation + in progenitor field sizes, and internal gestation in these + animals. Here we present a flexible, high-information, + multi-channel molecular recorder with a single-cell readout and + apply it as an evolving lineage tracer to assemble mouse + cell-fate maps from fertilization through gastrulation. By + combining lineage information with single-cell RNA sequencing + profiles, we recapitulate canonical developmental relationships + between different tissue types and reveal the nearly complete + transcriptional convergence of endodermal cells of + extra-embryonic and embryonic origins. Finally, we apply our + cell-fate maps to estimate the number of embryonic progenitor + cells and their degree of asymmetric partitioning during + specification. Our approach enables massively parallel, + high-resolution recording of lineage and other information in + mammalian systems, which will facilitate the construction of a + quantitative framework for understanding developmental processes.", + journal = "Nature", + volume = 570, + number = 7759, + pages = "77--82", + month = jun, + year = 2019, + language = "en" +} + +@ARTICLE{Qiu2012-yt, + title = "From understanding the development landscape of the canonical + fate-switch pair to constructing a dynamic landscape for two-step + neural differentiation", + author = "Qiu, Xiaojie and Ding, Shanshan and Shi, Tieliu", + abstract = "Recent progress in stem cell biology, notably cell fate + conversion, calls for novel theoretical understanding for cell + differentiation. The existing qualitative concept of Waddington's + ``epigenetic landscape'' has attracted particular attention + because it captures subsequent fate decision points, thus + manifesting the hierarchical (``tree-like'') nature of cell fate + diversification. Here, we generalized a recent work and explored + such a developmental landscape for a two-gene fate decision + circuit by integrating the underlying probability landscapes with + different parameters (corresponding to distinct developmental + stages). The change of entropy production rate along the + parameter changes indicates which parameter changes can represent + a normal developmental process while other parameters' change can + not. The transdifferentiation paths over the landscape under + certain conditions reveal the possibility of a direct and + reversible phenotypic conversion. As the intensity of noise + increases, we found that the landscape becomes flatter and the + dominant paths more straight, implying the importance of + biological noise processing mechanism in development and + reprogramming. We further extended the landscape of the one-step + fate decision to that for two-step decisions in central nervous + system (CNS) differentiation. A minimal network and dynamic model + for CNS differentiation was firstly constructed where two + three-gene motifs are coupled. We then implemented the SDEs + (Stochastic Differentiation Equations) simulation for the + validity of the network and model. By integrating the two + landscapes for the two switch gene pairs, we constructed the + two-step development landscape for CNS differentiation. Our work + provides new insights into cellular differentiation and important + clues for better reprogramming strategies.", + journal = "PLoS One", + volume = 7, + number = 12, + pages = "e49271", + month = dec, + year = 2012, + language = "en" +} + +@ARTICLE{Jurges2018-aj, + title = "Dissecting newly transcribed and old {RNA} using {GRAND-SLAM}", + author = "J{\"u}rges, Christopher and D{\"o}lken, Lars and Erhard, Florian", + abstract = "Summary: Global quantification of total RNA is used to + investigate steady state levels of gene expression. However, + being able to differentiate pre-existing RNA (that has been + synthesized prior to a defined point in time) and newly + transcribed RNA can provide invaluable information e.g. to + estimate RNA half-lives or identify fast and complex regulatory + processes. Recently, new techniques based on metabolic labeling + and RNA-seq have emerged that allow to quantify new and old RNA: + Nucleoside analogs are incorporated into newly transcribed RNA + and are made detectable as point mutations in mapped reads. + However, relatively infrequent incorporation events and + significant sequencing error rates make the differentiation + between old and new RNA a highly challenging task. We developed a + statistical approach termed GRAND-SLAM that, for the first time, + allows to estimate the proportion of old and new RNA in such an + experiment. Uncertainty in the estimates is quantified in a + Bayesian framework. Simulation experiments show our approach to + be unbiased and highly accurate. Furthermore, we analyze how + uncertainty in the proportion translates into uncertainty in + estimating RNA half-lives and give guidelines for planning + experiments. Finally, we demonstrate that our estimates of RNA + half-lives compare favorably to other experimental approaches and + that biological processes affecting RNA half-lives can be + investigated with greater power than offered by any other method. + GRAND-SLAM is freely available for non-commercial use at + http://software.erhard-lab.de; R scripts to generate all figures + are available at zenodo (doi: 10.5281/zenodo.1162340).", + journal = "Bioinformatics", + volume = 34, + number = 13, + pages = "i218--i226", + month = jul, + year = 2018, + language = "en" +} + +@ARTICLE{Huang2007-ns, + title = "Bifurcation dynamics in lineage-commitment in bipotent progenitor + cells", + author = "Huang, Sui and Guo, Yan-Ping and May, Gillian and Enver, Tariq", + abstract = "Lineage specification of multipotent progenitor cells is governed + by a balance of lineage-affiliated transcription factors, such as + GATA1 and PU.1, which regulate the choice between erythroid and + myelomonocytic fates. But how ratios of lineage-determining + transcription factors stabilize progenitor cells and resolve + their indeterminacy to commit them to discrete, mutually + exclusive fates remains unexplained. We used a simple model and + experimental measurements to analyze the dynamics of a binary + fate decision governed by a gene-circuit containing + auto-stimulation and cross-inhibition, as embodied by the + GATA1-PU.1 paradigm. This circuit generates stable attractors + corresponding to erythroid and myelomonocytic fates, as well as + an uncommitted metastable state characterized by coexpression of + both regulators, explaining the phenomenon of ``multilineage + priming''. GATA1 and PU.1 mRNA and transcriptome dynamics of + differentiating progenitor cells confirm that commitment occurs + in two stages, as suggested by the model: first, the progenitor + state is destabilized in an almost symmetrical bifurcation event, + resulting in a poised state at the boundary between the two + lineage-specific attractors; second, the cell is driven to the + respective, now accessible attractors. This minimal model + captures fundamental features of binary cell fate decisions, + uniting the concepts of stochastic (selective) and deterministic + (instructive) regulation, and hence, may apply to a wider range + of binary fate decision points.", + journal = "Dev. Biol.", + volume = 305, + number = 2, + pages = "695--713", + month = may, + year = 2007, + language = "en" +} + +@ARTICLE{Trapnell2014-kk, + title = "The dynamics and regulators of cell fate decisions are revealed + by pseudotemporal ordering of single cells", + author = "Trapnell, Cole and Cacchiarelli, Davide and Grimsby, Jonna and + Pokharel, Prapti and Li, Shuqiang and Morse, Michael and Lennon, + Niall J and Livak, Kenneth J and Mikkelsen, Tarjei S and Rinn, + John L", + abstract = "Defining the transcriptional dynamics of a temporal process such + as cell differentiation is challenging owing to the high + variability in gene expression between individual cells. + Time-series gene expression analyses of bulk cells have + difficulty distinguishing early and late phases of a + transcriptional cascade or identifying rare subpopulations of + cells, and single-cell proteomic methods rely on a priori + knowledge of key distinguishing markers. Here we describe + Monocle, an unsupervised algorithm that increases the temporal + resolution of transcriptome dynamics using single-cell RNA-Seq + data collected at multiple time points. Applied to the + differentiation of primary human myoblasts, Monocle revealed + switch-like changes in expression of key regulatory factors, + sequential waves of gene regulation, and expression of regulators + that were not known to act in differentiation. We validated some + of these predicted regulators in a loss-of function screen. + Monocle can in principle be used to recover single-cell gene + expression kinetics from a wide array of cellular processes, + including differentiation, proliferation and oncogenic + transformation.", + journal = "Nat. Biotechnol.", + volume = 32, + number = 4, + pages = "381--386", + month = apr, + year = 2014, + language = "en" +} + +@ARTICLE{Saelens2019-ts, + title = "A comparison of single-cell trajectory inference methods", + author = "Saelens, Wouter and Cannoodt, Robrecht and Todorov, Helena and + Saeys, Yvan", + abstract = "Trajectory inference approaches analyze genome-wide omics data + from thousands of single cells and computationally infer the + order of these cells along developmental trajectories. Although + more than 70 trajectory inference tools have already been + developed, it is challenging to compare their performance because + the input they require and output models they produce vary + substantially. Here, we benchmark 45 of these methods on 110 real + and 229 synthetic datasets for cellular ordering, topology, + scalability and usability. Our results highlight the + complementarity of existing tools, and that the choice of method + should depend mostly on the dataset dimensions and trajectory + topology. Based on these results, we develop a set of guidelines + to help users select the best method for their dataset. Our + freely available data and evaluation pipeline ( + https://benchmark.dynverse.org ) will aid in the development of + improved tools designed to analyze increasingly large and complex + single-cell datasets.", + journal = "Nat. Biotechnol.", + volume = 37, + number = 5, + pages = "547--554", + month = may, + year = 2019, + language = "en" +} + +@ARTICLE{Battich2020-gj, + title = "Sequencing metabolically labeled transcripts in single cells + reveals {mRNA} turnover strategies", + author = "Battich, Nico and Beumer, Joep and de Barbanson, Buys and + Krenning, Lenno and Baron, Chlo{\'e} S and Tanenbaum, Marvin E + and Clevers, Hans and van Oudenaarden, Alexander", + abstract = "The regulation of messenger RNA levels in mammalian cells can be + achieved by the modulation of synthesis and degradation rates. + Metabolic RNA-labeling experiments in bulk have quantified these + rates using relatively homogeneous cell populations. However, to + determine these rates during complex dynamical processes, for + instance during cellular differentiation, single-cell resolution + is required. Therefore, we developed a method that simultaneously + quantifies metabolically labeled and preexisting unlabeled + transcripts in thousands of individual cells. We determined + synthesis and degradation rates during the cell cycle and during + differentiation of intestinal stem cells, revealing major + regulatory strategies. These strategies have distinct + consequences for controlling the dynamic range and precision of + gene expression. These findings advance our understanding of how + individual cells in heterogeneous populations shape their gene + expression dynamics.", + journal = "Science", + volume = 367, + number = 6482, + pages = "1151--1156", + month = mar, + year = 2020, + language = "en" +} + +@ARTICLE{Furlan2017-aj, + title = "Multipotent peripheral glial cells generate neuroendocrine cells + of the adrenal medulla", + author = "Furlan, Alessandro and Dyachuk, Vyacheslav and Kastriti, Maria + Eleni and Calvo-Enrique, Laura and Abdo, Hind and Hadjab, Saida + and Chontorotzea, Tatiana and Akkuratova, Natalia and Usoskin, + Dmitry and Kamenev, Dmitry and Petersen, Julian and Sunadome, + Kazunori and Memic, Fatima and Marklund, Ulrika and Fried, Kaj + and Topilko, Piotr and Lallemend, Francois and Kharchenko, Peter + V and Ernfors, Patrik and Adameyko, Igor", + abstract = "Adrenaline is a fundamental circulating hormone for bodily + responses to internal and external stressors. Chromaffin cells of + the adrenal medulla (AM) represent the main neuroendocrine + adrenergic component and are believed to differentiate from + neural crest cells. We demonstrate that large numbers of + chromaffin cells arise from peripheral glial stem cells, termed + Schwann cell precursors (SCPs). SCPs migrate along the visceral + motor nerve to the vicinity of the forming adrenal gland, where + they detach from the nerve and form postsynaptic neuroendocrine + chromaffin cells. An intricate molecular logic drives two + sequential phases of gene expression, one unique for a distinct + transient cellular state and another for cell type specification. + Subsequently, these programs down-regulate SCP-gene and + up-regulate chromaffin cell-gene networks. The AM forms through + limited cell expansion and requires the recruitment of numerous + SCPs. Thus, peripheral nerves serve as a stem cell niche for + neuroendocrine system development.", + journal = "Science", + volume = 357, + number = 6346, + month = jul, + year = 2017, + language = "en" +} + +@ARTICLE{Cao2020-ik, + title = "Tracking development at the cellular level", + author = "Cao, Junyue", + journal = "Science", + volume = 370, + number = 6519, + pages = "924--925", + month = nov, + year = 2020, + language = "en" +} + +@ARTICLE{Hu2020-an, + title = "{ZipSeq}: barcoding for real-time mapping of single cell + transcriptomes", + author = "Hu, Kenneth H and Eichorst, John P and McGinnis, Chris S and + Patterson, David M and Chow, Eric D and Kersten, Kelly and + Jameson, Stephen C and Gartner, Zev J and Rao, Arjun A and + Krummel, Matthew F", + abstract = "Spatial transcriptomics seeks to integrate single cell + transcriptomic data within the three-dimensional space of + multicellular biology. Current methods to correlate a cell's + position with its transcriptome in living tissues have various + limitations. We developed an approach, called 'ZipSeq', that uses + patterned illumination and photocaged oligonucleotides to + serially print barcodes ('zipcodes') onto live cells in intact + tissues, in real time and with an on-the-fly selection of + patterns. Using ZipSeq, we mapped gene expression in three + settings: in vitro wound healing, live lymph node sections and a + live tumor microenvironment. In all cases, we discovered new gene + expression patterns associated with histological structures. In + the tumor microenvironment, this demonstrated a trajectory of + myeloid and T cell differentiation from the periphery inward. A + combinatorial variation of ZipSeq efficiently scales in the + number of regions defined, providing a pathway for complete + mapping of live tissues, subsequent to real-time imaging or + perturbation.", + journal = "Nat. Methods", + volume = 17, + number = 8, + pages = "833--843", + month = aug, + year = 2020, + language = "en" +} + +@ARTICLE{Ma2020-kv, + title = "Chromatin Potential Identified by Shared {Single-Cell} Profiling + of {RNA} and Chromatin", + author = "Ma, Sai and Zhang, Bing and LaFave, Lindsay M and Earl, Andrew S + and Chiang, Zachary and Hu, Yan and Ding, Jiarui and Brack, + Alison and Kartha, Vinay K and Tay, Tristan and Law, Travis and + Lareau, Caleb and Hsu, Ya-Chieh and Regev, Aviv and Buenrostro, + Jason D", + abstract = "Cell differentiation and function are regulated across multiple + layers of gene regulation, including modulation of gene + expression by changes in chromatin accessibility. However, + differentiation is an asynchronous process precluding a temporal + understanding of regulatory events leading to cell fate + commitment. Here we developed simultaneous high-throughput ATAC + and RNA expression with sequencing (SHARE-seq), a highly scalable + approach for measurement of chromatin accessibility and gene + expression in the same single cell, applicable to different + tissues. Using 34,774 joint profiles from mouse skin, we develop + a computational strategy to identify cis-regulatory interactions + and define domains of regulatory chromatin (DORCs) that + significantly overlap with super-enhancers. During lineage + commitment, chromatin accessibility at DORCs precedes gene + expression, suggesting that changes in chromatin accessibility + may prime cells for lineage commitment. We computationally infer + chromatin potential as a quantitative measure of chromatin + lineage-priming and use it to predict cell fate outcomes. + SHARE-seq is an extensible platform to study regulatory circuitry + across diverse cells in tissues.", + journal = "Cell", + volume = 183, + number = 4, + pages = "1103--1116.e20", + month = nov, + year = 2020, + keywords = "epigenomics; gene regulation; single cell; skin; stem cell", + language = "en" +} + +@ARTICLE{Rodriques2019-hk, + title = "Slide-seq: A scalable technology for measuring genome-wide + expression at high spatial resolution", + author = "Rodriques, Samuel G and Stickels, Robert R and Goeva, + Aleksandrina and Martin, Carly A and Murray, Evan and Vanderburg, + Charles R and Welch, Joshua and Chen, Linlin M and Chen, Fei and + Macosko, Evan Z", + abstract = "Spatial positions of cells in tissues strongly influence + function, yet a high-throughput, genome-wide readout of gene + expression with cellular resolution is lacking. We developed + Slide-seq, a method for transferring RNA from tissue sections + onto a surface covered in DNA-barcoded beads with known + positions, allowing the locations of the RNA to be inferred by + sequencing. Using Slide-seq, we localized cell types identified + by single-cell RNA sequencing datasets within the cerebellum and + hippocampus, characterized spatial gene expression patterns in + the Purkinje layer of mouse cerebellum, and defined the temporal + evolution of cell type-specific responses in a mouse model of + traumatic brain injury. These studies highlight how Slide-seq + provides a scalable method for obtaining spatially resolved gene + expression data at resolutions comparable to the sizes of + individual cells.", + journal = "Science", + volume = 363, + number = 6434, + pages = "1463--1467", + month = mar, + year = 2019, + language = "en" +} + +@ARTICLE{Erhard2019-oc, + title = "{scSLAM-seq} reveals core features of transcription dynamics in + single cells", + author = "Erhard, Florian and Baptista, Marisa A P and Krammer, Tobias and + Hennig, Thomas and Lange, Marius and Arampatzi, Panagiota and + J{\"u}rges, Christopher S and Theis, Fabian J and Saliba, + Antoine-Emmanuel and D{\"o}lken, Lars", + abstract = "Single-cell RNA sequencing (scRNA-seq) has highlighted the + important role of intercellular heterogeneity in phenotype + variability in both health and disease1. However, current + scRNA-seq approaches provide only a snapshot of gene expression + and convey little information on the true temporal dynamics and + stochastic nature of transcription. A further key limitation of + scRNA-seq analysis is that the RNA profile of each individual + cell can be analysed only once. Here we introduce single-cell, + thiol-(SH)-linked alkylation of RNA for metabolic labelling + sequencing (scSLAM-seq), which integrates metabolic RNA + labelling2, biochemical nucleoside conversion3 and scRNA-seq to + record transcriptional activity directly by differentiating + between new and old RNA for thousands of genes per single cell. + We use scSLAM-seq to study the onset of infection with lytic + cytomegalovirus in single mouse fibroblasts. The cell-cycle state + and dose of infection deduced from old RNA enable dose-response + analysis based on new RNA. scSLAM-seq thereby both visualizes and + explains differences in transcriptional activity at the + single-cell level. Furthermore, it depicts 'on-off' switches and + transcriptional burst kinetics in host gene expression with + extensive gene-specific differences that correlate with + promoter-intrinsic features (TBP-TATA-box interactions and DNA + methylation). Thus, gene-specific, and not cell-specific, + features explain the heterogeneity in transcriptomes between + individual cells and the transcriptional response to + perturbations.", + journal = "Nature", + volume = 571, + number = 7765, + pages = "419--423", + month = jul, + year = 2019, + language = "en" +} + +@ARTICLE{Klein2015-nw, + title = "Droplet barcoding for single-cell transcriptomics applied to + embryonic stem cells", + author = "Klein, Allon M and Mazutis, Linas and Akartuna, Ilke and + Tallapragada, Naren and Veres, Adrian and Li, Victor and Peshkin, + Leonid and Weitz, David A and Kirschner, Marc W", + abstract = "It has long been the dream of biologists to map gene expression + at the single-cell level. With such data one might track + heterogeneous cell sub-populations, and infer regulatory + relationships between genes and pathways. Recently, RNA + sequencing has achieved single-cell resolution. What is limiting + is an effective way to routinely isolate and process large + numbers of individual cells for quantitative in-depth sequencing. + We have developed a high-throughput droplet-microfluidic approach + for barcoding the RNA from thousands of individual cells for + subsequent analysis by next-generation sequencing. The method + shows a surprisingly low noise profile and is readily adaptable + to other sequencing-based assays. We analyzed mouse embryonic + stem cells, revealing in detail the population structure and the + heterogeneous onset of differentiation after leukemia inhibitory + factor (LIF) withdrawal. The reproducibility of these + high-throughput single-cell data allowed us to deconstruct cell + populations and infer gene expression relationships. VIDEO + ABSTRACT.", + journal = "Cell", + volume = 161, + number = 5, + pages = "1187--1201", + month = may, + year = 2015, + language = "en" +} + +@ARTICLE{Weinreb2018-fo, + title = "Fundamental limits on dynamic inference from single-cell + snapshots", + author = "Weinreb, Caleb and Wolock, Samuel and Tusi, Betsabeh K and + Socolovsky, Merav and Klein, Allon M", + abstract = "Single-cell expression profiling reveals the molecular states of + individual cells with unprecedented detail. Because these methods + destroy cells in the process of analysis, they cannot measure how + gene expression changes over time. However, some information on + dynamics is present in the data: the continuum of molecular + states in the population can reflect the trajectory of a typical + cell. Many methods for extracting single-cell dynamics from + population data have been proposed. However, all such attempts + face a common limitation: for any measured distribution of cell + states, there are multiple dynamics that could give rise to it, + and by extension, multiple possibilities for underlying + mechanisms of gene regulation. Here, we describe the aspects of + gene expression dynamics that cannot be inferred from a static + snapshot alone and identify assumptions necessary to constrain a + unique solution for cell dynamics from static snapshots. We + translate these constraints into a practical algorithmic + approach, population balance analysis (PBA), which makes use of a + method from spectral graph theory to solve a class of + high-dimensional differential equations. We use simulations to + show the strengths and limitations of PBA, and then apply it to + single-cell profiles of hematopoietic progenitor cells (HPCs). + Cell state predictions from this analysis agree with HPC fate + assays reported in several papers over the past two decades. By + highlighting the fundamental limits on dynamic inference faced by + any method, our framework provides a rigorous basis for dynamic + interpretation of a gene expression continuum and clarifies best + experimental designs for trajectory reconstruction from static + snapshot measurements.", + journal = "Proc. Natl. Acad. Sci. U. S. A.", + volume = 115, + number = 10, + pages = "E2467--E2476", + month = mar, + year = 2018, + keywords = "dynamic inference; hematopoiesis; pseudotime; single cell; + spectral graph theory", + language = "en" +} + +@Article{Li2020-my, +author = {Li , Tiejun and Shi , Jifan and Wu , Yichong and Zhou , Peijie}, +title = {On the Mathematics of RNA Velocity I: Theoretical Analysis}, +journal = {CSIAM Transactions on Applied Mathematics}, +year = {2021}, +volume = {2}, +number = {1}, +pages = {1--55}, +abstract = { + The RNA velocity provides a new avenue to study the stemness and lineage of cells in the development in scRNA-seq data analysis. Some promising extensions of it are proposed and the community is experiencing a fast developing period. + However, in this stage, it is of prime importance to revisit the whole process of RNA + velocity analysis from the mathematical point of view, which will help to understand + the rationale and drawbacks of different proposals. The current paper is devoted to + this purpose. We present a thorough mathematical study on the RNA velocity model + from dynamics to downstream data analysis. We derived the analytical solution of + the RNA velocity model from both deterministic and stochastic point of view. We + presented the parameter inference framework based on the maximum likelihood estimate. We also derived the continuum limit of different downstream analysis methods, + which provides insights on the construction of transition probability matrix, root and + ending-cells identification, and the development routes finding. The overall analysis + aims at providing a mathematical basis for more advanced design and development + of RNA velocity type methods in the future. +}, +issn = {2708-0579}, +doi = {https://doi.org/10.4208/csiam-am.SO-2020-0001}, +url = {http://global-sci.org/intro/article_detail/csiam-am/18653.html} +} + +@ARTICLE{Petratou2021-hj, + title = "The {MITF} paralog tfec is required in neural crest development + for fate specification of the iridophore lineage from a + multipotent pigment cell progenitor", + author = "Petratou, Kleio and Spencer, Samantha A and Kelsh, Robert N and + Lister, James A", + abstract = "Understanding how fate specification of distinct cell-types from + multipotent progenitors occurs is a fundamental question in + embryology. Neural crest stem cells (NCSCs) generate + extraordinarily diverse derivatives, including multiple neural, + skeletogenic and pigment cell fates. Key transcription factors + and extracellular signals specifying NCSC lineages remain to be + identified, and we have only a little idea of how and when they + function together to control fate. Zebrafish have three neural + crest-derived pigment cell types, black melanocytes, + light-reflecting iridophores and yellow xanthophores, which offer + a powerful model for studying the molecular and cellular + mechanisms of fate segregation. Mitfa has been identified as the + master regulator of melanocyte fate. Here, we show that an + Mitf-related transcription factor, Tfec, functions as master + regulator of the iridophore fate. Surprisingly, our phenotypic + analysis of tfec mutants demonstrates that Tfec also functions in + the initial specification of all three pigment cell-types, + although the melanocyte and xanthophore lineages recover later. + We show that Mitfa represses tfec expression, revealing a likely + mechanism contributing to the decision between melanocyte and + iridophore fate. Our data are consistent with the long-standing + proposal of a tripotent progenitor restricted to pigment cell + fates. Moreover, we investigate activation, maintenance and + function of tfec in multipotent NCSCs, demonstrating for the + first time its role in the gene regulatory network forming and + maintaining early neural crest cells. In summary, we build on our + previous work to characterise the gene regulatory network + governing iridophore development, establishing Tfec as the master + regulator driving iridophore specification from multipotent + progenitors, while shedding light on possible cellular mechanisms + of progressive fate restriction.", + journal = "PLoS One", + volume = 16, + number = 1, + pages = "e0244794", + month = jan, + year = 2021, + language = "en" +} + +@ARTICLE{Kester2018-dv, + title = "{Single-Cell} Transcriptomics Meets Lineage Tracing", + author = "Kester, Lennart and van Oudenaarden, Alexander", + abstract = "Reconstructing lineage relationships between cells within a + tissue or organism is a long-standing aim in biology. + Traditionally, lineage tracing has been achieved through the + (genetic) labeling of a cell followed by the tracking of its + offspring. Currently, lineage trajectories can also be predicted + using single-cell transcriptomics. Although single-cell + transcriptomics provides detailed phenotypic information, the + predicted lineage trajectories do not necessarily reflect genetic + relationships. Recently, techniques have been developed that + unite these strategies. In this Review, we discuss + transcriptome-based lineage trajectory prediction algorithms, + single-cell genetic lineage tracing, and the promising + combination of these techniques for stem cell and cancer + research.", + journal = "Cell Stem Cell", + volume = 23, + number = 2, + pages = "166--179", + month = aug, + year = 2018, + keywords = "lineage trajectory reconstruction; single-cell lineage tracing; + single-cell mRNA sequencing", + language = "en" +} + +@BOOK{Waddington1957-ct, + title = "The Strategy of the Genes, a Discussion of Some Aspects of + Theoretical Biology, by {C.H}. Waddington, ... With an Appendix + [Some Physico-chemical Aspects of Biological Organisation] by H. + Kacser,", + author = "Waddington, Conrad Hall", + publisher = "G. Allen and Unwin", + year = 1957, + language = "en" +} + +@ARTICLE{Cao2020-ng, + title = "A human cell atlas of fetal gene expression", + author = "Cao, Junyue and O'Day, Diana R and Pliner, Hannah A and Kingsley, + Paul D and Deng, Mei and Daza, Riza M and Zager, Michael A and + Aldinger, Kimberly A and Blecher-Gonen, Ronnie and Zhang, Fan and + Spielmann, Malte and Palis, James and Doherty, Dan and Steemers, + Frank J and Glass, Ian A and Trapnell, Cole and Shendure, Jay", + abstract = "The gene expression program underlying the specification of human + cell types is of fundamental interest. We generated human cell + atlases of gene expression and chromatin accessibility in fetal + tissues. For gene expression, we applied three-level + combinatorial indexing to >110 samples representing 15 organs, + ultimately profiling ~4 million single cells. We leveraged the + literature and other atlases to identify and annotate hundreds of + cell types and subtypes, both within and across tissues. Our + analyses focused on organ-specific specializations of broadly + distributed cell types (such as blood, endothelial, and + epithelial), sites of fetal erythropoiesis (which notably + included the adrenal gland), and integration with mouse + developmental atlases (such as conserved specification of blood + cells). These data represent a rich resource for the exploration + of in vivo human gene expression in diverse tissues and cell + types.", + journal = "Science", + volume = 370, + number = 6518, + month = nov, + year = 2020, + language = "en" +} + +@ARTICLE{Moffitt2018-fd, + title = "Molecular, spatial, and functional single-cell profiling of the + hypothalamic preoptic region", + author = "Moffitt, Jeffrey R and Bambah-Mukku, Dhananjay and Eichhorn, + Stephen W and Vaughn, Eric and Shekhar, Karthik and Perez, Julio + D and Rubinstein, Nimrod D and Hao, Junjie and Regev, Aviv and + Dulac, Catherine and Zhuang, Xiaowei", + abstract = "The hypothalamus controls essential social behaviors and + homeostatic functions. However, the cellular architecture of + hypothalamic nuclei-including the molecular identity, spatial + organization, and function of distinct cell types-is poorly + understood. Here, we developed an imaging-based in situ cell-type + identification and mapping method and combined it with + single-cell RNA-sequencing to create a molecularly annotated and + spatially resolved cell atlas of the mouse hypothalamic preoptic + region. We profiled ~1 million cells, identified ~70 neuronal + populations characterized by distinct neuromodulatory signatures + and spatial organizations, and defined specific neuronal + populations activated during social behaviors in male and female + mice, providing a high-resolution framework for mechanistic + investigation of behavior circuits. The approach described opens + a new avenue for the construction of cell atlases in diverse + tissues and organisms.", + journal = "Science", + volume = 362, + number = 6416, + month = nov, + year = 2018, + language = "en" +} + +@ARTICLE{Sundqvist2018-xi, + title = "{JUNB} governs a feed-forward network of {TGF$\beta$} signaling + that aggravates breast cancer invasion", + author = "Sundqvist, Anders and Morikawa, Masato and Ren, Jiang and + Vasilaki, Eleftheria and Kawasaki, Natsumi and Kobayashi, Mai and + Koinuma, Daizo and Aburatani, Hiroyuki and Miyazono, Kohei and + Heldin, Carl-Henrik and van Dam, Hans and Ten Dijke, Peter", + abstract = "It is well established that transforming growth factor-$\beta$ + (TGF$\beta$) switches its function from being a tumor suppressor + to a tumor promoter during the course of tumorigenesis, which + involves both cell-intrinsic and environment-mediated mechanisms. + We are interested in breast cancer cells, in which SMAD mutations + are rare and interactions between SMAD and other transcription + factors define pro-oncogenic events. Here, we have performed + chromatin immunoprecipitation (ChIP)-sequencing analyses which + indicate that the genome-wide landscape of SMAD2/3 binding is + altered after prolonged TGF$\beta$ stimulation. De novo motif + analyses of the SMAD2/3 binding regions predict enrichment of + binding motifs for activator protein (AP)1 in addition to SMAD + motifs. TGF$\beta$-induced expression of the AP1 component JUNB + was required for expression of many late invasion-mediating + genes, creating a feed-forward regulatory network. Moreover, we + found that several components in the WNT pathway were enriched + among the late TGF$\beta$-target genes, including the + invasion-inducing WNT7 proteins. Consistently, overexpression of + WNT7A or WNT7B enhanced and potentiated TGF$\beta$-induced breast + cancer cell invasion, while inhibition of the WNT pathway reduced + this process. Our study thereby helps to explain how accumulation + of pro-oncogenic stimuli switches and stabilizes + TGF$\beta$-induced cellular phenotypes of epithelial cells.", + journal = "Nucleic Acids Res.", + volume = 46, + number = 3, + pages = "1180--1195", + month = feb, + year = 2018, + language = "en" +} + +@ARTICLE{Huang2012-ft, + title = "The molecular and mathematical basis of Waddington's epigenetic + landscape: a framework for post-Darwinian biology?", + author = "Huang, Sui", + abstract = "The Neo-Darwinian concept of natural selection is plausible when + one assumes a straightforward causation of phenotype by + genotype. However, such simple 1:1 mapping must now give place + to the modern concepts of gene regulatory networks and gene + expression noise. Both can, in the absence of genetic mutations, + jointly generate a diversity of inheritable randomly occupied + phenotypic states that could also serve as a substrate for + natural selection. This form of epigenetic dynamics challenges + Neo-Darwinism. It needs to incorporate the non-linear, + stochastic dynamics of gene networks. A first step is to + consider the mathematical correspondence between gene regulatory + networks and Waddington's metaphoric 'epigenetic landscape', + which actually represents the quasi-potential function of global + network dynamics. It explains the coexistence of multiple stable + phenotypes within one genotype. The landscape's topography with + its attractors is shaped by evolution through mutational + re-wiring of regulatory interactions - offering a link between + genetic mutation and sudden, broad evolutionary changes.", + journal = "Bioessays", + publisher = "Wiley", + volume = 34, + number = 2, + pages = "149--157", + month = feb, + year = 2012, + language = "en" +} + +@ARTICLE{Golding2005-ia, + title = "Real-time kinetics of gene activity in individual bacteria", + author = "Golding, Ido and Paulsson, Johan and Zawilski, Scott M and Cox, + Edward C", + abstract = "Protein levels have been shown to vary substantially between + individual cells in clonal populations. In prokaryotes, the + contribution to such fluctuations from the inherent randomness of + gene expression has largely been attributed to having just a few + transcripts of the corresponding mRNAs. By contrast, eukaryotic + studies tend to emphasize chromatin remodeling and burst-like + transcription. Here, we study single-cell transcription in + Escherichia coli by measuring mRNA levels in individual living + cells. The results directly demonstrate transcriptional bursting, + similar to that indirectly inferred for eukaryotes. We also + measure mRNA partitioning at cell division and correlate mRNA and + protein levels in single cells. Partitioning is approximately + binomial, and mRNA-protein correlations are weaker earlier in the + cell cycle, where cell division has recently randomized the + relative concentrations. Our methods further extend protein-based + approaches by counting the integer-valued number of transcript + with single-molecule resolution. This greatly facilitates kinetic + interpretations in terms of the integer-valued random processes + that produce the fluctuations.", + journal = "Cell", + volume = 123, + number = 6, + pages = "1025--1036", + month = dec, + year = 2005, + language = "en" +} + +@ARTICLE{Bradley2012-la, + title = "Regulation of embryonic stem cell pluripotency by heat shock + protein 90", + author = "Bradley, Eric and Bieberich, Erhard and Mivechi, Nahid F and + Tangpisuthipongsa, Dantera and Wang, Guanghu", + abstract = "Deciphering the molecular basis of stem cell pluripotency is + fundamental to the understanding of stem cell biology, early + embryonic development, and to the clinical application of + regenerative medicine. We report here that the molecular + chaperone heat shock protein 90 (Hsp90) is essential for mouse + embryonic stem cell (ESC) pluripotency through regulating + multiple pluripotency factors, including Oct4, Nanog, and signal + transducer and activator of transcription 3. Inhibition of Hsp90 + by either 17-N-Allylamino-17-demethoxygeldanamycin or miRNA led + to ESC differentiation. Overexpression of Hsp90$\beta$ partially + rescued the phenotype; in particular, the levels of Oct4 and + Nanog were restored. Notably, Hsp90 associated with Oct4 and + Nanog in the same cellular complex and protected them from + degradation by the ubiquitin proteasome pathway, suggesting that + Oct4 and Nanog are potential novel Hsp90 client proteins. In + addition, Hsp90 inhibition reduced the mRNA level of Oct4, but + not that of Nanog, indicating that Hsp90 participates in Oct4 + mRNA processing or maturation. Hsp90 inhibition also increased + expression of some protein markers for mesodermal lineages, + implying that Hsp90 suppresses mesodermal differentiation from + ESCs. These findings support a new role for Hsp90 in maintaining + ESC pluripotency by sustaining the level of multiple pluripotency + factors, particularly Oct4 and Nanog.", + journal = "Stem Cells", + volume = 30, + number = 8, + pages = "1624--1633", + month = aug, + year = 2012, + language = "en" +} + +% The entry below contains non-ASCII chars that could not be converted +% to a LaTeX equivalent. +@ARTICLE{Adamson2016-qj, + title = "A Multiplexed {Single-Cell} {CRISPR} Screening Platform Enables + Systematic Dissection of the Unfolded Protein Response", + author = "Adamson, Britt and Norman, Thomas M and Jost, Marco and Cho, Min + Y and Nu{\~n}ez, James K and Chen, Yuwen and Villalta, Jacqueline + E and Gilbert, Luke A and Horlbeck, Max A and Hein, Marco Y and + Pak, Ryan A and Gray, Andrew N and Gross, Carol A and Dixit, + Atray and Parnas, Oren and Regev, Aviv and Weissman, Jonathan S", + abstract = "Functional genomics efforts face tradeoffs between number of + perturbations examined and complexity of phenotypes measured. We + bridge this gap with Perturb-seq, which combines droplet-based + single-cell RNA-seq with a strategy for barcoding CRISPR-mediated + perturbations, allowing many perturbations to be profiled in + pooled format. We applied Perturb-seq to dissect the mammalian + unfolded protein response (UPR) using single and combinatorial + CRISPR perturbations. Two genome-scale CRISPR interference + (CRISPRi) screens identified genes whose repression perturbs ER + homeostasis. Subjecting ∼100 hits to Perturb-seq enabled + high-precision functional clustering of genes. Single-cell + analyses decoupled the three UPR branches, revealed bifurcated + UPR branch activation among cells subject to the same + perturbation, and uncovered differential activation of the + branches across hits, including an isolated feedback loop between + the translocon and IRE1$\alpha$. These studies provide insight + into how the three sensors of ER homeostasis monitor distinct + types of stress and highlight the ability of Perturb-seq to + dissect complex cellular responses.", + journal = "Cell", + volume = 167, + number = 7, + pages = "1867--1882.e21", + month = dec, + year = 2016, + keywords = "CRIPSRi; CRISPR; Single-cell RNA-seq; cell-to-cell heterogeneity; + genome-scale screening; single-cell genomics; unfolded protein + response", + language = "en" +} + +@ARTICLE{Dixit2016-br, + title = "{Perturb-Seq}: Dissecting Molecular Circuits with Scalable + {Single-Cell} {RNA} Profiling of Pooled Genetic Screens", + author = "Dixit, Atray and Parnas, Oren and Li, Biyu and Chen, Jenny and + Fulco, Charles P and Jerby-Arnon, Livnat and Marjanovic, Nemanja + D and Dionne, Danielle and Burks, Tyler and Raychowdhury, Raktima + and Adamson, Britt and Norman, Thomas M and Lander, Eric S and + Weissman, Jonathan S and Friedman, Nir and Regev, Aviv", + abstract = "Genetic screens help infer gene function in mammalian cells, but + it has remained difficult to assay complex phenotypes-such as + transcriptional profiles-at scale. Here, we develop Perturb-seq, + combining single-cell RNA sequencing (RNA-seq) and clustered + regularly interspaced short palindromic repeats (CRISPR)-based + perturbations to perform many such assays in a pool. We + demonstrate Perturb-seq by analyzing 200,000 cells in immune + cells and cell lines, focusing on transcription factors + regulating the response of dendritic cells to lipopolysaccharide + (LPS). Perturb-seq accurately identifies individual gene targets, + gene signatures, and cell states affected by individual + perturbations and their genetic interactions. We posit new + functions for regulators of differentiation, the anti-viral + response, and mitochondrial function during immune activation. By + decomposing many high content measurements into the effects of + perturbations, their interactions, and diverse cell metadata, + Perturb-seq dramatically increases the scope of pooled genomic + assays.", + journal = "Cell", + volume = 167, + number = 7, + pages = "1853--1866.e17", + month = dec, + year = 2016, + keywords = "CRISPR; epistasis; genetic interactions; pooled screen; + single-cell RNA-seq", + language = "en" +} + +@ARTICLE{Wang2020-zb, + title = "Live-cell imaging and analysis reveal cell phenotypic transition + dynamics inherently missing in snapshot data", + author = "Wang, Weikang and Douglas, Diana and Zhang, Jingyu and Kumari, + Sangeeta and Enuameh, Metewo Selase and Dai, Yan and Wallace, + Callen T and Watkins, Simon C and Shu, Weiguo and Xing, Jianhua", + abstract = "Recent advances in single-cell techniques catalyze an emerging + field of studying how cells convert from one phenotype to + another, in a step-by-step process. Two grand technical + challenges, however, impede further development of the field. + Fixed cell-based approaches can provide snapshots of + high-dimensional expression profiles but have fundamental limits + on revealing temporal information, and fluorescence-based + live-cell imaging approaches provide temporal information but are + technically challenging for multiplex long-term imaging. We first + developed a live-cell imaging platform that tracks cellular + status change through combining endogenous fluorescent labeling + that minimizes perturbation to cell physiology and/or live-cell + imaging of high-dimensional cell morphological and texture + features. With our platform and an A549 VIM-RFP + epithelial-to-mesenchymal transition (EMT) reporter cell line, + live-cell trajectories reveal parallel paths of EMT missing from + snapshot data due to cell-cell dynamic heterogeneity. Our results + emphasize the necessity of extracting dynamical information of + phenotypic transitions from multiplex live-cell imaging.", + journal = "Sci Adv", + volume = 6, + number = 36, + month = sep, + year = 2020, + language = "en" +} + +@ARTICLE{Qiu2020-uf, + title = "Massively parallel and time-resolved {RNA} sequencing in single + cells with {scNT-seq}", + author = "Qiu, Qi and Hu, Peng and Qiu, Xiaojie and Govek, Kiya W and + C{\'a}mara, Pablo G and Wu, Hao", + abstract = "Single-cell RNA sequencing offers snapshots of whole + transcriptomes but obscures the temporal RNA dynamics. Here we + present single-cell metabolically labeled new RNA tagging + sequencing (scNT-seq), a method for massively parallel analysis + of newly transcribed and pre-existing mRNAs from the same cell. + This droplet microfluidics-based method enables high-throughput + chemical conversion on barcoded beads, efficiently marking newly + transcribed mRNAs with T-to-C substitutions. Using scNT-seq, we + jointly profiled new and old transcriptomes in ~55,000 single + cells. These data revealed time-resolved transcription factor + activities and cell-state trajectories at the single-cell level + in response to neuronal activation. We further determined rates + of RNA biogenesis and decay to uncover RNA regulatory strategies + during stepwise conversion between pluripotent and rare + totipotent two-cell embryo (2C)-like stem cell states. Finally, + integrating scNT-seq with genetic perturbation identifies DNA + methylcytosine dioxygenase as an epigenetic barrier into the + 2C-like cell state. Time-resolved single-cell transcriptomic + analysis thus opens new lines of inquiry regarding + cell-type-specific RNA regulatory mechanisms.", + journal = "Nat. Methods", + volume = 17, + number = 10, + pages = "991--1001", + month = oct, + year = 2020, + language = "en" +} + +@ARTICLE{Bergen2020-kx, + title = "Generalizing {RNA} velocity to transient cell states through + dynamical modeling", + author = "Bergen, Volker and Lange, Marius and Peidli, Stefan and Wolf, F + Alexander and Theis, Fabian J", + abstract = "RNA velocity has opened up new ways of studying cellular + differentiation in single-cell RNA-sequencing data. It describes + the rate of gene expression change for an individual gene at a + given time point based on the ratio of its spliced and unspliced + messenger RNA (mRNA). However, errors in velocity estimates arise + if the central assumptions of a common splicing rate and the + observation of the full splicing dynamics with steady-state mRNA + levels are violated. Here we present scVelo, a method that + overcomes these limitations by solving the full transcriptional + dynamics of splicing kinetics using a likelihood-based dynamical + model. This generalizes RNA velocity to systems with transient + cell states, which are common in development and in response to + perturbations. We apply scVelo to disentangling subpopulation + kinetics in neurogenesis and pancreatic endocrinogenesis. We + infer gene-specific rates of transcription, splicing and + degradation, recover each cell's position in the underlying + differentiation processes and detect putative driver genes. + scVelo will facilitate the study of lineage decisions and gene + regulation.", + journal = "Nat. Biotechnol.", + volume = 38, + number = 12, + pages = "1408--1414", + month = dec, + year = 2020, + language = "en" +} + +@BOOK{Seydel1988-ub, + title = "From equilibrium to chaos: practical bifurcation and stability + analysis", + author = "Seydel, R{\"u}diger", + publisher = "North-Holland", + year = 1988 +} + +@ARTICLE{Alemany2018-se, + title = "Whole-organism clone tracing using single-cell sequencing", + author = "Alemany, Anna and Florescu, Maria and Baron, Chlo{\'e} S and + Peterson-Maduro, Josi and van Oudenaarden, Alexander", + abstract = "Embryonic development is a crucial period in the life of a + multicellular organism, during which limited sets of embryonic + progenitors produce all cells in the adult body. Determining + which fate these progenitors acquire in adult tissues requires + the simultaneous measurement of clonal history and cell identity + at single-cell resolution, which has been a major challenge. + Clonal history has traditionally been investigated by + microscopically tracking cells during development, monitoring the + heritable expression of genetically encoded fluorescent proteins + and, more recently, using next-generation sequencing technologies + that exploit somatic mutations, microsatellite instability, + transposon tagging, viral barcoding, CRISPR-Cas9 genome editing + and Cre-loxP recombination. Single-cell transcriptomics provides + a powerful platform for unbiased cell-type classification. Here + we present ScarTrace, a single-cell sequencing strategy that + enables the simultaneous quantification of clonal history and + cell type for thousands of cells obtained from different organs + of the adult zebrafish. Using ScarTrace, we show that a small set + of multipotent embryonic progenitors generate all haematopoietic + cells in the kidney marrow, and that many progenitors produce + specific cell types in the eyes and brain. In addition, we study + when embryonic progenitors commit to the left or right eye. + ScarTrace reveals that epidermal and mesenchymal cells in the + caudal fin arise from the same progenitors, and that + osteoblast-restricted precursors can produce mesenchymal cells + during regeneration. Furthermore, we identify resident immune + cells in the fin with a distinct clonal origin from other blood + cell types. We envision that similar approaches will have major + applications in other experimental systems, in which the matching + of embryonic clonal origin to adult cell type will ultimately + allow reconstruction of how the adult body is built from a single + cell.", + journal = "Nature", + volume = 556, + number = 7699, + pages = "108--112", + month = apr, + year = 2018, + language = "en" +} + +@ARTICLE{Buenrostro2015-oz, + title = "Single-cell chromatin accessibility reveals principles of + regulatory variation", + author = "Buenrostro, Jason D and Wu, Beijing and Litzenburger, Ulrike M + and Ruff, Dave and Gonzales, Michael L and Snyder, Michael P and + Chang, Howard Y and Greenleaf, William J", + abstract = "Cell-to-cell variation is a universal feature of life that + affects a wide range of biological phenomena, from developmental + plasticity to tumour heterogeneity. Although recent advances have + improved our ability to document cellular phenotypic variation, + the fundamental mechanisms that generate variability from + identical DNA sequences remain elusive. Here we reveal the + landscape and principles of mammalian DNA regulatory variation by + developing a robust method for mapping the accessible genome of + individual cells by assay for transposase-accessible chromatin + using sequencing (ATAC-seq) integrated into a programmable + microfluidics platform. Single-cell ATAC-seq (scATAC-seq) maps + from hundreds of single cells in aggregate closely resemble + accessibility profiles from tens of millions of cells and provide + insights into cell-to-cell variation. Accessibility variance is + systematically associated with specific trans-factors and + cis-elements, and we discover combinations of trans-factors + associated with either induction or suppression of cell-to-cell + variability. We further identify sets of trans-factors associated + with cell-type-specific accessibility variance across eight cell + types. Targeted perturbations of cell cycle or transcription + factor signalling evoke stimulus-specific changes in this + observed variability. The pattern of accessibility variation in + cis across the genome recapitulates chromosome compartments de + novo, linking single-cell accessibility variation to + three-dimensional genome organization. Single-cell analysis of + DNA accessibility provides new insight into cellular variation of + the 'regulome'.", + journal = "Nature", + volume = 523, + number = 7561, + pages = "486--490", + month = jul, + year = 2015, + language = "en" +} + +@ARTICLE{Grun2014-nb, + title = "Validation of noise models for single-cell transcriptomics", + author = "Gr{\"u}n, Dominic and Kester, Lennart and van Oudenaarden, + Alexander", + abstract = "Single-cell transcriptomics has recently emerged as a powerful + technology to explore gene expression heterogeneity among single + cells. Here we identify two major sources of technical + variability: sampling noise and global cell-to-cell variation in + sequencing efficiency. We propose noise models to correct for + this, which we validate using single-molecule FISH. We + demonstrate that gene expression variability in mouse embryonic + stem cells depends on the culture condition.", + journal = "Nat. Methods", + volume = 11, + number = 6, + pages = "637--640", + month = jun, + year = 2014, + language = "en" +} + +@ARTICLE{Cahan2014-qm, + title = "{CellNet}: network biology applied to stem cell engineering", + author = "Cahan, Patrick and Li, Hu and Morris, Samantha A and Lummertz da + Rocha, Edroaldo and Daley, George Q and Collins, James J", + abstract = "Somatic cell reprogramming, directed differentiation of + pluripotent stem cells, and direct conversions between + differentiated cell lineages represent powerful approaches to + engineer cells for research and regenerative medicine. We have + developed CellNet, a network biology platform that more + accurately assesses the fidelity of cellular engineering than + existing methodologies and generates hypotheses for improving + cell derivations. Analyzing expression data from 56 published + reports, we found that cells derived via directed differentiation + more closely resemble their in vivo counterparts than products of + direct conversion, as reflected by the establishment of target + cell-type gene regulatory networks (GRNs). Furthermore, we + discovered that directly converted cells fail to adequately + silence expression programs of the starting population and that + the establishment of unintended GRNs is common to virtually every + cellular engineering paradigm. CellNet provides a platform for + quantifying how closely engineered cell populations resemble + their target cell type and a rational strategy to guide enhanced + cellular engineering.", + journal = "Cell", + volume = 158, + number = 4, + pages = "903--915", + month = aug, + year = 2014, + language = "en" +} + +@ARTICLE{Sheth2018-mz, + title = "{DNA-based} memory devices for recording cellular events", + author = "Sheth, Ravi U and Wang, Harris H", + abstract = "Measuring biological data across time and space is critical for + understanding complex biological processes and for various + biosurveillance applications. However, such data are often + inaccessible or difficult to directly obtain. Less invasive, more + robust and higher-throughput biological recording tools are + needed to profile cells and their environments. DNA-based + cellular recording is an emerging and powerful framework for + tracking intracellular and extracellular biological events over + time across living cells and populations. Here, we review and + assess DNA recorders that utilize CRISPR nucleases, integrases + and base-editing strategies, as well as recombinase and + polymerase-based methods. Quantitative characterization, + modelling and evaluation of these DNA-recording modalities can + guide their design and implementation for specific application + areas.", + journal = "Nat. Rev. Genet.", + volume = 19, + number = 11, + pages = "718--732", + month = nov, + year = 2018, + language = "en" +} + +@ARTICLE{Fisher2007-gg, + title = "Requirement for {ErbB2/ErbB} signaling in developing cartilage + and bone", + author = "Fisher, Melanie C and Clinton, Gail M and Maihle, Nita J and + Dealy, Caroline N", + abstract = "During endochondral ossification, the skeletal elements of + vertebrate limbs form and elongate via coordinated control of + chondrocyte and osteoblast differentiation and proliferation. The + role of signaling by the ErbB family of receptor tyrosine + kinases, which consists of ErbB1 (epidermal growth factor + receptor or EGFR), ErbB2, ErbB3 and ErbB4, has been little + studied during cartilage and bone development. Signaling by the + ErbB network generates a diverse array of cellular responses via + formation of ErbB dimers activated by distinct ligands that + produce distinct signal outputs. Herstatin is a soluble ErbB2 + receptor that acts in a dominant negative fashion to inhibit ErbB + signaling by binding to endogenous ErbB receptors, preventing + functional dimer formation. Here, we examine the effects of + Herstatin on limb skeletal element development in transgenic + mice, achieved via Prx1 promoter-driven expression in limb + cartilage and bone. The limb skeletal elements of Prx1-Herstatin + embryos are shortened, and chondrocyte maturation and osteoblast + differentiation are delayed. In addition, proliferation by + chondrocytes and periosteal cells of Prx1-Herstatin limb skeletal + elements is markedly reduced. Our study identifies requirements + for ErbB signaling in the maintenance of chondrocyte and + osteoblast proliferation involved in the timely progression of + chondrocyte maturation and periosteal osteoblast differentiation.", + journal = "Dev. Growth Differ.", + volume = 49, + number = 6, + pages = "503--513", + month = aug, + year = 2007, + language = "en" +} + +@ARTICLE{Hendriks2019-ap, + title = "{NASC-seq} monitors {RNA} synthesis in single cells", + author = "Hendriks, Gert-Jan and Jung, Lisa A and Larsson, Anton J M and + Lidschreiber, Michael and Andersson Forsman, Oscar and + Lidschreiber, Katja and Cramer, Patrick and Sandberg, Rickard", + abstract = "Sequencing of newly synthesised RNA can monitor transcriptional + dynamics with great sensitivity and high temporal resolution, but + is currently restricted to populations of cells. Here, we develop + new transcriptome alkylation-dependent single-cell RNA sequencing + (NASC-seq), to monitor newly synthesised and pre-existing RNA + simultaneously in single cells. We validate the method on + pre-labelled RNA, and by demonstrating that more newly + synthesised RNA was detected for genes with known high mRNA + turnover. Monitoring RNA synthesis during Jurkat T-cell + activation with NASC-seq reveals both rapidly up- and + down-regulated genes, and that induced genes are almost + exclusively detected as newly transcribed. Moreover, the newly + synthesised and pre-existing transcriptomes after T-cell + activation are distinct, confirming that NASC-seq simultaneously + measures gene expression corresponding to two time points in + single cells. Altogether, NASC-seq enables precise temporal + monitoring of RNA synthesis at single-cell resolution during + homoeostasis, perturbation responses and cellular + differentiation.", + journal = "Nat. Commun.", + volume = 10, + number = 1, + pages = "3138", + month = jul, + year = 2019, + language = "en" +} + +@ARTICLE{Macosko2015-mn, + title = "Highly Parallel Genome-wide Expression Profiling of Individual + Cells Using Nanoliter Droplets", + author = "Macosko, Evan Z and Basu, Anindita and Satija, Rahul and Nemesh, + James and Shekhar, Karthik and Goldman, Melissa and Tirosh, Itay + and Bialas, Allison R and Kamitaki, Nolan and Martersteck, Emily + M and Trombetta, John J and Weitz, David A and Sanes, Joshua R + and Shalek, Alex K and Regev, Aviv and McCarroll, Steven A", + abstract = "Cells, the basic units of biological structure and function, vary + broadly in type and state. Single-cell genomics can characterize + cell identity and function, but limitations of ease and scale + have prevented its broad application. Here we describe Drop-seq, + a strategy for quickly profiling thousands of individual cells by + separating them into nanoliter-sized aqueous droplets, + associating a different barcode with each cell's RNAs, and + sequencing them all together. Drop-seq analyzes mRNA transcripts + from thousands of individual cells simultaneously while + remembering transcripts' cell of origin. We analyzed + transcriptomes from 44,808 mouse retinal cells and identified 39 + transcriptionally distinct cell populations, creating a molecular + atlas of gene expression for known retinal cell classes and novel + candidate cell subtypes. Drop-seq will accelerate biological + discovery by enabling routine transcriptional profiling at + single-cell resolution. VIDEO ABSTRACT.", + journal = "Cell", + volume = 161, + number = 5, + pages = "1202--1214", + month = may, + year = 2015, + language = "en" +} + +@ARTICLE{Ao2009-li, + title = "Global view of bionetwork dynamics: adaptive landscape", + author = "Ao, Ping", + abstract = "Based on recent work, I will give a nontechnical brief review of + a powerful quantitative concept in biology, adaptive landscape, + initially proposed by S. Wright over 70 years ago, reintroduced + by one of the founders of molecular biology and by others in + different biological contexts, but apparently forgotten by modern + biologists for many years. Nevertheless, this concept finds an + increasingly important role in the development of systems biology + and bionetwork dynamics modeling, from phage lambda genetic + switch to endogenous network for cancer genesis and progression. + It is an ideal quantification to describe the robustness and + stability of bionetworks. Here, I will first introduce five + landmark proposals in biology on this concept, to demonstrate an + important common thread in theoretical biology. Then I will + discuss a few recent results, focusing on the studies showing + theoretical consistency of adaptive landscape. From the + perspective of a working scientist and of what is needed + logically for a dynamical theory when confronting empirical data, + the adaptive landscape is useful both metaphorically and + quantitatively, and has captured an essential aspect of + biological dynamical processes. Though at the theoretical level + the adaptive landscape must exist and it can be used across + hierarchical boundaries in biology, many associated issues are + indeed vague in their initial formulations and their quantitative + realizations are not easy, and are good research topics for + quantitative biologists. I will discuss three types of open + problems associated with the adaptive landscape in a broader + perspective.", + journal = "J. Genet. Genomics", + volume = 36, + number = 2, + pages = "63--73", + month = feb, + year = 2009, + language = "en" +} + +@ARTICLE{Arda2013-pa, + title = "Gene regulatory networks governing pancreas development", + author = "Arda, H Efsun and Benitez, Cecil M and Kim, Seung K", + abstract = "Elucidation of cellular and gene regulatory networks (GRNs) + governing organ development will accelerate progress toward + tissue replacement. Here, we have compiled reference GRNs + underlying pancreas development from data mining that integrates + multiple approaches, including mutant analysis, lineage tracing, + cell purification, gene expression and enhancer analysis, and + biochemical studies of gene regulation. Using established + computational tools, we integrated and represented these networks + in frameworks that should enhance understanding of the surging + output of genomic-scale genetic and epigenetic studies of + pancreas development and diseases such as diabetes and pancreatic + cancer. We envision similar approaches would be useful for + understanding the development of other organs.", + journal = "Dev. Cell", + volume = 25, + number = 1, + pages = "5--13", + month = apr, + year = 2013, + language = "en" +} + +@ARTICLE{La_Manno2018-vp, + title = "{RNA} velocity of single cells", + author = "La Manno, Gioele and Soldatov, Ruslan and Zeisel, Amit and Braun, + Emelie and Hochgerner, Hannah and Petukhov, Viktor and + Lidschreiber, Katja and Kastriti, Maria E and L{\"o}nnerberg, + Peter and Furlan, Alessandro and Fan, Jean and Borm, Lars E and + Liu, Zehua and van Bruggen, David and Guo, Jimin and He, Xiaoling + and Barker, Roger and Sundstr{\"o}m, Erik and Castelo-Branco, + Gon{\c c}alo and Cramer, Patrick and Adameyko, Igor and + Linnarsson, Sten and Kharchenko, Peter V", + abstract = "RNA abundance is a powerful indicator of the state of individual + cells. Single-cell RNA sequencing can reveal RNA abundance with + high quantitative accuracy, sensitivity and throughput1. However, + this approach captures only a static snapshot at a point in time, + posing a challenge for the analysis of time-resolved phenomena + such as embryogenesis or tissue regeneration. Here we show that + RNA velocity-the time derivative of the gene expression state-can + be directly estimated by distinguishing between unspliced and + spliced mRNAs in common single-cell RNA sequencing protocols. RNA + velocity is a high-dimensional vector that predicts the future + state of individual cells on a timescale of hours. We validate + its accuracy in the neural crest lineage, demonstrate its use on + multiple published datasets and technical platforms, reveal the + branching lineage tree of the developing mouse hippocampus, and + examine the kinetics of transcription in human embryonic brain. + We expect RNA velocity to greatly aid the analysis of + developmental lineages and cellular dynamics, particularly in + humans.", + journal = "Nature", + volume = 560, + number = 7719, + pages = "494--498", + month = aug, + year = 2018, + language = "en" +} + +@ARTICLE{Clevers2017-tl, + title = "Lgr5 Stem Cell-based organoids in human disease", + author = "Clevers, Hans", + abstract = "The intestinal epithelium is the most rapidly self-renewing + tissue in adult mammals. We originally defined Lgr5 as a Wnt + target gene, transcribed in colon cancer cells. Two knock-in + alleles revealed exclusive expression of Lgr5 in cycling, + columnar cells at the crypt base. Using lineage tracing + experiments in adult mice, we found that these Lgr5+ve crypt + base columnar cells (CBC) generated all epithelial lineages + throughout life, implying that they represent the stem cell of + the small intestine and colon. Lgr5 was subsequently found to + represent an exquisitely specific and almost ?generic? marker + for stem cells, including in hair follicles, kidney, liver, + mammary gland, inner ear tongue and stomach epithelium. Single + sorted Lgr5+ve stem cells can initiate ever-expanding + crypt-villus organoids, or so called ?mini-guts? in 3D culture. + The technology is based on the observation that Lgr5 is the + receptor for a potent stem cell growth factor, R-spondin. + Similar 3D cultures systems have been developed for the Lgr5+ve + stem cells of human stomach, liver, pancreas, prostate and + kidney. Using CRISPR/Cas9 technology, genes can be efficiently + modified in organoids of various origins.", + journal = "The FASEB Journal", + publisher = "John Wiley \& Sons, Ltd", + volume = 31, + number = "S1", + pages = "85.1--85.1", + month = apr, + year = 2017 +} + +@ARTICLE{Elowitz2000-tc, + title = "A synthetic oscillatory network of transcriptional regulators", + author = "Elowitz, M B and Leibler, S", + abstract = "Networks of interacting biomolecules carry out many essential + functions in living cells, but the 'design principles' underlying + the functioning of such intracellular networks remain poorly + understood, despite intensive efforts including quantitative + analysis of relatively simple systems. Here we present a + complementary approach to this problem: the design and + construction of a synthetic network to implement a particular + function. We used three transcriptional repressor systems that + are not part of any natural biological clock to build an + oscillating network, termed the repressilator, in Escherichia + coli. The network periodically induces the synthesis of green + fluorescent protein as a readout of its state in individual + cells. The resulting oscillations, with typical periods of hours, + are slower than the cell-division cycle, so the state of the + oscillator has to be transmitted from generation to generation. + This artificial clock displays noisy behaviour, possibly because + of stochastic fluctuations of its components. Such 'rational + network design may lead both to the engineering of new cellular + behaviours and to an improved understanding of naturally + occurring networks.", + journal = "Nature", + volume = 403, + number = 6767, + pages = "335--338", + month = jan, + year = 2000, + language = "en" +} + +@BOOK{Brauer2015-po, + title = "Dynamical Systems for Biological Modeling: An Introduction", + author = "Brauer, Fred and Kribs, Christopher", + abstract = "Dynamical Systems for Biological Modeling: An Introduction + prepares both biology and mathematics students with the + understanding and techniques necessary to undertake basic + modeling of biological systems. It achieves this through the + development and analysis of dynamical systems.The approach + emphasizes qualitative ideas rather than explicit computa", + publisher = "CRC Press", + month = dec, + year = 2015, + language = "en" +} + +@ARTICLE{Kiefer2018-oy, + title = "Expanding the Nucleoside Recoding Toolkit: Revealing {RNA} + Population Dynamics with 6-Thioguanosine", + author = "Kiefer, Lea and Schofield, Jeremy A and Simon, Matthew D", + abstract = "RNA-sequencing (RNA-seq) measures RNA abundance in a biological + sample but does not provide temporal information about the + sequenced RNAs. Metabolic labeling can be used to distinguish + newly made RNAs from pre-existing RNAs. Mutations induced from + chemical recoding of the hydrogen bonding pattern of the + metabolic label can reveal which RNAs are new in the context of a + sequencing experiment. These nucleotide recoding strategies have + been developed for a single uridine analogue, 4-thiouridine + (s4U), limiting the scope of these experiments. Here we report + the first use of nucleoside recoding with a guanosine analogue, + 6-thioguanosine (s6G). Using TimeLapse sequencing + (TimeLapse-seq), s6G can be recoded under RNA-friendly oxidative + nucleophilic-aromatic substitution conditions to produce adenine + analogues (substituted 2-aminoadenosines). We demonstrate the + first use of s6G recoding experiments to reveal + transcriptome-wide RNA population dynamics.", + journal = "J. Am. Chem. Soc.", + volume = 140, + number = 44, + pages = "14567--14570", + month = nov, + year = 2018, + language = "en" +} + +@ARTICLE{Kimmerling2016-mk, + title = "A microfluidic platform enabling single-cell {RNA-seq} of + multigenerational lineages", + author = "Kimmerling, Robert J and Lee Szeto, Gregory and Li, Jennifer W + and Genshaft, Alex S and Kazer, Samuel W and Payer, Kristofor R + and de Riba Borrajo, Jacob and Blainey, Paul C and Irvine, + Darrell J and Shalek, Alex K and Manalis, Scott R", + abstract = "We introduce a microfluidic platform that enables off-chip + single-cell RNA-seq after multi-generational lineage tracking + under controlled culture conditions. We use this platform to + generate whole-transcriptome profiles of primary, activated + murine CD8+ T-cell and lymphocytic leukemia cell line lineages. + Here we report that both cell types have greater intra- than + inter-lineage transcriptional similarity. For CD8+ T-cells, genes + with functional annotation relating to lymphocyte differentiation + and function--including Granzyme B--are enriched among the genes + that demonstrate greater intra-lineage expression level + similarity. Analysis of gene expression covariance with matched + measurements of time since division reveals cell type-specific + transcriptional signatures that correspond with cell cycle + progression. We believe that the ability to directly measure the + effects of lineage and cell cycle-dependent transcriptional + profiles of single cells will be broadly useful to fields where + heterogeneous populations of cells display distinct clonal + trajectories, including immunology, cancer, and developmental + biology.", + journal = "Nat. Commun.", + volume = 7, + pages = "10220", + month = jan, + year = 2016, + language = "en" +} + +@ARTICLE{Tabula_Muris_Consortium2020-cf, + title = "A single-cell transcriptomic atlas characterizes ageing tissues + in the mouse", + author = "{Tabula Muris Consortium}", + abstract = "Ageing is characterized by a progressive loss of physiological + integrity, leading to impaired function and increased + vulnerability to death1. Despite rapid advances over recent + years, many of the molecular and cellular processes that underlie + the progressive loss of healthy physiology are poorly + understood2. To gain a better insight into these processes, here + we generate a single-cell transcriptomic atlas across the + lifespan of Mus musculus that includes data from 23 tissues and + organs. We found cell-specific changes occurring across multiple + cell types and organs, as well as age-related changes in the + cellular composition of different organs. Using single-cell + transcriptomic data, we assessed cell-type-specific + manifestations of different hallmarks of ageing-such as + senescence3, genomic instability4 and changes in the immune + system2. This transcriptomic atlas-which we denote Tabula Muris + Senis, or 'Mouse Ageing Cell Atlas'-provides molecular + information about how the most important hallmarks of ageing are + reflected in a broad range of tissues and cell types.", + journal = "Nature", + volume = 583, + number = 7817, + pages = "590--595", + month = jul, + year = 2020, + language = "en" +} + +@ARTICLE{Westendorp2012-wi, + title = "{E2F7} represses a network of oscillating cell cycle genes to + control S-phase progression", + author = "Westendorp, Bart and Mokry, Michal and Groot Koerkamp, Marian J A + and Holstege, Frank C P and Cuppen, Edwin and de Bruin, Alain", + abstract = "E2F transcription factors are known to be important for timely + activation of G(1)/S and G(2)/M genes required for cell cycle + progression, but transcriptional mechanisms for deactivation of + cell cycle-regulated genes are unknown. Here, we show that E2F7 + is highly expressed during mid to late S-phase, occupies + promoters of G(1)/S-regulated genes and represses their + transcription. ChIP-seq analysis revealed that E2F7 binds + preferentially to genomic sites containing the TTCCCGCC motif, + which closely resembles the E2F consensus site. We identified 89 + target genes that carry E2F7 binding sites close to the + transcriptional start site and that are directly repressed by + short-term induction of E2F7. Most of these target genes are + known to be activated by E2Fs and are involved in DNA + replication, metabolism and DNA repair. Importantly, induction of + E2F7 during G(0)-G(1)/S resulted in S-phase arrest and DNA + damage, whereas expression of E2F7 during G(2)/M failed to + disturb cell cycle progression. These findings provide strong + evidence that E2F7 directly controls the downswing of oscillating + G(1)/S genes during S-phase progression.", + journal = "Nucleic Acids Res.", + volume = 40, + number = 8, + pages = "3511--3523", + month = apr, + year = 2012, + language = "en" +} + + +@ARTICLE{Zhou2017-du, + author = {Jiayi Ma and Ji Zhao and Hanqi Guo and Junjun Jiang and Huabing Zhou and Yuan Gao}, + title = {Locality Preserving Matching}, + journal = {Proceedings of the Twenty-Sixth International Joint Conference on + Artificial Intelligence, {IJCAI-17}}, + pages = {4492--4498}, + year = {2017}, +} + + +@ARTICLE{Gordon2020-pt, + title = "A {SARS-CoV-2} protein interaction map reveals targets for drug repurposing", + author = "Gordon, David E and Jang, Gwendolyn M and Bouhaddou, Mehdi and Xu, Jiewei and Obernier, Kirsten and White, Kris M and O'Meara, Matthew J and Rezelj, Veronica V and Guo, Jeffrey Z and Swaney, Danielle L and Tummino, Tia A and H{\"u}ttenhain, Ruth and Kaake, Robyn M and Richards, Alicia L and Tutuncuoglu, Beril and + Foussard, Helene and Batra, Jyoti and Haas, Kelsey and Modak, + Maya and Kim, Minkyu and Haas, Paige and Polacco, Benjamin J and + Braberg, Hannes and Fabius, Jacqueline M and Eckhardt, Manon and + Soucheray, Margaret and Bennett, Melanie J and Cakir, Merve and + McGregor, Michael J and Li, Qiongyu and Meyer, Bjoern and Roesch, + Ferdinand and Vallet, Thomas and Mac Kain, Alice and Miorin, Lisa + and Moreno, Elena and Naing, Zun Zar Chi and Zhou, Yuan and Peng, + Shiming and Shi, Ying and Zhang, Ziyang and Shen, Wenqi and + Kirby, Ilsa T and Melnyk, James E and Chorba, John S and Lou, + Kevin and Dai, Shizhong A and Barrio-Hernandez, Inigo and Memon, + Danish and Hernandez-Armenta, Claudia and Lyu, Jiankun and Mathy, + Christopher J P and Perica, Tina and Pilla, Kala Bharath and + Ganesan, Sai J and Saltzberg, Daniel J and Rakesh, Ramachandran + and Liu, Xi and Rosenthal, Sara B and Calviello, Lorenzo and + Venkataramanan, Srivats and Liboy-Lugo, Jose and Lin, Yizhu and + Huang, Xi-Ping and Liu, Yongfeng and Wankowicz, Stephanie A and + Bohn, Markus and Safari, Maliheh and Ugur, Fatima S and Koh, + Cassandra and Savar, Nastaran Sadat and Tran, Quang Dinh and + Shengjuler, Djoshkun and Fletcher, Sabrina J and O'Neal, Michael + C and Cai, Yiming and Chang, Jason C J and Broadhurst, David J + and Klippsten, Saker and Sharp, Phillip P and Wenzell, Nicole A + and Kuzuoglu-Ozturk, Duygu and Wang, Hao-Yuan and Trenker, + Raphael and Young, Janet M and Cavero, Devin A and Hiatt, Joseph + and Roth, Theodore L and Rathore, Ujjwal and Subramanian, Advait + and Noack, Julia and Hubert, Mathieu and Stroud, Robert M and + Frankel, Alan D and Rosenberg, Oren S and Verba, Kliment A and + Agard, David A and Ott, Melanie and Emerman, Michael and Jura, + Natalia and von Zastrow, Mark and Verdin, Eric and Ashworth, Alan + and Schwartz, Olivier and d'Enfert, Christophe and Mukherjee, + Shaeri and Jacobson, Matt and Malik, Harmit S and Fujimori, + Danica G and Ideker, Trey and Craik, Charles S and Floor, Stephen + N and Fraser, James S and Gross, John D and Sali, Andrej and + Roth, Bryan L and Ruggero, Davide and Taunton, Jack and Kortemme, + Tanja and Beltrao, Pedro and Vignuzzi, Marco and + Garc{\'\i}a-Sastre, Adolfo and Shokat, Kevan M and Shoichet, + Brian K and Krogan, Nevan J", + abstract = "A newly described coronavirus named severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), which is the causative agent of coronavirus disease 2019 (COVID-19), has infected over 2.3 million people, led to the death of more than 160,000 individuals and caused worldwide social and economic disruption1,2. There are no antiviral drugs with proven clinical efficacy for the treatment of COVID-19, nor are there any vaccines that prevent infection with SARS-CoV-2, and efforts to develop drugs and vaccines are hampered by the limited knowledge of the molecular details of how SARS-CoV-2 infects cells. Here we cloned, tagged and expressed 26 of the 29 SARS-CoV-2 proteins in human cells and identified the human proteins that physically associated with each of the SARS-CoV-2 proteins using affinity-purification mass spectrometry, identifying 332 high-confidence protein-protein interactions between SARS-CoV-2 and human proteins. Among these, we identify 66 druggable human proteins or host factors targeted by 69 compounds (of which, 29 drugs are approved by the US Food and Drug Administration, 12 are in clinical trials and 28 are preclinical compounds). We screened a subset of these in multiple viral assays and found two sets of pharmacological agents that displayed antiviral activity: inhibitors of mRNA translation and predicted regulators of the sigma-1 and sigma-2 receptors. Further studies of these host-factor-targeting agents, including their combination with drugs that directly target viral enzymes, could lead to a therapeutic regimen to treat COVID-19.", + journal = "Nature", + volume = 583, + number = 7816, + pages = "459--468", + month = jul, + year = 2020, + language = "en" +} + +@UNPUBLISHED{Hein2021-mj, + title = "Functional single-cell genomics of human cytomegalovirus + infection", + author = "Hein, Marco Y and Weissman, Jonathan S", + abstract = "The complex life cycle of herpesviruses is orchestrated by the + interplay of host factors and hundreds of viral genes. + Understanding how they work together and how perturbations of + viral and host factors impact infection represents both a + fundamental problem in virology and the basis for designing + antiviral interventions. Here, we use CRISPR screening to + comprehensively define the functional contribution of each viral + and host factor to human cytomegalovirus (HCMV) infection in + primary cells. We then record the transcriptomes of tens of + thousands of single cells, and monitor how genetic perturbation + of critical host and viral factors alters the timing, course, and + progression of infection. We find that normally, the large + majority of cells follow a stereotypical transcriptional + trajectory. Perturbing critical host factors does not change this + trajectory per se, but can either stall, delay or accelerate + progression along the trajectory, allowing us to pinpoint + systematically the stage of infection at which each host factor + acts. Conversely, perturbation of viral factors can create + distinct, abortive trajectories. Our results reveal a dichotomy + between the roles of host and viral factors and more generally + provide a road map for functional dissection of host-pathogen + interactions. \#\#\# Competing Interest Statement The authors + have declared no competing interest.", + journal = "Cold Spring Harbor Laboratory", + pages = "775080", + month = jan, + year = 2021, + language = "en" +} + +@ARTICLE{Stoeckius2017-vw, + title = "Simultaneous epitope and transcriptome measurement in single cells", + author = "Stoeckius, Marlon and Hafemeister, Christoph and Stephenson, William and Houck-Loomis, Brian and Chattopadhyay, Pratip K and Swerdlow, Harold and Satija, Rahul and Smibert, Peter", + abstract = "High-throughput single-cell RNA sequencing has transformed our + understanding of complex cell populations, but it does not + provide phenotypic information such as cell-surface protein + levels. Here, we describe cellular indexing of transcriptomes and + epitopes by sequencing (CITE-seq), a method in which + oligonucleotide-labeled antibodies are used to integrate cellular + protein and transcriptome measurements into an efficient, + single-cell readout. CITE-seq is compatible with existing + single-cell sequencing approaches and scales readily with + throughput increases.", + journal = "Nat. Methods", + volume = 14, + number = 9, + pages = "865--868", + month = sep, + year = 2017, + language = "en" +} + +@ARTICLE{Kim2000-lb, + title = "Multiconfiguration molecular mechanics algorithm for potential + energy surfaces of chemical reactions", + author = "Kim, Yongho and Corchado, Jos{\'e} C and Vill{\`a}, Jordi and + Xing, Jianhua and Truhlar, Donald G", + abstract = "We present an efficient algorithm for generating semiglobal + potential energy surfaces of reactive systems. The method takes + as input molecular mechanics force fields for reactants and + products and a quadratic expansion of the potential energy + surface around a small number of geometries whose locations are + determined by an iterative process. These Hessian expansions + might come, for example, from ab initio electronic structure + calculations, density functional theory, or semiempirical + molecular orbital theory. A 2?2 electronic diabatic Hamiltonian + matrix is constructed from these data such that, by + construction, the lowest eigenvalue of this matrix provides a + semiglobal approximation to the lowest electronically adiabatic + potential energy surface. The theory is illustrated and tested + by applications to rate constant calculations for three + gas-phase test reactions, namely, the isomerization of + 1,3-cis-pentadiene, OH+CH4?H2O+CH3, and CH2Cl+CH3F?CH3Cl+CH2F.", + journal = "J. Chem. Phys.", + publisher = "American Institute of Physics", + volume = 112, + number = 6, + pages = "2718--2735", + month = feb, + year = 2000 +} + +@ARTICLE{Gorin2020-yh, + title = "Protein velocity and acceleration from single-cell multiomics + experiments", + author = "Gorin, Gennady and Svensson, Valentine and Pachter, Lior", + abstract = "The simultaneous quantification of protein and RNA makes possible + the inference of past, present, and future cell states from + single experimental snapshots. To enable such temporal analysis + from multimodal single-cell experiments, we introduce an + extension of the RNA velocity method that leverages estimates of + unprocessed transcript and protein abundances to extrapolate cell + states. We apply the model to six datasets and demonstrate + consistency among cell landscapes and phase portraits. The + analysis software is available as the protaccel Python package.", + journal = "Genome Biol.", + volume = 21, + number = 1, + pages = "39", + month = feb, + year = 2020, + keywords = "Bioinformatics; Computational biology; Multiomics; Protein + acceleration; Protein velocity; RNA velocity; Transcriptomics", + language = "en" +} + +@ARTICLE{Stoeckius2018-cb, + title = "Cell Hashing with barcoded antibodies enables multiplexing and + doublet detection for single cell genomics", + author = "Stoeckius, Marlon and Zheng, Shiwei and Houck-Loomis, Brian and + Hao, Stephanie and Yeung, Bertrand Z and Mauck, 3rd, William M + and Smibert, Peter and Satija, Rahul", + abstract = "Despite rapid developments in single cell sequencing, + sample-specific batch effects, detection of cell multiplets, and + experimental costs remain outstanding challenges. Here, we + introduce Cell Hashing, where oligo-tagged antibodies against + ubiquitously expressed surface proteins uniquely label cells from + distinct samples, which can be subsequently pooled. By sequencing + these tags alongside the cellular transcriptome, we can assign + each cell to its original sample, robustly identify cross-sample + multiplets, and ``super-load'' commercial droplet-based systems + for significant cost reduction. We validate our approach using a + complementary genetic approach and demonstrate how hashing can + generalize the benefits of single cell multiplexing to diverse + samples and experimental designs.", + journal = "Genome Biol.", + volume = 19, + number = 1, + pages = "224", + month = dec, + year = 2018, + language = "en" +} + +@ARTICLE{Love2014-na, + title = "Moderated estimation of fold change and dispersion for {RNA-seq} + data with {DESeq2}", + author = "Love, Michael I and Huber, Wolfgang and Anders, Simon", + abstract = "In comparative high-throughput sequencing assays, a fundamental + task is the analysis of count data, such as read counts per gene + in RNA-seq, for evidence of systematic changes across + experimental conditions. Small replicate numbers, discreteness, + large dynamic range and the presence of outliers require a + suitable statistical approach. We present DESeq2, a method for + differential analysis of count data, using shrinkage estimation + for dispersions and fold changes to improve stability and + interpretability of estimates. This enables a more quantitative + analysis focused on the strength rather than the mere presence of + differential expression. The DESeq2 package is available at + http://www.bioconductor.org/packages/release/bioc/html/DESeq2.html + webcite.", + journal = "Genome Biol.", + volume = 15, + number = 12, + pages = "550", + year = 2014, + language = "en" +} + +@BOOK{Marsden2012-zj, + title = "Vector Calculus", + author = "Marsden, Jerrold E and Tromba, Anthony", + abstract = "", + publisher = "W. H. Freeman and Company", + year = 2012, + language = "en" +} + +@ARTICLE{Perez-Carrasco2016, + doi = {10.1371/journal.pcbi.1005154}, + author = {Perez-Carrasco, Ruben and Guerrero, Pilar and Briscoe, James and Page, Karen M.}, + journal = {PLOS Computational Biology}, + publisher = {Public Library of Science}, + title = {Intrinsic Noise Profoundly Alters the Dynamics and Steady State of Morphogen-Controlled Bistable Genetic Switches}, + year = {2016}, + month = {10}, + volume = {12}, + url = {https://doi.org/10.1371/journal.pcbi.1005154}, + pages = {1-23}, + abstract = {During tissue development, patterns of gene expression determine the spatial arrangement of cell types. In many cases, gradients of secreted signalling molecules—morphogens—guide this process by controlling downstream transcriptional networks. A mechanism commonly used in these networks to convert the continuous information provided by the gradient into discrete transitions between adjacent cell types is the genetic toggle switch, composed of cross-repressing transcriptional determinants. Previous analyses have emphasised the steady state output of these mechanisms. Here, we explore the dynamics of the toggle switch and use exact numerical simulations of the kinetic reactions, the corresponding Chemical Langevin Equation, and Minimum Action Path theory to establish a framework for studying the effect of gene expression noise on patterning time and boundary position. This provides insight into the time scale, gene expression trajectories and directionality of stochastic switching events between cell states. Taking gene expression noise into account predicts that the final boundary position of a morphogen-induced toggle switch, although robust to changes in the details of the noise, is distinct from that of the deterministic system. Moreover, the dramatic increase in patterning time close to the boundary predicted from the deterministic case is substantially reduced. The resulting stochastic switching introduces differences in patterning time along the morphogen gradient that result in a patterning wave propagating away from the morphogen source with a velocity determined by the intrinsic noise. The wave sharpens and slows as it advances and may never reach steady state in a biologically relevant time. This could explain experimentally observed dynamics of pattern formation. Together the analysis reveals the importance of dynamical transients for understanding morphogen-driven transcriptional networks and indicates that gene expression noise can qualitatively alter developmental patterning.}, + number = {10}, +} + +@Article{Tang2017, + author={Tang, Ying + and Yuan, Ruoshi + and Wang, Gaowei + and Zhu, Xiaomei + and Ao, Ping}, + title={Potential landscape of high dimensional nonlinear stochastic dynamics with large noise}, + journal={Scientific Reports}, + year={2017}, + month={Nov}, + day={17}, + volume={7}, + number={1}, + pages={15762}, + abstract={Quantifying stochastic processes is essential to understand many natural phenomena, particularly in biology, including the cell-fate decision in developmental processes as well as the genesis and progression of cancers. While various attempts have been made to construct potential landscape in high dimensional systems and to estimate transition rates, they are practically limited to the cases where either noise is small or detailed balance condition holds. A general and practical approach to investigate real-world nonequilibrium systems, which are typically high-dimensional and subject to large multiplicative noise and the breakdown of detailed balance, remains elusive. Here, we formulate a computational framework that can directly compute the relative probabilities between locally stable states of such systems based on a least action method, without the necessity of simulating the steady-state distribution. The method can be applied to systems with arbitrary noise intensities through A-type stochastic integration, which preserves the dynamical structure of the deterministic counterpart dynamics. We demonstrate our approach in a numerically accurate manner through solvable examples. We further apply the method to investigate the role of noise on tumor heterogeneity in a 38-dimensional network model for prostate cancer, and provide a new strategy on controlling cell populations by manipulating noise strength.}, + issn={2045-2322}, + doi={10.1038/s41598-017-15889-2}, + url={https://doi.org/10.1038/s41598-017-15889-2} +} + +@book{freidlin2012random, + added-at = {2014-11-14T03:12:52.000+0100}, + author = {Freidlin, Mark I. and Wentzell, Alexander D.}, + biburl = {https://www.bibsonomy.org/bibtex/26df179288a704c367421a6f9801fe949/peter.ralph}, + interhash = {0926f80cc9f0d77d5bce51922d3ff9ea}, + intrahash = {6df179288a704c367421a6f9801fe949}, + isbn = {9783642258473}, + keywords = {Freidlin-Wentzell_theory dynamical_systems large_deviations stochastic_perturbation}, + publisher = {Springer}, + series = {Grundlehren der mathematischen Wissenschaften}, + timestamp = {2014-11-14T03:15:07.000+0100}, + title = {Random Perturbations of Dynamical Systems}, + url = {http://books.google.de/books?id=p8LFMILAiMEC}, + year = 2012 +} + +@article{onsager1953, + title = {Fluctuations and Irreversible Processes}, + author = {Onsager, Lars and Machlup, Stefan}, + journal = {Phys. Rev.}, + volume = {91}, + issue = {6}, + pages = {1505--1512}, + numpages = {0}, + year = {1953}, + month = {Sep}, + publisher = {American Physical Society}, + doi = {10.1103/PhysRev.91.1505}, + url = {https://link.aps.org/doi/10.1103/PhysRev.91.1505} +} + + +@article{Maier1997, + ISSN = {00361399}, + URL = {http://www.jstor.org/stable/2951902}, + abstract = {Consider a two-dimensional continuous-time dynamical system, with an attracting fixed point S. If the deterministic dynamics are perturbed by white noise (random perturbations) of strength ε, the system state will eventually leave the domain of attraction Ω of S. We analyze the case when, as ε → 0, the exit location on the boundary ∂Ω is increasingly concentrated near a saddle point H of the deterministic dynamics. We show using formal methods that the asymptotic form of the exit location distribution on ∂Ω is generically non-Gaussian and asymmetric, and classify the possible limiting distributions. A key role is played by a parameter μ, equal to the ratio |λs(H)|λu(H) of the stable and unstable eigenvalues of the linearized deterministic flow at H. If $\mu < 1$, then the exit location distribution is generically asymptotic as ε → 0 to a Weibull distribution with shape parameter 2/μ, on the O(εμ/2) lengthscale near H. If $\mu > 1$, it is generically asymptotic to a distribution on the O(ε1/2) lengthscale, whose moments we compute. Our treatment employs both matched asymptotic expansions and stochastic analysis. As a byproduct of our treatment, we clarify the limitations of the traditional Eyring formula for the weak-noise exit time asymptotics.}, + author = {Robert S. Maier and Daniel L. Stein}, + journal = {SIAM Journal on Applied Mathematics}, + number = {3}, + pages = {752--790}, + publisher = {Society for Industrial and Applied Mathematics}, + title = {Limiting Exit Location Distributions in the Stochastic Exit Problem}, + volume = {57}, + year = {1997} +} + +@article{Aurell2002, + title = {Epigenetics as a First Exit Problem}, + author = {Aurell, Erik and Sneppen, Kim}, + journal = {Phys. Rev. Lett.}, + volume = {88}, + issue = {4}, + pages = {048101}, + numpages = {4}, + year = {2002}, + month = {Jan}, + publisher = {American Physical Society}, + doi = {10.1103/PhysRevLett.88.048101}, + url = {https://link.aps.org/doi/10.1103/PhysRevLett.88.048101} +} + +@incollection{VANKAMPEN2007193, + title = {Chapter VIII - THE FOKKER–PLANCK EQUATION}, + editor = {N.G. Van Kampen}, + booktitle = {Stochastic Processes in Physics and Chemistry (Third Edition)}, + publisher = {Elsevier}, + edition = {Third Edition}, + address = {Amsterdam}, + pages = {193-218}, + year = {2007}, + series = {North-Holland Personal Library}, + issn = {09255818}, + doi = {https://doi.org/10.1016/B978-044452965-7/50011-8}, + url = {https://www.sciencedirect.com/science/article/pii/B9780444529657500118}, + author = {N.G. Van Kampen} +} + +@article{Merkl2013, + author = {Merkl, Claudia and Saalfrank, Anja and Riesen, Nathalie and Kühn, Ralf and Pertek, Anna and Eser, Stefan and Hardt, Markus and Kind, Alexander and Saur, Dieter and Wurst, Wolfgang and Iglesias, Antonio and Schnieke, Angelika}, + year = {2013}, + month = {01}, + pages = {e55170}, + title = {Efficient Generation of Rat Induced Pluripotent Stem Cells Using a Non-Viral Inducible Vector}, + volume = {8}, + journal = {PloS one}, + doi = {10.1371/journal.pone.0055170} +} + +@book{fey65, + added-at = {2008-06-25T19:30:29.000+0200}, + address = {New York}, + annote = {polarones and Path Integrals}, + author = {Feynman, Richard P. and Hibbs, Albert R.}, + biburl = {https://www.bibsonomy.org/bibtex/20ce4ceae88fcbe31e3dafdb3aa8f8a66/jgl}, + citeulike-article-id = {2484177}, + comment = {polarones and Path Integrals}, + interhash = {23ae1411bb88c2017745e1b468ac27c3}, + intrahash = {0ce4ceae88fcbe31e3dafdb3aa8f8a66}, + keywords = {high-tc, htsct, theory}, + posted-at = {2008-03-07 13:36:20}, + priority = {2}, + publisher = {McGraw-Hill}, + timestamp = {2008-06-25T19:31:36.000+0200}, + title = {Quantum Mechanics and Path Integrals}, + year = 1965 +} +@article{takahashi2006induction, + title={Induction of pluripotent stem cells from mouse embryonic and adult fibroblast cultures by defined factors}, + author={Takahashi, Kazutoshi and Yamanaka, Shinya}, + journal={cell}, + volume={126}, + number={4}, + pages={663--676}, + year={2006}, + publisher={Elsevier} +} +@ARTICLE{2020SciPy-NMeth, + author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and Haberland, Matt and Reddy, Tyler and Cournapeau, David and + Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and Bright, Jonathan and {van der Walt}, St{\'e}fan J. and + Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and + Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and Kern, Robert and Larson, Eric and Carey, C J and Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and + Harris, Charles R. and Archibald, Anne M. and + Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and + {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, + title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific + Computing in Python}}, + journal = {Nature Methods}, + year = {2020}, + volume = {17}, + pages = {261--272}, + adsurl = {https://rdcu.be/b08Wh}, + doi = {10.1038/s41592-019-0686-2}, +} + +@article{larsson2019genomic, + title={Genomic encoding of transcriptional burst kinetics}, + author={Larsson, Anton JM and Johnsson, Per and Hagemann-Jensen, Michael and Hartmanis, Leonard and Faridani, Omid R and Reinius, Bj{\"o}rn and Segerstolpe, {\AA}sa and Rivera, Chloe M and Ren, Bing and Sandberg, Rickard}, + journal={Nature}, + volume={565}, + number={7738}, + pages={251--254}, + year={2019}, + publisher={Nature Publishing Group} +} + +@article{grun2014validation, + title={Validation of noise models for single-cell transcriptomics}, + author={Gr{\"u}n, Dominic and Kester, Lennart and Van Oudenaarden, Alexander}, + journal={Nature methods}, + volume={11}, + number={6}, + pages={637--640}, + year={2014}, + publisher={Nature Publishing Group} +} + +@article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent. + and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter + and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and + Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} +} + +@article {Wang8257, + author = {Wang, Jin and Zhang, Kun and Xu, Li and Wang, Erkang}, + title = {Quantifying the Waddington landscape and biological paths for development and differentiation}, + volume = {108}, + number = {20}, + pages = {8257--8262}, + year = {2011}, + doi = {10.1073/pnas.1017017108}, + publisher = {National Academy of Sciences}, + abstract = {We developed a theoretical framework to prove the existence and quantify the Waddington landscape as well as chreode-biological paths for development and differentiation. The cells can have states with the higher probability ones giving the different cell types. Different cell types correspond to different basins of attractions of the probability landscape. We study how the cells develop from undifferentiated cells to differentiated cells from landscape perspectives. We quantified the Waddington landscape through construction of underlying probability landscape for cell development. We show the developmental process proceeds as moving from undifferentiated to the differentiated basins of attractions. The barrier height of the basins of attractions correlates with the escape time that determines the stability of cell types. We show that the developmental process can be quantitatively described and uncovered by the biological paths on the quantified Waddington landscape from undifferentiated to the differentiated cells. We found the dynamics of the developmental process is controlled by a combination of the gradient and curl force on the landscape. The biological paths often do not follow the steepest descent path on the landscape. The landscape framework also quantifies the possibility of reverse differentiation process such as cell reprogramming from differentiated cells back to the original stem cell. We show that the biological path of reverse differentiation is irreversible and different from the one for differentiation process. We found that the developmental process described by the underlying landscape and the associated biological paths is relatively stable and robust against the influences of environmental perturbations.}, + issn = {0027-8424}, + URL = {https://www.pnas.org/content/108/20/8257}, + eprint = {https://www.pnas.org/content/108/20/8257.full.pdf}, + journal = {Proceedings of the National Academy of Sciences} +} \ No newline at end of file diff --git a/docs/references.md b/docs/references.md new file mode 100644 index 000000000..00ad6a6ea --- /dev/null +++ b/docs/references.md @@ -0,0 +1,5 @@ +# References + +```{bibliography} +:cited: +``` diff --git a/docs/requirements.txt b/docs/requirements.txt old mode 100755 new mode 100644 index f69d56a93..7af07f223 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -34,6 +34,17 @@ docutils mock pandocfilters readthedocs-sphinx-ext -sphinxcontrib-bibtex>=2.3 sphinx-gallery typing-extensions + +docutils>=0.8,!=0.18.*,!=0.19.* +ipython +sphinx-book-theme>=1.0.1 +sphinx_copybutton +sphinx-design +sphinxext-opengraph +sphinx-hoverxref +sphinxcontrib-bibtex>=1.0.0 +myst-parser +myst-nb +sphinx-autodoc-typehints \ No newline at end of file diff --git a/docs/source/_ext/pdfembed.py b/docs/source/_ext/pdfembed.py deleted file mode 100644 index 4b0c12da6..000000000 --- a/docs/source/_ext/pdfembed.py +++ /dev/null @@ -1,51 +0,0 @@ -# We would like to acknowledge the contribution of SuperKogito for their valuable code of sphinxcontrib-pdfembed. -# The original code can be found at https://github.com/SuperKogito/sphinxcontrib-pdfembed/blob/master/sphinxcontrib/pdfembed.py. - -from docutils import nodes - - -def pdfembed_html(pdfembed_specs): - """ - Build the iframe code for the pdf file, - """ - html_base_code = """ - - """ - return html_base_code % ( - pdfembed_specs["src"], - pdfembed_specs["height"], - pdfembed_specs["width"], - pdfembed_specs["align"], - ) - - -def pdfembed_role(typ, rawtext, text, lineno, inliner, options={}, content=[]): - """ - Get iframe specifications and generate the associate HTML code for the pdf iframe. - """ - # parse and init variables - text = text.replace(" ", "") - pdfembed_specs = {} - # read specs - for component in text.split(","): - pdfembed_specs[component.split(":")[0]] = component.split(":")[1] - # build node from pdf iframe html code - node = nodes.raw("", pdfembed_html(pdfembed_specs), format="html") - return [node], [] - - -def setup(app): - """ - Set up the app with the extension function - """ - app.add_role("pdfembed", pdfembed_role) diff --git a/docs/source/_templates/custom-class-template.rst b/docs/source/_templates/custom-class-template.rst deleted file mode 100644 index b29757c52..000000000 --- a/docs/source/_templates/custom-class-template.rst +++ /dev/null @@ -1,32 +0,0 @@ -{{ fullname | escape | underline}} - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - :members: - :show-inheritance: - :inherited-members: - - {% block methods %} - .. automethod:: __init__ - - {% if methods %} - .. rubric:: {{ _('Methods') }} - - .. autosummary:: - {% for item in methods %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block attributes %} - {% if attributes %} - .. rubric:: {{ _('Attributes') }} - - .. autosummary:: - {% for item in attributes %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} diff --git a/docs/source/_templates/custom-module-template.rst b/docs/source/_templates/custom-module-template.rst deleted file mode 100644 index a23004f93..000000000 --- a/docs/source/_templates/custom-module-template.rst +++ /dev/null @@ -1,66 +0,0 @@ -{{ fullname | escape | underline}} - -.. automodule:: {{ fullname }} - - {% block attributes %} - {% if attributes %} - .. rubric:: Module Attributes - - .. autosummary:: - :toctree: - {% for item in attributes %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block functions %} - {% if functions %} - .. rubric:: {{ _('Functions') }} - - .. autosummary:: - :toctree: - {% for item in functions %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block classes %} - {% if classes %} - .. rubric:: {{ _('Classes') }} - - .. autosummary:: - :toctree: - :template: custom-class-template.rst - {% for item in classes %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block exceptions %} - {% if exceptions %} - .. rubric:: {{ _('Exceptions') }} - - .. autosummary:: - :toctree: - {% for item in exceptions %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - -{% block modules %} -{% if modules %} -.. rubric:: Modules - -.. autosummary:: - :toctree: - :template: custom-class-template.rst - :recursive: -{% for item in modules %} - {{ item }} -{%- endfor %} -{% endif %} -{% endblock %} diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100755 index 4af1d4583..000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,193 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -import os -import sys -from pathlib import Path - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -from urllib.request import urlretrieve - -module_path = os.path.join(os.path.dirname(__file__), "../..") -sys.path.insert(0, os.path.abspath(module_path)) -sys.path.insert(0, os.path.abspath("../")) -sys.path.insert(0, os.path.abspath("../../")) -sys.path.append(os.path.abspath("./_ext")) - -import dynamo -from docs.source.utils import _download_docs_dirs - -# HERE = Path(__file__).parent -# sys.path[:0] = [str(HERE.parent)] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] -source_suffix = [".rst"] -bibtex_bibfiles = ["./notebooks/lap.bib", "./notebooks/dynamo_ref.bib"] -bibtex_reference_style = "author_year" - -master_doc = "index" - - -# the following are borrowed from scvelo -# -- Retrieve notebooks ------------------------------------------------ -# notebooks_url = "https://github.com/aristoteleo/dynamo-tutorials/raw/master/" -# notebooks = [ -# "Introduction.ipynb", -# "Primer.ipynb", -# "Differential_geometry.ipynb", -# "zebrafish.ipynb", -# # "dentategyrus_subset_scvelo.ipynb", -# # "pancreatic_endocrinogenesis.ipynb", -# "scNT_seq_readthedocs.ipynb", -# "scEU_seq_rpe1_analysis_kinetic.ipynb", -# "scEU_seq_organoid_analysis_kinetic.ipynb", -# ] -# for nb in notebooks: -# try: -# urlretrieve(notebooks_url + nb, nb) -# except: -# pass - -github_org = "aristoteleo" -github_code_repo = "dynamo-release" -github_ref = "master" -github_nb_repo = "dynamo_readthedocs" -_download_docs_dirs(repo_url=f"https://github.com/{github_org}/{github_nb_repo}") - -# Add notebooks prolog to Google Colab and nbviewer -nbsphinx_prolog = r""" -{% set docname = 'github/aristoteleo/dynamo-tutorials/blob/master/' + env.doc2path(env.docname, base=None) %} -.. raw:: html - -
- - Open In Colab - - Open In nbviewer -
-""" -nbsphinx_execute = "never" # never execute notebooks - -# -- Project information ----------------------------------------------------- - -project = "dynamo" -copyright = "2020, Xiaojie Qiu, Yan Zhang, Ke Ni" -author = "Xiaojie Qiu, Yan Zhang, Ke Ni" - -# The full version, including alpha/beta/rc tags -release = "1.4.1" - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. - -# specify sphinx version -needs_sphinx = "4" - -extensions = [ - "nbsphinx", - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.doctest", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", - "sphinx.ext.napoleon", - # Link to other project's documentation (see mapping below) - "sphinx.ext.intersphinx", - # Add a link to the Python source code for classes, functions etc. - "sphinx.ext.viewcode", - "sphinx.ext.githubpages", - "sphinx.ext.autosectionlabel", - # Automatically document param types (less noise in class signature) - "sphinx_autodoc_typehints", - "sphinxcontrib.bibtex", - "sphinx_gallery.load_style", - # pdf embed - "pdfembed", -] - -# Mappings for sphinx.ext.intersphinx. Projects have to have Sphinx-generated doc! (.inv file) -intersphinx_mapping = { - "anndata": ("https://anndata.readthedocs.io/en/stable/", None), - "cycler": ("https://matplotlib.org/cycler/", None), - "h5py": ("http://docs.h5py.org/en/stable/", None), - "ipython": ("https://ipython.readthedocs.io/en/stable/", None), - "louvain": ("https://louvain-igraph.readthedocs.io/en/latest/", None), - "matplotlib": ("https://matplotlib.org/", None), - "networkx": ( - "https://networkx.github.io/documentation/networkx-1.10/", - None, - ), - "numpy": ("https://docs.scipy.org/doc/numpy/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), - "pytest": ("https://docs.pytest.org/en/latest/", None), - "python": ("https://docs.python.org/3", None), - "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), - "seaborn": ("https://seaborn.pydata.org/", None), - "sklearn": ("https://scikit-learn.org/stable/", None), -} - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - -# Generate the API documentation when building -autosummary_generate = True -autodoc_member_order = "bysource" -autoclass_content = "both" # Add __init__ doc (ie. params) to class summaries -# Remove 'view source code' from top of page (for html, not python) -html_show_sourcelink = True -# If no class summary, inherit base class summary -autodoc_inherit_docstrings = True - -autodoc_default_flags = [ - # Make sure that any autodoc declarations show the right members - "members", - "inherited-members", - "private-members", - "show-inheritance", -] -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" -html_theme_options = dict( - navigation_depth=4, - logo_only=True, -) -html_context = dict( - display_github=True, # Integrate GitHub - github_user="aristoteleo", # organization - github_repo="dynamo", # Repo name - github_version="master", # Version - conf_py_path="/docs/source/", -) -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] -html_css_files = ["css/custom.css"] -# html_logo = "_static/logo.png" -html_logo = "_static/logo_with_word.png" - - -def setup(app): - app.add_css_file("css/custom.css") - - -sphinx_enable_epub_build = False -sphinx_enable_pdf_build = False diff --git a/docs/source/utils.py b/docs/source/utils.py deleted file mode 100644 index b83947d72..000000000 --- a/docs/source/utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import glob -import os -import re -import subprocess -from logging import info, warning -from pathlib import Path -from shutil import copy, copytree, rmtree -from tempfile import TemporaryDirectory -from typing import Dict, ForwardRef, List, Union - -from git import Repo - -CUR_DIR = Path(__file__).parent - -NOTEBOOK_BRANCH = "main" -DYNAMO_NOTEBOOK_PATH_ENV_VAR = "DYNAMO_DOWNLOAD_NOTEBOOKS" - - -def _download_docs_dirs(repo_url: str) -> None: - def copy_docs_dirs(repo_path: Union[str, Path]) -> None: - repo_path = Path(repo_path) - print("repo path:", repo_path) - for dirname in ["notebooks", "gallery", "_static"]: - rmtree(dirname, ignore_errors=True) # locally re-cloning - copytree(repo_path / "docs" / "source" / dirname, dirname) - - # copy all rsts in docs/source/*.rst - for file_path in glob.glob(str(repo_path / "docs" / "source" / "*.rst")): - print("%s copied to source" % file_path) - copy(file_path, "./") # dest: source - - def fetch_remote(repo_url: str) -> None: - info(f"Fetching notebooks from repo `{repo_url}`") - with TemporaryDirectory() as repo_dir: - branch = NOTEBOOK_BRANCH - repo = Repo.clone_from(repo_url, repo_dir, depth=1, branch=branch) - repo.git.checkout(branch, force=True) - copy_docs_dirs(repo_dir) - - def fetch_local(repo_path: Union[str, Path]) -> None: - info(f"Fetching notebooks from local path `{repo_path}`") - repo_path = Path(repo_path) - if not repo_path.is_dir(): - raise OSError(f"`{repo_path}` is not a directory.") - copy_docs_dirs(repo_path) - - notebooks_local_path = Path( - os.environ.get(DYNAMO_NOTEBOOK_PATH_ENV_VAR, CUR_DIR.absolute().parent.parent.parent / "notebooks") - ) - try: - fetch_local(notebooks_local_path) - except Exception as e: - warning(f"read`{notebooks_local_path}` failed, error message: `{e}`. Trying remote") - require_download = int(os.environ.get(DYNAMO_NOTEBOOK_PATH_ENV_VAR, 1)) - if not require_download: - info(f"Used downloaded files as set in ENV") - return - - fetch_remote(repo_url) diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md new file mode 100644 index 000000000..96741eaf6 --- /dev/null +++ b/docs/tutorials/index.md @@ -0,0 +1,22 @@ +# Tutorials + +The easiest way to get familiar with dynamo is to follow along with our tutorials. +Many are also designed to work seamlessly in Google Colab, a free cloud computing platform. +Tutorials by default work with the latest installable version of dynamo. To view older tutorials, +change the documentation version using the tab at the bottom of the left sidebar. + + +```{toctree} +:maxdepth: 2 + +index_preprocessing +index_conventional +index_labeling +index_differential_geometry +index_vector_pre +index_multivelo +index_gallery + +``` + + diff --git a/docs/tutorials/index_conventional.md b/docs/tutorials/index_conventional.md new file mode 100644 index 000000000..a4e4bf914 --- /dev/null +++ b/docs/tutorials/index_conventional.md @@ -0,0 +1,10 @@ +# Conventional scRNA-seq + +```{toctree} +:maxdepth: 1 + +notebooks/zebrafish +notebooks/dynamo_beyondvelo +notebooks/zebrafish_topography_analysis_tutorial + +``` diff --git a/docs/tutorials/index_differential_geometry.md b/docs/tutorials/index_differential_geometry.md new file mode 100644 index 000000000..cc156907b --- /dev/null +++ b/docs/tutorials/index_differential_geometry.md @@ -0,0 +1,13 @@ +# Differential geometry + + +```{toctree} +:maxdepth: 1 + +notebooks/tutorial_hsc_dynamo_megakaryocytes_appearance +notebooks/tutorial_hsc_dynamo_cellwise_analysis +notebooks/tutorial_hsc_dynamo_basophil_lineage +notebooks/Differential_geometry + + +``` diff --git a/docs/tutorials/index_gallery.md b/docs/tutorials/index_gallery.md new file mode 100644 index 000000000..f704898d9 --- /dev/null +++ b/docs/tutorials/index_gallery.md @@ -0,0 +1,15 @@ +# Gallery + +Tutorial gallery +================ + +Welcome to the dynamo tutorial gallery. If you have any new analysis results worth sharing by applying dynamo to your dataset, welcome to let us know on GitHub or via email and we can put them in this gallery. + +![Pancreatic endocrinogenesis analysis](https://raw.githubusercontent.com/aristoteleo/dynamo_readthedocs/refs/heads/main/docs/source/gallery/images/pancreas_cover0.png) + +```{toctree} +:maxdepth: 1 + +notebooks/pancreatic_endocrinogenesis + +``` diff --git a/docs/tutorials/index_labeling.md b/docs/tutorials/index_labeling.md new file mode 100644 index 000000000..76fd8a2ad --- /dev/null +++ b/docs/tutorials/index_labeling.md @@ -0,0 +1,11 @@ +# Labeling scRNA-seq + +```{toctree} +:maxdepth: 1 + +notebooks/scNT_seq_readthedocs +notebooks/tutorial_hsc_velocity +notebooks/scEU_seq_organoid_analysis_kinetic +notebooks/scEU_seq_rpe1_analysis_kinetic + +``` diff --git a/docs/tutorials/index_multivelo.md b/docs/tutorials/index_multivelo.md new file mode 100644 index 000000000..c4971d935 --- /dev/null +++ b/docs/tutorials/index_multivelo.md @@ -0,0 +1,8 @@ +# Multi-Velocity + +```{toctree} +:maxdepth: 1 + +notebooks/dynamo_multivelo + +``` diff --git a/docs/tutorials/index_preprocessing.md b/docs/tutorials/index_preprocessing.md new file mode 100644 index 000000000..6f8d04a56 --- /dev/null +++ b/docs/tutorials/index_preprocessing.md @@ -0,0 +1,8 @@ +# Preprocessing + +```{toctree} +:maxdepth: 1 + +notebooks/tutorial_preprocess + +``` diff --git a/docs/tutorials/index_vector_pre.md b/docs/tutorials/index_vector_pre.md new file mode 100644 index 000000000..629fb6207 --- /dev/null +++ b/docs/tutorials/index_vector_pre.md @@ -0,0 +1,11 @@ +# Vector field predictions + + +```{toctree} +:maxdepth: 1 + +notebooks/lap_tutorial +notebooks/perturbation_tutorial +shiny + +``` \ No newline at end of file diff --git a/docs/tutorials/notebooks b/docs/tutorials/notebooks new file mode 160000 index 000000000..d587d9ea8 --- /dev/null +++ b/docs/tutorials/notebooks @@ -0,0 +1 @@ +Subproject commit d587d9ea8a55faf0a2d1ef22995331181f63164c diff --git a/docs/tutorials/shiny.md b/docs/tutorials/shiny.md new file mode 100644 index 000000000..0e2d0e159 --- /dev/null +++ b/docs/tutorials/shiny.md @@ -0,0 +1,194 @@ +# Shiny App + +## Introduction + +[Shiny](https://shiny.posit.co/py/) is a web application framework originally from the R programming language. It allows developers to create interactive web applications directly from code. We have developed the pipeline in LAP and perturbation tutorials into Shiny applications that allow users to interactively explore the results of the analyses. In this tutorial, we will walk through the basic steps to perform those two types of analyses in Shiny. Check out the original notebook for more details on the theory and results analyses. ([LAP](notebooks/lap_tutorial/lap_tutorial) and [perturbation](notebooks/perturbation_tutorial/perturbation_tutorial)) + +## Prerequisites + +To start the Shiny app, ensure that you have the Python version of Shiny installed. You can install it from PyPI: + +```bash +pip install shiny +``` + +or conda forge: + +```bash +conda install -c conda-forge shiny +``` + +Detailed instructions for installing Shiny can be found [here](https://shiny.posit.co/py/docs/install.html). To successfully perform the analyses, you will also need to have a processed dataset just like what we showed in the previous tutorials. Here we use the `dyn.sample_data.hematopoiesis()` as an example. + +## In silico perturbation experiments + +To run the in silico perturbation app in Shiny, you can run the following code: + +```python +import dynamo as dyn + +adata = dyn.sample_data.hematopoiesis() +dyn.shiny.perturbation_web_app(adata) +``` + +Then you can find the address of your shiny app from the output. By default, it is http://127.0.0.1:8000. Open the website you will see: + +![Perturbation App Screenshot 1](../_static/Shiny_tutorial_files/perturbation/1.png) + +On the left side is the control panel where you can select the perturbation experiment you want to perform. On the right side is the streamline plot before and after perturbation. + +![Perturbation App Screenshot 2](../_static/Shiny_tutorial_files/perturbation/1_1.jpg) + +To start a perturbation experiment, manipulate the parameters in the first panel. The slider on the top controls the number of genes to perturb. Each gene will have two parameters, gene name and the expression value of perturbation. For each gene, specify its name and the corresponding perturbation expression value. Once you have completed the selection or input of genes and values, click the `Run Perturbation` button to start the experiment. The result will be on the right side of the screen under the title “Streamline Plot After Perturbation”. + +![Perturbation App Screenshot 3](../_static/Shiny_tutorial_files/perturbation/2_1.jpg) + +Here we suppress the expression of GATA1 by 100. The streamline plot shows that the suppression can divert cells from GMP-related lineages to MEP-related lineages. This aligns with the fact that GATA1 is the master regulator of the GMP lineage. + +![Perturbation App Screenshot 4](../_static/Shiny_tutorial_files/perturbation/2_2.jpg) + +Next, let’s add one more gene through slide bar to perform a double suppression experiment. + +![Perturbation App Screenshot 5](../_static/Shiny_tutorial_files/perturbation/2_3.jpg) + +Here we suppress both SPI1 and GATA1 cells. Click the `Run perturbation` button to perform the experiment again. The result reveals a seesaw-effect regulation between SPI1 and GATA1 in driving the GMP and the MEP lineages. This is consistent with the fact that SPI1 and GATA1 are two master regulators of the GMP and the MEP lineages, respectively. + +![Perturbation App Screenshot 6](../_static/Shiny_tutorial_files/perturbation/3_1.jpg) + +The control panel below can be used to change the parameter of the streamline plot. For example, you can add one more `color` GATA1 to the plot. Results will be displayed immediately on both views. + +## Most probable path predictions + +Similar to the perturbation app, you can run the most probable path app in Shiny by running the following code: + +```python +import dynamo as dyn + +adata = dyn.sample_data.hematopoiesis() +dyn.shiny.lap_web_app(adata) +``` + +Please be aware that the second part of the app, “Evaluate TFs ranking based on LAP analyses”, requires transcription factors (TFs) information. You can specify the information in the second argument of the function. Here we use the example data `dyn.sample_data.human_tfs()`. + +```python +human_tfs = dyn.sample_data.human_tfs() +dyn.shiny.lap_web_app(adata_labeling, human_tfs) +``` + +If you don’t have the TFs information, you can still proceed with running the first part of the app. + +### Part 1: Run pairwise least action path analyses + +On the top of the app, you will see the streamline plot illustrating the velocities of the given dataset. + +![LAP App Screenshot 1](../_static/Shiny_tutorial_files/lap/1_1.png) + +You can modify the group key and basis to explore different perspectives. Note that these two parameters are also used in the LAP analyses. + +![LAP App Screenshot 2](../_static/Shiny_tutorial_files/lap/1_2.png) + +Scroll down to the scatter plot. You can manually select cells to initialize the LAP analyses. + +![LAP App Screenshot 3](../_static/Shiny_tutorial_files/lap/2_1.jpg) + +Click any cells on the scatter plot; the detailed information of the cell selected will be displayed on the table “Points near cursor”. + +![LAP App Screenshot 4](../_static/Shiny_tutorial_files/lap/2_2.jpg) + +Click the add button if you are satisfied with the selection. The selected cells will be displayed on the right table. At the same time, the scatters will be updated with the selected cells and their nearest neighbors. + +![LAP App Screenshot 5](../_static/Shiny_tutorial_files/lap/2_3.jpg) + +Alternatively, you can draw a rectangle on the plot to select cells. The selected cells will be displayed on the table “Points in brush”. + +![LAP App Screenshot 6](../_static/Shiny_tutorial_files/lap/2_4.png) + +Then add them to the table on the right. + +![LAP App Screenshot 7](../_static/Shiny_tutorial_files/lap/2_5.jpg) + +If you are not satisfied with the selection of cells in the table “Identified Cells to initialize the path”, you can click the reset button. + +![LAP App Screenshot 8](../_static/Shiny_tutorial_files/lap/2_6.jpg) + +All points will be removed from the table. You can start over again. Considering the running time, here we select three cells for cell type HSC, Meg, and Mon for illustration. LAP analyses on all cell types can be found in the tutorial “Most probable path predictions”. Click the “Run LAP analyses with identified cells” button to start the analyses. You will see a progress bar on the right bottom corner of the screen. After the analyses are done, the results will be displayed in the following sections. + +![LAP App Screenshot 9](../_static/Shiny_tutorial_files/lap/3_1.jpg) + +The first section is the ranking of genes for each transition. The slider on the left is for the number of top genes to display. The text box on the right is for the selection of transition. + +![LAP App Screenshot 10](../_static/Shiny_tutorial_files/lap/3_2.png) + +Here will select the transition `HSC->Mon` and top 9 genes. The plot will be updated immediately. + +![LAP App Screenshot 11](../_static/Shiny_tutorial_files/lap/4_1.jpg) + +The next section is the visualization of the path. The control panel specifies the number and name of transition. + +![LAP App Screenshot 12](../_static/Shiny_tutorial_files/lap/4_2.png) + +Here we select both the development and reprogramming transitions. The corresponding least action paths will be updated in the plot. + +![LAP App Screenshot 13](../_static/Shiny_tutorial_files/lap/5_1.jpg) + +This section displays the LAP time barplot for the path originating from the specified cell type. Since we used the metabolic labeling based scRNA-seq, we are able to obtain absolute RNA velocity. Consequently, we can predict the actual time (with units of hour) of the LAP, which is a remarkable feature derived from the labeling data. + +![LAP App Screenshot 14](../_static/Shiny_tutorial_files/lap/5_2.jpg) + +If we enable the global LAP time, we can see the barplot of all transitions. + +![LAP App Screenshot 15](../_static/Shiny_tutorial_files/lap/6.png) + +The following heatmap is the visualization of the transition matrices of actions and LAP time between all pair-wise cell type conversions with heatmaps. + +![LAP App Screenshot 16](../_static/Shiny_tutorial_files/lap/7_1.png) + +The last section is the kinetic heatmap of the given transition. You also need to specify the key of the transition matrix in the AnnData object. More explanation can be found in the API page of the `dynamo.pl.kinetic_heatmap()`. + +![LAP App Screenshot 17](../_static/Shiny_tutorial_files/lap/7_2.jpg) + +Since the space is limited, it is difficult to identify the gene names on the right. Thus, we reduce the number of genes to visualize. + +### Part 2: Evaluate TFs ranking based on LAP analyses + +The second part of the app is to evaluate the ranking of transcription factors based on LAP analyses. Remember that you need to specify the transcription factors information when initializing the app. + +![LAP App Screenshot 18](../_static/Shiny_tutorial_files/lap/8_1.jpg) + +First, navigate to the top of the page and select the second tab to switch to the second part of the app. + +![LAP App Screenshot 19](../_static/Shiny_tutorial_files/lap/8_2.png) + +The structure is similar. Begin with an initialization page to input known transcription factors, and the subsequent sections will visualize the results. + +![LAP App Screenshot 20](../_static/Shiny_tutorial_files/lap/9_1.png) + +In the initialization page, you need to type in the transcription factors manually. You also need to specify the type of transition (development, reprogramming, or transdifferentiation). All those pieces of information will be saved in a dictionary, just like the tutorial “Most probable path predictions”. There is no need to modify the default value of “Key to save TFs”, “Keys to save TFs rank” and “main key” unless you want to specify multiple groups of transcription factors for one transition. + +![LAP App Screenshot 21](../_static/Shiny_tutorial_files/lap/9_2.jpg) + +Here we add known transition factors to HSC->Meg and specify it as a development transition. + +![LAP App Screenshot 22](../_static/Shiny_tutorial_files/lap/9_3.jpg) + +Click the “Add transition info” button. + +![LAP App Screenshot 23](../_static/Shiny_tutorial_files/lap/9_4.jpg) + +The transition information will be displayed on the right. + +![LAP App Screenshot 24](../_static/Shiny_tutorial_files/lap/9_5.png) + +Then we keep adding transition information for HSC->Mon, Meg->HSC, and Mon->Meg. + +![LAP App Screenshot 25](../_static/Shiny_tutorial_files/lap/9_6.jpg) + +Once adding all transition information, click the “Analyze with current TFs” button. + +![LAP App Screenshot 26](../_static/Shiny_tutorial_files/lap/9_7.png) + +The first plot is the visualization of priority scores. Here we will convert the rankings of known TFs to a priority score, simply defined as `1 - rank / number of TFs`. From the above plot, you can observe that our prediction works very well. The majority of the known TFs of the known transitions are prioritized as > 0.8. + +![LAP App Screenshot 27](../_static/Shiny_tutorial_files/lap/9_8.png) + +Last visualization is the receiver operating curve (ROC) analyses of LAP. ROC curve evaluates the TF prediction when using all known genes of all known transitions as the gold standard. The result illustrates that LAP predictions and TFs prioritization work well. diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md new file mode 100644 index 000000000..f15b310fc --- /dev/null +++ b/docs/user_guide/index.md @@ -0,0 +1,295 @@ +# User guide + +## 10 minutes to dynamo + +Welcome to dynamo! + +Dynamo is a computational framework that includes an inclusive model of expression dynamics with scSLAM-seq / multiomics, vector field reconstruction, and potential landscape mapping. + +## Why dynamo + +Dynamo currently provides a complete solution (see below) to analyze expression dynamics of conventional scRNA-seq or time-resolved metabolic labeling based scRNA-seq. It aspires to become the leading tool in continuously integrating the most exciting developments in machine learning, systems biology, information theory, stochastic physics, etc. to model, understand, and interpret datasets generated from various cutting-edge single-cell genomics techniques (developments of dynamo 2/3 are underway). We hope those models, understandings, and interpretations not only facilitate your research but may also eventually lead to new biological discovery. Dynamo has a strong community so you will feel supported no matter whether you are a newcomer to computational biology or a veteran researcher who wants to contribute to dynamo's development. + +## How to install + +Dynamo requires Python 3.6 or later. + +Dynamo now has been released to PyPi, you can install the PyPi version via: + +```bash +pip install dynamo-release +``` + +To install the newest version of dynamo, you can git clone our repo and then pip install: + +```bash +git clone https://github.com/aristoteleo/dynamo-release.git +pip install dynamo-release/ --user +``` + +Conda version is also available: + +```bash +conda install -c conda-forge dynamo-release +``` + +Note that `--user` flag is used to install the package to your home directory, in case you don't have root privilege. + +Alternatively, you can install dynamo when you are in the dynamo-release folder by directly using python's setup install: + +```bash +git clone https://github.com/aristoteleo/dynamo-release.git +cd dynamo-release/ +python setup.py install --user +``` + +from source, using the following script: + +```bash +pip install git+https://github.com/aristoteleo/dynamo-release +``` + +In order to ensure dynamo runs properly, your python environment needs to satisfy dynamo's [dependencies](https://github.com/aristoteleo/dynamo-release/blob/master/setup.py). We provide a helper function for you to check the versions of dynamo's all dependencies: + +```python +import dynamo as dyn +dyn.session_info() +``` + +## Architecture of dynamo + +![Dynamo Architecture](https://user-images.githubusercontent.com/7456281/93838294-2026f600-fc57-11ea-971b-c3ececba0d85.png) + +Dynamo has a few standard modules like most other single cell analysis toolkits (Scanpy, Monocle, or Seurat), for example, data loading (`dyn.read*`), preprocessing (`dyn.pp.*`), tool analysis (`dyn.tl.*`), and plotting (`dyn.pl.*`). Modules specific to dynamo include: + +- a comprehensive estimation framework (`dyn.est.*`) of expression dynamics that includes: + - conventional single cell RNA-seq (scRNA-seq) modeling (`dyn.est.csc.*`) for **standard RNA velocity estimation** and more; + - time-resolved metabolic labeling based single cell RNA-seq (scRNA-seq) modeling (`dyn.est.tsc.*`) for **labeling based RNA velocity estimation** and more; +- vector field reconstruction and vector calculus (`dyn.vf.*`); +- cell fate prediction (`dyn.pd.*`); +- create movie of cell fate predictions (`dyn.mv.*`); +- stochastic simulation of various metabolic labeling experiments (`dyn.sim.*`); +- integration with external tools built by us or others (`dyn.ext.*`); +- and more. + +## Typical workflow + +![Typical Workflow](https://user-images.githubusercontent.com/7456281/93838305-2b7a2180-fc57-11ea-8ec8-552b75446e32.png) + +A typical workflow in dynamo is similar to most of other single cell analysis toolkits (Scanpy, Monocle, or Seurat), including steps like importing dynamo (`import dynamo as dyn`), loading data (`dyn.read*`), preprocessing (`dyn.pp.*`), tool analysis (`dyn.tl.*`) and plotting (`dyn.pl.*`). To get the best of dynamo though, you need to use the `dyn.vf.*`, `dyn.pd.*` and `dyn.mv.*` modules. + +### Import dynamo + +```python +import dynamo as dyn +``` + +We provide a few nice visualization defaults for different purposes: + +```python +dyn.configuration.set_figure_params('dynamo', background='white') # jupyter notebooks +dyn.configuration.set_figure_params('dynamo', background='black') # presentation +dyn.configuration.set_pub_style() # manuscript +``` + +### Load data + +Dynamo relies on [anndata](https://anndata.readthedocs.io/en/latest/index.html) for data IO. You can read your own data via `read`, `read_loom`, `read_h5ad`, `read_h5` or `load_NASC_seq`, etc: + +```python +adata = dyn.read(filename) +``` + +Dynamo also comes with a few built-in sample datasets so you can familiarize with dynamo before analyzing your own dataset. For example, you can load the Dentate Gyrus example dataset: + +```python +adata = dyn.sample_data.DentateGyrus() +``` + +There are many sample datasets available. You can check other available datasets via `dyn.sample_data.*`. + +To process the scSLAM-seq data, please refer to the [NASC-seq analysis pipeline](https://github.com/sandberg-lab/NASC-seq). We are also working on a command line tool for this and will release it in due time. For processing splicing data, you can either use the [velocyto command line interface](http://velocyto.org/velocyto.py/tutorial/cli.html) or the [bustool from Pachter lab](http://pachterlab.github.io/kallistobus). + +### Preprocess data + +After loading data, you are ready to perform some preprocessing. You can use the `Preprocessor` class that applies similar but a generalized strategy from [Monocle 3](https://cole-trapnell-lab.github.io/monocle3/) to normalize all datasets in different layers (the spliced and unspliced or new, i.e., metabolically labeled, and total mRNAs or others), followed by feature selection and PCA dimension reduction: + +```python +from dynamo.preprocessing import Preprocessor +preprocessor = Preprocessor() +preprocessor.preprocess_adata(adata, recipe="monocle") +``` + +### Learn dynamics + +Next, you will want to estimate the kinetic parameters of expression dynamics and then learn the velocity values for all genes that pass some filters (selected feature genes, by default) across cells. The `dyn.tl.dynamics` does all the hard work for you: + +```python +dyn.tl.dynamics(adata) +``` + +implicitly calls `dyn.tl.moments` first + +```python +dyn.tl.moments(adata) +``` + +which calculates the first, second moments (and sometimes covariance between different layers) of the expression data. First / second moments are basically mean and uncentered variance of gene expression, which are calculated based on local smoothing via a nearest neighbors graph, constructed in the reduced PCA space from the spliced or total mRNA expression of single cells. + +And it then performs the following steps: + +- checks the data you have and determines the experimental type automatically, either the conventional scRNA-seq, `kinetics`, `degradation` or `one-shot` single-cell metabolic labeling experiment or the `CITE-seq` or `REAP-seq` co-assay, etc. +- learns the velocity for each feature gene using either the original deterministic model based on a steady-state assumption from the seminal RNA velocity work or a few new methods, including the `stochastic` (default) or `negative binomial method` for conventional scRNA-seq or `kinetic`, `degradation` or `one-shot` models for metabolic labeling based scRNA-seq. + +Those later methods are based on moment equations. All those methods use all or part of the output from `dyn.tl.moments(adata)`. + +Kinetic estimation of the conventional scRNA-seq and metabolic labeling based scRNA-seq is often tricky and has a lot of pitfalls. Sometimes you may even observe undesired backward vector flow. You can evaluate the confidence of gene-wise velocity via: + +```python +dyn.tl.gene_wise_confidence(adata, group='group', lineage_dict={'Progenitor': ['terminal_cell_state']}) +``` + +and filter those low-confidence genes for downstream Velocity vectors analysis, etc (See more details in FAQ). + +### Dimension reduction + +By default, we use the `umap` algorithm for dimension reduction. + +```python +dyn.tl.reduceDimension(adata) +``` + +If the requested reduced dimension already exists, dynamo won't touch it unless you set `enforce=True`. + +```python +dyn.tl.reduceDimension(adata, basis='umap', enforce=True) +``` + +### Velocity vectors + +We need to project the velocity vector onto low-dimensional embedding for later visualization. To get there, we can either use the default `correlation/cosine kernel` or the novel Itô kernel from us. + +```python +dyn.tl.cell_velocities(adata) +``` + +The above function projects and evaluates velocity vectors on `umap` space, but you can also operate them on other bases, for example, `pca` space: + +```python +dyn.tl.cell_velocities(adata, basis='pca') +``` + +You can check the confidence of cell-wise velocity to understand how reliable the recovered velocity is across cells via: + +```python +dyn.tl.cell_wise_confidence(adata) +``` + +Obviously, dynamo doesn't stop here. The really exciting part of dynamo lies in the fact that it learns a `functional form of vector field` in the full transcriptomic space, which can be then used to predict cell fate and map single cell potential landscape. + +### Vector field reconstruction + +In classical physics, including fluidics and aerodynamics, velocity and acceleration vector fields are used as fundamental tools to describe motion or external force of objects, respectively. In analogy, RNA velocity or protein accelerations estimated from single cells can be regarded as sparse samples in the velocity (La Manno et al. 2018) or acceleration vector field (Gorin, Svensson, and Pachter 2019) that defined on the gene expression space. + +In general, a vector field can be defined as a vector-valued function f that maps any points (or cells’ expression state) x in a domain Ω with D dimension (or the gene expression system with D transcripts / proteins) to a vector y (for example, the velocity or acceleration for different genes or proteins), that is f(x) = y. + +To formally define the problem of velocity vector field learning, we consider a set of measured cells with pairs of current and estimated future expression states. The difference between the predicted future state and current state for each cell corresponds to the velocity vector. We note that the measured single-cell velocity (conventional RNA velocity) is sampled from a smooth, differentiable vector field f that maps from xi to yi on the entire domain. Normally, single-cell velocity measurements are results of biased, noisy, and sparse sampling of the entire state space, thus the goal of velocity vector field reconstruction is to robustly learn a mapping function f that outputs yj given any point xj on the domain based on the observed data with certain smoothness constraints (Jiayi Ma et al. 2013). Under ideal scenario, the mapping function f should recover the true velocity vector field on the entire domain and predict the true dynamics in regions of expression space that are not sampled. To reconstruct vector field function in dynamo, you can simply use the following function to do all the heavy-lifting: + +```python +dyn.vf.VectorField(adata) +``` + +By default, it learns the vector field in the `pca` space, but you can of course learn it in any space or even the original gene expression space. + +### Characterize vector field topology + +Since we learn the vector field function of the data, we can then characterize the topology of the full vector field space. For example, we are able to identify + +- the fixed points (attractor/saddles, etc.) which may correspond to terminal cell types or progenitors; +- nullcline, separatrices of a recovered dynamic system, which may formally define the dynamical behavior or the boundary of cell types in gene expression space. + +Again, you only need to simply run the following function to get all that information. + +```python +dyn.vf.topography(adata, basis='umap') +``` + +### Predict cell fate + +Cell fate prediction is a crucial problem in single cell analysis. With the continuous vector field function learned, Dynamo is able to calculate the historical and future cell states over arbitrary time scales. You can use the following function to predict the cell fate on given initial cells. + +```python +dyn.pd.fate(adata, init_cells) +``` + +### Map potential landscape + +The concept of potential landscape is widely appreciated across various biological disciplines, for example, the adaptive landscape in population genetics, protein-folding funnel landscape in biochemistry, epigenetic landscape in developmental biology. In the context of cell fate transition, for example, differentiation, carcinogenesis, etc., a potential landscape will not only offer an intuitive description of the global dynamics of the biological process but also provides key insights to understand the multi-stability and transition rate between different cell types as well as to quantify the optimal path of cell fate transition. + +Because the classical definition of potential function in physics requires gradient systems (no `curl` or cycling dynamics), which is often not applicable to open biological systems. In dynamo, we provided several ways to quantify the potential of single cells by decomposing the vector field into gradient, curl parts, etc. The recommended method is built on the Hodge decomposition on simplicial complexes (a sparse directional graph) constructed based on the learned vector field function that provides fruitful analogy of gradient, curl, and harmonic (cyclic) flows on the manifold: + +```python +dyn.ext.ddhodge(adata) +``` + +In addition, we and others proposed various strategies to decompose the `stochastic differential equations` into either the gradient or the curl component from first principles. We then can use the gradient part to define the potential. + +Although an analytical decomposition on the reconstructed vector field is challenging, we are able to use a numerical algorithm we recently developed for our purpose. This approach uses a least action method under the A-type stochastic integration (Shi et al. 2012) to globally map the potential landscape Ψ(x) (Tang et al. 2017) by taking the vector field function f(x) as input. + +```python +dyn.vf.Potential(adata) +``` + +### Visualization + +In two or three dimensions, a streamline plot can be used to visualize the paths of cells will follow if released in different regions of the gene expression state space under a steady flow field. Although we currently do not support this, for vector fields that change over time, similar methods, for example, streakline, pathline, timeline, etc., can also be used to visualize the evolution of single cells or cell populations. + +In dynamo, we have three standard visual representations of vector fields, including the `cell wise`, `grid` quiver plots, and the `streamline plot`. Another intuitive way to visualize the structure of vector field is the so-called line integral convolution method or LIC (Cabral and Leedom 1993), which works by adding random black-and-white paint sources on the vector field and letting the flowing particles on the vector field picking up some texture to ensure points on the same streamline having similar intensity. We rely on the yt's `annotate_line_integral_convolution` function to visualize the LIC vector field reconstructed from dynamo. + +```python +dyn.pl.cell_wise_vectors(adata, color=colors, ncols=3) +dyn.pl.grid_vectors(adata, color=colors, ncols=3) +dyn.pl.streamline_plot(adata, color=colors, ncols=3) +dyn.pl.line_integral_conv(adata) +``` + +Note that `colors` here is a list or str that can be either the column name in `.obs` or `gene names`. + +To visualize the topological structure of the reconstructed 2D vector fields, we provide the `dyn.pl.topography` function in dynamo. + +```python +dyn.vf.VectorField(adata, basis='umap') +dyn.pl.topography(adata) +``` + +Plotting functions in dynamo are designed to be extremely flexible. For example, you can combine different types of dynamo plots together (when you visualize only one item for each plot function). + +```python +import matplotlib.pyplot as plt +fig1, f1_axes = plt.subplots(ncols=2, nrows=2, constrained_layout=True, figsize=(12, 10)) +f1_axes[0, 0] = dyn.pl.cell_wise_vectors(adata, color='umap_ddhodge_potential', pointsize=0.1, alpha=0.7, ax=f1_axes[0, 0], quiver_length=6, quiver_size=6, save_show_or_return='return') +f1_axes[0, 1] = dyn.pl.grid_vectors(adata, color='speed_umap', ax=f1_axes[0, 1], quiver_length=12, quiver_size=12, save_show_or_return='return') +f1_axes[1, 0] = dyn.pl.streamline_plot(adata, color='divergence_pca', ax=f1_axes[1, 0], save_show_or_return='return') +f1_axes[1, 1] = dyn.pl.topography(adata, color='acceleration_umap', ax=f1_axes[1, 1], save_show_or_return='return') +plt.show() +``` + +The above creates a 2x2 plot that puts `cell_wise_vectors`, `grid_vectors`, `streamline_plot`, and `topography` plots together. + +Last but not least, Dynamo also provides functions to create a movie of cell fate predictions. The animation needs predicted cell fate information in your dataset and requires a topography plot as the foundation. + +```python +from matplotlib import animation + +fig, ax = plt.subplots() +ax = dyn.pl.topography(adata, ax=ax) +instance = dyn.mv.StreamFuncAnim(adata=adata, ax=ax) +anim = animation.FuncAnimation(instance.fig, instance.update, init_func=instance.init_background) +``` + +## Compatibility + +Dynamo is fully compatible with velocyto, scanpy, and scvelo. So you can use your loom or annadata object as input for dynamo. The velocity vector samples estimated from either velocyto or scvelo can be also directly used to reconstruct the functional form of vector field and to map the potential landscape in the entire expression space. +``` + +This markdown version uses headings, code blocks, and lists to organize the information, making it easier to read and navigate. \ No newline at end of file diff --git a/dynamo/__init__.py b/dynamo/__init__.py index 870f8c30e..1a4a5fcf0 100755 --- a/dynamo/__init__.py +++ b/dynamo/__init__.py @@ -22,6 +22,7 @@ from . import sample_data from . import configuration from . import ext +from . import multi from .data_io import * from .dynamo_logger import ( diff --git a/dynamo/multi.py b/dynamo/multi.py new file mode 100644 index 000000000..9b876f67e --- /dev/null +++ b/dynamo/multi.py @@ -0,0 +1 @@ +from .multivelo import * \ No newline at end of file diff --git a/dynamo/multivelo/ATACseqTools.py b/dynamo/multivelo/ATACseqTools.py new file mode 100644 index 000000000..dc5b9ce37 --- /dev/null +++ b/dynamo/multivelo/ATACseqTools.py @@ -0,0 +1,407 @@ +import anndata as ad +from anndata import AnnData + +from concurrent.futures import as_completed, ThreadPoolExecutor + +from mudata import MuData +import numpy as np +from os import PathLike +import pandas as pd + + +from scipy.sparse import coo_matrix, csr_matrix, diags, hstack +from tqdm import tqdm +from typing import ( + Literal, + Union +) + +# Imports from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_info, +) + + +# Imports from MultiDynamo +from .MultiConfiguration import MDKM + + +def extend_gene_coordinates( + bedtool, + upstream: int = 2000, + downstream: int = 0 +): + from pybedtools import BedTool + extended_genes = [] + for feature in bedtool: + if feature[2] == 'gene': + start = max(0, int(feature.start) - upstream) + end = int(feature.end) + downstream + extended_genes.append((feature.chrom, start, end, feature.name)) + return BedTool(extended_genes) + + +def annotate_integrated_mdata(mdata: MuData, + celltypist_model: str = 'Immune_All_Low.pkl' + ) -> MuData: + import celltypist + from celltypist import models + import scanpy as sc + # Extract the RNA data + rna_adata = mdata.mod['rna'].copy() + + # ... revert to counts + rna_adata.X = rna_adata.layers['counts'].copy() + + # ... normalize counts so total number per cell is 10,000 (required by celltypist) + sc.pp.normalize_total(rna_adata, + target_sum=1e4) + + # ... pseudo-log transform (x -> log(1 + x)) for better dynamical range (and required by celltypist) + sc.pp.log1p(rna_adata) + + # ... rerun PCA - CellTypist can need larger number than we already computed + sc.pp.pca(rna_adata, n_comps=50) + + # ... recompute the neighborhood graph for majority voting + sc.pp.neighbors(rna_adata, + n_neighbors=50, + n_pcs=50) + + # Download celltypist models for annotation + models.download_models(force_update=True) + + # Select the low resolution immun cell model + model = models.Model.load(model=celltypist_model) + + # Compute cell type labels + predictions = celltypist.annotate(rna_adata, + model=celltypist_model, + majority_voting=True) + + # Transfer the predictions back to the RNA AnnData object + rna_adata = predictions.to_adata() + + # Create dictionary from cell indices to cell types + cellindex_to_celltype_dict = rna_adata.obs['majority_voting'].to_dict() + + # Apply the index map to both RNA and ATAC AnnData objects + atac_adata, rna_adata = mdata.mod['atac'].copy(), mdata.mod['rna'].copy() + atac_adata.obs['cell_type'] = atac_adata.obs.index.map( + lambda cell_idx: cellindex_to_celltype_dict.get(cell_idx, 'Undefined')) + rna_adata.obs['cell_type'] = rna_adata.obs.index.map( + lambda cell_idx: cellindex_to_celltype_dict.get(cell_idx, 'Undefined')) + + return MuData({'atac': atac_adata.copy(), 'rna': rna_adata.copy()}) + + +def gene_activity( + atac_adata: AnnData, + gtf_path: PathLike, + upstream: int = 2000, + downstream: int = 0 +) -> Union[AnnData, None]: + from pybedtools import BedTool + # Drop UCSC convention for naming of chromosomes - This assumes we are using ENSEMBL-format of GTF + atac_adata.var.index = [c.lstrip('chr') for c in atac_adata.var.index] + + # Read GTF to annotate genes + main_info('reading GTF', indent_level=3) + gene_annotations = BedTool(gtf_path) + + # Extend gene coordinates + main_info('extending genes to estimate regulatory regions', indent_level=3) + peak_extension_logger = LoggerManager.gen_logger('extend_genes') + peak_extension_logger.log_time() + + extended_genes = extend_gene_coordinates(gene_annotations, + upstream=upstream, + downstream=downstream) + + peak_extension_logger.finish_progress(progress_name='extend_genes', indent_level=3) + + # Extract ATAC-seq peak coordinates + chrom_list = atac_adata.var_names.str.split(':').str[0] # .astype(int) + start_list = atac_adata.var_names.str.split(':').str[1].str.split('-').str[0] # .astype(int).astype(int) + end_list = atac_adata.var_names.str.split(':').str[1].str.split('-').str[1] # .astype(int).astype(int) + + # Convert ATAC-seq peak data to BedTool format + atac_peaks = BedTool.from_dataframe(pd.DataFrame({ + 'chrom': chrom_list, + 'start': start_list, + 'end': end_list + })) + + # Find overlaps between peaks and extended genes + main_info('overlapping peaks and extended genes', indent_level=3) + linked_peaks = atac_peaks.intersect(extended_genes, wa=True, wb=True) + + # Create a DataFrame from the linked peaks + linked_peaks_df = linked_peaks.to_dataframe( + names=['chrom', 'peak_start', 'peak_end', 'chrom_gene', 'gene_start', 'gene_end', 'gene_name']) + + # Create a dictionary to map peak indices to gene names + main_info('building dictionaries', indent_level=3) + peak_to_gene = linked_peaks_df.set_index(['chrom', 'peak_start', 'peak_end'])['gene_name'].to_dict() + peak_to_gene = {f'{chrom}:{start}-{end}': gene_name for (chrom, start, end), gene_name in peak_to_gene.items()} + + # Get the list of peaks from the ATAC-seq data + peaks = atac_adata.var.index + gene_names = np.array([peak_to_gene.get(peak, '') for peak in peaks]) + + # Get the unique genes + unique_genes = np.unique(gene_names) + + # Initialize a sparse matrix for gene activity scores + n_cells, n_genes = atac_adata.n_obs, len(unique_genes) + + # Create a mapping from gene names to column indices in the sparse matrix + gene_to_idx = {gene: idx for idx, gene in enumerate(unique_genes)} + + def process_peak(i): + gene = gene_names[i] + return gene_to_idx.get(gene, -1), atac_adata[:, i].X + + # Fill the sparse matrix with aggregated counts in parallel + results = [] + with ThreadPoolExecutor() as executor: + futures = [executor.submit(process_peak, i) for i in range(len(peaks))] + with tqdm(total=len(peaks), desc="Processing peaks") as pbar: + for future in as_completed(futures): + result = future.result() + if result is not None: + results.append(result) + pbar.update(1) + + # Aggregate results in batches to minimize overhead + main_info('aggregating results', indent_level=3) + aggregation_logger = LoggerManager.gen_logger('aggregating_results') + aggregation_logger.log_time() + + data = [] + rows = [] + cols = [] + + # Loop through the results to gather data for COO matrix + for col_idx, sparse_col_vector in results: + # Extract row indices and data from the sparse column vector + coo = sparse_col_vector.tocoo() + data.extend(coo.data) + rows.extend(coo.row) + cols.extend([col_idx] * len(coo.row)) + + # Create a COO matrix from collected data + coo_matrix_all = coo_matrix((data, (rows, cols)), shape=(n_cells, n_genes)) + + # Convert COO matrix to CSR format + gene_activity_matrix = coo_matrix_all.tocsr() + + aggregation_logger.finish_progress(progress_name='aggregating_results', indent_level=3) + + # Add the sparse gene activity matrix as a new .obsm + atac_adata.obsm[MDKM.ATAC_GENE_ACTIVITY_KEY] = gene_activity_matrix + atac_adata.uns[MDKM.ATAC_GENE_ACTIVITY_GENES_KEY] = pd.Index(unique_genes) + + return atac_adata + + +def integrate(mdata: MuData, + integration_method: Literal['moscot', 'multivi'] = 'multivi', + alpha: float = 0.5, + entropic_regularization: float = 0.01, + gtf_path: Union[PathLike, str] = None, + max_epochs: int = 500, + lr: float = 0.0001, + ) -> MuData: + # Split into scATAC-seq and scRNA-seq AnnData objects + atac_adata, rna_adata = mdata.mod['atac'].copy(), mdata.mod['rna'].copy() + atac_adata.obs['modality'], rna_adata.obs['modality'] = 'atac', 'rna' + + if atac_adata.uns[MDKM.MATCHED_ATAC_RNA_DATA_KEY]: + main_info('Integration: matched multiome, so just filtering cells') + + # Restrict to cells common to both AnnData objects + shared_cells = pd.Index(np.intersect1d(rna_adata.obs_names, atac_adata.obs_names)) + atac_adata_filtered = atac_adata[shared_cells, :].copy() + rna_adata_filtered = rna_adata[shared_cells, :].copy() + + return MuData({'atac': atac_adata_filtered, 'rna': rna_adata_filtered}) + elif integration_method == 'moscot': + return integrate_via_moscot(mdata=mdata, + alpha=alpha, + entropic_regularization=entropic_regularization) + elif integration_method == 'multivi': + return integrate_via_multivi(mdata=mdata, + gtf_path=gtf_path, + lr=lr, + max_epochs=max_epochs) + else: + raise ValueError(f'Unknown integration method {integration_method} requested.') + + +def integrate_via_moscot(mdata: MuData, + alpha: float = 0.7, + entropic_regularization: float = 0.01, + gtf_path: Union[PathLike, str] = None, + ) -> MuData: + pass + + +def integrate_via_multivi(mdata: MuData, + gtf_path: Union[PathLike, str] = None, + lr: float = 0.0001, + max_epochs: int = 500, + ) -> MuData: + import scvi + main_info('Integration via MULTIVI ...') + integration_logger = LoggerManager.gen_logger('integration_via_multivi') + integration_logger.log_time() + + # Split into scATAC-seq and scRNA-seq AnnData objects + atac_adata, rna_adata = mdata.mod['atac'].copy(), mdata.mod['rna'].copy() + atac_adata.obs['modality'], rna_adata.obs['modality'] = 'atac', 'rna' + + # Check whether cell indices need to be prepended by 'atac' and 'rna' + if ':' not in atac_adata.obs_names[0]: + atac_adata.obs_names = atac_adata.obs_names.map(lambda x: f'atac:{x}') + num_atac_cells, num_atac_peaks = atac_adata.n_obs, atac_adata.n_vars + + if ':' not in rna_adata.obs_names[0]: + rna_adata.obs_names = rna_adata.obs_names.map(lambda x: f'rna:{x}') + num_rna_cells = rna_adata.n_obs + + # Check whether gene activity was pre-computed + if MDKM.ATAC_GENE_ACTIVITY_KEY not in atac_adata.obsm.keys(): + main_info('Computing gene activities', indent_level=2) + atac_adata = gene_activity(atac_adata=atac_adata, + gtf_path=gtf_path) + gene_activity_matrix = atac_adata.obsm[MDKM.ATAC_GENE_ACTIVITY_KEY] + + # Restrict to gene names common to gene activity matrix from atac-seq data and + # counts matrix from rna-seq data + gene_names_atac = atac_adata.uns[MDKM.ATAC_GENE_ACTIVITY_GENES_KEY] + gene_names_rna = rna_adata.var_names + common_genes = gene_names_rna.intersection(gene_names_atac) + num_genes = len(common_genes) + + # Filter gene activity and scATAC-seq data into a single AnnData object, with a + # batch label indicating the origin + main_info('Preparing ATAC-seq data for MULTIVI', indent_level=2) + gene_activity_filtered = gene_activity_matrix[:, [gene_names_atac.get_loc(gene) for gene in common_genes]] + + # Assemble multi-ome for the ATAC-seq data + # ... X + atac_multiome_X = hstack([gene_activity_filtered, atac_adata.X]) + + # ... obs + atac_multiome_obs = atac_adata.obs[['modality']].copy() + + # ... var + multiome_var = pd.concat((rna_adata.var.loc[common_genes].copy(), atac_adata.var.copy()), axis=1) + + atac_multiome = AnnData(X=csr_matrix(atac_multiome_X), + obs=atac_multiome_obs, + var=multiome_var) + + # Assemble multi-ome for RNA-seq data + main_info('Preparing RNA-seq data for MULTIVI', indent_level=2) + rna_adata_filtered = rna_adata[:, common_genes].copy() + + # ... X + rna_multiome_X = hstack([rna_adata_filtered.X.copy(), csr_matrix((num_rna_cells, num_atac_peaks))]) + + # ... obs + rna_multiome_obs = rna_adata_filtered.obs[['modality']].copy() + + # ... var - NTD + + rna_multiome = AnnData(X=csr_matrix(rna_multiome_X), + obs=rna_multiome_obs, + var=multiome_var) + + # Concatenate the data + combined_adata = ad.concat([atac_multiome, rna_multiome], axis=0) + + # Setup AnnData object for scvi-tools + main_info('Setting up combined data for MULTIVI', indent_level=2) + scvi.model.MULTIVI.setup_anndata(combined_adata, batch_key='modality') + + # Instantiate the SCVI model + main_info('Instantiating MULTIVI model', indent_level=2) + multivi_model = scvi.model.MULTIVI(adata=combined_adata, n_genes=num_genes, n_regions=num_atac_peaks) + + # Train the model + main_info('Training MULTIVI model', indent_level=2) + multivi_model.train(max_epochs=max_epochs, lr=lr) + + # Extract the latent representation + combined_adata.obsm['latent'] = multivi_model.get_latent_representation() + + # Impute counts from latent space + # ... X + main_info('Imputing RNA expression', indent_level=2) + imputed_rna_X = multivi_model.get_normalized_expression() + + # ... obs + multiome_obs = pd.concat((atac_multiome_obs, rna_multiome_obs)) + + # ... var + rna_multiome_var = rna_adata.var.loc[common_genes].copy() + + imputed_rna_adata = AnnData(X=imputed_rna_X, + obs=multiome_obs, + var=rna_multiome_var, + ) + + # ... X + main_info('Imputing accessibility', indent_level=2) + imputed_atac_X = multivi_model.get_accessibility_estimates() + + # ... obs - NTD + + # ... var + atac_multiome_var = atac_adata.var.copy() + + imputed_atac_adata = AnnData(X=imputed_atac_X, + obs=multiome_obs, + var=atac_multiome_var, + ) + + # Knit together into one harmonized MuData object + harmonized_mdata = MuData({'atac': imputed_atac_adata, 'rna': imputed_rna_adata}) + + integration_logger.finish_progress(progress_name='integration_via_multivi', indent_level=3) + + return harmonized_mdata + + +def tfidf_normalize( + atac_adata: AnnData, + log_tf: bool = True, + log_idf: bool = True, + log_tfidf: bool = False, + mv_algorithm: bool = True, + scale_factor: float = 1e4, +) -> None: + import muon as mu + # This computes the term frequency / inverse domain frequency normalization. + if mv_algorithm: + # MultiVelo's method + npeaks = atac_adata.X.sum(1) + npeaks_inv = csr_matrix(1.0 / npeaks) + tf = atac_adata.X.multiply(npeaks_inv) + idf = diags(np.ravel(atac_adata.X.shape[0] / atac_adata.X.sum(0))).log1p() + tf_idf = tf.dot(idf) * scale_factor + atac_adata.layers[MDKM.ATAC_TFIDF_LAYER] = np.log1p(tf_idf) + else: + atac_adata = mu.atac.pp.tfidf(data=atac_adata, + log_tf=log_tf, + log_idf=log_idf, + log_tfidf=log_tfidf, + scale_factor=scale_factor, + from_layer='counts', + to_layer=MDKM.ATAC_TFIDF_LAYER, + copy=True) + + return atac_adata diff --git a/dynamo/multivelo/ChromatinVelocity.py b/dynamo/multivelo/ChromatinVelocity.py new file mode 100644 index 000000000..6501f3659 --- /dev/null +++ b/dynamo/multivelo/ChromatinVelocity.py @@ -0,0 +1,213 @@ +import numpy as np +from scipy.sparse import issparse +from typing import Literal + +# Import from dynamo +from ..dynamo_logger import ( + main_exception, +) + + +# ChromatinVelocity class - patterned after MultiVelo, but retains accessibility at individual CRE +class ChromatinVelocity: + def __init__(self, + c, + u, + s, + ss, + us, + uu, + fit_args=None, + gene=None, + r2_adjusted=False): + self.gene = gene + self.outlier = np.clip(fit_args['outlier'], a_min=80, a_max=100) + self.r2_adjusted = r2_adjusted + self.total_n = len(u) + + # Convert all sparse vectors to dense ones + c = c.A if issparse(c) else c + s = s.A if issparse(s) else s + u = u.A if issparse(u) else u + ss = ss.A if ((ss is not None) and issparse(ss)) else ss + us = us.A if ((us is not None) and issparse(us)) else us + uu = uu.A if ((uu is not None) and issparse(uu)) else uu + + # In distinction to MultiVelo c will be (total_n, n_peak) array + # Sweep the minimum value in each column from the array + self.offset_c = np.min(c, axis=0) + self.c_all = c - self.offset_c + + # The other moments are (total_n, ) arrays + self.s_all, self.u_all = np.ravel(np.array(s, dtype=np.float64)), np.ravel(np.array(u, dtype=np.float64)) + self.offset_s, self.offset_u = np.min(self.s_all), np.min(self.u_all) + self.s_all -= self.offset_s + self.u_all -= self.offset_u + + # For 'stochastic' method also need second moments + if ss is not None: + self.ss_all = np.ravel(np.array(ss, dtype=np.float64)) + if us is not None: + self.us_all = np.ravel(np.array(us, dtype=np.float64)) + if uu is not None: + self.uu_all = np.ravel(np.array(uu, dtype=np.float64)) + + # Ensure at least one element in each cell is positive + any_c_positive = np.any(self.c_all > 0, axis=1) + self.non_zero = np.ravel(any_c_positive) | np.ravel(self.u_all > 0) | np.ravel(self.s_all > 0) + + # remove outliers + # ... for chromatin, we'll be more stringent - if *any* peak count for a cell + # is an outlier, we'll remove that cell + self.non_outlier = np.all(self.c_all <= np.percentile(self.c_all, self.outlier, axis=0), axis=1) + self.non_outlier &= np.ravel(self.u_all <= np.percentile(self.u_all, self.outlier)) + self.non_outlier &= np.ravel(self.s_all <= np.percentile(self.s_all, self.outlier)) + self.c = self.c_all[self.non_zero & self.non_outlier] + self.u = self.u_all[self.non_zero & self.non_outlier] + self.s = self.s_all[self.non_zero & self.non_outlier] + self.ss = (None if ss is None + else self.ss_all[self.non_zero & self.non_outlier]) + self.us = (None if us is None + else self.us_all[self.non_zero & self.non_outlier]) + self.uu = (None if uu is None + else self.uu_all[self.non_zero & self.non_outlier]) + self.low_quality = len(self.u) < 10 + + # main_info(f'{len(self.u)} cells passed filter and will be used to fit regressions.') + + # 4 rate parameters + self.alpha_c = 0.1 + self.alpha = 0.0 + self.beta = 0.0 + self.gamma_det = 0.0 + self.gamma_stoch = 0.0 + + # other parameters or results + self.loss_det = np.inf + self.loss_stoch = np.inf + self.r2_det = 0 + self.r2_stoch = 0 + self.residual_det = None + self.residual_stoch = None + self.residual2_stoch = None + + self.steady_state_func = None + + # Select the cells for regression + w_sub_for_c = np.any(self.c >= 0.1 * np.max(self.c, axis=0), axis=1) + w_sub = w_sub_for_c & (self.u >= 0.1 * np.max(self.u)) & (self.s >= 0.1 * np.max(self.s)) + c_sub = self.c[w_sub] + w_sub_for_c = np.any(self.c >= np.mean(c_sub, axis=0) + np.std(c_sub, axis=0)) + w_sub = w_sub_for_c & (self.u >= 0.1 * np.max(self.u)) & (self.s >= 0.1 * np.max(self.s)) + self.w_sub = w_sub + if np.sum(self.w_sub) < 10: + self.low_quality = True + + # This method originated from MultiVelo - Corrected R^2 + def compute_deterministic(self): + # Steady state slope - no different than usual transcriptomic version + u_high = self.u[self.w_sub] + s_high = self.s[self.w_sub] + wu_high = u_high >= np.percentile(u_high, 95) + ws_high = s_high >= np.percentile(s_high, 95) + ss_u = u_high[wu_high | ws_high] + ss_s = s_high[wu_high | ws_high] + + gamma_det = np.dot(ss_u, ss_s) / np.dot(ss_s, ss_s) + self.steady_state_func = lambda x: gamma_det * x + residual_det = self.u_all - self.steady_state_func(self.s_all) + + loss_det = np.dot(residual_det, residual_det) / len(self.u_all) + + if self.r2_adjusted: + gamma_det = np.dot(self.u, self.s) / np.dot(self.s, self.s) + residual_det = self.u_all - gamma_det * self.s_all + + total_det = self.u_all - np.mean(self.u_all) + # total_det = self.u_all # Since fitting only slope with zero intercept, should not include mean + + self.gamma_det = gamma_det + self.loss_det = loss_det + self.residual_det = residual_det + + self.r2_det = 1 - np.dot(residual_det, residual_det) / np.dot(total_det, total_det) + + + # This method originated from MultiVelo + def compute_stochastic(self): + self.compute_deterministic() + + var_ss = 2 * self.ss - self.s + cov_us = 2 * self.us + self.u + s_all_ = 2 * self.s_all ** 2 - (2 * self.ss_all - self.s_all) + u_all_ = (2 * self.us_all + self.u_all) - 2 * self.u_all * self.s_all + gamma2 = np.dot(cov_us, var_ss) / np.dot(var_ss, var_ss) + residual2 = cov_us - gamma2 * var_ss + std_first = np.std(self.residual_det) + std_second = np.std(residual2) + + # chromatin adjusted steady-state slope + u_high = self.u[self.w_sub] + s_high = self.s[self.w_sub] + wu_high = u_high >= np.percentile(u_high, 95) + ws_high = s_high >= np.percentile(s_high, 95) + ss_u = u_high * (wu_high | ws_high) + ss_s = s_high * (wu_high | ws_high) + a = np.hstack((ss_s / std_first, var_ss[self.w_sub] / std_second)) + b = np.hstack((ss_u / std_first, cov_us[self.w_sub] / std_second)) + + gamma_stoch = np.dot(b, a) / np.dot(a, a) + self.steady_state_func = lambda x: gamma_stoch * x + self.residual_stoch = self.u_all - self.steady_state_func(self.s_all) + self.residual2_stoch = u_all_ - self.steady_state_func(s_all_) + loss_stoch = np.dot(self.residual_stoch, self.residual_stoch) / len(self.u_all) + + self.gamma_stoch = gamma_stoch + self.loss_stoch = loss_stoch + self.r2_stoch = 1 - np.dot(self.residual_stoch, self.residual_stoch) / np.dot(self.u_all, self.u_all) + + def get_gamma(self, + mode: Literal['deterministic', 'stochastic'] = 'stochastic'): + if mode == 'deterministic': + return self.gamma_det + elif mode == 'stochastic': + return self.gamma_stoch + else: + main_exception(f"Unknown mode {mode} - must be one of 'deterministic' or 'stochastic'") + + def get_loss(self, + mode: Literal['deterministic', 'stochastic'] = 'stochastic'): + if mode == 'deterministic': + return self.loss_det + elif mode == 'stochastic': + return self.loss_stoch + else: + main_exception(f"Unknown mode {mode} - must be one of 'deterministic' or 'stochastic'") + + def get_r2(self, + mode: Literal['deterministic', 'stochastic'] = 'stochastic'): + if mode == 'deterministic': + return self.r2_det + elif mode == 'stochastic': + return self.r2_stoch + else: + main_exception(f"Unknown mode {mode} - must be one of 'deterministic' or 'stochastic'") + + def get_variance_velocity(self, + mode: Literal['deterministic', 'stochastic'] = 'stochastic'): + if mode == 'stochastic': + return self.residual2_stoch + else: + main_exception("Should not call get_variance_velocity for mode other than 'stochastic'") + + def get_velocity(self, + mode: Literal['deterministic', 'stochastic'] = 'stochastic'): + vel = None # Make the lint checker happy + if mode == 'deterministic': + vel = self.residual_det + elif mode == 'stochastic': + vel = self.residual_stoch + else: + main_exception(f"Unknown mode {mode} - must be one of 'deterministic' or 'stochastic'") + + return vel diff --git a/dynamo/multivelo/MultiConfiguration.py b/dynamo/multivelo/MultiConfiguration.py new file mode 100644 index 000000000..41b036704 --- /dev/null +++ b/dynamo/multivelo/MultiConfiguration.py @@ -0,0 +1,97 @@ +from ..configuration import DynamoAdataKeyManager + +class MultiDynamoMdataKeyManager(DynamoAdataKeyManager): + # A class to manage the keys used in MuData object used for MultiDynamo + # Universal keys - independent of modality + INFERRED_BATCH_KEY = 'inferred_batch' + + # .mod + # ... 'atac' + # ... ... layers + ATAC_COUNTS_LAYER = 'counts' + ATAC_FIRST_MOMENT_CHROM_LAYER = 'M_c' + ATAC_TFIDF_LAYER = 'X_tfidf' # Also X? + ATAC_CHROMATIN_VELOCITY_LAYER = 'lifted_velo_c' + + # ... ... .obs + + # ... ... .obsm + ATAC_GENE_ACTIVITY_KEY = 'gene_activity' # Computed gene activity matrix - for unmatched data only + ATAC_OBSM_LSI_KEY = 'X_lsi' + ATAC_OBSM_PC_KEY = 'X_pca' + + # ... ... .obsp + + # ... ... .uns + ATAC_GENE_ACTIVITY_GENES_KEY = 'gene_activity_genes' # Genes for gene activity matrix + MATCHED_ATAC_RNA_DATA_KEY = 'matched_atac_rna_data' # Indicates whether ATAC- and RNA-seq data are matched + + # ... ... .var (atac:*) + + # ... ... .varm + ATAC_VARM_LSI_KEY = 'LSI' + + # ... 'cite' + # ... ... layers + + # ... ... .obs + + # ... ... .obsm + + # ... ... .obsp + + # ... ... .uns + MATCHED_CITE_RNA_DATA_KEY = 'matched_cite_rna_data' # Indicates whether CITE- and RNA-seq data are matched + + # ... ... .var (cite:*) + + # ... ... .varm + + # ... 'hic' + # ... ... layers + + # ... ... .obs + + # ... ... .obsm + + # ... ... .obsp + + # ... ... .uns + MATCHED_HIC_RNA_DATA_KEY = 'matched_hic_rna_data' # Indicates whether HiC- and RNA-seq data are matched + + # ... ... .var (hic:*) + + # ... ... .varm + + # ... 'rna' + # Most things are handled by DynamoAdataKeyManager; these are in addition to thos defined in dynamo + # ... ... layers + RNA_COUNTS_LAYER = 'counts' + RNA_COUNTS_LAYER_FROM_LOOM = 'matrix' + RNA_FIRST_MOMENT_CHROM_LAYER = 'M_c' + RNA_FIRST_MOMENT_SPLICED_LAYER = 'M_s' + RNA_FIRST_MOMENT_UNSPLICED_LAYER = 'M_u' + RNA_SECOND_MOMENT_SS_LAYER = 'M_ss' + RNA_SECOND_MOMENT_US_LAYER = 'M_us' + RNA_SECOND_MOMENT_UU_LAYER = 'M_uu' + RNA_SPLICED_LAYER = 'spliced' + RNA_SPLICED_VELOCITY_LAYER = 'velocity_S' + RNA_UNSPLICED_LAYER = 'unspliced' + + # ... ... .obs + + # ... ... .obsm + RNA_OBSM_PC_KEY = 'X_pca' + + # ... ... .obsp + + # ... ... .uns + + # ... ... .var (rna:*) + + # ... ... .varm + + def bogus_function(self): + pass + +MDKM = MultiDynamoMdataKeyManager diff --git a/dynamo/multivelo/MultiIO.py b/dynamo/multivelo/MultiIO.py new file mode 100644 index 000000000..7ed10b5c2 --- /dev/null +++ b/dynamo/multivelo/MultiIO.py @@ -0,0 +1,415 @@ +from anndata import ( + AnnData, + read_loom +) +from .MultiConfiguration import MDKM + +from mudata import MuData + + +import numpy as np +import os +from os import PathLike +import pandas as pd +from pathlib import Path +import re +from typing import ( + Dict, + Literal, + Union +) + +# Imports from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_exception, + main_info, +) + +# Imports from MultiDynamo +from .old_MultiVelocity import MultiVelocity +from .MultiPreprocessor import aggregate_peaks_10x + + +def add_splicing_data( + mdata: MuData, + multiome_base_path: Union[PathLike, str], + rna_splicing_loom: Union[PathLike, str] = 'multiome.loom', + cellranger_path_structure: bool = True +) -> MuData: + # Extract accessibility and transcriptomic counts + atac_adata, rna_adata = mdata.mod['atac'], mdata.mod['rna'] + + # Read in spicing data + splicing_data_path = os.path.join(multiome_base_path, + 'velocyto' if cellranger_path_structure else '', + rna_splicing_loom) + ldata = read_loom(filename=Path(splicing_data_path)) + + # Merge splicing data with transcriptomic data + rna_adata.var_names_make_unique() + ldata.var_names_make_unique() + + common_obs = pd.unique(rna_adata.obs_names.intersection(ldata.obs_names)) + common_vars = pd.unique(rna_adata.var_names.intersection(ldata.var_names)) + + if len(common_obs) == 0: + # Try cleaning cell indices, if intersection of indices is vacuous + clean_obs_names(rna_adata) + clean_obs_names(ldata) + common_obs = rna_adata.obs_names.intersection(ldata.obs_names) + + # Restrict to common cell indices and genes + rna_adata = rna_adata[common_obs, common_vars].copy() + ldata = ldata[common_obs, common_vars].copy() + + # Transfer layers from ldata + for key, data in ldata.layers.items(): + if key not in rna_adata.layers: + rna_adata.layers[key] = data.copy() + + # Copy over the loom counts to a counts layer + rna_adata.layers[MDKM.RNA_COUNTS_LAYER] = rna_adata.layers[MDKM.RNA_COUNTS_LAYER_FROM_LOOM].copy() + + mdata = MuData({'atac': atac_adata, 'rna': rna_adata}) + + return mdata + + +# These are convenience functions pattern after (but not identical to) ones in scvelo +def clean_obs_names( + adata: AnnData, + alphabet: Literal['[AGTCBDHKMNRSVWY]'] = '[AGTCBDHKMNRSVWY]', + batch_key: str = MDKM.INFERRED_BATCH_KEY, + id_length: int = 16 +) -> AnnData: + if adata.obs_names.map(len).unique().size == 1: + # Here if all cell indices have the same numbers of characters + # ... find (first) instance of id_length valid nucleotides in the first cell index + start, end = re.search(alphabet * id_length, adata.obs_names[0]).span() + + # ... truncate the cell indices to the valid nucleotides + new_obs_names = [obs_name[start:end] for obs_name in adata.obs_names] + + # ... any characters prior to the characters that define the new cell index + # might specify the batch, so save it as tuple with the new cell index + prefixes = [ + obs_name.replace(new_obs_name, "") + for obs_name, new_obs_name in zip(adata.obs_names, new_obs_names) + ] + else: + # Here if cell indices have different lengths + prefixes, new_obs_names = [], [] + for obs_name in adata.obs_names: + # ... loop over the cell indices individually; find the (first) instance + # of id_length valid nucleotides in each cell index + start, end = re.search(alphabet * id_length, adata.obs_names[0]).span() + + # ... truncate the cell indices to the valid nucleotides + new_obs_names.append(obs_name[start:end]) + + # ... any characters prior to the characters that define the new cell index + # might specify the batch, so save it as tuple with the new cell index + prefixes.append(obs_name.replace(obs_name[start:end], "")) + + adata.obs_names = new_obs_names + adata.obs_names_make_unique() + + if len(prefixes[0]) > 0 and len(np.unique(prefixes)) > 1: + # If non-trival list of prefices (non-trivial length and more than one different), + # then add MDKM.INFERRED_BATCH_KEY to cell metadata + adata.obs[batch_key] = ( + pd.Categorical(prefixes) + if len(np.unique(prefixes)) < adata.n_obs + else prefixes + ) + + return adata + + +def homogenize_mudata_obs_names( + mdata: MuData, + alphabet: Literal['[AGTCBDHKMNRSVWY]'] = '[AGTCBDHKMNRSVWY]', + batch_key: str = MDKM.INFERRED_BATCH_KEY, + id_length: int = 16 +) -> MuData: + cleaned_modality_dict = {} + for modality, modality_adata in mdata.mod.items(): + cleaned_modality_adata = clean_obs_names(adata=modality_adata, + alphabet=alphabet, + batch_key=batch_key, + id_length=id_length) + cleaned_modality_dict[modality] = cleaned_modality_adata.copy() + return MuData(cleaned_modality_dict) + + +def read(path_dict: Dict) -> MultiVelocity: + pass # Can significantly simply + +# ... from unmatched scRNA-seq and scATAC-seq data +def read_10x_atac_rna_h5_old( + atac_path: Union[PathLike, str], + rna_path: Union[PathLike, str], + atac_counts_matrix: Union[PathLike, str] = 'filtered_peak_bc_matrix', + rna_h5_fn: Union[PathLike, str] = 'filtered_feature_bc_matrix.h5', + rna_splicing_loom: Union[PathLike, str] = 'multiome.loom', + alphabet: Literal['[AGTCBDHKMNRSVWY]'] = '[AGTCBDHKMNRSVWY]', + batch_key: str = MDKM.INFERRED_BATCH_KEY, + cellranger_path_structure: bool = True, + id_length: int = 16 +) -> MuData: + from muon import atac as ac + import muon as mu + import scvi + main_info('Deserializing UNMATCHED scATAC-seq and scRNA-seq data ...') + temp_logger = LoggerManager.gen_logger('read_10x_atac_rna_h5') + temp_logger.log_time() + + # Read scATAC-seq h5 file + # ... counts + main_info(f'reading scATAC-seq data', indent_level=2) + atac_matrix_path = os.path.join(atac_path, + 'outs' if cellranger_path_structure else '', + atac_counts_matrix) + + atac_adata = scvi.data.read_10x_atac(atac_matrix_path) + + # Read scRNA-seq h5 file + main_info(f'reading scRNA-seq data', indent_level=2) + rna_h5_path = os.path.join(rna_path, + 'outs' if cellranger_path_structure else '', + rna_h5_fn) + + rna_adata = mu.read_10x_h5(filename=Path(rna_h5_path)).mod['rna'] + + # Assemble MuData object + main_info(f'combining scATAC-seq data and scRNA-seq data into MuData object ...', indent_level=2) + mdata = MuData({'atac': atac_adata, 'rna': rna_adata}) + + # Flag the scATAC-seq data as unmatched to the scRNA-seq data + main_info(f' .uns[{MDKM.MATCHED_ATAC_RNA_DATA_KEY}] = False', indent_level=3) + mdata.mod['atac'].uns[MDKM.MATCHED_ATAC_RNA_DATA_KEY] = False + + # Add path to fragment file + main_info(f" path to fragments file in .uns['files']", indent_level=3) + mdata.mod['atac'].uns['files'] = {'fragments': os.path.join(atac_path, + 'outs' if cellranger_path_structure else '', + 'fragments.tsv.gz')} + + # Add 'outs' paths + # ... atac + mdata.mod['atac'].uns['base_data_path'] = atac_path + + # ... rna + mdata.mod['rna'].uns['base_data_path'] = rna_path + + # Add peak annotation + main_info(f'adding peak annotation ...', indent_level=2) + ac.tl.add_peak_annotation(data=mdata, annotation=os.path.join(atac_path, + 'outs' if cellranger_path_structure else '', + 'peak_annotation.tsv')) + + # Homogenize cell indices across modalities + main_info(f'homogenizing cell indices ...', indent_level=2) + mdata = homogenize_mudata_obs_names(mdata=mdata, + alphabet=alphabet, + batch_key=batch_key, + id_length=id_length) + + # Add transcriptomic splicing data + main_info(f'adding splicing data ...', indent_level=2) + mdata = add_splicing_data(mdata=mdata, + multiome_base_path=rna_path, + rna_splicing_loom=rna_splicing_loom, + cellranger_path_structure=cellranger_path_structure) + + temp_logger.finish_progress(progress_name='read_10x_atac_rna_h5') + + return mdata + + +# ... from matched 10X multiome +def read_10x_multiome_h5_old( + multiome_base_path: Union[PathLike, str], + multiome_h5_fn: Union[PathLike, str] = 'filtered_feature_bc_matrix.h5', + rna_splicing_loom: Union[PathLike, str] = 'multiome.loom', + alphabet: Literal['[AGTCBDHKMNRSVWY]'] = '[AGTCBDHKMNRSVWY]', + batch_key: str = MDKM.INFERRED_BATCH_KEY, + cellranger_path_structure: bool = True, + id_length: int = 16 +) -> MuData: + import muon as mu + from muon import atac as ac + + main_info('Deserializing MATCHED scATAC-seq and scRNA-seq data ...') + temp_logger = LoggerManager.gen_logger('read_10x_multiome_h5') + temp_logger.log_time() + + # Assemble absolute path to multiomic data + full_multiome_path = os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + multiome_h5_fn) + + # Read the multiome h5 file + main_info(f'reading the multiome h5 file ...', indent_level=2) + mdata = mu.read_10x_h5(Path(full_multiome_path), extended=True) + + # Flag the scATAC-seq data as matched to the scRNA-seq data + main_info(f' .uns[{MDKM.MATCHED_ATAC_RNA_DATA_KEY}] = True', indent_level=3) + mdata.mod['atac'].uns[MDKM.MATCHED_ATAC_RNA_DATA_KEY] = True + + # Add 'outs' paths - Note: for multiome they are identical + # ... atac + mdata.mod['atac'].uns['base_data_path'] = multiome_base_path + + # ... rna + mdata.mod['rna'].uns['base_data_path'] = multiome_base_path + + # Add path to fragment file + main_info(f" path to fragments file in .uns['files'] ...", indent_level=3) + mdata.mod['atac'].uns['files'] = {'fragments': os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + 'fragments.tsv.gz')} + + # Add peak annotation + main_info(f'adding peak annotation ...', indent_level=2) + ac.tl.add_peak_annotation(data=mdata, annotation=os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + 'peak_annotation.tsv')) + + # Homogenize cell indices across modalities + main_info(f'homogenizing cell indices ...', indent_level=2) + mdata = homogenize_mudata_obs_names(mdata=mdata, + alphabet=alphabet, + batch_key=batch_key, + id_length=id_length) + + # Add transcriptomic splicing data + main_info(f'adding splicing data ...', indent_level=2) + mdata = add_splicing_data(mdata=mdata, + multiome_base_path=multiome_base_path, + rna_splicing_loom=rna_splicing_loom, + cellranger_path_structure=cellranger_path_structure) + + temp_logger.finish_progress(progress_name='read_10x_multiome_h5') + + return mdata + + +def read_10x_multiome_h5( + multiome_base_path: Union[PathLike, str], + multiome_h5_fn: Union[PathLike, str] = 'filtered_feature_bc_matrix.h5', + rna_splicing_loom: Union[PathLike, str] = 'multiome.loom', + alphabet: Literal['[AGTCBDHKMNRSVWY]'] = '[AGTCBDHKMNRSVWY]', + batch_key: str = MDKM.INFERRED_BATCH_KEY, + cellranger_path_structure: bool = True, + id_length: int = 16, + gtf_path: Union[PathLike, str] = None, +): + import muon as mu + from muon import atac as ac + + main_info('Deserializing MATCHED scATAC-seq and scRNA-seq data ...') + temp_logger = LoggerManager.gen_logger('read_10x_multiome_h5') + temp_logger.log_time() + + # Assemble absolute path to multiomic data + full_multiome_path = os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + multiome_h5_fn) + + # Read the multiome h5 file + main_info(f'reading the multiome h5 file ...', indent_level=2) + mdata = mu.read_10x_h5(Path(full_multiome_path), extended=True) + + # Flag the scATAC-seq data as matched to the scRNA-seq data + main_info(f' .uns[{MDKM.MATCHED_ATAC_RNA_DATA_KEY}] = True', indent_level=3) + mdata.mod['atac'].uns[MDKM.MATCHED_ATAC_RNA_DATA_KEY] = True + + # Add 'outs' paths - Note: for multiome they are identical + # ... atac + mdata.mod['atac'].uns['base_data_path'] = multiome_base_path + + # ... rna + mdata.mod['rna'].uns['base_data_path'] = multiome_base_path + + #Add path of fragments file if exist + fragments_path = os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + 'fragments.tsv.gz') + if os.path.exists(fragments_path): + main_info(f" path to fragments file in .uns['files'] ...", indent_level=3) + mdata.mod['atac'].uns['files'] = {'fragments': fragments_path} + else: + main_info(f"fragments file not found in {fragments_path}", indent_level=3) + + # Add peak annotation file if exist + peak_annotation_path = os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + 'peak_annotation.tsv') + if os.path.exists(peak_annotation_path): + main_info(f'adding peak annotation ...', indent_level=2) + ac.tl.add_peak_annotation(data=mdata, annotation=peak_annotation_path) + + elif gtf_path is not None: + main_info(f'adding peak annotation from gtf file ...', indent_level=2) + import Epiverse as ev + atac_anno=ev.utils.Annotation(gtf_path) + atac_anno.tss_init(upstream=1000, + downstream=100) + atac_anno.distal_init(upstream=[1000,200000], + downstream=[1000,200000]) + atac_anno.body_init() + + import pandas as pd + k=0 + for chr in mdata['atac'].var['seqnames'].unique(): + if k==0: + merge_pd=atac_anno.query_multi(query_list=mdata['atac'].var.loc[mdata['atac'].var['seqnames']==chr].index.tolist(), + chrom=chr,batch=4,ncpus=8) + else: + merge_pd1=atac_anno.query_multi(query_list=mdata['atac'].var.loc[mdata['atac'].var['seqnames']==chr].index.tolist(), + chrom=chr,batch=4,ncpus=8) + merge_pd=pd.concat([merge_pd,merge_pd1]) + k+=1 + merge_pd=atac_anno.merge_info(merge_pd) + atac_anno.add_gene_info(mdata['atac'],merge_pd, + columns=['peaktype','neargene','neargene_tss']) + else: + main_info(f"peak annotation file not found in {peak_annotation_path} and gtf file not provided", indent_level=3) + + # Homogenize cell indices across modalities + main_info(f'homogenizing cell indices ...', indent_level=2) + mdata = homogenize_mudata_obs_names(mdata=mdata, + alphabet=alphabet, + batch_key=batch_key, + id_length=id_length) + + # Add transcriptomic splicing data if exist + rna_splicing_loom_path = os.path.join(multiome_base_path, + 'velocyto' if cellranger_path_structure else '', + rna_splicing_loom) + if os.path.exists(rna_splicing_loom_path): + main_info(f'adding splicing data ...', indent_level=2) + mdata = add_splicing_data(mdata=mdata, + multiome_base_path=multiome_base_path, + rna_splicing_loom=rna_splicing_loom, + cellranger_path_structure=cellranger_path_structure) + else: + main_info(f"splicing data file not found in {rna_splicing_loom_path}", indent_level=3) + + # Aggregate_peaks_10x + main_info(f'aggregating peaks ...', indent_level=2) + feature_linkage_path=os.path.join(multiome_base_path, + 'outs' if cellranger_path_structure else '', + 'analysis/feature_linkage/feature_linkage.bedpe') + adata_aggr = aggregate_peaks_10x(mdata['atac'], + peak_annotation_path, + feature_linkage_path) + + mdata.mod['aggr']=adata_aggr + + + temp_logger.finish_progress(progress_name='read_10x_multiome_h5') + + return mdata \ No newline at end of file diff --git a/dynamo/multivelo/MultiPreprocessor.py b/dynamo/multivelo/MultiPreprocessor.py new file mode 100644 index 000000000..d752af0b9 --- /dev/null +++ b/dynamo/multivelo/MultiPreprocessor.py @@ -0,0 +1,719 @@ +# Imports from external modules +from anndata import AnnData +from .MultiConfiguration import MDKM +from mudata import MuData + +import numpy as np +import os +import pandas as pd + +from tqdm import tqdm +from scipy.sparse import coo_matrix, csr_matrix, diags + +from typing import Any, Callable, Dict, List, Literal, Optional, TypedDict + +# Imports from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_debug, + main_exception, + main_info, + main_info_insert_adata, + main_warning, +) +from ..preprocessing.gene_selection import ( + select_genes_monocle +) +from ..preprocessing.normalization import ( + calc_sz_factor, + normalize +) +from ..preprocessing.pca import ( + pca +) +from ..preprocessing.Preprocessor import ( + Preprocessor +) +from ..preprocessing.QC import ( + filter_cells_by_highly_variable_genes, + filter_cells_by_outliers as monocle_filter_cells_by_outliers, + filter_genes_by_outliers as monocle_filter_genes_by_outliers +) +from ..preprocessing.transform import ( + log1p +) +from ..preprocessing.utils import ( + collapse_species_adata, + convert2symbol +) + +# Imports from MultiDynamo +from .ATACseqTools import ( + tfidf_normalize +) +from .MultiQC import ( + modality_basic_stats, + modality_filter_cells_by_outliers, + modality_filter_features_by_outliers +) + +# Define a custom type for the recipe dictionary using TypedDict +ATACType = Literal['archR', 'cicero', 'muon', 'signac'] +CITEType = Literal['seurat'] +HiCType = Literal['periwal'] +ModalityType = Literal['atac', 'cite', 'hic', 'rna'] +RNAType = Literal['monocle', 'seurat', 'sctransform', 'pearson_residuals', 'monocle_pearson_residuals'] + +class RecipeDataType(TypedDict, total=False): # total=False allows partial dictionary to be valid + atac: ATACType + cite: CITEType + hic: HiCType + rna: RNAType + + +# The Multiomic Preprocessor class, MultiPreprocessor +class MultiPreprocessor(Preprocessor): + def __init__( + self, + cell_cycle_score_enable: bool=False, + cell_cycle_score_kwargs: Dict[str, Any] = {}, + collapse_species_adata_function: Callable = collapse_species_adata, + convert_gene_name_function: Callable=convert2symbol, + filter_cells_by_highly_variable_genes_function: Callable = filter_cells_by_highly_variable_genes, + filter_cells_by_highly_variable_genes_kwargs: Dict[str, Any] = {}, + filter_cells_by_outliers_function: Callable=monocle_filter_cells_by_outliers, + filter_cells_by_outliers_kwargs: Dict[str, Any] = {}, + filter_genes_by_outliers_function: Callable=monocle_filter_genes_by_outliers, + filter_genes_by_outliers_kwargs: Dict[str, Any] = {}, + force_gene_list: Optional[List[str]]=None, + gene_append_list: List[str] = [], + gene_exclude_list: List[str] = {}, + norm_method: Callable=log1p, + norm_method_kwargs: Dict[str, Any] = {}, + normalize_by_cells_function: Callable=normalize, + normalize_by_cells_function_kwargs: Dict[str, Any] = {}, + normalize_selected_genes_function: Callable=None, + normalize_selected_genes_kwargs: Dict[str, Any] = {}, + pca_function: Callable=pca, + pca_kwargs: Dict[str, Any] = {}, + regress_out_kwargs: Dict[List[str], Any] = {}, + sctransform_kwargs: Dict[str, Any] = {}, + select_genes_function: Callable = select_genes_monocle, + select_genes_kwargs: Dict[str, Any] = {}, + size_factor_function: Callable=calc_sz_factor, + size_factor_kwargs: Dict[str, Any] = {}) -> None: + super().__init__( + collapse_species_adata_function = collapse_species_adata_function, + convert_gene_name_function = convert_gene_name_function, + filter_cells_by_outliers_function = filter_cells_by_outliers_function, + filter_cells_by_outliers_kwargs = filter_cells_by_outliers_kwargs, + filter_genes_by_outliers_function = filter_genes_by_outliers_function, + filter_genes_by_outliers_kwargs = filter_genes_by_outliers_kwargs, + filter_cells_by_highly_variable_genes_function = filter_cells_by_highly_variable_genes_function, + filter_cells_by_highly_variable_genes_kwargs = filter_cells_by_highly_variable_genes_kwargs, + normalize_by_cells_function = normalize_by_cells_function, + normalize_by_cells_function_kwargs = normalize_by_cells_function_kwargs, + size_factor_function = size_factor_function, + size_factor_kwargs = size_factor_kwargs, + select_genes_function = select_genes_function, + select_genes_kwargs = select_genes_kwargs, + normalize_selected_genes_function = normalize_selected_genes_function, + normalize_selected_genes_kwargs = normalize_selected_genes_kwargs, + norm_method = norm_method, + norm_method_kwargs = norm_method_kwargs, + pca_function = pca_function, + pca_kwargs = pca_kwargs, + gene_append_list = gene_append_list, + gene_exclude_list = gene_exclude_list, + force_gene_list = force_gene_list, + sctransform_kwargs = sctransform_kwargs, + regress_out_kwargs = regress_out_kwargs, + cell_cycle_score_enable = cell_cycle_score_enable, + cell_cycle_score_kwargs = cell_cycle_score_kwargs + ) + + def preprocess_atac( + self, + mdata: MuData, + recipe: ATACType = 'muon', + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + if recipe == 'archR': + self.preprocess_atac_archr(mdata, + tkey=tkey, + experiment_type=experiment_type) + elif recipe == 'cicero': + self.preprocess_atac_cicero(mdata, + tkey=tkey, + experiment_type=experiment_type) + elif recipe == 'muon': + self.preprocess_atac_muon(mdata, + tkey=tkey, + experiment_type=experiment_type) + elif recipe == 'signac': + self.preprocess_atac_signac(mdata, + tkey=tkey, + experiment_type=experiment_type) + else: + raise NotImplementedError("preprocess recipe chosen not implemented: %s" % recipe) + + def preprocess_atac_archr( + self, + mdata: MuData, + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + pass + + def preprocess_atac_cicero( + self, + mdata: MuData, + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + pass + + def preprocess_atac_muon( + self, + mdata: MuData, + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + from muon import atac as ac + import scanpy as sc + main_info('Running muon preprocessing pipeline for scATAC-seq data ...') + preprocess_logger = LoggerManager.gen_logger('preprocess_atac_muon') + preprocess_logger.log_time() + + # Standardize MuData object + self.standardize_mdata(mdata, tkey, experiment_type) + + # Filter peaks + modality_filter_features_by_outliers(mdata, + modality='atac', + quantiles=[0.01, 0.99], + var_key='n_cells_by_counts') + + # Filter cells + modality_filter_cells_by_outliers(mdata, + modality='atac', + quantiles=[0.01, 0.99], + obs_key='n_genes_by_counts') + + modality_filter_cells_by_outliers(mdata, + modality='atac', + quantiles=[0.01, 0.99], + obs_key='total_counts') + + # Extract chromatin accessibility and transcriptome + atac_adata, rna_adata = mdata.mod['atac'], mdata.mod['rna'] + + # ... store counts layer used for SCVI's variational autoencoders + atac_adata.layers[MDKM.ATAC_COUNTS_LAYER] = atac_adata.X + rna_adata.layers[MDKM.RNA_COUNTS_LAYER] = rna_adata.X + + # ... compute TF-IDF + main_info(f'computing TF-IDF', indent_level=1) + atac_adata = tfidf_normalize(atac_adata=atac_adata, mv_algorithm=False) + + # Normalize + main_info(f'normalizing', indent_level=1) + sc.pp.normalize_total(atac_adata, target_sum=1e4) + sc.pp.log1p(atac_adata) + + # Feature selection + main_info(f'feature selection', indent_level=1) + sc.pp.highly_variable_genes(atac_adata, min_mean=0.05, max_mean=1.5, min_disp=0.5) + main_info(f'identified {np.sum(atac_adata.var.highly_variable)} highly variable features', indent_level=2) + + # Store current AnnData object in raw + atac_adata.raw = atac_adata + + # Latent sematic indexing + main_info(f'computing latent sematic indexing', indent_level=1) + ac.tl.lsi(atac_adata) + + # ... drop first component (size related) + main_info(f' X_lsi key in .obsm', indent_level=2) + atac_adata.obsm[MDKM.ATAC_OBSM_LSI_KEY] = atac_adata.obsm[MDKM.ATAC_OBSM_LSI_KEY][:, 1:] + main_info(f' LSI key in .varm', indent_level=2) + atac_adata.varm[MDKM.ATAC_VARM_LSI_KEY] = atac_adata.varm[MDKM.ATAC_VARM_LSI_KEY][:, 1:] + main_info(f' [lsi][stdev] key in .uns', indent_level=2) + atac_adata.uns['lsi']['stdev'] = atac_adata.uns['lsi']['stdev'][1:] + + # ... perhaps gratuitous deep copy + mdata.mod['atac'] = atac_adata.copy() + + preprocess_logger.finish_progress(progress_name='preprocess_atac_muon') + + def preprocess_atac_signac( + self, + mdata: MuData, + recipe: ATACType = 'muon', + tkey: Optional[str] = None, + experiment_type: Optional[str] = None + ) -> None: + pass + + def preprocess_cite( + self, + mdata: MuData, + recipe: CITEType + ) -> None: + pass + + def preprocess_hic( + self, + mdata: MuData, + recipe: HiCType + ) -> None: + pass + + def preprocess_mdata( + self, + mdata: MuData, + recipe_dict: RecipeDataType = None, + tkey: Optional[str] = None, + experiment_type: Optional[str] = None, + ) -> None: + """Preprocess the MuData object with the recipe specified. + + Args: + mdata: An AnnData object. + recipe_dict: The recipe used to preprocess the data. Current modalities are scATAC-seq, CITE-seq, scHi-C + and scRNA-seq + tkey: the key for time information (labeling time period for the cells) in .obs. Defaults to None. + experiment_type: the experiment type of the data. If not provided, would be inferred from the data. + + Raises: + NotImplementedError: the recipe is invalid. + """ + + if recipe_dict is None: + # Default recipe + recipe_dict = {'atac': 'signac', 'rna': 'seurat'} + + for mod, recipe in recipe_dict.items(): + if mod not in mdata.mod: + main_exception((f'Modality {mod} not found in MuData object')) + + if mod == 'atac': + self.preprocess_atac(mdata=mdata, + recipe=recipe, + tkey=tkey, + experiment_type=experiment_type) + + elif mod == 'cite': + self.preprocess_cite(mdata=mdata, + recipe=recipe, + tkey=tkey, + experiment_type=experiment_type) + elif mod == 'hic': + self.preprocess_hic(mdata=mdata, + recipe=recipe, + tkey=tkey, + experiment_type=experiment_type) + elif mod == 'rna': + rna_adata = mdata.mod.get('rna', None) + + self.preprocess_adata(adata=rna_adata, + recipe=recipe, + tkey=tkey, + experiment_type=experiment_type) + else: + raise NotImplementedError(f'Preprocess recipe not implemented for modality: {mod}') + + # Integrate modalities - at this point have filtered out poor quality cells for individual + # modalities. Next we need to + + def standardize_mdata( + self, + mdata: MuData, + tkey: str, + experiment_type: str + ) -> None: + """Process the scATAC-seq modality within MuData to make it meet the standards of dynamo. + + The index of the observations would be ensured to be unique. The layers with sparse matrix would be converted to + compressed csr_matrix. MDKM.allowed_layer_raw_names() will be used to define only_splicing, only_labeling and + splicing_labeling keys. + + Args: + mdata: an AnnData object. + tkey: the key for time information (labeling time period for the cells) in .obs. + experiment_type: the experiment type. + """ + + for modality, modality_adata in mdata.mod.items(): + if modality == 'rna': + # Handled by dynamo + continue + + # Compute basic QC metrics + modality_basic_stats(mdata=mdata, modality=modality) + + self.add_experiment_info(modality_adata, tkey, experiment_type) + main_info_insert_adata("tkey=%s" % tkey, "uns['pp']", indent_level=2) + main_info_insert_adata("experiment_type=%s" % modality_adata.uns["pp"]["experiment_type"], + "uns['pp']", + indent_level=2) + + self.convert_layers2csr(modality_adata) + + +def aggregate_peaks_10x(adata_atac, peak_annot_file, linkage_file, + peak_dist=10000, min_corr=0.5, gene_body=False, + return_dict=False, parallel=False, n_jobs=1): + + """Peak to gene aggregation. + + This function aggregates promoter and enhancer peaks to genes based on the + 10X linkage file. + + Parameters + ---------- + adata_atac: :class:`~anndata.AnnData` + ATAC anndata object which stores raw peak counts. + peak_annot_file: `str` + Peak annotation file from 10X CellRanger ARC. + linkage_file: `str` + Peak-gene linkage file from 10X CellRanger ARC. This file stores highly + correlated peak-peak and peak-gene pair information. + peak_dist: `int` (default: 10000) + Maximum distance for peaks to be included for a gene. + min_corr: `float` (default: 0.5) + Minimum correlation for a peak to be considered as enhancer. + gene_body: `bool` (default: `False`) + Whether to add gene body peaks to the associated promoters. + return_dict: `bool` (default: `False`) + Whether to return promoter and enhancer dictionaries. + + Returns + ------- + A new ATAC anndata object which stores gene aggreagted peak counts. + Additionally, if `return_dict==True`: + A dictionary which stores genes and promoter peaks. + And a dictionary which stores genes and enhancer peaks. + """ + promoter_dict = {} + distal_dict = {} + gene_body_dict = {} + corr_dict = {} + + # read annotations + with open(peak_annot_file) as f: + header = next(f) + tmp = header.split('\t') + if len(tmp) == 4: + cellranger_version = 1 + elif len(tmp) == 6: + cellranger_version = 2 + else: + raise ValueError('Peak annotation file should contain 4 columns ' + '(CellRanger ARC 1.0.0) or 6 columns (CellRanger ' + 'ARC 2.0.0)') + + main_info(f'CellRanger ARC identified as {cellranger_version}.0.0', + indent_level=1) + + if cellranger_version == 1: + for line in f: + tmp = line.rstrip().split('\t') + tmp1 = tmp[0].split('_') + peak = f'{tmp1[0]}:{tmp1[1]}-{tmp1[2]}' + if tmp[1] != '': + genes = tmp[1].split(';') + dists = tmp[2].split(';') + types = tmp[3].split(';') + for i, gene in enumerate(genes): + dist = dists[i] + annot = types[i] + if annot == 'promoter': + if gene not in promoter_dict: + promoter_dict[gene] = [peak] + else: + promoter_dict[gene].append(peak) + elif annot == 'distal': + if dist == '0': + if gene not in gene_body_dict: + gene_body_dict[gene] = [peak] + else: + gene_body_dict[gene].append(peak) + else: + if gene not in distal_dict: + distal_dict[gene] = [peak] + else: + distal_dict[gene].append(peak) + else: + for line in f: + tmp = line.rstrip().split('\t') + peak = f'{tmp[0]}:{tmp[1]}-{tmp[2]}' + gene = tmp[3] + dist = tmp[4] + annot = tmp[5] + if annot == 'promoter': + if gene not in promoter_dict: + promoter_dict[gene] = [peak] + else: + promoter_dict[gene].append(peak) + elif annot == 'distal': + if dist == '0': + if gene not in gene_body_dict: + gene_body_dict[gene] = [peak] + else: + gene_body_dict[gene].append(peak) + else: + if gene not in distal_dict: + distal_dict[gene] = [peak] + else: + distal_dict[gene].append(peak) + + # read linkages + with open(linkage_file) as f: + for line in f: + tmp = line.rstrip().split('\t') + if tmp[12] == "peak-peak": + peak1 = f'{tmp[0]}:{tmp[1]}-{tmp[2]}' + peak2 = f'{tmp[3]}:{tmp[4]}-{tmp[5]}' + tmp2 = tmp[6].split('><')[0][1:].split(';') + tmp3 = tmp[6].split('><')[1][:-1].split(';') + corr = float(tmp[7]) + for t2 in tmp2: + gene1 = t2.split('_') + for t3 in tmp3: + gene2 = t3.split('_') + # one of the peaks is in promoter, peaks belong to the + # same gene or are close in distance + if (((gene1[1] == "promoter") != + (gene2[1] == "promoter")) and + ((gene1[0] == gene2[0]) or + (float(tmp[11]) < peak_dist))): + + if gene1[1] == "promoter": + gene = gene1[0] + else: + gene = gene2[0] + if gene in corr_dict: + # peak 1 is in promoter, peak 2 is not in gene + # body -> peak 2 is added to gene 1 + if (peak2 not in corr_dict[gene] and + gene1[1] == "promoter" and + (gene2[0] not in gene_body_dict or + peak2 not in gene_body_dict[gene2[0]])): + + corr_dict[gene][0].append(peak2) + corr_dict[gene][1].append(corr) + # peak 2 is in promoter, peak 1 is not in gene + # body -> peak 1 is added to gene 2 + if (peak1 not in corr_dict[gene] and + gene2[1] == "promoter" and + (gene1[0] not in gene_body_dict or + peak1 not in gene_body_dict[gene1[0]])): + + corr_dict[gene][0].append(peak1) + corr_dict[gene][1].append(corr) + else: + # peak 1 is in promoter, peak 2 is not in gene + # body -> peak 2 is added to gene 1 + if (gene1[1] == "promoter" and + (gene2[0] not in + gene_body_dict + or peak2 not in + gene_body_dict[gene2[0]])): + + corr_dict[gene] = [[peak2], [corr]] + # peak 2 is in promoter, peak 1 is not in gene + # body -> peak 1 is added to gene 2 + if (gene2[1] == "promoter" and + (gene1[0] not in + gene_body_dict + or peak1 not in + gene_body_dict[gene1[0]])): + + corr_dict[gene] = [[peak1], [corr]] + elif tmp[12] == "peak-gene": + peak1 = f'{tmp[0]}:{tmp[1]}-{tmp[2]}' + tmp2 = tmp[6].split('><')[0][1:].split(';') + gene2 = tmp[6].split('><')[1][:-1] + corr = float(tmp[7]) + for t2 in tmp2: + gene1 = t2.split('_') + # peak 1 belongs to gene 2 or are close in distance + # -> peak 1 is added to gene 2 + if ((gene1[0] == gene2) or (float(tmp[11]) < peak_dist)): + gene = gene1[0] + if gene in corr_dict: + if (peak1 not in corr_dict[gene] and + gene1[1] != "promoter" and + (gene1[0] not in gene_body_dict or + peak1 not in gene_body_dict[gene1[0]])): + + corr_dict[gene][0].append(peak1) + corr_dict[gene][1].append(corr) + else: + if (gene1[1] != "promoter" and + (gene1[0] not in gene_body_dict or + peak1 not in gene_body_dict[gene1[0]])): + corr_dict[gene] = [[peak1], [corr]] + elif tmp[12] == "gene-peak": + peak2 = f'{tmp[3]}:{tmp[4]}-{tmp[5]}' + gene1 = tmp[6].split('><')[0][1:] + tmp3 = tmp[6].split('><')[1][:-1].split(';') + corr = float(tmp[7]) + for t3 in tmp3: + gene2 = t3.split('_') + # peak 2 belongs to gene 1 or are close in distance + # -> peak 2 is added to gene 1 + if ((gene1 == gene2[0]) or (float(tmp[11]) < peak_dist)): + gene = gene1 + if gene in corr_dict: + if (peak2 not in corr_dict[gene] and + gene2[1] != "promoter" and + (gene2[0] not in gene_body_dict or + peak2 not in gene_body_dict[gene2[0]])): + + corr_dict[gene][0].append(peak2) + corr_dict[gene][1].append(corr) + else: + if (gene2[1] != "promoter" and + (gene2[0] not in gene_body_dict or + peak2 not in gene_body_dict[gene2[0]])): + + corr_dict[gene] = [[peak2], [corr]] + + gene_dict = promoter_dict + enhancer_dict = {} + promoter_genes = list(promoter_dict.keys()) + main_info(f'Found {len(promoter_genes)} genes with promoter peaks', indent_level=1) + for gene in promoter_genes: + if gene_body: # add gene-body peaks + if gene in gene_body_dict: + for peak in gene_body_dict[gene]: + if peak not in gene_dict[gene]: + gene_dict[gene].append(peak) + enhancer_dict[gene] = [] + if gene in corr_dict: # add enhancer peaks + for j, peak in enumerate(corr_dict[gene][0]): + corr = corr_dict[gene][1][j] + if corr > min_corr: + if peak not in gene_dict[gene]: + gene_dict[gene].append(peak) + enhancer_dict[gene].append(peak) + + # aggregate to genes + adata_atac_X_copy = adata_atac.X.A + gene_mat = np.zeros((adata_atac.shape[0], len(promoter_genes))) + var_names = adata_atac.var_names.to_numpy() + var_dict = {} + + for i, name in enumerate(var_names): + var_dict.update({name: i}) + + # if we only want to run one job at a time, then no parallelization + # is necessary + if n_jobs == 1: + parallel = False + + if parallel: + from joblib import Parallel, delayed + # if we want to run in parallel, modify the gene_mat variable with + # multiple cores, calling prepare_gene_mat with joblib.Parallel() + Parallel(n_jobs=n_jobs, + require='sharedmem')( + delayed(prepare_gene_mat)(var_dict, + gene_dict[promoter_genes[i]], + gene_mat, + adata_atac_X_copy, + i)for i in tqdm(range( + len(promoter_genes)))) + + else: + # if we aren't running in parallel, just call prepare_gene_mat + # from a for loop + for i, gene in tqdm(enumerate(promoter_genes), + total=len(promoter_genes)): + prepare_gene_mat(var_dict, + gene_dict[promoter_genes[i]], + gene_mat, + adata_atac_X_copy, + i) + + gene_mat[gene_mat < 0] = 0 + gene_mat = AnnData(X=csr_matrix(gene_mat)) + gene_mat.obs_names = pd.Index(list(adata_atac.obs_names)) + gene_mat.var_names = pd.Index(promoter_genes) + gene_mat = gene_mat[:, gene_mat.X.sum(0) > 0] + if return_dict: + return gene_mat, promoter_dict, enhancer_dict + else: + return gene_mat + +def prepare_gene_mat(var_dict, peaks, gene_mat, adata_atac_X_copy, i): + + for peak in peaks: + if peak in var_dict: + peak_index = var_dict[peak] + + gene_mat[:, i] += adata_atac_X_copy[:, peak_index] + + + +def knn_smooth_chrom(adata_atac, nn_idx=None, nn_dist=None, conn=None, + n_neighbors=None): + """KNN smoothing. + + This function smooth (impute) the count matrix with k nearest neighbors. + The inputs can be either KNN index and distance matrices or a pre-computed + connectivities matrix (for example in adata_rna object). + + Parameters + ---------- + adata_atac: :class:`~anndata.AnnData` + ATAC anndata object. + nn_idx: `np.darray` (default: `None`) + KNN index matrix of size (cells, k). + nn_dist: `np.darray` (default: `None`) + KNN distance matrix of size (cells, k). + conn: `csr_matrix` (default: `None`) + Pre-computed connectivities matrix. + n_neighbors: `int` (default: `None`) + Top N neighbors to extract for each cell in the connectivities matrix. + + Returns + ------- + `.layers['Mc']` stores imputed values. + """ + if nn_idx is not None and nn_dist is not None: + if nn_idx.shape[0] != adata_atac.shape[0]: + raise ValueError('Number of rows of KNN indices does not equal to ' + 'number of observations.') + if nn_dist.shape[0] != adata_atac.shape[0]: + raise ValueError('Number of rows of KNN distances does not equal ' + 'to number of observations.') + X = coo_matrix(([], ([], [])), shape=(nn_idx.shape[0], 1)) + from umap.umap_ import fuzzy_simplicial_set + conn, sigma, rho, dists = fuzzy_simplicial_set(X, nn_idx.shape[1], + None, None, + knn_indices=nn_idx-1, + knn_dists=nn_dist, + return_dists=True) + elif conn is not None: + pass + else: + raise ValueError('Please input nearest neighbor indices and distances,' + ' or a connectivities matrix of size n x n, with ' + 'columns being neighbors.' + ' For example, RNA connectivities can usually be ' + 'found in adata.obsp.') + + conn = conn.tocsr().copy() + n_counts = (conn > 0).sum(1).A1 + if n_neighbors is not None and n_neighbors < n_counts.min(): + from .sparse_matrix_utils import top_n_sparse + conn = top_n_sparse(conn, n_neighbors) + conn.setdiag(1) + conn_norm = conn.multiply(1.0 / conn.sum(1)).tocsr() + adata_atac.layers['Mc'] = csr_matrix.dot(conn_norm, adata_atac.X) + adata_atac.obsp['connectivities'] = conn + diff --git a/dynamo/multivelo/MultiQC.py b/dynamo/multivelo/MultiQC.py new file mode 100644 index 000000000..0b463e72c --- /dev/null +++ b/dynamo/multivelo/MultiQC.py @@ -0,0 +1,142 @@ +import anndata as ad +from anndata import AnnData +from .MultiConfiguration import MDKM +from mudata import MuData + + + +import numpy as np +import pandas as pd + +from scipy.sparse import ( + issparse +) +from typing import ( + List, + Literal, + Optional, + Union, +) + +# Define several Literals - might move to MDKM +ModalityType = Literal['atac', 'cite', 'hic', 'rna'] +ObsKeyType = Literal['n_genes_by_counts', 'total_counts'] +VarKeyType = Literal['n_cells_by_counts'] + +# Imports from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_debug, + main_exception, + main_finish_progress, + main_info, + main_info_insert_adata, + main_warning, +) + +def modality_basic_stats( + mdata: MuData, + modality: ModalityType = None +) -> None: + """Generate basic stats of the adata, including number of genes, number of cells, and number of mitochondria genes. + + Args: + adata: an AnnData object. + + Returns: + An updated AnnData object with a number of QC metrics computed: 'n_cells_by_counts', 'n_features_by_counts', and + 'total_counts'. (Note: since most modalities do not have direct information about related genes, fractions of + mitochondrial genes cannot be computed.) + """ + from muon import atac as ac + modality_adata = mdata.mod.get(modality, None) + if modality_adata is None: + raise ValueError(f'Modality {modality} not found in MuData object.') + + # Compute QC metrics via functionality in scanpy + import scanpy as sc + sc.pp.calculate_qc_metrics(modality_adata, percent_top=None, log1p=False, inplace=True) + + # Compute modality specific QC metrics + if modality == 'atac': + ac.tl.nucleosome_signal(mdata, n=1e6) + + +def modality_filter_cells_by_outliers( + mdata: MuData, + modality: ModalityType = 'atac', + obs_key: VarKeyType = 'n_cells_by_counts', + quantiles: Optional[Union[List[float], float]] = [0.01, 0.99], + thresholds: Optional[Union[List[float], float]] = None +) -> None: + import muon as mu + modality_adata = mdata.mod.get(modality, None) + if modality_adata is None: + raise ValueError(f'Modality {modality} not found in MuData object.') + + if quantiles is not None: + # Thresholds were specified as quantiles + qc_parameter_series = modality_adata.obs[obs_key] + + if isinstance(quantiles, list): + if len(quantiles) > 2: + raise ValueError(f'More than 2 quantiles were specified {len(quantiles)}.') + + min_feature_thresh, max_feature_thresh = qc_parameter_series.quantile(quantiles).tolist() + else: + min_feature_thresh, max_feature_thresh = qc_parameter_series.quantile(quantiles), np.inf + else: + # Thresholds were specified as absolute thresholds + if isinstance(thresholds, list): + if len(thresholds) > 2: + raise ValueError(f'More than 2 thresholds were specified {len(thresholds)}.') + + min_feature_thresh, max_feature_thresh = thresholds + else: + min_feature_thresh, max_feature_thresh = thresholds, np.inf + + # Carry out the actual filtering + pre_filter_n_cells = modality_adata.n_obs + mu.pp.filter_obs(modality_adata, obs_key, lambda x: (x >= min_feature_thresh) & (x <= max_feature_thresh)) + post_filter_n_cells = modality_adata.n_obs + main_info(f'filtered out {pre_filter_n_cells - post_filter_n_cells} outlier cells', indent_level=2) + + +def modality_filter_features_by_outliers( + mdata: MuData, + modality: ModalityType = 'atac', + quantiles: Optional[Union[List[float], float]] = [0.01, 0.99], + thresholds: Optional[Union[List[float], float]] = None, + var_key: ObsKeyType = 'n_cells_by_counts' +) -> None: + import muon as mu + modality_adata = mdata.mod.get(modality, None) + if modality_adata is None: + raise ValueError(f'Modality {modality} not found in MuData object.') + + if quantiles is not None: + # Thresholds were specified as quantiles + qc_parameter_series = modality_adata.var[var_key] + + if isinstance(quantiles, list): + if len(quantiles) > 2: + raise ValueError(f'More than 2 quantiles were specified {len(quantiles)}.') + + min_feature_thresh, max_feature_thresh = qc_parameter_series.quantile(quantiles).tolist() + else: + min_feature_thresh, max_feature_thresh = qc_parameter_series.quantile(quantiles), np.inf + else: + # Thresholds were specified as absolute thresholds + if isinstance(thresholds, list): + if len(thresholds) > 2: + raise ValueError(f'More than 2 thresholds were specified {len(thresholds)}.') + + min_feature_thresh, max_feature_thresh = thresholds + else: + min_feature_thresh, max_feature_thresh = thresholds, np.inf + + # Carry out the actual filtering + pre_filter_n_cells = modality_adata.n_obs + mu.pp.filter_var(modality_adata, var_key, lambda x: (x >= min_feature_thresh) & (x <= max_feature_thresh)) + post_filter_n_cells = modality_adata.n_obs + main_info(f'filtered out {pre_filter_n_cells - post_filter_n_cells} outlier features', indent_level=2) diff --git a/dynamo/multivelo/MultiVelo.py b/dynamo/multivelo/MultiVelo.py new file mode 100644 index 000000000..5112bf22e --- /dev/null +++ b/dynamo/multivelo/MultiVelo.py @@ -0,0 +1,116 @@ +import pandas as pd +import numpy as np +from anndata import AnnData +from mudata import MuData +from typing import Dict + +from ..tl import dynamics,reduceDimension,cell_velocities +from .MultiPreprocessor import knn_smooth_chrom + + +def multi_velocities( + mdata: MuData, + model: str='stochastic', + method: str='pearson', + other_kernels_dict: Dict={'transform': 'sqrt'}, + core: int=3, + device: str='cpu', + extra_color_key: str=None, + max_iter: int=5, + velo_arg: Dict ={}, + vkey: str='velo_s', + **kwargs +)->AnnData: + """ + Calculate the velocites using the scRNA-seq and scATAC-seq data. + + Args: + mdata: MuData object containing the RNA and ATAC data. + model: The model used to calculate the dynamics. Default is 'stochastic'. + method: The method used to calculate the velocity. Default is 'pearson'. + other_kernels_dict: The dictionary containing the parameters for the other kernels. Default is {'transform': 'sqrt'}. + core: The number of cores used for the calculation. Default is 3. + device: The device used for the calculation. Default is 'cpu'. + extra_color_key: The extra color key used for the calculation. Default is None. + max_iter: The maximum number of iterations used for the calculation. Default is 5. + velo_arg: The dictionary containing the parameters for the velocity calculation. Default is {}. + vkey: The key used for the velocity calculation. Default is 'velo_s'. + **kwargs: The other parameters used for the calculation. + + Returns: + An updated AnnData object with the velocities calculated. + + """ + from .dynamical_chrom_func import recover_dynamics_chrom + # We need to calculate the dynamics of the RNA data first and reduce the dimensionality + dynamics(mdata['rna'], model=model, cores=core) + reduceDimension(mdata['rna']) + cell_velocities(mdata['rna'], method=method, + other_kernels_dict=other_kernels_dict, + **velo_arg + ) + + # And we use the connectivity matrix from the RNA data to smooth the ATAC data and calculate the Mc + knn_smooth_chrom(mdata['aggr'], conn= mdata['rna'].obsp['connectivities']) + + # We then select the genes that are present in both datasets + shared_cells = pd.Index(np.intersect1d(mdata['rna'].obs_names, mdata['aggr'].obs_names)) + shared_genes = pd.Index(np.intersect1d( + [i.split('rna:')[-1] for i in mdata['rna'][:,mdata['rna'].var['use_for_dynamics']].var_names], + [i.split('aggr:')[-1] for i in mdata['aggr'].var_names] + )) + + # We then create the AnnData objects for the RNA and ATAC data + adata_rna = mdata['rna'][shared_cells, [f'rna:{i}' for i in shared_genes]].copy() + adata_atac = mdata['aggr'][shared_cells, [f'aggr:{i}' for i in shared_genes]].copy() + adata_rna.var.index=[i.split('rna:')[-1] for i in adata_rna.var.index] + adata_atac.var.index=[i.split('aggr:')[-1] for i in adata_atac.var.index] + + adata_rna.layers['Ms']=adata_rna.layers['M_s'] + adata_rna.layers['Mu']=adata_rna.layers['M_u'] + + # Now we use MultiVelo's recover_dynamics_chrom function to calculate the dynamics of the RNA and ATAC data + adata_result = recover_dynamics_chrom(adata_rna, + adata_atac, + max_iter=max_iter, + init_mode="invert", + parallel=True, + n_jobs = core, + save_plot=False, + rna_only=False, + fit=True, + n_anchors=500, + extra_color_key=extra_color_key, + device=device, + **kwargs + ) + + # We need to add some information of new RNA velocity to the ATAC data + if vkey not in adata_result.layers.keys(): + raise ValueError('Velocity matrix is not found. Please run multivelo' + '.recover_dynamics_chrom function first.') + if vkey+'_norm' not in adata_result.layers.keys(): + adata_result.layers[vkey+'_norm'] = adata_result.layers[vkey] / np.sum( + np.abs(adata_result.layers[vkey]), 0) + adata_result.layers[vkey+'_norm'] /= np.mean(adata_result.layers[vkey+'_norm']) + adata_result.uns[vkey+'_norm_params'] = adata_result.uns[vkey+'_params'] + if vkey+'_norm_genes' not in adata_result.var.columns: + adata_result.var[vkey+'_norm_genes'] = adata_result.var[vkey+'_genes'] + + # Transition genes identification and velocity calculation + transition_genes=adata_result.var.loc[adata_result.var['velo_s_norm_genes']==True].index.tolist() + if 'pearson_transition_matrix' in adata_result.obsp.keys(): + del adata_result.obsp['pearson_transition_matrix'] + if 'velocity_umap' in adata_result.obsm.keys(): + del adata_result.obsm['velocity_umap'] + cell_velocities(adata_result, vkey='velo_s',#layer='Ms', + X=adata_result[:,transition_genes].layers['Ms'], + V=adata_result[:,transition_genes].layers['velo_s'], + transition_genes=adata_result.var.loc[adata_result.var['velo_s_norm_genes']==True].index.tolist(), + method=method, + other_kernels_dict=other_kernels_dict, + **velo_arg + ) + return adata_result + + diff --git a/dynamo/multivelo/__init__.py b/dynamo/multivelo/__init__.py new file mode 100644 index 000000000..3cc49c2d8 --- /dev/null +++ b/dynamo/multivelo/__init__.py @@ -0,0 +1,10 @@ +from .ATACseqTools import * +from .ChromatinVelocity import * +from .MultiConfiguration import * +from .MultiIO import * +from .MultiQC import * +from .old_MultiomicVectorField import * +from .MultiPreprocessor import * +from .old_MultiVelocity import * +from .pyWNN import * +from .MultiVelo import * diff --git a/dynamo/multivelo/dynamical_chrom_func.py b/dynamo/multivelo/dynamical_chrom_func.py new file mode 100644 index 000000000..c30a671af --- /dev/null +++ b/dynamo/multivelo/dynamical_chrom_func.py @@ -0,0 +1,6394 @@ +from dynamo.multivelo import settings + +import os +import sys +import numpy as np +from numpy.linalg import norm +import matplotlib.pyplot as plt +from scipy import sparse +from scipy.sparse import coo_matrix +from scipy.optimize import minimize +from scipy.spatial import KDTree +from sklearn.metrics import pairwise_distances +from sklearn.mixture import GaussianMixture + + +import pandas as pd +import seaborn as sns +from numba import njit +import numba +from numba.typed import List +from tqdm.auto import tqdm + +import math +import torch +from torch import nn + +current_path = os.path.dirname(__file__) +src_path = os.path.join(current_path, "..") +sys.path.append(src_path) + +from ..dynamo_logger import ( + LoggerManager, + main_exception, + main_info, +) + + + +# a funciton to check for invalid values of different parameters +def check_params(alpha_c, + alpha, + beta, + gamma, + c0=None, + u0=None, + s0=None): + + new_alpha_c = alpha_c + new_alpha = alpha + new_beta = beta + new_gamma = gamma + + new_c0 = c0 + new_u0 = u0 + new_s0 = s0 + + inf_fix = 1e10 + zero_fix = 1e-10 + + # check if any of our parameters are infinite + if c0 is not None and math.isinf(c0): + main_info("c0 is infinite.", indent_level=1) + new_c0 = inf_fix + if u0 is not None and math.isinf(u0): + main_info("u0 is infinite.", indent_level=1) + new_u0 = inf_fix + if s0 is not None and math.isinf(s0): + main_info("s0 is infinite.", indent_level=1) + new_s0 = inf_fix + if math.isinf(alpha_c): + new_alpha_c = inf_fix + main_info("alpha_c is infinite.", indent_level=1) + if math.isinf(alpha): + new_alpha = inf_fix + main_info("alpha is infinite.", indent_level=1) + if math.isinf(beta): + new_beta = inf_fix + main_info("beta is infinite.", indent_level=1) + if math.isinf(gamma): + new_gamma = inf_fix + main_info("gamma is infinite.", indent_level=1) + + # check if any of our parameters are nan + if c0 is not None and math.isnan(c0): + main_info("c0 is Nan.", indent_level=1) + new_c0 = zero_fix + if u0 is not None and math.isnan(u0): + main_info("u0 is Nan.", indent_level=1) + new_u0 = zero_fix + if s0 is not None and math.isnan(s0): + main_info("s0 is Nan.", indent_level=1) + new_s0 = zero_fix + if math.isnan(alpha_c): + new_alpha_c = zero_fix + main_info("alpha_c is Nan.", indent_level=1) + if math.isnan(alpha): + new_alpha = zero_fix + main_info("alpha is Nan.", indent_level=1) + if math.isnan(beta): + new_beta = zero_fix + main_info("beta is Nan.", indent_level=1) + if math.isnan(gamma): + new_gamma = zero_fix + main_info("gamma is Nan.", indent_level=1) + + # check if any of our rate parameters are 0 + if alpha_c < 1e-7: + new_alpha_c = zero_fix + main_info("alpha_c is zero.", indent_level=1) + if alpha < 1e-7: + new_alpha = zero_fix + main_info("alpha is zero.", indent_level=1) + if beta < 1e-7: + new_beta = zero_fix + main_info("beta is zero.", indent_level=1) + if gamma < 1e-7: + new_gamma = zero_fix + main_info("gamma is zero.", indent_level=1) + + if beta == alpha_c: + new_beta += zero_fix + main_info("alpha_c and beta are equal, leading to divide by zero", + indent_level=1) + if beta == gamma: + new_gamma += zero_fix + main_info("gamma and beta are equal, leading to divide by zero", + indent_level=1) + if alpha_c == gamma: + new_gamma += zero_fix + main_info("gamma and alpha_c are equal, leading to divide by zero", + indent_level=1) + + if c0 is not None and u0 is not None and s0 is not None: + return new_alpha_c, new_alpha, new_beta, new_gamma, new_c0, new_u0, \ + new_s0 + + return new_alpha_c, new_alpha, new_beta, new_gamma + + +@njit( + locals={ + "res": numba.types.float64[:, ::1], + "eat": numba.types.float64[::1], + "ebt": numba.types.float64[::1], + "egt": numba.types.float64[::1], + }, + fastmath=True) +def predict_exp(tau, + c0, + u0, + s0, + alpha_c, + alpha, + beta, + gamma, + scale_cc=1, + pred_r=True, + chrom_open=True, + backward=False, + rna_only=False): + + if len(tau) == 0: + return np.empty((0, 3)) + if backward: + tau = -tau + res = np.empty((len(tau), 3)) + eat = np.exp(-alpha_c * tau) + ebt = np.exp(-beta * tau) + egt = np.exp(-gamma * tau) + if rna_only: + kc = 1 + c0 = 1 + else: + if chrom_open: + kc = 1 + else: + kc = 0 + alpha_c *= scale_cc + + const = (kc - c0) * alpha / (beta - alpha_c) + + res[:, 0] = kc - (kc - c0) * eat + + if pred_r: + + res[:, 1] = u0 * ebt + (alpha * kc / beta) * (1 - ebt) + res[:, 1] += const * (ebt - eat) + + res[:, 2] = s0 * egt + (alpha * kc / gamma) * (1 - egt) + res[:, 2] += ((beta / (gamma - beta)) * + ((alpha * kc / beta) - u0 - const) * (egt - ebt)) + res[:, 2] += (beta / (gamma - alpha_c)) * const * (egt - eat) + + else: + res[:, 1] = np.zeros(len(tau)) + res[:, 2] = np.zeros(len(tau)) + return res + + +@njit(locals={ + "exp_sw1": numba.types.float64[:, ::1], + "exp_sw2": numba.types.float64[:, ::1], + "exp_sw3": numba.types.float64[:, ::1], + "exp1": numba.types.float64[:, ::1], + "exp2": numba.types.float64[:, ::1], + "exp3": numba.types.float64[:, ::1], + "exp4": numba.types.float64[:, ::1], + "tau_sw1": numba.types.float64[::1], + "tau_sw2": numba.types.float64[::1], + "tau_sw3": numba.types.float64[::1], + "tau1": numba.types.float64[::1], + "tau2": numba.types.float64[::1], + "tau3": numba.types.float64[::1], + "tau4": numba.types.float64[::1] + }, + fastmath=True) +def generate_exp(tau_list, + t_sw_array, + alpha_c, + alpha, + beta, + gamma, + scale_cc=1, + model=1, + rna_only=False): + + if beta == alpha_c: + beta += 1e-3 + if gamma == beta or gamma == alpha_c: + gamma += 1e-3 + switch = len(t_sw_array) + if switch >= 1: + tau_sw1 = np.array([t_sw_array[0]]) + if switch >= 2: + tau_sw2 = np.array([t_sw_array[1] - t_sw_array[0]]) + if switch == 3: + tau_sw3 = np.array([t_sw_array[2] - t_sw_array[1]]) + exp_sw1, exp_sw2, exp_sw3 = (np.empty((0, 3)), + np.empty((0, 3)), + np.empty((0, 3))) + if tau_list is None: + if model == 0: + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], exp_sw1[0, 2], + alpha_c, alpha, beta, gamma, + pred_r=False, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 1: + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], exp_sw1[0, 2], + alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 2: + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], exp_sw1[0, 2], + alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, 0, beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + + return (np.empty((0, 3)), np.empty((0, 3)), np.empty((0, 3)), + np.empty((0, 3))), (exp_sw1, exp_sw2, exp_sw3) + + tau1 = tau_list[0] + if switch >= 1: + tau2 = tau_list[1] + if switch >= 2: + tau3 = tau_list[2] + if switch == 3: + tau4 = tau_list[3] + exp1, exp2, exp3, exp4 = (np.empty((0, 3)), np.empty((0, 3)), + np.empty((0, 3)), np.empty((0, 3))) + if model == 0: + exp1 = predict_exp(tau1, 0, 0, 0, alpha_c, alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, rna_only=rna_only) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + pred_r=False, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, + gamma, pred_r=False, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + exp3 = predict_exp(tau3, exp_sw2[0, 0], exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, alpha, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch == 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + exp4 = predict_exp(tau4, exp_sw3[0, 0], exp_sw3[0, 1], + exp_sw3[0, 2], alpha_c, 0, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + elif model == 1: + exp1 = predict_exp(tau1, 0, 0, 0, alpha_c, alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, rna_only=rna_only) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, + gamma, scale_cc=scale_cc, + rna_only=rna_only) + exp3 = predict_exp(tau3, exp_sw2[0, 0], exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, alpha, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch == 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + exp4 = predict_exp(tau4, exp_sw3[0, 0], exp_sw3[0, 1], + exp_sw3[0, 2], alpha_c, 0, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + elif model == 2: + exp1 = predict_exp(tau1, 0, 0, 0, alpha_c, alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, rna_only=rna_only) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, + gamma, scale_cc=scale_cc, + rna_only=rna_only) + exp3 = predict_exp(tau3, exp_sw2[0, 0], exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, 0, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + if switch == 3: + exp_sw3 = predict_exp(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, 0, beta, gamma, + scale_cc=scale_cc, rna_only=rna_only) + exp4 = predict_exp(tau4, exp_sw3[0, 0], exp_sw3[0, 1], + exp_sw3[0, 2], alpha_c, 0, beta, gamma, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + return (exp1, exp2, exp3, exp4), (exp_sw1, exp_sw2, exp_sw3) + + +@njit(locals={ + "exp_sw1": numba.types.float64[:, ::1], + "exp_sw2": numba.types.float64[:, ::1], + "exp_sw3": numba.types.float64[:, ::1], + "exp1": numba.types.float64[:, ::1], + "exp2": numba.types.float64[:, ::1], + "exp3": numba.types.float64[:, ::1], + "exp4": numba.types.float64[:, ::1], + "tau_sw1": numba.types.float64[::1], + "tau_sw2": numba.types.float64[::1], + "tau_sw3": numba.types.float64[::1], + "tau1": numba.types.float64[::1], + "tau2": numba.types.float64[::1], + "tau3": numba.types.float64[::1], + "tau4": numba.types.float64[::1] + }, + fastmath=True) +def generate_exp_backward(tau_list, t_sw_array, alpha_c, alpha, beta, gamma, + scale_cc=1, model=1, t=None): + if beta == alpha_c: + beta += 1e-3 + if gamma == beta or gamma == alpha_c: + gamma += 1e-3 + switch = len(t_sw_array) + if switch >= 1: + tau_sw1 = np.array([t_sw_array[0]]) + if switch >= 2: + tau_sw2 = np.array([t_sw_array[1] - t_sw_array[0]]) + else: + tau_sw1 = np.array([t_sw_array[0]]) + tau_sw2 = np.array([t_sw_array[1] - t_sw_array[0]]) + + if t is None: + if model == 0: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, + gamma, scale_cc=scale_cc, chrom_open=False, + backward=True) + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, chrom_open=False, + backward=True) + elif model == 1: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, + gamma, scale_cc=scale_cc, chrom_open=False, + backward=True) + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, chrom_open=False, + backward=True) + elif model == 2: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, + gamma, scale_cc=scale_cc, chrom_open=False, + backward=True) + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, 0, beta, gamma, + scale_cc=scale_cc, backward=True) + return (np.empty((0, 0)), + np.empty((0, 0)), + np.empty((0, 0))), (exp_sw1, exp_sw2) + + tau1 = tau_list[0] + if switch >= 1: + tau2 = tau_list[1] + if switch >= 2: + tau3 = tau_list[2] + + exp1, exp2, exp3 = np.empty((0, 3)), np.empty((0, 3)), np.empty((0, 3)) + if model == 0: + exp1 = predict_exp(tau1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, gamma, + scale_cc=scale_cc, chrom_open=False, backward=True) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, + gamma, scale_cc=scale_cc, chrom_open=False, + backward=True) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, chrom_open=False, + backward=True) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, + gamma, scale_cc=scale_cc, + chrom_open=False, backward=True) + exp3 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, chrom_open=False, + backward=True) + elif model == 1: + exp1 = predict_exp(tau1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, gamma, + scale_cc=scale_cc, chrom_open=False, backward=True) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, + gamma, scale_cc=scale_cc, chrom_open=False, + backward=True) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, chrom_open=False, + backward=True) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, beta, + gamma, scale_cc=scale_cc, + chrom_open=False, backward=True) + exp3 = predict_exp(tau3, exp_sw2[0, 0], exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, backward=True) + elif model == 2: + exp1 = predict_exp(tau1, 1e-3, 1e-3, 1e-3, alpha_c, 0, beta, gamma, + scale_cc=scale_cc, chrom_open=False, backward=True) + if switch >= 1: + exp_sw1 = predict_exp(tau_sw1, 1e-3, 1e-3, 1e-3, alpha_c, alpha, + beta, gamma, scale_cc=scale_cc, + chrom_open=False, backward=True) + exp2 = predict_exp(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, 0, beta, gamma, + scale_cc=scale_cc, backward=True) + if switch >= 2: + exp_sw2 = predict_exp(tau_sw2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, 0, beta, gamma, + scale_cc=scale_cc, backward=True) + exp3 = predict_exp(tau3, exp_sw2[0, 0], exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, alpha, beta, gamma, + scale_cc=scale_cc, backward=True) + return (exp1, exp2, exp3), (exp_sw1, exp_sw2) + + +@njit(locals={ + "res": numba.types.float64[:, ::1], + }, + fastmath=True) +def ss_exp(alpha_c, alpha, beta, gamma, pred_r=True, chrom_open=True): + res = np.empty((1, 3)) + if not chrom_open: + res[0, 0] = 0 + res[0, 1] = 0 + res[0, 2] = 0 + else: + res[0, 0] = 1 + if pred_r: + res[0, 1] = alpha / beta + res[0, 2] = alpha / gamma + else: + res[0, 1] = 0 + res[0, 2] = 0 + return res + + +@njit(locals={ + "ss1": numba.types.float64[:, ::1], + "ss2": numba.types.float64[:, ::1], + "ss3": numba.types.float64[:, ::1], + "ss4": numba.types.float64[:, ::1] + }, + fastmath=True) +def compute_ss_exp(alpha_c, alpha, beta, gamma, model=0): + if model == 0: + ss1 = ss_exp(alpha_c, alpha, beta, gamma, pred_r=False) + ss2 = ss_exp(alpha_c, alpha, beta, gamma, pred_r=False, + chrom_open=False) + ss3 = ss_exp(alpha_c, alpha, beta, gamma, chrom_open=False) + ss4 = ss_exp(alpha_c, 0, beta, gamma, chrom_open=False) + elif model == 1: + ss1 = ss_exp(alpha_c, alpha, beta, gamma, pred_r=False) + ss2 = ss_exp(alpha_c, alpha, beta, gamma) + ss3 = ss_exp(alpha_c, alpha, beta, gamma, chrom_open=False) + ss4 = ss_exp(alpha_c, 0, beta, gamma, chrom_open=False) + elif model == 2: + ss1 = ss_exp(alpha_c, alpha, beta, gamma, pred_r=False) + ss2 = ss_exp(alpha_c, alpha, beta, gamma) + ss3 = ss_exp(alpha_c, 0, beta, gamma) + ss4 = ss_exp(alpha_c, 0, beta, gamma, chrom_open=False) + return np.vstack((ss1, ss2, ss3, ss4)) + + +@njit(fastmath=True) +def velocity_equations(c, u, s, alpha_c, alpha, beta, gamma, scale_cc=1, + pred_r=True, chrom_open=True, rna_only=False): + if rna_only: + c = np.full(len(u), 1.0) + if not chrom_open: + alpha_c *= scale_cc + if pred_r: + return -alpha_c * c, alpha * c - beta * u, beta * u - gamma * s + else: + return -alpha_c * c, np.zeros(len(u)), np.zeros(len(u)) + else: + if pred_r: + return (alpha_c - alpha_c * c), (alpha * c - beta * u), (beta * u + - gamma + * s) + else: + return alpha_c - alpha_c * c, np.zeros(len(u)), np.zeros(len(u)) + + +@njit(locals={ + "state0": numba.types.boolean[::1], + "state1": numba.types.boolean[::1], + "state2": numba.types.boolean[::1], + "state3": numba.types.boolean[::1], + "tau1": numba.types.float64[::1], + "tau2": numba.types.float64[::1], + "tau3": numba.types.float64[::1], + "tau4": numba.types.float64[::1], + "exp_list": numba.types.Tuple((numba.types.float64[:, ::1], + numba.types.float64[:, ::1], + numba.types.float64[:, ::1], + numba.types.float64[:, ::1])), + "exp_sw_list": numba.types.Tuple((numba.types.float64[:, ::1], + numba.types.float64[:, ::1], + numba.types.float64[:, ::1])), + "c": numba.types.float64[::1], + "u": numba.types.float64[::1], + "s": numba.types.float64[::1], + "vc_vec": numba.types.float64[::1], + "vu_vec": numba.types.float64[::1], + "vs_vec": numba.types.float64[::1] + }, + fastmath=True) +def compute_velocity(t, + t_sw_array, + state, + alpha_c, + alpha, + beta, + gamma, + rescale_c, + rescale_u, + scale_cc=1, + model=1, + total_h=20, + rna_only=False): + + if state is None: + state0 = t <= t_sw_array[0] + state1 = (t_sw_array[0] < t) & (t <= t_sw_array[1]) + state2 = (t_sw_array[1] < t) & (t <= t_sw_array[2]) + state3 = t_sw_array[2] < t + else: + state0 = np.equal(state, 0) + state1 = np.equal(state, 1) + state2 = np.equal(state, 2) + state3 = np.equal(state, 3) + + tau1 = t[state0] + tau2 = t[state1] - t_sw_array[0] + tau3 = t[state2] - t_sw_array[1] + tau4 = t[state3] - t_sw_array[2] + tau_list = [tau1, tau2, tau3, tau4] + switch = np.sum(t_sw_array < total_h) + typed_tau_list = List() + [typed_tau_list.append(x) for x in tau_list] + exp_list, exp_sw_list = generate_exp(typed_tau_list, + t_sw_array[:switch], + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + + c = np.empty(len(t)) + u = np.empty(len(t)) + s = np.empty(len(t)) + for i, ii in enumerate([state0, state1, state2, state3]): + if np.any(ii): + c[ii] = exp_list[i][:, 0] + u[ii] = exp_list[i][:, 1] + s[ii] = exp_list[i][:, 2] + + vc_vec = np.zeros(len(u)) + vu_vec = np.zeros(len(u)) + vs_vec = np.zeros(len(u)) + + if model == 0: + if np.any(state0): + vc_vec[state0], vu_vec[state0], vs_vec[state0] = \ + velocity_equations(c[state0], u[state0], s[state0], alpha_c, + alpha, beta, gamma, pred_r=False, + scale_cc=scale_cc, rna_only=rna_only) + if np.any(state1): + vc_vec[state1], vu_vec[state1], vs_vec[state1] = \ + velocity_equations(c[state1], u[state1], s[state1], alpha_c, + alpha, beta, gamma, pred_r=False, + chrom_open=False, scale_cc=scale_cc, + rna_only=rna_only) + if np.any(state2): + vc_vec[state2], vu_vec[state2], vs_vec[state2] = \ + velocity_equations(c[state2], u[state2], s[state2], alpha_c, + alpha, beta, gamma, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + if np.any(state3): + vc_vec[state3], vu_vec[state3], vs_vec[state3] = \ + velocity_equations(c[state3], u[state3], s[state3], alpha_c, 0, + beta, gamma, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + elif model == 1: + if np.any(state0): + vc_vec[state0], vu_vec[state0], vs_vec[state0] = \ + velocity_equations(c[state0], u[state0], s[state0], alpha_c, + alpha, beta, gamma, pred_r=False, + scale_cc=scale_cc, rna_only=rna_only) + if np.any(state1): + vc_vec[state1], vu_vec[state1], vs_vec[state1] = \ + velocity_equations(c[state1], u[state1], s[state1], alpha_c, + alpha, beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if np.any(state2): + vc_vec[state2], vu_vec[state2], vs_vec[state2] = \ + velocity_equations(c[state2], u[state2], s[state2], alpha_c, + alpha, beta, gamma, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + if np.any(state3): + vc_vec[state3], vu_vec[state3], vs_vec[state3] = \ + velocity_equations(c[state3], u[state3], s[state3], alpha_c, 0, + beta, gamma, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + elif model == 2: + if np.any(state0): + vc_vec[state0], vu_vec[state0], vs_vec[state0] = \ + velocity_equations(c[state0], u[state0], s[state0], alpha_c, + alpha, beta, gamma, pred_r=False, + scale_cc=scale_cc, rna_only=rna_only) + if np.any(state1): + vc_vec[state1], vu_vec[state1], vs_vec[state1] = \ + velocity_equations(c[state1], u[state1], s[state1], alpha_c, + alpha, beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if np.any(state2): + vc_vec[state2], vu_vec[state2], vs_vec[state2] = \ + velocity_equations(c[state2], u[state2], s[state2], alpha_c, + 0, beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if np.any(state3): + vc_vec[state3], vu_vec[state3], vs_vec[state3] = \ + velocity_equations(c[state3], u[state3], s[state3], alpha_c, 0, + beta, gamma, chrom_open=False, + scale_cc=scale_cc, rna_only=rna_only) + return vc_vec * rescale_c, vu_vec * rescale_u, vs_vec + + +def log_valid(x): + return np.log(np.clip(x, 1e-3, 1 - 1e-3)) + + +def approx_tau(u, s, u0, s0, alpha, beta, gamma): + if gamma == beta: + gamma -= 1e-3 + u_inf = alpha / beta + if beta > gamma: + b_new = beta / (gamma - beta) + s_inf = alpha / gamma + s_inf_new = s_inf - b_new * u_inf + s_new = s - b_new * u + s0_new = s0 - b_new * u0 + tau = -1.0 / gamma * log_valid((s_new - s_inf_new) / + (s0_new - s_inf_new)) + else: + tau = -1.0 / beta * log_valid((u - u_inf) / (u0 - u_inf)) + return tau + + +def anchor_points(t_sw_array, total_h=20, t=1000, mode='uniform', + return_time=False): + t_ = np.linspace(0, total_h, t) + tau1 = t_[t_ <= t_sw_array[0]] + tau2 = t_[(t_sw_array[0] < t_) & (t_ <= t_sw_array[1])] - t_sw_array[0] + tau3 = t_[(t_sw_array[1] < t_) & (t_ <= t_sw_array[2])] - t_sw_array[1] + tau4 = t_[t_sw_array[2] < t_] - t_sw_array[2] + + if mode == 'log': + if len(tau1) > 0: + tau1 = np.expm1(tau1) + tau1 = tau1 / np.max(tau1) * (t_sw_array[0]) + if len(tau2) > 0: + tau2 = np.expm1(tau2) + tau2 = tau2 / np.max(tau2) * (t_sw_array[1] - t_sw_array[0]) + if len(tau3) > 0: + tau3 = np.expm1(tau3) + tau3 = tau3 / np.max(tau3) * (t_sw_array[2] - t_sw_array[1]) + if len(tau4) > 0: + tau4 = np.expm1(tau4) + tau4 = tau4 / np.max(tau4) * (total_h - t_sw_array[2]) + + tau_list = [tau1, tau2, tau3, tau4] + if return_time: + return t_, tau_list + else: + return tau_list + + +# @jit(nopython=True, fastmath=True, debug=True) +def pairwise_distance_square(X, Y): + res = np.empty((X.shape[0], Y.shape[0]), dtype=X.dtype) + for a in range(X.shape[0]): + for b in range(Y.shape[0]): + val = 0.0 + for i in range(X.shape[1]): + tmp = X[a, i] - Y[b, i] + val += tmp**2 + res[a, b] = val + return res + + +def calculate_dist_and_time(c, u, s, + t_sw_array, + alpha_c, alpha, beta, gamma, + rescale_c, rescale_u, + scale_cc=1, + scale_factor=None, + model=1, + conn=None, + t=1000, k=1, + direction='complete', + total_h=20, + rna_only=False, + penalize_gap=True, + all_cells=True): + + n = len(u) + if scale_factor is None: + scale_factor = np.array([np.std(c), np.std(u), np.std(s)]) + tau_list = anchor_points(t_sw_array, total_h, t) + switch = np.sum(t_sw_array < total_h) + typed_tau_list = List() + [typed_tau_list.append(x) for x in tau_list] + alpha_c, alpha, beta, gamma = check_params(alpha_c, alpha, beta, gamma) + exp_list, exp_sw_list = generate_exp(typed_tau_list, + t_sw_array[:switch], + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + rescale_factor = np.array([rescale_c, rescale_u, 1.0]) + exp_list = [x*rescale_factor for x in exp_list] + exp_sw_list = [x*rescale_factor for x in exp_sw_list] + max_c = 0 + max_u = 0 + max_s = 0 + if rna_only: + exp_mat = (np.hstack((np.reshape(u, (-1, 1)), np.reshape(s, (-1, 1)))) + / scale_factor[1:]) + else: + exp_mat = np.hstack((np.reshape(c, (-1, 1)), np.reshape(u, (-1, 1)), + np.reshape(s, (-1, 1)))) / scale_factor + + dists = np.full((n, 4), np.inf) + taus = np.zeros((n, 4), dtype=u.dtype) + ts = np.zeros((n, 4), dtype=u.dtype) + anchor_exp, anchor_t = None, None + + for i in range(switch+1): + if not all_cells: + max_ci = (np.max(exp_list[i][:, 0]) if exp_list[i].shape[0] > 0 + else 0) + max_c = max_ci if max_ci > max_c else max_c + max_ui = np.max(exp_list[i][:, 1]) if exp_list[i].shape[0] > 0 else 0 + max_u = max_ui if max_ui > max_u else max_u + max_si = np.max(exp_list[i][:, 2]) if exp_list[i].shape[0] > 0 else 0 + max_s = max_si if max_si > max_s else max_s + + skip_phase = False + if direction == 'off': + if (model in [1, 2]) and (i < 2): + skip_phase = True + elif direction == 'on': + if (model in [1, 2]) and (i >= 2): + skip_phase = True + if rna_only and i == 0: + skip_phase = True + + if not skip_phase: + if rna_only: + tmp = exp_list[i][:, 1:] / scale_factor[1:] + else: + tmp = exp_list[i] / scale_factor + if anchor_exp is None: + anchor_exp = exp_list[i] + anchor_t = (tau_list[i] + t_sw_array[i-1] if i >= 1 + else tau_list[i]) + else: + anchor_exp = np.vstack((anchor_exp, exp_list[i])) + anchor_t = np.hstack((anchor_t, tau_list[i] + t_sw_array[i-1] + if i >= 1 else tau_list[i])) + + if not all_cells: + anchor_dist = np.diff(tmp, axis=0, prepend=np.zeros((1, 2)) + if rna_only else np.zeros((1, 3))) + anchor_dist = np.sqrt((anchor_dist**2).sum(axis=1)) + remove_cand = anchor_dist < (0.01*np.max(exp_mat[1]) + if rna_only + else 0.01*np.max(exp_mat[2])) + step_idx = np.arange(0, len(anchor_dist), 1) % 3 > 0 + remove_cand &= step_idx + keep_idx = np.where(~remove_cand)[0] + tmp = tmp[keep_idx, :] + + tree = KDTree(tmp) + dd, ii = tree.query(exp_mat, k=k) + dd = dd**2 + if k > 1: + dd = np.mean(dd, axis=1) + if conn is not None: + dd = conn.dot(dd) + dists[:, i] = dd + + if not all_cells: + ii = keep_idx[ii] + if k == 1: + taus[:, i] = tau_list[i][ii] + else: + for j in range(n): + taus[j, i] = tau_list[i][ii[j, :]] + ts[:, i] = taus[:, i] + t_sw_array[i-1] if i >= 1 else taus[:, i] + + min_dist = np.min(dists, axis=1) + state_pred = np.argmin(dists, axis=1) + t_pred = ts[np.arange(n), state_pred] + + anchor_t1_list = [] + anchor_t2_list = [] + t_sw_adjust = np.zeros(3, dtype=u.dtype) + + if direction == 'complete': + t_sorted = np.sort(t_pred) + dt = np.diff(t_sorted, prepend=0) + gap_thresh = 3*np.percentile(dt, 99) + idx = np.where(dt > gap_thresh)[0] + for i in idx: + t1 = t_sorted[i-1] if i > 0 else 0 + t2 = t_sorted[i] + anchor_t1 = anchor_exp[np.argmin(np.abs(anchor_t - t1)), :] + anchor_t2 = anchor_exp[np.argmin(np.abs(anchor_t - t2)), :] + if all_cells: + anchor_t1_list.append(np.ravel(anchor_t1)) + anchor_t2_list.append(np.ravel(anchor_t2)) + if not all_cells: + for j in range(1, switch): + crit1 = ((t1 > t_sw_array[j-1]) and (t2 > t_sw_array[j-1]) + and (t1 <= t_sw_array[j]) + and (t2 <= t_sw_array[j])) + crit2 = ((np.abs(anchor_t1[2] - exp_sw_list[j][0, 2]) + < 0.02 * max_s) and + (np.abs(anchor_t2[2] - exp_sw_list[j][0, 2]) + < 0.01 * max_s)) + crit3 = ((np.abs(anchor_t1[1] - exp_sw_list[j][0, 1]) + < 0.02 * max_u) and + (np.abs(anchor_t2[1] - exp_sw_list[j][0, 1]) + < 0.01 * max_u)) + crit4 = ((np.abs(anchor_t1[0] - exp_sw_list[j][0, 0]) + < 0.02 * max_c) and + (np.abs(anchor_t2[0] - exp_sw_list[j][0, 0]) + < 0.01 * max_c)) + if crit1 and crit2 and crit3 and crit4: + t_sw_adjust[j] += t2 - t1 + if penalize_gap: + dist_gap = np.sum(((anchor_t1[1:] - anchor_t2[1:]) / + scale_factor[1:])**2) + idx_to_adjust = t_pred >= t2 + t_sw_array_ = np.append(t_sw_array, total_h) + state_to_adjust = np.where(t_sw_array_ > t2)[0] + dists[np.ix_(idx_to_adjust, state_to_adjust)] += dist_gap + min_dist = np.min(dists, axis=1) + state_pred = np.argmin(dists, axis=1) + if all_cells: + t_pred = ts[np.arange(n), state_pred] + + if all_cells: + exp_ss_mat = compute_ss_exp(alpha_c, alpha, beta, gamma, model=model) + if rna_only: + exp_ss_mat[:, 0] = 1 + dists_ss = pairwise_distance_square(exp_mat, exp_ss_mat * + rescale_factor / scale_factor) + + reach_ss = np.full((n, 4), False) + for i in range(n): + for j in range(4): + if min_dist[i] > dists_ss[i, j]: + reach_ss[i, j] = True + late_phase = np.full(n, -1) + for i in range(3): + late_phase[np.abs(t_pred - t_sw_array[i]) < 0.1] = i + return min_dist, t_pred, state_pred, reach_ss, late_phase, max_u, \ + max_s, anchor_t1_list, anchor_t2_list + else: + return min_dist, state_pred, max_u, max_s, t_sw_adjust + + +def t_of_c(alpha_c, k_c, c_o, c, rescale_factor, sw_t): + + coef = -float(1)/alpha_c + + c_val = np.clip(c / rescale_factor, a_min=0, a_max=1) + + in_log = (float(k_c) - c_val) / float((k_c) - (c_o)) + + epsilon = 1e-9 + + return_val = coef * np.log(in_log + epsilon) + + if k_c == 0: + return_val += sw_t + + return return_val + + +def make_X(c, u, s, + max_u, + max_s, + alpha_c, alpha, beta, gamma, + gene_sw_t, + c0, c_sw1, c_sw2, c_sw3, + u0, u_sw1, u_sw2, u_sw3, + s0, s_sw1, s_sw2, s_sw3, + model, direction, state): + + if direction == "complete": + dire = 0 + elif direction == "on": + dire = 1 + elif direction == "off": + dire = 2 + + n = c.shape[0] + + epsilon = 1e-5 + + if dire == 0: + x = np.concatenate((np.array([c, + np.log(u + epsilon), + np.log(s + epsilon)]), + np.full((n, 17), [np.log(alpha_c + epsilon), + np.log(alpha + epsilon), + np.log(beta + epsilon), + np.log(gamma + epsilon), + c_sw1, c_sw2, c_sw3, + np.log(u_sw2 + epsilon), + np.log(u_sw3 + epsilon), + np.log(s_sw2 + epsilon), + np.log(s_sw3 + epsilon), + np.log(max_u), + np.log(max_s), + gene_sw_t[0], + gene_sw_t[1], + gene_sw_t[2], + model]).T, + np.full((n, 1), state).T + )).T.astype(np.float32) + + elif dire == 1: + x = np.concatenate((np.array([c, + np.log(u + epsilon), + np.log(s + epsilon)]), + np.full((n, 12), [np.log(alpha_c + epsilon), + np.log(alpha + epsilon), + np.log(beta + epsilon), + np.log(gamma + epsilon), + c_sw1, c_sw2, + np.log(u_sw1 + epsilon), + np.log(u_sw2 + epsilon), + np.log(s_sw1 + epsilon), + np.log(s_sw2 + epsilon), + gene_sw_t[0], + model]).T, + np.full((n, 1), state).T + )).T.astype(np.float32) + + elif dire == 2: + if model == 1: + + max_u_t = -(float(1)/alpha_c)*np.log((max_u*beta) + / (alpha*c0[2])) + + x = np.concatenate((np.array([np.log(c + epsilon), + np.log(u + epsilon), + np.log(s + epsilon)]), + np.full((n, 14), [np.log(alpha_c + epsilon), + np.log(alpha + epsilon), + np.log(beta + epsilon), + np.log(gamma + epsilon), + c_sw2, c_sw3, + np.log(u_sw2 + epsilon), + np.log(u_sw3 + epsilon), + np.log(s_sw2 + epsilon), + np.log(s_sw3 + epsilon), + max_u_t, + np.log(max_u), + np.log(max_s), + gene_sw_t[2]]).T, + np.full((n, 1), state).T + )).T.astype(np.float32) + elif model == 2: + x = np.concatenate((np.array([c, + np.log(u + epsilon), + np.log(s + epsilon)]), + np.full((n, 12), [np.log(alpha_c + epsilon), + np.log(alpha + epsilon), + np.log(beta + epsilon), + np.log(gamma + epsilon), + c_sw2, c_sw3, + np.log(u_sw2 + epsilon), + np.log(u_sw3 + epsilon), + np.log(s_sw2 + epsilon), + np.log(s_sw3 + epsilon), + np.log(max_u), + gene_sw_t[2]]).T, + np.full((n, 1), state).T + )).T.astype(np.float32) + + return x + + +def calculate_dist_and_time_nn(c, u, s, + max_u, max_s, + t_sw_array, + alpha_c, alpha, beta, gamma, + rescale_c, rescale_u, + ode_model_0, ode_model_1, + ode_model_2_m1, ode_model_2_m2, + device, + scale_cc=1, + scale_factor=None, + model=1, + conn=None, + t=1000, k=1, + direction='complete', + total_h=20, + rna_only=False, + penalize_gap=True, + all_cells=True): + + rescale_factor = np.array([rescale_c, rescale_u, 1.0]) + + exp_list_net, exp_sw_list_net = generate_exp(None, + t_sw_array, + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + + N = len(c) + N_list = np.arange(N) + + if scale_factor is None: + cur_scale_factor = np.array([np.std(c), + np.std(u), + np.std(s)]) + else: + cur_scale_factor = scale_factor + + t_pred_per_state = [] + dists_per_state = [] + + dire = 0 + + if direction == "on": + states = [0, 1] + dire = 1 + + elif direction == "off": + states = [2, 3] + dire = 2 + + else: + states = [0, 1, 2, 3] + dire = 0 + + dists_per_state = np.zeros((N, len(states))) + t_pred_per_state = np.zeros((N, len(states))) + u_pred_per_state = np.zeros((N, len(states))) + s_pred_per_state = np.zeros((N, len(states))) + + increment = 0 + + # determine when we can consider u and s close to zero + zero_us = np.logical_and((u < 0.1 * max_u), (s < 0.1 * max_s)) + + t_pred = np.zeros(N) + dists = None + + # pass all the data through the neural net as each valid state + for state in states: + + # when u and s = 0, it's better to use the inverse c equation + # instead of the neural network, which happens for part of + # state 3 and all of state 0 + inverse_c = np.logical_or(state == 0, + np.logical_and(state == 3, zero_us)) + + not_inverse_c = np.logical_not(inverse_c) + + # if we want to use the inverse c equation... + if np.any(inverse_c): + + # find out at what switch time chromatin closes + c_sw_t = t_sw_array[int(model)] + + # figure out whether chromatin is opening/closing and what + # the initial c value is + if state <= model: + k_c = 1 + c_0_for_t_guess = 0 + elif state > model: + k_c = 0 + c_0_for_t_guess = exp_sw_list_net[int(model)][0, 0] + + # calculate predicted time from the inverse c equation + t_pred[inverse_c] = t_of_c(alpha_c, + k_c, c_0_for_t_guess, + c[inverse_c], + rescale_factor[0], + c_sw_t) + + # if there are points where we want to use the neural network... + if np.any(not_inverse_c): + + # create an input matrix from the data + x = make_X(c[not_inverse_c] / rescale_factor[0], + u[not_inverse_c] / rescale_factor[1], + s[not_inverse_c] / rescale_factor[2], + max_u, + max_s, + alpha_c*(scale_cc if state > model else 1), + alpha, beta, gamma, + t_sw_array, + 0, + exp_sw_list_net[0][0, 0], + exp_sw_list_net[1][0, 0], + exp_sw_list_net[2][0, 0], + 0, + exp_sw_list_net[0][0, 1], + exp_sw_list_net[1][0, 1], + exp_sw_list_net[2][0, 1], + 0, + exp_sw_list_net[0][0, 2], + exp_sw_list_net[1][0, 2], + exp_sw_list_net[2][0, 2], + model, direction, state) + + # do a forward pass + if dire == 0: + t_pred_ten = ode_model_0(torch.tensor(x, + dtype=torch.float, + device=device) + .reshape(-1, x.shape[1])) + + elif dire == 1: + t_pred_ten = ode_model_1(torch.tensor(x, + dtype=torch.float, + device=device) + .reshape(-1, x.shape[1])) + + elif dire == 2: + if model == 1: + t_pred_ten = ode_model_2_m1(torch.tensor(x, + dtype=torch.float, + device=device) + .reshape(-1, x.shape[1])) + elif model == 2: + t_pred_ten = ode_model_2_m2(torch.tensor(x, + dtype=torch.float, + device=device) + .reshape(-1, x.shape[1])) + + # make a numpy array out of our tensor of predicted time points + t_pred[not_inverse_c] = (t_pred_ten.cpu().detach().numpy() + .flatten()*21) - 1 + + # calculate tau values from our predicted time points + if state == 0: + t_pred = np.clip(t_pred, a_min=0, a_max=t_sw_array[0]) + tau1 = t_pred + tau2 = [] + tau3 = [] + tau4 = [] + elif state == 1: + tau1 = [] + t_pred = np.clip(t_pred, a_min=t_sw_array[0], a_max=t_sw_array[1]) + tau2 = t_pred - t_sw_array[0] + tau3 = [] + tau4 = [] + elif state == 2: + tau1 = [] + tau2 = [] + t_pred = np.clip(t_pred, a_min=t_sw_array[1], a_max=t_sw_array[2]) + tau3 = t_pred - t_sw_array[1] + tau4 = [] + elif state == 3: + tau1 = [] + tau2 = [] + tau3 = [] + t_pred = np.clip(t_pred, a_min=t_sw_array[2], a_max=20) + tau4 = t_pred - t_sw_array[2] + + tau_list = [tau1, tau2, tau3, tau4] + + valid_vals = [] + + for i in range(len(tau_list)): + if len(tau_list[i]) == 0: + tau_list[i] = np.array([0.0]) + else: + valid_vals.append(i) + + # take the time points and get predicted c/u/s values from them + exp_list, exp_sw_list_2 = generate_exp(tau_list, + t_sw_array, + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + + pred_c = np.concatenate([exp_list[x][:, 0] * rescale_factor[0] + for x in valid_vals]) + pred_u = np.concatenate([exp_list[x][:, 1] * rescale_factor[1] + for x in valid_vals]) + pred_s = np.concatenate([exp_list[x][:, 2] * rescale_factor[2] + for x in valid_vals]) + + # calculate distance between predicted and real values + c_diff = (c - pred_c) / cur_scale_factor[0] + u_diff = (u - pred_u) / cur_scale_factor[1] + s_diff = (s - pred_s) / cur_scale_factor[2] + + dists = (c_diff*c_diff) + (u_diff*u_diff) + (s_diff*s_diff) + + if conn is not None: + dists = conn.dot(dists) + + # store the distances, times, and predicted u and s values for + # each state + dists_per_state[:, increment] = dists + t_pred_per_state[:, increment] = t_pred + u_pred_per_state[:, increment] = pred_u + s_pred_per_state[:, increment] = pred_s + + increment += 1 + + # whichever state has the smallest distance for a given data point + # is our predicted state + state_pred = np.argmin(dists_per_state, axis=1) + + # slice dists and predicted time over the correct state + dists = dists_per_state[N_list, state_pred] + t_pred = t_pred_per_state[N_list, state_pred] + + max_t = t_pred.max() + min_t = t_pred.min() + + penalty = 0 + + # for induction and complete genes, add a penalty to ensure that not + # all points are in state 0 + if direction == "on" or direction == "complete": + + if t_sw_array[0] >= max_t: + penalty += (t_sw_array[0] - max_t) + 10 + + # for induction genes, add a penalty to ensure that predicted time + # points are not "out of bounds" by being greater than the + # second switch time + if direction == "on": + + if min_t > t_sw_array[1]: + penalty += (min_t - t_sw_array[1]) + 10 + + # for repression genes, add a penalty to ensure that predicted time + # points are not "out of bounds" by being smaller than the + # second switch time + if direction == "off": + + if t_sw_array[1] >= max_t: + penalty += (t_sw_array[1] - max_t) + 10 + + # add penalty to ensure that the time points aren't concentrated to + # one spot + if np.abs(max_t - min_t) <= 1e-2: + penalty += np.abs(max_t - min_t) + 10 + + # because the indices chosen by np.argmin are just indices, + # we need to increment by two to get the true state number for + # our "off" genes (e.g. so that they're in the domain of [2,3] instead + # of [0,1]) + if direction == "off": + state_pred += 2 + + if all_cells: + return dists, t_pred, state_pred, max_u, max_s, penalty + else: + return dists, state_pred, max_u, max_s, penalty + + +# @jit(nopython=True, fastmath=True) +def compute_likelihood(c, u, s, + t_sw_array, + alpha_c, alpha, beta, gamma, + rescale_c, rescale_u, + t_pred, + state_pred, + scale_cc=1, + scale_factor=None, + model=1, + weight=None, + total_h=20, + rna_only=False): + + if weight is None: + weight = np.full(c.shape, True) + c_ = c[weight] + u_ = u[weight] + s_ = s[weight] + t_pred_ = t_pred[weight] + state_pred_ = state_pred[weight] + + n = len(u_) + if scale_factor is None: + scale_factor = np.ones(3) + tau1 = t_pred_[state_pred_ == 0] + tau2 = t_pred_[state_pred_ == 1] - t_sw_array[0] + tau3 = t_pred_[state_pred_ == 2] - t_sw_array[1] + tau4 = t_pred_[state_pred_ == 3] - t_sw_array[2] + tau_list = [tau1, tau2, tau3, tau4] + switch = np.sum(t_sw_array < total_h) + typed_tau_list = List() + [typed_tau_list.append(x) for x in tau_list] + alpha_c, alpha, beta, gamma = check_params(alpha_c, alpha, beta, gamma) + exp_list, _ = generate_exp(typed_tau_list, + t_sw_array[:switch], + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + rescale_factor = np.array([rescale_c, rescale_u, 1.0]) + exp_list = [x*rescale_factor*scale_factor for x in exp_list] + exp_mat = np.hstack((np.reshape(c_, (-1, 1)), np.reshape(u_, (-1, 1)), + np.reshape(s_, (-1, 1)))) * scale_factor + diffs = np.empty((n, 3), dtype=u.dtype) + likelihood_c = 0 + likelihood_u = 0 + likelihood_s = 0 + ssd_c, var_c = 0, 0 + for i in range(switch+1): + index = state_pred_ == i + if np.sum(index) > 0: + diff = exp_mat[index, :] - exp_list[i] + diffs[index, :] = diff + if rna_only: + diff_u = np.ravel(diffs[:, 0]) + diff_s = np.ravel(diffs[:, 1]) + dist_us = diff_u ** 2 + diff_s ** 2 + var_us = np.var(np.sign(diff_s) * np.sqrt(dist_us)) + nll = (0.5 * np.log(2 * np.pi * var_us) + 0.5 / n / + var_us * np.sum(dist_us)) + else: + diff_c = np.ravel(diffs[:, 0]) + diff_u = np.ravel(diffs[:, 1]) + diff_s = np.ravel(diffs[:, 2]) + dist_c = diff_c ** 2 + dist_u = diff_u ** 2 + dist_s = diff_s ** 2 + var_c = np.var(diff_c) + var_u = np.var(diff_u) + var_s = np.var(diff_s) + ssd_c = np.sum(dist_c) + nll_c = (0.5 * np.log(2 * np.pi * var_c) + 0.5 / n / + var_c * np.sum(dist_c)) + nll_u = (0.5 * np.log(2 * np.pi * var_u) + 0.5 / n / + var_u * np.sum(dist_u)) + nll_s = (0.5 * np.log(2 * np.pi * var_s) + 0.5 / n / + var_s * np.sum(dist_s)) + nll = nll_c + nll_u + nll_s + likelihood_c = np.exp(-nll_c) + likelihood_u = np.exp(-nll_u) + likelihood_s = np.exp(-nll_s) + likelihood = np.exp(-nll) + return likelihood, likelihood_c, ssd_c, var_c, likelihood_u, likelihood_s + + +class ChromatinDynamical: + def __init__(self, c, u, s, + gene=None, + model=None, + max_iter=10, + init_mode="grid", + device="cpu", + neural_net=False, + adam=False, + adam_lr=None, + adam_beta1=None, + adam_beta2=None, + batch_size=None, + local_std=None, + embed_coord=None, + connectivities=None, + plot=False, + save_plot=False, + plot_dir=None, + fit_args=None, + partial=None, + direction=None, + rna_only=False, + fit_decoupling=True, + extra_color=None, + rescale_u=None, + alpha=None, + beta=None, + gamma=None, + t_=None + ): + + self.device = device + self.gene = gene + self.local_std = local_std + self.conn = connectivities + + self.neural_net = neural_net + self.adam = adam + self.adam_lr = adam_lr + self.adam_beta1 = adam_beta1 + self.adam_beta2 = adam_beta2 + self.batch_size = batch_size + + self.torch_type = type(u[0].item()) + + # fitting arguments + self.init_mode = init_mode + self.rna_only = rna_only + self.fit_decoupling = fit_decoupling + self.max_iter = max_iter + self.n_anchors = np.clip(int(fit_args['t']), 201, 2000) + self.k_dist = np.clip(int(fit_args['k']), 1, 20) + self.tm = np.clip(fit_args['thresh_multiplier'], 0.4, 2) + self.weight_c = np.clip(fit_args['weight_c'], 0.1, 5) + self.outlier = np.clip(fit_args['outlier'], 80, 100) + self.model = int(model) if isinstance(model, float) else model + self.model_ = None + if self.model == 0 and self.init_mode == 'invert': + self.init_mode = 'grid' + + # plot parameters + self.plot = plot + self.save_plot = save_plot + self.extra_color = extra_color + self.fig_size = fit_args['fig_size'] + self.point_size = fit_args['point_size'] + if plot_dir is None: + self.plot_path = 'rna_plots' if self.rna_only else 'plots' + else: + self.plot_path = plot_dir + self.color = ['tab:red', 'tab:orange', 'tab:green', 'tab:blue'] + self.fig = None + self.ax = None + + # input + self.total_n = len(u) + if sparse.issparse(c): + c = c.A + if sparse.issparse(u): + u = u.A + if sparse.issparse(s): + s = s.A + self.c_all = np.ravel(np.array(c, dtype=np.float64)) + self.u_all = np.ravel(np.array(u, dtype=np.float64)) + self.s_all = np.ravel(np.array(s, dtype=np.float64)) + + # adjust offset + self.offset_c, self.offset_u, self.offset_s = np.min(self.c_all), \ + np.min(self.u_all), np.min(self.s_all) + self.offset_c = 0 if self.rna_only else self.offset_c + self.c_all -= self.offset_c + self.u_all -= self.offset_u + self.s_all -= self.offset_s + # remove zero counts + self.non_zero = (np.ravel(self.c_all > 0) | np.ravel(self.u_all > 0) | + np.ravel(self.s_all > 0)) + # remove outliers + self.non_outlier = np.ravel(self.c_all <= np.percentile(self.c_all, + self.outlier)) + self.non_outlier &= np.ravel(self.u_all <= np.percentile(self.u_all, + self.outlier)) + self.non_outlier &= np.ravel(self.s_all <= np.percentile(self.s_all, + self.outlier)) + self.c = self.c_all[self.non_zero & self.non_outlier] + self.u = self.u_all[self.non_zero & self.non_outlier] + self.s = self.s_all[self.non_zero & self.non_outlier] + self.low_quality = len(self.u) < 10 + # scale modalities + self.std_c, self.std_u, self.std_s = (np.std(self.c_all) + if not self.rna_only + else 1.0, np.std(self.u_all), + np.std(self.s_all)) + if self.std_u == 0 or self.std_s == 0: + self.low_quality = True + self.scale_c, self.scale_u, self.scale_s = np.max(self.c_all) \ + if not self.rna_only else 1.0, self.std_u/self.std_s, 1.0 + + # if we're on neural net mode, check to see if c is way bigger than + # u or s, which would be very hard for the neural net to fit + if not self.low_quality and neural_net: + max_c_orig = np.max(self.c) + if max_c_orig / np.max(self.u) > 500: + self.low_quality = True + + if not self.low_quality: + if max_c_orig / np.max(self.s) > 500: + self.low_quality = True + + self.c_all /= self.scale_c + self.u_all /= self.scale_u + self.s_all /= self.scale_s + self.c /= self.scale_c + self.u /= self.scale_u + self.s /= self.scale_s + self.scale_factor = np.array([np.std(self.c_all) / self.std_s / + self.weight_c, 1.0, 1.0]) + self.scale_factor[0] = 1 if self.rna_only else self.scale_factor[0] + self.max_u, self.max_s = np.max(self.u), np.max(self.s) + self.max_u_all, self.max_s_all = np.max(self.u_all), np.max(self.s_all) + if self.conn is not None: + self.conn_sub = self.conn[np.ix_(self.non_zero & self.non_outlier, + self.non_zero & self.non_outlier)] + else: + self.conn_sub = None + + main_info(f'{len(self.u)} cells passed filter and will be used to ' + 'compute trajectories.', indent_level=2) + self.known_pars = (True + if None not in [rescale_u, alpha, beta, gamma, t_] + else False) + if self.known_pars: + main_info(f'known parameters for gene {self.gene} are ' + f'scaling={rescale_u}, alpha={alpha}, beta={beta},' + f' gamma={gamma}, t_={t_}.', indent_level=1) + + # define neural networks + self.ode_model_0 = nn.Sequential( + nn.Linear(21, 150), + nn.ReLU(), + nn.Linear(150, 112), + nn.ReLU(), + nn.Linear(112, 75), + nn.ReLU(), + nn.Linear(75, 1), + nn.Sigmoid() + ) + + self.ode_model_1 = nn.Sequential( + nn.Linear(16, 64), + nn.ReLU(), + nn.Linear(64, 48), + nn.ReLU(), + nn.Linear(48, 32), + nn.ReLU(), + nn.Linear(32, 1), + nn.Sigmoid() + ) + + self.ode_model_2_m1 = nn.Sequential( + nn.Linear(18, 220), + nn.ReLU(), + nn.Linear(220, 165), + nn.ReLU(), + nn.Linear(165, 110), + nn.ReLU(), + nn.Linear(110, 1), + nn.Sigmoid() + ) + + self.ode_model_2_m2 = nn.Sequential( + nn.Linear(16, 150), + nn.ReLU(), + nn.Linear(150, 112), + nn.ReLU(), + nn.Linear(112, 75), + nn.ReLU(), + nn.Linear(75, 1), + nn.Sigmoid() + ) + + self.ode_model_0.to(torch.device(self.device)) + self.ode_model_1.to(torch.device(self.device)) + self.ode_model_2_m1.to(torch.device(self.device)) + self.ode_model_2_m2.to(torch.device(self.device)) + + # load in neural network + net_path = os.path.dirname(os.path.abspath(__file__)) + \ + "/neural_nets/" + + self.ode_model_0.load_state_dict(torch.load(net_path+"dir0.pt")) + self.ode_model_1.load_state_dict(torch.load(net_path+"dir1.pt")) + self.ode_model_2_m1.load_state_dict(torch.load(net_path+"dir2_m1.pt")) + self.ode_model_2_m2.load_state_dict(torch.load(net_path+"dir2_m2.pt")) + + # 4 rate parameters + self.alpha_c = 0.1 + self.alpha = alpha if alpha is not None else 0.0 + self.beta = beta if beta is not None else 0.0 + self.gamma = gamma if gamma is not None else 0.0 + # 3 possible switch time points + self.t_sw_1 = 0.1 if t_ is not None else 0.0 + self.t_sw_2 = t_+0.1 if t_ is not None else 0.0 + self.t_sw_3 = 20.0 if t_ is not None else 0.0 + # 2 rescale factors + self.rescale_c = 1.0 + self.rescale_u = rescale_u if rescale_u is not None else 1.0 + self.rates = None + self.t_sw_array = None + self.fit_rescale = True if rescale_u is None else False + self.params = None + + # other parameters or results + self.t = None + self.state = None + self.loss = [np.inf] + self.likelihood = -1.0 + self.l_c = 0 + self.ssd_c, self.var_c = 0, 0 + self.scale_cc = 1.0 + self.fitting_flag_ = 0 + self.velocity = None + self.anchor_t1_list, self.anchor_t2_list = None, None + self.anchor_exp = None + self.anchor_exp_sw = None + self.anchor_min_idx, self.anchor_max_idx, self.anchor_velo_min_idx, \ + self.anchor_velo_max_idx = None, None, None, None + self.anchor_velo = None + self.c0 = self.u0 = self.s0 = 0.0 + self.realign_ratio = 1.0 + self.partial = False + self.direction = 'complete' + self.steady_state_func = None + + # for fit and update + self.cur_iter = 0 + self.cur_loss = None + self.cur_state_pred = None + self.cur_t_sw_adjust = None + + # partial checking and model examination + determine_model = model is None + if partial is None and direction is None: + if embed_coord is not None: + self.embed_coord = embed_coord[self.non_zero & + self.non_outlier] + else: + self.embed_coord = None + self.check_partial_trajectory(determine_model=determine_model) + elif direction is not None: + self.direction = direction + if direction in ['on', 'off']: + self.partial = True + else: + self.partial = False + self.check_partial_trajectory(fit_gmm=False, fit_slope=False, + determine_model=determine_model) + elif partial is not None: + self.partial = partial + self.check_partial_trajectory(fit_gmm=False, + determine_model=determine_model) + else: + self.check_partial_trajectory(fit_gmm=False, fit_slope=False, + determine_model=determine_model) + + # intialize steady state parameters + if not self.known_pars and not self.low_quality: + self.initialize_steady_state_params(model_mismatch=self.model + != self.model_) + if self.known_pars: + self.params = np.array([self.t_sw_1, + self.t_sw_2-self.t_sw_1, + self.t_sw_3-self.t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + + # the torch tensor version of the anchor points function + def anchor_points_ten(self, t_sw_array, total_h=20, t=1000, mode='uniform', + return_time=False): + + t_ = torch.linspace(0, total_h, t, device=self.device, + dtype=self.torch_type) + tau1 = t_[t_ <= t_sw_array[0]] + tau2 = t_[(t_sw_array[0] < t_) & (t_ <= t_sw_array[1])] - t_sw_array[0] + tau3 = t_[(t_sw_array[1] < t_) & (t_ <= t_sw_array[2])] - t_sw_array[1] + tau4 = t_[t_sw_array[2] < t_] - t_sw_array[2] + + if mode == 'log': + if len(tau1) > 0: + tau1 = torch.expm1(tau1) + tau1 = tau1 / torch.max(tau1) * (t_sw_array[0]) + if len(tau2) > 0: + tau2 = torch.expm1(tau2) + tau2 = tau2 / torch.max(tau2) * (t_sw_array[1] - t_sw_array[0]) + if len(tau3) > 0: + tau3 = torch.expm1(tau3) + tau3 = tau3 / torch.max(tau3) * (t_sw_array[2] - t_sw_array[1]) + if len(tau4) > 0: + tau4 = torch.expm1(tau4) + tau4 = tau4 / torch.max(tau4) * (total_h - t_sw_array[2]) + + tau_list = [tau1, tau2, tau3, tau4] + if return_time: + return t_, tau_list + else: + return tau_list + + # the torch version of the predict_exp function + def predict_exp_ten(self, + tau, + c0, + u0, + s0, + alpha_c, + alpha, + beta, + gamma, + scale_cc=None, + pred_r=True, + chrom_open=True, + backward=False, + rna_only=False): + + if scale_cc is None: + scale_cc = torch.tensor(1.0, requires_grad=True, + device=self.device, + dtype=self.torch_type) + + if len(tau) == 0: + return torch.empty((0, 3), + requires_grad=True, + device=self.device, + dtype=self.torch_type) + if backward: + tau = -tau + + eat = torch.exp(-alpha_c * tau) + ebt = torch.exp(-beta * tau) + egt = torch.exp(-gamma * tau) + if rna_only: + kc = 1 + c0 = 1 + else: + if chrom_open: + kc = 1 + else: + kc = 0 + alpha_c = alpha_c * scale_cc + + const = (kc - c0) * alpha / (beta - alpha_c) + + res0 = kc - (kc - c0) * eat + + if pred_r: + + res1 = u0 * ebt + (alpha * kc / beta) * (1 - ebt) + res1 += const * (ebt - eat) + + res2 = s0 * egt + (alpha * kc / gamma) * (1 - egt) + res2 += ((beta / (gamma - beta)) * + ((alpha * kc / beta) - u0 - const) * (egt - ebt)) + res2 += (beta / (gamma - alpha_c)) * const * (egt - eat) + + else: + res1 = torch.zeros(len(tau), device=self.device, + requires_grad=True, + dtype=self.torch_type) + res2 = torch.zeros(len(tau), device=self.device, + requires_grad=True, + dtype=self.torch_type) + + res = torch.stack((res0, res1, res2), 1) + + return res + + # the torch tensor version of the generate_exp function + def generate_exp_tens(self, + tau_list, + t_sw_array, + alpha_c, + alpha, + beta, + gamma, + scale_cc=None, + model=1, + rna_only=False): + + if scale_cc is None: + scale_cc = torch.tensor(1.0, requires_grad=True, + device=self.device, + dtype=self.torch_type) + + if beta == alpha_c: + beta += 1e-3 + if gamma == beta or gamma == alpha_c: + gamma += 1e-3 + switch = int(t_sw_array.size(dim=0)) + if switch >= 1: + tau_sw1 = torch.tensor([t_sw_array[0]], requires_grad=True, + device=self.device, + dtype=self.torch_type) + if switch >= 2: + tau_sw2 = torch.tensor([t_sw_array[1] - t_sw_array[0]], + requires_grad=True, + device=self.device, + dtype=self.torch_type) + if switch == 3: + tau_sw3 = torch.tensor([t_sw_array[2] - t_sw_array[1]], + requires_grad=True, + device=self.device, + dtype=self.torch_type) + exp_sw1, exp_sw2, exp_sw3 = (torch.empty((0, 3), + requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), + requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), + requires_grad=True, + device=self.device, + dtype=self.torch_type)) + if tau_list is None: + if model == 0: + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], + alpha_c, alpha, beta, + gamma, pred_r=False, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, + exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, alpha, + beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 1: + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], + alpha_c, alpha, + beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, + exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, alpha, + beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 2: + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, + alpha, beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, + exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, 0, beta, + gamma, + scale_cc=scale_cc, + rna_only=rna_only) + + return [torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type)], \ + [exp_sw1, exp_sw2, exp_sw3] + + tau1 = tau_list[0] + if switch >= 1: + tau2 = tau_list[1] + if switch >= 2: + tau3 = tau_list[2] + if switch == 3: + tau4 = tau_list[3] + exp1, exp2, exp3, exp4 = (torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type), + torch.empty((0, 3), requires_grad=True, + device=self.device, + dtype=self.torch_type)) + if model == 0: + exp1 = self.predict_exp_ten(tau1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = self.predict_exp_ten(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, + beta, gamma, pred_r=False, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], + alpha_c, alpha, beta, gamma, + pred_r=False, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + exp3 = self.predict_exp_ten(tau3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch == 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, alpha, beta, + gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + exp4 = self.predict_exp_ten(tau4, exp_sw3[0, 0], + exp_sw3[0, 1], + exp_sw3[0, 2], + alpha_c, 0, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 1: + exp1 = self.predict_exp_ten(tau1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = self.predict_exp_ten(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, + beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, + alpha, beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + exp3 = self.predict_exp_ten(tau3, exp_sw2[0, 0], + exp_sw2[0, 1], exp_sw2[0, 2], + alpha_c, alpha, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + if switch == 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, alpha, beta, + gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + exp4 = self.predict_exp_ten(tau4, exp_sw3[0, 0], + exp_sw3[0, 1], + exp_sw3[0, 2], alpha_c, 0, + beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + elif model == 2: + exp1 = self.predict_exp_ten(tau1, 0, 0, 0, alpha_c, alpha, beta, + gamma, pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 1: + exp_sw1 = self.predict_exp_ten(tau_sw1, 0, 0, 0, alpha_c, + alpha, beta, gamma, + pred_r=False, scale_cc=scale_cc, + rna_only=rna_only) + exp2 = self.predict_exp_ten(tau2, exp_sw1[0, 0], exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, alpha, + beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if switch >= 2: + exp_sw2 = self.predict_exp_ten(tau_sw2, exp_sw1[0, 0], + exp_sw1[0, 1], + exp_sw1[0, 2], alpha_c, + alpha, beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + exp3 = self.predict_exp_ten(tau3, exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], alpha_c, 0, + beta, gamma, scale_cc=scale_cc, + rna_only=rna_only) + if switch == 3: + exp_sw3 = self.predict_exp_ten(tau_sw3, exp_sw2[0, 0], + exp_sw2[0, 1], + exp_sw2[0, 2], + alpha_c, 0, beta, gamma, + scale_cc=scale_cc, + rna_only=rna_only) + exp4 = self.predict_exp_ten(tau4, exp_sw3[0, 0], + exp_sw3[0, 1], + exp_sw3[0, 2], + alpha_c, 0, beta, gamma, + chrom_open=False, + scale_cc=scale_cc, + rna_only=rna_only) + return [exp1, exp2, exp3, exp4], [exp_sw1, exp_sw2, exp_sw3] + + def check_partial_trajectory(self, fit_gmm=True, fit_slope=True, + determine_model=True): + w_non_zero = ((self.c >= 0.1 * np.max(self.c)) & + (self.u >= 0.1 * np.max(self.u)) & + (self.s >= 0.1 * np.max(self.s))) + u_non_zero = self.u[w_non_zero] + s_non_zero = self.s[w_non_zero] + if len(u_non_zero) < 10: + self.low_quality = True + return + + # GMM + w_low = ((np.percentile(s_non_zero, 30) <= s_non_zero) & + (s_non_zero <= np.percentile(s_non_zero, 40))) + if np.sum(w_low) < 10: + fit_gmm = False + self.partial = True + if self.local_std is None: + main_info('local standard deviation not provided. ' + 'Skipping GMM..', indent_level=2) + if self.embed_coord is None: + main_info('Warning: embedded coordinates not provided. ' + 'Skipping GMM..') + if (fit_gmm and self.local_std is not None and self.embed_coord + is not None): + + pdist = pairwise_distances( + self.embed_coord[w_non_zero, :][w_low, :]) + dists = (np.ravel(pdist[np.triu_indices_from(pdist, k=1)]) + .reshape(-1, 1)) + model = GaussianMixture(n_components=2, covariance_type='tied', + random_state=2021).fit(dists) + mean_diff = np.abs(model.means_[1][0] - model.means_[0][0]) + criterion1 = mean_diff > self.local_std / self.tm + main_info(f'GMM: difference between means = {mean_diff}, ' + f'threshold = {self.local_std / self.tm}.', indent_level=2) + criterion2 = np.all(model.weights_[1] > 0.2 / self.tm) + main_info('GMM: weight of the second Gaussian =' + f' {model.weights_[1]}.', indent_level=2) + if criterion1 and criterion2: + self.partial = False + else: + self.partial = True + main_info(f'GMM decides {"" if self.partial else "not "}' + 'partial.', indent_level=2) + + # steady-state slope + wu = self.u >= np.percentile(u_non_zero, 95) + ws = self.s >= np.percentile(s_non_zero, 95) + ss_u = self.u[wu | ws] + ss_s = self.s[wu | ws] + if np.all(ss_u == 0) or np.all(ss_s == 0): + self.low_quality = True + return + gamma = np.dot(ss_u, ss_s) / np.dot(ss_s, ss_s) + self.steady_state_func = lambda x: gamma*x + + # thickness of phase portrait + u_norm = u_non_zero / np.max(self.u) + s_norm = s_non_zero / np.max(self.s) + exp = np.hstack((np.reshape(u_norm, (-1, 1)), + np.reshape(s_norm, (-1, 1)))) + U, S, Vh = np.linalg.svd(exp) + self.thickness = S[1] + + # slope-based direction decision + with np.errstate(divide='ignore', invalid='ignore'): + slope = self.u / self.s + non_nan = ~np.isnan(slope) + slope = slope[non_nan] + on = slope >= gamma + off = slope < gamma + if len(ss_u) < 10 or len(u_non_zero) < 10: + fit_slope = False + self.direction = 'complete' + if fit_slope: + slope_ = u_non_zero / s_non_zero + on_ = slope_ >= gamma + off_ = slope_ < gamma + on_dist = np.sum((u_non_zero[on_] - gamma * s_non_zero[on_])**2) + off_dist = np.sum((gamma * s_non_zero[off_] - u_non_zero[off_])**2) + main_info(f'Slope: SSE on induction phase = {on_dist},' + f' SSE on repression phase = {off_dist}.', indent_level=2) + if self.thickness < 1.5 / np.sqrt(self.tm): + narrow = True + else: + narrow = False + main_info(f'Thickness of trajectory = {self.thickness}. ' + f'Trajectory is {"narrow" if narrow else "normal"}.', + indent_level=2) + if on_dist > 10 * self.tm**2 * off_dist: + self.direction = 'on' + self.partial = True + elif off_dist > 10 * self.tm**2 * on_dist: + self.direction = 'off' + self.partial = True + else: + if self.partial is True: + if on_dist > 3 * self.tm * off_dist: + self.direction = 'on' + elif off_dist > 3 * self.tm * on_dist: + self.direction = 'off' + else: + if narrow: + self.direction = 'on' + else: + self.direction = 'complete' + self.partial = False + else: + if narrow: + self.direction = ('off' + if off_dist > 2 * self.tm * on_dist + else 'on') + self.partial = True + else: + self.direction = 'complete' + + # model pre-determination + if self.direction == 'on': + self.model_ = 1 + elif self.direction == 'off': + self.model_ = 2 + else: + c_high = self.c >= np.mean(self.c) + 2 * np.std(self.c) + c_high = c_high[non_nan] + if np.sum(c_high) < 10: + c_high = self.c >= np.mean(self.c) + np.std(self.c) + c_high = c_high[non_nan] + if np.sum(c_high) < 10: + c_high = self.c >= np.percentile(self.c, 90) + c_high = c_high[non_nan] + if np.sum(self.c[non_nan][c_high] == 0) > 0.5*np.sum(c_high): + self.low_quality = True + return + c_high_on = np.sum(c_high & on) + c_high_off = np.sum(c_high & off) + if c_high_on > c_high_off: + self.model_ = 1 + else: + self.model_ = 2 + if determine_model: + self.model = self.model_ + + if not self.known_pars: + if fit_gmm or fit_slope: + main_info(f'predicted partial trajectory: {self.partial}', + indent_level=1) + main_info('predicted trajectory direction:' + f'{self.direction}', indent_level=1) + if determine_model: + main_info(f'predicted model: {self.model}', indent_level=1) + + def initialize_steady_state_params(self, model_mismatch=False): + self.scale_cc = 1.0 + self.rescale_c = 1.0 + # estimate rescale factor for u + s_norm = self.s / self.max_s + u_mid = (self.u >= 0.4 * self.max_u) & (self.u <= 0.6 * self.max_u) + if np.sum(u_mid) < 10: + self.rescale_u = self.thickness / 5 + else: + s_low, s_high = np.percentile(s_norm[u_mid], [2, 98]) + s_dist = s_high - s_low + self.rescale_u = s_dist + if self.rescale_u == 0: + self.low_quality = True + return + + c = self.c / self.rescale_c + u = self.u / self.rescale_u + s = self.s + + # some extreme values + wu = u >= np.percentile(u, 97) + ws = s >= np.percentile(s, 97) + ss_u = u[wu | ws] + ss_s = s[wu | ws] + c_upper = np.mean(c[wu | ws]) + + c_high = c >= np.mean(c) + # _r stands for repressed state + c0_r = np.mean(c[c_high]) + u0_r = np.mean(ss_u) + s0_r = np.mean(ss_s) + if c0_r < c_upper: + c0_r = c_upper + 0.1 + + # adjust chromatin level for reasonable initialization + if model_mismatch or not self.fit_decoupling: + c_indu = np.mean(c[self.u > self.steady_state_func(self.s)]) + c_repr = np.mean(c[self.u < self.steady_state_func(self.s)]) + if c_indu == np.nan or c_repr == np.nan: + self.low_quality = True + return + c0_r = np.mean(c[c >= np.min([c_indu, c_repr])]) + + # initialize rates + self.alpha_c = 0.1 + self.beta = 1.0 + self.gamma = np.dot(ss_u, ss_s) / np.dot(ss_s, ss_s) + alpha = u0_r + self.alpha = u0_r + self.rates = np.array([self.alpha_c, self.alpha, self.beta, + self.gamma]) + + # RNA-only + if self.rna_only: + t_sw_1 = 0.1 + t_sw_3 = 20.0 + if self.init_mode == 'grid': + + # arange returns sequence [2,6,10,14,18] + for t_sw_2 in np.arange(2, 20, 4, dtype=np.float64): + self.update(self.params, initialize=True, adjust_time=False, + plot=False) + + elif self.init_mode == 'simple': + t_sw_2 = 10 + self.params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + + elif self.init_mode == 'invert': + t_sw_2 = approx_tau(u0_r, s0_r, 0, 0, alpha, self.beta, + self.gamma) + if t_sw_2 <= 0.2: + t_sw_2 = 1.0 + elif t_sw_2 >= 19.9: + t_sw_2 = 19.0 + self.params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + + # chromatin-RNA + else: + if self.init_mode == 'grid': + # arange returns sequence [1,5,9,13,17] + for t_sw_1 in np.arange(1, 18, 4, dtype=np.float64): + # arange returns sequence 2,6,10,14,18 + for t_sw_2 in np.arange(t_sw_1+1, 19, 4, dtype=np.float64): + # arange returns sequence [3,7,11,15,19] + for t_sw_3 in np.arange(t_sw_2+1, 20, 4, + dtype=np.float64): + if not self.fit_decoupling: + t_sw_3 = t_sw_2 + 30 / self.n_anchors + params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + self.update(params, initialize=True, + adjust_time=False, plot=False) + if not self.fit_decoupling: + break + + elif self.init_mode == 'simple': + t_sw_1, t_sw_2, t_sw_3 = 5, 10, 15 \ + if not self.fit_decoupling \ + else 10.1 + self.params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + + elif self.init_mode == 'invert': + self.alpha = u0_r / c_upper + if model_mismatch or not self.fit_decoupling: + self.alpha = u0_r / c0_r + rna_interval = approx_tau(u0_r, s0_r, 0, 0, alpha, self.beta, + self.gamma) + rna_interval = np.clip(rna_interval, 3, 12) + if self.model == 1: + for t_sw_1 in np.arange(1, rna_interval-1, 2, + dtype=np.float64): + t_sw_3 = rna_interval + t_sw_1 + for t_sw_2 in np.arange(t_sw_1+1, rna_interval, 2, + dtype=np.float64): + if not self.fit_decoupling: + t_sw_2 = t_sw_3 - 30 / self.n_anchors + + alpha_c = -np.log(1 - c0_r) / t_sw_2 + params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + self.update(params, initialize=True, + adjust_time=False, plot=False) + if not self.fit_decoupling: + break + + elif self.model == 2: + for t_sw_1 in np.arange(1, rna_interval, 2, + dtype=np.float64): + t_sw_2 = rna_interval + t_sw_1 + for t_sw_3 in np.arange(t_sw_2+1, t_sw_2+6, 2, + dtype=np.float64): + if not self.fit_decoupling: + t_sw_3 = t_sw_2 + 30 / self.n_anchors + + alpha_c = -np.log(1 - c0_r) / t_sw_3 + params = np.array([t_sw_1, + t_sw_2-t_sw_1, + t_sw_3-t_sw_2, + alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + self.update(params, initialize=True, + adjust_time=False, plot=False) + if not self.fit_decoupling: + break + + self.loss = [self.mse(self.params)] + self.t_sw_array = np.array([self.params[0], + self.params[0]+self.params[1], + self.params[0]+self.params[1] + + self.params[2]]) + self.t_sw_1, self.t_sw_2, self.t_sw_3 = self.t_sw_array + + main_info(f'initial params:\nswitch time array = {self.t_sw_array},' + '\n' + f'rates = {self.rates},\ncc scale = {self.scale_cc},\n' + f'c rescale factor = {self.rescale_c},\n' + f'u rescale factor = {self.rescale_u}', indent_level=1) + main_info(f'initial loss: {self.loss[-1]}', indent_level=1) + + def fit(self): + if self.low_quality: + return self.loss + + if self.plot: + plt.ion() + self.fig = plt.figure(figsize=self.fig_size) + if self.rna_only: + self.ax = self.fig.add_subplot(111) + else: + self.ax = self.fig.add_subplot(111, projection='3d') + + if not self.known_pars: + self.fit_dyn() + + self.update(self.params, perform_update=True, fit_outlier=True, + plot=True) + + # remove long gaps in the last observed state + t_sorted = np.sort(self.t) + dt = np.diff(t_sorted, prepend=0) + mean_dt = np.mean(dt) + std_dt = np.std(dt) + gap_thresh = np.clip(mean_dt+3*std_dt, 3*20/self.n_anchors, None) + if gap_thresh > 0: + idx = np.where(dt > gap_thresh)[0] + gap_sum = 0 + last_t_sw = np.max(self.t_sw_array[self.t_sw_array < 20]) + for i in idx: + t1 = t_sorted[i-1] if i > 0 else 0 + t2 = t_sorted[i] + if t1 > last_t_sw and t2 <= 20: + gap_sum += np.clip(t2 - t1 - mean_dt, 0, None) + if last_t_sw > np.max(self.t): + gap_sum += 20 - last_t_sw + realign_ratio = np.clip(20/(20 - gap_sum), None, 20/last_t_sw) + main_info(f'removing gaps and realigning by {realign_ratio}..', + indent_level=1) + self.rates /= realign_ratio + self.alpha_c, self.alpha, self.beta, self.gamma = self.rates + self.params[:3] *= realign_ratio + self.params[3:7] = self.rates + self.t_sw_array = np.array([self.params[0], + self.params[0]+self.params[1], + self.params[0]+self.params[1] + + self.params[2]]) + self.t_sw_1, self.t_sw_2, self.t_sw_3 = self.t_sw_array + self.update(self.params, perform_update=True, fit_outlier=True, + plot=True) + + if self.plot: + plt.ioff() + plt.show(block=True) + + # likelihood + main_info('computing likelihood..', indent_level=1) + keep = self.non_zero & self.non_outlier & \ + (self.u_all > 0.2 * np.percentile(self.u_all, 99.5)) & \ + (self.s_all > 0.2 * np.percentile(self.s_all, 99.5)) + scale_factor = np.array([self.scale_c / self.std_c, + self.scale_u / self.std_u, + self.scale_s / self.std_s]) + if np.sum(keep) >= 10: + self.likelihood, self.l_c, self.ssd_c, self.var_c, l_u, l_s = \ + compute_likelihood(self.c_all, + self.u_all, + self.s_all, + self.t_sw_array, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.rescale_c, + self.rescale_u, + self.t, + self.state, + scale_cc=self.scale_cc, + scale_factor=scale_factor, + model=self.model, + weight=keep, + rna_only=self.rna_only) + else: + self.likelihood, self.l_c, self.ssd_c, self.var_c, l_u = \ + 0, 0, 0, 0, 0 + # TODO: Keep? Remove?? + l_s = 0 + + if not self.rna_only: + main_info(f'likelihood of c: {self.l_c}, likelihood of u: {l_u},' + f' likelihood of s: {l_s}', indent_level=1) + + # velocity + main_info('computing velocities..', indent_level=1) + self.velocity = np.empty((len(self.u_all), 3)) + if self.conn is not None: + new_time = self.conn.dot(self.t) + new_time[new_time > 20] = 20 + new_state = self.state.copy() + new_state[new_time <= self.t_sw_1] = 0 + new_state[(self.t_sw_1 < new_time) & (new_time <= self.t_sw_2)] = 1 + new_state[(self.t_sw_2 < new_time) & (new_time <= self.t_sw_3)] = 2 + new_state[self.t_sw_3 < new_time] = 3 + + else: + new_time = self.t + new_state = self.state + + self.alpha_c, self.alpha, self.beta, self.gamma = \ + check_params(self.alpha_c, self.alpha, self.beta, self.gamma) + vc, vu, vs = compute_velocity(new_time, + self.t_sw_array, + new_state, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.rescale_c, + self.rescale_u, + scale_cc=self.scale_cc, + model=self.model, + rna_only=self.rna_only) + + self.velocity[:, 0] = vc * self.scale_c + self.velocity[:, 1] = vu * self.scale_u + self.velocity[:, 2] = vs * self.scale_s + + # anchor expression and velocity + anchor_time, tau_list = anchor_points(self.t_sw_array, 20, + self.n_anchors, return_time=True) + switch = np.sum(self.t_sw_array < 20) + typed_tau_list = List() + [typed_tau_list.append(x) for x in tau_list] + self.alpha_c, self.alpha, self.beta, self.gamma, \ + self.c0, self.u0, self.s0 = \ + check_params(self.alpha_c, self.alpha, self.beta, self.gamma, + c0=self.c0, u0=self.u0, s0=self.s0) + exp_list, exp_sw_list = generate_exp(typed_tau_list, + self.t_sw_array[:switch], + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + scale_cc=self.scale_cc, + model=self.model, + rna_only=self.rna_only) + rescale_factor = np.array([self.rescale_c, self.rescale_u, 1.0]) + exp_list = [x*rescale_factor for x in exp_list] + exp_sw_list = [x*rescale_factor for x in exp_sw_list] + c = np.ravel(np.concatenate([exp_list[x][:, 0] + for x in range(switch+1)])) + u = np.ravel(np.concatenate([exp_list[x][:, 1] + for x in range(switch+1)])) + s = np.ravel(np.concatenate([exp_list[x][:, 2] + for x in range(switch+1)])) + c_sw = np.ravel(np.concatenate([exp_sw_list[x][:, 0] + for x in range(switch)])) + u_sw = np.ravel(np.concatenate([exp_sw_list[x][:, 1] + for x in range(switch)])) + s_sw = np.ravel(np.concatenate([exp_sw_list[x][:, 2] + for x in range(switch)])) + self.alpha_c, self.alpha, self.beta, self.gamma = \ + check_params(self.alpha_c, self.alpha, self.beta, self.gamma) + vc, vu, vs = compute_velocity(anchor_time, + self.t_sw_array, + None, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.rescale_c, + self.rescale_u, + scale_cc=self.scale_cc, + model=self.model, + rna_only=self.rna_only) + + # scale and shift back to original scale + c_ = c * self.scale_c + self.offset_c + u_ = u * self.scale_u + self.offset_u + s_ = s * self.scale_s + self.offset_s + c_sw_ = c_sw * self.scale_c + self.offset_c + u_sw_ = u_sw * self.scale_u + self.offset_u + s_sw_ = s_sw * self.scale_s + self.offset_s + vc = vc * self.scale_c + vu = vu * self.scale_u + vs = vs * self.scale_s + + self.anchor_exp = np.empty((len(u_), 3)) + self.anchor_exp[:, 0], self.anchor_exp[:, 1], self.anchor_exp[:, 2] = \ + c_, u_, s_ + self.anchor_exp_sw = np.empty((len(u_sw_), 3)) + self.anchor_exp_sw[:, 0], self.anchor_exp_sw[:, 1], \ + self.anchor_exp_sw[:, 2] = c_sw_, u_sw_, s_sw_ + self.anchor_velo = np.empty((len(u_), 3)) + self.anchor_velo[:, 0] = vc + self.anchor_velo[:, 1] = vu + self.anchor_velo[:, 2] = vs + self.anchor_velo_min_idx = np.sum(anchor_time < np.min(new_time)) + self.anchor_velo_max_idx = np.sum(anchor_time < np.max(new_time)) - 1 + + if self.save_plot: + main_info('saving plots..', indent_level=1) + self.save_dyn_plot(c_, u_, s_, c_sw_, u_sw_, s_sw_, tau_list) + + self.realign_time_and_velocity(c, u, s, anchor_time) + + main_info(f'final params:\nswitch time array = {self.t_sw_array},\n' + f'rates = {self.rates},\ncc scale = {self.scale_cc},\n' + f'c rescale factor = {self.rescale_c},\n' + f'u rescale factor = {self.rescale_u}', + indent_level=1) + main_info(f'final loss: {self.loss[-1]}', indent_level=1) + main_info(f'final likelihood: {self.likelihood}', indent_level=1) + + return self.loss + + # the adam algorithm + # NOTE: The starting point for this function was an excample on the + # GeeksForGeeks website. The particular article is linked below: + # www.geeksforgeeks.org/how-to-implement-adam-gradient-descent-from-scratch-using-python/ + def AdamMin(self, x, n_iter, tol, eps=1e-8): + + n = len(x) + + x_ten = torch.tensor(x, requires_grad=True, device=self.device, + dtype=self.torch_type) + + # record lowest loss as a benchmark + # (right now the lowest loss is the current loss) + lowest_loss = torch.tensor(np.array(self.loss[-1], dtype=self.u.dtype), + device=self.device, + dtype=self.torch_type) + + # record the tensor of the parameters that cause the lowest loss + lowest_x_ten = x_ten + + # the m and v variables used in the adam calculations + m = torch.zeros(n, device=self.device, requires_grad=True, + dtype=self.torch_type) + v = torch.zeros(n, device=self.device, requires_grad=True, + dtype=self.torch_type) + + # the update amount to add to the x tensor after the appropriate + # calculations are made + u = torch.ones(n, device=self.device, requires_grad=True, + dtype=self.torch_type) * float("inf") + + # how many times the new loss is lower than the lowest loss + update_count = 0 + + iterations = 0 + + # run the gradient descent updates + for t in range(n_iter): + + iterations += 1 + + # calculate the loss + loss = self.mse_ten(x_ten) + + # if the loss is lower than the lowest loss... + if loss < lowest_loss: + + # record the new best tensor + lowest_x_ten = x_ten + update_count += 1 + + # if the percentage difference in x tensors and loss values + # is less than the tolerance parameter and we've update the + # loss 3 times by now... + if torch.all((torch.abs(u) / lowest_x_ten) < tol) and \ + (torch.abs(loss - lowest_loss) / lowest_loss) < tol and \ + update_count >= 3: + + # ...we've updated enough. Break! + break + + # record the new lowest loss + lowest_loss = loss + + # take the gradient of mse w/r/t our current parameter values + loss.backward(inputs=x_ten) + g = x_ten.grad + + # calculate the new update value using the Adam formula + m = (self.adam_beta1 * m) + ((1.0 - self.adam_beta1) * g) + v = (self.adam_beta2 * v) + ((1.0 - self.adam_beta2) * g * g) + + mhat = m / (1.0 - (self.adam_beta1**(t+1))) + vhat = v / (1.0 - (self.adam_beta2**(t+1))) + + u = -(self.adam_lr * mhat) / (torch.sqrt(vhat) + eps) + + # update the x tensor + x_ten = x_ten + u + + # as long as we've found at least one better x tensor... + if update_count > 1: + + # record the final lowest loss + if loss < lowest_loss: + lowest_loss = loss + + # set the new loss for the gene to the new lowest loss + self.cur_loss = lowest_loss.item() + + # use the update() function so the gene's parameters + # are the new best one we found + updated = self.update(lowest_x_ten.cpu().detach().numpy()) + + # if we never found a better x tensor, then the return value should + # state that we did not update it + else: + updated = False + + # return whether we updated the x tensor or not + return updated + + def fit_dyn(self): + + while self.cur_iter < self.max_iter: + self.cur_iter += 1 + + # RNA-only + if self.rna_only: + main_info('Nelder Mead on t_sw_2 and alpha..', indent_level=2) + self.fitting_flag_ = 0 + if self.cur_iter == 1: + var_test = (self.alpha + + np.array([-2, -1, -0.5, 0.5, 1, 2]) * 0.1 + * self.alpha) + new_params = self.params.copy() + for var in var_test: + new_params[4] = var + self.update(new_params, adjust_time=False, + penalize_gap=False) + res = minimize(self.mse, x0=[self.params[1], self.params[4]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, options={'maxiter': 3}) + + if self.fit_rescale: + main_info('Nelder Mead on t_sw_2, beta, and rescale u..', + indent_level=2) + res = minimize(self.mse, x0=[self.params[1], + self.params[5], + self.params[9]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 5}) + + main_info('Nelder Mead on alpha and gamma..', indent_level=2) + self.fitting_flag_ = 1 + res = minimize(self.mse, x0=[self.params[4], self.params[6]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, options={'maxiter': 3}) + + main_info('Nelder Mead on t_sw_2..', indent_level=2) + res = minimize(self.mse, x0=[self.params[1]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, options={'maxiter': 2}) + + main_info('Full Nelder Mead..', indent_level=2) + res = minimize(self.mse, x0=[self.params[1], self.params[4], + self.params[5], self.params[6]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, options={'maxiter': 5}) + + # chromatin-RNA + else: + + if not self.adam: + main_info('Nelder Mead on t_sw_1, chromatin switch time,' + 'and alpha_c..', indent_level=2) + self.fitting_flag_ = 1 + if self.cur_iter == 1: + var_test = (self.gamma + np.array([-1, -0.5, 0.5, 1]) + * 0.1 * self.gamma) + new_params = self.params.copy() + for var in var_test: + new_params[6] = var + self.update(new_params, adjust_time=False) + if self.model == 0 or self.model == 1: + res = minimize(self.mse, x0=[self.params[0], + self.params[1], + self.params[3]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + elif self.model == 2: + res = minimize(self.mse, x0=[self.params[0], + self.params[2], + self.params[3]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + + main_info('Nelder Mead on chromatin switch time,' + 'chromatin closing rate scaling, and rescale' + 'c..', indent_level=2) + self.fitting_flag_ = 2 + if self.model == 0 or self.model == 1: + res = minimize(self.mse, x0=[self.params[1], + self.params[7], + self.params[8]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + elif self.model == 2: + res = minimize(self.mse, x0=[self.params[2], + self.params[7], + self.params[8]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + + main_info('Nelder Mead on rna switch time and alpha..', + indent_level=2) + self.fitting_flag_ = 1 + if self.model == 0 or self.model == 1: + res = minimize(self.mse, x0=[self.params[2], + self.params[4]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 10}) + elif self.model == 2: + res = minimize(self.mse, x0=[self.params[1], + self.params[4]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 10}) + + main_info('Nelder Mead on rna switch time, beta, and ' + 'rescale u..', indent_level=2) + self.fitting_flag_ = 3 + if self.model == 0 or self.model == 1: + res = minimize(self.mse, x0=[self.params[2], + self.params[5], + self.params[9]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + elif self.model == 2: + res = minimize(self.mse, x0=[self.params[1], + self.params[5], + self.params[9]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + + main_info('Nelder Mead on alpha and gamma..', indent_level=2) + self.fitting_flag_ = 2 + res = minimize(self.mse, x0=[self.params[4], + self.params[6]], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 10}) + + main_info('Nelder Mead on t_sw..', indent_level=2) + self.fitting_flag_ = 4 + res = minimize(self.mse, x0=self.params[:3], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 20}) + + else: + + main_info('Adam on all parameters', indent_level=2) + self.AdamMin(np.array(self.params, dtype=self.u.dtype), 20, + tol=1e-2) + + main_info('Nelder Mead on t_sw..', indent_level=2) + self.fitting_flag_ = 4 + res = minimize(self.mse, x0=self.params[:3], + method='Nelder-Mead', tol=1e-2, + callback=self.update, + options={'maxiter': 15}) + + main_info(f'iteration {self.cur_iter} finished', indent_level=2) + + def _variables(self, x): + scale_cc = self.scale_cc + rescale_c = self.rescale_c + rescale_u = self.rescale_u + + # RNA-only + if self.rna_only: + if len(x) == 1: # fit t_sw_2 + t3 = np.array([self.t_sw_1, x[0], + self.t_sw_3 - self.t_sw_1 - x[0]]) + r4 = self.rates + + elif len(x) == 2: + if self.fitting_flag_: # fit alpha and gamma + t3 = self.params[:3] + r4 = np.array([self.alpha_c, x[0], self.beta, x[1]]) + else: # fit t_sw_2 and alpha + t3 = np.array([self.t_sw_1, x[0], + self.t_sw_3 - self.t_sw_1 - x[0]]) + r4 = np.array([self.alpha_c, x[1], self.beta, self.gamma]) + + elif len(x) == 3: # fit t_sw_2, beta, and rescale u + t3 = np.array([self.t_sw_1, + x[0], self.t_sw_3 - self.t_sw_1 - x[0]]) + r4 = np.array([self.alpha_c, self.alpha, x[1], self.gamma]) + rescale_u = x[2] + + elif len(x) == 4: # fit all + t3 = np.array([self.t_sw_1, x[0], self.t_sw_3 - self.t_sw_1 + - x[0]]) + r4 = np.array([self.alpha_c, x[1], x[2], x[3]]) + + elif len(x) == 10: # all available + t3 = x[:3] + r4 = x[3:7] + scale_cc = x[7] + rescale_c = x[8] + rescale_u = x[9] + + else: + return + + # chromatin-RNA + else: + + if len(x) == 2: + if self.fitting_flag_ == 1: # fit rna switch time and alpha + if self.model == 0 or self.model == 1: + t3 = np.array([self.t_sw_1, self.params[1], x[0]]) + elif self.model == 2: + t3 = np.array([self.t_sw_1, x[0], + self.t_sw_3 - self.t_sw_1 - x[0]]) + r4 = np.array([self.alpha_c, x[1], self.beta, self.gamma]) + elif self.fitting_flag_ == 2: # fit alpha and gamma + t3 = self.params[:3] + r4 = np.array([self.alpha_c, x[0], self.beta, x[1]]) + + elif len(x) == 3: + # fit t_sw_1, chromatin switch time, and alpha_c + if self.fitting_flag_ == 1: + if self.model == 0 or self.model == 1: + t3 = np.array([x[0], x[1], self.t_sw_3 - x[0] - x[1]]) + elif self.model == 2: + t3 = np.array([x[0], self.t_sw_2 - x[0], x[1]]) + r4 = np.array([x[2], self.alpha, self.beta, self.gamma]) + # fit chromatin switch time, chromatin closing rate scaling, + # and rescale c + elif self.fitting_flag_ == 2: + if self.model == 0 or self.model == 1: + t3 = np.array([self.t_sw_1, x[0], + self.t_sw_3 - self.t_sw_1 - x[0]]) + elif self.model == 2: + t3 = np.array([self.t_sw_1, self.params[1], x[0]]) + r4 = self.rates + scale_cc = x[1] + rescale_c = x[2] + # fit rna switch time, beta, and rescale u + elif self.fitting_flag_ == 3: + if self.model == 0 or self.model == 1: + t3 = np.array([self.t_sw_1, self.params[1], x[0]]) + elif self.model == 2: + t3 = np.array([self.t_sw_1, x[0], + self.t_sw_3 - self.t_sw_1 - x[0]]) + r4 = np.array([self.alpha_c, self.alpha, x[1], self.gamma]) + rescale_u = x[2] + # fit three switch times + elif self.fitting_flag_ == 4: + t3 = x + r4 = self.rates + + elif len(x) == 7: + t3 = x[:3] + r4 = x[3:] + + elif len(x) == 10: + t3 = x[:3] + r4 = x[3:7] + scale_cc = x[7] + rescale_c = x[8] + rescale_u = x[9] + + else: + return + + # clip to meaningful values + if self.fitting_flag_ and not self.adam: + scale_cc = np.clip(scale_cc, + np.max([0.5*self.scale_cc, 0.25]), + np.min([2*self.scale_cc, 4])) + + if not self.known_pars: + if self.fit_decoupling: + t3 = np.clip(t3, 0.1, None) + else: + t3[2] = 30 / self.n_anchors + t3[:2] = np.clip(t3[:2], 0.1, None) + r4 = np.clip(r4, 0.001, 1000) + rescale_c = np.clip(rescale_c, 0.75, 1.5) + rescale_u = np.clip(rescale_u, 0.2, 3) + + return t3, r4, scale_cc, rescale_c, rescale_u + + # the tensor version of the calculate_dist_and_time function + def calculate_dist_and_time_ten(self, + c, u, s, + t_sw_array, + alpha_c, alpha, beta, gamma, + rescale_c, rescale_u, + scale_cc=1, + scale_factor=None, + model=1, + conn=None, + t=1000, k=1, + direction='complete', + total_h=20, + rna_only=False, + penalize_gap=True, + all_cells=True): + + conn = torch.tensor(conn.todense(), + device=self.device, + dtype=self.torch_type) + + c_ten = torch.tensor(c, device=self.device, dtype=self.torch_type) + u_ten = torch.tensor(u, device=self.device, dtype=self.torch_type) + s_ten = torch.tensor(s, device=self.device, dtype=self.torch_type) + + n = len(u) + if scale_factor is None: + scale_factor_ten = torch.stack((torch.std(c_ten), torch.std(u_ten), + torch.std(s_ten))) + else: + scale_factor_ten = torch.tensor(scale_factor, device=self.device, + dtype=self.torch_type) + + tau_list = self.anchor_points_ten(t_sw_array, total_h, t) + + switch = torch.sum(t_sw_array < total_h) + + exp_list, exp_sw_list = self.generate_exp_tens(tau_list, + t_sw_array[:switch], + alpha_c, + alpha, + beta, + gamma, + model=model, + scale_cc=scale_cc, + rna_only=rna_only) + + rescale_factor = torch.stack((rescale_c, rescale_u, + torch.tensor(1.0, device=self.device, + requires_grad=True, + dtype=self.torch_type))) + + for i in range(len(exp_list)): + exp_list[i] = exp_list[i]*rescale_factor + + if i < len(exp_list)-1: + exp_sw_list[i] = exp_sw_list[i]*rescale_factor + + max_c = 0 + max_u = 0 + max_s = 0 + + if rna_only: + exp_mat = (torch.hstack((torch.reshape(u_ten, (-1, 1)), + torch.reshape(s_ten, (-1, 1)))) + / scale_factor_ten[1:]) + else: + exp_mat = torch.hstack((torch.reshape(c_ten, (-1, 1)), + torch.reshape(u_ten, (-1, 1)), + torch.reshape(s_ten, (-1, 1))))\ + / scale_factor_ten + + taus = torch.zeros((1, n), device=self.device, + requires_grad=True, + dtype=self.torch_type) + anchor_exp, anchor_t = None, None + + dists0 = torch.full((1, n), 0.0 if direction == "on" + or direction == "complete" else np.inf, + device=self.device, + requires_grad=True, + dtype=self.torch_type) + dists1 = torch.full((1, n), 0.0 if direction == "on" + or direction == "complete" else np.inf, + device=self.device, + requires_grad=True, + dtype=self.torch_type) + dists2 = torch.full((1, n), 0.0 if direction == "off" + or direction == "complete" else np.inf, + device=self.device, + requires_grad=True, + dtype=self.torch_type) + dists3 = torch.full((1, n), 0.0 if direction == "off" + or direction == "complete" else np.inf, + device=self.device, + requires_grad=True, + dtype=self.torch_type) + + ts0 = torch.zeros((1, n), device=self.device, + requires_grad=True, + dtype=self.torch_type) + ts1 = torch.zeros((1, n), device=self.device, + requires_grad=True, + dtype=self.torch_type) + ts2 = torch.zeros((1, n), device=self.device, + requires_grad=True, + dtype=self.torch_type) + ts3 = torch.zeros((1, n), device=self.device, + requires_grad=True, + dtype=self.torch_type) + + for i in range(switch+1): + + if not all_cells: + max_ci = (torch.max(exp_list[i][:, 0]) + if exp_list[i].shape[0] > 0 + else 0) + max_c = max_ci if max_ci > max_c else max_c + max_ui = torch.max(exp_list[i][:, 1]) if exp_list[i].shape[0] > 0 \ + else 0 + max_u = max_ui if max_ui > max_u else max_u + max_si = torch.max(exp_list[i][:, 2]) if exp_list[i].shape[0] > 0 \ + else 0 + max_s = max_si if max_si > max_s else max_s + + skip_phase = False + if direction == 'off': + if (model in [1, 2]) and (i < 2): + skip_phase = True + elif direction == 'on': + if (model in [1, 2]) and (i >= 2): + skip_phase = True + if rna_only and i == 0: + skip_phase = True + + if not skip_phase: + if rna_only: + tmp = exp_list[i][:, 1:] / scale_factor_ten[1:] + else: + tmp = exp_list[i] / scale_factor_ten + if anchor_exp is None: + anchor_exp = exp_list[i] + anchor_t = (tau_list[i] + t_sw_array[i-1] if i >= 1 + else tau_list[i]) + else: + anchor_exp = torch.vstack((anchor_exp, exp_list[i])) + anchor_t = torch.hstack((anchor_t, + tau_list[i] + t_sw_array[i-1] + if i >= 1 else tau_list[i])) + + if not all_cells: + anchor_prepend_rna = torch.zeros((1, 2), + device=self.device, + dtype=self.torch_type) + anchor_prepend_chrom = torch.zeros((1, 3), + device=self.device, + dtype=self.torch_type) + anchor_dist = torch.diff(tmp, dim=0, + prepend=anchor_prepend_rna + if rna_only + else anchor_prepend_chrom) + + anchor_dist = torch.sqrt((anchor_dist*anchor_dist) + .sum(axis=1)) + remove_cand = anchor_dist < (0.01*torch.max(exp_mat[1]) + if rna_only + else + 0.01*torch.max(exp_mat[2])) + step_idx = torch.arange(0, anchor_dist.size()[0], 1, + device=self.device, + dtype=self.torch_type) % 3 > 0 + remove_cand &= step_idx + keep_idx = torch.where(~remove_cand)[0] + + tmp = tmp[keep_idx, :] + from sklearn.neighbors import NearestNeighbors + model = NearestNeighbors(n_neighbors=k, output_type="numpy") + model.fit(tmp.detach()) + dd, ii = model.kneighbors(exp_mat.detach()) + ii = ii.T[0] + + new_dd = ((exp_mat[:, 0] - tmp[ii, 0]) + * (exp_mat[:, 0] - tmp[ii, 0]) + + (exp_mat[:, 1] - tmp[ii, 1]) + * (exp_mat[:, 1] - tmp[ii, 1]) + + (exp_mat[:, 2] - tmp[ii, 2]) + * (exp_mat[:, 2] - tmp[ii, 2])) + + if k > 1: + new_dd = torch.mean(new_dd, dim=1) + if conn is not None: + new_dd = torch.matmul(conn, new_dd) + + if i == 0: + dists0 = dists0 + new_dd + elif i == 1: + dists1 = dists1 + new_dd + elif i == 2: + dists2 = dists2 + new_dd + elif i == 3: + dists3 = dists3 + new_dd + + if not all_cells: + ii = keep_idx[ii] + if k == 1: + taus = tau_list[i][ii] + else: + for j in range(n): + taus[j] = tau_list[i][ii[j, :]] + + if i == 0: + ts0 = ts0 + taus + elif i == 1: + ts1 = ts1 + taus + t_sw_array[0] + elif i == 2: + ts2 = ts2 + taus + t_sw_array[1] + elif i == 3: + ts3 = ts3 + taus + t_sw_array[2] + + dists = torch.cat((dists0, dists1, dists2, dists3), 0) + + ts = torch.cat((ts0, ts1, ts2, ts3), 0) + + state_pred = torch.argmin(dists, axis=0) + + t_pred = ts[state_pred, torch.arange(n, device=self.device)] + + anchor_t1_list = [] + anchor_t2_list = [] + + t_sw_adjust = torch.zeros(3, device=self.device, dtype=self.torch_type) + + if direction == 'complete': + + dist_gap_add = torch.zeros((1, n), device=self.device, + dtype=self.torch_type) + + t_sorted = torch.clone(t_pred) + t_sorted, t_sorted_indices = torch.sort(t_sorted) + + dt = torch.diff(t_sorted, dim=0, + prepend=torch.zeros(1, device=self.device, + dtype=self.torch_type)) + + gap_thresh = 3*torch.quantile(dt, 0.99) + + idx = torch.where(dt > gap_thresh)[0] + + if len(idx) > 0 and penalize_gap: + h_tens = torch.tensor([total_h], device=self.device, + dtype=self.torch_type) + + for i in idx: + + t1 = t_sorted[i-1] if i > 0 else 0 + t2 = t_sorted[i] + anchor_t1 = anchor_exp[torch.argmin(torch.abs(anchor_t - t1)), + :] + anchor_t2 = anchor_exp[torch.argmin(torch.abs(anchor_t - t2)), + :] + if all_cells: + anchor_t1_list.append(torch.ravel(anchor_t1)) + anchor_t2_list.append(torch.ravel(anchor_t2)) + if not all_cells: + for j in range(1, switch): + crit1 = ((t1 > t_sw_array[j-1]) + and (t2 > t_sw_array[j-1]) + and (t1 <= t_sw_array[j]) + and (t2 <= t_sw_array[j])) + crit2 = ((torch.abs(anchor_t1[2] + - exp_sw_list[j][0, 2]) + < 0.02 * max_s) and + (torch.abs(anchor_t2[2] + - exp_sw_list[j][0, 2]) + < 0.01 * max_s)) + crit3 = ((torch.abs(anchor_t1[1] + - exp_sw_list[j][0, 1]) + < 0.02 * max_u) and + (torch.abs(anchor_t2[1] + - exp_sw_list[j][0, 1]) + < 0.01 * max_u)) + crit4 = ((torch.abs(anchor_t1[0] + - exp_sw_list[j][0, 0]) + < 0.02 * max_c) and + (torch.abs(anchor_t2[0] + - exp_sw_list[j][0, 0]) + < 0.01 * max_c)) + if crit1 and crit2 and crit3 and crit4: + t_sw_adjust[j] += t2 - t1 + if penalize_gap: + dist_gap = torch.sum(((anchor_t1[1:] - anchor_t2[1:]) / + scale_factor_ten[1:])**2) + + idx_to_adjust = torch.tensor(t_pred >= t2, + device=self.device) + + idx_to_adjust = torch.reshape(idx_to_adjust, + (1, idx_to_adjust.size()[0])) + + true_tensor = torch.tensor([True], device=self.device) + false_tensor = torch.tensor([False], device=self.device) + + t_sw_array_ = torch.cat((t_sw_array, h_tens), dim=0) + state_to_adjust = torch.where(t_sw_array_ > t2, + true_tensor, false_tensor) + + dist_gap_add[idx_to_adjust] += dist_gap + + if state_to_adjust[0].item(): + dists0 += dist_gap_add + if state_to_adjust[1].item(): + dists1 += dist_gap_add + if state_to_adjust[2].item(): + dists2 += dist_gap_add + if state_to_adjust[3].item(): + dists3 += dist_gap_add + + dist_gap_add[idx_to_adjust] -= dist_gap + + dists = torch.cat((dists0, dists1, dists2, dists3), 0) + + state_pred = torch.argmin(dists, dim=0) + + if all_cells: + t_pred = ts[torch.arange(n, device=self.device), state_pred] + + min_dist = torch.min(dists, dim=0).values + + if all_cells: + exp_ss_mat = compute_ss_exp(alpha_c, alpha, beta, gamma, + model=model) + if rna_only: + exp_ss_mat[:, 0] = 1 + dists_ss = pairwise_distance_square(exp_mat, exp_ss_mat * + rescale_factor / scale_factor) + + reach_ss = np.full((n, 4), False) + for i in range(n): + for j in range(4): + if min_dist[i] > dists_ss[i, j]: + reach_ss[i, j] = True + late_phase = np.full(n, -1) + for i in range(3): + late_phase[torch.abs(t_pred - t_sw_array[i]) < 0.1] = i + + return min_dist, t_pred, state_pred.cpu().detach().numpy(), \ + reach_ss, late_phase, max_u, max_s, anchor_t1_list, \ + anchor_t2_list + + else: + return min_dist, state_pred.cpu().detach().numpy(), max_u, max_s, \ + t_sw_adjust.cpu().detach().numpy() + + # the torch tensor version of the mse function + def mse_ten(self, x, fit_outlier=False, + penalize_gap=True): + + t3 = x[:3] + r4 = x[3:7] + scale_cc = x[7] + rescale_c = x[8] + rescale_u = x[9] + + if not self.known_pars: + if self.fit_decoupling: + t3 = torch.clip(t3, 0.1, None) + else: + t3[2] = 30 / self.n_anchors + t3[:2] = torch.clip(t3[:2], 0.1, None) + r4 = torch.clip(r4, 0.001, 1000) + rescale_c = torch.clip(rescale_c, 0.75, 1.5) + rescale_u = torch.clip(rescale_u, 0.2, 3) + + t_sw_array = torch.cumsum(t3, dim=0) + + if self.rna_only: + t_sw_array[2] = 20 + + # conditions for minimum switch time and rate params + penalty = 0 + if any(t3 < 0.2) or any(r4 < 0.005): + penalty = (torch.sum(0.2 - t3[t3 < 0.2]) if self.fit_decoupling + else torch.sum(0.2 - t3[:2][t3[:2] < 0.2])) + penalty += torch.sum(0.005 - r4[r4 < 0.005]) * 1e2 + + # condition for all params + if any(x > 500): + penalty = torch.sum(x[x > 500] - 500) * 1e-2 + + c_array = self.c_all if fit_outlier else self.c + u_array = self.u_all if fit_outlier else self.u + s_array = self.s_all if fit_outlier else self.s + + if self.batch_size is not None and self.batch_size < len(c_array): + + subset_choice = np.random.choice(len(c_array), self.batch_size, + replace=False) + + c_array = c_array[subset_choice] + u_array = u_array[subset_choice] + s_array = s_array[subset_choice] + + if fit_outlier: + conn_for_calc = self.conn[subset_choice] + if not fit_outlier: + conn_for_calc = self.conn_sub[subset_choice] + + conn_for_calc = ((conn_for_calc.T)[subset_choice]).T + + else: + + if fit_outlier: + conn_for_calc = self.conn + if not fit_outlier: + conn_for_calc = self.conn_sub + + scale_factor_func = np.array(self.scale_factor, dtype=self.u.dtype) + + # distances and time assignments + res = self.calculate_dist_and_time_ten(c_array, + u_array, + s_array, + t_sw_array, + r4[0], + r4[1], + r4[2], + r4[3], + rescale_c, + rescale_u, + scale_cc=scale_cc, + scale_factor=scale_factor_func, + model=self.model, + direction=self.direction, + conn=conn_for_calc, + k=self.k_dist, + t=self.n_anchors, + rna_only=self.rna_only, + penalize_gap=penalize_gap, + all_cells=fit_outlier) + + if fit_outlier: + min_dist, t_pred, state_pred, reach_ss, late_phase, max_u, max_s, \ + self.anchor_t1_list, self.anchor_t2_list = res + else: + min_dist, state_pred, max_u, max_s, t_sw_adjust = res + + loss = torch.mean(min_dist) + + # avoid exceeding maximum expressions + reg = torch.max(torch.tensor([0, max_s - torch.tensor(self.max_s)], + requires_grad=True, + dtype=self.torch_type))\ + + torch.max(torch.tensor([0, max_u - torch.tensor(self.max_u)], + requires_grad=True, + dtype=self.torch_type)) + + loss += reg + + loss += 1e-1 * penalty + + self.cur_loss = loss.item() + self.cur_state_pred = state_pred + + if fit_outlier: + return loss, t_pred + else: + self.cur_t_sw_adjust = t_sw_adjust + + return loss + + def mse(self, x, fit_outlier=False, penalize_gap=True): + x = np.array(x) + + t3, r4, scale_cc, rescale_c, rescale_u = self._variables(x) + + t_sw_array = np.array([t3[0], t3[0]+t3[1], t3[0]+t3[1]+t3[2]]) + if self.rna_only: + t_sw_array[2] = 20 + + # conditions for minimum switch time and rate params + penalty = 0 + if any(t3 < 0.2) or any(r4 < 0.005): + penalty = (np.sum(0.2 - t3[t3 < 0.2]) if self.fit_decoupling + else np.sum(0.2 - t3[:2][t3[:2] < 0.2])) + penalty += np.sum(0.005 - r4[r4 < 0.005]) * 1e2 + + # condition for all params + if any(x > 500): + penalty = np.sum(x[x > 500] - 500) * 1e-2 + + c_array = self.c_all if fit_outlier else self.c + u_array = self.u_all if fit_outlier else self.u + s_array = self.s_all if fit_outlier else self.s + + if self.neural_net: + + res = calculate_dist_and_time_nn(c_array, + u_array, + s_array, + self.max_u_all if fit_outlier + else self.max_u, + self.max_s_all if fit_outlier + else self.max_s, + t_sw_array, + r4[0], + r4[1], + r4[2], + r4[3], + rescale_c, + rescale_u, + self.ode_model_0, + self.ode_model_1, + self.ode_model_2_m1, + self.ode_model_2_m2, + self.device, + scale_cc=scale_cc, + scale_factor=self.scale_factor, + model=self.model, + direction=self.direction, + conn=self.conn if fit_outlier + else self.conn_sub, + k=self.k_dist, + t=self.n_anchors, + rna_only=self.rna_only, + penalize_gap=penalize_gap, + all_cells=fit_outlier) + + if fit_outlier: + min_dist, t_pred, state_pred, max_u, max_s, nn_penalty = res + else: + min_dist, state_pred, max_u, max_s, nn_penalty = res + + penalty += nn_penalty + + t_sw_adjust = [0, 0, 0] + + else: + + # distances and time assignments + res = calculate_dist_and_time(c_array, + u_array, + s_array, + t_sw_array, + r4[0], + r4[1], + r4[2], + r4[3], + rescale_c, + rescale_u, + scale_cc=scale_cc, + scale_factor=self.scale_factor, + model=self.model, + direction=self.direction, + conn=self.conn if fit_outlier + else self.conn_sub, + k=self.k_dist, + t=self.n_anchors, + rna_only=self.rna_only, + penalize_gap=penalize_gap, + all_cells=fit_outlier) + + if fit_outlier: + min_dist, t_pred, state_pred, reach_ss, late_phase, max_u, \ + max_s, self.anchor_t1_list, self.anchor_t2_list = res + else: + min_dist, state_pred, max_u, max_s, t_sw_adjust = res + + loss = np.mean(min_dist) + + # avoid exceeding maximum expressions + reg = np.max([0, max_s - self.max_s]) + np.max([0, max_u - self.max_u]) + loss += reg + + loss += 1e-1 * penalty + self.cur_loss = loss + self.cur_state_pred = state_pred + + if fit_outlier: + return loss, t_pred + else: + self.cur_t_sw_adjust = t_sw_adjust + + return loss + + def update(self, x, perform_update=False, initialize=False, + fit_outlier=False, adjust_time=True, penalize_gap=True, + plot=True): + t3, r4, scale_cc, rescale_c, rescale_u = self._variables(x) + t_sw_array = np.array([t3[0], t3[0]+t3[1], t3[0]+t3[1]+t3[2]]) + + # read results + if initialize: + new_loss = self.mse(x, penalize_gap=penalize_gap) + elif fit_outlier: + new_loss, t_pred = self.mse(x, fit_outlier=True, + penalize_gap=penalize_gap) + else: + new_loss = self.cur_loss + t_sw_adjust = self.cur_t_sw_adjust + state_pred = self.cur_state_pred + + if new_loss < self.loss[-1] or perform_update: + perform_update = True + + self.loss.append(new_loss) + self.alpha_c, self.alpha, self.beta, self.gamma = r4 + self.rates = r4 + self.scale_cc = scale_cc + self.rescale_c = rescale_c + self.rescale_u = rescale_u + + # adjust overcrowded anchors + if not fit_outlier and adjust_time: + t_sw_array -= np.cumsum(t_sw_adjust) + if self.rna_only: + t_sw_array[2] = 20 + + self.t_sw_1, self.t_sw_2, self.t_sw_3 = t_sw_array + self.t_sw_array = t_sw_array + self.params = np.array([self.t_sw_1, + self.t_sw_2-self.t_sw_1, + self.t_sw_3-self.t_sw_2, + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + self.scale_cc, + self.rescale_c, + self.rescale_u]) + if not initialize: + self.state = state_pred + if fit_outlier: + self.t = t_pred + + main_info(f'params updated as: {self.t_sw_array} {self.rates} ' + f'{self.scale_cc} {self.rescale_c} {self.rescale_u}', + indent_level=2) + + # interactive plot + if self.plot and plot: + tau_list = anchor_points(self.t_sw_array, 20, self.n_anchors) + switch = np.sum(self.t_sw_array < 20) + typed_tau_list = List() + [typed_tau_list.append(x) for x in tau_list] + self.alpha_c, self.alpha, self.beta, self.gamma, \ + self.c0, self.u0, self.s0 = \ + check_params(self.alpha_c, self.alpha, self.beta, + self.gamma, c0=self.c0, u0=self.u0, + s0=self.s0) + exp_list, exp_sw_list = generate_exp(typed_tau_list, + self.t_sw_array[:switch], + self.alpha_c, + self.alpha, + self.beta, + self.gamma, + scale_cc=self.scale_cc, + model=self.model, + rna_only=self.rna_only) + rescale_factor = np.array([self.rescale_c, + self.rescale_u, + 1.0]) + exp_list = [x*rescale_factor for x in exp_list] + exp_sw_list = [x*rescale_factor for x in exp_sw_list] + c = np.ravel(np.concatenate([exp_list[x][:, 0] for x in + range(switch+1)])) + u = np.ravel(np.concatenate([exp_list[x][:, 1] for x in + range(switch+1)])) + s = np.ravel(np.concatenate([exp_list[x][:, 2] for x in + range(switch+1)])) + c_ = self.c_all if fit_outlier else self.c + u_ = self.u_all if fit_outlier else self.u + s_ = self.s_all if fit_outlier else self.s + self.ax.clear() + plt.pause(0.1) + if self.rna_only: + self.ax.scatter(s, u, s=self.point_size*1.5, c='black', + alpha=0.6, zorder=2) + if switch >= 1: + c_sw1, u_sw1, s_sw1 = exp_sw_list[0][0] + self.ax.plot([s_sw1], [u_sw1], "om", + markersize=self.point_size, zorder=5) + if switch >= 2: + c_sw2, u_sw2, s_sw2 = exp_sw_list[1][0] + self.ax.plot([s_sw2], [u_sw2], "Xm", + markersize=self.point_size, zorder=5) + if switch == 3: + c_sw3, u_sw3, s_sw3 = exp_sw_list[2][0] + self.ax.plot([s_sw3], [u_sw3], "Dm", + markersize=self.point_size, zorder=5) + if np.max(self.t) == 20: + self.ax.plot([s[-1]], [u[-1]], "*m", + markersize=self.point_size, zorder=5) + for i in range(4): + if any(self.state == i): + self.ax.scatter(s_[(self.state == i)], + u_[(self.state == i)], + s=self.point_size, c=self.color[i]) + self.ax.set_xlabel('s') + self.ax.set_ylabel('u') + + else: + self.ax.scatter(s, u, c, s=self.point_size*1.5, + c='black', alpha=0.6, zorder=2) + if switch >= 1: + c_sw1, u_sw1, s_sw1 = exp_sw_list[0][0] + self.ax.plot([s_sw1], [u_sw1], [c_sw1], "om", + markersize=self.point_size, zorder=5) + if switch >= 2: + c_sw2, u_sw2, s_sw2 = exp_sw_list[1][0] + self.ax.plot([s_sw2], [u_sw2], [c_sw2], "Xm", + markersize=self.point_size, zorder=5) + if switch == 3: + c_sw3, u_sw3, s_sw3 = exp_sw_list[2][0] + self.ax.plot([s_sw3], [u_sw3], [c_sw3], "Dm", + markersize=self.point_size, zorder=5) + if np.max(self.t) == 20: + self.ax.plot([s[-1]], [u[-1]], [c[-1]], "*m", + markersize=self.point_size, zorder=5) + for i in range(4): + if any(self.state == i): + self.ax.scatter(s_[(self.state == i)], + u_[(self.state == i)], + c_[(self.state == i)], + s=self.point_size, c=self.color[i]) + self.ax.set_xlabel('s') + self.ax.set_ylabel('u') + self.ax.set_zlabel('c') + self.fig.canvas.draw() + plt.pause(0.1) + return perform_update + + def save_dyn_plot(self, c, u, s, c_sw, u_sw, s_sw, tau_list, + show_all=False): + if not os.path.exists(self.plot_path): + os.makedirs(self.plot_path) + main_info(f'{self.plot_path} directory created.', indent_level=2) + + switch = np.sum(self.t_sw_array < 20) + scale_back = np.array([self.scale_c, self.scale_u, self.scale_s]) + shift_back = np.array([self.offset_c, self.offset_u, self.offset_s]) + if switch >= 1: + c_sw1, u_sw1, s_sw1 = c_sw[0], u_sw[0], s_sw[0] + if switch >= 2: + c_sw2, u_sw2, s_sw2 = c_sw[1], u_sw[1], s_sw[1] + if switch == 3: + c_sw3, u_sw3, s_sw3 = c_sw[2], u_sw[2], s_sw[2] + + if not show_all: + n_anchors = len(u) + t_lower = np.min(self.t) + t_upper = np.max(self.t) + t_ = np.concatenate((tau_list[0], tau_list[1] + self.t_sw_array[0], + tau_list[2] + self.t_sw_array[1], + tau_list[3] + self.t_sw_array[2])) + c_pre = c[t_[:n_anchors] <= t_lower] + u_pre = u[t_[:n_anchors] <= t_lower] + s_pre = s[t_[:n_anchors] <= t_lower] + c = c[(t_lower < t_[:n_anchors]) & (t_[:n_anchors] < t_upper)] + u = u[(t_lower < t_[:n_anchors]) & (t_[:n_anchors] < t_upper)] + s = s[(t_lower < t_[:n_anchors]) & (t_[:n_anchors] < t_upper)] + + c_all = self.c_all * self.scale_c + self.offset_c + u_all = self.u_all * self.scale_u + self.offset_u + s_all = self.s_all * self.scale_s + self.offset_s + + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(s_pre, u_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(s, u, s=self.point_size*1.5, c='black', alpha=0.6, zorder=2) + for i in range(4): + if any(self.state == i): + ax.scatter(s_all[(self.state == i) & (self.non_outlier)], + u_all[(self.state == i) & (self.non_outlier)], + s=self.point_size, c=self.color[i]) + ax.scatter(s_all[~self.non_outlier], u_all[~self.non_outlier], + s=self.point_size/2, c='grey') + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([s_sw1], [u_sw1], "om", markersize=self.point_size, + zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] and + t_upper >= self.t_sw_array[1])): + ax.plot([s_sw2], [u_sw2], "Xm", markersize=self.point_size, + zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] and + t_upper >= self.t_sw_array[2])): + ax.plot([s_sw3], [u_sw3], "Dm", markersize=self.point_size, + zorder=5) + if np.max(self.t) == 20: + ax.plot([s[-1]], [u[-1]], "*m", markersize=self.point_size, + zorder=5) + if (self.anchor_t1_list is not None and len(self.anchor_t1_list) > 0 + and show_all): + for i in range(len(self.anchor_t1_list)): + exp_t1 = self.anchor_t1_list[i] * scale_back + shift_back + exp_t2 = self.anchor_t2_list[i] * scale_back + shift_back + ax.plot([exp_t1[2]], [exp_t1[1]], "|y", + markersize=self.point_size*1.5) + ax.plot([exp_t2[2]], [exp_t2[1]], "|c", + markersize=self.point_size*1.5) + ax.plot(s_all, + self.steady_state_func(self.s_all) * self.scale_u + + self.offset_u, c='grey', ls=':', lw=self.point_size/4, + alpha=0.7) + ax.set_xlabel('s') + ax.set_ylabel('u') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-us.png', + dpi=fig.dpi, facecolor=fig.get_facecolor(), + transparent=False, edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + if self.extra_color is not None: + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(s_pre, u_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(s, u, s=self.point_size*1.5, c='black', alpha=0.6, + zorder=2) + ax.scatter(s_all, u_all, s=self.point_size, c=self.extra_color) + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([s_sw1], [u_sw1], "om", markersize=self.point_size, + zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] and + t_upper >= self.t_sw_array[1])): + ax.plot([s_sw2], [u_sw2], "Xm", markersize=self.point_size, + zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] and + t_upper >= self.t_sw_array[2])): + ax.plot([s_sw3], [u_sw3], "Dm", markersize=self.point_size, + zorder=5) + if np.max(self.t) == 20: + ax.plot([s[-1]], [u[-1]], "*m", markersize=self.point_size, + zorder=5) + if (self.anchor_t1_list is not None and + len(self.anchor_t1_list) > 0 and show_all): + for i in range(len(self.anchor_t1_list)): + exp_t1 = self.anchor_t1_list[i] * scale_back + shift_back + exp_t2 = self.anchor_t2_list[i] * scale_back + shift_back + ax.plot([exp_t1[2]], [exp_t1[1]], "|y", + markersize=self.point_size*1.5) + ax.plot([exp_t2[2]], [exp_t2[1]], "|c", + markersize=self.point_size*1.5) + ax.plot(s_all, self.steady_state_func(self.s_all) * self.scale_u + + self.offset_u, c='grey', ls=':', lw=self.point_size/4, + alpha=0.7) + ax.set_xlabel('s') + ax.set_ylabel('u') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-' + 'us_colorby_extra.png', dpi=fig.dpi, + facecolor=fig.get_facecolor(), transparent=False, + edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + if not self.rna_only: + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(u_pre, c_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(u, c, s=self.point_size*1.5, c='black', alpha=0.6, + zorder=2) + ax.scatter(u_all, c_all, s=self.point_size, c=self.extra_color) + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([u_sw1], [c_sw1], "om", markersize=self.point_size, + zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] + and t_upper >= + self.t_sw_array[1])): + ax.plot([u_sw2], [c_sw2], "Xm", markersize=self.point_size, + zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] + and t_upper >= + self.t_sw_array[2])): + ax.plot([u_sw3], [c_sw3], "Dm", markersize=self.point_size, + zorder=5) + if np.max(self.t) == 20: + ax.plot([u[-1]], [c[-1]], "*m", markersize=self.point_size, + zorder=5) + ax.set_xlabel('u') + ax.set_ylabel('c') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-' + 'cu_colorby_extra.png', dpi=fig.dpi, + facecolor=fig.get_facecolor(), transparent=False, + edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + if not self.rna_only: + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, projection='3d', facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(s_pre, u_pre, c_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(s, u, c, s=self.point_size*1.5, c='black', alpha=0.6, + zorder=2) + for i in range(4): + if any(self.state == i): + ax.scatter(s_all[(self.state == i) & (self.non_outlier)], + u_all[(self.state == i) & (self.non_outlier)], + c_all[(self.state == i) & (self.non_outlier)], + s=self.point_size, c=self.color[i]) + ax.scatter(s_all[~self.non_outlier], u_all[~self.non_outlier], + c_all[~self.non_outlier], s=self.point_size/2, c='grey') + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([s_sw1], [u_sw1], [c_sw1], "om", + markersize=self.point_size, zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] and + t_upper >= self.t_sw_array[1])): + ax.plot([s_sw2], [u_sw2], [c_sw2], "Xm", + markersize=self.point_size, zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] and + t_upper >= self.t_sw_array[2])): + ax.plot([s_sw3], [u_sw3], [c_sw3], "Dm", + markersize=self.point_size, zorder=5) + if np.max(self.t) == 20: + ax.plot([s[-1]], [u[-1]], [c[-1]], "*m", + markersize=self.point_size, zorder=5) + ax.set_xlabel('s') + ax.set_ylabel('u') + ax.set_zlabel('c') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-cus.png', + dpi=fig.dpi, facecolor=fig.get_facecolor(), + transparent=False, edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(s_pre, u_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(s, u, s=self.point_size*1.5, c='black', alpha=0.6, + zorder=2) + ax.scatter(s_all, u_all, s=self.point_size, c=np.log1p(self.c_all), + cmap='coolwarm') + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([s_sw1], [u_sw1], "om", markersize=self.point_size, + zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] and + t_upper >= self.t_sw_array[1])): + ax.plot([s_sw2], [u_sw2], "Xm", markersize=self.point_size, + zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] and + t_upper >= self.t_sw_array[2])): + ax.plot([s_sw3], [u_sw3], "Dm", markersize=self.point_size, + zorder=5) + if np.max(self.t) == 20: + ax.plot([s[-1]], [u[-1]], "*m", markersize=self.point_size, + zorder=5) + ax.plot(s_all, self.steady_state_func(self.s_all) * self.scale_u + + self.offset_u, c='grey', ls=':', lw=self.point_size/4, + alpha=0.7) + ax.set_xlabel('s') + ax.set_ylabel('u') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-' + 'us_colorby_c.png', dpi=fig.dpi, + facecolor=fig.get_facecolor(), transparent=False, + edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + fig = plt.figure(figsize=self.fig_size) + fig.patch.set_facecolor('white') + ax = fig.add_subplot(111, facecolor='white') + if not show_all and len(u_pre) > 0: + ax.scatter(u_pre, c_pre, s=self.point_size/2, c='black', + alpha=0.4, zorder=2) + ax.scatter(u, c, s=self.point_size*1.5, c='black', alpha=0.6, + zorder=2) + for i in range(4): + if any(self.state == i): + ax.scatter(u_all[(self.state == i) & (self.non_outlier)], + c_all[(self.state == i) & (self.non_outlier)], + s=self.point_size, c=self.color[i]) + ax.scatter(u_all[~self.non_outlier], c_all[~self.non_outlier], + s=self.point_size/2, c='grey') + if show_all or t_lower <= self.t_sw_array[0]: + ax.plot([u_sw1], [c_sw1], "om", markersize=self.point_size, + zorder=5) + if switch >= 2 and (show_all or (t_lower <= self.t_sw_array[1] and + t_upper >= self.t_sw_array[1])): + ax.plot([u_sw2], [c_sw2], "Xm", markersize=self.point_size, + zorder=5) + if switch >= 3 and (show_all or (t_lower <= self.t_sw_array[2] and + t_upper >= self.t_sw_array[2])): + ax.plot([u_sw3], [c_sw3], "Dm", markersize=self.point_size, + zorder=5) + if np.max(self.t) == 20: + ax.plot([u[-1]], [c[-1]], "*m", markersize=self.point_size, + zorder=5) + ax.set_xlabel('u') + ax.set_ylabel('c') + ax.set_title(f'{self.gene}-{self.model}') + plt.tight_layout() + fig.savefig(f'{self.plot_path}/{self.gene}-{self.model}-cu.png', + dpi=fig.dpi, facecolor=fig.get_facecolor(), + transparent=False, edgecolor='none') + plt.close(fig) + plt.pause(0.2) + + def get_loss(self): + return self.loss + + def get_model(self): + return self.model + + def get_params(self): + return self.t_sw_array, self.rates, self.scale_cc, self.rescale_c, \ + self.rescale_u, self.realign_ratio + + def is_partial(self): + return self.partial + + def get_direction(self): + return self.direction + + def realign_time_and_velocity(self, c, u, s, anchor_time): + # realign time to range (0,20) + self.anchor_min_idx = np.sum(anchor_time < (np.min(self.t)-1e-5)) + self.anchor_max_idx = np.sum(anchor_time < (np.max(self.t)-1e-5)) + self.c0 = c[self.anchor_min_idx] + self.u0 = u[self.anchor_min_idx] + self.s0 = s[self.anchor_min_idx] + self.realign_ratio = 20 / (np.max(self.t) - np.min(self.t)) + main_info(f'fitted params:\nswitch time array = {self.t_sw_array},\n' + f'rates = {self.rates},\ncc scale = {self.scale_cc},\n' + f'c rescale factor = {self.rescale_c},\n' + f'u rescale factor = {self.rescale_u}', + indent_level=1) + main_info(f'aligning to range (0,20) by {self.realign_ratio}..', + indent_level=1) + self.rates /= self.realign_ratio + self.alpha_c, self.alpha, self.beta, self.gamma = self.rates + self.params[3:7] = self.rates + self.t_sw_array = ((self.t_sw_array - np.min(self.t)) + * self.realign_ratio) + self.t_sw_1, self.t_sw_2, self.t_sw_3 = self.t_sw_array + self.params[:3] = np.array([self.t_sw_1, self.t_sw_2 - self.t_sw_1, + self.t_sw_3 - self.t_sw_2]) + self.t -= np.min(self.t) + self.t = self.t * 20 / np.max(self.t) + self.velocity /= self.realign_ratio + self.velocity[:, 0] = np.clip(self.velocity[:, 0], -self.c_all + * self.scale_c, None) + self.velocity[:, 1] = np.clip(self.velocity[:, 1], -self.u_all + * self.scale_u, None) + self.velocity[:, 2] = np.clip(self.velocity[:, 2], -self.s_all + * self.scale_s, None) + self.anchor_velo /= self.realign_ratio + self.anchor_velo[:, 0] = np.clip(self.anchor_velo[:, 0], + -np.max(self.c_all * self.scale_c), + None) + self.anchor_velo[:, 1] = np.clip(self.anchor_velo[:, 1], + -np.max(self.u_all * self.scale_u), + None) + self.anchor_velo[:, 2] = np.clip(self.anchor_velo[:, 2], + -np.max(self.s_all * self.scale_s), + None) + + def get_initial_exp(self): + return np.array([self.c0, self.u0, self.s0]) + + def get_time_assignment(self): + if self.low_quality: + return np.zeros(len(self.u_all)) + return self.t + + def get_state_assignment(self): + if self.low_quality: + return np.zeros(len(self.u_all)) + return self.state + + def get_velocity(self): + if self.low_quality: + return np.zeros((len(self.u_all), 3)) + return self.velocity + + def get_likelihood(self): + return self.likelihood, self.l_c, self.ssd_c, self.var_c + + def get_anchors(self): + if self.low_quality: + return (np.zeros((1, 3)), np.zeros((1, 3)), np.zeros((1, 3)), + 0, 0, 0, 0) + return self.anchor_exp, self.anchor_exp_sw, self.anchor_velo, \ + self.anchor_min_idx, self.anchor_max_idx, \ + self.anchor_velo_min_idx, self.anchor_velo_max_idx + + +def regress_func(c, u, s, m, mi, im, dev, nn, ad, lr, b1, b2, bs, gpdist, + embed, conn, pl, sp, pdir, fa, gene, pa, di, ro, fit, fd, + extra, ru, alpha, beta, gamma, t_, verbosity, log_folder, + log_filename): + + settings.VERBOSITY = verbosity + settings.LOG_FOLDER = log_folder + settings.LOG_FILENAME = log_filename + settings.GENE = gene + + if m is not None: + main_info('#########################################################' + '######################################', indent_level=1) + main_info(f'testing model {m}', indent_level=1) + + c_90 = np.percentile(c, 90) + u_90 = np.percentile(u, 90) + s_90 = np.percentile(s, 90) + low_quality = (u_90 == 0 or s_90 == 0) if ro else (c_90 == 0 or u_90 == 0 + or s_90 == 0) + if low_quality: + main_info(f'low quality gene {gene}, skipping', indent_level=1) + return (np.inf, np.nan, '', (np.zeros(3), np.zeros(4), 0, 0, 0, 0), + np.zeros(3), np.zeros(len(u)), np.zeros(len(u)), + np.zeros((len(u), 3)), (-1.0, 0, 0, 0), + (np.zeros((1, 3)), np.zeros((1, 3)), np.zeros((1, 3)), 0, 0, + 0, 0)) + + if gpdist is not None: + subset_cells = s > 0.1 * np.percentile(s, 99) + subset_cells = np.where(subset_cells)[0] + if len(subset_cells) > 3000: + rng = np.random.default_rng(2021) + subset_cells = rng.choice(subset_cells, 3000, replace=False) + local_pdist = gpdist[np.ix_(subset_cells, subset_cells)] + dists = (np.ravel(local_pdist[np.triu_indices_from(local_pdist, k=1)]) + .reshape(-1, 1)) + local_std = np.std(dists) + else: + local_std = None + + cdc = ChromatinDynamical(c, + u, + s, + model=m, + max_iter=mi, + init_mode=im, + device=dev, + neural_net=nn, + adam=ad, + adam_lr=lr, + adam_beta1=b1, + adam_beta2=b2, + batch_size=bs, + local_std=local_std, + embed_coord=embed, + connectivities=conn, + plot=pl, + save_plot=sp, + plot_dir=pdir, + fit_args=fa, + gene=gene, + partial=pa, + direction=di, + rna_only=ro, + fit_decoupling=fd, + extra_color=extra, + rescale_u=ru, + alpha=alpha, + beta=beta, + gamma=gamma, + t_=t_) + if fit: + loss = cdc.fit() + if loss[-1] == np.inf: + main_info(f'low quality gene {gene}, skipping..', indent_level=1) + loss = cdc.get_loss() + model = cdc.get_model() + direction = cdc.get_direction() + parameters = cdc.get_params() + initial_exp = cdc.get_initial_exp() + velocity = cdc.get_velocity() + likelihood = cdc.get_likelihood() + time = cdc.get_time_assignment() + state = cdc.get_state_assignment() + anchors = cdc.get_anchors() + return loss[-1], model, direction, parameters, initial_exp, time, state, \ + velocity, likelihood, anchors + + +def multimodel_helper(c, u, s, + model_to_run, + max_iter, + init_mode, + device, + neural_net, + adam, + adam_lr, + adam_beta1, + adam_beta2, + batch_size, + global_pdist, + embed_coord, + conn, + plot, + save_plot, + plot_dir, + fit_args, + gene, + partial, + direction, + rna_only, + fit, + fit_decoupling, + extra_color, + rescale_u, + alpha, + beta, + gamma, + t_, + verbosity, log_folder, log_filename): + + loss, param_cand, initial_cand, time_cand = [], [], [], [] + state_cand, velo_cand, likelihood_cand, anch_cand = [], [], [], [] + + for model in model_to_run: + (loss_m, _, direction_, parameters, initial_exp, + time, state, velocity, likelihood, anchors) = \ + regress_func(c, u, s, model, max_iter, init_mode, device, neural_net, + adam, adam_lr, adam_beta1, adam_beta2, batch_size, + global_pdist, embed_coord, conn, plot, save_plot, + plot_dir, fit_args, gene, partial, direction, rna_only, + fit, fit_decoupling, extra_color, rescale_u, alpha, beta, + gamma, t_) + loss.append(loss_m) + param_cand.append(parameters) + initial_cand.append(initial_exp) + time_cand.append(time) + state_cand.append(state) + velo_cand.append(velocity) + likelihood_cand.append(likelihood) + anch_cand.append(anchors) + + best_model = np.argmin(loss) + model = np.nan if rna_only else model_to_run[best_model] + parameters = param_cand[best_model] + initial_exp = initial_cand[best_model] + time = time_cand[best_model] + state = state_cand[best_model] + velocity = velo_cand[best_model] + likelihood = likelihood_cand[best_model] + anchors = anch_cand[best_model] + return loss, model, direction_, parameters, initial_exp, time, state, \ + velocity, likelihood, anchors + + +def recover_dynamics_chrom(adata_rna, + adata_atac=None, + gene_list=None, + max_iter=5, + init_mode='invert', + device="cpu", + neural_net=False, + adam=False, + adam_lr=None, + adam_beta1=None, + adam_beta2=None, + batch_size=None, + model_to_run=None, + plot=False, + parallel=True, + n_jobs=None, + save_plot=False, + plot_dir=None, + rna_only=False, + fit=True, + fit_decoupling=True, + extra_color_key=None, + embedding='X_umap', + n_anchors=500, + k_dist=1, + thresh_multiplier=1.0, + weight_c=0.6, + outlier=99.8, + n_pcs=30, + n_neighbors=30, + fig_size=(8, 6), + point_size=7, + partial=None, + direction=None, + rescale_u=None, + alpha=None, + beta=None, + gamma=None, + t_sw=None + ): + + """Multi-omic dynamics recovery. + + This function optimizes the joint chromatin and RNA model parameters in + ODE solutions. + + Parameters + ---------- + adata_rna: :class:`~anndata.AnnData` + RNA anndata object. Required fields: `Mu`, `Ms`, and `connectivities`. + adata_atac: :class:`~anndata.AnnData` (default: `None`) + ATAC anndata object. Required fields: `Mc`. + gene_list: `str`, list of `str` (default: highly variable genes) + Genes to use for model fitting. + max_iter: `int` (default: `5`) + Iterations to run for parameter optimization. + init_mode: `str` (default: `'invert'`) + Initialization method for switch times. + `'invert'`: initial RNA switch time will be computed with scVelo time + inversion method. + `'grid'`: grid search the best set of switch times. + `'simple'`: simply initialize switch times to be 5, 10, and 15. + device: `str` (default: `'cpu'`) + The CUDA device that pytorch tensor calculations will be run on. Only + to be used with Adam or Neural Network mode. + neural_net: `bool` (default: `False`) + Whether to run time predictions with a neural network or not. Shortens + runtime at the expense of accuracy. If False, uses the usual method of + assigning each data point to an anchor time point as outlined in the + Multivelo paper. + adam: `bool` (default: `False`) + Whether MSE minimization is handled by the Adam algorithm or not. When + set to the default of False, function uses Nelder-Mead instead. + adam_lr: `float` (default: `None`) + The learning rate to use the Adam algorithm. If adam is False, this + value is ignored. + adam_beta1: `float` (default: `None`) + The beta1 parameter for the Adam algorithm. If adam is False, this + value is ignored. + adam_beta2: `float` (default: `None`) + The beta2 parameter for the Adam algorithm. If adam is False, this + value is ignored. + batch_size: `int` (default: `None`) + Speeds up performance using minibatch training. Specifies number of + cells to use per run of MSE when running the Adam algorithm. Ignored + if Adam is set to False. + model_to_run: `int` or list of `int` (default: `None`) + User specified models for each genes. Possible values are 1 are 2. If + `None`, the model + for each gene will be inferred based on expression patterns. If more + than one value is given, + the best model will be decided based on loss of fit. + plot: `bool` or `None` (default: `False`) + Whether to interactively plot the 3D gene portraits. Ignored if + parallel is True. + parallel: `bool` (default: `True`) + Whether to fit genes in a parallel fashion (recommended). + n_jobs: `int` (default: available threads) + Number of parallel jobs. + save_plot: `bool` (default: `False`) + Whether to save the fitted gene portrait figures as files. This will + take some disk space. + plot_dir: `str` (default: `plots` for multiome and `rna_plots` for + RNA-only) + Directory to save the plots. + rna_only: `bool` (default: `False`) + Whether to only use RNA for fitting (RNA velocity). + fit: `bool` (default: `True`) + Whether to fit the models. If False, only pre-determination and + initialization will be run. + fit_decoupling: `bool` (default: `True`) + Whether to fit decoupling phase (Model 1 vs Model 2 distinction). + n_anchors: `int` (default: 500) + Number of anchor time-points to generate as a representation of the + trajectory. + k_dist: `int` (default: 1) + Number of anchors to use to determine a cell's gene time. If more than + 1, time will be averaged. + thresh_multiplier: `float` (default: 1.0) + Multiplier for the heuristic threshold of partial versus complete + trajectory pre-determination. + weight_c: `float` (default: 0.6) + Weighting of scaled chromatin distances when performing 3D residual + calculation. + outlier: `float` (default: 99.8) + The percentile to mark as outlier that will be excluded when fitting + the model. + n_pcs: `int` (default: 30) + Number of principal components to compute distance smoothing neighbors. + This can be different from the one used for expression smoothing. + n_neighbors: `int` (default: 30) + Number of nearest neighbors for distance smoothing. + This can be different from the one used for expression smoothing. + fig_size: `tuple` (default: (8,6)) + Size of each figure when saved. + point_size: `float` (default: 7) + Marker point size for plotting. + extra_color_key: `str` (default: `None`) + Extra color key used for plotting. Common choices are `leiden`, + `celltype`, etc. + The colors for each category must be present in one of anndatas, which + can be pre-computed + with `scanpy.pl.scatter` function. + embedding: `str` (default: `X_umap`) + 2D coordinates of the low-dimensional embedding of cells. + partial: `bool` or list of `bool` (default: `None`) + User specified trajectory completeness for each gene. + direction: `str` or list of `str` (default: `None`) + User specified trajectory directionality for each gene. + rescale_u: `float` or list of `float` (default: `None`) + Known scaling factors for unspliced. Can be computed from scVelo + `fit_scaling` values + as `rescale_u = fit_scaling / std(u) * std(s)`. + alpha: `float` or list of `float` (default: `None`) + Known trascription rates. Can be computed from scVelo `fit_alpha` + values + as `alpha = fit_alpha * fit_alignment_scaling`. + beta: `float` or list of `float` (default: `None`) + Known splicing rates. Can be computed from scVelo `fit_alpha` values + as `beta = fit_beta * fit_alignment_scaling`. + gamma: `float` or list of `float` (default: `None`) + Known degradation rates. Can be computed from scVelo `fit_gamma` values + as `gamma = fit_gamma * fit_alignment_scaling`. + t_sw: `float` or list of `float` (default: `None`) + Known RNA switch time. Can be computed from scVelo `fit_t_` values + as `t_sw = fit_t_ / fit_alignment_scaling`. + + Returns + ------- + fit_alpha_c, fit_alpha, fit_beta, fit_gamma: `.var` + inferred chromatin opening, transcription, splicing, and degradation + (nuclear export) rates + fit_t_sw1, fit_t_sw2, fit_t_sw3: `.var` + inferred switching time points + fit_rescale_c, fit_rescale_u: `.var` + inferred scaling factor for chromatin and unspliced counts + fit_scale_cc: `.var` + inferred scaling value for chromatin closing rate compared to opening + rate + fit_alignment_scaling: `.var` + ratio used to realign observed time range to 0-20 + fit_c0, fit_u0, fit_s0: `.var` + initial expression values at earliest observed time + fit_model: `.var` + inferred gene model + fit_direction: `.var` + inferred gene direction + fit_loss: `.var` + loss of model fit + fit_likelihood: `.var` + likelihood of model fit + fit_likelihood_c: `.var` + likelihood of chromatin fit + fit_anchor_c, fit_anchor_u, fit_anchor_s: `.varm` + anchor expressions + fit_anchor_c_sw, fit_anchor_u_sw, fit_anchor_s_sw: `.varm` + switch time-point expressions + fit_anchor_c_velo, fit_anchor_u_velo, fit_anchor_s_velo: `.varm` + velocities of anchors + fit_anchor_min_idx: `.var` + first anchor mapped to observations + fit_anchor_max_idx: `.var` + last anchor mapped to observations + fit_anchor_velo_min_idx: `.var` + first velocity anchor mapped to observations + fit_anchor_velo_max_idx: `.var` + last velocity anchor mapped to observations + fit_t: `.layers` + inferred gene time + fit_state: `.layers` + inferred state assignments + velo_s, velo_u, velo_chrom: `.layers` + velocities in spliced, unspliced, and chromatin space + velo_s_genes, velo_u_genes, velo_chrom_genes: `.var` + velocity genes + velo_s_params, velo_u_params, velo_chrom_params: `.var` + fitting arguments used + ATAC: `.layers` + KNN smoothed chromatin accessibilities copied from adata_atac + """ + + fit_args = {} + fit_args['max_iter'] = max_iter + fit_args['init_mode'] = init_mode + fit_args['fit_decoupling'] = fit_decoupling + n_anchors = np.clip(int(n_anchors), 201, 2000) + fit_args['t'] = n_anchors + fit_args['k'] = k_dist + fit_args['thresh_multiplier'] = thresh_multiplier + fit_args['weight_c'] = weight_c + fit_args['outlier'] = outlier + fit_args['n_pcs'] = n_pcs + fit_args['n_neighbors'] = n_neighbors + fit_args['fig_size'] = list(fig_size) + fit_args['point_size'] = point_size + + if adam and neural_net: + raise Exception("ADAM and Neural Net mode can not be run concurently." + " Please choose one to run on.") + + if not adam and not neural_net and not device == "cpu": + raise Exception("Multivelo only uses non-CPU devices for Adam or" + " Neural Network mode. Please use one of those or" + "set the device to \"cpu\"") + + if adam and not device[0:5] == "cuda:": + raise Exception("ADAM and Neural Net mode are only possible on a cuda " + "device. Please try again.") + if not adam and batch_size is not None: + raise Exception("Batch training is for ADAM only, please set " + "batch_size to None") + + if adam: + from cuml.neighbors import NearestNeighbors + + all_genes = adata_rna.var_names + if adata_atac is None: + import anndata as ad + rna_only = True + adata_atac = ad.AnnData(X=np.ones(adata_rna.shape), obs=adata_rna.obs, + var=adata_rna.var) + adata_atac.layers['Mc'] = np.ones(adata_rna.shape) + if adata_rna.shape != adata_atac.shape: + raise ValueError('Shape of RNA and ATAC adata objects do not match: ' + f'{adata_rna.shape} {adata_atac.shape}') + if not np.all(adata_rna.obs_names == adata_atac.obs_names): + raise ValueError('obs_names of RNA and ATAC adata objects do not ' + 'match, please check if they are consistent') + if not np.all(all_genes == adata_atac.var_names): + raise ValueError('var_names of RNA and ATAC adata objects do not ' + 'match, please check if they are consistent') + if 'connectivities' not in adata_rna.obsp.keys(): + raise ValueError('Missing connectivities entry in RNA adata object') + if extra_color_key is None: + extra_color = None + elif (isinstance(extra_color_key, str) and extra_color_key in adata_rna.obs + and adata_rna.obs[extra_color_key].dtype.name == 'category'): + ngroups = len(adata_rna.obs[extra_color_key].cat.categories) + extra_color = adata_rna.obs[extra_color_key].cat.rename_categories( + adata_rna.uns[extra_color_key+'_colors'][:ngroups]).to_numpy() + elif (isinstance(extra_color_key, str) and extra_color_key in + adata_atac.obs and + adata_rna.obs[extra_color_key].dtype.name == 'category'): + ngroups = len(adata_atac.obs[extra_color_key].cat.categories) + extra_color = adata_atac.obs[extra_color_key].cat.rename_categories( + adata_atac.uns[extra_color_key+'_colors'][:ngroups]).to_numpy() + else: + raise ValueError('Currently, extra_color_key must be a single string ' + 'of categories and available in adata obs, and its ' + 'colors can be found in adata uns') + if ('connectivities' not in adata_rna.obsp.keys() or + (adata_rna.obsp['connectivities'] > 0).sum(1).min() + > (n_neighbors-1)): + from scanpy import Neighbors + neighbors = Neighbors(adata_rna) + neighbors.compute_neighbors(n_neighbors=n_neighbors, knn=True, + n_pcs=n_pcs) + rna_conn = neighbors.connectivities + else: + rna_conn = adata_rna.obsp['connectivities'].copy() + rna_conn.setdiag(1) + rna_conn = rna_conn.multiply(1.0 / rna_conn.sum(1)).tocsr() + if not rna_only: + if 'connectivities' not in adata_atac.obsp.keys(): + main_info('Missing connectivities in ATAC adata object, using ' + 'RNA connectivities instead', indent_level=1) + atac_conn = rna_conn + else: + atac_conn = adata_atac.obsp['connectivities'].copy() + atac_conn.setdiag(1) + atac_conn = atac_conn.multiply(1.0 / atac_conn.sum(1)).tocsr() + if gene_list is None: + if 'highly_variable' in adata_rna.var: + gene_list = adata_rna.var_names[adata_rna.var['highly_variable']]\ + .values + else: + gene_list = adata_rna.var_names.values[ + (~np.isnan(np.asarray(adata_rna.layers['Mu'].sum(0)) + .reshape(-1) + if sparse.issparse(adata_rna.layers['Mu']) + else np.sum(adata_rna.layers['Mu'], axis=0))) + & (~np.isnan(np.asarray(adata_rna.layers['Ms'].sum(0)) + .reshape(-1) + if sparse.issparse(adata_rna.layers['Ms']) + else np.sum(adata_rna.layers['Ms'], axis=0))) + & (~np.isnan(np.asarray(adata_atac.layers['Mc'].sum(0)) + .reshape(-1) + if sparse.issparse(adata_atac.layers['Mc']) + else np.sum(adata_atac.layers['Mc'], axis=0)))] + elif isinstance(gene_list, (list, np.ndarray, pd.Index, pd.Series)): + gene_list = np.array([x for x in gene_list if x in all_genes]) + elif isinstance(gene_list, str): + gene_list = np.array([gene_list]) if gene_list in all_genes else [] + else: + raise ValueError('Invalid gene list, must be one of (str, np.ndarray,' + 'pd.Index, pd.Series)') + gn = len(gene_list) + if gn == 0: + raise ValueError('None of the genes specified are in the adata object') + main_info(f'{gn} genes will be fitted', indent_level=1) + + models = np.zeros(gn) + t_sws = np.zeros((gn, 3)) + rates = np.zeros((gn, 4)) + scale_ccs = np.zeros(gn) + rescale_cs = np.zeros(gn) + rescale_us = np.zeros(gn) + realign_ratios = np.zeros(gn) + initial_exps = np.zeros((gn, 3)) + times = np.zeros((adata_rna.n_obs, gn)) + states = np.zeros((adata_rna.n_obs, gn)) + if not rna_only: + velo_c = np.zeros((adata_rna.n_obs, gn)) + velo_u = np.zeros((adata_rna.n_obs, gn)) + velo_s = np.zeros((adata_rna.n_obs, gn)) + likelihoods = np.zeros(gn) + l_cs = np.zeros(gn) + ssd_cs = np.zeros(gn) + var_cs = np.zeros(gn) + directions = [] + anchor_c = np.zeros((n_anchors, gn)) + anchor_u = np.zeros((n_anchors, gn)) + anchor_s = np.zeros((n_anchors, gn)) + anchor_c_sw = np.zeros((3, gn)) + anchor_u_sw = np.zeros((3, gn)) + anchor_s_sw = np.zeros((3, gn)) + anchor_vc = np.zeros((n_anchors, gn)) + anchor_vu = np.zeros((n_anchors, gn)) + anchor_vs = np.zeros((n_anchors, gn)) + anchor_min_idx = np.zeros(gn) + anchor_max_idx = np.zeros(gn) + anchor_velo_min_idx = np.zeros(gn) + anchor_velo_max_idx = np.zeros(gn) + + if rna_only: + model_to_run = [2] + main_info('Skipping model checking for RNA-only, running model 2', + indent_level=1) + + m_per_g = False + if model_to_run is not None: + if isinstance(model_to_run, (list, np.ndarray, pd.Index, pd.Series)): + model_to_run = [int(x) for x in model_to_run] + if np.any(~np.isin(model_to_run, [0, 1, 2])): + raise ValueError('Invalid model number (must be values in' + ' [0,1,2])') + if len(model_to_run) == gn: + losses = np.zeros((gn, 1)) + m_per_g = True + func_to_call = regress_func + else: + losses = np.zeros((gn, len(model_to_run))) + func_to_call = multimodel_helper + elif isinstance(model_to_run, (int, float)): + model_to_run = int(model_to_run) + if not np.isin(model_to_run, [0, 1, 2]): + raise ValueError('Invalid model number (must be values in ' + '[0,1,2])') + model_to_run = [model_to_run] + losses = np.zeros((gn, 1)) + func_to_call = multimodel_helper + else: + raise ValueError('Invalid model number (must be values in ' + '[0,1,2])') + else: + losses = np.zeros((gn, 1)) + func_to_call = regress_func + + p_per_g = False + if partial is not None: + if isinstance(partial, (list, np.ndarray, pd.Index, pd.Series)): + if np.any(~np.isin(partial, [True, False])): + raise ValueError('Invalid partial argument (must be values in' + ' [True,False])') + if len(partial) == gn: + p_per_g = True + else: + raise ValueError('Incorrect partial argument length') + elif isinstance(partial, bool): + if not np.isin(partial, [True, False]): + raise ValueError('Invalid partial argument (must be values in' + ' [True,False])') + else: + raise ValueError('Invalid partial argument (must be values in' + ' [True,False])') + + d_per_g = False + if direction is not None: + if isinstance(direction, (list, np.ndarray, pd.Index, pd.Series)): + if np.any(~np.isin(direction, ['on', 'off', 'complete'])): + raise ValueError('Invalid direction argument (must be values' + ' in ["on","off","complete"])') + if len(direction) == gn: + d_per_g = True + else: + raise ValueError('Incorrect direction argument length') + elif isinstance(direction, str): + if not np.isin(direction, ['on', 'off', 'complete']): + raise ValueError('Invalid direction argument (must be values' + ' in ["on","off","complete"])') + else: + raise ValueError('Invalid direction argument (must be values in' + ' ["on","off","complete"])') + + known_pars = [rescale_u, alpha, beta, gamma, t_sw] + for x in known_pars: + if x is not None: + if isinstance(x, (list, np.ndarray)): + if np.sum(np.isnan(x)) + np.sum(np.isinf(x)) > 0: + raise ValueError('Known parameters cannot contain NaN or' + ' Inf') + elif isinstance(x, (int, float)): + if x == np.nan or x == np.inf: + raise ValueError('Known parameters cannot contain NaN or' + ' Inf') + else: + raise ValueError('Invalid known parameters type') + + if ((embedding not in adata_rna.obsm) and + (embedding not in adata_atac.obsm)): + raise ValueError(f'{embedding} is not found in obsm') + embed_coord = adata_rna.obsm[embedding] if embedding in adata_rna.obsm \ + else adata_atac.obsm[embedding] + global_pdist = pairwise_distances(embed_coord) + + u_mat = adata_rna[:, gene_list].layers['Mu'].A \ + if sparse.issparse(adata_rna.layers['Mu']) \ + else adata_rna[:, gene_list].layers['Mu'] + s_mat = adata_rna[:, gene_list].layers['Ms'].A \ + if sparse.issparse(adata_rna.layers['Ms']) \ + else adata_rna[:, gene_list].layers['Ms'] + c_mat = adata_atac[:, gene_list].layers['Mc'].A \ + if sparse.issparse(adata_atac.layers['Mc']) \ + else adata_atac[:, gene_list].layers['Mc'] + + ru = rescale_u if rescale_u is not None else None + + if parallel: + if (n_jobs is None or not isinstance(n_jobs, int) or n_jobs < 0 or + n_jobs > os.cpu_count()): + n_jobs = os.cpu_count() + if n_jobs > gn: + n_jobs = gn + batches = -(-gn // n_jobs) + if n_jobs > 1: + main_info(f'running {n_jobs} jobs in parallel', indent_level=1) + else: + n_jobs = 1 + batches = gn + if n_jobs == 1: + parallel = False + + pbar = tqdm(total=gn) + for group in range(batches): + gene_indices = range(group * n_jobs, np.min([gn, (group+1) * n_jobs])) + if parallel: + from joblib import Parallel, delayed + verb = 51 if settings.VERBOSITY >= 2 else 0 + plot = False + + # clear the settings file if it exists + open("settings.txt", "w").close() + + # write our current settings to the file + with open("settings.txt", "a") as sfile: + sfile.write(str(settings.VERBOSITY) + "\n") + sfile.write(str(settings.CWD) + "\n") + sfile.write(str(settings.LOG_FOLDER) + "\n") + sfile.write(str(settings.LOG_FILENAME) + "\n") + + res = Parallel(n_jobs=n_jobs, backend='loky', verbose=verb)( + delayed(func_to_call)( + c_mat[:, i], + u_mat[:, i], + s_mat[:, i], + model_to_run[i] if m_per_g else model_to_run, + max_iter, + init_mode, + device, + neural_net, + adam, + adam_lr, + adam_beta1, + adam_beta2, + batch_size, + global_pdist, + embed_coord, + rna_conn, + plot, + save_plot, + plot_dir, + fit_args, + gene_list[i], + partial[i] if p_per_g else partial, + direction[i] if d_per_g else direction, + rna_only, + fit, + fit_decoupling, + extra_color, + ru[i] if isinstance(ru, (list, np.ndarray)) else ru, + alpha[i] if isinstance(alpha, (list, np.ndarray)) + else alpha, + beta[i] if isinstance(beta, (list, np.ndarray)) + else beta, + gamma[i] if isinstance(gamma, (list, np.ndarray)) + else gamma, + t_sw[i] if isinstance(t_sw, (list, np.ndarray)) else t_sw, + settings.VERBOSITY, + settings.LOG_FOLDER, + settings.LOG_FILENAME) + for i in gene_indices) + + for i, r in zip(gene_indices, res): + (loss, model, direct_out, parameters, initial_exp, + time, state, velocity, likelihood, anchors) = r + switch, rate, scale_cc, rescale_c, rescale_u, realign_ratio = \ + parameters + likelihood, l_c, ssd_c, var_c = likelihood + losses[i, :] = loss + models[i] = model + directions.append(direct_out) + t_sws[i, :] = switch + rates[i, :] = rate + scale_ccs[i] = scale_cc + rescale_cs[i] = rescale_c + rescale_us[i] = rescale_u + realign_ratios[i] = realign_ratio + likelihoods[i] = likelihood + l_cs[i] = l_c + ssd_cs[i] = ssd_c + var_cs[i] = var_c + if fit: + initial_exps[i, :] = initial_exp + times[:, i] = time + states[:, i] = state + n_anchors_ = anchors[0].shape[0] + n_switch = anchors[1].shape[0] + if not rna_only: + velo_c[:, i] = smooth_scale(atac_conn, velocity[:, 0]) + anchor_c[:n_anchors_, i] = anchors[0][:, 0] + anchor_c_sw[:n_switch, i] = anchors[1][:, 0] + anchor_vc[:n_anchors_, i] = anchors[2][:, 0] + velo_u[:, i] = smooth_scale(rna_conn, velocity[:, 1]) + velo_s[:, i] = smooth_scale(rna_conn, velocity[:, 2]) + anchor_u[:n_anchors_, i] = anchors[0][:, 1] + anchor_s[:n_anchors_, i] = anchors[0][:, 2] + anchor_u_sw[:n_switch, i] = anchors[1][:, 1] + anchor_s_sw[:n_switch, i] = anchors[1][:, 2] + anchor_vu[:n_anchors_, i] = anchors[2][:, 1] + anchor_vs[:n_anchors_, i] = anchors[2][:, 2] + anchor_min_idx[i] = anchors[3] + anchor_max_idx[i] = anchors[4] + anchor_velo_min_idx[i] = anchors[5] + anchor_velo_max_idx[i] = anchors[6] + else: + i = group + gene = gene_list[i] + main_info(f'@@@@@fitting {gene}', indent_level=1) + (loss, model, direct_out, + parameters, initial_exp, + time, state, velocity, + likelihood, anchors) = \ + func_to_call(c_mat[:, i], u_mat[:, i], s_mat[:, i], + model_to_run[i] if m_per_g else model_to_run, + max_iter, init_mode, + device, + neural_net, + adam, + adam_lr, + adam_beta1, + adam_beta2, + batch_size, + global_pdist, embed_coord, + rna_conn, plot, save_plot, plot_dir, + fit_args, gene, + partial[i] if p_per_g else partial, + direction[i] if d_per_g else direction, + rna_only, fit, fit_decoupling, extra_color, + ru[i] if isinstance(ru, (list, np.ndarray)) + else ru, + alpha[i] if isinstance(alpha, (list, np.ndarray)) + else alpha, + beta[i] if isinstance(beta, (list, np.ndarray)) + else beta, + gamma[i] if isinstance(gamma, (list, np.ndarray)) + else gamma, + t_sw[i] if isinstance(t_sw, (list, np.ndarray)) + else t_sw, + settings.VERBOSITY, + settings.LOG_FOLDER, + settings.LOG_FILENAME) + switch, rate, scale_cc, rescale_c, rescale_u, realign_ratio = \ + parameters + likelihood, l_c, ssd_c, var_c = likelihood + losses[i, :] = loss + models[i] = model + directions.append(direct_out) + t_sws[i, :] = switch + rates[i, :] = rate + scale_ccs[i] = scale_cc + rescale_cs[i] = rescale_c + rescale_us[i] = rescale_u + realign_ratios[i] = realign_ratio + likelihoods[i] = likelihood + l_cs[i] = l_c + ssd_cs[i] = ssd_c + var_cs[i] = var_c + if fit: + initial_exps[i, :] = initial_exp + times[:, i] = time + states[:, i] = state + n_anchors_ = anchors[0].shape[0] + n_switch = anchors[1].shape[0] + if not rna_only: + velo_c[:, i] = smooth_scale(atac_conn, velocity[:, 0]) + anchor_c[:n_anchors_, i] = anchors[0][:, 0] + anchor_c_sw[:n_switch, i] = anchors[1][:, 0] + anchor_vc[:n_anchors_, i] = anchors[2][:, 0] + velo_u[:, i] = smooth_scale(rna_conn, velocity[:, 1]) + velo_s[:, i] = smooth_scale(rna_conn, velocity[:, 2]) + anchor_u[:n_anchors_, i] = anchors[0][:, 1] + anchor_s[:n_anchors_, i] = anchors[0][:, 2] + anchor_u_sw[:n_switch, i] = anchors[1][:, 1] + anchor_s_sw[:n_switch, i] = anchors[1][:, 2] + anchor_vu[:n_anchors_, i] = anchors[2][:, 1] + anchor_vs[:n_anchors_, i] = anchors[2][:, 2] + anchor_min_idx[i] = anchors[3] + anchor_max_idx[i] = anchors[4] + anchor_velo_min_idx[i] = anchors[5] + anchor_velo_max_idx[i] = anchors[6] + pbar.update(len(gene_indices)) + pbar.close() + directions = np.array(directions) + + filt = np.sum(losses != np.inf, 1) >= 1 + if np.sum(filt) == 0: + raise ValueError('None of the genes were fitted due to low quality,' + ' not returning') + adata_copy = adata_rna[:, gene_list[filt]].copy() + adata_copy.layers['ATAC'] = c_mat[:, filt] + adata_copy.var['fit_alpha_c'] = rates[filt, 0] + adata_copy.var['fit_alpha'] = rates[filt, 1] + adata_copy.var['fit_beta'] = rates[filt, 2] + adata_copy.var['fit_gamma'] = rates[filt, 3] + adata_copy.var['fit_t_sw1'] = t_sws[filt, 0] + adata_copy.var['fit_t_sw2'] = t_sws[filt, 1] + adata_copy.var['fit_t_sw3'] = t_sws[filt, 2] + adata_copy.var['fit_scale_cc'] = scale_ccs[filt] + adata_copy.var['fit_rescale_c'] = rescale_cs[filt] + adata_copy.var['fit_rescale_u'] = rescale_us[filt] + adata_copy.var['fit_alignment_scaling'] = realign_ratios[filt] + adata_copy.var['fit_model'] = models[filt] + adata_copy.var['fit_direction'] = directions[filt] + if model_to_run is not None and not m_per_g and not rna_only: + for i, m in enumerate(model_to_run): + adata_copy.var[f'fit_loss_M{m}'] = losses[filt, i] + else: + adata_copy.var['fit_loss'] = losses[filt, 0] + adata_copy.var['fit_likelihood'] = likelihoods[filt] + adata_copy.var['fit_likelihood_c'] = l_cs[filt] + adata_copy.var['fit_ssd_c'] = ssd_cs[filt] + adata_copy.var['fit_var_c'] = var_cs[filt] + if fit: + adata_copy.layers['fit_t'] = times[:, filt] + adata_copy.layers['fit_state'] = states[:, filt] + adata_copy.layers['velo_s'] = velo_s[:, filt] + adata_copy.layers['velo_u'] = velo_u[:, filt] + if not rna_only: + adata_copy.layers['velo_chrom'] = velo_c[:, filt] + adata_copy.var['fit_c0'] = initial_exps[filt, 0] + adata_copy.var['fit_u0'] = initial_exps[filt, 1] + adata_copy.var['fit_s0'] = initial_exps[filt, 2] + adata_copy.var['fit_anchor_min_idx'] = anchor_min_idx[filt] + adata_copy.var['fit_anchor_max_idx'] = anchor_max_idx[filt] + adata_copy.var['fit_anchor_velo_min_idx'] = anchor_velo_min_idx[filt] + adata_copy.var['fit_anchor_velo_max_idx'] = anchor_velo_max_idx[filt] + adata_copy.varm['fit_anchor_c'] = np.transpose(anchor_c[:, filt]) + adata_copy.varm['fit_anchor_u'] = np.transpose(anchor_u[:, filt]) + adata_copy.varm['fit_anchor_s'] = np.transpose(anchor_s[:, filt]) + adata_copy.varm['fit_anchor_c_sw'] = np.transpose(anchor_c_sw[:, filt]) + adata_copy.varm['fit_anchor_u_sw'] = np.transpose(anchor_u_sw[:, filt]) + adata_copy.varm['fit_anchor_s_sw'] = np.transpose(anchor_s_sw[:, filt]) + adata_copy.varm['fit_anchor_c_velo'] = np.transpose(anchor_vc[:, filt]) + adata_copy.varm['fit_anchor_u_velo'] = np.transpose(anchor_vu[:, filt]) + adata_copy.varm['fit_anchor_s_velo'] = np.transpose(anchor_vs[:, filt]) + v_genes = adata_copy.var['fit_likelihood'] >= 0.05 + adata_copy.var['velo_s_genes'] = adata_copy.var['velo_u_genes'] = \ + adata_copy.var['velo_chrom_genes'] = v_genes + adata_copy.uns['velo_s_params'] = adata_copy.uns['velo_u_params'] = \ + adata_copy.uns['velo_chrom_params'] = {'mode': 'dynamical'} + adata_copy.uns['velo_s_params'].update(fit_args) + adata_copy.uns['velo_u_params'].update(fit_args) + adata_copy.uns['velo_chrom_params'].update(fit_args) + adata_copy.obsp['_RNA_conn'] = rna_conn + if not rna_only: + adata_copy.obsp['_ATAC_conn'] = atac_conn + return adata_copy + + +def smooth_scale(conn, vector): + max_to = np.max(vector) + min_to = np.min(vector) + v = conn.dot(vector.T).T + max_from = np.max(v) + min_from = np.min(v) + res = ((v - min_from) * (max_to - min_to) / (max_from - min_from)) + min_to + return res + + +def top_n_sparse(conn, n): + conn_ll = conn.tolil() + for i in range(conn_ll.shape[0]): + row_data = np.array(conn_ll.data[i]) + row_idx = np.array(conn_ll.rows[i]) + new_idx = row_data.argsort()[-n:] + top_val = row_data[new_idx] + top_idx = row_idx[new_idx] + conn_ll.data[i] = top_val.tolist() + conn_ll.rows[i] = top_idx.tolist() + conn = conn_ll.tocsr() + idx1 = conn > 0 + idx2 = conn > 0.25 + idx3 = conn > 0.5 + conn[idx1] = 0.25 + conn[idx2] = 0.5 + conn[idx3] = 1 + conn.eliminate_zeros() + return conn + + +def set_velocity_genes(adata, + likelihood_lower=0.05, + rescale_u_upper=None, + rescale_u_lower=None, + rescale_c_upper=None, + rescale_c_lower=None, + primed_upper=None, + primed_lower=None, + decoupled_upper=None, + decoupled_lower=None, + alpha_c_upper=None, + alpha_c_lower=None, + alpha_upper=None, + alpha_lower=None, + beta_upper=None, + beta_lower=None, + gamma_upper=None, + gamma_lower=None, + scale_cc_upper=None, + scale_cc_lower=None + ): + """Reset velocity genes. + + This function resets velocity genes based on criteria of variables. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + likelihood_lower: `float` (default: 0.05) + Minimum ikelihood. + rescale_u_upper: `float` (default: `None`) + Maximum rescale_u. + rescale_u_lower: `float` (default: `None`) + Minimum rescale_u. + rescale_c_upper: `float` (default: `None`) + Maximum rescale_c. + rescale_c_lower: `float` (default: `None`) + Minimum rescale_c. + primed_upper: `float` (default: `None`) + Maximum primed interval. + primed_lower: `float` (default: `None`) + Minimum primed interval. + decoupled_upper: `float` (default: `None`) + Maximum decoupled interval. + decoupled_lower: `float` (default: `None`) + Minimum decoupled interval. + alpha_c_upper: `float` (default: `None`) + Maximum alpha_c. + alpha_c_lower: `float` (default: `None`) + Minimum alpha_c. + alpha_upper: `float` (default: `None`) + Maximum alpha. + alpha_lower: `float` (default: `None`) + Minimum alpha. + beta_upper: `float` (default: `None`) + Maximum beta. + beta_lower: `float` (default: `None`) + Minimum beta. + gamma_upper: `float` (default: `None`) + Maximum gamma. + gamma_lower: `float` (default: `None`) + Minimum gamma. + scale_cc_upper: `float` (default: `None`) + Maximum scale_cc. + scale_cc_lower: `float` (default: `None`) + Minimum scale_cc. + + Returns + ------- + velo_s_genes, velo_u_genes, velo_chrom_genes: `.var` + new velocity genes for each modalities. + """ + + v_genes = (adata.var['fit_likelihood'] >= likelihood_lower) + if rescale_u_upper is not None: + v_genes &= adata.var['fit_rescale_u'] <= rescale_u_upper + if rescale_u_lower is not None: + v_genes &= adata.var['fit_rescale_u'] >= rescale_u_lower + if rescale_c_upper is not None: + v_genes &= adata.var['fit_rescale_c'] <= rescale_c_upper + if rescale_c_lower is not None: + v_genes &= adata.var['fit_rescale_c'] >= rescale_c_lower + t_sw1 = adata.var['fit_t_sw1'] + 20 / adata.uns['velo_s_params']['t'] * \ + adata.var['fit_anchor_min_idx'] * adata.var['fit_alignment_scaling'] + if primed_upper is not None: + v_genes &= t_sw1 <= primed_upper + if primed_lower is not None: + v_genes &= t_sw1 >= primed_lower + t_sw2 = np.clip(adata.var['fit_t_sw2'], None, 20) + t_sw3 = np.clip(adata.var['fit_t_sw3'], None, 20) + t_interval3 = t_sw3 - t_sw2 + if decoupled_upper is not None: + v_genes &= t_interval3 <= decoupled_upper + if decoupled_lower is not None: + v_genes &= t_interval3 >= decoupled_lower + if alpha_c_upper is not None: + v_genes &= adata.var['fit_alpha_c'] <= alpha_c_upper + if alpha_c_lower is not None: + v_genes &= adata.var['fit_alpha_c'] >= alpha_c_lower + if alpha_upper is not None: + v_genes &= adata.var['fit_alpha'] <= alpha_upper + if alpha_lower is not None: + v_genes &= adata.var['fit_alpha'] >= alpha_lower + if beta_upper is not None: + v_genes &= adata.var['fit_beta'] <= beta_upper + if beta_lower is not None: + v_genes &= adata.var['fit_beta'] >= beta_lower + if gamma_upper is not None: + v_genes &= adata.var['fit_gamma'] <= gamma_upper + if gamma_lower is not None: + v_genes &= adata.var['fit_gamma'] >= gamma_lower + if scale_cc_upper is not None: + v_genes &= adata.var['fit_scale_cc'] <= scale_cc_upper + if scale_cc_lower is not None: + v_genes &= adata.var['fit_scale_cc'] >= scale_cc_lower + main_info(f'{np.sum(v_genes)} velocity genes were selected', indent_level=1) + adata.var['velo_s_genes'] = adata.var['velo_u_genes'] = \ + adata.var['velo_chrom_genes'] = v_genes + + +def velocity_graph(adata, vkey='velo_s', xkey='Ms', **kwargs): + """Computes velocity graph. + + This function normalizes the velocity matrix and computes velocity graph + with `scvelo.tl.velocity_graph`. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + vkey: `str` (default: `velo_s`) + Default to use spliced velocities. + xkey: `str` (default: `Ms`) + Default to use smoothed spliced counts. + Additional parameters passed to `scvelo.tl.velocity_graph`. + + Returns + ------- + Normalized velocity matrix and associated velocity genes and params. + Outputs of `scvelo.tl.velocity_graph`. + """ + import scvelo as scv + if vkey not in adata.layers.keys(): + raise ValueError('Velocity matrix is not found. Please run multivelo' + '.recover_dynamics_chrom function first.') + if vkey+'_norm' not in adata.layers.keys(): + adata.layers[vkey+'_norm'] = adata.layers[vkey] / np.sum( + np.abs(adata.layers[vkey]), 0) + adata.layers[vkey+'_norm'] /= np.mean(adata.layers[vkey+'_norm']) + adata.uns[vkey+'_norm_params'] = adata.uns[vkey+'_params'] + if vkey+'_norm_genes' not in adata.var.columns: + adata.var[vkey+'_norm_genes'] = adata.var[vkey+'_genes'] + scv.tl.velocity_graph(adata, vkey=vkey+'_norm', xkey=xkey, **kwargs) + + +def velocity_embedding_stream(adata, vkey='velo_s', show=True, **kwargs): + """Plots velocity stream. + + This function plots velocity streamplot with + `scvelo.pl.velocity_embedding_stream`. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + vkey: `str` (default: `velo_s`) + Default to use spliced velocities. The normalized matrix will be used. + show: `bool` (default: `True`) + Whether to show the plot. + Additional parameters passed to `scvelo.tl.velocity_graph`. + + Returns + ------- + If `show==False`, a matplotlib axis object. + """ + import scvelo as scv + if vkey not in adata.layers: + raise ValueError('Velocity matrix is not found. Please run multivelo.' + 'recover_dynamics_chrom function first.') + if vkey+'_norm' not in adata.layers.keys(): + adata.layers[vkey+'_norm'] = adata.layers[vkey] / np.sum( + np.abs(adata.layers[vkey]), 0) + adata.uns[vkey+'_norm_params'] = adata.uns[vkey+'_params'] + if vkey+'_norm_genes' not in adata.var.columns: + adata.var[vkey+'_norm_genes'] = adata.var[vkey+'_genes'] + if vkey+'_norm_graph' not in adata.uns.keys(): + velocity_graph(adata, vkey=vkey, **kwargs) + out = scv.pl.velocity_embedding_stream(adata, vkey=vkey+'_norm', show=show, + **kwargs) + if not show: + return out + + +def latent_time(adata, vkey='velo_s', **kwargs): + """Computes latent time. + + This function computes latent time with `scvelo.tl.latent_time`. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + vkey: `str` (default: `velo_s`) + Default to use spliced velocities. The normalized matrix will be used. + Additional parameters passed to `scvelo.tl.velocity_graph`. + + Returns + ------- + Outputs of `scvelo.tl.latent_time`. + """ + import scvelo as scv + if vkey not in adata.layers.keys() or 'fit_t' not in adata.layers.keys(): + raise ValueError('Velocity or time matrix is not found. Please run ' + 'multivelo.recover_dynamics_chrom function first.') + if vkey+'_norm' not in adata.layers.keys(): + raise ValueError('Normalized velocity matrix is not found. Please ' + 'run multivelo.velocity_graph function first.') + if vkey+'_norm_graph' not in adata.uns.keys(): + velocity_graph(adata, vkey=vkey, **kwargs) + scv.tl.latent_time(adata, vkey=vkey+'_norm', **kwargs) + + +def LRT_decoupling(adata_rna, adata_atac, **kwargs): + """Computes likelihood ratio test for decoupling state. + + This function computes whether keeping decoupling state improves fit + Likelihood. + + Parameters + ---------- + adata_rna: :class:`~anndata.AnnData` + RNA anndata object + adata_atac: :class:`~anndata.AnnData` + ATAC anndata object. + Additional parameters passed to `recover_dynamics_chrom`. + + Returns + ------- + adata_result_w_decoupled: class:`~anndata.AnnData` + fit result with decoupling state + adata_result_w_decoupled: class:`~anndata.AnnData` + fit result without decoupling state + res: `pandas.DataFrame` + LRT statistics + """ + from scipy.stats.distributions import chi2 + main_info('fitting models with decoupling intervals', v=0) + adata_result_w_decoupled = recover_dynamics_chrom(adata_rna, adata_atac, + fit_decoupling=True, + **kwargs) + main_info('fitting models without decoupling intervals', v=0) + adata_result_wo_decoupled = recover_dynamics_chrom(adata_rna, adata_atac, + fit_decoupling=False, + **kwargs) + main_info('testing likelihood ratio', v=0) + shared_genes = pd.Index(np.intersect1d(adata_result_w_decoupled.var_names, + adata_result_wo_decoupled.var_names) + ) + l_c_w_decoupled = adata_result_w_decoupled[:, shared_genes].\ + var['fit_likelihood_c'].values + l_c_wo_decoupled = adata_result_wo_decoupled[:, shared_genes].\ + var['fit_likelihood_c'].values + n_obs = adata_rna.n_obs + LRT_c = -2 * n_obs * (np.log(l_c_wo_decoupled) - np.log(l_c_w_decoupled)) + p_c = chi2.sf(LRT_c, 1) + l_w_decoupled = adata_result_w_decoupled[:, shared_genes].\ + var['fit_likelihood'].values + l_wo_decoupled = adata_result_wo_decoupled[:, shared_genes].\ + var['fit_likelihood'].values + LRT = -2 * n_obs * (np.log(l_wo_decoupled) - np.log(l_w_decoupled)) + p = chi2.sf(LRT, 1) + res = pd.DataFrame({'likelihood_c_w_decoupled': l_c_w_decoupled, + 'likelihood_c_wo_decoupled': l_c_wo_decoupled, + 'LRT_c': LRT_c, + 'pval_c': p_c, + 'likelihood_w_decoupled': l_w_decoupled, + 'likelihood_wo_decoupled': l_wo_decoupled, + 'LRT': LRT, + 'pval': p, + }, index=shared_genes) + return adata_result_w_decoupled, adata_result_wo_decoupled, res + + +def transition_matrix_s(s_mat, velo_s, knn): + knn = knn.astype(int) + tm_val, tm_col, tm_row = [], [], [] + for i in range(knn.shape[0]): + two_step_knn = knn[i, :] + for j in knn[i, :]: + two_step_knn = np.append(two_step_knn, knn[j, :]) + two_step_knn = np.unique(two_step_knn) + for j in two_step_knn: + s = s_mat[i, :] + sn = s_mat[j, :] + ds = s - sn + dx = np.ravel(ds.A) + velo = velo_s[i, :] + cos_sim = np.dot(dx, velo)/(norm(dx)*norm(velo)) + tm_val.append(cos_sim) + tm_col.append(j) + tm_row.append(i) + tm = coo_matrix((tm_val, (tm_row, tm_col)), shape=(s_mat.shape[0], + s_mat.shape[0])).tocsr() + tm.setdiag(0) + tm_neg = tm.copy() + tm.data = np.clip(tm.data, 0, 1) + tm_neg.data = np.clip(tm_neg.data, -1, 0) + tm.eliminate_zeros() + tm_neg.eliminate_zeros() + return tm, tm_neg + + +def transition_matrix_chrom(c_mat, u_mat, s_mat, velo_c, velo_u, velo_s, knn): + knn = knn.astype(int) + tm_val, tm_col, tm_row = [], [], [] + for i in range(knn.shape[0]): + two_step_knn = knn[i, :] + for j in knn[i, :]: + two_step_knn = np.append(two_step_knn, knn[j, :]) + two_step_knn = np.unique(two_step_knn) + for j in two_step_knn: + u = u_mat[i, :].A + s = s_mat[i, :].A + c = c_mat[i, :].A + un = u_mat[j, :] + sn = s_mat[j, :] + cn = c_mat[j, :] + dc = (c - cn) / np.std(c) + du = (u - un) / np.std(u) + ds = (s - sn) / np.std(s) + dx = np.ravel(np.hstack((dc.A, du.A, ds.A))) + velo = np.hstack((velo_c[i, :], velo_u[i, :], velo_s[i, :])) + cos_sim = np.dot(dx, velo)/(norm(dx)*norm(velo)) + tm_val.append(cos_sim) + tm_col.append(j) + tm_row.append(i) + tm = coo_matrix((tm_val, (tm_row, tm_col)), shape=(c_mat.shape[0], + c_mat.shape[0])).tocsr() + tm.setdiag(0) + tm_neg = tm.copy() + tm.data = np.clip(tm.data, 0, 1) + tm_neg.data = np.clip(tm_neg.data, -1, 0) + tm.eliminate_zeros() + tm_neg.eliminate_zeros() + return tm, tm_neg + + +def likelihood_plot(adata, + genes=None, + figsize=(14, 10), + bins=50, + pointsize=4 + ): + """Likelihood plots. + + This function plots likelihood and variable distributions. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + genes: `str`, list of `str` (default: `None`) + If `None`, will use all fitted genes. + figsize: `tuple` (default: (14,10)) + Figure size. + bins: `int` (default: 50) + Number of bins for histograms. + pointsize: `float` (default: 4) + Point size for scatter plots. + """ + if genes is None: + var = adata.var + else: + genes = np.array(genes) + var = adata[:, genes].var + likelihood = var[['fit_likelihood']].values + rescale_u = var[['fit_rescale_u']].values + rescale_c = var[['fit_rescale_c']].values + t_interval1 = var['fit_t_sw1'] + 20 / adata.uns['velo_s_params']['t'] \ + * var['fit_anchor_min_idx'] * var['fit_alignment_scaling'] + t_sw2 = np.clip(var['fit_t_sw2'], None, 20) + t_sw3 = np.clip(var['fit_t_sw3'], None, 20) + t_interval3 = t_sw3 - t_sw2 + log_s = np.log1p(np.sum(adata.layers['Ms'], axis=0)) + alpha_c = var[['fit_alpha_c']].values + alpha = var[['fit_alpha']].values + beta = var[['fit_beta']].values + gamma = var[['fit_gamma']].values + scale_cc = var[['fit_scale_cc']].values + + fig, axes = plt.subplots(4, 5, figsize=figsize) + axes[0, 0].hist(likelihood, bins=bins) + axes[0, 0].set_title('likelihood') + axes[0, 1].hist(rescale_u, bins=bins) + axes[0, 1].set_title('rescale u') + axes[0, 2].hist(rescale_c, bins=bins) + axes[0, 2].set_title('rescale c') + axes[0, 3].hist(t_interval1.values, bins=bins) + axes[0, 3].set_title('primed interval') + axes[0, 4].hist(t_interval3, bins=bins) + axes[0, 4].set_title('decoupled interval') + + axes[1, 0].scatter(log_s, likelihood, s=pointsize) + axes[1, 0].set_xlabel('log spliced') + axes[1, 0].set_ylabel('likelihood') + axes[1, 1].scatter(rescale_u, likelihood, s=pointsize) + axes[1, 1].set_xlabel('rescale u') + axes[1, 2].scatter(rescale_c, likelihood, s=pointsize) + axes[1, 2].set_xlabel('rescale c') + axes[1, 3].scatter(t_interval1.values, likelihood, s=pointsize) + axes[1, 3].set_xlabel('primed interval') + axes[1, 4].scatter(t_interval3, likelihood, s=pointsize) + axes[1, 4].set_xlabel('decoupled interval') + + axes[2, 0].hist(alpha_c, bins=bins) + axes[2, 0].set_title('alpha c') + axes[2, 1].hist(alpha, bins=bins) + axes[2, 1].set_title('alpha') + axes[2, 2].hist(beta, bins=bins) + axes[2, 2].set_title('beta') + axes[2, 3].hist(gamma, bins=bins) + axes[2, 3].set_title('gamma') + axes[2, 4].hist(scale_cc, bins=bins) + axes[2, 4].set_title('scale cc') + + axes[3, 0].scatter(alpha_c, likelihood, s=pointsize) + axes[3, 0].set_xlabel('alpha c') + axes[3, 0].set_ylabel('likelihood') + axes[3, 1].scatter(alpha, likelihood, s=pointsize) + axes[3, 1].set_xlabel('alpha') + axes[3, 2].scatter(beta, likelihood, s=pointsize) + axes[3, 2].set_xlabel('beta') + axes[3, 3].scatter(gamma, likelihood, s=pointsize) + axes[3, 3].set_xlabel('gamma') + axes[3, 4].scatter(scale_cc, likelihood, s=pointsize) + axes[3, 4].set_xlabel('scale cc') + fig.tight_layout() + + +def pie_summary(adata, genes=None): + """Summary of directions and models. + + This function plots a pie chart for (pre-determined or specified) + directions and models. + `induction`: induction-only genes. + `repression`: repression-only genes. + `Model 1`: model 1 complete genes. + `Model 2`: model 2 complete genes. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + genes: `str`, list of `str` (default: `None`) + If `None`, will use all fitted genes. + """ + if genes is None: + genes = adata.var_names + fit_model = adata[:, (adata.var['fit_direction'] == 'complete') & + np.isin(adata.var_names, genes)].var['fit_model'].values + fit_direction = adata[:, genes].var['fit_direction'].values + data = [np.sum(fit_direction == 'on'), np.sum(fit_direction == 'off'), + np.sum(fit_model == 1), np.sum(fit_model == 2)] + index = ['induction', 'repression', 'Model 1', 'Model 2'] + index = [x for i, x in enumerate(index) if data[i] > 0] + data = [x for x in data if x > 0] + df = pd.DataFrame({'data': data}, index=index) + df.plot.pie(y='data', autopct='%1.1f%%', legend=False, startangle=30, + ylabel='') + circle = plt.Circle((0, 0), 0.8, fc='white') + fig = plt.gcf() + fig.gca().add_artist(circle) + + +def switch_time_summary(adata, genes=None): + """Summary of switch times. + + This function plots a box plot for observed switch times. + `primed`: primed intervals. + `coupled-on`: coupled induction intervals. + `decoupled`: decoupled intervals. + `coupled-off`: coupled repression intervals. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + genes: `str`, list of `str` (default: `None`) + If `None`, will use velocity genes. + """ + t_sw = adata[:, adata.var['velo_s_genes'] + if genes is None + else genes] \ + .var[['fit_t_sw1', 'fit_t_sw2', 'fit_t_sw3']].copy() + t_sw = t_sw.mask(t_sw > 20, 20) + t_sw = t_sw.mask(t_sw < 0) + t_sw['interval 1'] = t_sw['fit_t_sw1'] + t_sw['t_sw2 - t_sw1'] = t_sw['fit_t_sw2'] - t_sw['fit_t_sw1'] + t_sw['t_sw3 - t_sw2'] = t_sw['fit_t_sw3'] - t_sw['fit_t_sw2'] + t_sw['20 - t_sw3'] = 20 - t_sw['fit_t_sw3'] + t_sw = t_sw.mask(t_sw <= 0) + t_sw = t_sw.mask(t_sw > 20) + t_sw.columns = pd.Index(['time 1', 'time 2', 'time 3', 'primed', + 'coupled-on', 'decoupled', 'coupled-off']) + t_sw = t_sw[['primed', 'coupled-on', 'decoupled', 'coupled-off']] + t_sw = t_sw / 20 + fig, ax = plt.subplots(figsize=(4, 5)) + ax = sns.boxplot(data=t_sw, width=0.5, palette='Set2', ax=ax) + ax.set_yticks(np.linspace(0, 1, 5)) + ax.set_title('Switch Intervals') + + +def dynamic_plot(adata, + genes, + by='expression', + color_by='state', + gene_time=True, + axis_on=True, + frame_on=True, + show_anchors=True, + show_switches=True, + downsample=1, + full_range=False, + figsize=None, + pointsize=2, + linewidth=1.5, + cmap='coolwarm' + ): + """Gene dynamics plot. + + This function plots accessibility, expression, or velocity by time. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + genes: `str`, list of `str` + List of genes to plot. + by: `str` (default: `expression`) + Plot accessibilities and expressions if `expression`. Plot velocities + if `velocity`. + color_by: `str` (default: `state`) + Color by the four potential states if `state`. Other common values are + leiden, louvain, celltype, etc. + If not `state`, the color field must be present in `.uns`, which can + be pre-computed with `scanpy.pl.scatter`. + For `state`, red, orange, green, and blue represent state 1, 2, 3, and + 4, respectively. + gene_time: `bool` (default: `True`) + Whether to use individual gene fitted time, or shared global latent + time. + Mean values of 20 equal sized windows will be connected and shown if + `gene_time==False`. + axis_on: `bool` (default: `True`) + Whether to show axis labels. + frame_on: `bool` (default: `True`) + Whether to show plot frames. + show_anchors: `bool` (default: `True`) + Whether to display anchors. + show_switches: `bool` (default: `True`) + Whether to show switch times. The switch times are indicated by + vertical dotted line. + downsample: `int` (default: 1) + How much to downsample the cells. The remaining number will be + `1/downsample` of original. + full_range: `bool` (default: `False`) + Whether to show the full time range of velocities before smoothing or + subset to only smoothed range. + figsize: `tuple` (default: `None`) + Total figure size. + pointsize: `float` (default: 2) + Point size for scatter plots. + linewidth: `float` (default: 1.5) + Line width for anchor line or mean line. + cmap: `str` (default: `coolwarm`) + Color map for continuous color key. + """ + from pandas.api.types import is_numeric_dtype, is_categorical_dtype + if by not in ['expression', 'velocity']: + raise ValueError('"by" must be either "expression" or "velocity".') + if by == 'velocity': + show_switches = False + if color_by == 'state': + types = [0, 1, 2, 3] + colors = ['tab:red', 'tab:orange', 'tab:green', 'tab:blue'] + elif color_by in adata.obs and is_numeric_dtype(adata.obs[color_by]): + types = None + colors = adata.obs[color_by].values + elif color_by in adata.obs and is_categorical_dtype(adata.obs[color_by]) \ + and color_by+'_colors' in adata.uns.keys(): + types = adata.obs[color_by].cat.categories + colors = adata.uns[f'{color_by}_colors'] + else: + raise ValueError('Currently, color key must be a single string of ' + 'either numerical or categorical available in adata ' + 'obs, and the colors of categories can be found in ' + 'adata uns.') + + downsample = np.clip(int(downsample), 1, 10) + genes = np.array(genes) + missing_genes = genes[~np.isin(genes, adata.var_names)] + if len(missing_genes) > 0: + main_info(f'{missing_genes} not found', v=0) + genes = genes[np.isin(genes, adata.var_names)] + gn = len(genes) + if gn == 0: + return + if not gene_time: + show_anchors = False + latent_time = np.array(adata.obs['latent_time']) + time_window = latent_time // 0.05 + time_window = time_window.astype(int) + time_window[time_window == 20] = 19 + if 'velo_s_params' in adata.uns.keys() and 'outlier' \ + in adata.uns['velo_s_params']: + outlier = adata.uns['velo_s_params']['outlier'] + else: + outlier = 99 + + fig, axs = plt.subplots(gn, 3, squeeze=False, figsize=(10, 2.3*gn) + if figsize is None else figsize) + fig.patch.set_facecolor('white') + for row, gene in enumerate(genes): + u = adata[:, gene].layers['Mu' if by == 'expression' else 'velo_u'] + s = adata[:, gene].layers['Ms' if by == 'expression' else 'velo_s'] + c = adata[:, gene].layers['ATAC' if by == 'expression' + else 'velo_chrom'] + c = c.A if sparse.issparse(c) else c + u = u.A if sparse.issparse(u) else u + s = s.A if sparse.issparse(s) else s + c, u, s = np.ravel(c), np.ravel(u), np.ravel(s) + non_outlier = c <= np.percentile(c, outlier) + non_outlier &= u <= np.percentile(u, outlier) + non_outlier &= s <= np.percentile(s, outlier) + c, u, s = c[non_outlier], u[non_outlier], s[non_outlier] + time = np.array(adata[:, gene].layers['fit_t'] if gene_time + else latent_time) + if by == 'velocity': + time = np.reshape(time, (-1, 1)) + time = np.ravel(adata.obsp['_RNA_conn'].dot(time)) + time = time[non_outlier] + if types is not None: + for i in range(len(types)): + if color_by == 'state': + filt = adata[non_outlier, gene].layers['fit_state'] \ + == types[i] + else: + filt = adata[non_outlier, :].obs[color_by] == types[i] + filt = np.ravel(filt) + if np.sum(filt) > 0: + axs[row, 0].scatter(time[filt][::downsample], + c[filt][::downsample], s=pointsize, + c=colors[i], alpha=0.6) + axs[row, 1].scatter(time[filt][::downsample], + u[filt][::downsample], + s=pointsize, c=colors[i], alpha=0.6) + axs[row, 2].scatter(time[filt][::downsample], + s[filt][::downsample], s=pointsize, + c=colors[i], alpha=0.6) + else: + axs[row, 0].scatter(time[::downsample], c[::downsample], + s=pointsize, + c=colors[non_outlier][::downsample], + alpha=0.6, cmap=cmap) + axs[row, 1].scatter(time[::downsample], u[::downsample], + s=pointsize, + c=colors[non_outlier][::downsample], + alpha=0.6, cmap=cmap) + axs[row, 2].scatter(time[::downsample], s[::downsample], + s=pointsize, + c=colors[non_outlier][::downsample], + alpha=0.6, cmap=cmap) + + if not gene_time: + window_count = np.zeros(20) + window_mean_c = np.zeros(20) + window_mean_u = np.zeros(20) + window_mean_s = np.zeros(20) + for i in np.unique(time_window[non_outlier]): + idx = time_window[non_outlier] == i + window_count[i] = np.sum(idx) + window_mean_c[i] = np.mean(c[idx]) + window_mean_u[i] = np.mean(u[idx]) + window_mean_s[i] = np.mean(s[idx]) + window_idx = np.where(window_count > 20)[0] + axs[row, 0].plot(window_idx*0.05+0.025, window_mean_c[window_idx], + linewidth=linewidth, color='black', alpha=0.5) + axs[row, 1].plot(window_idx*0.05+0.025, window_mean_u[window_idx], + linewidth=linewidth, color='black', alpha=0.5) + axs[row, 2].plot(window_idx*0.05+0.025, window_mean_s[window_idx], + linewidth=linewidth, color='black', alpha=0.5) + + if show_anchors: + n_anchors = adata.uns['velo_s_params']['t'] + t_sw_array = np.array([adata[:, gene].var['fit_t_sw1'], + adata[:, gene].var['fit_t_sw2'], + adata[:, gene].var['fit_t_sw3']]) + t_sw_array = t_sw_array[t_sw_array < 20] + min_idx = int(adata[:, gene].var['fit_anchor_min_idx']) + max_idx = int(adata[:, gene].var['fit_anchor_max_idx']) + old_t = np.linspace(0, 20, n_anchors)[min_idx:max_idx+1] + new_t = old_t - np.min(old_t) + new_t = new_t * 20 / np.max(new_t) + if by == 'velocity' and not full_range: + anchor_interval = 20 / (max_idx + 1 - min_idx) + min_idx = int(adata[:, gene].var['fit_anchor_velo_min_idx']) + max_idx = int(adata[:, gene].var['fit_anchor_velo_max_idx']) + start = 0 + (min_idx - + adata[:, gene].var['fit_anchor_min_idx']) \ + * anchor_interval + end = 20 + (max_idx - + adata[:, gene].var['fit_anchor_max_idx']) \ + * anchor_interval + new_t = np.linspace(start, end, max_idx + 1 - min_idx) + ax = axs[row, 0] + a_c = adata[:, gene].varm['fit_anchor_c' if by == 'expression' + else 'fit_anchor_c_velo']\ + .ravel()[min_idx:max_idx+1] + if show_switches: + for t_sw in t_sw_array: + if t_sw > 0: + ax.vlines(t_sw, np.min(c), np.max(c), colors='black', + linestyles='dashed', alpha=0.5) + ax.plot(new_t[0:new_t.shape[0]], a_c, linewidth=linewidth, + color='black', alpha=0.5) + ax = axs[row, 1] + a_u = adata[:, gene].varm['fit_anchor_u' if by == 'expression' + else 'fit_anchor_u_velo']\ + .ravel()[min_idx:max_idx+1] + if show_switches: + for t_sw in t_sw_array: + if t_sw > 0: + ax.vlines(t_sw, np.min(u), np.max(u), colors='black', + linestyles='dashed', alpha=0.5) + ax.plot(new_t[0:new_t.shape[0]], a_u, linewidth=linewidth, + color='black', alpha=0.5) + ax = axs[row, 2] + a_s = adata[:, gene].varm['fit_anchor_s' if by == 'expression' + else 'fit_anchor_s_velo']\ + .ravel()[min_idx:max_idx+1] + if show_switches: + for t_sw in t_sw_array: + if t_sw > 0: + ax.vlines(t_sw, np.min(s), np.max(s), colors='black', + linestyles='dashed', alpha=0.5) + ax.plot(new_t[0:new_t.shape[0]], a_s, linewidth=linewidth, + color='black', alpha=0.5) + + axs[row, 0].set_title(f'{gene} ATAC' if by == 'expression' + else f'{gene} chromatin velocity') + axs[row, 0].set_xlabel('t' if by == 'expression' else '~t') + axs[row, 0].set_ylabel('c' if by == 'expression' else 'dc/dt') + axs[row, 1].set_title(f'{gene} unspliced' + ('' if by == 'expression' + else ' velocity')) + axs[row, 1].set_xlabel('t' if by == 'expression' else '~t') + axs[row, 1].set_ylabel('u' if by == 'expression' else 'du/dt') + axs[row, 2].set_title(f'{gene} spliced' + ('' if by == 'expression' + else ' velocity')) + axs[row, 2].set_xlabel('t' if by == 'expression' else '~t') + axs[row, 2].set_ylabel('s' if by == 'expression' else 'ds/dt') + + for j in range(3): + ax = axs[row, j] + if not axis_on: + ax.xaxis.set_ticks_position('none') + ax.yaxis.set_ticks_position('none') + ax.get_xaxis().set_visible(False) + ax.get_yaxis().set_visible(False) + if not frame_on: + ax.xaxis.set_ticks_position('none') + ax.yaxis.set_ticks_position('none') + ax.set_frame_on(False) + fig.tight_layout() + + +def scatter_plot(adata, + genes, + by='us', + color_by='state', + n_cols=5, + axis_on=True, + frame_on=True, + show_anchors=True, + show_switches=True, + show_all_anchors=False, + title_more_info=False, + velocity_arrows=False, + downsample=1, + figsize=None, + pointsize=2, + markersize=5, + linewidth=2, + cmap='coolwarm', + view_3d_elev=None, + view_3d_azim=None, + full_name=False + ): + """Gene scatter plot. + + This function plots phase portraits of the specified plane. + + Parameters + ---------- + adata: :class:`~anndata.AnnData` + Anndata result from dynamics recovery. + genes: `str`, list of `str` + List of genes to plot. + by: `str` (default: `us`) + Plot unspliced-spliced plane if `us`. Plot chromatin-unspliced plane + if `cu`. + Plot 3D phase portraits if `cus`. + color_by: `str` (default: `state`) + Color by the four potential states if `state`. Other common values are + leiden, louvain, celltype, etc. + If not `state`, the color field must be present in `.uns`, which can be + pre-computed with `scanpy.pl.scatter`. + For `state`, red, orange, green, and blue represent state 1, 2, 3, and + 4, respectively. + When `by=='us'`, `color_by` can also be `c`, which displays the log + accessibility on U-S phase portraits. + n_cols: `int` (default: 5) + Number of columns to plot on each row. + axis_on: `bool` (default: `True`) + Whether to show axis labels. + frame_on: `bool` (default: `True`) + Whether to show plot frames. + show_anchors: `bool` (default: `True`) + Whether to display anchors. + show_switches: `bool` (default: `True`) + Whether to show switch times. The three switch times and the end of + trajectory are indicated by + circle, cross, dismond, and star, respectively. + show_all_anchors: `bool` (default: `False`) + Whether to display full range of (predicted) anchors even for + repression-only genes. + title_more_info: `bool` (default: `False`) + Whether to display model, direction, and likelihood information for + the gene in title. + velocity_arrows: `bool` (default: `False`) + Whether to show velocity arrows of cells on the phase portraits. + downsample: `int` (default: 1) + How much to downsample the cells. The remaining number will be + `1/downsample` of original. + figsize: `tuple` (default: `None`) + Total figure size. + pointsize: `float` (default: 2) + Point size for scatter plots. + markersize: `float` (default: 5) + Point size for switch time points. + linewidth: `float` (default: 2) + Line width for connected anchors. + cmap: `str` (default: `coolwarm`) + Color map for log accessibilities or other continuous color keys when + plotting on U-S plane. + view_3d_elev: `float` (default: `None`) + Matplotlib 3D plot `elev` argument. `elev=90` is the same as U-S plane, + and `elev=0` is the same as C-U plane. + view_3d_azim: `float` (default: `None`) + Matplotlib 3D plot `azim` argument. `azim=270` is the same as U-S + plane, and `azim=0` is the same as C-U plane. + full_name: `bool` (default: `False`) + Show full names for chromatin, unspliced, and spliced rather than + using abbreviated terms c, u, and s. + """ + from pandas.api.types import is_numeric_dtype, is_categorical_dtype + if by not in ['us', 'cu', 'cus']: + raise ValueError("'by' argument must be one of ['us', 'cu', 'cus']") + if color_by == 'state': + types = [0, 1, 2, 3] + colors = ['tab:red', 'tab:orange', 'tab:green', 'tab:blue'] + elif by == 'us' and color_by == 'c': + types = None + elif color_by in adata.obs and is_numeric_dtype(adata.obs[color_by]): + types = None + colors = adata.obs[color_by].values + elif color_by in adata.obs and is_categorical_dtype(adata.obs[color_by]) \ + and color_by+'_colors' in adata.uns.keys(): + types = adata.obs[color_by].cat.categories + colors = adata.uns[f'{color_by}_colors'] + else: + raise ValueError('Currently, color key must be a single string of ' + 'either numerical or categorical available in adata' + ' obs, and the colors of categories can be found in' + ' adata uns.') + + if 'velo_s_params' not in adata.uns.keys() \ + or 'fit_anchor_s' not in adata.varm.keys(): + show_anchors = False + if color_by == 'state' and 'fit_state' not in adata.layers.keys(): + raise ValueError('fit_state is not found. Please run ' + 'recover_dynamics_chrom function first or provide a ' + 'valid color key.') + + downsample = np.clip(int(downsample), 1, 10) + genes = np.array(genes) + missing_genes = genes[~np.isin(genes, adata.var_names)] + if len(missing_genes) > 0: + main_info(f'{missing_genes} not found', v=0) + genes = genes[np.isin(genes, adata.var_names)] + gn = len(genes) + if gn == 0: + return + if gn < n_cols: + n_cols = gn + if by == 'cus': + fig, axs = plt.subplots(-(-gn // n_cols), n_cols, squeeze=False, + figsize=(3.2*n_cols, 2.7*(-(-gn // n_cols))) + if figsize is None else figsize, + subplot_kw={'projection': '3d'}) + else: + fig, axs = plt.subplots(-(-gn // n_cols), n_cols, squeeze=False, + figsize=(2.7*n_cols, 2.4*(-(-gn // n_cols))) + if figsize is None else figsize) + fig.patch.set_facecolor('white') + count = 0 + for gene in genes: + u = adata[:, gene].layers['Mu'].copy() if 'Mu' in adata.layers \ + else adata[:, gene].layers['unspliced'].copy() + s = adata[:, gene].layers['Ms'].copy() if 'Ms' in adata.layers \ + else adata[:, gene].layers['spliced'].copy() + u = u.A if sparse.issparse(u) else u + s = s.A if sparse.issparse(s) else s + u, s = np.ravel(u), np.ravel(s) + if 'ATAC' not in adata.layers.keys() and \ + 'Mc' not in adata.layers.keys(): + show_anchors = False + elif 'ATAC' in adata.layers.keys(): + c = adata[:, gene].layers['ATAC'].copy() + c = c.A if sparse.issparse(c) else c + c = np.ravel(c) + elif 'Mc' in adata.layers.keys(): + c = adata[:, gene].layers['Mc'].copy() + c = c.A if sparse.issparse(c) else c + c = np.ravel(c) + + if velocity_arrows: + if 'velo_u' in adata.layers.keys(): + vu = adata[:, gene].layers['velo_u'].copy() + elif 'velocity_u' in adata.layers.keys(): + vu = adata[:, gene].layers['velocity_u'].copy() + else: + vu = np.zeros(adata.n_obs) + max_u = np.max([np.max(u), 1e-6]) + u /= max_u + vu = np.ravel(vu) + vu /= np.max([np.max(np.abs(vu)), 1e-6]) + if 'velo_s' in adata.layers.keys(): + vs = adata[:, gene].layers['velo_s'].copy() + elif 'velocity' in adata.layers.keys(): + vs = adata[:, gene].layers['velocity'].copy() + max_s = np.max([np.max(s), 1e-6]) + s /= max_s + vs = np.ravel(vs) + vs /= np.max([np.max(np.abs(vs)), 1e-6]) + if 'velo_chrom' in adata.layers.keys(): + vc = adata[:, gene].layers['velo_chrom'].copy() + max_c = np.max([np.max(c), 1e-6]) + c /= max_c + vc = np.ravel(vc) + vc /= np.max([np.max(np.abs(vc)), 1e-6]) + + row = count // n_cols + col = count % n_cols + ax = axs[row, col] + if types is not None: + for i in range(len(types)): + if color_by == 'state': + filt = adata[:, gene].layers['fit_state'] == types[i] + else: + filt = adata.obs[color_by] == types[i] + filt = np.ravel(filt) + if by == 'us': + if velocity_arrows: + ax.quiver(s[filt][::downsample], u[filt][::downsample], + vs[filt][::downsample], + vu[filt][::downsample], color=colors[i], + alpha=0.5, scale_units='xy', scale=10, + width=0.005, headwidth=4, headaxislength=5.5) + else: + ax.scatter(s[filt][::downsample], + u[filt][::downsample], s=pointsize, + c=colors[i], alpha=0.7) + elif by == 'cu': + if velocity_arrows: + ax.quiver(u[filt][::downsample], + c[filt][::downsample], + vu[filt][::downsample], + vc[filt][::downsample], color=colors[i], + alpha=0.5, scale_units='xy', scale=10, + width=0.005, headwidth=4, headaxislength=5.5) + else: + ax.scatter(u[filt][::downsample], + c[filt][::downsample], s=pointsize, + c=colors[i], alpha=0.7) + else: + if velocity_arrows: + ax.quiver(s[filt][::downsample], + u[filt][::downsample], c[filt][::downsample], + vs[filt][::downsample], + vu[filt][::downsample], + vc[filt][::downsample], + color=colors[i], alpha=0.4, length=0.1, + arrow_length_ratio=0.5, normalize=True) + else: + ax.scatter(s[filt][::downsample], + u[filt][::downsample], + c[filt][::downsample], s=pointsize, + c=colors[i], alpha=0.7) + elif color_by == 'c': + if 'velo_s_params' in adata.uns.keys() and \ + 'outlier' in adata.uns['velo_s_params']: + outlier = adata.uns['velo_s_params']['outlier'] + else: + outlier = 99.8 + non_zero = (u > 0) & (s > 0) & (c > 0) + non_outlier = u < np.percentile(u, outlier) + non_outlier &= s < np.percentile(s, outlier) + non_outlier &= c < np.percentile(c, outlier) + c -= np.min(c) + c /= np.max(c) + if velocity_arrows: + ax.quiver(s[non_zero & non_outlier][::downsample], + u[non_zero & non_outlier][::downsample], + vs[non_zero & non_outlier][::downsample], + vu[non_zero & non_outlier][::downsample], + np.log1p(c[non_zero & non_outlier][::downsample]), + alpha=0.5, + scale_units='xy', scale=10, width=0.005, + headwidth=4, headaxislength=5.5, cmap=cmap) + else: + ax.scatter(s[non_zero & non_outlier][::downsample], + u[non_zero & non_outlier][::downsample], + s=pointsize, + c=np.log1p(c[non_zero & non_outlier][::downsample]), + alpha=0.8, cmap=cmap) + else: + if by == 'us': + if velocity_arrows: + ax.quiver(s[::downsample], u[::downsample], + vs[::downsample], vu[::downsample], + colors[::downsample], alpha=0.5, + scale_units='xy', scale=10, width=0.005, + headwidth=4, headaxislength=5.5, cmap=cmap) + else: + ax.scatter(s[::downsample], u[::downsample], s=pointsize, + c=colors[::downsample], alpha=0.7, cmap=cmap) + elif by == 'cu': + if velocity_arrows: + ax.quiver(u[::downsample], c[::downsample], + vu[::downsample], vc[::downsample], + colors[::downsample], alpha=0.5, + scale_units='xy', scale=10, width=0.005, + headwidth=4, headaxislength=5.5, cmap=cmap) + else: + ax.scatter(u[::downsample], c[::downsample], s=pointsize, + c=colors[::downsample], alpha=0.7, cmap=cmap) + else: + if velocity_arrows: + ax.quiver(s[::downsample], u[::downsample], + c[::downsample], vs[::downsample], + vu[::downsample], vc[::downsample], + colors[::downsample], alpha=0.4, length=0.1, + arrow_length_ratio=0.5, normalize=True, + cmap=cmap) + else: + ax.scatter(s[::downsample], u[::downsample], + c[::downsample], s=pointsize, + c=colors[::downsample], alpha=0.7, cmap=cmap) + + if show_anchors: + min_idx = int(adata[:, gene].var['fit_anchor_min_idx']) + max_idx = int(adata[:, gene].var['fit_anchor_max_idx']) + a_c = adata[:, gene].varm['fit_anchor_c']\ + .ravel()[min_idx:max_idx+1].copy() + a_u = adata[:, gene].varm['fit_anchor_u']\ + .ravel()[min_idx:max_idx+1].copy() + a_s = adata[:, gene].varm['fit_anchor_s']\ + .ravel()[min_idx:max_idx+1].copy() + if velocity_arrows: + a_c /= max_c + a_u /= max_u + a_s /= max_s + if by == 'us': + ax.plot(a_s, a_u, linewidth=linewidth, color='black', + alpha=0.7, zorder=1000) + elif by == 'cu': + ax.plot(a_u, a_c, linewidth=linewidth, color='black', + alpha=0.7, zorder=1000) + else: + ax.plot(a_s, a_u, a_c, linewidth=linewidth, color='black', + alpha=0.7, zorder=1000) + if show_all_anchors: + a_c_pre = adata[:, gene].varm['fit_anchor_c']\ + .ravel()[:min_idx].copy() + a_u_pre = adata[:, gene].varm['fit_anchor_u']\ + .ravel()[:min_idx].copy() + a_s_pre = adata[:, gene].varm['fit_anchor_s']\ + .ravel()[:min_idx].copy() + if velocity_arrows: + a_c_pre /= max_c + a_u_pre /= max_u + a_s_pre /= max_s + if len(a_c_pre) > 0: + if by == 'us': + ax.plot(a_s_pre, a_u_pre, linewidth=linewidth/1.3, + color='black', alpha=0.6, zorder=1000) + elif by == 'cu': + ax.plot(a_u_pre, a_c_pre, linewidth=linewidth/1.3, + color='black', alpha=0.6, zorder=1000) + else: + ax.plot(a_s_pre, a_u_pre, a_c_pre, + linewidth=linewidth/1.3, color='black', + alpha=0.6, zorder=1000) + if show_switches: + t_sw_array = np.array([adata[:, gene].var['fit_t_sw1'] + .values[0], + adata[:, gene].var['fit_t_sw2'] + .values[0], + adata[:, gene].var['fit_t_sw3'] + .values[0]]) + in_range = (t_sw_array > 0) & (t_sw_array < 20) + a_c_sw = adata[:, gene].varm['fit_anchor_c_sw'].ravel().copy() + a_u_sw = adata[:, gene].varm['fit_anchor_u_sw'].ravel().copy() + a_s_sw = adata[:, gene].varm['fit_anchor_s_sw'].ravel().copy() + if velocity_arrows: + a_c_sw /= max_c + a_u_sw /= max_u + a_s_sw /= max_s + if in_range[0]: + c_sw1, u_sw1, s_sw1 = a_c_sw[0], a_u_sw[0], a_s_sw[0] + if by == 'us': + ax.plot([s_sw1], [u_sw1], "om", markersize=markersize, + zorder=2000) + elif by == 'cu': + ax.plot([u_sw1], [c_sw1], "om", markersize=markersize, + zorder=2000) + else: + ax.plot([s_sw1], [u_sw1], [c_sw1], "om", + markersize=markersize, zorder=2000) + if in_range[1]: + c_sw2, u_sw2, s_sw2 = a_c_sw[1], a_u_sw[1], a_s_sw[1] + if by == 'us': + ax.plot([s_sw2], [u_sw2], "Xm", markersize=markersize, + zorder=2000) + elif by == 'cu': + ax.plot([u_sw2], [c_sw2], "Xm", markersize=markersize, + zorder=2000) + else: + ax.plot([s_sw2], [u_sw2], [c_sw2], "Xm", + markersize=markersize, zorder=2000) + if in_range[2]: + c_sw3, u_sw3, s_sw3 = a_c_sw[2], a_u_sw[2], a_s_sw[2] + if by == 'us': + ax.plot([s_sw3], [u_sw3], "Dm", markersize=markersize, + zorder=2000) + elif by == 'cu': + ax.plot([u_sw3], [c_sw3], "Dm", markersize=markersize, + zorder=2000) + else: + ax.plot([s_sw3], [u_sw3], [c_sw3], "Dm", + markersize=markersize, zorder=2000) + if max_idx > adata.uns['velo_s_params']['t'] - 4: + if by == 'us': + ax.plot([a_s[-1]], [a_u[-1]], "*m", + markersize=markersize, zorder=2000) + elif by == 'cu': + ax.plot([a_u[-1]], [a_c[-1]], "*m", + markersize=markersize, zorder=2000) + else: + ax.plot([a_s[-1]], [a_u[-1]], [a_c[-1]], "*m", + markersize=markersize, zorder=2000) + + if by == 'cus' and \ + (view_3d_elev is not None or view_3d_azim is not None): + # US: elev=90, azim=270. CU: elev=0, azim=0. + ax.view_init(elev=view_3d_elev, azim=view_3d_azim) + title = gene + if title_more_info: + if 'fit_model' in adata.var: + title += f" M{int(adata[:,gene].var['fit_model'].values[0])}" + if 'fit_direction' in adata.var: + title += f" {adata[:,gene].var['fit_direction'].values[0]}" + if 'fit_likelihood' in adata.var \ + and not np.all(adata.var['fit_likelihood'].values == -1): + title += " " + f"{adata[:,gene].var['fit_likelihood'].values[0]:.3g}" + ax.set_title(f'{title}', fontsize=11) + if by == 'us': + ax.set_xlabel('spliced' if full_name else 's') + ax.set_ylabel('unspliced' if full_name else 'u') + elif by == 'cu': + ax.set_xlabel('unspliced' if full_name else 'u') + ax.set_ylabel('chromatin' if full_name else 'c') + elif by == 'cus': + ax.set_xlabel('spliced' if full_name else 's') + ax.set_ylabel('unspliced' if full_name else 'u') + ax.set_zlabel('chromatin' if full_name else 'c') + if by in ['us', 'cu']: + if not axis_on: + ax.xaxis.set_ticks_position('none') + ax.yaxis.set_ticks_position('none') + ax.get_xaxis().set_visible(False) + ax.get_yaxis().set_visible(False) + if not frame_on: + ax.xaxis.set_ticks_position('none') + ax.yaxis.set_ticks_position('none') + ax.set_frame_on(False) + elif by == 'cus': + if not axis_on: + ax.set_xlabel('') + ax.set_ylabel('') + ax.set_zlabel('') + ax.xaxis.set_ticklabels([]) + ax.yaxis.set_ticklabels([]) + ax.zaxis.set_ticklabels([]) + if not frame_on: + ax.xaxis._axinfo['grid']['color'] = (1, 1, 1, 0) + ax.yaxis._axinfo['grid']['color'] = (1, 1, 1, 0) + ax.zaxis._axinfo['grid']['color'] = (1, 1, 1, 0) + ax.xaxis._axinfo['tick']['inward_factor'] = 0 + ax.xaxis._axinfo['tick']['outward_factor'] = 0 + ax.yaxis._axinfo['tick']['inward_factor'] = 0 + ax.yaxis._axinfo['tick']['outward_factor'] = 0 + ax.zaxis._axinfo['tick']['inward_factor'] = 0 + ax.zaxis._axinfo['tick']['outward_factor'] = 0 + count += 1 + for i in range(col+1, n_cols): + fig.delaxes(axs[row, i]) + fig.tight_layout() \ No newline at end of file diff --git a/dynamo/multivelo/globals.py b/dynamo/multivelo/globals.py new file mode 100644 index 000000000..cb18de654 --- /dev/null +++ b/dynamo/multivelo/globals.py @@ -0,0 +1,58 @@ +import os +import platform + +# Determine platform on which analysis is running +running_on = platform.system() + +# Set up locale configuration here +REPO_PATH, ROOT_PATH = None, None # To make the lint checker happy ... +if running_on == 'Darwin': + # ... OSX system + # ... ... root path + ROOT_PATH = '/Users/cordessf/OneDrive' # <============= CHANGE THIS !!! + + # ... ... repo path + REPO_PATH = os.path.join(ROOT_PATH, 'ACI', 'Repositories') +elif running_on == 'Linux': + # ... Linux system + # ... ... root path + ROOT_PATH = '/data/LIRGE' # <============= CHANGE THIS !!! + + # ... ... repo path + REPO_PATH = os.path.join(ROOT_PATH, 'Repositories') + +# ... Path to base directory (where code and results are kept) +BASE_PATH = os.path.join(REPO_PATH, 'MultiDynamo') + +# ... Path to cache intermediate results +CACHE_PATH = os.path.join(ROOT_PATH, 'cache') +if not os.path.exists(CACHE_PATH): + os.makedirs(CACHE_PATH) + +# ... Path to data +DATA_PATH = os.path.join(ROOT_PATH, 'external_data', 'multiome') +if not os.path.exists(DATA_PATH): + os.makedirs(DATA_PATH) + +# ... Path to reference data +REFERENCE_DATA_PATH = os.path.join(ROOT_PATH, 'reference_data') +if not os.path.exists(REFERENCE_DATA_PATH): + os.makedirs(REFERENCE_DATA_PATH) + +# Structure the data as it would come out of a cellranger run +# ... cellranger outs directory +OUTS_PATH = os.path.join(DATA_PATH, 'outs') +if not os.path.exists(OUTS_PATH): + os.makedirs(OUTS_PATH) + +# Path to ATAC-seq data +ATAC_PATH = os.path.join(ROOT_PATH, 'external_data', '10k_human_PBMC_ATAC') + +# Path to genome annotation +GTF_PATH = os.path.join(REFERENCE_DATA_PATH, 'annotation', 'Homo_sapiens.GRCh38.112.gtf.gz') + +# Path to multiomic data +MULTIOME_PATH = DATA_PATH + +# Path to RNA-seq data +RNA_PATH = os.path.join(ROOT_PATH, 'external_data', '10k_human_PBMC_RNA') diff --git a/dynamo/multivelo/neural_nets/dir0.pt b/dynamo/multivelo/neural_nets/dir0.pt new file mode 100644 index 000000000..92fb3454e Binary files /dev/null and b/dynamo/multivelo/neural_nets/dir0.pt differ diff --git a/dynamo/multivelo/neural_nets/dir1.pt b/dynamo/multivelo/neural_nets/dir1.pt new file mode 100644 index 000000000..d67c03f0d Binary files /dev/null and b/dynamo/multivelo/neural_nets/dir1.pt differ diff --git a/dynamo/multivelo/neural_nets/dir2_m1.pt b/dynamo/multivelo/neural_nets/dir2_m1.pt new file mode 100644 index 000000000..7dc053f80 Binary files /dev/null and b/dynamo/multivelo/neural_nets/dir2_m1.pt differ diff --git a/dynamo/multivelo/neural_nets/dir2_m2.pt b/dynamo/multivelo/neural_nets/dir2_m2.pt new file mode 100644 index 000000000..e4b2a10da Binary files /dev/null and b/dynamo/multivelo/neural_nets/dir2_m2.pt differ diff --git a/dynamo/multivelo/old_MultiVelocity.py b/dynamo/multivelo/old_MultiVelocity.py new file mode 100644 index 000000000..01217d40d --- /dev/null +++ b/dynamo/multivelo/old_MultiVelocity.py @@ -0,0 +1,1403 @@ +from anndata import AnnData +import matplotlib.pyplot as plt +from multiprocessing import Pool +from mudata import MuData +import numpy as np +import os +from os import PathLike +import pandas as pd + +from scipy.sparse import coo_matrix, csr_matrix, hstack, issparse +from scipy.sparse.linalg import svds + +from typing import ( + Dict, + List, + Literal, + Optional, + Tuple, + Union +) + +import warnings + +# Import from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_exception, + main_info, +) + +# Imports from MultiDynamo +from .ChromatinVelocity import ChromatinVelocity +from .MultiConfiguration import MDKM +from .pyWNN import pyWNN + + +# Static function +# direction_cosine +def direction_cosine(args): + i, j, expression_mtx, velocity_mtx = args + + if i == j: + return i, j, -1 + + delta_ij = None + if isinstance(expression_mtx, csr_matrix): + delta_ij = (expression_mtx.getrow(j) - expression_mtx.getrow(i)).toarray().flatten() + elif isinstance(expression_mtx, np.ndarray): + delta_ij = (expression_mtx[j, :] - expression_mtx[i, :]).flatten() + else: + main_exception(f'Expression matrix is instance of class {type(expression_mtx)}') + + vel_i = velocity_mtx.getrow(i).toarray().flatten() + + dot_product = np.dot(delta_ij, vel_i) # vel_i.dot(delta_ij) + magnitude_vel_i = np.linalg.norm(vel_i) + magnitude_delta_ij = np.linalg.norm(delta_ij) + + if magnitude_vel_i != 0 and magnitude_delta_ij != 0: + cosine_similarity = dot_product / (magnitude_vel_i * magnitude_delta_ij) + else: + # One of velocity or delta_ij is zero, so can't compute a cosine, we'll just set to + # lowest possible value (-1) + cosine_similarity = -1 + + return i, j, cosine_similarity + + +# get_connectivities - patterned after function in scVelo +def get_connectivities(adata: AnnData, + mode: str = 'connectivities', + n_neighbors: int = None, + recurse_neighbors: bool = False + ) -> Union[csr_matrix, None]: + if 'neighbors' in adata.uns.keys(): + C = get_neighbors(adata=adata, mode=mode) + if n_neighbors is not None and n_neighbors < get_n_neighbors(adata=adata): + if mode == 'connectivities': + C = select_connectivities(C, n_neighbors) + else: + C = select_distances(C, n_neighbors) + connectivities = C > 0 + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + connectivities.setdiag(1) + if recurse_neighbors: + connectivities += connectivities.dot(connectivities * 0.5) + connectivities.data = np.clip(connectivities.data, 0, 1) + connectivities = connectivities.multiply(1.0 / connectivities.sum(1)) + return connectivities.tocsr().astype(np.float32) + else: + return None + + +# get_n_neighbors - lifted from scVelo +def get_n_neighbors(adata: AnnData) -> int: + return adata.uns.get('neighbors', {}).get('params', {}).get('n_neighbors', 0) + + +def get_neighbors(adata: AnnData, + mode: str = 'distances'): + if hasattr(adata, 'obsp') and mode in adata.obsp: + return adata.obsp[mode] + elif 'neighbors' in adata.uns.keys() and mode in adata.uns['neighbors']: + return adata.uns['neighbors'][mode] + else: + main_exception(f'The selected mode {mode} is not valid.') + + +def lifted_chromatin_velocity(arg): + i, j, chromatin_state, cosines, expression_mtx, rna_velocity = arg + + if i == j: + main_exception('A cell should never be its own integral neighbor.') + + # Compute change in chromatin state + delta_c_ij = None + if isinstance(chromatin_state, csr_matrix): + delta_c_ij = (chromatin_state.getrow(j) - chromatin_state.getrow(i)).toarray().flatten() + elif isinstance(chromatin_state, np.ndarray): + delta_c_ij = (chromatin_state[j, :] - chromatin_state[i, :]).flatten() + else: + main_exception(f'Chromatin state matrix is instance of class {type(chromatin_state)}') + + # Retrieve cosine + cosine = cosines[i, j] + + # Compute change in RNA expression + delta_s_ij = None + if isinstance(expression_mtx, csr_matrix): + delta_s_ij = (expression_mtx.getrow(j) - expression_mtx.getrow(i)).toarray().flatten() + elif isinstance(expression_mtx, np.ndarray): + delta_s_ij = (expression_mtx[j, :] - expression_mtx[i, :]).flatten() + else: + main_exception(f'RNA expression matrix is instance of class {type(expression_mtx)}') + + # Compute norms + norm_delta_s_ij = np.linalg.norm(delta_s_ij) + norm_rna_velocity = np.linalg.norm(rna_velocity.toarray()) + + if norm_delta_s_ij != 0: + chromatin_velocity = (norm_rna_velocity * cosine / norm_delta_s_ij) * delta_c_ij + else: + chromatin_velocity = np.zeros(chromatin_state.shape[1]) + + return i, chromatin_velocity + + +def regression(c, + u, + s, + ss, + us, + uu, + fit_args, + mode, + gene): + c_90 = np.percentile(c, 90) + u_90 = np.percentile(u, 90) + s_90 = np.percentile(s, 90) + + low_quality = (c_90 == 0 or s_90 == 0 or u_90 == 0) + + if low_quality: + # main_info(f'Skipping low quality gene {gene}.') + return np.zeros(len(u)), np.zeros(len(u)), 0, 0, np.inf + + cvc = ChromatinVelocity(c, + u, + s, + ss, + us, + uu, + fit_args, + gene=gene) + + if cvc.low_quality: + return np.zeros(len(u)), np.zeros(len(u)), 0, 0, np.inf + + if mode == 'deterministic': + cvc.compute_deterministic() + elif mode == 'stochastic': + cvc.compute_stochastic() + velocity = cvc.get_velocity(mode=mode) + gamma = cvc.get_gamma(mode=mode) + r2 = cvc.get_r2(mode=mode) + loss = cvc.get_loss(mode=mode) + variance_velocity = (None if mode == 'deterministic' + else cvc.get_variance_velocity()) + return velocity, variance_velocity, gamma, r2, loss + + +def select_connectivities(connectivities, + n_neighbors=None): + C = connectivities.copy() + n_counts = (C > 0).sum(1).A1 if issparse(C) else (C > 0).sum(1) + n_neighbors = ( + n_counts.min() if n_neighbors is None else min(n_counts.min(), + n_neighbors) + ) + rows = np.where(n_counts > n_neighbors)[0] + cumsum_neighs = np.insert(n_counts.cumsum(), 0, 0) + dat = C.data + + for row in rows: + n0, n1 = cumsum_neighs[row], cumsum_neighs[row + 1] + rm_idx = n0 + dat[n0:n1].argsort()[::-1][n_neighbors:] + dat[rm_idx] = 0 + + C.eliminate_zeros() + return C + + +def select_distances(dist, + n_neighbors: int = None): + D = dist.copy() + n_counts = (D > 0).sum(1).A1 if issparse(D) else (D > 0).sum(1) + n_neighbors = ( + n_counts.min() if n_neighbors is None else min(n_counts.min(), n_neighbors) + ) + rows = np.where(n_counts > n_neighbors)[0] + cumsum_neighs = np.insert(n_counts.cumsum(), 0, 0) + dat = D.data + + for row in rows: + n0, n1 = cumsum_neighs[row], cumsum_neighs[row + 1] + rm_idx = n0 + dat[n0:n1].argsort()[n_neighbors:] + dat[rm_idx] = 0 + + D.eliminate_zeros() + return D + + +# smooth_scale - lifted from MultiVelo +def smooth_scale(conn, + vector): + max_to = np.max(vector) + min_to = np.min(vector) + v = conn.dot(vector.T).T + max_from = np.max(v) + min_from = np.min(v) + res = ((v - min_from) * (max_to - min_to) / (max_from - min_from)) + min_to + return res + + +# top_n_sparse - lifted from MultiVelo +def top_n_sparse(conn, n): + conn_ll = conn.tolil() + for i in range(conn_ll.shape[0]): + row_data = np.array(conn_ll.data[i]) + row_idx = np.array(conn_ll.rows[i]) + new_idx = row_data.argsort()[-n:] + top_val = row_data[new_idx] + top_idx = row_idx[new_idx] + conn_ll.data[i] = top_val.tolist() + conn_ll.rows[i] = top_idx.tolist() + conn = conn_ll.tocsr() + idx1 = conn > 0 + idx2 = conn > 0.25 + idx3 = conn > 0.5 + conn[idx1] = 0.25 + conn[idx2] = 0.5 + conn[idx3] = 1 + conn.eliminate_zeros() + return conn + + +class MultiVelocity: + def __init__(self, + mdata: MuData, + cosine_similarities: csr_matrix = None, + cre_dict: Dict = None, + include_gene_body: bool = False, + integral_neighbors: Dict = None, + linkage_fn: str = 'feature_linkage.bedpe', # in 'outs/analysis/feature_linkage' directory + linkage_method: Literal['cellranger', 'cicero', 'scenic+'] = 'cellranger', + max_peak_dist: int = 10000, + min_corr: float = 0.5, + neighbor_method: Literal['multivi', 'wnn'] = 'multivi', + nn_dist: csr_matrix = None, + nn_idx: csr_matrix = None, + peak_annot_fn: str = 'peak_annotation.tsv', # in 'outs' directory + promoter_dict: Dict = None + ): + # Initialize instance variables + self.mdata = mdata.copy() if mdata is not None else None + + self._cre_dict = cre_dict.copy() if cre_dict is not None else None + + self.cosine_similarities = cosine_similarities.copy() if cosine_similarities is not None else None + + self.include_gene_body = include_gene_body + + self.integral_neighbors = integral_neighbors.copy() if integral_neighbors is not None else None + + self.linkage_fn = linkage_fn + + self.linkage_method = linkage_method + + self.max_peak_dist = max_peak_dist + + self.min_corr = min_corr + + self.neighbor_method = neighbor_method + + self.nn_dist = nn_dist.copy() if nn_dist is not None else None + + self.nn_idx = nn_idx.copy() if nn_idx is not None else None + + self.peak_annot_fn = peak_annot_fn + + self._promoter_dict = promoter_dict.copy() if promoter_dict is not None else None + + def atac_elements(self): + return self.mdata['atac'].var_names.tolist() + + def compute_linkages(self) -> None: + if self.linkage_method == 'cellranger': + self.compute_linkages_via_cellranger() + elif self.linkage_method == 'cicero': + self.compute_linkages_via_cicero() + elif self.linkage_method == 'scenic+': + self.compute_linkages_via_scenicplus() + else: + main_exception(f'Unrecognized method to compute linkages ({self.linkage_method}) requested.') + + def compute_linkages_via_cellranger(self) -> None: + # This reads the cellranger-arc 'feature_linkage.bedpe' and 'peak_annotation.tsv' files + # to extract dictionaries attributing cis-regulatory elements with specific genes + main_info('Computing linkages via cellranger ...') + linkage_logger = LoggerManager.gen_logger('compute_linkages_via_cellranger') + linkage_logger.log_time() + + # Confirm that this is matched ATAC- and RNA-seq data + if not self.mdata.mod['atac'].uns[MDKM.MATCHED_ATAC_RNA_DATA_KEY]: + main_exception('Cannot use cellranger to compute CRE linkages for UNMATCHED data') + + outs_data_path = os.path.join(self.mdata.mod['atac'].uns['base_data_path'], 'outs') + # Confirm that the base path to the 'outs' directory exists + if not os.path.exists(outs_data_path): + main_exception(f'The path to the 10X outs directory ({outs_data_path}) does not exist.') + + # Read annotations + peak_annot_path = os.path.join(outs_data_path, self.peak_annot_fn) + if not os.path.exists(peak_annot_path): + main_exception(f'The path to the peak annotation file ({peak_annot_path}) does not exist.') + + corr_dict, distal_dict, gene_body_dict, promoter_dict = {}, {}, {}, {} + with open(peak_annot_path) as f: + # Scan the header to determine version of CellRanger used in making the peak annotation file + header = next(f) + fields = header.split('\t') + + # Peak annotation should contain 4 columns for version 1.X of CellRanger and 6 columns for + # version 2.X + if len(fields) not in [4, 6]: + main_exception('Peak annotation file should contain 4 columns (CellRanger ARC 1.0.0) ' + + 'or 6 columns (CellRanger ARC 2.0.0)') + else: + offset = 0 if len(fields) == 4 else 2 + + for line in f: + fields = line.rstrip().split('\t') + + peak = f'{fields[0]}:{fields[1]}-{fields[2]}' if offset else \ + f"{fields[0].split('_')[0]}:{fields[0].split('_')[1]}-{fields[0].split('_')[2]}" + + if fields[1 + offset] == '': + continue + + genes, dists, types = \ + fields[1 + offset].split(';'), fields[2 + offset].split(';'), fields[3 + offset].split(';') + + for gene, dist, annot in zip(genes, dists, types): + if annot == 'promoter': + promoter_dict.setdefault(gene, []).append(peak) + elif annot == 'distal': + if dist == '0': + gene_body_dict.setdefault(gene, []).append(peak) + else: + distal_dict.setdefault(gene, []).append(peak) + + # Read linkages + linkage_path = os.path.join(outs_data_path, 'analysis', 'feature_linkage', self.linkage_fn) + if not os.path.exists(linkage_path): + main_exception(f'The path to the linkage file ({linkage_path}) does not exist.') + with open(linkage_path) as f: + for line in f: + fields = line.rstrip().split('\t') + + # Form proper peak coordinates + peak_1, peak_2 = f'{fields[0]}:{fields[1]}-{fields[2]}', f'{fields[3]}:{fields[4]}-{fields[5]}' + + # Split the gene pairs + genes_annots_1, genes_annots_2 = \ + fields[6].split('><')[0][1:].split(';'), fields[6].split('><')[1][:-1].split(';') + + # Extract correlation + correlation = float(fields[7]) + + # Extract distance between peaks + dist = float(fields[11]) + + if fields[12] == 'peak-peak': + for gene_annot_1 in genes_annots_1: + gene_1, annot_1 = gene_annot_1.split('_') + for gene_annot_2 in genes_annots_2: + gene_2, annot_2 = gene_annot_2.split('_') + + if (((annot_1 == 'promoter') != (annot_2 == 'promoter')) and + ((gene_1 == gene_2) or (dist < self.max_peak_dist))): + gene = gene_1 if annot_1 == 'promoter' else gene_2 + + if (peak_2 not in corr_dict.get(gene, []) and annot_1 == 'promoter' and + (gene_2 not in gene_body_dict or peak_2 not in gene_body_dict.get(gene_2, []))): + corr_dict.setdefault(gene, [[], []])[0].append(peak_2) + corr_dict[gene][1].append(correlation) + + if (peak_1 not in corr_dict.get(gene, []) and annot_2 == 'promoter' and + (gene_1 not in gene_body_dict or peak_1 not in gene_body_dict.get(gene_1, []))): + corr_dict.setdefault(gene, [[], []])[0].append(peak_1) + corr_dict[gene][1].append(correlation) + + elif fields[12] == 'peak-gene': + gene_2 = genes_annots_2[0] + for gene_annot_1 in genes_annots_1: + gene_1, annot_1 = gene_annot_1.split('_') + + if (gene_1 == gene_2) or (dist < self.max_peak_dist): + gene = gene_1 + + if (peak_1 not in corr_dict.get(gene, []) and annot_1 != 'promoter' and + (gene_1 not in gene_body_dict or peak_1 not in gene_body_dict.get(gene_1, []))): + corr_dict.setdefault(gene, [[], []])[0].append(peak_1) + corr_dict[gene][1].append(correlation) + + elif fields[12] == 'gene-peak': + gene_1 = genes_annots_1[0] + for gene_annot_2 in genes_annots_2: + gene_2, annot_2 = gene_annot_2.split('_') + + if (gene_1 == gene_2) or (dist < self.max_peak_dist): + gene = gene_1 + + if (peak_2 not in corr_dict.get(gene, []) and annot_2 != 'promoter' and + (gene_2 not in gene_body_dict or peak_2 not in gene_body_dict.get(gene_2, []))): + corr_dict.setdefault(gene, [[], []])[0].append(peak_2) + corr_dict[gene][1].append(correlation) + + cre_dict = {} + gene_dict = promoter_dict + promoter_genes = list(promoter_dict.keys()) + + for gene in promoter_genes: + if self.include_gene_body: # add gene-body peaks + if gene in gene_body_dict: + for peak in gene_body_dict[gene]: + if peak not in gene_dict[gene]: + gene_dict[gene].append(peak) + cre_dict[gene] = [] + if gene in corr_dict: # add enhancer peaks + for j, peak in enumerate(corr_dict[gene][0]): + corr = corr_dict[gene][1][j] + if corr > self.min_corr: + if peak not in gene_dict[gene]: + gene_dict[gene].append(peak) + cre_dict[gene].append(peak) + + # Update the enhancer and promoter dictionaries + self._update_cre_and_promoter_dicts(cre_dict=cre_dict, + promoter_dict=promoter_dict) + + linkage_logger.finish_progress(progress_name='compute_linkages_via_cellranger') + + def compute_linkages_via_cicero(self) -> None: + # TODO: Use cicero to filter significant linkages + pass + + def compute_linkages_via_scenicplus(self) -> None: + # TODO: Use scenicplus to filter significant linkages + pass + + def compute_neighbors(self, + atac_lsi_key: str = MDKM.ATAC_OBSM_LSI_KEY, + lr: float = 0.0001, + max_epochs: int = 10, # 10 for debug mode 500 for release, + mv_algorithm: bool = True, + n_comps_atac: int = 20, + n_comps_rna: int = 20, + n_neighbors: int = 20, + pc_key: str = MDKM.ATAC_OBSM_PC_KEY, + random_state: int = 42, + rna_pca_key: str = MDKM.RNA_OBSM_PC_KEY, + scale_factor: float = 1e4, + use_highly_variable: bool = False + ) -> None: + if self.neighbor_method == 'multivi': + self.compute_neighbors_via_multivi( + lr=lr, + max_epochs=max_epochs) + elif self.neighbor_method == 'wnn': + self.weighted_nearest_neighbors( + atac_lsi_key=atac_lsi_key, + n_components_atac=n_comps_atac, + n_components_rna=n_comps_rna, + nn=n_neighbors, + random_state=random_state, + rna_pca_key=rna_pca_key, + use_highly_variable=use_highly_variable) + else: + main_exception(f'Unrecognized method to compute neighbors ({self.neighbor_method}) requested.') + + def compute_neighbors_via_multivi( + self, + lr: float = 0.0001, + max_epochs: int = 500, + n_comps: int = 20, + n_neighbors: int = 20, + ) -> None: + import scvi + import scanpy as sc + main_info('Computing nearest neighbors in latent representation generated by MULTIVI ...', indent_level=1) + nn_logger = LoggerManager.gen_logger('compute_nn_via_mvi') + nn_logger.log_time() + + # Extract the ATAC-seq and RNA-seq portions + atac_adata, rna_adata = self.mdata.mod['atac'], self.mdata.mod['rna'] + n_peaks, n_genes = atac_adata.n_vars, rna_adata.n_vars + + # Ensure that the ATAC- and RNA-seq portions have same number of cells + assert (atac_adata.n_obs == rna_adata.n_obs) + + # Restructure the data into MULTIVI format - we do not perform TF-IDF transformation + # ... X - counts or normalized counts??? + tmp_adata_X = hstack([rna_adata.layers[MDKM.RNA_COUNTS_LAYER], atac_adata.layers[MDKM.ATAC_COUNTS_LAYER]]) + + # ... obs + tmp_adata_obs = rna_adata.obs.copy() + + # ... var + tmp_adata_var = pd.concat([rna_adata.var.copy(), atac_adata.var.copy()], join='inner', axis=0) + + tmp_adata = AnnData(X=tmp_adata_X.copy(), obs=tmp_adata_obs, var=tmp_adata_var) + tmp_adata.layers['counts'] = tmp_adata.X.copy() + + # Get the number of cells + num_cells = tmp_adata.n_obs + + # Generate a random permutation of cell indices + cell_indices = np.random.permutation(num_cells) + + # Determine the split point + split_point = num_cells // 2 + + # Split indices into two groups + cell_indices_1 = cell_indices[:split_point] + cell_indices_2 = cell_indices[split_point:] + + # Subset the AnnData object into two disjoint AnnData objects + tmp_adata_1 = tmp_adata[cell_indices_1].copy() + tmp_adata_1.obs['modality'] = 'first_set' + tmp_adata_2 = tmp_adata[cell_indices_2].copy() + tmp_adata_2.obs['modality'] = 'second_set' + + tmp_adata = scvi.data.organize_multiome_anndatas(tmp_adata_1, tmp_adata_2) + + # Run MULTIVI + # ... setup AnnData object for scvi-tools + main_info('Setting up combined data for MULTIVI', indent_level=2) + scvi.model.MULTIVI.setup_anndata(tmp_adata, batch_key='modality') + + # ... instantiate the SCVI model + main_info('Instantiating MULTIVI model', indent_level=2) + multivi_model = scvi.model.MULTIVI(adata=tmp_adata, n_genes=n_genes, n_regions=n_peaks, n_latent=n_comps) + multivi_model.view_anndata_setup() + + # ... train the model + main_info('Training MULTIVI model', indent_level=2) + multivi_model.train(max_epochs=max_epochs, lr=lr) + + # Extract latent representation + main_info('extracting latent representation for ATAC-seq', indent_level=3) + atac_adata.obsm['X_mvi_latent'] = multivi_model.get_latent_representation().copy() + rna_adata.obsm['X_mvi_latent'] = multivi_model.get_latent_representation().copy() + + # Compute nearest neighbors + main_info('Computing nearest neighbors in MVI latent representation', indent_level=2) + sc.pp.neighbors(rna_adata, n_neighbors=n_neighbors, n_pcs=n_comps, use_rep='X_mvi_latent') + + # Redundantly copy over to atac-seq modality + atac_adata.obsp['distances'] = rna_adata.obsp['distances'].copy() + atac_adata.obsp['connectivities'] = rna_adata.obsp['connectivities'].copy() + atac_adata.uns['neighbors'] = rna_adata.uns['neighbors'].copy() + + # Extract the matrix storing the distances between each cell and its neighbors + cx = coo_matrix(rna_adata.obsp['distances'].copy()) + + # the number of cells + cells = rna_adata.obsp['distances'].shape[0] + + # define the shape of our final results + # and make the arrays that will hold the results + new_shape = (cells, n_neighbors) + nn_dist = np.zeros(shape=new_shape) + nn_idx = np.zeros(shape=new_shape) + + # new_col defines what column we store data in our result arrays + new_col = 0 + + # loop through the distance matrices + for i, j, v in zip(cx.row, cx.col, cx.data): + # store the distances between neighbor cells + nn_dist[i][new_col % n_neighbors] = v + + # for each cell's row, store the row numbers of its neighbor cells + # (1-indexing instead of 0- is a holdover from R multimodalneighbors()) + nn_idx[i][new_col % n_neighbors] = int(j) + 1 + + new_col += 1 + + # Add index and distance to the MultiomeVelocity object + self.nn_idx = nn_idx + self.nn_dist = nn_dist + + # Copy the subset AnnData scRNA-seq and scATAC-seq objects back into the MultiomeVelocity object + self.mdata.mod['atac'] = atac_adata.copy() + self.mdata.mod['rna'] = rna_adata.copy() + + nn_logger.finish_progress(progress_name='compute_nn_via_mvi') + + def compute_second_moments( + self, + adjusted: bool = False + ) -> Tuple[csr_matrix, csr_matrix, csr_matrix]: + # Extract transcriptome + rna_adata = self.mdata.mod['rna'] + + # Obtain connectivities matrix + connectivities = get_connectivities(rna_adata) + + s, u = (csr_matrix(rna_adata.layers[MDKM.RNA_SPLICED_LAYER]), + csr_matrix(rna_adata.layers[MDKM.RNA_UNSPLICED_LAYER])) + if s.shape[0] == 1: + s, u = s.T, u.T + Mss = csr_matrix.dot(connectivities, s.multiply(s)).astype(np.float32).A + Mus = csr_matrix.dot(connectivities, s.multiply(u)).astype(np.float32).A + Muu = csr_matrix.dot(connectivities, u.multiply(u)).astype(np.float32).A + if adjusted: + Mss = 2 * Mss - rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER].reshape(Mss.shape) + Mus = 2 * Mus - rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER].reshape(Mus.shape) + Muu = 2 * Muu - rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER].reshape(Muu.shape) + return Mss, Mus, Muu + + def compute_velocities(self, + linkage_method: Optional[Literal['cellranger', 'cicero', 'scenic+']] = 'cellranger', + mode: Literal['deterministic', 'stochastic'] = 'deterministic', + neighbor_method: Literal['multivi', 'wnn'] = 'wnn', + num_processes: int = 6) -> None: + if linkage_method is not None: + self.linkage_method = linkage_method + + if neighbor_method is not None: + self.neighbor_method = neighbor_method + + if (self.linkage_method is None) or (self.neighbor_method is None): + main_exception('linkage_method and neighbor_method mus be specified.') + + # Compute linkages + self.compute_linkages() + + # Compute neighbors + self.compute_neighbors() + + # Compute smoother accessibility + self.knn_smoothed_chrom() + + # Compute transcriptomic velocity + self.transcriptomic_velocity(mode=mode, num_processes=num_processes) + + # Compute lift of transcriptomic velocity + self.lift_transcriptomic_velocity(num_processes=num_processes) + + def find_cell_along_integral_curve(self, + num_processes: int = 6, + plot_dir_cosines: bool = False): + # Extract the ATAC- and RNA-seq portions + atac_adata, rna_adata = self.mdata.mod['atac'], self.mdata.mod['rna'] + + expression_mtx = rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER] + velocity_mtx = rna_adata.layers[MDKM.RNA_SPLICED_VELOCITY_LAYER] + + # Extract connectivities + connectivities = get_connectivities(rna_adata) + + # Get non-zero indices from connectivities + nonzero_idx = connectivities.nonzero() + + # Prepare argument list for parallel processing + args_list = [(i, j, expression_mtx, velocity_mtx) + for i, j in zip(nonzero_idx[0], nonzero_idx[1])] + + # Use multiprocessing to compute the results + with Pool(processes=num_processes) as pool: + results = pool.map(direction_cosine, args_list) + + # Convert results to sparse matrix + data = [cosines for _, _, cosines in results] + i_indices = [i_idx for i_idx, _, _ in results] + j_indices = [j_idx for _, j_idx, _ in results] + direction_cosines = csr_matrix((data, (i_indices, j_indices)), shape=connectivities.shape) + + # Find nearest neighbor along integral curve + integral_neighbors = direction_cosines.argmax(axis=1).A.flatten() + + if plot_dir_cosines: + # Summarize statistics about the best direction cosines + max_dir_cosines = direction_cosines.max(axis=1).A.flatten() + plt.hist(max_dir_cosines, bins=25) + plt.title('Frequencies of direction cosines') + plt.xlabel('Direction Cosines') + plt.ylabel('Frequency') + plt.show() + + # Save the results in this class + # TODO: Consider whether to add to AnnData objects + self.cosine_similarities = direction_cosines + self.integral_neighbors = {int(idx): int(integral_neighbor) + for idx, integral_neighbor in enumerate(integral_neighbors)} + + @classmethod + def from_mdata(cls, + mdata: MuData): + # Deep copy MuData object for export + atac_adata, rna_adata = mdata.mod['atac'].copy(), mdata.mod['rna'].copy() + + # ... from atac + # ... bit of kludge: dictionaries appear to require type casting after deserialization + deser_cre_dict = atac_adata.uns['cre_dict'].copy() + cre_dict = {} + for gene, cre_list in deser_cre_dict.items(): + cre_dict[str(gene)] = [str(cre) for cre in cre_list] + # ... bit of kludge: dictionaries appear to require type casting after deserialization + deser_promoter_dict = atac_adata.uns['promoter_dict'] + promoter_dict = {} + for gene, promoter_list in deser_promoter_dict.items(): + promoter_dict[str(gene)] = [str(promoter) for promoter in promoter_list] + + multi_dynamo_kwargs = atac_adata.uns['multi_dynamo_kwargs'] + include_gene_body = multi_dynamo_kwargs.get('include_gene_body', False) + linkage_fn = multi_dynamo_kwargs.get('linkage_fn', 'feature_linkage.bedpe') + linkage_method = multi_dynamo_kwargs.get('linkage_method', 'cellranger') + max_peak_dist = multi_dynamo_kwargs.get('max_peak_dist', 10000) + min_corr = multi_dynamo_kwargs.get('min_corr', 0.5) + peak_annot_fn = multi_dynamo_kwargs.get('min_corr', 'peak_annotation.tsv') + + # ... from rna + nn_dist = rna_adata.obsm['multi_dynamo_nn_dist'] + nn_idx = rna_adata.obsm['multi_dynamo_nn_idx'] + + cosine_similarities = rna_adata.obsp['cosine_similarities'] + # ... bit of kludge: dictionaries appear to require type casting after deserialization + integral_neighbors = {int(k): int(v) for k,v in rna_adata.uns['integral_neighbors'].items()} + + multi_dynamo_kwargs = rna_adata.uns['multi_dynamo_kwargs'] + neighbor_method = multi_dynamo_kwargs.get('neighbor_method', 'multivi') + + multi_velocity = cls(mdata=mdata, + cre_dict=cre_dict, + cosine_similarities=cosine_similarities, + include_gene_body=include_gene_body, + integral_neighbors=integral_neighbors, + linkage_fn=linkage_fn, + linkage_method=linkage_method, + max_peak_dist=max_peak_dist, + min_corr=min_corr, + nn_dist=nn_dist, + nn_idx=nn_idx, + neighbor_method=neighbor_method, + peak_annot_fn=peak_annot_fn, + promoter_dict=promoter_dict) + + return multi_velocity + + def get_cre_dict(self): + return self._cre_dict + + def get_mdata(self): + return self.mdata + + def get_nn_dist(self): + return self.nn_dist + + def get_nn_idx(self): + return self.nn_idx + + def get_promoter_dict(self): + return self._promoter_dict + + # knn_smoothed_chrom - method adapted from MultiVelo + def knn_smoothed_chrom(self, + nn: int = 20 + ) -> None: + # Consistency checks + nn_idx = None + if self.nn_idx is None: + main_exception('Missing KNN index matrix. Try calling compute_neighbors first.') + else: + nn_idx = self.nn_idx + + nn_dist = None + if self.nn_dist is None: + main_exception('Missing KNN distance matrix. Try calling compute_neighbors first.') + else: + nn_dist = self.nn_dist + + atac_adata, rna_adata = self.mdata.mod['atac'], self.mdata.mod['rna'] + n_cells = atac_adata.n_obs + + if (nn_idx.shape[0] != n_cells) or (nn_dist.shape[0] != n_cells): + main_exception('Number of rows of KNN indices does not equal to number of cells.') + + X = coo_matrix(([], ([], [])), shape=(n_cells, 1)) + from umap.umap_ import fuzzy_simplicial_set + conn, sigma, rho, dists = fuzzy_simplicial_set(X=X, + n_neighbors=nn, + random_state=None, + metric=None, + knn_indices=nn_idx-1, + knn_dists=nn_dist, + return_dists=True) + + conn = conn.tocsr().copy() + n_counts = (conn > 0).sum(1).A1 + if nn is not None and nn < n_counts.min(): + conn = top_n_sparse(conn, nn) + conn.setdiag(1) + conn_norm = conn.multiply(1.0 / conn.sum(1)).tocsr() + + # Compute first moment of chromatin accessibility + atac_adata.layers[MDKM.RNA_FIRST_MOMENT_CHROM_LAYER] = \ + csr_matrix.dot(conn_norm, atac_adata.layers['counts']).copy() + + # Overwrite ATAC- and RNA-seq connectivities + atac_adata.obsp['connectivities'] = conn.copy() + rna_adata.obsp['connectivities'] = conn.copy() + + self.mdata.mod['atac'] = atac_adata.copy() + self.mdata.mod['rna'] = rna_adata.copy() + + def lift_transcriptomic_velocity(self, + num_processes: int = 6): + # Compute integral neighbors + main_info('Starting computation of integral neighbors ...') + self.find_cell_along_integral_curve(num_processes=num_processes) + + # Extract the ATAC- and RNA-seq data + atac_adata, rna_adata = self.mdata.mod['atac'], self.mdata.mod['rna'] + + # Retrieve specified layer for chromatin state + chromatin_state = atac_adata.layers[MDKM.ATAC_TFIDF_LAYER] + + cosine_similarities = None + if self.cosine_similarities is None: + main_exception('Please compute integral neighbors before calling lift_transcriptomic_velocity.') + else: + cosine_similarities = self.cosine_similarities + + # Retrieve specified layer for expression matrix + expression_mtx = rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER] + + integral_neighbors = None + if self.integral_neighbors is None: + main_exception('Please compute integral neighbors before calling lift_transcriptomic_velocity.') + else: + integral_neighbors = self.integral_neighbors + + # Retrieve specified layer for the velocity matrix + velocity_mtx = rna_adata.layers[MDKM.RNA_SPLICED_VELOCITY_LAYER] + + # Prepare argument list for parallel processing + args_list = [(i, j, chromatin_state, cosine_similarities, expression_mtx, velocity_mtx[i, :]) + for i, j in integral_neighbors.items()] + + # Use multiprocessing to compute the results + with Pool(processes=num_processes) as pool: + results = pool.map(lifted_chromatin_velocity, args_list) + + # Convert results to sparse matrix + chromatin_velocity_mtx = np.zeros(chromatin_state.shape) + for i, chromatin_velocity in results: + chromatin_velocity_mtx[i, :] = chromatin_velocity + + atac_adata.layers[MDKM.ATAC_CHROMATIN_VELOCITY_LAYER] = chromatin_velocity_mtx + + # Copy the scATAC-seq AnnData object into the MultiomeVelocity object + self.mdata.mod['atac'] = atac_adata.copy() + + def _restrict_dicts_to_gene_list(self, + gene_list: List[str], + cre_dict: Dict[str, List[str]] = None, + promoter_dict: Dict[str, List[str]] = None + ) -> Tuple[List[str], List[str], Dict[str, List[str]], Dict[str, List[str]]]: + # Elements present in scATAC-seq data + present_elements = self.atac_elements() + + if len(gene_list) == 0: + main_exception('Require non-trivial gene_list for _restrict_to_gene_list.') + + if len(cre_dict) == 0 or len(promoter_dict) == 0: + main_exception('Require non-trivial enhancer and promoter dicts for _restrict_to_gene_list.') + + # Elements associated to genes in gene_list and present in scATAC-seq data + shared_elements = [] + + # Dictionary from gene to element list for all genes present in gene_list and with + # corresponding elements in enhancer dicts + shared_cre_dict = {} + for gene, element_list in cre_dict.items(): + if gene in gene_list: + shared_elements_for_gene =\ + [element for element in element_list if element in present_elements] + shared_elements_for_gene = list(set(shared_elements_for_gene)) + + shared_elements += shared_elements_for_gene + shared_cre_dict[gene] = shared_elements_for_gene + + # Add all promoters for genes in gene_list + shared_promoter_dict = {} + for gene, element_list in promoter_dict.items(): + if gene in gene_list: + shared_elements_for_gene = \ + [element for element in element_list if element in present_elements] + shared_elements_for_gene = list(set(shared_elements_for_gene)) # Bit pedantic ... + + shared_elements += shared_elements_for_gene + shared_promoter_dict[gene] = shared_elements_for_gene + + # Make elements into unique list + shared_elements = list(set(shared_elements)) + + # Determine which genes actually have elements present in the scATAC-seq data + all_dict_genes = list(set(list(shared_cre_dict.keys()) + list(shared_promoter_dict.keys()))) + shared_genes = [] + for gene in all_dict_genes: + enhancers_for_gene = len(shared_cre_dict.get(gene, [])) > 0 + + promoters_for_gene = len(shared_promoter_dict.get(gene, [])) > 0 + + if enhancers_for_gene or promoters_for_gene: + shared_genes.append(gene) + + # Clean up trivial entries in dicts + if not enhancers_for_gene and gene in shared_cre_dict: + del shared_cre_dict[gene] + + if not promoters_for_gene and gene in shared_promoter_dict: + del shared_promoter_dict[gene] + + shared_genes = list(set(shared_genes)) + + return shared_elements, shared_genes, shared_cre_dict, shared_promoter_dict + + def restrict_to_gene_list(self, + gene_list: List[str] = None, + subset: bool = False) -> Tuple[List[str], List[str]]: + # Extract genes from scRNA-seq data + rna_genes = self.rna_genes() + + if gene_list is None: + # If no gene_list offered, then use the genes found in scRNA-seq dataset + gene_list = rna_genes + else: + # Otherwise ensure gene is contained within the shared list + if not set(gene_list).issubset(set(rna_genes)): + main_exception('gene_list is not a subset of genes found in scRNA-seq dataset.') + + shared_elements, shared_genes, shared_enhancer_dict, shared_promoter_dict = \ + self._restrict_dicts_to_gene_list(gene_list=gene_list, + cre_dict=self._cre_dict, + promoter_dict=self._promoter_dict) + + if subset: + # Subset the scATAC-seq data to shared elements + self.mdata.mod['atac'] = self.mdata.mod['atac'][:, shared_elements].copy() + + # Subset the scRNA_seq data to shared genes + self.mdata.mod['rna'] = self.mdata.mod['rna'][:, shared_genes].copy() + + return shared_elements, shared_genes + + def rna_genes(self): + return self.mdata.mod['rna'].var_names.tolist() + + def to_mdata(self) -> MuData: + # Deep copy MuData object for export + atac_adata, rna_adata = self.mdata.mod['atac'].copy(), self.mdata.mod['rna'].copy() + + # ... embellish atac + atac_adata.uns['cre_dict'] = self._cre_dict.copy() + atac_adata.uns['promoter_dict'] = self._promoter_dict.copy() + atac_adata.uns['multi_dynamo_kwargs'] = {'include_gene_body': self.include_gene_body, + 'linkage_fn': self.linkage_fn, + 'linkage_method': self.linkage_method, + 'max_peak_dist': self.max_peak_dist, + 'min_corr': self.min_corr, + 'peak_annot_fn': self.peak_annot_fn} + + # ... embellish rna + rna_adata.obsm['multi_dynamo_nn_dist'] = self.nn_dist.copy() + rna_adata.obsm['multi_dynamo_nn_idx'] = self.nn_idx.copy() + + rna_adata.obsp['cosine_similarities'] = self.cosine_similarities.copy() + rna_adata.uns['integral_neighbors'] = {str(k): str(v) for k,v in self.integral_neighbors.items()}.copy() + rna_adata.uns['multi_dynamo_kwargs'] = {'neighbor_method': self.neighbor_method} + + return MuData({'atac': atac_adata, 'rna': rna_adata}) + + # transcriptomic_velocity: this could really be any of the many methods that already exist, including those in + # dynamo and we plan to add this capability later. + def transcriptomic_velocity(self, + adjusted: bool = False, + min_r2: float = 1e-2, + mode: Literal['deterministic', 'stochastic'] = 'deterministic', + n_neighbors: int = 20, + n_pcs: int = 20, + num_processes: int = 6, + outlier: float = 99.8): + # Extract transcriptome and chromatin accessibility + atac_adata, rna_adata = self.mdata.mod['atac'], self.mdata.mod['rna'] + + # Assemble dictionary of arguments for fits + fit_args = {'min_r2': min_r2, + 'mode': mode, + 'n_pcs': n_pcs, + 'n_neighbors': n_neighbors, + 'outlier': outlier} + + # Obtain connectivities from the scRNA-seq object + rna_conn = rna_adata.obsp['connectivities'] + + # Compute moments for transcriptome data + main_info('computing moments for transcriptomic data ...') + rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER] = ( + csr_matrix.dot(rna_conn, csr_matrix(rna_adata.layers[MDKM.RNA_SPLICED_LAYER])) + .astype(np.float32) + .toarray() + ) + rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER] = ( + csr_matrix.dot(rna_conn, csr_matrix(rna_adata.layers[MDKM.RNA_UNSPLICED_LAYER])) + .astype(np.float32) + .toarray() + ) + + # Initialize select second moments for the transcriptomic data + Mss, Mus, Muu = None, None, None + if mode == 'stochastic': + main_info('computing second moments', indent_level=2) + Mss, Mus, Muu = self.compute_second_moments(adjusted=adjusted) + + rna_adata.layers[MDKM.RNA_SECOND_MOMENT_SS_LAYER] = Mss.copy() + rna_adata.layers[MDKM.RNA_SECOND_MOMENT_US_LAYER] = Mus.copy() + rna_adata.layers[MDKM.RNA_SECOND_MOMENT_UU_LAYER] = Muu.copy() + + if 'highly_variable' in rna_adata.var: + main_info('using highly variable genes', indent_level=2) + rna_gene_list = rna_adata.var_names[rna_adata.var['highly_variable']].values + else: + rna_gene_list = rna_adata.var_names.values[ + (~np.isnan(np.asarray(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER].sum(0)) + .reshape(-1) + if issparse(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER]) + else np.sum(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER], axis=0))) + & (~np.isnan(np.asarray(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER].sum(0)) + .reshape(-1) + if issparse(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER]) + else np.sum(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER], axis=0)))] + + # Restrict to genes with corresponding peaks in scATAC-seq data + shared_elements, shared_genes = self.restrict_to_gene_list(gene_list=rna_gene_list, + subset=True) + + n_fitted_genes = len(shared_genes) + if n_fitted_genes: + main_info(f'{n_fitted_genes} genes will be fitted') + else: + main_exception('None of the genes specified are in the adata object') + + velo_s = np.zeros((rna_adata.n_obs, n_fitted_genes)) + variance_velo_s = np.zeros((rna_adata.n_obs, n_fitted_genes)) + gammas = np.zeros(n_fitted_genes) + r2s = np.zeros(n_fitted_genes) + losses = np.zeros(n_fitted_genes) + + u_mat = (rna_adata[:, shared_genes].layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER].A + if issparse(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER]) + else rna_adata[:, shared_genes].layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER]) + s_mat = (rna_adata[:, shared_genes].layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER].A + if issparse(rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER]) + else rna_adata[:, shared_genes].layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER]) + + M_c = csr_matrix(atac_adata[:, shared_elements].layers[MDKM.RNA_FIRST_MOMENT_CHROM_LAYER]) \ + if issparse(atac_adata.layers[MDKM.RNA_FIRST_MOMENT_CHROM_LAYER]) else \ + atac_adata[:, shared_elements].layers[MDKM.RNA_FIRST_MOMENT_CHROM_LAYER] + c_mat = M_c.toarray() if issparse(M_c) else M_c + + # Create dictionary from gene to index + gene_to_idx_dict = {gene: idx for idx, gene in enumerate(shared_genes)} + + # Create dictionary from peak to index + peak_to_idx_dict = {element: idx for idx, element in enumerate(shared_elements)} + + # Create unified gene to list of elements dict + tmp_elements_for_gene_dict = {} + for gene, element_list in self._cre_dict.items(): + tmp_elements_for_gene_dict[gene] = tmp_elements_for_gene_dict.setdefault(gene, []) + element_list + + for gene, element_list in self._promoter_dict.items(): + tmp_elements_for_gene_dict[gene] = tmp_elements_for_gene_dict.setdefault(gene, []) + element_list + + elements_for_gene_dict = {} + for gene, element_list in tmp_elements_for_gene_dict.items(): + elements_for_gene_dict[gene] = list(set(element_list)) + + # Create dictionary from gene indices to list of peaks by indices + gene_idx_to_peak_idx = {gene_to_idx_dict[gene]: [peak_to_idx_dict[peak] for peak in peak_list] + for gene, peak_list in elements_for_gene_dict.items()} + + # Define batch arguments + batches_of_arguments = [] + for i in range(n_fitted_genes): + gene = shared_genes[i] + peak_idx = gene_idx_to_peak_idx[i] + + batches_of_arguments.append( + (c_mat[:, peak_idx], + u_mat[:, i], + s_mat[:, i], + None if mode == 'deterministic' else Mss[:, i], + None if mode == 'deterministic' else Mus[:, i], + None if mode == 'deterministic' else Muu[:, i], + fit_args, + mode, + gene)) + + # Carry out fits in parallel + with Pool(processes=num_processes) as pool: + results = pool.starmap(regression, batches_of_arguments) + + # Reformat the results + for idx, (velocity, velocity_variance, gamma, r2, loss) in enumerate(results): + gammas[idx] = gamma + r2s[idx] = r2 + losses[idx] = loss + velo_s[:, idx] = smooth_scale(rna_conn, velocity) + + if mode == 'stochastic': + variance_velo_s[:, idx] = smooth_scale(rna_conn, + velocity_variance) + + # Determine which fits failed + kept_genes = [gene for gene, loss in zip(shared_genes, losses) if loss != np.inf] + if len(kept_genes) == 0: + main_exception('None of the genes were fit due to low quality.') + + # Subset the transcriptome to the genes for which the fits were successful + rna_copy = rna_adata[:, kept_genes].copy() + + # Add the fit results + keep = [loss != np.inf for loss in losses] + + # ... layers + rna_copy.layers[MDKM.RNA_SPLICED_VELOCITY_LAYER] = csr_matrix(velo_s[:, keep]) + if mode == 'stochastic': + rna_copy.layers['variance_velo_s'] = csr_matrix(variance_velo_s[:, keep]) + + # ... .obsp + rna_copy.obsp['_RNA_conn'] = rna_conn + + # ... .uns + # ... ... augment the dynamical and normalization information + dyn_and_norm_info = rna_copy.uns['pp'].copy() + dyn_and_norm_info['experiment_total_layers'] = None + dyn_and_norm_info['layers_norm_method'] = None + dyn_and_norm_info['tkey'] = None + rna_copy.uns['pp'] = dyn_and_norm_info.copy() + + dynamics = {'filter_gene_mode': 'final', + 't': None, + 'group': None, + 'X_data': None, + 'X_fit_data': None, + 'asspt_mRNA': 'ss', + 'experiment_type': dyn_and_norm_info.get('experiment_type', 'conventional'), + 'normalized': True, + 'model': mode, + 'est_method': 'gmm', # Consider altering + 'has_splicing': dyn_and_norm_info.get('has_splicing', True), + 'has_labeling': dyn_and_norm_info.get('has_labeling', False), + 'splicing_labeling': dyn_and_norm_info.get('splicing_labeling', False), + 'has_protein': dyn_and_norm_info.get('has_protein', False), + 'use_smoothed': True, + 'NTR_vel': False, + 'log_unnormalized': True, + # Ensure X is indeed log normalized (compute exp1m, sum and check rowsums) + 'fraction_for_deg': False} + rna_copy.uns['dynamics'] = dynamics.copy() + + rna_copy.uns['velo_s_params'] = {'mode': mode, + 'fit_offset': False, + 'perc': outlier} + rna_copy.uns['velo_s_params'].update(fit_args) + + # ... ... These are the column names for the array in .varm['vel_params'] + rna_copy.uns['vel_params_names'] = ['beta', 'gamma', 'half_life', 'alpha_b', 'alpha_r2', 'gamma_b', + 'gamma_r2', 'gamma_logLL', 'delta_b', 'delta_r2', 'bs', 'bf', + 'uu0', 'ul0', 'su0', 'sl0', 'U0', 'S0', 'total0'] + + # ... .var + rna_copy.var['fit_gamma'] = gammas[keep] + rna_copy.var['fit_loss'] = losses[keep] + rna_copy.var['fit_r2'] = r2s[keep] + + # Introduce var['use_for_dynamics'] for dynamo + v_gene_ind = rna_copy.var['fit_r2'] >= min_r2 + rna_copy.var['use_for_dynamics'] = v_gene_ind + rna_copy.var['velo_s_genes'] = v_gene_ind + + # ... .varm + vel_params_array = np.full((rna_copy.shape[1], len(rna_copy.uns['vel_params_names'])), np.nan) + + # ... ... ... transfer 'gamma' + gamma_index = np.where(np.array(rna_copy.uns['vel_params_names']) == 'gamma')[0][0] + vel_params_array[:, gamma_index] = rna_copy.var['fit_gamma'] + + # ... ... ... transfer 'gamma_r2' + gamma_r2_index = np.where(np.array(rna_copy.uns['vel_params_names']) == 'gamma_r2')[0][0] + vel_params_array[:, gamma_r2_index] = rna_copy.var['fit_r2'] + + rna_copy.varm['vel_params'] = vel_params_array + + # Copy the subset AnnData scRNA-seq and scATAC-seq objects back into the MultiomeVelocity object + self.mdata.mod['rna'] = rna_copy.copy() + + # Filter the scATAC-seq peaks to retain only those corresponding to fit genes + shared_elements, shared_genes = self.restrict_to_gene_list(gene_list=kept_genes, + subset=True) + + # Confer same status to element corresponding to genes declared as 'use_for_dynamics' + v_genes = [gene for gene, v_ind in zip(shared_genes, v_gene_ind) if v_ind] + # v_elements, v_genes = self.restrict_to_gene_list(gene_list=v_genes, subset=False) + # v_element_ind = [element in v_elements for element in shared_elements] + # TODO: Need to special case when no genes rise to significance + v_element_ind = [True for _ in range(atac_adata.n_vars)] + + # Introduce var['use_for_dynamics'] for dynamo + # TODO: This does NOT appear to work properly yet - so left permissive + atac_adata.var['use_for_dynamics'] = v_element_ind + + self.mdata.mod['atac'] = atac_adata.copy() + + def _update_cre_and_promoter_dicts(self, + cre_dict: Dict[str, List[str]] = None, + promoter_dict: Dict[str, List[str]] = None): + if cre_dict is not None or promoter_dict is not None: + # Should only have exogenous enhancer and promoter dicts if none are present in object + if self._cre_dict is not None or self._promoter_dict is not None: + main_exception('Should only specify exogenous CRE and promoter dicts if none are present in object.') + else: + # Extract the dictionaries + cre_dict = self._cre_dict + promoter_dict = self._promoter_dict + + # Extract the RNA genes + rna_genes = self.rna_genes() + + # ... determine which genes are actually present in the scATAC-seq data and for these + # which elements are present + shared_elements, shared_genes, shared_cre_dict, shared_promoter_dict = \ + self._restrict_dicts_to_gene_list(gene_list=rna_genes, + cre_dict=cre_dict, + promoter_dict=promoter_dict) + + if len(shared_genes) == 0: + main_exception('scATAC-seq data and scRNA-seq data do NOT share any genes.') + + # Subset the scATAC-seq data to shared elements + self.mdata.mod['atac'] = self.mdata.mod['atac'][:, shared_elements].copy() + + # Subset the scRNA_seq data to shared genes + self.mdata.mod['rna'] = self.mdata.mod['rna'][:, shared_genes].copy() + + # Initialize the original enhancer and promoter dicts + self._cre_dict = shared_cre_dict + self._promoter_dict = shared_promoter_dict + + def weighted_nearest_neighbors( + self, + atac_lsi_key: str = MDKM.ATAC_OBSM_LSI_KEY, + n_components_atac: int = 20, + n_components_rna: int = 20, + nn: int = 20, + random_state: int = 42, + rna_pca_key: str = MDKM.RNA_OBSM_PC_KEY, + use_highly_variable: bool = False): + import scanpy as sc + main_info('Starting computation of weighted nearest neighbors ...', indent_level=1) + nn_logger = LoggerManager.gen_logger('weighted_nearest_neighbors') + nn_logger.log_time() + + # Restrict to shared genes and their elements - as tied together by the attribution of CRE to genes + shared_elements, shared_genes = self.restrict_to_gene_list(subset=True) + + # Extract scATAC-seq and scRNA-seq data + atac_adata = self.mdata.mod['atac'][:, shared_elements].copy() + rna_adata = self.mdata.mod['rna'][:, shared_genes].copy() + + if rna_pca_key not in rna_adata.obsm: + # TODO: Consider normalizing counts here, if needed + + # Carry out PCA on scRNA-seq data + main_info('computing PCA on normalized and scaled scRNA-seq data', indent_level=2) + sc.tl.pca(rna_adata, + n_comps=n_components_rna, + random_state=random_state, + use_highly_variable=use_highly_variable) + + if atac_lsi_key not in atac_adata.obsm: + # Carry out singular value decomposition on the scATAC-seq data + main_info('computing latent semantic indexing of scATAC-seq data ...') + lsi = svds(atac_adata.X, k=n_components_atac) + + # get the lsi result + atac_adata.obsm[atac_lsi_key] = lsi[0] + + # Cross copy the LSI decomposition + rna_adata.obsm[atac_lsi_key] = atac_adata.obsm[atac_lsi_key] + + # Use Dylan Kotliar's python implementation of + # TODO: As alternative to PCA could use the latent space from variational autoencoder. + WNNobj = pyWNN(rna_adata, + reps=[rna_pca_key, atac_lsi_key], + npcs=[n_components_rna, n_components_atac], + n_neighbors=nn, + seed=42) + + adata_seurat = WNNobj.compute_wnn(rna_adata) + + # extract the matrix storing the distances between each cell and its neighbors + cx = coo_matrix(adata_seurat.obsp["WNN_distance"]) + + # the number of cells + cells = adata_seurat.obsp['WNN_distance'].shape[0] + + # define the shape of our final results + # and make the arrays that will hold the results + new_shape = (cells, nn) + nn_dist = np.zeros(shape=new_shape) + nn_idx = np.zeros(shape=new_shape) + + # new_col defines what column we store data in + # our result arrays + new_col = 0 + + # loop through the distance matrices + for i, j, v in zip(cx.row, cx.col, cx.data): + + # store the distances between neighbor cells + nn_dist[i][new_col % nn] = v + + # for each cell's row, store the row numbers of its neighbor cells + # (1-indexing instead of 0- is a holdover from R multimodalneighbors()) + nn_idx[i][new_col % nn] = int(j) + 1 + + new_col += 1 + + # Add index and distance to the MultiomeVelocity object + self.nn_idx = nn_idx + self.nn_dist = nn_dist + + # Revert to canonical naming of connectivities and distances + # ... .uns['neighbors'] + atac_adata.uns['neighbors'] = adata_seurat.uns['WNN'].copy() + rna_adata.uns['neighbors'] = adata_seurat.uns['WNN'].copy() + del adata_seurat.uns['WNN'] + + # ... .obsp['connectivities'] + atac_adata.obsp['connectivities'] = adata_seurat.obsp['WNN'].copy() + rna_adata.obsp['connectivities'] = adata_seurat.obsp['WNN'].copy() + del adata_seurat.obsp['WNN'] + + # ... .obsp['distances'] + atac_adata.obsp['distances'] = adata_seurat.obsp['WNN_distance'].copy() + rna_adata.obsp['distances'] = adata_seurat.obsp['WNN_distance'].copy() + del adata_seurat.obsp['WNN_distance'] + + # Copy the subset AnnData scRNA-seq and scATAC-seq objects back into the MultiomeVelocity object + self.mdata.mod['atac'] = atac_adata.copy() + self.mdata.mod['rna'] = rna_adata.copy() + + def write(self, + filename: Union[PathLike, str]) -> None: + export_mdata = self.to_mdata() + export_mdata.write_h5mu(filename) diff --git a/dynamo/multivelo/old_MultiomicVectorField.py b/dynamo/multivelo/old_MultiomicVectorField.py new file mode 100644 index 000000000..8e21443b7 --- /dev/null +++ b/dynamo/multivelo/old_MultiomicVectorField.py @@ -0,0 +1,445 @@ +import anndata as ad +from anndata import AnnData +import matplotlib.pyplot as plt +from mudata import MuData +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from typing import ( + Dict, + List, + Literal, + Optional, + Tuple, + Union, +) + +# Imports from MultiDynamo +from .MultiConfiguration import MDKM +from .old_MultiVelocity import MultiVelocity + +from ..pl import cell_wise_vectors, streamline_plot, topography +from ..pd import fate, perturbation +from ..mv import animate_fates +from ..pp import pca +from ..tl import reduceDimension, cell_velocities +from ..vf import VectorField + + +# Helper functions +def compute_animations(adata, + cell_type_key: str, + cores: int = 6, + delta_epsilon: float = 0.25, + epsilon: float = 1.0, + max_tries: int = 10, + n_cells: int = 100, + n_earliest: int = 30, + prefix: str = None, + skip_cell_types: List = [] + ) -> None: + # Extract cell metadata + cell_metadata = adata.obs.copy() + + # Add UMAP + cell_metadata['umap_1'] = adata.obsm['X_umap'][:, 0] + cell_metadata['umap_2'] = adata.obsm['X_umap'][:, 1] + + # Group by cell_type_key and find the rows with the maximal 'rotated_umap_1' + grouped = cell_metadata.groupby(cell_type_key) + + # Find the mean locations of cell types + top_indices_1, top_indices_2 = {}, {} + for cell_type, celltype_data in grouped: + subset_df = celltype_data.nsmallest(n_cells, 'umap_1') + top_indices_1[cell_type] = subset_df['umap_1'].mean() + subset_df = celltype_data.nlargest(n_cells, 'umap_2') + top_indices_2[cell_type] = subset_df['umap_2'].mean() + + cell_types = cell_metadata[cell_type_key].cat.categories.tolist() + progenitor_list = [] + + for cell_type in cell_types: + if (skip_cell_types is not None) and (cell_type in skip_cell_types): + continue + + print(f'Computing animation for cell type {cell_type}') + + # Find the progenitors + n_tries, progenitors = 1, [] + while len(progenitors) < n_cells and n_tries < max_tries + 1: + progenitors = adata.obs_names[adata.obs.celltype.isin([cell_type]) & + (abs(cell_metadata['umap_1'] - top_indices_1[cell_type]) < ( + epsilon + n_tries * delta_epsilon)) & + (abs(cell_metadata['umap_2'] - top_indices_2[cell_type]) < ( + epsilon + n_tries * delta_epsilon))] + n_tries += 1 + + if len(progenitors) >= n_earliest: + # Progenitors for all subset simulation + print(f'Adding {n_earliest} cells of type {cell_type}.') + progenitor_list.extend(progenitors[0:min(len(progenitors), n_earliest)]) + + # Progenitors for this animation + # progenitors = progenitors[0:min(len(progenitors), n_cells)] + + # Determine their fate + # dyn.pd.fate(adata, basis='umap_perturbation', init_cells=progenitors, interpolation_num=100, + # direction='forward', inverse_transform=False, average=False, cores=6) + + # Compute the animation + # animation_fn = cell_type + '_perturbed_fate_ani.mp4' + # animation_fn = animation_fn.replace('/', '-') + # dyn.mv.animate_fates(adata, basis='umap_perturbation', color='celltype', n_steps=100, + # interval=100, save_show_or_return='save', + # save_kwargs={'filename': animation_fn, + # 'writer': 'ffmpeg'}) + + # Determine fate of progenitor_list + fate(adata, basis='umap_perturbation', init_cells=progenitor_list, interpolation_num=100, + direction='forward', inverse_transform=False, average=False, cores=cores) + + # Compute the animation + file_name = prefix + '_perturbation.mpeg' + file_name = file_name.replace(':', '-') + file_name = file_name.replace('/', '-') + animate_fates(adata, basis='umap_perturbation', color='celltype', n_steps=100, + interval=100, save_show_or_return='save', + save_kwargs={'filename': file_name, + 'writer': 'ffmpeg'}) + +def genes_and_elements_for_dynamics(atac_adata: AnnData, + rna_adata: AnnData, + cre_dict: Dict[str, List[str]], + promoter_dict: Dict[str, List[str]], + min_r2: float = 0.01) -> List[bool]: + # Get fit parameters + vel_params_array = rna_adata.varm['vel_params'] + + # Extract 'gamma_r2' + gamma_r2_index = np.where(np.array(rna_adata.uns['vel_params_names']) == 'gamma_r2')[0][0] + r2 = vel_params_array[:, gamma_r2_index] + + # Set genes for dynamics + genes_for_dynamics = rna_adata.var_names[r2 > min_r2].to_list() + use_for_dynamics = [gene in genes_for_dynamics for gene in rna_adata.var_names.to_list()] + + # Compute elements for dynamics + cre_for_dynamics = [] + for gene, cre_list in cre_dict.items(): + if gene in genes_for_dynamics: + cre_for_dynamics += cre_list + + for gene, promoter_list in promoter_dict.items(): + if gene in genes_for_dynamics: + cre_for_dynamics += promoter_list + + use_for_dynamics += [element in cre_for_dynamics for element in atac_adata.var_names] + + return use_for_dynamics + + +class MultiomicVectorField: + def __init__(self, + multi_velocity: Union[MultiVelocity, MuData], + min_gamma: float = None, + min_r2: float = 0.01, + rescale_velo_c: float = 1.0): + # This is basically an adapter from multiomic data to format where we can borrow tools previously developed + # in dynamo. + if isinstance(multi_velocity, MuData): + multi_velocity = MultiVelocity.from_mdata(multi_velocity) + + # ... mdata + mdata = multi_velocity.get_mdata() + atac_adata, rna_adata = mdata.mod['atac'], mdata.mod['rna'] + + # ... CRE dictionary + cre_dict = multi_velocity.get_cre_dict() + + # ... promoter dictionary + promoter_dict = multi_velocity.get_promoter_dict() + + # To estimate the multi-omic velocity field, we assemble a single AnnData object from the following components + # NOTE: In our descriptions below *+* signifies the directo sum of two vector spaces + # ... .layers + # ... ... counts: counts => rna counts *+* atac counts + rna_counts = rna_adata.layers[MDKM.RNA_COUNTS_LAYER].toarray().copy() + atac_counts = atac_adata.layers[MDKM.ATAC_COUNTS_LAYER].toarray().copy() + counts = np.concatenate((rna_counts, atac_counts), axis=1) + + # ... ... raw: spliced, unspliced ==> spliced *+* chromatin, unspliced *+* 0 + chromatin_state = atac_adata.layers[MDKM.ATAC_COUNTS_LAYER].toarray().copy() + spliced = rna_adata.layers[MDKM.RNA_SPLICED_LAYER].toarray().copy() + unspliced = rna_adata.layers[MDKM.RNA_UNSPLICED_LAYER].toarray().copy() + + spliced = np.concatenate((spliced, chromatin_state), axis=1) + unspliced = np.concatenate((unspliced, np.zeros(chromatin_state.shape)), axis=1) + del chromatin_state + + # ... ... first moments: M_s, M_u => M_s *+* Mc, M_u *+* 0 + Mc = atac_adata.layers[MDKM.RNA_FIRST_MOMENT_CHROM_LAYER].toarray().copy() + Ms = rna_adata.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER].copy() + Mu = rna_adata.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER].copy() + + Ms = np.concatenate((Ms, Mc), axis=1) + Mu = np.concatenate((Mu, np.zeros(Mc.shape)), axis=1) + del Mc + + # ... ... velocity_S ==> velocity_S + lifted_velo_c + velocity_C = atac_adata.layers[MDKM.ATAC_CHROMATIN_VELOCITY_LAYER].copy() + velocity_S = rna_adata.layers[MDKM.RNA_SPLICED_VELOCITY_LAYER].toarray().copy() + + velocity_S = np.concatenate((velocity_S, rescale_velo_c * velocity_C), axis=1) + del velocity_C + + # ... .obs + # ... ... carry over entire obs for now + obs_df = rna_adata.obs.copy() + + # ... .obsp + # ... ... connectivities ==> connectivities + connectivities = rna_adata.obsp['connectivities'].copy() + + # ... ... distances ==> distances + distances = rna_adata.obsp['distances'].copy() + + # ... .uns + # ... ... dynamics ==> dynamics + dynamics = rna_adata.uns['dynamics'].copy() + + # ... ... neighbors ==> neighbors + neighbors = rna_adata.uns['neighbors'].copy() + + # ... ... pp ==> pp + pp = rna_adata.uns['pp'].copy() + + # ... ... vel_params_names ==> vel_params_names + vel_params_names = rna_adata.uns['vel_params_names'].copy() + + # ... .var + # ... ... var_names ==> (rna) var_names + (atac) var_names + var_names = rna_adata.var_names.tolist() + atac_adata.var_names.tolist() + + # ... ... feature_type ==> n_genes * 'gene', n_elements * 'CRE' + feature_type = rna_adata.n_vars * ['gene'] + atac_adata.n_vars * ['CRE'] + + # ... ... use_for_pca + use_for_dynamics = genes_and_elements_for_dynamics(atac_adata=atac_adata, + rna_adata=rna_adata, + cre_dict=cre_dict, + promoter_dict=promoter_dict, + min_r2=min_r2) + + # ... ... use_for_pca + use_for_pca = genes_and_elements_for_dynamics(atac_adata=atac_adata, + rna_adata=rna_adata, + cre_dict=cre_dict, + promoter_dict=promoter_dict, + min_r2=min_r2) + + var_df = pd.DataFrame(data={'feature_type': feature_type, + 'use_for_dynamics': use_for_dynamics, + 'use_for_pca': use_for_pca}, + index=var_names) + + # ... .varm + # ... ... vel_params => vel_params + (1,1) + vel_params_array = rna_adata.varm['vel_params'] + + chrom_vel_params_array = np.full((atac_adata.n_vars, len(vel_params_names)), np.nan) + + # ... ... create vacuous 'gamma' for chromatin data + gamma_index = np.where(np.array(vel_params_names) == 'gamma')[0][0] + chrom_vel_params_array[:, gamma_index] = np.ones(atac_adata.n_vars) + + # ... ... create vacuous 'gamma_r2' for chromatin data + gamma_r2_index = np.where(np.array(vel_params_names) == 'gamma_r2')[0][0] + chrom_vel_params_array[:, gamma_r2_index] = np.ones(atac_adata.n_vars) + + # ... ... concatenate the arrays + vel_params_array = np.concatenate((vel_params_array, chrom_vel_params_array), axis=0) + + # X ==> X + X + X = np.concatenate((rna_adata.X.toarray().copy(), atac_adata.X.toarray().copy()), axis=1) + + # Instantiate the multiomic AnnData object + adata_multi = AnnData(obs=obs_df, + var=var_df, + X=X) + # ... add .layers + # ... ... counts + adata_multi.layers[MDKM.RNA_COUNTS_LAYER] = counts + + # ... ... raw + adata_multi.layers[MDKM.RNA_SPLICED_LAYER] = spliced + adata_multi.layers[MDKM.RNA_UNSPLICED_LAYER] = unspliced + + # ... ... first moments + adata_multi.layers[MDKM.RNA_FIRST_MOMENT_SPLICED_LAYER] = Ms + adata_multi.layers[MDKM.RNA_FIRST_MOMENT_UNSPLICED_LAYER] = Mu + + # ... ... rna velocity + adata_multi.layers[MDKM.RNA_SPLICED_VELOCITY_LAYER] = velocity_S + + # ... add .obsp + adata_multi.obsp['connectivities'] = connectivities + adata_multi.obsp['distances'] = distances + + # ... add .uns + adata_multi.uns['dynamics'] = dynamics + adata_multi.uns['neighbors'] = neighbors + adata_multi.uns['pp'] = pp + adata_multi.uns['vel_params_names'] = vel_params_names + + # ... add varm + adata_multi.varm['vel_params'] = vel_params_array + + # Set instance variables + + self.multi_adata = adata_multi.copy() + + def cell_velocities(self, + cores: int = 6, + min_r2: float = 0.5, + n_neighbors: int = 30, + n_pcs: int = 30, + random_seed: int = 42, + trans_matrix_method: Literal["kmc", "fp", "cosine", "pearson", "transform"] = "pearson", + ) -> AnnData: + # We'll save ourselves some grief and just compute both the PCA and UMAP representations + # of the vector field up front + # ... extract the multiomic AnnData object + adata_multi = self.multi_adata.copy() + + # ... compute PCA + adata_multi = pca(adata=adata_multi, + n_pca_components=n_pcs, + random_state=random_seed) + + # ... compute the appropriate dimensional reduction + reduceDimension(adata_multi, + basis='pca', + cores=cores, + n_pca_components=n_pcs, + n_components=2, + n_neighbors=n_neighbors, + reduction_method='umap') + + # ... project high dimensional velocities onto PCA embeddings and compute cell transitions + cell_velocities(adata_multi, + basis='pca', + method=trans_matrix_method, + min_r2=min_r2, + other_kernels_dict={'transform': 'sqrt'}) + + # ... project high dimensional velocities onto PCA embeddings and compute cell transitions + cell_velocities(adata_multi, + basis='umap', + method=trans_matrix_method, + min_r2=min_r2, + other_kernels_dict={'transform': 'sqrt'}) + + self.multi_adata = adata_multi.copy() + + return self.multi_adata + + def compute_vector_field(self, + cores: int = 6, + restart_num: int = 5 + ): + VectorField(self.multi_adata, + basis='pca', + cores=cores, + grid_num=100, + M=1000, + pot_curl_div=True, + restart_num=restart_num, + restart_seed=[i * 888888888 for i in range(1, restart_num + 1)]) + ''' + dyn.vf.VectorField(self.multi_adata, + basis='umap', + cores=cores, + grid_num=100, + M=1000, + pot_curl_div=True, + restart_num=restart_num, + restart_seed=[i * 888888888 for i in range(1, restart_num + 1)]) + ''' + + def plot_cell_wise_vectors(self, + color: str = 'cell_type', + figsize: Tuple[float, float] = (9, 6), + **save_kwargs + ) -> None: + fig, ax = plt.subplots(figsize=figsize) + cell_wise_vectors(self.multi_adata, + basis='umap', + color=[color], + pointsize=0.1, + quiver_length=6, + quiver_size=6, + save_kwargs=save_kwargs, + save_show_or_return='show', + show_arrowed_spines=False, + show_legend='on_data', + ax = ax) + plt.show() + + def plot_streamline_plot(self, + color: str = 'cell_type', + figsize: Tuple[float, float] = (9, 6), + **save_kwargs + ) -> None: + fig, ax = plt.subplots(figsize=figsize) + streamline_plot(self.multi_adata, + basis='umap', + color=[color], + show_arrowed_spines=True, + show_legend='on_data', + ax = ax) + plt.show() + + def plot_topography(self, + color: str = 'cell_type', + figsize: Tuple[float, float] = (9, 6), + **save_kwargs + ) -> None: + fig, ax = plt.subplots(figsize=figsize) + topography(self.multi_adata, + basis='pca', + background='white', + color=color, + frontier=True, + n = 200, + show_legend='on data', + streamline_color='black', + ax = ax) + + def predict_perturbation(self, + gene: str, + expression: float, + cell_type_key: str = 'cell_type', + compute_animation: bool = False, + emb_basis: str = 'umap', + skip_cell_types: List = None + ) -> AnnData: + + perturbed_multi_adata = perturbation(self.multi_adata, + genes=gene, + expression=expression, + emb_basis='umap') + streamline_plot(self.multi_adata, color=["cell_type", gene], + basis="umap_perturbation") + + if compute_animation: + # Fit analytic vector field + VectorField(self.multi_adata, + basis='umap_perturbation') + + compute_animations(adata=self.multi_adata, + cell_type_key=cell_type_key, + prefix=gene, + skip_cell_types=skip_cell_types) + + return perturbed_multi_adata diff --git a/dynamo/multivelo/pyWNN.py b/dynamo/multivelo/pyWNN.py new file mode 100644 index 000000000..74e692d61 --- /dev/null +++ b/dynamo/multivelo/pyWNN.py @@ -0,0 +1,271 @@ +# This has been taken and lightly modified from Dylan's Kotliar's github repository +from anndata import AnnData +import numpy as np + +from sklearn import preprocessing +from scipy.sparse import csr_matrix, lil_matrix, diags +import sys +import time +from typing import List + +# Import from dynamo +from ..dynamo_logger import ( + LoggerManager, + main_debug, + main_exception, + main_finish_progress, + main_info, + main_info_insert_adata, + main_warning, +) + + +def compute_bw(knn_adj, embedding, n_neighbors=20): + intersect = knn_adj.dot(knn_adj.T) + indices = intersect.indices + indptr = intersect.indptr + data = intersect.data + data = data / ((n_neighbors * 2) - data) + bandwidth = [] + num = 0 + for i in range(intersect.shape[0]): + cols = indices[indptr[i]:indptr[i + 1]] + rowvals = data[indptr[i]:indptr[i + 1]] + idx = np.argsort(rowvals) + valssort = rowvals[idx] + numinset = len(cols) + if numinset < n_neighbors: + sys.exit('Fewer than 20 cells with Jacard sim > 0') + else: + curval = valssort[n_neighbors] + for num in range(n_neighbors, numinset): + if valssort[num] != curval: + break + else: + num += 1 + minjacinset = cols[idx][:num] + if num < n_neighbors: + main_exception('compute_bw method failed.') + sys.exit(-1) + else: + euc_dist = ((embedding[minjacinset, :] - embedding[i, :]) ** 2).sum(axis=1) ** .5 + euc_dist_sorted = np.sort(euc_dist)[::-1] + bandwidth.append(np.mean(euc_dist_sorted[:n_neighbors])) + return np.array(bandwidth) + +def compute_affinity(dist_to_predict, dist_to_nn, bw): + affinity = dist_to_predict - dist_to_nn + affinity[affinity < 0] = 0 + affinity = affinity * -1 + affinity = np.exp(affinity / (bw - dist_to_nn)) + return affinity + +def dist_from_adj(adjacency, embed1, embed2, nndist1, nndist2): + dist1 = lil_matrix(adjacency.shape) + dist2 = lil_matrix(adjacency.shape) + + indices = adjacency.indices + indptr = adjacency.indptr + ncells = adjacency.shape[0] + + tic = time.perf_counter() + for i in range(ncells): + for j in range(indptr[i], indptr[i + 1]): + col = indices[j] + a = (((embed1[i, :] - embed1[col, :]) ** 2).sum() ** .5) - nndist1[i] + if a == 0: + dist1[i, col] = np.nan + else: + dist1[i, col] = a + b = (((embed2[i, :] - embed2[col, :]) ** 2).sum() ** .5) - nndist2[i] + if b == 0: + dist2[i, col] = np.nan + else: + dist2[i, col] = b + + if (i % 2000) == 0: + toc = time.perf_counter() + main_info('%d out of %d %.2f seconds elapsed' % (i, ncells, toc - tic), indent_level=3) + + return csr_matrix(dist1), csr_matrix(dist2) + +def get_nearestneighbor(knn, neighbor=1): + # For each row of knn, returns the column with the lowest value i.e. the nearest neighbor + indices = knn.indices + indptr = knn.indptr + data = knn.data + nn_idx = [] + for i in range(knn.shape[0]): + cols = indices[indptr[i]:indptr[i + 1]] + rowvals = data[indptr[i]:indptr[i + 1]] + idx = np.argsort(rowvals) + nn_idx.append(cols[idx[neighbor - 1]]) + return np.array(nn_idx) + +def select_topK(dist, n_neighbors=20): + indices = dist.indices + indptr = dist.indptr + data = dist.data + nrows = dist.shape[0] + + final_data = [] + final_col_ind = [] + + for i in range(nrows): + cols = indices[indptr[i]:indptr[i + 1]] + rowvals = data[indptr[i]:indptr[i + 1]] + idx = np.argsort(rowvals) + final_data.append(rowvals[idx[(-1 * n_neighbors):]]) + final_col_ind.append(cols[idx[(-1 * n_neighbors):]]) + + final_data = np.concatenate(final_data) + final_col_ind = np.concatenate(final_col_ind) + final_row_ind = np.tile(np.arange(nrows), (n_neighbors, 1)).reshape(-1, order='F') + + result = csr_matrix((final_data, (final_row_ind, final_col_ind)), shape=(nrows, dist.shape[1])) + + return result + +class pyWNN(): + + def __init__(self, + adata: AnnData, + reps: List[str] = None, + n_neighbors: int = 20, + npcs: List[int] = None, + seed: int = 14, + distances: csr_matrix = None + ) -> None: + """\ + Class for running weighted nearest neighbors analysis as described in Hao + et al 2021. + """ + import scanpy as sc + # Set default arguments + if npcs is None: + npcs = [20, 20] + + if reps is None: + reps = ['X_pca', 'X_apca'] + + self.seed = seed + np.random.seed(seed) + + if len(reps) > 2: + sys.exit('WNN currently only implemented for 2 modalities') + + self.adata = adata.copy() + self.reps = [r + '_norm' for r in reps] + self.npcs = npcs + for (i, r) in enumerate(reps): + self.adata.obsm[self.reps[i]] = preprocessing.normalize(adata.obsm[r][:, 0:npcs[i]]) + + self.n_neighbors = n_neighbors + if distances is None: + main_info('Computing KNN distance matrices using default Scanpy implementation') + # ... n_neighbors in each modality + sc.pp.neighbors(self.adata, n_neighbors=n_neighbors, n_pcs=npcs[0], use_rep=self.reps[0], + metric='euclidean', key_added='1') + sc.pp.neighbors(self.adata, n_neighbors=n_neighbors, n_pcs=npcs[1], use_rep=self.reps[1], + metric='euclidean', key_added='2') + + # ... top 200 nearest neighbors in each modality + sc.pp.neighbors(self.adata, n_neighbors=200, n_pcs=npcs[0], use_rep=self.reps[0], metric='euclidean', + key_added='1_200') + sc.pp.neighbors(self.adata, n_neighbors=200, n_pcs=npcs[1], use_rep=self.reps[1], metric='euclidean', + key_added='2_200') + self.distances = ['1_distances', '2_distances', '1_200_distances', '2_200_distances'] + else: + main_info('Using pre-computed KNN distance matrices') + self.distances = distances + + for d in self.distances: + # Convert to sparse CSR matrices as needed + if type(self.adata.obsp[d]) is not csr_matrix: + self.adata.obsp[d] = csr_matrix(self.adata.obsp[d]) + + self.NNdist = [] + self.NNidx = [] + self.NNadjacency = [] + self.BWs = [] + + for (i, r) in enumerate(self.reps): + nn = get_nearestneighbor(self.adata.obsp[self.distances[i]]) + dist_to_nn = ((self.adata.obsm[r] - self.adata.obsm[r][nn, :]) ** 2).sum(axis=1) ** .5 + nn_adj = (self.adata.obsp[self.distances[i]] > 0).astype(int) + nn_adj_wdiag = csr_matrix(nn_adj.copy()) + nn_adj_wdiag.setdiag(1) + bw = compute_bw(nn_adj_wdiag, self.adata.obsm[r], n_neighbors=self.n_neighbors) + self.NNidx.append(nn) + self.NNdist.append(dist_to_nn) + self.NNadjacency.append(nn_adj) + self.BWs.append(bw) + + self.cross = [] + self.weights = [] + self.within = [] + self.WNN = None + self.WNNdist = None + + def compute_weights(self) -> None: + cmap = {0: 1, 1: 0} + affinity_ratios = [] + self.within = [] + self.cross = [] + for (i, r) in enumerate(self.reps): + within_predict = self.NNadjacency[i].dot(self.adata.obsm[r]) / (self.n_neighbors - 1) + cross_predict = self.NNadjacency[cmap[i]].dot(self.adata.obsm[r]) / (self.n_neighbors - 1) + + within_predict_dist = ((self.adata.obsm[r] - within_predict) ** 2).sum(axis=1) ** .5 + cross_predict_dist = ((self.adata.obsm[r] - cross_predict) ** 2).sum(axis=1) ** .5 + within_affinity = compute_affinity(within_predict_dist, self.NNdist[i], self.BWs[i]) + cross_affinity = compute_affinity(cross_predict_dist, self.NNdist[i], self.BWs[i]) + affinity_ratios.append(within_affinity / (cross_affinity + 0.0001)) + self.within.append(within_predict_dist) + self.cross.append(cross_predict_dist) + + self.weights.append(1 / (1 + np.exp(affinity_ratios[1] - affinity_ratios[0]))) + self.weights.append(1 - self.weights[0]) + + def compute_wnn( + self, + adata: AnnData + ) -> AnnData: + main_info('Computing modality weights', indent_level=2) + self.compute_weights() + union_adj_mat = ((self.adata.obsp[self.distances[2]] + self.adata.obsp[self.distances[3]]) > 0).astype(int) + + main_info('Computing weighted distances for union of 200 nearest neighbors between modalities', indent_level=2) + full_dists = dist_from_adj(union_adj_mat, self.adata.obsm[self.reps[0]], self.adata.obsm[self.reps[1]], + self.NNdist[0], self.NNdist[1]) + weighted_dist = csr_matrix(union_adj_mat.shape) + for (i, dist) in enumerate(full_dists): + dist = diags(-1 / (self.BWs[i] - self.NNdist[i]), format='csr').dot(dist) + dist.data = np.exp(dist.data) + ind = np.isnan(dist.data) + dist.data[ind] = 1 + dist = diags(self.weights[i]).dot(dist) + weighted_dist += dist + + main_info('Selecting top K neighbors', indent_level=2) + self.WNN = select_topK(weighted_dist, n_neighbors=self.n_neighbors) + WNNdist = self.WNN.copy() + x = (1 - WNNdist.data) / 2 + x[x < 0] = 0 + x[x > 1] = 1 + WNNdist.data = np.sqrt(x) + self.WNNdist = WNNdist + + adata.obsp['WNN'] = self.WNN + adata.obsp['WNN_distance'] = self.WNNdist + adata.obsm[self.reps[0]] = self.adata.obsm[self.reps[0]] + adata.obsm[self.reps[1]] = self.adata.obsm[self.reps[1]] + adata.uns['WNN'] = {'connectivities_key': 'WNN', + 'distances_key': 'WNN_distance', + 'params': {'n_neighbors': self.n_neighbors, + 'method': 'WNN', + 'random_state': self.seed, + 'metric': 'euclidean', + 'use_rep': self.reps[0], + 'n_pcs': self.npcs[0]}} + return (adata) diff --git a/dynamo/multivelo/settings.py b/dynamo/multivelo/settings.py new file mode 100644 index 000000000..5d07355a9 --- /dev/null +++ b/dynamo/multivelo/settings.py @@ -0,0 +1,27 @@ +import os + +"""Settings +""" + +# the desired verbosity +global VERBOSITY + +# cwd: The current working directory +global CWD + +# the name of the file to which we're writing the log files +global LOG_FOLDER + +# the name of the file to which we're writing the logs +# (If left to the default value of None, we don't write to a file) +global LOG_FILENAME + +# the name of the gene the code is processing +global GENE + +VERBOSITY = 1 +CWD = os.path.abspath(os.getcwd()) +LOG_FOLDER = os.path.join(CWD, "../logs") +LOG_FILENAME = None +GENE = None + diff --git a/dynamo/multivelo/sparse_matrix_utils.py b/dynamo/multivelo/sparse_matrix_utils.py new file mode 100644 index 000000000..d64aa2765 --- /dev/null +++ b/dynamo/multivelo/sparse_matrix_utils.py @@ -0,0 +1,94 @@ +import os +import warnings + +if "NVCC" not in os.environ: + os.environ["NVCC"] = "/usr/local/cuda-11.5/bin/nvcc" + warnings.warn( + "NVCC Path not found, set to : /usr/local/cuda-11.5/bin/nvcc . \nPlease set NVCC as appropitate to your environment" + ) + +import cupy as cp +from numba import cuda +import math + +## Cuda JIT +code = """ +#include +extern "C" __global__ +void sort_sparse_array(double *data, int*indices, int *indptr, int n_rows) +{ + int tid = blockDim.x * blockIdx.x + threadIdx.x; + if(tid >= n_rows) return; + thrust::sort_by_key(thrust::seq, data+ indptr[tid], data + indptr[tid+1], indices + indptr[tid]); +} +""" + +kernel = cp.RawModule(code=code, backend="nvcc") +sort_f = kernel.get_function("sort_sparse_array") + +## Numba function +@cuda.jit +def find_top_k_values( + data, indices, indptr, output_values_ar, output_idx_ar, k, n_rows +): + gid = cuda.grid(1) + + if gid >= n_rows: + return + + row_st_ind = indptr[gid] + row_end_ind = indptr[gid + 1] - 1 + + k = min(k, 1 + row_end_ind - row_st_ind) + for i in range(0, k): + index = row_st_ind + i + if data[index] != 0: + output_values_ar[gid][i] = data[index] + output_idx_ar[gid][i] = indices[index] + + +def find_top_k_values_sparse_matrix(X, k): + + X = X.copy() + + ### Output arrays to save the top k values + values_ar = cp.full(fill_value=0, shape=(X.shape[0], k), dtype=cp.float64) + idx_ar = cp.full(fill_value=-1, shape=(X.shape[0], k), dtype=cp.int32) + + ### sort in decreasing order + X.data = X.data * -1 + sort_f( + (math.ceil(X.shape[0] / 32),), (32,), (X.data, X.indices, X.indptr, X.shape[0]) + ) + X.data = X.data * -1 + + ## configure kernel based on number of tasks + find_top_k_values_k = find_top_k_values.forall(X.shape[0]) + + find_top_k_values_k(X.data, X.indices, X.indptr, values_ar, idx_ar, k, X.shape[0]) + + return idx_ar, values_ar + + +def top_n_sparse(X, n): + """Return indices,values of top n values in each row of a sparse matrix + Args: + X: The sparse matrix from which to get the + top n indices and values per row + n: The number of highest values to extract from each row + Returns: + indices: The top n indices per row + values: The top n values per row + """ + value_ls, idx_ls = [], [] + batch_size = 500 + for s in range(0, X.shape[0], batch_size): + e = min(s + batch_size, X.shape[0]) + idx_ar, value_ar = find_top_k_values_sparse_matrix(X[s:e], n) + value_ls.append(value_ar) + idx_ls.append(idx_ar) + + indices = cp.concatenate(idx_ls) + values = cp.concatenate(value_ls) + + return indices, values \ No newline at end of file diff --git a/dynamo/plot/utils.py b/dynamo/plot/utils.py index df2a341d6..f505e14d7 100755 --- a/dynamo/plot/utils.py +++ b/dynamo/plot/utils.py @@ -413,6 +413,7 @@ def _matplotlib_points( inset_dict={}, show_colorbar=True, projection=None, # default in matplotlib + adjust_legend=False, **kwargs, ): import matplotlib.pyplot as plt @@ -833,6 +834,7 @@ def _matplotlib_points( ) elif len(unique_labels) > 1 and show_legend == "on data": font_color = "white" if background in ["black", "#ffffff"] else "black" + texts=[] for i in unique_labels: if i == "other": continue @@ -855,6 +857,15 @@ def _matplotlib_points( PathEffects.Normal(), ] ) + texts.append(txt) + if adjust_legend==True: + from adjustText import adjust_text + import adjustText + if adjustText.__version__<='0.8': + adjust_text(texts,only_move={'text': 'xy'},arrowprops=dict(arrowstyle='->', color='red'),) + else: + adjust_text(texts,only_move={"text": "xy", "static": "xy", "explode": "xy", "pull": "xy"}, + arrowprops=dict(arrowstyle='->', color='black')) else: ax.legend( handles=legend_elements, @@ -885,6 +896,7 @@ def _datashade_points( vmax=98, sort="raw", projection="2d", + adjust_legend=False, **kwargs, ): import datashader as ds @@ -1007,6 +1019,7 @@ def _datashade_points( if show_legend and legend_elements is not None: if len(unique_labels) > 1 and show_legend == "on data": font_color = "white" if background == "black" else "black" + texts=[] for i in unique_labels: color_cnt = np.nanmedian(points.iloc[np.where(labels == i)[0], :2], 0) txt = plt.text( @@ -1025,6 +1038,16 @@ def _datashade_points( PathEffects.Normal(), ] ) + texts.append(txt) + if adjust_legend==True: + from adjustText import adjust_text + import adjustText + + if adjustText.__version__<='0.8': + adjust_text(texts,only_move={'text': 'xy'},arrowprops=dict(arrowstyle='->', color='red'),) + else: + adjust_text(texts,only_move={"text": "xy", "static": "xy", "explode": "xy", "pull": "xy"}, + arrowprops=dict(arrowstyle='->', color='black')) else: if type(show_legend) == "str": ax.legend( diff --git a/dynamo/sample_data.py b/dynamo/sample_data.py index 9a2641b17..ca74d6312 100755 --- a/dynamo/sample_data.py +++ b/dynamo/sample_data.py @@ -145,6 +145,17 @@ def DentateGyrus( return adata +def bone_marrow( + url: str = "https://figshare.com/ndownloader/files/35826944", + filename: str = "bone_marrow.h5ad", +) -> AnnData: + """The bone marrow dataset used in + + This data consists of 27,876 genes across 5,780 cells. + """ + adata = get_adata(url, filename) + + return adata def Haber( url: str = "http://pklab.med.harvard.edu/velocyto/Haber_et_al/Haber_et_al.loom", diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index f39ec0265..0dd9e3b00 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -2718,6 +2718,7 @@ def get_ekey_vkey_from_adata(adata: AnnData) -> Tuple[str, str, str]: mapper = get_mapper() layer = [] + if has_splicing: if has_labeling: if "X_new" not in adata.layers.keys(): # unlabel spliced: S diff --git a/requirements.txt b/requirements.txt index 04342b2d4..a1c012032 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy>=1.20.0,<2.0.0 pandas>=1.3.5 scipy>=1.4.1 scikit-learn>=0.19.1,<1.5.0 -anndata>=0.8.0 +anndata>=0.8.0,<0.10.8 loompy>=3.0.5 matplotlib>=3.7.5 setuptools @@ -24,3 +24,5 @@ get_version>=3.5.4 openpyxl typing-extensions session-info>=1.0.0 +adjustText +mudata diff --git a/setup.cfg b/setup.cfg index 37db253e2..807606544 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,3 +6,6 @@ tag = True [bumpversion:file:setup.py] [bumpversion:file:docs/source/conf.py] + +[options.package_data] +* = multivelo/neural_nets/* diff --git a/setup.py b/setup.py index f21527a67..17ef9a0e2 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ def read_requirements(path): if __name__ == "__main__": setup( name="dynamo-release", - version="v1.4.1", + version="v1.4.2-rc1", python_requires=">=3.7", install_requires=read_requirements("requirements.txt"), extras_require={ @@ -31,6 +31,7 @@ def read_requirements(path): ], # include_dirs=[np.get_include()], author="Xiaojie Qiu, Yan Zhang, Ke Ni", + author_team="dynamo team", author_email="xqiu.sc@gmail.com", description="Mapping Vector Field of Single Cells", long_description=long_description, @@ -48,4 +49,4 @@ def read_requirements(path): "scSLAMseq", "potential", ], - ) + ) \ No newline at end of file