Release candidate (#15)

* rename file * rename * code updates * documentation updates * simplify pseudobulk code * formatting * update to release notes * add docs * Update news --------- Co-authored-by: Jason Vander Heiden <jason.vanderheiden@yale.edu>
Genentech · Sep 11, 2024 · 7823fa8 · 7823fa8
1 parent cb27cd1
commit 7823fa8
Show file tree

Hide file tree

Showing 18 changed files with 2,124 additions and 684 deletions.
diff --git a/NEWS.rst b/NEWS.rst
@@ -1,8 +1,36 @@
 Release Notes
-========================================================================
+===============================================================================
+
+Version 0.3.0 pre-release notes:  September 11, 2024
+-------------------------------------------------------------------------------
+
+Exhaustive queries: 
+  + Functionality to perform exhaustive queries has been added as the new
+    method `cell_query.search_exhaustive`.
+  + The kNN query method `cell_query.search` has been renamed to
+    `cell_query.search_nearest`.
+
+Query result filtering and interpretation:
+  + Results from exhaustive queries can be constrained to specific
+    metadata criteria (e.g.  tissue, disease, in vitro vs in vivo, etc.)
+    using the `metadata_filter` argument to `cell_query.search_exhaustive`.
+  + Results from exhaustive queries can be constrained by distance-to-query
+    using the `max_dist` argument to `cell_query.search_exhaustive`.
+  + The `cell_query.compile_sample_metadata` method has been expanded to
+    allow grouping by tissue and disease (in addition to study and sample).
+  + The methods `utils.subset_by_unique_values`, `utils.subset_by_frequency`,
+    and `utils.categorize_and_sort_by_score` have been added to provide
+    tools for filtering, sorting and summarizing query results.
+
+Optimizations to training:
+  + The ASW and NMSE training evaluation metrics were added to multiple
+    methods.
+  + The `triplet_selector.get_asw` method was added to calculate ASW.
+  + Optimized sampling weights of study and cell type.
+
 
 Version 0.2.0:  March 22, 2024
-------------------------------------------------------------------------
+-------------------------------------------------------------------------------
 
 + Updated version requirements for multiple dependencies and removed
   the ``pegasuspy`` dependency.
@@ -14,6 +42,6 @@ Version 0.2.0:  March 22, 2024
 
 
 Version 0.1.0:  August 13, 2023
-------------------------------------------------------------------------
+-------------------------------------------------------------------------------
 
 + Initial public release.
diff --git a/docs/api.rst b/docs/api.rst
@@ -21,19 +21,19 @@ API Reference
     modules/cell_annotation
     modules/cell_embedding
     modules/cell_query
+    modules/cell_search_knn
     modules/interpreter
 
 .. toctree::
     :maxdepth: 2
     :caption: Model Training
     :hidden:
 
-    modules/data_models
+    modules/anndata_data_models
     modules/nn_models
     modules/training_models
     modules/triplet_selector
     modules/zarr_data_models
-    modules/zarr_dataset
 
 .. toctree::
     :maxdepth: 2
@@ -43,6 +43,7 @@ API Reference
     modules/ontologies
     modules/utils
     modules/visualizations
+    modules/zarr_dataset
 
 Core Functionality
 --------------------------------------------------------------------------------
@@ -54,6 +55,7 @@ attribution scoring.
 * :mod:`scimilarity.cell_annotation`
 * :mod:`scimilarity.cell_embedding`
 * :mod:`scimilarity.cell_query`
+* :mod:`scimilarity.cell_search_knn`
 * :mod:`scimilarity.interpreter`
 
 Model Training
@@ -65,7 +67,7 @@ across datasets, specialized variations of metric learning loss functions, and
 procedures for cell ontology aware triplet mining. The following modules include
 support for these training tasks.
 
-* :mod:`scimilarity.data_models`
+* :mod:`scimilarity.anndata_data_models`
 * :mod:`scimilarity.nn_models`
 * :mod:`scimilarity.training_models`
 * :mod:`scimilarity.triplet_selector`

diff --git a/docs/modules/anndata_data_models.rst b/docs/modules/anndata_data_models.rst
@@ -0,0 +1,6 @@
+scimilarity.anndata_data_models
+--------------------------------------------------------------------------------
+
+.. automodule:: scimilarity.anndata_data_models
+    :members:
+    :show-inheritance:
diff --git a/docs/modules/data_models.rst → docs/modules/cell_search_knn.rst b/docs/modules/data_models.rst → docs/modules/cell_search_knn.rst
@@ -1,6 +1,6 @@
-scimilarity.data_models
+scimilarity.cell_search_knn
 --------------------------------------------------------------------------------
 
-.. automodule:: scimilarity.data_models
+.. automodule:: scimilarity.cell_search_knn
     :members:
-    :show-inheritance:
+    :show-inheritance:
diff --git a/docs/notebooks/advanced_tutorial.ipynb b/docs/notebooks/advanced_tutorial.ipynb
@@ -19,7 +19,7 @@
     "\n",
     " 1. SCimilarity trained model. [Download SCimilarity models](https://zenodo.org/record/8240464). Note, this is a large tarball - downloading and uncompressing can take a several minutes.\n",
     "\n",
-    " 2. Query data. We will use [Adams et al., 2020](https://www.science.org/doi/10.1126/sciadv.aba1983?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed) healthy and IPF lung scRNA-seq data. [Download tutorial data](https://zenodo.org/record/8242083).\n",
+    " 2. Query data. We will use [Adams et al., 2020](https://www.science.org/doi/10.1126/sciadv.aba1983?url_ver=Z39.88-2003&rfr_id=ori:rid:crossref.org&rfr_dat=cr_pub%20%200pubmed) healthy and IPF lung scRNA-seq data. [Download tutorial data](https://zenodo.org/records/13685881).\n",
     "\n",
     "For instructions on how to download the demo data or SCimilarity models please check the cell search tutorial."
    ]
@@ -1012,7 +1012,6 @@
      "shell.execute_reply": "2023-08-08T18:49:40.565153Z",
      "shell.execute_reply.started": "2023-08-08T18:49:31.140190Z"
     },
-    "scrolled": false,
     "tags": []
    },
    "outputs": [
@@ -1146,7 +1145,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.10.14"
   }
  },
  "nbformat": 4,

diff --git a/docs/notebooks/cell_annotation_tutorial.ipynb b/docs/notebooks/cell_annotation_tutorial.ipynb
diff --git a/docs/notebooks/cell_search_tutorial_1.ipynb b/docs/notebooks/cell_search_tutorial_1.ipynb
diff --git a/docs/notebooks/cell_search_tutorial_2.ipynb b/docs/notebooks/cell_search_tutorial_2.ipynb
diff --git a/docs/notebooks/cell_search_tutorial_3.ipynb b/docs/notebooks/cell_search_tutorial_3.ipynb
diff --git a/src/scimilarity/data_models.py → src/scimilarity/anndata_data_models.py b/src/scimilarity/data_models.py → src/scimilarity/anndata_data_models.py
@@ -8,8 +8,8 @@
 from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
 from typing import Optional
 
-from scimilarity.utils import align_dataset
-from scimilarity.ontologies import import_cell_ontology, get_id_mapper
+from .utils import align_dataset
+from .ontologies import import_cell_ontology, get_id_mapper
 
 
 class scDataset(Dataset):
@@ -166,11 +166,15 @@ def get_sampler_weights(
         else:
             class_sample_count = Counter(labels)
             study_sample_count = Counter(studies)
+            class_sample_count = {
+                x: np.log1p(class_sample_count[x] / 1e4) for x in class_sample_count
+            }
+            study_sample_count = {
+                x: np.log1p(study_sample_count[x] / 1e5) for x in study_sample_count
+            }
             sample_weights = torch.Tensor(
                 [
-                    1.0
-                    / class_sample_count[labels[i]]
-                    / np.log(study_sample_count[studies[i]])
+                    1.0 / class_sample_count[labels[i]] / study_sample_count[studies[i]]
                     for i in range(len(labels))
                 ]
             )

diff --git a/src/scimilarity/cell_annotation.py b/src/scimilarity/cell_annotation.py
@@ -1,6 +1,6 @@
 from typing import Optional, Union, List, Set, Tuple
 
-from scimilarity.cell_search_knn import CellSearchKNN
+from .cell_search_knn import CellSearchKNN
 
 
 class CellAnnotation(CellSearchKNN):
@@ -48,6 +48,8 @@ def __init__(
             filenames = {}
 
         self.annotation_path = os.path.join(model_path, "annotation")
+        os.makedirs(self.annotation_path, exist_ok=True)
+
         self.filenames["knn"] = os.path.join(
             self.annotation_path, filenames.get("knn", "labelled_kNN.bin")
         )
@@ -60,10 +62,13 @@ def __init__(
         self.load_knn_index(self.filenames["knn"])
 
         # get int2label
-        with open(self.filenames["celltype_labels"], "r") as fh:
-            self.idx2label = {i: line.strip() for i, line in enumerate(fh)}
+        self.idx2label = None
+        self.classes = None
+        if self.knn is not None:
+            with open(self.filenames["celltype_labels"], "r") as fh:
+                self.idx2label = {i: line.strip() for i, line in enumerate(fh)}
+            self.classes = set(self.label2int.keys())
 
-        self.classes = set(self.label2int.keys())
         self.safelist = None
         self.blocklist = None
 
@@ -111,8 +116,8 @@ def build_kNN(
         import numpy as np
         import os
         import pandas as pd
-        from scimilarity.utils import align_dataset
-        from scimilarity.zarr_dataset import ZarrDataset
+        from .utils import align_dataset
+        from .zarr_dataset import ZarrDataset
         from tqdm import tqdm
 
         if isinstance(input_data, list):
@@ -259,6 +264,7 @@ def get_predictions_kNN(
         k: int = 50,
         ef: int = 100,
         weighting: bool = False,
+        disable_progress: bool = False,
     ) -> Tuple["numpy.ndarray", "numpy.ndarray", "numpy.ndarray", "pandas.DataFrame"]:
         """Get predictions from kNN search results.
 
@@ -273,6 +279,8 @@ def get_predictions_kNN(
             See https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
         weighting: bool, default: False
             Use distance weighting when getting the consensus prediction.
+        disable_progress: bool, default: False
+            Disable tqdm progress bar
 
         Returns
         -------
@@ -330,7 +338,9 @@ def get_predictions_kNN(
             predictions = pd.Series(nn_idxs.flatten()).map(self.idx2label)
         else:
             predictions = []
-            for nns, d_nns in tqdm(zip(nn_idxs, nn_dists), total=nn_idxs.shape[0]):
+            for nns, d_nns in tqdm(
+                zip(nn_idxs, nn_dists), total=nn_idxs.shape[0], disable=disable_progress
+            ):
                 # count celltype in nearest neighbors (optionally with distance weights)
                 celltype = defaultdict(float)
                 celltype_weighted = defaultdict(float)
@@ -403,7 +413,7 @@ def annotate_dataset(
         >>> data = annotate_dataset(data)
         """
 
-        from scimilarity.utils import align_dataset
+        from .utils import align_dataset
 
         embeddings = self.get_embeddings(align_dataset(data, self.gene_order).X)
         data.obsm["X_scimilarity"] = embeddings

diff --git a/src/scimilarity/cell_embedding.py b/src/scimilarity/cell_embedding.py
@@ -35,7 +35,7 @@ def __init__(
         import json
         import os
         import pandas as pd
-        from scimilarity.nn_models import Encoder
+        from .nn_models import Encoder
 
         self.model_path = model_path
         self.use_gpu = use_gpu