sample right after download

openproblems-bio · Sep 25, 2024 · cebecbd · cebecbd
1 parent f9cb326
commit cebecbd
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 84 deletions.
diff --git a/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh b/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh
@@ -32,6 +32,11 @@ param_list:
     dataset_description: See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset.
     dataset_organism: mus_musculus
 
+sample_n_obs: 500000
+sample_obs_weight: subclass
+sample_transform: log
+sample_seed: 42
+
 output_dataset: "\$id/dataset.h5ad"
 output_meta: "\$id/dataset_meta.yaml"
 output_state: "\$id/state.yaml"

diff --git a/scripts/create_resources/process_vizgen_merscope.sh b/scripts/create_resources/process_vizgen_merscope.sh
@@ -16,7 +16,7 @@ param_list:
   - id: "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1"
     gcloud_bucket: "vz-ffpe-showcase"
     dataset_bucket_name: "HumanBreastCancerPatient1"
-    dataset_name: "2022 Vizgen Human Breast Cancer MERFISH Patient1"
+    dataset_name: "Vizgen Human Breast Cancer MERFISH Patient1"
     dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
     dataset_summary: "Human Breast Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
     dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
@@ -26,7 +26,7 @@ param_list:
   - id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep1"
     gcloud_bucket: "vz-ffpe-showcase"
     dataset_bucket_name: "HumanLiverCancerPatient1"
-    dataset_name: "2022 Vizgen Human Liver Cancer MERFISH Patient1"
+    dataset_name: "Vizgen Human Liver Cancer MERFISH Patient1"
     dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
     dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
     dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
@@ -36,7 +36,7 @@ param_list:
   - id: "vizgen_merscope/2022_vizgen_human_liver_cancer_merfish/rep2"
     gcloud_bucket: "vz-ffpe-showcase"
     dataset_bucket_name: "HumanLiverCancerPatient2"
-    dataset_name: "2022 Vizgen Human Liver Cancer MERFISH Patient2"
+    dataset_name: "Vizgen Human Liver Cancer MERFISH Patient2"
     dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
     dataset_summary: "Human Liver Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
     dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
@@ -46,7 +46,7 @@ param_list:
   - id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep1"
     gcloud_bucket: "vz-ffpe-showcase"
     dataset_bucket_name: "HumanLungCancerPatient1"
-    dataset_name: "2022 Vizgen Human Lung Cancer MERFISH Patient1"
+    dataset_name: "Vizgen Human Lung Cancer MERFISH Patient1"
     dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
     dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
     dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."
@@ -56,7 +56,7 @@ param_list:
   - id: "vizgen_merscope/2022_vizgen_human_lung_cancer_merfish/rep2"
     gcloud_bucket: "vz-ffpe-showcase"
     dataset_bucket_name: "HumanLungCancerPatient2"
-    dataset_name: "2022 Vizgen Human Lung Cancer MERFISH Patient2"
+    dataset_name: "Vizgen Human Lung Cancer MERFISH Patient2"
     dataset_url: "https://info.vizgen.com/ffpe-showcase?submissionGuid=a93dbab5-c128-4269-afe3-82ea2bf9cdaf"
     dataset_summary: "Human Lung Cancer data from the MERSCOPE FFPE Human Immuno-Oncology Data Release."
     dataset_description: "The MERSCOPE FFPE Human Immuno-Oncology Data Release was generated using the MERSCOPE FFPE Sample Prep Solution and the MERSCOPE Immuno-Oncology Predesigned Panel. This data release includes 16 MERFISH datasets generated by the MERSCOPE Platform from 8 different human tumor types, each measuring 500 genes representing approximately 4 billion transcripts and 9 million cells cumulatively."

diff --git a/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
@@ -25,16 +25,28 @@ argument_groups:
           - MB
           - OLF
           - TH
-  - name: Caching
+  - name: Sampling
     arguments:
+      - type: integer
+        name: --sample_n_obs
+        required: false
+        description: The number of cells to sample.
       - type: string
-        name: --cache_dir
+        name: --sample_obs_weight
         required: false
-        description: Directory to cache the downloaded data.
+        description: The column to use for weighting the sampling of cells.
+        choices:
+          - donor_label
+          - anatomical_division_label
+          - class
+          - subclass
       - type: string
-        name: --output_cache_dir
+        name: --sample_transform
         required: false
-        description: Output directory of the cached data.
+        description: The transformation to apply to the column used for weighting the sampling of cells.
+        choices:
+          - log
+          - sqrt
   - name: Metadata
     arguments:
       - type: string

diff --git a/src/datasets/loaders/allen_brain_cell_atlas/script.py b/src/datasets/loaders/allen_brain_cell_atlas/script.py
@@ -2,23 +2,30 @@
 
 from pathlib import Path
 import pandas as pd
+import numpy as np
 from collections import defaultdict
 import scipy as sp
-import scanpy as sc
 import anndata as ad
 from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache
 
 ## VIASH START
 par = {
     "abca_version": "20230630",
     "regions": ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TF"],
+    "sample_n_obs": 500000,
+    "sample_obs_weight": "subclass",
+    "sample_transform": "sqrt",
+    "sample_seed": None,
     "output": "tmp_dataset.h5ad",
 }
 meta = {
     "temp_dir": "/tmp/allen_brain_cell_atlas",
 }
 ## VIASH END
 
+if par["sample_seed"]:
+    np.random.seed(par["sample_seed"])
+
 # helper variables
 VERSION = par["abca_version"]
 REGIONS = par["regions"]
@@ -35,8 +42,8 @@
 metadata_files = [
     "cell_metadata_with_cluster_annotation",
 ]
-for file in metadata_files:
-    abc_cache.get_metadata_path(directory="WMB-10X", file_name=file)
+for file_name in metadata_files:
+    abc_cache.get_metadata_path(directory="WMB-10X", file_name=file_name)
 
 print("Reading obs", flush=True)
 obs = pd.read_csv(
@@ -51,35 +58,54 @@
     )
 )
 
-print("Downloading expression matrices", flush=True)
-# From abc_cache.list_data_files("WMB-10Xv2") # TODO: potentially also load other chemistries (currently only 10Xv2)
-for region in REGIONS:
-    print(f"Downloading h5ad file for region {region}", flush=True)
-    file = f"WMB-10Xv2-{region}/raw"
-    abc_cache.get_data_path(directory="WMB-10Xv2", file_name=file)
+print("Filtering obs based on regions", flush=True)
+obs = obs[obs["anatomical_division_label"].isin(REGIONS)]
+
+if par["sample_n_obs"]:
+    print("Filtering obs based on n_obs", flush=True)
+    col = par["sample_obs_weight"]
+
+    if col:
+        weights = obs.groupby(col).size()
+
+        if par["sample_transform"] == "sqrt":
+            weights = weights.apply(lambda x: np.sqrt(x))
+        elif par["sample_transform"] == "log":
+            weights = weights.apply(lambda x: np.log(x))
 
-print("Reading expression matrices", flush=True)
+        obs = obs.sample(n=par["sample_n_obs"], weights=obs[col].map(weights))
+    else:
+        obs = obs.sample(n=par["sample_n_obs"])
+
+
+# From abc_cache.list_data_files("WMB-10Xv2")
+# TODO: potentially also load other chemistries (currently only 10Xv2)
+
+print("Downloading and reading expression matrices", flush=True)
 adatas = []
 for region in REGIONS:
-    print(f"Reading h5ad for region {region}", flush=True)
-    adata = ad.read_h5ad(
-        TMP_DIR / f"expression_matrices/WMB-10Xv2/{VERSION}/WMB-10Xv2-{region}-raw.h5ad"
-    )
-    sc.pp.filter_cells(adata, min_genes=5)
-    sc.pp.filter_cells(adata, min_counts=50)
-
-    adata = adata[adata.obs_names.isin(obs.index)]
-    adata.obs["region"] = region
-    counts = adata.X
-    del adata.X
-
-    # make sure counts is sparse
-    if not isinstance(counts, sp.sparse.csr_matrix):
-        counts = sp.sparse.csr_matrix(counts)
-    adata.layers["counts"] = counts
-
-    # add anndata to list
-    adatas.append(adata)
+    try:
+        print(f"Reading h5ad for region {region}", flush=True)
+
+        file_name = f"WMB-10Xv2-{region}/raw"
+        adata_path = abc_cache.get_data_path(directory="WMB-10Xv2", file_name=file_name)
+
+        adata = ad.read_h5ad(str(adata_path), backed=True)
+
+        adata_ = adata[adata.obs_names.isin(obs.index)]
+        adata.obs["region"] = region
+        counts = adata.X
+        del adata.X
+
+        # make sure counts is sparse
+        if not isinstance(counts, sp.sparse.csr_matrix):
+            counts = sp.sparse.csr_matrix(counts)
+        adata.layers["counts"] = counts
+
+        # add anndata to list
+        adatas.append(adata)
+    except Exception as e:
+        print(f"Error reading {region}: {e}")
 
 print("Concatenating data", flush=True)
 adata = ad.concat(adatas, merge="first")

diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml
@@ -57,36 +57,27 @@ argument_groups:
         required: false
   - name: Sampling options
     arguments:
-      - name: "--do_subsample"
-        type: boolean
-        default: false
-        description: "Whether or not to subsample the dataset"
-      - name: "--n_obs"
-        type: integer
-        description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
-        default: 500
-      - name: "--n_vars"
-        type: integer
-        description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
-        default: 500
-      - name: "--keep_features"
-        type: string
-        multiple: true
-        description: A list of genes to keep.
-      - name: "--keep_cell_type_categories"
-        type: "string"
-        multiple: true
-        description: "Categories indexes to be selected"
+      - type: integer
+        name: --sample_n_obs
         required: false
-      - name: "--keep_batch_categories"
-        type: "string"
-        multiple: true
-        description: "Categories indexes to be selected"
+        description: The number of cells to sample.
+      - type: string
+        name: --sample_obs_weight
+        required: false
+        description: The column to use for weighting the sampling of cells.
+        choices:
+          - donor_label
+          - anatomical_division_label
+          - class
+          - subclass
+      - type: string
+        name: --sample_transform
         required: false
-      - name: "--even"
-        type: "boolean_true"
-        description: Subsample evenly from different batches
-      - name: "--seed"
+        description: The transformation to apply to the column used for weighting the sampling of cells.
+        choices:
+          - log
+          - sqrt
+      - name: "--sample_seed"
         type: "integer"
         description: "A seed for the subsampling."
         example: 123

diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf b/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf
@@ -31,29 +31,16 @@ workflow run_wf {
         "dataset_summary",
         "dataset_description",
         "dataset_organism",
+        "sample_n_obs",
+        "sample_obs_weight",
+        "sample_transform",
+        "sample_seed"
       ],
       toState: [
         "output_raw": "output"
       ]
     )
 
-    // subsample if so desired
-    | subsample.run(
-      runIf: { id, state -> state.do_subsample },
-      fromState: [
-        "input": "output_raw",
-        "n_obs": "n_obs",
-        "n_vars": "n_vars",
-        "keep_features": "keep_features",
-        "keep_cell_type_categories": "keep_cell_type_categories",
-        "keep_batch_categories": "keep_batch_categories",
-        "even": "even",
-        "seed": "seed"
-      ],
-      args: [output_mod2: null],
-      toState: ["output_raw": "output"]
-    )
-
     | log_cp.run(
       key: "log_cp10k",
       fromState: [