minor changes to sc reference component

Co-authored-by: LouisK92 <LouisK92@users.noreply.github.com>
openproblems-bio · Aug 28, 2024 · dcd0623 · dcd0623
1 parent 0ee9657
commit dcd0623
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 27 deletions.
diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh
@@ -76,9 +76,10 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \
   --min_y 10000 \
   --max_y 12000
 
-# # generate sc reference
-# viash run src/data_loaders/download_allen_brain_atlas/config.vsh.yaml -- \
-#   --output "$OUT_DIR/dataset_sc.h5ad"
+# generate sc reference
+VIASH_TEMP=/tmp/allen_brain_cell_atlas \
+  viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml -- \
+  --output "$OUT_DIR/sc_reference.h5ad" --regions "OLF;TH"
 
 aws s3 sync --profile op \
   "resources_test/common/2023_10x_mouse_brain_xenium" \

diff --git a/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml b/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml
@@ -1,15 +1,34 @@
-__merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr 
+# __merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr 
 name: download_allen_brain_cell_atlas
 namespace: data_loaders
 
 argument_groups:
- - name: Inputs
-   arguments:
-     - type: file
-       name: --temp_dir
-       required: false
-       description: Path to a directory where intermediate files are saved.
-
+  - name: Inputs
+    arguments:
+      - type: string
+        name: --regions
+        required: false
+        description: A list of brain regions to download data for. If not provided, data for all regions will be downloaded.
+        multiple: true
+        default:
+          - CTXsp
+          - HPF
+          - HY
+          - Isocortex-1
+          - Isocortex-2
+          - Isocortex-3
+          - Isocortex-4
+          - MB
+          - OLF
+          - TH
+  - name: Outputs
+    arguments:
+      - type: file
+        name: --output
+        required: true
+        direction: output
+        example: output.h5ad
+        description: Path to the output h5ad file.
 
 resources:
   - type: python_script
@@ -22,9 +41,10 @@ engines:
       - type: python
         packages: 
           - git+https://github.com/alleninstitute/abc_atlas_access
+  - type: native
 
 runners:
   - type: executable
   - type: nextflow
     directives:
-      label: [highmem, midcpu, midtime]
+      label: [highmem, midcpu, midtime]
diff --git a/src/data_loaders/download_allen_brain_cell_atlas/script.py b/src/data_loaders/download_allen_brain_cell_atlas/script.py
@@ -8,25 +8,32 @@
 # env setup:
 # pip install -U git+https://github.com/alleninstitute/abc_atlas_access
 
-
-MANIFEST_JSON = 'releases/20230630/manifest.json' # Defines data version (Allen Brain manifest file)
-version = MANIFEST_JSON.split('/')[1]
-OUTPUT_DIR = Path(f'resources/datasets/abc_atlas_{version}')
-
-regions = ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TH"]
+VERSION = "20230630"
 
 ## VIASH START
 par = {
-    "temp_dir": str(OUTPUT_DIR / 'tmp'),
-    "output": str(OUTPUT_DIR / f"abc_atlas_{version}.h5ad")
+    "regions": ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TH"],
+    "output": f"abc_atlas_{VERSION}.h5ad"
+}
+meta = {
+    "name": "...",
+    "config": "...",
+    "temp_dir": "...",
+    "cpus": None,
+    "memory_b": None,
+    "memory_mb": None,
+    "memory_gb": None
 }
-
 ## VIASH END
 
-TMP_DIR = (OUTPUT_DIR / 'tmp') if par["temp_dir"] is None else Path(par["temp_dir"])
+regions = par["regions"]
+
+TMP_DIR = Path("/tmp") if meta["temp_dir"] is None else Path(meta["temp_dir"])
 
 abc_cache = AbcProjectCache.from_cache_dir(TMP_DIR)
-abc_cache.load_manifest(MANIFEST_JSON)
+abc_cache.load_manifest(
+    f"releases/{VERSION}/manifest.json"
+)  # saved to TMPDIR / releases/{VERSION}/manifest.json
 
 # From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
 count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in regions]
@@ -41,18 +48,22 @@
 # Download data
 for file in count_matrix_files:
     abc_cache.get_data_path(directory='WMB-10Xv2', file_name=file)
-    
+
 for file in metadata_files:
     abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)
-    
+
 # Read an concatenate the data
 obs = pd.read_csv(
-    TMP_DIR / f"metadata/WMB-10X/{version}/views/cell_metadata_with_cluster_annotation.csv", index_col=0
+    TMP_DIR
+    / f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
+    index_col=0,
 )
 
 adatas = []
 for region in regions:
-    adata = ad.read_h5ad(TMP_DIR / f"expression_matrices/WMB-10Xv2/{version}/WMB-10Xv2-{region}-raw.h5ad")
+    adata = ad.read_h5ad(
+        TMP_DIR / f"expression_matrices/WMB-10Xv2/{VERSION}/WMB-10Xv2-{region}-raw.h5ad"
+    )
     adata = adata[adata.obs_names.isin(obs.index)]
     adata.obs["region"] = region
     adatas.append(adata)