remove cache during processing

openproblems-bio · Sep 26, 2024 · 2b09255 · 2b09255
1 parent 28d7183
commit 2b09255
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 1 deletion.
diff --git a/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh b/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh
@@ -36,6 +36,7 @@ sample_n_obs: 500000
 sample_obs_weight: subclass
 sample_transform: log
 sample_seed: 42
+keep_files: false # disk isn't large enough
 
 output_dataset: "\$id/dataset.h5ad"
 output_meta: "\$id/dataset_meta.yaml"

diff --git a/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
@@ -51,6 +51,12 @@ argument_groups:
         type: "integer"
         description: "A seed for the subsampling."
         example: 123
+  - name: Caching settings
+    arguments:
+      - type: boolean
+        name: --keep_files
+        required: true
+        description: Whether to remove the downloaded files after processing.
   - name: Metadata
     arguments:
       - type: string

diff --git a/src/datasets/loaders/allen_brain_cell_atlas/script.py b/src/datasets/loaders/allen_brain_cell_atlas/script.py
@@ -16,6 +16,7 @@
     "sample_obs_weight": "subclass",
     "sample_transform": "sqrt",
     "sample_seed": None,
+    "keep_files": True,
     "output": "tmp_dataset.h5ad",
 }
 meta = {
@@ -39,8 +40,9 @@
 )
 
 print("Reading obs", flush=True)
+obs_path = abc_cache.get_metadata_path(directory="WMB-10X", file_name="cell_metadata_with_cluster_annotation")
 obs = pd.read_csv(
-    abc_cache.get_metadata_path(directory="WMB-10X", file_name="cell_metadata_with_cluster_annotation"),
+    obs_path,
     index_col=0,
     dtype=defaultdict(
         lambda: "category",
@@ -50,6 +52,8 @@
         region_of_interest_order="int"
     )
 )
+if not par["keep_files"]:
+    obs_path.unlink()
 
 print("Filtering obs based on regions", flush=True)
 obs = obs[obs["anatomical_division_label"].isin(REGIONS)]
@@ -84,6 +88,9 @@
         print(f"Reading h5ad for region {region}", flush=True)
         adata = ad.read_h5ad(str(adata_path))
 
+        if not par["keep_files"]:
+            adata_path.unlink()
+
         # filter cells
         adata = adata[adata.obs_names.isin(obs.index)].copy()
 

diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml
@@ -25,6 +25,12 @@ argument_groups:
           - MB
           - OLF
           - TH
+  - name: Caching settings
+    arguments:
+      - type: boolean
+        name: --keep_files
+        required: true
+        description: Whether to remove the downloaded files after processing.
   - name: Metadata
     arguments:
       - type: string

diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf b/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf
@@ -24,6 +24,7 @@ workflow run_wf {
       fromState: [
         "abca_version",
         "regions",
+        "keep_files",
         "dataset_id",
         "dataset_name",
         "dataset_url",