Skip to content

Commit

Permalink
remove cache during processing
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 26, 2024
1 parent 28d7183 commit 2b09255
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ sample_n_obs: 500000
sample_obs_weight: subclass
sample_transform: log
sample_seed: 42
keep_files: false # disk isn't large enough
output_dataset: "\$id/dataset.h5ad"
output_meta: "\$id/dataset_meta.yaml"
Expand Down
6 changes: 6 additions & 0 deletions src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ argument_groups:
type: "integer"
description: "A seed for the subsampling."
example: 123
- name: Caching settings
arguments:
- type: boolean
name: --keep_files
required: true
description: Whether to remove the downloaded files after processing.
- name: Metadata
arguments:
- type: string
Expand Down
9 changes: 8 additions & 1 deletion src/datasets/loaders/allen_brain_cell_atlas/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"sample_obs_weight": "subclass",
"sample_transform": "sqrt",
"sample_seed": None,
"keep_files": True,
"output": "tmp_dataset.h5ad",
}
meta = {
Expand All @@ -39,8 +40,9 @@
)

print("Reading obs", flush=True)
obs_path = abc_cache.get_metadata_path(directory="WMB-10X", file_name="cell_metadata_with_cluster_annotation")
obs = pd.read_csv(
abc_cache.get_metadata_path(directory="WMB-10X", file_name="cell_metadata_with_cluster_annotation"),
obs_path,
index_col=0,
dtype=defaultdict(
lambda: "category",
Expand All @@ -50,6 +52,8 @@
region_of_interest_order="int"
)
)
if not par["keep_files"]:
obs_path.unlink()

print("Filtering obs based on regions", flush=True)
obs = obs[obs["anatomical_division_label"].isin(REGIONS)]
Expand Down Expand Up @@ -84,6 +88,9 @@
print(f"Reading h5ad for region {region}", flush=True)
adata = ad.read_h5ad(str(adata_path))

if not par["keep_files"]:
adata_path.unlink()

# filter cells
adata = adata[adata.obs_names.isin(obs.index)].copy()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ argument_groups:
- MB
- OLF
- TH
- name: Caching settings
arguments:
- type: boolean
name: --keep_files
required: true
description: Whether to remove the downloaded files after processing.
- name: Metadata
arguments:
- type: string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ workflow run_wf {
fromState: [
"abca_version",
"regions",
"keep_files",
"dataset_id",
"dataset_name",
"dataset_url",
Expand Down

0 comments on commit 2b09255

Please sign in to comment.