Skip to content

Commit

Permalink
minor changes to sc reference component
Browse files Browse the repository at this point in the history
Co-authored-by: LouisK92 <LouisK92@users.noreply.github.com>
  • Loading branch information
rcannood and LouisK92 committed Aug 28, 2024
1 parent 0ee9657 commit dcd0623
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 27 deletions.
7 changes: 4 additions & 3 deletions scripts/create_test_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \
--min_y 10000 \
--max_y 12000

# # generate sc reference
# viash run src/data_loaders/download_allen_brain_atlas/config.vsh.yaml -- \
# --output "$OUT_DIR/dataset_sc.h5ad"
# generate sc reference
VIASH_TEMP=/tmp/allen_brain_cell_atlas \
viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml -- \
--output "$OUT_DIR/sc_reference.h5ad" --regions "OLF;TH"

aws s3 sync --profile op \
"resources_test/common/2023_10x_mouse_brain_xenium" \
Expand Down
38 changes: 29 additions & 9 deletions src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,34 @@
__merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr
# __merge__: ../../api/comp_data_loader.yaml # TODO: loader for scrnaseq h5ad instead of spatialdata zarr
name: download_allen_brain_cell_atlas
namespace: data_loaders

argument_groups:
- name: Inputs
arguments:
- type: file
name: --temp_dir
required: false
description: Path to a directory where intermediate files are saved.

- name: Inputs
arguments:
- type: string
name: --regions
required: false
description: A list of brain regions to download data for. If not provided, data for all regions will be downloaded.
multiple: true
default:
- CTXsp
- HPF
- HY
- Isocortex-1
- Isocortex-2
- Isocortex-3
- Isocortex-4
- MB
- OLF
- TH
- name: Outputs
arguments:
- type: file
name: --output
required: true
direction: output
example: output.h5ad
description: Path to the output h5ad file.

resources:
- type: python_script
Expand All @@ -22,9 +41,10 @@ engines:
- type: python
packages:
- git+https://github.com/alleninstitute/abc_atlas_access
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [highmem, midcpu, midtime]
label: [highmem, midcpu, midtime]
41 changes: 26 additions & 15 deletions src/data_loaders/download_allen_brain_cell_atlas/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,32 @@
# env setup:
# pip install -U git+https://github.com/alleninstitute/abc_atlas_access


MANIFEST_JSON = 'releases/20230630/manifest.json' # Defines data version (Allen Brain manifest file)
version = MANIFEST_JSON.split('/')[1]
OUTPUT_DIR = Path(f'resources/datasets/abc_atlas_{version}')

regions = ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TH"]
VERSION = "20230630"

## VIASH START
par = {
"temp_dir": str(OUTPUT_DIR / 'tmp'),
"output": str(OUTPUT_DIR / f"abc_atlas_{version}.h5ad")
"regions": ["CTXsp", "HPF", "HY", "Isocortex-1", "Isocortex-2", "Isocortex-3", "Isocortex-4", "MB", "OLF", "TH"],
"output": f"abc_atlas_{VERSION}.h5ad"
}
meta = {
"name": "...",
"config": "...",
"temp_dir": "...",
"cpus": None,
"memory_b": None,
"memory_mb": None,
"memory_gb": None
}

## VIASH END

TMP_DIR = (OUTPUT_DIR / 'tmp') if par["temp_dir"] is None else Path(par["temp_dir"])
regions = par["regions"]

TMP_DIR = Path("/tmp") if meta["temp_dir"] is None else Path(meta["temp_dir"])

abc_cache = AbcProjectCache.from_cache_dir(TMP_DIR)
abc_cache.load_manifest(MANIFEST_JSON)
abc_cache.load_manifest(
f"releases/{VERSION}/manifest.json"
) # saved to TMPDIR / releases/{VERSION}/manifest.json

# From abc_cache.list_data_files('WMB-10Xv2') # TODO: potentially also load other chemistries (currently only 10Xv2)
count_matrix_files = [f'WMB-10Xv2-{region}/raw' for region in regions]
Expand All @@ -41,18 +48,22 @@
# Download data
for file in count_matrix_files:
abc_cache.get_data_path(directory='WMB-10Xv2', file_name=file)

for file in metadata_files:
abc_cache.get_metadata_path(directory='WMB-10X', file_name=file)

# Read an concatenate the data
obs = pd.read_csv(
TMP_DIR / f"metadata/WMB-10X/{version}/views/cell_metadata_with_cluster_annotation.csv", index_col=0
TMP_DIR
/ f"metadata/WMB-10X/{VERSION}/views/cell_metadata_with_cluster_annotation.csv",
index_col=0,
)

adatas = []
for region in regions:
adata = ad.read_h5ad(TMP_DIR / f"expression_matrices/WMB-10Xv2/{version}/WMB-10Xv2-{region}-raw.h5ad")
adata = ad.read_h5ad(
TMP_DIR / f"expression_matrices/WMB-10Xv2/{VERSION}/WMB-10Xv2-{region}-raw.h5ad"
)
adata = adata[adata.obs_names.isin(obs.index)]
adata.obs["region"] = region
adatas.append(adata)
Expand Down

0 comments on commit dcd0623

Please sign in to comment.