Skip to content

Commit cb472a6

Browse files
committed
2 parents a7d7abd + 55efb39 commit cb472a6

File tree

6 files changed

+279
-24
lines changed

6 files changed

+279
-24
lines changed

_viash.yaml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,12 @@ config_mods: |
8181
.runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
8282
8383
repositories:
84-
- name: openproblems-v2
84+
- name: openproblems
8585
type: github
86-
repo: openproblems-bio/openproblems-v2
87-
tag: main_build
86+
repo: openproblems-bio/openproblems
87+
tag: build/main
88+
- name: core
89+
type: github
90+
repo: openproblems-bio/core
91+
tag: build/main
92+
path: viash/core

scripts/test_resources/2023_yao_mouse_brain_scrnaseq_10xv2.sh

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,30 @@ cd "$REPO_ROOT"
88

99
set -e
1010

11-
DATASET_ID="2023_yao_mouse_brain_scrnaseq_10xv2"
12-
TMP_DIR="temp/datasets/$DATASET_ID"
13-
OUT_DIR="resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2"
14-
15-
# generate sc reference
16-
VIASH_TEMP=/tmp/allen_brain_cell_atlas \
17-
viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml \
18-
--keep true -- \
19-
--regions "OLF;TH" \
20-
--output "$TMP_DIR/tmp_dataset.h5ad" \
21-
--dataset_id "$DATASET_ID" \
22-
--dataset_name "ABCA Mouse Brain scRNAseq" \
23-
--dataset_url "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \
24-
--dataset_reference "10.1038/s41586-023-06812-z" \
25-
--dataset_summary "A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \
26-
--dataset_description "See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \
27-
--dataset_organism "mus_musculus"
28-
29-
viash run src/data_processors/subset_reference/config.vsh.yaml -- \
30-
--input "$TMP_DIR/tmp_dataset.h5ad" \
31-
--output "$OUT_DIR/dataset.h5ad"
11+
cat > /tmp/params.yaml << HERE
12+
param_list:
13+
- id: 2023_yao_mouse_brain_scrnaseq_10xv2
14+
regions:
15+
- OLF
16+
- TH
17+
dataset_name: ABCA Mouse Brain scRNAseq
18+
dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717
19+
dataset_reference: 10.1038/s41586-023-06812-z
20+
dataset_summary: A high-resolution scRNAseq atlas of cell types in the whole mouse brain
21+
dataset_description: See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset.
22+
dataset_organism: mus_musculus
23+
24+
output: "\$id/dataset.h5ad"
25+
output_state: "\$id/state.yaml"
26+
27+
publish_dir: resources_test/common
28+
HERE
29+
30+
nextflow run . \
31+
-main-script target/nextflow/datasets/workflows/process_allen_brain_cell_atlas/main.nf \
32+
-profile docker \
33+
-resume \
34+
-params-file /tmp/params.yaml
3235

3336
aws s3 sync --profile op \
3437
"resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2" \

src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,16 @@ argument_groups:
2525
- MB
2626
- OLF
2727
- TH
28+
- name: Caching
29+
arguments:
30+
- type: string
31+
name: --cache_dir
32+
required: false
33+
description: Directory to cache the downloaded data.
34+
- type: string
35+
name: --output_cache_dir
36+
required: false
37+
description: Output directory of the cached data.
2838
- name: Metadata
2939
arguments:
3040
- type: string
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
name: process_allen_brain_cell_atlas
2+
namespace: datasets/workflows
3+
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: string
8+
name: --abca_version
9+
required: false
10+
default: "20230630"
11+
description: The version of the Allen Brain Cell Atlas to download data from.
12+
- type: string
13+
name: --regions
14+
required: false
15+
description: A list of brain regions to download data for. If not provided, data for all regions will be downloaded.
16+
multiple: true
17+
default:
18+
- CTXsp
19+
- HPF
20+
- HY
21+
- Isocortex-1
22+
- Isocortex-2
23+
- Isocortex-3
24+
- Isocortex-4
25+
- MB
26+
- OLF
27+
- TH
28+
- name: Metadata
29+
arguments:
30+
- type: string
31+
name: --id
32+
description: "A unique identifier for the dataset"
33+
required: true
34+
- name: --dataset_name
35+
type: string
36+
description: Nicely formatted name.
37+
required: true
38+
- type: string
39+
name: --dataset_url
40+
description: Link to the original source of the dataset.
41+
required: false
42+
- name: --dataset_reference
43+
type: string
44+
description: Bibtex reference of the paper in which the dataset was published.
45+
required: false
46+
- name: --dataset_summary
47+
type: string
48+
description: Short description of the dataset.
49+
required: true
50+
- name: --dataset_description
51+
type: string
52+
description: Long description of the dataset.
53+
required: true
54+
- name: --dataset_organism
55+
type: string
56+
description: The organism of the sample in the dataset.
57+
required: false
58+
- name: Sampling options
59+
arguments:
60+
- name: "--do_subsample"
61+
type: boolean
62+
default: false
63+
description: "Whether or not to subsample the dataset"
64+
- name: "--n_obs"
65+
type: integer
66+
description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
67+
default: 500
68+
- name: "--n_vars"
69+
type: integer
70+
description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
71+
default: 500
72+
- name: "--keep_features"
73+
type: string
74+
multiple: true
75+
description: A list of genes to keep.
76+
- name: "--keep_cell_type_categories"
77+
type: "string"
78+
multiple: true
79+
description: "Categories indexes to be selected"
80+
required: false
81+
- name: "--keep_batch_categories"
82+
type: "string"
83+
multiple: true
84+
description: "Categories indexes to be selected"
85+
required: false
86+
- name: "--even"
87+
type: "boolean_true"
88+
description: Subsample evenly from different batches
89+
- name: "--seed"
90+
type: "integer"
91+
description: "A seed for the subsampling."
92+
example: 123
93+
- name: Outputs
94+
arguments:
95+
- name: "--output"
96+
__merge__: /src/api/file_common_singlecell.yaml
97+
direction: output
98+
required: true
99+
100+
resources:
101+
- type: nextflow_script
102+
path: main.nf
103+
entrypoint: run_wf
104+
- path: /common/nextflow_helpers/helper.nf
105+
106+
dependencies:
107+
- name: datasets/loaders/allen_brain_cell_atlas
108+
- name: datasets/processors/subsample
109+
repository: openproblems
110+
- name: datasets/normalization/log_cp
111+
repository: openproblems
112+
- name: datasets/processors/pca
113+
repository: openproblems
114+
- name: datasets/processors/hvg
115+
repository: openproblems
116+
- name: datasets/processors/knn
117+
repository: openproblems
118+
- name: h5ad/extract_uns_metadata
119+
repository: core
120+
121+
runners:
122+
- type: nextflow
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
2+
3+
workflow auto {
4+
findStates(params, meta.config)
5+
| meta.workflow.run(
6+
auto: [publish: "state"]
7+
)
8+
}
9+
10+
workflow run_wf {
11+
take:
12+
input_ch
13+
14+
main:
15+
output_ch = input_ch
16+
17+
// copy id to the state
18+
| map{ id, state ->
19+
def new_state = state + [dataset_id: id]
20+
[id, new_state]
21+
}
22+
23+
| allen_brain_cell_atlas.run(
24+
fromState: [
25+
"abca_version",
26+
"regions",
27+
"dataset_id",
28+
"dataset_name",
29+
"dataset_url",
30+
"dataset_reference",
31+
"dataset_summary",
32+
"dataset_description",
33+
"dataset_organism",
34+
],
35+
toState: [
36+
"output_raw": "output"
37+
]
38+
)
39+
40+
// subsample if so desired
41+
| subsample.run(
42+
runIf: { id, state -> state.do_subsample },
43+
fromState: [
44+
"input": "output_raw",
45+
"n_obs": "n_obs",
46+
"n_vars": "n_vars",
47+
"keep_features": "keep_features",
48+
"keep_cell_type_categories": "keep_cell_type_categories",
49+
"keep_batch_categories": "keep_batch_categories",
50+
"even": "even",
51+
"seed": "seed"
52+
],
53+
args: [output_mod2: null],
54+
toState: ["output_raw": "output"]
55+
)
56+
57+
| log_cp.run(
58+
key: "log_cp10k",
59+
fromState: [
60+
"input": "output_raw"
61+
],
62+
args: [
63+
"normalization_id": "log_cp10k",
64+
"n_cp": 10000
65+
],
66+
toState: [
67+
"output_normalized": "output"
68+
]
69+
)
70+
| hvg.run(
71+
fromState: ["input": "output_normalized"],
72+
toState: ["output_hvg": "output"]
73+
)
74+
75+
| pca.run(
76+
fromState: ["input": "output_hvg"],
77+
toState: ["output_pca": "output" ]
78+
)
79+
80+
| knn.run(
81+
fromState: ["input": "output_pca"],
82+
toState: ["output_knn": "output"]
83+
)
84+
// add synonym
85+
| map{ id, state ->
86+
[id, state + [output_dataset: state.output_knn]]
87+
}
88+
89+
| extract_uns_metadata.run(
90+
fromState: { id, state ->
91+
def schema = findArgumentSchema(meta.config, "output_dataset")
92+
// workaround: convert GString to String
93+
schema = iterateMap(schema, { it instanceof GString ? it.toString() : it })
94+
def schemaYaml = tempFile("schema.yaml")
95+
writeYaml(schema, schemaYaml)
96+
[
97+
"input": state.output_dataset,
98+
"schema": schemaYaml
99+
]
100+
},
101+
toState: ["output_meta": "output"]
102+
)
103+
104+
| setState([
105+
"output": "output_normalized"
106+
])
107+
108+
emit:
109+
output_ch
110+
}
111+

src/datasets/workflows/process_tenx_xenium/config.vsh.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ argument_groups:
1919
multiple: true
2020
- name: Metadata
2121
arguments:
22+
- type: string
23+
name: --id
24+
description: "A unique identifier for the dataset"
25+
required: true
2226
- name: --dataset_name
2327
type: string
2428
description: Nicely formatted name.

0 commit comments

Comments
 (0)