Merge branch 'main' of https://github.com/openproblems-bio/task_preprocessing_imagingbased_st into main

LouisK92 · LouisK92 · commit cb472a6ebfdc · 2024-09-18T22:41:45.000+02:00
diff --git a/_viash.yaml b/_viash.yaml
@@ -81,7 +81,12 @@ config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
 
 repositories:
-  - name: openproblems-v2
+  - name: openproblems
     type: github
-    repo: openproblems-bio/openproblems-v2
-    tag: main_build
+    repo: openproblems-bio/openproblems
+    tag: build/main
+  - name: core
+    type: github
+    repo: openproblems-bio/core
+    tag: build/main
+    path: viash/core
diff --git a/scripts/test_resources/2023_yao_mouse_brain_scrnaseq_10xv2.sh b/scripts/test_resources/2023_yao_mouse_brain_scrnaseq_10xv2.sh
@@ -8,27 +8,30 @@ cd "$REPO_ROOT"
 
 set -e
 
-DATASET_ID="2023_yao_mouse_brain_scrnaseq_10xv2"
-TMP_DIR="temp/datasets/$DATASET_ID"
-OUT_DIR="resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2"
-
-# generate sc reference
-VIASH_TEMP=/tmp/allen_brain_cell_atlas \
-  viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml \
-  --keep true -- \
-  --regions "OLF;TH" \
-  --output "$TMP_DIR/tmp_dataset.h5ad" \
-  --dataset_id "$DATASET_ID" \
-  --dataset_name "ABCA Mouse Brain scRNAseq" \
-  --dataset_url "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \
-  --dataset_reference "10.1038/s41586-023-06812-z" \
-  --dataset_summary "A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \
-  --dataset_description "See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \
-  --dataset_organism "mus_musculus"
-
-viash run src/data_processors/subset_reference/config.vsh.yaml -- \
-  --input "$TMP_DIR/tmp_dataset.h5ad" \
-  --output "$OUT_DIR/dataset.h5ad"
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: 2023_yao_mouse_brain_scrnaseq_10xv2
+    regions:
+      - OLF
+      - TH
+    dataset_name: ABCA Mouse Brain scRNAseq
+    dataset_url: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717
+    dataset_reference: 10.1038/s41586-023-06812-z
+    dataset_summary: A high-resolution scRNAseq atlas of cell types in the whole mouse brain
+    dataset_description: See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset.
+    dataset_organism: mus_musculus
+
+output: "\$id/dataset.h5ad"
+output_state: "\$id/state.yaml"
+
+publish_dir: resources_test/common
+HERE
+
+nextflow run . \
+  -main-script target/nextflow/datasets/workflows/process_allen_brain_cell_atlas/main.nf \
+  -profile docker \
+  -resume \
+  -params-file /tmp/params.yaml
 
 aws s3 sync --profile op \
   "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2" \
diff --git a/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/loaders/allen_brain_cell_atlas/config.vsh.yaml
@@ -25,6 +25,16 @@ argument_groups:
           - MB
           - OLF
           - TH
+  - name: Caching
+    arguments:
+      - type: string
+        name: --cache_dir
+        required: false
+        description: Directory to cache the downloaded data.
+      - type: string
+        name: --output_cache_dir
+        required: false
+        description: Output directory of the cached data.
   - name: Metadata
     arguments:
       - type: string
diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml b/src/datasets/workflows/process_allen_brain_cell_atlas/config.vsh.yaml
@@ -0,0 +1,122 @@
+name: process_allen_brain_cell_atlas
+namespace: datasets/workflows
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - type: string
+        name: --abca_version
+        required: false
+        default: "20230630"
+        description: The version of the Allen Brain Cell Atlas to download data from.
+      - type: string
+        name: --regions
+        required: false
+        description: A list of brain regions to download data for. If not provided, data for all regions will be downloaded.
+        multiple: true
+        default:
+          - CTXsp
+          - HPF
+          - HY
+          - Isocortex-1
+          - Isocortex-2
+          - Isocortex-3
+          - Isocortex-4
+          - MB
+          - OLF
+          - TH
+  - name: Metadata
+    arguments:
+      - type: string
+        name: --id
+        description: "A unique identifier for the dataset"
+        required: true
+      - name: --dataset_name
+        type: string
+        description: Nicely formatted name.
+        required: true
+      - type: string
+        name: --dataset_url
+        description: Link to the original source of the dataset.
+        required: false
+      - name: --dataset_reference
+        type: string
+        description: Bibtex reference of the paper in which the dataset was published.
+        required: false
+      - name: --dataset_summary
+        type: string
+        description: Short description of the dataset.
+        required: true
+      - name: --dataset_description
+        type: string
+        description: Long description of the dataset.
+        required: true
+      - name: --dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+  - name: Sampling options
+    arguments:
+      - name: "--do_subsample"
+        type: boolean
+        default: false
+        description: "Whether or not to subsample the dataset"
+      - name: "--n_obs"
+        type: integer
+        description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed.
+        default: 500
+      - name: "--n_vars"
+        type: integer
+        description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed.
+        default: 500
+      - name: "--keep_features"
+        type: string
+        multiple: true
+        description: A list of genes to keep.
+      - name: "--keep_cell_type_categories"
+        type: "string"
+        multiple: true
+        description: "Categories indexes to be selected"
+        required: false
+      - name: "--keep_batch_categories"
+        type: "string"
+        multiple: true
+        description: "Categories indexes to be selected"
+        required: false
+      - name: "--even"
+        type: "boolean_true"
+        description: Subsample evenly from different batches
+      - name: "--seed"
+        type: "integer"
+        description: "A seed for the subsampling."
+        example: 123
+  - name: Outputs
+    arguments:
+    - name: "--output"
+      __merge__: /src/api/file_common_singlecell.yaml
+      direction: output
+      required: true
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+  - path: /common/nextflow_helpers/helper.nf
+
+dependencies:
+  - name: datasets/loaders/allen_brain_cell_atlas
+  - name: datasets/processors/subsample
+    repository: openproblems
+  - name: datasets/normalization/log_cp
+    repository: openproblems
+  - name: datasets/processors/pca
+    repository: openproblems
+  - name: datasets/processors/hvg
+    repository: openproblems
+  - name: datasets/processors/knn
+    repository: openproblems
+  - name: h5ad/extract_uns_metadata
+    repository: core
+
+runners:
+  - type: nextflow
diff --git a/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf b/src/datasets/workflows/process_allen_brain_cell_atlas/main.nf
@@ -0,0 +1,111 @@
+include { findArgumentSchema } from "${meta.resources_dir}/helper.nf"
+
+workflow auto {
+  findStates(params, meta.config)
+    | meta.workflow.run(
+      auto: [publish: "state"]
+    )
+}
+
+workflow run_wf {
+  take:
+  input_ch
+
+  main:
+  output_ch = input_ch
+
+    // copy id to the state
+    | map{ id, state ->
+      def new_state = state + [dataset_id: id]
+      [id, new_state]
+    }
+
+    | allen_brain_cell_atlas.run(
+      fromState: [
+        "abca_version",
+        "regions",
+        "dataset_id",
+        "dataset_name",
+        "dataset_url",
+        "dataset_reference",
+        "dataset_summary",
+        "dataset_description",
+        "dataset_organism",
+      ],
+      toState: [
+        "output_raw": "output"
+      ]
+    )
+
+    // subsample if so desired
+    | subsample.run(
+      runIf: { id, state -> state.do_subsample },
+      fromState: [
+        "input": "output_raw",
+        "n_obs": "n_obs",
+        "n_vars": "n_vars",
+        "keep_features": "keep_features",
+        "keep_cell_type_categories": "keep_cell_type_categories",
+        "keep_batch_categories": "keep_batch_categories",
+        "even": "even",
+        "seed": "seed"
+      ],
+      args: [output_mod2: null],
+      toState: ["output_raw": "output"]
+    )
+
+    | log_cp.run(
+      key: "log_cp10k",
+      fromState: [
+        "input": "output_raw"
+      ],
+      args: [
+        "normalization_id": "log_cp10k",
+        "n_cp": 10000
+      ],
+      toState: [
+        "output_normalized": "output"
+      ]
+    )
+    | hvg.run(
+      fromState: ["input": "output_normalized"],
+      toState: ["output_hvg": "output"]
+    )
+
+    | pca.run(
+      fromState: ["input": "output_hvg"],
+      toState: ["output_pca": "output" ]
+    )
+
+    | knn.run(
+      fromState: ["input": "output_pca"],
+      toState: ["output_knn": "output"]
+    )
+    // add synonym
+    | map{ id, state ->
+      [id, state + [output_dataset: state.output_knn]]
+    }
+
+    | extract_uns_metadata.run(
+      fromState: { id, state ->
+        def schema = findArgumentSchema(meta.config, "output_dataset")
+        // workaround: convert GString to String
+        schema = iterateMap(schema, { it instanceof GString ? it.toString() : it })
+        def schemaYaml = tempFile("schema.yaml")
+        writeYaml(schema, schemaYaml)
+        [
+          "input": state.output_dataset,
+          "schema": schemaYaml
+        ]
+      },
+      toState: ["output_meta": "output"]
+    )
+
+    | setState([
+      "output": "output_normalized"
+    ])
+
+  emit:
+  output_ch
+}
+
diff --git a/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml b/src/datasets/workflows/process_tenx_xenium/config.vsh.yaml
@@ -19,6 +19,10 @@ argument_groups:
         multiple: true
   - name: Metadata
     arguments:
+      - type: string
+        name: --id
+        description: "A unique identifier for the dataset"
+        required: true
       - name: --dataset_name
         type: string
         description: Nicely formatted name.