From 38f6a76339c308dcd1481b719aa702ad2950ffd2 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Tue, 24 Sep 2024 15:14:56 +0200
Subject: [PATCH] update scripts and wf

---
 nextflow.config                               |  2 +-
 .../create_resources/process_10x_xenium.sh    |  7 +++---
 .../process_allen_brain_cell_atlas_brain.sh   |  3 +--
 scripts/create_resources/process_datasets.sh  | 11 ++++++---
 scripts/run_benchmark/run_full_local.sh       | 13 ++--------
 scripts/run_benchmark/run_full_seqeracloud.sh | 24 +++++++++----------
 scripts/run_benchmark/run_test_seqeracloud.sh |  6 +++++
 src/workflows/run_benchmark/main.nf           |  3 +++
 src/workflows/run_benchmark/test.sh           |  2 +-
 9 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 8fc6c4e3..6402ebf2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1 +1 @@
-process.container = 'nextflow/bash:latest'
\ No newline at end of file
+process.container = 'nextflow/bash:latest'
diff --git a/scripts/create_resources/process_10x_xenium.sh b/scripts/create_resources/process_10x_xenium.sh
index 6eae8e4a..d17a4ade 100755
--- a/scripts/create_resources/process_10x_xenium.sh
+++ b/scripts/create_resources/process_10x_xenium.sh
@@ -23,7 +23,7 @@ param_list:
     segmentation_id: [cell, nucleus]
 
   - id: "10x_xenium/2023_10x_mouse_brain_xenium/rep2"
-    input: https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip
+    input: https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_2/Xenium_V1_FF_Mouse_Brain_MultiSection_2_outs.zip
     dataset_name: "Xenium V1 Fresh Frozen Mouse Brain replicate 2"
     dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard"
     dataset_summary: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1)."
@@ -32,7 +32,7 @@ param_list:
     segmentation_id: [cell, nucleus]
 
   - id: "10x_xenium/2023_10x_mouse_brain_xenium/rep3"
-    input: https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip
+    input: https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_3/Xenium_V1_FF_Mouse_Brain_MultiSection_3_outs.zip
     dataset_name: "Xenium V1 Fresh Frozen Mouse Brain replicate 3"
     dataset_url: "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard"
     dataset_summary: "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1)."
@@ -45,13 +45,12 @@ output_state: "\$id/state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch openproblems-bio/task_ist_preprocessing \
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/datasets/workflows/process_tenx_xenium/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file /tmp/params.yaml \
-  --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
   --labels datasets,10x_xenium
diff --git a/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh b/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh
index 8ebdf3dc..706d79e1 100755
--- a/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh
+++ b/scripts/create_resources/process_allen_brain_cell_atlas_brain.sh
@@ -38,13 +38,12 @@ output_state: "\$id/state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch openproblems-bio/task_ist_preprocessing \
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/datasets/workflows/process_allen_brain_cell_atlas/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file /tmp/params.yaml \
-  --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
   --labels datasets,allen_brain_cell_atlas
diff --git a/scripts/create_resources/process_datasets.sh b/scripts/create_resources/process_datasets.sh
index a7117493..ee7ae3f9 100755
--- a/scripts/create_resources/process_datasets.sh
+++ b/scripts/create_resources/process_datasets.sh
@@ -11,13 +11,18 @@ set -e
 input_dir="s3://openproblems-data/resources/datasets"
 publish_dir="s3://openproblems-data/resources/task_ist_preprocessing/datasets"
 
-
 cat > /tmp/params.yaml << HERE
 param_list:
 
   - id: "mouse_brain_combined/rep1"
     input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep1/dataset.zarr"
     input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
+  - id: "mouse_brain_combined/rep2"
+    input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep2/dataset.zarr"
+    input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
+  - id: "mouse_brain_combined/rep2"
+    input_sp: "$input_dir/10x_xenium/2023_10x_mouse_brain_xenium/rep2/dataset.zarr"
+    input_sc: "$input_dir/allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad"
 
 output_sc: "\$id/output_sc.h5ad"
 output_sp: "\$id/output_sp.zarr"
@@ -25,7 +30,7 @@ output_state: "\$id/state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch openproblems-bio/task_ist_preprocessing \
+tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/process_datasets/main.nf \
@@ -34,4 +39,4 @@ tw launch openproblems-bio/task_ist_preprocessing \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels datasets,10x_xenium
+  --labels task_ist_preprocessing,process_datasets
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 8c63393b..5496a205 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -11,14 +11,6 @@ cd "$REPO_ROOT"
 # please refer to the nextflow information for more details:
 # https://www.nextflow.io/docs/latest/
 
-# remove this when you have implemented the script
-echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
-echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
-echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
-echo "  Step 4: remove this message"
-exit 1
-
 set -e
 
 echo "Running benchmark on test data"
@@ -31,14 +23,13 @@ publish_dir="resources/results/${RUN_ID}"
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
 input_states: resources/datasets/**/state.yaml
-rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
+rename_keys: 'input_sc:output_sc;input_sp:output_sp'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
 # run the benchmark
-nextflow run openproblems-bio/task_template \
-  --revision build/main \
+nextflow run . \
   -main-script target/nextflow/workflows/run_benchmark/main.nf \
   -profile docker \
   -resume \
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 87d133c4..8466540d 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -6,29 +6,21 @@ REPO_ROOT=$(git rev-parse --show-toplevel)
 # ensure that the command below is run from the root of the repository
 cd "$REPO_ROOT"
 
-# remove this when you have implemented the script
-echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
-echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
-echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
-echo "  Step 4: remove this message"
-exit 1
-
 set -e
 
 # generate a unique id
 RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
+publish_dir="s3://openproblems-data/resources/temp_ist_preprocessing/results/${RUN_ID}"
 
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
-input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
-rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
+input_states: s3://openproblems-data/resources/temp_ist_preprocessing/datasets/**/state.yaml
+rename_keys: 'input_sc:output_sc;input_sp:output_sp'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch https://github.com/openproblems-bio/task_template.git \
+tw launch https://github.com/openproblems-bio/temp_ist_preprocessing.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
@@ -37,4 +29,10 @@ tw launch https://github.com/openproblems-bio/task_template.git \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels task_template,full
\ No newline at end of file
+  --labels temp_ist_preprocessing,full
+
+aws s3 sync \
+  s3://openproblems-data/resources/temp_ist_preprocessing/results \
+  resources/temp_ist_preprocessing/results \
+  --profile op \
+  --dryrun
diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
index 4f556898..72446616 100755
--- a/scripts/run_benchmark/run_test_seqeracloud.sh
+++ b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -29,3 +29,9 @@ tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
   --params-file /tmp/params.yaml \
   --config common/nextflow_helpers/labels_tw.config \
   --labels task_template,test
+
+aws s3 sync \
+  s3://openproblems-nextflow/temp/results \
+  temp_results \
+  --profile op \
+  --dryrun
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index 58116229..934a6cbb 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -361,6 +361,9 @@ workflow run_wf {
     | extract_uns_metadata.run(
       key: "extract_uns_scores",
       fromState: [input: "output_metric"],
+      args: [
+        uns_length_cutoff: 100
+      ],
       toState: { id, output, state ->
         state + [
           score_uns: readYaml(output.output).uns
diff --git a/src/workflows/run_benchmark/test.sh b/src/workflows/run_benchmark/test.sh
index b0dbc249..3ac912ac 100755
--- a/src/workflows/run_benchmark/test.sh
+++ b/src/workflows/run_benchmark/test.sh
@@ -10,7 +10,7 @@ set -e
 
 # export TOWER_WORKSPACE_ID=53907369739130
 
-DATASETS_DIR="resources_test/task_template"
+DATASETS_DIR="resources_test/task_ist_preprocessing"
 OUTPUT_DIR="output/temp"
 
 if [ ! -d "$OUTPUT_DIR" ]; then