Add 100k open builds (subsamples)

File paths chosen to sit nicely alongside our existing intermediate (open) files, such as https://data.nextstrain.org/files/ncov/open/oceania/metadata.tsv.xz https://data.nextstrain.org/files/ncov/open/oceania/sequences.fasta.xz
nextstrain · Jun 29, 2023 · 74a68b1 · 74a68b1
1 parent 831bfe4
commit 74a68b1
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 14 deletions.
diff --git a/.github/workflows/rebuild-100k.yml b/.github/workflows/rebuild-100k.yml
@@ -17,7 +17,7 @@ jobs:
       with:
         python-version: "3.10"
 
-    - name: Launch build
+    - name: Launch GISAID build
       run: |
         set -x
 
@@ -31,27 +31,60 @@ jobs:
           --memory 31GiB \
           . \
             upload \
-            --configfile nextstrain_profiles/100k/config.yaml \
+            --configfile nextstrain_profiles/100k/config-gisaid.yaml \
             --config "${config[@]}" \
             --set-threads tree=8 \
-        |& tee build-launch.log
+        |& tee build-launch-gisaid.log
       env:
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
 
+    - name: Launch open build
+      run: |
+        set -x
+
+        declare -a config
+        config+=(slack_token=$SLACK_TOKEN)
+
+        nextstrain build \
+          --aws-batch \
+          --detach \
+          --cpus 16 \
+          --memory 31GiB \
+          . \
+            upload \
+            --configfile nextstrain_profiles/100k/config-open.yaml \
+            --config "${config[@]}" \
+            --set-threads tree=8 \
+        |& tee build-launch-open.log
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+
     - name: Build info
       run: |
-        echo "--> 100k sample rebuilding on AWS"
+        echo "--> 100k sample rebuilding viw 2 AWS jobs"
         echo
-        echo "--> When completed, the following 2 files will be updated:"
+        echo "--> When completed, the following files will be updated:"
+        echo "s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz"
+        echo "s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz"
         echo "s3://nextstrain-ncov-private/100k/metadata.tsv.xz"
         echo "s3://nextstrain-ncov-private/100k/sequences.fasta.xz"
         echo
-        echo "--> You can attach to this AWS job via:"
-        tail -n1 build-launch.log
+        echo "--> You can attach to the GISAID AWS job via:"
+        tail -n1 build-launch-gisaid.log
+        echo
+        echo "--> You can attach to the OPEN AWS job via:"
+        tail -n1 build-launch-open.log
+        echo
+        JOBID=$( tail -n1 build-launch-gisaid.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
+        echo "--> View the GISAID job in the AWS console via"
+        echo "    https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}"
         echo
-        JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
-        echo "--> View this job in the AWS console via"
+        JOBID=$( tail -n1 build-launch-open.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
+        echo "--> View the GISAID job in the AWS console via"
         echo "    https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}"
         echo
diff --git a/nextstrain_profiles/100k/README.md b/nextstrain_profiles/100k/README.md
@@ -1,21 +1,27 @@
 ## Aim
 
 To build a representative 100k dataset which is available for testing / developing builds locally.
-This is intended to run weekly via a GitHub action (which triggers a job to be run on AWS).
-It will make two files available:
+This is intended to run weekly via a GitHub action (which triggers jobs to be run on AWS).
+It will upload these files:
 
+* `s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz`
+* `s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz`
 * `s3://nextstrain-ncov-private/100k/metadata.tsv.xz`
 * `s3://nextstrain-ncov-private/100k/sequences.fasta.xz`
 
 While this profile is not recommended to be run locally, you can see what rules would be run via:
 
 ```
-snakemake --cores 1 --configfile nextstrain_profiles/100k/config.yaml -npf upload --dag | dot -Tpdf > dag.pdf
+snakemake --cores 1 --configfile nextstrain_profiles/100k/config-gisaid.yaml -npf upload --dag | dot -Tpdf > dag-100k-gisaid.pdf
+snakemake --cores 1 --configfile nextstrain_profiles/100k/config-open.yaml -npf upload --dag | dot -Tpdf > dag-100k-open.pdf
 ```
 
-To run manually you can trigger the GitHub action or run the job locally via:
+To run manually you can trigger the GitHub action (recommended) or run the jobs locally via:
 ```
 nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
-  --configfile nextstrain_profiles/100k/config.yaml \
+  --configfile nextstrain_profiles/100k/config-gisaid.yaml \
+  -f upload
+nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
+  --configfile nextstrain_profiles/100k/config-open.yaml \
   -f upload
 ```
diff --git a/nextstrain_profiles/100k/config.yaml → nextstrain_profiles/100k/config-gisaid.yaml b/nextstrain_profiles/100k/config.yaml → nextstrain_profiles/100k/config-gisaid.yaml
diff --git a/nextstrain_profiles/100k/config-open.yaml b/nextstrain_profiles/100k/config-open.yaml
@@ -0,0 +1,30 @@
+# This file is largely duplicated from `config-gisaid.yaml` - please
+# see that file for comments
+S3_DST_BUCKET: "nextstrain-data/files/ncov/open/100k" # TODO XXX
+S3_DST_ORIGINS: [needed-for-workflow-but-unused]
+deploy_url: needed_for_workflow_but_unused
+custom_rules:
+  - workflow/snakemake_rules/export_for_nextstrain.smk
+inputs:
+  - name: open
+    metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.zst"
+    aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.zst"
+    skip_sanitize_metadata: true
+builds:
+  100k:
+    subsampling_scheme: 100k_scheme
+upload:
+  metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
+  sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
+filter:
+  exclude_where: "division='USA'"
+subsampling:
+  100k_scheme:
+    50k_early:
+      group_by: "year month country"
+      max_sequences: 50000
+      max_date: "--max-date 1Y"
+    50k_late:
+      group_by: "year month country"
+      max_sequences: 50000
+      min_date: "--min-date 1Y"