run-nextclade-full: keep input FASTA compressed

Use the `lzma` module to read the compressed input FASTA so that we do not have to take up additional disk space with the uncompressed FASTA.
nextstrain · Oct 26, 2021 · 7e4ff36 · 7e4ff36
1 parent fcba89c
commit 7e4ff36
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 3 deletions.
diff --git a/bin/run-nextclade-full b/bin/run-nextclade-full
@@ -91,7 +91,7 @@ main() {
   DATE_UTC=$(date -u "+%Y-%m-%d--%H-%M-%S--%Z")
   S3_DST="$S3_SRC/nextclade-full-run-${DATE_UTC}"
 
-  INPUT_FASTA="data/${DATABASE}/sequences.fasta"
+  INPUT_FASTA="data/${DATABASE}/sequences.fasta.xz"
   OUTPUT_TSV="data/${DATABASE}/nextclade.tsv"
   TMP_DIR_FASTA="tmp/${DATABASE}/fasta"
   TMP_DIR_TSV="tmp/${DATABASE}/clades"
@@ -135,7 +135,7 @@ main() {
   fi
 
   echo "[ INFO] ${0}:${LINENO}: Downloading '${S3_SRC}/sequences.fasta.xz' to '${INPUT_FASTA}'"
-  aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" - | xz -T0 -cdfq >"${INPUT_FASTA}"
+  aws s3 cp --no-progress "${S3_SRC}/sequences.fasta.xz" "${INPUT_FASTA}"
 
   echo "[ INFO] ${0}:${LINENO}: Splitting '${INPUT_FASTA}' into batches of size ${BATCH_SIZE} sequences and storing them in '${INPUT_WILDCARD}'"
   # Split fasta file to multiple batches

diff --git a/bin/split-fasta b/bin/split-fasta
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import os
+import lzma
 
 from Bio import SeqIO
 
@@ -56,7 +57,8 @@ def main():
     input_filename = os.path.basename(args.input_file)
     batch_size = int(args.batch_size)
 
-    with open(args.input_file) as f_input:
+    # Must be in "rt" mode since SeqIO requires FASTA files to be opened in text mode
+    with lzma.open(args.input_file, "rt") as f_input:
         record_iter = SeqIO.parse(f_input, file_format)
         for i, batch in enumerate(batch_iterator(record_iter, batch_size)):
             filename = os.path.join(args.output_dir, f"{input_filename}.batch-{i:05}.fasta")