Skip to content

Commit

Permalink
transform-gisaid: use lzma to read .xz file
Browse files Browse the repository at this point in the history
Use the `lzma` module to read the compressed `gisaid.ndjson.xz` file so
that we do not have to keep the uncompressed file on disk.
  • Loading branch information
joverlee521 committed Oct 26, 2021
1 parent 9415711 commit 3f3e1d4
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions bin/transform-gisaid
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import os
import argparse
import csv
import sys
import lzma
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent / "lib"))
Expand Down Expand Up @@ -53,7 +54,7 @@ if __name__ == '__main__':
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument("gisaid_data",
default="s3://nextstrain-ncov-private/gisaid.ndjson.gz",
default="s3://nextstrain-ncov-private/gisaid.ndjson.xz",
help="Newline-delimited GISAID JSON data")
parser.add_argument("--annotations",
default=str( base / "source-data/gisaid_annotations.tsv" ),
Expand Down Expand Up @@ -142,7 +143,7 @@ if __name__ == '__main__':
RAW_METADATA_FILENAME = args.output_metadata + '.raw'


with open(args.gisaid_data, "r") as gisaid_fh :
with lzma.open(args.gisaid_data, "r") as gisaid_fh :

pipeline = (
LineToJsonDataSource(gisaid_fh)
Expand Down Expand Up @@ -245,7 +246,7 @@ if __name__ == '__main__':
updated_strain_names_by_line_no[entry[LINE_NUMBER_KEY]] = entry['strain']

if not args.sorted_fasta:
with open(args.gisaid_data, "r") as gisaid_fh:
with lzma.open(args.gisaid_data, "r") as gisaid_fh:
for entry in (
LineToJsonDataSource(gisaid_fh)
| RenameAndAddColumns()
Expand Down

0 comments on commit 3f3e1d4

Please sign in to comment.