From ec0833ed105b47392dcf79bd1d8970ac05a9dc00 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:36:48 +0200 Subject: [PATCH] Chromosomes must be named and unaligned sequences cannot include '-' characters. --- ena-submission/config/config.yaml | 8 ++-- ena-submission/scripts/create_assembly.py | 4 +- .../scripts/ena_submission_helper.py | 6 ++- ena-submission/scripts/test_ena_submission.py | 39 +++++++++++++++++-- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/ena-submission/config/config.yaml b/ena-submission/config/config.yaml index be844a509..d98f0d21f 100644 --- a/ena-submission/config/config.yaml +++ b/ena-submission/config/config.yaml @@ -76,11 +76,11 @@ organisms: - externalMetadataUpdater: ena name: sraRunAccession type: string - ebola-sudan: + ebola-zaire: ingest: - taxon_id: 3052460 - scientific_name: "Orthoebolavirus sudanense" - organismName: "Ebola Sudan" + taxon_id: 186538 + scientific_name: "Orthoebolavirus zairense" + organismName: "Ebola Zaire" externalMetadata: - externalMetadataUpdater: ena name: ncbiReleaseDate diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py index 6a4ec113c..9119b6cda 100644 --- a/ena-submission/scripts/create_assembly.py +++ b/ena-submission/scripts/create_assembly.py @@ -89,7 +89,7 @@ def create_chromosome_list_object( else: entry = AssemblyChromosomeListFileObject( object_name=f"{seq_key["accession"]}.{seq_key["version"]}", - chromosome_name="", + chromosome_name="main", chromosome_type=chromosome_type, ) entries.append(entry) @@ -464,7 +464,6 @@ def assembly_table_handle_errors( f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries" f" in assembly_table in status HAS_ERRORS or SUBMITTING for over {time_threshold}m" ) - logger.warning(error_msg) send_slack_notification( error_msg, slack_config, @@ -482,7 +481,6 @@ def assembly_table_handle_errors( f"ENA Submission pipeline found {len(entries_waiting)} entries in assembly_table in" f" status WAITING for over {time_threshold_waiting}h" ) - logger.warning(error_msg) send_slack_notification( config, error_msg, time=datetime.now(tz=pytz.utc), time_threshold=slack_time_threshold ) diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py index b25adb6cd..5a5a885f9 100644 --- a/ena-submission/scripts/ena_submission_helper.py +++ b/ena-submission/scripts/ena_submission_helper.py @@ -258,11 +258,13 @@ def create_fasta( if len(unaligned_sequences.keys()) == 1: entry = chromosome_list.chromosomes[0] gz.write(f">{entry.object_name}\n".encode()) - gz.write(f"{unaligned_sequences["main"]}\n".encode()) + gz.write(f"{unaligned_sequences["main"].replace('-', 'N')}\n".encode()) else: for entry in chromosome_list.chromosomes: gz.write(f">{entry.object_name}\n".encode()) - gz.write(f"{unaligned_sequences[entry.chromosome_name]}\n".encode()) + gz.write( + f"{unaligned_sequences[entry.chromosome_name].replace('-', 'N')}\n".encode() + ) return filename diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index b6313a805..8588787bc 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -164,9 +164,28 @@ def test_sample_set_construction(self): class AssemblyCreationTests(unittest.TestCase): def setUp(self): - self.unaligned_sequences = sample_data_in_submission_table["unaligned_nucleotide_sequences"] + self.unaligned_sequences_multi = sample_data_in_submission_table[ + "unaligned_nucleotide_sequences" + ] + self.unaligned_sequences = { + "main": "CTTAACTTTGAGAGAGTGAATT-", + } self.seq_key = {"accession": "test_accession", "version": "test_version"} + def test_create_chromosome_list_multi_segment(self): + chromosome_list = create_chromosome_list_object( + self.unaligned_sequences_multi, self.seq_key + ) + file_name_chromosome_list = create_chromosome_list(chromosome_list) + + with gzip.GzipFile(file_name_chromosome_list, "rb") as gz: + content = gz.read() + + self.assertEqual( + content, + b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n", + ) + def test_create_chromosome_list(self): chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key) file_name_chromosome_list = create_chromosome_list(chromosome_list) @@ -176,10 +195,24 @@ def test_create_chromosome_list(self): self.assertEqual( content, - b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n", + b"test_accession.test_version\tmain\tlinear-segmented\n", + ) + + def test_create_fasta_multi(self): + chromosome_list = create_chromosome_list_object( + self.unaligned_sequences_multi, self.seq_key + ) + fasta_file_name = create_fasta(self.unaligned_sequences_multi, chromosome_list) + + with gzip.GzipFile(fasta_file_name, "rb") as gz: + content = gz.read() + self.assertEqual( + content, + b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n", ) def test_create_fasta(self): + # Also check that - is converted to N chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key) fasta_file_name = create_fasta(self.unaligned_sequences, chromosome_list) @@ -187,7 +220,7 @@ def test_create_fasta(self): content = gz.read() self.assertEqual( content, - b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n", + b">test_accession.test_version\nCTTAACTTTGAGAGAGTGAATTN\n", ) def test_create_manifest(self):