Skip to content

Commit

Permalink
Chromosomes must be named and unaligned sequences cannot include '-' …
Browse files Browse the repository at this point in the history
…characters.
  • Loading branch information
anna-parker committed Aug 30, 2024
1 parent 03dc892 commit ec0833e
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 12 deletions.
8 changes: 4 additions & 4 deletions ena-submission/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ organisms:
- externalMetadataUpdater: ena
name: sraRunAccession
type: string
ebola-sudan:
ebola-zaire:
ingest:
taxon_id: 3052460
scientific_name: "Orthoebolavirus sudanense"
organismName: "Ebola Sudan"
taxon_id: 186538
scientific_name: "Orthoebolavirus zairense"
organismName: "Ebola Zaire"
externalMetadata:
- externalMetadataUpdater: ena
name: ncbiReleaseDate
Expand Down
4 changes: 1 addition & 3 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def create_chromosome_list_object(
else:
entry = AssemblyChromosomeListFileObject(
object_name=f"{seq_key["accession"]}.{seq_key["version"]}",
chromosome_name="",
chromosome_name="main",
chromosome_type=chromosome_type,
)
entries.append(entry)
Expand Down Expand Up @@ -464,7 +464,6 @@ def assembly_table_handle_errors(
f"{config.backend_url}: ENA Submission pipeline found {len(entries_with_errors)} entries"
f" in assembly_table in status HAS_ERRORS or SUBMITTING for over {time_threshold}m"
)
logger.warning(error_msg)
send_slack_notification(
error_msg,
slack_config,
Expand All @@ -482,7 +481,6 @@ def assembly_table_handle_errors(
f"ENA Submission pipeline found {len(entries_waiting)} entries in assembly_table in"
f" status WAITING for over {time_threshold_waiting}h"
)
logger.warning(error_msg)
send_slack_notification(
config, error_msg, time=datetime.now(tz=pytz.utc), time_threshold=slack_time_threshold
)
Expand Down
6 changes: 4 additions & 2 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,11 +258,13 @@ def create_fasta(
if len(unaligned_sequences.keys()) == 1:
entry = chromosome_list.chromosomes[0]
gz.write(f">{entry.object_name}\n".encode())
gz.write(f"{unaligned_sequences["main"]}\n".encode())
gz.write(f"{unaligned_sequences["main"].replace('-', 'N')}\n".encode())
else:
for entry in chromosome_list.chromosomes:
gz.write(f">{entry.object_name}\n".encode())
gz.write(f"{unaligned_sequences[entry.chromosome_name]}\n".encode())
gz.write(
f"{unaligned_sequences[entry.chromosome_name].replace('-', 'N')}\n".encode()
)

return filename

Expand Down
39 changes: 36 additions & 3 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,28 @@ def test_sample_set_construction(self):

class AssemblyCreationTests(unittest.TestCase):
def setUp(self):
self.unaligned_sequences = sample_data_in_submission_table["unaligned_nucleotide_sequences"]
self.unaligned_sequences_multi = sample_data_in_submission_table[
"unaligned_nucleotide_sequences"
]
self.unaligned_sequences = {
"main": "CTTAACTTTGAGAGAGTGAATT-",
}
self.seq_key = {"accession": "test_accession", "version": "test_version"}

def test_create_chromosome_list_multi_segment(self):
chromosome_list = create_chromosome_list_object(
self.unaligned_sequences_multi, self.seq_key
)
file_name_chromosome_list = create_chromosome_list(chromosome_list)

with gzip.GzipFile(file_name_chromosome_list, "rb") as gz:
content = gz.read()

self.assertEqual(
content,
b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n",
)

def test_create_chromosome_list(self):
chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key)
file_name_chromosome_list = create_chromosome_list(chromosome_list)
Expand All @@ -176,18 +195,32 @@ def test_create_chromosome_list(self):

self.assertEqual(
content,
b"test_accession.test_version_seg2\tseg2\tlinear-segmented\ntest_accession.test_version_seg3\tseg3\tlinear-segmented\n",
b"test_accession.test_version\tmain\tlinear-segmented\n",
)

def test_create_fasta_multi(self):
chromosome_list = create_chromosome_list_object(
self.unaligned_sequences_multi, self.seq_key
)
fasta_file_name = create_fasta(self.unaligned_sequences_multi, chromosome_list)

with gzip.GzipFile(fasta_file_name, "rb") as gz:
content = gz.read()
self.assertEqual(
content,
b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n",
)

def test_create_fasta(self):
# Also check that - is converted to N
chromosome_list = create_chromosome_list_object(self.unaligned_sequences, self.seq_key)
fasta_file_name = create_fasta(self.unaligned_sequences, chromosome_list)

with gzip.GzipFile(fasta_file_name, "rb") as gz:
content = gz.read()
self.assertEqual(
content,
b">test_accession.test_version_seg2\nGCGGCACGTCAGTACGTAAGTGTATCTCAAAGAAATACTTAACTTTGAGAGAGTGAATT\n>test_accession.test_version_seg3\nCTTAACTTTGAGAGAGTGAATT\n",
b">test_accession.test_version\nCTTAACTTTGAGAGAGTGAATTN\n",
)

def test_create_manifest(self):
Expand Down

0 comments on commit ec0833e

Please sign in to comment.