From b16aebccc9802c441f13fe3ef3e9a5dd41487493 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 9 May 2024 16:37:34 +0100 Subject: [PATCH 1/3] Rename so pytest finds files; remove duplicate test file --- .../cmds_tests/{amr_tests.py => amr_test.py} | 0 tests/metagenomics_tests/MTBC_tests.py | 104 ------------------ 2 files changed, 104 deletions(-) rename tests/cmds_tests/{amr_tests.py => amr_test.py} (100%) delete mode 100644 tests/metagenomics_tests/MTBC_tests.py diff --git a/tests/cmds_tests/amr_tests.py b/tests/cmds_tests/amr_test.py similarity index 100% rename from tests/cmds_tests/amr_tests.py rename to tests/cmds_tests/amr_test.py diff --git a/tests/metagenomics_tests/MTBC_tests.py b/tests/metagenomics_tests/MTBC_tests.py deleted file mode 100644 index 48fb8265..00000000 --- a/tests/metagenomics_tests/MTBC_tests.py +++ /dev/null @@ -1,104 +0,0 @@ -from unittest import TestCase -import os -import sys -sys.path.append(".") -from mykrobe.metagenomics import AMRSpeciesPredictor -DATA_DIR = os.path.join("tests", "ref_data") - -class MTBCSpeciesTests(TestCase): - - def setUp(self): - self.hierarchy_json_file = f"{DATA_DIR}/mtbc_hierarchy.json" - - def teardown(self): - pass - - def test_mixed_MTBC_NTM(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - species_predictor.out_json["phylogenetics"] = {} - species_predictor.out_json["phylogenetics"]["phylo_group"] = { - "Non_tuberculosis_mycobacterium_complex": { - "percent_coverage": 58.71542975006994, - "median_depth": 36 - }, - "Mycobacterium_tuberculosis_complex": { - "percent_coverage": 62.81850563578579, - "median_depth": 2 - } - } - assert species_predictor.is_mtbc_present() - assert species_predictor.is_ntm_present() - - assert len(species_predictor._get_present_phylo_groups( - species_predictor.out_json["phylogenetics"]["phylo_group"])) == 2 - - def test_get_best_coverage_dict(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - - best_species = species_predictor._get_best_coverage_dict({ - "Mycobacterium_chimaera": { - "percent_coverage": 99.162, - "median_depth": 39 - }, - "Mycobacterium_intracellulare": { - "percent_coverage": 98.662, - "median_depth": 45 - }, - "Mycobacterium_bovis": { - "percent_coverage": 9.894, - "median_depth": 12.0 - }}).keys() - assert list(best_species) == ["Mycobacterium_chimaera"] - - def test_mixed_chimera(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - species_predictor.out_json["phylogenetics"] = { - "sub_complex": { - "Mycobacterium_avium_complex": { - "percent_coverage": 98.346, - "median_depth": 54.0 - } - }, - "phylo_group": { - "Non_tuberculosis_mycobacterium_complex": { - "percent_coverage": 82.846, - "median_depth": 49 - } - }, - "species": { - "Mycobacterium_chimaera": { - "percent_coverage": 99.162, - "median_depth": 39 - }, - "Mycobacterium_intracellulare": { - "percent_coverage": 98.662, - "median_depth": 45 - }, - "Mycobacterium_bovis": { - "percent_coverage": 9.894, - "median_depth": 12.0 - } - } - } - - out_dict = species_predictor.choose_best( - species_predictor.out_json["phylogenetics"]) - - assert "Mycobacterium_chimaera" in out_dict["species"] - assert "Mycobacterium_intracellulare" in out_dict["species"] - assert "Mycobacterium_bovis" not in out_dict["species"] From e4195c00d6a025c9792cf2d73bb63eb135b29176 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 9 May 2024 16:41:50 +0100 Subject: [PATCH 2/3] Remove classes/setup/teardown, use pytest.fixture; all tests discoverable by pytest --- src/mykrobe/cmds/dump.py | 2 +- src/mykrobe/cmds/makeprobes.py | 2 +- src/mykrobe/cmds/variants/add.py | 4 +- tests/annotation_tests/test_annotations.py | 813 ++++++++--------- tests/cmds_tests/amr_test.py | 1 - tests/end_to_end_test.py | 1 - tests/metagenomics_tests/basic_test.py | 79 +- tests/metagenomics_tests/lineages_test.py | 134 ++- tests/metagenomics_tests/test_MTBC.py | 161 ++-- tests/predict_tests/test_amr.py | 69 +- tests/predict_tests/test_model.py | 71 +- tests/probe_tests/base.py | 6 +- tests/probe_tests/conftest.py | 44 + tests/probe_tests/test_indel_only.py | 484 +++++----- tests/probe_tests/test_large_indel.py | 163 ++-- tests/probe_tests/test_models.py | 129 +-- tests/probe_tests/test_probe_generation.py | 17 +- .../probe_tests/test_snp_indel_interaction.py | 824 ++++++++---------- tests/probe_tests/test_snp_only.py | 604 +++++++------ tests/species_data_tests/test_data_dir.py | 13 +- tests/species_data_tests/test_species_dir.py | 3 +- tests/stats_tests/test_coverage_lik.py | 59 +- .../test_presence_typer.py | 307 +++---- .../test_type_simple_vars.py | 539 +++++------- tests/utils_test.py | 2 +- tests/variant_tests/conftest.py | 35 + tests/variant_tests/test_calls.py | 197 ++--- tests/variant_tests/test_variants.py | 223 +++-- tests/vcf_tests/test_adding_vcf_to_db.py | 216 ++--- 29 files changed, 2496 insertions(+), 2706 deletions(-) create mode 100644 tests/probe_tests/conftest.py create mode 100644 tests/variant_tests/conftest.py diff --git a/src/mykrobe/cmds/dump.py b/src/mykrobe/cmds/dump.py index b0705254..b09d0c00 100644 --- a/src/mykrobe/cmds/dump.py +++ b/src/mykrobe/cmds/dump.py @@ -32,7 +32,7 @@ def get_non_singelton_variants(db_name): def run(parser, args): db_name = '%s-%s' % (DB_PREFIX, args.db_name) - DB = connect(db_name, host=args.db_uri) + DB = connect(db_name, host=args.db_uri, uuidRepresentation="pythonLegacy") if args.verbose: logger.setLevel(level=logging.DEBUG) else: diff --git a/src/mykrobe/cmds/makeprobes.py b/src/mykrobe/cmds/makeprobes.py index 8023411b..d4e9019f 100644 --- a/src/mykrobe/cmds/makeprobes.py +++ b/src/mykrobe/cmds/makeprobes.py @@ -39,7 +39,7 @@ def run(parser, args): logger.info("Not connecting to database, because --no-backgrounds option used") DB = None else: - DB = connect("%s-%s" % (DB_PREFIX, args.db_name), host=args.db_uri) + DB = connect("%s-%s" % (DB_PREFIX, args.db_name), host=args.db_uri, uuidRepresentation="pythonLegacy") if DB is not None: try: diff --git a/src/mykrobe/cmds/variants/add.py b/src/mykrobe/cmds/variants/add.py index 58f808a3..9c473063 100644 --- a/src/mykrobe/cmds/variants/add.py +++ b/src/mykrobe/cmds/variants/add.py @@ -48,7 +48,7 @@ def run(parser, args): else: logger.setLevel(logging.INFO) DBNAME = "%s-%s" % (DB_PREFIX, args.db_name) - connect(DBNAME, host=args.db_uri) + connect(DBNAME, host=args.db_uri, uuidRepresentation="pythonLegacy") logger.debug("Using DB %s" % DBNAME) with open(args.reference_set) as fp: @@ -58,7 +58,7 @@ def run(parser, args): reference_set = ReferenceSet.objects.get(name=reference_set_name) except DoesNotExist: reference_set = ReferenceSet.create_and_save(name=reference_set_name) - + try: reference = Reference.create_and_save( name=reference_set_name, reference_sets=[reference_set], md5checksum="NA" diff --git a/tests/annotation_tests/test_annotations.py b/tests/annotation_tests/test_annotations.py index 6d1bd152..0fd954a1 100644 --- a/tests/annotation_tests/test_annotations.py +++ b/tests/annotation_tests/test_annotations.py @@ -15,434 +15,449 @@ from mykrobe.variants.schema.models import Variant from mykrobe.variants.schema.models import VariantSet -DB = connect("mykrobe-test") - DATA_DIR = os.path.join("tests", "ref_data") -class TestRegions: - def teardown(self): - DB.drop_database("mykrobe-test") +@pytest.fixture(autouse=True) +def db_setup_teardown(): + DB = connect("mykrobe-test", uuidRepresentation="pythonLegacy") + DB.drop_database("mykrobe-test") + yield + DB.drop_database("mykrobe-test") - def setup(self): - DB.drop_database("mykrobe-test") - with open(f"{DATA_DIR}/NC_000962.3.fasta", "r") as infile: - self.reference_seq = list(SeqIO.parse(infile, "fasta"))[0].seq - self.gm = GeneAminoAcidChangeToDNAVariants( - reference=f"{DATA_DIR}/NC_000962.3.fasta", - genbank=f"{DATA_DIR}/NC_000962.3.gb", - ) - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", reference_set=self.reference_set - ) - self.variant_sets = [self.variant_set] - self.reference_id = Reference().create_and_save( - name="ref", md5checksum="sre", reference_sets=[self.reference_set] - ) +@pytest.fixture() +def reference_seq(): + with open(f"{DATA_DIR}/NC_000962.3.fasta", "r") as infile: + reference_seq = list(SeqIO.parse(infile, "fasta"))[0].seq + return reference_seq - def test_simple_gene(self): - g = Gene(name="rpoB", reference=self.reference_seq, start=759807, end=763325) - assert g.name == "rpoB" - assert g.forward - assert g.strand == "forward" - assert ( - g.seq - == "TTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA" - ) - assert ( - g.prot - == "LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA" - ) - DB.drop_database("mykrobe-test") - - def test_reverse_gene(self): - g = Gene( - name="gidB", - reference=self.reference_seq, - start=4407528, - end=4408202, - forward=False, - ) - assert g.name == "gidB" - assert g.forward is False - assert g.strand == "reverse" - assert ( - g.seq - == "ATGTCTCCGATCGAGCCCGCGGCGTCTGCGATCTTCGGACCGCGGCTTGGCCTTGCTCGGCGGTACGCCGAAGCGTTGGCGGGACCCGGTGTGGAGCGGGGGCTGGTGGGACCCCGCGAAGTCGGTAGGCTATGGGACCGGCATCTACTGAACTGCGCCGTGATCGGTGAGCTCCTCGAACGCGGTGACCGGGTCGTGGATATCGGTAGCGGAGCCGGGTTGCCGGGCGTGCCATTGGCGATAGCGCGGCCGGACCTCCAGGTAGTTCTCCTAGAACCGCTACTGCGCCGCACCGAGTTTCTTCGAGAGATGGTGACAGATCTGGGCGTGGCCGTTGAGATCGTGCGGGGGCGCGCCGAGGAGTCCTGGGTGCAGGACCAATTGGGCGGCAGCGACGCTGCGGTGTCACGGGCGGTGGCCGCGTTGGACAAGTTGACGAAATGGAGCATGCCGTTGATACGGCCGAACGGGCGAATGCTCGCCATCAAAGGCGAGCGGGCTCACGACGAAGTACGGGAGCACCGGCGTGTGATGATCGCATCGGGCGCGGTTGATGTCAGGGTGGTGACATGTGGCGCGAACTATTTGCGTCCGCCCGCGACCGTGGTGTTCGCACGACGTGGAAAGCAGATCGCCCGAGGGTCGGCACGGATGGCGAGTGGAGGGACGGCGTGA" - ) - assert ( - g.prot - == "MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLAIARPDLQVVLLEPLLRRTEFLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRMLAIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA" - ) - DB.drop_database("mykrobe-test") - - def test_reverse_gene2(self): - g = Gene( - name="katG", - reference=self.reference_seq, - start=2153889, - end=2156111, - forward=False, - ) - assert g.name == "katG" - assert g.forward is False - assert g.strand == "reverse" - assert ( - g.seq - == "GTGCCCGAGCAACACCCACCCATTACAGAAACCACCACCGGAGCCGCTAGCAACGGCTGTCCCGTCGTGGGTCATATGAAATACCCCGTCGAGGGCGGCGGAAACCAGGACTGGTGGCCCAACCGGCTCAATCTGAAGGTACTGCACCAAAACCCGGCCGTCGCTGACCCGATGGGTGCGGCGTTCGACTATGCCGCGGAGGTCGCGACCATCGACGTTGACGCCCTGACGCGGGACATCGAGGAAGTGATGACCACCTCGCAGCCGTGGTGGCCCGCCGACTACGGCCACTACGGGCCGCTGTTTATCCGGATGGCGTGGCACGCTGCCGGCACCTACCGCATCCACGACGGCCGCGGCGGCGCCGGGGGCGGCATGCAGCGGTTCGCGCCGCTTAACAGCTGGCCCGACAACGCCAGCTTGGACAAGGCGCGCCGGCTGCTGTGGCCGGTCAAGAAGAAGTACGGCAAGAAGCTCTCATGGGCGGACCTGATTGTTTTCGCCGGCAACTGCGCGCTGGAATCGATGGGCTTCAAGACGTTCGGGTTCGGCTTCGGCCGGGTCGACCAGTGGGAGCCCGATGAGGTCTATTGGGGCAAGGAAGCCACCTGGCTCGGCGATGAGCGTTACAGCGGTAAGCGGGATCTGGAGAACCCGCTGGCCGCGGTGCAGATGGGGCTGATCTACGTGAACCCGGAGGGGCCGAACGGCAACCCGGACCCCATGGCCGCGGCGGTCGACATTCGCGAGACGTTTCGGCGCATGGCCATGAACGACGTCGAAACAGCGGCGCTGATCGTCGGCGGTCACACTTTCGGTAAGACCCATGGCGCCGGCCCGGCCGATCTGGTCGGCCCCGAACCCGAGGCTGCTCCGCTGGAGCAGATGGGCTTGGGCTGGAAGAGCTCGTATGGCACCGGAACCGGTAAGGACGCGATCACCAGCGGCATCGAGGTCGTATGGACGAACACCCCGACGAAATGGGACAACAGTTTCCTCGAGATCCTGTACGGCTACGAGTGGGAGCTGACGAAGAGCCCTGCTGGCGCTTGGCAATACACCGCCAAGGACGGCGCCGGTGCCGGCACCATCCCGGACCCGTTCGGCGGGCCAGGGCGCTCCCCGACGATGCTGGCCACTGACCTCTCGCTGCGGGTGGATCCGATCTATGAGCGGATCACGCGTCGCTGGCTGGAACACCCCGAGGAATTGGCCGACGAGTTCGCCAAGGCCTGGTACAAGCTGATCCACCGAGACATGGGTCCCGTTGCGAGATACCTTGGGCCGCTGGTCCCCAAGCAGACCCTGCTGTGGCAGGATCCGGTCCCTGCGGTCAGCCACGACCTCGTCGGCGAAGCCGAGATTGCCAGCCTTAAGAGCCAGATCCGGGCATCGGGATTGACTGTCTCACAGCTAGTTTCGACCGCATGGGCGGCGGCGTCGTCGTTCCGTGGTAGCGACAAGCGCGGCGGCGCCAACGGTGGTCGCATCCGCCTGCAGCCACAAGTCGGGTGGGAGGTCAACGACCCCGACGGGGATCTGCGCAAGGTCATTCGCACCCTGGAAGAGATCCAGGAGTCATTCAACTCCGCGGCGCCGGGGAACATCAAAGTGTCCTTCGCCGACCTCGTCGTGCTCGGTGGCTGTGCCGCCATAGAGAAAGCAGCAAAGGCGGCTGGCCACAACATCACGGTGCCCTTCACCCCGGGCCGCACGGATGCGTCGCAGGAACAAACCGACGTGGAATCCTTTGCCGTGCTGGAGCCCAAGGCAGATGGCTTCCGAAACTACCTCGGAAAGGGCAACCCGTTGCCGGCCGAGTACATGCTGCTCGACAAGGCGAACCTGCTTACGCTCAGTGCCCCTGAGATGACGGTGCTGGTAGGTGGCCTGCGCGTCCTCGGCGCAAACTACAAGCGCTTACCGCTGGGCGTGTTCACCGAGGCCTCCGAGTCACTGACCAACGACTTCTTCGTGAACCTGCTCGACATGGGTATCACCTGGGAGCCCTCGCCAGCAGATGACGGGACCTACCAGGGCAAGGATGGCAGTGGCAAGGTGAAGTGGACCGGCAGCCGCGTGGACCTGGTCTTCGGGTCCAACTCGGAGTTGCGGGCGCTTGTCGAGGTCTATGGCGCCGATGACGCGCAGCCGAAGTTCGTGCAGGACTTCGTCGCTGCCTGGGACAAGGTGATGAACCTCGACAGGTTCGACGTGCGCTGA" - ) - assert ( - g.prot - == "VPEQHPPITETTTGAASNGCPVVGHMKYPVEGGGNQDWWPNRLNLKVLHQNPAVADPMGAAFDYAAEVATIDVDALTRDIEEVMTTSQPWWPADYGHYGPLFIRMAWHAAGTYRIHDGRGGAGGGMQRFAPLNSWPDNASLDKARRLLWPVKKKYGKKLSWADLIVFAGNCALESMGFKTFGFGFGRVDQWEPDEVYWGKEATWLGDERYSGKRDLENPLAAVQMGLIYVNPEGPNGNPDPMAAAVDIRETFRRMAMNDVETAALIVGGHTFGKTHGAGPADLVGPEPEAAPLEQMGLGWKSSYGTGTGKDAITSGIEVVWTNTPTKWDNSFLEILYGYEWELTKSPAGAWQYTAKDGAGAGTIPDPFGGPGRSPTMLATDLSLRVDPIYERITRRWLEHPEELADEFAKAWYKLIHRDMGPVARYLGPLVPKQTLLWQDPVPAVSHDLVGEAEIASLKSQIRASGLTVSQLVSTAWAAASSFRGSDKRGGANGGRIRLQPQVGWEVNDPDGDLRKVIRTLEEIQESFNSAAPGNIKVSFADLVVLGGCAAIEKAAKAAGHNITVPFTPGRTDASQEQTDVESFAVLEPKADGFRNYLGKGNPLPAEYMLLDKANLLTLSAPEMTVLVGGLRVLGANYKRLPLGVFTEASESLTNDFFVNLLDMGITWEPSPADDGTYQGKDGSGKVKWTGSRVDLVFGSNSELRALVEVYGADDAQPKFVQDFVAAWDKVMNLDRFDVR" - ) - DB.drop_database("mykrobe-test") - - def test_get_codon(self): - g = Gene(name="rpoB", reference=self.reference_seq, start=759807, end=763325) - with pytest.raises(ValueError): - g.get_codon(1173) - assert g.get_codon(2) == "GCA" - assert g.get_codon(3) == "GAT" - assert g.get_reference_position(1) == 759807 - assert g.seq[0] == self.reference_seq[759806] - assert g.get_reference_position(-1) == 759806 - DB.drop_database("mykrobe-test") - - def test_get_codon_reverse(self): - g = Gene( - name="gidB", - reference=self.reference_seq, - start=4407528, - end=4408202, - forward=False, - ) - with pytest.raises(ValueError): - g.get_codon(225) - assert g.get_codon(2) == "TCT" - assert g.get_codon(3) == "CCG" - assert g.get_reference_position(1) == 4408202 - assert g.get_reference_position(2) == 4408201 - assert g.get_reference_position(-1) == 4408203 - assert g.get_reference_position(-2) == 4408204 - DB.drop_database("mykrobe-test") - - def test_gene_muts(self): - self.gm = GeneAminoAcidChangeToDNAVariants( - reference=f"{DATA_DIR}/NC_000962.3.fasta", - genbank=f"{DATA_DIR}/NC_000962.3.gb", - ) - assert self.gm.get_alts("K") == ["AAA", "AAG"] - # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15 - assert sorted(self.gm.get_variant_names("rpoB", "D3A")) == sorted( - ["GAT759813GCA", "GAT759813GCT", "GAT759813GCC", "GAT759813GCG"] - ) - # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15 - assert sorted(self.gm.get_variant_names("rpoB", "D3X")) == sorted( - [ - "GAT759813GCA", - "GAT759813GCT", - "GAT759813GCC", - "GAT759813GCG", - "GAT759813TGT", - "GAT759813TGC", - "GAT759813GAA", - "GAT759813GAG", - "GAT759813GGA", - "GAT759813GGT", - "GAT759813GGC", - "GAT759813GGG", - "GAT759813TTT", - "GAT759813TTC", - "GAT759813ATA", - "GAT759813ATT", - "GAT759813ATC", - "GAT759813CAT", - "GAT759813CAC", - "GAT759813AAA", - "GAT759813AAG", - "GAT759813ATG", - "GAT759813TTA", - "GAT759813TTG", - "GAT759813CTA", - "GAT759813CTT", - "GAT759813CTC", - "GAT759813CTG", - "GAT759813AAT", - "GAT759813AAC", - "GAT759813CAA", - "GAT759813CAG", - "GAT759813CCA", - "GAT759813CCT", - "GAT759813CCC", - "GAT759813CCG", - "GAT759813AGT", - "GAT759813AGC", - "GAT759813TCA", - "GAT759813TCT", - "GAT759813TCC", - "GAT759813TCG", - "GAT759813AGA", - "GAT759813AGG", - "GAT759813CGA", - "GAT759813CGT", - "GAT759813CGC", - "GAT759813CGG", - "GAT759813ACA", - "GAT759813ACT", - "GAT759813ACC", - "GAT759813ACG", - "GAT759813TGG", - "GAT759813GTA", - "GAT759813GTT", - "GAT759813GTC", - "GAT759813GTG", - "GAT759813TAT", - "GAT759813TAC", - ] - ) - DB.drop_database("mykrobe-test") - def test_gene_muts2(self): - self.gm = GeneAminoAcidChangeToDNAVariants( - reference=f"{DATA_DIR}/NC_000962.3.fasta", - genbank=f"{DATA_DIR}/NC_000962.3.gb", +@pytest.fixture() +def reference_id_and_variant_sets(reference_seq): + reference_set = ReferenceSet().create_and_save(name="ref_set") + variant_set = VariantSet.create_and_save( + name="this_vcf_file", reference_set=reference_set + ) + variant_sets = [variant_set] + reference_id = Reference().create_and_save( + name="ref", md5checksum="sre", reference_sets=[reference_set] + ) + return reference_id, variant_sets + + +@pytest.fixture() +def gm(): + return GeneAminoAcidChangeToDNAVariants( + reference=f"{DATA_DIR}/NC_000962.3.fasta", + genbank=f"{DATA_DIR}/NC_000962.3.gb", + ) + + +def test_simple_gene(reference_seq): + g = Gene(name="rpoB", reference=reference_seq, start=759807, end=763325) + assert g.name == "rpoB" + assert g.forward + assert g.strand == "forward" + assert ( + g.seq + == "TTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA" + ) + assert ( + g.prot + == "LADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA" + ) + + +def test_reverse_gene(reference_seq): + g = Gene( + name="gidB", + reference=reference_seq, + start=4407528, + end=4408202, + forward=False, + ) + assert g.name == "gidB" + assert g.forward is False + assert g.strand == "reverse" + assert ( + g.seq + == "ATGTCTCCGATCGAGCCCGCGGCGTCTGCGATCTTCGGACCGCGGCTTGGCCTTGCTCGGCGGTACGCCGAAGCGTTGGCGGGACCCGGTGTGGAGCGGGGGCTGGTGGGACCCCGCGAAGTCGGTAGGCTATGGGACCGGCATCTACTGAACTGCGCCGTGATCGGTGAGCTCCTCGAACGCGGTGACCGGGTCGTGGATATCGGTAGCGGAGCCGGGTTGCCGGGCGTGCCATTGGCGATAGCGCGGCCGGACCTCCAGGTAGTTCTCCTAGAACCGCTACTGCGCCGCACCGAGTTTCTTCGAGAGATGGTGACAGATCTGGGCGTGGCCGTTGAGATCGTGCGGGGGCGCGCCGAGGAGTCCTGGGTGCAGGACCAATTGGGCGGCAGCGACGCTGCGGTGTCACGGGCGGTGGCCGCGTTGGACAAGTTGACGAAATGGAGCATGCCGTTGATACGGCCGAACGGGCGAATGCTCGCCATCAAAGGCGAGCGGGCTCACGACGAAGTACGGGAGCACCGGCGTGTGATGATCGCATCGGGCGCGGTTGATGTCAGGGTGGTGACATGTGGCGCGAACTATTTGCGTCCGCCCGCGACCGTGGTGTTCGCACGACGTGGAAAGCAGATCGCCCGAGGGTCGGCACGGATGGCGAGTGGAGGGACGGCGTGA" + ) + assert ( + g.prot + == "MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLAIARPDLQVVLLEPLLRRTEFLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRMLAIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA" + ) + + +def test_reverse_gene2(reference_seq): + g = Gene( + name="katG", + reference=reference_seq, + start=2153889, + end=2156111, + forward=False, + ) + assert g.name == "katG" + assert g.forward is False + assert g.strand == "reverse" + assert ( + g.seq + == "GTGCCCGAGCAACACCCACCCATTACAGAAACCACCACCGGAGCCGCTAGCAACGGCTGTCCCGTCGTGGGTCATATGAAATACCCCGTCGAGGGCGGCGGAAACCAGGACTGGTGGCCCAACCGGCTCAATCTGAAGGTACTGCACCAAAACCCGGCCGTCGCTGACCCGATGGGTGCGGCGTTCGACTATGCCGCGGAGGTCGCGACCATCGACGTTGACGCCCTGACGCGGGACATCGAGGAAGTGATGACCACCTCGCAGCCGTGGTGGCCCGCCGACTACGGCCACTACGGGCCGCTGTTTATCCGGATGGCGTGGCACGCTGCCGGCACCTACCGCATCCACGACGGCCGCGGCGGCGCCGGGGGCGGCATGCAGCGGTTCGCGCCGCTTAACAGCTGGCCCGACAACGCCAGCTTGGACAAGGCGCGCCGGCTGCTGTGGCCGGTCAAGAAGAAGTACGGCAAGAAGCTCTCATGGGCGGACCTGATTGTTTTCGCCGGCAACTGCGCGCTGGAATCGATGGGCTTCAAGACGTTCGGGTTCGGCTTCGGCCGGGTCGACCAGTGGGAGCCCGATGAGGTCTATTGGGGCAAGGAAGCCACCTGGCTCGGCGATGAGCGTTACAGCGGTAAGCGGGATCTGGAGAACCCGCTGGCCGCGGTGCAGATGGGGCTGATCTACGTGAACCCGGAGGGGCCGAACGGCAACCCGGACCCCATGGCCGCGGCGGTCGACATTCGCGAGACGTTTCGGCGCATGGCCATGAACGACGTCGAAACAGCGGCGCTGATCGTCGGCGGTCACACTTTCGGTAAGACCCATGGCGCCGGCCCGGCCGATCTGGTCGGCCCCGAACCCGAGGCTGCTCCGCTGGAGCAGATGGGCTTGGGCTGGAAGAGCTCGTATGGCACCGGAACCGGTAAGGACGCGATCACCAGCGGCATCGAGGTCGTATGGACGAACACCCCGACGAAATGGGACAACAGTTTCCTCGAGATCCTGTACGGCTACGAGTGGGAGCTGACGAAGAGCCCTGCTGGCGCTTGGCAATACACCGCCAAGGACGGCGCCGGTGCCGGCACCATCCCGGACCCGTTCGGCGGGCCAGGGCGCTCCCCGACGATGCTGGCCACTGACCTCTCGCTGCGGGTGGATCCGATCTATGAGCGGATCACGCGTCGCTGGCTGGAACACCCCGAGGAATTGGCCGACGAGTTCGCCAAGGCCTGGTACAAGCTGATCCACCGAGACATGGGTCCCGTTGCGAGATACCTTGGGCCGCTGGTCCCCAAGCAGACCCTGCTGTGGCAGGATCCGGTCCCTGCGGTCAGCCACGACCTCGTCGGCGAAGCCGAGATTGCCAGCCTTAAGAGCCAGATCCGGGCATCGGGATTGACTGTCTCACAGCTAGTTTCGACCGCATGGGCGGCGGCGTCGTCGTTCCGTGGTAGCGACAAGCGCGGCGGCGCCAACGGTGGTCGCATCCGCCTGCAGCCACAAGTCGGGTGGGAGGTCAACGACCCCGACGGGGATCTGCGCAAGGTCATTCGCACCCTGGAAGAGATCCAGGAGTCATTCAACTCCGCGGCGCCGGGGAACATCAAAGTGTCCTTCGCCGACCTCGTCGTGCTCGGTGGCTGTGCCGCCATAGAGAAAGCAGCAAAGGCGGCTGGCCACAACATCACGGTGCCCTTCACCCCGGGCCGCACGGATGCGTCGCAGGAACAAACCGACGTGGAATCCTTTGCCGTGCTGGAGCCCAAGGCAGATGGCTTCCGAAACTACCTCGGAAAGGGCAACCCGTTGCCGGCCGAGTACATGCTGCTCGACAAGGCGAACCTGCTTACGCTCAGTGCCCCTGAGATGACGGTGCTGGTAGGTGGCCTGCGCGTCCTCGGCGCAAACTACAAGCGCTTACCGCTGGGCGTGTTCACCGAGGCCTCCGAGTCACTGACCAACGACTTCTTCGTGAACCTGCTCGACATGGGTATCACCTGGGAGCCCTCGCCAGCAGATGACGGGACCTACCAGGGCAAGGATGGCAGTGGCAAGGTGAAGTGGACCGGCAGCCGCGTGGACCTGGTCTTCGGGTCCAACTCGGAGTTGCGGGCGCTTGTCGAGGTCTATGGCGCCGATGACGCGCAGCCGAAGTTCGTGCAGGACTTCGTCGCTGCCTGGGACAAGGTGATGAACCTCGACAGGTTCGACGTGCGCTGA" + ) + assert ( + g.prot + == "VPEQHPPITETTTGAASNGCPVVGHMKYPVEGGGNQDWWPNRLNLKVLHQNPAVADPMGAAFDYAAEVATIDVDALTRDIEEVMTTSQPWWPADYGHYGPLFIRMAWHAAGTYRIHDGRGGAGGGMQRFAPLNSWPDNASLDKARRLLWPVKKKYGKKLSWADLIVFAGNCALESMGFKTFGFGFGRVDQWEPDEVYWGKEATWLGDERYSGKRDLENPLAAVQMGLIYVNPEGPNGNPDPMAAAVDIRETFRRMAMNDVETAALIVGGHTFGKTHGAGPADLVGPEPEAAPLEQMGLGWKSSYGTGTGKDAITSGIEVVWTNTPTKWDNSFLEILYGYEWELTKSPAGAWQYTAKDGAGAGTIPDPFGGPGRSPTMLATDLSLRVDPIYERITRRWLEHPEELADEFAKAWYKLIHRDMGPVARYLGPLVPKQTLLWQDPVPAVSHDLVGEAEIASLKSQIRASGLTVSQLVSTAWAAASSFRGSDKRGGANGGRIRLQPQVGWEVNDPDGDLRKVIRTLEEIQESFNSAAPGNIKVSFADLVVLGGCAAIEKAAKAAGHNITVPFTPGRTDASQEQTDVESFAVLEPKADGFRNYLGKGNPLPAEYMLLDKANLLTLSAPEMTVLVGGLRVLGANYKRLPLGVFTEASESLTNDFFVNLLDMGITWEPSPADDGTYQGKDGSGKVKWTGSRVDLVFGSNSELRALVEVYGADDAQPKFVQDFVAAWDKVMNLDRFDVR" + ) + + +def test_get_codon(reference_seq): + g = Gene(name="rpoB", reference=reference_seq, start=759807, end=763325) + with pytest.raises(ValueError): + g.get_codon(1173) + assert g.get_codon(2) == "GCA" + assert g.get_codon(3) == "GAT" + assert g.get_reference_position(1) == 759807 + assert g.seq[0] == reference_seq[759806] + assert g.get_reference_position(-1) == 759806 + + +def test_get_codon_reverse(reference_seq): + g = Gene( + name="gidB", + reference=reference_seq, + start=4407528, + end=4408202, + forward=False, + ) + with pytest.raises(ValueError): + g.get_codon(225) + assert g.get_codon(2) == "TCT" + assert g.get_codon(3) == "CCG" + assert g.get_reference_position(1) == 4408202 + assert g.get_reference_position(2) == 4408201 + assert g.get_reference_position(-1) == 4408203 + assert g.get_reference_position(-2) == 4408204 + + +def test_gene_muts(gm): + gm = GeneAminoAcidChangeToDNAVariants( + reference=f"{DATA_DIR}/NC_000962.3.fasta", + genbank=f"{DATA_DIR}/NC_000962.3.gb", + ) + assert gm.get_alts("K") == ["AAA", "AAG"] + # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15 + assert sorted(gm.get_variant_names("rpoB", "D3A")) == sorted( + ["GAT759813GCA", "GAT759813GCT", "GAT759813GCC", "GAT759813GCG"] + ) + # GAT -> ['GCA', 'GCT', 'GCC', 'GCG'], positions 759813,14,15 + assert sorted(gm.get_variant_names("rpoB", "D3X")) == sorted( + [ + "GAT759813GCA", + "GAT759813GCT", + "GAT759813GCC", + "GAT759813GCG", + "GAT759813TGT", + "GAT759813TGC", + "GAT759813GAA", + "GAT759813GAG", + "GAT759813GGA", + "GAT759813GGT", + "GAT759813GGC", + "GAT759813GGG", + "GAT759813TTT", + "GAT759813TTC", + "GAT759813ATA", + "GAT759813ATT", + "GAT759813ATC", + "GAT759813CAT", + "GAT759813CAC", + "GAT759813AAA", + "GAT759813AAG", + "GAT759813ATG", + "GAT759813TTA", + "GAT759813TTG", + "GAT759813CTA", + "GAT759813CTT", + "GAT759813CTC", + "GAT759813CTG", + "GAT759813AAT", + "GAT759813AAC", + "GAT759813CAA", + "GAT759813CAG", + "GAT759813CCA", + "GAT759813CCT", + "GAT759813CCC", + "GAT759813CCG", + "GAT759813AGT", + "GAT759813AGC", + "GAT759813TCA", + "GAT759813TCT", + "GAT759813TCC", + "GAT759813TCG", + "GAT759813AGA", + "GAT759813AGG", + "GAT759813CGA", + "GAT759813CGT", + "GAT759813CGC", + "GAT759813CGG", + "GAT759813ACA", + "GAT759813ACT", + "GAT759813ACC", + "GAT759813ACG", + "GAT759813TGG", + "GAT759813GTA", + "GAT759813GTT", + "GAT759813GTC", + "GAT759813GTG", + "GAT759813TAT", + "GAT759813TAC", + ] + ) + + +def test_gene_muts2(gm): + gm = GeneAminoAcidChangeToDNAVariants( + reference=f"{DATA_DIR}/NC_000962.3.fasta", + genbank=f"{DATA_DIR}/NC_000962.3.gb", + ) + assert gm.get_alts("K") == ["AAA", "AAG"] + # AGC -> ['CTT', 'CTC', 'CTA', 'CTG'] + # # GAG -> ['GCA', 'GCT', 'GCC', 'GCG'] + # RC : CTC -> ['TGC',...] position2156103 + assert sorted(gm.get_variant_names("katG", "E3A")) == sorted( + ["CTC2156103TGC", "CTC2156103AGC", "CTC2156103GGC", "CTC2156103CGC"] + ) + + +def test_make_variant_panel1(reference_id_and_variant_sets, gm): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + gene = gm.get_gene("rpoB") + for var in gm.get_variant_names("rpoB", "D3A"): + ref, start, alt = split_var_name(var) + v = Variant.create( + variant_sets=variant_sets, + reference=reference_id, + reference_bases=ref, + start=start, + alternate_bases=[alt], ) - assert self.gm.get_alts("K") == ["AAA", "AAG"] - # AGC -> ['CTT', 'CTC', 'CTA', 'CTG'] - # # GAG -> ['GCA', 'GCT', 'GCC', 'GCG'] - # RC : CTC -> ['TGC',...] position2156103 - assert sorted(self.gm.get_variant_names("katG", "E3A")) == sorted( - ["CTC2156103TGC", "CTC2156103AGC", "CTC2156103GGC", "CTC2156103CGC"] + panel = ag.create(v) + for alt in panel.alts: + seq = copy.copy(str(gene.seq)) + assert Seq(seq).translate()[2] == "D" + seq = seq.replace(panel.refs[0][25:], alt[24:]) + assert seq != str(gene.seq) + assert Seq(seq).translate()[2] == "A" + + +def test_make_variant_panel2(reference_id_and_variant_sets, gm): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + gene = gm.get_gene("katG") + for var in gm.get_variant_names("katG", "E3A"): + ref, start, alt = split_var_name(var) + v = Variant.create( + variant_sets=variant_sets, + reference=reference_id, + reference_bases=ref, + start=start, + alternate_bases=[alt], ) - DB.drop_database("mykrobe-test") - - def test_make_variant_panel1(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) - gene = self.gm.get_gene("rpoB") - for var in self.gm.get_variant_names("rpoB", "D3A"): - ref, start, alt = split_var_name(var) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, - reference_bases=ref, - start=start, - alternate_bases=[alt], - ) - panel = ag.create(v) - for alt in panel.alts: - seq = copy.copy(str(gene.seq)) - assert Seq(seq).translate()[2] == "D" - seq = seq.replace(panel.refs[0][25:], alt[24:]) - assert seq != str(gene.seq) - assert Seq(seq).translate()[2] == "A" - DB.drop_database("mykrobe-test") - - def test_make_variant_panel2(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) - gene = self.gm.get_gene("katG") - for var in self.gm.get_variant_names("katG", "E3A"): - ref, start, alt = split_var_name(var) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, - reference_bases=ref, - start=start, - alternate_bases=[alt], - ) - panel = ag.create(v) - for alt in panel.alts: - seq = copy.copy(str(gene.seq.reverse_complement())) - seq = seq.replace( - panel.refs[0][:39], alt[: 39 + len(alt) - len(panel.refs[0])] - ) - assert seq != str(gene.seq) - assert Seq(seq).reverse_complement().translate()[2] == "A" - DB.drop_database("mykrobe-test") - - def test_make_variant_panel3(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") - gene = self.gm.get_gene("katG") - for var in self.gm.get_variant_names("katG", "S315L"): - ref, start, alt = split_var_name(var) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, - reference_bases=ref, - start=start, - alternate_bases=[alt], - ) - panel = ag.create(v) - for alt in panel.alts: - seq = copy.copy(str(gene.seq.reverse_complement())) - seq = seq.replace(panel.refs[0], alt) - assert seq != str(gene.seq) - assert Seq(seq).reverse_complement().translate()[314] == "L" - DB.drop_database("mykrobe-test") - - def test_make_variant_panel4(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") - gene = self.gm.get_gene("katG") - for var in self.gm.get_variant_names("katG", "W90R"): - ref, start, alt = split_var_name(var) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, - reference_bases=ref, - start=start, - alternate_bases=[alt], + panel = ag.create(v) + for alt in panel.alts: + seq = copy.copy(str(gene.seq.reverse_complement())) + seq = seq.replace( + panel.refs[0][:39], alt[: 39 + len(alt) - len(panel.refs[0])] ) - panel = ag.create(v) - for alt in panel.alts: - seq = copy.copy(str(gene.seq.reverse_complement())) - seq = seq.replace(panel.refs[0], alt) - assert seq != str(gene.seq) - assert Seq(seq).reverse_complement().translate()[89] == "R" - DB.drop_database("mykrobe-test") - - def test_make_variant_panel5(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") - gene = self.gm.get_gene("gyrA") - for var in self.gm.get_variant_names("gyrA", "D94X"): - ref, start, alt = split_var_name(var) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, - reference_bases=ref, - start=start, - alternate_bases=[alt], + assert seq != str(gene.seq) + # start with first codon in seq, so is multiple of 3, otherwise biopython warning + assert ( + Seq(seq[-(len(gene.seq) - 3) :]).reverse_complement().translate()[2] + == "A" ) - panel = ag.create(v) - for alt in panel.alts: - seq = copy.copy(str(gene.seq)) - seq = seq.replace(panel.refs[0], alt) - assert Seq(seq).translate()[93] != "D" - DB.drop_database("mykrobe-test") - - def test_make_variant_panel6(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) - gene = self.gm.get_gene("pncA") - variants = list( - self.gm.get_variant_names("pncA", "CAG28TAA", protein_coding_var=False) - ) - assert len(variants) == 1 - var = variants[0] + + +def test_make_variant_panel3(reference_id_and_variant_sets, gm): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") + gene = gm.get_gene("katG") + for var in gm.get_variant_names("katG", "S315L"): ref, start, alt = split_var_name(var) - assert ref == "CTG" - assert start == 2289212 - assert alt == "TTA" v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, + variant_sets=variant_sets, + reference=reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) - assert len(panel.alts) == 1 - alt = panel.alts[0] - # the panel ref/alt seqs go past the end of the gene, - # so can't comparie against gene sequence. Need to get - # subseq from the reference seq - panel_ref_start = self.reference_seq.find(panel.refs[0]) - assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) - seq = str( - self.reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])] - ) - assert seq == panel.refs[0] - assert alt == seq[:30] + "TTA" + seq[33:] - DB.drop_database("mykrobe-test") - - def test_make_variant_panel7(self): - # Test DNA change upstream of a gene on the reverse - # strand. The variant G-10A is in "gene space", ie - # 10 bases upstream of eis is the nucleotide G on the - # reverse strand. That position is 2715342 in the genome, - # and is C on the forwards strand. - # Here's a diagram: - # | <- This C is at -10 in "gene space", so variant G-10A has ref=G - # | ref coord is 2715342, and variant in "ref space" is C2715342T - # CACAGAATCCGACTGTGGCATATGCCGC - # | - # | <- C = last nucleotide of gene, at 2715332 - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) - gene = self.gm.get_gene("eis") - variants = list( - self.gm.get_variant_names("eis", "G-10A", protein_coding_var=False) - ) - assert len(variants) == 1 - var = variants[0] + for alt in panel.alts: + seq = copy.copy(str(gene.seq.reverse_complement())) + seq = seq.replace(panel.refs[0], alt) + assert seq != str(gene.seq) + # start with first codon in seq, so is multiple of 3, otherwise biopython warning + assert ( + Seq(seq[-(len(gene.seq) - 3) :]).reverse_complement().translate()[314] + == "L" + ) + + +def test_make_variant_panel4(reference_id_and_variant_sets, gm): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") + gene = gm.get_gene("katG") + for var in gm.get_variant_names("katG", "W90R"): ref, start, alt = split_var_name(var) - assert ref == "C" - assert start == 2715342 - assert alt == "T" v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, + variant_sets=variant_sets, + reference=reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) - assert len(panel.alts) == 1 - alt = panel.alts[0] - # the panel ref/alt seqs go past the end of the gene, - # so can't comparie against gene sequence. Need to get - # subseq from the reference seq - panel_ref_start = self.reference_seq.find(panel.refs[0]) - assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) - seq = str( - self.reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])] - ) - assert seq == panel.refs[0] - assert alt == seq[:30] + "T" + seq[31:] - DB.drop_database("mykrobe-test") - - def test_make_variant_panel8(self): - ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) - gene = self.gm.get_gene("eis") - variants = list( - self.gm.get_variant_names("eis", "TG-1T", protein_coding_var=False) - ) - assert len(variants) == 1 - var = variants[0] + for alt in panel.alts: + seq = copy.copy(str(gene.seq.reverse_complement())) + seq = seq.replace(panel.refs[0], alt) + assert seq != str(gene.seq) + # start with first codon in seq, so is multiple of 3, otherwise biopython warning + assert ( + Seq(seq[-(len(gene.seq) - 3) :]).reverse_complement().translate()[89] + == "R" + ) + + +def test_make_variant_panel5(reference_id_and_variant_sets, gm): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta") + gene = gm.get_gene("gyrA") + for var in gm.get_variant_names("gyrA", "D94X"): ref, start, alt = split_var_name(var) - assert ref == "CA" - assert start == 2715332 - assert alt == "A" v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference_id, + variant_sets=variant_sets, + reference=reference_id, reference_bases=ref, start=start, alternate_bases=[alt], ) panel = ag.create(v) - assert len(panel.alts) == 1 - alt = panel.alts[0] - # the panel ref/alt seqs go past the end of the gene, - # so can't comparie against gene sequence. Need to get - # subseq from the reference seq - panel_ref_start = self.reference_seq.find(panel.refs[0]) - assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) - seq = str( - self.reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])] - ) - assert seq == panel.refs[0] - print(alt, seq[:31] + seq[31:]) - assert alt == seq[:30] + seq[31:] - DB.drop_database("mykrobe-test") - - def test_make_variant_panel_stop_codon(self): - variants = list( - self.gm.get_variant_names("katG", "W90*", protein_coding_var=True) - ) - assert len(variants) == 3 - refs = sorted([split_var_name(v)[0] for v in variants]) - alts = sorted([split_var_name(v)[-1] for v in variants]) - var = variants[0] - ref, start, alt = split_var_name(var) - assert start == 2155842 - assert refs == ["CCA"] * 3 - assert alts == sorted(["TTA", "CTA", "TCA"]) + for alt in panel.alts: + seq = copy.copy(str(gene.seq)) + seq = seq.replace(panel.refs[0], alt) + # start with first codon in seq, so is multiple of 3, otherwise biopython warning + assert Seq(seq[-(len(gene.seq) - 3) :]).translate()[93] != "D" + + +def test_make_variant_panel6(reference_id_and_variant_sets, gm, reference_seq): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + variants = list(gm.get_variant_names("pncA", "CAG28TAA", protein_coding_var=False)) + assert len(variants) == 1 + var = variants[0] + ref, start, alt = split_var_name(var) + assert ref == "CTG" + assert start == 2289212 + assert alt == "TTA" + v = Variant.create( + variant_sets=variant_sets, + reference=reference_id, + reference_bases=ref, + start=start, + alternate_bases=[alt], + ) + panel = ag.create(v) + assert len(panel.alts) == 1 + alt = panel.alts[0] + # the panel ref/alt seqs go past the end of the gene, + # so can't comparie against gene sequence. Need to get + # subseq from the reference seq + panel_ref_start = reference_seq.find(panel.refs[0]) + assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) + seq = str(reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])]) + assert seq == panel.refs[0] + assert alt == seq[:30] + "TTA" + seq[33:] + + +def test_make_variant_panel7(reference_id_and_variant_sets, gm, reference_seq): + reference_id, variant_sets = reference_id_and_variant_sets + # Test DNA change upstream of a gene on the reverse + # strand. The variant G-10A is in "gene space", ie + # 10 bases upstream of eis is the nucleotide G on the + # reverse strand. That position is 2715342 in the genome, + # and is C on the forwards strand. + # Here's a diagram: + # | <- This C is at -10 in "gene space", so variant G-10A has ref=G + # | ref coord is 2715342, and variant in "ref space" is C2715342T + # CACAGAATCCGACTGTGGCATATGCCGC + # | + # | <- C = last nucleotide of gene, at 2715332 + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + variants = list(gm.get_variant_names("eis", "G-10A", protein_coding_var=False)) + assert len(variants) == 1 + var = variants[0] + ref, start, alt = split_var_name(var) + assert ref == "C" + assert start == 2715342 + assert alt == "T" + v = Variant.create( + variant_sets=variant_sets, + reference=reference_id, + reference_bases=ref, + start=start, + alternate_bases=[alt], + ) + panel = ag.create(v) + assert len(panel.alts) == 1 + alt = panel.alts[0] + # the panel ref/alt seqs go past the end of the gene, + # so can't comparie against gene sequence. Need to get + # subseq from the reference seq + panel_ref_start = reference_seq.find(panel.refs[0]) + assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) + seq = str(reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])]) + assert seq == panel.refs[0] + assert alt == seq[:30] + "T" + seq[31:] + + +def test_make_variant_panel8(reference_id_and_variant_sets, gm, reference_seq): + reference_id, variant_sets = reference_id_and_variant_sets + ag = AlleleGenerator(f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + variants = list(gm.get_variant_names("eis", "TG-1T", protein_coding_var=False)) + assert len(variants) == 1 + var = variants[0] + ref, start, alt = split_var_name(var) + assert ref == "CA" + assert start == 2715332 + assert alt == "A" + v = Variant.create( + variant_sets=variant_sets, + reference=reference_id, + reference_bases=ref, + start=start, + alternate_bases=[alt], + ) + panel = ag.create(v) + assert len(panel.alts) == 1 + alt = panel.alts[0] + # the panel ref/alt seqs go past the end of the gene, + # so can't comparie against gene sequence. Need to get + # subseq from the reference seq + panel_ref_start = reference_seq.find(panel.refs[0]) + assert panel_ref_start < start < panel_ref_start + len(panel.refs[0]) + seq = str(reference_seq[panel_ref_start : panel_ref_start + len(panel.refs[0])]) + assert seq == panel.refs[0] + assert alt == seq[:30] + seq[31:] + + +def test_make_variant_panel_stop_codon(gm): + variants = list(gm.get_variant_names("katG", "W90*", protein_coding_var=True)) + assert len(variants) == 3 + refs = sorted([split_var_name(v)[0] for v in variants]) + alts = sorted([split_var_name(v)[-1] for v in variants]) + var = variants[0] + ref, start, alt = split_var_name(var) + assert start == 2155842 + assert refs == ["CCA"] * 3 + assert alts == sorted(["TTA", "CTA", "TCA"]) diff --git a/tests/cmds_tests/amr_test.py b/tests/cmds_tests/amr_test.py index cfe68ffe..9c95ec6c 100644 --- a/tests/cmds_tests/amr_test.py +++ b/tests/cmds_tests/amr_test.py @@ -1,5 +1,4 @@ import copy -import pytest from mykrobe import amr diff --git a/tests/end_to_end_test.py b/tests/end_to_end_test.py index 9c9c014b..71221c00 100644 --- a/tests/end_to_end_test.py +++ b/tests/end_to_end_test.py @@ -1,6 +1,5 @@ import json import os -import pytest import subprocess from mykrobe.parser import parser diff --git a/tests/metagenomics_tests/basic_test.py b/tests/metagenomics_tests/basic_test.py index 5bee4221..c8fdab35 100644 --- a/tests/metagenomics_tests/basic_test.py +++ b/tests/metagenomics_tests/basic_test.py @@ -1,54 +1,37 @@ -from unittest import TestCase from mykrobe.metagenomics import AMRSpeciesPredictor -class MTBCSpeciesTests(TestCase): - - def setUp(self): - pass - - def teardown(self): - pass - - def test_mixed_chimera(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={} - ) - species_predictor.out_json["phylogenetics"] = { - "sub_complex": { - "Mycobacterium_avium_complex": { - "percent_coverage": 98.346, - "median_depth": 54.0 - } - }, - "phylo_group": { - "Non_tuberculosis_mycobacterium_complex": { - "percent_coverage": 82.846, - "median_depth": 49 - } - }, - "species": { - "Mycobacterium_chimaera": { - "percent_coverage": 99.162, - "median_depth": 39 - }, - "Mycobacterium_intracellulare": { - "percent_coverage": 98.662, - "median_depth": 45 - }, - "Mycobacterium_bovis": { - "percent_coverage": 9.894, - "median_depth": 12.0 - } +def test_mixed_chimera(): + species_predictor = AMRSpeciesPredictor( + phylo_group_covgs={}, sub_complex_covgs={}, species_covgs={}, lineage_covgs={} + ) + species_predictor.out_json["phylogenetics"] = { + "sub_complex": { + "Mycobacterium_avium_complex": { + "percent_coverage": 98.346, + "median_depth": 54.0, + } + }, + "phylo_group": { + "Non_tuberculosis_mycobacterium_complex": { + "percent_coverage": 82.846, + "median_depth": 49, } - } + }, + "species": { + "Mycobacterium_chimaera": {"percent_coverage": 99.162, "median_depth": 39}, + "Mycobacterium_intracellulare": { + "percent_coverage": 98.662, + "median_depth": 45, + }, + "Mycobacterium_bovis": {"percent_coverage": 9.894, "median_depth": 12.0}, + }, + } - out_dict = species_predictor.choose_best( - species_predictor.out_json["phylogenetics"]) + out_dict = species_predictor.choose_best( + species_predictor.out_json["phylogenetics"] + ) - assert "Mycobacterium_chimaera" in out_dict["species"] - assert "Mycobacterium_intracellulare" in out_dict["species"] - assert "Mycobacterium_bovis" not in out_dict["species"] + assert "Mycobacterium_chimaera" in out_dict["species"] + assert "Mycobacterium_intracellulare" in out_dict["species"] + assert "Mycobacterium_bovis" not in out_dict["species"] diff --git a/tests/metagenomics_tests/lineages_test.py b/tests/metagenomics_tests/lineages_test.py index f59a3043..45717d0e 100644 --- a/tests/metagenomics_tests/lineages_test.py +++ b/tests/metagenomics_tests/lineages_test.py @@ -1,8 +1,6 @@ import copy import json -import pytest -import anytree from anytree.exporter import JsonExporter from mykrobe.metagenomics import LineagePredictor @@ -14,6 +12,7 @@ def test_lineage_to_itself_plus_parents(): assert f("a.1") == ["a", "a.1"] assert f("a.1.2") == ["a", "a.1", "a.1.2"] + def test_constructor_makes_tree(): # Note: deliberately miss out lineage1.1 to check that node # still gets made. We're allowing the user to not specify every single @@ -36,10 +35,16 @@ def test_constructor_makes_tree(): "name": "lineage1", "children": [ {"name": "lineage1.2"}, - {"name": "lineage1.1", "children": [{"name": "lineage1.1.1"}],}, + { + "name": "lineage1.1", + "children": [{"name": "lineage1.1.1"}], + }, ], }, - {"name": "lineage2", "children": [{"name": "lineage2.1"}],}, + { + "name": "lineage2", + "children": [{"name": "lineage2.1"}], + }, ], } assert got_tree == expect_tree @@ -48,11 +53,20 @@ def test_constructor_makes_tree(): def test_score_each_lineage_node(): lineage_calls = { "lineage1": { - "var1": {"genotype": [0, 0], "info": {"filter": [], "conf": 500},}, - "var1a": {"genotype": [1, 1], "info": {"filter": [], "conf": 1000},}, + "var1": { + "genotype": [0, 0], + "info": {"filter": [], "conf": 500}, + }, + "var1a": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 1000}, + }, }, "lineage1.1": { - "var1.1": {"genotype": [1, 1], "info": {"filter": [], "conf": 20},}, + "var1.1": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 20}, + }, }, } variant_to_lineage = { @@ -79,18 +93,38 @@ def test_get_paths_and_scores(): lineage_calls = { "lineage1": { - "var1": {"genotype": [0, 0], "info": {"filter": [], "conf": 500},}, - "var1a": {"genotype": [1, 1], "info": {"filter": [], "conf": 1000},}, + "var1": { + "genotype": [0, 0], + "info": {"filter": [], "conf": 500}, + }, + "var1a": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 1000}, + }, }, "lineage1.1": { - "var1.1": {"genotype": [1, 1], "info": {"filter": [], "conf": 1000},}, + "var1.1": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 1000}, + }, }, "lineage1.1.1": { - "var1.1.1": {"genotype": [1, 1], "info": {"filter": [], "conf": 100},}, + "var1.1.1": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 100}, + }, + }, + "lineage2": { + "var2": { + "genotype": [1, 1], + "info": {"filter": [], "conf": 1}, + }, }, - "lineage2": {"var2": {"genotype": [1, 1], "info": {"filter": [], "conf": 1},},}, "lineage2.1": { - "var2": {"genotype": [0, 0], "info": {"filter": [], "conf": 100},}, + "var2": { + "genotype": [0, 0], + "info": {"filter": [], "conf": 100}, + }, }, } @@ -128,9 +162,16 @@ def test_call_lineage_using_conf_scores(): var2_1_call = {"genotype": [1, 1], "info": {"filter": [], "conf": 500}} lineage_calls = { - "lineage1": {"var1": var1_call, "var1a": var1a_call,}, - "lineage1.1": {"var1.1": var1_1_call,}, - "lineage1.1.1": {"var1.1.1": var1_1_1_call,}, + "lineage1": { + "var1": var1_call, + "var1a": var1a_call, + }, + "lineage1.1": { + "var1.1": var1_1_call, + }, + "lineage1.1.1": { + "var1.1.1": var1_1_1_call, + }, } expect = { @@ -151,8 +192,13 @@ def test_call_lineage_using_conf_scores(): def test_genotype_each_lineage_node(): lineage_calls = { - "lineage1": {"var1": {"genotype": [0, 0]}, "var1a": {"genotype": [1, 1]},}, - "lineage1.1": {"var1.1": {"genotype": [0, 1]},}, + "lineage1": { + "var1": {"genotype": [0, 0]}, + "var1a": {"genotype": [1, 1]}, + }, + "lineage1.1": { + "var1.1": {"genotype": [0, 1]}, + }, } variant_to_lineage = { "var1": {"name": "lineage1", "use_ref_allele": True}, @@ -177,24 +223,35 @@ def test_get_good_paths_using_genotype_calls(): } lineage_calls = { - "lineage1": {"var1": {"genotype": [0, 0],}, "var1a": {"genotype": [1, 1]},}, - "lineage1.1": {"var1.1": {"genotype": [1, 1]},}, - "lineage1.1.1": {"var1.1.1": {"genotype": [1, 1]},}, - "lineage2": {"var2": {"genotype": [1, 1]},}, - "lineage2.1": {"var2": {"genotype": [0, 0]},}, + "lineage1": { + "var1": { + "genotype": [0, 0], + }, + "var1a": {"genotype": [1, 1]}, + }, + "lineage1.1": { + "var1.1": {"genotype": [1, 1]}, + }, + "lineage1.1.1": { + "var1.1.1": {"genotype": [1, 1]}, + }, + "lineage2": { + "var2": {"genotype": [1, 1]}, + }, + "lineage2.1": { + "var2": {"genotype": [0, 0]}, + }, } lin_pred = LineagePredictor(variant_to_lineage) got = lin_pred._get_good_paths_using_genotype_calls(lineage_calls) expect = { - "lineage1.1.1": - { + "lineage1.1.1": { "genotypes": {"lineage1": 1, "lineage1.1": 1, "lineage1.1.1": 1}, "good_nodes": 3, "tree_depth": 3, }, - "lineage2": - { + "lineage2": { "genotypes": {"lineage2": 1}, "good_nodes": 1, "tree_depth": 1, @@ -208,7 +265,11 @@ def test_call_lineage(): "var1": {"name": "lineage1", "use_ref_allele": False}, "var1a": {"name": "lineage1", "use_ref_allele": False}, "var1.1": {"name": "lineage1.1", "use_ref_allele": False}, - "var1.1.1": {"name": "lineage1.1.1", "use_ref_allele": False, "report_name": "l1-1-1"}, + "var1.1.1": { + "name": "lineage1.1.1", + "use_ref_allele": False, + "report_name": "l1-1-1", + }, "var2": {"name": "lineage2", "use_ref_allele": False}, "var2.1": {"name": "lineage2.1", "use_ref_allele": False}, } @@ -223,9 +284,16 @@ def test_call_lineage(): var2_1_call = {"genotype": [1, 1]} lineage_calls = { - "lineage1": {"var1": var1_call, "var1a": var1a_call,}, - "lineage1.1": {"var1.1": var1_1_call,}, - "lineage1.1.1": {"var1.1.1": var1_1_1_call,}, + "lineage1": { + "var1": var1_call, + "var1a": var1a_call, + }, + "lineage1.1": { + "var1.1": var1_1_call, + }, + "lineage1.1.1": { + "var1.1.1": var1_1_1_call, + }, } expect_lineage_calls = copy.deepcopy(lineage_calls) @@ -241,7 +309,7 @@ def test_call_lineage(): "good_nodes": 3, "tree_depth": 3, }, - } + }, } assert lin_pred.call_lineage(lineage_calls) == expect @@ -253,7 +321,7 @@ def test_call_lineage(): "lineage2.1": {"var2.1": var2_1_call}, } expect["calls_summary"]["lineage2.1"] = { - "genotypes": {'lineage2': 1, 'lineage2.1': 1}, + "genotypes": {"lineage2": 1, "lineage2.1": 1}, "good_nodes": 2, "tree_depth": 2, } diff --git a/tests/metagenomics_tests/test_MTBC.py b/tests/metagenomics_tests/test_MTBC.py index 7450a204..9ce3bbd8 100644 --- a/tests/metagenomics_tests/test_MTBC.py +++ b/tests/metagenomics_tests/test_MTBC.py @@ -1,105 +1,88 @@ import os -from unittest import TestCase -import sys -sys.path.append(".") +import pytest + from mykrobe.metagenomics import AMRSpeciesPredictor -DATA_DIR = os.path.join("tests", "ref_data") -class MTBCSpeciesTests(TestCase): +@pytest.fixture() +def species_predictor(): + hierarchy_json_file = os.path.join("tests", "ref_data", "mtbc_hierarchy.json") + return AMRSpeciesPredictor( + phylo_group_covgs={}, + sub_complex_covgs={}, + species_covgs={}, + lineage_covgs={}, + hierarchy_json_file=hierarchy_json_file, + ) - def setUp(self): - self.hierarchy_json_file = f"{DATA_DIR}/mtbc_hierarchy.json" - def teardown(self): - pass +def test_mixed_MTBC_NTM(species_predictor): + species_predictor.out_json["phylogenetics"] = {} + species_predictor.out_json["phylogenetics"]["phylo_group"] = { + "Non_tuberculosis_mycobacterium_complex": { + "percent_coverage": 58.71542975006994, + "median_depth": 36, + }, + "Mycobacterium_tuberculosis_complex": { + "percent_coverage": 62.81850563578579, + "median_depth": 2, + }, + } + assert species_predictor.is_mtbc_present() + assert species_predictor.is_ntm_present() - def test_mixed_MTBC_NTM(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - species_predictor.out_json["phylogenetics"] = {} - species_predictor.out_json["phylogenetics"]["phylo_group"] = { - "Non_tuberculosis_mycobacterium_complex": { - "percent_coverage": 58.71542975006994, - "median_depth": 36 - }, - "Mycobacterium_tuberculosis_complex": { - "percent_coverage": 62.81850563578579, - "median_depth": 2 - } - } - assert species_predictor.is_mtbc_present() - assert species_predictor.is_ntm_present() + assert ( + len( + species_predictor._get_present_phylo_groups( + species_predictor.out_json["phylogenetics"]["phylo_group"] + ) + ) + == 2 + ) - assert len(species_predictor._get_present_phylo_groups( - species_predictor.out_json["phylogenetics"]["phylo_group"])) == 2 - def test_get_best_coverage_dict(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - - best_species = species_predictor._get_best_coverage_dict({ - "Mycobacterium_chimaera": { - "percent_coverage": 99.162, - "median_depth": 39 - }, +def test_get_best_coverage_dict(species_predictor): + best_species = species_predictor._get_best_coverage_dict( + { + "Mycobacterium_chimaera": {"percent_coverage": 99.162, "median_depth": 39}, "Mycobacterium_intracellulare": { "percent_coverage": 98.662, - "median_depth": 45 + "median_depth": 45, }, - "Mycobacterium_bovis": { - "percent_coverage": 9.894, - "median_depth": 12.0 - }}).keys() - assert list(best_species) == ["Mycobacterium_chimaera"] + "Mycobacterium_bovis": {"percent_coverage": 9.894, "median_depth": 12.0}, + } + ).keys() + assert list(best_species) == ["Mycobacterium_chimaera"] - def test_mixed_chimera(self): - species_predictor = AMRSpeciesPredictor( - phylo_group_covgs={}, - sub_complex_covgs={}, - species_covgs={}, - lineage_covgs={}, - hierarchy_json_file=self.hierarchy_json_file) - species_predictor.out_json["phylogenetics"] = { - "sub_complex": { - "Mycobacterium_avium_complex": { - "percent_coverage": 98.346, - "median_depth": 54.0 - } - }, - "phylo_group": { - "Non_tuberculosis_mycobacterium_complex": { - "percent_coverage": 82.846, - "median_depth": 49 - } - }, - "species": { - "Mycobacterium_chimaera": { - "percent_coverage": 99.162, - "median_depth": 39 - }, - "Mycobacterium_intracellulare": { - "percent_coverage": 98.662, - "median_depth": 45 - }, - "Mycobacterium_bovis": { - "percent_coverage": 9.894, - "median_depth": 12.0 - } + +def test_mixed_chimera(species_predictor): + species_predictor.out_json["phylogenetics"] = { + "sub_complex": { + "Mycobacterium_avium_complex": { + "percent_coverage": 98.346, + "median_depth": 54.0, } - } + }, + "phylo_group": { + "Non_tuberculosis_mycobacterium_complex": { + "percent_coverage": 82.846, + "median_depth": 49, + } + }, + "species": { + "Mycobacterium_chimaera": {"percent_coverage": 99.162, "median_depth": 39}, + "Mycobacterium_intracellulare": { + "percent_coverage": 98.662, + "median_depth": 45, + }, + "Mycobacterium_bovis": {"percent_coverage": 9.894, "median_depth": 12.0}, + }, + } - out_dict = species_predictor.choose_best( - species_predictor.out_json["phylogenetics"]) + out_dict = species_predictor.choose_best( + species_predictor.out_json["phylogenetics"] + ) - assert "Mycobacterium_chimaera" in out_dict["species"] - assert "Mycobacterium_intracellulare" in out_dict["species"] - assert "Mycobacterium_bovis" not in out_dict["species"] + assert "Mycobacterium_chimaera" in out_dict["species"] + assert "Mycobacterium_intracellulare" in out_dict["species"] + assert "Mycobacterium_bovis" not in out_dict["species"] diff --git a/tests/predict_tests/test_amr.py b/tests/predict_tests/test_amr.py index cec7be6b..5fb29959 100644 --- a/tests/predict_tests/test_amr.py +++ b/tests/predict_tests/test_amr.py @@ -1,45 +1,42 @@ import os -from unittest import TestCase -import pytest from mykrobe.predict import TBPredictor from mykrobe.variants.schema.models import Variant -DATA_DIR = os.path.join("tests", "ref_data") -class AMRPredictTest(TestCase): - def setUp(self): - self.variant_snp = Variant.create( - start=0, end=1, reference_bases="A", alternate_bases=["T"] - ) - - self.predictor = TBPredictor(variant_calls={}, called_genes={}, variant_to_resistance_json_fp=os.path.join(DATA_DIR, "tb_variant_to_resistance_drug.json")) - - def teardown(self): - pass - - def test_wt_vars(self): - call = { - "variant": None, - "genotype": [0, 1], - "genotype_likelihoods": [0.1, 0.9, 0.12], - "info": { - "contamination_depths": [], - "coverage": { - "alternate": { - "percent_coverage": 100.0, - "median_depth": 15, - "min_depth": 2, - }, - "reference": { - "percent_coverage": 100.0, - "median_depth": 139, - "min_depth": 128, - }, +def test_wr_vars(): + variant_snp = Variant.create( + start=0, end=1, reference_bases="A", alternate_bases=["T"] + ) + + predictor = TBPredictor( + variant_calls={}, + called_genes={}, + variant_to_resistance_json_fp=os.path.join( + "tests", "ref_data", "tb_variant_to_resistance_drug.json" + ), + ) + + call = { + "variant": None, + "genotype": [0, 1], + "genotype_likelihoods": [0.1, 0.9, 0.12], + "info": { + "contamination_depths": [], + "coverage": { + "alternate": { + "percent_coverage": 100.0, + "median_depth": 15, + "min_depth": 2, + }, + "reference": { + "percent_coverage": 100.0, + "median_depth": 139, + "min_depth": 128, }, - "expected_depths": [152], }, - } - - assert self.predictor._coverage_greater_than_threshold(call, [""]) is False + "expected_depths": [152], + }, + } + assert predictor._coverage_greater_than_threshold(call, [""]) is False diff --git a/tests/predict_tests/test_model.py b/tests/predict_tests/test_model.py index aaa86ab5..a7ffdad5 100644 --- a/tests/predict_tests/test_model.py +++ b/tests/predict_tests/test_model.py @@ -1,50 +1,31 @@ -import sys import json -from unittest import TestCase - -sys.path.append(".") - from mykrobe.predict import MykrobePredictorSusceptibilityResult -class MykrobePredictResultsTest(TestCase): - - def setUp(self): - pass - - def teardown(self): - pass - - def test_document_from_json(self): - temp_json = {'susceptibility':{'Rifampicin':{"predict" : 'R'}}} - result = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) - assert result.susceptibility["Rifampicin"] == {"predict" : 'R'} - - result2 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) - assert result is not result2 - assert result == result2 - - -class MykrobePredictResultsTest(TestCase): - - def setUp(self): - temp_json = {'susceptibility':{'Rifampicin':{"predict" : 'R'}}} - self.result = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) - temp_json2 = {'susceptibility':{'Rifampicin':{"predict" : 'R'}}} - self.result2 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json2)) - temp_json3 = {'susceptibility':{'Rifampicin':{"predict" : 'S'}}} - self.result3 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json3)) - temp_json4 = {'susceptibility':{'Quin':{"predict" : 'S'}}} - self.result4 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json4)) - - def teardown(self): - pass - - def test_document_diff(self): - assert self.result.diff(self.result2) == {} - assert self.result.diff(self.result3) == {"Rifampicin" : {"predict" :("R", "S")}} - - def test_document_diff2(self): - assert self.result.diff(self.result4) == {'Rifampicin': {'predict': ('R', 'NA')}, 'Quin': {'predict': ('NA', 'S')}} - +def test_document_from_json(): + temp_json = {"susceptibility": {"Rifampicin": {"predict": "R"}}} + result = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) + assert result.susceptibility["Rifampicin"] == {"predict": "R"} + + result2 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) + assert result is not result2 + assert result == result2 + + +def test_mykrobe_predict_results(): + temp_json = {"susceptibility": {"Rifampicin": {"predict": "R"}}} + result = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json)) + temp_json2 = {"susceptibility": {"Rifampicin": {"predict": "R"}}} + result2 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json2)) + temp_json3 = {"susceptibility": {"Rifampicin": {"predict": "S"}}} + result3 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json3)) + temp_json4 = {"susceptibility": {"Quin": {"predict": "S"}}} + result4 = MykrobePredictorSusceptibilityResult.from_json(json.dumps(temp_json4)) + + assert result.diff(result2) == {} + assert result.diff(result3) == {"Rifampicin": {"predict": ("R", "S")}} + assert result.diff(result4) == { + "Rifampicin": {"predict": ("R", "NA")}, + "Quin": {"predict": ("NA", "S")}, + } diff --git a/tests/probe_tests/base.py b/tests/probe_tests/base.py index 1e74d6b1..ad0e21e5 100644 --- a/tests/probe_tests/base.py +++ b/tests/probe_tests/base.py @@ -1,7 +1,11 @@ from mykrobe.utils import seq_to_kmers + + def assert_no_overlapping_kmers(panel): for i in range(len(panel.refs)): - kmer_intersection=set(seq_to_kmers(panel.refs[i], 31)).intersection(set(seq_to_kmers(panel.alts[i], 31))) + kmer_intersection = set(seq_to_kmers(panel.refs[i], 31)).intersection( + set(seq_to_kmers(panel.alts[i], 31)) + ) if kmer_intersection: print(panel.refs) print(panel.alts) diff --git a/tests/probe_tests/conftest.py b/tests/probe_tests/conftest.py new file mode 100644 index 00000000..f4a75505 --- /dev/null +++ b/tests/probe_tests/conftest.py @@ -0,0 +1,44 @@ +import os + +from mongoengine import connect +import pytest + +from mykrobe.probes import AlleleGenerator +from mykrobe.variants.schema.models import Reference +from mykrobe.variants.schema.models import ReferenceSet +from mykrobe.variants.schema.models import VariantSet + +DATA_DIR = os.path.join("tests", "ref_data") + + +@pytest.fixture(autouse=True) +def db_setup_teardown(): + DB = connect("mykrobe-test") + DB.drop_database("mykrobe-test") + yield + DB.drop_database("mykrobe-test") + + +@pytest.fixture() +def variant_sets_and_reference(): + reference_set = ReferenceSet().create_and_save(name="ref_set") + variant_set = VariantSet.create_and_save( + name="this_vcf_file", reference_set=reference_set + ) + variant_sets = [variant_set] + reference = Reference().create_and_save( + name="ref", md5checksum="sre", reference_sets=[reference_set] + ) + return variant_sets, reference + + +@pytest.fixture() +def pg(): + pg = AlleleGenerator(reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31) + return pg + + +@pytest.fixture() +def pg2(): + pg2 = AlleleGenerator(reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta", kmer=31) + return pg2 diff --git a/tests/probe_tests/test_indel_only.py b/tests/probe_tests/test_indel_only.py index df35dbac..6f4cf931 100644 --- a/tests/probe_tests/test_indel_only.py +++ b/tests/probe_tests/test_indel_only.py @@ -1,270 +1,222 @@ -import os - -from mongoengine import connect - from base import assert_no_overlapping_kmers -from mykrobe.probes import AlleleGenerator -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet from mykrobe.variants.schema.models import Variant -from mykrobe.variants.schema.models import VariantSet - -DB = connect("mykrobe-test") - -DATA_DIR = os.path.join("tests", "ref_data") - - -class TestINDELAlleleGenerator: - def setup(self): - DB.drop_database("mykrobe-test") - - self.pg = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31 - ) - self.pg2 = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta", kmer=31 - ) - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", reference_set=self.reference_set - ) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", md5checksum="sre", reference_sets=[self.reference_set] - ) - - def test_simple_deletion1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AA", - start=31, - alternate_bases=["A"], - ) - assert v.is_indel - assert v.is_deletion - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert self.pg._calculate_length_delta_from_indels(v, []) == 1 - assert panel.alts == [ - "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG" - ] - - def test_simple_deletion2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AT", - start=32, - alternate_bases=["A"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert panel.alts == [ - "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGAT" - ] - - def test_simple_deletion3(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AT", - start=2902618, - alternate_bases=["T"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - in panel.refs - ) - assert panel.alts == [ - "TTTATACTACTGCTCAATTTTTTTACTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - ] - - def test_simple_deletion4(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert panel.alts == [ - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT" - ] - - def test_simple_insertion1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=1, - alternate_bases=["TTTC"], - ) - panel = self.pg.create(v) - # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref - assert v.is_indel - assert v.is_insertion - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert panel.alts == ["TTTCGATTAAAGATAGAAATACACGATGCGAGC"] - - def test_simple_insertion2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=1, - alternate_bases=["CTTT"], - ) - panel = self.pg.create(v) - # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert panel.alts == ["CTTTGATTAAAGATAGAAATACACGATGCGAGCA"] - - def test_simple_insertion3(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["ATTT"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert panel.alts == [ - "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG" - ] - - def test_simple_insertion4(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["AGGGG"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert panel.alts == [ - "ATTAAAGATAGAAATACACGATGCGAGCAAGGGGTCAAATTTCATAACATCACCATGAGTTTGA" - ] - - def test_simple_insertion5(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=2902618, - alternate_bases=["ATGC"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - in panel.refs - ) - assert panel.alts == [ - "TATACTACTGCTCAATTTTTTTACTTTTATGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - ] - - def test_double_insertion(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=4021408, - alternate_bases=["ACGCTGGCGGGCG"], - ) - v1 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AGA", - start=4021406, - alternate_bases=["CGG"], - ) - context = [v1] - assert self.pg2._remove_overlapping_contexts(v, [v1]) == [] - panel = self.pg2.create(v, context=context) - assert_no_overlapping_kmers(panel) - assert ( - "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGAATCGCATGATTTGAGCTCAAATCATGCGAT" - in panel.refs - ) - assert panel.alts == [ - "TCTAGCCGCAAGGGCGCGAGCAGACGCAGACGCTGGCGGGCGATCGCATGATTTGAGCTCAAATCATGCGAT" - ] - def test_double_indel_fail(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="CCA", - start=2288851, - alternate_bases=["A"], - ) - v1 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=2288850, - alternate_bases=["ACC"], - ) - context = [v1] - panel = self.pg2.create(v, context=context) - assert ( - "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" - in panel.refs - ) - assert ( - "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" - not in panel.alts - ) - def test_large_insertion(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="CCGCCGGCCCCGCCGTTT", - start=1636155, - alternate_bases=[ - "CTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCG" - ], - ) - panel = self.pg2.create(v, context=[]) - assert_no_overlapping_kmers(panel) - assert ( - "AGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCAT" - in panel.refs - ) - assert panel.alts == [ - "GACCTAGCAGGGTGCCGGCGCCGCCCTTGCTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCC" - ] +def test_simple_deletion1(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AA", + start=31, + alternate_bases=["A"], + ) + assert v.is_indel + assert v.is_deletion + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert pg._calculate_length_delta_from_indels(v, []) == 1 + assert panel.alts == ["GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"] + + +def test_simple_deletion2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AT", + start=32, + alternate_bases=["A"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert panel.alts == [ + "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGAT" + ] + + +def test_simple_deletion3(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AT", + start=2902618, + alternate_bases=["T"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs + assert panel.alts == [ + "TTTATACTACTGCTCAATTTTTTTACTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] + + +def test_simple_deletion4(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert panel.alts == ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"] + + +def test_simple_insertion1(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=1, + alternate_bases=["TTTC"], + ) + panel = pg.create(v) + # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref + assert v.is_indel + assert v.is_insertion + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert panel.alts == ["TTTCGATTAAAGATAGAAATACACGATGCGAGC"] + + +def test_simple_insertion2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=1, + alternate_bases=["CTTT"], + ) + panel = pg.create(v) + # assert_no_overlapping_kmers(panel)### Skip this test for vars in first k bases of ref + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert panel.alts == ["CTTTGATTAAAGATAGAAATACACGATGCGAGCA"] + + +def test_simple_insertion3(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["ATTT"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert panel.alts == [ + "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG" + ] + + +def test_simple_insertion4(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["AGGGG"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert panel.alts == [ + "ATTAAAGATAGAAATACACGATGCGAGCAAGGGGTCAAATTTCATAACATCACCATGAGTTTGA" + ] + + +def test_simple_insertion5(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=2902618, + alternate_bases=["ATGC"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + assert "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" in panel.refs + assert panel.alts == [ + "TATACTACTGCTCAATTTTTTTACTTTTATGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] + + +def test_double_insertion(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=4021408, + alternate_bases=["ACGCTGGCGGGCG"], + ) + v1 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AGA", + start=4021406, + alternate_bases=["CGG"], + ) + context = [v1] + assert pg2._remove_overlapping_contexts(v, [v1]) == [] + panel = pg2.create(v, context=context) + assert_no_overlapping_kmers(panel) + assert "ATCTAGCCGCAAGGGCGCGAGCAGACGCAGAATCGCATGATTTGAGCTCAAATCATGCGAT" in panel.refs + assert panel.alts == [ + "TCTAGCCGCAAGGGCGCGAGCAGACGCAGACGCTGGCGGGCGATCGCATGATTTGAGCTCAAATCATGCGAT" + ] + + +def test_double_indel_fail(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="CCA", + start=2288851, + alternate_bases=["A"], + ) + v1 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=2288850, + alternate_bases=["ACC"], + ) + context = [v1] + panel = pg2.create(v, context=context) + assert "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" in panel.refs + assert ( + "GGCGCACACAATGATCGGTGGCAATACCGACCACATCGACCTCATCGACGCCGCGTTGCCG" + not in panel.alts + ) + + +def test_large_insertion(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="CCGCCGGCCCCGCCGTTT", + start=1636155, + alternate_bases=[ + "CTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCG" + ], + ) + panel = pg2.create(v, context=[]) + assert_no_overlapping_kmers(panel) + assert "AGACCTAGCAGGGTGCCGGCGCCGCCCTTGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCAT" in panel.refs + assert panel.alts == [ + "GACCTAGCAGGGTGCCGGCGCCGCCCTTGCTGCCGGCCCCGCCGGCGCCGCCCAATCCACCGAAGCCCCTCCCTTCGGTGGGGTCGCTGCCGCCGTCGCCGCCGTCACCGCCCTTGCCGCCGGCCCCGCCGTCGCCGCCGGCTCCGGCGGTGCCGTCGCCGCCCTGGCCGCCGGCCCCGCCGTTTCCGCCGCCGCCGCCATCGCCGATGATGTTTTCC" + ] diff --git a/tests/probe_tests/test_large_indel.py b/tests/probe_tests/test_large_indel.py index 3c5f8b9d..e937cfb7 100644 --- a/tests/probe_tests/test_large_indel.py +++ b/tests/probe_tests/test_large_indel.py @@ -1,106 +1,77 @@ -import os - -from mongoengine import connect - from base import assert_no_overlapping_kmers -from mykrobe.probes import AlleleGenerator -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet from mykrobe.variants.schema.models import Variant -from mykrobe.variants.schema.models import VariantSet -DB = connect("mykrobe-test") -DATA_DIR = os.path.join("tests", "ref_data") +def test_large_variant1(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACT", + start=1355983, + alternate_bases=["ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCG"], + ) + panel = pg2.create(v) + assert_no_overlapping_kmers(panel) + assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" in panel.refs + assert panel.alts == [ + "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCGGGA" + ] -class TestLargeINDELAlleleGenerator: - def setup(self): - DB.drop_database("mykrobe-test") - self.pg = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta", kmer=31 - ) - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", reference_set=self.reference_set - ) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", md5checksum="sre", reference_sets=[self.reference_set] - ) +def test_large_variant2(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCAC", + start=1355983, + alternate_bases=["ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGC"], + ) + panel = pg2.create(v) + assert_no_overlapping_kmers(panel) + assert "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" in panel.refs + assert panel.alts == [ + "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCTGGA" + ] - def test_large_variant1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACT", - start=1355983, - alternate_bases=["ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCG"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" - in panel.refs - ) - assert panel.alts == [ - "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCGGGA" - ] - def test_large_variant2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCAC", - start=1355983, - alternate_bases=["ACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGC"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "TCGTCAACGCCCGGTATCTGAGGATCTGTGTTCTCACCCAATACAAGTCGCATTCACTGGA" - in panel.refs - ) - assert panel.alts == [ - "TCGTCACCGCCCGGTATCTGAGGATTGGTTTTCCACCCAAATACAAGTCGCATTCGCTGGA" - ] +def test_large_insertion(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=2352065, + alternate_bases=[ + "CCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTG" + ], + ) + panel = pg2.create(v) + assert_no_overlapping_kmers(panel) + assert "AGCTCGGCCAGCTCAGTCACGTCGCCGCCGCCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" in panel.refs + assert panel.alts == [ + "CCAGCTCAGTCACGTCGCCGCCGCCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTGCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" + ] - def test_large_insertion(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=2352065, - alternate_bases=[ - "CCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTG" - ], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "AGCTCGGCCAGCTCAGTCACGTCGCCGCCGCCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" - in panel.refs - ) - assert panel.alts == [ - "CCAGCTCAGTCACGTCGCCGCCGCCTCGCCTGGGCTGGCGAGCAGACGCAAAATCCCCCGCACGCCCGGCGTGTCGGGGGATTTTGCGTCTGCTCGCCAGTTGACCGCGCCCGCTCGCGGCT" - ] - def test_large_var1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="CGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCA", - start=2266659, - alternate_bases=[ - "CACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCG" - ], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert ( - "TGGTGACGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCAAACA" - in panel.refs - ) - assert panel.alts == [ - "GGCCTCGTCGGGAATGCCGGCGATGGTGACACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCGAACACGCCAGGCCCGGGCCGTCGTCAACCA" - ] +def test_large_var1(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="CGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCA", + start=2266659, + alternate_bases=[ + "CACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCG" + ], + ) + panel = pg2.create(v) + assert_no_overlapping_kmers(panel) + assert ( + "TGGTGACGCGGGAGTAGAACGATCGCCAAGTGGTCGGTCTTGGCTGCCCACTTCATCCCCGGCGCCACCGGCAGGTCTCGCGGTCATCTCGACCAACGGAGGGCCGTCGGTGGTTCGTATCCGGCCAAGAACGGCGAGAACGGTTTGTGCCTCTATGCCAGGGTGAATGTCTCATCTCCCAGGCGGACGGTGATATCCAGTTCTCCGCCAAGAGCGGACACGTATTTGCGCAGTGTGTTGACCTGTGCGGAGCCGATGTCGCCGTTCTCGATGCTGGATACCCGGCTCTGCCGGATGTGCGCCAGCGCAGCCACCTGGACCTGGGTGAGTGACTGAGCCGCGCGCAGCTCCCGGAGCCGGAATGCCCGCACTTCATCGCGCATTCGTGCCTTGTGCCGGTCCACCGCCTCCCGGTTAACGGGACGTACGGCGTCCATGTCCCGTAGTGTCATCGCCATCGTGCCACTTACCCTTTCTTGCGCTTGCGCCTCTTTGGCTTCGTGTCCTCGAACTGTGCGAGATGTTCGGCAAACATCTCATCGGCCGCTTTGATCTTCTCGTCGTACCACTGGGTCCACCGCCCGGCCTTGTTACCGGCGGCCAGCATGATCGCCTGCCGCGCCGGGTCGAAGGCGAACAGAATGCGGACCTCGGACCGCCCTTGTGATCCTGGACGCAGCTCCTTCATGTTCTTGTGGCGCGACCCACGCACCGTGTCCACCAGAGGACAGCCAAGTGCGGGGCCCTCTTCCTCGAGAACCTCGATAGCTGCGAACACCAATTCGTAGGTCTCTCGGTCCAAGCCGTTGAGCCAGGCGGAGATGCGCTCCACATCCGCCGTCCACCCCACAGAGTCGCAGAGTAGCGCGATACGCGATATCACACAAGGGTGATATTCCTCCGGGTAAGAGCAGCGGGCGACGGGGCTACCGTCGAGGAAATGCCGGCAGGCGAGGACGGACTCTGCGCACCCGGGCCGTTGAAACAGTAGCCTGTGCCAGGCCGAGAATTCATCCCCACGTATGAGGCAGTACAGTGCGCCGCCGTGCGCGTTCTCCCATGGAACGTTCACGGGCTCCCGTGGATGACAGGCGTTTCATGAACGCCAGCGCCGCCGCAACCCGACCGAAAGCGGTTGACCCCAAGGAGAGCTGGAAGTCGAGGCCACCACCTTCGCCGCGGAGTTGCTCATGCCCGAGAGCGAGACTCGTCCCGAAATACGCCGGCTCGATTTCGGCAAGTTGCTCGAACTGAAGCGGGAATGGGCGTCGACCCGCTCGACCAGCCCCAGCCGGGTGACCAGCCCCAGCCGGGTGACCAGCCGATGCACCGCGGCGATCCCACCGAAGCCGGTGGCATCGATGTTGGCGCCGACCTCGTAGCGCACCGCGCCCGAACCCAGCATCGGCCTGGGCTGCGCCGCCCAGCGTCCAGCCCGCGCGTGCCGCGCCGCCACCCTGCGCCCTCGGCGTGTGATGTTTCGCCGACTCTGTTCATGGGTTATCTTCTTCACCACAAAGGCCTTTCCTGCTGGGCTGTGTTGAGGTCGCAAACCCAGCCAGGGTAAGGCCTTTGGCCTCTCCTACCCGGCCGACACGCTTACTGAAGGCCTAGTCTAGGCAGGCCATTCAATCTGCGGAATCGAAAAATTCGGTTCCAGCCTGCTCGTTTCCTTTCCGACAGCGATCTGACGTTGCGTAACGTCATTTGTACGGACTCTTTTAGCGGCATTGATTTCAGATGCCAACGCCGTCTGTGCTGTAGCGCCGATTGGCCGAAACTGTAAATTTGTATGATTATTTAAATCTTTGACGAACACGCGCCACAAACGTACTATCTCTTTGGCAAAGTCCACCGGCATCTCATTCAACGGTTTTGTTTGCGCGTGGTCGTCATATGTTGGTAACTGTGTAACCGGCCGCCTATCTTGCGCGTGCATCATATGACTATGAATCGGCCTTCTCCAGTGAAATTGATACAAGATCGATCCGATAAGCGGTACCTTGTACACAGTGCAATTGTAGTAATTCGCGTTTTGTCCTACGCTTGTATTCTGCGTGAAGAATTCAAACA" + in panel.refs + ) + assert panel.alts == [ + "GGCCTCGTCGGGAATGCCGGCGATGGTGACACGCGAGTTGTAGATGATCGTTGAGTGGTCTTGCTTGGACTTCCATTTCATCTTTTCGACGCGCCAGGTCTCGCGGTCCTCCGGATCTGCGCCCGGTTTGAGTTGCACATCAAGGGGATACGGCTTGACCGACTCGTAGCCGACATGTAAGTCGGCTAGTTTCCGGCCGGCGCTGGCGAGCTGGTCGAAGCGTTCGCGGGTCTCCGGTGTTGGGATGTGCGGGAGCATCTTCTTGAGGTCAGCGGCGTATTTTGTGCGGTAGGCGGGGTCATGCAGCAGGCCGTAGACGTAGTAGAAGATGTCGTCTTTGGTGACTTGGTCGCCGATCGTGTCGCGGTAGAGCTTGAGGATGACGCCGGTGATGTTGTCGACGCGGCGGTAGCCGTGGTCGTCTACTTCGGCGTTGGTGGTGGACTCGAAATCGAGTTCGCCGTCACGTGGTTCGGTCTTCTCGTAGGTCCAGCGCGGGAAGAATTGACCGTTGCTTGAGCCCCAGAATGCGAGATCGGGGATAGCGTTTAGCATCAGACACGAGAAGGGCTTGTCTGAGCCCATGCCAACCACGTAGTAACCGACATTCCCGTGCTCCGGCGTCGGAAACATCGACGGAAGCTGGTAGGTACAGTTGTTGAGCTGCTGGTTGGGGTCGAGGTAGGCGTGCTCTTTCGTAAATGGTCGGTACGTGCCGAGCCGCATTCCCGCGGGAGCGAATTCGATGCGAATGCCTTGTGCCACTTGCCGCTTGTTGATGCGGTCCCAGCTGAACTTGGCCGAGTCCACGGTAATGAGGGCGTCAACCGGCGGGGTCTTGGCGTCCCTTCCGCGGATCTCGTTGATCCGGTCGACCTCCGAGTTGTAGAAGTCGATCGTGCGTCCGATGTTGGCCTCGAGCGCACCACGTGAAAAGTTGTAACACCACGCATCCCGGCTGGTCTTCAAGCCCGCGGAATAGTTCGCGAAGACACGTGTCACGTCAAGAGCAGCCTTCTTGTCGCCGATAACCGGCCACGCGCTGAACGCGTCGTCGCGTTGGTTGACCCAGTCACCGTGCAAGTTGGGTGTGACTGTCTGCCATTCCACCGTGTCGAGGTAGCCGTCGCCGACGATCCGCAACTTCTCCTCGCGACTCAGGTAATCGCCGATGTCGCGGTAAAGGACATCGCATGGCCCGCTGTGCTTCGGATCCTTGATGCCAAGGAAGATCGCCACCGTGTTGCGACTCCCCCCGCCAAAGACCTTGCCGCCTTCCTGGCGTGAGAGTTCCCCAGCTGTGCGCTGGTTCCCCCGCAGGTTGTACACATATACCGCCGCGTAGTCGTCGGCGAGCGACAACCGCATGCCGTCTGCCGTGTTGCCGTCTATGTACCCACCATTGGAGACGAATCCGACAACACCGTTGTCACCAATGCGGTCGGTCGCCCACCGGAACGCGCGAATATACGAGTCGTACAGGCTGTTCTTCAGCTGCGCCGTCGACCGCTTCGCGTACGTCTGCTCAATCCGCCCGTCCAACGTCGGATACTTCACGTTGGCGTTCAGGTCGTTCGCGCTGCTCTGCCCCACCGAGTACGGCGGATTCCCGATGATCACGCTGATCGGCGTCGCCAGCTGTCGCAAGATCCGAGCGTTGTTGTACGGGAACATGATCGCGTCCATCGAGTCCCCGGCTTCGGAAATCTGGAACGTGTCGGCCAGCGCCATCCCGGGGAACGGCTCATAGGCGTCGGCGTCGGCGGTCTTGCCCGCCAAAGCATGGTAGGTCGACTCGATGTTCACCGCGGCGATGTAGTACGCCAGCAGCATGATCTCGTTGGCGTGCAGCTCTTGCGAGTACTTTCGGGTGAGGTCGGCGGCCGTGATCAGGTCGGACTGCAGCAGCCGGGTAATGAATGTGCCCGTCCCGGCGAAGCCGTCCAGAATATGCACGCCCTCGTCGGTCAGCCCGCGCCCGAAATGCTTGCGCGACACGAAATCAGCCGCCCGCACAATGAAGTCCACGACCTCGACCGGCGTGTACACGATCCCCAGCGCCTCGGCCTGCTTCTTGAAGCCGATGCGGAAGAACTTCTCGTACAGCTCGGCGATCACCTGCTGCTTGCCCTCGGCGCTGGTGACCTCGCCGGCGCGCCGTCGCACCGATTCGTAAAAGCCTTCCAACCGAGCGGTTTCGGCCTCCAGGCCGGCACCCCCGACGGTGTCGACCATCTTCTGCATGGCCCGCGACACCGGGTTGTGCGACGCGAAGTCATGCCCGGCGAACAGCGCGTCGAACACCGGCTTGGTGATCAGGTGCTGCGAGAGCATGCTGATCGCGTCATCGGGGGTGATCGAGTCATTGAGGTTATCGCGCAGCCCGGCCAGGAACTGCTCGAACGCCGCCGCCGCCGTAGCGTCGGCGCCGCCGAGCAGGGCGTGGATACGGGTGGTCAGCGTCGCGGCGATGTCGGCGACATCGGCGGCCCACTGCTCCCAATAGGTCCGGGTGCCAACCTTGTCGACGATGCGCGCGTAGATCGCTTCCTGCCACTGCGACAACGAGAACATCGCCAACTGCTCCGCGACGGCGGGTCCCGCCTCGTCGGAGGTCGGCCCGATGTGACCGCCCAACAGCTTGTCGCTGCCTTCACCGGTCTTCGTCGGCTTCACGTTCAGCGCAATGCTGTTCACCATCGCGTCGAAGCGCTCGTCGTGCGACCGCAACGCGTTGAGGACCTGCCACACCACCTTGAACCGTTTGTTGTCGGCCAACGCGGCAGACGGCTCGACACCCTCGGGCACCGCCACCGGCAAGATGACGTACCCGTAGTCCTTGCCGGGCGACTTGCGCATCACCCGACCGACCGACTGCACCACGTCGACGATGGAATTGCGCGGATTCAGGAACAGCACCGCGTCCAGCGCGGGCACGTCGACCCCTTCGGAGAGGCAGCGGGCGTTGGACAGGATGCGGCATTCATCCTCGGCGACCACGCCTTTGAGCCAGGCCAGCTGTTCGTTGCGGACCAGCGCGTTGAACGTCCCGTCCACGTGGCGCACCGAACACGCCAGGCCCGGGCCGTCGTCAACCA" + ] diff --git a/tests/probe_tests/test_models.py b/tests/probe_tests/test_models.py index cb2c9198..377970f0 100644 --- a/tests/probe_tests/test_models.py +++ b/tests/probe_tests/test_models.py @@ -1,59 +1,78 @@ -import sys -import json import os +import pytest -sys.path.append(".") from mykrobe.probes.models import Mutation from mykrobe.annotation.genes import GeneAminoAcidChangeToDNAVariants -DATA_DIR = os.path.join("tests", "ref_data") - - -class TestMutation(): - - def setup(self): - self.reference_filepath = f"{DATA_DIR}/NC_000962.3.fasta" - self.reference = os.path.basename( - self.reference_filepath).split('.fa')[0] - self.aa2dna = GeneAminoAcidChangeToDNAVariants( - f"{DATA_DIR}/NC_000962.3.fasta", - f"{DATA_DIR}/NC_000962.3.gb") - - def teardown(self): - pass - - def test_mutation_name_forward_strand(self): - gene = "rpoB" - mutation_string = "S450L" - is_protein_coding_var = True - assert set(self.aa2dna.get_variant_names( - gene, mutation_string, is_protein_coding_var)) == set(["TCG761154TTA", "TCG761154TTG", "TCG761154CTA", "TCG761154CTT", "TCG761154CTC", "TCG761154CTG"]) - mutation = Mutation(reference=self.reference, - var_name="TCG761154TTA", - gene=self.aa2dna.get_gene("rpoB"), - mut="S450L") - assert mutation.mutation_output_name == "S450L" - - def test_mutation_name_reverse_strand(self): - gene = "gid" - mutation_string = "I11N" - is_protein_coding_var = True - assert set(self.aa2dna.get_variant_names( - gene, mutation_string, is_protein_coding_var)) == set(["GAT4408170ATT", "GAT4408170GTT"]) - mutation = Mutation(reference=self.reference, - var_name="GAT4408170ATT", - gene=self.aa2dna.get_gene("gid"), - mut="I11N") - assert mutation.mutation_output_name == "I11N" - - def test_mutation_name_dna_space(self): - gene = "pncA" - mutation_string = "C18CCA" - is_protein_coding_var = False - assert set(self.aa2dna.get_variant_names( - gene, mutation_string, is_protein_coding_var)) == set(["G2289224TGG"]) - mutation = Mutation(reference=self.reference, - var_name=self.aa2dna.get_variant_names( - gene, mutation_string, is_protein_coding_var)[0], - gene=self.aa2dna.get_gene(gene), - mut=mutation_string) - assert mutation.mutation_output_name == "C18CCA" + + +@pytest.fixture(scope="module") +def ref_data(): + data_dir = os.path.join("tests", "ref_data") + reference_filepath = f"{data_dir}/NC_000962.3.fasta" + reference = os.path.basename(reference_filepath).split(".fa")[0] + aa2dna = GeneAminoAcidChangeToDNAVariants( + f"{data_dir}/NC_000962.3.fasta", f"{data_dir}/NC_000962.3.gb" + ) + return reference, aa2dna + + +def test_mutation_name_forward_strand(ref_data): + reference, aa2dna = ref_data + gene = "rpoB" + mutation_string = "S450L" + is_protein_coding_var = True + assert set( + aa2dna.get_variant_names(gene, mutation_string, is_protein_coding_var) + ) == set( + [ + "TCG761154TTA", + "TCG761154TTG", + "TCG761154CTA", + "TCG761154CTT", + "TCG761154CTC", + "TCG761154CTG", + ] + ) + mutation = Mutation( + reference=reference, + var_name="TCG761154TTA", + gene=aa2dna.get_gene("rpoB"), + mut="S450L", + ) + assert mutation.mutation_output_name == "S450L" + + +def test_mutation_name_reverse_strand(ref_data): + reference, aa2dna = ref_data + gene = "gid" + mutation_string = "I11N" + is_protein_coding_var = True + assert set( + aa2dna.get_variant_names(gene, mutation_string, is_protein_coding_var) + ) == set(["GAT4408170ATT", "GAT4408170GTT"]) + mutation = Mutation( + reference=reference, + var_name="GAT4408170ATT", + gene=aa2dna.get_gene("gid"), + mut="I11N", + ) + assert mutation.mutation_output_name == "I11N" + + +def test_mutation_name_dna_space(ref_data): + reference, aa2dna = ref_data + gene = "pncA" + mutation_string = "C18CCA" + is_protein_coding_var = False + assert set( + aa2dna.get_variant_names(gene, mutation_string, is_protein_coding_var) + ) == set(["G2289224TGG"]) + mutation = Mutation( + reference=reference, + var_name=aa2dna.get_variant_names(gene, mutation_string, is_protein_coding_var)[ + 0 + ], + gene=aa2dna.get_gene(gene), + mut=mutation_string, + ) + assert mutation.mutation_output_name == "C18CCA" diff --git a/tests/probe_tests/test_probe_generation.py b/tests/probe_tests/test_probe_generation.py index e9cd28f9..7f5519fc 100644 --- a/tests/probe_tests/test_probe_generation.py +++ b/tests/probe_tests/test_probe_generation.py @@ -1,10 +1,12 @@ import os -import pytest from mykrobe.probes.models import Mutation from mykrobe.probes import load_dna_vars_txt_file + def test_load_dna_vars_txt_file(): - infile = os.path.join("tests", "probe_tests", "test_probe_generation.load_dna_vars_txt_file.tsv") + infile = os.path.join( + "tests", "probe_tests", "test_probe_generation.load_dna_vars_txt_file.tsv" + ) got_mutations, got_lineage = load_dna_vars_txt_file(infile, "ref") expect_mutations = [ Mutation(reference="ref", var_name="G42A"), @@ -13,10 +15,13 @@ def test_load_dna_vars_txt_file(): Mutation(reference="ref", var_name="A72T"), ] expect_lineage = { - "G42A": {"name": "lineage1", "use_ref_allele": False}, - "C52G": {"name": "lineage1.2", "use_ref_allele": True}, - "A72T": {"name": "lineage1.2.3", "use_ref_allele": False, "report_name": "lineage1.2.3_report_name"}, + "G42A": {"name": "lineage1", "use_ref_allele": False}, + "C52G": {"name": "lineage1.2", "use_ref_allele": True}, + "A72T": { + "name": "lineage1.2.3", + "use_ref_allele": False, + "report_name": "lineage1.2.3_report_name", + }, } assert got_mutations == expect_mutations assert got_lineage == expect_lineage - diff --git a/tests/probe_tests/test_snp_indel_interaction.py b/tests/probe_tests/test_snp_indel_interaction.py index a7e5dc9b..22b26b03 100644 --- a/tests/probe_tests/test_snp_indel_interaction.py +++ b/tests/probe_tests/test_snp_indel_interaction.py @@ -1,460 +1,410 @@ -import os - -from mongoengine import connect - from base import assert_no_overlapping_kmers -from mykrobe.probes import AlleleGenerator -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet from mykrobe.variants.schema.models import Variant -from mykrobe.variants.schema.models import VariantSet -DB = connect("mykrobe-test") -DATA_DIR = os.path.join("tests", "ref_data") +def test_ins_with_SNP_context(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["ATTT"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + panel = pg.create(v, context=[v2]) + # assert_no_overlapping_kmers(panel) ### This test seems to fail sometimes, and pass othertimes... + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG", + "TTAAAGATAGAAATACACGATGCGAGCATTTTTCAAATTTCATAACATCACCATGAGTTTG", + ] + ) + + +def test_del_with_SNP_context1(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AA", + start=31, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=33, + alternate_bases=["A"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGA", + "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG", + ] + ) + + +def test_del_with_SNP_context2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AA", + start=31, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2]) == [] + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert sorted(panel.alts) == sorted( + ["GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"] + ) + + +def test_del_with_ins_context1(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="AAT", + start=31, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=4, + alternate_bases=["TTTT"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2]) == [v2] + assert "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "GATTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGAT", + "TTTTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTG", + ] + ) + + +def test_del_with_ins_context2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=1, + alternate_bases=["CTTT"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2]) == [v2] + assert pg._remove_contexts_not_within_k(v, [v2]) == [] + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"] + ) -class TestINDELandSNPSAlleleGenerator: - def setup(self): - DB.drop_database("mykrobe-test") - self.pg = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31 - ) - self.pg2 = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/NC_000962.3.fasta", kmer=31 - ) - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", reference_set=self.reference_set - ) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", md5checksum="sre", reference_sets=[self.reference_set] - ) - def teardown(self): - DB.drop_database("mykrobe-test") +def test_del_with_ins_context3(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=5, + alternate_bases=["TT"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2]) == [v2] + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + ] + ) - def test_ins_with_SNP_context(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["ATTT"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - panel = self.pg.create(v, context=[v2]) - # assert_no_overlapping_kmers(panel) ### This test seems to fail sometimes, and pass othertimes... - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "GATTAAAGATAGAAATACACGATGCGAGCATTTATCAAATTTCATAACATCACCATGAGTTTG", - "TTAAAGATAGAAATACACGATGCGAGCATTTTTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) +def test_del_with_ins_context4(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=5, + alternate_bases=["TT"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=5, + alternate_bases=["TG"], + ) + panel = pg.create(v, context=[v2, v3]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "TTGAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + ] + ) - def test_del_with_SNP_context1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AA", - start=31, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=33, - alternate_bases=["A"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "ATTAAAGATAGAAATACACGATGCGAGCAACAAATTTCATAACATCACCATGAGTTTGA", - "GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG", - ] - ) - def test_del_with_SNP_context2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AA", - start=31, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2]) == [] - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - ["GATTAAAGATAGAAATACACGATGCGAGCATCAAATTTCATAACATCACCATGAGTTTG"] - ) +def test_del_with_ins_context5(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=5, + alternate_bases=["TT"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=6, + alternate_bases=["AG"], + ) + panel = pg.create(v, context=[v2, v3]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGA", + "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + ] + ) - def test_del_with_ins_context1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="AAT", - start=31, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=4, - alternate_bases=["TTTT"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] - assert ( - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "GATTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTGAT", - "TTTTAAAGATAGAAATACACGATGCGAGCACAAATTTCATAACATCACCATGAGTTTG", - ] - ) - def test_del_with_ins_context2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=1, - alternate_bases=["CTTT"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] - assert self.pg._remove_contexts_not_within_k(v, [v2]) == [] - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"] - ) +def test_del_with_ins_context_where_base_is_deleted1(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=33, + alternate_bases=["C"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) + assert pg._remove_overlapping_contexts(v, [v2]) == [] + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"] + ) - def test_del_with_ins_context3(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=5, - alternate_bases=["TT"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2]) == [v2] - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - ] - ) - def test_del_with_ins_context4(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=5, - alternate_bases=["TT"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=5, - alternate_bases=["TG"], - ) - panel = self.pg.create(v, context=[v2, v3]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "TTGAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - ] - ) +def test_del_with_ins_context_where_base_is_deleted2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATC", + start=32, + alternate_bases=["A"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="TAAA", + start=5, + alternate_bases=["T"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=7, + alternate_bases=["AG"], + ) + panel = pg.create(v, context=[v2, v3]) + assert_no_overlapping_kmers(panel) + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", + "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + ] + ) - def test_del_with_ins_context5(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=5, - alternate_bases=["TT"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=6, - alternate_bases=["AG"], - ) - panel = self.pg.create(v, context=[v2, v3]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2, v3]) == [v2, v3] - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGA", - "TTAGAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "TTTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - ] - ) + panel = pg.create(v, context=[v3, v2]) + assert_no_overlapping_kmers(panel) + assert "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" in panel.refs + assert sorted(panel.alts) == sorted( + [ + "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", + "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", + ] + ) - def test_del_with_ins_context_where_base_is_deleted1(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=33, - alternate_bases=["C"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert self.pg._remove_overlapping_contexts(v, [v2]) == [] - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - ["ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT"] - ) - def test_del_with_ins_context_where_base_is_deleted2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATC", - start=32, - alternate_bases=["A"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="TAAA", - start=5, - alternate_bases=["T"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=7, - alternate_bases=["AG"], - ) - panel = self.pg.create(v, context=[v2, v3]) - assert_no_overlapping_kmers(panel) - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", - "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - ] - ) +def test_snp_with_replace_context(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="G", + start=2338961, + alternate_bases=["A"], + ) + v1 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="GGATG", + start=2338990, + alternate_bases=["CGATA"], + ) + panel = pg2.create(v, context=[v1]) + assert_no_overlapping_kmers(panel) + assert "CGACTAGCCACCATCGCGCATCAGTGCGAGGTCAAAAGCGACCAAAGCGAGCAAGTCGCGG" in panel.refs - panel = self.pg.create(v, context=[v3, v2]) - assert_no_overlapping_kmers(panel) - assert ( - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - in panel.refs - ) - assert sorted(panel.alts) == sorted( - [ - "ATTAAAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGATC", - "TTAAGAGATAGAAATACACGATGCGAGCAAAAATTTCATAACATCACCATGAGTTTGAT", - ] - ) + assert set(panel.alts) == set( + [ + "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCCG", + "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCGG", + ] + ) - def test_snp_with_replace_context(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="G", - start=2338961, - alternate_bases=["A"], - ) - v1 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="GGATG", - start=2338990, - alternate_bases=["CGATA"], - ) - panel = self.pg2.create(v, context=[v1]) - assert_no_overlapping_kmers(panel) - assert ( - "CGACTAGCCACCATCGCGCATCAGTGCGAGGTCAAAAGCGACCAAAGCGAGCAAGTCGCGG" - in panel.refs - ) - assert set(panel.alts) == set( - [ - "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCCG", - "CGACTAGCCACCATCGCGCATCAGTGCGAGATCAAAAGCGACCAAAGCGAGCAAGTCGCGG", - ] - ) +def test_indel_snp_indel_context(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="TCGCGTGGC", + start=4021459, + alternate_bases=["GCGAGCAGA"], + ) + v1 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=4021455, + alternate_bases=["ATCTAGCCGCAAG"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=4021489, + alternate_bases=["G"], + ) + panel = pg2.create(v) # , context = [v1, v2]) + assert_no_overlapping_kmers(panel) + assert "ATCATGCGATTCTGCGTCTGCTCGCGAGGCTCGCGTGGCCGCCGGCGCTGGCGGGCGATCT" in panel.refs - def test_indel_snp_indel_context(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="TCGCGTGGC", - start=4021459, - alternate_bases=["GCGAGCAGA"], - ) - v1 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=4021455, - alternate_bases=["ATCTAGCCGCAAG"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=4021489, - alternate_bases=["G"], - ) - panel = self.pg2.create(v) # , context = [v1, v2]) - assert_no_overlapping_kmers(panel) - assert ( - "ATCATGCGATTCTGCGTCTGCTCGCGAGGCTCGCGTGGCCGCCGGCGCTGGCGGGCGATCT" - in panel.refs - ) + panel = pg2.create(v, context=[v1, v2]) + assert_no_overlapping_kmers(panel) + assert sorted(panel.alts) == sorted( + [ + "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", + "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", + "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", + "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", + ] + ) - panel = self.pg2.create(v, context=[v1, v2]) - assert_no_overlapping_kmers(panel) - assert sorted(panel.alts) == sorted( - [ - "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", - "ATCATGCGATTCTGCGTCTGCTCGCGAGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", - "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCG", - "TGCGTCTGCTCGCGATCTAGCCGCAAGGGCGCGAGCAGACGCCGGCGCTGGCGGGCGATCT", - ] - ) - def test_complex_context(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="ATTT", - start=1503643, - alternate_bases=["A"], - ) - v1 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="CCT", - start=1503615, - alternate_bases=["C"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=1503655, - alternate_bases=["ATGCCGCCGCC"], - ) - panel = self.pg2.create(v, context=[v1, v2]) - assert_no_overlapping_kmers(panel) - assert ( - "ATCCTGGAGCCCACCAGCGGAAACACCGGCATTTCGCTGGCGATGGCGGCCCGGTTGAAGG" - in panel.refs - ) - assert set(panel.alts) == set( - [ - "CCATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGT", - "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGG", - "ATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", - "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", - ] - ) +def test_complex_context(variant_sets_and_reference, pg2): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="ATTT", + start=1503643, + alternate_bases=["A"], + ) + v1 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="CCT", + start=1503615, + alternate_bases=["C"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=1503655, + alternate_bases=["ATGCCGCCGCC"], + ) + panel = pg2.create(v, context=[v1, v2]) + assert_no_overlapping_kmers(panel) + assert "ATCCTGGAGCCCACCAGCGGAAACACCGGCATTTCGCTGGCGATGGCGGCCCGGTTGAAGG" in panel.refs + assert set(panel.alts) == set( + [ + "CCATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGGT", + "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGGCGGCCCGGTTGAAGGGG", + "ATCGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", + "TCCTGGAGCCCACCAGCGGAAACACCGGCACGCTGGCGATGCCGCCGCCTGGCGGCCCGG", + ] + ) diff --git a/tests/probe_tests/test_snp_only.py b/tests/probe_tests/test_snp_only.py index c43938ea..306a1e77 100644 --- a/tests/probe_tests/test_snp_only.py +++ b/tests/probe_tests/test_snp_only.py @@ -1,339 +1,333 @@ -import os - import pytest -from mongoengine import connect from base import assert_no_overlapping_kmers from mykrobe.probes import AlleleGenerator -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet from mykrobe.variants.schema.models import Variant -from mykrobe.variants.schema.models import VariantSet -DB = connect("mykrobe-test") -DATA_DIR = os.path.join("tests", "ref_data") +def test_panel_generator(pg): + pg = AlleleGenerator(reference_filepath="tests/ref_data/BX571856.1.fasta", kmer=31) + assert pg.ref is not None -class TestSNPAlleleGenerator: - def setup(self): - DB.drop_database("mykrobe-test") - self.pg = AlleleGenerator( - reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31 - ) - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", reference_set=self.reference_set - ) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", md5checksum="sre", reference_sets=[self.reference_set] - ) - def test_panel_generator(self): - pg = AlleleGenerator(reference_filepath=f"{DATA_DIR}/BX571856.1.fasta", kmer=31) - assert pg.ref is not None +def test_simple_snp_variant(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["T"], + ) + panel = pg.create(v) + assert panel.refs[0][:31] != panel.alts[0][:31] + assert panel.refs[0][-32:] != panel.alts[0][-32:] + assert panel.refs[0][-31:] != panel.alts[0][-31:] + + assert_no_overlapping_kmers(panel) + + assert panel.refs == [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" + ] + assert panel.alts == [ + "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG" + ] + assert pg._calculate_length_delta_from_indels(v, []) == 0 + assert v.is_indel is False + + +def test_simple_variant2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) + + assert panel.refs == [ + "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" + ] + assert panel.alts == [ + "GATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGA" + ] + - def test_simple_snp_variant(self): +def test_simple_variant_invalid(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + with pytest.raises(ValueError): v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", + variant_sets=variant_sets, + reference=reference, + reference_bases="T", start=31, alternate_bases=["T"], ) - panel = self.pg.create(v) - assert panel.refs[0][:31] != panel.alts[0][:31] - assert panel.refs[0][-32:] != panel.alts[0][-32:] - assert panel.refs[0][-31:] != panel.alts[0][-31:] + pg.create(v) - assert_no_overlapping_kmers(panel) - assert panel.refs == [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - ] - assert panel.alts == [ - "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG" - ] - assert self.pg._calculate_length_delta_from_indels(v, []) == 0 - assert v.is_indel is False +def test_simple_variant_start(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=1, + alternate_bases=["T"], + ) + panel = pg.create(v) + # assert_no_overlapping_kmers(panel) ## Will have overlapping kmers only if the SNP is in the i + assert panel.refs == [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" + ] + assert panel.alts == [ + "TGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" + ] - def test_simple_variant2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert panel.refs == [ - "GATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGA" - ] - assert panel.alts == [ - "GATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGA" - ] +def test_simple_variant_end(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=2902618, + alternate_bases=["T"], + ) + panel = pg.create(v) + assert_no_overlapping_kmers(panel) - def test_simple_variant_invalid(self): - with pytest.raises(ValueError) as cm: - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=31, - alternate_bases=["T"], - ) - panel = self.pg.create(v) + assert panel.refs == [ + "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] + assert panel.alts == [ + "TTTATACTACTGCTCAATTTTTTTACTTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] - def test_simple_variant_start(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=1, - alternate_bases=["T"], - ) - panel = self.pg.create(v) - # assert_no_overlapping_kmers(panel) ## Will have overlapping kmers only if the SNP is in the i - assert panel.refs == [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - ] - assert panel.alts == [ - "TGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG" - ] + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="T", + start=2902616, + alternate_bases=["C"], + ) + panel = pg.create(v) + assert panel.refs == [ + "ATTTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] + assert panel.alts == [ + "ATTTTATACTACTGCTCAATTTTTTTACTTCTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ] - def test_simple_variant_end(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=2902618, - alternate_bases=["T"], - ) - panel = self.pg.create(v) - assert_no_overlapping_kmers(panel) - assert panel.refs == [ - "TTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - ] - assert panel.alts == [ - "TTTATACTACTGCTCAATTTTTTTACTTTTTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNN" - ] +def test_simple_variant_with_nearby_snp(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["T"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + panel = pg.create(v, context=[v2]) + assert_no_overlapping_kmers(panel) - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="T", - start=2902616, - alternate_bases=["C"], - ) - panel = self.pg.create(v) - assert panel.refs == [ - "ATTTTATACTACTGCTCAATTTTTTTACTTTTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" + assert set(panel.refs) == set( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", ] - assert panel.alts == [ - "ATTTTATACTACTGCTCAATTTTTTTACTTCTATNNNNNNNNNNNNNNNNNNNNNNNNNNN" + ) + assert set(panel.alts) == set( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", ] + ) - def test_simple_variant_with_nearby_snp(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["T"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - panel = self.pg.create(v, context=[v2]) - assert_no_overlapping_kmers(panel) - assert set(panel.refs) == set( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) - assert set(panel.alts) == set( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) +def test_simple_variant_with_multiple_nearby_snps(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["T"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["G"], + ) + + panel = pg.create(v, context=[v2, v3]) + assert_no_overlapping_kmers(panel) + + assert panel.refs == [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", + ] + assert panel.alts == [ + "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", + ] - def test_simple_variant_with_multiple_nearby_snps(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["T"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["G"], - ) - panel = self.pg.create(v, context=[v2, v3]) - assert_no_overlapping_kmers(panel) +def test_simple_variant_with_multiple_nearby_snps2(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["T"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["G"], + ) + v4 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["T"], + ) + v5 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["A"], + ) - assert panel.refs == [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTGAT", + assert sorted(pg._split_context([v, v3, v4])) == sorted([[v, v4], [v, v3]]) + assert (pg._split_context([v3, v4])) == [[v4], [v3]] + assert (pg._split_context([v, v3, v4, v5])) == [[v, v4, v5], [v, v3, v5]] + panel = pg.create(v, context=[v2, v3, v4, v5]) + assert_no_overlapping_kmers(panel) + assert sorted(panel.refs) == sorted( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGAAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGAATTCAAATTTCATAACATCACCATGAGTTTG", ] - assert panel.alts == [ - "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTGAT", - "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTGAT", + ) + assert sorted(panel.alts) == sorted( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGATATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGATTTCAAATTTCATAACATCACCATGAGTTTG", ] + ) - def test_simple_variant_with_multiple_nearby_snps2(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["T"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["G"], - ) - v4 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["T"], - ) - v5 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["A"], - ) - assert sorted(self.pg._split_context([v, v3, v4])) == sorted([[v, v4], [v, v3]]) - assert (self.pg._split_context([v3, v4])) == [[v4], [v3]] - assert (self.pg._split_context([v, v3, v4, v5])) == [[v, v4, v5], [v, v3, v5]] - panel = self.pg.create(v, context=[v2, v3, v4, v5]) - assert_no_overlapping_kmers(panel) - assert sorted(panel.refs) == sorted( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGAAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGAATTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) - assert sorted(panel.alts) == sorted( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGATATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGATTTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) - - def test_simple_variant_with_multiple_nearby_snps(self): - v = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=31, - alternate_bases=["T"], - ) - v2 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["T"], - ) - v5 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="A", - start=32, - alternate_bases=["G"], - ) - v3 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["G"], - ) - v4 = Variant.create( - variant_sets=self.variant_sets, - reference=self.reference, - reference_bases="C", - start=30, - alternate_bases=["T"], - ) - panel = self.pg.create(v, context=[v2, v3, v4, v5]) - assert_no_overlapping_kmers(panel) - assert sorted(panel.refs) == sorted( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCAGTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGAGTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTAGTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) - assert sorted(panel.alts) == sorted( - [ - "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGCTGTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGGTGTCAAATTTCATAACATCACCATGAGTTTG", - "CGATTAAAGATAGAAATACACGATGCGAGTTGTCAAATTTCATAACATCACCATGAGTTTG", - ] - ) +def test_simple_variant_with_multiple_nearby_snps3(variant_sets_and_reference, pg): + variant_sets, reference = variant_sets_and_reference + v = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=31, + alternate_bases=["T"], + ) + v2 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["T"], + ) + v5 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="A", + start=32, + alternate_bases=["G"], + ) + v3 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["G"], + ) + v4 = Variant.create( + variant_sets=variant_sets, + reference=reference, + reference_bases="C", + start=30, + alternate_bases=["T"], + ) + panel = pg.create(v, context=[v2, v3, v4, v5]) + assert_no_overlapping_kmers(panel) + assert sorted(panel.refs) == sorted( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTAATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTATTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCAGTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGAGTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTAGTCAAATTTCATAACATCACCATGAGTTTG", + ] + ) + assert sorted(panel.alts) == sorted( + [ + "CGATTAAAGATAGAAATACACGATGCGAGCTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTTATCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTTTTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGCTGTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGGTGTCAAATTTCATAACATCACCATGAGTTTG", + "CGATTAAAGATAGAAATACACGATGCGAGTTGTCAAATTTCATAACATCACCATGAGTTTG", + ] + ) diff --git a/tests/species_data_tests/test_data_dir.py b/tests/species_data_tests/test_data_dir.py index ee472f8c..fa266fd9 100644 --- a/tests/species_data_tests/test_data_dir.py +++ b/tests/species_data_tests/test_data_dir.py @@ -9,6 +9,7 @@ data_dir = os.path.dirname(os.path.abspath(__file__)) + def test_data_dir(): # This is a long test that runs through the entire reference data process # that a user might do: update metadata, install a species, update a @@ -62,7 +63,9 @@ def test_data_dir(): assert ddir.species_is_installed("species1") assert ddir.installed_species() == ["species1"] expect_manifest_with_species1 = copy.deepcopy(expect_manifest) - expect_manifest_with_species1["species1"]["installed"] = copy.copy(manifest_data["species1"]) + expect_manifest_with_species1["species1"]["installed"] = copy.copy( + manifest_data["species1"] + ) assert ddir.manifest == expect_manifest_with_species1 # Want to test we can get species1 from the data dir. @@ -104,8 +107,12 @@ def test_data_dir(): # Now update species1 with the force option and check it worked. ddir.add_or_replace_species_data(species1_tarball, force=True) - expect_manifest_with_species1["species1"]["latest"] = copy.copy(manifest_data["species1"]) - expect_manifest_with_species1["species1"]["installed"] = copy.copy(manifest_data["species1"]) + expect_manifest_with_species1["species1"]["latest"] = copy.copy( + manifest_data["species1"] + ) + expect_manifest_with_species1["species1"]["installed"] = copy.copy( + manifest_data["species1"] + ) assert ddir.manifest == expect_manifest_with_species1 assert ddir.species_is_installed("species1") assert ddir.installed_species() == ["species1"] diff --git a/tests/species_data_tests/test_species_dir.py b/tests/species_data_tests/test_species_dir.py index 38eb036e..88d8b632 100644 --- a/tests/species_data_tests/test_species_dir.py +++ b/tests/species_data_tests/test_species_dir.py @@ -33,7 +33,7 @@ def test_species_dir(): "hierarchy": None, }, }, - } + }, } with open(os.path.join(temp_dir, "manifest.json"), "w") as f: json.dump(manifest_data, f, indent=2, sort_keys=True) @@ -85,7 +85,6 @@ def test_species_dir(): pass assert sdir.sanity_check() - manifest_data["panels"]["panel2"] = { "description": "description of panel2", "reference_genome": "NC42", diff --git a/tests/stats_tests/test_coverage_lik.py b/tests/stats_tests/test_coverage_lik.py index 68288e63..4b7cd5e2 100644 --- a/tests/stats_tests/test_coverage_lik.py +++ b/tests/stats_tests/test_coverage_lik.py @@ -9,7 +9,8 @@ def test_percentage_coverage(): assert percent_coverage_from_expected_coverage( - 100) > percent_coverage_from_expected_coverage(10) + 100 + ) > percent_coverage_from_expected_coverage(10) assert percent_coverage_from_expected_coverage(100) == 1 assert percent_coverage_from_expected_coverage(1) < 1 @@ -21,46 +22,28 @@ def test_log_factorial(): def test_log_lik_depth(): - assert exp( - log_lik_depth( - expected_depth=10, - depth=10)) > exp( - log_lik_depth( - expected_depth=10, - depth=1)) - assert exp( - log_lik_depth( - expected_depth=10, - depth=10)) > exp( - log_lik_depth( - expected_depth=10, - depth=8)) - assert exp( - log_lik_depth( - expected_depth=10, - depth=10)) == exp( - log_lik_depth( - expected_depth=10, - depth=9)) - assert exp( - log_lik_depth( - expected_depth=10, - depth=10)) > exp( - log_lik_depth( - expected_depth=10, - depth=11)) - assert log_lik_depth( - expected_depth=100, - depth=50) < log_lik_depth( - expected_depth=10, - depth=9) - with pytest.raises(ValueError) as cm: + assert exp(log_lik_depth(expected_depth=10, depth=10)) > exp( + log_lik_depth(expected_depth=10, depth=1) + ) + assert exp(log_lik_depth(expected_depth=10, depth=10)) > exp( + log_lik_depth(expected_depth=10, depth=8) + ) + assert exp(log_lik_depth(expected_depth=10, depth=10)) == exp( + log_lik_depth(expected_depth=10, depth=9) + ) + assert exp(log_lik_depth(expected_depth=10, depth=10)) > exp( + log_lik_depth(expected_depth=10, depth=11) + ) + assert log_lik_depth(expected_depth=100, depth=50) < log_lik_depth( + expected_depth=10, depth=9 + ) + with pytest.raises(ValueError): log_lik_depth(expected_depth=0, depth=0) - with pytest.raises(ValueError) as cm: + with pytest.raises(ValueError): log_lik_depth(expected_depth=-1, depth=9) - with pytest.raises(ValueError) as cm: + with pytest.raises(ValueError): log_lik_depth(expected_depth=12, depth=-1) - with pytest.raises(ValueError) as cm: + with pytest.raises(ValueError): log_lik_depth(expected_depth=0, depth=1) assert log_lik_depth(expected_depth=1, depth=0) == -1 diff --git a/tests/typer_tests/presence_typer_tests/test_presence_typer.py b/tests/typer_tests/presence_typer_tests/test_presence_typer.py index 15870afd..c2ab5a7a 100644 --- a/tests/typer_tests/presence_typer_tests/test_presence_typer.py +++ b/tests/typer_tests/presence_typer_tests/test_presence_typer.py @@ -1,182 +1,133 @@ -from unittest import TestCase +import pytest +from mykrobe.typing import PresenceTyper from mykrobe.typing import ProbeCoverage from mykrobe.typing import SequenceProbeCoverage -from mykrobe.typing import PresenceTyper -class PresenceTyperTest(TestCase): - - def setUp(self): - self.pt = PresenceTyper(expected_depths=[100]) - self.pt_10 = PresenceTyper(expected_depths=[10]) - - def teardown(self): - pass - - def test_base_case_no_coverage(self): - pc = ProbeCoverage(min_depth=0, - percent_coverage=0, - median_depth=0, - k_count=0, - klen=31) - s1 = SequenceProbeCoverage(name="A123T", - probe_coverage=pc - ) - call = self.pt.type(s1) - assert call.genotype == [0, 0] - - def test_genotyping_gene_11(self): - - pc = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=100, - k_count=100, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt.type(s) - assert call.genotype == [1, 1] - - def test_genotyping_gene_01(self): - - pc = ProbeCoverage(min_depth=100, - percent_coverage=82, - median_depth=2, - k_count=82, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt.type(s) - assert call.genotype == [0, 1] - - def test_resistotype_gene_at_high_CN(self): - - pc = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=1000, - k_count=100, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt.type(s) - assert call.genotype == [1, 1] - - def test_low_coverage(self): - - pc = ProbeCoverage(min_depth=100, - percent_coverage=16, - median_depth=16, - k_count=16, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt_10.type(s) - assert call.genotype == [0, 0] - - pc = ProbeCoverage(min_depth=100, - percent_coverage=80, - median_depth=16, - k_count=16, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt_10.type(s) - assert call.genotype == [1, 1] - - -class PresenceTyperTestWithContaim(TestCase): - - def setUp(self): - self.pt_no_contaim = PresenceTyper(expected_depths=[100]) - self.pt_contaim = PresenceTyper( - expected_depths=[100], - contamination_depths=[10]) - - def teardown(self): - pass - - def test_genotyping_gene_01(self): - - pc = ProbeCoverage(min_depth=10, - percent_coverage=100, - median_depth=10, - k_count=10, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = self.pt_no_contaim.type(s) - assert call.genotype == [0, 1] - call = self.pt_contaim.type(s) - assert call.genotype == [0, 0] - - def test_genotyping_gene_11(self): - pt_no_contaim = PresenceTyper(expected_depths=[20]) - pt_contaim = PresenceTyper( - expected_depths=[20], - contamination_depths=[10]) - - pc = ProbeCoverage(min_depth=10, - percent_coverage=100, - median_depth=10, - k_count=100, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = pt_no_contaim.type(s) - assert call.genotype == [1, 1] - - call = pt_contaim.type(s) - assert call.genotype == [0, 0] - - pc = ProbeCoverage(min_depth=10, - percent_coverage=100, - median_depth=30, - k_count=30, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = pt_no_contaim.type(s) - assert call.genotype == [1, 1] - - call = pt_contaim.type(s) - assert call.genotype == [1, 1] - - pc = ProbeCoverage(min_depth=10, - percent_coverage=100, - median_depth=20, - k_count=20, - klen=31) - s = SequenceProbeCoverage(name="A123T", - probe_coverage=pc, - percent_coverage_threshold=80 - - ) - call = pt_no_contaim.type(s) - assert call.genotype == [1, 1] - - call = pt_contaim.type(s) - assert call.genotype == [1, 1] +@pytest.fixture() +def pt(): + return PresenceTyper(expected_depths=[100]) + + +@pytest.fixture() +def pt_10(): + return PresenceTyper(expected_depths=[10]) + + +def test_base_case_no_coverage(pt): + pc = ProbeCoverage( + min_depth=0, percent_coverage=0, median_depth=0, k_count=0, klen=31 + ) + s1 = SequenceProbeCoverage(name="A123T", probe_coverage=pc) + call = pt.type(s1) + assert call.genotype == [0, 0] + + +def test_genotyping_gene_11(pt): + pc = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=100, k_count=100, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt.type(s) + assert call.genotype == [1, 1] + + +def test_genotyping_gene_01(pt): + pc = ProbeCoverage( + min_depth=100, percent_coverage=82, median_depth=2, k_count=82, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt.type(s) + assert call.genotype == [0, 1] + + +def test_resistotype_gene_at_high_CN(pt): + pc = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=1000, k_count=100, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt.type(s) + assert call.genotype == [1, 1] + + +def test_low_coverage(pt, pt_10): + pc = ProbeCoverage( + min_depth=100, percent_coverage=16, median_depth=16, k_count=16, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_10.type(s) + assert call.genotype == [0, 0] + + pc = ProbeCoverage( + min_depth=100, percent_coverage=80, median_depth=16, k_count=16, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_10.type(s) + assert call.genotype == [1, 1] + + +def test_with_contaim_genotyping_gene_01(): + pt_no_contaim = PresenceTyper(expected_depths=[100]) + pt_contaim = PresenceTyper(expected_depths=[100], contamination_depths=[10]) + + pc = ProbeCoverage( + min_depth=10, percent_coverage=100, median_depth=10, k_count=10, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_no_contaim.type(s) + assert call.genotype == [0, 1] + call = pt_contaim.type(s) + assert call.genotype == [0, 0] + + +def test_with_contaim_genotyping_gene_11(): + pt_no_contaim = PresenceTyper(expected_depths=[20]) + pt_contaim = PresenceTyper(expected_depths=[20], contamination_depths=[10]) + + pc = ProbeCoverage( + min_depth=10, percent_coverage=100, median_depth=10, k_count=100, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_no_contaim.type(s) + assert call.genotype == [1, 1] + + call = pt_contaim.type(s) + assert call.genotype == [0, 0] + + pc = ProbeCoverage( + min_depth=10, percent_coverage=100, median_depth=30, k_count=30, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_no_contaim.type(s) + assert call.genotype == [1, 1] + + call = pt_contaim.type(s) + assert call.genotype == [1, 1] + + pc = ProbeCoverage( + min_depth=10, percent_coverage=100, median_depth=20, k_count=20, klen=31 + ) + s = SequenceProbeCoverage( + name="A123T", probe_coverage=pc, percent_coverage_threshold=80 + ) + call = pt_no_contaim.type(s) + assert call.genotype == [1, 1] + + call = pt_contaim.type(s) + assert call.genotype == [1, 1] diff --git a/tests/typer_tests/variant_typer_tests/test_type_simple_vars.py b/tests/typer_tests/variant_typer_tests/test_type_simple_vars.py index 541c976f..8676e2e6 100644 --- a/tests/typer_tests/variant_typer_tests/test_type_simple_vars.py +++ b/tests/typer_tests/variant_typer_tests/test_type_simple_vars.py @@ -1,306 +1,241 @@ -from unittest import TestCase -from mykrobe.variants.schema.models import Variant -from mykrobe.variants.schema.models import VariantCall -from mykrobe.typing import VariantTyper +import pytest from mykrobe.typing import ProbeCoverage -from mykrobe.typing import SequenceProbeCoverage from mykrobe.typing import VariantProbeCoverage +from mykrobe.typing import VariantTyper -class VariantTyperTest(TestCase): - - def setUp(self): - self.vt = VariantTyper(expected_depths=[100]) - - def teardown(self): - pass - - def test_wt_vars(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=100, - k_count=100, - klen=31) - alternate_coverages = [ProbeCoverage(min_depth=100, - percent_coverage=3, - median_depth=100, - k_count=3, - klen=31)] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - call = self.vt.type([v1]) - assert call['genotype'] == [0, 0] - assert call["info"].get('expected_depths') == [100] - - def test_alt_vars(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=3, - median_depth=100, - k_count=3, - klen=31) - alternate_coverages = [ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=100, - k_count=100, - klen=31)] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - call = self.vt.type([v1]) - assert call['genotype'] == [1, 1] - - def test_mixed_vars(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=50, - k_count=50, - klen=31) - alternate_coverages = [ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=50, - k_count=50, - klen=31)] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - call = self.vt.type(v1) - assert call['genotype'] == [0, 1] - - def test_mixed_vars2(self): - reference_coverage = ProbeCoverage(min_depth=11, - percent_coverage=100, - median_depth=42, - k_count=42, - klen=31) - alternate_coverages = [ProbeCoverage(min_depth=94, - percent_coverage=100, - median_depth=102, - k_count=94, - klen=31)] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - call = self.vt.type(v1) - assert call['genotype'] == [0, 1] - - -class VariantTyperWithContamination(TestCase): - - def setUp(self): - self.vt_no_contaim = VariantTyper( - expected_depths=[100], - contamination_depths=[]) - # To do add contamination type - # self.vt_contaim = VariantTyper( - # expected_depths=[80], - # contamination_depths=[20]) - - def teardown(self): - pass - - def test_simple_case(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=80, - k_count=80, - klen=31) - alternate_coverages = [ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=20, - k_count=40, - klen=31)] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - call = self.vt_no_contaim.type(v1) - assert call['genotype'] == [0, 1] - - # call = self.vt_contaim.type(v1) - # assert call['genotype'] == [0, 0] - - -class TestVariantTyperWithMultipleAlternateCoverages(TestCase): - - def setUp(self): - # to do, test should pass on kc model also - self.vt_no_contaim = VariantTyper( - expected_depths=[100], - contamination_depths=[], - model="median_depth") - - def teardown(self): - pass - - def test_simple_case(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=70, - median_depth=80, - k_count=80, - klen=31) - alt1 = ProbeCoverage(min_depth=100, - percent_coverage=70, - median_depth=20, - k_count=20, - klen=31) - alt2 = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=80, - k_count=80, - klen=31) - alternate_coverages = [alt1, alt2] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - assert v1._choose_best_alternate_coverage() == alt2 - - call = self.vt_no_contaim.type(v1) - assert call['genotype'] == [1, 1] - - -class TestVariantTyperWithMultipleProbeCoverages(TestCase): - - def setUp(self): - self.vt_no_contaim = VariantTyper( - expected_depths=[100], - contamination_depths=[]) - - def teardown(self): - pass - - def test_simple_case(self): - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=80, - median_depth=80, - k_count=80, - klen=31) - alt1 = ProbeCoverage(min_depth=100, - percent_coverage=50, - median_depth=20, - k_count=20, - klen=31) - alt2 = ProbeCoverage(min_depth=100, - percent_coverage=40, - median_depth=80, - k_count=30, - klen=31) - alternate_coverages = [alt1, alt2] - - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - reference_coverage = ProbeCoverage(min_depth=100, - percent_coverage=80, - median_depth=80, - k_count=20, - klen=31) - alt1 = ProbeCoverage(min_depth=100, - percent_coverage=50, - median_depth=20, - k_count=20, - klen=31) - alt2 = ProbeCoverage(min_depth=100, - percent_coverage=100, - median_depth=80, - k_count=100, - klen=31) - - alternate_coverages = [alt1, alt2] - - v2 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - call = self.vt_no_contaim.type([v1, v2]) - assert call['genotype'] == [1, 1] - - -class TestVariantTyperWithLowMinimum(TestCase): - - def setUp(self): - self.vt_no_contaim = VariantTyper( - expected_depths=[100], - contamination_depths=[]) - self.vt2_no_contaim = VariantTyper( - expected_depths=[1], - contamination_depths=[]) - - def teardown(self): - pass - - def test_2(self): - reference_coverage = ProbeCoverage(min_depth=131, - percent_coverage=95.2381, - median_depth=155, - k_count=131, - klen=31) - alt1 = ProbeCoverage(min_depth=1, - percent_coverage=100, - median_depth=1, - k_count=1, - klen=31) - alternate_coverages = [alt1] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - call = self.vt_no_contaim.type(v1) - assert call['genotype'] == [0, 0] - - def test_3(self): - reference_coverage = ProbeCoverage(min_depth=2, - percent_coverage=59.52, - median_depth=2, - k_count=60, - klen=31) - alt1 = ProbeCoverage(min_depth=1, - percent_coverage=83.33, - median_depth=1, - k_count=83, - klen=31) - alternate_coverages = [alt1] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - call = self.vt2_no_contaim.type(v1) - assert call['genotype'] == [1, 1] - assert call["info"]["conf"] < 150 - - def test_4(self): - vt = VariantTyper( - expected_depths=[6], - contamination_depths=[], - confidence_threshold=3) - reference_coverage = ProbeCoverage(min_depth=1, - percent_coverage=100, - median_depth=2, - k_count=2, - klen=31) - alt1 = ProbeCoverage(min_depth=1, - percent_coverage=100, - median_depth=1, - k_count=1, - klen=31) - alternate_coverages = [alt1] - v1 = VariantProbeCoverage(var_name="A123T", - reference_coverages=[reference_coverage], - alternate_coverages=alternate_coverages - ) - - call = vt.type(v1) - assert call['genotype'] == [0, 1] - print(call["info"]) - assert call["info"]["conf"] < 100 +@pytest.fixture() +def vt(): + return VariantTyper(expected_depths=[100]) + + +@pytest.fixture() +def vt_no_contaim(): + return VariantTyper(expected_depths=[100], contamination_depths=[]) + + +def test_wt_vars(vt): + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=100, k_count=100, klen=31 + ) + alternate_coverages = [ + ProbeCoverage( + min_depth=100, percent_coverage=3, median_depth=100, k_count=3, klen=31 + ) + ] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + call = vt.type([v1]) + assert call["genotype"] == [0, 0] + assert call["info"].get("expected_depths") == [100] + + +def test_alt_vars(vt): + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=3, median_depth=100, k_count=3, klen=31 + ) + alternate_coverages = [ + ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=100, k_count=100, klen=31 + ) + ] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + call = vt.type([v1]) + assert call["genotype"] == [1, 1] + + +def test_mixed_vars(vt): + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=50, k_count=50, klen=31 + ) + alternate_coverages = [ + ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=50, k_count=50, klen=31 + ) + ] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + call = vt.type(v1) + assert call["genotype"] == [0, 1] + + +def test_mixed_vars2(vt): + reference_coverage = ProbeCoverage( + min_depth=11, percent_coverage=100, median_depth=42, k_count=42, klen=31 + ) + alternate_coverages = [ + ProbeCoverage( + min_depth=94, percent_coverage=100, median_depth=102, k_count=94, klen=31 + ) + ] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + call = vt.type(v1) + assert call["genotype"] == [0, 1] + + +def test_typer_with_contm_simple_case(vt_no_contaim): + # To do add contamination type + # self.vt_contaim = VariantTyper( + # expected_depths=[80], + # contamination_depths=[20]) + + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=80, k_count=80, klen=31 + ) + alternate_coverages = [ + ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=20, k_count=40, klen=31 + ) + ] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + call = vt_no_contaim.type(v1) + assert call["genotype"] == [0, 1] + + # call = self.vt_contaim.type(v1) + # assert call['genotype'] == [0, 0] + + +def test_typer_with_mult_alt_coverages_simple_case(): + # to do, test should pass on kc model also + vt_no_contaim = VariantTyper( + expected_depths=[100], contamination_depths=[], model="median_depth" + ) + + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=70, median_depth=80, k_count=80, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=100, percent_coverage=70, median_depth=20, k_count=20, klen=31 + ) + alt2 = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=80, k_count=80, klen=31 + ) + alternate_coverages = [alt1, alt2] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + assert v1._choose_best_alternate_coverage() == alt2 + + call = vt_no_contaim.type(v1) + assert call["genotype"] == [1, 1] + + +def test_typer_with_mult_probe_coverages_simple_case(vt_no_contaim): + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=80, median_depth=80, k_count=80, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=100, percent_coverage=50, median_depth=20, k_count=20, klen=31 + ) + alt2 = ProbeCoverage( + min_depth=100, percent_coverage=40, median_depth=80, k_count=30, klen=31 + ) + alternate_coverages = [alt1, alt2] + + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + reference_coverage = ProbeCoverage( + min_depth=100, percent_coverage=80, median_depth=80, k_count=20, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=100, percent_coverage=50, median_depth=20, k_count=20, klen=31 + ) + alt2 = ProbeCoverage( + min_depth=100, percent_coverage=100, median_depth=80, k_count=100, klen=31 + ) + + alternate_coverages = [alt1, alt2] + + v2 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + call = vt_no_contaim.type([v1, v2]) + assert call["genotype"] == [1, 1] + + +def test_type_with_low_minimum_1(vt_no_contaim): + reference_coverage = ProbeCoverage( + min_depth=131, percent_coverage=95.2381, median_depth=155, k_count=131, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=1, percent_coverage=100, median_depth=1, k_count=1, klen=31 + ) + alternate_coverages = [alt1] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + call = vt_no_contaim.type(v1) + assert call["genotype"] == [0, 0] + + +def test_type_with_low_minimum_2(): + vt2_no_contaim = VariantTyper(expected_depths=[1], contamination_depths=[]) + reference_coverage = ProbeCoverage( + min_depth=2, percent_coverage=59.52, median_depth=2, k_count=60, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=1, percent_coverage=83.33, median_depth=1, k_count=83, klen=31 + ) + alternate_coverages = [alt1] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + call = vt2_no_contaim.type(v1) + assert call["genotype"] == [1, 1] + assert call["info"]["conf"] < 150 + + +def test_type_with_low_minimum_3(): + vt = VariantTyper( + expected_depths=[6], contamination_depths=[], confidence_threshold=3 + ) + reference_coverage = ProbeCoverage( + min_depth=1, percent_coverage=100, median_depth=2, k_count=2, klen=31 + ) + alt1 = ProbeCoverage( + min_depth=1, percent_coverage=100, median_depth=1, k_count=1, klen=31 + ) + alternate_coverages = [alt1] + v1 = VariantProbeCoverage( + var_name="A123T", + reference_coverages=[reference_coverage], + alternate_coverages=alternate_coverages, + ) + + call = vt.type(v1) + assert call["genotype"] == [0, 1] + assert call["info"]["conf"] < 100 diff --git a/tests/utils_test.py b/tests/utils_test.py index 989b9754..6c604e59 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -55,7 +55,7 @@ def test_fix_amino_acid_X_variants_keys(): "katG_S315C-GCT2155167TTA": "baz", "katG_S315X-GCT2155167CTA": "baz", # katG is on reverse strand so stop TAG is CTA "embB_M306I-ATG4247429ATA": "value", - "embB_M306X-ATG4247429ATA": "value", # duplicate of previous, should remove + "embB_M306X-ATG4247429ATA": "value", # duplicate of previous, should remove } utils.fix_amino_acid_X_variants_keys(test_dict) diff --git a/tests/variant_tests/conftest.py b/tests/variant_tests/conftest.py new file mode 100644 index 00000000..a450bdc3 --- /dev/null +++ b/tests/variant_tests/conftest.py @@ -0,0 +1,35 @@ +from mongoengine import connect +import pytest + +from mykrobe.variants.schema.models import Reference +from mykrobe.variants.schema.models import ReferenceSet +from mykrobe.variants.schema.models import VariantSet + + +@pytest.fixture(autouse=True) +def db_setup_teardown(): + DB = connect("mykrobe-test") + DB.drop_database("mykrobe-test") + yield + DB.drop_database("mykrobe-test") + + +@pytest.fixture() +def reference_set(): + reference_set = ReferenceSet().create_and_save(name="ref_set") + return reference_set + + +@pytest.fixture() +def reference(reference_set): + return Reference().create_and_save( + name="ref", md5checksum="sre", reference_sets=[reference_set] + ) + + +@pytest.fixture() +def variant_sets(reference_set, reference): + variant_set = VariantSet.create_and_save( + name="this_vcf_file", reference_set=reference_set + ) + return [variant_set] diff --git a/tests/variant_tests/test_calls.py b/tests/variant_tests/test_calls.py index 39e07d93..d966f7f5 100644 --- a/tests/variant_tests/test_calls.py +++ b/tests/variant_tests/test_calls.py @@ -1,120 +1,89 @@ +import pytest + from mykrobe.variants.schema.models import Variant from mykrobe.variants.schema.models import VariantSet from mykrobe.variants.schema.models import VariantCall from mykrobe.variants.schema.models import VariantCallSet -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet - -from mykrobe.utils import split_var_name -from mongoengine import connect -DB = connect('mykrobe-test') - - -class BaseTest(): - - def setup(self): - DB.drop_database('mykrobe-test') - - def teardown(self): - DB.drop_database('mykrobe-test') - - -class TestCallSet(BaseTest): - - def setup(self): - DB.drop_database('mykrobe-test') - - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file", - reference_set=self.reference_set) - self.variant_sets = [self.variant_set] - - def test_create_call_set(self): - call_set = VariantCallSet.create_and_save( - name="call_set", - sample_id="C00123", - variant_sets=self.variant_sets) - cs = VariantCallSet.objects.get(name="call_set") - assert call_set == cs - assert cs.name == "call_set" - assert cs.variant_sets[0].reference_set.name == "ref_set" - - -class TestCall(BaseTest): - - def teardown(self): - DB.drop_database('mykrobe-test') - - def setup(self): - DB.drop_database('mykrobe-test') - - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file2", - reference_set=self.reference_set) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", - md5checksum="sre", - reference_sets=[ - self.reference_set]) - self.call_set = VariantCallSet.create( - sample_id="C00123", - name="C00123", - variant_sets=self.variant_sets) - self.variant_snp = Variant.create(variant_sets=self.variant_sets, - start=0, end=1, reference_bases="A", - alternate_bases=["T"], - reference=self.reference) - - self.variant_snp_mult_alts = Variant.create( - variant_sets=self.variant_sets, - start=0, - end=1, - reference_bases="T", - alternate_bases=[ - "A", - "C"], - reference=self.reference) - - def test_create_SNP_het_call(self): - c1 = VariantCall.create(variant=self.variant_snp, - call_set=self.call_set, - genotype=[0, 1], - genotype_likelihoods=[0.1, 0.9, 0.12]) - assert c1.call_set_name == "C00123" - assert c1.genotype == [0, 1] - self.variant_snp.save() - c1.save() - assert c1 in self.variant_snp.calls - - c2 = VariantCall.create(variant=self.variant_snp, - call_set=self.call_set, - genotype="1/1", - genotype_likelihoods=[0.01, 0.1, 0.9]) - assert c2.call_set_name == "C00123" - assert c2.genotype == [1, 1] - c2.save() - assert c2 in self.variant_snp.calls - def test_create_complex_call(self): - c1 = VariantCall.create( - variant=self.variant_snp_mult_alts, - call_set=self.call_set, - genotype="2/1", - genotype_likelihoods=[ - 0.01, - 0.1, - 0.9, - 0.1, - 0.2, - 0.6]) - self.variant_snp.save() - self.variant_snp_mult_alts.save() - c1.save() - assert c1.call_set_name == "C00123" - assert c1.genotype == [2, 1] - assert c1 not in self.variant_snp.calls - assert c1 in self.variant_snp_mult_alts.calls +@pytest.fixture() +def call_set(reference_set, reference): + variant_set = VariantSet.create_and_save( + name="this_vcf_file2", reference_set=reference_set + ) + variant_sets = [variant_set] + return VariantCallSet.create( + sample_id="C00123", name="C00123", variant_sets=variant_sets + ) + + +@pytest.fixture() +def variant_snp(variant_sets, reference): + return Variant.create( + variant_sets=variant_sets, + start=0, + end=1, + reference_bases="A", + alternate_bases=["T"], + reference=reference, + ) + + +def test_create_call_set(variant_sets): + call_set = VariantCallSet.create_and_save( + name="call_set", sample_id="C00123", variant_sets=variant_sets + ) + cs = VariantCallSet.objects.get(name="call_set") + assert call_set == cs + assert cs.name == "call_set" + assert cs.variant_sets[0].reference_set.name == "ref_set" + + +def test_create_SNP_het_call(variant_snp, call_set): + c1 = VariantCall.create( + variant=variant_snp, + call_set=call_set, + genotype=[0, 1], + genotype_likelihoods=[0.1, 0.9, 0.12], + ) + assert c1.call_set_name == "C00123" + assert c1.genotype == [0, 1] + variant_snp.save() + c1.save() + assert c1 in variant_snp.calls + + c2 = VariantCall.create( + variant=variant_snp, + call_set=call_set, + genotype="1/1", + genotype_likelihoods=[0.01, 0.1, 0.9], + ) + assert c2.call_set_name == "C00123" + assert c2.genotype == [1, 1] + + c2.save() + assert c2 in variant_snp.calls + + +def test_create_complex_call(reference, variant_snp, variant_sets, call_set): + variant_snp_mult_alts = Variant.create( + variant_sets=variant_sets, + start=0, + end=1, + reference_bases="T", + alternate_bases=["A", "C"], + reference=reference, + ) + c1 = VariantCall.create( + variant=variant_snp_mult_alts, + call_set=call_set, + genotype="2/1", + genotype_likelihoods=[0.01, 0.1, 0.9, 0.1, 0.2, 0.6], + ) + variant_snp.save() + variant_snp_mult_alts.save() + c1.save() + assert c1.call_set_name == "C00123" + assert c1.genotype == [2, 1] + assert c1 not in variant_snp.calls + assert c1 in variant_snp_mult_alts.calls diff --git a/tests/variant_tests/test_variants.py b/tests/variant_tests/test_variants.py index d904439c..5d57ec3c 100644 --- a/tests/variant_tests/test_variants.py +++ b/tests/variant_tests/test_variants.py @@ -1,125 +1,104 @@ from mykrobe.variants.schema.models import Variant from mykrobe.variants.schema.models import VariantSet -from mykrobe.variants.schema.models import VariantCall -from mykrobe.variants.schema.models import VariantCallSet -from mykrobe.variants.schema.models import Reference -from mykrobe.variants.schema.models import ReferenceSet - from mykrobe.utils import split_var_name -from mongoengine import connect -DB = connect('mykrobe-test') - - -class BaseTest(): - - def setup(self): - DB.drop_database('mykrobe-test') - - def teardown(self): - DB.drop_database('mykrobe-test') - - -class TestVariantSets(BaseTest): - - def setup(self): - DB.drop_database('mykrobe-test') - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - - def test_create_new_variant(self): - variant_set = VariantSet.create_and_save( - name="this_vcf_file", - reference_set=self.reference_set) - vs = VariantSet.objects.get(name="this_vcf_file") - assert variant_set == vs - assert vs.reference_set.name == "ref_set" - - -class TestVariants(BaseTest): - - def setup(self): - DB.drop_database('mykrobe-test') - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.variant_set = VariantSet.create_and_save( - name="this_vcf_file2", - reference_set=self.reference_set) - self.variant_sets = [self.variant_set] - self.reference = Reference().create_and_save( - name="ref", - md5checksum="sre", - reference_sets=[ - self.reference_set]) - - def teardown(self): - DB.drop_database('mykrobe-test') - - def test_create_SNP(self): - v1 = Variant.create(variant_sets=self.variant_sets, start=0, - end=1, reference_bases="A", - alternate_bases=["T"], - reference=self.reference) - assert v1.start == 0 - assert v1.end == 1 - assert v1.alternate_bases == ["T"] - assert v1.length == 0 - - def test_create_insertion(self): - v1 = Variant.create(variant_sets=self.variant_sets, - start=0, end=1, reference_bases="T", - alternate_bases=["TA"], - reference=self.reference) - assert v1.start == 0 - assert v1.end == 1 - assert v1.alternate_bases == ["TA"] - assert v1.is_insertion - assert v1.is_deletion is False - assert v1.is_indel - assert v1.length == 1 - - def test_create_deletion(self): - v1 = Variant.create(variant_sets=self.variant_sets, - start=0, end=1, reference_bases="AA", - alternate_bases=["A"], - reference=self.reference) - assert v1.start == 0 - assert v1.end == 1 - assert v1.alternate_bases == ["A"] - assert v1.reference_bases == "AA" - assert v1.is_insertion is False - assert v1.is_deletion - assert v1.is_indel - assert v1.length == 1 - - def test_split_name(self): - name = "A12T" - r, pos, a = split_var_name(name) - assert r == "A" - assert pos == 12 - assert a == "T" - - def test_split_name_del(self): - name = "AA12T" - r, pos, a = split_var_name(name) - assert r == "AA" - assert pos == 12 - assert a == "T" - - def test_split_name_ins(self): - name = "A12TT" - r, pos, a = split_var_name(name) - assert r == "A" - assert pos == 12 - assert a == "TT" - - def test_split_name2(self): - name = "A12T/A" - r, pos, a = split_var_name(name) - assert r == "A" - assert pos == 12 - assert a == "T/A" - - def test_split_name3(self): - name = "C-54T" - r, pos, a = split_var_name(name) - assert r == "C" - assert pos == -54 - assert a == "T" + + +def test_create_new_variant(reference_set): + variant_set = VariantSet.create_and_save( + name="this_vcf_file", reference_set=reference_set + ) + vs = VariantSet.objects.get(name="this_vcf_file") + assert variant_set == vs + assert vs.reference_set.name == "ref_set" + + +def test_create_SNP(reference, variant_sets): + v1 = Variant.create( + variant_sets=variant_sets, + start=0, + end=1, + reference_bases="A", + alternate_bases=["T"], + reference=reference, + ) + assert v1.start == 0 + assert v1.end == 1 + assert v1.alternate_bases == ["T"] + assert v1.length == 0 + + +def test_create_insertion(reference, variant_sets): + v1 = Variant.create( + variant_sets=variant_sets, + start=0, + end=1, + reference_bases="T", + alternate_bases=["TA"], + reference=reference, + ) + assert v1.start == 0 + assert v1.end == 1 + assert v1.alternate_bases == ["TA"] + assert v1.is_insertion + assert v1.is_deletion is False + assert v1.is_indel + assert v1.length == 1 + + +def test_create_deletion(reference, variant_sets): + v1 = Variant.create( + variant_sets=variant_sets, + start=0, + end=1, + reference_bases="AA", + alternate_bases=["A"], + reference=reference, + ) + assert v1.start == 0 + assert v1.end == 1 + assert v1.alternate_bases == ["A"] + assert v1.reference_bases == "AA" + assert v1.is_insertion is False + assert v1.is_deletion + assert v1.is_indel + assert v1.length == 1 + + +def test_split_name(): + name = "A12T" + r, pos, a = split_var_name(name) + assert r == "A" + assert pos == 12 + assert a == "T" + + +def test_split_name_del(): + name = "AA12T" + r, pos, a = split_var_name(name) + assert r == "AA" + assert pos == 12 + assert a == "T" + + +def test_split_name_ins(): + name = "A12TT" + r, pos, a = split_var_name(name) + assert r == "A" + assert pos == 12 + assert a == "TT" + + +def test_split_name2(): + name = "A12T/A" + r, pos, a = split_var_name(name) + assert r == "A" + assert pos == 12 + assert a == "T/A" + + +def test_split_name3(): + name = "C-54T" + r, pos, a = split_var_name(name) + assert r == "C" + assert pos == -54 + assert a == "T" diff --git a/tests/vcf_tests/test_adding_vcf_to_db.py b/tests/vcf_tests/test_adding_vcf_to_db.py index 0c3003bc..0aecf109 100644 --- a/tests/vcf_tests/test_adding_vcf_to_db.py +++ b/tests/vcf_tests/test_adding_vcf_to_db.py @@ -1,4 +1,5 @@ import datetime +import pytest from mykrobe._vcf import VCF from mykrobe.variants.schema.models import VariantSet from mykrobe.variants.schema.models import VariantSetMetadata @@ -9,127 +10,94 @@ from mykrobe.variants.schema.models import ReferenceSet from mykrobe.variants.schema.models import Reference -DB = connect('mykrobe-test') - - -class BaseTest(): - - def setup(self): - DB.drop_database('mykrobe-test') - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.reference = Reference().create_and_save( - name="NC_000962.3", - md5checksum="sre", - reference_sets=[ - self.reference_set]) - - def teardown(self): - DB.drop_database('mykrobe-test') - - -class TestAddNewVariantSet(BaseTest): - - def test_add_new_vcf_variant_set(self): - vcf = VCF( - f="tests/vcf_tests/test.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - # We create a global variant set as well as one for the individual VCF - assert VariantSet.objects().count() == 2 - vs = VariantSet.objects()[0] - assert len(Variant.objects()[0].variant_sets) == 2 - assert vs.name == "test.vcf" - - -class TestAddNewVariantSetMetaData(BaseTest): - - def test_add_new_vcf_variant_set(self): - vcf = VCF( - f="tests/vcf_tests/test.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - assert VariantSetMetadata.objects().count() >= 2 - assert VariantSetMetadata.objects(key="KMER").count() == 2 - - -class TestAddNewCallSet(BaseTest): - - def test_add_new_call_set(self): - vcf = VCF( - f="tests/vcf_tests/test.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - # Only one callset but the callset should belong to multiple variant - # sets - assert VariantCallSet.objects().count() == 1 - assert VariantCallSet.objects()[ - 0].created_at <= datetime.datetime.now() - assert len(VariantCallSet.objects()[0].variant_sets) == 2 - - -class TestVariantsAndCalls(BaseTest): - - def test_add_add_variants_and_calls(self): - vcf = VCF( - f="tests/vcf_tests/test.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - assert VariantCall.objects().count() == 21 - assert Variant.objects().count() == 21 - - -class TestAddSecondVCF(BaseTest): - - def setup(self): - DB.drop_database('mykrobe-test') - self.reference_set = ReferenceSet().create_and_save(name="ref_set") - self.reference = Reference().create_and_save( - name="NC_000962.3", - md5checksum="sre", - reference_sets=[ - self.reference_set]) - vcf = VCF( - f="tests/vcf_tests/test.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - - def test_add_second_vcf_variant_set(self): - # This VCF only has one Variant which is not in the first VCF - vcf = VCF( - f="tests/vcf_tests/test2.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - assert VariantSet.objects().count() == 3 - assert VariantCallSet.objects().count() == 2 - assert VariantCall.objects().count() == 42 - assert Variant.objects().count() == 22 - assert len(Variant.objects()[0].variant_sets) == 3 - assert len( - Variant.objects.get( - names="UNION_BC_k31_var_147").variant_sets) == 3 - - -class TestAddVCFwithIndels(BaseTest): - - def test_add_second_vcf_variant_set(self): - # This VCF only has one Variant which is not in the first VCF - vcf = VCF( - f="tests/vcf_tests/test3.vcf", - reference_set_id=self.reference_set.id, - method="CORTEX") - vcf.add_to_database() - assert VariantSet.objects().count() == 2 - assert VariantCallSet.objects().count() == 1 - assert VariantCall.objects().count() == 106 - assert Variant.objects().count() == 106 - assert Variant.snps().count() == 89 - assert Variant.indels().count() == 17 - assert Variant.insertions().count() == 8 - assert Variant.deletions().count() == 8 - assert Variant.ph_snps.count() == 1 + +@pytest.fixture(autouse=True) +def db_setup_teardown(): + DB = connect("mykrobe-test", uuidRepresentation="pythonLegacy") + DB.drop_database("mykrobe-test") + yield + DB.drop_database("mykrobe-test") + + +@pytest.fixture() +def reference_set(): + reference_set = ReferenceSet().create_and_save(name="ref_set") + Reference().create_and_save( + name="NC_000962.3", md5checksum="sre", reference_sets=[reference_set] + ) + vcf = VCF( + f="tests/vcf_tests/test.vcf", reference_set_id=reference_set.id, method="CORTEX" + ) + vcf.add_to_database() + return reference_set + + +def test_add_new_vcf_variant_set(reference_set): + # We create a global variant set as well as one for the individual VCF + assert VariantSet.objects().count() == 2 + vs = VariantSet.objects()[0] + assert len(Variant.objects()[0].variant_sets) == 2 + assert vs.name == "test.vcf" + + +def test_add_new_vcf_variant_set_meta_data(reference_set): + assert VariantSetMetadata.objects().count() >= 2 + assert VariantSetMetadata.objects(key="KMER").count() == 2 + + +def test_add_new_call_set(reference_set): + # Only one callset but the callset should belong to multiple variant + # sets + assert VariantCallSet.objects().count() == 1 + assert VariantCallSet.objects()[0].created_at <= datetime.datetime.now() + assert len(VariantCallSet.objects()[0].variant_sets) == 2 + + +def test_add_add_variants_and_calls(reference_set): + assert VariantCall.objects().count() == 21 + assert Variant.objects().count() == 21 + + +def test_add_second_vcf_variant_set(reference_set): + # This VCF only has one Variant which is not in the first VCF + vcf = VCF( + f="tests/vcf_tests/test2.vcf", + reference_set_id=reference_set.id, + method="CORTEX", + ) + vcf.add_to_database() + assert VariantSet.objects().count() == 3 + assert VariantCallSet.objects().count() == 2 + assert VariantCall.objects().count() == 42 + assert Variant.objects().count() == 22 + assert len(Variant.objects()[0].variant_sets) == 3 + assert len(Variant.objects.get(names="UNION_BC_k31_var_147").variant_sets) == 3 + + +def test_add_vcf_with_indels(): + # Refactoring 09/05/2024. Original comment said: + # This VCF only has one Variant which is not in the first VCF + # ... I think the intention was to add a second VCF, like + # the previous test does. But it was never adding in the first VCF and + # just using the new one. Do the same here, otherwise all the asserts + # fail. Hence not using the same reference_set etc as all the other + # tests above. + reference_set = ReferenceSet().create_and_save(name="ref_set") + Reference().create_and_save( + name="NC_000962.3", md5checksum="sre", reference_sets=[reference_set] + ) + vcf = VCF( + f="tests/vcf_tests/test3.vcf", + reference_set_id=reference_set.id, + method="CORTEX", + ) + vcf.add_to_database() + assert VariantSet.objects().count() == 2 + assert VariantCallSet.objects().count() == 1 + assert VariantCall.objects().count() == 106 + assert Variant.objects().count() == 106 + assert Variant.snps().count() == 89 + assert Variant.indels().count() == 17 + assert Variant.insertions().count() == 8 + assert Variant.deletions().count() == 8 + assert Variant.ph_snps.count() == 1 From 7a21fa3cc8fd7658fdddcb18c5846175b7babe19 Mon Sep 17 00:00:00 2001 From: Martin Hunt Date: Thu, 9 May 2024 16:42:09 +0100 Subject: [PATCH 3/3] Bug fix where alt probes could be in wrong order --- src/mykrobe/probes/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mykrobe/probes/models.py b/src/mykrobe/probes/models.py index b692befb..60169a64 100644 --- a/src/mykrobe/probes/models.py +++ b/src/mykrobe/probes/models.py @@ -204,6 +204,7 @@ def _generate_alternates_on_all_backgrounds(self, v, context): for alt in v.alternate_bases: alternate[i: i + len(v.reference_bases)] = alt alternates.append(alternate) + return alternates def _get_all_context_combinations(self, context): @@ -379,7 +380,8 @@ def __init__(self, variant, refs, start, alts): self.refs = unique(["".join(ref) for ref in refs]) self.start = start self.alts = unique(["".join(alt) for alt in alts]) - self.alts=list(set(self.alts)-set(self.refs)) + ref_set = set(self.refs) + self.alts = [x for x in self.alts if x not in ref_set] class Mutation(object):