From f7ca5915784fd10229659f1ca2833658328bc572 Mon Sep 17 00:00:00 2001 From: rfm-targa Date: Fri, 28 Jun 2024 18:13:45 +0100 Subject: [PATCH] Added support for genetic codes 2-3, 5-6, 9-10, 12-16, 21-25. The CreateSchema and AlleleCall modules will use the translation table used to create the training file (values passed to --t, --translation-table are ignored if a training file is used). --- CHEWBBACA/SchemaEvaluator/evaluate_schema.py | 2 +- CHEWBBACA/chewBBACA.py | 63 +++++++++++++++----- CHEWBBACA/utils/constants.py | 24 ++++++-- CHEWBBACA/utils/parameters_validation.py | 47 +++++++++------ 4 files changed, 97 insertions(+), 39 deletions(-) diff --git a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py index d97a0706..469944b4 100644 --- a/CHEWBBACA/SchemaEvaluator/evaluate_schema.py +++ b/CHEWBBACA/SchemaEvaluator/evaluate_schema.py @@ -577,7 +577,7 @@ def main(schema_directory, output_directory, genes_list, annotations, annotations : str Path to a TSV file created by the UniprotFinder module. translation_table : int - Genetic code used to translate the alleles. + Genetic code used to translate alleles. size_threshold : float Allele size variation threshold. Used to determine if the size of an allele is within the interval of the locus size mode +/- diff --git a/CHEWBBACA/chewBBACA.py b/CHEWBBACA/chewBBACA.py index eede62ce..034b634f 100755 --- a/CHEWBBACA/chewBBACA.py +++ b/CHEWBBACA/chewBBACA.py @@ -27,6 +27,7 @@ from ExtractCgMLST import determine_cgmlst from utils import (join_profiles, remove_genes, + gene_prediction as gp, # profiles_sqlitedb as ps, process_datetime as pdt, constants as ct, @@ -48,6 +49,7 @@ from CHEWBBACA.ExtractCgMLST import determine_cgmlst from CHEWBBACA.utils import (join_profiles, remove_genes, + gene_prediction as gp, # profiles_sqlitedb as ps, process_datetime as pdt, constants as ct, @@ -102,7 +104,10 @@ def msg(name=None): parser.add_argument('--ptf', '--training-file', type=str, required=False, dest='ptf_path', help='Path to the Prodigal training file used by Pyrodigal ' - 'to predict genes and added to the schema folder.') + 'to predict genes. The translation table used to create ' + 'this file overrides any value passed to `--t`, ' + '`--translation-table`. This file is copied ' + 'to the schema folder to be used for allele calling.') parser.add_argument('--bsr', '--blast-score-ratio', type=pv.bsr_type, required=False, default=ct.DEFAULT_BSR, @@ -118,9 +123,11 @@ def msg(name=None): 'sequences (CDSs) shorter than this value are excluded.') parser.add_argument('--t', '--translation-table', type=pv.translation_table_type, - required=False, default=11, dest='translation_table', + required=False, dest='translation_table', help='Genetic code used to predict genes and' - ' to translate coding sequences (CDSs).') + ' to translate coding DNA sequences (CDSs). ' + 'This value is ignored if a valid training file ' + 'is passed to `--ptf`, `--training-file`.') parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type, required=False, default=0.2, dest='size_threshold', @@ -167,10 +174,21 @@ def msg(name=None): args = parser.parse_args() del args.CreateSchema - # Check if PTF exists - if args.ptf_path is not None: - if os.path.isfile(args.ptf_path) is False: + # Check if user passed PTF + if args.ptf_path: + # Check if PTF exists + if not os.path.isfile(args.ptf_path): sys.exit(ct.INVALID_PTF_PATH) + else: + # Get translation table used to create training file + ptf_table = gp.read_training_file(args.ptf_path).translation_table + args.translation_table = ptf_table + else: + if not args.translation_table: + args.translation_table = ct.GENETIC_CODES_DEFAULT + + # Check if translation table is supported + pv.translation_table_type([args.translation_table]) # Create output directory created = fo.create_directory(args.output_directory) @@ -261,7 +279,9 @@ def msg(name=None): required=False, dest='ptf_path', help='Path to the Prodigal training file used by Pyrodigal ' 'to predict genes. Default is to use the training file ' - 'included in the schema\'s directory.') + 'included in the schema\'s directory. The translation ' + 'table used to create this file overrides any value ' + 'passed to `--t`, `--translation-table`.') parser.add_argument('--gl', '--genes-list', type=str, required=False, default=False, dest='genes_list', @@ -287,9 +307,8 @@ def msg(name=None): parser.add_argument('--t', '--translation-table', type=pv.translation_table_type, required=False, dest='translation_table', help='Genetic code used to predict genes and' - ' to translate coding sequences. Must match ' - 'the genetic code used to create the training ' - 'file.') + ' to translate coding DNA sequences (CDSs). ' + 'This value will be ignored if a training file is used.') parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type, required=False, dest='size_threshold', @@ -886,7 +905,9 @@ def msg(name=None): required=False, dest='ptf_path', help='Path to the Prodigal training file that ' 'will be included in the directory of the ' - 'adapted schema.') + 'adapted schema. The translation table used to create ' + 'this file overrides any value passed to `--t`, ' + '`--translation-table`.') parser.add_argument('--bsr', '--blast-score-ratio', type=pv.bsr_type, required=False, default=ct.DEFAULT_BSR, @@ -909,8 +930,10 @@ def msg(name=None): parser.add_argument('--t', '--translation-table', type=pv.translation_table_type, required=False, - default=11, dest='translation_table', - help='Genetic code used for allele translation.') + dest='translation_table', + help='Genetic code used for allele translation. This ' + 'value is ignored if a valid training file ' + 'is passed to `--ptf`, `--training-file`.') parser.add_argument('--st', '--size-threshold', type=pv.size_threshold_type, required=False, default=ct.SIZE_THRESHOLD_DEFAULT, @@ -944,10 +967,18 @@ def msg(name=None): args = parser.parse_args() del args.PrepExternalSchema - # Check if PTF exists - if args.ptf_path is not None: - if os.path.isfile(args.ptf_path) is False: + # Check if user passed PTF + if args.ptf_path: + # Check if PTF exists + if not os.path.isfile(args.ptf_path): sys.exit(ct.INVALID_PTF_PATH) + else: + # Get translation table used to create training file + ptf_table = gp.read_training_file(args.ptf_path).translation_table + args.translation_table = ptf_table + else: + if not args.translation_table: + args.translation_table = ct.GENETIC_CODES_DEFAULT # Define output paths schema_path = os.path.abspath(args.output_directory) diff --git a/CHEWBBACA/utils/constants.py b/CHEWBBACA/utils/constants.py index e6994431..b5568305 100755 --- a/CHEWBBACA/utils/constants.py +++ b/CHEWBBACA/utils/constants.py @@ -77,11 +77,25 @@ INTRA_CLUSTER_DEFAULT = 0.9 # Genetic codes/translation tables -GENETIC_CODES = {1: 'Standard', - 4: 'The mold, protozoan, and coelenterate mitochondrial ' - 'code and the mycoplasma/spiroplasma code', - 11: 'The Bacterial, Archaeal and Plant Plastid code', - 25: 'Candidate division SR1 and gracilibacteria code'} +GENETIC_CODES = {1: 'The Standard Code', + 2: 'The Vertebrate Mitochondrial Code', + 3: 'The Yeast Mitochondrial Code', + 4: 'The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code', + 5: 'The Invertebrate Mitochondrial Code', + 6: 'The Ciliate, Dasycladacean and Hexamita Nuclear Code', + 9: 'The Echinoderm and Flatworm Mitochondrial Code', + 10: 'The Euplotid Nuclear Code', + 11: 'The Bacterial, Archaeal and Plant Plastid Code', + 12: 'The Alternative Yeast Nuclear Code', + 13: 'The Ascidian Mitochondrial Code', + 14: 'The Alternative Flatworm Mitochondrial Code', + 15: 'Blepharisma Nuclear Code', + 16: 'Chlorophycean Mitochondrial Code', + 21: 'Trematode Mitochondrial Code', + 22: 'Scenedesmus obliquus Mitochondrial Code', + 23: 'Thraustochytrium Mitochondrial Code', + 24: 'Rhabdopleuridae Mitochondrial Code', + 25: 'Candidate Division SR1 and Gracilibacteria Code'} GENETIC_CODES_DEFAULT = 11 diff --git a/CHEWBBACA/utils/parameters_validation.py b/CHEWBBACA/utils/parameters_validation.py index c11ea683..a7adfcdd 100644 --- a/CHEWBBACA/utils/parameters_validation.py +++ b/CHEWBBACA/utils/parameters_validation.py @@ -24,12 +24,14 @@ try: from utils import (constants as ct, + gene_prediction as gp, file_operations as fo, chewiens_requests as cr, fasta_operations as fao, iterables_manipulation as im) except ModuleNotFoundError: from CHEWBBACA.utils import (constants as ct, + gene_prediction as gp, file_operations as fo, chewiens_requests as cr, fasta_operations as fao, @@ -242,7 +244,7 @@ def translation_table_type(arg, genetic_codes=ct.GENETIC_CODES): valid = False if valid is False: - # format available genetic codes into list + # Format available genetic codes into list lines = ['\t{0}: {1}'.format(k, v) for k, v in genetic_codes.items()] gc_table = '\n{0}\n'.format('\n'.join(lines)) @@ -798,7 +800,6 @@ def validate_ptf(ptf_path, schema_directory, schema_ptfs, force_continue): True if the training file does not match any of the training files previously used with the schema. """ - ptf_path = validate_ptf_path(ptf_path, schema_directory) # Determine PTF checksum @@ -855,26 +856,21 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio, Dictionary with the arguments validated values that will be used for allele calling. """ - - # run parameters values + # Parameter values for current run run_params = {'bsr': blast_score_ratio, - 'translation_table': translation_table, + #'translation_table': translation_table, 'minimum_locus_length': minimum_length, 'size_threshold': size_threshold} - # determine user provided arguments values that differ from default + # Determine user provided values that differ from default unmatch_params = {k: v for k, v in run_params.items() if v not in schema_params[k] and v is not None} - # determine arguments values not provided by user - default_params = {k: schema_params[k][0] - for k, v in run_params.items() - if v is None} - # update arguments for current run - for k in run_params: - if k in default_params: - run_params[k] = default_params[k] + # Update run values equal to None + for k, v in run_params.items(): + if v is None: + run_params[k] = schema_params[k][0] if len(unmatch_params) > 0: print(ct.ARGS_DIFFER) @@ -892,11 +888,11 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio, if params_answer.lower() not in ['y', 'yes']: sys.exit('Exited.') else: - # Append new argument values to configs values + # Append new argument values to config values for p in unmatch_params: schema_params[p].append(unmatch_params[p]) - # default is to get the training file in schema directory + # Default is to get the training file in schema directory schema_ptfs = schema_params['prodigal_training_file'] ptf_path, ptf_hash, unmatch = validate_ptf(ptf_path, schema_directory, schema_ptfs, force_continue) @@ -906,7 +902,24 @@ def solve_conflicting_arguments(schema_params, ptf_path, blast_score_ratio, schema_params['prodigal_training_file'].append(ptf_hash) unmatch_params['prodigal_training_file'] = ptf_hash - # save updated schema config file + # Update translation table + if ptf_path: + # Get translation table used to create training file + ptf_table = gp.read_training_file(ptf_path).translation_table + run_params['translation_table'] = ptf_table + if ptf_table not in schema_params['translation_table']: + schema_params['translation_table'].append(ptf_table) + unmatch_params['translation_table'] = ptf_table + else: + if not translation_table: + run_params['translation_table'] = schema_params['translation_table'][0] + else: + run_params['translation_table'] = translation_table + if translation_table not in schema_params['translation_table']: + schema_params['translation_table'].append(ptf_table) + unmatch_params['translation_table'] = ptf_table + + # Update schema config file if len(unmatch_params) > 0: fo.pickle_dumper(schema_params, config_file)