diff --git a/CONFIG/RUNTIME_PARAMETERS.py b/CONFIG/RUNTIME_PARAMETERS.py index 3732dc6..44b038f 100644 --- a/CONFIG/RUNTIME_PARAMETERS.py +++ b/CONFIG/RUNTIME_PARAMETERS.py @@ -14,6 +14,7 @@ # parameters used to filter mmseqs2 search results before aligning MMSEQS_MIN_BIT_SCORE = -99999 MMSEQS_MAX_EVAL = 99999 +MMSEQS_MIN_IDENTITY = 0.5 # parameters used to calculate alignment score PAIRWISE_ALIGNMENT_MATCH = 2 diff --git a/CONFIG/get_config_dict.py b/CONFIG/get_config_dict.py index 61f9fe8..41511fa 100644 --- a/CONFIG/get_config_dict.py +++ b/CONFIG/get_config_dict.py @@ -18,14 +18,15 @@ def runtime_config(): "ANGSTROM_CONTACT_THRESHOLD": ANGSTROM_CONTACT_THRESHOLD, "GENERATE_CONTACTS": GENERATE_CONTACTS, + "MMSEQS_MIN_BIT_SCORE": MMSEQS_MIN_BIT_SCORE, + "MMSEQS_MAX_EVAL": MMSEQS_MAX_EVAL, + "MMSEQS_MIN_IDENTITY": MMSEQS_MIN_IDENTITY, + "PAIRWISE_ALIGNMENT_MATCH": PAIRWISE_ALIGNMENT_MATCH, "PAIRWISE_ALIGNMENT_MISSMATCH": PAIRWISE_ALIGNMENT_MISSMATCH, "PAIRWISE_ALIGNMENT_GAP_OPEN": PAIRWISE_ALIGNMENT_GAP_OPEN, "PAIRWISE_ALIGNMENT_GAP_CONTINUATION": PAIRWISE_ALIGNMENT_GAP_CONTINUATION, - "MMSEQS_MIN_BIT_SCORE": MMSEQS_MIN_BIT_SCORE, - "MMSEQS_MAX_EVAL": MMSEQS_MAX_EVAL, - "ALIGNMENT_MIN_SEQUENCE_IDENTITY": ALIGNMENT_MIN_SEQUENCE_IDENTITY, } return config diff --git a/utils/search_alignments.py b/utils/search_alignments.py index 270387b..ca94dd0 100644 --- a/utils/search_alignments.py +++ b/utils/search_alignments.py @@ -39,10 +39,14 @@ def search_alignments(query_seqs: dict, mmseqs_search_output: pd.DataFrame, targ if alignments_json_file.exists(): return json.load(open(alignments_json_file, "r")) + print(f"MMseqs search output is {len(mmseqs_search_output)} long.") query_seqs_keys = list(query_seqs.keys()) filtered_mmseqs_search = mmseqs_search_output[mmseqs_search_output['bit_score'] > job_config["MMSEQS_MIN_BIT_SCORE"]] filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['e_value'] < job_config["MMSEQS_MAX_EVAL"]] + filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['identity'] > job_config["MMSEQS_MIN_IDENTITY"]] filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['query'].isin(query_seqs_keys)] + print(f"Filtered {len(mmseqs_search_output) - len(filtered_mmseqs_search)} mmseqs matches. " + f"Total alignments to check {len(filtered_mmseqs_search)}") queries = list(map(lambda x: query_seqs[x], filtered_mmseqs_search["query"])) targets = list(map(lambda x: target_seqs[x], filtered_mmseqs_search["target"])) @@ -61,13 +65,14 @@ def search_alignments(query_seqs: dict, mmseqs_search_output: pd.DataFrame, targ alignments = dict() for i in range(len(all_alignments)): alignment = all_alignments[i] - query_id = filtered_mmseqs_search["query"].iloc[i] - target_id = filtered_mmseqs_search["target"].iloc[i] + # filter out bad alignments based on alignment sequences identity sequence_identity = alignment_sequences_identity(alignment) if sequence_identity > job_config["ALIGNMENT_MIN_SEQUENCE_IDENTITY"]: + query_id = filtered_mmseqs_search["query"].iloc[i] + target_id = filtered_mmseqs_search["target"].iloc[i] if query_id not in alignments.keys(): alignments[query_id] = {"target_id": target_id, "alignment": alignment, "sequence_identity": sequence_identity} - # select best alignment based on pairwise2.align.globalms.score, not sequence_identity? + # select best alignment based on pairwise2.align.globalms.score elif alignment.score > alignments[query_id]["alignment"].score: alignments[query_id] = {"target_id": target_id, "alignment": alignment, "sequence_identity": sequence_identity}