Skip to content

Commit

Permalink
Additional parameter to filter mmseqs search, release of v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
SoliareofAstora committed Jan 26, 2022
1 parent 20c5432 commit fdb0578
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
1 change: 1 addition & 0 deletions CONFIG/RUNTIME_PARAMETERS.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# parameters used to filter mmseqs2 search results before aligning
MMSEQS_MIN_BIT_SCORE = -99999
MMSEQS_MAX_EVAL = 99999
MMSEQS_MIN_IDENTITY = 0.5

# parameters used to calculate alignment score
PAIRWISE_ALIGNMENT_MATCH = 2
Expand Down
7 changes: 4 additions & 3 deletions CONFIG/get_config_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,15 @@ def runtime_config():
"ANGSTROM_CONTACT_THRESHOLD": ANGSTROM_CONTACT_THRESHOLD,
"GENERATE_CONTACTS": GENERATE_CONTACTS,

"MMSEQS_MIN_BIT_SCORE": MMSEQS_MIN_BIT_SCORE,
"MMSEQS_MAX_EVAL": MMSEQS_MAX_EVAL,
"MMSEQS_MIN_IDENTITY": MMSEQS_MIN_IDENTITY,

"PAIRWISE_ALIGNMENT_MATCH": PAIRWISE_ALIGNMENT_MATCH,
"PAIRWISE_ALIGNMENT_MISSMATCH": PAIRWISE_ALIGNMENT_MISSMATCH,
"PAIRWISE_ALIGNMENT_GAP_OPEN": PAIRWISE_ALIGNMENT_GAP_OPEN,
"PAIRWISE_ALIGNMENT_GAP_CONTINUATION": PAIRWISE_ALIGNMENT_GAP_CONTINUATION,

"MMSEQS_MIN_BIT_SCORE": MMSEQS_MIN_BIT_SCORE,
"MMSEQS_MAX_EVAL": MMSEQS_MAX_EVAL,

"ALIGNMENT_MIN_SEQUENCE_IDENTITY": ALIGNMENT_MIN_SEQUENCE_IDENTITY,
}
return config
11 changes: 8 additions & 3 deletions utils/search_alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@ def search_alignments(query_seqs: dict, mmseqs_search_output: pd.DataFrame, targ
if alignments_json_file.exists():
return json.load(open(alignments_json_file, "r"))

print(f"MMseqs search output is {len(mmseqs_search_output)} long.")
query_seqs_keys = list(query_seqs.keys())
filtered_mmseqs_search = mmseqs_search_output[mmseqs_search_output['bit_score'] > job_config["MMSEQS_MIN_BIT_SCORE"]]
filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['e_value'] < job_config["MMSEQS_MAX_EVAL"]]
filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['identity'] > job_config["MMSEQS_MIN_IDENTITY"]]
filtered_mmseqs_search = filtered_mmseqs_search[filtered_mmseqs_search['query'].isin(query_seqs_keys)]
print(f"Filtered {len(mmseqs_search_output) - len(filtered_mmseqs_search)} mmseqs matches. "
f"Total alignments to check {len(filtered_mmseqs_search)}")

queries = list(map(lambda x: query_seqs[x], filtered_mmseqs_search["query"]))
targets = list(map(lambda x: target_seqs[x], filtered_mmseqs_search["target"]))
Expand All @@ -61,13 +65,14 @@ def search_alignments(query_seqs: dict, mmseqs_search_output: pd.DataFrame, targ
alignments = dict()
for i in range(len(all_alignments)):
alignment = all_alignments[i]
query_id = filtered_mmseqs_search["query"].iloc[i]
target_id = filtered_mmseqs_search["target"].iloc[i]
# filter out bad alignments based on alignment sequences identity
sequence_identity = alignment_sequences_identity(alignment)
if sequence_identity > job_config["ALIGNMENT_MIN_SEQUENCE_IDENTITY"]:
query_id = filtered_mmseqs_search["query"].iloc[i]
target_id = filtered_mmseqs_search["target"].iloc[i]
if query_id not in alignments.keys():
alignments[query_id] = {"target_id": target_id, "alignment": alignment, "sequence_identity": sequence_identity}
# select best alignment based on pairwise2.align.globalms.score, not sequence_identity?
# select best alignment based on pairwise2.align.globalms.score
elif alignment.score > alignments[query_id]["alignment"].score:
alignments[query_id] = {"target_id": target_id, "alignment": alignment, "sequence_identity": sequence_identity}

Expand Down

0 comments on commit fdb0578

Please sign in to comment.