Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

haddock3-traceback #668

Merged
merged 10 commits into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def read(*names, **kwargs):
'haddock3-score = haddock.clis.cli_score:maincli',
'haddock3-unpack = haddock.clis.cli_unpack:maincli',
'haddock3-analyse = haddock.clis.cli_analyse:maincli',
'haddock3-traceback = haddock.clis.cli_traceback:maincli',
]
},
# cmdclass={'build_ext': optional_build_ext},
Expand Down
243 changes: 243 additions & 0 deletions src/haddock/clis/cli_traceback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
"""
Traces back PDB files from a HADDOCK run directory.

Given an input run directory, haddock3-traceback traces back each model to the
initial input molecules used, providing the rank of each intermediate model.

USAGE::

haddock3-traceback -r <run_dir>

"""
import argparse
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from haddock import log
from haddock.libs.libontology import ModuleIO, PDBFile
from haddock.modules import get_module_steps_folders


TRACK_FOLDER = "traceback" # name of the traceback folder

ANA_MODULES = ["caprieval",
"seletop",
"topoaa",
"rmsdmatrix",
"clustrmsd",
"clustfcc"]
rvhonorato marked this conversation as resolved.
Show resolved Hide resolved


def get_ori_names(n: int, pdbfile: PDBFile, max_topo_len: int):
"""
Get the original name(s) of the PDB file.

Parameters
----------
n : int
Step number.
pdbfile : PDBFile
PDBFile object.
max_topo_len : int
Maximum length of the topologies found so far.

Returns
-------
ori_names : list
List of original names.
max_topo_len : int
Maximum length of the topologies found so far.
"""
if n != 0: # not the first step, ori_name should be defined
ori_names = [pdbfile.ori_name]
else: # first step, we get topology files instead of ori_name
# topology can either be a list of topologies or a single
# topology
if isinstance(pdbfile.topology, list):
ori_names = [el.file_name for el in pdbfile.topology]
if len(pdbfile.topology) > max_topo_len:
max_topo_len = len(pdbfile.topology)
else:
ori_names = [pdbfile.topology.file_name]
max_topo_len = 1
return ori_names, max_topo_len


def traceback_dataframe(data_dict: dict,
rank_dict: dict,
sel_step: list,
max_topo_len: int):
"""
Create traceback dataframe by combining together ranks and data.

Parameters
----------
data_dict : dict
Dictionary containing the data to be traced back.
rank_dict : dict
Dictionary containing the ranks of the data to be traced back.
sel_step : list
List of selected steps.
max_topo_len : int
Maximum length of the topologies.

Returns
-------
df_ord : pandas.DataFrame
Dataframe containing the traceback data.
"""
# get last step of the workflow
last_step = sel_step[-1]
# data dict to dataframe
df_data = pd.DataFrame.from_dict(data_dict, orient="index")
df_data.reset_index(inplace=True)
# assign columns
data_cols = [el for el in reversed(sel_step)]
data_cols.extend([f"00_topo{i+1}" for i in range(max_topo_len)])
df_data.columns = data_cols

# same for the rank_dict
df_ranks = pd.DataFrame.from_dict(rank_dict, orient="index")
df_ranks.reset_index(inplace=True)
ranks_col = [last_step] # the key to merge the dataframes
ranks_col.extend([f"{el}_rank" for el in reversed(sel_step)])
df_ranks.columns = ranks_col

# merging the data and ranks dataframes
df_merged = pd.merge(df_data, df_ranks, on=last_step)
ordered_cols = sorted(df_merged.columns)
df_ord = df_merged[ordered_cols]
# last thing: substituting unk records with - in the last step
unk_records = df_ord[f'{last_step}'].str.startswith('unk')
df_ord.loc[unk_records, last_step] = "-"
return df_ord


# Command line interface parser
ap = argparse.ArgumentParser(
prog="haddock3-traceback",
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)

ap.add_argument(
"-r",
"--run-dir",
help="The input run directory.",
required=True,
)
mgiulini marked this conversation as resolved.
Show resolved Hide resolved


def _ap():
return ap


def load_args(ap):
"""Load argument parser args."""
return ap.parse_args()


def cli(ap, main):
"""Command-line interface entry point."""
cmd = vars(load_args(ap))
main(**cmd)


def maincli():
"""Execute main client."""
cli(ap, main)


def main(run_dir):
"""
Analyse CLI.

Parameters
----------
run_dir : str or Path
Path to the original run directory.
"""
log.level = 20
log.info(f"Running haddock3-traceback on {run_dir}")

outdir = Path(run_dir, TRACK_FOLDER)
try:
outdir.mkdir(exist_ok=False)
log.info(f"Created directory: {str(outdir.resolve())}")
except FileExistsError:
log.warning(f"Directory {str(outdir.resolve())} already exists.")

# Reading steps
log.info("Reading input run directory")
# get the module folders from the run_dir input
all_steps = get_module_steps_folders(Path(run_dir))
log.info(f"All_steps: {', '.join(all_steps)}")
sel_step = [st for st in all_steps if st.split("_")[1] not in ANA_MODULES]
Copy link
Contributor

@VGPReys VGPReys Jun 26, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'_'.join(st.split("_")[1:])
Just to make sure if any module has an underscore character one day ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not worry about it..

log.info(f"Steps to trace back: {', '.join(sel_step)}")

data_dict, rank_dict = {}, {}
unk_idx, max_topo_len = 0, 0
# this cycle goes through the steps in reverse order
for n in range(len(sel_step) - 1, -1, -1):
delta = len(sel_step) - n - 1 # how many steps have we gone back?
log.info(f"Tracing back step {sel_step[n]}")
# loading the .json file
json_path = Path(run_dir, sel_step[n], "io.json")
io = ModuleIO()
io.load(json_path)
# list all the values in the data_dict
ls_values = [x for val in data_dict.values() for x in val]
# getting and sorting the ranks for the current step folder
ranks = []
mgiulini marked this conversation as resolved.
Show resolved Hide resolved
for pdbfile in io.output:
ranks.append(pdbfile.score)
ranks_argsort = np.argsort(ranks)

# iterating through the pdbfiles to fill data_dict and rank_dict
for i, pdbfile in enumerate(io.output):
rank = np.where(ranks_argsort == i)[0][0] + 1
# getting the original names
ori_names, max_topo_len = get_ori_names(n, pdbfile, max_topo_len)
if n != len(sel_step) - 1:
if pdbfile.file_name not in ls_values:
# this is the first step in which the pdbfile appears.
# This means that it was discarded for the subsequent steps
# We need to add the pdbfile to the data_dict
key = f"unk{unk_idx}"
data_dict[key] = ["-" for el in range(delta - 1)]
data_dict[key].append(pdbfile.file_name)
rank_dict[key] = ["-" for el in range(delta)]
unk_idx += 1
else:
# we've already seen this pdb before.
idx = ls_values.index(pdbfile.file_name)
key = list(data_dict.keys())[idx // delta]

# assignment
for el in ori_names:
data_dict[key].append(el)
rank_dict[key].append(rank)
else: # last step of the workflow
data_dict[pdbfile.file_name] = [oname for oname in ori_names]
rank_dict[pdbfile.file_name] = [rank]

# print(f"rank_dict {rank_dict}")
# print(f"data_dict {data_dict}, maxtopo {max_topo_len}")
# dumping the data into a dataframe
df_output = traceback_dataframe(data_dict,
rank_dict,
sel_step,
max_topo_len)
# dumping the dataframe
track_filename = Path(run_dir, TRACK_FOLDER, "traceback.tsv")
log.info(f"Output dataframe {track_filename} "
f"created with shape {df_output.shape}")
df_output.to_csv(track_filename, sep="\t", index=False)
return


if __name__ == "__main__":
sys.exit(maincli())
1 change: 1 addition & 0 deletions src/haddock/modules/analysis/seletopclusts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def _run(self):
# changing attributes
name_path = Path(name)
name_path.write_text(model.rel_path.read_text())
model.ori_name = model.file_name
model.file_name = name
model.full_name = name
model.rel_path = Path('..', Path(self.path).name, name)
Expand Down
1 change: 1 addition & 0 deletions src/haddock/modules/refinement/emref/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def _run(self):
model, idx, ".", "emref"
)
expected_pdb.restr_fname = ambig_fname
expected_pdb.ori_name = model.file_name
self.output_models.append(expected_pdb)

job = CNSJob(inp_file, out_file, envvars=self.envvars)
Expand Down
1 change: 1 addition & 0 deletions src/haddock/modules/refinement/flexref/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def _run(self):
model, idx, ".", "flexref"
)
expected_pdb.restr_fname = ambig_fname
expected_pdb.ori_name = model.file_name
self.output_models.append(expected_pdb)

job = CNSJob(inp_file, out_file, envvars=self.envvars)
Expand Down
Loading