From 71e8263d73b6e13cc38dce83043410943bc11f91 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Fri, 23 Jun 2023 14:16:20 +0200 Subject: [PATCH 01/10] added ori_name field to exported models --- src/haddock/modules/analysis/seletopclusts/__init__.py | 1 + src/haddock/modules/refinement/emref/__init__.py | 1 + src/haddock/modules/refinement/flexref/__init__.py | 1 + 3 files changed, 3 insertions(+) diff --git a/src/haddock/modules/analysis/seletopclusts/__init__.py b/src/haddock/modules/analysis/seletopclusts/__init__.py index e605759b1..c552ad6c6 100644 --- a/src/haddock/modules/analysis/seletopclusts/__init__.py +++ b/src/haddock/modules/analysis/seletopclusts/__init__.py @@ -87,6 +87,7 @@ def _run(self): # changing attributes name_path = Path(name) name_path.write_text(model.rel_path.read_text()) + model.ori_name = model.file_name model.file_name = name model.full_name = name model.rel_path = Path('..', Path(self.path).name, name) diff --git a/src/haddock/modules/refinement/emref/__init__.py b/src/haddock/modules/refinement/emref/__init__.py index 8a22eec59..71066f9ad 100644 --- a/src/haddock/modules/refinement/emref/__init__.py +++ b/src/haddock/modules/refinement/emref/__init__.py @@ -85,6 +85,7 @@ def _run(self): model, idx, ".", "emref" ) expected_pdb.restr_fname = ambig_fname + expected_pdb.ori_name = model.file_name self.output_models.append(expected_pdb) job = CNSJob(inp_file, out_file, envvars=self.envvars) diff --git a/src/haddock/modules/refinement/flexref/__init__.py b/src/haddock/modules/refinement/flexref/__init__.py index cab2602d3..de1ce76b7 100644 --- a/src/haddock/modules/refinement/flexref/__init__.py +++ b/src/haddock/modules/refinement/flexref/__init__.py @@ -87,6 +87,7 @@ def _run(self): model, idx, ".", "flexref" ) expected_pdb.restr_fname = ambig_fname + expected_pdb.ori_name = model.file_name self.output_models.append(expected_pdb) job = CNSJob(inp_file, out_file, envvars=self.envvars) From e05cbd03880f35ee8e8c8b94f465000f7068c4b8 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 10:54:35 +0200 Subject: [PATCH 02/10] added traceback cli --- setup.py | 1 + src/haddock/clis/cli_traceback.py | 179 ++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 src/haddock/clis/cli_traceback.py diff --git a/setup.py b/setup.py index 62f26dd5c..eee785d2e 100644 --- a/setup.py +++ b/setup.py @@ -81,6 +81,7 @@ def read(*names, **kwargs): 'haddock3-score = haddock.clis.cli_score:maincli', 'haddock3-unpack = haddock.clis.cli_unpack:maincli', 'haddock3-analyse = haddock.clis.cli_analyse:maincli', + 'haddock3-traceback = haddock.clis.cli_traceback:maincli', ] }, # cmdclass={'build_ext': optional_build_ext}, diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py new file mode 100644 index 000000000..b662464bc --- /dev/null +++ b/src/haddock/clis/cli_traceback.py @@ -0,0 +1,179 @@ +""" +Traceback CLI. Given an input run directory, haddock3-traceback traces back +each model to the initial input molecules used, providing the rank of each +intermediate structure. + +USAGE:: + + haddock3-traceback -r + +""" +import argparse +import sys +from pathlib import Path + +from haddock import log +from haddock.libs.libontology import ModuleIO +from haddock.modules import get_module_steps_folders +import numpy as np +import pandas as pd + +TRACK_FOLDER = "traceback" # name of the traceback folder + + +def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, max_topo_len: int): + """ + Creates traceback dataframe by combining together ranks and data. + """ + + # get last step of the workflow + last_step = sel_step[-1] + # data dict to dataframe + df_data = pd.DataFrame.from_dict(data_dict, orient="index") + df_data.reset_index(inplace=True) + # assign columns + data_cols = [el for el in reversed(sel_step)] + data_cols.extend([f"00_topo{i+1}" for i in range(max_topo_len)]) + df_data.columns = data_cols + + # same for the rank_dict + df_ranks = pd.DataFrame.from_dict(rank_dict, orient="index") + df_ranks.reset_index(inplace=True) + ranks_col = [last_step] # the key to merge the dataframes + ranks_col.extend([f"{el}_rank" for el in reversed(sel_step)]) + df_ranks.columns=ranks_col + + # merging the data and ranks dataframes + df_merged = pd.merge(df_data, df_ranks, on=last_step) + ordered_cols = sorted(df_merged.columns) + df_ord = df_merged[ordered_cols] + # last thing: substituting unk records with - in the last step + unk_records = df_ord[f'{last_step}'].str.startswith('unk') + df_ord.loc[unk_records, last_step] = "-" + return df_ord + +# Command line interface parser +ap = argparse.ArgumentParser( + prog="haddock3-traceback", + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + +ap.add_argument( + "-r", + "--run-dir", + help="The input run directory.", + required=True, + ) + + +def _ap(): + return ap + + +def load_args(ap): + """Load argument parser args.""" + return ap.parse_args() + + +def cli(ap, main): + """Command-line interface entry point.""" + cmd = vars(load_args(ap)) + main(**cmd) + + +def maincli(): + """Execute main client.""" + cli(ap, main) + + +def main(run_dir): + """ + Analyse CLI. + + Parameters + ---------- + run_dir : str or Path + Path to the original run directory. + """ + log.level = 20 + log.info(f"Running haddock3-traceback on {run_dir}") + + outdir = Path(run_dir, TRACK_FOLDER) + try: + outdir.mkdir(exist_ok=False) + log.info(f"Created directory: {str(outdir.resolve())}") + except FileExistsError: + log.warning(f"Directory {str(outdir.resolve())} already exists.") + + # Reading steps + log.info("Reading input run directory") + # get the module folders from the run_dir input + all_steps = get_module_steps_folders(Path(run_dir)) + log.info(f"All_steps: {', '.join(all_steps)}") + analysis_modules = ["caprieval", "seletop", "topoaa", "rmsdmatrix", "clustrmsd", "clustfcc"] + sel_step = [st for st in all_steps if st.split("_")[1] not in analysis_modules] + log.info(f"Steps to trace back: {', '.join(sel_step)}") + + data_dict, rank_dict = {}, {} + unk_idx, max_topo_len = 0, 0 + # this cycle goes through the steps in reverse order + for n in range(len(sel_step)-1,-1,-1): + delta = len(sel_step) - n - 1 # how many steps have we gone back? + log.info(f"Tracing back step {sel_step[n]}") + # loading the .json file + json_path = Path(run_dir, sel_step[n], "io.json") + io = ModuleIO() + io.load(json_path) + # list all the values in the data_dict + ls_values = [x for l in data_dict.values() for x in l] + # getting and sorting the ranks for the current step folder + ranks = [] + for pdbfile in io.output: + ranks.append(pdbfile.score) + ranks_argsort = np.argsort(ranks) + + # iterating through the pdbfiles to fill data_dict and rank_dict + for i, pdbfile in enumerate(io.output): + rank = np.where(ranks_argsort == i)[0][0] + 1 + if n != len(sel_step) - 1: + if pdbfile.file_name not in ls_values: + # this is the first step in which the pdbfile appears. + # This means that it was discarded for the subsequent steps + # We need to add the pdbfile to the data_dict + key = f"unk{unk_idx}" + data_dict[key] = ["-" for el in range(delta-1)] + data_dict[key].append(pdbfile.file_name) + rank_dict[key] = ["-" for el in range(delta)] + unk_idx += 1 + else: + # we've already seen this pdb before. + idx = ls_values.index(pdbfile.file_name) + key = list(data_dict.keys())[idx//delta] + # at which step are we? + if n != 0: # not the first step, ori_name should be defined + ori_names = [pdbfile.ori_name] + else: # first step, we get topology files instead of ori_name + ori_names = [el.file_name for el in pdbfile.topology] + if len(pdbfile.topology) > max_topo_len: + max_topo_len = len(pdbfile.topology) + # assignment + for el in ori_names: + data_dict[key].append(el) + rank_dict[key].append(rank) + else: + data_dict[pdbfile.file_name] = [pdbfile.ori_name] + rank_dict[pdbfile.file_name] = [rank] + #print(f"rank_dict {rank_dict}") + #print(f"data_dict {data_dict}") + # dumping the data into a dataframe + df_output = traceback_dataframe(data_dict, rank_dict, sel_step, max_topo_len) + # dumping the dataframe + track_filename = Path(run_dir, TRACK_FOLDER, "traceback.tsv") + log.info(f"Output dataframe {track_filename} created with shape {df_output.shape}") + df_output.to_csv(track_filename, sep="\t") + return + + +if __name__ == "__main__": + sys.exit(maincli()) From 2a5c56653fc34e049f418ef201562fc14705517c Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 11:42:08 +0200 Subject: [PATCH 03/10] removed index --- src/haddock/clis/cli_traceback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py index b662464bc..682cf2457 100644 --- a/src/haddock/clis/cli_traceback.py +++ b/src/haddock/clis/cli_traceback.py @@ -171,7 +171,7 @@ def main(run_dir): # dumping the dataframe track_filename = Path(run_dir, TRACK_FOLDER, "traceback.tsv") log.info(f"Output dataframe {track_filename} created with shape {df_output.shape}") - df_output.to_csv(track_filename, sep="\t") + df_output.to_csv(track_filename, sep="\t", index=False) return From 3c5b3cff3e7a816bc3a9d2ed067ed47e680c6e02 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 11:42:31 +0200 Subject: [PATCH 04/10] added test for traceback --- tests/golden_data/io_flexref.json | 203 +++++++++++++++++++ tests/golden_data/io_rigid.json | 325 ++++++++++++++++++++++++++++++ tests/test_cli_traceback.py | 62 ++++++ 3 files changed, 590 insertions(+) create mode 100644 tests/golden_data/io_flexref.json create mode 100644 tests/golden_data/io_rigid.json create mode 100644 tests/test_cli_traceback.py diff --git a/tests/golden_data/io_flexref.json b/tests/golden_data/io_flexref.json new file mode 100644 index 000000000..d5f6ed570 --- /dev/null +++ b/tests/golden_data/io_flexref.json @@ -0,0 +1,203 @@ +{ + "input": [], + "output": [ + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:15:03", + "file_name": "flexref_1.pdb", + "file_type": { + "py/reduce": [ + { + "py/type": "haddock.libs.libontology.Format" + }, + { + "py/tuple": [ + "pdb" + ] + } + ] + }, + "full_name": "flexref_1.pdb", + "len": NaN, + "md5": null, + "ori_name": "rigidbody_2.pdb", + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/5_flexref", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "5_flexref", + "flexref_1.pdb" + ] + } + ] + }, + "restr_fname": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "data", + "5_flexref", + "ambig-CDR-NMR-CSP.tbl" + ] + } + ] + }, + "score": -242.31202000000002, + "topology": [ + { + "created": "2023-06-26 11:14:32", + "file_name": "4G6K_fv_haddock.psf", + "file_type": { + "py/reduce": [ + { + "py/type": "haddock.libs.libontology.Format" + }, + { + "py/tuple": [ + "psf" + ] + } + ] + }, + "full_name": "4G6K_fv_haddock.psf", + "md5": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/0_topoaa", + "py/object": "haddock.libs.libontology.TopologyFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "0_topoaa", + "4G6K_fv_haddock.psf" + ] + } + ] + }, + "restr_fname": null + }, + { + "created": "2023-06-26 11:14:32", + "file_name": "4I1B-matched_haddock.psf", + "file_type": { + "py/id": 9 + }, + "full_name": "4I1B-matched_haddock.psf", + "md5": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/0_topoaa", + "py/object": "haddock.libs.libontology.TopologyFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "0_topoaa", + "4I1B-matched_haddock.psf" + ] + } + ] + }, + "restr_fname": null + } + ], + "unw_energies": { + "air": 117.512, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1363.37, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 11.779, + "dihe": 0.0, + "elec": -244.859, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": -134.696, + "vdw": -7.34952, + "vean": 0.0, + "xpcs": 0.0 + } + }, + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:15:03", + "file_name": "flexref_2.pdb", + "file_type": { + "py/id": 4 + }, + "full_name": "flexref_2.pdb", + "len": NaN, + "md5": null, + "ori_name": "rigidbody_4.pdb", + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/5_flexref", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "5_flexref", + "flexref_2.pdb" + ] + } + ] + }, + "restr_fname": { + "py/id": 6 + }, + "score": -191.67547, + "topology": [ + { + "py/id": 8 + }, + { + "py/id": 11 + } + ], + "unw_energies": { + "air": 180.926, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1475.06, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 2.32073, + "dihe": 0.0, + "elec": -175.102, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": -16.4121, + "vdw": -22.2362, + "vean": 0.0, + "xpcs": 0.0 + } + } + ] +} \ No newline at end of file diff --git a/tests/golden_data/io_rigid.json b/tests/golden_data/io_rigid.json new file mode 100644 index 000000000..57452ab39 --- /dev/null +++ b/tests/golden_data/io_rigid.json @@ -0,0 +1,325 @@ +{ + "input": [], + "output": [ + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:14:34", + "file_name": "rigidbody_1.pdb", + "file_type": { + "py/reduce": [ + { + "py/type": "haddock.libs.libontology.Format" + }, + { + "py/tuple": [ + "pdb" + ] + } + ] + }, + "full_name": "1_rigidbody/rigidbody_1.pdb", + "len": NaN, + "md5": null, + "ori_name": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/1_rigidbody/1_rigidbody", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "1_rigidbody", + "rigidbody_1.pdb" + ] + } + ] + }, + "restr_fname": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "data", + "1_rigidbody", + "ambig-CDR-NMR-CSP.tbl" + ] + } + ] + }, + "score": 9.95317401, + "topology": [ + { + "created": "2023-06-26 11:14:32", + "file_name": "4G6K_fv_haddock.psf", + "file_type": { + "py/reduce": [ + { + "py/type": "haddock.libs.libontology.Format" + }, + { + "py/tuple": [ + "psf" + ] + } + ] + }, + "full_name": "4G6K_fv_haddock.psf", + "md5": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/0_topoaa", + "py/object": "haddock.libs.libontology.TopologyFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "0_topoaa", + "4G6K_fv_haddock.psf" + ] + } + ] + }, + "restr_fname": null + }, + { + "created": "2023-06-26 11:14:32", + "file_name": "4I1B-matched_haddock.psf", + "file_type": { + "py/id": 9 + }, + "full_name": "4I1B-matched_haddock.psf", + "md5": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/0_topoaa", + "py/object": "haddock.libs.libontology.TopologyFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "0_topoaa", + "4I1B-matched_haddock.psf" + ] + } + ] + }, + "restr_fname": null + } + ], + "unw_energies": { + "air": 1443.5, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1413.94, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 14.0989, + "dihe": 0.0, + "elec": -4.44414, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": 1439.34, + "vdw": 0.281401, + "vean": 0.0, + "xpcs": 0.0 + } + }, + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:14:34", + "file_name": "rigidbody_2.pdb", + "file_type": { + "py/id": 4 + }, + "full_name": "1_rigidbody/rigidbody_2.pdb", + "len": NaN, + "md5": null, + "ori_name": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/1_rigidbody/1_rigidbody", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "1_rigidbody", + "rigidbody_2.pdb" + ] + } + ] + }, + "restr_fname": { + "py/id": 6 + }, + "score": 0.7690929999999998, + "topology": [ + { + "py/id": 8 + }, + { + "py/id": 11 + } + ], + "unw_energies": { + "air": 853.966, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1193.51, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 11.3613, + "dihe": 0.0, + "elec": -7.52969, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": 879.729, + "vdw": 33.2923, + "vean": 0.0, + "xpcs": 0.0 + } + }, + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:14:34", + "file_name": "rigidbody_3.pdb", + "file_type": { + "py/id": 4 + }, + "full_name": "1_rigidbody/rigidbody_3.pdb", + "len": NaN, + "md5": null, + "ori_name": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/1_rigidbody/1_rigidbody", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "1_rigidbody", + "rigidbody_3.pdb" + ] + } + ] + }, + "restr_fname": { + "py/id": 6 + }, + "score": 19.477193999999997, + "topology": [ + { + "py/id": 8 + }, + { + "py/id": 11 + } + ], + "unw_energies": { + "air": 1968.47, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1062.13, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 12.9516, + "dihe": 0.0, + "elec": -2.87223, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": 1999.04, + "vdw": 33.4424, + "vean": 0.0, + "xpcs": 0.0 + } + }, + { + "clt_id": null, + "clt_model_rank": null, + "clt_rank": null, + "created": "2023-06-26 11:14:34", + "file_name": "rigidbody_4.pdb", + "file_type": { + "py/id": 4 + }, + "full_name": "1_rigidbody/rigidbody_4.pdb", + "len": NaN, + "md5": null, + "ori_name": null, + "path": "/trinity/login/mgiulini/haddock3/examples/docking-antibody-antigen/run1-CDR-NMR-CSP-test-new/1_rigidbody/1_rigidbody", + "py/object": "haddock.libs.libontology.PDBFile", + "rel_path": { + "py/reduce": [ + { + "py/type": "pathlib.PosixPath" + }, + { + "py/tuple": [ + "..", + "1_rigidbody", + "rigidbody_4.pdb" + ] + } + ] + }, + "restr_fname": { + "py/id": 6 + }, + "score": 1.1810980000000004, + "topology": [ + { + "py/id": 8 + }, + { + "py/id": 11 + } + ], + "unw_energies": { + "air": 886.669, + "angles": 0.0, + "bonds": 0.0, + "bsa": 1241.54, + "cdih": 0.0, + "coup": 0.0, + "dani": 0.0, + "desolv": 12.0165, + "dihe": 0.0, + "elec": -7.6465, + "improper": 0.0, + "rdcs": 0.0, + "rg": 0.0, + "total": 915.003, + "vdw": 35.9808, + "vean": 0.0, + "xpcs": 0.0 + } + } + ] +} \ No newline at end of file diff --git a/tests/test_cli_traceback.py b/tests/test_cli_traceback.py new file mode 100644 index 000000000..04e1d7eb9 --- /dev/null +++ b/tests/test_cli_traceback.py @@ -0,0 +1,62 @@ +"""Test haddock3-traceback client.""" + +import os +import shutil +import pandas as pd +import pytest +from pathlib import Path + +from haddock.clis.cli_traceback import ( + main, + ) + +from . import golden_data + + +@pytest.fixture +def rigid_json(): + """Provide example rigidbody io.json file.""" + return Path(golden_data, "io_rigid.json") + +@pytest.fixture +def flexref_json(): + """Provide example flexref io.json file.""" + return Path(golden_data, "io_flexref.json") + +def test_main(rigid_json, flexref_json): + """Test haddock3-traceback client.""" + # build fake run_dir + run_dir = "example_dir" + step_dirs = [os.path.join(run_dir, "1_rigidbody"), os.path.join(run_dir, "4_flexref")] + + if os.path.isdir(run_dir): + shutil.rmtree(run_dir) + os.mkdir(run_dir) + os.mkdir(step_dirs[0]) + os.mkdir(step_dirs[1]) + shutil.copy(rigid_json, os.path.join(step_dirs[0], "io.json")) + shutil.copy(flexref_json, os.path.join(step_dirs[1], "io.json")) + + # run haddock3-traceback + main(run_dir) + + # check traceback folder exists + assert os.path.isdir(os.path.join(run_dir, "traceback")) + + # check traceback files exist + tr_file = os.path.join(run_dir, "traceback", "traceback.tsv") + assert os.path.isfile(tr_file) + + obs_tr = pd.read_csv(tr_file, sep="\t", dtype=str) + exp_tr = [["00_topo1", "00_topo2", "1_rigidbody", "1_rigidbody_rank", "4_flexref", "4_flexref_rank"], + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_2.pdb", "1", "flexref_1.pdb", "1"], + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_4.pdb", "2", "flexref_2.pdb", "2"], + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_1.pdb", "3", "-", "-"], + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_3.pdb", "4", "-", "-"]] + exp_tr_df = pd.DataFrame(exp_tr[1:], columns=exp_tr[0]) + + assert obs_tr.columns.tolist() == exp_tr_df.columns.tolist() + assert obs_tr.equals(exp_tr_df) + + # clean up + shutil.rmtree(run_dir) From 61773b6b3cbbbe77dc6dc58a8749f7822a8a2d43 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 11:55:04 +0200 Subject: [PATCH 05/10] fixed lint --- src/haddock/clis/cli_traceback.py | 94 +++++++++++++++++++++---------- tests/test_cli_traceback.py | 22 ++++---- 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py index 682cf2457..9c538c2a6 100644 --- a/src/haddock/clis/cli_traceback.py +++ b/src/haddock/clis/cli_traceback.py @@ -1,7 +1,8 @@ """ -Traceback CLI. Given an input run directory, haddock3-traceback traces back -each model to the initial input molecules used, providing the rank of each -intermediate structure. +Traces back PDB files from a HADDOCK run directory. + +Given an input run directory, haddock3-traceback traces back each model to the +initial input molecules used, providing the rank of each intermediate model. USAGE:: @@ -12,36 +13,63 @@ import sys from pathlib import Path +import numpy as np +import pandas as pd + from haddock import log from haddock.libs.libontology import ModuleIO from haddock.modules import get_module_steps_folders -import numpy as np -import pandas as pd + TRACK_FOLDER = "traceback" # name of the traceback folder +ANA_MODULES = ["caprieval", + "seletop", + "topoaa", + "rmsdmatrix", + "clustrmsd", + "clustfcc"] -def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, max_topo_len: int): - """ - Creates traceback dataframe by combining together ranks and data. + +def traceback_dataframe(data_dict: dict, + rank_dict: dict, + sel_step: list, + max_topo_len: int): """ + Create traceback dataframe by combining together ranks and data. + Parameters + ---------- + data_dict : dict + Dictionary containing the data to be traced back. + rank_dict : dict + Dictionary containing the ranks of the data to be traced back. + sel_step : list + List of selected steps. + max_topo_len : int + Maximum length of the topologies. + + Returns + ------- + df_ord : pandas.DataFrame + Dataframe containing the traceback data. + """ # get last step of the workflow last_step = sel_step[-1] # data dict to dataframe df_data = pd.DataFrame.from_dict(data_dict, orient="index") df_data.reset_index(inplace=True) - # assign columns + # assign columns data_cols = [el for el in reversed(sel_step)] data_cols.extend([f"00_topo{i+1}" for i in range(max_topo_len)]) df_data.columns = data_cols - # same for the rank_dict + # same for the rank_dict df_ranks = pd.DataFrame.from_dict(rank_dict, orient="index") df_ranks.reset_index(inplace=True) - ranks_col = [last_step] # the key to merge the dataframes + ranks_col = [last_step] # the key to merge the dataframes ranks_col.extend([f"{el}_rank" for el in reversed(sel_step)]) - df_ranks.columns=ranks_col + df_ranks.columns = ranks_col # merging the data and ranks dataframes df_merged = pd.merge(df_data, df_ranks, on=last_step) @@ -52,6 +80,7 @@ def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, max_to df_ord.loc[unk_records, last_step] = "-" return df_ord + # Command line interface parser ap = argparse.ArgumentParser( prog="haddock3-traceback", @@ -111,22 +140,21 @@ def main(run_dir): # get the module folders from the run_dir input all_steps = get_module_steps_folders(Path(run_dir)) log.info(f"All_steps: {', '.join(all_steps)}") - analysis_modules = ["caprieval", "seletop", "topoaa", "rmsdmatrix", "clustrmsd", "clustfcc"] - sel_step = [st for st in all_steps if st.split("_")[1] not in analysis_modules] + sel_step = [st for st in all_steps if st.split("_")[1] not in ANA_MODULES] log.info(f"Steps to trace back: {', '.join(sel_step)}") data_dict, rank_dict = {}, {} unk_idx, max_topo_len = 0, 0 - # this cycle goes through the steps in reverse order - for n in range(len(sel_step)-1,-1,-1): - delta = len(sel_step) - n - 1 # how many steps have we gone back? + # this cycle goes through the steps in reverse order + for n in range(len(sel_step) - 1, -1, -1): + delta = len(sel_step) - n - 1 # how many steps have we gone back? log.info(f"Tracing back step {sel_step[n]}") # loading the .json file json_path = Path(run_dir, sel_step[n], "io.json") io = ModuleIO() io.load(json_path) - # list all the values in the data_dict - ls_values = [x for l in data_dict.values() for x in l] + # list all the values in the data_dict + ls_values = [x for val in data_dict.values() for x in val] # getting and sorting the ranks for the current step folder ranks = [] for pdbfile in io.output: @@ -142,35 +170,39 @@ def main(run_dir): # This means that it was discarded for the subsequent steps # We need to add the pdbfile to the data_dict key = f"unk{unk_idx}" - data_dict[key] = ["-" for el in range(delta-1)] + data_dict[key] = ["-" for el in range(delta - 1)] data_dict[key].append(pdbfile.file_name) rank_dict[key] = ["-" for el in range(delta)] unk_idx += 1 else: # we've already seen this pdb before. idx = ls_values.index(pdbfile.file_name) - key = list(data_dict.keys())[idx//delta] - # at which step are we? - if n != 0: # not the first step, ori_name should be defined + key = list(data_dict.keys())[idx // delta] + # at which step are we? + if n != 0: # not the first step, ori_name should be defined ori_names = [pdbfile.ori_name] - else: # first step, we get topology files instead of ori_name + else: # first step, we get topology files instead of ori_name ori_names = [el.file_name for el in pdbfile.topology] if len(pdbfile.topology) > max_topo_len: max_topo_len = len(pdbfile.topology) - # assignment + # assignment for el in ori_names: data_dict[key].append(el) rank_dict[key].append(rank) else: data_dict[pdbfile.file_name] = [pdbfile.ori_name] rank_dict[pdbfile.file_name] = [rank] - #print(f"rank_dict {rank_dict}") - #print(f"data_dict {data_dict}") - # dumping the data into a dataframe - df_output = traceback_dataframe(data_dict, rank_dict, sel_step, max_topo_len) - # dumping the dataframe + # print(f"rank_dict {rank_dict}") + # print(f"data_dict {data_dict}") + # dumping the data into a dataframe + df_output = traceback_dataframe(data_dict, + rank_dict, + sel_step, + max_topo_len) + # dumping the dataframe track_filename = Path(run_dir, TRACK_FOLDER, "traceback.tsv") - log.info(f"Output dataframe {track_filename} created with shape {df_output.shape}") + log.info(f"Output dataframe {track_filename} " + f"created with shape {df_output.shape}") df_output.to_csv(track_filename, sep="\t", index=False) return diff --git a/tests/test_cli_traceback.py b/tests/test_cli_traceback.py index 04e1d7eb9..2a8782332 100644 --- a/tests/test_cli_traceback.py +++ b/tests/test_cli_traceback.py @@ -2,13 +2,12 @@ import os import shutil +from pathlib import Path + import pandas as pd import pytest -from pathlib import Path -from haddock.clis.cli_traceback import ( - main, - ) +from haddock.clis.cli_traceback import main from . import golden_data @@ -18,16 +17,19 @@ def rigid_json(): """Provide example rigidbody io.json file.""" return Path(golden_data, "io_rigid.json") + @pytest.fixture def flexref_json(): """Provide example flexref io.json file.""" return Path(golden_data, "io_flexref.json") + def test_main(rigid_json, flexref_json): """Test haddock3-traceback client.""" # build fake run_dir run_dir = "example_dir" - step_dirs = [os.path.join(run_dir, "1_rigidbody"), os.path.join(run_dir, "4_flexref")] + step_dirs = [os.path.join(run_dir, "1_rigidbody"), + os.path.join(run_dir, "4_flexref")] if os.path.isdir(run_dir): shutil.rmtree(run_dir) @@ -48,11 +50,11 @@ def test_main(rigid_json, flexref_json): assert os.path.isfile(tr_file) obs_tr = pd.read_csv(tr_file, sep="\t", dtype=str) - exp_tr = [["00_topo1", "00_topo2", "1_rigidbody", "1_rigidbody_rank", "4_flexref", "4_flexref_rank"], - ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_2.pdb", "1", "flexref_1.pdb", "1"], - ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_4.pdb", "2", "flexref_2.pdb", "2"], - ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_1.pdb", "3", "-", "-"], - ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_3.pdb", "4", "-", "-"]] + exp_tr = [["00_topo1", "00_topo2", "1_rigidbody", "1_rigidbody_rank", "4_flexref", "4_flexref_rank"], # noqa: E501 + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_2.pdb", "1", "flexref_1.pdb", "1"], # noqa: E501 + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_4.pdb", "2", "flexref_2.pdb", "2"], # noqa: E501 + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_1.pdb", "3", "-", "-"], # noqa: E501 + ["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_3.pdb", "4", "-", "-"]] # noqa: E501 exp_tr_df = pd.DataFrame(exp_tr[1:], columns=exp_tr[0]) assert obs_tr.columns.tolist() == exp_tr_df.columns.tolist() From 514e41a5810a496dfe37721ad45821cd548343b1 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 14:36:06 +0200 Subject: [PATCH 06/10] handled scoring and refining workflows --- src/haddock/clis/cli_traceback.py | 54 ++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py index 9c538c2a6..37d9c781f 100644 --- a/src/haddock/clis/cli_traceback.py +++ b/src/haddock/clis/cli_traceback.py @@ -17,7 +17,7 @@ import pandas as pd from haddock import log -from haddock.libs.libontology import ModuleIO +from haddock.libs.libontology import ModuleIO, PDBFile from haddock.modules import get_module_steps_folders @@ -31,6 +31,41 @@ "clustfcc"] +def get_ori_names(n: int, pdbfile: PDBFile, max_topo_len: int): + """ + Get the original name(s) of the PDB file. + + Parameters + ---------- + n : int + Step number. + pdbfile : PDBFile + PDBFile object. + max_topo_len : int + Maximum length of the topologies found so far. + + Returns + ------- + ori_names : list + List of original names. + max_topo_len : int + Maximum length of the topologies found so far. + """ + if n != 0: # not the first step, ori_name should be defined + ori_names = [pdbfile.ori_name] + else: # first step, we get topology files instead of ori_name + # topology can either be a list of topologies or a single + # topology + if isinstance(pdbfile.topology, list): + ori_names = [el.file_name for el in pdbfile.topology] + if len(pdbfile.topology) > max_topo_len: + max_topo_len = len(pdbfile.topology) + else: + ori_names = [pdbfile.topology.file_name] + max_topo_len = 1 + return ori_names, max_topo_len + + def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, @@ -164,6 +199,8 @@ def main(run_dir): # iterating through the pdbfiles to fill data_dict and rank_dict for i, pdbfile in enumerate(io.output): rank = np.where(ranks_argsort == i)[0][0] + 1 + # getting the original names + ori_names, max_topo_len = get_ori_names(n, pdbfile, max_topo_len) if n != len(sel_step) - 1: if pdbfile.file_name not in ls_values: # this is the first step in which the pdbfile appears. @@ -178,22 +215,17 @@ def main(run_dir): # we've already seen this pdb before. idx = ls_values.index(pdbfile.file_name) key = list(data_dict.keys())[idx // delta] - # at which step are we? - if n != 0: # not the first step, ori_name should be defined - ori_names = [pdbfile.ori_name] - else: # first step, we get topology files instead of ori_name - ori_names = [el.file_name for el in pdbfile.topology] - if len(pdbfile.topology) > max_topo_len: - max_topo_len = len(pdbfile.topology) + # assignment for el in ori_names: data_dict[key].append(el) rank_dict[key].append(rank) - else: - data_dict[pdbfile.file_name] = [pdbfile.ori_name] + else: # last step of the workflow + data_dict[pdbfile.file_name] = [oname for oname in ori_names] rank_dict[pdbfile.file_name] = [rank] + # print(f"rank_dict {rank_dict}") - # print(f"data_dict {data_dict}") + # print(f"data_dict {data_dict}, maxtopo {max_topo_len}") # dumping the data into a dataframe df_output = traceback_dataframe(data_dict, rank_dict, From 65fd11adcb1963c8a619383b1d711d23c220e6a9 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 26 Jun 2023 16:20:27 +0200 Subject: [PATCH 07/10] minor improvements --- src/haddock/clis/cli_traceback.py | 4 +--- tests/test_cli_traceback.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py index 37d9c781f..44e890bf0 100644 --- a/src/haddock/clis/cli_traceback.py +++ b/src/haddock/clis/cli_traceback.py @@ -191,9 +191,7 @@ def main(run_dir): # list all the values in the data_dict ls_values = [x for val in data_dict.values() for x in val] # getting and sorting the ranks for the current step folder - ranks = [] - for pdbfile in io.output: - ranks.append(pdbfile.score) + ranks = [pdbfile.score for pdbfile in io.output] ranks_argsort = np.argsort(ranks) # iterating through the pdbfiles to fill data_dict and rank_dict diff --git a/tests/test_cli_traceback.py b/tests/test_cli_traceback.py index 2a8782332..2908f865c 100644 --- a/tests/test_cli_traceback.py +++ b/tests/test_cli_traceback.py @@ -33,9 +33,9 @@ def test_main(rigid_json, flexref_json): if os.path.isdir(run_dir): shutil.rmtree(run_dir) - os.mkdir(run_dir) - os.mkdir(step_dirs[0]) - os.mkdir(step_dirs[1]) + # Loop over directories to be created + for d in [run_dir, *step_dirs]: + os.mkdir(d) shutil.copy(rigid_json, os.path.join(step_dirs[0], "io.json")) shutil.copy(flexref_json, os.path.join(step_dirs[1], "io.json")) From ef52da9e9307a704319cac33eac7f47cc0be5738 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Tue, 27 Jun 2023 16:11:28 +0200 Subject: [PATCH 08/10] handled AttributeError in refinement --- src/haddock/modules/refinement/emref/__init__.py | 5 ++++- src/haddock/modules/refinement/flexref/__init__.py | 5 ++++- src/haddock/modules/refinement/mdref/__init__.py | 4 ++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/haddock/modules/refinement/emref/__init__.py b/src/haddock/modules/refinement/emref/__init__.py index 71066f9ad..36b4f0e4a 100644 --- a/src/haddock/modules/refinement/emref/__init__.py +++ b/src/haddock/modules/refinement/emref/__init__.py @@ -85,7 +85,10 @@ def _run(self): model, idx, ".", "emref" ) expected_pdb.restr_fname = ambig_fname - expected_pdb.ori_name = model.file_name + try: + expected_pdb.ori_name = model.file_name + except AttributeError: + expected_pdb.ori_name = None self.output_models.append(expected_pdb) job = CNSJob(inp_file, out_file, envvars=self.envvars) diff --git a/src/haddock/modules/refinement/flexref/__init__.py b/src/haddock/modules/refinement/flexref/__init__.py index de1ce76b7..2a4627fd8 100644 --- a/src/haddock/modules/refinement/flexref/__init__.py +++ b/src/haddock/modules/refinement/flexref/__init__.py @@ -87,7 +87,10 @@ def _run(self): model, idx, ".", "flexref" ) expected_pdb.restr_fname = ambig_fname - expected_pdb.ori_name = model.file_name + try: + expected_pdb.ori_name = model.file_name + except AttributeError: + expected_pdb.ori_name = None self.output_models.append(expected_pdb) job = CNSJob(inp_file, out_file, envvars=self.envvars) diff --git a/src/haddock/modules/refinement/mdref/__init__.py b/src/haddock/modules/refinement/mdref/__init__.py index fa8473124..9bb07bc2c 100644 --- a/src/haddock/modules/refinement/mdref/__init__.py +++ b/src/haddock/modules/refinement/mdref/__init__.py @@ -85,6 +85,10 @@ def _run(self): model, idx, ".", "mdref" ) expected_pdb.restr_fname = ambig_fname + try: + expected_pdb.ori_name = model.file_name + except AttributeError: + expected_pdb.ori_name = None self.output_models.append(expected_pdb) job = CNSJob(inp_file, out_file, envvars=self.envvars) From 17e5d2e1a83594fd0c53bdfec9eedd52bfa49328 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Thu, 29 Jun 2023 09:38:13 +0200 Subject: [PATCH 09/10] used rel paths and addressed comments --- src/haddock/clis/cli_traceback.py | 44 ++++++++++++++++++------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/haddock/clis/cli_traceback.py b/src/haddock/clis/cli_traceback.py index 44e890bf0..c6038450e 100644 --- a/src/haddock/clis/cli_traceback.py +++ b/src/haddock/clis/cli_traceback.py @@ -17,6 +17,7 @@ import pandas as pd from haddock import log +from haddock.libs import libcli from haddock.libs.libontology import ModuleIO, PDBFile from haddock.modules import get_module_steps_folders @@ -123,16 +124,7 @@ def traceback_dataframe(data_dict: dict, formatter_class=argparse.RawDescriptionHelpFormatter, ) -ap.add_argument( - "-r", - "--run-dir", - help="The input run directory.", - required=True, - ) - - -def _ap(): - return ap +libcli.add_rundir_arg(ap) def load_args(ap): @@ -182,8 +174,14 @@ def main(run_dir): unk_idx, max_topo_len = 0, 0 # this cycle goes through the steps in reverse order for n in range(len(sel_step) - 1, -1, -1): - delta = len(sel_step) - n - 1 # how many steps have we gone back? log.info(f"Tracing back step {sel_step[n]}") + # correcting names in the dictionary. The ori_name must be complemented + # with the step folder name + for key in data_dict.keys(): + if data_dict[key][-1] != "-": + data_dict[key][-1] = f"../{sel_step[n]}/{data_dict[key][-1]}" + + delta = len(sel_step) - n - 1 # how many steps have we gone back? # loading the .json file json_path = Path(run_dir, sel_step[n], "io.json") io = ModuleIO() @@ -200,18 +198,18 @@ def main(run_dir): # getting the original names ori_names, max_topo_len = get_ori_names(n, pdbfile, max_topo_len) if n != len(sel_step) - 1: - if pdbfile.file_name not in ls_values: + if str(pdbfile.rel_path) not in ls_values: # this is the first step in which the pdbfile appears. # This means that it was discarded for the subsequent steps # We need to add the pdbfile to the data_dict key = f"unk{unk_idx}" data_dict[key] = ["-" for el in range(delta - 1)] - data_dict[key].append(pdbfile.file_name) + data_dict[key].append(str(pdbfile.rel_path)) rank_dict[key] = ["-" for el in range(delta)] unk_idx += 1 else: # we've already seen this pdb before. - idx = ls_values.index(pdbfile.file_name) + idx = ls_values.index(str(pdbfile.rel_path)) key = list(data_dict.keys())[idx // delta] # assignment @@ -219,14 +217,24 @@ def main(run_dir): data_dict[key].append(el) rank_dict[key].append(rank) else: # last step of the workflow - data_dict[pdbfile.file_name] = [oname for oname in ori_names] - rank_dict[pdbfile.file_name] = [rank] + data_dict[str(pdbfile.rel_path)] = [on for on in ori_names] + rank_dict[str(pdbfile.rel_path)] = [rank] # print(f"rank_dict {rank_dict}") # print(f"data_dict {data_dict}, maxtopo {max_topo_len}") + + # stripping away relative paths + final_data_dict = {} + for key in data_dict.keys(): + new_key = key.split("/")[-1] + final_data_dict[new_key] = [el.split("/")[-1] for el in data_dict[key]] + final_rank_dict = {} + for key in rank_dict.keys(): + new_key = key.split("/")[-1] + final_rank_dict[new_key] = rank_dict[key] # dumping the data into a dataframe - df_output = traceback_dataframe(data_dict, - rank_dict, + df_output = traceback_dataframe(final_data_dict, + final_rank_dict, sel_step, max_topo_len) # dumping the dataframe From aec97bb9d9d2c0bea7f2fac7f7fd0b24815e99a4 Mon Sep 17 00:00:00 2001 From: mgiulini Date: Mon, 7 Aug 2023 11:51:41 +0200 Subject: [PATCH 10/10] added traceback to postprocess option --- src/haddock/libs/libworkflow.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/haddock/libs/libworkflow.py b/src/haddock/libs/libworkflow.py index 67a1471e6..77de8ad23 100644 --- a/src/haddock/libs/libworkflow.py +++ b/src/haddock/libs/libworkflow.py @@ -6,6 +6,7 @@ from haddock import log from haddock.clis.cli_analyse import main as cli_analyse +from haddock.clis.cli_traceback import main as cli_traceback from haddock.core.exceptions import HaddockError, HaddockTermination, StepError from haddock.gear.clean_steps import clean_output from haddock.gear.config import get_module_name @@ -62,6 +63,8 @@ def postprocess(self): capri_steps.append(step.order) # call cli_analyse (no need for capri_dicts, it's all precalculated) cli_analyse("./", capri_steps, top_cluster=10, format=None, scale=None) + # call cli_traceback + cli_traceback("./") class Workflow: