Skip to content

Commit

Permalink
fixed lint
Browse files Browse the repository at this point in the history
  • Loading branch information
mgiulini committed Jun 26, 2023
1 parent 3c5b3cf commit 61773b6
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 41 deletions.
94 changes: 63 additions & 31 deletions src/haddock/clis/cli_traceback.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""
Traceback CLI. Given an input run directory, haddock3-traceback traces back
each model to the initial input molecules used, providing the rank of each
intermediate structure.
Traces back PDB files from a HADDOCK run directory.
Given an input run directory, haddock3-traceback traces back each model to the
initial input molecules used, providing the rank of each intermediate model.
USAGE::
Expand All @@ -12,36 +13,63 @@
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from haddock import log
from haddock.libs.libontology import ModuleIO
from haddock.modules import get_module_steps_folders
import numpy as np
import pandas as pd


TRACK_FOLDER = "traceback" # name of the traceback folder

ANA_MODULES = ["caprieval",
"seletop",
"topoaa",
"rmsdmatrix",
"clustrmsd",
"clustfcc"]

def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, max_topo_len: int):
"""
Creates traceback dataframe by combining together ranks and data.

def traceback_dataframe(data_dict: dict,
rank_dict: dict,
sel_step: list,
max_topo_len: int):
"""
Create traceback dataframe by combining together ranks and data.
Parameters
----------
data_dict : dict
Dictionary containing the data to be traced back.
rank_dict : dict
Dictionary containing the ranks of the data to be traced back.
sel_step : list
List of selected steps.
max_topo_len : int
Maximum length of the topologies.
Returns
-------
df_ord : pandas.DataFrame
Dataframe containing the traceback data.
"""
# get last step of the workflow
last_step = sel_step[-1]
# data dict to dataframe
df_data = pd.DataFrame.from_dict(data_dict, orient="index")
df_data.reset_index(inplace=True)
# assign columns
# assign columns
data_cols = [el for el in reversed(sel_step)]
data_cols.extend([f"00_topo{i+1}" for i in range(max_topo_len)])
df_data.columns = data_cols

# same for the rank_dict
# same for the rank_dict
df_ranks = pd.DataFrame.from_dict(rank_dict, orient="index")
df_ranks.reset_index(inplace=True)
ranks_col = [last_step] # the key to merge the dataframes
ranks_col = [last_step] # the key to merge the dataframes
ranks_col.extend([f"{el}_rank" for el in reversed(sel_step)])
df_ranks.columns=ranks_col
df_ranks.columns = ranks_col

# merging the data and ranks dataframes
df_merged = pd.merge(df_data, df_ranks, on=last_step)
Expand All @@ -52,6 +80,7 @@ def traceback_dataframe(data_dict: dict, rank_dict: dict, sel_step: list, max_to
df_ord.loc[unk_records, last_step] = "-"
return df_ord


# Command line interface parser
ap = argparse.ArgumentParser(
prog="haddock3-traceback",
Expand Down Expand Up @@ -111,22 +140,21 @@ def main(run_dir):
# get the module folders from the run_dir input
all_steps = get_module_steps_folders(Path(run_dir))
log.info(f"All_steps: {', '.join(all_steps)}")
analysis_modules = ["caprieval", "seletop", "topoaa", "rmsdmatrix", "clustrmsd", "clustfcc"]
sel_step = [st for st in all_steps if st.split("_")[1] not in analysis_modules]
sel_step = [st for st in all_steps if st.split("_")[1] not in ANA_MODULES]
log.info(f"Steps to trace back: {', '.join(sel_step)}")

data_dict, rank_dict = {}, {}
unk_idx, max_topo_len = 0, 0
# this cycle goes through the steps in reverse order
for n in range(len(sel_step)-1,-1,-1):
delta = len(sel_step) - n - 1 # how many steps have we gone back?
# this cycle goes through the steps in reverse order
for n in range(len(sel_step) - 1, -1, -1):
delta = len(sel_step) - n - 1 # how many steps have we gone back?
log.info(f"Tracing back step {sel_step[n]}")
# loading the .json file
json_path = Path(run_dir, sel_step[n], "io.json")
io = ModuleIO()
io.load(json_path)
# list all the values in the data_dict
ls_values = [x for l in data_dict.values() for x in l]
# list all the values in the data_dict
ls_values = [x for val in data_dict.values() for x in val]
# getting and sorting the ranks for the current step folder
ranks = []
for pdbfile in io.output:
Expand All @@ -142,35 +170,39 @@ def main(run_dir):
# This means that it was discarded for the subsequent steps
# We need to add the pdbfile to the data_dict
key = f"unk{unk_idx}"
data_dict[key] = ["-" for el in range(delta-1)]
data_dict[key] = ["-" for el in range(delta - 1)]
data_dict[key].append(pdbfile.file_name)
rank_dict[key] = ["-" for el in range(delta)]
unk_idx += 1
else:
# we've already seen this pdb before.
idx = ls_values.index(pdbfile.file_name)
key = list(data_dict.keys())[idx//delta]
# at which step are we?
if n != 0: # not the first step, ori_name should be defined
key = list(data_dict.keys())[idx // delta]
# at which step are we?
if n != 0: # not the first step, ori_name should be defined
ori_names = [pdbfile.ori_name]
else: # first step, we get topology files instead of ori_name
else: # first step, we get topology files instead of ori_name
ori_names = [el.file_name for el in pdbfile.topology]
if len(pdbfile.topology) > max_topo_len:
max_topo_len = len(pdbfile.topology)
# assignment
# assignment
for el in ori_names:
data_dict[key].append(el)
rank_dict[key].append(rank)
else:
data_dict[pdbfile.file_name] = [pdbfile.ori_name]
rank_dict[pdbfile.file_name] = [rank]
#print(f"rank_dict {rank_dict}")
#print(f"data_dict {data_dict}")
# dumping the data into a dataframe
df_output = traceback_dataframe(data_dict, rank_dict, sel_step, max_topo_len)
# dumping the dataframe
# print(f"rank_dict {rank_dict}")
# print(f"data_dict {data_dict}")
# dumping the data into a dataframe
df_output = traceback_dataframe(data_dict,
rank_dict,
sel_step,
max_topo_len)
# dumping the dataframe
track_filename = Path(run_dir, TRACK_FOLDER, "traceback.tsv")
log.info(f"Output dataframe {track_filename} created with shape {df_output.shape}")
log.info(f"Output dataframe {track_filename} "
f"created with shape {df_output.shape}")
df_output.to_csv(track_filename, sep="\t", index=False)
return

Expand Down
22 changes: 12 additions & 10 deletions tests/test_cli_traceback.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@

import os
import shutil
from pathlib import Path

import pandas as pd
import pytest
from pathlib import Path

from haddock.clis.cli_traceback import (
main,
)
from haddock.clis.cli_traceback import main

from . import golden_data

Expand All @@ -18,16 +17,19 @@ def rigid_json():
"""Provide example rigidbody io.json file."""
return Path(golden_data, "io_rigid.json")


@pytest.fixture
def flexref_json():
"""Provide example flexref io.json file."""
return Path(golden_data, "io_flexref.json")


def test_main(rigid_json, flexref_json):
"""Test haddock3-traceback client."""
# build fake run_dir
run_dir = "example_dir"
step_dirs = [os.path.join(run_dir, "1_rigidbody"), os.path.join(run_dir, "4_flexref")]
step_dirs = [os.path.join(run_dir, "1_rigidbody"),
os.path.join(run_dir, "4_flexref")]

if os.path.isdir(run_dir):
shutil.rmtree(run_dir)
Expand All @@ -48,11 +50,11 @@ def test_main(rigid_json, flexref_json):
assert os.path.isfile(tr_file)

obs_tr = pd.read_csv(tr_file, sep="\t", dtype=str)
exp_tr = [["00_topo1", "00_topo2", "1_rigidbody", "1_rigidbody_rank", "4_flexref", "4_flexref_rank"],
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_2.pdb", "1", "flexref_1.pdb", "1"],
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_4.pdb", "2", "flexref_2.pdb", "2"],
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_1.pdb", "3", "-", "-"],
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_3.pdb", "4", "-", "-"]]
exp_tr = [["00_topo1", "00_topo2", "1_rigidbody", "1_rigidbody_rank", "4_flexref", "4_flexref_rank"], # noqa: E501
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_2.pdb", "1", "flexref_1.pdb", "1"], # noqa: E501
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_4.pdb", "2", "flexref_2.pdb", "2"], # noqa: E501
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_1.pdb", "3", "-", "-"], # noqa: E501
["4G6K_fv_haddock.psf", "4I1B-matched_haddock.psf", "rigidbody_3.pdb", "4", "-", "-"]] # noqa: E501
exp_tr_df = pd.DataFrame(exp_tr[1:], columns=exp_tr[0])

assert obs_tr.columns.tolist() == exp_tr_df.columns.tolist()
Expand Down

0 comments on commit 61773b6

Please sign in to comment.