-
Notifications
You must be signed in to change notification settings - Fork 0
/
recluster.py
146 lines (120 loc) · 5.42 KB
/
recluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from pathlib import Path
from typing import List, Dict, Any
import logging
import json
import shutil
import pandas as pd
from lib import flip, pdb, clustering, align
import config
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def process_pca_clustering(pca_file: Path, n_dim: int, n_clus: int) -> tuple[pd.DataFrame, List[List[Any]]]:
"""Process PCA and clustering for molecule."""
pca_df = pd.read_csv(pca_file)
selected_columns = [str(i) for i in range(1, n_dim + 1)]
clustering_labels, _ = clustering.best_clustering(n_clus, pca_df[selected_columns])
pca_df.insert(1, "cluster", clustering_labels, False)
pca_df["cluster"] = pca_df["cluster"].astype(str)
popp = clustering.kde_c(n_clus, pca_df, selected_columns)
return pca_df, popp
def save_cluster_info(output_path: Path, data: Dict[str, Any]) -> None:
"""Save cluster information to JSON."""
with open(output_path, "w") as json_file:
json.dump(data, json_file, indent=4)
def process_torsions(input_file: Path, clustering_labels: List[int], output_file: Path) -> None:
"""Process and save torsion data."""
df = pd.read_csv(input_file)
df.insert(1, "cluster", clustering_labels, False)
cols_to_drop = [col for col in df.columns if "internal" in col]
df = df.drop(columns=cols_to_drop)
df.to_csv(output_file, index=False)
def align_cluster_files(temp_dir: Path, final_dir: Path) -> None:
"""Align PDB files and save to final directory."""
filenames = [f for f in temp_dir.glob("*.pdb")]
sorted_files = sorted(filenames, key=lambda x: float(x.stem.split("_")[1]), reverse=True)
if not sorted_files:
raise ValueError("No PDB files found for alignment")
reference_file = sorted_files[0]
align.align_pdb_files(str(reference_file), [str(f) for f in sorted_files],
[str(final_dir / f.name) for f in sorted_files])
def process_molecule(directory: Path) -> None:
"""Process a single molecule directory."""
clusters_dir = directory / "clusters"
if clusters_dir.exists():
logger.info(f"Skipping existing directory: {directory.name}")
return
logger.info(f"Processing directory: {directory.name}")
try:
# Create directory structure
for subdir in ["pack", "alpha", "beta"]:
(clusters_dir / subdir).mkdir(parents=True, exist_ok=True)
# Read configuration
with open(directory / "output/info.txt") as f:
config_vars = {}
for line in f:
exec(line, None, config_vars)
# Copy files
for file in ["torparts.npz", "PCA_variance.png", "Silhouette_Score.png"]:
shutil.copy(directory / f"output/{file}", clusters_dir / f"pack/{file}")
# Determine molecule type and set paths
is_alpha = flip.is_alpha(directory.name)
temp_dir = clusters_dir / ("alpha_temp" if is_alpha else "beta_temp")
final_dir = clusters_dir / ("alpha" if is_alpha else "beta")
flip_dir = clusters_dir / ("beta" if is_alpha else "alpha")
temp_dir.mkdir(exist_ok=True)
# Process PCA and clustering
pca_df, popp = process_pca_clustering(
directory / "output/pca.csv",
config_vars["n_dim"],
config_vars["n_clus"]
)
# Export cluster representatives
pdb.exportframeidPDB(str(directory / f"{directory.name}.pdb"), popp, str(temp_dir)+"/")
# Save cluster info
sorted_popp = sorted(popp, key=lambda x: int(x[1]))
data = {
"n_clus": config_vars["n_clus"],
"n_dim": config_vars["n_dim"],
"popp": [int(item[0]) for item in sorted_popp],
"s_scores": [float(s) for s in config_vars["s_scores"]],
"flexibility": int(config_vars["flexibility"])
}
save_cluster_info(clusters_dir / "pack/info.json", data)
# Process torsions
process_torsions(
directory / "output/torsions.csv",
pca_df["cluster"],
clusters_dir / "pack/torsions.csv"
)
# Save simplified PCA data
pca_df[["0", "1", "2", "i", "cluster"]].to_csv(
clusters_dir / "pack/pca.csv", index=False
)
# Align cluster files
align_cluster_files(temp_dir, final_dir)
shutil.rmtree(temp_dir)
# Flip structures
try:
for pdb_file in final_dir.glob("*.pdb"):
flip.flip_alpha_beta(str(pdb_file), str(flip_dir / pdb_file.name))
logger.info("Structure flipping completed successfully")
except Exception as e:
logger.error(f"Failed to flip structures: {e}")
# try:
# for pdb_file in final_dir.glob("*.pdb"):
# flip.flip_c2_connected_atoms(str(pdb_file), str(flip_dir / pdb_file.name))
# logger.info("Structure flipping completed successfully using c2")
# except Exception as e:
# logger.error(f"Failed to flip structures with c2: {e}")
except Exception as e:
logger.error(f"Failed to process {directory.name}: {e}")
pass
def main() -> None:
"""Main execution function."""
data_dir = Path(config.data_dir)
logger.info(f"Processing directories in: {data_dir}")
for directory in data_dir.iterdir():
if directory.is_dir():
process_molecule(directory)
if __name__ == "__main__":
main()