Skip to content
This repository was archived by the owner on Mar 21, 2025. It is now read-only.

Commit 8abe152

Browse files
authored
Maintain (#15)
* update * test mod compatible * add graph visualizer * fix lint * add copy right for FGUtils * prepare release * add partial map expansion * add testcase for partial expansion ver1 * update new features * prepare release * prepare release * fix dependency * fix bug * enhance graph descriptors * fix bug and update new features * add new expanding method * fix strip context, add graph clustering with batch * enhance multistep with aam
1 parent a8a6d67 commit 8abe152

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+2588
-391
lines changed

.coverage

0 Bytes
Binary file not shown.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ test_mod.py
88
test_format.py
99
*dev_zone
1010
test_format.py
11+
ITS_graphs.pkl.gz
File renamed without changes.

Data/Testcase/graph.pkl.gz

1.13 MB
Binary file not shown.

Data/Testcase/mech.json.gz

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"R-id": "Mech-1", "reaction": "[CH3:1][CH:2]=[O:3].[CH:4]([H:7])([H:8])[CH:5]=[O:6]>>[CH3:1][CH:2]=[CH:4][CH:5]=[O:6].[O:3]([H:7])([H:8])", "mechanisms": [{"mech_name": "Aldol reaction (base cat)", "steps": [{"description": "Base abstracts H from substrate", "smart_string": "[CH:4]([H:7])([H:8])[CH:5]=[O:6].[*-:9]>>[CH-:4]([H:8])[CH:5]=[O:6].[*:9][H:7]", "step": 1, "step_gml": "rule [\n ruleID \"2\"\n left [\n ]\n context [\n ]\n right [\n ]\n]", "step_dfs": null, "step_smart": ">>", "step_desc": "Base abstracts H from substrate"}, {"description": "Nucleophilic addition fro substrate", "smart_string": "[CH3:1][CH:2]=[O:3].[CH-:4]([H:8])[CH:5]=[O:6]>>[CH3:1][CH:2]([O-:3])[CH:4]([H:8])[CH:5]=[O:6]", "step": 2, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"=\" ]\n node [ id 2 label \"O\" ]\n node [ id 3 label \"C-\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n ]\n right [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 1 target 3 label \"-\" ]\n node [ id 2 label \"O-\" ]\n node [ id 3 label \"C\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH2:1]=[O:2].[CH3-:3]>>[CH2:1]([O-:2])[CH3:3]", "step_desc": "Nucleophilic addition fro substrate"}, {"description": "Neutralize substrate", "smart_string": "[CH3:1][CH:2]([O-:3])[CH:4]([H:8])[CH:5]=[O:6].[*:9][H:7]>>[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6].[*-:9]", "step": 3, "step_gml": "rule [\n ruleID \"2\"\n left [\n ]\n context [\n ]\n right [\n ]\n]", "step_dfs": null, "step_smart": ">>", "step_desc": "Neutralize substrate"}, {"description": "Base abstracts H from substrate", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6].[*-:9]>>[CH3:1][CH:2]([O:3][H:7])[CH-:4][CH:5]=[O:6].[*:9][H:8]", "step": 4, "step_gml": "rule [\n ruleID \"2\"\n left [\n ]\n context [\n ]\n right [\n ]\n]", "step_dfs": null, "step_smart": ">>", "step_desc": "Base abstracts H from substrate"}, {"description": "Elimination Unimolecular Conjugate Base", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH-:4][CH:5]=[O:6]>>[CH3:1][CH:2]=[CH:4][CH:5]=[O:6].[O-:3][H:7]", "step": 5, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 1 target 3 label \"-\" ]\n node [ id 2 label \"O\" ]\n node [ id 3 label \"C-\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n ]\n right [\n edge [ source 1 target 3 label \"=\" ]\n node [ id 2 label \"O-\" ]\n node [ id 3 label \"C\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH2:1]([OH:2])[CH2-:3]>>[CH2:1]=[CH2:3].[OH-:2]", "step_desc": "Elimination Unimolecular Conjugate Base"}, {"description": "Neutralize Hydroxide", "smart_string": "[O-:3][H:7].[*:9][H:8]>>[O:3]([H:7])([H:8]).[*-:9]", "step": 6, "step_gml": "rule [\n ruleID \"2\"\n left [\n ]\n context [\n ]\n right [\n ]\n]", "step_dfs": null, "step_smart": ">>", "step_desc": "Neutralize Hydroxide"}], "cat": "[*-]"}, {"mech_name": "Aldol reaction (neutral cat)", "steps": [{"description": "Tautomerization of substrate", "smart_string": "[CH:4]([H:7])([H:8])[CH:5]=[O:6]>>[CH:4]([H:8])=[CH:5][O:6]([H:7])", "step": 1, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 2 target 3 label \"=\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n node [ id 2 label \"C\" ]\n node [ id 3 label \"O\" ]\n ]\n right [\n edge [ source 1 target 2 label \"=\" ]\n edge [ source 2 target 3 label \"-\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH3:1][CH:2]=[O:3]>>[CH2:1]=[CH:2][OH:3]", "step_desc": "Tautomerization of substrate"}, {"description": "Nucleophilic addition from substrate", "smart_string": "[CH3:1][CH:2]=[O:3].[CH:4]([H:8])=[CH:5][O:6]([H:7])>>[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6]", "step": 2, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"=\" ]\n edge [ source 3 target 4 label \"=\" ]\n edge [ source 4 target 5 label \"-\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n node [ id 2 label \"O\" ]\n node [ id 3 label \"C\" ]\n node [ id 4 label \"C\" ]\n node [ id 5 label \"O\" ]\n ]\n right [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 1 target 3 label \"-\" ]\n edge [ source 3 target 4 label \"-\" ]\n edge [ source 4 target 5 label \"=\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH2:1]=[O:2].[CH2:3]=[CH:4][OH:5]>>[CH2:1]([OH:2])[CH2:3][CH:4]=[O:5]", "step_desc": "Nucleophilic addition from substrate"}, {"description": "Tautomerization of substrate", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6]>>[CH3:1][CH:2]([O:3][H:7])[CH:4]=[CH:5][O:6]([H:8])", "step": 3, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 2 target 3 label \"=\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n node [ id 2 label \"C\" ]\n node [ id 3 label \"O\" ]\n ]\n right [\n edge [ source 1 target 2 label \"=\" ]\n edge [ source 2 target 3 label \"-\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH3:1][CH:2]=[O:3]>>[CH2:1]=[CH:2][OH:3]", "step_desc": "Tautomerization of substrate"}, {"description": "Elimination", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH:4]=[CH:5][O:6]([H:8])>>[CH3:1][CH:2]=[CH:4][CH:5]=[O:6].[O:3]([H:7])([H:8])", "step": 4, "step_gml": "rule [\n ruleID \"2\"\n left [\n edge [ source 1 target 2 label \"-\" ]\n edge [ source 1 target 3 label \"-\" ]\n edge [ source 3 target 4 label \"=\" ]\n edge [ source 4 target 5 label \"-\" ]\n ]\n context [\n node [ id 1 label \"C\" ]\n node [ id 2 label \"O\" ]\n node [ id 3 label \"C\" ]\n node [ id 4 label \"C\" ]\n node [ id 5 label \"O\" ]\n ]\n right [\n edge [ source 1 target 3 label \"=\" ]\n edge [ source 3 target 4 label \"-\" ]\n edge [ source 4 target 5 label \"=\" ]\n ]\n]", "step_dfs": null, "step_smart": "[CH2:1]([OH:2])[CH:3]=[CH:4][OH:5]>>[CH2:1]=[CH:3][CH:4]=[O:5].[OH2:2]", "step_desc": "Elimination"}], "cat": ""}, {"mech_name": "Aldol reaction (acid cat)", "steps": [{"description": "Tautomerization of substrate with acid cat", "smart_string": "[CH:4]([H:7])([H:8])[CH:5]=[O:6].[H+:9]>>[CH:4]([H:8])=[CH:5][O:6]([H:9]).[H+:7]", "step": 1}, {"description": "Nucleophilic addition from substrate", "smart_string": "[CH3:1][CH:2]=[O:3].[CH:4]([H:8])=[CH:5][O:6]([H:9]).[H+:7]>>[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6].[H+:9]", "step": 2}, {"description": "Tautomerization of substrate with acid cat", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH:4]([H:8])[CH:5]=[O:6].[H+:9]>>[CH3:1][CH:2]([O:3][H:7])[CH:4]=[CH:5][O:6]([H:9]).[H+:8]", "step": 3}, {"description": "Elimination", "smart_string": "[CH3:1][CH:2]([O:3][H:7])[CH:4]=[CH:5][O:6]([H:9]).[H+:8]>>[CH3:1][CH:2]=[CH:4][CH:5]=[O:6].[H+:9].[O:3]([H:7])([H:8])", "step": 4}], "cat": "[H+]"}]}]

Data/smart.json.gz

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

Test/SynAAM/test_partial_expand.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ def test_expand(self):
1818
"[CH2:1]=[CH:2][CH3:3].[H:4][H:5]>>[CH2:1]([CH:2]([CH3:3])[H:5])[H:4]"
1919
)
2020
# Perform the expansion
21-
output_rsmi = PartialExpand.expand(input_rsmi)
21+
output_rsmi = PartialExpand.expand_aam_with_transform(input_rsmi)
2222
# Assert the result matches the expected output
2323
self.assertTrue(AAMValidator.smiles_check(output_rsmi, expected_rsmi, "ITS"))
2424

2525
def test_expand_2(self):
2626
input_rsmi = "CC[CH2:3][Cl:1].[NH2:2][H:4]>>CC[CH2:3][NH2:2].[Cl:1][H:4]"
27-
output_rsmi = PartialExpand.expand(input_rsmi)
27+
output_rsmi = PartialExpand.expand_aam_with_transform(input_rsmi)
2828
expected_rsmi = (
2929
"[CH3:1][CH2:2][CH2:3][Cl:4].[NH2:5][H:6]"
3030
+ ">>[CH3:1][CH2:2][CH2:3][NH2:5].[Cl:4][H:6]"

Test/SynGraph/Cluster/__init__.py

Whitespace-only changes.
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import time
2+
import unittest
3+
from synutility.SynIO.data_type import load_from_pickle
4+
from synutility.SynGraph.Descriptor.graph_signature import GraphSignature
5+
from synutility.SynGraph.Cluster.batch_cluster import BatchCluster
6+
7+
8+
class TestBatchCluster(unittest.TestCase):
9+
@classmethod
10+
def setUpClass(cls):
11+
cls.graphs = load_from_pickle("Data/Testcase/graph.pkl.gz")
12+
cls.templates = None
13+
for value in cls.graphs:
14+
value["rc_sig"] = GraphSignature(value["RC"]).create_graph_signature()
15+
value["its_sig"] = GraphSignature(value["ITS"]).create_graph_signature()
16+
17+
def test_initialization(self):
18+
"""Test initialization and verify if the attributes are set correctly."""
19+
cluster = BatchCluster(["element", "charge"], ["*", 0], "bond_order")
20+
self.assertEqual(cluster.nodeLabelNames, ["element", "charge"])
21+
self.assertEqual(cluster.nodeLabelDefault, ["*", 0])
22+
self.assertEqual(cluster.edgeAttribute, "bond_order")
23+
24+
def test_initialization_failure(self):
25+
"""Test initialization failure when lengths of node labels and defaults do not match."""
26+
with self.assertRaises(ValueError):
27+
BatchCluster(["element"], ["*", 0, 1], "bond_order")
28+
29+
def test_batch_dicts(self):
30+
"""Test the batching function to split data correctly."""
31+
batch_cluster = BatchCluster(["element", "charge"], ["*", 0], "bond_order")
32+
input_list = [{"id": i} for i in range(10)]
33+
batches = batch_cluster.batch_dicts(input_list, 3)
34+
self.assertEqual(len(batches), 4)
35+
self.assertEqual(len(batches[0]), 3)
36+
self.assertEqual(len(batches[-1]), 1)
37+
38+
def test_lib_check_functionality(self):
39+
"""Test the lib_check method using directly comparable results."""
40+
cluster = BatchCluster()
41+
batch_1 = self.graphs[:50]
42+
batch_2 = self.graphs[50:]
43+
_, templates = cluster.fit(batch_1, None, "RC", "rc_sig")
44+
for entry in batch_2:
45+
_, templates = cluster.lib_check(entry, templates, "RC", "rc_sig")
46+
self.assertEqual(len(templates), 30)
47+
48+
def test_cluster_integration(self):
49+
"""Test the cluster method to ensure it processes data entries correctly."""
50+
cluster = BatchCluster()
51+
expected_template_count = 30
52+
_, updated_templates = cluster.cluster(self.graphs, [], "RC", "rc_sig")
53+
54+
self.assertEqual(
55+
len(updated_templates),
56+
expected_template_count,
57+
f"Failed: expected {expected_template_count} templates, got {len(updated_templates)}",
58+
)
59+
60+
def test_fit(self):
61+
cluster = BatchCluster()
62+
batch_sizes = [None, 10]
63+
expected_template_count = 30
64+
65+
for batch_size in batch_sizes:
66+
start_time = time.time()
67+
_, updated_templates = cluster.fit(
68+
self.graphs, self.templates, "RC", "rc_sig", batch_size=batch_size
69+
)
70+
elapsed_time = time.time() - start_time
71+
72+
self.assertEqual(
73+
len(updated_templates),
74+
expected_template_count,
75+
f"Failed for batch_size={batch_size}: expected "
76+
+ f"{expected_template_count} templates, got {len(updated_templates)}",
77+
)
78+
print(
79+
f"Test for batch_size={batch_size} completed in {elapsed_time:.2f} seconds."
80+
)
81+
82+
def test_fit_gml(self):
83+
cluster = BatchCluster()
84+
batch_sizes = [None, 10]
85+
expected_template_count = (
86+
30 # Assuming this is the expected number of templates after processing
87+
)
88+
89+
for batch_size in batch_sizes:
90+
start_time = time.time()
91+
_, updated_templates = cluster.fit(
92+
self.graphs, self.templates, "RC", "rc_sig", batch_size=batch_size
93+
)
94+
elapsed_time = time.time() - start_time
95+
96+
self.assertEqual(
97+
len(updated_templates),
98+
expected_template_count,
99+
f"Failed for batch_size={batch_size}: expected"
100+
+ f" {expected_template_count} templates, got {len(updated_templates)}",
101+
)
102+
print(
103+
f"Test for batch_size={batch_size} completed in {elapsed_time:.2f} seconds."
104+
)
105+
106+
107+
# To run the tests
108+
if __name__ == "__main__":
109+
unittest.main()
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
import time
2+
import unittest
3+
from synutility.SynIO.data_type import load_from_pickle
4+
from synutility.SynGraph.Cluster.graph_cluster import GraphCluster
5+
from synutility.SynGraph.Descriptor.graph_descriptors import GraphDescriptor
6+
7+
8+
class TestRCCluster(unittest.TestCase):
9+
10+
@classmethod
11+
def setUpClass(cls):
12+
# Load data once for all tests
13+
cls.graphs = load_from_pickle("Data/Testcase/graph.pkl.gz")
14+
for value in cls.graphs:
15+
value = GraphDescriptor.get_descriptors(value)
16+
cls.clusterer = GraphCluster()
17+
18+
def test_initialization(self):
19+
"""Test the initialization and configuration of the RCCluster."""
20+
self.assertIsInstance(self.clusterer.nodeLabelNames, list)
21+
self.assertEqual(self.clusterer.edgeAttribute, "order")
22+
self.assertEqual(
23+
len(self.clusterer.nodeLabelNames), len(self.clusterer.nodeLabelDefault)
24+
)
25+
26+
def test_auto_cluster(self):
27+
"""Test the auto_cluster method functionality."""
28+
rc = [value["RC"] for value in self.graphs]
29+
cycles = [value["cycle"] for value in self.graphs]
30+
signature = [value["signature_rc"] for value in self.graphs]
31+
atom_count = [value["atom_count"] for value in self.graphs]
32+
for att in [None, cycles, signature, atom_count]:
33+
clusters, graph_to_cluster = self.clusterer.iterative_cluster(
34+
rc,
35+
att,
36+
nodeMatch=self.clusterer.nodeMatch,
37+
edgeMatch=self.clusterer.edgeMatch,
38+
)
39+
self.assertIsInstance(clusters, list)
40+
self.assertIsInstance(graph_to_cluster, dict)
41+
self.assertEqual(len(clusters), 30)
42+
43+
def test_auto_cluster_wrong_isomorphism(self):
44+
rc = [value["RC"] for value in self.graphs]
45+
cycles = [value["cycle"] for value in self.graphs]
46+
signature = [value["signature_rc"] for value in self.graphs]
47+
atom_count = [value["atom_count"] for value in self.graphs]
48+
49+
# cluster all
50+
clusters, _ = self.clusterer.iterative_cluster(
51+
rc, None, nodeMatch=None, edgeMatch=None
52+
)
53+
self.assertEqual(len(clusters), 8) # wrong value
54+
55+
# cluster with cycle
56+
clusters, _ = self.clusterer.iterative_cluster(
57+
rc, cycles, nodeMatch=None, edgeMatch=None
58+
)
59+
self.assertEqual(len(clusters), 8) # wrong value
60+
61+
# cluster with atom_count
62+
clusters, _ = self.clusterer.iterative_cluster(
63+
rc, atom_count, nodeMatch=None, edgeMatch=None
64+
)
65+
self.assertEqual(len(clusters), 27) # wrong value but almost correct
66+
67+
# cluster with signature
68+
clusters, _ = self.clusterer.iterative_cluster(
69+
rc, signature, nodeMatch=None, edgeMatch=None
70+
)
71+
self.assertEqual(len(clusters), 30) # correct by some magic. No proof for this
72+
73+
def test_fit(self):
74+
"""Test the fit method to ensure it correctly updates data entries with cluster indices."""
75+
76+
clustered_data = self.clusterer.fit(
77+
self.graphs, rule_key="RC", attribute_key="atom_count"
78+
)
79+
max_class = 0
80+
for item in clustered_data:
81+
print(item["class"])
82+
max_class = item["class"] if item["class"] >= max_class else max_class
83+
# print(max_class)
84+
self.assertIn("class", item)
85+
self.assertEqual(max_class, 29) # 30 classes start from 0 so max is 29
86+
87+
def test_fit_gml(self):
88+
"""Test the fit method to ensure it correctly updates data entries with cluster indices."""
89+
90+
clustered_data = self.clusterer.fit(
91+
self.graphs, rule_key="rc", attribute_key="atom_count"
92+
)
93+
max_class = 0
94+
for item in clustered_data:
95+
print(item["class"])
96+
max_class = item["class"] if item["class"] >= max_class else max_class
97+
# print(max_class)
98+
self.assertIn("class", item)
99+
self.assertEqual(max_class, 29) # 30 classes start from 0 so max is 29
100+
101+
def test_fit_time_compare(self):
102+
attributes = {
103+
"None": None,
104+
"Cycles": "cycle",
105+
"Signature": "signature_rc",
106+
"Atom_count": "atom_count",
107+
}
108+
109+
results = {}
110+
for name, attr in attributes.items():
111+
start_time = time.time()
112+
clustered_data = self.clusterer.fit(
113+
self.graphs, rule_key="RC", attribute_key=attr
114+
)
115+
elapsed_time = time.time() - start_time
116+
117+
# Optionally print out class information or verify correctness
118+
max_class = max(item["class"] for item in clustered_data if "class" in item)
119+
120+
results[name] = elapsed_time
121+
122+
# Basic verification that 'class' is assigned and max class is as expected
123+
self.assertTrue(all("class" in item for item in clustered_data))
124+
self.assertEqual(
125+
max_class, 29
126+
) # Ensure the maximum class index is as expected
127+
128+
# Compare results to check which attribute took the least/most time
129+
min_time_attr = min(results, key=results.get)
130+
max_time_attr = max(results, key=results.get)
131+
self.assertIn(min_time_attr, ["Atom_count", "Signature"])
132+
self.assertIn(max_time_attr, ["None", "Cycles"])
133+
134+
135+
if __name__ == "__main__":
136+
unittest.main()

Test/SynGraph/GML/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)