-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_prep_3.py
84 lines (72 loc) · 2.57 KB
/
data_prep_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#! /home/paddy/anaconda3/envs/mol_transformer/bin/python
"""
Takes a canonicalized SMILES reaction, with reactants and reagents separated,
and tokenizes it.
"""
import re
from typing import List
import click
from data_prep_1 import (
Reaction,
split_into_rxn_class,
sep_reactants_and_reagents,
)
from data_prep_2 import strip_and_canonicalize_rxn
tokenize_pattern: str = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
tokenize_regex: re.Pattern = re.compile(tokenize_pattern)
def tokenize_reaction(rxn: Reaction) -> str:
"""
Tokenizes the reaction. Reaction tokenizer from Philippe Schwaller et al.
"""
reactants: str = ".".join(rxn.reactants) + ">"
reagents: str = ".".join(rxn.reagents)
products: str = ">" + ".".join(rxn.products)
react_tokenized: str = " ".join(
[token for token in tokenize_regex.findall(reactants)]
)
reag_tokenized: str = " " + " . ".join(reagents.split(".")) + " "
prod_tokenized: str = " ".join(
[token for token in tokenize_regex.findall(products)]
)
return react_tokenized + reag_tokenized + prod_tokenized
@click.command()
@click.argument(
"smiles_file",
type=click.Path(exists=True, readable=True, resolve_path=True),
)
@click.option(
"--step1",
"step_1",
is_flag=True,
help="Activate this to also run Step 1 (separate reactants and reagents).",
)
@click.option(
"--step2",
"step_2",
is_flag=True,
help="Activate this to also run Step 2 (strip atom-mapping and canonicalize reaction).",
)
def main(smiles_file: str, step_1: bool, step_2: bool):
"""
This script takes a canonicalized SMILES reaction, with reactants and
reagents separated, and tokenizes it. Requires an input SMILES file. The
SMILES file must be atom-mapped in order for reactant/reagent separation to
work (if run with the --step2 flag).
"""
with open(smiles_file) as sf_file:
for line in sf_file:
if line == "":
continue
if step_1:
just_smi: str = line.split()[0]
prelim_rxn: Reaction = split_into_rxn_class(just_smi)
rr_sep: Reaction = sep_reactants_and_reagents(prelim_rxn)
else:
rr_sep: Reaction = split_into_rxn_class(line.split()[0])
can_rxn: Reaction = strip_and_canonicalize_rxn(
rr_sep
) if step_2 else rr_sep
tokenized_rxn: str = tokenize_reaction(can_rxn)
click.echo(tokenized_rxn)
if __name__ == "__main__":
main()