Skip to content

Commit

Permalink
Merge pull request #32 from nlesc-nano/filter
Browse files Browse the repository at this point in the history
make pipeline more robust
  • Loading branch information
felipeZ authored Aug 21, 2020
2 parents e80a0b6 + 192d37b commit 7741238
Show file tree
Hide file tree
Showing 7 changed files with 951 additions and 39 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Change Log

# 0.3.0 [date]
# 0.3.0 [21/08/2020]

## New
* Introduce Pipeline to filter ligands (#26)
* Introduce Pipeline to filter ligands (#13, #26)
* Use [SCScore](https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622)
* Use [Horovod](https://github.com/horovod/horovod) to distribute the training
* Add [mypy](http://mypy-lang.org/) test
Expand Down
727 changes: 704 additions & 23 deletions notebooks/Filter_visualization.ipynb

Large diffs are not rendered by default.

192 changes: 192 additions & 0 deletions notebooks/candidates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
,smiles
2,O=C/C(Cl)=C(/Cl)C(=O)O
3,O=C(O)C/C=C/CC(=O)O
4,O=C(O)C1CC(Br)(Br)C1
6,O=C(O)C1CCOC1=O
7,C#CCCC#CC(=O)O
8,N#CNCC(=O)O
9,C=CC(=C)C(=O)O
25,C=CNCC(=O)O
26,O=C(O)C1CS1
28,O=C(O)CCCOCCOC1CCCCO1
29,C=CCOCCOCC(=O)O
31,O=C(O)c1nccnc1O
34,N=C(S)C(=O)O
37,O=C(O)CCCCCCCCCCCCCCBr
38,O=C(O)COCCCl
39,O=C(O)c1nc(O)ccc1F
40,O=C(O)CC(=O)CCl
42,O=C(O)c1nc(Br)ccc1F
44,O=C(O)C1CSCN1
45,O=C(O)C(O)(O)C(=O)O
47,O=C(O)c1ncccc1F
48,O=C(O)C=Cc1ccccc1F
49,O=C(O)C=Cc1ccc(F)cc1
51,O=C(O)C=Cc1ccc2c(c1)OCO2
52,O=C(O)C(Cl)(Cl)Cl
56,O=C(O)C1CNCCN1
58,O=C(O)C(F)(F)C(F)F
71,O=C(O)c1cnc(Cl)nc1
77,O=C(O)/C(Cl)=C/Cl
83,O=C(O)C(F)(F)I
87,O=C(O)COCCBr
88,O=C(O)C1CS(=O)(=O)C1
89,O=C(O)CC(=O)C(F)(F)F
122,N#CCCSCC(=O)O
123,O=C(O)C=CI
124,O=C(O)C(F)(Cl)Cl
125,O=C(O)C(=O)CS
126,O=C(O)C=Cc1ccc(Cl)cc1
130,O=C(O)C1CO1
131,O=C(O)C1C=CCC1
132,O=C1CCC(F)(C(=O)O)CC1
133,O=C(O)CCCCCCCCCCCCCBr
137,O=C(O)/C(Cl)=C\c1ccccc1
143,O=C(O)C(Br)C(=O)O
146,O=C(O)COc1ccc(Br)cc1
147,O=C(O)COc1ccc(Cl)cc1
150,O=C(O)CI
151,O=C(O)C#CC(=O)O
153,C=CC(=O)O
155,O=C(O)C=CC(=O)O
156,O=C(O)CBr
157,C=CCCC(=O)O
159,O=C(O)C(F)(F)F
160,O=C(O)CCBr
163,O=Cc1ccc(C(=O)O)cc1
164,O=C(O)c1ccc(F)cc1
165,O=C(O)c1ccc(O)nc1
166,O=C(O)C=Cc1cccs1
168,O=C(O)CCCl
169,C=CCC(=O)O
176,O=C(O)C(Br)CBr
177,O=C(O)C(F)F
181,O=C(O)CCCCCBr
182,O=C(O)c1ncccc1O
183,O=C(O)c1cc(F)ccc1O
185,O=C(O)c1ccc(Cl)nc1
187,O=C(O)CCCCBr
188,O=C(O)COc1ccccc1
189,O=C(O)c1ccc(CBr)cc1
191,O=C(O)C1CCCN1
193,O=C(O)c1ccoc1
194,O=C(O)C(F)(F)C(F)(F)C(F)(F)F
196,O=C(O)C(=O)CBr
198,O=C(O)CCCBr
199,O=C(O)CCCCCCCCCCBr
201,O=C(O)CCCCCCCCCCCBr
202,O=C(O)CC(=O)C(=O)O
205,C=C(CBr)C(=O)O
209,O=C(O)CCCCCCCBr
221,C#CCCC(=O)O
224,O=C(O)CCCCl
225,O=C(O)c1ccc(O)cc1
226,N#CCC(=O)O
228,O=C(O)C(F)(F)Cl
230,O=C(O)c1ccc(Cl)cc1F
231,O=C(O)c1ccsc1
232,O=C(O)c1cc(F)ccc1F
238,O=CC(=O)O
240,O=C(O)c1ccc(F)cc1F
242,O=C(O)c1ccc(CCl)cc1
243,O=C(O)CCCCCl
244,O=C(O)C(Cl)Cl
246,O=C(O)CS
247,O=C(O)CC(=O)O
248,O=C(O)C=Cc1ccc(O)cc1
249,O=C(O)C1CCC1
253,O=C(O)C1CCCC1
254,O=C(O)C1CC1
261,O=C(O)C(Cl)CCl
265,O=C(O)c1ccc(O)cc1O
267,C=C(Br)C(=O)O
270,O=C(O)C(F)(F)C(F)(F)C(=O)O
271,O=C(O)C1CNC1
272,O=C(O)c1ccc(Br)cc1F
273,O=C(O)c1ccccn1
275,O=C(O)COCCOCC(=O)O
276,O=C(O)c1ccccc1F
278,O=C(O)c1c[nH]cn1
284,C#CCCCCC(=O)O
285,O=C(O)CCCCCCCCCCCCCCCBr
289,O=C(O)C1CC=CC1
292,O=C(O)CSCCSCC(=O)O
298,O=C(O)c1ccco1
302,O=C(O)C(F)(F)S(=O)(=O)F
304,O=C(O)c1cc(Cl)ccc1F
306,O=C(O)C1CCC(C(=O)O)CC1
307,O=C(O)CCCCCCCCCBr
310,O=C(O)c1cc(I)ccc1F
317,O=C(O)c1cc(Br)ccc1F
318,O=C(O)CCS
324,O=C(O)C1=CCSC1
325,O=C(O)C1(O)CCOCC1
327,O=C(O)CCOC1CCCCO1
328,O=C(O)c1cc(F)c(F)cc1O
330,O=C(O)C1CCC=CO1
333,N#CC=CC(=O)O
337,O=C(O)c1nc(Cl)ccc1F
341,O=C(O)C(=O)n1ccnc1
342,O=C(O)COCC(=O)n1ccnc1
350,O=C(O)c1cc(Br)c[nH]1
355,C=C=CCC(=O)O
363,O=C(O)C=Cc1ccc(Br)cc1
366,O=C(O)c1cc(Br)c(F)cc1F
368,O=C(O)CSc1cc[n+](CC(=O)c2ccccc2)cc1
371,O=C(O)C=CCBr
373,C=C1CC(C(=O)O)C1
376,O=C(O)c1cc[n+](Cc2ccc(F)cc2)cc1
377,O=C(O)c1cc(Br)cnc1F
379,O=C(O)CONC(=O)OCc1ccccc1
390,O=C(O)C(S)C(=O)O
391,C=CC(=O)C(=O)O
392,O=C(O)C(=O)C(Cl)(Cl)Cl
396,O=C(O)C=Cc1c(F)cc(F)cc1F
397,O=C(O)C1CSC1
398,O=C(O)C1SCCCS1
403,O=C(O)CCCCCCCCCCCCCCCCBr
408,O=C(O)CCCS
412,O=C(O)C1CCC=CCC1
416,O=C(O)C1CC(C(=O)O)S1
417,O=C(O)CCCCCCCCC[P+](c1ccccc1)(c1ccccc1)c1ccccc1
418,O=C(O)CC1(O)CC1
419,O=C(O)CS(=O)(=O)C(F)(F)F
430,C=CCOCC(=O)O
432,C=CCCOCC(=O)O
434,O=C(O)COC(F)(F)F
439,O=C(O)C=C(Cl)Cl
443,C#CC=CC(=O)O
444,O=C(O)C1C=CC=CC=C1
454,O=C(O)c1cc(F)c(Br)cc1F
459,C=C(Cl)C(=O)O
461,O=C(O)CSCSCC(=O)O
470,O=C(O)C(F)(Br)Br
471,O=C(O)C(F)(F)Br
477,O=C(O)c1cc(O)ccc1F
480,O=C(O)CCCCCCCCCCCCCCCCCCCCBr
481,O=C(O)CCCCCCBr
483,O=C(O)CCCCCCCCCCCCCCCCCCBr
486,O=C(O)c1ccc(Br)cc1O
488,C#CCC(=O)O
489,O=C(O)CF
491,O=C(O)c1ccc(Br)nc1
494,O=C(O)C(=O)c1ccco1
497,O=C(O)C(=O)c1ccccc1
498,O=C(O)C#Cc1ccccc1
499,O=C(O)C1CCNCC1
500,C#CC(=O)O
501,O=C(O)c1cnccn1
502,O=C(O)c1ccc[nH]1
504,O=C(O)c1cccs1
510,O=C(O)CCI
518,O=C/C(Cl)=C(\Cl)C(=O)O
525,O=C(O)C(=O)c1cc(F)ccc1F
529,O=C(O)C(=O)c1ccc(Br)cc1
546,O=C(O)C1(O)CSC1
548,C=CC(Cl)C(=O)O
552,O=C(O)c1cc(F)ccn1
559,O=C(O)C1CCOCC1
562,O=C(O)c1cc(Cl)ccn1
565,O=C(O)c1cc(Br)ccn1
573,N#CCSCC(=O)O
574,O=C(O)CSc1cccs1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
'nano-CAT@git+https://github.com/nlesc-nano/nano-CAT@master',
'data-CAT@git+https://github.com/nlesc-nano/data-CAT@master',
'horovod', 'mendeleev', 'more_itertools', 'numpy', 'pandas',
'pyyaml>=5.1.1', 'seaborn', 'schema', 'sqlalchemy',
'pyyaml>=5.1.1', 'retry', 'seaborn', 'schema', 'sqlalchemy',
'torch-geometric', 'typing-extensions'],

extras_require={
Expand Down
51 changes: 44 additions & 7 deletions swan/cosmo/cat_interface.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,15 @@
"""Interface with CAT/PLAMS Packages."""
"""Interface with CAT/PLAMS Packages.
Index
-----
.. currentmodule:: swan.cosmo.cat_interface
API
---
.. autofunction:: call_mopac
.. autofunction:: call_cat_in_parallel
"""
import logging
import os
import shutil
Expand All @@ -16,6 +27,7 @@
import yaml
from more_itertools import chunked
from scm.plams import CRSJob, Settings
from retry import retry

import CAT
from CAT.base import prep
Expand All @@ -25,6 +37,9 @@
from ..utils import Options
from .functions import run_command

__all__ = ["call_cat_in_parallel", "call_mopac"]


T = TypeVar('T')

# Starting logger
Expand All @@ -35,6 +50,7 @@
logger.addHandler(handler)


@retry(FileExistsError, tries=3, delay=1)
def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") -> Path:
"""Call cat with a given `config` and returns a dataframe with the results.
Expand All @@ -58,7 +74,7 @@ def call_cat(smiles: pd.Series, opts: Mapping[str, T], chunk_name: str = "0") ->
"""
# create workdir for cat
path_workdir_cat = Path(opts["workdir"]) / "cat_workdir" / chunk_name
path_workdir_cat.mkdir(parents=True)
path_workdir_cat.mkdir(parents=True, exist_ok=True)

path_smiles = (path_workdir_cat / "smiles.txt").absolute().as_posix()

Expand Down Expand Up @@ -129,20 +145,41 @@ def compute_bulkiness(smiles: pd.Series, opts: Mapping[str, T], indices: pd.Inde
chunk_name = str(indices[0])
try:
values = compute_bulkiness_using_cat(chunk, opts, chunk_name)
except RuntimeError:
values = np.repeat(np.nan, len(smiles))
except (RuntimeError):
logger.error(f"There was an error processing:\n{chunk.values}")
values = np.repeat(np.nan, len(indices))

return values


def call_cat_in_parallel(smiles: pd.Series, opts: Options) -> np.ndarray:
"""Compute a ligand/quantum dot property using CAT."""
"""Compute a ligand/quantum dot property using CAT.
It creates several instances of CAT using multiprocessing.
Parameters
----------
smiles
Pandas.Series with the smiles to compute
opts
Options to call CAT
Returns
-------
Numpy array with the computed properties
"""
worker = partial(compute_bulkiness, smiles, opts.to_dict())

with Pool() as p:
results = p.map(worker, chunked(smiles.index, 10))

return np.concatenate(results)
results = np.concatenate(results)

if len(smiles.index) != results.size:
msg = "WWW There is an incongruence in the bulkiness computed by CAT!"
raise RuntimeError(msg)

return results


def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
Expand All @@ -161,7 +198,7 @@ def call_mopac(smile: str, solvents=["Toluene.coskf"]) -> Tuple[float, float]:
return np.nan, np.nan
return call_cat_mopac(Path(tmp), smile, solvents)
except ValueError:
print(f"Error reading smile: {smile}")
logger.error(f"Error reading smile: {smile}")
return np.nan, np.nan

finally:
Expand Down
8 changes: 6 additions & 2 deletions swan/filter/screen.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
API
---
.. autofunction:: split_filter_in_batches
.. autofunction:: apply_filters
{autodata}
"""
Expand Down Expand Up @@ -112,8 +115,8 @@ def split_filter_in_batches(opts: Options) -> None:
try:
apply_filters(batch, opts, output_file)
except:
error = next(iter(sys.exc_info()))
logger.error(error)
error, msg, _ = sys.exc_info()
logger.error(f"Error processing batch: {k}\n{error} {msg}")


def apply_filters(molecules: pd.DataFrame, opts: Options, output_file: Path) -> None:
Expand Down Expand Up @@ -195,6 +198,7 @@ def filter_by_bulkiness(molecules: pd.DataFrame, opts: Options) -> pd.DataFrame:
raise RuntimeError("A core molecular geometry is needed to compute bulkiness")

molecules["bulkiness"] = call_cat_in_parallel(molecules.smiles, opts)
logger.debug("CAT has been called!")

return apply_predicate(molecules, "bulkiness", opts)

Expand Down
6 changes: 2 additions & 4 deletions tests/test_files/input_test_filter.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@ filters:
- "C(=O)O"
scscore:
lower_than:
2.5
2.0
bulkiness:
lower_than:
20


20

0 comments on commit 7741238

Please sign in to comment.