diff --git a/.github/workflows/ci_process.yml b/.github/workflows/ci_process.yml index 12fe7898..2068f43e 100644 --- a/.github/workflows/ci_process.yml +++ b/.github/workflows/ci_process.yml @@ -15,11 +15,11 @@ jobs: run: make build_docker - name: Run the project run: | - docker-compose up -d + docker compose up -d sleep 30 - name: Run unit tests run: | set -o pipefail make test_lillymol | tee ./unit_test_results.txt - name: Stop running - run: docker-compose down + run: docker compose down diff --git a/.vilerc b/.vilerc new file mode 100755 index 00000000..b22bcb7e --- /dev/null +++ b/.vilerc @@ -0,0 +1,35 @@ +set nowrapscan +set ignorecase +set autoindent +set nobackspacelimit +set check-modtime +set notabinsert +set undolimit 100 +set view-on-readonly +set c-shiftwidth 2 +set shiftwidth 2 +set cmode +set showmatch +set shell=/bin/bash +set popup-choices immediate +bind-key next-page \s +set c-suffixes "\\.\\(\\([Cchis]\\)\\|CC\\|cpp\\|cxx\\|hxx\\|scm\\|cc\\)$" + +store-procedure buffer_action + set title $cfilname + set iconname $cbufname +~endm +~if &seq $progname "xvile" + set buffer-hook buffer_action +~endif + +; word completion in insert mode (Komplete, Next, Previous) +; allows backspacing over inserted word +; ^K map complete word +; ^N map try next word for completion +; ^P map try previous word for completion + +map! . hbmmi?\<2h"zdt.@z ywmx`mPea dwbis"zdt.x@z +map! . hbmmi/\<2h"zdt.@z ywmx`mPea dwbis"zdt.x@z +map!  . hbdwmm`xnywmx`mPea dwbis"zdt.x@z +map!  . hbdwmm`xNywmx`mPea dwbis"zdt.x@z diff --git a/README.md b/README.md index 4d7d3a45..d4d9ba2a 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ LillyMol has some novel approaches to substructure searching, reaction enumerati chemical similarity. These have been developed over many years, driven by the needs of Computational and Medicinal Chemists at Lilly and elsewhere. +Recent work has focussed on making *de-novo* molecule construction and there are +several tools desiged to either support or complement A/I driven molecule +generation. + LillyMol is fast and scalable, with modest memory requirements. This release includes a number of C++ unit tests. All @@ -96,9 +100,6 @@ sudo apt install python-dev libblas-dev ``` Things seem to work seamlessly in virtualenv. -Installation within virtualenv works well. - - # TLDR If you have bazelisk and gcc installed, there is a reasonable possibility that issuing `make` in the top level directory will work (but see note below diff --git a/contrib/bin/Lilly_Medchem_Rules.rb b/contrib/bin/Lilly_Medchem_Rules.rb new file mode 100755 index 00000000..5585bfa6 --- /dev/null +++ b/contrib/bin/Lilly_Medchem_Rules.rb @@ -0,0 +1,341 @@ +#!/usr/bin/env ruby + +# Implementation of Lilly Medchem rules using latest LillyMol executables. + +ianhome = File.dirname($0) # location for supporting files + +ianhome = ENV['LILLYMOL_HOME'] +unless ianhome + ianhome = File.dirname(File.dirname(File.dirname __FILE__)) +end + +require "#{ianhome}/contrib/bin/lib/iwcmdline.rb" + +$expert = false + +$default_lower_atom_count_cutoff = 7 +$default_soft_upper_atom_count_cutoff = 25 +$default_hard_upper_atom_count_cutoff = 40 + +def usage (rc) + $stderr.print "Runs the Lilly medchem rules\n" + $stderr.print " -c lower atom count cutoff (default #{$default_lower_atom_count_cutoff})\n" if $expert + $stderr.print " -Cs soft upper atom count cuttof (default #{$default_soft_upper_atom_count_cutoff})\n" if $expert + $stderr.print " -Ch hard upper atom count cuttof (default #{$default_hard_upper_atom_count_cutoff})\n" if $expert + $stderr.print " -smarts optional smarts to reject\n" if $expert + $stderr.print " -rej optional query file to reject\n" if $expert + $stderr.print " -relaxed relaxed rules: 7-50 heavy atoms, 160 demerit cutoff\n" + $stderr.print " -nodemerit hard rejections only, do not apply any demerits\n" if $expert + $stderr.print " -S write output to rather than stdout\n" if $expert + $stderr.print " -B output name stem for rejected molecules\n" if $expert + $stderr.print " -log name stem for log files\n" if $expert + $stderr.print " -tp...-tp options passed directly to mc_first_pass\n" if $expert + $stderr.print " -iwd...-iwd options passed directly to iwdemerit\n" if $expert + $stderr.print " -odm omit demerit with file name \n" if $expert + $stderr.print " -edm extra demerits to be applied - query file format only\n" if $expert + $stderr.print " -dmrt extra demerits to be applied\n" if $expert + $stderr.print " -dcf demerit control file (the -C option to iwdemerit)\n" if $expert + $stderr.print " -q directory for queries\n" if $expert + $stderr.print " -okiso allow isotopic atoms to pass through\n"; + $stderr.print " -symm discard symmetric molecules where two symmetric atoms > apart\n" if $expert + $stderr.print " -noapdm do not append demerit reasons\n" + $stderr.print " -i input type\n" if $expert + $stderr.print " -expert more options\n" unless $expert; + $stderr.print " -v verbose output\n" + exit(rc) +end + +cl = IWCmdline.new("-v-noapdm-i=s-expert-b=fraction-B=s-q=dir-log=s-tp=close-iwd=close-smarts=s-rej=s-c=ipos-Cs=ipos-Ch=ipos-okiso-odm=s-edm=sfile-relaxed-nodemerit-S=s-dcf=sfile-nobadfiles-symm=ipos") + +if cl.unrecognised_options_encountered() + $stderr.print "Unrecognised options encountered\n" + usage(1) +end + +verbose = cl.option_present('v') + +if cl.option_present('expert') + $expert = true +end + +if ARGV.empty? + $stderr.print "Insufficient arguments\n" + usage(2) +end + +append_demerit_reason = ! cl.option_present('noapdm') # convoluted inverse logic + +stop_afer_completing_step = 3 # no longer optional + +input_type = "" +if cl.option_present('i') + input_type = '-i ' << cl.value('i') +end + +# The ring bond ratio used by mc_first_pass + +ring_bond_ratio = -1 +if cl.option_present('b') + ring_bond_ratio = cl.value('b') +end + +mc_first_pass_options = "" +if cl.option_present('tp') + mc_first_pass_options = cl.value('tp') +end + +unless cl.option_present('okiso') + mc_first_pass_options << ' -I 0' +end + +mc_first_pass_options << ' -A I -A ipp' + +iwdemerit = "#{ianhome}/bin/Linux/iwdemerit" +mc_first_pass = "#{ianhome}/bin/Linux/tp_first_pass" +tsubstructure = "#{ianhome}/bin/Linux/tsubstructure" + +query_dir = "#{ianhome}/data/LillyMedchemRules" + +query_dir = cl.value('q') if cl.option_present('q') + +# The default query files + +query_file = Array.new + +query_file[1] = 'reject1' +query_file[2] = 'reject2' +query_file[3] = 'demerits' + +# The default stem of the rejected files + +bad_stem = false + +if cl.option_present('B') + bad_stem = cl.value('B') +elsif cl.option_present('nobadfiles') + true +else + bad_stem = 'bad' +end + +# The default stem for log files + +logfilestem = 'ok'; + +if cl.option_present('log') + logfilestem = cl.value('log') +end + +optional_queries = "" + +cl.values('rej').each do |q| + optional_queries << " -q #{q}" +end + +cl.values('smarts').each do |s| + optional_queries << " -s '#{s}'" +end + +extra_iwdemerit_options = "" + +if cl.option_present('relaxed') + $default_soft_upper_atom_count_cutoff = 26 + $default_hard_upper_atom_count_cutoff = 50 + extra_iwdemerit_options << " -f 160" +end + +if cl.option_present('nodemerit') + extra_iwdemerit_options << " -r" + $default_soft_upper_atom_count_cutoff = $default_hard_upper_atom_count_cutoff - 1 +end + +if cl.option_present('iwd') + extra_iwdemerit_options << " " << cl.value('iwd') +end + +if cl.option_present('symm') + extra_iwdemerit_options << " -s " << cl.value('symm').to_s +end + +charge_assigner = "#{ianhome}/data/queries/charges/queries" + +unless FileTest.size?(charge_assigner) + $stderr << "Charge assigner not available, skipping\n" +else + extra_iwdemerit_options << " -N F:#{charge_assigner}" +end + +lower_atom_count_cutoff = $default_lower_atom_count_cutoff +if cl.option_present('c') + lower_atom_count_cutoff = cl.value('c') +end + +soft_upper_atom_count_cutoff = $default_soft_upper_atom_count_cutoff; +if cl.option_present('Cs') + soft_upper_atom_count_cutoff = cl.value('Cs') +end + +hard_upper_atom_count_cutoff = $default_hard_upper_atom_count_cutoff; +if cl.option_present('Ch') + hard_upper_atom_count_cutoff = cl.value('Ch') + unless cl.option_present('Cs') + soft_upper_atom_count_cutoff = hard_upper_atom_count_cutoff - 1 + end +end + +if hard_upper_atom_count_cutoff < soft_upper_atom_count_cutoff + hard_upper_atom_count_cutoff = soft_upper_atom_count_cutoff + 1 +end + +unless FileTest.directory?(query_dir) + $stderr.print "Cannot continue, query dir '#{query_dir}' invalid\n" + exit(3) +end + +raise "Query file '#{query_dir}/#{query_file[1]}' missing or inaccessible" unless (FileTest.size?("#{query_dir}/#{query_file[1]}") && FileTest.readable?("#{query_dir}/#{query_file[1]}")) +raise "Query file '#{query_dir}/#{query_file[2]}' missing or inaccessible" unless (FileTest.size?("#{query_dir}/#{query_file[2]}") && FileTest.readable?("#{query_dir}/#{query_file[2]}")) +raise "Query file '#{query_dir}/#{query_file[3]}' missing or inaccessible" unless (FileTest.size?("#{query_dir}/#{query_file[3]}") && FileTest.readable?("#{query_dir}/#{query_file[3]}")) + +$stderr.print "Queries from '#{query_dir}'\n" if verbose + +files_to_be_deleted = Array.new + +query_file3 = "#{query_dir}/#{query_file[3]}" + +# I want to have the odm option behave as either a regular expression +# or as exact matches. +# Note that I don't check against multiple RX= directives + +class Odm + def initialize(o) + @rx = false + @hash = Hash.new + + o.each do |d| + if 'RX=' == d[0,3] + @rx = Regexp.new(d[3, d.size - 3], Regexp::IGNORECASE) + else + @hash[d] = true + end + end + end + + def match(d) + if @hash.has_key?(d) + return true + end + + if @rx + return @rx.match(d) + end + + return false + end +end + +if cl.option_present('odm') + + tmpdir = "." + if cl.option_present('tmpdir') + tmpdir = cl.value('tmpdir') + end + + old_demerits = File.open("#{query_dir}/#{query_file[3]}", mode='r') + raise "Cannot open original demerit file '#{query_dir}/#{query_file[3]}'" unless (old_demerits) + + temporary_demerit_file = "#{tmpdir}/demerits" << Process.pid.to_s + new_demerits = File.open(temporary_demerit_file, mode='w') + raise "Cannot open temporary demerit file '#{temporary_demerit_file}'" unless (new_demerits) + + odm = Odm.new(cl.values('odm')) + + demerit_rx = Regexp.new("^(\\S+)\.qry") + + items_discarded = 0 + + old_demerits.each do |line| + m = demerit_rx.match(line) + + next unless (m) + + stem = m[1] + + if odm.match(stem) + items_discarded += 1 + else + new_demerits << "#{query_dir}/#{stem}.qry\n" + end + end + + if 0 == items_discarded + $stderr.print "Warning, no demerits discarded\n" + elsif verbose + $stderr.print "Discarded #{items_discarded} demerits\n" + end + + new_demerits.close + + query_file3 = "#{temporary_demerit_file}" + + files_to_be_deleted.push(query_file3) +end + +iwdemerit_optional_control_file = false + +if cl.option_present('dcf') + iwdemerit_optional_control_file = cl.value('dcf') +end + +if cl.option_present('edm') + additional_demerits = cl.value('edm') +end + +cmd = "#{mc_first_pass} "; + +cmd << " -b #{ring_bond_ratio}" if ring_bond_ratio >= 0.0 + +cmd << " #{mc_first_pass_options}" if mc_first_pass_options.length > 0 + +cmd << " #{input_type} " if input_type.length > 0 + +cmd << " -c #{lower_atom_count_cutoff} -C #{hard_upper_atom_count_cutoff} -E autocreate -o smi -V -g all -g ltltr -i ICTE " +cmd << "-L #{bad_stem}0 -K TP1 " if bad_stem +cmd << "-a -S - #{ARGV.join(' ')} 2> #{logfilestem}0.log " + +if stop_afer_completing_step >= 1 + cmd << "| #{tsubstructure} -E autocreate -b -u -i smi -o smi -A D " + cmd << "-m #{bad_stem}1 -m QDT " if bad_stem + cmd << "-n - -q F:#{query_dir}/#{query_file[1]} " + + cmd << optional_queries if optional_queries.length > 0 + + cmd << " - 2> #{logfilestem}1.log "; + + if stop_afer_completing_step >= 2 + cmd << "| #{tsubstructure} -A D -E autocreate -b -u -i smi -o smi " + cmd << "-m #{bad_stem}2 -m QDT " if bad_stem + cmd << "-n - -q F:#{query_dir}/#{query_file[2]} - 2> #{logfilestem}2.log "; + if stop_afer_completing_step >= 3 + cmd << " | #{iwdemerit} -x #{extra_iwdemerit_options} -E autocreate -A D -i smi -o smi -q F:#{query_file3} " + cmd << "-R #{bad_stem}3 " if bad_stem + cmd << "-G - -c smax=#{soft_upper_atom_count_cutoff} -c hmax=#{hard_upper_atom_count_cutoff} " + cmd << "-q F:#{additional_demerits} " if additional_demerits + cmd << "-C #{iwdemerit_optional_control_file} " if iwdemerit_optional_control_file + cmd << "-t " if append_demerit_reason + cmd << "- 2> #{logfilestem}3.log " + end + end +end + +if cl.option_present('S') + s = cl.value('S') + cmd << " > #{s}" +end + +$stderr.print "Command is '#{cmd}'\n" if verbose + +system(cmd) + +files_to_be_deleted.each do |f| + File.unlink(f) if FileTest.exists?(f) +end diff --git a/contrib/bin/Lilly_Medchem_Rules.sh b/contrib/bin/Lilly_Medchem_Rules.sh new file mode 100755 index 00000000..eb4ba16b --- /dev/null +++ b/contrib/bin/Lilly_Medchem_Rules.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +if [[ ! -v LILLYMOL_HOME ]] ; then + here=$(readlink -f $0) + echo ${here} + export LILLYMOL_HOME=$(dirname $(dirname $(dirname ${here}))) +fi + +exec ruby $(dirname ${here})/Lilly_Medchem_Rules.rb "$@" diff --git a/contrib/bin/dopattern.rb b/contrib/bin/dopattern.rb index 1810d4b7..70cb1b1d 100755 --- a/contrib/bin/dopattern.rb +++ b/contrib/bin/dopattern.rb @@ -4,7 +4,7 @@ #tool_home = ENV['C3TK_BIN'] tool_home = ".." -require_relative "#{tool_home}/ruby/lib/iwcmdline" +require_relative "lib/iwcmdline" #GH Original #~ cl = IWCmdline.new("-v-a=i-o=i-e=i-w=ipos-k-ks-dry-start=i-stop=i-do=s-stem=s-rx=s-suffix=s-dsc=sfile-qsub-cluster-cluster.seq-sync-qsubopt=close-file=sfile-col=s-echo-echon-f=s-rxdir=dir-subdir-s-array=s-sleep=ipos-sortrx-expert-parallel=ipos-l=s-V-j-q-submit=xfile-noeval") diff --git a/contrib/bin/dopattern.sh b/contrib/bin/dopattern.sh index fa0e1fe6..121b567e 100755 --- a/contrib/bin/dopattern.sh +++ b/contrib/bin/dopattern.sh @@ -1,21 +1,11 @@ #!/bin/bash -# $Id$ - -if [[ -d "../ruby" ]] -then # place here all your code - # define executable - program="../ruby/dopattern.rb" - # check that executable exists - if [ ! -s "$program" ] - then - echo "Cannot access executable '$program'" >&2 - exit 1 - fi - - # this will execute and exit, returning the exit status of the command - exec $program "$@" +here=$(dirname $(readlink -e $0)) +if [[ -v LILLYMOL_HOME ]] ; then + true else - echo $(basename $0)": Required libraries are not found under ruby folder" >&2 && exit 1 + export LILLYMOL_HOME=$(dirname $(dirname $(readlink -e $0))) fi + +exec ruby ${here}/dopattern.rb "$@" diff --git a/contrib/bin/gfp_erg.sh b/contrib/bin/gfp_erg.sh index 1b9807a6..39cf7b49 100755 --- a/contrib/bin/gfp_erg.sh +++ b/contrib/bin/gfp_erg.sh @@ -1,10 +1,12 @@ #!/bin/bash # Set up queries for gfp_erg -if [ -v LILLYMOL_HOME ] ; then +here=$(dirname $(readlink -e $0)) + +if [[ -v LILLYMOL_HOME ]] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname ${here})) fi ERG_QUERIES=${LILLYMOL_HOME}/data/ErG diff --git a/contrib/bin/gfp_make.pl b/contrib/bin/gfp_make.pl index 6bebc9ee..09cd42ee 100755 --- a/contrib/bin/gfp_make.pl +++ b/contrib/bin/gfp_make.pl @@ -28,7 +28,7 @@ my @bindir; unshift (@bindir, "$ianhome/bin/Linux"); -unshift (@bindir, "$ianhome/contrib/script/sh"); +unshift (@bindir, "$ianhome/contrib/bin"); # Flags for each kind of fingerprint possible @@ -1435,7 +1435,7 @@ sub ust_needed_for_atype } elsif ($opt eq "-D2F") { - my $descriptors_to_fingerprint = find_executable('descriptors_to_fingerprint.sh'); + my $descriptors_to_fingerprint = find_executable('descriptors_to_fingerprint'); $d2f_string .= "|${descriptors_to_fingerprint} -f "; my $tmp = $ARGV[$argptr++]; diff --git a/contrib/bin/gfp_make.sh b/contrib/bin/gfp_make.sh new file mode 100755 index 00000000..1983919c --- /dev/null +++ b/contrib/bin/gfp_make.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +here=$(dirname $(readlink -e $0)) + +if [ -v LILLYMOL_HOME ] ; then + true +else + export LILLYMOL_HOME=$(dirname $(dirname ${here})) +fi + +exec perl ${here}/gfp_make.pl "$@" + diff --git a/contrib/bin/graph_edit_changes.sh b/contrib/bin/graph_edit_changes.sh index 2b6a3e40..41ecb0fe 100755 --- a/contrib/bin/graph_edit_changes.sh +++ b/contrib/bin/graph_edit_changes.sh @@ -5,10 +5,11 @@ # TODO: random_molecular_transformations emits too many warning # messages, code needs cleanup.. +set -x if [[ -v LILLYMOL_HOME ]] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi lib="${LILLYMOL_HOME}/data/random_molecular_permutations.d" diff --git a/contrib/bin/hydrophobic_sections.sh b/contrib/bin/hydrophobic_sections.sh index 5d6e655b..2725bdeb 100755 --- a/contrib/bin/hydrophobic_sections.sh +++ b/contrib/bin/hydrophobic_sections.sh @@ -3,7 +3,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi exec ${LILLYMOL_HOME}/bin/Linux/hydrophobic_sections -E autocreate -G def -L def -i smi "$@" diff --git a/contrib/bin/iwdescr.sh b/contrib/bin/iwdescr.sh index 43a6f407..c0023ef1 100755 --- a/contrib/bin/iwdescr.sh +++ b/contrib/bin/iwdescr.sh @@ -3,7 +3,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi charges="${LILLYMOL_HOME}/data/queries/charges/queries" diff --git a/contrib/bin/jwcats.sh b/contrib/bin/jwcats.sh index 45d92950..8fdd51ae 100755 --- a/contrib/bin/jwcats.sh +++ b/contrib/bin/jwcats.sh @@ -5,7 +5,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi charges="${LILLYMOL_HOME}/data/queries/charges/queries" diff --git a/contrib/bin/jwdist.sh b/contrib/bin/jwdist.sh index 218067e6..ef498148 100755 --- a/contrib/bin/jwdist.sh +++ b/contrib/bin/jwdist.sh @@ -5,7 +5,7 @@ if [[ -v LILLYMOL_HOME ]] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi charges="${LILLYMOL_HOME}/data/queries/charges/" diff --git a/contrib/bin/medchem_wizard.sh b/contrib/bin/medchem_wizard.sh index 508b0b35..a11cd2e7 100755 --- a/contrib/bin/medchem_wizard.sh +++ b/contrib/bin/medchem_wizard.sh @@ -3,7 +3,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi rxn_dir=${LILLYMOL_HOME}/data/MedchemWizard diff --git a/contrib/bin/minor_changes.sh b/contrib/bin/minor_changes.sh index 0a4de940..655fc162 100755 --- a/contrib/bin/minor_changes.sh +++ b/contrib/bin/minor_changes.sh @@ -1,10 +1,9 @@ #!/usr/bin/env bash if [[ ! -v LILLYMOL_HOME ]] ; then - me=$0 - export LILLYMOL_HOME=$(dirname $(dirname $(dirname ${me}))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi -config=${LILLYMOL_HOME}/data/minor_changes/minor_changes.textproto +config=${LILLYMOL_HOME}/data/minor_changes.textproto $LILLYMOL_HOME/bin/$(uname)/minor_changes -C ${config} "$@" diff --git a/contrib/bin/pubchem_fingerprints.sh b/contrib/bin/pubchem_fingerprints.sh index 13bab858..08ec6596 100755 --- a/contrib/bin/pubchem_fingerprints.sh +++ b/contrib/bin/pubchem_fingerprints.sh @@ -3,7 +3,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi # The directory in which the queries are found diff --git a/contrib/bin/random_molecular_permutations.sh b/contrib/bin/random_molecular_permutations.sh index 082d4f0d..0fd7ab27 100755 --- a/contrib/bin/random_molecular_permutations.sh +++ b/contrib/bin/random_molecular_permutations.sh @@ -3,11 +3,7 @@ if [[ -v LILLYMOL_HOME ]] ; then true else - here=$(dirname $0) - up=$(dirname ${here}) - up=$(dirname ${up}) - export LILLYMOL_HOME=${up} - echo "LILLYMOL_HOME at ${LILLYMOL_HOME}" + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi lib="${LILLYMOL_HOME}/data/random_molecular_permutations.d" diff --git a/contrib/bin/reduced_graph.sh b/contrib/bin/reduced_graph.sh index 7e00e56f..f272b090 100755 --- a/contrib/bin/reduced_graph.sh +++ b/contrib/bin/reduced_graph.sh @@ -3,7 +3,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi charges="${LILLYMOL_HOME}/data/queries/charges/queries" diff --git a/contrib/bin/smiles_in_text.py b/contrib/bin/smiles_in_text.py index a0b4c17f..37bbed40 100644 --- a/contrib/bin/smiles_in_text.py +++ b/contrib/bin/smiles_in_text.py @@ -19,6 +19,7 @@ import string from absl import app +from absl import logging from lillymol import * @@ -30,7 +31,7 @@ def main(argv): ';', '<', '>', '?', '^', '_', '`', '{', '|', '}', '~', '“', '”'] p = re.compile("[" + re.escape("".join(my_punct)) + "]") - do_not_process = re.compile(".*\.(png|pdf)$") + do_not_process = re.compile(".*\.(png|pdf|pyc)$") # initially implemented this to avoid finding the same molecule # multiple times, but we actually want to find all instances of @@ -42,6 +43,7 @@ def main(argv): fname = os.path.join(root, name) if do_not_process.match(fname): continue + logging.info("Processing %s", fname) with open(fname, "r") as reader: for line in reader: no_punctuation = p.sub(" ", line.rstrip()) diff --git a/contrib/bin/xgbd/random_forest_model_pb2.py b/contrib/bin/xgbd/random_forest_model_pb2.py new file mode 100755 index 00000000..2de73140 --- /dev/null +++ b/contrib/bin/xgbd/random_forest_model_pb2.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: Utilities/General/random_forest_model.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n+Utilities/General/random_forest_model.proto\x12\x11RandomForestModel\"\xed\x04\n\x16RandomForestParameters\x12\x19\n\x0cn_estimators\x18\x01 \x01(\rH\x01\x88\x01\x01\x12\x16\n\tmax_depth\x18\x02 \x01(\rH\x02\x88\x01\x01\x12\x1e\n\x11min_samples_split\x18\x03 \x01(\rH\x03\x88\x01\x01\x12\x1d\n\x10min_samples_leaf\x18\x04 \x01(\rH\x04\x88\x01\x01\x12 \n\x13min_weight_fraction\x18\x05 \x01(\x02H\x05\x88\x01\x01\x12\x0b\n\x01s\x18\x10 \x01(\tH\x00\x12\x0b\n\x01i\x18\x11 \x01(\rH\x00\x12\x0b\n\x01\x66\x18\x12 \x01(\x02H\x00\x12\x1b\n\x0emax_leaf_nodes\x18\x06 \x01(\rH\x06\x88\x01\x01\x12\"\n\x15min_impurity_decrease\x18\x07 \x01(\x02H\x07\x88\x01\x01\x12\x16\n\tbootstrap\x18\x08 \x01(\x08H\x08\x88\x01\x01\x12\x16\n\toob_score\x18\t \x01(\x08H\t\x88\x01\x01\x12\x13\n\x06n_jobs\x18\n \x01(\rH\n\x88\x01\x01\x12\x19\n\x0crandom_state\x18\x0b \x01(\x05H\x0b\x88\x01\x01\x12\x14\n\x07verbose\x18\x0c \x01(\x05H\x0c\x88\x01\x01\x42\x0e\n\x0cmax_featuresB\x0f\n\r_n_estimatorsB\x0c\n\n_max_depthB\x14\n\x12_min_samples_splitB\x13\n\x11_min_samples_leafB\x16\n\x14_min_weight_fractionB\x11\n\x0f_max_leaf_nodesB\x18\n\x16_min_impurity_decreaseB\x0c\n\n_bootstrapB\x0c\n\n_oob_scoreB\t\n\x07_n_jobsB\x0f\n\r_random_stateB\n\n\x08_verbose\"\xde\x02\n\x11RandomForestModel\x12\x17\n\nmodel_type\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x15\n\x08response\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x42\n\nparameters\x18\x03 \x01(\x0b\x32).RandomForestModel.RandomForestParametersH\x02\x88\x01\x01\x12\x1b\n\x0e\x63lassification\x18\x04 \x01(\x08H\x03\x88\x01\x01\x12H\n\x0bname_to_col\x18\x05 \x03(\x0b\x32\x33.RandomForestModel.RandomForestModel.NameToColEntry\x1a\x30\n\x0eNameToColEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\r:\x02\x38\x01\x42\r\n\x0b_model_typeB\x0b\n\t_responseB\r\n\x0b_parametersB\x11\n\x0f_classificationb\x06proto3') + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'Utilities.General.random_forest_model_pb2', _globals) +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _RANDOMFORESTMODEL_NAMETOCOLENTRY._options = None + _RANDOMFORESTMODEL_NAMETOCOLENTRY._serialized_options = b'8\001' + _globals['_RANDOMFORESTPARAMETERS']._serialized_start=67 + _globals['_RANDOMFORESTPARAMETERS']._serialized_end=688 + _globals['_RANDOMFORESTMODEL']._serialized_start=691 + _globals['_RANDOMFORESTMODEL']._serialized_end=1041 + _globals['_RANDOMFORESTMODEL_NAMETOCOLENTRY']._serialized_start=931 + _globals['_RANDOMFORESTMODEL_NAMETOCOLENTRY']._serialized_end=979 +# @@protoc_insertion_point(module_scope) diff --git a/contrib/bin/xgbd/rf_evaluate.py b/contrib/bin/xgbd/rf_evaluate.py new file mode 100644 index 00000000..aad68b95 --- /dev/null +++ b/contrib/bin/xgbd/rf_evaluate.py @@ -0,0 +1,90 @@ +# Evaluate an xgboost descriptor model built with xgboost_make + +import os +import re + +import joblib +import pandas as pd + +from absl import app +from absl import flags +from absl import logging +from google.protobuf import text_format + +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor + +import random_forest_model_pb2 + +FLAGS = flags.FLAGS + +flags.DEFINE_string("mdir", "", "Model directory") + +def get_model(mdir: str)->tuple: + """Look for 'what_kind_of_model` in `mdir` and make sure it is OK + Return a model instantiated from mdir/random_forest.joblib and the + name of the response + """ + fname = os.path.join(mdir, "model_metadata.txt") + if not os.path.exists(fname): + logging.error("%s not found", fname) + return None, None + + with open(fname, "r") as reader: + text = reader.read() + + proto = text_format.Parse(text, random_forest_model_pb2.RandomForestModel()) + if not proto: + logging.error("Cannot interpret as proto %s", text) + return None, None + + if not proto.response: + logging.error("No response in %s", fname) + return None, None + + model_file = os.path.join(mdir, "random_forest.joblib") + if not os.path.exists(model_file): + logging.error("%s not found", model_file) + return None + + model = joblib.load(model_file) + + return model, proto.response + +def random_forest_evaluate(mdir: str, fname: str)->bool: + """Read `fname` as descriptors for a model in `mdir` + """ + if not os.path.isdir(mdir): + logging.error("Model directory %s not found", mdir) + return False + + model, response = get_model(mdir) + if not model: + logging.error("Invalid mode in %s", mdir) + return False + + data = pd.read_csv(fname, sep=' ', header=0) + + logging.info("Evaluating %d rows", len(data)) + results = model.predict(data.iloc[:,1:]) + print(f"Id RF_{response}") + for i in range(len(results)): + print(f"{data.iloc[i,0]} {results[i]:.4f}") + + return True + +def main(argv): + """Evaluate a random forest descriptor model. + """ + if len(argv) == 1: + logging.error("Must specify descriptor file as argument") + return 1 + + if not FLAGS.mdir: + logging.error("must specify model directory via the --mdir option") + return 1 + + return random_forest_evaluate(FLAGS.mdir, argv[1]) + +if __name__ == '__main__': + app.run(main) diff --git a/contrib/bin/xgbd/rf_evaluate.rb b/contrib/bin/xgbd/rf_evaluate.rb new file mode 100644 index 00000000..0ca68b43 --- /dev/null +++ b/contrib/bin/xgbd/rf_evaluate.rb @@ -0,0 +1,116 @@ +#!/usr/bin/env ruby + +# Evaluate an RF model from either smiles or a descriptor file. + +require 'set' +require 'google/protobuf' + +c3tk_home = ENV['C3TK_HOME'] +raise 'C3TK_HOME not defined' unless c3tk_home + +require "#{c3tk_home}/bin/ruby/lib/iwcmdline" +require "#{c3tk_home}/bin/py/pytk/xgbd/random_forest_model_pb" + +def usage +msg = <<-END +Scores a random_forest descriptor model built with rf_make. +Takes either a smiles file or a descriptor file, +If a smiles file is given, the descriptors required for the model will be computed by make_descriptors + -mdir model directory created by rf_make + -smi input is a smiles file - model descriptors will be computed. + -pipe use descriptor_pipeline for descriptor compuation, + only some descriptors are supported, but maybe faster. + -j send the -j option to make_descriptors.sh + -v verbose output +END + $stderr << msg + + exit(0) +end + +# Return the proto mdir/model_metadata.dat +def get_model_metadata(mdir) + fname = File.join(mdir, 'model_metadata.dat') + + unless File.size?(fname) + $stderr << "Empty or missing model metadata #{fname}\n" + return 1 + end + + return RandomForestModel::RandomForestModel.decode(File.read(fname)) +end + +def rf_evaluate_smiles(fname, mdir, proto, cl) + descriptors = Set.new() + proto.name_to_col.each do |name, col| + descriptors.add(name.gsub(/_.*/, "")) + end + + if cl.option_present('pipe') + cmd = 'descriptor_pipeline.sh' + else + cmd = 'make_descriptors.sh' + end + + if cl.option_present('j') + j = cl.value('j') + cmd << " -j #{j}" + end + + tmpfile = File.join(ENV['TMPDIR'], "rf_evaluate_smiles_#{Process.uid}.#{Process.pid}.dat") + + descriptors.each do |d| + cmd << " -#{d}" + end + cmd << " #{fname} > #{tmpfile}" + $stderr << "Executing #{cmd}\n" if cl.option_present('v') + + system(cmd) + unless File.size?(tmpfile) + $stderr << "#{cmd} failed\n" + return + end + + rf_evaluate_descriptors(tmpfile, mdir, proto, cl) + + File.unlink(tmpfile) +end + +def rf_evaluate_descriptors(fname, mdir, proto, cl) + cmd = "rf_evaluate.sh -mdir #{mdir} #{fname}" + + $stderr << "Executing #{cmd}\n" if cl.option_present('v') + system(cmd) +end + +def main + cl = IWCmdline.new("-v-mdir=dir-smi-pipe-j=ipos") + + if cl.unrecognised_options_encountered + $stderr << "unrecognised_options_encountered\n" + usage + end + + unless cl.option_present('mdir') + $stderr << "Must specify random forest model directory via the -mdir option\n" + usage + end + + if ARGV.empty? + $stderr << "Insufficient arguments\n" + usage + end + + mdir = cl.value('mdir') + + proto = get_model_metadata(mdir) + + if cl.option_present('smi') || ARGV[0].match(/\.smi$/) + rf_evaluate_smiles(ARGV[0], mdir, proto, cl) + else + rf_evaluate_descriptors(ARGV[0], mdir, proto, cl) + end + +end + +main diff --git a/contrib/bin/xgbd/rf_make.py b/contrib/bin/xgbd/rf_make.py new file mode 100644 index 00000000..323b936f --- /dev/null +++ b/contrib/bin/xgbd/rf_make.py @@ -0,0 +1,240 @@ +# Build and commit an RF model +# Deliberately simplistic in approach + +import os + +import joblib +import pandas as pd +import sklearn + +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestRegressor + +from absl import app +from absl import flags +from absl import logging +from google.protobuf import text_format + +import random_forest_model_pb2 + +FLAGS = flags.FLAGS + +flags.DEFINE_string("activity", "", "Name of training set activity file") +flags.DEFINE_boolean("classification", False, "True if this is a classification task") +flags.DEFINE_string("mdir", "", "Directory into which the model is placed") +flags.DEFINE_integer("max_num_features", 0, "Maximum number of features to plot in variable importance") +flags.DEFINE_boolean("feature_importance", False, "Create mdir/feature_importance.txt or not") +flags.DEFINE_integer("rfverbosity", 0, "RF verbosity") +flags.DEFINE_string("proto", "", "A file containing an RandomForestModel proto") +flags.DEFINE_integer("max_depth", 12, "max depth") +flags.DEFINE_integer("n_estimators", 100, "number of estimators") +flags.DEFINE_integer("min_samples_split", 4, "minum number of samples required to split an internal node") +flags.DEFINE_integer("min_samples_leaf", 2, "minum number of samples required to be in a leaf node") +flags.DEFINE_integer("n_jobs", -1, "parallelism, -1 means use all available processors") + +class Options: + def __init__(self): + self.classification = False + self.mdir: str = "" + self.max_num_features: int = 10 + self.feature_importance: bool = False + self.proto = random_forest_model_pb2.RandomForestParameters() + + def read_proto(self, fname)->bool: + """Read self.proto from `fname` + """ + with open(fname, "r") as reader: + text = reader.read() + + self.proto = text_format.Parse(text, random_forest_model_pb2.RandomForestParameters()) + if not self.proto: + logging.error("Cannot intpret %s", text) + return False + + return True + +# Note that we are doing just impurity based feature importance which is known +# to have a bias against features with just a few values. So integer values +# will likely show up as less important than floating point descriptors. +def do_feature_importance(booster, feature_names, options: Options): + logging.info("Doing feature importance") + + importances = booster.feature_importances_ + + forest_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False) + +# plotting never worked as I wanted, tabular output seems good enough. +# If ever anyone wants to implement this, need to sort at the end not above... +# std = np.std([tree.feature_importances_ for tree in booster.estimators_], axis=0) +# fig, ax = plt.subplots() +# forest_importances.plot.bar(yerr=std, ax=ax) +# ax.set_title("Feature importances using MDI") +# ax.set_ylabel("Mean decrease in impurity") +# fig.tight_layout() +# fig.savefig(os.path.join(options.mdir, "feature_importance.png")) + + forest_importances = pd.DataFrame(forest_importances).reset_index() + forest_importances.columns = ["Feature", "Importance"] + forest_importances.to_csv(os.path.join(options.mdir, "feature_importance.txt"), sep=' ', index=False) + +def keywords_from(proto): + """Transfer keyword arguments from `proto` to a dictionary. + """ + result = {} + if proto.n_estimators: + result['n_estimators'] = proto.n_estimators + if proto.max_depth: + result['max_depth'] = proto.max_depth + if proto.min_samples_leaf: + result['min_samples_leaf'] = proto.min_samples_leaf + if proto.min_samples_split: + result['min_samples_split'] = proto.min_samples_split + if proto.n_jobs > 0: + result['n_jobs'] = proto.n_jobs + if proto.verbose: + result['verbose'] = True + + return result + +def classification(x, y, options: Options): + """build a classification model + """ + args = keywords_from(options.proto) + + logging.info("Build model with %s", args) + booster = RandomForestClassifier(**args) + booster.fit(x, y) + + logging.info("Saving model to %s", os.path.join(options.mdir, "random_forest.joblib")) + joblib.dump(booster, os.path.join(options.mdir, "random_forest.joblib")) + + if len(options.feature_importance): + do_feature_importance(booster, x.columns, options) + + return True + +def regression(x, y, options: Options): + """build a regression model. + """ + args = keywords_from(options.proto) + + logging.info("Build model with %s", args) + booster = RandomForestRegressor(**args) + booster.fit(x, y) + + logging.info("Saving model to %s", os.path.join(options.mdir, "random_forest.joblib")) + joblib.dump(booster, os.path.join(options.mdir, "random_forest.joblib")) + + if options.feature_importance: + do_feature_importance(booster, x.columns, options) + + return True + +def build_random_forest_model(descriptor_fname: str, + activity_fname: str, + options: Options)->bool: + """Build a random forest model on the data in `descriptor_fname` and + `activity_fname`. + This function does data preprocessing. + """ + + descriptors = pd.read_csv(descriptor_fname, sep=' ', header=0, low_memory=False) + logging.info("Read %d rows and %d columns from %s", len(descriptors), + descriptors.shape[1], descriptor_fname) + activity = pd.read_csv(activity_fname, sep=' ', header=0) + logging.info("Read %d rows from %s", activity.shape[0], activity_fname) + + + descriptors.rename(columns={descriptors.columns[0]: "Id"}, inplace=True) + activity.rename(columns={activity.columns[0]: "Id"}, inplace=True) + combined = pd.concat([activity.set_index("Id"), + descriptors.set_index("Id")], axis=1, join='inner').reset_index() + if len(combined) != len(descriptors): + logging.error("Combined set has %d rows, need %d", len(combined), len(descriptors)) + return 1 + + if not os.path.isdir(options.mdir): + os.mkdir(options.mdir) + + y = combined.iloc[:,1].to_numpy() + + x = combined.iloc[:,2:] + features = x.columns + x.apply(pd.to_numeric).to_numpy() + + rc = False + if options.classification: + rc = classification(x, y, options) + else: + rc = regression(x, y, options) + + if not rc: + return False + + response = activity.columns[1] + + proto = random_forest_model_pb2.RandomForestModel(); + proto.model_type = "RF" + proto.classification = options.classification + proto.response = response + proto.parameters.CopyFrom(options.proto) + + for (column, feature) in enumerate(features): + proto.name_to_col[feature] = column + + with open(os.path.join(options.mdir, "model_metadata.txt"), "w") as f: + f.write(text_format.MessageToString(proto)) + with open(os.path.join(options.mdir, "model_metadata.dat"), "wb") as f: + f.write(proto.SerializeToString()) + + return True + +def main(argv): + """Build random_forest models from activity file and descriptor file. + """ + if not FLAGS.activity: + logging.error("Must specifythe name of the activity file with the --activity option") + return False + if not FLAGS.mdir: + logging.error("Must specify the model directory via the --mdir option") + return False + # Descriptor file must be the only command line argument + if len(argv) == 1: + logging.error("Must specify the descriptor as a command line argument") + return False + + options = Options() + options.classification = FLAGS.classification + options.feature_importance = FLAGS.feature_importance + options.mdir = FLAGS.mdir + options.verbosity = FLAGS.rfverbosity + + # Build the proto first, and then anything that might overwrite it. + if FLAGS.proto: + if not options.read_proto(FLAGS.proto): + logging.error("Cannot read textproto parameters %s", FLAGS.proto) + return 1 + + if FLAGS.n_estimators: + options.proto.n_estimators = FLAGS.n_estimators + if FLAGS.max_depth: + options.proto.max_depth = FLAGS.max_depth + if FLAGS.min_samples_split: + options.proto.min_samples_split = FLAGS.min_samples_split + if FLAGS.min_samples_leaf: + options.proto.min_samples_leaf = FLAGS.min_samples_leaf + if FLAGS.n_jobs > 0: + options.proto.n_jobs = FLAGS.n_jobs + options.proto.verbose = FLAGS.rfverbosity + + # print(options.proto) + + if not build_random_forest_model(argv[1], FLAGS.activity, options): + logging.error("Model %s not build", options.mdir) + return 1 + + # zero return code for success. + return 0 + +if __name__ == '__main__': + app.run(main) diff --git a/contrib/bin/xgbd/xgbd_evaluate.py b/contrib/bin/xgbd/xgbd_evaluate.py index 78a1653c..1cdc16ff 100644 --- a/contrib/bin/xgbd/xgbd_evaluate.py +++ b/contrib/bin/xgbd/xgbd_evaluate.py @@ -67,14 +67,14 @@ def xgboost_evaluate(mdir: str, fname: str)->bool: logging.info("Evaluating %d rows", len(data)) results = model.predict(data.iloc[:,1:]) - print(f"Id {response}") + print(f"Id XGBD_{response}") for i in range(len(results)): print(f"{data.iloc[i,0]} {results[i]:.4f}") return True def main(argv): - """Temporary tool to fix broken multi-fragment unique smiles problem + """Evaluate an xgboost descriptor model. """ if len(argv) == 1: logging.error("Must specify descriptor file as argument") @@ -88,4 +88,5 @@ def main(argv): return xgboost_evaluate(FLAGS.mdir, argv[1]) if __name__ == '__main__': + absl::InitializeLog() app.run(main) diff --git a/contrib/bin/xgbd/xgbd_evaluate.rb b/contrib/bin/xgbd/xgbd_evaluate.rb index 78ed1624..cb20c7c5 100644 --- a/contrib/bin/xgbd/xgbd_evaluate.rb +++ b/contrib/bin/xgbd/xgbd_evaluate.rb @@ -1,61 +1,108 @@ -# Front end for xgbd_evaluate.py +#!/usr/bin/env ruby -require_relative("../lib/iwcmdline.rb") +# Evaluate an xgboost model -def usage(rc) - exit(rc) -end +require 'set' +require 'google/protobuf' -def main - cl = IWCmdline.new("-v-mdir=dir-tmpdir=dir") +c3tk_home = ENV['C3TK_HOME'] +raise 'C3TK_HOME not defined' unless c3tk_home - verbose = cl.option_present("v") +require "#{c3tk_home}/bin/ruby/lib/iwcmdline" +require "#{c3tk_home}/bin/py/pytk/xgbd/xgboost_model_pb" - if cl.unrecognised_options_encountered() - $stderr << "unrecongised_options_encountered\n" - usage(1) - end +def usage +msg = <<-END +Scores an xgboost descriptor model built with xgbd_make. +Takes either a smiles file or a descriptor file, +If a smiles file is given, the descriptors required for the model will be computed by make_descriptors + -mdir model directory created by xgbd_make + -smi input is a smiles file - model descriptors will be computed. + -pipe use descriptor_pipeline for descriptor compuation, + only some descriptors are supported, but maybe faster. + -j send the -j option to make_descriptors.sh + -v verbose output +END + $stderr << msg - unless cl.option_present("mdir") - $stderr << "Must specify model directory via the -mdir option\n" - usage(1) - end + exit(0) +end + +# Return the proto mdir/model_metadata.dat +def get_model_metadata(mdir) + fname = File.join(mdir, 'model_metadata.dat') - mdir = cl.value("mdir") - $stderr << "mdir #{mdir}\n" - traindat = File.join(mdir, "train.dat") - unless File.size?(traindat) - $stderr << "Missing or empty training data file #{traindat}\n" + unless File.size?(fname) + $stderr << "Empty or missing model metadata #{fname}\n" return 1 end - tmpdir = if cl.option_present("tmpdir") - cl.value("tmpdir") - else - "/tmp" - end + return XgboostModel::XGBoostModel.decode(File.read(fname)) +end - if ARGV.size == 0 - $stderr << "Insufficient arguments\n" - usage(1) +def xgbd_evaluate_smiles(fname, mdir, proto, cl) + descriptors = Set.new() + proto.name_to_col.each do |name, col| + descriptors.add(name.gsub(/_.*/, "")) end - tmpfile = File.join(tmpdir, "xgbd_eval#{Process.pid}.dat") - cmd = "iwcut -D #{traindat} -v #{ARGV[0]} > #{tmpfile}" - $stderr << "Executing #{cmd}\n" if verbose - rc = system(cmd) - if rc && File.size?(tmpfile) + if cl.option_present('pipe') + cmd = 'descriptor_pipeline.sh' else - $stderr << "#{cmd} failed, did not generate #{tmpfile}\n" + cmd = 'make_descriptors.sh' end - python_script = __FILE__.gsub("\.rb", ".py") - cmd = "python #{python_script} -mdir #{mdir} #{tmpfile}" - $stderr << "Executing #{cmd}\n" if verbose + if cl.option_present('j') + j = cl.value('j') + cmd << " -j #{j}" + end + + descriptors.each do |d| + cmd << " -#{d}" + end + cmd << " #{fname}" + cmd << "| xgboost_model_evaluate.sh -mdir #{mdir} -" + + $stderr << "Executing #{cmd}\n" if cl.option_present('v') + system(cmd) +end +def xgbd_evaluate_descriptors(fname, mdir, proto, cl) + cmd = "xgboost_model_evaluate.sh -mdir #{mdir} #{fname}" + + $stderr << "Executing #{cmd}\n" if cl.option_present('v') + system(cmd) +end + +def main + cl = IWCmdline.new("-v-mdir=dir-smi-pipe-j=ipos") + + if cl.unrecognised_options_encountered + $stderr << "unrecognised_options_encountered\n" + usage + end + + unless cl.option_present('mdir') + $stderr << "Must specify xgboost model directory via the -mdir option\n" + usage + end + + if ARGV.empty? + $stderr << "Insufficient arguments\n" + usage + end + + mdir = cl.value('mdir') + + proto = get_model_metadata(mdir) + + if cl.option_present('smi') || ARGV[0].match(/\.smi$/) + xgbd_evaluate_smiles(ARGV[0], mdir, proto, cl) + else + xgbd_evaluate_descriptors(ARGV[0], mdir, proto, cl) + end - File.unlink(tmpfile) end -main() +main diff --git a/contrib/bin/xgbd/xgbd_make.py b/contrib/bin/xgbd/xgbd_make.py index 9ad7247a..4fb8d180 100644 --- a/contrib/bin/xgbd/xgbd_make.py +++ b/contrib/bin/xgbd/xgbd_make.py @@ -1,7 +1,7 @@ # Build and commit an xgboost model # Deliberately simplistic in approach -import os +import os, sys import pandas as pd import sklearn @@ -21,16 +21,21 @@ FLAGS = flags.FLAGS flags.DEFINE_string("activity", "", "Name of training set activity file") -flags.DEFINE_string("desc", "", "Name of training set descriptor file") flags.DEFINE_boolean("classification", False, "True if this is a classification task") flags.DEFINE_string("mdir", "", "Directory into which the model is placed") flags.DEFINE_integer("max_num_features", 0, "Maximum number of features to plot in variable importance") flags.DEFINE_string("feature_importance", "", "File containing feature importance values") flags.DEFINE_integer("xgverbosity", 0, "xgboost verbosity") flags.DEFINE_string("proto", "", "A file containing an XGBoostParameters proto") -flags.DEFINE_float("eta", 0.3, "xgboost learning rate parameter eta") -flags.DEFINE_integer("max_depth", 6, "xgboost max depth") -flags.DEFINE_integer("n_estimators", 100, "xboost number of estimators") +flags.DEFINE_float("eta", 0.4, "xgboost learning rate parameter eta") +flags.DEFINE_integer("max_depth", 5, "xgboost max depth") +flags.DEFINE_integer("n_estimators", 500, "xboost number of estimators") +flags.DEFINE_float("subsample", 1.0, "subsample ratio for training instances") +flags.DEFINE_float("colsample_bytree", 1.0, "subsampling occurs once for every tree constructed") +flags.DEFINE_float("colsample_bylevel", 1.0, "subsampling occurs once for every new depth level reached") +flags.DEFINE_float("colsample_bynode", 1.0, "subsampling occurs once for every time a new split is evaluated") +flags.DEFINE_enum("tree_method", "auto", ["auto", "exact", "approx", "hist"], "tree construction method: auto exact approx hist") + class Options: def __init__(self): @@ -52,22 +57,47 @@ def read_proto(self, fname)->bool: return False return True + def classification(x, y, options: Options)->bool: """build a classification model + Args: + x: feature matrix + y: response - must be translated to 0,1. Not implemented... """ booster = XGBClassifier(verbosity=options.verbosity) booster.fit(x, y) + booster.save_model(os.path.join(options.mdir, "xgboost.json")) + def regression(x, y, options: Options): """build a regression model. """ + match options.proto.tree_method: + case xgboost_model_pb2.AUTO: + tree_method = 'auto' + case xgboost_model_pb2.EXACT: + tree_method = 'exact' + case xgboost_model_pb2.APPROX: + tree_method = 'approx' + case xgboost_model_pb2.HIST: + tree_method = 'hist' + case _: + tree_method = 'hist' + booster = XGBRegressor(verbosity=options.verbosity, eta=options.proto.eta, max_depth=options.proto.max_depth, - n_estimators = options.proto.n_estimators) + n_estimators=options.proto.n_estimators, + colsample_bytree=options.proto.colsample_bytree, + colsample_bylevel=options.proto.colsample_bylevel, + colsample_bynode=options.proto.colsample_bynode, + subsample=options.proto.subsample, + tree_method=tree_method + ) booster.fit(x, y) booster.save_model(os.path.join(options.mdir, "xgboost.json")) + logging.info("Saved model to %s", os.path.join(options.mdir, "xgboost.json")) if options.max_num_features: plot_importance(booster, max_num_features=options.max_num_features) pyplot.show() @@ -76,11 +106,9 @@ def regression(x, y, options: Options): feature_importance = sorted(feature_importance.items(), key=lambda x:x[1]) if options.feature_importance: with open(os.path.join(options.mdir, options.feature_importance), "w") as writer: - # Write a markdown table, easy to undo if needed. - print("| Feature | Weight |", file=writer) - print("| ------- | ------ |", file=writer) + print("Feature Weight", file=writer) for f, i in feature_importance: - print(f"| {f} | {i} |", file=writer) + print(f"{f} {i}", file=writer) # config = booster.save_config() @@ -94,27 +122,30 @@ def build_xgboost_model(descriptor_fname: str, This function does data preprocessing. """ - descriptors = pd.read_csv(descriptor_fname, sep=' ', header=0, low_memory=False) + descriptors = pd.read_csv(descriptor_fname, sep=' ', header=0, low_memory=False, na_values=['.']) logging.info("Read %d rows and %d columns from %s", len(descriptors), descriptors.shape[1], descriptor_fname) activity = pd.read_csv(activity_fname, sep=' ', header=0) logging.info("Read %d rows from %s", activity.shape[0], activity_fname) - descriptors.rename(columns={descriptors.columns[0]: "Id"}, inplace=True) - activity.rename(columns={activity.columns[0]: "Id"}, inplace=True) - combined = pd.concat([activity.set_index("Id"), - descriptors.set_index("Id")], axis=1, join='inner').reset_index() + descriptors.rename(columns={descriptors.columns[0]: "Name"}, inplace=True) + activity.rename(columns={activity.columns[0]: "Name"}, inplace=True) + combined = pd.concat([activity.set_index("Name"), + descriptors.set_index("Name")], axis=1, join='inner').reset_index() if len(combined) != len(descriptors): - logging.error("Combined set has %d rows", len(combined)) + logging.error("Combined set has %d rows, need %d", len(combined), len(descriptors)) return 1 if not os.path.isdir(options.mdir): os.mkdir(options.mdir) + combined.to_csv(os.path.join(options.mdir, "train.xy"), sep= ' ', index=False) + y = combined.iloc[:,1].to_numpy() x = combined.iloc[:,2:] + features = x.columns x.apply(pd.to_numeric).to_numpy() rc = False @@ -130,25 +161,44 @@ def build_xgboost_model(descriptor_fname: str, proto = xgboost_model_pb2.XGBoostModel(); proto.model_type = "XGBD" - proto.classification = False + proto.classification = options.classification proto.response = response proto.parameters.CopyFrom(options.proto) + + for (column, feature) in enumerate(features): + proto.name_to_col[feature] = column + with open(os.path.join(options.mdir, "model_metadata.txt"), "w") as f: f.write(text_format.MessageToString(proto)) + with open(os.path.join(options.mdir, "model_metadata.dat"), "wb") as f: + f.write(proto.SerializeToString()) return True +def option_present(flag)->bool: + """Return true if the option `flag` is in sys.argv + Args: + argv: usually the command line + flag: a command line option. We look for -flag and --flag in argv. + """ + if '-'+flag in sys.argv: + return True + if '--'+flag in sys.argv: + return True + return False + + def main(argv): """Build xgboost models from activity file and descriptor file. """ if not FLAGS.activity: logging.error("Must specifythe name of the activity file with the --activity option") return False - if not FLAGS.desc: - logging.error("Must specifythe name of the descriptor file with the --desc option") + if len(argv) == 1: + logging.error("Must specifythe name of the descriptor file as argument") return False if not FLAGS.mdir: - logging.error("Must specifyi the model directory via the --mdir option") + logging.error("Must specify the model directory via the --mdir option") return False options = Options() @@ -158,21 +208,59 @@ def main(argv): options.feature_importance = FLAGS.feature_importance options.verbosity = FLAGS.xgverbosity - # Build the proto first, and then anything that might overwrite it. + # Build the proto first. + # After that is done, we check for command line arguments that would + # over-ride what has come in from the proto. if FLAGS.proto: if not options.read_proto(FLAGS.proto): logging.error("Cannot read textproto parameters %s", FLAGS.proto) return False - else: + + # Overrides from the command line. + # If the proto does not have a value, just use the default from FLAGS. + if not options.proto.HasField("eta"): options.proto.eta = FLAGS.eta + + if not options.proto.HasField("max_depth"): options.proto.max_depth = FLAGS.max_depth + + if not options.proto.HasField("n_estimators"): options.proto.n_estimators = FLAGS.n_estimators - if not build_xgboost_model(FLAGS.desc, FLAGS.activity, options): + if not options.proto.HasField("subsample"): + options.proto.subsample = FLAGS.subsample + + if not options.proto.HasField("colsample_bytree"): + options.proto.colsample_bytree = FLAGS.colsample_bytree + + if not options.proto.HasField("colsample_bylevel"): + options.proto.colsample_bylevel = FLAGS.colsample_bylevel + + if not options.proto.HasField("colsample_bynode"): + options.proto.colsample_bynode = FLAGS.colsample_bynode + + if option_present("tree_method"): + match FLAGS.tree_method: + case "auto": + options.proto.tree_method = xgboost_model_pb2.AUTO + case "exact": + options.proto.tree_method = xgboost_model_pb2.EXACT + case "approx": + options.proto.tree_method = xgboost_model_pb2.APPROX + case "hist": + options.proto.tree_method = xgboost_model_pb2.HIST + case _: # Cannot happen because this is DEFINE_enum + print(f"Unrecognised tree method {FLAGS.tree_method}", file=sys.stderr) + return False + elif not options.proto.HasField("tree_method"): + options.proto.tree_method = xgboost_model_pb2.AUTO + + if not build_xgboost_model(argv[1], FLAGS.activity, options): logging.error("Model %s not build", options.mdir) return False - return True + # zero return code for success. + return 0 if __name__ == '__main__': app.run(main) diff --git a/contrib/bin/zof.sh b/contrib/bin/zof.sh index 803be142..2f8f6e2f 100755 --- a/contrib/bin/zof.sh +++ b/contrib/bin/zof.sh @@ -5,7 +5,7 @@ if [ -v LILLYMOL_HOME ] ; then true else - export LILLYMOL_HOME=$(dirname $(dirname $(dirname $0))) + export LILLYMOL_HOME=$(dirname $(dirname $(dirname $(readlink -e $0)))) fi ZARTLER=${LILLYMOL_HOME}/data/queries/zof diff --git a/contrib/script/py/__init__.py b/contrib/script/py/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/script/py/mmp/README.md b/contrib/script/py/mmp/README.md deleted file mode 100644 index e0582a88..00000000 --- a/contrib/script/py/mmp/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# The Eli Lilly (LillyMol) Matched Molecular Pairs Toolkit - -## Contact -[JamesALumley](https://github.com/JamesALumley) - -## About -A toolkit of scripts and methods for Matched Molecular Pairs (MMP) analysis based on an in-memory implementation of the Hussain and Rea fragment indexing algorithm (JCIM 2010). - -An MMP is defined as a pair of molecules that differ in only a minor single point of chemical change. MMP Analysis is a method that compares the properties of these two molecules to derive a change or delta value associated with the chemical (fragment) change. - -A Transform can be defined as a specified chemical change and its statistical significance across many pairs of molecules (many MMPs). An example transformation is the replacement of one functional group by another (e.g.: Cl => Br) or the addition of a phenyl ring (e.g.: H => Phenyl). A useful Molecular transformation in a specified context is termed a significant transformation if it is seen to systematically decrease or increase a desired property (i.e.: The specified fragment change or Transform is similar in magnitude and direction for many example MMPs. Different metrics can be used to define significant. Transforms or design rules can be used as an aid to decision-making in the multiple parameter optimisation problem of small molecule drug design. - -## Getting Started - -### Prerequisites - -This python code depends on multiple binary executables from the core LillyMol code base such as dicer for fragmentation. Please follow the instructions to install and compile LillyMol first as per the root folder README file. Once LillyMol is installed and working the file pymo.py can be edited to correctly reference all binaries, typically using the environment variable LILLYMOL_HOME to point to the top directory containing the _ _contrib_ _ subdir. The pymo.py file acts as a file IO based python interface to the main LillyMol executables. The python path should then be set as below: - -``` -export PYTHONPATH=${LILLYMOL_HOME}/contrib/script/py -cd $LILLYMOL_HOME -cd bin -ln -s Linux-gcc-7.2.1 Linux -``` - -### Simple MMP / MMS Generation -The following scripts are included in this project: -* **getMMPfromSMI.py** - For an input SMILES file (Molecule SMILES and numeric ID) generate a CSV of Matched Molecular Pairs -* **getMMPStatsfromCSV.py** - For an input CSV file (Molecule SMILES, numeric ID and numerical data column) generate a CSV of Matched Molecular Pairs with associated delta for the given data column(s). Summarised (aggregated) transforms can additionally be requested. A second CSV file will be generated containing these transforms. Options are available for different statistical aggregation methods. - * MEAN_DIFF: The simplest form of aggregation would ensure the input bioactivity data is in log form, then calculate the mean of the difference (-A MEAN_DIFF). The [scipy students t-test](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html) is used to calculate statistical significance. - * FOLD_CHANGE: For data that is already in the log scale, such as solubility (LogS = log10(Solubility mg/ml)), this method can be used to return the reverse log of the mean (10^x). This represents the average solubility ratio of substituted over the parent molecules for a given matched-pair. Lower values indicate the transformation is likely to reduce solubility, positive values vice-versa and values close to 1 indicate minimal change expected. - * DIFFXX: The diffxx metric is a scaled Index with upper and lower boundaries determined by the t.interval. It is for use with data like % Microsomal Metabolic Turnover (DIFF60) or Est IC50 for CYP Inhibition (DIFF50) and will be described elsewhere. - * CATEGORICAL: For categorical data, the method assumes two class categorical data with values 0 and 1. A textual value is output describing the number of pairs that have or have not seen a change in class. - * Atom Context: The aggregation type switch (-g) allows you to include the atom context in the aggregation and will most likely produce a greater number of summarised pairs. In some cases transforms will have improved predictive power using atom attachment points. - * Property Data: can be added via the -p switch (default is no property differences). This is based on the output from the LillyMol iwdescr binary and calculates difference (FRAG_R – FRAG_L) regardless of -A. - * Quality Metric: The quality estimate was designed for use with the commercial Biobyte clogP algorithm. As an alternative, the opensource LillyMol Abrahams AlogP can be used. It is used to estimate the quality of a transfrom from the perspective of chemical diversity and uses logP as a surrogate. Where L.clogp is the clogp of the parent (Left) compound and d.clogp is the delta clogp (Right – Left), we can defined the following: A good transform has >=15 contributing MMPs, a stdev of L.clogp >= 0.5 and when |d.clogp|>0, stdev of d.clogp>0. A medium quality transform has >=6 contributing MMPs, a stdev of L.clogp >= 0.3 and when |d.clogp>|0, stdev of d.clogp>0. Any other transform is classed as poor quality. -* **getMMPbasedMCSSfromSMI.py** - For an input SMILES file (Molecule SMILES and numeric ID) generate a CSV of the MMP that has the smallest change between the two molecules (by atom count). This is roughly equivalent to the Maximum Common Substructure (MCSS) between two molecules but is based on SMILES and is not as expressive as more flexible SMARTS based definition. -* **getMMPEnumeratedNewMols.py** - For an input SMI file (Molecule SMILES and numeric ID) mutate the molecule to create new idea molecules based on a prebuilt or custom set of Transforms. This code is useful in evolving molecules in an automated design scenario by applying a file of suggested/selected transforms. -* **getMMPSeriesfromCSV.py** - For an input CSV file (Molecule SMILES, numeric ID and numerical data column) generate a CSV of Matched Molecular Series. The file will print each series as a ordered series with one molecule per line with associated data. This is an automated approach to deriving new idea compounds with improved activity data. It is also an approach for SAR transfer. -* **getMMPSeriesSuggestfromCSV.py** - For an input SMI file and a second .series file (or a directory of .series files) search the input SMI for matched series that can be extended by the series in the second file(s). This is an automated approach to deriving new idea compounds with improved activity data and for SAR transfer. - -### Implementation Details -The python MMP code is based on the LillyMol dicer executable, a c-code based executable for molecule fragmentation. As molecule smiles are parsed by dicer, the resulting canonicalised fragments are processed and stored in a dictionary. The larger fragment is termed the context smiles and the smaller part the fragment. The dicer switch MAXFF=0.50001 is used and means that the ‘context’ or the bit of the matched pair that does not change, is defined as the larger part. Larger is >50.001% of the whole molecule (fraction of atoms). The resulting dictionary is keyed by the context smiles and an associated iterator can return all MMP's from the fully populated dictionary. Atoms involved in bond split/fragmentation are tagged with an isotopic label e.g.: the smiles [1CH3] which represents a terminal methyl group that has been cut/fragmented from a molecule with the C atom labelled with an isotopic label 1. Support for atom attachment points is included. The concept of denormalisation is used on the stored fragment/context smiles strings in order to conserve memory. A lookup dictionary is used to assign an arbitary id to all smiles strings and the id stored in the mmp dictionary to avoid repetitively storing larger smiles strings. The result is a fast mmp implementation that is not database coupled and therefore lighterweight than some other opensource implementations. The obvious downside is greater memory usage. - -### Examples - -``` -getMMPfromSMI.py --help - -getMMPfromSMI.py –i input.smi –o output.pairs - -getMMPStatsfromCSV.py --help - -getMMPStatsfromCSV.py –i input.csv –o output.pairs -s SMI_COLUMN -n ID_COLUMN -a DATA_COLUMN -A AGGREGATION_METHOD - -``` - - -## Running the tests - -Each class has it's own set of unit tests that can be triggered independently e.g.: -``` -python mmp_objects.py -python mmp_stats_functions.py -``` -Alternatively, integration tests that demonstrate the usage of the code can be run from the LillyMol test suite: -``` -cd $LILLYMOL_HOME/tests/getMMPStatsfromCSV/ -./run_test -``` - -## License - -Please see the LICENSE file in the root of the repository diff --git a/contrib/script/py/mmp/__init__.py b/contrib/script/py/mmp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/script/py/mmp/getMMPEnumeratedNewMols.py b/contrib/script/py/mmp/getMMPEnumeratedNewMols.py deleted file mode 100755 index 745857ff..00000000 --- a/contrib/script/py/mmp/getMMPEnumeratedNewMols.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python -############################################################## -""" -Summary: Input SMI to script. SMI will be fragmented using DICER -and the resulting R-groups/fragments will be used to query the MMPs -for MMP based improvements/transformations for command line specified -ADME end points (JAL) - -""" -############################################################# -import logging -import argparse -import sys -import os - -from mmp.mmp_enum_mols_from_pairs import MMPEnumerateNewMols - - -def main(): - - def restricted_dicer_float(x): - """check maxff switch on dicer is valid, where 0.3 is smaller - and 0.5 is default and 0.7 is huge/avoided""" - x = float(x) - if x < 0.01 or x > 0.99: - raise argparse.ArgumentTypeError("%r not in range [0.01, 0.99]" % (x,)) - return x - - parser = argparse.ArgumentParser(description='Enumerate new molecules from input SMI and a matched pairs file. ') - - parser.add_argument("-i", nargs=1, required=True, - help="Specify input smiles file") - parser.add_argument("-p", nargs=1, required=True, - help="Specify input pairs file") - parser.add_argument("--frag_left_col", nargs=1, required=True, - help="Specify name of frag left column in input pairs file") - parser.add_argument("--frag_right_col", nargs=1, required=True, - help="Specify name of frag right column in input pairs file") - parser.add_argument("-o", nargs=1, required=True, - help="Specify output CSV filename") - parser.add_argument('-H', action='store_false', default=True, - help='Invoke flag to include all H transformations (usually lots!)') - parser.add_argument('-b', type=restricted_dicer_float, default=0.3, - help='Dicer Threshold for Fragmentation, default=0.3 for small side chain replace or set to ' - '0.50001 for larger fragment removal including core replacements') - parser.add_argument('-l', nargs=1, default=False, help='optional log filename') - parser.add_argument('-L', nargs=1, default='INFO', help='debug log level CRITICAL|ERROR|WARNING|INFO|DEBUG') - - args = parser.parse_args() - - # create logger - mmplogger = logging.getLogger('lillymol_file_logger') - - log_level = args.L[0] - - if args.l: - log_file = args.l[0] - - else: - log_file = None - log_level = None - - if log_file is not None: - - fh = logging.FileHandler(log_file) - - if log_level is None: - mmplogger.setLevel(logging.INFO) - - else: - numeric_level = getattr(logging, log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%Y/%m/%d %I:%M:%S %p') - - fh.setFormatter(formatter) - - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - - ####################### - # main stuff - ####################### - - # print('Instantiating MMP ADME DB Objects' - mmplogger.info('Instantiating Enumeration Object') - mmp_newmol_object = MMPEnumerateNewMols(mmplogger) - - # read in the input smiles then fragment - mmp_newmol_object.scan_input_smiles(args.i[0], injest=True) - # fragment - mmp_newmol_object.fragment_reference_smi('BOTH', args.b, exclude_h_subst=args.H) - - # get transformations for enumeration - temp_dict = mmp_newmol_object.pairs_file_to_dict(args.p[0], - frag_l_col=args.frag_left_col[0], - frag_r_col=args.frag_right_col[0], - exclude_h_subst=args.H) - - mmp_newmol_object.add_transformation_group('pairs_from_file', temp_dict) - - # enumerate - mmp_newmol_object.write_rxn_files() - mmp_newmol_object.write_reactants_mol_frag_dict() - mmp_newmol_object.do_reactions() - mmp_newmol_object.write_products_to_csv(args.o[0]) - - -if __name__ == '__main__': - main() diff --git a/contrib/script/py/mmp/getMMPSeriesSuggestfromCSV.py b/contrib/script/py/mmp/getMMPSeriesSuggestfromCSV.py deleted file mode 100755 index 7c6cc586..00000000 --- a/contrib/script/py/mmp/getMMPSeriesSuggestfromCSV.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python -############################################################################## -# -# Summary: -# Generate all Matched Series from input CSV file. Take the query molecules -# and search for all possible series extensions (within set SAR transfer) -# -# Usage: -# getMMPSeriesSuggestfromCSV.py -i test.csv -o test.ideas -s SMILES -n ID -a PIC50 -m 4 -# -############################################################################## -import argparse -import sys -import logging - -from mmp.mmp_series_object import MMPSeriesObjectClass - - -def main(): - - def restricted_float(x): - x = float(x) - if x < 0.1 or x > 0.99: - raise argparse.ArgumentTypeError("%r not in range [0.1, 0.99]" % (x,)) - return x - - def restricted_int(x): - x = int(x) - if x < 1 or x > 99: - raise argparse.ArgumentTypeError("%r not in range [1, 99]" % (x,)) - return x - - parser = argparse.ArgumentParser(description='Generate new ideas using Matched Series data for 1 or more input SMI ' - 'and a set of compounds to derive MMS from (within set MMS Analysis).' - 'For removal of poor quality matches use: -p 5 -f which is consistent ' - 'with Keefer and Chang MedChemComm 2017 where candidate (match) ' - 'series must have >=5 compounds, activity spread >= 0.5 and skew <= 3') - # This is essential - parser.add_argument('-i', required=True, help='CSV: Input CSV file of smiles and activity data') - parser.add_argument('-s', required=True, help='Column name for Molecule SMILES') - parser.add_argument('-n', required=True, help='Column name for Molecule ID') - parser.add_argument('-a', required=True, help='Column name for Activity Data') - parser.add_argument('-o', required=True, help='Output file of enumerated new ideas with scored MMS data') - # options - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-x', type=str, default=None, - help='Quoted, Comma delimited list of 3 or more Mol IDs with no gaps or spaces e.g.: ' - '\"12345,12346,12347\". These Mol IDs must be present in the CSV. An alternative is to ' - 'use auto_search (-y) which scans series created from input CSV and uses them to derive ' - 'query series.') - group.add_argument('-y', action='store_true', default=False, - help='Invoke this flag instead of -x if you want to run an auto_search') - group.add_argument('-w', action='store_true', default=False, - help='Invoke this flag instead of -x or -y if you want to run an auto_search_fullscan') - # additional option - parser.add_argument('-z', type=str, default=None, - help='Fully qualified path containing one or more series files to use as reference set for ' - 'SAR Transfer - external set transfer. If this is defined, will run external set transfer ' - 'not within set transfer.') - # limits on series length and other matching options - parser.add_argument('-p', type=restricted_int, default=5, - help='Invoke to set a min limit on the reported series length (default >= 5). The minimum ' - 'allowed value is 3 as a value of 2 will simple find matched pairs. For the auto search ' - 'method, this switch will be used as the series length for the auto search query series ' - 'and -q will be used as the max length.') - parser.add_argument('-q', type=restricted_int, default=10, - help='Invoke to set a max limit on the search series length (default <= 10). This affects ' - 'auto search only and combined with -p defines the max and mix extent of all the ' - 'series that will be autogenerated from existing series and used as search queries ' - 'to try and identify further functional groups that can be made with greater activity.') - parser.add_argument('-m', action='store_true', default=False, - help='Invoke flag to force strict, ordered matching of fragments in series (default = False). ' - 'If False, series Me,F,Cl,Br will be allowed to match Me,Cl,F,Br i.e.: number and type' - 'of fragments must be the same by order does not have to match. Output will include ' - 'Spearmans rho to allow assessment of rank correlation for matched series pairs') - # - parser.add_argument('-f', action='store_true', default=False, - help='Invoke flag to apply range and skew filters to initial query MMS') - # - parser.add_argument('-t', action='store_true', default=False, - help='Invoke this flag (no value needed) if you want a second header line with datatypes') - # - parser.add_argument('-d', type=restricted_float, default=0.3, - help='Dicer maxff cutoff with default value of 0.3 (remove a maximum of 30 pct of each mol as' - ' the fragment).') - # - parser.add_argument('-c', type=str, default='BOTH', choices=['SINGLE', 'DOUBLE', 'BOTH'], - help='Specify cut type, SINGLE, DOUBLE or default = BOTH') - # - parser.add_argument('-l', help='optional log filename', default=False) - parser.add_argument('-L', help='debug log level CRITICAL|ERROR|WARNING|INFO|DEBUG', default='INFO') - - args = parser.parse_args() - - # create logger - mmplogger = logging.getLogger('lillymol_file_logger') - - log_level = args.L - - if args.l: - log_file = args.l - else: - log_file = None - log_level = None - - if log_file is not None: - - fh = logging.FileHandler(log_file) - - if log_level is None: - mmplogger.setLevel(logging.INFO) - - else: - numeric_level = getattr(logging, log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p') - fh.setFormatter(formatter) - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - - ####################### - # main stuff # - ####################### - mmplogger.info('Instantiating MMP Suggest Object') - mmp_suggest_object = MMPSeriesObjectClass(mmplogger) - - # parse series from directory of files or generate from CSV - - use_comparison_df = False - - mmp_suggest_object.setup_mmp_data_for_mms(args.i, args.s, args.n, args.a, args.p, args.d, cut_type=args.c) - mmp_suggest_object.generate_store_mmp_series(apply_pre_filter=args.f) - - if args.z is not None: - mmp_suggest_object.setup_pregenerated_series_data_for_mms(args.z) - use_comparison_df = True - - # check the input list of Mol IDs is valid - if args.x is not None: - - mol_id_list = args.x - mol_id_list = mol_id_list.split(",") - if len(mol_id_list) < 3: - sys.exit("Please specify 3 or more Mol ID's as a (quoted) comma delimited list e.g.: /'12345,12346,12347/'") - mol_id_list = [int(x) for x in mol_id_list] - - mmplogger.info('Initiating search for MMS in reference dir set, using molid list input') - result_df = mmp_suggest_object.search_for_mms_to_extend_molids(mol_id_list, - strict_order=args.m, - use_comparison_df=use_comparison_df) - if result_df.empty: - print("No results found: query series had no matches against the target set") - else: - result_scored = None - result_scored = mmp_suggest_object.return_scored_series_dataframe(mol_id_list, result_df, result_scored) - mmp_suggest_object.write_series_dataframe(result_scored, args.o) - - # args.y must be true - elif args.y: - result_df = mmp_suggest_object.auto_search(args.q, args.p, - strict_ordering=args.m, - use_comparison_df=use_comparison_df) - mmp_suggest_object.auto_search_write(result_df, args.o) - - elif args.w: - # print "Running full auto search" - result_df = mmp_suggest_object.auto_search_fullscan(strict_ordering=args.m, - use_comparison_df=use_comparison_df) - mmp_suggest_object.auto_search_write(result_df, args.o) - - else: - sys.exit('Error, invalid option, please try -x, -y or -w') - - -if __name__ == "__main__": - main() diff --git a/contrib/script/py/mmp/getMMPSeriesfromCSV.py b/contrib/script/py/mmp/getMMPSeriesfromCSV.py deleted file mode 100755 index bec93519..00000000 --- a/contrib/script/py/mmp/getMMPSeriesfromCSV.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -############################################################################## -# -# Summary: -# Generate all Matched Series from input SMI file and prints to output -# -# Example usage: -# python compare_mmp_sum_files.py -# -i input1.pairs.sum input2.pairs.sum -# -o output_diffs.csv -# -############################################################################## -import argparse -import sys -import logging - -from mmp.mmp_series_object import MMPSeriesObjectClass - - -def main(): - - def restricted_float(x): - x = float(x) - if x < 0.1 or x > 0.99: - raise argparse.ArgumentTypeError("%r not in range [0.1, 0.99]" % (x,)) - return x - - parser = argparse.ArgumentParser(description='Generate Matched Series data from an input CSV. ' - 'Script prints all valid series but does not exhaustively enumerate ' - 'all sub-series of a series as this is too combinatorially explosive. ' - 'Use output with script getMMPSeriesSuggestfromCSV to utilise data. ' - '-m / -f flags sets as per Keefer and Chang MedChemComm 2017 where a ' - 'series must have >=5 compounds, activity spread >= 0.5 and skew <= 3') - parser.add_argument('-i', nargs=1, required=True, help='Input CSV file of smiles and activity data') - parser.add_argument('-o', nargs=1, required=True, help='Output filename to write series to') - parser.add_argument('-s', nargs=1, required=True, help='Column name for Molecule SMILES') - parser.add_argument('-n', nargs=1, required=True, help='Column name for Molecule ID') - parser.add_argument('-a', nargs=1, required=True, help='Column name for Activity Data') - parser.add_argument('-m', type=int, default=5, - help="Invoke to set a min limit on the reported series length (default >= 5). " - "The minimum allowed value is 3 as a value of 2 will simple find matched pairs") - parser.add_argument('-t', type=restricted_float, default=0.50001, - help="Set the percentage of the molecule to be retained as context. Use smaller values (0.3) " - "for small side chain replacements only, default 0.50001") - parser.add_argument('-c', type=str, default='BOTH', choices=['SINGLE', 'DOUBLE', 'BOTH']) - parser.add_argument('-l', nargs=1, help='optional log filename', default=False) - parser.add_argument('-L', nargs=1, choices=['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], default='INFO') - - args = parser.parse_args() - - input_file = args.i[0] - smi_col = args.s[0] - id_col = args.n[0] - act_col = args.a[0] - - if args.m <= 2: - parser.print_help() - sys.exit('Please ensure your min series length (-m) value is greater than 2') - - mmplogger = logging.getLogger('lillymol_file_logger') - - log_level = args.L[0] - - if args.l: - log_file = args.l[0] - - else: - log_file = None - log_level = None - - if log_file is not None: - - fh = logging.FileHandler(log_file) - - if log_level is None: - mmplogger.setLevel(logging.INFO) - - else: - numeric_level = getattr(logging, log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p') - - fh.setFormatter(formatter) - - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - - ####################### - # main stuff - ####################### - - mmplogger.info('Instantiating MMP Series Objects') - mmp_series_object = MMPSeriesObjectClass(mmplogger) - - # setup - mmp_series_object.setup_mmp_data_for_mms(input_file, smi_col, id_col, act_col, args.m, args.t, cut_type=args.c) - - # write raw series - mmp_series_object.write_raw_series_to_file(csv_out=args.o[0]) - - -if __name__ == "__main__": - - main() diff --git a/contrib/script/py/mmp/getMMPStatsfromCSV.py b/contrib/script/py/mmp/getMMPStatsfromCSV.py deleted file mode 100755 index 40e5e550..00000000 --- a/contrib/script/py/mmp/getMMPStatsfromCSV.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python -################################################################### -# Summary: Script to generate matched moleular pairs from input smiles -# -############################################################# -import argparse -import os -import sys -import logging - -from mmp.mmp_pairs_objects import MMPPairsObjectClass, validate_agg_method - - -def create_parser(): - - parser = argparse.ArgumentParser(description='Generate matched molecular pairs and summarised Transforms from CSV') - parser.add_argument("-i", - required=True, - help="Input CSV filename") - parser.add_argument("-o", - help="Output filename to write pairs and stats data") - parser.add_argument("-s", "--smiles", - required=True, - help="Column name for Molecule SMILES") - parser.add_argument("-n", "--molid", - required=True, - help="Column name for Molecule ID") - parser.add_argument("-d", "--diff_calc", - default='DIFF', - choices=['DIFF', 'RATIO'], - help="Difference calc DIFF|RATIO (DIFF = FRAG_R - FRAG_L / RATIO = FRAG_R / FRAG_L), default = DIFF") - parser.add_argument("-c", "--cut_type", - default="BOTH", - choices=['SINGLE', 'DOUBLE', 'BOTH'], - help="SINGLE|DOUBLE|BOTH (default = BOTH)") - parser.add_argument("-f", "--filter", - default="NONE", - choices=['REMOVE_NONRINGS', 'REMOVE_RINGS', 'NONE'], - help="REMOVE_NONRINGS|REMOVE_RINGS|NONE, default = NONE",) - parser.add_argument("-l", "--log_file", - help="Name of file to place log info in") - parser.add_argument("-L", "--log_level", - help="CRITICAL|ERROR|WARNING|INFO|DEBUG") - parser.add_argument("-k", "--skip_col_check", - help="Skip auto detect of numeric columns. Code will attempt to get diff data on all/any column.", - action="store_true", - default=False) - parser.add_argument("-t", "--types_header", - help="Invoke this flag (no value needed) if you want a second header line with datatypes", - action="store_true", - default=False) - parser.add_argument("-a", "--act_data_col", - help="Optional flag to specify column to calculate differences on, else will do all columns") - parser.add_argument("-A", "--agg_method", - default=False, - help="Invoke flag with value MEAN_DIFF|MEAN_DIFF_INVLOG|CATEGORICAL|DIFF50|DIFFXX to get summarised" - " frag pairs, also needs -a. MEAN simply calculates the mean of the input data (after " - " aggregation). MEAN_DIFF calculates MEAN, confidence limits are added, input should already " - " be in the Log scale (e.g. LogD). MEAN_DIFF_INVLOG also inverts the resulting mean out of log" - " scale to get FOLD_CHANGE (e.g.: Solubility, Clearance). CATEGORICAL expects binary 0/1 two " - " category data (e.g.: Permeation). DIFFXX where 0 < X < 100 implements an index function, for" - " use with CYP Pc Inhibition & metabolism data.") - parser.add_argument("-g", "--agg_type", - default="FRAG", - choices=['FRAG', 'ATTACH'], - help="Invoke flag with FRAG|ATTACH to exclude|include atom attachment points in aggregation, " - "default = FRAG") - parser.add_argument("-p", "--prop_data", - default='NONE', - choices=['BASIC', 'ALL'], - help="Invoke flag with BASIC|ALL to get MWT/CLP (BASIC) or full iwdecr property data (ALL) diffs") - parser.add_argument("-q", "--add_quality", - dest="add_quality", - help="Invoke flag if you want to add Quality metric to summarised pairs", - action="store_true", default=False) - parser.add_argument("-x", "--del_id_dupes", - help="Invoke flag if you want to keep and count duplicate positional isomer fragL->fragR changes" - "for same ID", - action="store_false", - default=True) - parser.add_argument("-y", "--get_low_n_pairs", - help="Invoke flag if you want to get a summarised data point for pairs with N<3 with DIFFxx and " - "MEAN_DIFF Pc calcs. Other method always return values for low N pairs. This flag impacts speed, " - "particularly for DIFFxx due to calculation of confidence intervals. At a rough estimate the " - "execution time of the script will increase by <=x Pc where x is equal to the number of summarised " - "pairs with n<3. When 50 Pc of summarised pairs have n<3 script is <=50 Pc slower.", - action="store_true", - default=False) - - args = parser.parse_args() - - # rely on module level function to validate this option - if args.agg_method: - args.agg_method = args.agg_method.upper() # validate this below using module level function - validate_agg_method(args.agg_method) - - # check -A and -a flags - # check this behaviour early on rather than half way through the code as the pairs - # aggregations code on the imported object will replicate this behaviour - if args.agg_method and args.act_data_col is None: - parser.print_help() - parser.error('Can only invoke the -A when you specify the activity column using -a (because it can be slow!)') - sys.exit() - - if (args.add_quality is True and args.prop_data.upper() == 'NONE') or \ - (args.add_quality is True and args.agg_type is False): - parser.print_help() - parser.error( - 'Can only generate Quality metric when using -A (aggregation) and -p (properties), consult --help\n') - sys.exit() - - if args.agg_type: - args.agg_type = args.agg_type.upper() - - return parser - -# -# Main -# -def main(): - - parser = create_parser() - args = parser.parse_args() - - if str(args.prop_data).upper() == 'BASIC': - add_prop_diff = 1 - elif str(args.prop_data).upper() == 'ALL': - add_prop_diff = 2 - else: - add_prop_diff = 0 - - if args.diff_calc.upper() == 'RATIO': - fold_diff = True - else: - fold_diff = False - - mmplogger = logging.getLogger('lillymol_file_logger') - - if args.log_file is not None: - - fh = logging.FileHandler(args.log_file) - - if args.log_level is None: - mmplogger.setLevel(logging.INFO) - else: - numeric_level = getattr(logging, args.log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % args.log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p') - - fh.setFormatter(formatter) - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - # instantiate objects - mmplogger.info('Instantiating the class') - my_mmp_data_object = MMPPairsObjectClass(mmplogger) - - # parse CSV - mmplogger.info('Will now parse CSV into the object') - - # my_mmp_data_object.csv_to_pair_and_data_objects(args.i, args.smiles, args.molid, args.cut_type.upper(), args.filter) - if args.skip_col_check: - my_mmp_data_object.csv_sniffer(args.i, args.smiles, args.molid, skip_num_check=True) - else: - my_mmp_data_object.csv_sniffer(args.i, args.smiles, args.molid) - - if args.act_data_col is None and add_prop_diff == 0: - my_mmp_data_object.csv_to_data_objects(args.i, args.smiles, args.molid) - - elif args.act_data_col is None and add_prop_diff > 0: - my_mmp_data_object.csv_to_data_objects(args.i, args.smiles, args.molid, add_prop_diff=add_prop_diff) - - elif args.act_data_col is not None and add_prop_diff == 0: - my_mmp_data_object.csv_to_data_objects(args.i, args.smiles, args.molid, act_data_col=args.act_data_col) - - else: - my_mmp_data_object.csv_to_data_objects(args.i, args.smiles, args.molid, act_data_col=args.act_data_col, - add_prop_diff=add_prop_diff) - - tmp_dicer_file = my_mmp_data_object.write_mol_smi_dict_tofile() - my_mmp_data_object.build_from_dicer(tmp_dicer_file, args.cut_type.upper(), args.filter) - - # get pairs - mmplogger.info('Now getting pairs and writing to file: %s' % args.o) - if args.types_header is True: - my_mmp_data_object.get_pairs_and_diffs(args.o, args.cut_type.upper(), - fold_diff=fold_diff, - inc_types_header=args.types_header, - add_qmetric=args.add_quality) - - else: - my_mmp_data_object.get_pairs_and_diffs(args.o, args.cut_type.upper(), - fold_diff=fold_diff, - add_qmetric=args.add_quality) - - mmplogger.info('Completed generation of raw pairs.') - - # Now do pair aggregation if requested, consider - # Essential: agg_type / agg_method - # Optional and interdependent: prop_data (default False) / args.act_data_col (default False) - # - if args.agg_method: - - ######################################################### - # free up some memory - # - Running on HPC environment, with 66,514 smi -> 14,514,719 pairs I get various MemoryError returns - # from python/pandas specifically as we do the aggregation so need to free up some space - ######################################################### - # This should empty the object out, we'll then read back from CSV - mmplogger.info('Cleaning out all objects from memory') - my_mmp_data_object.clean_out_data() - - mmplogger.info('Now getting summarised/aggregated pairs') - - mmplogger.debug('Generating pandas dataframe from the CSV file we just wrote') - # seems like this is very slow - # my_mmp_data_object.pairsdataobj_to_pd(cut_type, args.act_data_col) - # try this instead - my_mmp_data_object.pd_read_csv(args.o) - - mmplogger.debug('Done reading CSV, now aggregating') - - if args.act_data_col is None: - - if add_prop_diff > 0: - my_mmp_data_object.pd_aggregate_pairs_to_csv(args.o+'.sum', args.agg_type, - prop_data=True, - agg_method=args.agg_method, - remove_id_dupes=args.del_id_dupes, - inc_low_n_vals=args.get_low_n_pairs, - add_qmetric=args.add_quality) - else: - my_mmp_data_object.pd_aggregate_pairs_to_csv(args.o+'.sum', args.agg_type, - agg_method=args.agg_method, - remove_id_dupes=args.del_id_dupes, - inc_low_n_vals=args.get_low_n_pairs, - add_qmetric=args.add_quality) - else: - if add_prop_diff > 0: - my_mmp_data_object.pd_aggregate_pairs_to_csv(args.o+'.sum', args.agg_type, - prop_data=True, - agg_method=args.agg_method, - act_col=args.act_data_col, - remove_id_dupes=args.del_id_dupes, - inc_low_n_vals=args.get_low_n_pairs, - add_qmetric=args.add_quality) - else: - my_mmp_data_object.pd_aggregate_pairs_to_csv(args.o+'.sum', args.agg_type, - agg_method=args.agg_method, - act_col=args.act_data_col, - remove_id_dupes=args.del_id_dupes, - inc_low_n_vals=args.get_low_n_pairs, - add_qmetric=args.add_quality) - - mmplogger.info('Completed generation of summarised pairs.') - - -# -# -if __name__ == '__main__': - - main() - sys.exit() diff --git a/contrib/script/py/mmp/getMMPbasedMCSSfromSMI.py b/contrib/script/py/mmp/getMMPbasedMCSSfromSMI.py deleted file mode 100755 index 76977d75..00000000 --- a/contrib/script/py/mmp/getMMPbasedMCSSfromSMI.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python - -################################################################### -# Summary: Script to generate largest MCSS based MMP for each pair -# using input smiles -# -# Example Usage: -# python getMCSSbasedMMPfromSMI.py -i input.smi -o output.arff -# -c double -f remove_nonrings -# -############################################################# -import argparse -import os -import sys -import logging - -from mmp.mmp_mcss_objects import MMPbasedMCSSObjectClass - - -# -# Main -# -def main(): - # - parser = argparse.ArgumentParser(description='Generate MMP based MCSS from SMI input') - parser.add_argument("-i", "--smi_file", - required=True, - help="Input smiles file") - parser.add_argument("-o", "--out_file", - required=True, - help="Output pairs file CSV") - parser.add_argument("-x", "--mdc_atm_hard", - type=int, - help="Max Double Cut Atom cutoff (Hard)" - "This implements a filter that will remove *any* double cuts (and therefore MCSS) where one " - "half has num_atoms <= mdc_atm_hard. This is a hard cutoff and may therefore mean you will " - "fail to find any MCSS between two different compounds if the only valid MCSS is a double " - "cut with num_atoms <= mdc_atm_hard. This is not a recommended option, use mdc_atm_soft ", - default=None) - parser.add_argument("-s", "--mdc_atm_soft", - type=int, - help="Max Double Cut Atom cutoff (Soft)" - "* must be used with mdc_atm_soft_threshold *" - "When a double cut has a greater number of atoms than a single cut, the double cut will be " - "discarded in preference to the smaller number of atom single cut only when (a) and (b): " - "(a) either part of the double cut context has num_atoms <= mdc_atm_soft " - "(b) total double cut atom <= (single cut atoms + mdc_atm_soft_threshold) ", - default=None) - parser.add_argument("-t", "--mdc_atm_soft_threshold", - type=int, - help="Threshold value for mdc_atm_soft" - "* must be used with mdc_atm_soft *" - "See help text for mdc_atm_soft", - default=None) - parser.add_argument("-c", "--cut_type", - default="BOTH", - choices=['SINGLE', 'DOUBLE', 'BOTH'], - help="SINGLE|DOUBLE|BOTH (default = BOTH)") - parser.add_argument("-f", "--filter", - default="NONE", - choices=['REMOVE_NONRINGS', 'REMOVE_RINGS', 'NONE'], - help="REMOVE_NONRINGS|REMOVE_RINGS|NONE, default = NONE") - parser.add_argument("-l", "--log_file", dest="log_file", help="Name of file to place log info in") - parser.add_argument("-d", "--log_level", dest="log_level", help="CRITICAL|ERROR|WARNING|INFO|DEBUG") - - opts = parser.parse_args() - smi_fi = opts.smi_file - out_fi = opts.out_file - # minimum double cut (mdc) atoms == The minimum number of atoms allowed in any double cut fragment - madc_hard = opts.mdc_atm_hard - madc_soft = opts.mdc_atm_soft - madc_soft_threshold = opts.mdc_atm_soft_threshold - cut_type = opts.cut_type.upper() - filter_type = opts.filter - - # should be a group? - if (madc_soft is None) and (madc_soft_threshold is not None) or\ - (madc_soft is not None) and (madc_soft_threshold is None): - parser.print_help() - parser.error('Please specify both -s and -t together, you cannot use one or the other, consult --help\n') - sys.exit() - - # get log details if they are defined or set to None - log_file = opts.log_file - log_level = opts.log_level - - mmplogger = logging.getLogger('lillymol_file_logger') - - if log_file is not None: - - fh = logging.FileHandler(log_file) - - if log_level is None: - mmplogger.setLevel(logging.INFO) - else: - numeric_level = getattr(logging, log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p') - - fh.setFormatter(formatter) - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - - # - mmplogger.info('Instantiating MMP Objects') - my_mmp_mcss_object = MMPbasedMCSSObjectClass(mmplogger) - - # pre-check the smiles for for odd stuff - # if prescan_smi: - my_mmp_mcss_object.scan_input_smiles(smi_fi) - - mmplogger.info('Parse Dicer Fragments to Pairs') - my_mmp_mcss_object.build_from_dicer(smi_fi, cut_type, filter_type) - mmplogger.info('Get NAtoms for all frags') - my_mmp_mcss_object.enumerate_fragment_properties() - mmplogger.info('Write out final pairs') - - if madc_soft is not None: - my_mmp_mcss_object.get_largest_mcs_pairs(out_fi, cut_type, mdc_atm_soft=madc_soft, - mdc_atm_soft_threshold=madc_soft_threshold) - - elif madc_hard is not None: - my_mmp_mcss_object.get_largest_mcs_pairs(out_fi, cut_type, mdc_atm_hard=madc_hard) - - else: - my_mmp_mcss_object.get_largest_mcs_pairs(out_fi, cut_type) - - mmplogger.info('Complete.') - - -if __name__ == '__main__': - main() - sys.exit() diff --git a/contrib/script/py/mmp/getMMPfromSMI.py b/contrib/script/py/mmp/getMMPfromSMI.py deleted file mode 100755 index a7741c9c..00000000 --- a/contrib/script/py/mmp/getMMPfromSMI.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python - -################################################################### -# Summary: Script to generate matched moleular pairs from input smiles -# -# Example Usage: -# python getMMPfromSMI.py --smi_file /home/my_username/input.smi -# python getMMPfromSMI.py -i input.smi -o output.csv -# python getMMPfromSMI.py -i input.smi -o output.csv -c double -f remove_nonrings -r rdkit -# -# -############################################################# - -import argparse -import sys -import logging - -from mmp.mmp_objects import MMPObjectClass - -# -# Main -# -def main(): - - parser = argparse.ArgumentParser(description='Generate matched molecular pairs from input smiles') - parser.add_argument("-i", required=True, help="Input smiles file") - parser.add_argument("-o", required=True, help="Output pairs file CSV") - parser.add_argument("-c", default='BOTH', - choices=['SINGLE', 'DOUBLE', 'BOTH', 'single', 'double', 'both'], - help="SINGLE|DOUBLE|BOTH (default = BOTH)") - parser.add_argument("-f", default='NONE', - help="REMOVE_NONRINGS|REMOVE_RINGS|NONE") -# parser.add_option("-e", "--prescan_smi", dest="prescan_smi", action="store_true", \ -# help="Invoke if you want to avoid error check of smiles file via pre-scan", default=True) - parser.add_argument('-b', type=float, default=0.50001, - help='Dicer Threshold for Fragmentation, default = 0.50001') - parser.add_argument("-l", default=None, - help="Name of file to place log info in") - parser.add_argument("-d", default=None, - help="CRITICAL|ERROR|WARNING|INFO|DEBUG") - parser.add_argument("-t", action="store_true", default=False, - help="Invoke this flag (no value needed) if you want a second header line with datatypes") - - args = parser.parse_args() - types_header = args.t - - smi_fi = args.i - out_fi = args.o - - # check cut type is valid - cut_type = args.c - - cut_type_options = ['SINGLE', 'DOUBLE', 'BOTH'] - if not any(opt in cut_type.upper() for opt in cut_type_options): - parser.print_help() - parser.error('Need to specify a valid cut type consult --help\n') - sys.exit() - - # check filter type is valid - filter_type = args.f - filter_type_options = ['REMOVE_NONRINGS', 'REMOVE_RINGS', 'NONE'] - if not any(opt in filter_type.upper() for opt in filter_type_options): - parser.print_help() - parser.error('Need to specify a valid filter type consult --help\n') - sys.exit() - - try: - dicer_threshold = float(args.b) - if dicer_threshold < 0.1 or dicer_threshold > 0.9: - sys.exit('Please specify a value for the dicer threshold between 0.1 and 0.9') - except: - sys.exit('Please specify a value for the dicer threshold between 0.1 and 0.9') - - # logging - - log_file = args.l - log_level = args.d - - mmplogger = logging.getLogger('lillymol_file_logger') - - if log_file is not None: - - fh = logging.FileHandler(log_file) - - if log_level is None: - mmplogger.setLevel(logging.INFO) - else: - numeric_level = getattr(logging, log_level.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError('Invalid log level: %s' % log_level) - mmplogger.setLevel(numeric_level) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(module)s: %(message)s', - datefmt='%m/%d/%Y %I:%M:%S %p') - - fh.setFormatter(formatter) - mmplogger.addHandler(fh) - - else: - logging.disable(logging.CRITICAL) - - # - mmplogger.info('Instantiating MMP Objects') - my_mmp_object = MMPObjectClass(mmplogger) - - # pre-check the smiles for odd stuff - my_mmp_object.scan_input_smiles(smi_fi) - - # execute dicer cmd and parse results to mmp_object - mmplogger.info('Parse Dicer Fragments to Pairs') - my_mmp_object.build_from_dicer(smi_fi, cut_type, filter_type, threshold=dicer_threshold) - - mmplogger.info('Write out final pairs') - if types_header: - my_mmp_object.print_to_file(out_fi, cut_type, inc_types_header=types_header) - else: - my_mmp_object.print_to_file(out_fi, cut_type) - mmplogger.info('Complete.') - - -if __name__ == '__main__': - - main() - sys.exit() diff --git a/contrib/script/py/mmp/get_coverage.sh b/contrib/script/py/mmp/get_coverage.sh deleted file mode 100755 index 84da3fd0..00000000 --- a/contrib/script/py/mmp/get_coverage.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -coverage erase -coverage run mmp_data_objects.py -coverage run -a mmp_dicer_functions.py -coverage run -a mmp_enum_mols_from_pairs.py -coverage run -a mmp_math_functions.py -coverage run -a mmp_mcss_objects.py -coverage run -a mmp_objects.py -coverage run -a mmp_pairs_objects.py -coverage run -a mmp_series_object.py -coverage run -a mmp_stats_functions.py -coverage run -a mmp_stats_functions_timer.py -coverage report -m mmp_data_objects.py mmp_dicer_functions.py mmp_enum_mols_from_pairs.py mmp_math_functions.py mmp_mcss_objects.py mmp_objects.py mmp_pairs_objects.py mmp_series_object.py mmp_stats_functions.py mmp_stats_functions_timer.py diff --git a/contrib/script/py/mmp/mmp_data_objects.py b/contrib/script/py/mmp/mmp_data_objects.py deleted file mode 100755 index 9132e795..00000000 --- a/contrib/script/py/mmp/mmp_data_objects.py +++ /dev/null @@ -1,1723 +0,0 @@ -################################################################### -""" Summary: Class and Methods to get MMP differences from input CSV data - -About: The class is an extension of the class mmp_objects. The base -class only deals with molecule data and pair generation. This class -adds objects to handle associated molecule data and generate data -differences or deltas between molecules as well as the raw molecule pairs - -Example usage: - - my_mmp_data_object = MMPDataObjectClass(mmplogger) - - # parse CSV - my_mmp_data_object.csv_sniffer(in_file, smi_col, molid_col) - my_mmp_data_object.csv_to_data_objects(in_file, smi_col, molid_col) - - # add dicer fragments to create pair dicts - tmp_dicer_file = my_mmp_data_object.write_mol_smi_dict_tofile() - my_mmp_data_object.build_from_dicer(tmp_dicer_file, cut_type, filter_type) - - # get pairs - my_mmp_data_object.get_pairs_and_diffs(out_fi, cut_type) - -Warnings: This class assumes the input smiles are already salt stripped -and that their identifiers and convertable to a valid integer value. Input -smiles of the type "c1ccccc1[N+].[Cl-] CHEMBL9876" will fail for both reasons -""" -################################################################### -import logging -import csv -import os -import sys -# things needed for unit testing only -import unittest -import tempfile -import copy - -from mmp.mmp_objects import MMPObjectClass -# later clogp method can be licensed biobyte clogp -# or opensourced abraham's alogp depending on environment var -if 'LILLYMOL_HOME' in os.environ: - sys.path.insert(0, os.getenv('LILLYMOL_HOME') + '/contrib/script/py/pybase'); - import pyopmo as pymo -else: - import pybase.pymo as pymo - -class MMPDataObjectClass(MMPObjectClass): - - def __init__(self, logger_object): - """ - Example usage: - mmplogger = logging.getLogger('lillymol_file_logger') - logging.disable(logging.CRITICAL) - my_mmp_data_object = MMPDataObjectClass(mmplogger) - - Due to the large number of repeat smiles generated by dicer fragmentation, - normalisation is applied to the final single and double cut matched pairs - objects. The actual MMP data objects single_pairs_dict and double_pairs_dict - simply contain numerical id's for every smiles. The smiles are stored in - refsmi_dict and data for each smiles stored in real_data_dict - """ - - MMPObjectClass.__init__(self, logger_object) - - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - # setup temp smiles file for dicer to use - self.suffix = '.smi' # iwdescr needs this .smi suffix to work - self.temp_dicer_smifi = tempfile.NamedTemporaryFile(delete=False, suffix=self.suffix, - encoding='utf-8', mode='wt') - - # object level flag to determine if we will add MWT / CLP data to the output - self.add_prop_data = 0 - - # becomes dict of smi => [array, of, real, data] - # currently code will drop or ignore any data given - # on input that is not of type real (or smiles/mol_id) - self.mol_data_dict = {} - - self.mol_props_dict = {} - self.mol_props_headers = [] - self.mol_props_headers_numeric_position = {} - - # for comparison SMI sets - self.mol_smi_comparison_dict = {} - - self.mol_props_comparison_dict = {} - self.mol_props_comparison_headers = [] - self.mol_props_comparison_headers_numeric_position = {} - - # other vars to help us process the CSV file - self.csv_dialect = '' - self.headers = [] - self.headers_nosmi = [] - self.headers_numeric_position = {} - self.csv_items_per_line = 0 - self.smi_col_num = '' - self.molid_col_num = '' - - # Bug Fix: this var is needed to keep track of -t option in wrapper script where we write out a second header - # line in CSV containing the column types. This flag and header can screw up aggregation - # in the method pd_read_csv so we drop this line on read of CSV. It gets set in the method get_pairs_and_diffs - # but is never really used unless inherit this from mmp_pairs_object where it's essential - self.types_header = False - - def clean_out_data(self): - """Method to clean out all objects in class""" - - self.logger.debug('mem_trace refsmi_dict entries entries: %s, mem usage: %s kB' % ( - len(self.refsmi_dict), sys.getsizeof(self.refsmi_dict)/1000)) - self.logger.debug('mem_trace single_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.single_pairs_dict), sys.getsizeof(self.single_pairs_dict)/1000)) - self.logger.debug('mem_trace double_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.double_pairs_dict), sys.getsizeof(self.double_pairs_dict)/1000)) - self.logger.debug('mem_trace mol_data_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_data_dict), sys.getsizeof(self.mol_data_dict)/1000)) - self.logger.debug('mem_trace mol_smi_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_smi_dict), sys.getsizeof(self.mol_smi_dict)/1000)) - self.logger.info("Cleaning out all objects") - - # just clean out all the dicts - self.refsmi_dict.clear() - self.single_pairs_dict.clear() - self.double_pairs_dict.clear() - self.add_prop_data = 0 - self.mol_data_dict.clear() - self.mol_smi_dict.clear() - self.mol_smi_comparison_dict.clear() - self.mol_props_dict.clear() - self.mol_props_headers = [] - self.mol_props_headers_numeric_position.clear() - - # and reset csv stuff - self.csv_dialect = '' - self.headers = [] - self.headers_nosmi = [] - self.headers_numeric_position = {} - self.csv_items_per_line = 0 - self.smi_col_num = '' - self.molid_col_num = '' - - self.logger.debug('mem_trace refsmi_dict entries entries: %s, mem usage: %s kB' % ( - len(self.refsmi_dict), sys.getsizeof(self.refsmi_dict)/1000)) - self.logger.debug('mem_trace single_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.single_pairs_dict), sys.getsizeof(self.single_pairs_dict)/1000)) - self.logger.debug('mem_trace double_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.double_pairs_dict), sys.getsizeof(self.double_pairs_dict)/1000)) - self.logger.debug('mem_trace mol_data_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_data_dict), sys.getsizeof(self.mol_data_dict)/1000)) - self.logger.debug('mem_trace mol_smi_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_smi_dict), sys.getsizeof(self.mol_smi_dict)/1000)) - - def write_mol_smi_dict_tofile(self, use_comparison_dict = False): - """Method to write out the mol_smi_dict to a flatfile for use by dicer - fragmentation. Could later extend this to any fragment dict if useful! - Returns the name of the temp file that has been used. - """ - - if len(self.mol_smi_dict) < 1: - sys.exit("You are trying to write a zero sized mol_smi_dict to file") - - self.logger.info("writing smiles to temp file for dicer fragmentation: %s" % self.temp_dicer_smifi.name) - - if use_comparison_dict is False: - query_dict = self.mol_smi_dict - - else: - query_dict = self.mol_smi_comparison_dict - - # write smi/id to temp file in std smiles format for dicer - with open(self.temp_dicer_smifi.name, "w") as f: - for id_, smi in query_dict.items(): - if type(id_) == int: - f.write(smi+" "+str(id_)+"\n") - - return self.temp_dicer_smifi.name - - def build_comparison_smidict(self, comparison_smi_fi, add_prop_data=0): - """Utility method for other programs to use such as mmp_predict that need smiles of a comparison SMI set""" - - # ensure we're consistent - if add_prop_data == 0 and self.add_prop_data != 0: - add_prop_data = self.add_prop_data - - with open(comparison_smi_fi, "r+t") as csmifile: - - for line in csmifile: - line = line.rstrip('\r') - line = line.rstrip('\n') - line_list = line.split() - - if len(line_list) != 2: - sys.exit("This comparison SMI file has >2 columns, expecting only smiles and id, please check") - try: - csmi = str(line_list[0]) - cid = int(line_list[1]) - except: - sys.exit("This SMI file has non numeric or non integer ID's which I can't handle, please change") - - self.mol_smi_comparison_dict[cid] = csmi - - self.logger.info("Completed build of Comparison SMI Dict with %d smiles" % ( - len(self.mol_smi_comparison_dict))) - - # now add the property data - if add_prop_data == 1: - - self.generate_clogp(input_smi=comparison_smi_fi, comparison_dict=True) - self.generate_mwt(input_smi=comparison_smi_fi, comparison_dict=True) - - if add_prop_data == 2: - - self.generate_clogp(input_smi=comparison_smi_fi, comparison_dict=True) - self.generate_all_props(input_smi=comparison_smi_fi, comparison_dict=True) - - def csv_sniffer(self, csv_file, smi_col, molid_col, skip_num_check=False): - """Method to take input CSV and populate smi data in mol_smi_dict, and all other - data in self.real_data_dict, expects csv file name as input as well as column header - for SMI and MOL_ID columns. - """ - - # check the csv file has headers, exit if it does not - sniffer = csv.Sniffer() - if sniffer.has_header(csv_file) == 0: - sys.exit("Sniffer module did not detect a header in your CSV file, please check!") - - # now open csv file and start to use it - with open(csv_file, "rt", newline='') as csvfile: - - # use sniffer to detect dialet - self.csv_dialect = sniffer.sniff(csvfile.read(65536)) - csvfile.seek(0) - # create reader object - reader = csv.reader(csvfile, self.csv_dialect, skipinitialspace=True) - # get headers - self.headers = next(reader) - self.headers_nosmi = copy.deepcopy(self.headers) - self.logger.info("Got headers from CSV: %s" % self.headers) - # get expected line length - self.csv_items_per_line = len(self.headers) - self.logger.info("Expected number of items per line: %s" % self.csv_items_per_line) - - # get the index or column id for the smi and id columns and assign to vars - for i in range(len(self.headers)): - if self.headers[i].upper() == smi_col.upper(): - self.smi_col_num = i - if self.headers[i].upper() == molid_col.upper(): - self.molid_col_num = i - - # check they are assigned or fail completely as we will not be able to write smiles file - if self.smi_col_num == '': - sys.exit('Could not detect SMI column %s in CSV file headers %s' % (smi_col, self.headers)) - elif self.molid_col_num == '': - sys.exit('Could not detect MOL_ID column %s in CSV file headers %s' % (molid_col, self.headers)) - self.logger.info("The MOL_ID and SMI columns are at indexs: %d, %d" % ( - self.molid_col_num, self.smi_col_num)) - - # clean up headers_nosmi - if self.molid_col_num > self.smi_col_num: - self.headers_nosmi.pop(self.molid_col_num) - self.headers_nosmi.pop(self.smi_col_num) - else: - self.headers_nosmi.pop(self.smi_col_num) - self.headers_nosmi.pop(self.molid_col_num) - - # scan first 30 or less lines of file to determine data types for each column - # if you find a real or int value then add column position/index to headers_numeric_position - row_iterator = 0 - self.logger.info("Initialising prescan of CSV") - - if skip_num_check: - self.logger.debug("Skipping pre-scan of CSV and will operate on all columns") - row = next(reader) - for idx, val in enumerate(row): - self.headers_numeric_position[idx] = self.headers[idx] - del self.headers_numeric_position[self.smi_col_num] - - else: - for row in reader: - - row_iterator += 1 - self.logger.debug("Prescan of CSV row: %d" % row_iterator) - - # if first data row, capture position of all numeric columns - if row_iterator == 1: - self.logger.debug("Evaluating all columns for numeric values") - for idx, val in enumerate(row): - try: - float(val) - self.headers_numeric_position[idx] = self.headers[idx] - self.logger.debug("This value %s with index %d is numeric: " % (val, idx)) - except: - self.logger.debug("This value %s with index %d is not numeric: " % (val, idx)) - - # for all other rows, remove entry from headers_numeric_position if not numeric - # but only do this for first 30 rows - if row_iterator < 30: - for idx, val in enumerate(row): - if idx in self.headers_numeric_position: - try: - float(val) - except: - del self.headers_numeric_position[idx] - self.logger.debug("I had to remove this value %s with index %s: " % (idx, val)) - - # more than 30 rows so quit loop - else: - break - - # now remove the id column from self.headers_numeric_position as we - # don't want to do math on this column when calculating pair data diffs - del self.headers_numeric_position[self.molid_col_num] - - self.logger.info("These are the columns I will do maths on for pair diffs: %s" % self.headers_numeric_position) - - # close csv file handle - csvfile.close() - - def csv_to_data_objects(self, csv_file, smi_col, molid_col, act_data_col=None, add_prop_diff=0, std_smi=False): - """ - Method to take input CSV and populate smi data in mol_smi_dict, and all other - data in self.real_data_dict, expects a number of variables set at the class level - to be populated or it will fail and exit - :param csv_file: CSV file name to read / parse - :param smi_col: column name for smiles data column - :param molid_col: column name for mol_id data column - :param act_data_col: column name for activity data column - :param add_prop_diff: set to 1 if you want to add clogp, mwt and other property data - :param std_smi: Standardise smiles stored in self.mol_smi_dict - :return: - """ - - if (self.csv_dialect == '' or - self.headers == [] or - self.headers_nosmi == [] or - self.headers_numeric_position == {} or - self.csv_items_per_line == 0): - sys.exit("Can't proceed with CSV parsing as the CSV file did not contain anything " - "other than SMI and ID columns. Script needs further data columns to " - "calculate differences on") - - self.logger.info("Size of mol_data_dict, mol_smi_dict, refsmi_dict, single_pairs_dict, double_pairs_dict: \ - %d, %d, %d, %d, %d" % (len(self.mol_data_dict), len(self.mol_smi_dict), len(self.refsmi_dict), - len(self.single_pairs_dict), len(self.double_pairs_dict))) - - if add_prop_diff == 1: - self.add_prop_data = 1 - - elif add_prop_diff == 2: - self.add_prop_data = 2 - - if act_data_col is not None: - # injected this code to force a given column name to be the only column we do diffs on. - # This iterates over self.headers_numeric_position and removes all items if not 'act_data_col' - # then adjust self.headers_numeric_position accordingly - temp_dict = {} - - for key, value in self.headers_numeric_position.items(): - if value == act_data_col: - temp_dict[key] = value - - if len(temp_dict) == 0: - sys.exit("Please specify a valid column name, %s not found in CSV columns" % act_data_col) - - self.headers_numeric_position = temp_dict - - with open(csv_file, "rt") as csvfile: - - # iterate over filelines and add to data dict - # first reset the file iterator then skip the header - # then do fresh iteration to read data - self.logger.info("Now populating mol_data_dict and mol_smi_dict") - reader = csv.reader(csvfile, self.csv_dialect, skipinitialspace=True) - next(reader) - for row in reader: - data_row = [] - for key in sorted(self.headers_numeric_position.keys()): - # try to grab the numerical data and add to array - try: - data_row.append(float(row[key])) - except: - data_row.append('null') - - # add to mol_data_dict as mol_id => [the,numerical,data,as,array,with,mol_id,smi,removed] - if int(row[self.molid_col_num]) in self.mol_data_dict: - self.logger.debug("Duplicate data found for mol_id: %s" % (row[self.molid_col_num])) - else: - self.mol_data_dict[int(row[self.molid_col_num])] = data_row - self.logger.debug("Added real data array for mol_id %s: %s" % (row[self.molid_col_num], data_row)) - - # add to the mol_smi_dict as mol_id => smiles - self.mol_smi_dict[int(row[self.molid_col_num])] = row[self.smi_col_num] - - self.logger.debug("Done populating mol_data_dict to %s rows and mol_smi_dict to %s rows" % ( - len(self.mol_data_dict), len(self.mol_smi_dict))) - - # now write to file and get back the temp file object - smifi_for_dicer = self.write_mol_smi_dict_tofile() - - # standardise smiles - if std_smi: - self.mol_smi_dict = copy.deepcopy(self.generate_std_smiles(smifi_for_dicer, smi_id_map='both')) - - # now validate smiles in smi file - self.scan_input_smiles(smifi_for_dicer) - - # now add the property data, after act_data_col dealt with - if self.add_prop_data == 1: - - self.generate_clogp() - self.generate_mwt() - - if self.add_prop_data == 2: - - self.generate_clogp() - self.generate_all_props() - - self.logger.info('Done parsing CSV to data objects') - self.logger.info('mem_trace refsmi_dict entries: %s, mem usage: %s kB' % ( - len(self.refsmi_dict), sys.getsizeof(self.refsmi_dict)/1000)) - self.logger.info('mem_trace single_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.single_pairs_dict), sys.getsizeof(self.single_pairs_dict)/1000)) - self.logger.info('mem_trace double_pairs_dict entries: %s, mem usage: %s kB' % ( - len(self.double_pairs_dict), sys.getsizeof(self.double_pairs_dict)/1000)) - self.logger.info('mem_trace mol_data_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_data_dict), sys.getsizeof(self.mol_data_dict)/1000)) - self.logger.info('mem_trace mol_smi_dict entries: %s, mem usage: %s kB' % ( - len(self.mol_smi_dict), sys.getsizeof(self.mol_smi_dict)/1000)) - - # close csv file handle - csvfile.close() - - def generate_std_smiles(self, smiles_file, smi_id_map='both'): - """ - Method to standardise the input smiles stored in given dictionary. This was added for the matched - series work where enumerated product molecules need to be checked to ensure they are novel and this - is done via string comparison on standardised smiles. - :param smiles_dict: the dictionary structure containing the smiles and ids - :param smi_id_map: the direction to add smi and id to the dict, uni or bidirectional - :param existing_smifi: name of the file containing smiles to standardise, or None if it does not exist - """ - # are we creating/storing dict as smi => id or id +. smi or both ways? - smi_id_map_opt = ['both', 'id_smi', 'smi_id'] - if smi_id_map not in smi_id_map_opt: - raise Exception('Error, invalid param for smi_id_map in generate_std_smiles') - - return_dict = {} - return_out_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding='utf-8', mode='wt') - return_out = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding='utf-8', mode='wt') - - # run standardise smiles - self.logger.debug("Running standardise smiles with in: %s, out: %s" % (smiles_file, return_out.name)) - exit_status_a = pymo.fileconv(smiles_file, - outfile=return_out_tmp.name, - params_dict={'-i': 'rmhknown', - ' -i': 'mdlD', - ' -i': 'mdlT', - ' -i': 'mdlquiet', - ' -i': 'ignore_bad_chiral', - ' -i': 'ignore_bad_m', - ' -i': 'firstsdftag', - '-A': 'D', - ' -A': 'I', - '-E': 'autocreate', - '-V': '', - '-S': '-', - '-g': 'all', - '-B': '9999', - '-c': '5', - '-C': '200', - '-s': 'rmbad', - '-Y': 'ftn', - '-I': 'change', - '-O': 'none', - '-f': 'lod'}, - loggero=self.logger) - self.logger.debug("standardise smiles return code was: %s" % exit_status_a) - - # run standardise smiles - self.logger.debug("Running preferred smiles with in: %s, out: %s" % (return_out_tmp.name, return_out.name)) - exit_status = pymo.preferred_smiles(return_out_tmp.name, - outfile=return_out.name, - params_dict={}, - loggero=self.logger) - self.logger.debug("preferred smiles return code was: %s" % exit_status) - - # load the results - with open(return_out.name, "rt") as csv_file: - - reader = csv.reader(csv_file, delimiter=' ') - for row in reader: - try: - id_ = int(row[1]) - smi_ = row[0] - except: - self.logger.debug('Unable to parse line from standardise smiles:\n %s' % str(row)) - continue - - if smi_id_map == 'both' or smi_id_map == 'id_smi': - return_dict[id_] = smi_ - - if smi_id_map == 'both' or smi_id_map == 'smi_id': - return_dict[smi_] = id_ - - return return_dict - - def generate_clogp(self, input_smi=False, comparison_dict=False): - """Use pymo.cmi to generate CLP data for all mols - Original implementation uses Biobyte clogP - Opensource implmentation uses Abrahams """ - - if input_smi is False: - input_smi = self.temp_dicer_smifi.name - - temp_clp_out = tempfile.NamedTemporaryFile(encoding='utf-8', mode='wt', delete=False) - - # run cmi - self.logger.info("Running cmi with in: %s, out: %s" % (input_smi, temp_clp_out.name)) - - # - use_alogp = False - - if 'LILLYMOL_HOME' in os.environ: - - use_alogp = True - exit_status = pymo.alogp(input_smi, - outfile=temp_clp_out.name, - loggero=self.logger) - self.logger.debug("alogp return code was: %s" % exit_status) - - else: - - exit_status = pymo.cmi(input_smi, outfile=temp_clp_out.name, - params_dict={}, loggero=self.logger) - self.logger.debug("cmi return code was: %s" % exit_status) - - - # append header - if comparison_dict: - self.mol_props_comparison_headers.append('MOL_CLP') - self.mol_props_comparison_headers_numeric_position[self.mol_props_comparison_headers.index('MOL_CLP')] = \ - 'MOL_CLP' - mol_props_dict_touse = self.mol_props_comparison_dict - self.logger.debug("Using comparison dict for property data") - - else: - self.mol_props_headers.append('MOL_CLP') - self.mol_props_headers_numeric_position[self.mol_props_headers.index('MOL_CLP')] = 'MOL_CLP' - mol_props_dict_touse = self.mol_props_dict - - clp_col = None - id_col = None - - # load the csv file - with open(temp_clp_out.name, "rt") as csv_file: - - reader = csv.reader(csv_file, delimiter=' ') - i = -1 - for row in reader: - - # ignore the header row, but capture position of logp - i += 1 - if i == 0: - if use_alogp: - clp_col = row.index('abr_Logp') - id_col = row.index('Name') - else: - clp_col = 1 - id_col = 0 - continue - - my_id = int(row[id_col]) - val = float(row[clp_col]) - - if my_id in mol_props_dict_touse: - self.logger.warn("Duplicate ID seen, overwrite of property data for id: %s" % my_id) - - if my_id in mol_props_dict_touse: - mol_props_dict_touse[my_id].append(val) - - else: - mol_props_dict_touse[my_id] = [] - mol_props_dict_touse[my_id].append(val) - - temp_clp_out.close() - - self.logger.info("Loaded %d CLP values from file" % i) - - def generate_mwt(self, input_smi=False, comparison_dict=False): - """Use pymo.iwdescr to generate MWT data""" - - if input_smi is False: - input_smi = self.temp_dicer_smifi.name - - temp_mwt_out = tempfile.NamedTemporaryFile(encoding='utf-8', mode='wt') - - # run pymo.iwdescr - self.logger.info("Running pymo.iwdescr for MWT with in:%s, out:%s" % (input_smi, temp_mwt_out.name)) - exit_status = pymo.iwdescr(input_smi, temp_mwt_out.name, - params_dict={'-l': '', '-v': ''}, loggero=self.logger) - self.logger.debug("Ran iwdescr with exit status %s" % exit_status) - - # append header - if comparison_dict: - self.mol_props_comparison_headers.append('MOL_MWT') - self.mol_props_comparison_headers_numeric_position[self.mol_props_comparison_headers.index('MOL_MWT')] = \ - 'MOL_MWT' - mol_props_dict_touse = self.mol_props_comparison_dict - self.logger.debug("Using comparison dict for property data") - - else: - self.mol_props_headers.append('MOL_MWT') - self.mol_props_headers_numeric_position[self.mol_props_headers.index('MOL_MWT')] = 'MOL_MWT' - mol_props_dict_touse = self.mol_props_dict - - # set the mwt - with open(temp_mwt_out.name, "rt") as csv_file: - - reader = csv.reader(csv_file, delimiter=' ') - i = -1 - for row in reader: - - # ignore header row - i += 1 - - if i == 0: - # TODO: pick the column location dynamically, don't hard code the loc - if row[4] != 'w_amw': - self.logger.warn("When this was written, MWT was in array position 4 (zero indexed) with " - "column title w_amw. Now it's not, it's: %s" % row[4]) - sys.exit("When this was written, MWT was in array position 4 (zero indexed) with column " - "title w_amw. Now it's not, it's: %s" % row[4]) - continue - - if int(row[0]) in mol_props_dict_touse: - mol_props_dict_touse[int(row[0])].append(float(row[4])) - - else: - mol_props_dict_touse[int(row[0])] = [] - mol_props_dict_touse[int(row[0])].append(float(row[4])) - - csv_file.close() - - self.logger.info("Loaded %d MWT values from file" % i) - - def generate_all_props(self, input_smi=False, comparison_dict=False): - """Use pymo.iwdescr to generate excessive amounts of property data""" - - if input_smi is False: - input_smi = self.temp_dicer_smifi.name - - temp_mwt_out = tempfile.NamedTemporaryFile(encoding='utf-8', mode='wt') - - # run pymo.iwdescr - self.logger.info("Running pymo.iwdescr with in:%s, out:%s" % (input_smi, temp_mwt_out.name)) - exit_status = pymo.iwdescr(input_smi, temp_mwt_out.name, - params_dict={'-l': '', '-v': ''}, loggero = self.logger) - self.logger.debug("Ran iwdescr with exit status %s" % exit_status) - - if comparison_dict: - mol_props_dict_touse = self.mol_props_comparison_dict - self.logger.debug("Using comparison dict for property data") - - else: - mol_props_dict_touse = self.mol_props_dict - - # parse output - with open(temp_mwt_out.name, "rt") as csv_file: - - reader = csv.reader(csv_file, delimiter=' ') - i = -1 - for row in reader: - - i += 1 - - # if header row, append headers - if i == 0: - - for header_raw in row: - header_str = str(header_raw).upper() - - if comparison_dict: - self.mol_props_comparison_headers.append(header_str) - self.mol_props_comparison_headers_numeric_position[ - self.mol_props_comparison_headers.index(header_str)] = header_str - - else: - self.mol_props_headers.append(header_str) - self.mol_props_headers_numeric_position[self.mol_props_headers.index(header_str)] =\ - header_str - - continue - - # or add data - tmp_row = [] - for val_ in row: - try: - tmp_row.append(float(val_)) - except: - tmp_row.append(val_) - row = tmp_row - mol_props_dict_touse[int(row[0])] = row - - temp_mwt_out.close() - - self.logger.info("Parsed property data for %d molecules from file" % i) - - def get_data_diffs(self, molid_L, molid_R): - """Method to generate an array of differences between - two entries or arrays in the mol_data_dict - """ - - difference_array = [] - - for idx, val in enumerate(self.mol_data_dict[molid_L]): - - try: - temp_diff = '{:0.3e}'.format(self.mol_data_dict[molid_R][idx] - val) - except: - temp_diff = 'NaN' - difference_array.append(temp_diff) - - return difference_array - - def get_prop_diffs(self, molid_L, molid_R): - """Method to generate an array of differences between - two entries or arrays in the mol_props_dict. Could consolidate - this with get_data_diffs() but called so many times it's kept - separate for now - """ - difference_array = [] - - for idx, val in enumerate(self.mol_props_dict[molid_L]): - - try: - temp_diff = '{:0.3e}'.format(self.mol_props_dict[molid_R][idx] - val) - except: - temp_diff = 'NaN' - difference_array.append(temp_diff) - - return difference_array - - def get_folddata_diffs(self, molid_L, molid_R): - """Method to generate an array of differences between - two entries or arrays in the mol_data_dict, we do not - implement this for property data - """ - - difference_array = [] - - for idx, val in enumerate(self.mol_data_dict[molid_L]): - try: - # fold diff - temp_diff = self.mol_data_dict[molid_R][idx] / val - except: - temp_diff = 'n/a' - difference_array.append(temp_diff) - - return difference_array - - def get_pairs_and_diffs(self, out_file, cut_type, fold_diff=False, - inc_types_header=False, numeric_ids=False, - add_qmetric=False): - """Method to print out the pairs from the input CSV with data differences - - out_file: - The user specified output file - - cut_type: - Specifies the type of fragmentation required. Allowed values are SINGLE, - DOUBLE or BOTH. Currently this class does not support anything greater than - double cut fragmentation - - Example usage: - - # give me a CSV named my_output.pairs of all the pairs: - my_mmp_object.get_pairs_and_diffs('myoutput.csv', 'BOTH') - - # give me a CSV of only the DOUBLE cut pairs: - my_mmp_object.get_pairs_and_diffs('myoutput.csv', 'DOUBLE') - """ - - if add_qmetric is True and self.add_prop_data == 0: - raise Exception('Must specify prop_data=True when requesting add_qmetric as Quality metric needs CLP data') - - # check file write possible before start - self.logger.info('Opening output file for write: %s' % out_file) - - # check cut_type, convert to int - if cut_type.upper() == 'DOUBLE': - # confusing but faster later - cut_type_id = 3 - elif cut_type.upper() == 'BOTH': - # confusing but faster later - cut_type_id = 2 - elif cut_type.upper() == 'SINGLE': - cut_type_id = 1 - else: - self.logger.warn('cut_type specification is incorrect, using single cut: %s' % cut_type.upper()) - cut_type_id = 1 - - # Quality metric needs clp data - if add_qmetric: - # remind ourselves where the clp data will be in a molecule property list - clp_col_idx = self.mol_props_headers.index('MOL_CLP') - - # fail if both single_pairs_dict and double_pairs_dict are empty - if (len(self.single_pairs_dict) == 0) and (len(self.double_pairs_dict) == 0): - self.logger.debug('No data found in single_pairs_dict and/or double_pairs_dict, expect no results') - # sys.exit("Error: no data found in single_pairs_dict and/or double_pairs_dict, nothing to find and write") - - # Now start processing the data structures to write the pairs - with open(out_file, "w") as f: - - # write headers to CSV - f.write("CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,") - - num_items = len(self.headers_numeric_position) - idx = 0 - - for item in self.headers_nosmi: - if item in list(self.headers_numeric_position.values()): - idx += 1 - if idx == num_items: - item += '_DIFF' - f.write(item) - else: - item += '_DIFF,' - f.write(item) - - num_items = len(self.mol_props_headers) - idx = 0 - - if self.add_prop_data != 0: - for item in self.mol_props_headers: - idx += 1 - if idx == 1: - item = "," + item + '_DIFF,' - f.write(item) - elif idx == num_items: - item += '_DIFF' - f.write(item) - else: - item += '_DIFF,' - f.write(item) - - if add_qmetric: - f.write(',MOL_L_CLP,MOL_R_CLP') - - f.write('\n') - - num_items = len(self.headers_numeric_position) - idx = 0 - - if inc_types_header is True: - - # This is a bug fix for later mmp_pairs_object data aggregation, see comments in pd_read_cdv method - self.types_header = True - - f.write("STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING,") - - for item in self.headers_nosmi: - if item in list(self.headers_numeric_position.values()): - idx += 1 - if idx == num_items: - f.write("REAL") - else: - f.write("REAL,") - - idx = 0 - if self.add_prop_data != 0: - for item in self.mol_props_headers: - idx += 1 - if idx == 1: - f.write(",REAL,") - elif idx == num_items: - f.write("REAL") - else: - f.write("REAL,") - - f.write('\n') - - # print pairs for single - if cut_type_id <= 2: - # first get pairs via iterator_single_pairs_dict... - if numeric_ids: - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R in \ - self.iterator_single_pairs_dict_numeric(inc_attachpt=True): - if fold_diff: - diff_array = self.get_folddata_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_folddata_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - else: - diff_array = self.get_data_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_prop_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - - num_items = len(diff_array) - 1 - f.write('single,%d,%d,%s,%s,%s,%s,%s,%s,%s,' % (molid_L, molid_R, ctx, frag_L, - frag_R, fa_L, ca_L, fa_R, ca_R)) - for idx, val in enumerate(diff_array): - if idx == num_items: - f.write('%s\n' % val) - else: - f.write('%s,' % val) - - else: - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R in \ - self.iterator_single_pairs_dict(): - # ...now add the data differences - if fold_diff: - diff_array = self.get_folddata_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_prop_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - else: - diff_array = self.get_data_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_prop_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - - num_items = len(diff_array) - 1 - f.write('single,%d,%d,%s,%s,%s,%s,%s,%s,%s,' % (molid_L, molid_R, ctx, frag_L, - frag_R, fa_L, ca_L, fa_R, ca_R)) - for idx, val in enumerate(diff_array): - if idx == num_items: - f.write('%s\n' % val) - else: - f.write('%s,' % val) - - # print pairs for double - if cut_type_id >= 2: - # first get pairs via iterator_double_pairs_dict... - if numeric_ids: - self.logger.warn('The use of numeric_ids with cut_type 2 (double cuts) is not yet supported') - - else: - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R in \ - self.iterator_double_pairs_dict(): - # ...now add the data differences - if fold_diff: - diff_array = self.get_folddata_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_prop_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - else: - diff_array = self.get_data_diffs(molid_L, molid_R) - if self.add_prop_data: - diff_array = diff_array + self.get_prop_diffs(molid_L, molid_R) - # Quality metric needs clp data - if add_qmetric: - diff_array.extend([self.mol_props_dict[molid_L][clp_col_idx], - self.mol_props_dict[molid_R][clp_col_idx]]) - - num_items = len(diff_array) - 1 - f.write('double,%d,%d,%s,%s,%s,%s,%s,%s,%s,' % (molid_L, molid_R, ctx, frag_L, - frag_R, fa_L, ca_L, fa_R, ca_R)) - for idx, val in enumerate(diff_array): - if idx == num_items: - f.write('%s\n' % val) - else: - f.write('%s,' % val) - - # close the file handle - f.close() - - -class _TestMMPDataObjectClass(unittest.TestCase): - """Test class for MMPDataObjectClass(object) written to use pythons unittest - - Example usage: - - python mmp_data_objects.py - - coverage run mmp_data_objects.py - coverage report mmp_data_objects.py - - """ - - def setUp(self): - """Instantiate""" - - self.maxDiff = None - - self.temp_file_input_smi_01 = tempfile.NamedTemporaryFile(delete=False, - suffix=".smi", - encoding='utf-8', - mode='wt') - self.temp_file_input_smi_01b = tempfile.NamedTemporaryFile(delete=False, - suffix=".smi", - encoding='utf-8', - mode='wt') - self.temp_file_input_smi_02 = tempfile.NamedTemporaryFile(delete=False, - suffix=".smi", - encoding='utf-8', - mode='wt') - self.temp_file_input_csv = tempfile.NamedTemporaryFile(delete=False, - encoding='utf-8', - mode='wt') - self.temp_file_output_pairs = tempfile.NamedTemporaryFile(delete=False, - encoding='utf-8', - mode='wt') - - self.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - # logging.disable(logging.CRITICAL) - - self.test_mmp_data_object = MMPDataObjectClass(self.mmplogger) - - # The following represent synthetic data, analogues of CHEMBL1382609 - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1382609/ - # 1. substituents are added to the pyrazole ring to generate side chain MMPs - # H on CHEMBL1382609 between two methyls is changed to Br, F, C, I to - # visually see the change in the smiles string (avoiding Cl as already present) - # e.g.: N1C(=C(Br)C(=N1)C)C - # 2. core ring system is modified (phenyl to pyridine) to see ring switch MMP's - # Presence/Absence of Pyridine-N and N-positional isomerism in Cl-Ph ring - # e.g.: C2=NC(=CS2)C2=CC=C(Cl)C=C2 + addition of N -> - # C2=NC(=CS2)C2=CN=C(Cl)C=C2 + move N around ring -> - # C2=NC(=CS2)C2=NC=C(Cl)C=C2 - - self.test_dataset_goldeninput_smi_01 = { - '001': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '002': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(F)C(=N1)C)C', - '003': 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '004': 'N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '005': 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(F)C(=N1)C)C' - } - - self.test_dataset_goldeninput_smi_01b = { - '006': 'N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '007': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(I)C(=N1)C)C', - '008': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(C)C(=N1)C)C' - } - - self.test_dataset_goldeninput_smi_02 = { - 80: 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(I)C(=N1)C)C', - 81: 'N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(I)C(=N1)C)C', - 82: 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(C)C(=N1)C)C', - 83: 'N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(C)C(=N1)C)C' - } - - self.test_dataset_goldeninput_csv_headers = \ - ['ID,SMILES,ACT_A,ACT_B,OTHER_INFO,CATEGORICAL_DATA_1,CATEGORICAL_DATA_2'] - - self.test_dataset_goldeninput_csv_data = { - '001,N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C,30.0,25.0,25,0,0': None, - '002,N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(F)C(=N1)C)C,20.0,25.0,string,1,0': None, - '003,N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(Br)C(=N1)C)C,10.0,15.0,25,1,1': None, - '005,N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(F)C(=N1)C)C,20.0,30.0,text,1,1': None, - '004,N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C,50.0,60.0,blah,0,1': None - } - - self.test_dataset_list_of_context_smi = ['Brc1c([n][n](c2sc[1cH][n]2)c1C)C', - '[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1'] - - # output results data for test comparison - actualy MMP's between above CHEML compounds - self.test_dataset_goldenoutput_pairs_and_diffs = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,ACT_A_DIFF,ACT_B_DIFF,CATEGORICAL_DATA_1_DIFF,CATEGORICAL_DATA_2_DIFF': None, - 'single,1,2,Clc1ccc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'single,2,1,Clc1ccc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'single,1,2,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'single,2,1,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'single,1,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-2.000e+01,-1.000e+01,1.000e+00,1.000e+00': None, - 'single,1,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],2.000e+01,3.500e+01,0.000e+00,1.000e+00': None, - 'single,3,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],2.000e+01,1.000e+01,-1.000e+00,-1.000e+00': None, - 'single,3,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],4.000e+01,4.500e+01,-1.000e+00,0.000e+00': None, - 'single,4,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-2.000e+01,-3.500e+01,0.000e+00,-1.000e+00': None, - 'single,4,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-4.000e+01,-4.500e+01,1.000e+00,0.000e+00': None, - 'single,2,5,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.000e+00,5.000e+00,0.000e+00,1.000e+00': None, - 'single,5,2,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.000e+00,-5.000e+00,0.000e+00,-1.000e+00': None, - 'single,3,5,Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'single,5,3,Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'single,3,5,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'single,5,3,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,0.000e+00,1.000e+00,0.000e+00': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,0.000e+00,-1.000e+00,0.000e+00': None, - 'double,1,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-2.000e+01,-1.000e+01,1.000e+00,1.000e+00': None, - 'double,1,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-2.000e+01,-1.000e+01,1.000e+00,1.000e+00': None, - 'double,1,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],2.000e+01,3.500e+01,0.000e+00,1.000e+00': None, - 'double,1,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],2.000e+01,3.500e+01,0.000e+00,1.000e+00': None, - 'double,3,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],2.000e+01,1.000e+01,-1.000e+00,-1.000e+00': None, - 'double,3,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],2.000e+01,1.000e+01,-1.000e+00,-1.000e+00': None, - 'double,3,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],4.000e+01,4.500e+01,-1.000e+00,0.000e+00': None, - 'double,3,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],4.000e+01,4.500e+01,-1.000e+00,0.000e+00': None, - 'double,4,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[1cH]1cc[2cH]cc1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-2.000e+01,-3.500e+01,0.000e+00,-1.000e+00': None, - 'double,4,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[2cH]1cc[1cH]cc1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-2.000e+01,-3.500e+01,0.000e+00,-1.000e+00': None, - 'double,4,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-4.000e+01,-4.500e+01,1.000e+00,0.000e+00': None, - 'double,4,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-4.000e+01,-4.500e+01,1.000e+00,0.000e+00': None, - 'double,2,5,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.000e+00,5.000e+00,0.000e+00,1.000e+00': None, - 'double,2,5,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.000e+00,5.000e+00,0.000e+00,1.000e+00': None, - 'double,5,2,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.000e+00,-5.000e+00,0.000e+00,-1.000e+00': None, - 'double,5,2,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.000e+00,-5.000e+00,0.000e+00,-1.000e+00': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01,1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01,-1.500e+01,0.000e+00,0.000e+00': None} - - self.test_dataset_goldenoutput_pairs_and_diffs2 = copy.deepcopy(self.test_dataset_goldenoutput_pairs_and_diffs) - self.test_dataset_goldenoutput_pairs_and_diffs2[ - 'STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING,REAL,REAL,REAL,REAL'] = None - - self.test_dataset_goldenoutput_pairs_and_diffs3 = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,ACT_A_DIFF': None, - 'single,1,2,Clc1ccc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],-1.000e+01': None, - 'single,2,1,Clc1ccc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],1.000e+01': None, - 'single,1,2,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],-1.000e+01': None, - 'single,2,1,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],1.000e+01': None, - 'single,1,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-2.000e+01': None, - 'single,1,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],2.000e+01': None, - 'single,3,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],2.000e+01': None, - 'single,3,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],4.000e+01': None, - 'single,4,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-2.000e+01': None, - 'single,4,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],-4.000e+01': None, - 'single,2,5,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.000e+00': None, - 'single,5,2,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.000e+00': None, - 'single,3,5,Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],1.000e+01': None, - 'single,5,3,Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],-1.000e+01': None, - 'single,3,5,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],1.000e+01': None, - 'single,5,3,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],-1.000e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,1,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-2.000e+01': None, - 'double,1,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-2.000e+01': None, - 'double,1,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],2.000e+01': None, - 'double,1,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],2.000e+01': None, - 'double,3,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],2.000e+01': None, - 'double,3,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],2.000e+01': None, - 'double,3,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],4.000e+01': None, - 'double,3,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],4.000e+01': None, - 'double,4,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[1cH]1cc[2cH]cc1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-2.000e+01': None, - 'double,4,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[2cH]1cc[1cH]cc1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-2.000e+01': None, - 'double,4,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],-4.000e+01': None, - 'double,4,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],-4.000e+01': None, - 'double,2,5,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.000e+00': None, - 'double,2,5,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.000e+00': None, - 'double,5,2,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.000e+00': None, - 'double,5,2,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.000e+00': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.000e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.000e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],-1.000e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],-1.000e+01': None} - - self.test_dataset_goldenoutput_pairs_and_diffs4 = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,ACT_A_DIFF,MOL_CLP_DIFF,MOL_MWT_DIFF': None, - 'single,1,2,Clc1ccc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'single,2,1,Clc1ccc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],1.5,6.697e-01,6.091e+01': None, - 'single,1,2,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'single,2,1,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],1.5,6.697e-01,6.091e+01': None, - 'single,1,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.3333333333333333,-1.348e+00,9.900e-01': None, - 'single,1,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1cc[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],1.6666666666666667,-1.348e+00,9.900e-01': None, - 'single,3,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],3.0,1.348e+00,-9.900e-01': None, - 'single,3,4,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1[n]c[1cH]cc1,Clc1c[n][1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],5.0,1.000e-06,0.000e+00': None, - 'single,4,1,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.6,1.348e+00,-9.900e-01': None, - 'single,4,3,Brc1c([n][n](c2sc[1cH][n]2)c1C)C,Clc1c[n][1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],0.2,-1.000e-06,0.000e+00': None, - 'single,2,5,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],1.0,-1.348e+00,9.900e-01': None, - 'single,5,2,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1,[1:C2],[1:CAR],[1:C2],[1:CAR],1.0,1.348e+00,-9.900e-01': None, - 'single,3,5,Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1c([1nH][n]c1C)C,Fc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'single,5,3,Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1c([1nH][n]c1C)C,Brc1c([1nH][n]c1C)C,[1:C2],[1:NPL3],[1:C2],[1:NPL3],0.5,6.697e-01,6.091e+01': None, - 'single,3,5,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH],[1:C2],[1:BR],[1:C2],[1:F],2.0,-6.697e-01,-6.091e+01': None, - 'single,5,3,Clc1[n]cc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH],[1:C2],[1:F],[1:C2],[1:BR],0.5,6.697e-01,6.091e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,1,2,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.6666666666666666,-6.697e-01,-6.091e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[1CH4].Clc1ccc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,2,1,[2CH4].Clc1ccc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],1.5,6.697e-01,6.091e+01': None, - 'double,1,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.3333333333333333,-1.348e+00,9.900e-01': None, - 'double,1,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.3333333333333333,-1.348e+00,9.900e-01': None, - 'double,1,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[1cH]1cc[2cH]cc1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],1.6666666666666667,-1.348e+00,9.900e-01': None, - 'double,1,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[2cH]1cc[1cH]cc1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],1.6666666666666667,-1.348e+00,9.900e-01': None, - 'double,3,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],3.0,1.348e+00,-9.900e-01': None, - 'double,3,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],3.0,1.348e+00,-9.900e-01': None, - 'double,3,4,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[2:C2|1:CL],[1:CAR|2:CAR],5.0,1.000e-06,0.000e+00': None, - 'double,3,4,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[1:C2|2:CL],[2:CAR|1:CAR],5.0,1.000e-06,0.000e+00': None, - 'double,4,1,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[1cH]1cc[2cH]cc1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.6,1.348e+00,-9.900e-01': None, - 'double,4,1,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[2cH]1cc[1cH]cc1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.6,1.348e+00,-9.900e-01': None, - 'double,4,3,[1ClH].Brc1c([n][n](c2sc[2cH][n]2)c1C)C,[n]1[2cH]cc[1cH]c1,[n]1[1cH]cc[2cH]c1,[2:C2|1:CL],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],0.2,-1.000e-06,0.000e+00': None, - 'double,4,3,[2ClH].Brc1c([n][n](c2sc[1cH][n]2)c1C)C,[n]1[1cH]cc[2cH]c1,[n]1[2cH]cc[1cH]c1,[1:C2|2:CL],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],0.2,-1.000e-06,0.000e+00': None, - 'double,2,5,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],1.0,-1.348e+00,9.900e-01': None, - 'double,2,5,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[2cH]1cc[1cH]cc1,[n]1[2cH]cc[1cH]c1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],1.0,-1.348e+00,9.900e-01': None, - 'double,5,2,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1,[1:CL|2:C2],[1:CAR|2:CAR],[1:CL|2:C2],[1:CAR|2:CAR],1.0,1.348e+00,-9.900e-01': None, - 'double,5,2,[2ClH].Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,[n]1[2cH]cc[1cH]c1,[2cH]1cc[1cH]cc1,[2:CL|1:C2],[2:CAR|1:CAR],[2:CL|1:C2],[2:CAR|1:CAR],1.0,1.348e+00,-9.900e-01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][n][2nH]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][n][1nH]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Brc1[1cH][2nH][n]c1C,Fc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,3,5,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Brc1[2cH][1nH][n]c1C,Fc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],2.0,-6.697e-01,-6.091e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][n][2nH]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][n][1nH]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][n][2nH]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][n][1nH]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[1CH4].Clc1[n]cc(c2[n][2cH]sc2)cc1,Fc1[1cH][2nH][n]c1C,Brc1[1cH][2nH][n]c1C,[1:C3|2:C2],[1:C2|2:NPL3],[1:C3|2:C2],[1:C2|2:NPL3],0.5,6.697e-01,6.091e+01': None, - 'double,5,3,[2CH4].Clc1[n]cc(c2[n][1cH]sc2)cc1,Fc1[2cH][1nH][n]c1C,Brc1[2cH][1nH][n]c1C,[2:C3|1:C2],[2:C2|1:NPL3],[2:C3|1:C2],[2:C2|1:NPL3],0.5,6.697e-01,6.091e+01': None} - - for key, value in self.test_dataset_goldenoutput_pairs_and_diffs.items(): - tmp_key_list = key.split(',') - tmp_key_list.pop() - tmp_key_list.pop() - tmp_key_list.pop() - tmp_key = ",".join(tmp_key_list) - tmp_key = tmp_key.rstrip(', ').rstrip(',') - self.test_dataset_goldenoutput_pairs_and_diffs3[tmp_key] = None - - # write test data to temp file (smi) - for smi_id, smi in list(self.test_dataset_goldeninput_smi_01.items()): - self.temp_file_input_smi_01.write(smi + " " + smi_id + "\n") - self.temp_file_input_smi_01.close() - - for smi_id, smi in list(self.test_dataset_goldeninput_smi_01b.items()): - self.temp_file_input_smi_01b.write(smi + " " + smi_id + "\n") - self.temp_file_input_smi_01b.close() - - for smi_id, smi in list(self.test_dataset_goldeninput_smi_02.items()): - self.temp_file_input_smi_02.write(smi + " " + str(smi_id) + "\n") - self.temp_file_input_smi_02.close() - - # write test data to temp file (csv) - self.temp_file_input_csv.write(', '.join(self.test_dataset_goldeninput_csv_headers) + "\n") - for data in list(self.test_dataset_goldeninput_csv_data.keys()): - self.temp_file_input_csv.write(data + "\n") - self.temp_file_input_csv.close() - - # container for results data - self.test_dataset_testresults = {} - - def tearDown(self): - """Tear down object for clean reuse in further tests""" - # clean out the object - self.test_mmp_data_object.clean_out_data() - # clean out the temp data store - self.test_dataset_testresults.clear() - - os.remove(self.temp_file_input_smi_01.name) - os.remove(self.temp_file_input_csv.name) - os.remove(self.temp_file_output_pairs.name) - - def test_build_comparison_smidict(self): - """ """ - self.test_mmp_data_object.build_comparison_smidict(self.temp_file_input_smi_02.name, add_prop_data=2) - # not the best test but simple method simple test - self.assertEqual(self.test_mmp_data_object.mol_smi_comparison_dict, self.test_dataset_goldeninput_smi_02) - - self.assertEqual(len(self.test_mmp_data_object.mol_props_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_comparison_dict), 4) - - def test_csv_sniffer(self): - """Test method to sniff/prescan CSV file to determine format""" - - # 1. scan CSV - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - - # - # Equality test on base objects - # - # print(self.test_mmp_data_object.headers_nosmi) - self.assertEqual(self.test_mmp_data_object.headers_nosmi, - ['ACT_A', 'ACT_B', 'OTHER_INFO', 'CATEGORICAL_DATA_1', 'CATEGORICAL_DATA_2'] - ) - # print(self.test_mmp_data_object.headers_numeric_position) - self.assertEqual(self.test_mmp_data_object.headers_numeric_position, - {2: 'ACT_A', 3: 'ACT_B', 5: 'CATEGORICAL_DATA_1', 6: 'CATEGORICAL_DATA_2'} - ) - # print(self.test_mmp_data_object.csv_items_per_line) - self.assertEqual(self.test_mmp_data_object.csv_items_per_line, 7) - # print(self.test_mmp_data_object.smi_col_num) - self.assertEqual(self.test_mmp_data_object.smi_col_num, 1) - # print(self.test_mmp_data_object.molid_col_num) - self.assertEqual(self.test_mmp_data_object.molid_col_num, 0) - - # print(self.test_mmp_data_object.csv_dialect.__name__) - self.assertEqual(self.test_mmp_data_object.csv_dialect.__name__, 'dialect') - - def test_csv_sniffer_skip_num_check(self): - """Test method to sniff/prescan CSV file to determine format""" - - # 1. scan CSV - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID', skip_num_check=True) - - # - # Equality test on base objects - # - # print(self.test_mmp_data_object.headers_nosmi) - self.assertEqual(self.test_mmp_data_object.headers_nosmi, - ['ACT_A', 'ACT_B', 'OTHER_INFO', 'CATEGORICAL_DATA_1', 'CATEGORICAL_DATA_2'] - ) - # print(self.test_mmp_data_object.headers_numeric_position) - self.assertEqual(self.test_mmp_data_object.headers_numeric_position, - # without skip_num_check set as True we got this: - # {2: 'ACT_A', 3: 'ACT_B', 5: 'CATEGORICAL_DATA_1', 6: 'CATEGORICAL_DATA_2'} - {2: 'ACT_A', 3: 'ACT_B', 4: 'OTHER_INFO', 5: 'CATEGORICAL_DATA_1', 6: 'CATEGORICAL_DATA_2'} - ) - # - # Tested again as per test_csv_sniffer, not really needed here but inc anyway to reassert - # - # print(self.test_mmp_data_object.csv_items_per_line) - self.assertEqual(self.test_mmp_data_object.csv_items_per_line, 7) - # print(self.test_mmp_data_object.smi_col_num) - self.assertEqual(self.test_mmp_data_object.smi_col_num, 1) - # print(self.test_mmp_data_object.molid_col_num) - self.assertEqual(self.test_mmp_data_object.molid_col_num, 0) - - # print(self.test_mmp_data_object.csv_dialect.__name__) - self.assertEqual(self.test_mmp_data_object.csv_dialect.__name__, 'dialect') - - def test_csv_to_data_objects(self): - """Test method to load CSV file into data objects assuming it's already been prescanned""" - - # 2. scan and parse CSV - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - - # equality test - # print(self.test_mmp_data_object.mol_smi_dict) - self.assertEqual(self.test_mmp_data_object.mol_smi_dict, - {1: 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - 2: 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(F)C(=N1)C)C', - 3: 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - 4: 'N1(C2=NC(=CS2)C2=NC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - 5: 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(F)C(=N1)C)C'} - ) - - # print(self.test_mmp_data_object.mol_data_dict) - self.assertEqual(self.test_mmp_data_object.mol_data_dict, - {1: [30.0, 25.0, 0.0, 0.0], - 2: [20.0, 25.0, 1.0, 0.0], - 3: [10.0, 15.0, 1.0, 1.0], - 4: [50.0, 60.0, 0.0, 1.0], - 5: [20.0, 30.0, 1.0, 1.0]} - ) - - def test_write_mol_smi_dict_tofile(self): - """Test method to write out the SMI data into a SMI file""" - - # 3. scan and parse CSV then write SMILES file back out - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(tmp_dicer_file, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[int(line.split()[1])] = line.split()[0] - test_results_filehandle.close() - - self.assertEqual(self.test_dataset_testresults, self.test_mmp_data_object.mol_smi_dict) - - def test_build_from_dicer(self): - """Test method to read DICER output into pairs dicts""" - - # 4. build pairs dicts from dicer output - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - - # Below refsmi_dict gets too big to print so we'll simply check num entries - # print(self.test_mmp_data_object.refsmi_dict) - # print(len(self.test_mmp_data_object.refsmi_dict)) - self.assertEqual(len(self.test_mmp_data_object.refsmi_dict), 160) - - # Below single_pairs_dict gets too big to print so we'll simply check num entries - # print(self.test_mmp_data_object.single_pairs_dict) - # print(len(self.test_mmp_data_object.single_pairs_dict)) - self.assertEqual(len(self.test_mmp_data_object.single_pairs_dict), 51) - - #print(self.test_mmp_data_object.double_pairs_dict) - self.assertEqual(self.test_mmp_data_object.double_pairs_dict, - {50: {13: (3, 4)}, 99: {43: (6, 4), 76: (6, 4), 493: (6, 4), 558: (6, 4)}, 118: {103: (8, 9)}, - 649: {208: (12, 13), 1481: (12, 13), 3155: (18, 13)}, 555: {18: (15, 4)}, - 1161: {627: (17, 9)}, 1414: {228: (12, 13), 1590: (12, 13)}, 1171: {24: (3, 4)}, - 1369: {62: (6, 4), 101: (6, 4), 589: (6, 4), 660: (6, 4)}, 1223: {132: (8, 9)}, - 2528: {39: (15, 4)}, 3369: {735: (17, 9)}, 2623: {31: (3, 4)}, - 2917: {73: (6, 4), 115: (6, 4)}, 2699: {148: (8, 9)}} - ) - - def test_generate_std_smiles(self): - """Test method to standardise smiles""" - - # scan and parse CSV then write SMILES file back out - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - - self.test_mmp_data_object.mol_smi_dict = self.test_mmp_data_object.generate_std_smiles(tmp_dicer_file, - smi_id_map='both') - # print self.test_mmp_data_object.mol_smi_dict - self.assertEqual( - {1: 'Brc1c([n][n](c2scc([n]2)c2ccc(Cl)cc2)c1C)C', - 2: 'Clc1ccc(c2csc([n]3[n]c(c(F)c3C)C)[n]2)cc1', - 3: 'Brc1c([n][n](c2scc([n]2)c2c[n]c(Cl)cc2)c1C)C', - 4: 'Brc1c([n][n](c2scc(c3[n]cc(Cl)cc3)[n]2)c1C)C', - 5: 'Clc1[n]cc(c2csc([n]3[n]c(c(F)c3C)C)[n]2)cc1', - 'Brc1c([n][n](c2scc([n]2)c2ccc(Cl)cc2)c1C)C': 1, - 'Brc1c([n][n](c2scc(c3[n]cc(Cl)cc3)[n]2)c1C)C': 4, - 'Clc1ccc(c2csc([n]3[n]c(c(F)c3C)C)[n]2)cc1': 2, - 'Brc1c([n][n](c2scc([n]2)c2c[n]c(Cl)cc2)c1C)C': 3, - 'Clc1[n]cc(c2csc([n]3[n]c(c(F)c3C)C)[n]2)cc1': 5}, - self.test_mmp_data_object.mol_smi_dict - ) - - def test_get_pairs_and_diffs(self): - """Test method to get pairs and pair diffs""" - - # 5. full build then write of pairs to file - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH') - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_goldenoutput_pairs_and_diffs, self.test_dataset_testresults) - - def test_get_pairs_and_diffs_withnamedcol(self): - """Test method to get pairs and pair diffs""" - - # 5b. full build then write of pairs to file, but only for a single named column - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH') - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_goldenoutput_pairs_and_diffs3, self.test_dataset_testresults) - - def test_get_pairs_and_diffs_withpropdiffs(self): - """Test method to get pairs and pair diffs with added property difference""" - - # 5b. full build then write of pairs to file, but only for a single named column - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A', add_prop_diff=1) - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH', - fold_diff=True) - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_goldenoutput_pairs_and_diffs4, self.test_dataset_testresults) - - def test_get_pairs_and_diffs_withpropdiffs_and_qmetric(self): - """Test method to get pairs and pair diffs with added property difference""" - - # 5b. full build then write of pairs to file, but only for a single named column - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A', add_prop_diff=1) - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH', - fold_diff=True, add_qmetric=True) - - # now read it back into temp object and check it's what we wrote out! - idx_ = 0 - with open(self.temp_file_output_pairs.name, mode='r') as results_file: - reader = csv.reader(results_file) - for line in reader: - idx_ += 1 - self.test_dataset_testresults[idx_] = line - - headers_ = ['CUT', 'MOLID_L', 'MOLID_R', 'CONTEXT', 'FRAG_L', 'FRAG_R', - 'ATTCHPT_CTX_L', 'ATTCHPT_FRAG_L', 'ATTCHPT_CTX_R', 'ATTCHPT_FRAG_R', - 'ACT_A_DIFF', 'MOL_CLP_DIFF', 'MOL_MWT_DIFF', 'MOL_L_CLP', 'MOL_R_CLP'] - - # too much data so just validate the data shape and headers - self.assertEqual(len(self.test_dataset_testresults), 65) - self.assertEqual(self.test_dataset_testresults[1], headers_) - self.assertEqual(len(self.test_dataset_testresults[10]), 15) - self.assertEqual(len(self.test_dataset_testresults[10]), 15) - self.assertIsInstance(float(self.test_dataset_testresults[10][14]), float) - - def test_get_pairs_and_diffs_withpropdiffs_v2(self): - """Test method to get pairs and pair diffs with added property difference""" - - # 5b. full build then write of pairs to file, but only for a single named column - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A', add_prop_diff=1) - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH', - fold_diff=True) - - # now read it back into temp object and check it's what we wrote out! - idx_ = 0 - #print(self.temp_file_output_pairs.name) - with open(self.temp_file_output_pairs.name, mode='r') as results_file: - reader = csv.reader(results_file) - for line in reader: - idx_ += 1 - self.test_dataset_testresults[idx_] = line - - headers_ = ['CUT', 'MOLID_L', 'MOLID_R', 'CONTEXT', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', - 'ATTCHPT_FRAG_L', 'ATTCHPT_CTX_R', 'ATTCHPT_FRAG_R', 'ACT_A_DIFF', 'MOL_CLP_DIFF', 'MOL_MWT_DIFF'] - - # too much data so just validate the data shape and headers - self.assertEqual(len(self.test_dataset_testresults), 65) - self.assertEqual(self.test_dataset_testresults[1], headers_) - self.assertGreater(len(self.test_dataset_testresults[10]), 10) - self.assertGreater(len(self.test_dataset_testresults[10]), 10) - # print float(self.test_dataset_testresults[10][14]) - self.assertIsInstance(float(self.test_dataset_testresults[10][11]), float) - # not sure this next one will work as we cannot guarantee the order of the rows in the csv - # self.assertEquals(float(self.test_dataset_testresults[10][14]), 4.222) - # print(self.test_dataset_testresults) - # self.assertEqual(self.test_dataset_goldenoutput_pairs_and_diffs4, self.test_dataset_testresults) - - def test_get_pairs_and_diffs_manypropdiffs(self): - """Test method to get pairs and pair diffs with a huge number of added property difference""" - - # 5c. full build then write of pairs to file, but only for a single named column - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A', add_prop_diff=2) - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH', fold_diff=False) - - # now read it back into temp object and check it's what we wrote out! - idx_ = 0 - with open(self.temp_file_output_pairs.name, mode='r') as results_file: - reader = csv.reader(results_file) - for line in reader: - idx_ += 1 - self.test_dataset_testresults[idx_] = line - - # check the shape as too much data to store - self.assertEqual(len(self.test_dataset_testresults), 65) - self.assertGreater(len(self.test_dataset_testresults[1]), 200) - - def test_use_of_comparison_dicts(self): - """Functional test where we load smiles and data into main data objects and check they are - populated with no data leaks into comparison data objects. We then clean up and do the - to ensure when loading comparison smiles we don't contaminate the main data stores""" - - # 5c. full build - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID', - act_data_col='ACT_A', add_prop_diff=2) - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - - # ensure we only have data in main data storage containers not comparison ones - self.assertEqual(len(self.test_mmp_data_object.mol_data_dict), 5) - self.assertEqual(len(self.test_mmp_data_object.mol_props_dict), 5) - self.assertGreater(len(self.test_mmp_data_object.mol_props_headers), 200) - self.assertGreater(len(self.test_mmp_data_object.mol_props_headers_numeric_position), 200) - self.assertEqual(len(self.test_mmp_data_object.mol_smi_dict), 5) - self.assertEqual(len(self.test_mmp_data_object.single_pairs_dict), 51) - self.assertEqual(len(self.test_mmp_data_object.double_pairs_dict), 15) - # comparison dicts - self.assertEqual(len(self.test_mmp_data_object.mol_props_comparison_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_comparison_headers), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_comparison_headers_numeric_position), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_smi_comparison_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.single_pairs_comparison_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.double_pairs_comparison_dict), 0) - - # stat again but now use comparison dict - self.test_mmp_data_object.clean_out_data() - # - self.test_mmp_data_object.build_from_dicer(self.temp_file_input_smi_01.name, 'BOTH', 'NONE', - use_comparison_dict=True) - self.test_mmp_data_object.build_comparison_smidict(self.temp_file_input_smi_01.name, - add_prop_data=2) - - # ensure we only have data in comparison data storage containers not main ones - # ensure we only have data in main data storage containers not comparison ones - self.assertEqual(len(self.test_mmp_data_object.mol_data_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_headers), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_props_headers_numeric_position), 0) - self.assertEqual(len(self.test_mmp_data_object.mol_smi_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.single_pairs_dict), 0) - self.assertEqual(len(self.test_mmp_data_object.double_pairs_dict), 0) - # comparison dicts - self.assertEqual(len(self.test_mmp_data_object.mol_props_comparison_dict), 5) - self.assertGreater(len(self.test_mmp_data_object.mol_props_comparison_headers), 200) - self.assertGreater(len(self.test_mmp_data_object.mol_props_comparison_headers_numeric_position), 200) - self.assertEqual(len(self.test_mmp_data_object.mol_smi_comparison_dict), 5) - self.assertEqual(len(self.test_mmp_data_object.single_pairs_comparison_dict), 51) - self.assertEqual(len(self.test_mmp_data_object.double_pairs_comparison_dict), 15) - - def test_get_pairs_and_diffs_with_header_types(self): - """Test method to get pairs and pair diffs with additional header line""" - - # 6. full build then write of pairs to file - self.test_mmp_data_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'ID') - self.test_mmp_data_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'ID') - tmp_dicer_file = self.test_mmp_data_object.write_mol_smi_dict_tofile() - self.test_mmp_data_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - self.test_mmp_data_object.get_pairs_and_diffs(self.temp_file_output_pairs.name, 'BOTH', inc_types_header=True) - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_goldenoutput_pairs_and_diffs2, self.test_dataset_testresults) - - -if __name__ == '__main__': - unittest.main() - diff --git a/contrib/script/py/mmp/mmp_dicer_functions.py b/contrib/script/py/mmp/mmp_dicer_functions.py deleted file mode 100755 index cb238214..00000000 --- a/contrib/script/py/mmp/mmp_dicer_functions.py +++ /dev/null @@ -1,384 +0,0 @@ -################################################################### -""" Summary: Builds dicer commend line for MMP generation JAL - -Notes: - -G iso01 (force deprioritisation of isomeric numbering in canonicalisation) - -B nosmi (removes need for grep -v "B=" | grep -v "FRAGID") - -c (discard chirality) - -B lostchiral (check each fragment for lost chirality) - -m 0 (include H cuts) - -k 2 (single and double cuts, 1 = single only) - -i ICTE will skip connection table errors rather than fail silently half way through with exit code 0 - - -B atype=sb (write smiles and complementary smiles with atom type labels on break points i.e.: attachment points) - -B MAXFF= (discard fragments that comprise more than fraction of atoms in parent) - -C auto (write smiles and complementary smiles with auto label on break points) - -s "ClC" -s "BrC" -s "FC" (smarts for cut points - breaks bond between 1st and 2nd matched atom) - -B addq (run the -q queries in addition to the default rules) - -B bscb (allow Carbon-Carbon single bonds to break) - -M 15 (discard fragments with more than 15 atoms) - -X 5000 (maximum number of fragments per molecule to produce) - -A I (enable input of aromatic structures) - -A D (use Daylight aromaticity definitions) - - Generates fragment and complimentary frag - can still get the same cansmi with different isomeric numbering - [1xx]xxxxx[2xx] or [2xx]xxxxx[1xx] - For complimentary part (context) - Guarantee first fragment is labelled 1 always, second fragment is 2: - [1xx].xxxxx[2xx]xxxx - -""" -########################################################################## -import sys -import os -import logging -from subprocess import Popen, PIPE - -import unittest -import tempfile - - -def build_mmpdicer_cmd(smi_fi, cut_type, threshold, return_threshold=False): - - try: - home_dir = os.environ['C3TK_HOME'] - - except KeyError: - home_dir = os.environ['LILLYMOL_HOME'] - - except KeyError: - sys.exit("Failed to identiy home dir, please set C3TK_HOME or LILLYMOL_HOME") - - build_dir = 'Linux' - - try: - build_dir = os.environ['BUILD_DIR'] - except KeyError: - pass - - root_dir = home_dir + '/bin/' + build_dir - - dicer_binary = root_dir + '/dicer' - - try: - # default was 0.50001 - # smaller value like 0.3 will result in less fragmentation (smaller fragments produced) - maxff = float(threshold) - except: - sys.exit('Cannot cast threshold value to float: %s' % threshold) - - if threshold < 0.01 or threshold > 0.999: - sys.exit('Invalid input to dicer command line (build_mmpdicer_cmd) please try >= 0.1 and <= 0.9') - - # full command line, see comment above - dicer_cmd = '-B atype=sb -B MAXFF=' + str(maxff) + \ - ' -C auto -s "ClC" -s "BrC" -s "FC" -B addq -B bscb -m 0 -M 15 -X 5000 -i smi -A I -A D' \ - ' -B nosmi -G iso01 -c -i ICTE' - - if cut_type.upper() == 'DOUBLE': - dicer_cut_opt = '-k 2' - - elif cut_type.upper() == 'BOTH': - dicer_cut_opt = '-k 2' - - elif cut_type.upper() == 'SINGLE': - dicer_cut_opt = '-k 1' - - else: - dicer_cut_opt = '-k 1' - - if return_threshold: - dicer_cmd += " -B appnatoms" - - dicer_full_cmd = dicer_binary + " " + dicer_cmd + " " + dicer_cut_opt + " " + smi_fi - - return dicer_full_cmd - - -def invoke_dicer_subprocess(dicer_cmd, bufsize): - - return Popen(dicer_cmd, stdout=PIPE, stderr=PIPE, shell=True, bufsize=-1) - - -def subprocess_dicer_communicate(dicer_cmd, bufsize): - - proc = invoke_dicer_subprocess(dicer_cmd, bufsize) - - while proc.returncode is None: - - (stdout, stderr) = proc.communicate() - for line in stdout.decode('utf-8').splitlines(): - - yield line - - if proc.returncode != 0: - - sys.exit("Please check your input SMI, the dicer code failed with exit code %s" % str(proc.returncode)) - - -def execute_dicer(smi_fi, cut_type, threshold, logger_object, return_threshold=False): - - logger = logger_object - - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - dicer_cmd = build_mmpdicer_cmd(smi_fi, cut_type, threshold, return_threshold=return_threshold) - - logger.info('Attempting execution of dicer:\n %s' % dicer_cmd) - - for line in subprocess_dicer_communicate(dicer_cmd, 1): - - # logger.debug('Parsing Dicer output...') - - line = line.strip('\r') - line = line.strip('\n') - line_list = line.split() - - # - # expect to see all lines in following format: - # [1CH4] --some id-- AT=[1:C2] --some smiles-- COMP --some id-- AT=[1:C3] - if return_threshold: - try: - mol_id = int(line_list[1]) - ctx_orig = str(line_list[5].lstrip('0')) - frag = str(line_list[0]) - # these next two items are the fragment and context attachment points - fattach = str(line_list[4]).replace("AT=", "") - cattach = str(line_list[8]).replace("AT=", "") - cut_threshold_frag = float(line_list[3]) - cut_num_atoms_frag = float(line_list[2]) - cut_threshold_ctx = float(line_list[10]) - cut_num_atoms_ctx = float(line_list[9]) - except: - logger.warn('Error parsing Dicer output (with threshold), unexpected line format: %s' % line) - try: - mol_id = int(line_list[1]) - except: - logger.warn('Looks like your Input ID is not an integer') - continue - else: - try: - mol_id = int(line_list[1]) - ctx_orig = str(line_list[3].lstrip('0')) - frag = str(line_list[0]) - # these next two items are the fragment and context attachment points - fattach = str(line_list[2]).replace("AT=", "") - cattach = str(line_list[6]).replace("AT=", "") - except: - logger.warn('Error parsing Dicer output, unexpected line format: %s' % line) - try: - mol_id = int(line_list[1]) - except: - logger.warn('Looks like your Input ID is not an integer') - continue - - # now separate out single from double cuts - # we can do this by counting the number of dot disconnected smiles in the context smi string - num_cuts = ctx_orig.count('.') + 1 - - if return_threshold: - yield num_cuts, mol_id, ctx_orig, frag, fattach, cattach, \ - cut_num_atoms_frag, cut_threshold_frag, cut_num_atoms_ctx, cut_threshold_ctx - else: - yield num_cuts, mol_id, ctx_orig, frag, fattach, cattach - - -class _TestMMPDicerFunctions(unittest.TestCase): - - """Test class for Dicer execution - """ - - def setUp(self): - - self.maxDiff = None - - self.temp_file_input_smi = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_output_data = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - - self.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - logging.disable(logging.CRITICAL) - - self.test_dataset_input_smi_01 = { - # All the following test data is from CHEMBL - # CHEMBL2105127 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL2105127/ - '2105127': 'CC1(CC(=O)N(CN2CCOCC2)C1=O)c3ccccc3', - # CHEMBL3989502 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL3989502/ - '3989502': 'CN1[C@@H]2CC[C@H]1C[C@@H](C2)OC3c4ccccc4N(C)S(=O)(=O)c5ccccc35', - } - - self.test_dataset_goldenoutput_fragmentedsmols = { - (1, 2105127, 'O=C1N(CN2CCOCC2)C(=O)C[1CH]1c1ccccc1', '[1CH4]', '[1:C3]', '[1:C3]'): None, - (1, 2105127, 'CC1(c2ccccc2)CC(=O)[1NH]C1=O', '[1CH3]N1CCOCC1', '[1:NAM]', '[1:C3]'): None, - (1, 2105127, '[1CH3]N1C(=O)C(c2ccccc2)(CC1=O)C', 'O1CC[1NH]CC1', '[1:C3]', '[1:N3]'): None, - (1, 2105127, 'C[1CH]1CC(=O)N(CN2CCOCC2)C1=O', '[1cH]1ccccc1', '[1:C3]', '[1:CAR]'): None, - (2, 2105127, '[1CH3]N1CCOCC1.[2cH]1ccccc1', 'C[2CH]1CC(=O)[1NH]C1=O', '[2:CAR|1:C3]', '[1:NAM|2:C3]'): None, - (2, 2105127, 'CC1(c2ccccc2)CC(=O)[1NH]C1=O.O1CC[2NH]CC1', '[12CH4]', '[2:N3|1:NAM]', '[1:C3|2:C3]'): None, - (2, 2105127, 'O1CC[1NH]CC1.[2cH]1ccccc1', '[1CH3]N1C(=O)[2CH](CC1=O)C', '[1:N3|2:CAR]', '[1:C3|2:C3]'): None, - (1, 2105127, '[1CH3]C1(c2ccccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 2105127, 'CC1(c2ccccc2)[1CH2]C(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 2105127, 'CC1(c2ccccc2)CC(=O)N([1CH2]N2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 2105127, 'CC1(c2ccccc2)CC(=O)N(CN2[1CH2]COCC2)C1=O', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 2105127, 'CC1(c2ccccc2)CC(=O)N(CN2CCO[1CH2]C2)C1=O', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 2105127, 'CC1(c2[1cH]cccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 2105127, 'CC1(c2c[1cH]ccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 2105127, 'CC1(c2cc[1cH]cc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1S(=O)(=O)c2c(C(OC3CC4[1NH]C(C3)CC4)c3c1cccc3)cccc2', '[1CH4]', '[1:N3]', '[1:C3]'): None, - (2, 3989502, '[1CH4].[2OH]C1c2c(N(S(=O)(=O)c3c1cccc3)C)cccc2', '[1NH]1C2C[2CH2]CC1CC2', '[1:C3|2:O3]', '[1:N3|2:C3]'): None, - (2, 3989502, '[1CH4].CN1S(=O)(=O)c2c([2CH2]c3c1cccc3)cccc2', '[2OH]C1CC2[1NH]C(C1)CC2', '[2:C3|1:C3]', '[1:N3|2:O3]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c([1NH]S(=O)(=O)c5c3cccc5)cccc4)CC1CC2', '[1CH4]', '[1:N3]', '[1:C3]'): None, - (1, 3989502, '[1OH]C1c2c(N(S(=O)(=O)c3c1cccc3)C)cccc2', 'CN1C2C[1CH2]CC1CC2', '[1:O3]', '[1:C3]'): None, - (2, 3989502, 'CN1C2C[1CH2]CC1CC2.CN1S(=O)(=O)c2c([2CH2]c3c1cccc3)cccc2', '[12OH2]', '[2:C3|1:C3]', '[1:O3|2:O3]'): None, - (1, 3989502, 'CN1S(=O)(=O)c2c([1CH2]c3c1cccc3)cccc2', '[1OH]C1CC2N(C)C(C1)CC2', '[1:C3]', '[1:O3]'): None, - (1, 3989502, '[1CH3]N1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1[1CH]2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2C[1CH2]C1CC(OC1c3c(N(S(=O)(=O)c4c1cccc4)C)cccc3)C2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)[1CH2]C1CC2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2C[1CH](OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(O[1CH]3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4[1cH]cccc4N(S(=O)(=O)c4c3cccc4)C)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cc[1cH]c4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)c[1cH]cc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(S(=O)(=O)N(C)c5[1cH]cccc35)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, '[1CH3]N1S(=O)(=O)c2c(C(OC3CC4N(C)C(C3)CC4)c3c1cccc3)cccc2', '[1H]', '[1:C3]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5[1cH]cccc35)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cc[1cH]c5)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3c[1cH]cc5)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None, - (1, 3989502, 'CN1C2CC(OC3c4[1cH]cccc4S(=O)(=O)N(c4c3cccc4)C)CC1CC2', '[1H]', '[1:CAR]', '[1:H]'): None} - - self.test_dataset_goldenoutput_fragmentedsmols_withthreshold = { - (1, 2105127, 'O=C1N(CN2CCOCC2)C(=O)C[1CH]1c1ccccc1', '[1CH4]', '[1:C3]', '[1:C3]', 1.0, 0.04761905, 20.0, - 0.952381): None, ( - 1, 2105127, 'CC1(c2ccccc2)CC(=O)[1NH]C1=O', '[1CH3]N1CCOCC1', '[1:NAM]', '[1:C3]', 7.0, 0.3333333, 14.0, - 0.6666667): None, ( - 1, 2105127, '[1CH3]N1C(=O)C(c2ccccc2)(CC1=O)C', 'O1CC[1NH]CC1', '[1:C3]', '[1:N3]', 6.0, 0.2857143, 15.0, - 0.7142857): None, ( - 1, 2105127, 'C[1CH]1CC(=O)N(CN2CCOCC2)C1=O', '[1cH]1ccccc1', '[1:C3]', '[1:CAR]', 6.0, 0.2857143, 15.0, - 0.7142857): None, ( - 2, 2105127, '[1CH3]N1CCOCC1.[2cH]1ccccc1', 'C[2CH]1CC(=O)[1NH]C1=O', '[2:CAR|1:C3]', '[1:NAM|2:C3]', 8.0, - 0.3809524, 13.0, 0.6190476): None, ( - 2, 2105127, 'CC1(c2ccccc2)CC(=O)[1NH]C1=O.O1CC[2NH]CC1', '[12CH4]', '[2:N3|1:NAM]', '[1:C3|2:C3]', 1.0, - 0.04761905, 20.0, 0.952381): None, ( - 2, 2105127, 'O1CC[1NH]CC1.[2cH]1ccccc1', '[1CH3]N1C(=O)[2CH](CC1=O)C', '[1:N3|2:CAR]', '[1:C3|2:C3]', 9.0, - 0.4285714, 12.0, 0.5714286): None, ( - 1, 2105127, '[1CH3]C1(c2ccccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2ccccc2)[1CH2]C(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2ccccc2)CC(=O)N([1CH2]N2CCOCC2)C1=O', '[1H]', '[1:C3]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2ccccc2)CC(=O)N(CN2[1CH2]COCC2)C1=O', '[1H]', '[1:C3]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2ccccc2)CC(=O)N(CN2CCO[1CH2]C2)C1=O', '[1H]', '[1:C3]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2[1cH]cccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2c[1cH]ccc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 2105127, 'CC1(c2cc[1cH]cc2)CC(=O)N(CN2CCOCC2)C1=O', '[1H]', '[1:CAR]', '[1:H]', 1.0, 0.04761905, 21.0, - 1.0): None, ( - 1, 3989502, 'CN1S(=O)(=O)c2c(C(OC3CC4[1NH]C(C3)CC4)c3c1cccc3)cccc2', '[1CH4]', '[1:N3]', '[1:C3]', 1.0, - 0.03571429, 27.0, 0.9642857): None, ( - 2, 3989502, '[1CH4].[2OH]C1c2c(N(S(=O)(=O)c3c1cccc3)C)cccc2', '[1NH]1C2C[2CH2]CC1CC2', '[1:C3|2:O3]', - '[1:N3|2:C3]', 8.0, 0.2857143, 20.0, 0.7142857): None, ( - 2, 3989502, '[1CH4].CN1S(=O)(=O)c2c([2CH2]c3c1cccc3)cccc2', '[2OH]C1CC2[1NH]C(C1)CC2', '[2:C3|1:C3]', - '[1:N3|2:O3]', 9.0, 0.3214286, 19.0, 0.6785714): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c([1NH]S(=O)(=O)c5c3cccc5)cccc4)CC1CC2', '[1CH4]', '[1:N3]', '[1:C3]', 1.0, - 0.03571429, 27.0, 0.9642857): None, ( - 1, 3989502, '[1OH]C1c2c(N(S(=O)(=O)c3c1cccc3)C)cccc2', 'CN1C2C[1CH2]CC1CC2', '[1:O3]', '[1:C3]', 9.0, - 0.3214286, 19.0, 0.6785714): None, ( - 2, 3989502, 'CN1C2C[1CH2]CC1CC2.CN1S(=O)(=O)c2c([2CH2]c3c1cccc3)cccc2', '[12OH2]', '[2:C3|1:C3]', - '[1:O3|2:O3]', 1.0, 0.03571429, 27.0, 0.9642857): None, ( - 1, 3989502, 'CN1S(=O)(=O)c2c([1CH2]c3c1cccc3)cccc2', '[1OH]C1CC2N(C)C(C1)CC2', '[1:C3]', '[1:O3]', 10.0, - 0.3571429, 18.0, 0.6428571): None, ( - 1, 3989502, '[1CH3]N1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1[1CH]2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2C[1CH2]C1CC(OC1c3c(N(S(=O)(=O)c4c1cccc4)C)cccc3)C2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)[1CH2]C1CC2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2C[1CH](OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(O[1CH]3c4c(N(S(=O)(=O)c5c3cccc5)C)cccc4)CC1CC2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4[1cH]cccc4N(S(=O)(=O)c4c3cccc4)C)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)cc[1cH]c4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cccc5)C)c[1cH]cc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(S(=O)(=O)N(C)c5[1cH]cccc35)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, '[1CH3]N1S(=O)(=O)c2c(C(OC3CC4N(C)C(C3)CC4)c3c1cccc3)cccc2', '[1H]', '[1:C3]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5[1cH]cccc35)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3cc[1cH]c5)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4c(N(S(=O)(=O)c5c3c[1cH]cc5)C)cccc4)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None, ( - 1, 3989502, 'CN1C2CC(OC3c4[1cH]cccc4S(=O)(=O)N(c4c3cccc4)C)CC1CC2', '[1H]', '[1:CAR]', '[1:H]', 1.0, - 0.03571429, 28.0, 1.0): None - } - - for smi_id, smi in list(self.test_dataset_input_smi_01.items()): - self.temp_file_input_smi.write(smi+" "+smi_id+"\n") - self.temp_file_input_smi.close() - - # container for results data - self.test_dataset_testresults = {} - - def test_execute_dicer(self): - """Test the dicer execution""" - - for cut_type, mol_id, ctx_orig, frag, fattach, cattach in execute_dicer(self.temp_file_input_smi.name, - 'BOTH', - 0.50001, - self.mmplogger): - self.test_dataset_testresults[cut_type, mol_id, ctx_orig, frag, fattach, cattach] = None - - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_fragmentedsmols) - - def test_execute_dicer_with_threshold(self): - """Test dicer execution with return threshold""" - - for cut_type, mol_id, ctx_orig, frag, fattach, cattach, na_frag, thr_frag, na_ctx, thr_ctx in \ - execute_dicer(self.temp_file_input_smi.name, - 'BOTH', - 0.50001, - self.mmplogger, - return_threshold=True): - self.test_dataset_testresults[cut_type, mol_id, ctx_orig, frag, fattach, cattach, - na_frag, thr_frag, na_ctx, thr_ctx] = None - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_fragmentedsmols_withthreshold) - - def test_execute_dicer_with_threshold2(self): - """Test dicer execution with return threshold""" - - for cut_type, mol_id, ctx_orig, frag, fattach, cattach, na_frag, thr_frag, na_ctx, thr_ctx in \ - execute_dicer(self.temp_file_input_smi.name, 'BOTH', 0.7, self.mmplogger, return_threshold=True): - self.test_dataset_testresults[cut_type, mol_id, ctx_orig, frag, fattach, cattach, - na_frag, thr_frag, na_ctx, thr_ctx] = None - - # more as we allow larger dicer fragments, default 37 - self.assertEqual(len(self.test_dataset_testresults), 41) - - def test_execute_dicer_with_threshold3(self): - """Test dicer execution with return threshold""" - - for cut_type, mol_id, ctx_orig, frag, fattach, cattach, na_frag, thr_frag, na_ctx, thr_ctx in \ - execute_dicer(self.temp_file_input_smi.name, 'BOTH', 0.3, self.mmplogger, return_threshold=True): - self.test_dataset_testresults[cut_type, mol_id, ctx_orig, frag, fattach, cattach, - na_frag, thr_frag, na_ctx, thr_ctx] = None - - # less as we restrict dicer to smaller fragments, default 37 - self.assertEqual(len(self.test_dataset_testresults), 31) - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_enum_mols_from_pairs.py b/contrib/script/py/mmp/mmp_enum_mols_from_pairs.py deleted file mode 100755 index 95bee8f8..00000000 --- a/contrib/script/py/mmp/mmp_enum_mols_from_pairs.py +++ /dev/null @@ -1,1154 +0,0 @@ -################################################################### -""" -Summary: Enumerate new molecules from matched pairs - -About: Expects a smiles file of input that it will fragment using dicer into the -separate parts: context + frag_l. It also expects a matched pairs summary file with -columns frag_l and frag_r as minimum. Script will iterate over all possible -fragmentation patterns of every input SMI (there is usually more than one per smi) and -by searching the matched pairs summary file for matching frag_l it will then use trxn -to switch frag_l on original molecule with a matched pair frag_r, fully -enumerated new molecules from the matched pairs. (JAL) - -""" ######################################################## - -import csv -import logging -import os -import sys -# import re -import tempfile -import unittest - -from mmp.mmp_dicer_functions import execute_dicer - -if "LILLYMOL_HOME" in os.environ: - sys.path.insert(0, os.getenv("LILLYMOL_HOME") + "/contrib/script/py/pybase") - import pyopmo as pymo -else: - import pybase.pymo as pymo - - -class MMPEnumerateNewMols(object): - """Object and methods for querying a set of MMPs matching input mol for transfomation - Instantiation of the object requires a valid python logger object to be - passed in as a parameter, even if the logger is switched off. - - Example usage: - mmplogger = logging.getLogger('lillymol_file_logger') - logging.disable(logging.CRITICAL) - mmp_admedb_object = MMPQueryADMEDB(mmplogger) - - """ - - def __init__(self, logger_object): - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit( - "Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging" - ) - - # files for single - self.rxn_file_ctx_sgl = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_frags_sgl = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_rxn_map_sgl = tempfile.NamedTemporaryFile( - delete=False, suffix=".rxn", encoding="utf-8", mode="wt" - ) - self.rxn_file_makethese_sgl = tempfile.NamedTemporaryFile( - delete=False, suffix=".txt", encoding="utf-8", mode="wt" - ) - self.rxn_file_output_sgl = tempfile.NamedTemporaryFile(delete=False, encoding="utf-8", mode="wt") - # files for double - self.rxn_file_ctx_dbl1 = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_ctx_dbl2 = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_frags1_dbl = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_frags2_dbl = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - self.rxn_file_rxn_map1_dbl = tempfile.NamedTemporaryFile( - delete=False, suffix=".rxn", encoding="utf-8", mode="wt" - ) - self.rxn_file_rxn_map2_dbl = tempfile.NamedTemporaryFile( - delete=False, suffix=".rxn", encoding="utf-8", mode="wt" - ) - self.rxn_file_rxn_map3_dbl = tempfile.NamedTemporaryFile( - delete=False, suffix=".rxn", encoding="utf-8", mode="wt" - ) - self.rxn_file_rxn_map4_dbl = tempfile.NamedTemporaryFile( - delete=False, suffix=".rxn", encoding="utf-8", mode="wt" - ) - self.rxn_file_makethese_dbl1 = tempfile.NamedTemporaryFile( - delete=False, suffix=".txt", encoding="utf-8", mode="wt" - ) - self.rxn_file_makethese_dbl2 = tempfile.NamedTemporaryFile( - delete=False, suffix=".txt", encoding="utf-8", mode="wt" - ) - self.rxn_file_output_dbl1 = tempfile.NamedTemporaryFile(delete=False, encoding="utf-8", mode="wt") - self.rxn_file_output_dbl2 = tempfile.NamedTemporaryFile(delete=False, encoding="utf-8", mode="wt") - # - self.do_single = False - self.do_double = False - # - self.single_exit_code1 = None - self.double_exit_code1 = None - self.double_exit_code2 = None - # - self.smi_file = None - self.csv_file = None - # storage - self.mol_smi_dict = {} - self.mol_fragments_dict_single = {} - self.mol_fragments_dict_double = {} - self.unique_frags = set() - self.transformation_groups = {} - self.new_mols = {} - - # this is the reaction specification that trxn needs to combine isotopically labelled mmp fragmentation points - self.single_rxn = '(0 Reaction\n (0 Scaffold\n (A C smarts "[!0*]")\n (A I isotope (0 0))\n )\n' - self.single_rxn += ' (1 Sidechain\n (A C smarts "[!0*]")\n (A I isotope (0 0))\n (A I join (0 0))\n )\n)' - - # double cut needs two types of reaction to deal with the special case [12 isotopic label - self.double_rxn_1 = '(0 Reaction\n (0 Scaffold\n (A C smarts "[1*].[2*]")\n )\n (1 Sidechain\n' - self.double_rxn_1 += ' (A C smarts "[!0*]")\n (A I isotope (0 0))\n (A I join (0 0))\n )\n)' - - self.double_rxn_2 = '(0 Reaction\n (0 Scaffold\n (A C smarts "[1*].[2*]")\n (A I isotope (0 0))\n' - self.double_rxn_2 += ' (A I isotope (1 0))\n )\n (2 Sidechain\n (A C smarts "[!0*]")\n' - self.double_rxn_2 += " (A I isotope (0 0))\n (A I join (1 0))\n )\n)" - - self.double_rxn_3 = '(0 Reaction\n (0 Scaffold\n (A C smarts "[12*]")\n )\n (1 Sidechain\n' - self.double_rxn_3 += ' (A C smarts "[!0*]")\n (A I isotope (0 0))\n (A I join (0 0))\n )\n)' - - self.double_rxn_4 = '(0 Reaction\n (0 Scaffold\n (A C smarts "[12*]")\n (A I isotope (0 0))\n' - self.double_rxn_4 += ' )\n (2 Sidechain\n (A C smarts "[!0*]")\n' - self.double_rxn_4 += " (A I isotope (0 0))\n (A I join (0 0))\n )\n)" - - def clean_out_data_mmpenumerate(self): - """Method to clean out all objects in class""" - - # - self.smi_file = None - self.csv_file = None - # storage - self.mol_smi_dict = {} - self.mol_fragments_dict_single = {} - self.mol_fragments_dict_double = {} - self.unique_frags = set() - self.transformation_groups = {} - self.new_mols = {} - - def file_exists_check(self, file_to_check): - """ """ - try: - if os.path.isfile(file_to_check) > 0: - return True - else: - return False - except OSError: - return False - - def set_do_single(self): - """Method to set self.do_single. Used when enumerating from Pairs file or methods that don't directly - confirm single/double fragmentation""" - self.do_single = True - - def set_do_double(self): - """Method to set self.do_single. Used when enumerating from Pairs file or methods that don't directly - confirm single/double fragmentation""" - self.do_double = True - - def scan_input_smiles(self, smi_fi, injest=False): - """ - TODO : Method ripped from mmp_object, only ingest line differs, could extract to utils function - Dicer currently finishes with an exit code of zero, even if it fails and bombs half way through. This means the - MMP code can complete with results but has missed many pairs due to failed fragmentation, this injects a - pre-check into the process to remove/drop smiles likely to fail dicer BEFORE they get parsed to dicer or just - crash out with an error code. - """ - - self.smi_file = smi_fi - - with open(smi_fi, "rt") as input_smifi: - self.logger.info("Beginning pre-scan of input smiles file") - - line_num = 0 - for line in input_smifi: - line_num += 1 - line_list = line.split() - - # check it's valid smiles file format which is "smiles\sid" - if len(line_list) != 2: - # random exit code of 4 - sys.exit("Failed to parse smiles from line %s of file %s" % (line_num, smi_fi)) - - # Is the smiles_id numeric? - try: - smiles_id = int(line_list[1]) - except: - sys.exit("Failed smiles parser as non-numeric id on line %s of file %s" % (line_num, smi_fi)) - - smiles = line_list[0] - - if len(smiles) < 2: - sys.exit("Failed smiles parser as tiny smi size on line %s of file %s" % (line_num, smi_fi)) - - if "." in smiles: - sys.exit( - "Failed smiles parser as salt on line %s of file %s (pls clean using fileconv)" - % (line_num, smi_fi) - ) - - if injest: - # add to the mol_smi_dict as mol_id => smiles - self.mol_smi_dict[smiles_id] = line_list[0] - - self.logger.info("Completed pre-scan of input smiles file with no errors") - - input_smifi.close() - - def fragment_reference_smi(self, cut_type, dicer_threshold, exclude_h_subst=True): - """Method to fragment using dicer, all smiles we've read into self.original_mols - :dicer_threshold = The dicer threshold value to use during fragmentation (maxff) - """ - - # using key word "DOUBLE" will execute single and double cut in dicer so need to weed out - # the double cuts or we'll get the same as "BOTH" - parse_single = False - parse_double = False - if cut_type == "SINGLE" or cut_type == "BOTH": - parse_single = True - if cut_type == "DOUBLE" or cut_type == "BOTH": - parse_double = True - - if self.smi_file is None: - sys.exit("Please use method scan_input_smiles to set input smiles file") - - # execute dicer to get fragments of key_smi - for num_cuts, mol_id, ctx_orig, frag, fattach, cattach in execute_dicer( - self.smi_file, cut_type, dicer_threshold, self.logger - ): - if num_cuts == 1: - if parse_single: - if mol_id in self.mol_fragments_dict_single: - self.mol_fragments_dict_single[mol_id][ctx_orig] = frag - else: - self.mol_fragments_dict_single[mol_id] = {} - # it's possible we'll get duplicate fragmentation patterns e.g.: CF3 but just overwrite - self.mol_fragments_dict_single[mol_id][ctx_orig] = frag - - elif num_cuts == 2: - if parse_double: - if mol_id in self.mol_fragments_dict_double: - self.mol_fragments_dict_double[mol_id][ctx_orig] = frag - else: - self.mol_fragments_dict_double[mol_id] = {} - # it's possible we'll get duplicate fragmentation patterns e.g.: CF3 but just overwrite - self.mol_fragments_dict_double[mol_id][ctx_orig] = frag - - else: - sys.exit("Error: Dicer gave a cut type other than 1 and 2!! (method call fragment_reference_smi)") - - # now get a unique set of frags for querying the DB - for mol_id in self.mol_fragments_dict_single: - for ctx, frag in self.mol_fragments_dict_single[mol_id].items(): - self.unique_frags.add(frag) - - for mol_id in self.mol_fragments_dict_double: - for ctx, frag in self.mol_fragments_dict_double[mol_id].items(): - self.unique_frags.add(frag) - - ################################################################### - # This single line prevents all H -> substitutions from occurring # - # if it gets removed, then edit the unit tests to include [1H] # - ################################################################### - if exclude_h_subst is True: - if "[1H]" in self.unique_frags: - self.unique_frags.remove("[1H]") - - self.logger.debug("Found %s unique fragments for search" % len(self.unique_frags)) - - def pairs_file_to_dict( - self, pairs_file, frag_l_col="FRAG_L", frag_r_col="FRAG_R", exclude_h_subst=True, num_pairs_limit=999999 - ): - """ - :param pairs_file file to read - :param frag_l_col='FRAG_L' column name of left hand fragment column - :param frag_r_col='FRAG_R' column name of right hand fragment column - :param exclude_h_subst Default False means we will not enumerate H -> R changes - :param num_pairs_limit Parses only the first n lines of the file, use case is get Top 100 from - most common Lilly transforms - """ - # I will return this as frag_L: {frag_r: [data], frag_r: [data]} - temp_dict = {} - - sniffer = csv.Sniffer() - if sniffer.has_header(pairs_file) == 0: - sys.exit("Sniffer module did not detect a header in your CSV file, please check!") - - pairs_count = 0 - with open(pairs_file, "rt") as csvfile: - csv_dialect = sniffer.sniff(csvfile.read(65536)) - csvfile.seek(0) - reader = csv.reader(csvfile, csv_dialect, skipinitialspace=True) - - headers = next(reader) - self.logger.info("Got headers from CSV: %s" % headers) - - csv_items_per_line = len(headers) - self.logger.info("Expected number of items per line: %s" % csv_items_per_line) - - if frag_l_col not in headers: - self.logger.warn("Cannot find column FRAG_L in CSV file") - sys.exit("Cannot find column FRAG_L in CSV file") - else: - header_fragl_num = headers.index(frag_l_col) - - if frag_r_col not in headers: - self.logger.warn("Cannot find column FRAG_R in CSV file") - sys.exit("Cannot find column FRAG_R in CSV file") - else: - header_fragr_num = headers.index(frag_r_col) - - self.logger.info("Column headers look great, proceed with parse") - - for row in reader: - pairs_count += 1 - if pairs_count <= num_pairs_limit: - frag_l = row[header_fragl_num] - frag_r = row[header_fragr_num] - - if frag_l in temp_dict: - temp_dict[frag_l][frag_r] = row - else: - temp_dict[frag_l] = {} - temp_dict[frag_l][frag_r] = row - - else: - self.logger.info("Early Termination parsing %d pairs from file as n limit set" % pairs_count) - break - - self.logger.info("Done reading from csv") - - if exclude_h_subst is True: - if "[1H]" in temp_dict: - del temp_dict["[1H]"] - - return temp_dict - - def add_transformation_group(self, name, pairs_dict): - """ """ - - if name in self.transformation_groups: - sys.exit("Error, you tried to add the same transformation group twice") - else: - self.transformation_groups[name] = pairs_dict - - # write the rxn files we need to disk - def write_rxn_files(self, rxn_type="both"): - """pre-defined rxn is written to file ready for trxn""" - - if rxn_type in ["single", "both"]: - self.rxn_file_rxn_map_sgl.write(self.single_rxn) - self.rxn_file_rxn_map_sgl.close() - - if rxn_type in ["double", "both"]: - self.rxn_file_rxn_map1_dbl.write(self.double_rxn_1) - self.rxn_file_rxn_map1_dbl.close() - self.rxn_file_rxn_map2_dbl.write(self.double_rxn_2) - self.rxn_file_rxn_map2_dbl.close() - - self.rxn_file_rxn_map3_dbl.write(self.double_rxn_3) - self.rxn_file_rxn_map3_dbl.close() - self.rxn_file_rxn_map4_dbl.write(self.double_rxn_4) - self.rxn_file_rxn_map4_dbl.close() - - def write_reactants_mol_frag_dict(self): - """Take the various context we got from dicer for each molecule - (found in self.mol_fragments_dict_single[mol_id]) then see if the - matching frag_l has alternates for a given transformation group (which - will be found in self.transformation_groups[name]). If so, write - out for enumeration - """ - - random_id = 0 - confirm_write = False - - for mol_id in self.mol_fragments_dict_single: - for ctx, frag_l in self.mol_fragments_dict_single[mol_id].items(): - for name in self.transformation_groups: - if frag_l in self.transformation_groups[name]: - random_id += 1 - # delimit the id & frag_l using '___' as it's not valid smiles - # we will split this back again on results read to get original id and frag_l - mol_seq_id = str(mol_id) + "___" + frag_l + "___" + str(random_id) - - # write 1 - context to add new frag to - self.rxn_file_ctx_sgl.write(ctx + " " + mol_seq_id + "\n") - - for frag_r in self.transformation_groups[name][frag_l]: - # at this point only are we completely sure we have a rxn for a given single or double - # cut, therefore set the vars so we know we should read back / expect results - confirm_write = True - # write 2 - new fragment to be added to ctx - self.rxn_file_frags_sgl.write(frag_r + " " + frag_r + "\n") - # write 3 - add to the full 'make' list - self.rxn_file_makethese_sgl.write(mol_seq_id + " " + frag_r + "\n") - - self.do_single = confirm_write - self.rxn_file_ctx_sgl.close() - self.rxn_file_frags_sgl.close() - self.rxn_file_makethese_sgl.close() - - # now repeat for double but this time we write to three files not two - confirm_write = False - - # need this to help find and convert smiles labelled with [12 to duplicate but singly labelled smi - # regex = r"(.*)\[12(.{1,4})\](.*)" - - for mol_id in self.mol_fragments_dict_double: - for ctx, frag_l in self.mol_fragments_dict_double[mol_id].items(): - ctx1 = ctx.split(".")[0] - ctx2 = ctx.split(".")[1] - - for name in self.transformation_groups: - if frag_l in self.transformation_groups[name]: - # write 1 - ctx has to be split, easier to make these the frags - self.rxn_file_frags1_dbl.write(ctx1 + " " + ctx1 + "\n") - self.rxn_file_frags2_dbl.write(ctx2 + " " + ctx2 + "\n") - - for frag_r in self.transformation_groups[name][frag_l]: - # delimit the id & frag_l using '___' as it's not valid smiles - # we will split this back again on results read to get original id and frag_l - mol_seq_id = str(mol_id) + "___" + frag_l + "___" + frag_r - - # print frag_r - # at this point only are we completely sure we have a rxn for a given single or double - # cut, therefore set the vars so we know we should read back / expect results - confirm_write = True - - if "[12" in frag_r: - # match = re.search(regex, frag_r) - # print match.group(1) - # frag_r_before = frag_r - # frag_r = match.group(1) + "[1" + match.group(2) + "].[2" + match.group(2) + "]" + - # match.group(3) - # print ("Edited frag from %s to %s" % (frag_r_before, frag_r)) - self.rxn_file_ctx_dbl2.write(frag_r + " " + mol_seq_id + "\n") - self.rxn_file_makethese_dbl2.write(mol_seq_id + " " + ctx1 + " " + ctx2 + "\n") - - else: - # print ("Edited frag to %s" % (frag_r)) - self.rxn_file_ctx_dbl1.write(frag_r + " " + mol_seq_id + "\n") - self.rxn_file_makethese_dbl1.write(mol_seq_id + " " + ctx1 + " " + ctx2 + "\n") - - self.do_double = confirm_write - # Yes I know I used 'with' but I still had to close these files to avoid odd unittest bug - self.rxn_file_ctx_dbl1.close() - self.rxn_file_ctx_dbl2.close() - self.rxn_file_frags1_dbl.close() - self.rxn_file_frags2_dbl.close() - self.rxn_file_makethese_dbl1.close() - self.rxn_file_makethese_dbl2.close() - - def write_reactants_simple_dict(self, simple_dict): - """Take the various context + fragment pairs we have in a pandas dataframe and write out to file - for enumeration. Writes to same source as write_reactants_mol_frag_dict so do not use this - method at the same time as any other method as files will get overwritten.The assumption is that - all ctx / frag pairs in the dataframe already have attachment points within the smiles - """ - random_id = 0 - - if len(simple_dict) < 1: - raise Exception("Error, no items in dict so cant enumerate anything") - - for ctx_smi, frag_smi in list(simple_dict.keys()): - # delimit the id & frag_l using '___' as it's not valid smiles - # we will split this back again on results read to get original id and frag_l - random_id += 1 - mol_seq_id1 = str(random_id) + "___" + ctx_smi - random_id += 1 - mol_seq_id2 = str(random_id) + "___" + frag_smi - - # write 1 - context to add new frag to - self.rxn_file_ctx_sgl.write(ctx_smi + " " + mol_seq_id1 + "\n") - # write 2 - new fragment to be added to ctx - self.rxn_file_frags_sgl.write(frag_smi + " " + mol_seq_id2 + "\n") - # write 3 - add to the full 'make' list - self.rxn_file_makethese_sgl.write(mol_seq_id1 + " " + mol_seq_id2 + "\n") - - self.do_single = True - self.rxn_file_ctx_sgl.close() - self.rxn_file_frags_sgl.close() - self.rxn_file_makethese_sgl.close() - - def do_reactions(self): - """ - Added function to pymo pymo.make_these_mols as Alternative to trxn, need this for the MMP code for generating - new mols from MMP's, trxn version would be this (two components, single cut fragments): - trxn.sh -S - -r oneConnection.rxn partOne.smi partTwo.smi - For connecting two bonds (three component, two cut fragments) it would be this: - trxn.sh -S -rxn -S - -r twoConnection.rxn partThree.smi partOne.smi partTwo.smi - BUT, if have a long list of different contexts (partOne) and don't want exhaustive enumeration, specify rxn's: - make_these_molecules.sh -R oneConnection.rxn -M m2Make.txt -S - partOne.smi partTwo.smi - In this case, you can put all context fragments SMILES (context1a, context 1b, ...) in one reagent file, and - all fragments SMILES (frag1, frag2, ...) in the second reagent file. If have something like (context1a frag1\n - context1a frag2\ncontext1b frag3\n...) in your m2Make.txt file, you will create the molecules you wanted - """ - # pymo will strip file extension from output filename, trxn requires an stem not full name/suffix but will then - # write to an output file {stem}.smi which makes temp file handling more complicated. This is exhaustive - # exit_status = pymo.trxn([self.rxn_file_ctx_sgl.name, self.rxn_file_frags_sgl.name], - # self.rxn_file_rxn_map_sgl.name, outfile=self.rxn_file_output.name) - # this method / code is not exhaustive and work on instructions in self.rxn_file_makethese.name to make only - # the listed combinations of reagents - - if self.do_single: - self.single_exit_code1 = pymo.make_these_molecules( - [self.rxn_file_ctx_sgl.name, self.rxn_file_frags_sgl.name], - self.rxn_file_makethese_sgl.name, - [self.rxn_file_rxn_map_sgl.name], - outfile=self.rxn_file_output_sgl.name, - debug=False, - ) - - # print ("Ran make_these_molecules with exit code %s and output %s.smi" % ( - # self.single_exit_code1, self.rxn_file_output_sgl.name)) - self.logger.info( - "Ran make_these_molecules with exit code %s and output %s.smi" - % (self.single_exit_code1, self.rxn_file_output_sgl.name) - ) - - if self.do_double: - self.double_exit_code1 = pymo.make_these_molecules( - [self.rxn_file_ctx_dbl1.name, self.rxn_file_frags1_dbl.name, self.rxn_file_frags2_dbl.name], - self.rxn_file_makethese_dbl1.name, - [self.rxn_file_rxn_map1_dbl.name, self.rxn_file_rxn_map2_dbl.name], - outfile=self.rxn_file_output_dbl1.name, - debug=False, - ) - - # print ("Ran make_these_molecules with exit code %s and output %s.smi" % (self.double_exit_code1, - # self.rxn_file_output_dbl1.name)) - self.logger.info( - "Ran make_these_molecules with exit code %s and output %s.smi" - % (self.double_exit_code1, self.rxn_file_output_dbl1.name) - ) - - self.double_exit_code2 = pymo.make_these_molecules( - [self.rxn_file_ctx_dbl2.name, self.rxn_file_frags1_dbl.name, self.rxn_file_frags2_dbl.name], - self.rxn_file_makethese_dbl2.name, - [self.rxn_file_rxn_map3_dbl.name, self.rxn_file_rxn_map4_dbl.name], - outfile=self.rxn_file_output_dbl2.name, - debug=False, - ) - - # print ("Ran make_these_molecules with exit code %s and output %s.smi" % ( - # self.double_exit_code2, self.rxn_file_output_dbl2.name)) - self.logger.info( - "Ran make_these_molecules with exit code %s and output %s.smi" - % (self.double_exit_code2, self.rxn_file_output_dbl2.name) - ) - - def yield_products_complex_id(self): - """Reads output from do_reactions function, output file self.rxn_file_output.name plus .smi extension""" - - self.logger.info("Reading enumerated mols from output files") - - def yield_products_submethod(filename, cut_type): - with open(filename + ".smi", "rt") as outfile: - for line in outfile: - # Return will be new_mol_smi, id_including_the_new_frag, context (1 or two smi) - # do not know what fragment we have added - # - line = line.rstrip("\n") - line_list = line.split(" ") - # example output: - # [{smiles}, '1234___{frag_smi1}}', '+', '{frag_smi2}'] - try: - new_mol = line_list[0] - orig_mol_id = int(line_list[1].split("___")[0]) - frag_removed = line_list[1].split("___")[1] - if cut_type == 1: - frag_added = line_list[3] - elif cut_type == 2: - frag_added = line_list[1].split("___")[2] - except: - self.logger.debug("Skipped line as cannot parse id properly") - continue - - self.logger.debug("I now have %s %s %s" % (new_mol, orig_mol_id, frag_added)) - yield new_mol, orig_mol_id, frag_removed, frag_added - - if self.do_single: - if self.single_exit_code1 == 0 and self.file_exists_check(self.rxn_file_output_sgl.name): - for new_mol, orig_mol_id, frag_removed, frag_added in yield_products_submethod( - self.rxn_file_output_sgl.name, 1 - ): - yield new_mol, orig_mol_id, "single", frag_removed, frag_added - else: - self.logger.debug("No results for Single Cut Enumeration") - - # - if self.do_double: - if self.double_exit_code1 == 0 and self.file_exists_check(self.rxn_file_output_dbl1.name): - for new_mol, orig_mol_id, frag_removed, frag_added in yield_products_submethod( - self.rxn_file_output_dbl1.name, 2 - ): - yield new_mol, orig_mol_id, "double", frag_removed, frag_added - else: - self.logger.debug("No results for Double Cut Enumeration (1 of 2)") - - if self.double_exit_code2 == 0 and self.file_exists_check(self.rxn_file_output_dbl2.name): - for new_mol, orig_mol_id, frag_removed, frag_added in yield_products_submethod( - self.rxn_file_output_dbl2.name, 2 - ): - yield new_mol, orig_mol_id, "double", frag_removed, frag_added - else: - self.logger.debug("No results for Double Cut Enumeration (2 of 2)") - - def yield_products_simple_dict_input(self): - """Reads output from do_reactions function, output file self.rxn_file_output.name plus .smi extension""" - - self.logger.info("Reading enumerated mols from output files") - - def yield_products_simple_submethod(filename): - with open(filename + ".smi", "rt") as outfile: - for line in outfile: - line = line.rstrip("\n") - line_list = line.split(" ") - - try: - new_mol = line_list[0] - context = line_list[1].split("___")[1] - frag = line_list[3].split("___")[1] - - except: - self.logger.debug("Skipped line as cannot parse it") - continue - - self.logger.debug("I now have %s %s which made %s" % (context, frag, new_mol)) - yield context, frag, new_mol - - if self.do_single: - if self.single_exit_code1 == 0 and self.file_exists_check(self.rxn_file_output_sgl.name): - for context, frag, new_mol in yield_products_simple_submethod(self.rxn_file_output_sgl.name): - # print context, frag, new_mol - yield "single", context, frag, new_mol - - else: - self.logger.debug("No results for Single Cut Enumeration") - - if self.do_double: - pass - - def write_products_to_csv(self, csv_filename): - """ - Method to write out enumerated products to a CSV file - :csv_filename name of CSV file to write output to - """ - - with open(csv_filename, "w") as csv_out: - headers = "NEW_MOL_SMI,ORIG_MOL_ID,CUT_TYPE,FRAG_REMOVED,FRAG_ADDED" - csv_out.write(headers + "\n") - - for new_mol, orig_mol_id, cut_type, frag_removed, frag_added in self.yield_products_complex_id(): - write_string = ",".join([new_mol, str(orig_mol_id), cut_type, frag_removed, frag_added]) - csv_out.write(write_string + "\n") - - -# -# unittest everything -# -class _TestMMPEnumerateNewMols(unittest.TestCase): - """Test class to test the object and methods""" - - @classmethod - def setUpClass(self): - # - self.maxDiff = None - - # setup test data locations - self.temp_file_input_csv = tempfile.NamedTemporaryFile(delete=False, encoding="utf-8", mode="wt") - self.temp_file_input_smi = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", encoding="utf-8", mode="wt") - - # setup a logger object - self.mmplogger = logging.getLogger("mmpobjectclass_testlogger") - logging.disable(logging.CRITICAL) - - ################################# - # - # Now create test data - # - ################################# - - self.test_input_smi_data = { - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL3552650/ - "CCN1CC2(CN(Cc3ccncc3)CCN(C2)C(=O)C(C)C)CC1=O": 3552650, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL2426642/ - "CC(C)(C(c1ccncc1)c2ccc3c(cnn3c4ccc(F)cc4)c2)C(=O)Nc5nccs5": 2426642, - } - - self.test_input_single_csv_header = "FRAG_L,FRAG_R,MOL_CLP_DIFF,MOL_CLP_STDEV,NUM_VALS,PERCENT_THAT_INCREASE,PERCENT_THAT_DECREASE,PERCENT_WITH_NOCHANGE,MOL_MWT_DIFF" - self.test_input_single_csv_data = { - "Fc1c[n][1cH][n]c1,[1nH]1[n]ccc1,-0.202,,1,0.000,0.000,100.000,-30.000": None, - "[1nH]1[n]ccc1,Fc1c[n][1cH][n]c1,0.202,,1,0.000,0.000,100.000,30.000": None, - "O=[1CH]C(C)C,O=[1CH]CC,-0.309,0.000,10,10.000,10.000,80.000,-14.100": None, - "O=[1CH]C(C)C,[1CH3]C(C)C,1.599,,1,0.000,0.000,100.000,-14.000": None, - "O=[1CH]C(C)C,[1H],-0.134,0.335,7,0.000,42.860,57.140,-70.100": None, - "O=[1CH]C(C)C,O=[1CH]c1ccc(C#N)cc1,0.355,,1,0.000,0.000,100.000,59.000": None, - "O=[1CH]CC,O=[1CH]C(C)C,0.309,0.000,10,10.000,10.000,80.000,14.100": None, - "O=[1CH]CC,[1CH3]C(C)C,1.908,,1,0.000,0.000,100.000,0.100": None, - "O=[1CH]CC,[1H],-0.118,0.497,4,0.000,25.000,75.000,-56.000": None, - "O=[1CH]CC,O=[1CH]c1ccc(C#N)cc1,0.664,,1,0.000,0.000,100.000,73.100": None, - "[1CH3]C(C)C,O=[1CH]C(C)C,-1.599,,1,0.000,0.000,100.000,14.000": None, - "[1CH3]C(C)C,O=[1CH]CC,-1.908,,1,0.000,0.000,100.000,-0.100": None, - "[1CH3]c1cc[n]cc1,[1CH3]c1c[n]ccc1,0.000,0.000,12,0.000,0.000,100.000,0.000": None, - "[n]1c[1cH]ccc1,[1H],-0.388,0.434,17,11.760,0.000,88.240,-77.100": None, - "[n]1c[1cH]ccc1,CCOc1[1cH]cccc1,1.945,,1,0.000,0.000,100.000,43.100": None, - "[1nH]1[n]ccc1,FC(c1[n][1nH]c(C2COC2)c1)(F)F,0.068,,1,0.000,0.000,100.000,124.100": None, - "[1FH],[1BrH],0.630,0.186,9,0.000,0.000,100.000,60.900": None, - # junk data but a good double cut example for testing - "[12CH4],[12NH3],0.630,0.186,9,0.000,0.000,100.000,60.900": None, - "[2CH3][1CH3],O=[2CH][1CH2]C,0.630,0.186,9,0.000,0.000,100.000,60.900": None, - # test methyl replacement so change to something significant - "[1CH4],[1IH],0.630,0.186,9,0.000,0.000,100.000,60.900": None, - # included to allow H subst - "[1H],O=[1CH]CC,-0.118,0.497,4,0.000,25.000,75.000,-56.000": None, - } - - self.test_output_products_single = { - "C(I)N1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(CC1=CC=NC=C1)C2": [3552650, "[1CH4]", "[1IH]"], - "O=C1N(CC)CC2(CN(C(=O)C(C)C)CCN(C2)CC2=CN=CC=C2)C1": [3552650, "[1CH3]c1cc[n]cc1", "[1CH3]c1c[n]ccc1"], - "O=C1N(CC2(CN(CC3=CC=NC=C3)CCN(C2)C(=O)CC)C1)CC": [3552650, "O=[1CH]C(C)C", "O=[1CH]CC"], - "O=C1N(CC2(CN(CC3=CC=NC=C3)CCN(C2)CC(C)C)C1)CC": [3552650, "O=[1CH]C(C)C", "[1CH3]C(C)C"], - "O=C1N(CC2(CN(CC3=CC=NC=C3)CCN([H])C2)C1)CC": [3552650, "O=[1CH]C(C)C", "[1H]"], - "O=C1N(CC2(CN(CC3=CC=NC=C3)CCN(C2)C(=O)C2=CC=C(C#N)C=C2)C1)CC": [ - 3552650, - "O=[1CH]C(C)C", - "O=[1CH]c1ccc(C#N)cc1", - ], - "O=C(C(C)I)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1": [3552650, "[1CH4]", "[1IH]"], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(I)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1": [2426642, "[1CH4]", "[1IH]"], - "O=C(NC1=NC=CS1)C(C)(C)C(C1=CC2=C(N(N=C2)C2=CC=C(Br)C=C2)C=C1)C1=CC=NC=C1": [2426642, "[1FH]", "[1BrH]"], - } - - self.test_output_products_single_incH = self.test_output_products_single.copy() - self.test_output_products_single_incH.update( - { - "C(CN1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(CC1=CC=NC=C1)C2)C(=O)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(CN(C(=O)C2)C(C)C(=O)CC)CN(CC2=CC=NC=C2)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(C(N(C(=O)C2)CC)C(=O)CC)CN(CC2=CC=NC=C2)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(C(N(CC3=CC=NC=C3)CC1)C(=O)CC)CN(C(=O)C2)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(C(C2=CC=NC=C2)C(=O)CC)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(CC2=C(C=NC=C2)C(=O)CC)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC(=C2)C(=O)CC)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC(N(CC2=CC=NC=C2)CC2(CN(C(=O)C2)CC)C1)C(=O)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1C(CN(CC2=CC=NC=C2)CC2(CN(C(=O)C2)CC)C1)C(=O)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1C(C2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1)C(=O)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)(C)C(=O)CC)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "C(C(C)C(=O)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1)C(=O)CC": [3552650, "[1H]", "O=[1CH]CC"], - "O=C(C(C)C)N1CC2(C(C(=O)N(C2)CC)C(=O)CC)CN(CC2=CC=NC=C2)CC1": [3552650, "[1H]", "O=[1CH]CC"], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C(=O)NC2=NC=CS2)CC(=O)CC)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)(C2=CC=NC=C2)C(=O)CC)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=C(C=NC=C2)C(=O)CC)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC(=NC=C2)C(=O)CC)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=C(C=C23)C(=O)CC)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC(=C23)C(=O)CC)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2C3=CC=C(C=C3C(=N2)C(=O)CC)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC(=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1)C(=O)CC": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=C(C=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1)C(=O)CC": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=C2C=CC(=C3C(=O)CC)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)N(C2=NC=CS2)C(=O)CC)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC(=CS2)C(=O)CC)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=C(S2)C(=O)CC)C2=CC=NC=C2)C=C1": [ - 2426642, - "[1H]", - "O=[1CH]CC", - ], - } - ) - - self.test_output_products_double = { - "N(C)N1CC2(CN(C(=O)C(C)C)CCN(C2)CC2=CC=NC=C2)CC1=O": [3552650, "[12CH4]", "[12NH3]"], - "N(N1CCN(CC2(CN(C(=O)C2)CC)C1)C(=O)C(C)C)C1=CC=NC=C1": [3552650, "[12CH4]", "[12NH3]"], - } - - # smi file - basic test - for smi, smi_id in self.test_input_smi_data.items(): - self.temp_file_input_smi.write(smi + " " + str(smi_id) + "\n") - self.temp_file_input_smi.close() - - # csv file - basic test - self.temp_file_input_csv.write(self.test_input_single_csv_header + "\n") - for data in list(self.test_input_single_csv_data.keys()): - self.temp_file_input_csv.write(data + "\n") - self.temp_file_input_csv.close() - - # container for results data - self.test_dataset_testresults = {} - - @classmethod - def tearDownClass(self): - """Cleanup for end of all tests""" - - # remove(self.temp_file_input_smi.name) - os.remove(self.temp_file_input_csv.name) - - def setUp(self): - """Setup object for clean reuse in further tests""" - # create empty mmp object - self.test_mmp_pairs_object = MMPEnumerateNewMols(self.mmplogger) - - def tearDown(self): - """Tear down object for clean reuse in further tests""" - # - self.test_mmp_pairs_object.clean_out_data_mmpenumerate() - self.test_dataset_testresults.clear() - - def test_scan_input_smiles(self): - """Test build_graph_from_pairs""" - # - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - - # returned chembl compounds - self.assertEqual( - self.test_mmp_pairs_object.mol_smi_dict, - { - 2426642: "CC(C)(C(c1ccncc1)c2ccc3c(cnn3c4ccc(F)cc4)c2)C(=O)Nc5nccs5", - 3552650: "CCN1CC2(CN(Cc3ccncc3)CCN(C2)C(=O)C(C)C)CC1=O", - }, - ) - - def test_fragment_reference_smi(self): - """ """ - # - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - # new stuff - self.test_mmp_pairs_object.fragment_reference_smi("SINGLE", 0.3) - - # print(self.test_mmp_pairs_object.unique_frags) - # removed '[1H]' as we don't allow H substitution - self.assertEqual( - self.test_mmp_pairs_object.unique_frags, - { - "s1[1cH][n]cc1", - "[n]1cc[1cH]cc1", - "[1CH4]", - "O=[1CH]Nc1scc[n]1", - "[1CH3]C", - "[1NH2]c1scc[n]1", - "[1FH]", - "[1CH3]c1cc[n]cc1", - "C[1CH2]C", - "Fc1cc[1cH]cc1", - "O=[1CH]C(C)C", - }, - ) - - def test_fragment_reference_smi_incH(self): - """ """ - # - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - # new stuff - self.test_mmp_pairs_object.fragment_reference_smi("SINGLE", 0.3, exclude_h_subst=False) - - # print(self.test_mmp_pairs_object.unique_frags) - # removed '[1H]' as we don't allow H substitution - self.assertEqual( - self.test_mmp_pairs_object.unique_frags, - { - "[1H]", - "O=[1CH]Nc1scc[n]1", - "s1[1cH][n]cc1", - "[1NH2]c1scc[n]1", - "C[1CH2]C", - "[n]1cc[1cH]cc1", - "O=[1CH]C(C)C", - "[1CH3]C", - "[1CH4]", - "Fc1cc[1cH]cc1", - "[1FH]", - "[1CH3]c1cc[n]cc1", - }, - ) - - def test_pairs_file_to_dict(self): - """ """ - # - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name) - - self.assertEqual(len(result), 11) - - def test_pairs_file_to_dict_numpairslimit(self): - """ """ - # - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name, num_pairs_limit=10) - - # result is not a 1:1 mapping of pairs to items but should be less than previous test - self.assertEqual(len(result), 4) - - def test_add_transformation_group(self): - """ """ - # - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name) - self.test_mmp_pairs_object.add_transformation_group("test", result) - - self.assertEqual(len(self.test_mmp_pairs_object.transformation_groups), 1) - - def test_everything_at_once_single(self): - """ """ - # input smi - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - self.test_mmp_pairs_object.fragment_reference_smi("SINGLE", 0.3) - # get pairs - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name) - self.test_mmp_pairs_object.add_transformation_group("test", result) - # write rxn files - self.test_mmp_pairs_object.write_rxn_files() - self.test_mmp_pairs_object.write_reactants_mol_frag_dict() - self.test_mmp_pairs_object.do_reactions() - # new for test - for ( - new_mol, - orig_mol_id, - cut_type, - frag_removed, - frag_added, - ) in self.test_mmp_pairs_object.yield_products_complex_id(): - self.test_dataset_testresults[new_mol] = [orig_mol_id, frag_removed, frag_added] - - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_output_products_single) - - def test_everything_at_once_single_simple_dict(self): - """ """ - - # fragmented forms of original test data - chembl molecules - test_data = { - ("[1CH3]N1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(Cc1cc[n]cc1)C2", "[1CH4]"): None, - ("O=C1N(CC)CC2(CN(C(=O)C(C)C)CC[1NH]C2)C1", "[1CH3]c1cc[n]cc1"): None, - ("[1CH3]N1CC2(CN(C(=O)C2)CC)CN(C(=O)C(C)C)CC1", "[n]1cc[1cH]cc1"): None, - ("O=C1N(CC2(CN(Cc3cc[n]cc3)CC[1NH]C2)C1)CC", "O=[1CH]C(C)C"): None, - ("CCN1C(=O)CC2(C1)CN(Cc1cc[n]cc1)CCN([1CH]=O)C2", "C[1CH2]C"): None, - ("O=C([1CH2]C)N1CC2(CN(C(=O)C2)CC)CN(Cc2cc[n]cc2)CC1", "[1CH4]"): None, - ("O=C(C(C)C)N1CC2(C[1NH]C(=O)C2)CN(Cc2cc[n]cc2)CC1", "[1CH3]C"): None, - ("[1CH3]CN1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(Cc1cc[n]cc1)C2", "[1H]"): None, - ("O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN([1CH2]c2cc[n]cc2)CC1", "[1H]"): None, - ("Fc1ccc([n]2[n]cc3c2ccc(C([1CH](C(=O)Nc2scc[n]2)C)c2cc[n]cc2)c3)cc1", "[1CH4]"): None, - ("Fc1ccc([n]2[n]cc3c2ccc([1CH2]C(C(=O)Nc2scc[n]2)(C)C)c3)cc1", "[n]1cc[1cH]cc1"): None, - ("O=C(Nc1scc[n]1)C(C(c1cc2c([1nH][n]c2)cc1)c1cc[n]cc1)(C)C", "Fc1cc[1cH]cc1"): None, - } - - # write rxn files - self.test_mmp_pairs_object.write_rxn_files() - self.test_mmp_pairs_object.write_reactants_simple_dict(test_data) - self.test_mmp_pairs_object.do_reactions() - # - for cut_type, context, frag, new_mol in self.test_mmp_pairs_object.yield_products_simple_dict_input(): - self.test_dataset_testresults[context, frag, new_mol] = None - - # print(self.test_dataset_testresults) - self.assertEqual( - { - ( - "[1CH3]N1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(Cc1cc[n]cc1)C2", - "[1CH4]", - "C(C)N1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(CC1=CC=NC=C1)C2", - ): None, - ( - "O=C1N(CC)CC2(CN(C(=O)C(C)C)CC[1NH]C2)C1", - "[1CH3]c1cc[n]cc1", - "O=C1N(CC)CC2(CN(C(=O)C(C)C)CCN(C2)CC2=CC=NC=C2)C1", - ): None, - ( - "[1CH3]N1CC2(CN(C(=O)C2)CC)CN(C(=O)C(C)C)CC1", - "[n]1cc[1cH]cc1", - "C(N1CC2(CN(C(=O)C2)CC)CN(C(=O)C(C)C)CC1)C1=CC=NC=C1", - ): None, - ( - "O=C1N(CC2(CN(Cc3cc[n]cc3)CC[1NH]C2)C1)CC", - "O=[1CH]C(C)C", - "O=C1N(CC2(CN(CC3=CC=NC=C3)CCN(C2)C(=O)C(C)C)C1)CC", - ): None, - ( - "CCN1C(=O)CC2(C1)CN(Cc1cc[n]cc1)CCN([1CH]=O)C2", - "C[1CH2]C", - "CCN1C(=O)CC2(C1)CN(CC1=CC=NC=C1)CCN(C(=O)C(C)C)C2", - ): None, - ( - "O=C([1CH2]C)N1CC2(CN(C(=O)C2)CC)CN(Cc2cc[n]cc2)CC1", - "[1CH4]", - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1", - ): None, - ( - "O=C(C(C)C)N1CC2(C[1NH]C(=O)C2)CN(Cc2cc[n]cc2)CC1", - "[1CH3]C", - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(CC2=CC=NC=C2)CC1", - ): None, - ( - "[1CH3]CN1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(Cc1cc[n]cc1)C2", - "[1H]", - "C([H])CN1C(=O)CC2(C1)CN(C(=O)C(C)C)CCN(CC1=CC=NC=C1)C2", - ): None, - ( - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN([1CH2]c2cc[n]cc2)CC1", - "[1H]", - "O=C(C(C)C)N1CC2(CN(C(=O)C2)CC)CN(C([H])C2=CC=NC=C2)CC1", - ): None, - ( - "Fc1ccc([n]2[n]cc3c2ccc(C([1CH](C(=O)Nc2scc[n]2)C)c2cc[n]cc2)c3)cc1", - "[1CH4]", - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1", - ): None, - ( - "Fc1ccc([n]2[n]cc3c2ccc([1CH2]C(C(=O)Nc2scc[n]2)(C)C)c3)cc1", - "[n]1cc[1cH]cc1", - "FC1=CC=C(N2N=CC3=CC(=CC=C23)C(C(C)(C)C(=O)NC2=NC=CS2)C2=CC=NC=C2)C=C1", - ): None, - ( - "O=C(Nc1scc[n]1)C(C(c1cc2c([1nH][n]c2)cc1)c1cc[n]cc1)(C)C", - "Fc1cc[1cH]cc1", - "O=C(NC1=NC=CS1)C(C)(C)C(C1=CC2=C(N(N=C2)C2=CC=C(F)C=C2)C=C1)C1=CC=NC=C1", - ): None, - }, - self.test_dataset_testresults, - ) - - def test_everything_at_once_single_incH(self): - """ """ - # input smi - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - self.test_mmp_pairs_object.fragment_reference_smi("SINGLE", 0.3, exclude_h_subst=False) - # get pairs - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name, exclude_h_subst=False) - self.test_mmp_pairs_object.add_transformation_group("test", result) - # write rxn files - self.test_mmp_pairs_object.write_rxn_files() - self.test_mmp_pairs_object.write_reactants_mol_frag_dict() - - self.test_mmp_pairs_object.do_reactions() - # new for test - for ( - new_mol, - orig_mol_id, - cut_type, - frag_removed, - frag_added, - ) in self.test_mmp_pairs_object.yield_products_complex_id(): - self.test_dataset_testresults[new_mol] = [orig_mol_id, frag_removed, frag_added] - - # print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_output_products_single_incH) - - def test_everything_at_once_double(self): - """ """ - # input smi - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - self.test_mmp_pairs_object.fragment_reference_smi("DOUBLE", 0.3) - # get pairs - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name) - self.test_mmp_pairs_object.add_transformation_group("test", result) - # write rxn files - self.test_mmp_pairs_object.write_rxn_files() - self.test_mmp_pairs_object.write_reactants_mol_frag_dict() - self.test_mmp_pairs_object.do_reactions() - - # pp.pprint(self.test_mmp_pairs_object.mol_fragments_dict_double) - # new for test - for ( - new_mol, - orig_mol_id, - cut_type, - frag_removed, - frag_added, - ) in self.test_mmp_pairs_object.yield_products_complex_id(): - self.test_dataset_testresults[new_mol] = [orig_mol_id, frag_removed, frag_added] - - # print(self.test_dataset_testresults) - # - self.assertEqual(self.test_dataset_testresults, self.test_output_products_double) - - def test_everything_at_once_both(self): - """ """ - # input smi - self.test_mmp_pairs_object.scan_input_smiles(self.temp_file_input_smi.name, injest=True) - self.test_mmp_pairs_object.fragment_reference_smi("BOTH", 0.3) - # get pairs - result = self.test_mmp_pairs_object.pairs_file_to_dict(self.temp_file_input_csv.name) - self.test_mmp_pairs_object.add_transformation_group("test", result) - # write rxn files - self.test_mmp_pairs_object.write_rxn_files() - self.test_mmp_pairs_object.write_reactants_mol_frag_dict() - self.test_mmp_pairs_object.do_reactions() - - # introspect - # print(self.test_mmp_pairs_object.mol_fragments_dict_single) - # print(self.test_mmp_pairs_object.mol_fragments_dict_double) - - # pp.pprint(self.test_mmp_pairs_object.mol_fragments_dict_double) - # new for test - for ( - new_mol, - orig_mol_id, - cut_type, - frag_removed, - frag_added, - ) in self.test_mmp_pairs_object.yield_products_complex_id(): - self.test_dataset_testresults[new_mol] = [orig_mol_id, frag_removed, frag_added] - - # get the results by merging single and double - results = self.test_output_products_single.copy() - results.update(self.test_output_products_double) - # print(results) - # - self.assertEqual(self.test_dataset_testresults, results) - - -if __name__ == "__main__": - unittest.main() diff --git a/contrib/script/py/mmp/mmp_math_functions.py b/contrib/script/py/mmp/mmp_math_functions.py deleted file mode 100755 index 79e5317a..00000000 --- a/contrib/script/py/mmp/mmp_math_functions.py +++ /dev/null @@ -1,85 +0,0 @@ -################################################################### -# Summary: Cantor Pairing function and Reverse Cantor used in MMP code -# -# Notes: -# - Allows us to fold a vector of integer values onto a single integer value reversibly -# http://stackoverflow.com/questions/919612/mapping-two-integers-to-one-in-a-unique-and-deterministic-way -# http://mathworld.wolfram.com/PairingFunction.html -# - This allows us to create a dict keyed by a single int value, saving memory over a tuple based key -# https://guillaume.segu.in/blog/code/487/optimizing-memory-usage-in-python-a-case-study/ -# The alternative is to store a tuple of ints (x, y) which is 8x? more expensive than an int? -# - The function is order specific so (int1, int2) -> int3 and (int2, int1) -> int4 (not int3) -# and should only be applied to positive integer values -# - 2016 changes give 25% reduction in speed over 1K input SMI: -# from 0.78s -> 0.71s -> 0.59s removing isinstance then sub-functions -# -############################################################# -import math -import unittest - - -def cantor(x, y): - - # 2016 uncommented this out to speed things up, int input checked elsewhere - # catch anything other than positive integer - # if not isinstance(x, int) or x < 0: - # raise Exception("Need positive integers as input") - # if not isinstance(y, int) or y < 0: - # raise Exception("Need positive integers as input") - - return int((x + y) * (x + y + 1) / 2 + y) - - -# def _w(z): -# n = math.sqrt(8 * z + 1) -# return int(math.floor((n - 1) / 2)) - - -# def _t(z): -# w = _w(z) -# return (w * w + w) / 2 - - -def inv_cantor(z): - - # 2016 uncommented this out to speed things up, int input checked elsewhere - # catch anything other than positive integer - # if not isinstance(z, int) or z < 0: - # raise Exception("Need positive integer as input") - - # t = _t(z) - # w = _w(z) - # 2016 faster without the use of subfunctions - n = math.sqrt(8 * z + 1) - w = int(math.floor((n - 1) / 2)) - t = (w * w + w) / 2 - y = z - t - x = w - y - - return (int(x), int(y)) - - -class _TestAllFunctions(unittest.TestCase): - - def setUp(self): - self.test_val_1 = 12345 - self.test_val_2 = 98765 - self.test_val_12 = 6172870370 - - def test_cantor(self): - # input (test_val_1, test_val_2) should become test_val_12 - self.assertEqual(cantor(self.test_val_1, self.test_val_2), self.test_val_12) -# self.assertRaises(Exception, cantor, (-1,1)) -# self.assertRaises(Exception, cantor, ('string',1)) -# self.assertRaises(Exception, cantor, (10.99,1)) - - def test_invcantor(self): - # input value test_val_12 should return a tuple of test_val_1, test_val_2 - self.assertEqual(inv_cantor(self.test_val_12), (self.test_val_1, self.test_val_2)) -# self.assertRaises(Exception, inv_cantor, -1) -# self.assertRaises(Exception, inv_cantor, 'string') -# self.assertRaises(Exception, inv_cantor, 10.99) - - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_mcss_objects.py b/contrib/script/py/mmp/mmp_mcss_objects.py deleted file mode 100755 index 9cca156a..00000000 --- a/contrib/script/py/mmp/mmp_mcss_objects.py +++ /dev/null @@ -1,615 +0,0 @@ -################################################################### -""" Summary: Class and Methods for deriving MCSS based MMP's - -About: Derive a matched pair based MCSS from a pair molecules - -To do: - extend the method enumerate_fragment_properties to also - enumerate self.mol_smi_dict as this would allow the addition - of a flag '-p' that prints out whole molecule props alongside - MCSS and therefore compute %molecule that the MCSS covers - - could use other descriptors from IW code to get MCSS via - bond count not #Atoms or - - Should move iterators in process_mcss_list_to_string to be numeric - and store numeric ID's in self.largest_mcs_mmp_double/single - - could allow further switched to change behaviour of tie break - where single/double or double alone give tie break MCSS - [connected substructures versus disconnected or both/either] - - - Extension to triple cut would allow improved search/match e.g.: - N1(C(c2c(cc3c(c2)OCO3)CC1)c4cc(c(c(c4)OC)O)OC)C(=O)OC CHEMBL311765 - N1(C(c2c(cc(cc2)O)CC1)c3ccc(cc3)OCCN4CCCC4)C(=O)OCC CHEMBL94080 -""" -################################################################### -import logging -import csv -import os -import sys -import unittest -import tempfile - -from builtins import range - -from mmp.mmp_data_objects import MMPDataObjectClass - -if 'LILLYMOL_HOME' in os.environ: - sys.path.insert(0, os.getenv('LILLYMOL_HOME') + '/contrib/script/py/pybase'); - import pyopmo as pymo -else: - import pybase.pymo as pymo - - -class MMPbasedMCSSObjectClass(MMPDataObjectClass): - - def __init__(self, logger_object): - """ - Example usage: - mmplogger = logging.getLogger('lillymol_file_logger') - logging.disable(logging.CRITICAL) - my_mmp_mcss_object = MMPbasedMCSSObjectClass(mmplogger) - - """ - - MMPDataObjectClass.__init__(self, logger_object) - - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - # this is used for storing the largest MCS MMP for given pair - self.largest_mcs_mmp_result = {} - self.ref_smi_props = {} - - def clean_out_data_mcss_obj(self): - """Method to clean out all objects in class""" - - self.clean_out_data() - self.mcs_mmp.clear() - - def enumerate_fragment_properties(self): - """Writes out the ref_smi_dict to disk, calculates natoms, returns data to self.ref_smi_props - Some complexities in method such as double cut fragments (iw_descr only calcs largest frag)""" - - frag_smi_file = tempfile.NamedTemporaryFile(delete=False, suffix='.smi') - frag_smi_props_out = tempfile.NamedTemporaryFile(delete=False) - - with open(frag_smi_file.name, "w") as f: - for item in self.refsmi_dict: - if isinstance(item, int): - # can't see an easy way to do this except string compare, [1H] causes iw_descr to crash out - if self.refsmi_dict[item] != '[1H]': - f.write(self.refsmi_dict[item]+" "+str(item)+"\n") - - # run pymo.iwdescr - self.logger.info("Running pymo.iwdescr on %s smi with in:%s, out:%s" % - (len(self.refsmi_dict), frag_smi_file.name, frag_smi_props_out.name)) - exit_status = pymo.iwdescr(frag_smi_file.name, frag_smi_props_out.name, params_dict={'-l': '', '-v': ''}, - loggero=self.logger) - self.logger.debug("Ran iwdescr with exit status %s" % exit_status) - - with open(frag_smi_props_out.name, "r") as csv_file: - - reader = csv.reader(csv_file, delimiter=' ') - i = -1 - for row in reader: - - i += 1 - # if header row, append headers - if i == 0: - if row[1] != 'w_natoms': - self.logger.warn("When this was written, NATOMs was in array position 1 (zero indexed) with " - "column title w_natoms. Now it's not, it's: %s" % row[1]) - sys.exit("When this was written, NATOMs was in array position 1 (zero indexed) with column " - "title w_natom. Now it's not, it's: %s" % row[1]) - continue - - # we trust there is only one entry per id - # print row[0], row[1] - self.ref_smi_props[int(row[0])] = int(row[1]) - - frag_smi_props_out.close() - - self.logger.debug("Completed load of %s mol props from dict of %s from file %s" % - (len(self.ref_smi_props), len(self.refsmi_dict)/2, frag_smi_props_out.name)) - - def get_largest_mcs_pairs(self, out_file, cut_type, mdc_atm_soft=None, mdc_atm_soft_threshold=None, - mdc_atm_hard=None): - """Method to print out a single smi - smi pair from the input CSV with data differences. Selection of the - exact matched pair for a given smi - smi combination is based on the largest Maximum Common Substructure - which equates to the MMP with the smallest MWT/#Atoms difference across all MMP's for that smi/smi combo - - out_file: - The user specified output file - - cut_type: - Specifies the type of fragmentation required. Allowed values are SINGLE, - DOUBLE or BOTH. Currently this class does not support anything greater than - double cut fragmentation - - mdc_atm_hard: - max double cut atom cutoff (hard) - Never consider double cut context fragments where one half has num_atoms <= mdc_atm_hard - i.e.: this is a hard cutoff filter implemented during dicer parsing - - mdc_atm_soft: - max double cut atom cutoff (soft) - * must be used with mdc_atm_soft_threshold - When double cut is greater than single, if one part of double context has num_atoms <= mdc_atm_soft and - total double cut atom <= single cut atoms + mdc_atm_soft_threshold then discard - - mdc_atm_soft_threshold: - max double cut atom cutoff threshold (soft) - * must be used with mdc_atm_soft - This gets added to single cut num atoms each comparison that's done, if and when mdc_atm_soft is set - see details of mdc_atm_soft - - Example usage: - - # give me a CSV named my_output.pairs of all MCS based pairs: - my_mmp_object.get_largest_mcs_pairs('myoutput.csv', 'BOTH', 'DICER') - - # give me a CSV of only the DOUBLE cut MCS based pairs with RDKit attachment points: - my_mmp_object.get_largest_mcs_pairs('myoutput.csv', 'DOUBLE', 'RDKIT') - """ - - if (mdc_atm_soft is not None and mdc_atm_soft_threshold is None) or\ - (mdc_atm_soft is None and mdc_atm_soft_threshold is not None): - sys.exit("Error, mdc_atm_soft and mdc_atm_soft_threshold must be specified together.") - - def process_mcss_list_to_string(prefix, input_list): - """sub method to build a printable string from input list of specific structure""" - - out_string = '' - num_of_entries = len(input_list) - - if num_of_entries > 4: - - for i_ in range(0, num_of_entries, 4): - out_string = out_string + prefix + "_" + str((i_/4)+1) + "," + str(molid_L) + "," + str(molid_R) - out_string = out_string + "," + str(sum(input_list[0 + i_])) + "," + str(input_list[1 + i_]) + "," - out_string = out_string + str(input_list[2 + i_]) + "," + str(input_list[3 + i_]) - out_string += "\n" - - else: - if len(input_list[1]) > 1: - ctx_smi = self.refsmi_dict[input_list[1][0]] + "." + self.refsmi_dict[input_list[1][1]] - else: - ctx_smi = self.refsmi_dict[input_list[1][0]] - out_string = prefix + "," + str(molid_L) + "," + str(molid_R) + "," - out_string = out_string + str(sum(input_list[0])) + "," + ctx_smi + "," - out_string = out_string + str(self.refsmi_dict[input_list[2]]) + "," \ - + str(self.refsmi_dict[input_list[3]]) - out_string += "\n" - - return out_string - - def disambiguate_double_list(input_list): - """sub method to untangle double cut tie break cases""" - - num_of_entries = len(input_list) - - filtered_list = [] - - # The tie code should have only saved the example with the largest 'smallest fragment' size - # so now we just take the first example where atom numbering [1 before [2 - # Theoretically, if two different examples of a double cut fragmentation pattern exist with the same number - # of atoms *in both parts* of the context, then there is another tie break here. e.g.: - # num_atoms in context = (2,10) should always appear not (1,11) but can't disentangle many (1,11) - # Decided not to handle this and instead just take the first one with the ordered numbering - for i_ in range(0, num_of_entries, 4): - - # only use if the isomeric label is the right way round, [1 before [2 - if '[1' in self.refsmi_dict[input_list[1 + i_][0]]: - filtered_list = input_list[(0 + i_): (4 + i_)] - else: - continue - - return filtered_list - - def remove_atom_num_dupes(input_list): - """sub method to get only 1 example of simple isomeric numbering flip""" - - # only use if the isomeric label is the right way round, [1 before [2 - if '[1' in self.refsmi_dict[input_list[1][0]]: - # take the first 4 items - output_list = input_list[:4] - - else: - # just take the last 4 items - output_list = input_list[-4:] - - return output_list - - self.logger.info('Opening output file for write: %s' % out_file) - - # check cut_type, convert to int - if cut_type.upper() == 'DOUBLE': - # confusing but faster later - cut_type_id = 3 - elif cut_type.upper() == 'BOTH': - # confusing but faster later - cut_type_id = 2 - elif cut_type.upper() == 'SINGLE': - cut_type_id = 1 - else: - self.logger.warn('cut_type specification is incorrect, using single cut: %s' % cut_type.upper()) - cut_type_id = 1 - - # fail if both single_pairs_dict and double_pairs_dict are empty - if (len(self.single_pairs_dict) == 0) and (len(self.double_pairs_dict) == 0): - self.logger.debug('No data found in single_pairs_dict and/or double_pairs_dict, expect no results') - # sys.exit("Error: no data found in single_pairs_dict and/or double_pairs_dict, nothing to find and write") - - # - # Here we build data structures of type: - # self.largest_mcs_mmp_result[(molid_L, molid_R)] = [(#atoms, #atoms or None), - # (context_id, context_id or None), frag_Left_id, frag_Right_id] - # - - # single - this is easy as we only keep/store the one with the greatest number of atoms - if cut_type_id <= 2: - - for molid_L, molid_R, ctx_id, frag_L_id, frag_R_id in \ - self.iterator_single_pairs_dict_numeric(inc_attachpt=False): - - if (molid_L, molid_R) in self.largest_mcs_mmp_result: - - if self.largest_mcs_mmp_result[(molid_L, molid_R)][0][0] <= self.ref_smi_props[ctx_id]: - - if self.largest_mcs_mmp_result[(molid_L, molid_R)][0][0] == self.ref_smi_props[ctx_id]: - - self.largest_mcs_mmp_result[(molid_L, molid_R)].extend( - [(self.ref_smi_props[ctx_id], ), (ctx_id, ), frag_L_id, frag_R_id]) - - else: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = [ - (self.ref_smi_props[ctx_id], ), (ctx_id, ), frag_L_id, frag_R_id] - - else: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = [ - (self.ref_smi_props[ctx_id], ), (ctx_id, ), frag_L_id, frag_R_id] - - # now build the final results on the fly - # double - for each one we compare against what we already have in self.largest_mcs_mmp_result - - ctx_natoms = None - - if cut_type_id >= 2: - - for molid_L, molid_R, ctx1_id, ctx2_id, frag_L_id, frag_R_id in \ - self.iterator_double_pairs_dict_numeric(inc_attachpt=False): - - # - if ctx1_id in self.ref_smi_props: - ctx_natoms = (self.ref_smi_props[ctx1_id], ) - else: - ctx1_smi = self.refsmi_dict[ctx1_id] - ctx1_smi = ctx1_smi.replace("[1", "[9") - ctx1_smi = ctx1_smi.replace("[2", "[1") - ctx1_smi = ctx1_smi.replace("[9", "[2") - try: - ctx_natoms = (self.ref_smi_props[self.refsmi_dict[ctx1_smi]], ) - except: - print("ERR >>>") - print(("{} {} {} {} {} {}".format(molid_L, molid_R, ctx1_id, ctx2_id, frag_L_id, frag_R_id))) - print(("{} {} {}".format(ctx1_id, ctx1_smi, self.refsmi_dict[ctx1_smi]))) - print("") - - if ctx2_id in self.ref_smi_props: - ctx_natoms = ctx_natoms + (self.ref_smi_props[ctx2_id], ) - else: - ctx2_smi = self.refsmi_dict[ctx2_id] - ctx2_smi = ctx2_smi.replace("[1", "[9") - ctx2_smi = ctx2_smi.replace("[2", "[1") - ctx2_smi = ctx2_smi.replace("[9", "[2") - ctx_natoms = ctx_natoms + (self.ref_smi_props[self.refsmi_dict[ctx2_smi]], ) - - # If the indicator flag check_all_context is set to true we need to pre-filter all ctx fragments - # to ensure they are greater than or equal to the specified limit for mdc_atm_hard (maximum double - # cut atoms hard limit). This is a crude filter and could remove valid double cut MCSS. - if mdc_atm_hard is not None: - if ctx_natoms[0] <= mdc_atm_hard: - continue - - elif ctx_natoms[1] <= mdc_atm_hard: - continue - - # - # Main - # have we seen this smi - smi pair before? - if (molid_L, molid_R) in self.largest_mcs_mmp_result: - - # get the number of atoms in the context - num_atoms_existing = self.largest_mcs_mmp_result[(molid_L, molid_R)][0] - if len(num_atoms_existing) > 1: - total_num_atoms_existing = sum(num_atoms_existing) - else: - total_num_atoms_existing = num_atoms_existing[0] - total_num_atoms_new = sum(ctx_natoms) - - if total_num_atoms_new > total_num_atoms_existing: - - # if it is a double and we have a min fragment setting - if mdc_atm_soft is not None: - - # if it falls below the threshold at which we apply this min frag setting - if total_num_atoms_new <= (total_num_atoms_existing + mdc_atm_soft_threshold): - # only keep if both frag sizes are legal - if '[1' in self.refsmi_dict[ctx1_id]: - if (ctx_natoms[0] > mdc_atm_soft) and (ctx_natoms[1] > mdc_atm_soft): - self.largest_mcs_mmp_result[(molid_L, molid_R)] = \ - [ctx_natoms, (ctx1_id, ctx2_id), frag_L_id, frag_R_id] - - # above threshold so keep anyway - else: - if '[1' in self.refsmi_dict[ctx1_id]: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = \ - [ctx_natoms, (ctx1_id, ctx2_id), frag_L_id, frag_R_id] - - else: - if '[1' in self.refsmi_dict[ctx1_id]: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = \ - [ctx_natoms, (ctx1_id, ctx2_id), frag_L_id, frag_R_id] - - # tie-break - elif total_num_atoms_new == total_num_atoms_existing: - - # single always wins over double, so only consider this if existing is double - # double cut tie breaks get disambiguated later using custom function - if len(num_atoms_existing) == 1: - continue - - else: - # consider the size of the 'smallest fragment' and add if same, replace if bigger, - # drop if smaller - if min(ctx_natoms) > min(num_atoms_existing): - if '[1' in self.refsmi_dict[ctx1_id]: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = \ - [ctx_natoms, (ctx1_id, ctx2_id), frag_L_id, frag_R_id] - - elif min(ctx_natoms) == min(num_atoms_existing): - self.largest_mcs_mmp_result[(molid_L, molid_R)].extend( - [ctx_natoms, (ctx1_id, ctx2_id), frag_L_id, frag_R_id]) - - else: - # don't store as we have a better context with a larger 'smallest fragment' - continue - - # double cut context must be smaller than what we already have so discard this new one - else: - continue - - else: - # new result, case where we only have a double cut MCSS so add it! - if '[1' in self.refsmi_dict[ctx1_id]: - self.largest_mcs_mmp_result[(molid_L, molid_R)] = [ctx_natoms, (ctx1_id, ctx2_id), - frag_L_id, frag_R_id] - - with open(out_file, "w") as final_out: - - final_out.write('CUT_TYPE,MOL_ID_L,MOL_ID_R,NATOMS,MCSS,FRAG_L,FRAG_R\n') - - # do single cut first as these take precedence above a double - for (molid_L, molid_R) in self.largest_mcs_mmp_result: - - list_length = len(self.largest_mcs_mmp_result[(molid_L, molid_R)]) - # the list self.largest_mcs_mmp_result[(molid_L, molid_R)] contains an ordered list of items - # the first 4 are (1) a tuple of the num_atoms (2) fragment (3&4) context in two parts - # Therefore if the list is greater than 8 items it means we have more than one double - # cut that we need to consider, possibly as a double cut tie break. We do not consider the - # case where there are 8 items as we know this will be two identical fragmentation patterns - # with differing isomeric numbering on the atom attachment points therefore we use >8 not >=8 - if list_length > 8: - if len(self.largest_mcs_mmp_result[(molid_L, molid_R)][0]) == 1: - # disambiguate single cut list - final_out.write(process_mcss_list_to_string('SINGLE', self.largest_mcs_mmp_result[ - (molid_L, molid_R)][0:4])) - - else: - # print("Double won (a): ", molid_L, molid_R, self.largest_mcs_mmp_result[(molid_L, molid_R)]) - new_list = disambiguate_double_list(self.largest_mcs_mmp_result[(molid_L, molid_R)]) - final_out.write(process_mcss_list_to_string('DOUBLE', new_list)) - - elif list_length == 4: - # print("Single won (a): ", molid_L, molid_R, self.largest_mcs_mmp_result[(molid_L, molid_R)]) - final_out.write(process_mcss_list_to_string('SINGLE', self.largest_mcs_mmp_result[ - (molid_L, molid_R)])) - - else: - # print("Double wins (b): ", molid_L, molid_R, self.largest_mcs_mmp_result[(molid_L, molid_R)]) - # need to remove atom numbering dupes then print - new_list = remove_atom_num_dupes(self.largest_mcs_mmp_result[(molid_L, molid_R)]) - final_out.write(process_mcss_list_to_string('DOUBLE', new_list)) - - -class _TestMMPbasedMCSSObjectClass(unittest.TestCase): - """Test class for MMPDataObjectClass(object) written to use pythons unittest - - Example usage: - - python mmp_mcss_objects.py - - coverage run mmp_mcss_objects.py - coverage report mmp_mcss_objects.py - - """ - - def setUp(self): - - """Instantiate temp file names, test data objects that get written to temp files - a silent logger object (needed to instantiate class) and the mmp object we'll test""" - - self.maxDiff = None - - # setup test data location use tempfile.NamedTemporaryFile(delete=False) to persist data on disk - self.temp_file_input_smi_01 = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", - encoding='utf-8', mode='wt') - self.temp_file_input_smi_03 = tempfile.NamedTemporaryFile(delete=False, suffix=".smi", - encoding='utf-8', mode='wt') - self.temp_file_output_pairs = tempfile.NamedTemporaryFile(delete=False) - - # setup a logger object - self.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - # logging.disable(logging.CRITICAL) - - # create empty mmp object - self.test_mmp_mcss_object = MMPbasedMCSSObjectClass(self.mmplogger) - - # data set for use in testing input - self.test_dataset_goldeninput_smi_01 = { - # The following represent synthetic data, analogues of CHEMBL1382609 - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1382609/ - # 1. substituents are added to the pyrazole ring to generate side chain MMPs - # H on CHEMBL1382609 between two methyls is changed to Br, F, C, I to - # visually see the change in the smiles string (avoiding Cl as already present) - # e.g.: N1C(=C(Br)C(=N1)C)C - # 2. core ring system is modified (phenyl to pyridine) to see ring switch MMP's - # Presence/Absence of Pyridine-N and N-positional isomerism in Cl-Ph ring - # e.g.: C2=NC(=CS2)C2=CC=C(Cl)C=C2 + addition of N -> - # C2=NC(=CS2)C2=CN=C(Cl)C=C2 + move N around ring -> - # C2=NC(=CS2)C2=NC=C(Cl)C=C2 - # for 1,2 single wins - '001': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '002': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(F)C(=N1)C)C', - # for 2,5 double wins tie - '003': 'N1(C2=NC(=CS2)C2=CN=C(Cl)C=C2)C(=C(F)C(=N1)C)C', - # The following represent synthetic data, analogues of CHEMBL1341352 - # for 1341352 and it's synthetic unsubstituted analogue there is no double - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1341352/ - '1341352': 'Cc1cc(nn1CC(=O)NCc2ccccc2)C(F)(F)F', - '004': 'c1cc(nn1CC(=O)NCc2ccccc2)', - # more double cut only - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL6211 - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL6232 - '6211': 'O=C(OCC1N(C(=O)c2cc(c(OC)c(c2)OC)OC)CCN(C1)C(=O)c1cc(c(OC)c(OC)c1)OC)CCCCCCC', - '6232': 'O=C(N1C(CN(C(=O)c2cc(c(OC)c(c2)OC)OC)CC1)COC(=O)CC(C)(C)C)c1cc(c(OC)c(OC)c1)OC' - } - - self.test_dataset_goldeninput_smi_03 = { - # repeat of above - '001': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(Br)C(=N1)C)C', - '002': 'N1(C2=NC(=CS2)C2=CC=C(Cl)C=C2)C(=C(F)C(=N1)C)C', - } - - # all smiles are output from above input as either a repeat smiles or a fragment of them - self.test_dataset_golden_output_01 = {'CUT_TYPE,MOL_ID_L,MOL_ID_R,NATOMS,MCSS,FRAG_L,FRAG_R': None, - 'SINGLE,1,2,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH]': None, - 'SINGLE,2,1,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH]': None, - 'DOUBLE,2,3,14,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[1cH]1cc[2cH]cc1,[n]1[1cH]cc[2cH]c1': None, - 'DOUBLE,3,2,14,[1ClH].Fc1c([n](c2sc[2cH][n]2)[n]c1C)C,[n]1[1cH]cc[2cH]c1,[1cH]1cc[2cH]cc1': None, - 'SINGLE,1341352,4,11,O=C(NCc1ccccc1)[1CH3],Cc1[1nH][n]c(C(F)(F)F)c1,[1nH]1[n]ccc1': None, - 'SINGLE,4,1341352,11,O=C(NCc1ccccc1)[1CH3],[1nH]1[n]ccc1,Cc1[1nH][n]c(C(F)(F)F)c1': None, - 'DOUBLE,6211,6232,40,[1CH4].[2CH3]C(=O)OCC1N(C(=O)c2cc(c(OC)c(c2)OC)OC)CCN(C1)C(=O)c1cc(c(OC)c(OC)c1)OC,[1CH3]CCC[2CH3],C[12CH2]C': None, - 'DOUBLE,6232,6211,40,[1CH4].[2CH3]C(=O)OCC1N(C(=O)c2cc(c(OC)c(c2)OC)OC)CCN(C1)C(=O)c1cc(c(OC)c(OC)c1)OC,C[12CH2]C,[1CH3]CCC[2CH3]': None} - - self.test_dataset_golden_output_02 = {'CUT_TYPE,MOL_ID_L,MOL_ID_R,NATOMS,MCSS,FRAG_L,FRAG_R': None, - 'SINGLE,1,2,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH]': None, - 'SINGLE,2,1,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH]': None, - 'SINGLE,2,3,13,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1cc[1cH]cc1,Clc1[n]c[1cH]cc1': None, - 'SINGLE,3,2,13,Fc1c([n](c2sc[1cH][n]2)[n]c1C)C,Clc1[n]c[1cH]cc1,Clc1cc[1cH]cc1': None, - 'SINGLE,1341352,4,11,O=C(NCc1ccccc1)[1CH3],Cc1[1nH][n]c(C(F)(F)F)c1,[1nH]1[n]ccc1': None, - 'SINGLE,4,1341352,11,O=C(NCc1ccccc1)[1CH3],[1nH]1[n]ccc1,Cc1[1nH][n]c(C(F)(F)F)c1': None, - 'SINGLE,6211,6232,39,[1CH3]C(=O)OCC1N(C(=O)c2cc(c(OC)c(c2)OC)OC)CCN(C1)C(=O)c1cc(c(OC)c(OC)c1)OC,[1CH3]CCCCC,C[1CH](C)C': None, - 'SINGLE,6232,6211,39,[1CH3]C(=O)OCC1N(C(=O)c2cc(c(OC)c(c2)OC)OC)CCN(C1)C(=O)c1cc(c(OC)c(OC)c1)OC,C[1CH](C)C,[1CH3]CCCCC': None} - - self.test_dataset_golden_output_03 = {'CUT_TYPE,MOL_ID_L,MOL_ID_R,NATOMS,MCSS,FRAG_L,FRAG_R': None, - 'SINGLE,1,2,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH]': None, - 'SINGLE,2,1,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH]': None, - 'DOUBLE,1,2,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1BrH],[1FH]': None, - 'DOUBLE,2,1,19,Clc1ccc(c2csc([n]3[n]c([1cH]c3C)C)[n]2)cc1,[1FH],[1BrH]': None} - - # write test data to temp file (smi) - for smi_id, smi in list(self.test_dataset_goldeninput_smi_01.items()): - self.temp_file_input_smi_01.write(smi + " " + smi_id + "\n") - self.temp_file_input_smi_01.close() - - # write test data to temp file (smi) - for smi_id, smi in list(self.test_dataset_goldeninput_smi_03.items()): - self.temp_file_input_smi_03.write(smi + " " + smi_id + "\n") - self.temp_file_input_smi_03.close() - - # container for results data - self.test_dataset_testresults = {} - - def tearDown(self): - - """Tear down object for clean reuse in further tests""" - # clean out the object - self.test_mmp_mcss_object.clean_out_data() - # clean out the temp data store - self.test_dataset_testresults.clear() - - os.remove(self.temp_file_input_smi_01.name) - - def test_get_largest_mcs_pairs_with_diff(self): - """Test method to get largest MCS MMP for given smi - smi pair""" - - # 6. full build then write of pairs to file, but only for a single named column - self.test_mmp_mcss_object.build_from_dicer(self.temp_file_input_smi_01.name, 'BOTH', 'NONE') - self.test_mmp_mcss_object.enumerate_fragment_properties() - self.test_mmp_mcss_object.get_largest_mcs_pairs(self.temp_file_output_pairs.name, 'BOTH') - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_golden_output_01, self.test_dataset_testresults) - - def test_get_largest_mcs_pairs_mdc_atm_hard(self): - """Test method to get largest MCS MMP for given smi - smi pair""" - - # 6. full build then write of pairs to file, but only for a single named column - self.test_mmp_mcss_object.build_from_dicer(self.temp_file_input_smi_01.name, 'BOTH', 'NONE') - self.test_mmp_mcss_object.enumerate_fragment_properties() - self.test_mmp_mcss_object.get_largest_mcs_pairs(self.temp_file_output_pairs.name, 'BOTH', mdc_atm_hard=4) - - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_golden_output_02, self.test_dataset_testresults) - - def test_get_largest_mcs_pairs_mdc_atm_soft(self): - """Test method to get largest MCS MMP for given smi - smi pair""" - - # 6. full build then write of pairs to file, but only for a single named column - self.test_mmp_mcss_object.build_from_dicer(self.temp_file_input_smi_03.name, 'BOTH', 'NONE') - self.test_mmp_mcss_object.enumerate_fragment_properties() - - # - self.test_mmp_mcss_object.get_largest_mcs_pairs(self.temp_file_output_pairs.name, 'BOTH') - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - self.test_mmp_mcss_object.get_largest_mcs_pairs(self.temp_file_output_pairs.name, 'BOTH', mdc_atm_soft=3, - mdc_atm_soft_threshold=4) - # now read it back into temp object and check it's what we wrote out! - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - test_results_filehandle.close() - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_golden_output_03, self.test_dataset_testresults) - - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_objects.py b/contrib/script/py/mmp/mmp_objects.py deleted file mode 100755 index 5e2479d7..00000000 --- a/contrib/script/py/mmp/mmp_objects.py +++ /dev/null @@ -1,1399 +0,0 @@ -################################################################### -""" Summary: Class and Methods to get matched moleular pairs from smiles - -About: The class is an implementation of Hussain & Rea's approach to generating -Matched Molecular Pairs (J Chem Inf Modea 2013, 50, 339-348) adapted -for the Lilly Dicer fragmentation code. Input smiles are stored, -all dicer fragments are stored and a new Dictonary object created to -store the matched pairs. The data object should not be read directly -for double cuts due to the way the cut/attachment points are labelled -in dicer. (JAL) - -Example usage: - - my_mmp_object = MMPObjectClass(my_logger) - my_mmp_object.build_from_dicer(smi_fi, cut_type, filter_type) - my_mmp_object.print_to_file(out_fi, out_fmt, cut_type) - -Warnings: This class assumes the input smiles are already salt stripped -and that their identifiers and convertible to a valid integer value. Input -smiles of the type "c1ccccc1[N+].[Cl-] CHEMBL9876" will fail for both reasons -""" -################################################################### - -import re -import logging -import sys - -import unittest -import tempfile -import copy - -from mmp.mmp_math_functions import cantor, inv_cantor -import mmp.mmp_dicer_functions as dicer - - -class MMPObjectClass(object): - - """Class implements objects nad methods for MMP generation using Dicer fragmentation output - Instantiation of the object requires a valid python logger object to be - passed in as a parameter, even if the logger is switched off. - - Example usage: - mmplogger = logging.getLogger('lillymolfile_logger') - logging.disable(logging.CRITICAL) - my_mmp_object = MMPObjectClass(mmplogger) - - Due to the large number of repeat smiles generated by dicer fragmentation, - normalisation is applied to the final single and double cut matched pairs - objects. The actual MMP data objects single_pairs_dict and double_pairs_dict - simply contain numerical id's for ever smiles. The smiles are stored in - refsmi_dict - """ - def __init__(self, logger_object): - - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - # remains as dict smi => id - # but also do reverse insert id => smi - # keep refsmi_id at object level as may use methods involving this id multiple times - self.refsmi_id = 0 - self.refsmi_dict = {} - - # Don't want to store input SMI in refsmi_dict in case we get a - # key clash with the numbers used in the refsmi_dict index - # generated by the cantor pairing, therefore use separate dict - self.mol_smi_dict = {} - - # dict to store the attachment points - self.refattach_id = 0 - self.refattach_dict = {} - - # become dict of dicts - self.single_pairs_dict = {} - self.double_pairs_dict = {} - - # some of the iterators attached to this method work on the entire object or on a second custom dict - # but in order to accept a second custom dict it needs to look the same as the base objects i.e.: - # self.single_pairs_dict and self.double_pairs_dict - # Example useage include mmp_predict where we take mols/ctx from single_pairs_comparison_dict and - # get a predicted delta from the pair in self.single_pairs_dict - self.single_pairs_comparison_dict = {} - self.double_pairs_comparison_dict = {} - - # 02/16 Added this for MMS work to avoid exhaustive enumeration of MMS - # Becomes a dict of frag1 => { ctx_1 = None, - # ctx_2 = None } - # frag2 => { ctx_3 = None } - # so we can quickly find which ctx base MMS we need to enumerate via this lookup table - # and can then just enumerate a subset of MMS from self.single_pairs_dict not all - self.frag_ctx_lookup_sgl = {} - self.frag_ctx_lookup_dbl = {} - - def clean_out_data(self): - """Method to clean out all objects in class""" - - # just clean out all the dicts - self.refsmi_dict.clear() - self.single_pairs_dict.clear() - self.double_pairs_dict.clear() - # - self.single_pairs_comparison_dict.clear() - self.double_pairs_comparison_dict.clear() - - def scan_input_smiles(self, smi_fi, injest=False, fail_chirals=False): - """ - Dicer currently finishes with an exit code of zero, even if it fails and bombs half way through. This means the - MMP code can complete with results but has missed many pairs due to failed fragmentation, this injects a - pre-check into the process to remove/drop smiles likely to fail dicer BEFORE they get parsed to dicer or just - crash out with an error code. - """ - with open(smi_fi, "r") as input_smifi: - - self.logger.info('Beginning pre-scan of input smiles file') - - line_num = 0 - for line in input_smifi: - - line_num += 1 - line_list = line.split() - - # check it's valid smiles file format which is "smiles\sid" - if len(line_list) != 2: - # random exit code of 4 - sys.exit("Failed to parse smiles from line %s of file %s" % - (line_num, smi_fi)) - - # Is the smiles_id numeric? - try: - smiles_id = int(line_list[1]) - except: - sys.exit("Failed smiles parser as non-numeric id on line %s, id %s of file %s" % - (line_num, smiles_id, smi_fi)) - - smiles = line_list[0] - - if len(smiles) < 2: - sys.exit("Failed smiles parser as tiny smi size on line %s, id %s of file %s" % - (line_num, smiles_id, smi_fi)) - - if "." in smiles: - sys.exit("Failed smiles parser as salt on line %s, id %s\nfilename: %s" % - (line_num, smiles_id, smi_fi)) - - if "[1" in smiles or "[2" in smiles: - sys.exit("Failed smiles parser as smiles on line %s has isomeric label [1 or [2 %s" % - (line_num, smi_fi)) - - if fail_chirals: - if "@" in smiles: - sys.exit("Failed smiles parser as smiles on line %s contains chiral flags @: %s" % - (line_num, smi_fi)) - - if injest: - # add to the mol_smi_dict as mol_id => smiles - self.mol_smi_dict[smiles_id] = line_list[0] - - self.logger.info('Completed pre-scan of input smiles file with no errors') - - def build_from_dicer(self, smi_fi, cut_type, filter_type, threshold=0.50001, use_comparison_dict=False, - add_frag_ctx_map=False): - """Method to take an input smiles file, fragment each entry with dicer, then read - the dicer output back into the MMP data structure. The data structures produced - by this method should not be accessed directly. The following parameters are - mandatory: - - smi_fi: - The user specified smiles file - - cut_type: - Specifies the type of fragmentation required. Allowed values are SINGLE, - DOUBLE or BOTH. Currently this class does not support anything greater than - double cut fragmentation - - filter_type: - Optional filtering can be applied to remove rings or non-rings from any of - the matched pairs generated. - REMOVE_NONRINGS, REMOVE_RINGS where NONE will results in all dicer output - being used/stored. Filtering is achieved using the regular expression '[^H\[]\d' - - Example usage, all matched pairs single and double cut fragmentation: - my_mmp_object.build_from_dicer('mysmiles.smi', 'BOTH', 'NONE') - - Example usage, Terminal ring replacements: - my_mmp_object.build_from_dicer('mysmiles.smi', 'SINGLE', 'REMOVE_NONRINGS') - - Example usage, non-ring linker matched pairs / replacements: - my_mmp_object.build_from_dicer('mysmiles.smi', 'DOUBLE', 'REMOVE_RINGS') - - 180515 - Added parameter use_comparison_dict in order to hijack method and allow - us to reuse code in building an additional MMP data structure for use in later algorithms - such as mmp_predict, this writes data to an alternate 'comparison' dict of same structure - - 260516 - Surfaced maxff setting as threshold param. - Use of threshold default 0.50001 ensures >50.001% of mol remains as context. Larger values - will remove smaller fragments and a value of 0.7 will retain 70% of the mol as context - """ - # smi_fi and cut_type are passed through to build_mmpdicer_cmd() - self.logger.debug("Dicer threshold: %s" % threshold) - - # see comments on below iterator_single_pairs_dict - if use_comparison_dict is False: - query_dict_single = self.single_pairs_dict - query_dict_double = self.double_pairs_dict - - elif use_comparison_dict is True: - query_dict_single = self.single_pairs_comparison_dict - query_dict_double = self.double_pairs_comparison_dict - - else: - self.logger.debug("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - sys.exit("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - - # * Check filter_type, convert to int - # * Regex works by finding all examples of xN where x is an atom or ')' and N is a number/int - # For example c1 / n2 / )1 which are all ring close/open smiles. We need to exclude things like - # ]1 and 12 which give false matches against isomerically labelled groups. - # Full details: [^H\d\[]\d pattern group [] plus digit \d, inside pattern group ^H^\d\[ means - # ^ means NOT.. H, digit \d or escaped [ - # - if filter_type == 'REMOVE_NONRINGS': - filter_id = 2 - regex_rings = re.compile(r'[^H\d\[]\d') - elif filter_type == 'REMOVE_RINGS': - filter_id = 3 - regex_rings = re.compile(r'[^H\d\[]\d') - else: - filter_id = 1 - - # check cut_type, convert to int - if cut_type.upper() == 'DOUBLE': - # confusing but faster later - cut_type_id = 3 - elif cut_type.upper() == 'BOTH': - # confusing but faster later - cut_type_id = 2 - elif cut_type.upper() == 'SINGLE': - cut_type_id = 1 - else: - self.logger.warn('cut_type specification is incorrect, using single cut: %s' % cut_type.upper()) - cut_type_id = 1 - - # - ctx1_id = 0 - ctx2_id = 0 - frag_id = 0 - - dupe_frags_s = 0 - dupe_frags_d = 0 - - # regex to find contexts that have a fragmentation point label of '1' - regex_context1 = re.compile(r'\[1\w{1,3}\]') - - for num_cuts, mol_id, ctx_orig, frag, fattach, cattach in dicer.execute_dicer(smi_fi, cut_type, threshold, - self.logger): - - # use continue for filter conditions, some say continue is not nice - if filter_id > 1: - if regex_rings.search(frag) is None: - rings = 0 - else: - rings = 1 - if filter_id == 3 and rings > 0: - continue - elif filter_id == 2 and rings == 0: - continue - - # add frag to smi dict or get id if exists - # self.refsmi_dict [a_smiles_string] => an_arbitrary_uniqueid - if frag in self.refsmi_dict: - frag_id = self.refsmi_dict[frag] - else: - self.refsmi_id += 1 - self.refsmi_dict[frag] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = frag - frag_id = self.refsmi_id - self.logger.debug('Got new fragment: %s with id %s' % (frag, self.refsmi_id)) - - # add attachment points to self.refattach_dict both ways to allow for - # reverse lookup: [attachment point string] = random_id and reverse - if fattach in self.refattach_dict: - fattach_id = self.refattach_dict[fattach] - else: - self.refattach_id += 1 - self.refattach_dict[fattach] = self.refattach_id - self.refattach_dict[self.refattach_id] = fattach - fattach_id = self.refattach_id - self.logger.debug('Got new Attachment point label: %s with id %s' % (fattach, fattach_id)) - - if cattach in self.refattach_dict: - cattach_id = self.refattach_dict[cattach] - else: - self.refattach_id += 1 - self.refattach_dict[cattach] = self.refattach_id - self.refattach_dict[self.refattach_id] = cattach - cattach_id = self.refattach_id - self.logger.debug('Got new Attachment point label: %s with id %s' % (cattach, cattach_id)) - - if num_cuts == 1 and cut_type_id <= 2: - - # does the context already have an unique id? if not get one and add to self.refsmi_dict - if ctx_orig in self.refsmi_dict: - ctx1_id = self.refsmi_dict[ctx_orig] - else: - self.refsmi_id += 1 - self.refsmi_dict[ctx_orig] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = ctx_orig - ctx1_id = self.refsmi_id - self.logger.debug('Got new single cut ctx smi: %s with id %s' % (ctx_orig, self.refsmi_id)) - - # now create molid_ctx1id_id which is a unique combination of the mol_id paired with a valid frag_id - # for that given mol_id. - # we need to store both, as they are different fragmentations, keying a dict by mol_id would overwrite - # values so we create unique molid + fragid combination (** see matching comments below) - molid_fragid_uid = cantor(mol_id, frag_id) - - # - # add to the MMP data structure - if ctx1_id in query_dict_single: - - # check to remove dupes occuring from groups like CF3 that give 3 x C-F fragmentation - if molid_fragid_uid in query_dict_single[ctx1_id]: - dupe_frags_s = dupe_frags_s + 1 - else: - self.logger.debug('Add to query dict (ctx_id, mol_id (as molid_fragid_uid), ' - 'frag_id): %s, %s, %s, %s' % (ctx1_id, mol_id, molid_fragid_uid, frag_id)) - query_dict_single[ctx1_id][molid_fragid_uid] = (fattach_id, cattach_id) - - else: - # create new entry in single cut MMP dict - # builds data structure of: - # dict_single[ctx1] => { mol1_frag1_id => frag1 } - # dict_single[ctx2] => { mol1_frag2_id => frag2 } - # dict_single[ctx3] => { mol1_frag3_id => frag3 } - # As you add new mols you get something like: - # dict_single[ctx1] => { mol1_frag1_id => frag1, mol2_frag4_id => frag4, } - # dict_single[ctx2] => { mol1_frag2_id => frag2, mol2_frag5_id => frag5, } - # dict_single[ctx3] => { mol1_frag3_id => frag3 } - # dict_single[ctx4] => { mol2_frag6_id => frag6 } - query_dict_single[ctx1_id] = {} - self.logger.debug('Add to query dict (ctx_id, mol_id (as molid_fragid_uid), ' - 'frag_id): %s, %s, %s, %s' % (ctx1_id, mol_id, molid_fragid_uid, frag_id)) - query_dict_single[ctx1_id][molid_fragid_uid] = (fattach_id, cattach_id) - - # 02/16 - # added in for Matched Series on-the-fly generation - if add_frag_ctx_map: - - if frag_id in self.frag_ctx_lookup_sgl: - - # just add it, don't care if we overwrite - self.frag_ctx_lookup_sgl[frag_id].add(ctx1_id) - - else: - self.frag_ctx_lookup_sgl[frag_id] = set() - self.frag_ctx_lookup_sgl[frag_id].add(ctx1_id) - - elif num_cuts == 2 and cut_type_id >= 2: - - # split up the fragments and fragment attachment points - ctx_list = ctx_orig.split('.') - ctx1_tmp = ctx_list[0] - ctx2_tmp = ctx_list[1] - - # sort context, should not need to do this with latest dicer as enforces order [1xxxx . [2xxxx - if regex_context1.search(ctx1_tmp) is not None: - ctx1 = ctx1_tmp - ctx2 = ctx2_tmp - else: - ctx1 = ctx2_tmp - ctx2 = ctx1_tmp - - # normalise - if ctx1 in self.refsmi_dict: - ctx1_id = self.refsmi_dict[ctx1] - else: - self.refsmi_id += 1 - self.refsmi_dict[ctx1] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = ctx1 - ctx1_id = self.refsmi_id - self.logger.debug('Got new double cut ctx smi: %s with id %s' % (ctx1, self.refsmi_id)) - - if ctx2 in self.refsmi_dict: - ctx2_id = self.refsmi_dict[ctx2] - else: - self.refsmi_id += 1 - self.refsmi_dict[ctx2] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = ctx2 - ctx2_id = self.refsmi_id - self.logger.debug('Got new double cut ctx smi: %s with id %s' % (ctx2, self.refsmi_id)) - - # combine to get unique id, this is simply for speed - # cantor pairing function allows us to combine two unique id's to get a new single id - # however, what's neat is that we can do a reverse cantor to get both original id's back again - # so we get very fast hash lookup and smaller storage (single id instead of tuple of 2 id's) - # Need to store this in numeric order to avoid misparsing dicer output - if ctx1_id <= ctx2_id: - ctx12_id = cantor(ctx1_id, ctx2_id) - else: - ctx12_id = cantor(ctx2_id, ctx1_id) - - molid_fragid_uid = cantor(mol_id, frag_id) - - if ctx12_id in query_dict_double: - # check to remove dupes occurring from groups like CF3 - if molid_fragid_uid in query_dict_double[ctx12_id]: - dupe_frags_s += 1 - else: - self.logger.debug('Add to Query dict Double cut dict (ctx12_id (ctx1_id, ctx2_id), ' - 'molid_fragid_uid (mol_id, frag_id): %s, %s, %s, %s, %s, %s' % - (ctx12_id, ctx1_id, ctx2_id, molid_fragid_uid, mol_id, frag_id)) - query_dict_double[ctx12_id][molid_fragid_uid] = (fattach_id, cattach_id) - else: - query_dict_double[ctx12_id] = {} - self.logger.debug('Add to Query dict Double cut dict (ctx12_id, ctx1_id, ctx2_id, ' - 'molid_fragid_uid, mol_id, frag_id): %s, %s, %s, %s, %s, %s' % - (ctx12_id, ctx1_id, ctx2_id, molid_fragid_uid, mol_id, frag_id)) - query_dict_double[ctx12_id][molid_fragid_uid] = (fattach_id, cattach_id) - - # 02/16 - # added in for Matched Series on-the-fly generation - if add_frag_ctx_map: - - if frag_id in self.frag_ctx_lookup_dbl: - - # just add it, don't care if we overwrite - self.frag_ctx_lookup_dbl[frag_id].add(ctx12_id) - - else: - self.frag_ctx_lookup_dbl[frag_id] = set() - self.frag_ctx_lookup_dbl[frag_id].add(ctx12_id) - - # catch severe fails - else: - self.logger.debug('Filtered out line with %d cuts' % num_cuts) - - self.logger.debug('mem_trace refsmi_dict entries entries: %s, mem usage: %s' % - (len(self.refsmi_dict), sys.getsizeof(self.refsmi_dict)/1000)) - self.logger.debug('mem_trace query_dict entries: %s, mem usage: %s' % - (len(query_dict_single), sys.getsizeof(query_dict_single)/1000)) - self.logger.debug('mem_trace double_pairs_dict entries: %s, mem usage: %s' % - (len(query_dict_double), sys.getsizeof(query_dict_double)/1000)) - - # wrap up - self.logger.info('Done reading dicer output into single and double pair dicts') - - self.logger.info('mem_trace refsmi_dict entries: %s, mem usage: %s' % - (len(self.refsmi_dict), sys.getsizeof(self.refsmi_dict)/1000)) - - self.logger.info('mem_trace query_dict single entries: %s, mem usage: %s' % - (len(query_dict_single), sys.getsizeof(query_dict_single)/1000)) - self.logger.info('mem_trace query_dict double entries: %s, mem usage: %s' % - (len(query_dict_double), sys.getsizeof(query_dict_double)/1000)) - self.logger.info('Removed %d and %d duplicate fragmentation patterns from single_pairs_dict' - ' & double_pairs_dict' % (dupe_frags_s, dupe_frags_d)) - - def iterator_single_pairs_dict_numeric(self, inc_attachpt=False, use_comparison_dict=False): - """ Method to iterate over Single Cut Dictionary structure and yield - numeric values of pairs to be used programmatically in other new data - structure such as with H enrichment algorithm or graph building""" - - # see comments on below iterator_single_pairs_dict - if use_comparison_dict is False: - query_dict = self.single_pairs_dict - - elif use_comparison_dict is True: - query_dict = self.single_pairs_comparison_dict - - else: - self.logger.debug("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - sys.exit("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - - # structure of this iterator follows iterator_single_pairs_dict - for ctx_id in query_dict: - - for molid_fragid_uid_L in query_dict[ctx_id]: - molid_L, frag_id_L = inv_cantor(molid_fragid_uid_L) - - if ctx_id in self.single_pairs_dict: - for molid_fragid_uid_R in self.single_pairs_dict[ctx_id]: - molid_R, frag_id_R = inv_cantor(molid_fragid_uid_R) - - if molid_L != molid_R: - - if frag_id_L != frag_id_R: - - if inc_attachpt: - - # get attachment points: - fattach_R = self.single_pairs_dict[ctx_id][molid_fragid_uid_R][0] - cattach_R = self.single_pairs_dict[ctx_id][molid_fragid_uid_R][1] - fattach_L = query_dict[ctx_id][molid_fragid_uid_L][0] - cattach_L = query_dict[ctx_id][molid_fragid_uid_L][1] - - yield molid_L, molid_R, ctx_id, frag_id_L, frag_id_R, fattach_L, cattach_L, \ - fattach_R, cattach_R - - else: - yield molid_L, molid_R, ctx_id, frag_id_L, frag_id_R - - def iterator_single_pairs_dict(self, use_comparison_dict=False): - """ Method to iterate over Single Cut Dictionary structure and yield pairs """ - - # hijack method with an alternate query dict and therefore search between SMI sets not just withing SMI sets - if use_comparison_dict is False: - query_dict = self.single_pairs_dict - - elif use_comparison_dict is True: - # will use a restricted set of context's for pair lookup - query_dict = self.single_pairs_comparison_dict - - else: - # error - self.logger.debug("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - sys.exit("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - - self.logger.info('Iterating over single cut pairs dictionary') - - for ctx_id in query_dict: - - # pairs are already determined by the data structure, for example: - # dict_single[ctx1] => { mol1_frag1_id => frag1, mol2_frag4_id => frag4, } - # dict_single[ctx2] => { mol1_frag2_id => frag2, mol2_frag5_id => frag5, } - # dict_single[ctx3] => { mol1_frag3_id => frag3 } - # dict_single[ctx4] => { mol2_frag6_id => frag6 } - # pairs exist where the context is the same, but the fragment differs - # so for ctx2, we find a matched pair between mol1 and mol2 of 'frag1 -> frag5' - ctx = self.refsmi_dict[ctx_id] - - for molid_fragid_uid_L in query_dict[ctx_id]: - - molid_L, frag_id_L = inv_cantor(molid_fragid_uid_L) - frag_L = self.refsmi_dict[frag_id_L] - - if ctx_id in self.single_pairs_dict: - for molid_fragid_uid_R in self.single_pairs_dict[ctx_id]: - molid_R, frag_id_R = inv_cantor(molid_fragid_uid_R) - - # filter out self matches - if molid_L != molid_R: - - # filter out (non) pair change where frag is identical in each mol - if frag_id_L != frag_id_R: - frag_R = self.refsmi_dict[frag_id_R] - - # finally get the attachment point info back from the numeric id's - fattach_str_R = self.refattach_dict[self.single_pairs_dict[ctx_id][molid_fragid_uid_R][0]] - cattach_str_R = self.refattach_dict[self.single_pairs_dict[ctx_id][molid_fragid_uid_R][1]] - fattach_str_L = self.refattach_dict[query_dict[ctx_id][molid_fragid_uid_L][0]] - cattach_str_L = self.refattach_dict[query_dict[ctx_id][molid_fragid_uid_L][1]] - - yield molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, \ - fattach_str_R, cattach_str_R - - def iterator_double_pairs_dict_numeric(self, inc_attachpt=False, use_comparison_dict=False): - """Method to iterate over Double Cut Dictionary structure and yield pairs as numeric ID's. This method uses more - memory than the non-numeric iterator as it inserts new smi into the smi_dict representing flipped double cut - smi, and is consequently slower if big dict resize. It is not needed for simple MMP generation but was written - for more complex methods that sit above the MMP code such as the MCSS generation code that re-stores MMPs, hence - the need for numeric ID's for mem efficiency. Also, the iterator will return an additional numeric value to the - single cut one as there are two different numeric id's returned for a context that cannot be stored / returned - together (whereas a smiles can as dot delimited). - """ - - # as per single cuts comment - if use_comparison_dict is False: - # memory pointer! - query_dict = self.double_pairs_dict - - elif use_comparison_dict is True: - # will use a restricted set of context's for pair lookup - query_dict = self.double_pairs_comparison_dict - - else: - # error - self.logger.debug("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - sys.exit("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - - self.logger.info('Inverting refattach_dict') - - # unfortunately need to invert self.refattach_dict - for key, val in self.refattach_dict.items(): - if val not in self.refattach_dict: - self.refattach_dict[val] = key - self.logger.info('done inverting refattach_dict') - - self.logger.info('Iterating over double cut pairs dictionary') - - # regex to find contexts that have a fragmentation point label of '1' - regex_context1 = re.compile(r'\[1\w{1,3}\]') - - def convert_smi_to_id(smiles): - - if smiles in self.refsmi_dict: - smiles_id = self.refsmi_dict[smiles] - else: - self.refsmi_id += 1 - self.refsmi_dict[smiles] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = smiles - smiles_id = self.refsmi_id - - return smiles_id - - def convert_attachsmi_to_id(attach): - - if attach in self.refattach_dict: - attach_id = self.refattach_dict[attach] - else: - self.refattach_id += 1 - attach_id = self.refattach_id - self.refattach_dict[attach] = self.refattach_id - self.refattach_dict[self.refattach_id] = attach - - return attach_id - - for ctxm_id in query_dict: - - # pairs are found in exactly the same way as single cuts - - # first deconvolute the context ids so we can get back the smiles - ctx1_id_tmp, ctx2_id_tmp = inv_cantor(ctxm_id) - ctx1_tmp = self.refsmi_dict[ctx1_id_tmp] - ctx2_tmp = self.refsmi_dict[ctx2_id_tmp] - - # not sure if I need the regex anymore with the new dicer canonicalisation - if regex_context1.search(ctx1_tmp) is not None: - # got [1 so keep - ctx1_id = ctx1_id_tmp - ctx1 = ctx1_tmp - ctx2_id = ctx2_id_tmp - ctx2 = ctx2_tmp - else: - ctx1_id = ctx2_id_tmp - ctx1 = ctx2_tmp - ctx2_id = ctx1_id_tmp - ctx2 = ctx1_tmp - - ctx1_f = ctx1 - ctx2_f = ctx2 - - ctx1_f = ctx1_f.replace("[1", "[9").replace("[2", "[1").replace("[9", "[2") - ctx2_f = ctx2_f.replace("[1", "[9").replace("[2", "[1").replace("[9", "[2") - - ctx1_f_id = convert_smi_to_id(ctx1_f) - ctx2_f_id = convert_smi_to_id(ctx2_f) - - self.logger.debug('Got: %s, %s from %s' % (ctx1_id, ctx2_id, ctxm_id)) - self.logger.debug('which converts to context: %s, %s' % (ctx1, ctx2)) - - # now find pairs - for molid_fragid_uid_L in query_dict[ctxm_id]: - molid_L, frag_id_L = inv_cantor(molid_fragid_uid_L) - frag_L = self.refsmi_dict[frag_id_L] - - # check it exists in double_pairs_dict, because it might not if - # we are comparing data from double_pairs_comparison_dict - if ctxm_id in self.double_pairs_dict: - - # flip it, as we need to reverse numbering - frag_L_f = frag_L - if '[12' not in frag_L: - frag_L_f = frag_L_f.replace("[1", "[9") - frag_L_f = frag_L_f.replace("[2", "[1") - frag_L_f = frag_L_f.replace("[9", "[2") - - frag_L_f_id = convert_smi_to_id(frag_L_f) - - else: - frag_L_f_id = frag_id_L - - if inc_attachpt: - fattach_str_L_id = query_dict[ctxm_id][molid_fragid_uid_L][0] - cattach_str_L_id = query_dict[ctxm_id][molid_fragid_uid_L][1] - fattach_str_L = self.refattach_dict[fattach_str_L_id] - cattach_str_L = self.refattach_dict[cattach_str_L_id] - - fattach_str_L_f = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", fattach_str_L) - cattach_str_L_f = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", cattach_str_L) - - fattach_str_L_id_f = convert_attachsmi_to_id(fattach_str_L_f) - cattach_str_L_id_f = convert_attachsmi_to_id(cattach_str_L_f) - - # now iterate over pairs - for molid_fragid_uid_R in self.double_pairs_dict[ctxm_id]: - - molid_R, frag_id_R = inv_cantor(molid_fragid_uid_R) - # print "Working with molid_l=%s, molid_r=%s, ctx=%s, frag_L=%s, frag_r=%s" % - # (molid_L, molid_R, ctx, frag_L, self.refsmi_dict[frag_id_R] ) - - if molid_L != molid_R: - - frag_R = self.refsmi_dict[frag_id_R] - # need to remove pairs where the original smiles was identical for both molid - # which results in a pair with frag_l == frag_r and is not of interest - # this mostly occurs because we do not handle chirality - # this simple if string comparison statement is expensive and slows code down! - if frag_L != frag_R: - - # At this point we have a pair! - - # now flip the R - frag_R_f = frag_R - if '[12' not in frag_R: - frag_R_f = frag_R_f.replace("[1", "[9") - frag_R_f = frag_R_f.replace("[2", "[1") - frag_R_f = frag_R_f.replace("[9", "[2") - - frag_R_f_id = convert_smi_to_id(frag_R_f) - - else: - frag_R_f_id = frag_id_R - - # and flip attachment points - if inc_attachpt: - - # finally get the attachment point info back from the numeric id's - fattach_str_R_id = self.double_pairs_dict[ctxm_id][molid_fragid_uid_R][0] - cattach_str_R_id = self.double_pairs_dict[ctxm_id][molid_fragid_uid_R][1] - fattach_str_R = self.refattach_dict[fattach_str_R_id] - cattach_str_R = self.refattach_dict[cattach_str_R_id] - - # regex to flip the numbers in the attachment points - fattach_str_R_f = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", fattach_str_R) - cattach_str_R_f = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", cattach_str_R) - - # - fattach_str_R_id_f = convert_attachsmi_to_id(fattach_str_R_f) - cattach_str_R_id_f = convert_attachsmi_to_id(cattach_str_R_f) - - for count_ in range(2): - if count_ == 0: - # original with attachment point numbers - yield molid_L, molid_R, ctx1_id, ctx2_id, frag_id_L, frag_id_R,\ - fattach_str_L_id, cattach_str_L_id, fattach_str_R_id, cattach_str_R_id - else: - # reversed format - yield molid_L, molid_R, ctx1_f_id, ctx2_f_id, frag_L_f_id, frag_R_f_id, \ - fattach_str_L_id_f, cattach_str_L_id_f, fattach_str_R_id_f, \ - cattach_str_R_id_f - - else: - for count_ in range(2): - if count_ == 0: - # original - yield molid_L, molid_R, ctx1_id, ctx2_id, frag_id_L, frag_id_R - else: - # reversed - yield molid_L, molid_R, ctx1_f_id, ctx2_f_id, frag_L_f_id, frag_R_f_id - - def __subiterator_double_pairs_dict(self, use_comparison_dict): - """A hidden sub method used by iterator_double_pairs_dict() method - to iterate over Double Cut Dictionary structure and yield pairs""" - - # as per single cuts comment - if use_comparison_dict is False: - # memory pointer! - query_dict = self.double_pairs_dict - - elif use_comparison_dict is True: - # will use a restricted set of context's for pair lookup - query_dict = self.double_pairs_comparison_dict - - else: - # error - self.logger.debug("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - sys.exit("Invalid parameter for method, use_ctx_lookup_dict must be True or False") - - self.logger.info('Iterating over double cut pairs dictionary') - - # regex to find contexts that have a fragmentation point label of '1' - regex_context1 = re.compile(r'\[1\w{1,3}\]') - - for ctxm_id in query_dict: - - # pairs are found in exactly the same way as single cuts - - # first deconvolute the context ids so we can get back the smiles - ctx1_id_tmp, ctx2_id_tmp = inv_cantor(ctxm_id) - ctx1_tmp = self.refsmi_dict[ctx1_id_tmp] - ctx2_tmp = self.refsmi_dict[ctx2_id_tmp] - - if regex_context1.search(ctx1_tmp) is not None: - # got [1 so keep - ctx1_id = ctx1_id_tmp - ctx1 = ctx1_tmp - ctx2_id = ctx2_id_tmp - ctx2 = ctx2_tmp - else: - ctx1_id = ctx2_id_tmp - ctx1 = ctx2_tmp - ctx2_id = ctx1_id_tmp - ctx2 = ctx1_tmp - - ctx = ctx1 + "." + ctx2 - - self.logger.debug('Got: %s, %s from %s' % (ctx1_id, ctx2_id, ctxm_id)) - self.logger.debug('which converts to context: %s, %s' % (ctx1, ctx2)) - - # - # now find pairs - for molid_fragid_uid_L in query_dict[ctxm_id]: - molid_L, frag_id_L = inv_cantor(molid_fragid_uid_L) - frag_L = self.refsmi_dict[frag_id_L] - - # check it exists in double_pairs_dict, because it might not if - # we are comparing data from double_pairs_comparison_dict - if ctxm_id in self.double_pairs_dict: - for molid_fragid_uid_R in self.double_pairs_dict[ctxm_id]: - molid_R, frag_id_R = inv_cantor(molid_fragid_uid_R) - # print "Working with molid_l=%s, molid_r=%s, ctx=%s, frag_L=%s, frag_r=%s" % - # (molid_L, molid_R, ctx, frag_L, self.refsmi_dict[frag_id_R] ) - if molid_L != molid_R: - frag_R = self.refsmi_dict[frag_id_R] - # need to remove pairs where the original smiles was identical for both molid - # which results in a pair with frag_l == frag_r and is not of interest - # this mostly occurs because we do not handle chirality - # this simple if string comparison statement is expensive and slows code down! - if frag_L != frag_R: - - # At this point we have a pair! - # Some cases exist where multiple fragmentations for the same input smiles give the - # same context but different frag. This is due to isomeric substitution e.g.: two - # different Me groups off a core. These are still valid matches but could be removed - # by ensuring that the original smiles is also different - - # finally get the attachment point info back from the numeric id's - fattach_str_R = self.refattach_dict[ - self.double_pairs_dict[ctxm_id][molid_fragid_uid_R][0]] - cattach_str_R = self.refattach_dict[ - self.double_pairs_dict[ctxm_id][molid_fragid_uid_R][1]] - fattach_str_L = self.refattach_dict[query_dict[ctxm_id][molid_fragid_uid_L][0]] - cattach_str_L = self.refattach_dict[query_dict[ctxm_id][molid_fragid_uid_L][1]] - - yield molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, \ - fattach_str_R, cattach_str_R - - def iterator_double_pairs_dict(self, use_comparison_dict=False): - - """Method to iterate over Double Cut Dictionary structure and yield pairs""" - - for molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, fattach_str_R, cattach_str_R \ - in self.__subiterator_double_pairs_dict(use_comparison_dict): - - for count_ in range(2): - - # yield the pair twice, first raw data, second with flipped numbering - if count_ == 0: - - # print molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, fattach_str_R, - # cattach_str_R - yield molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, fattach_str_R, \ - cattach_str_R - - else: - - ctx = ctx.replace("[1", "[9") - ctx = ctx.replace("[2", "[1") - ctx = ctx.replace("[9", "[2") - - if '[12' not in frag_L: - frag_L = frag_L.replace("[1", "[9") - frag_L = frag_L.replace("[2", "[1") - frag_L = frag_L.replace("[9", "[2") - - if '[12' not in frag_R: - frag_R = frag_R.replace("[1", "[9") - frag_R = frag_R.replace("[2", "[1") - frag_R = frag_R.replace("[9", "[2") - - # regex to flip the numbers in the attachment points - fattach_str_L = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", fattach_str_L) - cattach_str_L = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", cattach_str_L) - fattach_str_R = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", fattach_str_R) - cattach_str_R = re.sub("\[(\d)(.*)\|(\d)(.*)", r"[\3\2|\1\4", cattach_str_R) - - # print "-F-> ", molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, - # fattach_str_R, cattach_str_R - - yield molid_L, molid_R, ctx, frag_L, frag_R, fattach_str_L, cattach_str_L, fattach_str_R, \ - cattach_str_R - - def _inspector_double_pairs_dict(self): - """Method is simply used in validation work, it's not algorithmically useful. The code will confirm that - dicer is canonicalising the double cut context fragments correctly. Incorrect canonicalisation will result - in a double cut being represented in two different ways and stored independently, with different fragments - against each case. Pairs will be missed as you only find mmp's for each of the different representations and - not between the two representations. Correct canonicalisation will ensure this does not happen. The code - below is searching for cases where two different canonicalised forms are stored with different fragments. The - code should not return any results, thus proving that canonicalisation is done correctly. Example validation - test code would be: - - def test_inspector_double_cut_pairs(self): - self.test_mmp_object.build_from_dicer("twomillionsmi.smi", 'DOUBLE', 'NONE') - self.test_mmp_object.inspector_double_pairs_dict() - - Ran on with 2.1M smi and no results were returned (no print to cmd line) - unit test switched back to smiple input file - """ - - # regex to find contexts that have a fragmentation point label of '1' - regex_context1 = re.compile(r'\[1\w{1,3}\]') - - counter = 0 - - for ctxm_id in self.double_pairs_dict: - - counter += 1 - # if (counter % 1000) == 0: - # print "Iteration: ", counter - - # first deconvolute the context ids so we can get back the smiles - ctx1_id, ctx2_id = inv_cantor(ctxm_id) - - ctx1_smi = self.refsmi_dict[ctx1_id] - ctx2_smi = self.refsmi_dict[ctx2_id] - - if regex_context1.search(ctx1_smi) is not None: - # got [1 so flip - ctx1_smi_flipped = ctx1_smi.replace("[1", "[2") - ctx2_smi_flipped = ctx2_smi.replace("[2", "[1") - - else: - ctx1_smi_flipped = ctx1_smi.replace("[2", "[1") - ctx2_smi_flipped = ctx2_smi.replace("[1", "[2") - - if ctx1_smi_flipped in self.refsmi_dict: - ctx1_id_flipped = self.refsmi_dict[ctx1_smi_flipped] - else: - ctx1_id_flipped = None - - if ctx2_smi_flipped in self.refsmi_dict: - ctx2_id_flipped = self.refsmi_dict[ctx2_smi_flipped] - else: - ctx2_id_flipped = None - - if ctx1_id_flipped is not None and ctx2_id_flipped is not None: - - ctxm_id_flipped = cantor(ctx1_id_flipped, ctx2_id_flipped) - - if ctxm_id_flipped in self.double_pairs_dict: - - # print "Iteration: ", counter, " had a duplicate reversed entry ctx smi" - if self.double_pairs_dict[ctxm_id] != self.double_pairs_dict[ctxm_id_flipped]: - # # OK, this is bad, it should not happen: - print("Original:") - print(("{} {}".format(ctxm_id, self.double_pairs_dict[ctxm_id]))) - print("Double") - print(("{} {}".format(ctxm_id_flipped, self.double_pairs_dict[ctxm_id_flipped]))) - - def print_to_file(self, out_fi, cut_type, inc_types_header=False): - """Method to get pairs from the base data object of the class - - out_fi: - The user specified output file - - cut_type: - Specifies the type of fragmentation required. Allowed values are SINGLE, - DOUBLE or BOTH. Currently this class does not support anything greater than - double cut fragmentation - - Example usage: - - # give me a CSV named my_output.pairs of all the pairs: - my_mmp_object.print_to_file('my_output.pairs', 'CSV', 'BOTH') - - # give me a CSV of only the DOUBLE cut pairs: - my_mmp_object.print_to_file('my_output.pairs', 'CSV', 'DOUBLE') - """ - - # check file write possible before start - self.logger.info('Opening output file for write: %s' % out_fi) - - # check cut_type, convert to int - if cut_type.upper() == 'DOUBLE': - # confusing but faster later - cut_type_id = 3 - elif cut_type.upper() == 'BOTH': - # confusing but faster later - cut_type_id = 2 - elif cut_type.upper() == 'SINGLE': - cut_type_id = 1 - else: - self.logger.warn('cut_type specification is incorrect, using single cut: %s' % cut_type.upper()) - cut_type_id = 1 - - # Now start processing the data structures to write the pairs - with open(out_fi, "w") as f: - - # - # write single header line - # - - f.write("CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_FRAG_L," - "ATTCHPT_CTX_L,ATTCHPT_FRAG_R,ATTCHPT_CTX_R\n") - if inc_types_header: - f.write("STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING\n") - - # - # print pairs for single - if cut_type_id <= 2: - # for molid_L, molid_R, ctx, frag_L, frag_R in self.iterator_single_pairs_dict(): - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R \ - in self.iterator_single_pairs_dict(): - f.write('single,%d,%d,%s,%s,%s,%s,%s,%s,%s\n' % (molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, - fa_R, ca_R)) - - # - # print pairs for double - if cut_type_id >= 2: - # for molid_L, molid_R, ctx, frag_L, frag_R in self.iterator_double_pairs_dict(): - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R \ - in self.iterator_double_pairs_dict(): - f.write('double,%d,%d,%s,%s,%s,%s,%s,%s,%s\n' % (molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, - fa_R, ca_R)) - - # close the file handle - f.close() - - self.logger.info('All done!') - - -class _TestMMPObjectClass(unittest.TestCase): - - """Test class for MMPObjectClass(object) written to use pythons unittest - - Example usage: - - python mmp_objects.py - - coverage run mmp_objects.py - coverage report mmp_objects.py - - """ - - def setUp(self): - - """Instantiate temp file names, test data objects that get written to temp files - a silent logger object (needed to instantiate class) and the mmp object we'll test""" - - self.maxDiff = None - - self.temp_file_input_smi = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - self.temp_file_input_smi_1b = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - self.temp_file_input_smi_2 = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - self.temp_file_output_pairs = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - - self.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - logging.disable(logging.CRITICAL) - - self.test_mmp_object = MMPObjectClass(self.mmplogger) - - # input is a set of smiles to generate pairs from - # golden output data sets are the expected output from the code we'll run during the test - self.test_dataset_input_smi_01 = { - - # basic test set - # CHEMBL3105327 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL3105327/ - '3105327': 'Cc1ccc2c(ccn2c3nc(cs3)c4cc(ccc4F)C(F)(F)F)c1', - # CHEMBL1526778 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1526778/ - '1526778': 'CC(=O)c1c(C)n(c(C)c1C(=O)C)c2nc(c(C)s2)c3ccc(C)c(C)c3', - # CHEMBL1494678 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1494678/ - '1494678': 'CC(=O)c1c(C)n(c(C)c1C(=O)C)c2nc(c(C)s2)c3ccccc3', - - # test a bug converting double cut label [12 to [21 - # CHEMBL472166 - '472166': 'OC(CCn1ccnc1)(c2ccccc2)c3ccccc3', - # CHEMBL69798 - '69798': 'Cc1nccn1CCC(O)(c2ccccc2)c3ccccc3', - - } - - self.test_dataset_input_smi_01b = { - # no pairs from these extras - # CHEMBL367346 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL367346/ - '367346': 'Cc1sc(N)nc1c2cccc(Cl)c2', - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL366881/ - '366881': 'Cc1sc(N)nc1c2ccc(Cl)c(Cl)c2' - } - - self.test_dataset_input_smi_02 = { - # CHEMBL1477460 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1477460/ - '1477460': 'COc1ccc(cc1)c2nc(sc2C)n3c(C)c(C(=O)C)c(C(=O)C)c3C', - # CHEMBL1441050 https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1441050/ - '1441050': 'COc1ccc(cc1OC)c2nc(sc2C)n3c(C)c(C(=O)C)c(C(=O)C)c3C' - } - - self.test_dataset_goldenoutput_pairs_01 = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_FRAG_L,ATTCHPT_CTX_L,ATTCHPT_FRAG_R,ATTCHPT_CTX_R': None, - 'double,1526778,1494678,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,Cc1ccc(c2[n][2cH]s[1cH]2)cc1C,s1[2cH][n]c(c2ccccc2)[1cH]1,[2:NPL3|1:C3],[1:C2|2:C2],[2:NPL3|1:C3],[1:C2|2:C2]': None, - 'double,1526778,1494678,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,Cc1ccc(c2[n][1cH]s[2cH]2)cc1C,s1[1cH][n]c(c2ccccc2)[2cH]1,[1:NPL3|2:C3],[2:C2|1:C2],[1:NPL3|2:C3],[2:C2|1:C2]': None, - 'double,1526778,1494678,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,Cc1s[2cH][n]c1c1cc([1cH]cc1)C,s1[2cH][n]c(c2ccccc2)[1cH]1,[2:NPL3|1:C3],[1:CAR|2:C2],[2:NPL3|1:C3],[1:C2|2:C2]': None, - 'double,1526778,1494678,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,Cc1s[1cH][n]c1c1cc([2cH]cc1)C,s1[1cH][n]c(c2ccccc2)[2cH]1,[1:NPL3|2:C3],[2:CAR|1:C2],[1:NPL3|2:C3],[2:C2|1:C2]': None, - 'double,1526778,1494678,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,Cc1s[2cH][n]c1c1c[1cH]c(cc1)C,s1[2cH][n]c(c2ccccc2)[1cH]1,[2:NPL3|1:C3],[1:CAR|2:C2],[2:NPL3|1:C3],[1:C2|2:C2]': None, - 'double,1526778,1494678,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,Cc1s[1cH][n]c1c1c[2cH]c(cc1)C,s1[1cH][n]c(c2ccccc2)[2cH]1,[1:NPL3|2:C3],[2:CAR|1:C2],[1:NPL3|2:C3],[2:C2|1:C2]': None, - 'double,1494678,1526778,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,s1[2cH][n]c(c2ccccc2)[1cH]1,Cc1ccc(c2[n][2cH]s[1cH]2)cc1C,[2:NPL3|1:C3],[1:C2|2:C2],[2:NPL3|1:C3],[1:C2|2:C2]': None, - 'double,1494678,1526778,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,s1[1cH][n]c(c2ccccc2)[2cH]1,Cc1ccc(c2[n][1cH]s[2cH]2)cc1C,[1:NPL3|2:C3],[2:C2|1:C2],[1:NPL3|2:C3],[2:C2|1:C2]': None, - 'double,1494678,1526778,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,s1[2cH][n]c(c2ccccc2)[1cH]1,Cc1s[2cH][n]c1c1cc([1cH]cc1)C,[2:NPL3|1:C3],[1:C2|2:C2],[2:NPL3|1:C3],[1:CAR|2:C2]': None, - 'double,1494678,1526778,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,s1[1cH][n]c(c2ccccc2)[2cH]1,Cc1s[1cH][n]c1c1cc([2cH]cc1)C,[1:NPL3|2:C3],[2:C2|1:C2],[1:NPL3|2:C3],[2:CAR|1:C2]': None, - 'double,1494678,1526778,[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C,s1[2cH][n]c(c2ccccc2)[1cH]1,Cc1s[2cH][n]c1c1c[1cH]c(cc1)C,[2:NPL3|1:C3],[1:C2|2:C2],[2:NPL3|1:C3],[1:CAR|2:C2]': None, - 'double,1494678,1526778,[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C,s1[1cH][n]c(c2ccccc2)[2cH]1,Cc1s[1cH][n]c1c1c[2cH]c(cc1)C,[1:NPL3|2:C3],[2:C2|1:C2],[1:NPL3|2:C3],[2:CAR|1:C2]': None, - 'double,472166,69798,[1cH]1ccccc1.[2cH]1ccccc1,O[12CH2]CC[n]1c[n]cc1,O[12CH2]CC[n]1c([n]cc1)C,[2:CAR|1:CAR],[1:C3|2:C3],[2:CAR|1:CAR],[1:C3|2:C3]': None, - 'double,472166,69798,[2cH]1ccccc1.[1cH]1ccccc1,O[12CH2]CC[n]1c[n]cc1,O[12CH2]CC[n]1c([n]cc1)C,[1:CAR|2:CAR],[2:C3|1:C3],[1:CAR|2:CAR],[2:C3|1:C3]': None, - 'double,69798,472166,[1cH]1ccccc1.[2cH]1ccccc1,O[12CH2]CC[n]1c([n]cc1)C,O[12CH2]CC[n]1c[n]cc1,[2:CAR|1:CAR],[1:C3|2:C3],[2:CAR|1:CAR],[1:C3|2:C3]': None, - 'double,69798,472166,[2cH]1ccccc1.[1cH]1ccccc1,O[12CH2]CC[n]1c([n]cc1)C,O[12CH2]CC[n]1c[n]cc1,[1:CAR|2:CAR],[2:C3|1:C3],[1:CAR|2:CAR],[2:C3|1:C3]': None} - - self.test_dataset_goldenoutput_pairs_02 = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_FRAG_L,ATTCHPT_CTX_L,ATTCHPT_FRAG_R,ATTCHPT_CTX_R': None, - 'STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING': None, - 'single,1526778,1494678,Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1,Cc1cc[1cH]cc1C,[1cH]1ccccc1,[1:C2],[1:CAR],[1:C2],[1:CAR]': None, - 'single,1494678,1526778,Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1,[1cH]1ccccc1,Cc1cc[1cH]cc1C,[1:C2],[1:CAR],[1:C2],[1:CAR]': None, - 'single,472166,69798,O[1CH](c1ccccc1)c1ccccc1,[1CH3]C[n]1c[n]cc1,[1CH3]C[n]1c([n]cc1)C,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,69798,472166,O[1CH](c1ccccc1)c1ccccc1,[1CH3]C[n]1c([n]cc1)C,[1CH3]C[n]1c[n]cc1,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,472166,69798,OC([1CH3])(c1ccccc1)c1ccccc1,[1CH3][n]1c[n]cc1,[1CH3][n]1c([n]cc1)C,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,69798,472166,OC([1CH3])(c1ccccc1)c1ccccc1,[1CH3][n]1c([n]cc1)C,[1CH3][n]1c[n]cc1,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,472166,69798,OC(C[1CH3])(c1ccccc1)c1ccccc1,[1nH]1c[n]cc1,Cc1[1nH]cc[n]1,[1:C3],[1:NPL3],[1:C3],[1:NPL3]': None, - 'single,69798,472166,OC(C[1CH3])(c1ccccc1)c1ccccc1,Cc1[1nH]cc[n]1,[1nH]1c[n]cc1,[1:C3],[1:NPL3],[1:C3],[1:NPL3]': None, - 'single,472166,69798,OC(CC[n]1[1cH][n]cc1)(c1ccccc1)c1ccccc1,[1H],[1CH4],[1:C2],[1:H],[1:C2],[1:C3]': None, - 'single,69798,472166,OC(CC[n]1[1cH][n]cc1)(c1ccccc1)c1ccccc1,[1CH4],[1H],[1:C2],[1:C3],[1:C2],[1:H]': None} - - self.test_dataset_goldenoutput_pairs_03 = { - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_FRAG_L,ATTCHPT_CTX_L,ATTCHPT_FRAG_R,ATTCHPT_CTX_R': None, - 'STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING': None, - 'single,1526778,1494678,Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1,Cc1cc[1cH]cc1C,[1cH]1ccccc1,[1:C2],[1:CAR],[1:C2],[1:CAR]': None, - 'single,1494678,1526778,Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1,[1cH]1ccccc1,Cc1cc[1cH]cc1C,[1:C2],[1:CAR],[1:C2],[1:CAR]': None, - 'single,472166,69798,O[1CH](c1ccccc1)c1ccccc1,[1CH3]C[n]1c[n]cc1,[1CH3]C[n]1c([n]cc1)C,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,69798,472166,O[1CH](c1ccccc1)c1ccccc1,[1CH3]C[n]1c([n]cc1)C,[1CH3]C[n]1c[n]cc1,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,472166,69798,OC([1CH3])(c1ccccc1)c1ccccc1,[1CH3][n]1c[n]cc1,[1CH3][n]1c([n]cc1)C,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,69798,472166,OC([1CH3])(c1ccccc1)c1ccccc1,[1CH3][n]1c([n]cc1)C,[1CH3][n]1c[n]cc1,[1:C3],[1:C3],[1:C3],[1:C3]': None, - 'single,472166,69798,OC(C[1CH3])(c1ccccc1)c1ccccc1,[1nH]1c[n]cc1,Cc1[1nH]cc[n]1,[1:C3],[1:NPL3],[1:C3],[1:NPL3]': None, - 'single,69798,472166,OC(C[1CH3])(c1ccccc1)c1ccccc1,Cc1[1nH]cc[n]1,[1nH]1c[n]cc1,[1:C3],[1:NPL3],[1:C3],[1:NPL3]': None} - - self.test_dataset_goldenoutput_pairs_04 = copy.deepcopy(self.test_dataset_goldenoutput_pairs_03) - self.test_dataset_goldenoutput_pairs_04['STRING,STRING,STRING,SMILES,SMILES,SMILES,STRING,STRING,STRING,STRING'] = None - - self.test_dataset_goldenoutput_pairs_05 = { - (1477460, 1494678, 'Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n]c1c1cc[1cH]cc1'): ('[1OH]C', '[1H]'), - (1477460, 1526778, 'Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ('COc1cc[1cH]cc1', 'Cc1cc[1cH]cc1C'), - (1477460, 1494678, 'Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ('COc1cc[1cH]cc1', '[1cH]1ccccc1'), - (1441050, 1526778, 'Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ('COc1cc[1cH]cc1OC', 'Cc1cc[1cH]cc1C'), - (1441050, 1494678, 'Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ('COc1cc[1cH]cc1OC', '[1cH]1ccccc1')} - - self.test_dataset_goldenoutput_pairs_06 = { - (1477460, 1526778, '[1CH4].Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][2cH]1'): ( - '[1OH]c1cc[2cH]cc1', 'Cc1[1cH]c[2cH]cc1'), - (1477460, 1526778, '[2CH4].Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ( - '[2OH]c1cc[1cH]cc1', 'Cc1[2cH]c[1cH]cc1'), - (1441050, 1526778, '[1CH4].Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][2cH]1'): ( - '[1OH]c1c(OC)cc[2cH]c1', 'Cc1[1cH]c[2cH]cc1'), - (1441050, 1526778, '[2CH4].Cc1sc([n]2c(c(c(c2C)C(=O)C)C(=O)C)C)[n][1cH]1'): ( - '[2OH]c1c(OC)cc[1cH]c1', 'Cc1[2cH]c[1cH]cc1'), (1477460, 1526778, '[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C'): ( - 'COc1ccc(c2[n][2cH]s[1cH]2)cc1', 'Cc1s[2cH][n]c1c1c[1cH]c(cc1)C'), - (1477460, 1526778, '[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C'): ( - 'COc1ccc(c2[n][1cH]s[2cH]2)cc1', 'Cc1s[1cH][n]c1c1c[2cH]c(cc1)C'), - (1477460, 1494678, '[1CH4].Cc1[2nH]c(c(c1C(=O)C)C(=O)C)C'): ( - 'COc1ccc(c2[n][2cH]s[1cH]2)cc1', 's1[2cH][n]c(c2ccccc2)[1cH]1'), - (1477460, 1494678, '[2CH4].Cc1[1nH]c(c(c1C(=O)C)C(=O)C)C'): ( - 'COc1ccc(c2[n][1cH]s[2cH]2)cc1', 's1[1cH][n]c(c2ccccc2)[2cH]1')} - - self.test_dataset_goldenoutput_pairs_numeric = {(1526778, 1494678, 31, 30, 47): 1, - (1494678, 1526778, 31, 47, 30): 1, - (472166, 69798, 57, 56, 74): 1, (69798, 472166, 57, 74, 56): 1, - (472166, 69798, 59, 58, 73): 1, (69798, 472166, 59, 73, 58): 1, - (472166, 69798, 61, 60, 72): 1, (69798, 472166, 61, 72, 60): 1, - (472166, 69798, 68, 12, 1): 1, (69798, 472166, 68, 1, 12): 1} - - self.test_dataset_goldenoutput_pairs_numeric_threshold03 = {(1526778, 1494678, 25, 24, 40): 1, - (1494678, 1526778, 25, 40, 24): 1, - (472166, 69798, 52, 51, 63): 1, - (69798, 472166, 52, 63, 51): 1, - (472166, 69798, 59, 8, 1): 1, - (69798, 472166, 59, 1, 8): 1} - - self.test_dataset_goldenoutput_pairs_numeric_ctx = {(1526778, 1494678, 31, 30, 47, 3, 1, 3, 1): 1, - (1494678, 1526778, 31, 47, 30, 3, 1, 3, 1): 1, - (472166, 69798, 57, 56, 74, 2, 2, 2, 2): 1, - (69798, 472166, 57, 74, 56, 2, 2, 2, 2): 1, - (472166, 69798, 59, 58, 73, 2, 2, 2, 2): 1, - (69798, 472166, 59, 73, 58, 2, 2, 2, 2): 1, - (472166, 69798, 61, 60, 72, 2, 4, 2, 4): 1, - (69798, 472166, 61, 72, 60, 2, 4, 2, 4): 1, - (472166, 69798, 68, 12, 1, 3, 6, 3, 2): 1, - (69798, 472166, 68, 1, 12, 3, 2, 3, 6): 1} - - # write test data to temp file 01 - for smi_id, smi in list(self.test_dataset_input_smi_01.items()): - self.temp_file_input_smi.write(smi+" "+smi_id+"\n") - self.temp_file_input_smi.close() - - # write test data to temp file 01b - for smi_id, smi in list(self.test_dataset_input_smi_01.items()): - self.temp_file_input_smi_1b.write(smi+" "+smi_id+"\n") - for smi_id, smi in list(self.test_dataset_input_smi_01b.items()): - self.temp_file_input_smi_1b.write(smi+" "+smi_id+"\n") - self.temp_file_input_smi_1b.close() - - # write test data to temp file 02 - for smi_id, smi in list(self.test_dataset_input_smi_02.items()): - self.temp_file_input_smi_2.write(smi+" "+smi_id+"\n") - self.temp_file_input_smi_2.close() - - # container for results data - self.test_dataset_testresults = {} - - def tearDown(self): - - """Tear down object for clean reuse in further tests""" - self.test_mmp_object.clean_out_data() - self.test_dataset_testresults.clear() - - def test_singlecutpairs_yieldnumeric(self): - - """Test the generation of single cut pairs with output""" - - # test the MMPObjectClass using above temp file, then write to new output - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE') - - for molid_L, molid_R, ctx_id, frag_id_L, frag_id_R in self.test_mmp_object.iterator_single_pairs_dict_numeric(): - self.test_dataset_testresults[molid_L, molid_R, ctx_id, frag_id_L, frag_id_R] = 1 - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_numeric) - - def test_singlecutpairs_yieldnumeric_ctx(self): - """Test the generation of single cut pairs with output""" - - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE') - - for molid_L, molid_R, ctx_id, frag_id_L, frag_id_R, fattach_L, cattach_L, fattach_R, cattach_R in \ - self.test_mmp_object.iterator_single_pairs_dict_numeric(inc_attachpt = True): - self.test_dataset_testresults[molid_L, molid_R, ctx_id, frag_id_L, frag_id_R, fattach_L, cattach_L, - fattach_R, cattach_R] = 1 - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_numeric_ctx) - - def test_doublecutpairs_yieldnumeric(self): - """Test as per test_singlecutpairs_numeric but utilises double cut methods""" - - # generate base objects as in test_build_from_dicer_comparison_dict_true - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'DOUBLE', 'NONE') - - # test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for molid_L, molid_R, ctx1_id, ctx2_id, frag_L_id, frag_R_id, fa_L_id, ca_L_id, fa_R_id, ca_R_id \ - in self.test_mmp_object.iterator_double_pairs_dict_numeric(inc_attachpt=True): - - ctx_str = self.test_mmp_object.refsmi_dict[ctx1_id] + "." + self.test_mmp_object.refsmi_dict[ctx2_id] - - # print molid_L, molid_R, ctx_str - # print self.test_mmp_object.refsmi_dict[frag_L_id], self.test_mmp_object.refsmi_dict[frag_R_id], - # print self.test_mmp_object.refattach_dict[fa_L_id], self.test_mmp_object.refattach_dict[ca_L_id], - # print self.test_mmp_object.refattach_dict[fa_R_id], self.test_mmp_object.refattach_dict[ca_R_id] - - key = (molid_L, molid_R, ctx_str, self.test_mmp_object.refsmi_dict[frag_L_id], - self.test_mmp_object.refsmi_dict[frag_R_id]) - val = (self.test_mmp_object.refattach_dict[fa_L_id], self.test_mmp_object.refattach_dict[ca_L_id], - self.test_mmp_object.refattach_dict[fa_R_id], self.test_mmp_object.refattach_dict[ca_R_id]) - self.test_dataset_testresults[key] = val - - # repeat non numeric - comparison_test_dataset_testresults = {} - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R \ - in self.test_mmp_object.iterator_double_pairs_dict(): - - # print molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R - comparison_test_dataset_testresults[(molid_L, molid_R, ctx, frag_L, frag_R)] = (fa_L, ca_L, fa_R, ca_R) - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(comparison_test_dataset_testresults, self.test_dataset_testresults) - - def test_build_from_dicer_comparison_dict_true(self): - - """Test the generation of single cut pairs but also create comparison dict without altering pairs in - original object. All this test does is ensure we don't cross contaminate the dicts""" - - # This test ensures that the flag "use_comparison_dict = True" will work correctly by populating - # a second independent comparison pairs dict object without altering the base pairs dict - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE') - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi_2.name, 'SINGLE', 'NONE', - use_comparison_dict=True) - - for molid_L, molid_R, ctx_id, frag_id_L, frag_id_R in self.test_mmp_object.iterator_single_pairs_dict_numeric(): - self.test_dataset_testresults[molid_L, molid_R, ctx_id, frag_id_L, frag_id_R] = 1 - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_numeric) - - def test_build_from_dicer_threshold(self): - - """Test the generation of single cut pairs but also create comparison dict without altering pairs in - original object. All this test does is ensure we don't cross contaminate the dicts""" - - # This test ensures that the flag "use_comparison_dict = True" will work correctly by populating - # a second independant comparison pairs dict object without altering the base pairs dict - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE', threshold=0.30001) - - for molid_L, molid_R, ctx_id, frag_id_L, frag_id_R in self.test_mmp_object.iterator_single_pairs_dict_numeric(): - self.test_dataset_testresults[molid_L, molid_R, ctx_id, frag_id_L, frag_id_R] = 1 - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_numeric_threshold03) - - def test_singlecutpairs(self): - """Test the generation of single cut pairs with output""" - - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE') - self.test_mmp_object.print_to_file(self.temp_file_output_pairs.name, 'SINGLE', 'DICER') - - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_02) - - def test_doublecutpairs(self): - - """Test the generation of double cut pairs with CSV output""" - - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'DOUBLE', 'NONE') - self.test_mmp_object.print_to_file(self.temp_file_output_pairs.name, 'DOUBLE') - - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(">>> ", self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_01) - - def test_singlecutpairs_comparison_dict_true(self): - - """This test compares the different contexts found in single_pairs_comparison_dict to those in - single_pairs_dict. When we find a match then we have found a pair between the two different - sets of smiles. This method is used in approaches like mmp_prediction when we need a delta value - from an existing pair to predict data for a new smiles""" - - # generate base objects as in test_build_from_dicer_comparison_dict_true - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'NONE') - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi_2.name, 'SINGLE', 'NONE', - use_comparison_dict=True) - - # test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R \ - in self.test_mmp_object.iterator_single_pairs_dict(use_comparison_dict=True): - self.test_dataset_testresults[(molid_L, molid_R, ctx)] = (frag_L, frag_R) - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_05) - - def test_doublecutpairs_comparison_dict_true(self): - - """Test as per test_singlecutpairs_comparison_dict_true but utilises double cut methods""" - - # generate base objects as in test_build_from_dicer_comparison_dict_true - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'DOUBLE', 'NONE') - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi_2.name, 'DOUBLE', 'NONE', - use_comparison_dict=True) - - # test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R \ - in self.test_mmp_object.iterator_double_pairs_dict(use_comparison_dict=True): - self.test_dataset_testresults[(molid_L, molid_R, ctx)] = (frag_L, frag_R) - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_06) - - def test_singlecutpairs_typesheaders(self): - - """Test the generation of single cut pairs with CSV output and types header""" - - # test the MMPObjectClass using above temp file, then write to new output - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'SINGLE', 'REMOVE_NONRINGS') - self.test_mmp_object.print_to_file(self.temp_file_output_pairs.name, 'SINGLE', inc_types_header=True) - - test_results_filehandle = open(self.temp_file_output_pairs.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_pairs_04) - - def test_inspector_double_cut_pairs(self): - - """Test the generation of single cut pairs with CSV output and types header""" - - self.test_mmp_object.build_from_dicer(self.temp_file_input_smi.name, 'DOUBLE', 'NONE') - - self.test_mmp_object._inspector_double_pairs_dict() - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_pairs_objects.py b/contrib/script/py/mmp/mmp_pairs_objects.py deleted file mode 100755 index 5abd2e17..00000000 --- a/contrib/script/py/mmp/mmp_pairs_objects.py +++ /dev/null @@ -1,1337 +0,0 @@ -################################################################### -""" Summary: Class and Methods to manipulate MMP pairs - -About: An object that reads a pairs file into memory for manipulation -by the associated methods. Includes statistical aggregation functions -to summarise MMP data - -""" -################################################################### -import logging -import sys -import os -import unittest -import tempfile -import pandas as pd -import numpy as np - -import mmp.mmp_stats_functions as mmps -from mmp.mmp_data_objects import MMPDataObjectClass - -def validate_agg_method(agg_method): - """Validates that the keyword parameter passed to this module for the aggregation type - is valid. This can be imported and used by other scripts such as wrapper scripts. Because - this might be called before the object is instantiated it is not an object level function.""" - - agg_method = agg_method.upper() - - if agg_method == 'MEAN': - return True - - # catch MEAN_DIFF before DIFFxx - elif agg_method == 'MEAN_DIFF': - return True - - elif agg_method == 'MEAN_DIFF_INVLOG': - return True - - elif agg_method == 'CATEGORICAL': - return True - - # order specific so should not get MEAN_DIFF - elif 'DIFF' in agg_method: - - try: - agg_method_int = int(agg_method.replace('DIFF', '')) - if 100 > agg_method_int > 0: - return True - - else: - sys.exit('Invalid number for diffxx, try diffxx where 0 < xx < 100') - - except: - sys.exit('Invalid parameter used for method diff, try diffxx where 0 < xx < 100') - - else: - sys.exit('Invalid parameter used for method, try MEAN|MEAN_DIFF|MEAN_DIFF_INVLOG|CATEGORICAL|DIFFXX ' - 'where 0 agg_method_int > 0: - return column_name + '_diff' + str(agg_method_int) - - else: - sys.exit('Invalid number for diffxx, try diffxx where 0 < xx < 100') - - except: - sys.exit('Invalid parameter used for method diff, try diffxx where 0 < xx < 100') - - else: - sys.exit('Invalid agg_method (%s) specified' % agg_method) - - -class MMPPairsObjectClass(MMPDataObjectClass): - """Class implements objects nad methods for MMP pair manipulation""" - - def __init__(self, logger_object): - """setup base object""" - # core objects - - # core objects from base class - MMPDataObjectClass.__init__(self, logger_object) - - # setup logger and check it's ok - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - # keep tabs on property data so we can aggregate MWT - # and CLP using different stats than result data DIFFxx - self.inc_props = False - self.add_qmetric = False - # - self.grouped = {} - - def clear_out_data_pairobj(self): - """Implements method from MMPDataObjectClass to clear out all data structures plus - clears out a few more related to this extended class""" - - self.clean_out_data() - self.grouped.clear() - - def pairsdataobj_to_pd(self, cut_type, diff_col_name): - """Iterate over the Pairs object and convert to a pandas dataframe for aggregation - This is the only method in this class that uses methods and objects from the inherited - class MMPDataObjectsClass (mmp_data_objects)""" - - # fail if both single_pairs_dict and double_pairs_dict are empty - if (len(self.single_pairs_dict) == 0) and (len(self.double_pairs_dict) == 0): - self.logger.debug('No data found in single_pairs_dict and/or double_pairs_dict, expect no results') - sys.exit("Error: no data found in single_pairs_dict and/or double_pairs_dict, nothing to find and write") - - # check cut_type, convert to int - # TODO: code is repeated loads so should pull it out from everywhere and convert to function - if cut_type.upper() == 'DOUBLE': - # confusing but faster later - cut_type_id = 3 - elif cut_type.upper() == 'BOTH': - # confusing but faster later - cut_type_id = 2 - elif cut_type.upper() == 'SINGLE': - cut_type_id = 1 - else: - self.logger.warn('cut_type specification is incorrect, using single cut: %s' % cut_type.upper()) - cut_type_id = 1 - - # fail if diff_col_name has invalid value - # print diff_col_name, self.headers_nosmi - if diff_col_name not in self.headers_nosmi: - sys.exit("Error: specified column name containing diff data does not exist") - - # find location of diff_col_name in the data table self.mol_data_dict - #diff_col_name_idx = self.headers_nosmi.index(diff_col_name) - - headers = ['CUT', 'MOLID_L', 'MOLID_R', 'CONTEXT', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', 'ATTCHPT_FRAG_L', - 'ATTCHPT_CTX_R', 'ATTCHPT_FRAG_R'] - - for col in self.headers_nosmi: - if col in list(self.headers_numeric_position.values()): - headers.append(col) - - self.logger.info('Creating pandas dataframe from base mmp object via iterators') - - # Should be faster to pre-compute the dataframe size: - # num_rows = 0 - # print "Final size: ", num_rows - # - # for el_ in self.single_pairs_dict.keys(): - # size_ = len(self.single_pairs_dict[el_]) - # num_rows = num_rows + size_ - # print "Final size: ", num_rows - # - # self.pairs_table = pd.DataFrame(index=np.arange(0, num_rows), columns=headers) - # but hard due to pairs being dependent on factorial product of all pairs for each context, - # less pairs generated via various conditions that get filtered out - self.pairs_table = pd.DataFrame(columns=headers) - - row_position = 0 - - # print pairs for single - if cut_type_id <= 2: - - self.logger.debug("Adding single cuts to pandas dataframe") - - # first get pairs via iterator_single_pairs_dict... - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R in self.iterator_single_pairs_dict(): - - data_row = ['single', molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R] - - # ...now add the data differences - # data_diff = self.mol_data_dict[molid_R][diff_col_name_idx] - - # self.mol_data_dict[molid_L][diff_col_name_idx] - diff_array = self.get_data_diffs(molid_L, molid_R) - data_row.extend(diff_array) - - # add row to table - self.pairs_table.loc[row_position] = data_row - row_position += 1 - - # print pairs for double - if cut_type_id >= 2: - - self.logger.debug("Adding double cuts to pandas dataframe") - - # first get pairs via iterator_double_pairs_dict... - for molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R in self.iterator_double_pairs_dict(): - - data_row = ['single', molid_L, molid_R, ctx, frag_L, frag_R, fa_L, ca_L, fa_R, ca_R] - - # ...now add the data differences - # data_diff = self.mol_data_dict[molid_R][diff_col_name_idx] - - # self.mol_data_dict[molid_L][diff_col_name_idx] - diff_array = self.get_data_diffs(molid_L, molid_R) - # data_row.append(data_diff) - data_row.extend(diff_array) - - # add row to table - self.pairs_table.loc[row_position] = data_row - row_position += 1 - - self.logger.debug("Built dataframe of shape: %d, %d" % self.pairs_table.shape) - - def pd_read_csv(self, pairs_file, column_rename_dict = None): - """Read in the CSV to a pandas dataframe: - This method expects certain column names and if they do not exist in the input data - you need to rename them by providing a column_rename_dict on input parameter, see - unit test for example by need columns: - 'FRAG_L', 'FRAG_R', 'ATTCHPT_FRAG_L', 'ATTCHPT_FRAG_R', 'DIFF' - """ - - # get and check CSV file - self.csv_file = pairs_file - if not os.path.isfile(self.csv_file): - self.logger.warn('Cant instantiate object without a valid csv file input') - sys.exit('Cant instantiate object without a valid csv file input') - - self.logger.info('Creating pandas dataframe from csv') - if self.types_header: - # Bug Fix: - # In some cases we need to skip the second row in the CSV because we have a types header, this types header - # is written only when -t flag invoked. Read file back in with this types header - # and the column types become object not float then aggregation fails.. use these lines before aggregation - # to highlight the issue with an without -t flag: print self.pairs_table.dtypes && print self.grouped.dtypes - # The self.types_header var had to be added to the class mmp_data_object - self.pairs_table = pd.read_csv(self.csv_file, header=0, skiprows=[1], skipinitialspace=True, - index_col=False) - else: - self.pairs_table = pd.read_csv(self.csv_file, header=0, skipinitialspace=True, index_col=False) - - # check to see if we need a column rename - if column_rename_dict is not None: - self.logger.info('Found a rename dict to use so will try to rename CSV columns') - self.pairs_table.rename(columns = column_rename_dict, inplace=True) - - # check we have the columns we need later, these are hard coded later on so - # must be dynamically renamed above or else everything will fail - if 'MOLID_L' not in self.pairs_table.columns: - self.logger.warn('Cannot find column MOLID_L in CSV file') - sys.exit('Cannot find column MOLID_L in CSV file') - - elif 'MOLID_R' not in self.pairs_table.columns: - self.logger.warn('Cannot find column MOLID_R in CSV file') - sys.exit('Cannot find column MOLID_R in CSV file') - - elif 'FRAG_L' not in self.pairs_table.columns: - self.logger.warn('Cannot find column FRAG_L in CSV file') - sys.exit('Cannot find column FRAG_L in CSV file') - - elif 'FRAG_R' not in self.pairs_table.columns: - self.logger.warn('Cannot find column FRAG_R in CSV file') - sys.exit('Cannot find column FRAG_R in CSV file') - - elif 'ATTCHPT_FRAG_L' not in self.pairs_table.columns: - self.logger.warn('Cannot find column ATTCHPT_FRAG_L in CSV file') - sys.exit('Cannot find column ATTCHPT_FRAG_L in CSV file') - - elif 'ATTCHPT_FRAG_R' not in self.pairs_table.columns: - self.logger.warn('Cannot find column ATTCHPT_FRAG_R in CSV file') - sys.exit('Cannot find column ATTCHPT_FRAG_R in CSV file') - - else: - self.logger.info('Column headers look great, proceed with aggregation') - - self.logger.info('Done reading from csv') - - def pd_aggregate_pairs_to_csv(self, filename, agg_type, agg_method='mean', prop_data=False, - act_col=False, remove_id_dupes=True, inc_low_n_vals=True, - add_qmetric=False): - """Method to create grouped or aggregated pair data with associated stats and write to file. - Can group data in different ways such as: - frag = 'FRAG_L, FRAG_R' (to get the average change for a given fragment combination) - attach = 'FRAG_L, FRAG_R, ATTCHPT_FRAG_L, ATTCHPT_FRAG_R' (to get the average change - for for a given fragment and attachment point combination) - - Merged in a method (pairs_table.drop_duplicates) to remove ID/ID duplicates from a data frame. - It's useful as a pre filter step before aggregation. - - The separate object level method pd_save_aggregate_pairs(self, out_file) got added back - into this method due to memory issues (see inline comment below). This plays off speed against - memory. For small df's we can hold the aggregated pairs and .agg() data in memory then use .to_csv - on the whole df but for larger df's the script bombs with mem errors. Thus for large df's we only - generate the grouped object then iterate over it to calc the .agg() stats on an individual group at - a time, writing to file not mem. Small df = 100K rows and 15 columns, Large = 15,000K rows and 10 - columns in testing so cutoff is an arbitary 100K * 15 = 1,500,000 (num_cells). test_mode = True as - input param will let you run the method designed for large df's over any small input df (testing mode) - """ - - # this is not fool proof. If the incomming table does not have the columns ['MOL_L_CLP' 'MOL_R_CLP'] - # then code will pass here but fail later, so add second catch for missing columns - if add_qmetric is True and prop_data is False: - raise Exception('Must specify prop_data=True when requesting add_qmetric as Quality metric needs CLP data') - # second catch - see above - if add_qmetric is True: - if 'MOL_L_CLP' not in self.pairs_table.columns or 'MOL_R_CLP' not in self.pairs_table.columns: - raise Exception('Must have columns MOL_L_CLP and MOL_R_CLP present to add qmetric') - - def get_quality(row, clp_diff_idx, clp_diff_sd_idx, clp_left_sd_idx, num_pairs_idx): - """ Function that takes list input (row) and returns an estimate of quality based on - Good, Number of pairs >= 15 & stdev of L.clogp >= 0.5 & when |d.clogp|>0, stdev of d.clogp>0 - Medium, Number of pairs >= 6 and stdev of L.clogp >= 0.3 and when |d.clogp|>0, stdev of d.clogp>0 - Poor, The rest of the matched-pairs that do not satisfied the above criteria - Columns to work with are - 'MOL_CLP_DIFF, 'MOL_CLP_DIFF_std', 'MOL_L_CLP_std', act_col + '_count' - """ - # first write of function with simplest slower logic - # if row[num_pairs_idx] >= 15 and row[clp_left_sd_idx] >= 0.5 and \ - # abs(row[clp_diff_idx]) > 0 and row[clp_diff_sd_idx] > 0: - # return 2 - # elif row[num_pairs_idx] >= 6 and row[clp_left_sd_idx] >= 0.3 and \ - # abs(row[clp_diff_sd_idx]) > 0 and row[clp_diff_sd_idx] > 0: - # return 1 - # else: - # return 0 - if abs(row[clp_diff_idx]) > 0 and row[clp_diff_sd_idx] > 0: - - num_pairs = row[num_pairs_idx] - clp_left_sd = row[clp_left_sd_idx] - - if num_pairs >= 15 and clp_left_sd >= 0.5: - return 2 - - elif num_pairs >= 6 and clp_left_sd >= 0.3: - return 1 - - else: - return 0 - else: - return 0 - - def write_grouped_table_to_csv(filename): - # This method is a work around a bug seen with multiline headers in pd 0.16.1 with py 2.7 I cannot write a - # multi line header: https://github.com/pydata/pandas/issues/5539 because I get an additional empty header - # line in the csv. Instead we write the original header to csv, replace it then append the whole df to csv - - # 201701-26 Adding Quality metric - # TODO: Putting this aggregate function here in a submethod is messy but simplest approach for now - # execution suffers badly and it needs a refactor - if add_qmetric: - self.logger.info('Calculating Quality metric') - header_cols = list(self.grouped.columns) - clp_diff_idx = header_cols.index('MOL_CLP_DIFF_mean') - clp_diff_sd_idx = header_cols.index('MOL_CLP_DIFF_std') - clp_left_sd_idx = header_cols.index('MOL_L_CLP_std') - num_pairs_idx = header_cols.index(act_col + '_count') - self.grouped['QUALITY'] = self.grouped.apply(lambda row: get_quality(row, clp_diff_idx, - clp_diff_sd_idx, clp_left_sd_idx, - num_pairs_idx), axis=1) - - if self.types_header: - self.logger.info('Writing Types Header') - # write the header to file - index_count = len(self.grouped.index.names) - for idx in range(0, index_count): - self.grouped.reset_index(level=0, inplace=True) - pd.DataFrame(data=[self.grouped.columns]).to_csv(filename, - float_format='%.3f', - header=False, - index=False, - na_rep="NaN") - - # now replace header with column type labels - types_header_for_insert = list(self.grouped.columns.values) - for idx, val in enumerate(self.grouped.columns.values): - # print idx, val, self.grouped[val].dtype - if self.grouped[val].dtype == 'float64': - types_header_for_insert[idx] = 'REAL' - - elif self.grouped[val].dtype == 'int64': - types_header_for_insert[idx] = 'INTEGER' - - else: - types_header_for_insert[idx] = 'STRING' - - self.grouped.columns = types_header_for_insert - - # and append the whole df with new header to original file - self.logger.info('Writing to CSV') - self.grouped.to_csv(filename, mode="a", float_format='%.3f', index=False, na_rep="NaN") - - else: - # simple write - self.logger.info('Writing to CSV') - self.grouped.to_csv(filename, float_format='%.3f', na_rep="NaN") - - # As the DIFF and other functions can be expensive, we will only allow - agg_method = agg_method.upper() - agg_type = agg_type.upper() - out_file = filename - tmp_out_file = out_file + "_tmp" - - validate_agg_method(agg_method) - - # will be 0 if 'DIFFXX' or >0 if 'MEAN_DIFF' and -1 when string does not contain DIFF - diff_type = str.find(agg_method, 'DIFF') - - if act_col: - act_col += '_DIFF' - - # As the DIFF and other functions can be expensive, we will only allow it to be used with act_col - # test times gave 22 sec for mean versus 202 sec for diffxx over 1.4M keys, single column - if act_col is False and 'DIFF' in agg_method: - sys.exit('Please specify the activity difference column name when using the DIFFXX function as it is ' - 'expensive to run') - - # drop columns we don't need: - self.logger.info('Drop columns we dont need') - - if prop_data: - self.pairs_table.rename(columns={'W_AMW_DIFF': 'MOL_MWT_DIFF'}, inplace=True) - - if act_col: - if prop_data: - column_list = ['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', - 'ATTCHPT_CTX_R', 'MOL_CLP_DIFF', 'MOL_MWT_DIFF', act_col] - if add_qmetric: - column_list.extend(['MOL_L_CLP', 'MOL_R_CLP']) - self.pairs_table = self.pairs_table[column_list] - - else: - column_list = ['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', 'ATTCHPT_CTX_R', act_col] - self.pairs_table = self.pairs_table[column_list] - - # could sort table before we remove duplicates to get consistent smiles ouptut - # tested on 75K input SMI but cost was high - # self.logger.info('sort data table') - # self.pairs_table.sort(['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', 'ATTCHPT_CTX_R'], - # inplace=True) - - # drop duplicates - self.logger.info('Drop duplicate rows') - - if agg_type == 'FRAG': - if remove_id_dupes: - self.pairs_table.drop_duplicates(['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R'], inplace=True) - self.pairs_table.drop(['MOLID_L', 'MOLID_R', 'ATTCHPT_CTX_L', 'ATTCHPT_CTX_R'], axis=1, inplace=True) - - elif agg_type == 'ATTACH': - # - # This creates a situation where a different N (count) can occur for non attachment point aggregated data - # versus attachment point aggregated. - if remove_id_dupes: - self.pairs_table.drop_duplicates(['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R', - 'ATTCHPT_CTX_L', 'ATTCHPT_CTX_R'], - inplace=True) - # as opposed to this, which will select an arbitary one of the many attachment points for the given pairs - # self.pairs_table.drop_duplicates(['MOLID_L', 'MOLID_R', 'FRAG_L', 'FRAG_R'], inplace = True) - self.pairs_table.drop(['MOLID_L', 'MOLID_R'], axis=1, inplace=True) - - else: - sys.exit('Invalid value for parameter agg_type: %s (should be FRAG or ATTACH' % agg_type) - - # do grouping and aggregation - self.logger.info('Group') - - # python3 needs numeric not object for aggregate - self.pairs_table = self.pairs_table.apply(pd.to_numeric, errors='ignore') - - if agg_type == 'ATTACH': - self.grouped = self.pairs_table.groupby(['FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', 'ATTCHPT_CTX_R'], sort=False) - - # assume we have caught all miss-assignments of agg_type so must have 'FRAG' - else: - self.grouped = self.pairs_table.groupby(['FRAG_L', 'FRAG_R'], sort=False) - - ########################## - # - # See method level comments regarding df size. pd.df's of 15,000K rows failed (~80K input SMI) - # Therefore two method implemented to balance speed versus memory - # TODO: must be a better way to deal with below multi switch possibilities... - # - ############################# - # (height, width) = self.pairs_table.shape - # num_cells = height * width - - self.logger.info('Aggregating in memory') - if agg_method == 'MEAN': - - self.logger.info('Aggregating using MEAN') - self.grouped = self.grouped.aggregate(['mean', 'std', 'count']) - self.grouped.columns = ['_'.join(col).strip() for col in self.grouped.columns.values] - - # try: - write_grouped_table_to_csv(filename) - # except: - # self.logger.warn('You have not done any aggregation, use aggregate_pairs() first') - # sys.exit('You have not done any aggregation, use aggregate_pairs() first') - - elif agg_method == 'CATEGORICAL': - - self.logger.info('Aggregating CATEGORICAL data') - - if prop_data: - f = {act_col: ['count', mmps.category_moved_up_one_pct, mmps.category_moved_down_one_pct, - mmps.category_no_change_pct, mmps.category_changed_class_pct], - 'MOL_CLP_DIFF': ['mean', 'std'], - 'MOL_MWT_DIFF': ['first']} - if add_qmetric: - f['MOL_L_CLP'] = ['std'] - - else: - f = {act_col: ['count', mmps.category_moved_up_one_pct, mmps.category_moved_down_one_pct, - mmps.category_no_change_pct, mmps.category_changed_class_pct]} - - self.grouped = self.grouped.aggregate(f) - self.grouped.columns = ['_'.join(col).strip() for col in self.grouped.columns.values] - - write_grouped_table_to_csv(filename) - - # case where we have MEAN_DIFF or DIFFxx - elif diff_type >= 0: - - if diff_type == 0: - try: - agg_method_int = int(agg_method.replace('DIFF', '')) - - except: - sys.exit('Invalid parameter used for method diff, try diffxx where 0 < xx < 100') - - if prop_data: - - self.logger.info('Aggregating using DIFF or MEAN_DIFF, with property data') - # specify what to use on what column - if diff_type == 0: - f = {act_col: [mmps.diffn_list_rtn(agg_method_int, inc_low_n_vals=inc_low_n_vals), 'count', - mmps.n_pos_diff, mmps.n_neg_diff], 'MOL_CLP_DIFF': ['mean', 'std'], - 'MOL_MWT_DIFF': ['first']} - if add_qmetric: - f['MOL_L_CLP'] = ['std'] - else: - if 'INVLOG' in agg_method: - f = {act_col: [mmps.mean_diff_invlog(inc_low_n_vals=inc_low_n_vals), 'count', mmps.n_pos_diff, - mmps.n_neg_diff], 'MOL_CLP_DIFF': ['mean', 'std'], - 'MOL_MWT_DIFF': ['first']} - if add_qmetric: - f['MOL_L_CLP'] = ['std'] - else: - f = {act_col: [mmps.mean_diff(inc_low_n_vals=inc_low_n_vals), 'count', mmps.n_pos_diff, - mmps.n_neg_diff], 'MOL_CLP_DIFF': ['mean', 'std'], - 'MOL_MWT_DIFF': ['first']} - if add_qmetric: - f['MOL_L_CLP'] = ['std'] - - self.grouped = self.grouped.aggregate(f) - self.grouped.columns = ['_'.join(col).strip() for col in self.grouped.columns.values] - # self.grouped[[act_col+'_'+agg_method+'_low', act_col+'_'+agg_method, - # act_col+'_'+agg_method+'_upp']] = self.grouped[act_col+'_'+agg_method.lower()+'_all'].apply(pd.Series) - # self.grouped.drop(act_col+'_'+agg_method.lower()+'_all', axis=1, inplace=True) - - # self.grouped.dropna(how='any', inplace=True) - write_grouped_table_to_csv(tmp_out_file) - - else: - - self.logger.info('Aggregating using DIFF, with no property data') - - if diff_type == 0: - f = {act_col: ['count', mmps.n_pos_diff, mmps.n_neg_diff, - mmps.diffn_list_rtn(agg_method_int, inc_low_n_vals=inc_low_n_vals)]} - else: - if 'INVLOG' in agg_method: - f = {act_col: ['count', mmps.n_pos_diff, mmps.n_neg_diff, - mmps.mean_diff_invlog(inc_low_n_vals=inc_low_n_vals)]} - else: - f = {act_col: ['count', mmps.n_pos_diff, mmps.n_neg_diff, - mmps.mean_diff(inc_low_n_vals=inc_low_n_vals)]} - - self.grouped = self.grouped.aggregate(f) - - self.grouped.columns = ['_'.join(col).strip() for col in self.grouped.columns.values] - - write_grouped_table_to_csv(tmp_out_file) - - # could not get efficient way to break columns out in memory or via group iterator so doing it the ugly way - # need to replace ,ACT_A_DIFF_diff60_all, with ,ACT_A_DIFF_DIFF60_low,ACT_A_DIFF_DIFF60,ACT_A_DIFF_DIFF60_upp - # and sort out ,"(39.999999751010833, 39.99999997345952, 39.999999997170974)", - # TODO: Get rid of this file based cleanup and implement method for DIFFXX calc to return separate columns - # - self.logger.info('Messy cleanup of function output') - - if diff_type == 0: - header_string_orig = act_col + '_' + agg_method.lower() + '_all' - else: - header_string_orig = act_col + '_' + agg_method.lower() - - header_string_new = act_col + '_' + agg_method.lower() + '_low' + ', ' + act_col + '_' + \ - agg_method.lower() + ', ' + act_col + '_' + agg_method.lower() + '_upp' - - with open(tmp_out_file, 'rt') as f_read: - with open(out_file, 'wt') as f_write: - line_num = 0 - for x in f_read: - line_num += 1 - y = x - if line_num == 1: - y = y.replace(header_string_orig, header_string_new) - y = y.replace(" ", '') - elif line_num == 2 and self.types_header: - # strip eol - y = y.rstrip("\n") - # to list then find all 'STRING', check the underlying data, if (nan, nan, nan), change to - # REAL - x = y.split(",") - for idx, item in enumerate(x): - if item == 'STRING': - # print "->", idx, item, self.grouped.iloc[2, idx] - # does this column position correspond to the (nan, nan, nan) position in dataframe - if str(self.grouped.iloc[2, idx]).find(",") > 0: - # now replace with REAL,REAL,REAL - # print "--->", idx, item, self.grouped.iloc[2, idx] - x[idx] = 'REAL,REAL,REAL' - # convert back to string - y = ','.join(x) - y += "\n" - else: - y = x - y = y.replace(",\"[", ',') - y = y.replace("]\",", ',') - y = y.replace("NaN]\"", 'NaN') - y = y.replace(",\"(", ',') - y = y.replace(")\",", ',') - y = y.replace(")\"", '') - y = y.replace(", ", ',') - # y = y.replace(",,",',NaN,') - # y = y.replace(", ,",',NaN,') - # Java needs NaN not nan - not sure what we'll do if we remove this loop for native to_csv - y = y.replace(",nan", ',NaN') - f_write.write(y) - - os.remove(tmp_out_file) - - else: - sys.exit("Invalid input for agg_method") - - self.logger.info('Completed aggregation') - - -# -# unittest everything -# -class _Test_MMPPairsObjectClass(unittest.TestCase): - """Test class to test the object and methods""" - - def setUp(self): - # - self.maxDiff = None - - # setup test data location use tempfile.NamedTemporaryFile(delete=False) to persist data on disk - self.temp_file_input_pairs = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_input_pairdupes = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_input_diffdata = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_input_diffdata_02 = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_output = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_output_nodupes = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_output_diffdata = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - self.temp_file_input_csv = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='w+t') - - self.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - logging.disable(logging.CRITICAL) - - self.test_mmp_pairs_object = MMPPairsObjectClass(self.mmplogger) - # TODO: move files write blocks into each unittest - - self.test_dataset_goldeninput_header = 'MOLID_L, MOLID_R, CONTEXT, FRAG_L, FRAG_R, ATTCHPT_CTX_L, ' - self.test_dataset_goldeninput_header += 'ATTCHPT_FRAG_L, ATTCHPT_CTX_R, ATTCHPT_FRAG_R, pIC50_DIFF' - - # I need to remap my input data columns to something consistent - self.test_columns_dict = {'FRAG_L' : 'FRAG_L', - 'FRAG_R' : 'FRAG_R', - 'ATTCHPT_FRAG_L' : 'ATTCHPT_FRAG_L', - 'ATTCHPT_FRAG_R' : 'ATTCHPT_FRAG_R', - 'pIC50_DIFF' : 'DIFF' - } - - self.test_dataframe = pd.DataFrame({ - 'A' : ['aa', 'ab', 'ac', 'ad', 'aa', 'ab', 'ac', 'ad', 'aa', 'aa', 'ab'], - 'B' : [-0.542, -3.043, 0.264, 0.094, -0.262, 0.344, 0.094, -0.262, -0.555, -0.54, -0.27], - 'C' : [-3.043, 0.264, 0.094, -0.262, 0.344, 0.769, 0.094, -0.262, -3.001, -3.10, 0.35], - 'D' : [0.264, 0.094, -0.262, 0.344, 0.769, 0.811, 0.094, -0.262, -0.260, -3.001, 0.100], - 'E' : [-0.262, 0.344, 0.769, 0.811, -1.350, -1.475, 0.094, -0.262, -0.254, -0.254, 0.901], - 'F' : [-0.262, 0.344, 0.769, 0.811, -1.350, -1.475, 0.094, -0.262, -0.256, -0.206, 0.344] - }) - - self.test_dataframe_result = pd.DataFrame({ - 'A': ['aa', 'ab', 'ac', 'ad'], - 'diff60': [-59.06244, 21.25944, np.nan, np.nan], - 'len': [4.0, 3.0, 2.0, 2.0], - 'mean': [-2.200, 0.461, 0.094, -0.262] - }) - - self.test_dataset_goldeninput_csv_headers = \ - ['SMILES','PIC50','CHEMBL_ID','CHEMBL_TARGET_ID','CHEMBL_TARGET'] - - self.test_dataset_goldeninput_csv_data = { - # Below data is taken from DOI: 10.12688/f1000research.3-36.v2 - # -> https://f1000research.com/articles/3-36 - # Bajorath et. al. 2014 "Matched molecular pair-based data sets for computer-aided medicinal chemistry." - # All ID's are CHEMBL Compound or Target IDs - 'O(CC)c1ccc(N2C(=Nc3c(cccc3)C2=O)[C@H](N(C(=O)Cc2ccc(cc2)-c2ccccc2)CCOCC)CC)cc1,8.045757491,230126,4441,C-X-C chemokine receptor type 3': None, - 'O(CC)c1ccc(N2C(=Nc3c(cccc3)C2=O)[C@H](N(C(=O)Cc2ccc(cc2)-c2ccccc2)CCOCC)C)cc1,7.124938737,266663,4441,C-X-C chemokine receptor type 3': None, - 'O(CC)c1ccc(N2C(=Nc3c(cccc3)C2=O)CN(C(=O)Cc2ccc(cc2)-c2ccccc2)CCOCC)cc1,5.638272164,231487,4441,C-X-C chemokine receptor type 3': None, - 'O(CC)c1ccc(N2C(=Nc3c(cccc3)C2=O)[C@H](N(C(=O)Cc2ccc(cc2)-c2ccccc2)CCOCC)c2ccccc2)cc1,5.397940009,230127,4441,C-X-C chemokine receptor type 3': None, - '[nH]1nc(c(c1)-c1cc(ncc1)-c1ccc(cc1)C)-c1ncccc1,7.552841969,205156,4439,TGF-beta receptor type I': None, - '[nH]1nc(c(c1)-c1cc(ncc1)-c1cc(ccc1)C)-c1ncccc1,6.718966633,205260,4439,TGF-beta receptor type I': None, - '[nH]1nc(c(c1)-c1cc(ncc1)-c1ccccc1C)-c1ncccc1,5.400007822,383433,4439,TGF-beta receptor type I': None, - 'Clc1cc(cnc1N1C[C@@H]([NH+](CC1)C1CC[NH+](CC1)Cc1ccc(Cl)cc1)CC)C(=O)NC,8.522878745,1681874,4441,C-X-C chemokine receptor type 3': None, - 'Clc1cc(cnc1N1C[C@@H]([NH+](CC1)C1CC[NH+](CC1)Cc1ccc(Cl)cc1)C)C(=O)NC,7.494850022,1681841,4441,C-X-C chemokine receptor type 3': None, - 'Clc1cc(cnc1N1CC[NH+](CC1)C1CC[NH+](CC1)Cc1ccc(Cl)cc1)C(=O)NC,6.251811973,565761,4441,C-X-C chemokine receptor type 3': None, - 'Clc1cc(cnc1N1C[C@@H]([NH+](CC1)C1CC[NH+](CC1)Cc1ccc(Cl)cc1)c1ccccc1)C(=O)NC,6.080921908,1681876,4441,C-X-C chemokine receptor type 3': None, - 'FC(F)(F)c1ccc(cc1)-c1nccc(c1)-c1c[nH]nc1-c1ncccc1,7.619788758,383523,4439,TGF-beta receptor type I': None, - 'FC(F)(F)c1cc(ccc1)-c1nccc(c1)-c1c[nH]nc1-c1ncccc1,6.619788758,382466,4439,TGF-beta receptor type I': None, - 'FC(F)(F)c1ccccc1-c1nccc(c1)-c1c[nH]nc1-c1ncccc1,5.029978734,426852,4439,TGF-beta receptor type I': None - } - - # example pairs file (CHEMBL data): - # below output is derived from command line given, with testdata.csv file contents from above TGF-beta lines: - # getMMPStatsfromCSV.sh -i testdata.csv -o testdata.pairs -s SMILES -n CHEMBL_ID -a PIC50 -c SINGLE - self.test_dataset_goldeninput_pairs = { -'single,205156,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-8.339e-01': None, -'single,205156,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1[1cH]cccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-2.153e+00': None, -'single,205156,383523,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],6.695e-02': None, -'single,205156,382466,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-9.331e-01': None, -'single,205156,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-2.523e+00': None, -'single,205260,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],8.339e-01': None, -'single,205260,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,Cc1[1cH]cccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-1.319e+00': None, -'single,205260,383523,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],9.008e-01': None, -'single,205260,382466,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-9.918e-02': None, -'single,205260,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-1.689e+00': None, -'single,383433,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],2.153e+00': None, -'single,383433,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],1.319e+00': None, -'single,383433,383523,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],2.220e+00': None, -'single,383433,382466,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],1.220e+00': None, -'single,383523,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-6.695e-02': None, -'single,383523,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-9.008e-01': None, -'single,383523,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-2.220e+00': None, -'single,383523,382466,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-1.000e+00': None, -'single,383523,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-2.590e+00': None, -'single,382466,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],9.331e-01': None, -'single,382466,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],9.918e-02': None, -'single,382466,383523,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],1.000e+00': None, -'single,382466,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-1.590e+00': None, -'single,426852,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],2.523e+00': None, -'single,426852,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],1.689e+00': None, -'single,426852,383523,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],2.590e+00': None, -'single,426852,382466,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],1.590e+00': None, -'single,205156,383523,[nH]1[n]c(c(c1)c1cc([n]cc1)c1cc[1cH]cc1)c1[n]cccc1,[1CH4],F[1CH](F)F,[1:CAR],[1:C3],[1:CAR],[1:C3],6.695e-02': None, -'single,383523,205156,[nH]1[n]c(c(c1)c1cc([n]cc1)c1cc[1cH]cc1)c1[n]cccc1,F[1CH](F)F,[1CH4],[1:CAR],[1:C3],[1:CAR],[1:C3],-6.695e-02': None, -'single,205260,382466,[nH]1[n]c(c(c1)c1cc([n]cc1)c1c[1cH]ccc1)c1[n]cccc1,[1CH4],F[1CH](F)F,[1:CAR],[1:C3],[1:CAR],[1:C3],-9.918e-02': None, -'single,382466,205260,[nH]1[n]c(c(c1)c1cc([n]cc1)c1c[1cH]ccc1)c1[n]cccc1,F[1CH](F)F,[1CH4],[1:CAR],[1:C3],[1:CAR],[1:C3],9.918e-02': None, -'single,382466,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-1.220e+00': None, -'single,383433,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-3.700e-01': None, -'single,426852,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],3.700e-01': None, -'single,383433,426852,[nH]1[n]c(c(c1)c1cc(c2[1cH]cccc2)[n]cc1)c1[n]cccc1,[1CH4],F[1CH](F)F,[1:CAR],[1:C3],[1:CAR],[1:C3],-3.700e-01': None, -'single,426852,383433,[nH]1[n]c(c(c1)c1cc(c2[1cH]cccc2)[n]cc1)c1[n]cccc1,F[1CH](F)F,[1CH4],[1:CAR],[1:C3],[1:CAR],[1:C3],3.700e-01': None - } - - # These two CHEMBL mols will produce very repetitive pairs. On aggregation we should only count - # the id-id pair once if filtering is on, see test - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL268389 - # CC\C(=C(/c1ccc(I)cc1)\c2ccc(OCCCCCN3CCCC3)cc2)\c4ccccc4,6.1,268389,2095150,Phosphodiesterase 1 - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL267035/ - # CC\C(=C(/c1ccc(I)cc1)\c2ccc(OCCCCN3CCCC3)cc2)\c4ccccc4,6.0,267035,2095150,Phosphodiesterase 1 - # ['SMILES','PIC50','CHEMBL_ID','CHEMBL_TARGET_ID','CHEMBL_TARGET'] - # a lot of data is removed as unnecessary to trigger test condition - - self.test_dataset_goldeninput_pairdupes_header = 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,PIC50_DIFF' - - self.test_dataset_goldeninput_pairdupes = { - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc([1OH])cc2)cc1.[2CH3]CCN1CCCC1,[2CH3][1CH3],[12CH4],[2:C3|1:O3],[1:C3|2:C3],[2:C3|1:O3],[1:C3|2:C3],-1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc([2OH])cc2)cc1.[1CH3]CCN1CCCC1,[1CH3][2CH3],[12CH4],[1:C3|2:O3],[2:C3|1:C3],[1:C3|2:O3],[2:C3|1:C3],-1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc([1OH])cc2)cc1.[2CH3]CCN1CCCC1,[12CH4],[2CH3][1CH3],[2:C3|1:O3],[1:C3|2:C3],[2:C3|1:O3],[1:C3|2:C3],1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc([2OH])cc2)cc1.[1CH3]CCN1CCCC1,[12CH4],[1CH3][2CH3],[1:C3|2:O3],[2:C3|1:C3],[1:C3|2:O3],[2:C3|1:C3],1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[1CH3])cc2)cc1.[2CH3]CN1CCCC1,[2CH3][1CH3],[12CH4],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],-1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[2CH3])cc2)cc1.[1CH3]CN1CCCC1,[1CH3][2CH3],[12CH4],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],-1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[1CH3])cc2)cc1.[2CH3]CN1CCCC1,[12CH4],[2CH3][1CH3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[2CH3])cc2)cc1.[1CH3]CN1CCCC1,[12CH4],[1CH3][2CH3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[1CH3])cc2)cc1.[2CH3]N1CCCC1,[2CH3]C[1CH3],[2CH3][1CH3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],-1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[2CH3])cc2)cc1.[1CH3]N1CCCC1,[1CH3]C[2CH3],[1CH3][2CH3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],-1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[1CH3])cc2)cc1.[2CH3]N1CCCC1,[2CH3][1CH3],[2CH3]C[1CH3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(O[2CH3])cc2)cc1.[1CH3]N1CCCC1,[1CH3][2CH3],[1CH3]C[2CH3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OC[1CH3])cc2)cc1.[2CH3]N1CCCC1,[2CH3][1CH3],[12CH4],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],-1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OC[2CH3])cc2)cc1.[1CH3]N1CCCC1,[1CH3][2CH3],[12CH4],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],-1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OC[1CH3])cc2)cc1.[2CH3]N1CCCC1,[12CH4],[2CH3][1CH3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OC[2CH3])cc2)cc1.[1CH3]N1CCCC1,[12CH4],[1CH3][2CH3],[1:C3|2:C3],[2:C3|1:C3],[1:C3|2:C3],[2:C3|1:C3],1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OCC[1CH3])cc2)cc1.[2NH]1CCCC1,[2CH3][1CH3],[12CH4],[2:N3|1:C3],[1:C3|2:C3],[2:N3|1:C3],[1:C3|2:C3],-1.000e-01': None, - 'double,268389,267035,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OCC[2CH3])cc2)cc1.[1NH]1CCCC1,[1CH3][2CH3],[12CH4],[1:N3|2:C3],[2:C3|1:C3],[1:N3|2:C3],[2:C3|1:C3],-1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OCC[1CH3])cc2)cc1.[2NH]1CCCC1,[12CH4],[2CH3][1CH3],[2:N3|1:C3],[1:C3|2:C3],[2:N3|1:C3],[1:C3|2:C3],1.000e-01': None, - 'double,267035,268389,Ic1ccc(C(=C(CC)c2ccccc2)c2ccc(OCC[2CH3])cc2)cc1.[1NH]1CCCC1,[12CH4],[1CH3][2CH3],[1:N3|2:C3],[2:C3|1:C3],[1:N3|2:C3],[2:C3|1:C3],1.000e-01': None - } - - # - # fake test data, numeric id's for compounds/fragments have not relation to real data - self.test_dataset_goldeninput_diffheader = 'MOLID_L, MOLID_R, FRAG_L, FRAG_R, ATTCHPT_CTX_L, ATTCHPT_FRAG_L, ATTCHPT_CTX_R, ATTCHPT_FRAG_R, DIFF_DIFF' - - self.test_dataset_goldeninput_diffdata = { -'19733, 19733, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.542,': None, -'88154, 94170, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -3.043,': None, -'22633, 20295, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.264,': None, -'07788, 08310, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.094,': None, -'00035, 03079, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.262,': None, -'82657, 87490, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.344,': None, -'82472, 87490, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.769,': None, -'82657, 87490, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.811,': None, -'04116, 04139, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.35,': None, -'05358, 05056, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.475,': None, -'32144, 32058, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.027,': None, -'93314, 05098, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -3.2,': None, -'91240, 05098, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -3.345,': None, -'93399, 05098, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.95,': None, -'16563, 16207, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.107,': None, -'16278, 16060, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.298,': None, -'88346, 94030, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -3.273,': None, -'18283, 20381, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.301,': None, -'21549, 21181, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -5.52,': None, -'21181, 21181, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -5.999,': None, -'21550, 21181, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -5.52,': None, -'82132, 83661, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.097,': None, -'88347, 93216, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.069,': None, -'79281, 58182, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.257,': None, -'21157, 19552, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.65,': None, -'88346, 94142, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.117,': None, -'57884, 59931, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.473,': None, -'01494, 00530, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.042,': None, -'99785, 92755, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.57,': None, -'17063, 24247, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.751,': None, -'16948, 17281, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.046,': None, -'92124, 94061, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.644,': None, -'19023, 18922, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.105,': None, -'13801, 13590, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2,': None, -'29390, 29447, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.671,': None, -'60826, 92119, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -3.942,': None, -'09859, 10707, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.909,': None, -'79325, 81431, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.06,': None, -'79325, 81446, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.89,': None, -'93410, 04047, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.372,': None, -'17065, 16908, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.469,': None, -'04967, 04965, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.736,': None, -'83651, 93159, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 2.342,': None, -'18850, 22119, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.276,': None, -'80423, 82835, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.974,': None, -'66590, 8639, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], 0.39,': None, -'82240, 83794, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.385,': None, -'80424, 82835, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -1.87,': None, -'94159, 97578, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.522,': None, -'94159, 99904, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -0.395,': None, -'00042, 01800, [1CH4], [1OH], [1:C3], [1:O3], [1:C3], [1:O3], -2.647,': None, -'00042, 01899, [1CH4], [1SH], [1:C3], [1:S3], [1:C3], [1:S3], -2.647,': None - } - - # A few lines of CHEMBL data taken from test_dataset_goldeninput_pairs - ID's are chembl compound id's - # the first twos lines represent the same pair but in both directions - # a rest of the lines are a set of further unique CHEMBL ids taken from test_dataset_goldeninput_pairs but - # fragments changed to same as the above to make more of the same pair change, should get 5 the same 1 different - - self.test_dataset_goldeninput_diffheader_02 = \ - 'CUT,MOLID_L,MOLID_R,CONTEXT,FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_FRAG_L,ATTCHPT_CTX_R,ATTCHPT_FRAG_R,ACT_A_DIFF,MOL_CLP_DIFF,MOL_MWT_DIFF' - - self.test_dataset_goldeninput_diffdata_02 = { -'single,205156,205260,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],10.0,0.72,60.9': None, -'single,205260,205156,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],-10.0,-0.72,-60.9': None, -'single,383523,205156,[nH]1[n]c(c(c1)c1cc([n]cc1)c1cc[1cH]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],11.0,0.72,60.9': None, -'single,382466,205260,[nH]1[n]c(c(c1)c1cc([n]cc1)c1c[1cH]ccc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],8.0,0.72,60.9': None, -'single,382466,383433,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],9.0,0.72,60.9': None, -'single,383433,426852,[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],[1:CAR],[1:CAR],9.0,0.72,60.9': None - } - - # - # A matched pair taken from above data 205156, 205260 = CHEMBL205156 and CHEMBL205260) - # Further lines have fake meaningless identifiers - # self.test_dataset_goldeninput_diffheader_06 - # - self.test_dataset_goldeninput_diffheader_03 = \ - 'MOLID_L, MOLID_R, CONTEXT, FRAG_L, FRAG_R, ATTCHPT_CTX_L, ATTCHPT_FRAG_L, ATTCHPT_CTX_R, ATTCHPT_FRAG_R, DIFF_DIFF, MOL_CLP_DIFF, MOL_MWT_DIFF, MOL_L_CLP, MOL_R_CLP' - - self.test_dataset_goldeninput_diffdata_03 = { - '205156, 205260, [nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1, Cc1cc[1cH]cc1, Cc1c[1cH]ccc1, [1:CAR], [1:CAR], [1:CAR], [1:CAR], -0.0628419184837, -0.441, -55.1, 0.000, 0.000': None, - '9000001, 9000002, [nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1, Cc1cc[1cH]cc1, Cc1c[1cH]ccc1, [1:CAR], [1:CAR], [1:CAR], [1:CAR], -0.485163651843, -0.441, -55.1, 0.000, 0.000': None, - '9000003, 9000004, [nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1, Cc1cc[1cH]cc1, Cc1c[1cH]ccc1, [1:CAR], [1:CAR], [1:CAR], [1:CAR], -0.183073670609, -0.441, -55.1, 0.000, 0.000': None, - '9000005, 9000006, [nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1, Cc1cc[1cH]cc1, Cc1c[1cH]ccc1, [1:CAR], [1:CAR], [1:CAR], [1:CAR], -0.0826401755214, -0.68, -55.1, 0.000, 0.000': None, - '9000007, 9000008, [nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1, Cc1cc[1cH]cc1, Cc1c[1cH]ccc1, [1:CAR], [1:CAR], [1:CAR], [1:CAR], -0.721352311615, -0.412, -55.1, 0.000, 0.000': None - } - - ################################# - # - # Golden results data - # - ################################# - - # example aggregated results file - self.test_dataset_goldenoutput_01 = {'FRAG_L,FRAG_R,CONTEXT_mean,CONTEXT_std,CONTEXT_count': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1cc[1cH]cc1,205260,NaN,1': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1c[1cH]ccc1,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,Cc1[1cH]cccc1,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1cc[1cH]cc1)(F)F,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1c[1cH]ccc1)(F)F,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1c[1cH][n]cc1)c1[n]cccc1,FC(c1[1cH]cccc1)(F)F,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc([n]cc1)c1cc[1cH]cc1)c1[n]cccc1,[1CH4],383523,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc([n]cc1)c1cc[1cH]cc1)c1[n]cccc1,F[1CH](F)F,205156,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc([n]cc1)c1c[1cH]ccc1)c1[n]cccc1,[1CH4],382466,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc([n]cc1)c1c[1cH]ccc1)c1[n]cccc1,F[1CH](F)F,205260,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc(c2[1cH]cccc2)[n]cc1)c1[n]cccc1,[1CH4],426852,NaN,1': None, - '[nH]1[n]c(c(c1)c1cc(c2[1cH]cccc2)[n]cc1)c1[n]cccc1,F[1CH](F)F,383433,NaN,1': None} - - self.test_dataset_goldenoutput_02 = { - 'FRAG_L,FRAG_R,ATTCHPT_CTX_L,ATTCHPT_CTX_R,PIC50_mean,PIC50_std,PIC50_count,CHEMBL_TARGET_ID_mean,CHEMBL_TARGET_ID_std,CHEMBL_TARGET_ID_count': None, - '[1CH3]C,[1CH4],[1:C3],[1:C3],-0.974,0.076,2,0.000,0.000,2': None, - '[1CH3]C,[1H],[1:C3],[1:C3],-2.339,0.096,2,0.000,0.000,2': None, - '[1CH3]C,[1cH]1ccccc1,[1:C3],[1:C3],-2.545,0.146,2,0.000,0.000,2': None, - '[1CH4],[1CH3]C,[1:C3],[1:C3],0.974,0.076,2,0.000,0.000,2': None, - '[1CH4],[1H],[1:C3],[1:C3],-1.170,0.250,4,0.000,0.000,4': None, - '[1CH4],[1cH]1ccccc1,[1:C3],[1:C3],-1.571,0.221,2,0.000,0.000,2': None, - '[1H],[1CH3]C,[1:C3],[1:C3],2.339,0.096,2,0.000,0.000,2': None, - '[1H],[1CH4],[1:C3],[1:C3],1.170,0.250,4,0.000,0.000,4': None, - '[1H],[1cH]1ccccc1,[1:C3],[1:C3],-0.206,0.049,2,0.000,0.000,2': None, - '[1cH]1ccccc1,[1CH3]C,[1:C3],[1:C3],2.545,0.146,2,0.000,0.000,2': None, - '[1cH]1ccccc1,[1CH4],[1:C3],[1:C3],1.571,0.221,2,0.000,0.000,2': None, - '[1cH]1ccccc1,[1H],[1:C3],[1:C3],0.206,0.049,2,0.000,0.000,2': None, - 'Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],-0.834,NaN,1,0.000,NaN,1': None, - 'Cc1cc[1cH]cc1,Cc1[1cH]cccc1,[1:CAR],[1:CAR],-2.153,NaN,1,0.000,NaN,1': None, - 'Cc1cc[1cH]cc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],0.067,NaN,1,0.000,NaN,1': None, - 'Cc1cc[1cH]cc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],-0.933,NaN,1,0.000,NaN,1': None, - 'Cc1cc[1cH]cc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],-2.523,NaN,1,0.000,NaN,1': None, - 'Cc1c[1cH]ccc1,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],0.834,NaN,1,0.000,NaN,1': None, - 'Cc1c[1cH]ccc1,Cc1[1cH]cccc1,[1:CAR],[1:CAR],-1.319,NaN,1,0.000,NaN,1': None, - 'Cc1c[1cH]ccc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],0.901,NaN,1,0.000,NaN,1': None, - 'Cc1c[1cH]ccc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],-0.099,NaN,1,0.000,NaN,1': None, - 'Cc1c[1cH]ccc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],-1.689,NaN,1,0.000,NaN,1': None, - 'Cc1[1cH]cccc1,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],2.153,NaN,1,0.000,NaN,1': None, - 'Cc1[1cH]cccc1,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],1.319,NaN,1,0.000,NaN,1': None, - 'Cc1[1cH]cccc1,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],2.220,NaN,1,0.000,NaN,1': None, - 'Cc1[1cH]cccc1,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],1.220,NaN,1,0.000,NaN,1': None, - 'Cc1[1cH]cccc1,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],-0.370,NaN,1,0.000,NaN,1': None, - 'FC(c1cc[1cH]cc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],-0.067,NaN,1,0.000,NaN,1': None, - 'FC(c1cc[1cH]cc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],-0.901,NaN,1,0.000,NaN,1': None, - 'FC(c1cc[1cH]cc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],-2.220,NaN,1,0.000,NaN,1': None, - 'FC(c1cc[1cH]cc1)(F)F,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],-1.000,NaN,1,0.000,NaN,1': None, - 'FC(c1cc[1cH]cc1)(F)F,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],-2.590,NaN,1,0.000,NaN,1': None, - 'FC(c1c[1cH]ccc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],0.933,NaN,1,0.000,NaN,1': None, - 'FC(c1c[1cH]ccc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],0.099,NaN,1,0.000,NaN,1': None, - 'FC(c1c[1cH]ccc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],-1.220,NaN,1,0.000,NaN,1': None, - 'FC(c1c[1cH]ccc1)(F)F,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],1.000,NaN,1,0.000,NaN,1': None, - 'FC(c1c[1cH]ccc1)(F)F,FC(c1[1cH]cccc1)(F)F,[1:CAR],[1:CAR],-1.590,NaN,1,0.000,NaN,1': None, - 'FC(c1[1cH]cccc1)(F)F,Cc1cc[1cH]cc1,[1:CAR],[1:CAR],2.523,NaN,1,0.000,NaN,1': None, - 'FC(c1[1cH]cccc1)(F)F,Cc1c[1cH]ccc1,[1:CAR],[1:CAR],1.689,NaN,1,0.000,NaN,1': None, - 'FC(c1[1cH]cccc1)(F)F,Cc1[1cH]cccc1,[1:CAR],[1:CAR],0.370,NaN,1,0.000,NaN,1': None, - 'FC(c1[1cH]cccc1)(F)F,FC(c1cc[1cH]cc1)(F)F,[1:CAR],[1:CAR],2.590,NaN,1,0.000,NaN,1': None, - 'FC(c1[1cH]cccc1)(F)F,FC(c1c[1cH]ccc1)(F)F,[1:CAR],[1:CAR],1.590,NaN,1,0.000,NaN,1': None, - '[1CH4],F[1CH](F)F,[1:CAR],[1:CAR],-0.134,0.221,3,0.000,0.000,3': None, - 'F[1CH](F)F,[1CH4],[1:CAR],[1:CAR],0.134,0.221,3,0.000,0.000,3': None} - - # Note that for this output set the N value is 1 (not 4) due to the duplicate removal we applied - # so for fragment change [2CH3][1CH3] --> [12CH4] we only count it once! not 4 times for the same - # id/id pair - self.test_dataset_goldenoutput_03 = {'FRAG_L,FRAG_R,PIC50_DIFF_mean,PIC50_DIFF_std,PIC50_DIFF_count': None, - '[2CH3][1CH3],[12CH4],-0.100,NaN,1': None, - '[1CH3][2CH3],[12CH4],-0.100,NaN,1': None, - '[12CH4],[2CH3][1CH3],0.100,NaN,1': None, - '[12CH4],[1CH3][2CH3],0.100,NaN,1': None, - '[2CH3]C[1CH3],[2CH3][1CH3],-0.100,NaN,1': None, - '[1CH3]C[2CH3],[1CH3][2CH3],-0.100,NaN,1': None, - '[2CH3][1CH3],[2CH3]C[1CH3],0.100,NaN,1': None, - '[1CH3][2CH3],[1CH3]C[2CH3],0.100,NaN,1': None} - - # same as self.test_dataset_goldenoutput_03 but the counts - # are higher due to id dupes for given fragL, fragR change (no filtering) - self.test_dataset_goldenoutput_03b = {'FRAG_L,FRAG_R,PIC50_DIFF_mean,PIC50_DIFF_std,PIC50_DIFF_count': None, - '[2CH3][1CH3],[12CH4],-0.100,0.000,4': None, - '[1CH3][2CH3],[12CH4],-0.100,0.000,4': None, - '[12CH4],[2CH3][1CH3],0.100,0.000,4': None, - '[12CH4],[1CH3][2CH3],0.100,0.000,4': None, - '[2CH3]C[1CH3],[2CH3][1CH3],-0.100,NaN,1': None, - '[1CH3]C[2CH3],[1CH3][2CH3],-0.100,NaN,1': None, - '[2CH3][1CH3],[2CH3]C[1CH3],0.100,NaN,1': None, - '[1CH3][2CH3],[1CH3]C[2CH3],0.100,NaN,1': None} - - # diff function test output - self.test_dataset_goldenoutput_04 = { - 'Mean, SD, DIFF60, upp90, low90': None, - '51.0, -1.60, -56.4, -51.5, -58.5': None - } - - self.test_dataframe_goldenoutput_04 = { - 'FRAG_L,FRAG_R,DIFF_DIFF_count,DIFF_DIFF_n_pos_diff,DIFF_DIFF_n_neg_diff,DIFF_DIFF_diff60_low,DIFF_DIFF_diff60,DIFF_DIFF_diff60_upp': None, - '[1CH4],[1SH],1,0.000,1.000,NaN,-59.663,NaN': None, - '[1CH4],[1OH],50,8.000,42.000,-58.6488,-56.7383,-52.336': None - } - - self.test_dataframe_goldenoutput_05 = { - 'FRAG_L,FRAG_R,ACT_A_DIFF_diff60_low,ACT_A_DIFF_diff60,ACT_A_DIFF_diff60_upp,ACT_A_DIFF_count,ACT_A_DIFF_n_pos_diff,ACT_A_DIFF_n_neg_diff,MOL_CLP_DIFF_mean,MOL_CLP_DIFF_std,MOL_MWT_DIFF_first': None, - 'Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,40.0,40.0,40.0,5,5.000,0.000,0.720,0.000,60.900': None, - 'Cc1c[1cH]ccc1,Cc1cc[1cH]cc1,NaN,-60.0,NaN,1,0.000,1.000,-0.720,NaN,-60.900': None - } - - self.test_dataframe_goldenoutput_06 = { - 'FRAG_L,FRAG_R,DIFF_DIFF_mean_diff_invlog_low,DIFF_DIFF_mean_diff_invlog,DIFF_DIFF_mean_diff_invlog_upp,DIFF_DIFF_count,DIFF_DIFF_n_pos_diff,DIFF_DIFF_n_neg_diff,MOL_CLP_DIFF_mean,MOL_CLP_DIFF_std,MOL_MWT_DIFF_first': None, - 'Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,0.2628,0.4932,0.9253,5,0.000,5.000,-0.483,0.111,-55.100': None - } - - self.test_dataframe_goldenoutput_07 = { - 'FRAG_L,FRAG_R,DIFF_DIFF_mean_diff_low,DIFF_DIFF_mean_diff,DIFF_DIFF_mean_diff_upp,DIFF_DIFF_count,DIFF_DIFF_n_pos_diff,DIFF_DIFF_n_neg_diff,MOL_CLP_DIFF_mean,MOL_CLP_DIFF_std,MOL_MWT_DIFF_first': None, - 'Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,-0.5803,-0.307,-0.0337,5,0.000,5.000,-0.483,0.111,-55.100': None - } - - self.test_dataframe_goldenoutput_08 = { - 'FRAG_L,FRAG_R,DIFF_DIFF_mean_diff_low,DIFF_DIFF_mean_diff,DIFF_DIFF_mean_diff_upp,DIFF_DIFF_count,DIFF_DIFF_n_pos_diff,DIFF_DIFF_n_neg_diff,MOL_CLP_DIFF_mean,MOL_CLP_DIFF_std,MOL_MWT_DIFF_first,MOL_L_CLP_std,QUALITY': None, - 'Cc1cc[1cH]cc1,Cc1c[1cH]ccc1,-0.5803,-0.307,-0.0337,5,0.000,5.000,-0.483,0.111,-55.100,0.000,0': None - } - - ################################## - # - # write test data to temp files - # - ################################## - - # csv file - self.temp_file_input_csv.write(', '.join(self.test_dataset_goldeninput_csv_headers)+"\n") - for data in list(self.test_dataset_goldeninput_csv_data.keys()): - self.temp_file_input_csv.write(data+"\n") - self.temp_file_input_csv.close() - - # std pairs - self.temp_file_input_pairs.write(self.test_dataset_goldeninput_header+"\n") - - for line in self.test_dataset_goldeninput_pairs.items(): - #print(line) - self.temp_file_input_pairs.write(line[0]+"\n") - self.temp_file_input_pairs.close() - - # pairs after de-dupe - self.temp_file_input_pairdupes.write(self.test_dataset_goldeninput_pairdupes_header+"\n") - for line in self.test_dataset_goldeninput_pairdupes.items(): - self.temp_file_input_pairdupes.write(line[0]+"\n") - - self.temp_file_input_pairdupes.close() - - # diff function test - self.temp_file_input_diffdata.write(self.test_dataset_goldeninput_diffheader+"\n") - for line in self.test_dataset_goldeninput_diffdata.items(): - self.temp_file_input_diffdata.write(line[0]+"\n") - - self.temp_file_input_diffdata.close() - - # diff function test with props - self.temp_file_input_diffdata_02.write(self.test_dataset_goldeninput_diffheader_02+"\n") - for line in self.test_dataset_goldeninput_diffdata_02.items(): - self.temp_file_input_diffdata_02.write(line[0]+"\n") - - self.temp_file_input_diffdata_02.close() - - # container for results data - self.test_dataset_testresults = {} - - def tearDown(self): - """Tear down object for clean reuse in further tests""" - - self.test_dataset_testresults.clear() - - os.remove(self.temp_file_input_pairs.name) - os.remove(self.temp_file_input_diffdata.name) - os.remove(self.temp_file_input_pairdupes.name) - - def test_diff60_df(self): - """Functional test to confirm the diff function is working on a dataframe""" - - # group, aggregate, convert object to df, sort index - grouped = self.test_dataframe.groupby(['A']) - grouped = grouped['C'].agg([np.mean, len, mmps.diffn_agg(60)]) - - # this is important for unit testing as order or columns can change and needs to be made consistent - grouped = pd.DataFrame(grouped).reset_index() - grouped.sort_index(axis=1, inplace=True) - - #print(self.test_dataframe_result) - pd.util.testing.assert_frame_equal(grouped, self.test_dataframe_result) - - def test_pairsdataobj_to_pd(self): - """Test method to create a dataframe from the base object iterator - This test and its associated method are the only ones that use the methods and - other objects associated with the class we inherit from (MMPDataObjectsClass) - """ - - # full build of pairs and data objects from csv - self.test_mmp_pairs_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'CHEMBL_ID') - self.test_mmp_pairs_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'CHEMBL_ID') - tmp_dicer_file = self.test_mmp_pairs_object.write_mol_smi_dict_tofile() - self.test_mmp_pairs_object.build_from_dicer(tmp_dicer_file, 'BOTH', 'NONE') - # then convert final pairs to pandas dataframe - self.test_mmp_pairs_object.pairsdataobj_to_pd('BOTH', 'PIC50') - - columns = list(self.test_mmp_pairs_object.pairs_table.columns.values) - shape = self.test_mmp_pairs_object.pairs_table.shape - num_nulls = self.test_mmp_pairs_object.pairs_table.isnull().values.sum() - # don't want whole table in unittest so just check structure - self.assertEqual(columns, - ['CUT', 'MOLID_L', 'MOLID_R', 'CONTEXT', 'FRAG_L', 'FRAG_R', 'ATTCHPT_CTX_L', 'ATTCHPT_FRAG_L', - 'ATTCHPT_CTX_R', 'ATTCHPT_FRAG_R', 'PIC50', 'CHEMBL_TARGET_ID']) - self.assertEqual(shape[0], 280) - self.assertEqual(shape[1], 12) - self.assertEqual(num_nulls, 0) - - def test_pd_aggregation_frag_filter_dupes(self): - """Test method to filter certain dupes from a dataframe""" - - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_pairdupes.name, self.test_columns_dict) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output_nodupes.name, 'frag') - - test_results_filehandle = open(self.temp_file_output_nodupes.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_03) - - def test_pd_aggregation_frag_filter_dupes_false(self): - """Test method to filter certain dupes from a dataframe""" - - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_pairdupes.name, self.test_columns_dict) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output_nodupes.name, 'frag', - remove_id_dupes=False) - - test_results_filehandle = open(self.temp_file_output_nodupes.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_03b) - - def test_pd_aggregation_frag(self): - """Build pandas df from csv, aggregate then save pairs""" - - # do everything, read, group by frag (not frag attach), write - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_pairs.name, self.test_columns_dict) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, 'frag') - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_01) - - def test_pd_aggregation_attach(self): - """Build pandas df from mmp data object iterator, aggregate then save pairs""" - - # self.test_dataset_goldeninput_csv_data - # full build of pairs and data objects from csv - self.test_mmp_pairs_object.csv_sniffer(self.temp_file_input_csv.name, 'SMILES', 'CHEMBL_ID') - self.test_mmp_pairs_object.csv_to_data_objects(self.temp_file_input_csv.name, 'SMILES', 'CHEMBL_ID') - tmp_dicer_file = self.test_mmp_pairs_object.write_mol_smi_dict_tofile() - self.test_mmp_pairs_object.build_from_dicer(tmp_dicer_file, 'SINGLE', 'NONE') - # then convert final pairs to pandas dataframe - self.test_mmp_pairs_object.pairsdataobj_to_pd('SINGLE', 'PIC50') - - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, 'attach') - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) # use this pprint statement to regenerate the golden data - self.assertEqual(self.test_dataset_testresults, self.test_dataset_goldenoutput_02) - - def test_pd_aggregation_diff(self): - """Test the diff function""" - - # do everything, read, group by frag (not frag attach), write - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_diffdata.name) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, - 'frag', - agg_method='diff60', - act_col='DIFF') - # self.test_mmp_pairs_object.grouped = pd.DataFrame(self.test_mmp_pairs_object.grouped).reset_index() - # self.test_mmp_pairs_object.grouped.sort_index(axis=1, inplace=True) - # self.test_mmp_pairs_object.pd_save_aggregate_pairs(self.temp_file_output.name) - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - #print(self.test_dataset_testresults) - self.assertEqual(self.test_dataframe_goldenoutput_04, self.test_dataset_testresults) - - def test_pd_aggregation_diff_withprops(self): - """Test the diff function with prop data in same dataframe""" - - # do everything, read, group by frag (not frag attach), write - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_diffdata_02.name) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, - 'frag', - agg_method='diff60', - prop_data=True, - act_col='ACT_A') - # now read results back in for unittest compare - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - print(">>05>>", self.test_dataset_testresults) - self.assertEqual(self.test_dataframe_goldenoutput_05, self.test_dataset_testresults) - - def test_pd_aggregation_meandiff_invlog_withprops(self): - - self.temp_file_input_meandiff_withprops = tempfile.NamedTemporaryFile(delete=False, - encoding='utf-8', - mode='wt') - - # diff function test with props - self.temp_file_input_meandiff_withprops.write(self.test_dataset_goldeninput_diffheader_03+"\n") - for line in self.test_dataset_goldeninput_diffdata_03.items(): - self.temp_file_input_meandiff_withprops.write(line[0]+"\n") - self.temp_file_input_meandiff_withprops.close() - - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_meandiff_withprops.name) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, - 'frag', - agg_method='mean_diff_invlog', - prop_data=True, - act_col='DIFF') - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - os.remove(self.temp_file_input_meandiff_withprops.name) - - #print(">>06>>", self.test_dataset_testresults) - self.assertEqual(self.test_dataframe_goldenoutput_06, self.test_dataset_testresults) - - def test_pd_aggregation_meandiff_withprops(self): - """ """ - self.temp_file_input_meandiff_withprops = tempfile.NamedTemporaryFile(delete=False, - encoding='utf-8', - mode='wt') - - # diff function test with props - self.temp_file_input_meandiff_withprops.write(self.test_dataset_goldeninput_diffheader_03+"\n") - for line in self.test_dataset_goldeninput_diffdata_03.items(): - self.temp_file_input_meandiff_withprops.write(line[0]+"\n") - self.temp_file_input_meandiff_withprops.close() - - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_meandiff_withprops.name) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, - 'frag', - agg_method='mean_diff', - prop_data=True, - act_col='DIFF') - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - os.remove(self.temp_file_input_meandiff_withprops.name) - - #print(">>07>>", self.test_dataset_testresults) - self.assertEqual(self.test_dataframe_goldenoutput_07, self.test_dataset_testresults) - - def test_pd_aggregation_meandiff_withprops_withqmetric(self): - """Test of the Quality metric, output data last column should be 0,1 or 2 representing quality metric""" - self.temp_file_input_meandiff_withprops = tempfile.NamedTemporaryFile(delete=False, - encoding='utf-8', - mode='wt') - - # diff function test with props - self.temp_file_input_meandiff_withprops.write(self.test_dataset_goldeninput_diffheader_03+"\n") - for line in self.test_dataset_goldeninput_diffdata_03.items(): - self.temp_file_input_meandiff_withprops.write(line[0]+"\n") - self.temp_file_input_meandiff_withprops.close() - - - self.test_mmp_pairs_object.pd_read_csv(self.temp_file_input_meandiff_withprops.name) - self.test_mmp_pairs_object.pd_aggregate_pairs_to_csv(self.temp_file_output.name, - 'frag', - agg_method='mean_diff', - prop_data=True, - act_col='DIFF', - add_qmetric=True) - - test_results_filehandle = open(self.temp_file_output.name, 'r') - for line in test_results_filehandle: - line = line.rstrip('\r') - line = line.rstrip('\n') - self.test_dataset_testresults[line] = None - - os.remove(self.temp_file_input_meandiff_withprops.name) - - print(self.test_dataset_testresults) - self.assertEqual(self.test_dataframe_goldenoutput_08, self.test_dataset_testresults) - - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_series_object.py b/contrib/script/py/mmp/mmp_series_object.py deleted file mode 100755 index e644691c..00000000 --- a/contrib/script/py/mmp/mmp_series_object.py +++ /dev/null @@ -1,2256 +0,0 @@ -################################################################### -""" Summary: Class and Methods to get and interogate matched series - -About: An object that reads a smiles file into memory and generates -matched series (OBoyle, Bostron, Sayle, Gill JMC 2014) of a defined -length via an in-house extension of the fragment indexing method -(Hussain and Rea JCIM 2010). Includes various matched series scoring -methods as annotated. (JAL) - -TODO: print Molecular Context from result series. Currently the context of the query series is printed. -TODO: print Series source as filename of query series or blank for internal SARA transfer - -""" -################################################################### -import logging -import sys -import os -import glob -from copy import deepcopy - -# specific -import pandas as pd -from operator import itemgetter -# from pandas.util.testing import assert_frame_equal - -# -from itertools import groupby, combinations # product, permutations # -from math import factorial -from scipy.stats import binom_test, pearsonr, spearmanr, linregress, skew -from scipy.spatial.distance import cityblock -import numpy as np -# from numpy import NaN - -# local imports -from mmp.mmp_data_objects import MMPDataObjectClass -from mmp.mmp_math_functions import inv_cantor -import mmp.mmp_enum_mols_from_pairs as enum_mols - -# things needed for unit testing only -import unittest -import tempfile -#from tempfile import NamedTemporaryFile, mkdtemp - -# https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas -pd.options.mode.chained_assignment = None - - -class MMPSeriesObjectClass(MMPDataObjectClass): - - """Class implements objects and methods for MMP series manipulation""" - - def __init__(self, logger_object): - """setup base object""" - # core objects - - # core objects from base class - MMPDataObjectClass.__init__(self, logger_object) - - # setup logger and check it's ok - self.logger = logger_object - if len(logging.Logger.manager.loggerDict) < 1: - # exit with system status 1 and custom error - sys.exit("Invalid or no logger object passed to MMPObjectClass. Please create \ - and pass a logger and set to use logging.disable if you don't want logging") - - # variable use to track method used to create series - self.data_column = None - self.data_column_position = None - self.min_series_length = 3 - self.return_series_max_len = None - self.threshold = 0.50001 - # series number incremented by _iterators - self.series_id = 0 - # filtering out series: - self.series_filtered_out = 0 - self.max_skew = 3 - self.min_pAct = 0.5 - # - self.running_fullscan = False - - # base pandas df - self.series_df = None - self.ref_series_df = None - - # temp stuff - self.key_smi_tempfile = None - self.series_comparison_df = None - self.enumerated_products_smi = None - # this becomes series context => max activity - self.max_series_potency = {} - - # set of series files to inject - self.series_from_file = False - self.series_base_dir = None - self.series_file_list = [] - - def clean_out_data_seriesobj(self): - """Implements method from MMPDataObjectClass to clear out all data structures plus - clears out a few more related to this extended class""" - - self.clean_out_data() - - self.data_column = None - self.data_column_position = None - self.min_series_length = 3 - self.return_series_max_len = None - self.threshold = 0.50001 - self.series_id = 0 - # - self.series_filtered_out = 0 - self.max_skew = 3 - self.min_pAct = 0.5 - - self.series_df = None - self.ref_series_df = None - - self.key_smi_tempfile = None - self.series_comparison_df = None - self.enumerated_products_smi = None - self.max_series_potency = {} - - self.series_from_file = False - self.series_base_dir = None - self.series_file_list = [] - - ############################################################################################ - # - # Setup and series retrieval methods - # - ############################################################################################ - - def setup_mmp_data_for_mms(self, - input_file, - smi_col, - id_col, - data_column, - min_series_len, - threshold, - cut_type='BOTH'): - """ - Method sets object level vars for MMS generation as well as running a number setup methods from mmp - object to generate and store pairs ready for MMS generation. Can take some time to do this. - :param input_file: CSV of smiles, id and activity data - :param smi_col: column name of smiles column - :param id_col: column name of id column - :param data_column: column name of activity data column - :param min_series_len: minimum length of a series must be >3 - :param threshold: maxff dicer setting (% molecule to be retained as context) - :param cut_type: single double or both cuts? - :return: None - """ - cut_type_options = ['SINGLE', 'DOUBLE', 'BOTH'] - if cut_type not in cut_type_options: - raise Exception('Errro, invalid cut type: %s' % cut_type) - self.logger.info('Setting up MMS generation via MMP data creation') - # parse csv - self.csv_sniffer(input_file, smi_col, id_col) - self.csv_to_data_objects(input_file, smi_col, id_col, std_smi=True) - # get pairs - tmp_dicer_file = self.write_mol_smi_dict_tofile() - ############################################################################## - # This line check for odd stuff in smiles like chirality and fails if present - ############################################################################## - self.scan_input_smiles(tmp_dicer_file, fail_chirals=True) - # build series - self.build_from_dicer(tmp_dicer_file, cut_type, 'NONE', threshold=threshold) - # - self.min_series_length = min_series_len - self.data_column = data_column - self.threshold = threshold - # - # Find the column position for the data_column in self.mol_data_dict - try: - self.data_column_position = self.headers_nosmi.index(data_column) - except: - raise Exception("Invalid data column specification, %s cannot be found in CSV headers" % data_column) - - self.logger.info('completed Setting up MMP data: %d sgl & %d dbl ctx stored for series gen' % - (len(self.single_pairs_dict), len(self.double_pairs_dict))) - - def _iterator_mmp_series_numeric(self, - use_comparison_df=False, - store_series_max_act=True, - sgl_or_dbl='single', - apply_pre_filter=False): - """ Method to iterate over Single Cut Dictionary structure and yield seed matched series - This does not exhaustively return all matched series - """ - - if self.min_series_length < 3: - sys.exit("Invalid value for param min_series_length in method iterator_singlecut_mmp_series, must be > 3") - - # - # can later change this to check which dict we are comparing, allows search between SMI sets not just - # within SMI sets via use of comparison dict - if sgl_or_dbl == 'single': - if use_comparison_df: - query_dict = self.single_pairs_comparison_dict - else: - query_dict = self.single_pairs_dict - - self.logger.info('Iterating over single cut pairs dictionary to get matched series of length N: %d' - % self.min_series_length) - - elif sgl_or_dbl == 'double': - if use_comparison_df: - query_dict = self.double_pairs_comparison_dict - else: - query_dict = self.double_pairs_dict - - self.logger.info('Iterating over double cut pairs dictionary to get matched series of length N: %d' - % self.min_series_length) - - else: - self.logger.warn('invalid cut type: %s (try single or double)' % sgl_or_dbl) - sys.exit('invalid cut type: %s (try single or double)' % sgl_or_dbl) - - # - for ctx_id in query_dict: - - # series are already determined by the data structure, for example: - # dict_single[ctx1] => { mol1_frag1_id => frag1, mol2_frag4_id => frag4, ... } is a series - # dict_single[ctx2] => { mol1_frag2_id => frag2, mol2_frag5_id => frag5, ... } is a series - # dict_single[ctx3] => { mol1_frag3_id => frag3, mol7_frag9_id => frag7} is only a pair - # dict_single[ctx4] => { mol2_frag6_id => frag6 } is nothing! - # series exist where the context is the same, but the fragments differ and there are more - # than two of them i.e.: param series_len > 2 - # ...but complexity occurs where we have two different mol_id's with the same frag_id's but diff - # activity. It also arises when two different mol_id's and frag_id's have identical activities - - series_length = len(query_dict[ctx_id]) - series_unsorted = [] - - if series_length >= self.min_series_length: - - self.logger.debug('Found a series of valid length (%s)' % series_length) - - # iterate and store all fragments, create series - for molid_fragid_uid_L in query_dict[ctx_id]: - - molid_L, frag_id_L = inv_cantor(molid_fragid_uid_L) - # frag_L = self.refsmi_dict[frag_id_L] - - # TODO: Code does not support situation where there are 1 or more text based columns! - # print self.mol_data_dict[molid_L] - # print self.mol_data_dict - # print self.headers - # print self.headers_nosmi - # print self.headers_numeric_position - try: - mol_act = self.mol_data_dict[molid_L][self.data_column_position] - except: - mol_act = None - - series_unsorted.append([frag_id_L, molid_L, mol_act]) - - ################################################################################# - # filter out series if: - # (a) the set of unique activity values is less than self.min_series_length - # or they have poor characteristics, Keefer & Chang MedChemComm 2017 - 10.1039/C7MD00465F - # (b) range in activity <= 0.5 - # (c) skew <= 3 - # quit the loop, discarding the series - ################################################################################# - act_data_set = set() - act_data_arr = [] - for item in series_unsorted: - act_data_set.add(item[2]) - act_data_arr.append(item[2]) - # print act_data_set - # print len(act_data_set), self.min_series_length - # (a) - if len(act_data_set) < self.min_series_length: - continue - - # now sort it - self.logger.debug('Ordering series') - series_sorted = sorted(series_unsorted, key=lambda xx: xx[2]) - - # TODO: check this as it seems too discriminatory! - act_data_nparr = np.array(act_data_arr) - # print ">>> range and skew" - # print "data: ", act_data_nparr - # print "range: ", (abs(act_data_nparr.max() - act_data_nparr.min())) - # print "skew: ", (3 * (act_data_nparr.mean() - np.median(act_data_nparr)))/act_data_nparr.std() - # print skew(act_data_nparr) - if apply_pre_filter: - - act_data_nparr = np.array(act_data_arr) - - # remove if below pAct range - if (abs(act_data_nparr.max() - act_data_nparr.min())) <= self.min_pAct: - self.series_filtered_out += 1 - continue - - # skew, via pearsons second coefficient (alternate method avoiding mode) - # remove if series is skewed - # http://www.statisticshowto.com/pearson-mode-skewness/ - # if (3 * (act_data_nparr.mean() - np.median(act_data_nparr)))/act_data_nparr.std() <= 3: - # easier using scipy: - # https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.stats.skew.html - if skew(act_data_nparr) >= self.max_skew: - self.series_filtered_out += 1 - continue - - self.series_id += 1 - series_seq_id = 0 - previous_act = None - - if store_series_max_act: - self.max_series_potency[ctx_id] = series_sorted[-1][2] - - ##################################################################### - # A sorted series gets a series_id - # Each fragment in the sorted series gets a series_seq_id - # The series_seq_id will only be incremented if the fragment has a greater activity than the previous - # activity. Identical series_seq_id within a given series indicate identical activity fragments - ##################################################################### - for [frag_id_L, molid_L, mol_act] in series_sorted: - - if previous_act is None: - previous_act = mol_act - series_seq_id += 1 - - else: - if mol_act > previous_act: - series_seq_id += 1 - previous_act = mol_act - - # (ctx1_id, ctx2_id) = inv_cantor(ctx_id) - # print self.series_id, series_seq_id, self.refsmi_dict[ctx1_id], self.refsmi_dict[ctx2_id], - # self.refsmi_dict[frag_id_L], molid_L, mol_act - yield self.series_id, series_seq_id, ctx_id, frag_id_L, molid_L, mol_act - - self.logger.info('Removed %s series due to skew and poor pAct range' % self.series_filtered_out) - # print ('Removed %s series due to skew and poor pAct range' % self.series_filtered_out) - - def generate_store_mmp_series(self, sgl_or_dbl_or_both='single', apply_pre_filter=False): - """ - Utilises MMS iterator to get single cut MMS's and store them - :param sgl_or_dbl_or_both: single cut, double cut or both - :param apply_pre_filter - :return: nothing, updates self.series_df in place - """ - - series_data = [] - - if sgl_or_dbl_or_both == 'single' or sgl_or_dbl_or_both == 'both': - series_data_sgl = [(a, b, c, d, e, f) for a, b, c, d, e, f in - self._iterator_mmp_series_numeric(apply_pre_filter=apply_pre_filter)] - series_data.extend(series_data_sgl) - # print series_data - - if sgl_or_dbl_or_both == 'double' or sgl_or_dbl_or_both == 'both': - series_data_dbl = [(a, b, c, d, e, f) for a, b, c, d, e, f in - self._iterator_mmp_series_numeric(sgl_or_dbl='double', - apply_pre_filter=apply_pre_filter)] - series_data.extend(series_data_dbl) - # print series_data - - self.series_df = pd.DataFrame(series_data, - columns=['SERIES_ID', 'SERIES_SEQ_ID', - 'CONTEXT_ID', 'FRAG_ID', - 'MOL_ID', 'ACTIVITY'] - ) - # self.series_df.set_index(['SERIES_ID', 'FRAG_ID'], inplace=True) - - # print('Parsed series CSV to dataframe of size %d, %d' % (self.series_df.shape[0], self.series_df.shape[1])) - self.logger.info('Parsed series CSV to dataframe of size %d, %d' % - (self.series_df.shape[0], self.series_df.shape[1])) - - ############################################################################################ - # - # Search Methods - # - ############################################################################################ - - def return_series_matching_idlist(self, id_list, column_name, use_comparison_df=False, strict_ordering=False): - """ - Search a df for all series that individually match every one of the frag ids in the query fragid list - :param id_list: a list of fragment ID's to use as query - :param column_name: the name of a column in self.series_df that will be searched for the ids listed - :param use_comparison_df compare two dfs - :param strict_ordering - :return: A pandas df of series that match the query - """ - if use_comparison_df: - self.logger.info('Searching comparison_df for series matching id list %s of type %s' % - (str(id_list), column_name)) - else: - self.logger.info('Searching series_df for series matching id list %s of type %s' % - (str(id_list), column_name)) - - if len(id_list) < 2: - raise Exception('Error, need at least 2 ids to search for matches in our list of series') - - if use_comparison_df: - search_df = self.series_comparison_df - else: - search_df = self.series_df - - if column_name not in search_df.columns: - raise Exception('Error, specified column must be present in df') - - # get the set of series_ids that contain the first id - # self.logger.debug("Searching for: %s" % str(id_list[0])) - series_ids = search_df[search_df[column_name] == id_list[0]]['SERIES_ID'].tolist() - - if len(series_ids) == 0: - return pd.DataFrame({}) - - selected_series_df = search_df[search_df['SERIES_ID'].isin(series_ids)] - - # iterate over the remaining fragments and keep series_id only if every fragment present - for idx, item_id in enumerate(id_list): - - if idx > 0: - - # self.logger.debug("Searching for: %s" % str(id_list[idx])) - series_ids = search_df[search_df[column_name] == id_list[idx]]['SERIES_ID'].tolist() - - if len(series_ids) == 0: - return pd.DataFrame({}) - - selected_series_df = selected_series_df[selected_series_df['SERIES_ID'].isin(series_ids)] - - # these are specifically not matched to the input fragment series order as - # some methods use the info about series of alternate ordering to derive observation stats - if strict_ordering: - - series_ids = selected_series_df.SERIES_ID.unique() - for series_id in series_ids: - - # get array of fragment ids - series_df = search_df[search_df['SERIES_ID'] == series_id] - # print series_df - # print "My ID list: ", id_list - # print "Searching column: ", column_name - items_idx_loc = [series_df[series_df[column_name] == x].index[0] for x in id_list] - - last_item_idx = 0 - for item_idx in items_idx_loc: - if item_idx < last_item_idx: - # this series is not in requested order so remove from df - self.logger.debug("Removing series %d as it is not ordered correctly" % series_id) - selected_series_df = selected_series_df[selected_series_df['SERIES_ID'] != series_id] - break - - last_item_idx = item_idx - - # print selected_series_df - return selected_series_df - - def search_for_mms_to_extend_molids(self, molid_list, strict_order=True, use_comparison_df=False): - """ - Main method used to search a pre-generated pandas data table of matched series data for any - series that can be applied as extensions to any/all series found for the molid_list. This method - mainly chains together a set of other methods - :param molid_list: - :return: - """ - self.logger.info('Searching for series to extend molids: %s' % str(molid_list)) - - list_len = len(molid_list) - if self.return_series_max_len is None: - self.return_series_max_len = deepcopy(list_len) + 1 - - elif list_len >= self.return_series_max_len: - self.return_series_max_len = deepcopy(list_len) + 1 - - if use_comparison_df is False: - if self.series_df.shape[0] < 4: - raise Exception("Error, no series found in series_df. Please generate series before trying to " - "search them") - else: - if self.series_comparison_df.shape[0] < 4: - raise Exception("Error, no series found in comparison_series_df. Please check input series files " - "before trying to search them") - - # TODO: This will only work if the query id's trigger the creation of a series (a matched pair will not work) - search_series_df = self.return_series_matching_idlist(molid_list, 'MOL_ID', - strict_ordering=strict_order, - # False because we start by extracting series for - # input mols by querying self.series_df not ref set - use_comparison_df=False) - - if search_series_df.empty or search_series_df.shape[0] < 3: - raise Exception("Error, no series were found for query molids in target search series_df.") - - # return dataframe - if use_comparison_df: - results_df = pd.DataFrame(dtype='int', - columns=['RESULT_ID', 'RESULT_SERIES_SEQ_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID', 'QUERY_FRAG_ID', - 'QUERY_MOL_ID', 'RESULT_MOL_ACTIVITY', 'SOURCE_FILE']) - else: - results_df = pd.DataFrame(dtype='int', - columns=['RESULT_ID', 'RESULT_SERIES_SEQ_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID', 'QUERY_FRAG_ID', - 'QUERY_MOL_ID', 'RESULT_MOL_ACTIVITY']) - - result_id = 0 - - # now extract the ordered fragment lists for these series and use each one as a query to find matching series - search_series_lst = search_series_df.SERIES_ID.unique() - self.logger.info("Found these series ids to use as search seed: %s" % str(search_series_lst)) - for search_series_id in search_series_lst: - - ############################### - # Convert each series id into a list of fragments we can use to query for similar series - # 1. get all data for the series as df - # 2. get fragid for each of the molids - temp_df = search_series_df[search_series_df.SERIES_ID == search_series_id] - - search_series_context_id = temp_df['CONTEXT_ID'].max() - self.logger.debug("Query series has context: %s" % search_series_context_id) - - search_frag_ids = [] - for molid in molid_list: - search_frag_ids.extend(temp_df[temp_df.MOL_ID == molid]['FRAG_ID'].tolist()) - - self.logger.info("For input mols found series %s" % str([self.refsmi_dict[x] for x in search_frag_ids])) - self.logger.info("Will now use this to search for longer series from other mols") - - ############################### - # Get a list of the series that match our query fragids - matching_series_df = self.return_series_matching_idlist(search_frag_ids, 'FRAG_ID', - strict_ordering=strict_order, - use_comparison_df=use_comparison_df) - - # print search_frag_ids, str([self.refsmi_dict[x] for x in search_frag_ids]) - - if matching_series_df.empty is True: - continue - - ################################################ - # debug to help generate test data - ################################################ - # result_df_print = copy.deepcopy(matching_series_df) - # print result_df_print.to_csv() - # result_df_print['CONTEXT_ID'] = result_df_print['CONTEXT_ID'].apply(lambda x: self.refsmi_dict[x]) - # result_df_print['FRAG_ID'] = result_df_print['FRAG_ID'].apply(lambda x: self.refsmi_dict[x]) - # result_df_print.rename(columns={'CONTEXT_ID': 'CONTEXT', - # 'FRAG_ID': 'FRAG'}, - # inplace=True) - # print search_frag_ids - # print result_df_print.to_csv() - ################################################# - # Done debug - ################################################# - - matching_series_lst = matching_series_df.SERIES_ID.unique() - self.logger.debug("Found %d series that match query series" % len(matching_series_lst)) - - ############################### - # For each series that has matched our query fragid list: - # 1. Drop it if it's the same series id as our original query - # 2. Check it has a length longer than our query and last element is new i.e.: extends query - # 3. Merge results into results dataframe with additional columns for query and context ids - - for matching_series_id in matching_series_lst: - - # make sure our matching series is not the same as one of our query ones - # only in the case where we are working on the same df that the query came from - if use_comparison_df is False and matching_series_id in search_series_lst: - self.logger.debug("Dropped this series as it is part of the query series set") - - else: - self.logger.debug("For molid list %s I have query series fragids %s" % - (str(molid_list), str(search_frag_ids))) - - matching_series_fragids = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['FRAG_ID'].tolist() - - # check this matched series can be used to extend our existing series - if (len(matching_series_fragids) > len(molid_list)) \ - and (matching_series_fragids[-1] not in search_frag_ids): - # this line can be used if the series are always ordered - # and (matching_series_fragids[-1] != search_frag_ids[-1]): - # this line in an alternate version for non-ordered series - # and (matching_series_fragids[-1] is not in search_frag_ids): - - ################################ - # these are the results we will pivot into a return df - matching_series_molids = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['MOL_ID'].tolist() - matching_series_seqids = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['SERIES_SEQ_ID'].tolist() - matching_series_act = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['ACTIVITY'].tolist() - matching_series_frag_smi = [self.refsmi_dict[x] for x in matching_series_fragids] - context_id = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['CONTEXT_ID'].unique()[0] - num_items = len(matching_series_molids) - self.logger.debug("The common context SMI is: %s" % self.refsmi_dict[context_id]) - - result_id += 1 - - ################################ - # these are also added to the pandas table and flag which items in the matching series - # were part of the query and which were not (NaN). It's not that intelligent and might - # flag an item twice - matching_series_query_molids = [] - matching_series_query_fragids = [] - - for idx, item in enumerate(matching_series_fragids): - try: - locn = search_frag_ids.index(item) - matching_series_query_fragids.append(item) - matching_series_query_molids.append(molid_list[locn]) - except: - matching_series_query_fragids.append(0) - matching_series_query_molids.append(0) - - ################################ - # finally form a return dataframe from all the lists - a_result = pd.DataFrame({'RESULT_ID': [result_id] * num_items, - 'RESULT_SERIES_SEQ_ID': matching_series_seqids, - 'RESULT_MOL_ID': matching_series_molids, - 'RESULT_CONTEXT_ID': [context_id] * num_items, - 'QUERY_CONTEXT_ID': [search_series_context_id] * num_items, - 'RESULT_FRAG_ID': matching_series_fragids, - 'QUERY_FRAG_ID': matching_series_query_fragids, - 'QUERY_MOL_ID': matching_series_query_molids, - 'RESULT_MOL_ACTIVITY': matching_series_act - }, - columns=['RESULT_ID', 'RESULT_SERIES_SEQ_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID', 'QUERY_FRAG_ID', - 'QUERY_MOL_ID', 'QUERY_CONTEXT_ID', 'RESULT_MOL_ACTIVITY'], - # does not seem to work - # dtype='int' - ) - - if use_comparison_df: - a_result['SOURCE_FILE'] = matching_series_df[matching_series_df.SERIES_ID == - matching_series_id]['SOURCE_FILE'].unique()[0] - - # py2>3 explicit sort=False https://github.com/pandas-dev/pandas/issues/4588#issue-18183895 - # use False because I control column order and should be aligned - results_df = pd.concat([results_df, a_result], sort=False) - - # sanity check - # for idx, item in enumerate(matching_series_fragids): - # print result_id, idx+1, matching_series_molids[idx], context_id, item, - # if item in search_frag_ids: - # print item, matching_series_molids[idx] - # else: - # print NaN, NaN - # different format sanity check - self.logger.debug("Found a series we can extend your query series with:") - self.logger.debug("Query (Mols): %s" % str(molid_list)) - self.logger.debug("Query (Frags): %s" % str([self.refsmi_dict[x] for x in search_frag_ids])) - self.logger.debug("Match (Frags): %s" % matching_series_frag_smi) - self.logger.debug("Match (Mols): %s" % matching_series_molids) - # print ("Found a series we can extend your query series with:") - # print ("Query (Mols): %s" % str(molid_list)) - # print ("Query (Frags): %s" % str([self.refsmi_dict[x] for x in search_frag_ids])) - # print ("Match (Frags): %s" % matching_series_frag_smi) - # print ("Match (Mols): %s" % matching_series_molids) - - else: - self.logger.debug("Dropped this series as it is the same length as our query") - - if results_df.empty is False: - results_df = results_df.astype({'RESULT_ID': int, - 'RESULT_SERIES_SEQ_ID': int, - 'RESULT_MOL_ID': int, - 'QUERY_MOL_ID': int, - 'RESULT_CONTEXT_ID': int, - 'QUERY_CONTEXT_ID': int, - 'RESULT_FRAG_ID': int, - 'QUERY_FRAG_ID': int - }) - - # this is not essential but if converting from df to dict you lose data - # due to non-unique index, which in turn caused bugs in the unittest - # results_df['QUERY_MOL_ID'] = results_df[['QUERY_MOL_ID']].astype(int, ) - results_df.reset_index(drop=True, inplace=True) - # force NaN to zero's. Many later methods use filtering of column values >0 to determine the existence of - # a result value (e.g.: find all rows with a query frag/mol). Also, NaN is not valid for float columns in pd - # use zero as surrogate NaN. - results_df.fillna(value=0, inplace=True) - - return results_df - - ############################################################################################ - # - # Scoring Methods - # - ############################################################################################ - - def _iterator_series_pstats(self, frag_list, series_dataframe, strict_order=False): - """ - Score all examples of frag_list (a series) found in series_dataframe using the p-value - method of OBoyle, Bostron, Sayle, Gill JMC 2014 - :param frag_list: list of fragments in - :param series_dataframe: series_dataframe containing all series involving frag_list - :param strict_order: Typically False as you want to get back the series stats for all - possible orderings of the frag_list. If set to true the stats will not change, all - possible orderings of the frag_list will still be explored but only the specific order - requested in the frag_list will be returned (i.e.: less return data only) - :return: a generator containing each series and it's stats - """ - self.logger.info("searching for: %s and generating stats" % str(frag_list)) - - ########################################################################################### - # For the input series there are many possible alternate series of a different ordering - # but only some of these are observed. All alternate permutations can be see here: - # print [x for x in permutations(frag_list)] - # but what we really want is observed so we have to transform the series_dataframe into a - # dict of an_observed_series => {series_id: series_data, series_id: series_data} - # From this we can determine the series stats as per OBoyle, Bostron, Sayle, Gill JMC 2014 - ########################################################################################### - observed_series = {} - - # bug: for some reason the .values method returns float values from an int columns - # which means later id matches are comparing int and float and fail to match!! - # added [int(y) for y in x] as we get back list of float not int - - # TODO: This is a hack due to a bug in way we score series, remove it and code will fail - # if series_dataframe.empty: - # this series has never been seen! - # typically because we took a fragment from source set and transferred onto target set - # series, observations, total_series_obs, observed_prob, >expected_prob, enrichment, p_value, - # p_value_corrected, significant - - # self.logger.debug('Series has never been observed!') - # yield frag_list, 0, 0, 0, 1.00/float(factorial(len(frag_list))), 0, 0, 0, 0 - - mylist = [tuple([int(y) for y in x]) for x in - series_dataframe[series_dataframe['FRAG_ID'].isin(frag_list)] - [['SERIES_ID', 'FRAG_ID', 'SERIES_SEQ_ID']].values] - - # transpose to dict keyed by series with values eq to list of tuples (frag_id, series_seq_id) - all_series = {} - for k, g in groupby(mylist, itemgetter(0)): - all_series[k] = [el[1:] for el in g] - - # print all_series - - # sort each series by series_seq_id, transpose to dict keyed by sorted (observed!) series - for series_id in all_series: - # sort by series_seq_id, return only fragid - # http://stackoverflow.com/questions/10695139/sort-a-list-of-tuples-by-2nd-item-integer-value - series_sorted = sorted(all_series[series_id], key=lambda lst: lst[1]) - series_sorted = tuple(map((lambda tup: tup[0]), series_sorted)) - - self.logger.debug("Series id %s is sorted to %s" % (series_id, str(series_sorted))) - - # store - if series_sorted not in observed_series: - observed_series[series_sorted] = {} - - observed_series[series_sorted][series_id] = None - - ################################################################### - # Generate the statistics on how significant this series is - ################################################################### - - total_series_obs = len(all_series) - n_factorial = factorial(len(frag_list)) - # get p-value for specific observations - for series in observed_series: - - # enrichment - observations = len(observed_series[series]) - observed_prob = observations/float(total_series_obs) - - # added try/except due to long series total_series_obs >= 270 causing Long Int to float conversion fail - # https://stackoverflow.com/questions/13978556/ - # python-long-int-too-large-to-convert-to-float-when-calculating-pi - #series_too_large = False - try: - expected_prob = 1.00/n_factorial - except Exception: - yield series, observations, total_series_obs, observed_prob, \ - np.nan, np.nan, np.nan, np.nan, np.nan - # yield does not imply continue but we're done so skip to next iteration - continue - - enrichment = observed_prob / expected_prob - - # p-value (corrected) - # use binom_test(count, nTrials, pEvent) where pEvent is the expected prob (1/n_factorial) - p_value = binom_test(observations, total_series_obs, expected_prob) - # Bonferroni correction, multiplying the original p-value by the degrees of freedom - # i.e.: p_value * (N! - 1) - p_value_corrected = p_value * (n_factorial - 1) - - if p_value_corrected <= 0.05: - significant = True - else: - significant = False - - if strict_order: - if series == tuple(frag_list): - yield series, observations, total_series_obs, observed_prob, expected_prob, enrichment, \ - p_value, p_value_corrected, significant - else: - self.logger.debug("Removed series as non-matching order %s versus %s" % (str(series), - str(tuple(frag_list)))) - else: - yield series, observations, total_series_obs, observed_prob, expected_prob, enrichment, \ - p_value, p_value_corrected, significant - - def _iterator_series_scoringmetrics(self, mol_id_list, selected_series_df, apply_filter=False): - """ - Method will iterate over the df and take each pair of series, convert MolID's to activity values - then use the ordered series to calculate an cRMSD (centered RMSD). Originally designed to take - the output from the method search_for_mms_to_extend_molids and process to get cRMSD values. - Scoring (JAL): - cRMSD score is from Matched Molecular Series: Measuring SAR Transferability - Christian Kramer, - Emanuel Ehmki, (Roche), Seventh Joint Sheffield Conference on Chemoinformatics 4th July 2016 and - J Chem Inf Model. 2017 May 1. doi: 10.1021/acs.jcim.6b00709 - http://cisrg.shef.ac.uk/shef2016/conference-programme/poster-session/... - matched-molecular-series-measuring-sar-transferability/ - http://pubs.acs.org/doi/abs/10.1021/acs.jcim.6b00709 - Filtering - apply_filter approach was taken from Keefer and Chang MedChemComm 2017 to remove "candidate series" - with poor characteristics, pActivity < 5 or skew < 3 - :param: mol_id_list: list of the molecules used in query, order specific, will be sanity checked - :param selected_series_df: series_dataframe containing all series involving frag_list - :param apply_filter: boolean, remove candidate series with poor characteristics - :return: a generator containing each series and it's cRMSD - """ - # sanity check the data table for the columns we need - required_col = ['RESULT_ID', 'QUERY_MOL_ID', 'RESULT_MOL_ID', 'QUERY_FRAG_ID', 'RESULT_FRAG_ID'] - for col in required_col: - if col not in selected_series_df.columns: - raise Exception("Input data table does not have required columns: %s" % col) - - # sanity check, the series we are scoring must contain all mols in the query series - # want the RESULT_FRAG_ID where the QUERY_FRAG_ID is not null - first_series_df = selected_series_df[selected_series_df['RESULT_ID'] == selected_series_df['RESULT_ID'].min()] - df_mol_ids = first_series_df[first_series_df['QUERY_MOL_ID'] != 0]['QUERY_MOL_ID'].tolist() - - if sorted(df_mol_ids) != sorted(mol_id_list): - raise Exception("The matching result mol ids in the data frame do not match the mol ids in mol_id_list") - - ########################## - # Calculation of delta p value - # (query) - do this here to avoid repetition inside loop - query_mols_data = [] - - # if self.series_from_file == True: - # print self.series_comparison_df.columns - # print "Using Comparison >>>" - # print [self.series_comparison_df[self.series_comparison_df['MOL_ID'] == x]['ACTIVITY'].max() for x in - # mol_id_list] - # print "--" - # print [self.series_comparison_df[self.series_comparison_df['MOL_ID'] == x]['ACTIVITY'].to_csv() for x in - # mol_id_list] - # print "--" - # print [self.series_comparison_df[self.series_comparison_df['MOL_ID'] == x].to_csv() for x in mol_id_list] - # print "--" - # self.series_comparison_df[self.series_comparison_df['MOL_ID'] == x]['ACTIVITY'] - # for x_ in mol_id_list: - # query_mols_data.append(self.series_comparison_df[self.series_comparison_df['MOL_ID'] == - # int(x_)]['ACTIVITY'].max()) - # else: - for x_ in mol_id_list: - query_mols_data.append(self.mol_data_dict[int(x_)][self.data_column_position]) - - query_avg = sum([x for x in query_mols_data]) / float(len(query_mols_data)) - - sort_order = {} - for idx, id_ in enumerate(mol_id_list): - sort_order[id_] = idx - - # iterate over each of the pairs of query:result matched series - for result_id in selected_series_df['RESULT_ID'].unique(): - - just_this_series_df = selected_series_df[selected_series_df['RESULT_ID'] == result_id] - - new_rgroups_df = just_this_series_df[just_this_series_df['QUERY_MOL_ID'] == 0] - just_this_series_df = just_this_series_df[just_this_series_df['QUERY_MOL_ID'] > 0] - - # sanity check - #print("Try this one") - #print(sorted(set(just_this_series_df['QUERY_MOL_ID'].tolist())), sorted(set(mol_id_list))) - # converted to set() because of situations where we have chiral enantiomers of mol with same fragment - # creating a series with the same fragment twice, with matching query mol then repeated twice, - # - just_query_mol_ids = just_this_series_df['QUERY_MOL_ID'].tolist() - if sorted(set(just_query_mol_ids)) != sorted(set(mol_id_list)): - raise Exception("Data error: input dataframe query mols do not match your input mol_id_list") - - # sort according to the input mol_id_list order - # TODO: bug - ValueError: Categorical categories must be unique, add filter to remove for now - # case occurs where multiple identical Mol_IDS exist in column 'QUERY_MOL_ID - if len(set(just_query_mol_ids)) != len(just_query_mol_ids): - #print("Ditch this set as multiple identical QUERY_MOL_IDs found") - #print(sorted(set(just_this_series_df['QUERY_MOL_ID'].tolist())), sorted(set(mol_id_list))) - continue - - just_this_series_df['QUERY_MOL_ID_CAT'] = pd.Categorical(just_this_series_df['QUERY_MOL_ID'], - categories=mol_id_list, - ordered=True) - - just_this_series_df = just_this_series_df.sort_values('QUERY_MOL_ID_CAT') - # would this work instead? - # just_this_series_df['QUERY_MOL_ID_CAT'] = just_this_series_df.reindex(mol_id_list) - - result_mol_ids = just_this_series_df['RESULT_MOL_ID'].tolist() - result_frag_ids = just_this_series_df['RESULT_FRAG_ID'].tolist() - result_mols_data = just_this_series_df['RESULT_MOL_ACTIVITY'].tolist() - - if apply_filter: - - act_data_nparr = np.array(result_mols_data) - - # range in activity - # print "- range = ", (abs(act_data_nparr.max() - act_data_nparr.min())) - if (abs(act_data_nparr.max() - act_data_nparr.min())) < 0.5: - continue - - # skew, via pearsons second coefficient (alternate method avoiding mode) - # print "- skew = ", (3 * (act_data_nparr.mean() - np.median(act_data_nparr)))/act_data_nparr.std() - if (3 * (act_data_nparr.mean() - np.median(act_data_nparr)))/act_data_nparr.std() > 3: - continue - - ########################## - # Calculation of cRMSD / RMSD - # - # print query_mols_data, result_mols_data - # convert arrays to np.arrays - query_mols_data = np.array(query_mols_data) - result_mols_data = np.array(result_mols_data) - - result_avg = sum([x for x in result_mols_data]) / float(len(result_mols_data)) - - # center the data to compensate for the different core, assay or interaction - # of the compound with the target (see ref) - query_mols_data_centered = np.array([(x - query_avg) for x in query_mols_data]) - result_mols_data_centered = np.array([(x - result_avg) for x in result_mols_data]) - # - rmsd = np.sqrt((query_mols_data - result_mols_data) ** 2).mean() - rmsd_c = np.sqrt((query_mols_data_centered - result_mols_data_centered) ** 2).mean() - - ########################## - # Predicted activity: - # Calculation of p values - # (difference to the mean or diffMean) - # Calculation via Linear Regression - # - new_fragids = new_rgroups_df['RESULT_FRAG_ID'].tolist() - # diffMean model method - new_predict_act_dM = [] - # linear model method - lm_x = linregress(query_mols_data, result_mols_data) - new_predict_act_lm = [] - new_predict_lm_rsqd = lm_x.rvalue - # print lm_x.slope, lm_x.intercept - - for molid in new_rgroups_df['RESULT_MOL_ID'].tolist(): - - if self.series_from_file: - molid_activity = self.series_comparison_df[self.series_comparison_df['MOL_ID'] == - molid]['ACTIVITY'].max() - - else: - molid_activity = self.mol_data_dict[molid][self.data_column_position] - - new_predict_act_dM.append(query_avg + (molid_activity - result_avg)) - new_predict_act_lm.append((lm_x.slope * molid_activity) + lm_x.intercept) - - ########################## - # Distance metrics - # https://docs.scipy.org/doc/scipy/reference/spatial.distance.html - # from scipy.spatial.distance import cityblock - # - manhattan_dist = cityblock(query_mols_data, result_mols_data) - manhattan_dist_c = cityblock(query_mols_data_centered, result_mols_data_centered) - - ########################## - # Calculation of r2 - # gets rank order coeff and p-val as list - # from scipy.stats import pearsonr, spearmanr - # - pearson_r = pearsonr(query_mols_data, result_mols_data) - spearman_r = spearmanr(query_mols_data, result_mols_data) - - yield result_id, result_mol_ids, result_frag_ids, new_fragids, \ - new_predict_act_dM, new_predict_act_lm, new_predict_lm_rsqd, rmsd, rmsd_c, \ - manhattan_dist, manhattan_dist_c, pearson_r[0], pearson_r[1], spearman_r[0], spearman_r[1] - - ############################################################################################ - # - # Write results methods - # - ############################################################################################ - - def write_raw_series_to_file(self, csv_out, apply_pre_filter=False): - """ This method writes out raw matched series to file - """ - - self.logger.info('Writing raw series to file: %s' % csv_out) - - with open(csv_out, "w") as out_file: - - out_file.write('SERIES_ID,SERIES_SEQ_ID,MOL_ID,CONTEXT,FRAG,ACTIVITY\n') - - for series_id, series_seq_id, ctx_id, \ - frag_id_L, molid_L, mol_act in self._iterator_mmp_series_numeric(apply_pre_filter=apply_pre_filter): - - print_string = str(series_id) + "," + str(series_seq_id) + "," + str(molid_L) + "," - print_string += self.refsmi_dict[ctx_id] + "," - print_string += self.refsmi_dict[frag_id_L] + "," - print_string += str(mol_act) + "\n" - - out_file.write(print_string) - - self.logger.info('Done writing file: %s' % csv_out) - - def return_scored_series_dataframe(self, - mol_id_list, - results_dataframe, - return_dataframe, - append=False, - apply_filter=False): - """ - Take a dataframe as input and validate as it expects the output from search_for_mms_to_extend_molids. Now - iterate across the result series and score then return.... - :param mol_id_list the mol id list used to generate the results_dataframe - :param results_dataframe name of input dataframe to reformat/score - :param return_dataframe name of dataframe to return, this var must exist but can be set to None - :param append should I append the reformatted results_dataframe to return_dataframe or is it a new df - :param apply_filter (Filter) - :return reformatted, scored dataframe - """ - # self.logger.info('Writing scored series to file %s' % csv_out) - - # validation of df structure - required_col = ['RESULT_ID', 'RESULT_SERIES_SEQ_ID', 'QUERY_MOL_ID', 'RESULT_MOL_ID', - 'QUERY_FRAG_ID', 'QUERY_CONTEXT_ID', 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID'] - for col in required_col: - if col not in results_dataframe.columns: - raise Exception("Input data table does not have required columns: %s" % col) - - inc_source_sol = False - if 'SOURCE_FILE' in results_dataframe.columns: - inc_source_sol = True - - # catch for empty table - if results_dataframe.shape[0] == 0: - print("No results found") - return False - - # get RMSE scoring (work on unexpanded series) - results_container = {} - unique_series = set() - - for result_id, result_mol_ids, result_frag_ids, new_fragids, \ - new_predict_act_dM, new_predict_act_lm, new_predict_lm_rsqd,\ - rmsd, rmsd_c, manhattan_dist, manhattan_dist_c, \ - pearson_r2, pearson_p, spearman_r2, spearman_p \ - in self._iterator_series_scoringmetrics(mol_id_list, results_dataframe, apply_filter=apply_filter): - - results_container[result_id] = [result_id, result_mol_ids, result_frag_ids, new_fragids, - new_predict_act_dM, new_predict_act_lm, new_predict_lm_rsqd, - rmsd, rmsd_c, manhattan_dist, manhattan_dist_c, - pearson_r2, pearson_p, spearman_r2, spearman_p] - unique_series.add(result_id) - - # debugging - need to see what I have - # with open('rmse_score.csv', 'wb') as outrmse: - # for key, value in results_container.items(): - # outrmse.write(str(key)+","+str(value)+"\n") - # print key, value - - # get all new expanded series - basically the ideas of things we should make - # ...and get the p-value based scoring for them - # removed this as if we do filtering on the result series by range and skew, we might lose result_id's - # instead build this in the above loop - # unique_series = results_dataframe['RESULT_ID'].unique() - - # - # setup return dataframe inc headers - # - if append is False: - return_dataframe = None - - # num_items = len(mol_id_list) - header_row = ['LAST_QUERY_SERIES_MOLID', 'LAST_QUERY_SERIES_MOLSMI', 'QUERY_MOL_CONTEXT', - 'RESULT_MOL_CONTEXT', 'QUERY_MOL_FRAG_L', 'NEW_FRAG_R'] - # series_frag_smi list - # (a) here we only have the query frags - for idx in range(self.return_series_max_len): - header_row.append('QUERY_SERIES_FRAG_' + str(idx + 1)) - header_row.extend(['SERIES_N_OBS', 'SERIES_N_TOTALOBS', 'OBS_PROB', 'EXPECTED_OBS', 'ENRICHMENT', - 'P_VALUE', 'P_VALUE_CORRECTED', 'SIGNIFCANCE']) - # results_container[result_id] - # (b) here we have the query frag + 1 additional matched frag that extends the series hence +1 - for idx in range(self.return_series_max_len + 1): - header_row.append('RESULT_SERIES_MOLID_' + str(idx + 1)) - for idx in range(self.return_series_max_len + 1): - header_row.append('RESULT_SERIES_FRAGSMI_' + str(idx + 1)) - header_row.extend(['PRED_ACT_DDP', 'PRED_ACT_LR', 'PRED_ACT_LR_R2', 'RMSE', 'cRMSE', 'MD', 'cMD', - 'PEARSON_R2', 'PEARSON_PVAL', 'SPEARMAN_R2', 'SPEARMAN_PVAL']) - if inc_source_sol: - header_row.append('SOURCE_FILE') - - # - results_rows = [] - - # iterate over the results, foreach series get the last molecule from query and the new fragment - # to make (can be more than one per series), return single new fragment idea per line with score - for result_id in unique_series: - - result_subdf = results_dataframe[results_dataframe['RESULT_ID'] == result_id] - query_context_id = result_subdf['QUERY_CONTEXT_ID'].max() - result_context_id = result_subdf['RESULT_CONTEXT_ID'].max() - if inc_source_sol: - source_file = result_subdf.SOURCE_FILE.unique()[0] - # print source_file - - # get rowid for last element in query - last_query_frag_seqid = result_subdf[result_subdf['QUERY_FRAG_ID'] != 0]['RESULT_SERIES_SEQ_ID'].max() - query_fragids = result_subdf[result_subdf['QUERY_FRAG_ID'] != 0]['QUERY_FRAG_ID'].tolist() - query_fragids = [int(x) for x in query_fragids] - last_query_fragid = query_fragids[-1] - last_query_molid = result_subdf[result_subdf['QUERY_FRAG_ID'] != 0]['QUERY_MOL_ID'].tolist()[-1] - # get all result frag_id, mol_id that come after the last element in query - new_fragments = [tuple(x) for x in result_subdf[result_subdf['RESULT_SERIES_SEQ_ID'] > - last_query_frag_seqid][['RESULT_FRAG_ID', - 'RESULT_MOL_ID']].values] - - # I need to do p-scoring for the frag list, which needs all example series matching frag list. - # These already exist in the results_dataframe but other series might also exist that were found - # from a different set of query frags. Therefore we need to filter the results_dataframe to get only - # all series matching the frag list. - for (new_fragid, new_molid) in new_fragments: - result_fragids = deepcopy(query_fragids) - result_fragids.append(int(new_fragid)) - - self.logger.debug('New frag %s searching comparison dict %s' % (new_fragid, self.series_from_file)) - - # For the new extended series that we have found (query plus new frag_id), score each one - # TODO: Look at why I have to use a dataframe level property, inconsistent with use_comparison_df param - # in other words I should be able to just pass use_comparison_df, maybe make use_comparison_df an object - # level property - if self.series_from_file is False: - temp_results = self.return_series_matching_idlist(result_fragids, 'FRAG_ID') - else: - temp_results = self.return_series_matching_idlist(result_fragids, 'FRAG_ID', use_comparison_df=True) - # TODO: you should not be looking in comparison dict as data came from source - if self.running_fullscan and temp_results.empty: - temp_results = self.return_series_matching_idlist(result_fragids, 'FRAG_ID') - - # TODO: if using non strict matching, this makes no sense, return nulls instead - for series, n_obs, tot_series_obs, obs_prob, exptd_prob, enrichment, p_val, \ - p_val_corrected, signif in self._iterator_series_pstats(result_fragids, temp_results, - strict_order=True): - # - single_row = [] - frag_l = self.refsmi_dict[last_query_fragid] - frag_r = self.refsmi_dict[new_fragid] - query_context_smi = self.refsmi_dict[query_context_id] - result_context_smi = self.refsmi_dict[result_context_id] - series_frag_smi = [self.refsmi_dict[x] for x in series] - series_frag_smi = series_frag_smi[:-1] - this_series_len = len(series_frag_smi) - series_frag_smi.extend([None for _ in range(this_series_len, self.return_series_max_len)]) - - # TODO: This should be the smiles not the context - if self.series_from_file: - last_query_mol_smi = self.series_comparison_df[self.series_comparison_df['MOL_ID'] == - last_query_molid]['CONTEXT_ID'].max() - else: - last_query_mol_smi = self.mol_smi_dict[last_query_molid] - - # print last_query_molid, last_query_mol_smi, frag_l, "-->", frag_r, series_frag_smi, n_obs, \ - # tot_series_obs, obs_prob, exptd_prob, enrichment, p_val, \ - # p_val_corrected, signif, results_container[result_id] - single_row.extend([last_query_molid, last_query_mol_smi, query_context_smi, - result_context_smi, frag_l, frag_r]) - single_row.extend(series_frag_smi) - # - single_row.extend([n_obs, tot_series_obs, obs_prob, exptd_prob, enrichment, - p_val, p_val_corrected, signif]) - # get the result molids that match the query molids via fragment series match - # and add the new mol that adds new information - single_row.extend(results_container[result_id][1]) - single_row.append(new_molid) - single_row.extend([None for _ in range(this_series_len, self.return_series_max_len)]) - single_row.extend([self.refsmi_dict[x] for x in results_container[result_id][2]]) - single_row.append(frag_r) - single_row.extend([None for _ in range(this_series_len, self.return_series_max_len)]) - # get the predicted activity for this fragment - idx_loc = results_container[result_id][3].index(new_fragid) - single_row.append(results_container[result_id][4][idx_loc]) - single_row.append(results_container[result_id][5][idx_loc]) - # same again for the fragments of this series - self.logger.debug("frag id %s has activity %s" % (new_fragid, - results_container[result_id][4][idx_loc])) - # rest of the info - single_row.extend(results_container[result_id][6:]) - if inc_source_sol: - single_row.append(source_file) - results_rows.append(single_row) - # print result_id, last_query_molid, last_query_fragid, "-->", new_fragid, series, n_obs, \ - # tot_series_obs, obs_prob, exptd_prob, enrichment, p_val, \ - # p_val_corrected, signif, results_container[result_id] - - if append is False: - return_dataframe = pd.DataFrame(results_rows, columns=header_row) - - else: - return_dataframe = pd.concat([return_dataframe, pd.DataFrame(results_rows, columns=header_row)], sort=False) - # return_dataframe = return_dataframe.rename_axis(None) - - # add enumerated product - #print(return_dataframe.shape) - if return_dataframe.shape[0] < 1: - return pd.DataFrame(columns=header_row) - else: - return_dataframe = self.enumerate_products(return_dataframe, 'QUERY_MOL_CONTEXT', 'NEW_FRAG_R') - - # add max activity for given context - def get_series_max_act(row): - return self.max_series_potency[self.refsmi_dict[row['QUERY_MOL_CONTEXT']]] - - return_dataframe['MAX_SERIES_ACT'] = return_dataframe.apply(lambda row: get_series_max_act(row), axis=1) - - # reorder headers - header_row.extend(['ENUMERATED_PRODUCT', 'NOVEL', 'MAX_SERIES_ACT']) - # IL replaced deprecated .reindex_axis with .reindex - return_dataframe = return_dataframe.reindex(header_row, axis=1) - # - return return_dataframe - - def write_series_dataframe(self, dataframe, csv_out): - """ - :param dataframe: input dataframe to write to file - :param csv_out: name of file to write df to - Only real use for this right now is for object level access, debugging, - avoiding pandas import at script wrapper level - """ - dataframe.to_csv(csv_out, index=False, float_format='%.3f') - - ############################################################################################ - # - # Methods to parse a set of series files from disk - # - ############################################################################################ - - def set_base_dir(self, base_dir): - """Set the base directory for this script to run from, all files stored here""" - - if os.path.isdir(base_dir): - - self.series_base_dir = base_dir - - if self.series_base_dir[-1] != "/": - self.series_base_dir += "/" - - self.logger.info('Base Directory set as %s' % self.series_base_dir) - # print ('Base Directory set as %s' % self.series_base_dir) - - else: - self.logger.warn('Fatal: Base Directory does not exist %s' % self.series_base_dir) - raise Exception('Error Base Directory does not exist') - - def get_series_filelist_from_dir(self): - """get list of *.series files from base_dir, - if present store in crc_raw_file_list & return True""" - # - if self.series_base_dir is None: - self.logger.warn('Fatal: Base Directory not set %s') - raise Exception('Error Base Directory not set') - - self.series_file_list = glob.glob(self.series_base_dir + '*.series') - self.logger.info('Found %d files' % len(self.series_file_list)) - - if len(self.series_file_list) >= 1: - return True - - else: - return False - - def parse_directory_of_series_files(self): - """ - Read all files in directory with .series extension and merges into one huge table - :param: dir_name: - :return: None - """ - if self.series_base_dir is None or len(self.series_file_list) < 1: - self.logger.warn('Fatal: Base Directory not set %s') - raise Exception('Error Base Directory not set') - - self.logger.info('Parsing dir of files from %s' % self.series_base_dir) - - self.ref_series_df = pd.DataFrame([], columns=['SERIES_ID', 'SERIES_SEQ_ID', 'CONTEXT', - 'FRAG', 'MOL_ID', 'ACTIVITY']) - - required_col = ['SERIES_ID', 'SERIES_SEQ_ID', 'CONTEXT', 'FRAG', 'MOL_ID', 'ACTIVITY'] - max_series_id = 0 - - for series_file in self.series_file_list: - - # print series_file - temp_df = pd.read_csv(series_file) # , index_col=False) - # print temp_df.columns - - # sanity check the data table for the columns we need - for col in required_col: - if col not in temp_df.columns: - raise Exception("Input CSV %s does not have required columns: %s" % (series_file, col)) - - # re-sequence the series ID's - if max_series_id == 0: - max_series_id = temp_df['SERIES_ID'].max() - else: - max_series_id = self.ref_series_df['SERIES_ID'].max() - # print max_series_id - - temp_df['SERIES_ID'] = temp_df['SERIES_ID'] + max_series_id - temp_df['SOURCE_FILE'] = os.path.basename(series_file) - - # py2>3 explicit sort=False added - self.ref_series_df = pd.concat([self.ref_series_df, temp_df], sort=False) - self.logger.info('Appended dataframe shape %s to master dataframe %s' % - (str(temp_df.shape), str(self.ref_series_df.shape))) - # print ('Appended dataframe shape %s to master dataframe %s' % (str(temp_df.shape), - # str(self.ref_series_df.shape))) - # print self.ref_series_df['SERIES_ID'].max() - - self.series_comparison_df = self.ref_series_df - - def setup_pregenerated_series_data_for_mms(self, dir_name): - """ - Runs a set of methods to read in a dir of series files ready for searching - :param dir_name: a directory of series files, full path - :return: - """ - self.logger.info('Setting up searching using pre-generated series data') - - self.set_base_dir(dir_name) - self.get_series_filelist_from_dir() - self.parse_directory_of_series_files() - self.series_from_file = True - - self.logger.info('Parsed data to series data structure of shape %s' % str(self.series_comparison_df.shape)) - - def convert_smi_to_id(smi_str): - """convert a fragment or context smiles to a reference id consistent with existing mmp object ids""" - - if smi_str in self.refsmi_dict: - frag_id = self.refsmi_dict[smi_str] - - else: - self.refsmi_id += 1 - self.refsmi_dict[smi_str] = self.refsmi_id - self.refsmi_dict[self.refsmi_id] = smi_str - frag_id = self.refsmi_id - - return frag_id - - # flip 'CONTEXT' and 'FRAG' columns to reference ID values - # print self.series_comparison_df.shape - # print self.series_comparison_df.head() - self.series_comparison_df['CONTEXT'] = self.series_comparison_df['CONTEXT'].apply( - lambda x: convert_smi_to_id(x)) - self.series_comparison_df['FRAG'] = self.series_comparison_df['FRAG'].apply(lambda x: convert_smi_to_id(x)) - - # rename them as ID columns as no longer raw smi strings - self.series_comparison_df.rename(columns={'CONTEXT': 'CONTEXT_ID', - 'FRAG': 'FRAG_ID'}, - inplace=True) - - # print self.series_comparison_df.to_csv() - - def enumerate_products(self, dataframe, context_col, fragment_col): - """ - Uses MMPEnumerateNewMols object to enumerate new molecules from ctx + frag pair - which are expected to already be labelled with isotopic labelling to highlight - the connection points - :param dataframe: - :param context_col: - :param fragment_col: - :return: - """ - # create dict - self.enumerated_products_smi = {} - for (ctx, frag) in zip(dataframe[context_col].tolist(), dataframe[fragment_col].tolist()): - self.enumerated_products_smi[(ctx, frag)] = None - - # function to return value from this dict: - def get_product(row): - return self.enumerated_products_smi[(row[context_col], row[fragment_col])] - - # function to return value for novelty - def get_novelty(row): - if row['ENUMERATED_PRODUCT'] in self.mol_smi_dict: - return self.mol_smi_dict[row['ENUMERATED_PRODUCT']] - else: - return True - - # write rxn files - enum_object = enum_mols.MMPEnumerateNewMols(self.logger) - enum_object.write_rxn_files() - enum_object.write_reactants_simple_dict(self.enumerated_products_smi) - enum_object.do_reactions() - - for cut_type, rtn_ctx, rtn_frag, new_mol in enum_object.yield_products_simple_dict_input(): - if (rtn_ctx, rtn_frag) in self.enumerated_products_smi: - self.enumerated_products_smi[(rtn_ctx, rtn_frag)] = new_mol - else: - self.logger.debug("got a return I was not expecting: %s, %s -> %s" % (rtn_ctx, rtn_frag, new_mol)) - - # standardise - temp_smifi = tempfile.NamedTemporaryFile(suffix=".smi", delete=False, encoding='utf-8', mode='wt') - std_smi_lookup = {} - # take the product smi and store as product_smi => None - arbitary_id = 0 - for smi_no_std in list(self.enumerated_products_smi.values()): - arbitary_id += 1 - std_smi_lookup[smi_no_std] = arbitary_id - std_smi_lookup[arbitary_id] = smi_no_std - temp_smifi.write(smi_no_std + " " + str(arbitary_id) + "\n") - temp_smifi.close() - - # - # send in a dict of old_smi => old_smi - # should turn this into old_smi => new_smi - self.logger.debug("Requested standardisation on %s" % temp_smifi.name) - new_smi_dict = self.generate_std_smiles(temp_smifi.name, smi_id_map='id_smi') - std_smi_lookup.update(new_smi_dict) - - # - for key, value in list(self.enumerated_products_smi.items()): - # print value, " >> ", std_smi_lookup[std_smi_lookup[value]] - self.enumerated_products_smi[key] = std_smi_lookup[std_smi_lookup[value]] - - # add the return enumerated mols to the df - dataframe['ENUMERATED_PRODUCT'] = dataframe.apply(lambda row: get_product(row), axis=1) - dataframe['NOVEL'] = dataframe.apply(lambda row: get_novelty(row), axis=1) - # - del enum_object - # - return dataframe - - def auto_search(self, m, n, strict_ordering=False, use_comparison_df=False): - """ - Method to automatically search a series dataframe for series that can be extended - to create molecules of activity the same as or greater than the best molecule in - the set. The method will remove m items from every series in the set of molecules. It - will then decompose this set of m items into every possible subset down to length m but - maintaining the sequence order. - :param m: the number of items removed from a given series to create the longest seed - :param n: the smallest number of items to be used in a query series, decomposed from m - :return: new idea compounds as iterator - """ - self.logger.info('Initialised auto_search with parameters m:%s, n:%s' % (m, n)) - - if n > m: - raise Exception("Error, m must be greater or equal to n") - else: - self.return_series_max_len = m - - series_ids = self.series_df['SERIES_ID'].unique() - all_results = pd.DataFrame(dtype='int', - columns=['RESULT_ID', 'SERIES_ID', 'RESULT_SERIES_SEQ_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID', 'QUERY_CONTEXT_ID', 'QUERY_FRAG_ID', - 'QUERY_MOL_ID', 'QUERY_ORDER', 'RESULT_MOL_ACTIVITY']) - iteration = 0 - result_id = 0 - - # iterate over every matched series, use last m values as search query - for series_id in series_ids: - - series_fragids = self.series_df[self.series_df['SERIES_ID'] == series_id]['FRAG_ID'].tolist() - series_molids = self.series_df[self.series_df['SERIES_ID'] == series_id]['MOL_ID'].tolist() - query_context_id = self.series_df[self.series_df['SERIES_ID'] == series_id]['CONTEXT_ID'].max() - # self.logger.debug("Query context: %s" % - # self.refsmi_dict[self.series_df[self.series_df['SERIES_ID'] == series_id] - # ['CONTEXT_ID'].max()]) - - if len(series_fragids) > m: - series_fragids = series_fragids[-m:] - series_molids = series_molids[-m:] - - series_frag_n_mols = [tuple([val, series_molids[idx]]) for idx, val in enumerate(series_fragids)] - - # get every length of series we will generate between m and n then generate - # the actual ordered sub-series from a series using combinations, start with - # the longest series first as later we will drop/never process a series if it's - # series_id has been seen before, so take longest series first - for size in range(m, n - 1, -1): - - for query_tuples in combinations(series_frag_n_mols, size): - - iteration += 1 - query_fragids = [x[0] for x in query_tuples] - query_molids = [x[1] for x in query_tuples] - - # run search, remove the current series_id from result list - return_df = self.return_series_matching_idlist(query_fragids, 'FRAG_ID', - strict_ordering=strict_ordering, - use_comparison_df=use_comparison_df) - # - # search might return nothing, or might only return the series the query came from - if not return_df.empty: - return_df = return_df[return_df['SERIES_ID'] != series_id] - else: - # this should never happen for internal SAR transfer but can for external set - continue - # - if not return_df.empty: - - for sgl_rtn_series_id in return_df['SERIES_ID'].unique(): - - sgl_rtn_return_df = return_df[return_df['SERIES_ID'] == sgl_rtn_series_id] - - # only add if we've not seen it before - # We're processing series in reverse length order so we'll capture the longest series - # first and drop anything shorter with the same series_id. - # - if sgl_rtn_return_df['SERIES_ID'].max() not in all_results['SERIES_ID'].unique(): - - matching_series_fragids = sgl_rtn_return_df['FRAG_ID'].tolist() - - # check this matched series can be used to extend our existing series - if (len(matching_series_fragids) > len(query_fragids)) \ - and (matching_series_fragids[-1] not in query_fragids): - - result_id += 1 - # debugging - # print size, len(query_fragids), len(matching_series_fragids) - - # reformat results - matching_series_molids = sgl_rtn_return_df['MOL_ID'].tolist() - matching_series_seqids = sgl_rtn_return_df['SERIES_SEQ_ID'].tolist() - matching_series_act = sgl_rtn_return_df['ACTIVITY'].tolist() - # matching_series_frag_smi = [self.refsmi_dict[x] for x in matching_series_fragids] - matching_series_context_id = sgl_rtn_return_df['CONTEXT_ID'].unique()[0] - matching_series_id = sgl_rtn_return_df['SERIES_ID'].max() - if use_comparison_df: - matching_series_source = sgl_rtn_return_df['SOURCE_FILE'].unique()[0] - num_items = len(matching_series_molids) - - # don't know if single or double so skip this - # (ms_ctx1_id, ms_ctx1_id) = inv_cantor(matching_series_context_id) - # self.logger.debug("Result context: %s" % - # self.refsmi_dict[matching_series_context_id]) - - ################################ - # these are also added to the pandas table and flag which items in the matching - # series were part of the query and which were not (NaN). It's not that intelligent - # and might flag an item twice - matching_series_query_molids = [] - matching_series_query_fragids = [] - for idx, item in enumerate(matching_series_fragids): - try: - locn = query_fragids.index(item) - matching_series_query_fragids.append(item) - matching_series_query_molids.append(query_molids[locn]) - except: - matching_series_query_fragids.append(0) - matching_series_query_molids.append(0) - - # Need the query order added to the dataframe as we did non-ordered matching - matching_series_query_molids_order = [] - for idx, query_id in enumerate(matching_series_query_molids): - if query_id in query_molids: - matching_series_query_molids_order.append(query_molids.index(query_id) + 1) - else: - matching_series_query_molids_order.append(0) - - # print [result_id] * num_items - # print [matching_series_id] * num_items - # print matching_series_seqids - # print matching_series_molids - # print [matching_series_context_id] * num_items - # print 'matching_series_fragids ', matching_series_fragids - # print 'matching_series_query_fragids', matching_series_query_fragids - # print 'matching_series_query_molids', matching_series_query_molids - # print matching_series_act - - ################################ - # finally form a return dataframe from all the lists - reformatted_result_df = pd.DataFrame({ - 'RESULT_ID': [result_id] * num_items, - 'SERIES_ID': [matching_series_id] * num_items, - 'RESULT_SERIES_SEQ_ID': matching_series_seqids, - 'RESULT_MOL_ID': matching_series_molids, - 'RESULT_CONTEXT_ID': [matching_series_context_id] * num_items, - 'RESULT_FRAG_ID': matching_series_fragids, - 'QUERY_CONTEXT_ID': [query_context_id] * num_items, - 'QUERY_FRAG_ID': matching_series_query_fragids, - 'QUERY_MOL_ID': matching_series_query_molids, - 'QUERY_ORDER': matching_series_query_molids_order, - 'RESULT_MOL_ACTIVITY': matching_series_act - }, - columns=['RESULT_ID', 'SERIES_ID', 'RESULT_SERIES_SEQ_ID', - 'RESULT_MOL_ID', 'RESULT_CONTEXT_ID', - 'RESULT_FRAG_ID', 'QUERY_CONTEXT_ID', - 'QUERY_FRAG_ID', 'QUERY_MOL_ID', - 'QUERY_ORDER', 'RESULT_MOL_ACTIVITY'] - ) - if use_comparison_df: - reformatted_result_df['SOURCE_FILE'] = matching_series_source - # [matching_series_source] * num_items - - all_results = pd.concat([all_results, reformatted_result_df]) - - self.logger.debug("series %s, search iteration %s using frag_id list %s" % - (series_fragids, series_id, query_fragids)) - self.logger.debug("merging result_df (%s) from query %s to all_results (now %s)" % - (str(return_df.shape), series_fragids, str(all_results.shape))) - - else: - self.logger.debug("Skipped as already have this series in results set") - - else: - self.logger.debug("No results") - - # all_results = all_results.drop_duplicates().reset_index(drop=True) - # print all_results.columns - all_results = all_results.astype({'RESULT_ID': int, - 'SERIES_ID': int, - 'RESULT_SERIES_SEQ_ID': int, - 'RESULT_MOL_ID': int, - 'RESULT_CONTEXT_ID': int, - 'RESULT_FRAG_ID': int, - 'QUERY_CONTEXT_ID': int, - 'QUERY_FRAG_ID': int, - 'QUERY_MOL_ID': int, - 'QUERY_ORDER': int - }) - - self.logger.info('Auto search done') - return all_results - - def auto_search_fullscan(self, strict_ordering=False, use_comparison_df=False): - # TODO: describe this versus auto_search - """ """ - - self.running_fullscan = True - self.logger.info('Initialised auto_search_fullscan') - - result_id = 0 - all_results = pd.DataFrame(dtype='int', - columns=['RESULT_ID', 'SERIES_ID', 'RESULT_SERIES_SEQ_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'RESULT_FRAG_ID', 'QUERY_CONTEXT_ID', 'QUERY_FRAG_ID', - 'QUERY_MOL_ID', 'QUERY_ORDER', 'RESULT_MOL_ACTIVITY']) - - ############################ - # - # - self.logger.info("Getting frag lists for each series") - series_ids = self.series_df['SERIES_ID'].unique() - self.logger.info("Got %s series to work on" % len(series_ids)) - series_frags = {} - target_frags = {} - for series_id in series_ids: - # TODO: storing list as it is ordered, but set here would be faster to avoid conversion later? - series_frags[series_id] = self.series_df[self.series_df['SERIES_ID'] == series_id]['FRAG_ID'].tolist() - - ############################ - # - # - - # useful counters and things needed in loop: - total_num_series = len(series_ids) - trunc = total_num_series - self.return_series_max_len = self.min_series_length - - self.logger.info("Working on %s series, %s comparisons" % (total_num_series, - (total_num_series * total_num_series / 2))) - - if use_comparison_df: - target_series_ids = self.series_comparison_df['SERIES_ID'].unique() - target_series_df = self.series_comparison_df - for series_id in target_series_ids: - # TODO: storing list as it is ordered, but set here would be faster to avoid conversion later? - target_frags[series_id] = target_series_df[target_series_df['SERIES_ID'] == - series_id]['FRAG_ID'].tolist() - else: - target_series_ids = series_ids[-1 * trunc:] - target_series_df = self.series_df - target_frags = series_frags - - for series_id_a in series_ids: - trunc -= 1 - intersect_frags = [] - - # TODO: last comparison is last item versus self - for series_id_b in target_series_ids: - - # series_frags[series_id_a] - # series_frags[series_id_b] - interset_frags = set(series_frags[series_id_a]).intersection(set(target_frags[series_id_b])) - num_interset_frags = len(interset_frags) - - # - # This is an MMS pair, so now we need to process it - # - if num_interset_frags > self.min_series_length: - - if num_interset_frags > self.return_series_max_len: - self.return_series_max_len = deepcopy(num_interset_frags) - - # TODO: still have cases here where two series are identical and intersection same, so no extension - # print ">>>" - # print "series: ", series_id_a, " has items ", series_frags[series_id_a] - # print "series: ", series_id_b, " has items ", series_frags[series_id_b] - # print "with intersect size:", len(interset_frags), "and items ", interset_frags - - # ordered list of fragments for the series - series_a_df = self.series_df[self.series_df['SERIES_ID'] == series_id_a] - series_b_df = target_series_df[target_series_df['SERIES_ID'] == series_id_b] - - series_a_fragids = series_a_df['FRAG_ID'].tolist() - series_b_fragids = series_b_df['FRAG_ID'].tolist() - series_a_molids = series_a_df['MOL_ID'].tolist() - series_b_molids = series_b_df['MOL_ID'].tolist() - - # the intersection fragments, but ordered by the original series order - series_a_query = [x for x in series_a_fragids if x in interset_frags] - series_b_query = [x for x in series_b_fragids if x in interset_frags] - - series_a_df.loc[:, 'QUERY_FRAG_ID'] = [x if x in interset_frags else 0 for x in series_a_fragids] - series_b_df.loc[:, 'QUERY_FRAG_ID'] = [x if x in interset_frags else 0 for x in series_b_fragids] - - # indexes are zero indexed and we use them as a pseudo for sequence order so increment by 1 - series_a_df.loc[:, 'QUERY_SERIES_SEQID'] = [series_b_query.index(x) + 1 if x in interset_frags else - 0 for x in series_a_fragids] - series_b_df.loc[:, 'QUERY_SERIES_SEQID'] = [series_a_query.index(x) + 1 if x in interset_frags else - 0 for x in series_b_fragids] - - series_a_df['QUERY_MOL_ID'] = [series_b_molids[series_b_query.index(x)] if x in interset_frags else - 0 for x in series_a_fragids] - series_b_df['QUERY_MOL_ID'] = [series_a_molids[series_a_query.index(x)] if x in interset_frags else - 0 for x in series_b_fragids] - - # ensure we can extend the query series - # if so then reformat the df - if series_a_df['QUERY_SERIES_SEQID'].tolist()[-1] == 0: - - if use_comparison_df: - continue - else: - result_id += 1 - series_a_df['RESULT_ID'] = [result_id] * len(series_a_fragids) - series_a_df['QUERY_CONTEXT_ID'] = [series_b_df.loc[:, 'CONTEXT_ID'].max()] * \ - len(series_a_fragids) - # named new_df because an in place rename will affect second below if statement, - # would no longer find CONTEXT_ID column in series_a_df - new_df = series_a_df.rename(columns={'SERIES_SEQ_ID': 'RESULT_SERIES_SEQ_ID', - 'CONTEXT_ID': 'RESULT_CONTEXT_ID', - 'FRAG_ID': 'RESULT_FRAG_ID', - 'MOL_ID': 'RESULT_MOL_ID', - 'ACTIVITY': 'RESULT_MOL_ACTIVITY', - 'QUERY_SERIES_SEQID': 'QUERY_ORDER'}) - - # TODO: If this has use_comparison_df = True then we will not have any SOURCE_FILE column - # TODO value we could remove the above lines if use_comparison_df: continue the get and - # TODO print reverse transfer events but later code makes too many assumptions and fails - # ...but a result here indicates a series in target, can be extended by source, tag on - # extra data - # if use_comparison_df: - # new_df['SOURCE_FILE'] = ["TRANSFER_TO_" + series_b_df['SOURCE_FILE'].max()] * - # len(series_a_fragids) - - all_results = all_results.append(new_df, sort=False) - # print series_a_df.to_csv() - - if series_b_df['QUERY_SERIES_SEQID'].tolist()[-1] == 0: - - result_id += 1 - series_b_df['RESULT_ID'] = [result_id] * len(series_b_fragids) - series_b_df['QUERY_CONTEXT_ID'] = [series_a_df.loc[:, 'CONTEXT_ID'].max()] * \ - len(series_b_fragids) - series_b_df = series_b_df.rename(columns={'SERIES_SEQ_ID': 'RESULT_SERIES_SEQ_ID', - 'CONTEXT_ID': 'RESULT_CONTEXT_ID', - 'FRAG_ID': 'RESULT_FRAG_ID', - 'MOL_ID': 'RESULT_MOL_ID', - 'ACTIVITY': 'RESULT_MOL_ACTIVITY', - 'QUERY_SERIES_SEQID': 'QUERY_ORDER'}) - all_results = all_results.append(series_b_df, sort=False) - # print series_b_df.to_csv() - - # print "Wrapping up with table of shape: ", all_results.shape - self.logger.info('Auto search FULL done with final table shape %s' % str(all_results.shape)) - return all_results - - def auto_search_write(self, auto_search_result_df, out_csv): - """ - Writes the result of an auto_search run to chosen filename - :param auto_search_result_df: pandas df, should come from aut_search method return - :param out_csv: the file to write results to - :return: none, writes file to disk - """ - self.logger.info('Starting auto search and write') - all_result_ids = auto_search_result_df['RESULT_ID'].unique() - - # validation of df structure - required_col = ['RESULT_ID', 'SERIES_ID', 'RESULT_SERIES_SEQ_ID', 'QUERY_MOL_ID', 'RESULT_MOL_ID', - 'RESULT_CONTEXT_ID', 'QUERY_FRAG_ID', 'QUERY_MOL_ID', 'QUERY_CONTEXT_ID', 'RESULT_FRAG_ID', - 'QUERY_ORDER', 'RESULT_MOL_ACTIVITY'] - - for col in required_col: - if col not in auto_search_result_df.columns: - raise Exception("Input data table does not have required columns: %s" % col) - - # catch for empty table - if auto_search_result_df.shape[0] == 0: - print ("No results found") - return False - - iteration = 1 - return_df = None - - for result_id in all_result_ids: - - self.logger.info("Result, series ID %s from table size %s: " % (result_id, auto_search_result_df.shape[0])) - - sub_series_df = auto_search_result_df[auto_search_result_df['RESULT_ID'] == result_id] - - # get the original query mol_id_list in it's original query order - # it can be mis-ordered due to strict_order=False param on the search method - mol_id_list = list(zip(sub_series_df['QUERY_MOL_ID'].tolist(), sub_series_df['QUERY_ORDER'].tolist())) - mol_id_list = sorted(mol_id_list, key=lambda xx: xx[1]) - mol_id_list = [x[0] for x in mol_id_list if x[1] > 0] - - self.logger.debug('Merging results to CSV frame for iteration %s and dataframe %s' % - (iteration, str(sub_series_df.shape))) - - if iteration == 1: - return_df = self.return_scored_series_dataframe(mol_id_list, sub_series_df, return_df, append=False) - self.logger.debug('First iteration, sized at %s' % str(return_df.shape)) - iteration += 1 - else: - # as above but append=True - return_df = self.return_scored_series_dataframe(mol_id_list, sub_series_df, return_df, append=True) - self.logger.debug('Merge operation, sized at %s' % str(return_df.shape)) - iteration += 1 - - # return_df = self.enumerate_products(return_df, 'QUERY_MOL_CONTEXT', 'NEW_FRAG_R') - - return_df.to_csv(out_csv, index=False, float_format='%.3f') # , header=True) - self.logger.info('Completed write of auto_search results') - - -# -# unittest everything -# -class _TestMMPSeriesObjectClass(unittest.TestCase): - """Test class to test the object and methods""" - - @classmethod - def setUpClass(cls): - # - cls.maxDiff = None - - # setup test data - cls.temp_file_input_csv = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_input_csv_larger = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_input_csv_confusion = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_input_csv_anotherone = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_input_csv_double = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - # output - cls.temp_file_output_series = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_output_seriessuggest = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_output_seriessuggest2 = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - cls.temp_file_output_autosearch = tempfile.NamedTemporaryFile(delete=False, encoding='utf-8', mode='wt') - # yet more for multi series file parse and search - cls.temp_dir_series_files = tempfile.mkdtemp(suffix='series') - - # setup a logger object - cls.mmplogger = logging.getLogger('mmpobjectclass_testlogger') - logging.disable(logging.CRITICAL) - - cls.test_dataset_goldeninput_csv_headers = \ - ['SMILES, ID, PIC50'] - - # first 8 smiles are taken from matsy paper JMC 2014 - # https://www.ncbi.nlm.nih.gov/pubmed/24601597 - # https://pubs.acs.org/doi/10.1021/jm500022q - cls.test_dataset_goldeninput_csv_data = { - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL89089 - 'CS(=O)(=O)c1ccc(C2=C(c3ccc(OC)cc3)CC3(C2)CC3)cc1, 001, 7.00': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL88550 - 'Fc1c(OC)ccc(c1)C1=C(c2ccc(S(=O)(=O)C)cc2)CC2(C1)CC2, 002, 7.68': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL88899 - 'Clc1c(OC)ccc(c1)C1=C(c2ccc(S(=O)(=O)C)cc2)CC2(C1)CC2, 003, 8.51': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL86363 - 'Brc1c(OC)ccc(c1)C1=C(c2ccc(S(=O)(=O)C)cc2)CC2(C1)CC2, 004, 8.77': None, - # non chiral analogues of https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL390742 - 'O=C(OC)C1C(c2ccccc2)CC2N(C1CC2)C, 010, 8.30': None, - # non chiral analogues of https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL222705 - 'Fc1ccc(C2CC3N(C(C2C(=O)OC)CC3)C)cc1, 011, 8.00': None, - # non chiral analogues of https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL375888 - 'Clc1ccc(C2CC3N(C(C2C(=O)OC)CC3)C)cc1, 012, 7.77': None, - # non chiral analogues of https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL97887 - 'Brc1ccc(C2CC3N(C(C2C(=O)OC)CC3)C)cc1, 013, 8.89': None - } - - # a few more I made up to create some identical repeat series from - # different molecule ID's and therefore trigger frequency data output - cls.test_dataset_goldeninput_csv_data_extras = { - # halogen switch analogues of - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL1080278 - 'Clc1cc2c([n]c[n]c2Nc2ccccc2)cc1, 081, 6.80': None, - 'Clc1cc2c([n]c[n]c2Nc2ccc(F)cc2)cc1, 082, 7.20': None, - 'Clc1ccc(Nc2[n]c[n]c3c2cc(cc3)Cl)cc1, 083, 7.81': None, - 'Brc1ccc(Nc2[n]c[n]c3c2cc(Cl)cc3)cc1, 084, 8.42': None - } - - # even more to add confusion: - # (a) repeat R groups with different activity and different core - # (b) repeat identical activity mols with different R groups - # both require more than a simple sort on the series - cls.test_dataset_goldeninput_csv_data_extra_confusion = { - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL300021/ H analogue, append 00 to id - 'c1ccc(cc1)C2=C(CN(C2=O)c3ccccc3)c4ccc(cc4)S(=O)(=O)C, 30002100, 6.50': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL300021/ - 'COc1ccc(cc1)C2=C(CN(C2=O)c3ccccc3)c4ccc(cc4)S(=O)(=O)C, 300021, 6.50': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL298356 - 'CS(=O)(=O)c1ccc(cc1)C2=C(C(=O)N(C2)c3ccccc3)c4ccc(F)cc4, 298356, 6.90': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL298356 Cl analogue, append 01 to id - 'CS(=O)(=O)c1ccc(cc1)C2=C(C(=O)N(C2)c3ccccc3)c4ccc(Cl)cc4, 29835601, 7.30': None, - # https://www.ebi.ac.uk/chembl/compound_report_card/CHEMBL298356 Br analogue, append 02 to id - 'CS(=O)(=O)c1ccc(cc1)C2=C(C(=O)N(C2)c3ccccc3)c4ccc(Br)cc4, 29835602, 7.91': None, - # made up analogue of CHEMBL86363 (004 above) to that should get - # transferred to any matched series - 'CCCOc1c(OC)ccc(c1)C1=C(c2ccc(S(=O)(=O)C)cc2)CC2(C1)CC2, 005, 8.99': None, - } - - # this set will only produce a double cut - # cox inhibitors again but taken from - # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3813081/ - # activity pIC50 data is synthetic - cls.test_dataset_goldeninput_csv_data_extras_doublecut = { - # compounds from the paper - 'CS(=O)(=O)c1ccc(cc1)c2cc(Br)sc2c3ccc(F)cc3, 697, 8.6': None, - 'CS(=O)(=O)c1ccc(cc1)c2nc([nH]c2c3ccc(F)cc3)C(F)(F)F, 12, 8.7': None, - 'CCSc1nnc(c2ccc(F)cc2)n1c3ccc(cc3)S(=O)(=O)C, 13, 8.8': None, - 'CS(=O)(=O)c1ccc(cc1)C2SCC(=O)N2c3ccc(F)cc3, 14, 8.9': None, - 'Cc1cnc2c(c3ccc(F)cc3)c(nn2c1C)c4ccc(cc4)S(=O)(=O)C, 33, 8.7': None, - 'CS(=O)(=O)c1ccc(cc1)c2nc3CCCCc3n2c4ccc(F)cc4, 36, 8.8': None, - 'CS(=O)(=O)c1ccc(cc1)C2Sc3ccccc3C(=O)N2c4ccc(F)cc4, 35, 8.9': None, - 'CCOc1ccc(cc1)c2c(nn3ncccc23)c4ccc(cc4)S(=O)(=O)C, 32, 9.0': None, - 'COc1ccc(cc1)c2nc3ccccc3nc2c4ccc(cc4)S(=O)(=O)C, 30, 9.1': None, - # to make a matched series with this we create synthetic data - # below are copies of above first 4 compounds but id's now prefixed with 90xxx - # and the activity data incremented by 1.0 - # and the p-Phenyl F is changed to a p-Phenyl Br - 'CS(=O)(=O)c1ccc(cc1)c2cc(Br)sc2c3ccc(Br)cc3, 90697, 9.6': None, - 'CS(=O)(=O)c1ccc(cc1)c2nc([nH]c2c3ccc(Br)cc3)C(F)(F)F, 9012, 9.7': None, - 'CCSc1nnc(c2ccc(Br)cc2)n1c3ccc(cc3)S(=O)(=O)C, 9013, 9.8': None, - 'CS(=O)(=O)c1ccc(cc1)C2SCC(=O)N2c3ccc(Br)cc3, 9014, 9.9': None - } - - ################################## - # - # write test data to temp files - # - ################################## - - # csv file - basic test - cls.temp_file_input_csv.write(', '.join(cls.test_dataset_goldeninput_csv_headers)+"\n") - for data in list(cls.test_dataset_goldeninput_csv_data.keys()): - cls.temp_file_input_csv.write(data+"\n") - cls.temp_file_input_csv.close() - - # extend the basic test data - cls.temp_file_input_csv_larger.write(', '.join(cls.test_dataset_goldeninput_csv_headers)+"\n") - for data in list(cls.test_dataset_goldeninput_csv_data.keys()): - cls.temp_file_input_csv_larger.write(data+"\n") - for data in list(cls.test_dataset_goldeninput_csv_data_extras.keys()): - cls.temp_file_input_csv_larger.write(data+"\n") - cls.temp_file_input_csv_larger.close() - - # full test with corner cases - cls.temp_file_input_csv_confusion.write(', '.join(cls.test_dataset_goldeninput_csv_headers)+"\n") - for data in list(cls.test_dataset_goldeninput_csv_data.keys()): - cls.temp_file_input_csv_confusion.write(data+"\n") - for data in list(cls.test_dataset_goldeninput_csv_data_extra_confusion.keys()): - cls.temp_file_input_csv_confusion.write(data+"\n") - cls.temp_file_input_csv_confusion.close() - - # double - cls.temp_file_input_csv_double.write(', '.join(cls.test_dataset_goldeninput_csv_headers) + "\n") - for data in list(cls.test_dataset_goldeninput_csv_data_extras_doublecut.keys()): - cls.temp_file_input_csv_double.write(data+"\n") - cls.temp_file_input_csv_double.close() - - # container for results data - cls.test_dataset_testresults = {} - - - @classmethod - def tearDownClass(cls): - """Cleanup for end of all tests""" - - os.remove(cls.temp_file_input_csv.name) - os.remove(cls.temp_file_input_csv_larger.name) - os.remove(cls.temp_file_input_csv_confusion.name) - os.remove(cls.temp_file_output_series.name) - os.remove(cls.temp_file_output_seriessuggest.name) - os.remove(cls.temp_file_output_seriessuggest2.name) - os.remove(cls.temp_file_output_autosearch.name) - - def setUp(cls): - """Setup object for clean reuse in further tests""" - - cls.temp_file_output_series = tempfile.NamedTemporaryFile(delete=True, encoding='utf-8', mode='wt') - # create empty mmp object each time - cls.test_mmp_series_object = MMPSeriesObjectClass(cls.mmplogger) - - def tearDown(cls): - """Tear down object for clean reuse in further tests""" - - # cls.test_mmp_series_object.clean_out_data_seriesobj() - # reusable data struct - cls.test_mmp_series_object.clean_out_data_seriesobj() - cls.test_dataset_testresults.clear() - # reusable results file - # os.remove(cls.temp_file_output_series.name) - - def test_iterator_mmp_series_numeric(cls): - """Test the generation of basic matched series""" - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - for series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act in \ - cls.test_mmp_series_object._iterator_mmp_series_numeric(): - - cls.test_dataset_testresults[(series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act)] = None - - #print(cls.test_dataset_testresults) - cls.assertEqual({(1, 1, 8, 7, 1, 7.0): None, (1, 2, 8, 44, 2, 7.68): None, (1, 3, 8, 69, 3, 8.51): None, - (1, 4, 8, 91, 4, 8.77): None, (2, 1, 29, 24, 1, 7.0): None, (2, 2, 29, 34, 2, 7.68): None, - (2, 3, 29, 60, 3, 8.51): None, (2, 4, 29, 82, 4, 8.77): None, (3, 1, 107, 156, 12, 7.77): None, - (3, 2, 107, 134, 11, 8.0): None, (3, 3, 107, 106, 10, 8.3): None, - (3, 4, 107, 175, 13, 8.89): None, (4, 1, 122, 60, 12, 7.77): None, - (4, 2, 122, 34, 11, 8.0): None, (4, 3, 122, 24, 10, 8.3): None, - (4, 4, 122, 82, 13, 8.89): None}, - cls.test_dataset_testresults) - - def test_iterator_mmp_series_numeric_confusion(cls): - """Test the generation of matched series with confused input data""" - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - for series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act in \ - cls.test_mmp_series_object._iterator_mmp_series_numeric(): - - cls.test_dataset_testresults[(series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act)] = None - - #print(cls.test_dataset_testresults) - cls.assertEqual({(1, 1, 8, 7, 1, 7.0): None, (1, 2, 8, 44, 2, 7.68): None, (1, 3, 8, 69, 3, 8.51): None, - (1, 4, 8, 91, 4, 8.77): None, (1, 5, 8, 285, 5, 8.99): None, (2, 1, 29, 24, 1, 7.0): None, - (2, 2, 29, 34, 2, 7.68): None, (2, 3, 29, 60, 3, 8.51): None, (2, 4, 29, 82, 4, 8.77): None, - (2, 5, 29, 286, 5, 8.99): None, (3, 1, 107, 156, 12, 7.77): None, - (3, 2, 107, 134, 11, 8.0): None, (3, 3, 107, 106, 10, 8.3): None, - (3, 4, 107, 175, 13, 8.89): None, (4, 1, 122, 60, 12, 7.77): None, - (4, 2, 122, 34, 11, 8.0): None, (4, 3, 122, 24, 10, 8.3): None, - (4, 4, 122, 82, 13, 8.89): None, (5, 1, 194, 106, 30002100, 6.5): None, - (5, 1, 194, 7, 300021, 6.5): None, (5, 2, 194, 134, 298356, 6.9): None, - (5, 3, 194, 156, 29835601, 7.3): None, (5, 4, 194, 175, 29835602, 7.91): None, - (6, 1, 208, 24, 30002100, 6.5): None, (6, 1, 208, 9, 300021, 6.5): None, - (6, 2, 208, 34, 298356, 6.9): None, (6, 3, 208, 60, 29835601, 7.3): None, - (6, 4, 208, 82, 29835602, 7.91): None}, - cls.test_dataset_testresults) - - def test_iterator_mmp_series_numeric_double(cls): - """Test the generation of basic matched series""" - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_double.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.45) - - for series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act in \ - cls.test_mmp_series_object._iterator_mmp_series_numeric(sgl_or_dbl='double'): - - cls.test_dataset_testresults[(series_id, series_seq_id, ctx_id, frag_id_L, molid_L, molid_L_act)] = None - - #print(cls.test_dataset_testresults) - cls.assertEqual( - {(1, 1, 290, 13, 697, 8.6): None, (1, 2, 290, 40, 12, 8.7): None, (1, 2, 290, 101, 33, 8.7): None, - (1, 3, 290, 61, 13, 8.8): None, (1, 3, 290, 118, 36, 8.8): None, (1, 4, 290, 83, 14, 8.9): None, - (1, 4, 290, 136, 35, 8.9): None, (2, 1, 24516, 13, 90697, 9.6): None, (2, 2, 24516, 224, 9012, 9.7): None, - (2, 3, 24516, 61, 9013, 9.8): None, (2, 4, 24516, 83, 9014, 9.9): None}, - cls.test_dataset_testresults) - - def test_generate_store_mmp_series(cls): - """Test the generation of matched series with confused input data""" - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - # print cls.test_mmp_series_object.series_df.to_dict() - cls.assertEqual(len(cls.test_mmp_series_object.series_df.to_dict()['MOL_ID']), 28) - - cls.test_mmp_series_object.generate_store_mmp_series(sgl_or_dbl_or_both='double') - # print cls.test_mmp_series_object.series_df.to_dict() - cls.assertEqual(len(cls.test_mmp_series_object.series_df.to_dict()['MOL_ID']), 17) - - cls.test_mmp_series_object.generate_store_mmp_series(sgl_or_dbl_or_both='both') - # print cls.test_mmp_series_object.series_df.to_dict() - cls.assertEqual(len(cls.test_mmp_series_object.series_df.to_dict()['MOL_ID']), 45) - - def test_return_series_matching_idlist_frags(cls): - """ test query by frag id """ - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - - result_ = cls.test_mmp_series_object.return_series_matching_idlist([24, 34, 60], - 'FRAG_ID') - #print(result_.to_dict()) - cls.assertDictEqual( - {'SERIES_ID': {5: 2, 6: 2, 7: 2, 8: 2, 9: 2, 14: 4, 15: 4, 16: 4, 17: 4, 23: 6, 24: 6, 25: 6, 26: 6, 27: 6}, - 'SERIES_SEQ_ID': {5: 1, 6: 2, 7: 3, 8: 4, 9: 5, 14: 1, 15: 2, 16: 3, 17: 4, 23: 1, 24: 1, 25: 2, 26: 3, - 27: 4}, - 'CONTEXT_ID': {5: 29, 6: 29, 7: 29, 8: 29, 9: 29, 14: 122, 15: 122, 16: 122, 17: 122, 23: 208, 24: 208, - 25: 208, 26: 208, 27: 208}, - 'FRAG_ID': {5: 24, 6: 34, 7: 60, 8: 82, 9: 286, 14: 60, 15: 34, 16: 24, 17: 82, 23: 24, 24: 9, 25: 34, - 26: 60, 27: 82}, - 'MOL_ID': {5: 1, 6: 2, 7: 3, 8: 4, 9: 5, 14: 12, 15: 11, 16: 10, 17: 13, 23: 30002100, 24: 300021, - 25: 298356, 26: 29835601, 27: 29835602}, - 'ACTIVITY': {5: 7.0, 6: 7.68, 7: 8.51, 8: 8.77, 9: 8.99, 14: 7.77, 15: 8.0, 16: 8.3, 17: 8.89, 23: 6.5, - 24: 6.5, 25: 6.9, 26: 7.3, 27: 7.91}}, - result_.to_dict()) - - def test_return_series_matching_idlist_frags_strictorder(cls): - """ test query by frag id - Repeat of above but strict ordering requested """ - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - - result_ = cls.test_mmp_series_object.return_series_matching_idlist([24, 34, 60], 'FRAG_ID', - strict_ordering=True) - #print(result_.to_dict()) - cls.assertDictEqual({'SERIES_ID': {5: 2, 6: 2, 7: 2, 8: 2, 9: 2, 23: 6, 24: 6, 25: 6, 26: 6, 27: 6}, - 'SERIES_SEQ_ID': {5: 1, 6: 2, 7: 3, 8: 4, 9: 5, 23: 1, 24: 1, 25: 2, 26: 3, 27: 4}, - 'CONTEXT_ID': {5: 29, 6: 29, 7: 29, 8: 29, 9: 29, 23: 208, 24: 208, 25: 208, 26: 208, - 27: 208}, - 'FRAG_ID': {5: 24, 6: 34, 7: 60, 8: 82, 9: 286, 23: 24, 24: 9, 25: 34, 26: 60, 27: 82}, - 'MOL_ID': {5: 1, 6: 2, 7: 3, 8: 4, 9: 5, 23: 30002100, 24: 300021, 25: 298356, - 26: 29835601, 27: 29835602}, - 'ACTIVITY': {5: 7.0, 6: 7.68, 7: 8.51, 8: 8.77, 9: 8.99, 23: 6.5, 24: 6.5, 25: 6.9, - 26: 7.3, 27: 7.91}}, - result_.to_dict()) - - def test_return_series_matching_idlist_mols(cls): - """ test search for series by mol id """ - - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - - result_ = cls.test_mmp_series_object.return_series_matching_idlist([2, 3, 4], 'MOL_ID') - - #print(result_.to_dict()) - cls.assertDictEqual({'SERIES_ID': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 2, 6: 2, 7: 2, 8: 2, 9: 2}, - 'SERIES_SEQ_ID': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 1, 6: 2, 7: 3, 8: 4, 9: 5}, - 'CONTEXT_ID': {0: 8, 1: 8, 2: 8, 3: 8, 4: 8, 5: 29, 6: 29, 7: 29, 8: 29, 9: 29}, - 'FRAG_ID': {0: 7, 1: 44, 2: 69, 3: 91, 4: 285, 5: 24, 6: 34, 7: 60, 8: 82, 9: 286}, - 'MOL_ID': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 1, 6: 2, 7: 3, 8: 4, 9: 5}, - 'ACTIVITY': {0: 7.0, 1: 7.68, 2: 8.51, 3: 8.77, 4: 8.99, 5: 7.0, 6: 7.68, 7: 8.51, 8: 8.77, - 9: 8.99}}, - result_.to_dict()) - - def test_search_for_mms_to_extend_molids(cls): - """""" - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - - # for this dataset, strict_ordering = False will get the same results due to the simplicity of the dataset - result_ = cls.test_mmp_series_object.search_for_mms_to_extend_molids([30002100, 298356, 29835601, 29835602]) - # NaN becomes nan on return/dict convert so simpler to reduce to zero for comparison - # result_.fillna(value=0, inplace=True) - #print(result_.to_dict()) - cls.assertDictEqual( - {'RESULT_ID': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1}, 'RESULT_SERIES_SEQ_ID': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, - 'RESULT_MOL_ID': {0: 1, 1: 2, 2: 3, 3: 4, 4: 5}, 'RESULT_CONTEXT_ID': {0: 29, 1: 29, 2: 29, 3: 29, 4: 29}, - 'RESULT_FRAG_ID': {0: 24, 1: 34, 2: 60, 3: 82, 4: 286}, - 'QUERY_FRAG_ID': {0: 24, 1: 34, 2: 60, 3: 82, 4: 0}, - 'QUERY_MOL_ID': {0: 30002100, 1: 298356, 2: 29835601, 3: 29835602, 4: 0}, - 'RESULT_MOL_ACTIVITY': {0: 7.0, 1: 7.68, 2: 8.51, 3: 8.77, 4: 8.99}, - 'QUERY_CONTEXT_ID': {0: 208, 1: 208, 2: 208, 3: 208, 4: 208}}, result_.to_dict()) - - def test_iterator_series_pstats(cls): - """test it""" - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_confusion.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series() - - result_df = cls.test_mmp_series_object.return_series_matching_idlist([24, 34, 82], 'FRAG_ID') - - results = {} - for series, observations, total_series_obs, observed_prob, \ - expected_prob, enrichment, p_value, p_value_corrected, significant \ - in cls.test_mmp_series_object._iterator_series_pstats([24, 34, 82], result_df): - - results[series] = [observations, total_series_obs, observed_prob, - expected_prob, enrichment, p_value, p_value_corrected, significant] - # print to regenerate/print the series data line by line - # print series, results[series] - - #print(results) - cls.assertDictEqual({(24, 34, 82): [2, 3, 0.6666666666666666, 0.16666666666666666, 4.0, 0.07407407407407407, - 0.37037037037037035, False], - (34, 24, 82): [1, 3, 0.3333333333333333, 0.16666666666666666, 2.0, 0.42129629629629617, - 2.106481481481481, False]}, - results) - - def test_auto_search_double(cls): - """Testing the auto search method""" - cls.test_mmp_series_object.setup_mmp_data_for_mms(cls.temp_file_input_csv_double.name, - 'SMILES', 'ID', 'PIC50', - 3, 0.50001) - - cls.test_mmp_series_object.generate_store_mmp_series(sgl_or_dbl_or_both='double') - - # print cls.test_mmp_series_object.series_df.to_csv() - result_df = cls.test_mmp_series_object.auto_search(5, 3, strict_ordering=True) - - #print(result_df.to_dict()) - cls.assertEqual({'RESULT_ID': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}, - 'SERIES_ID': {0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}, - 'RESULT_SERIES_SEQ_ID': {0: 1, 1: 2, 2: 2, 3: 3, 4: 3, 5: 4, 6: 4}, - 'RESULT_MOL_ID': {0: 697, 1: 12, 2: 33, 3: 13, 4: 36, 5: 14, 6: 35}, - 'RESULT_CONTEXT_ID': {0: 290, 1: 290, 2: 290, 3: 290, 4: 290, 5: 290, 6: 290}, - 'RESULT_FRAG_ID': {0: 13, 1: 40, 2: 104, 3: 63, 4: 121, 5: 86, 6: 139}, - 'QUERY_CONTEXT_ID': {0: 25185, 1: 25185, 2: 25185, 3: 25185, 4: 25185, 5: 25185, 6: 25185}, - 'QUERY_FRAG_ID': {0: 13, 1: 0, 2: 0, 3: 63, 4: 0, 5: 86, 6: 0}, - 'QUERY_MOL_ID': {0: 90697, 1: 0, 2: 0, 3: 9013, 4: 0, 5: 9014, 6: 0}, - 'QUERY_ORDER': {0: 1, 1: 0, 2: 0, 3: 2, 4: 0, 5: 3, 6: 0}, - 'RESULT_MOL_ACTIVITY': {0: 8.6, 1: 8.7, 2: 8.7, 3: 8.8, 4: 8.8, 5: 8.9, 6: 8.9}}, - result_df.to_dict()) - - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_stats_functions.py b/contrib/script/py/mmp/mmp_stats_functions.py deleted file mode 100755 index cfadcaf2..00000000 --- a/contrib/script/py/mmp/mmp_stats_functions.py +++ /dev/null @@ -1,665 +0,0 @@ -################################################################### -""" Summary: Useful stats functions for working with MMP data - -""" -################################################################### -import numpy as np -import pandas as pd - -from math import sqrt, log10 -from scipy.stats import t - -# things needed for unit testing only -import unittest -import tempfile - - -############################################################ -# -# Custom aggregation functions -# -# See unit tests for usage as well as mmp_pairs_objects for usage -# -############################################################ - -def diffn(x, level, inc_low_n_vals=False): - """This function was developed for use in - aggregating certain MMP data such as Microsomal Metabolic Turnover (diff60) - or Single Point CYPxxx Inhibition (Diff50 of Est pIC50). Note that it will - return Nan for array with <3 values""" - - n_ = len(x) - - # timing data inc_low_n_vals=False - # ncalls tottime percall cumtime percall - # 20000 0.169 0.000 7.672 0.000 - # 20000 0.170 0.000 7.569 0.000 - # 20000 0.171 0.000 7.733 0.000 - # 20000 0.169 0.000 7.752 0.000 - # - # - if n_ < 3 and inc_low_n_vals is False: - return np.nan, np.nan, np.nan - - # Baseline with full diffn calc on n<3 - # ncalls tottime percall cumtime percall - # 20000 0.339 0.000 13.810 0.001 - # 20000 0.344 0.000 13.591 0.001 - # 20000 0.333 0.000 13.691 0.001 - # 20000 0.320 0.000 13.551 0.001 - # - # adding below catch for n==1 and inc_low_n_vals is True - # ncalls tottime percall cumtime percall - # 20000 0.257 0.000 10.829 0.001 - # 20000 0.257 0.000 10.948 0.001 - # 20000 0.268 0.000 10.984 0.001 - # 20000 0.261 0.000 11.072 0.001 - # - # ---------------------------------------- - # 2019-04-05 Baseline (hardware changes) - # ncalls tottime percall cumtime percall - # 20000 0.165 0.000 6.151 0.000 - # 20000 0.159 0.000 5.890 0.000 - # 20000 0.159 0.000 6.053 0.000 - # 20000 0.174 0.000 6.673 0.000 - # - # edits to add round(x, 4) to all returns - # - # ncalls tottime percall cumtime percall - # 20000 0.176 0.000 6.191 0.000 - # 20000 0.179 0.000 6.470 0.000 - # 20000 0.176 0.000 6.100 0.000 - # 20000 0.179 0.000 6.243 0.000 - # - elif n_ == 1 and inc_low_n_vals is True: - # - avg_ = np.average(x) - lx_ = log10(level / (100.0 - level)) - a_ = (10.0 ** (avg_ + lx_)) - diffn = round(100.0 * a_ / (1.0 + a_) - level, 4) - - return np.nan, diffn, np.nan - - avg_ = np.average(x) - sd_ = np.std(x, ddof=1) - interval_ = t.interval(0.9, df=(n_ - 1))[1] * sd_ / sqrt(n_) - low90_ = avg_ - interval_ - upp90_ = avg_ + interval_ - lx_ = log10(level / (100.0 - level)) - - try: - a_ = (10.0 ** (avg_ + lx_)) - diffn = round(100.0 * a_ / (1.0 + a_) - level, 4) - except: - diffn = np.nan - - try: - b_ = (10.0 ** (low90_ + lx_)) - diffn_low = round(100.0 * b_ / (1.0 + b_) - level, 4) - except: - diffn_low = np.nan - - try: - c_ = (10.0 ** (upp90_ + lx_)) - diffn_upp = round(100.0 * c_ / (1.0 + c_) - level, 4) - except: - diffn_upp = np.nan - - # convert to pandas Series to allow return of column labels (index) - # worked once! but not twice :-) - # s_ = pd.Series([diffn, diffn_upp, diffn_low], index=['diffn', 'diffn_upp', 'diffn_low']) - - return diffn_low, diffn, diffn_upp - - -def diffn_list_rtn(n, inc_low_n_vals): - """returns a list containing: - [diffn, diffn_upp, diffn_low]""" - def diff_(x): - # note below returns full list: [diffn, diffn_upp, diffn_low] - return diffn(x, n, inc_low_n_vals) - diff_.__name__ = 'diff%s_all' % n - return diff_ - - -def diffn_agg(n): - """returns only diffn""" - def diff_(x): - return diffn(x, n)[1] - diff_.__name__ = 'diff%s' % n - return diff_ - - -def diffn_agg_upp(n): - """Returns only diffn_upp""" - def diff_(x): - return diffn(x, n)[2] - diff_.__name__ = 'diff%s_upp' % n - return diff_ - - -def diffn_agg_low(n): - """Returns only diffn_upp""" - def diff_(x): - return diffn(x, n)[0] - diff_.__name__ = 'diff%s_low' % n - return diff_ - - -##################################################### -# -# An alternate aggregation function for use with data such as -# Unbound Microsomal Intrinsic Clearance (rClint-u) where -# we want to calculate a 'Fold Change' which is the mean diff -# in log scale for Pair_L/Pair_R => Index. Additionally the -# result becomes -10 if negative 0.1 change) -# -##################################################### - -def _mean_diff_invlog(x, inc_low_n_vals=False): - """Aggregate function for use with deltas in log scale such as - Unbound Microsomal Intrinsic Clearance (rClint-u) where we want - to calculate a 'Fold Change' which is the mean diff in log - scale for Pair_L/Pair_R => Index. Additionally the result - becomes -10 if negative 0.1 change. It is expected that the - input is log transformed but the output will be transformed back - into original units via inverse log transform. Equivalent to the - Mean ratio or Fold Change or Mean difference - """ - - n_ = len(x) - - # These two lines have been speed tested with external code found in mmp_stats_functions_timer.py - # This function was run 20,000 times on lists of randomly generated but fixed seed numbers. Of these - # 20K lists, 10K are of length <3 and rest are of length between 3 to 100. Execution time is measured - # by cProfile and the cumtime (cumulative total) is reported below. Run 4 times to smooth out cpu peaks. - # The data is now identical in each run so we can compare times effectively/reproducibly. - # - # (1) Without 'CRITICAL TWO LINES': - # ncalls tottime percall cumtime percall - # 20000 0.269 0 13.744 0.001 - # 20000 0.262 0 13.668 0.001 - # 20000 0.265 0 13.546 0.001 - # 20000 0.273 0 13.662 0.001 - # - # (2) With line "if n_ < 3: return np.nan, np.nan, np.nan": - # ncalls tottime percall cumtime percall - # 20000 0.139 0 7.604 0 - # 20000 0.137 0 7.598 0 - # 20000 0.137 0 7.549 0 - # 20000 0.14 0 7.599 0 - # - # (3)(a) With line "if n_ < 3 and inc_low_n_vals is False: return np.nan, np.nan, np.nan": - # and ensuring call with inc_low_n_vals=False - # ncalls tottime percall cumtime percall - # 20000 0.148 0.000 8.060 0.000 - # 20000 0.141 0.000 7.561 0.000 - # 20000 0.141 0.000 7.654 0.000 - # 20000 0.152 0.000 7.717 0.000 - # - # (3)(b) With line "if n_ < 3 and inc_low_n_vals is False: return np.nan, np.nan, np.nan": - # and ensuring call with inc_low_n_vals=True <----- *NB* - # ncalls tottime percall cumtime percall - # 20000 0.270 0.000 13.642 0.001 - # 20000 0.273 0.000 13.872 0.001 - # 20000 0.275 0.000 13.919 0.001 - # 20000 0.266 0.000 13.572 0.001 - # - # CRITICAL 2 LINES - # if n_ < 3 and inc_low_n_vals is False: - # return np.nan, np.nan, np.nan - - # having optimised the above and chosen option (3) we now add this line to catch n_=1 cases - # and test call with inc_low_n_vals=True --> The results are good wso we'll leave the line in - # ncalls tottime percall cumtime percall - # 20000 0.195 0.000 10.806 0.001 - # 20000 0.195 0.000 10.739 0.001 - # 20000 0.198 0.000 10.871 0.001 - # 20000 0.216 0.000 10.850 0.001 - # - # and test call with inc_low_n_vals=False - # 20000 0.142 0.000 7.757 0.000 - # 20000 0.143 0.000 7.756 0.000 - # 20000 0.150 0.000 8.116 0.000 - # 20000 0.145 0.000 7.787 0.000 - # - # elif n_ == 1 and inc_low_n_vals is True: - # return np.nan, x[0], np.nan - - # and finally, having fixed the above two lines and confirmed that they do improve performance (speed) - # test that the below change also helps - # test call with inc_low_n_vals=True --> The results are good wso we'll leave the line in - # ncalls tottime percall cumtime percall - # 20000 0.156 0.000 7.839 0.000 - # 20000 0.150 0.000 7.858 0.000 - # 20000 0.150 0.000 7.689 0.000 - # 20000 0.148 0.000 7.864 0.000 - # - # test call with inc_low_n_vals=False - # ncalls tottime percall cumtime percall - # 20000 0.147 0.000 7.756 0.000 - # 20000 0.143 0.000 7.573 0.000 - # 20000 0.142 0.000 7.638 0.000 - # 20000 0.143 0.000 7.621 0.000 - # - # elif n_ == 2 and inc_low_n_vals is True: - # return np.nan, np.average(x), np.nan - - # Here's a version that actually works. Previous one fails - # as can't address x as array so x[0] fails - # - # ncalls tottime percall cumtime percall - # 20000 0.151 0.000 7.798 0.000 - # 20000 0.158 0.000 8.155 0.000 - # 20000 0.153 0.000 7.887 0.000 - # 20000 0.149 0.000 7.726 0.000 - # - # ncalls tottime percall cumtime percall - # 20000 0.144 0.000 7.745 0.000 - # 20000 0.141 0.000 7.537 0.000 - # 20000 0.145 0.000 7.822 0.000 - # 20000 0.139 0.000 7.618 0.000 - # - if n_ < 3: - if inc_low_n_vals is False: - return np.nan, np.nan, np.nan - else: - return np.nan, (10 ** np.average(x)), np.nan - - # Mean of these deltas in log scale - avg_ = np.average(x) - # sd of these deltas in log scale - sd_ = np.std(x, ddof=1) - int_ = t.interval(0.9, df=(n_ - 1))[1] - - low90_ = avg_ - int_ * (sd_ / sqrt(n_)) - upp90_ = avg_ + int_ * (sd_ / sqrt(n_)) - - # need to reverse log: - fold_change = round((10 ** avg_), 4) - fold_change_upper90 = round((10 ** upp90_), 4) - fold_change_lower90 = round((10 ** low90_), 4) - - return fold_change_lower90, fold_change, fold_change_upper90 - - -def _mean_diff(x, inc_low_n_vals=False): - """Same as fold_change but no inverse log function - after aggregation. For use with data like LogP/D - """ - - n_ = len(x) - - # see timing data for _mean_diff_inv_log version - if n_ < 3: - if inc_low_n_vals is False: - return np.nan, np.nan, np.nan - else: - return np.nan, np.average(x), np.nan - - # Mean of these deltas in log scale - avg_ = round(np.average(x), 4) - # sd of these deltas in log scale - sd_ = np.std(x, ddof=1) - int_ = t.interval(0.9, df=(n_ - 1))[1] - - low90_ = round(avg_ - int_ * (sd_ / sqrt(n_)), 4) - upp90_ = round(avg_ + int_ * (sd_ / sqrt(n_)), 4) - - return low90_, avg_, upp90_ - - -# Added these to allow the previous mean_diff and mean_diff_invlog functions to be used -# with new keyword args, specifically to print out all low n value results. Returning the -# low n value results slows the function down and they are not useful in some MMP analysis -# but in other cases such as MMP prediction work we still need these data points. -def mean_diff(inc_low_n_vals): - def mean_diff_(x): - # note below returns full list: [diffn, diffn_upp, diffn_low] - return _mean_diff(x, inc_low_n_vals) - mean_diff_.__name__ = 'mean_diff' - return mean_diff_ - - -def mean_diff_invlog(inc_low_n_vals): - def mean_diff_invlog_(x): - # note below returns full list: [diffn, diffn_upp, diffn_low] - return _mean_diff_invlog(x, inc_low_n_vals) - mean_diff_invlog_.__name__ = 'mean_diff_invlog' - return mean_diff_invlog_ - - -##################################################### -# -# More useful stuff for counting number of positive -# or negative values in df -# -##################################################### - -def n_neg_diff(x): - return sum(n < 0 for n in x) - - -def n_pos_diff(x): - return sum(n > 0 for n in x) - - -##################################################### -# -# Functions to use with categorical data -# -##################################################### -def category_moved_up_one(x): - return sum(n == 1 for n in x) - - -def category_moved_up_one_pct(x): - return round(float(sum(n == 1 for n in x))/float(len(x))*100, 4) - - -def category_moved_down_one(x): - return sum(n == -1 for n in x) - - -def category_moved_down_one_pct(x): - return round(float(sum(n == -1 for n in x))/float(len(x))*100, 4) - - -def category_no_change(x): - return sum(n == 0 for n in x) - - -def category_no_change_pct(x): - return round(float(sum(n == 0 for n in x))/float(len(x))*100, 4) - - -def category_changed_class(x): - return sum(n != 0 for n in x) - - -def category_changed_class_pct(x): - return round(float(sum(n != 0 for n in x))/float(len(x))*100, 4) - - -##################################################### -# -# unittest everything -# -##################################################### - -class _TestMMPStatsFunctions(unittest.TestCase): - """Test class to test the object and methods""" - - def setUp(self): - # - self.maxDiff = None - - self.test_data_diff = [-0.542, -3.043, 0.264, 0.094, -0.262, 0.344, 0.769, 0.811, -1.350, -1.475, -0.027, - -3.200, -3.345, -2.950] - - self.test_dataframe = pd.DataFrame({ - 'A': ['aa', 'ab', 'ac', 'ad', 'aa', 'ab', 'ac', 'ad', 'aa', 'aa', 'ab'], - 'B': [-0.542, -3.043, 0.264, 0.094, -0.262, 0.344, 0.094, -0.262, -0.555, -0.54, -0.27], - 'C': [-3.043, 0.264, 0.094, -0.262, 0.344, 0.769, 0.094, -0.262, -3.001, -3.10, 0.35], - 'D': [0.264, 0.094, -0.262, 0.344, 0.769, 0.811, 0.094, -0.262, -0.260, -3.001, 0.100], - 'E': [-0.262, 0.344, 0.769, 0.811, -1.350, -1.475, 0.094, -0.262, -0.254, -0.254, 0.901], - 'F': [-0.262, 0.344, 0.769, 0.811, -1.350, -1.475, 0.094, -0.262, -0.256, -0.206, 0.344] - }) - - self.test_data_diff_goldresult_diffn = -46.791482 - self.test_data_diff_goldresult_diffn_upp = -14.72599757 - self.test_data_diff_goldresult_diffn_low = -57.27662627 - - self.test_dataframe_goldenresult = pd.DataFrame({ - 'A': ['aa', 'ab', 'ac', 'ad'], - 'C_count': [4, 3, 2, 2], - 'C_mean': [-2.200, 0.461, 0.094, -0.262], - 'C_diff60': [-59.0624, 21.2594, np.nan, np.nan], - 'C_diff60_low': [-59.9905, 0.3045, np.nan, np.nan], - 'C_diff60_upp': [-11.5932, 32.5238, np.nan, np.nan] - }) - self.test_dataframe_goldenresult.sort_index(axis=1, inplace=True) - - self.test_diff_2_inputfile = tempfile.NamedTemporaryFile(delete=False) - - self.test_diff_2_input = pd.DataFrame({ - 'FRAG_R': ['O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O=[1CH]C', - 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_L': ['s1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', - 's1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', 's1[1cH][n]cc1', - 's1[1cH][n]cc1', 's1[1cH][n]cc1', 'O=[1CH]N1C[2CH2]CCC1'], - 'logitM_DIFF': [-3.380, -2.895, -3.380, -1.282, -2.170, -0.226, -1.291, -1.715, -0.798, -1.378, -0.738, - -2.426, 7] - }) - - self.test_diff_2_goldenout = pd.DataFrame({ - 'FRAG_L': ['s1[1cH][n]cc1', 's1[1cH][n]cc1', 'O=[1CH]N1C[2CH2]CCC1'], - 'FRAG_R': ['O[1CH]=O', 'O=[1CH]C', '[1CH3]N1CCO[2CH2]C1'], - 'diff60': [-59.108675, -54.253683, np.nan], - 'diff60_low90': [-59.918216, -58.174201, np.nan], - 'diff60_upper90': [-51.007039, -43.343025, np.nan], - 'num_vals': [6, 6, 1] - }) - - self.test_diff_2_goldenout_low_n_val = pd.DataFrame({ - 'FRAG_L': ['s1[1cH][n]cc1', 's1[1cH][n]cc1', 'O=[1CH]N1C[2CH2]CCC1'], - 'FRAG_R': ['O[1CH]=O', 'O=[1CH]C', '[1CH3]N1CCO[2CH2]C1'], - 'diff60': [-59.108675, -54.253683, 7.00000], - 'diff60_low90': [-59.918216, -58.174201, np.nan], - 'diff60_upper90': [-51.007039, -43.343025, np.nan], - 'num_vals': [6, 6, 1] - }) - - self.test_foldchange_input = pd.DataFrame({ - 'FRAG_L': ['O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', - 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', - 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', - 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', '[1CH3]N1CCO[2CH2]C1', 'c1ccccc1', 'c1ccccc1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', - 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', - 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', - 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1', 'c1ccccc1', 'c1ccccc1'], - 'pLog_rClintu_DIFF': [-1.99704061, -1.806422468, -1.827428534, -1.90152502, -1.249553199, -0.664484291, - -1.857100559, -1.5845214, -2.667472973, -1.479999458, -1.635125098, -2.415917968, - -1.823722494, -2.52674842, -1.418294469, -1.19056738, -1.955927983, -1.285681124, - -1.27009994, -1.395897893, -1.641798956, -2.063255775, -1.529434468, -2.43015166, - -1.620558471, -1.2, 0.2, 0.3] - }) - - # data from R: 77.4282849546501, 53.6474674395413,37.1705348292594 - self.test_foldchange_goldenout = pd.DataFrame({ - 'FRAG_L': ['O[1CH]=O', '[1CH3]N1CCO[2CH2]C1', 'c1ccccc1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1', 'c1ccccc1'], - 'pLog_rClintu_DIFF_count': [25, 1, 2], - 'pLog_rClintu_DIFF_mean_diff_invlog': [(0.0129, 0.0186, 0.0269), - (np.nan, np.nan, np.nan), - (np.nan, np.nan, np.nan)] - }) - - self.test_mean_diff_goldenout = pd.DataFrame({ - 'FRAG_L': ['[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]N1C[2CH2]CCC1'], - 'Solubility_pH2_mMolar_DIFF_count': [15], - 'Solubility_pH2_mMolar_DIFF_mean_diff': [(-2.4688, -1.9984, -1.528)] - }) - - self.test_foldchange_goldenout_inc_low_n_vals = pd.DataFrame({ - 'FRAG_L': ['O[1CH]=O', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1'], - 'pLog_rClintu_DIFF_count': [25, 1], - 'pLog_rClintu_DIFF_mean_diff_invlog': [(0.012915177, 0.018640209, 0.026903030), - (np.nan, -1.20000, np.nan)] - }) - - self.test_foldchange_input_2 = pd.DataFrame({ - 'FRAG_L': ['[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', - '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', - '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', - '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', - 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', - 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', - 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1'], - 'Solubility_pH2_mMolar_DIFF': [-0.02380751, -2.648036977, -0.855180236, -2.612320185, -2.732343491, - -0.052453718, -2.240123595, -1.344902599, -2.532140934, -2.819401653, - -2.545212373, -2.832473093, -2.920275198, -1.088678392, -2.728023456] - # 'Solubility_pH2_mMolar_DIFF': [0.016217488, -2.030728258, -1.747948595, -1.598866354, -1.936057396, \ - # 0.045988318, -1.764970198, -1.948350161, -2.027001232, -2.029993245, \ - # -2.020599475, -2.023591488, -2.013101327, -1.599220664, -2.070137256] - }) - - # data from R: - self.test_foldchange_goldenout_2 = pd.DataFrame({ - 'FRAG_L': ['[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]N1C[2CH2]CCC1'], - 'Solubility_pH2_mMolar_DIFF_count': [15], - 'Solubility_pH2_mMolar_DIFF_mean_diff_invlog': [(0.0034, 0.01, 0.0297)] - }) - - # categorical data - self.test_categorical_dataframe = pd.DataFrame({ - 'FRAG_L': ['O[1CH]=O', 'O[1CH]=O', 'O[1CH]=O', '[1CH3]N1CCO[2CH2]C1', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]C', 'O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1', 'O=[1CH]N1C[2CH2]CCC1'], - 'CAT_DATA_DIFF': [0, 0, 1, 0, -1] - }) - - # categorical data golden - self.test_categorical_dataframe_golden = pd.DataFrame({ - 'FRAG_L': ['O[1CH]=O', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1'], - 'CAT_DATA_DIFF_category_changed_class': [1, 1], - 'CAT_DATA_DIFF_category_moved_down_one': [0, 1], - 'CAT_DATA_DIFF_category_moved_up_one': [1, 0], - 'CAT_DATA_DIFF_category_no_change': [2, 1], - 'CAT_DATA_DIFF_count': [3, 2] - }) - - self.test_categorical_dataframe_pct_golden = pd.DataFrame({ - 'CAT_DATA_DIFF_category_changed_class_pct': [33.3333, 50.0], - 'CAT_DATA_DIFF_category_moved_down_one_pct': [0, 50], - 'CAT_DATA_DIFF_category_moved_up_one_pct': [33.3333, 0], - 'CAT_DATA_DIFF_category_no_change_pct': [66.6667, 50.0], - 'CAT_DATA_DIFF_count': [3, 2], - 'FRAG_L': ['O[1CH]=O', '[1CH3]N1CCO[2CH2]C1'], - 'FRAG_R': ['O=[1CH]C', 'O=[1CH]N1C[2CH2]CCC1'] - }) - - def test_diff_array(self): - - (result_diffn_low, result_diffn, result_diffn_upp) = diffn(self.test_data_diff, 60) - - # pp.pprint(result_diffn) - # pp.pprint(result_diffn_upp) - # pp.pprint(result_diffn_low) - - self.assertAlmostEqual(result_diffn, self.test_data_diff_goldresult_diffn, 4) - self.assertAlmostEqual(result_diffn_upp, self.test_data_diff_goldresult_diffn_upp, 4) - self.assertAlmostEqual(result_diffn_low, self.test_data_diff_goldresult_diffn_low, 4) - - def test_diff_df_individual(self): - - # group, affregate, convert object to df, sort index - f = {'C': ['mean', 'count', diffn_agg_low(60), diffn_agg(60), diffn_agg_upp(60)]} - grouped_stats = self.test_dataframe.groupby(['A']).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - # this is important for unittesting as order or columns can change and needs to be made consistent - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - # Use pandas assert on data frames - # pp.pprint(self.test_dataframe_goldenresult) - # pp.pprint(grouped_stats) - pd.util.testing.assert_frame_equal(self.test_dataframe_goldenresult, grouped_stats, check_exact=False) - - def test_diff_df_series1(self): - - # group, affregate, convert object to df, sort index - # print self.test_dataframe.head(10) - f = {'C': ['mean', 'count', diffn_list_rtn(60, inc_low_n_vals=False)]} - grouped_stats = self.test_dataframe.groupby(['A'], sort=False).agg(f) # [diffn_list_rtn(60), len]) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - grouped_stats[['C_diff60_low', 'C_diff60', 'C_diff60_upp']] = grouped_stats['C_diff60_all'].apply(pd.Series) - grouped_stats.drop('C_diff60_all', axis=1, inplace=True) - # this is important for unittesting as order or columns can change and needs to be made consistent - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - # pp.pprint(self.test_dataframe_goldenresult) - # pp.pprint(grouped_stats) - pd.util.testing.assert_frame_equal(self.test_dataframe_goldenresult, grouped_stats) - - def test_diff_df_series2(self): - - f = {'logitM_DIFF': ['count', diffn_list_rtn(60, inc_low_n_vals=False)]} - grouped_stats = self.test_diff_2_input.groupby(['FRAG_L', 'FRAG_R'], sort = False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - grouped_stats[['diff60_low90', 'diff60', 'diff60_upper90']] = grouped_stats['logitM_DIFF_diff60_all'].apply(pd.Series) - grouped_stats.drop('logitM_DIFF_diff60_all', axis=1, inplace=True) - grouped_stats.rename(columns={'logitM_DIFF_count': 'num_vals'}, inplace=True) - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - pd.util.testing.assert_frame_equal(grouped_stats, self.test_diff_2_goldenout) - - def test_mean_diff_invlog_df(self): - - f = {'pLog_rClintu_DIFF': ['count', mean_diff_invlog(inc_low_n_vals=False)]} - grouped_stats = self.test_foldchange_input.groupby(['FRAG_L', 'FRAG_R'], sort=False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - pd.util.testing.assert_frame_equal(grouped_stats, self.test_foldchange_goldenout) - - def test_mean_diff_invlog_df_2(self): - - f = {'Solubility_pH2_mMolar_DIFF': ['count', mean_diff_invlog(inc_low_n_vals=False)]} - grouped_stats = self.test_foldchange_input_2.groupby(['FRAG_L', 'FRAG_R'], sort=False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - pd.util.testing.assert_frame_equal(grouped_stats, self.test_foldchange_goldenout_2) - - def test_mean_diff(self): - - f = {'Solubility_pH2_mMolar_DIFF': ['count', mean_diff(inc_low_n_vals=False)]} - grouped_stats = self.test_foldchange_input_2.groupby(['FRAG_L', 'FRAG_R'], sort=False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - pd.util.testing.assert_frame_equal(grouped_stats, self.test_mean_diff_goldenout) - - def test_categorical_data(self): - - f = {'CAT_DATA_DIFF': [category_moved_up_one, category_moved_down_one, category_no_change, - category_changed_class, 'count']} - grouped_stats = self.test_categorical_dataframe.groupby(['FRAG_L', 'FRAG_R'], sort=False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - # pp.pprint(grouped_stats) - pd.util.testing.assert_frame_equal(grouped_stats, self.test_categorical_dataframe_golden) - - def test_categorical_data(self): - - f = {'CAT_DATA_DIFF': [category_moved_up_one_pct, category_moved_down_one_pct, category_no_change_pct, - category_changed_class_pct, 'count']} - grouped_stats = self.test_categorical_dataframe.groupby(['FRAG_L', 'FRAG_R'], sort=False).agg(f) - grouped_stats.columns = ['_'.join(col).strip() for col in grouped_stats.columns.values] - - grouped_stats = pd.DataFrame(grouped_stats).reset_index() - grouped_stats.sort_index(axis=1, inplace=True) - - #pp.pprint(grouped_stats) - pd.util.testing.assert_frame_equal(grouped_stats, self.test_categorical_dataframe_pct_golden) - - -if __name__ == '__main__': - unittest.main() diff --git a/contrib/script/py/mmp/mmp_stats_functions_timer.py b/contrib/script/py/mmp/mmp_stats_functions_timer.py deleted file mode 100755 index 903341be..00000000 --- a/contrib/script/py/mmp/mmp_stats_functions_timer.py +++ /dev/null @@ -1,88 +0,0 @@ -from mmp.mmp_stats_functions import diffn # , _mean_diff_invlog -import random -import cProfile -import pstats - -# 100 random seeds generated on command line using random.randint() in loop -seeds = [ - 246, 507, 469, 491, 60, 863, 500, 541, 23, 383, - 673, 377, 874, 606, 11, 872, 858, 391, 102, 68, - 82, 880, 891, 588, 293, 201, 343, 67, 40, 142, - 477, 967, 590, 620, 798, 513, 497, 817, 88, 789, - 550, 605, 313, 546, 809, 546, 65, 686, 22, 761, - 481, 211, 907, 37, 87, 140, 259, 59, 7, 68, 395, - 922, 514, 468, 234, 941, 621, 154, 368, 817, 567, - 250, 946, 363, 632, 217, 86, 976, 93, 113, 175, - 737, 428, 75, 479, 40, 139, 848, 881, 859, 969, - 367, 739, 119, 409, 885, 102, 100, 584, 654 -] - -# now generate 100 lists of random length containing random floats -# this will always give the same results because we fix the seed -random.seed(10) - -# generate a list of lists to be used in testing: -mega_list = [] - - -def makelist(random_int): - """Creates a list of random float values of total list length - random_int (specified as function param)""" - num_list = [] - for count in range(random_int): - num_list.append(random.random()) - return num_list - - -def get_random_lists(minlen, maxlen, list_of_ints): - """for every int in the list_of_ints code will return a lists of random floats. - The method is reproducible, so if the input minlen=1 and maxlen=2 for list_of_ints=[123] - the result will always be the same. Used for testing the timing of functions that require - a lot of lists as input.""" - for seed_ in seeds: - random.seed(seed_) - random_int = random.randint(minlen, maxlen) - # print (random_int) - elements = makelist(random_int) - yield elements - - -def create_mega_dataset(): - """Using a set os seeds (a list of int values) a huge list of random length lists of - random floats is generated. The lists of random length with have a 50:50 distribution - of list length <3 versus >=3""" - # generate some long lists from seeds - get_random_lists(3, 100, seeds) - # now generate some short length lists - get_random_lists(1, 2, seeds) - - list_of_lists = [] - for new_list in get_random_lists(3, 100, seeds): - list_of_lists.append(new_list) - - for new_list in get_random_lists(1, 2, seeds): - list_of_lists.append(new_list) - - global mega_list - for _ in range(100): - mega_list = mega_list + list_of_lists - - -def run_this_test(): - """This will iterate across all data in mega_list using each list as input to - the function of choice""" - for list_to_test in mega_list: - # Need to switch this betweem True/False - # _mean_diff_invlog(list_to_test, inc_low_n_vals=False) - diffn(list_to_test, 50, inc_low_n_vals=True) - -# generate some data -create_mega_dataset() - -# profile whatever is specified in run_this_test() -cProfile.run('run_this_test()', '/tmp/restats') - -# collect results from file and print the function of interest (hard coded) -p = pstats.Stats('/tmp/restats') -p.print_stats('_mean_diff_invlog') -p.print_stats('diffn') diff --git a/contrib/script/py/pybase/__init__.py b/contrib/script/py/pybase/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/contrib/script/py/pybase/pyopmo.py b/contrib/script/py/pybase/pyopmo.py deleted file mode 100755 index d0f19a0f..00000000 --- a/contrib/script/py/pybase/pyopmo.py +++ /dev/null @@ -1,411 +0,0 @@ -""" -Contains Python "bindings" to molecule object and Python functions -utilizing molecule object key functionalities -The intent is to provide a Pythonic interface to mo utilities and, thus, -enable easy/consistent use of mo from within python programs. - -Testing: -$ python /pymo.py -""" - -import os -import subprocess as sp -import shlex -import shutil -import sys -import argparse -import logging -import unittest -from tempfile import NamedTemporaryFile, mkdtemp - -log = logging.getLogger('lilly.' + __name__) -log.addHandler(logging.NullHandler()) - -try: - home_dir = os.environ['C3TK_HOME'] -except KeyError: - home_dir = os.environ['LILLYMOL_HOME'] -except KeyError: - sys.exit("Failed to identify home dir, please set C3TK_HOME or LILLYMOL_HOME") - -build_dir = 'Linux' - -try: - build_dir = os.environ['BUILD_DIR'] -except KeyError: - pass - -root_dir = home_dir + '/bin/' + build_dir - -# dictionary of commands that will be turned into functions -# function_name: [script_name, debug_message, default_params_dict] -mo_tool_map = { - 'dicer': [root_dir + '/dicer', - 'Recursively cuts molecules based on queries', - {}], - 'fileconv': [root_dir + '/fileconv', - 'file/molecule conversion utilities', - {}], - 'iwdescr': [root_dir + '/iwdescr', - 'compute physicochemical descriptors using iwdescr', - {'-l': ''}], - 'make_these_molecules': [root_dir + '/make_these_molecules', - 'Makes molecules from isotopically labelled ' - 'reagents according to make file, not ' - 'combinatorial join', - {}], - 'preferred_smiles': [root_dir + '/preferred_smiles', - '', - {}], - 'alogp': [root_dir + '/abraham', - '', - {'-l': '', - '-F': home_dir + '/data/queries/abraham/Abraham', - '-P': home_dir + '/data/queries/abraham/Alpha2H', - '-g': 'all' }] -} - -def __function_generator(fct_name, - script_name, - debug_msg, - default_params_dict, - expect_zero=True): - """ - A generator for functions which runs one of LillyMol scripts with a - predefined set of parameters. - - :param str fct_name: the function name of the newly generated function - :param script_name: your LillyMol script path (from mo_tool above) - :param debug_msg: quick message about what the script does (from mo_tool above) - :param default_params_dict: default parameters - :param expect_zero: whether to expect zero as a return value from the script - :return: a function which you can call to run the script - :rtype: callable - """ - - def funct(infile, outfile=None, params_dict=default_params_dict, - params_input_string=None, loggero=None, pretend=False, - altret=False, inpipe=False): - # Use param_input_string when a dictionary can not be used - # e.g. tsubstructure.sh -A M -A D - - log.debug('funct = %s: ' % script_name + debug_msg) - - params_string = ' ' - - if params_input_string: - params_string += params_input_string + ' ' - - for k, v in list(params_dict.items()): - if type(v) is list: - for vv in v: - params_string += k + ' ' + vv + ' ' - else: - params_string += k + ' ' + str(v) + ' ' - params_string = params_string[:-1] - - if type(infile) is list: - infile_s = ' '.join(infile) - elif infile is None: - infile_s = '' - else: - infile_s = infile - - cmd_2_execute = script_name + params_string + ' ' - - if not inpipe: - cmd_2_execute += infile_s - - out_fh = None - - if altret: - out_fh = sp.PIPE - elif outfile: - out_fh = open(outfile, 'w') - - if pretend: - log.warning('Just pretending') - exit_status = 0 - else: - cmd = shlex.split(cmd_2_execute) - log.info('Executing: {}'.format(cmd_2_execute)) - #print('Executing: {}'.format(cmd_2_execute)) - - # FIXME: this gets really ugly now... - if inpipe: - my_proc = sp.Popen(cmd, stdin=sp.PIPE, stdout=out_fh, - stderr=sp.PIPE, shell=False) - - # FIXME: Python2 - out, err = my_proc.communicate(infile_s.encode('utf-8')) - else: - my_proc = sp.Popen(cmd, stdout=out_fh, stderr=sp.PIPE, - shell=False) - out, err = my_proc.communicate() - - exit_status = my_proc.returncode - - # NOTE: Python2 returns strings so need to check - # FIXME: this needs to be simplified as soon as everything is - # Python3 - if type(err) == bytes: - err = err.decode('utf-8') - if type(out) == bytes: - out = out.decode('utf-8') - - if outfile: - out_fh.close() - - if exit_status and expect_zero: - log.error("%s failed:\n%s" % (script_name, cmd_2_execute)) - - if err: - log.error(err) - else: - log.debug("Done: " + debug_msg) - - # very ugly workaround for this mis-designed function - if not altret: - return exit_status - else: - return exit_status, out, err - - funct.__name__ = fct_name - funct.__doc__ = debug_msg - - return funct - - -for name, params in list(mo_tool_map.items()): - nparams = len(params) - - if not (3 <= nparams <= 5): - raise IndexError('mo_tool_map: "{}" has {:d} parameter(s) but should ' - 'have 3-5'.format(name, nparams)) - - locals()[name] = __function_generator(name, *params) - - -def make_these_molecules(rgnt_list, make_these_file, reaction_file_list, outfile=None, params_dict={}, debug=True, - loggero=None): - """ - Alternative to trxn, used in MMP code for generating new mols from MMP's, trxn version would be alternative: - For connecting one bond (two components, single cut fragments): - trxn.sh -S - -r oneConnection.rxn partOne.smi partTwo.smi - For connecting two bonds (three component, two cut fragments): - trxn.sh -S -rxn -S - -r twoConnection.rxn partThree.smi partOne.smi partTwo.smi - BUT, if we have a long list of different contexts (partOne) and don't want exhaustive enumeration, specify rxn's: - make_these_molecules.sh -R oneConnection.rxn -M m2Make.txt -S - partOne.smi partTwo.smi - In this case, you can put all context fragments SMILES (context1a, context 1b, ...) in one reagent file, and - all fragments SMILES (frag1, frag2, ...) in the second reagent file. If you have something like (context1a frag1\n - context1a frag2\ncontext1b frag3\n...) in your m2Make.txt file, you will create the molecules you wanted - """ - - log.debug("Generating virtual compounds using rxn and reagents supplied plus specified combinations file") - - # prep reagents file string - rgnt_string = " ".join(rgnt_list) - - log.debug("These are the reagent files...." + str(rgnt_string)) - - # prep params string - params_string = " " - for k, v in list(params_dict.items()): - params_string += k + " " + v + " " - params_string = params_string[:-1] - - # set outfile - # improved a bit to handle files with '.' in main name, other than in the extension - if outfile: - if outfile[-4:] == ".smi" or outfile[-4:] == ".txt": - params_string += " -S " + os.path.splitext(outfile)[0] - else: - params_string += " -S " + outfile - - reaction_file = "" - for rxn_file in reaction_file_list: # todo: if single string, this is split in characters - reaction_file += ' -R ' + rxn_file - - cmd_line = (mo_tool_map['make_these_molecules'][0] + reaction_file + - ' -M ' + make_these_file + " " + params_string + " " + - rgnt_string) - - log.debug("Executing: %s" % cmd_line) - #if debug: - #print(cmd_line) - - my_proc = sp.Popen(shlex.split(cmd_line), stdout=None, stderr=sp.PIPE, - shell=False) - - for line in my_proc.stderr.readlines(): - log.debug(line.rstrip()) - - exit_status = my_proc.wait() - - log.debug("Done generating compounds") - - return exit_status - - -##################################################### -class _TestPymo(unittest.TestCase): - """Test class for pymo module - - Example usage: - - python pymo.py (to execute all tests) - - python pymo.py -c (for verbose console logging) - python pymo.py -f mylog.log (for logging to file mylog.log) - python pymo.py -c -f mylog.log (for both) - - python pymo.py _Test_pymo.test_fetch_smiles # (to execute only the specified test) - - coverage run pymo.py (to run test code coverage analysis) - coverage report pymo.py (to view the result of the test code coverage analysis) - """ - - def setUp(self): - """setup test data location, unittest config and logger""" - - # location of test data - self.test_data_location = root_dir + '/contrib/script/py/mmp/testdata/' - - # temp output file and dir - self.temp_inp_file = NamedTemporaryFile(encoding='utf-8', mode='wt', suffix='.smi', delete=False) - self.temp_out_file = NamedTemporaryFile(encoding='utf-8', mode='wt', delete=False) - self.temp_out_dir = mkdtemp() - - test_smiles = { - # basic test set - all the below id's and structures are CHEMBL - '3105327': 'Cc1ccc2c(ccn2c3nc(cs3)c4cc(ccc4F)C(F)(F)F)c1', - '1526778': 'CC(=O)c1c(C)n(c(C)c1C(=O)C)c2nc(c(C)s2)c3ccc(C)c(C)c3', - '1494678': 'CC(=O)c1c(C)n(c(C)c1C(=O)C)c2nc(c(C)s2)c3ccccc3', - '472166': 'OC(CCn1ccnc1)(c2ccccc2)c3ccccc3', - '69798': 'Cc1nccn1CCC(O)(c2ccccc2)c3ccccc3', - '367346': 'Cc1sc(N)nc1c2cccc(Cl)c2', - '366881': 'Cc1sc(N)nc1c2ccc(Cl)c(Cl)c2', - '1477460': 'COc1ccc(cc1)c2nc(sc2C)n3c(C)c(C(=O)C)c(C(=O)C)c3C', - '1441050': 'COc1ccc(cc1OC)c2nc(sc2C)n3c(C)c(C(=O)C)c(C(=O)C)c3C' - } - - # write test data to temp file 01 - for smi_id, smi in test_smiles.items(): - string = smi+' '+smi_id+'\n' - self.temp_inp_file.write(string) - self.temp_inp_file.close() - - def tearDown(self): - """cleanup test data and settings""" - - # Clean up the directory - # os.removedirs(self.temp_out_dir) - shutil.rmtree(self.temp_out_dir) - - def test_fileconv(self): - log.debug("Testing fileconv") - exit_status = fileconv(os.path.join(self.test_data_location, self.temp_inp_file.name), - params_dict={'-v': '', - '-c': '10', - '-C': '20', - '-S': os.path.join(self.temp_out_dir, 'atomcountfilter'), - '-o': 'smi'} - ) - log.debug("fileconv return code was: %s" % exit_status) - self.assertEqual(exit_status, 0) - - def test_make_these_molecules(self): - log.debug("Testing _make_these_molecules") - - # test data from mmp_enum_mols_from_pairs.py - context = NamedTemporaryFile(encoding='utf-8', mode='wt', suffix='.smi', delete=False) - examp_context = "[1CH3]CCCCCC partOne_1\n[1CH3]CCCCCCC partOne_2\n[1CH3]CCCCCCCC partOne_3\n[1CH3]CCCCCCCCC partOne_4\n[1CH3]CCCCCCCCCC partOne_5" - context.write(examp_context) - context.close() - - frags = NamedTemporaryFile(encoding='utf-8', mode='wt', suffix='.smi', delete=False) - examp_frags = "[1OH]CCCCCC partTwo_1\n[1OH]CCCCCCC partTwo_2\n[1OH]CCCCCCCC partTwo_3\n[1OH]CCCCCCCCC partTwo_4\n[1OH]CCCCCCCCCC partTwo_5\n[1OH]CCCCCCCCCCC partTwo_6\n" - frags.write(examp_frags) - frags.close() - - make_instr = NamedTemporaryFile(encoding='utf-8', mode='wt', delete=False) - examp_make_instr = "partOne_2 partTwo_3\npartOne_4 partTwo_5\n" - make_instr.write(examp_make_instr) - make_instr.close() - - rxn = NamedTemporaryFile(encoding='utf-8', mode='wt', suffix='.rxn', delete=False) - # this is the reaction specification that trxn needs to combine isotopically labelled mmp fragmentation points - single_rxn = "(0 Reaction\n (0 Scaffold\n (A C smarts \"[!0*]\")\n (A I isotope (0 0))\n )\n" - single_rxn += " (1 Sidechain\n (A C smarts \"[!0*]\")\n (A I isotope (0 0))\n (A I join (0 0))\n )\n)" - rxn.write(single_rxn) - rxn.close() - - exit_status = make_these_molecules([context.name, frags.name], - make_instr.name, - [rxn.name]) - - log.debug("make_these_molecules return code was: %s" % exit_status) - self.assertEqual(exit_status, 0) - - def test_dicer(self): - log.debug("Testing dicer") - log.debug("Testing %s" % dicer.__name__) - exit_status = dicer(os.path.join(self.test_data_location, self.temp_inp_file.name), - os.path.join(self.temp_out_dir, 'dicer.out'), - params_dict={}, ) - log.debug("dicer return code was: %s" % exit_status) - self.assertEqual(exit_status, 0) - - def test_alogp(self): - log.debug("Testing CMI") - exit_status = alogp(os.path.join(self.test_data_location, self.temp_inp_file.name), - os.path.join(self.temp_out_dir, 'alogp.out') - ) - log.debug("CMI return code was: %s" % exit_status) - self.assertEqual(exit_status, 0) - -if __name__ == "__main__": - - # optional command line flags - parser = argparse.ArgumentParser() - parser.add_argument('-l', '--log_file', - help='Name of file to place debug log info in') - parser.add_argument('-c', '--console', - help='Switch on console logging', - default=False, - action='store_true') - - args = parser.parse_args() - #print(args) - logger_file = args.log_file - console_on = args.console - - pymotest_logger = logging.getLogger("pymo.testlogger") - pymotest_logger.setLevel(logging.DEBUG) - - log_formatter = logging.Formatter("%(asctime)s [%(funcName)-12.12s] " - "[%(levelname)-5.5s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S") - - if console_on: - print("Switched on console") - h1 = logging.StreamHandler(stream=sys.stdout) - h1.setLevel(logging.DEBUG) - h1.setFormatter(log_formatter) - pymotest_logger.addHandler(h1) - else: - print("console off") - - if logger_file is not None: - print(("Switched on logging to file: {}".format(logger_file))) - fileHandler = logging.FileHandler(filename=logger_file) - fileHandler.setFormatter(log_formatter) - fileHandler.setLevel(logging.DEBUG) - pymotest_logger.addHandler(fileHandler) - else: - print("file logging off") - - if console_on is False and logger_file is None: - pymotest_logger.setLevel(logging.CRITICAL) - - unittest.main() diff --git a/data/minor_changes.textproto b/data/minor_changes.textproto new file mode 100644 index 00000000..bd236198 --- /dev/null +++ b/data/minor_changes.textproto @@ -0,0 +1,36 @@ +# Allow removal of fragments containing as many a 3 atoms +remove_fragment: [1, 2, 3] +# Avoid excessive numbers of products for any starting molecule. +max_variants: 10000 +# Running with atom typing is generally recommended +# This type contains the element and whether it is aromatic or not. +atype: "UST:AY" +# Do not use rare fragments. Helps cut numbers too. +# in this example, prefer setting library size +# fragment_support: 10 + +# When removing fragments, how large can we lose. +max_atoms_lost: 3 + +# Turn on all transformations + +add_fragments: true +replace_terminal_fragments: true +single_to_double_bond: true +double_to_single_bond: true +unspiro: true +make_three_membered_rings: true +change_carbon_to_nitrogen: true +change_carbon_to_oxygen: true +change_nitrogen_to_carbon: true +insert_ch2: true +remove_ch2: true +destroy_aromatic_rings: true +destroy_aromatic_ring_systems: true +swap_adjacent_atoms: true +swap_adjacent_aromatic_atoms: true +insert_fragments: true +replace_inner_fragments: true + +max_fragment_lib_size: 100 +max_bivalent_fragment_lib_size: 200 diff --git a/docs/GeneExpression/README.md b/docs/GeneExpression/README.md new file mode 100644 index 00000000..e6dfb912 --- /dev/null +++ b/docs/GeneExpression/README.md @@ -0,0 +1,96 @@ +# Gene Expression Similarity + +This is an implementation of the the concepts outlined in +``` + A simple and robust method for connecting small-molecule drugs using gene-expression signatures + Shu-Dong Zhang* and Timothy W Gant* BMC Bioinformatics 2008, 9:258 doi:10.1186/1471-2105-9-258 +``` + +## The Task +The task is to compare one or more query structures (needles) against a large +collection of gene expression data (haystack), identifying those members of the +haystack that are closest to the needle(s), as measured by the Zhang and Grant +similarity measure. + +## Architecture +The data needs to be transformed into serialized protocol buffer forms. The +tool `gene_expression_to_proto` can be used to perform this task. The input +file(s) must be of the form +``` +gene_id,Z_score_robust +29082,-7.43514680862427 +10528,5.61989164352417 +5111,-5.17184543609619 +60528,-4.9140830039978 +26520,-4.69979190826416 +4582,4.6722207069397 +``` +where the first column is the gene id, and the second column a sorted list of +gene expresson values. Sorting must have been by absolute value. Note that +the tool does not check that the input has been sorted, although if needed +that functionality could be added. Any extra columns are ignored. + +For the gene expression data we used for the initial implementation, the haystack +consisted of 473k individual .csv files as above. That took about 272GB on disk, +with each file containing about 12k records. + +Converion of the full dataset to serialized protocol buffer form took over an +hour, but resulted in a file of 58GB. + +As an aside the output file generated by `gene_expression_to_proto` is actually +a TFDataRecord file +[TensorFlow](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) +holding a simple serialized protcol buffer. + +This 58GB haystack file can be searched in about 30 minutes. + +## Optimization +The algorithm described by Zhang and Grant includes a parameter that limits +the number of expression values considered. If we are only going to compare +the top 200 genes in the expression data, it does not make sense to store +and process all 12k of them. Therefore `gene_expression_to_proto` to has a +`-maxrank` parameter that allows specification of the maximum rank to be +stored in the haystack file. If that is restricted to 500, the resulting +file is only 2.4GB, and if only the 200 highest scoring expression values +are stored, the file is 960MB. These files can be searched in under a +minute. + +## HowTo +To convert a directory of gene expression data files to a haystack file try +``` +gene_expression_to_proto -d .csv -maxrank 200 -rpt 10000 -fname -S haystack200.dat -v / + path/to/dir +``` +This says to process all files that end in .csv that are found in /path/to/dir. +Only store the 200 highest ranked expression data. If you are curious +about how fast the job is progressing, add the -rpt option, and in this +case, it will emit a status message every 10,000 items processed. +Adding `-fname` means use the file name (rather than the full path name) +of the name associated with each file. + +This might take 10 minutes or more. + +Alternatively, you can provide the tool with a list of files to process and then +pass that to gene_expression_to_proto as `F:files`. It knows that if its argument +starts with "F:" that means it is a file containing a list of files to process. + +The needle(s) must be converted in a similar way. This time omit the maxrank +parameter so that all the genes in the needle are stored. The algorithm works +by looking up all the genes in the needle in the (possibly truncated) list +stored with each member of the haystack. + +Once both the haystack and the needles have been converted to serialized +proto form, nearneighbours can be found with `gene_expression_near_neighbours`. + +``` +gene_expression_nearneighbours -maxrank 200 -n 10 -p needles.dat -rpt 100000 -v haystack.dat +``` + +Finds, for every item in `needles.dat` the 10 nearest neighbours in +`haystack.dat` using the top ranked 200 genes in the comparison. Again, +if you are curious about progress, the -rpt option can be used. + +Output will be three columns, the id of the needle in column 1, the +id of the a neighbour from the haystack and the last column is the score. +This will be ordered by decreasing score. + diff --git a/docs/LillyMol.md b/docs/LillyMol.md index 12ae87d6..9952cd71 100644 --- a/docs/LillyMol.md +++ b/docs/LillyMol.md @@ -20,7 +20,7 @@ since many large scale Cheminformatics problems are quite sharded form. It also works well with pipelined command processing, where the output from one command is fed to the next command, thereby taking advantage of the multi-core -environments that today are uniquitous. +environments that today are ubiquitous. Since all LillyMol tools are built on a common code base, tools tend to have common arguments and behaviours. diff --git a/docs/Molecule_Lib/Images/CHEMBL4515300.png b/docs/Molecule_Lib/Images/CHEMBL4515300.png new file mode 100755 index 00000000..a3e9759e Binary files /dev/null and b/docs/Molecule_Lib/Images/CHEMBL4515300.png differ diff --git a/docs/Molecule_Lib/Images/CHEMBL59123.png b/docs/Molecule_Lib/Images/CHEMBL59123.png new file mode 100755 index 00000000..5047c5d5 Binary files /dev/null and b/docs/Molecule_Lib/Images/CHEMBL59123.png differ diff --git a/docs/Molecule_Lib/Images/CHEMBL88846.png b/docs/Molecule_Lib/Images/CHEMBL88846.png new file mode 100755 index 00000000..9e4d7534 Binary files /dev/null and b/docs/Molecule_Lib/Images/CHEMBL88846.png differ diff --git a/docs/Molecule_Lib/chemical_standardisation.md b/docs/Molecule_Lib/chemical_standardisation.md index 3f935fa5..7f776c56 100644 --- a/docs/Molecule_Lib/chemical_standardisation.md +++ b/docs/Molecule_Lib/chemical_standardisation.md @@ -76,6 +76,7 @@ Entering `-g help` to most LillyMol tools yields the usage message -g keto_enol convert enol to keto forms (no adjacent heteroatoms) -g all ALL the above standardistions -g rvnitro convert O=N=O nitro groups to charge separated + -g isotope convert all isotopic atoms to non isotopic forms -g rvnv5 convert all 5 valent N atoms to charge separated -g APP= append 'xxx' to the name of changed molecules -g APP=EACH append the reason for each change diff --git a/docs/Molecule_Lib/io.md b/docs/Molecule_Lib/io.md index 80f0ef8e..a7410c63 100644 --- a/docs/Molecule_Lib/io.md +++ b/docs/Molecule_Lib/io.md @@ -393,7 +393,13 @@ to V2000 files which have the isotope in the atom record. ### -i mdlsep=\<..\> Separator between tags when reading mdl files. If a name is built up from the concatentation of multiple tags, this is the separator between -those tags. Default is space. +those tags. Default is space. Character names are recognised so +`-i mdlsep=tab` is valid. To convert a .sdf file with tags to a tab +separated file, we recently used this invocation. +``` +fileconv -i mdlquiet -i 'SDFID:(idnumber|LogP|Collection)' -o smi -o smisep=tab + -i mdlsep=tab -S newfile /path/to/input.sdf +``` ### -i mdlnwce Discerning chirality from wedge bonds and atom positions can often diff --git a/docs/Molecule_Lib/substructure.md b/docs/Molecule_Lib/substructure.md index 0acb4c80..bfdd7a90 100644 --- a/docs/Molecule_Lib/substructure.md +++ b/docs/Molecule_Lib/substructure.md @@ -122,15 +122,61 @@ smarts could be extended to `O=[$([CD3](-N)-[!N])]` which is starting to become complex. Differentiating an amide from a urea can also be done with an -`environment_no_match` directive in a query file. +environment_no_match` directive in a query file. ``` +query { + smarts: "O=[CD3]-[NG0]" + environment_no_match { + attachment { + attachment_point: 1 + btype: SS_SINGLE_BOND + } + smarts: "N" + } +} +``` +But note that this query will match molecules like +![CHEMBL4515300](Images/CHEMBL4515300.png) +which contain both a urea and an amide - that share a Nitrogen atom. +We could also exclude matches where there is another C=O atom attached +to the matched Nitrogen atom. ``` +query { + smarts: "O=[CD3]-[NG0]" + environment_no_match { + attachment { + attachment_point: 1 + btype: SS_SINGLE_BOND + } + smarts: "N" + } + environment_no_match { + attachment { + attachment_point: 2 + btype: SS_SINGLE_BOND + } + smarts: "C=O" + } +} +``` +where one environment_no_match checks what is attached to matched atom 1, +the Carbon atom, and the other checks what is attached to the Nitrogen atom. +Again, these could also be accomplished by recursive smarts, but at the +cost of considerable complexity if primary, secondary and tertiary amides are +to be handled. +``` +O=C([$([!N])])-[$([ND1]),$([ND2]-[!$(C=O)]),$([ND3](-[!$(C=O)])-[!$(C=O)])] +``` +yields the same results. -This particular problem points to the fact that within LillyMol +This case points to the fact that within LillyMol substructure searching there are frequently many different ways of doing the same thing. These will differ in their complexity, and efficiency. +In the case above, the query file processes 20k molecules in 0.65 seconds +whereas the smarts based query takes 10% longer. + ### Diversion Smarts has been referred to as a write-only language, with extraodinarily complex smarts being written by experts. These create maintenance @@ -174,7 +220,7 @@ The best location of documentation for this feature is in the source file [substructure.h](/src/Molecule_Lib/substructure.h), where several use cases are outlined. For example ``` -// {>3;0[c]} More than 3 bonds, and no aromatic Carbons. + {>3;0[c]} More than 3 bonds, and no aromatic Carbons. ``` Refer to that file and also the file containing C++ [unit tests](/src/Molecule_Lib/substructure_nmab_test.cc) @@ -184,28 +230,6 @@ specification. While complex, it can be very useful. Whether it is better to express a complex linker relationship in smiles or via a query file is an open question. -## Down The Bond -There were several times when we needed to place a limit on -the number of atoms in a substituent. This is not as simple -as looking at the furthest distance of an atom in the -substituent, because of branching. - -For example to find various methoxy-like -substituents on a benzene, one might limit the size of -the group to a max of 5 atoms. -``` -a-[OD2]-{a{1-5}}C -``` -The directive is inserted between the two atoms that define the bond, and -atoms down that bond are considered. If the bond is in a ring, the match -will fail. - -After this was implemented, we came up with the more general idea -of a substituent, that is only available in query files. A substituent -is a much more complex concept, but for simple cases, a down-the-bond -directive can work well. The DownTheBond message is also available -in a query file. - ## Atomic Smarts Extensions LillyMol's atomic smarts also contain some useful extensions. @@ -471,3 +495,323 @@ underscore `[/IWNv{-3_14}C]` which is interpreted as `[/IWNv{-3.14}C]`. As use cases emerge further `/IW` directives may be added. Some can be evalated during atom matching, others are applied after an embedding has been found. + +## Other Messages + +Proto specifications of substructure match conditions allow specification of +several useful concepts related to more general descriptions of regions of a +molecule. + +### Separated Atoms +The SeparatedAtoms message describes bond separations between matched atoms, +as well as rotatable bonds on the path between those matched atoms. + +For example the query +``` +query { + smarts: "[OH].[OH]" + separated_atoms { + a1: 0 + a2: 1 + min_bonds_between: 3 + min_rotbond: 1 + } +} +``` +matches `OCCCCO` but not `Oc1ccc(O)cc1`. This construct can be useful +creating pharmacophore type matches. + +### Nearby Atoms +This construct is also useful for pharmacophore type queries, but has +some added flexibility. + +For example, to identify an aromatic Nitrogen acceptor with a nearby +amide +``` +query { + smarts: "[nD2H0]:c" + unique_embeddings_only: true + nearby_atoms { + smarts: "[CD3T2](=O)-[NT0G0]" + hits_needed: 1 + matched_atom: 0 + min_bonds_between: 3 + max_bonds_between: 5 + } +} +``` +would do that. But note this imposes no 'directionality' on the amide. If +that were needed, then something like +``` +c:[nD2H0]...{3-5}[CD3T2](=O)-[NT0G0] +``` +would be needed. But note that in this case, the atoms matched by the +amide are returned as matched atoms, whereas with the nearby_atoms +construct, only the `nc` matched atoms are returned. + +A query_file can also be used instead of a smarts in a nearby_atoms +directive, which allows considerable flexibility and re-use of complex +queries. + +Beware with the max_hits_needed directive. If specified, this also +includes zero hits. So if you wish to specify both that there must +be at least one match and a max number, you will need to specify +``` + min_hits_needed: 1 + max_hits_needed: 2 +``` +to specify either 1 or 2 matched. This would be the same as +``` + hits_needed: [1, 2] +``` + +### Separated Rings +A pharmacaphore type query might involve something like: + +``` +"A molecule with a a five membered aromatic ring containing +an [nH] group, and a 6 membered aromatic containing an [nH0] group, +separated by at least 6 atoms" +``` +This query can be accomplished with the query file +``` +query { + ring_specifier { + aromatic: true + ring_size: 5 + fused: 0 + base { + set_global_id: 1 + environment: "[nr5H]" + } + } + ring_specifier { + aromatic: true + ring_size: 6 + fused: 0 + base { + set_global_id: 2 + environment: "[nr6H0]" + } + } + smarts: "[/IWgid1c]!@-[/IWscaf1]...{>5}[/IWscaf1]-!@[/IWgid2c]" +} +``` +First each of the two rings are defined. Both are aromatic, and +they have different sizes. Neither are fused. + +In the first ring, we specify that there must be an aromatic +Nitrogen atom with a Hydrogen atom. The 'r5' specification is +redundant, but harmless. Same in the other ring where we look +for a pyridine like Nitrogen atom, again with a redundant +specification of the ring size. + +The smarts is complex. We are looking for an aromatic Carbon +atom that is in the first ring, then a non-ring bond to an +atom that is in the scaffold. That can only be an atom that +is part of a linker, joining rings. Five atoms away we look +for a non ring atom that is also in the scaffold and is then bonded +to an atom that is in the second ring. + +This will identify molecules such as + +![CHEMBL59123](Images/CHEMBL59123.png) + +To identify two separate rings without the distance +constraint could be done with +``` +[/IWfss1/IWrid1nr5H]...[/IWfss1/IWrid2nr6H0] +``` +Many more matches are found, but there is no separation +constraint on the matches. Note that just using a `,,,` directive +to specify atoms between the matched atoms would not work since +that path may include other atoms in the ring. + + +## Down The Bond +There were several times when we needed to place a limit on +the number of atoms in a substituent. This is not as simple +as looking at the furthest distance of an atom in the +substituent, because of branching. + +For example to find various methoxy-like +substituents on a benzene, one might limit the size of +the group to a max of 5 atoms. +``` +a-[OD2]-{a{1-5}}C +``` +The directive is inserted between the two atoms that define the bond, and +atoms down that bond are considered. Note that the 'C' atom in the query +above is *not* considered to be part of what is matched by the down the bond +directive - because it can be fully specified here. If the bond is in a ring, the match will fail. + +After this was implemented, we came up with the more general idea +of a substituent, that is only available in query files. A substituent +is a slightly different concept, but for simple cases, a down-the-bond +directive can work well. The DownTheBond message is also available +in a query file. + +The down the bond directive now supports further atomic properties. +Counts of atoms that are + +* heteroatom +* aromatic +* unsaturated +* in a ring +* atomic smarts + +Currently all specifications are treated with an implicit `and` operator, +so all specifications must be satisfied. In addition, multiple specifications +must be separated by an ';' operator - leaving the door open for a future +use case that might implement other operators. + +So a complex down the bond smarts might look like + +``` +[Nx0]-C(=O)-{a{2-5};h0;r>1;u>0}C +``` +which is an amide group, then down the bond with between 2 and 5 atoms, no +heteroatoms, at least one ring atom (actually there would be at least +three ring atoms) and at least one unsaturated atom. This matches molecules +like +![CHEMBL88846](Images/CHEMBL88846.png) + +Note that unsaturation and aromaticity are separate concepts, so unsaturation +only applies to aliphatic atoms. Note that the minimum unsaturation value +will be 2 since an unsaturated bond consists of two atoms that are each +unsaturated. + +Looking for a C1-5 secondary alkyl amide might look like +``` +O=[CD3T2]-{a<6;h0;u0;m0}[ND2] +``` +where we want fewer than 6 atoms down the bond, no heteratoms, no unsaturated +atoms, and no aromatic atoms. + +If we wanted tertiary amides, that would be +``` +O=[CD3T2]-[ND3](-{a<5;h0;u0;m0}C)-{a<5;h0;u0;m0}C +``` +where the same directive is repeated for each substituent. This of +course raises the question whether it is possible to come up with a single +query that will handle primary, secondary and tertiary amides. That is +not possible with a down the bond smiles construct, but is possible with +a query file. + +``` +query { + smarts: "[CD3T2R0](=O)N" + down_the_bond { + a1: 0 + a2: 2 + max_natoms: 5 + heteroatom_count: 0 + unsaturation_count: 0 + aromatic_count: 0 + match_individual_substituent: true + no_other_substituents_allowed: true + } +} +``` +This query will *not* match `OCCC(=O)N(CCC)CO`. Nor will it match a primary amide, +since a down the bond specification has bee made, but no match would be made with +a primary amide. Clearly we could add a directive to say that no substituent atoms +would comprise a positive match. + +By default, the down the bond directives aggregate all atoms that appear +down the bond, and atomic properties are summed across all substituents +attached to `a2`. If `match_individual_substituent` is set, then +rather than aggregating across all substituents, each substituent is +computed separately and must match the constraints. If a single substituent +is found to satisfy the conditions, a match is reported. + +If `no_other_substituents_allowed` is also specified, then the match fails +if there is a substituent that does *not* satisfy the constraints. + +Both within smarts and via a query file, an atomic smarts can be specified. Since +that, like all directives, will be followed by a numeric qualifier, it can be +either a positive or negative requirement. + +The quite complex smarts +``` +[OHD1]-[CD3R0](=O)-[CD3R0](-{[CD2]1;[$(O=c1nsnc1)]1;d3;m5;r5;u1;a7}*)-[ND1H2] +``` +will match +!(CHEMBL1094324)[Images/CHEMBL1094324.png] + +where we are looking for a `[CD2]` as well as an atom that is the aromatic +ring. Note that these could have been combined, although maybe you want +flexibility. All queries are completely independent. In this case we +also require that the maximum separation from `a2` be 3 bonds, there +be 5 aromatic atoms, 5 ring atoms, 1 unsaturated atoms and 7 atoms. + +Note that in all cases only atomic smarts can be specified, but they can +be recursive smarts, and can match any parts of the molecule - including +atoms that are NOT down the bond. + +### Substituent +The Substituent message was initially implemented to handle the +environments specified with ring and ring system specifications. These +are similar to DownTheBond directives, but have some other properties +reflecting their origin. + +They are somewhat similar to an envionment or environment_no_match +messages, but describe less specific features. They also share a lot +of attributes with the DownTheBond message, but are not neccessarily +tied to a specific bond. Generally prefer using a DownTheBond message +although if the substituent can be attached via multiple sites, +this is more convenient. + +The other use case for a Substituent message is restricting matches +to certain functional groups. For example, going back to the previous example +of trying to identify an alkyl amide, the following query might be +an approach to that. +``` +query { + smarts: "[CD3T2R0](=O)N" + substituent { + max_natoms: 5 + heteroatom_count: 0 + unsaturation_count: 0 + no_other_substituents_allowed: true + disqualifying_smarts: "a" + } +} +``` +But because this looks at all matched atoms, it is also applying its +matching restrictions to atoms attached to the Carbonyl Carbon atom. +This may not be what is needed. + +Note the significant differences between environments and +substituents. Environments are very specific atom matches that +are directly bonded to specific matched atoms. Substituents are more general +concepts attached to any matched atom. Down the bond messages are +specific to the matched atoms forming a bond. + +### Region +Two matched atoms can define a region in a molecule. In order for this to +work, the two matched atoms must *not* be in the same ring system. The +'atom' directive for defining a region is a repeated field, leaving the +door open for regions that might be defined by more than 2 matched query atoms. +But currently only regions defined by two matched atoms are supported. + +The query +``` +query { + smarts: "[N].[N]" + unique_embeddings_only: true + region { + atom: [0, 1] + natoms: 6 + atoms_not_on_shortest_path: 0 + heteroatom_count: [2, 3] + } +} +``` +will match "NCOCOCCN". + +Currently only a small number of attributes are implemented for Region messages +but clearly more could be added as the need arises. + +The C++ unit tests in the [src/Molecule_Lib](src/Molecule_Lib) directory contain +more examples of proto specifications of complex queries that test these concepts. diff --git a/docs/Molecule_Lib/substructure_proto.md b/docs/Molecule_Lib/substructure_proto.md index e02c7408..d80dbb82 100644 --- a/docs/Molecule_Lib/substructure_proto.md +++ b/docs/Molecule_Lib/substructure_proto.md @@ -661,6 +661,11 @@ must be distinct from the matched atoms, but that can be changed if If there are no constraints on distance, then a simple '&&' smarts can be used, which describes two (possibly overlapping) separate smarts matches. +### Separated Atoms +This message started as redundant with the Linker message. It describes bond +separations between pairs of matched atoms. In addition, the number of rotatable +bonds found on the shortest path between the two matched atoms can be specified. + ## Smarts I have heard smarts referred to as a 'write only' language. Several of these constructs in the proto representation are specifically designed to lessen the diff --git a/docs/Molecule_Tools/iwdescr.md b/docs/Molecule_Tools/iwdescr.md index 62bc61dc..4d91d0a3 100644 --- a/docs/Molecule_Tools/iwdescr.md +++ b/docs/Molecule_Tools/iwdescr.md @@ -9,6 +9,10 @@ iwdescr file.smi > file.w which will generate a tabular (space separated) file containing a couple of hundred molecular descriptors. +There is also a shell wrapper in the contrib/bin directory that sets up some useful +defaults. It is strongly recommended that the wrapper be used always. Otherwise you will +see various missing columns. + ## Descriptors. The following descriptors are computed. @@ -219,12 +223,15 @@ The following descriptors are computed. | brnsdual | fred bruns: donor and acceptor | | brunspos | fred bruns: likely positive charge | | brunsneg | fred bruns: likely negative charge | +| formal_charge| sum of brunspos + brunsneg. Net formal charge | | brunshbdsum | brunsacc + brunsdon - brnsdual | | cd4ring | carbon atoms with four connections in a ring | | cd4chain | carbon atoms with four connections not in a ring | | csp3_chain | sp3 carbon atoms not in a ring | | frsub | fraction of ring atoms that are subsituted outside the ring | | frssub | fraction of ring atoms that have a single atom subsituent | +| alorthoring | number of ortho substituents on an aliphatic ring | +| arorthoring | number of ortho substituents on an aromatic ring | | fsatspcha | fraction of spinach atoms that are saturated | | satspcha | number of spinach atoms that are saturated | | unsatspcha | number of unsaturated spinach atoms | diff --git a/docs/Molecule_Tools/molecule_filter.md b/docs/Molecule_Tools/molecule_filter.md new file mode 100644 index 00000000..4fca65ad --- /dev/null +++ b/docs/Molecule_Tools/molecule_filter.md @@ -0,0 +1,125 @@ +# Molecule Filter + +## Motivation +This tool is designed to quickly filter large collections of molecules as +efficiently as possible. + +Internally tests are performed roughly in order of expense of computation. +For example, if reading a smiles, the number of atoms in the smiles can +be very quickly determined by examing the smiles string, without performing a much +more expensive Molecule interpretation. Note however that this will only +be beneficial to the extent that molecules get rejected for size. If hardly +any molecules are discarded for size, using this approach may be worse. +Some tests are very expensive, xlogp for example, and those will be +performed after all cheaper tests. + +Note that such a task is 'pleasingly parallel' and generally this task is +used in combination with batch processing large collections in sharded parallel +form. + + +## HOWTO +Built a textproto configuration file based on the proto +[proto](/src/Molecule_Tools/molecule_filter.proto). +``` +molecule_filter -F config.textproto:w + + +## Implementation + +The tool is drive by a [proto](/src/Molecule_Tools/molecule_filter.proto) which +provides the configuration information for how the filtering is to be done. An +example that sets all available properties might be +``` +min_natoms: 10 +max_natoms: 40 +min_heteroatom_count: 1 +min_heteroatom_fraction: 0.05 +max_heteroatom_fraction: 0.90 +min_nrings: 1 +max_nrings: 6 +min_aromatic_ring_count: 1 +min_aliphatic_ring_count: 0 +max_aliphatic_ring_count: 4 +min_rotatable_bonds: 2 +max_rotatable_bonds: 10 +max_ring_system_size: 3 +max_aromatic_rings_in_system: 3 +min_tpsa: 20 +max_tpsa: 180 +min_alogp: -2 +max_alogp: 6 +min_xlogp: -2 +max_xlogp: 6 +min_hba: 2 +max_hba: 10 +min_hbd: 0 +max_hbd: 5 +largest_ring_size: 7 +exclude_non_organic: true +exclude_isotopes: true +max_halogen_count: 7 +max_distance: 30 +min_sp3_carbon: 1 +max_aromatic_density: 0.8 +max_chiral: 2 +max_number_fragments: 2 +``` +Note that as written this makes no sense. It would be silly to filter +on both alogp and xlogp. Choose one. That said, the xlogp implementation +is more expensive than the alogp implementation, so if a logP filter is +required, and speed is of the utmost concern, alogp would be preferred. +Which one might provide better predictions is beyond the scope of this +document! + +A typical invocation might be +``` +molecule_filter -F config.textproto -v large.smi > passed.smi +``` +Using the -v option results in a summary of failure modes being written +to stderr at the end of processing. If you care about the molecules that +have been discarded, add the `-B rejected` option, to specify a file where +rejected molecules are written. + +The tool takes 62 seconds to process 2.4M molecules in a recent Chembl, +reporting (because of the -v option) +``` +Read 2409270 molecules, passed 1488785 0.61794 +7905 non organic +1901 isotope +6582 too few atoms 10 +253993 too many atoms 40 +18088 too few rings 1 +10255 too many rings 6 +282 too few heteroatoms 1 +2170 min heteroatom fraction 0.05 +13 max heteroatom fraction 0.9 +41203 too few aromatic rings 1 +64474 too many aromatic rings 4 +0 too few aliphatic rings 0 +901 too many aliphatic rings 4 +22132 ring systems too large 3 +0 too many aromatic rings in system 3 +7964 ring too large 7 +84062 too few rotatable bonds 2 +60752 too many rotatable bonds 10 +14284 low TPSA 20 +3559 high TPSA 180 +294 low ALOGP -2 +56821 high ALOGP 6 +13937 too few HBA 2 +34722 too many HBA 10 +0 too few HBD 0 +13070 too many HBD 5 +1761 too many halogens 7 +0 molecules too long 30 +625 too few CSP3 1 +47640 aromatic density too high 0.8 +151095 too many chiral centres 2 +18800 too many fragments 2 +``` +Note that some rules may show zero matches because they might overlap +with previously run rules. Note that the order above is the order of +the properties in the proto definition. The actual order of computation +is as in the source file, and is generally from cheapest to most expensive. +As we learn more, the order of execution may change. diff --git a/docs/Molecule_Tools/pharmacophore2d.md b/docs/Molecule_Tools/pharmacophore2d.md new file mode 100644 index 00000000..7d5a2649 --- /dev/null +++ b/docs/Molecule_Tools/pharmacophore2d.md @@ -0,0 +1,244 @@ +# Pharmacophore_2d + +## Objective +Given a set of active molecules, generate a set of substructure queries that describe the +pharmacophoric features in that set of molecules. + +Generally the tool will be driven by a set of queries which define the pharmacophore. As those +pharmacophoric atoms are identified in the molecules, the query is constructed to preserve +the relative separation of those atoms. The atomic characteristics of the matched atoms can +also be specified, so there is considerable flexibility on how precisely the atom matching is +done. + +For example, if 3-aminopropanol, `NCCCO`, were an active molecule, we might define the pharmacophoric +elements to be a primary amine and an alcohol. In the starting molecules the pharmacophores are +separated by 3 bonds. + +An exact match to this query might look like +``` +name: "1aminopropanol" +query { + respect_initial_atom_numbering: true + query_atom { + id: 0 + atom_properties { + atomic_number: 7 + } + } + query_atom { + id: 4 + atom_properties { + atomic_number: 8 + } + } + separated_atoms { + a1: 4 + a2: 0 + bonds_between: 4 + } +} +``` +This query will match any N...O grouping. But note that there are no constraints on +what kind of Nitrogen or Oxygen atom (alpiphatic only). This can be controlled by the textproto config +file that controls execution. If we use +``` +functional_group: "SMARTS:[ND1H2]" +functional_group: "SMARTS:[OD1H]" +all_functional_groups_must_match: true + +atomic_property: ATOMIC_NUMBER +atomic_property: HCOUNT +``` +as the configuration file, the resulting query file will be +``` +name: "1aminopropanol" +query { + respect_initial_atom_numbering: true + query_atom { + id: 0 + atom_properties { + atomic_number: 7 + hcount: 2 + } + } + query_atom { + id: 4 + atom_properties { + atomic_number: 8 + hcount: 1 + } + } + separated_atoms { + a1: 4 + a2: 0 + bonds_between: 4 + } +} +``` +where we see that the Hydrogen count has been added to the atomic properties +that must be matched. Note that this information did *not* derive from the smarts. +The smarts matched the atoms, and the hydrogen count is derived from the matched +atoms. If would probably not make sense to specify a highly specific smarts, and then +only use `atomic_number` as the query property. + +Atomic properties from the enum in the proto definition can be used. +``` +enum AtomicProperty { + UNSPECIFIED = 0; + ATOMIC_NUMBER = 1; + NCON = 2; + AP_AROMATIC = 3; + RING_BOND_COUNT = 4; + HAS_PI_ELECTRON = 5; // Not implemented, do not use + PI_ELECTRON_COUNT = 6; // Not implemented, do not use + UNSATURATION = 7; + ISOTOPE = 8; + RING_SIZE = 9; + SPINACH = 10; + FUSED_SYSTEM_SIZE = 11; + HCOUNT = 12; +} +``` +Any number of these properties can be combined. All properties will be transferred +from the matched atom to the query file, and so all properties must be matched. + +## Fuzzy matching. +In the above example, the match and the `separated_atoms` directives are exact. We can +obtain less precise matches by turning on directives in the configuraiton file. + +``` +functional_group: "SMARTS:[ND1H2]" +functional_group: "SMARTS:[OD1H]" +all_functional_groups_must_match: true + +min_separation: 3 +max_separation: 8 +delta_longer: 4 +ncon_becomes_min_ncon: true +hcount_becomes_min: true + +atomic_property: ATOMIC_NUMBER +atomic_property: HCOUNT +atomic_property: NCON +``` +generates the query file +``` +name: "1aminopropanol" +query { + respect_initial_atom_numbering: true + query_atom { + id: 0 + atom_properties { + atomic_number: 7 + min_ncon: 1 + min_hcount: 2 + } + } + query_atom { + id: 4 + atom_properties { + atomic_number: 8 + min_ncon: 1 + min_hcount: 1 + } + } + separated_atoms { + a1: 4 + a2: 0 + max_bonds_between: 8 + } +} +``` +where now some of the atomic properties are specified as minimum rather than specific +values. And bonds_between is now max_bonds_between, because of the `delta_longer` setting. + +The proto definition enables setting of several attributes that enable conversion of +specific values in the molecules to min/max values in the query proto. + +As a check, the query generated from a molecule should match the starting molecule. + +A more complex example might involve a pharmacophore defined by an aromatic nitrogen +atom and an amide group. In addition, this shows how the number of rotatable bonds +between pharmacophoric atoms can be specified. +``` +functional_group: "SMARTS:[/IWfss2nr5]" +functional_group: "SMARTS:O=C-[NR0]" +all_functional_groups_must_match: true + +min_separation: 5 +max_separation: 10 + +atomic_property: ATOMIC_NUMBER +atomic_property: AP_AROMATIC +atomic_property: SPINACH +atomic_property: FUSED_SYSTEM_SIZE + +extra_rotbond: 3 +``` +which might generate a query file such as +``` +name: "CHEMBL1471636" +query { + respect_initial_atom_numbering: true + query_atom { + id: 7 + atom_properties { + atomic_number: 7 + aromatic: true + fused_system_size: 2 + match_spinach_only: 0 + } + } + query_atom { + id: 10 + atom_properties { + atomic_number: 7 + match_spinach_only: 1 + } + } + query_atom { + id: 11 + atom_properties { + atomic_number: 6 + match_spinach_only: 1 + } + query_bond { + btype: SS_SINGLE_BOND + other_end: 10 + } + } + query_atom { + id: 12 + atom_properties { + atomic_number: 8 + match_spinach_only: 1 + } + query_bond { + btype: SS_DOUBLE_BOND + other_end: 11 + } + } + separated_atoms { + a1: 12 + a2: 7 + bonds_between: 5 + max_rotbond: 5 + } +} +``` +### Example +If we were to look for pharmacaphore equivalents for Dasatinib, that input file +might look like +``` +functional_group: "SMARTS:[OD1]CCN1CCNCC1" +functional_group: "SMARTS:O=C-Nc1c(Cl)cccc1C" +delta_shorter: 2 +delta_longer: 7 +atomic_property: ATOMIC_NUMBER +atomic_property: NCON +atomic_property: AP_AROMATIC +atomic_property: RING_BOND_COUNT +atomic_property: UNSATURATION +atomic_property: RING_SIZE +``` + diff --git a/docs/Utilities/GFP_Tools/AAReadme.md b/docs/Utilities/GFP_Tools/AAReadme.md deleted file mode 100644 index 68b894f8..00000000 --- a/docs/Utilities/GFP_Tools/AAReadme.md +++ /dev/null @@ -1,286 +0,0 @@ -# GFP Tools - -## Background -The `gfp` toolset is a set of tools for dealing with molecular similarity. - -An early realisation was that there are an umlimited number of different ways -in which molecular similarity can be measured. Additionally, composite fingerprints, -which might consist of multiple different fingerprints, can be extremely useful in -model building, and in concordance with human perception. - -The `gfp` toolset is file based set of similarity tools, founded on the principle of -one or more fingerprints defining the similarity measure to be used. Performance -is major design goal. - -## gfp_make -The most convenient way to generate a fingerprint file is via `gfp_make`. This takes -one or more structure files and passes those molecules to one or more -executables that add a fingerprint to the output. Output is a Daylight Thor -DataTree form - an obsolete, but useful format for multiple fingerprints. The -fingerprints themselves are encoded via Daylight's du_bin2ascii function. - -For example, if one were to use `gfp_make` to generate a fingerprint file containing the -MACCS keys (a variant thereof) and a linear path fingerprint, that could be done via -``` -gfp_make -MK -IW file.smi > file.gfp -``` -and the resulting .gfp file might look like -``` -$SMI -FPIW -PCN -FPMK<10EU.4A2.+X.Lp00kn03nE9CuYUh.k6d3;192;61;192;61;1> -| -``` -This record consists of the smiles of the starting molecule in the `$SMI<` tag. Then follows -an encoded linear fingerprint `FPIW`. Then follows the name of the molecule `PCN`. The last -component is the MACCS keys. The delimiter separating records is the `|` character. - -Any number of fingerprints can be in a `.gfp` file. The order can be important, but if all -fingerprint files are made by `gfp_make`, fingerprints will be inserted in a consistent -order. - -Both fingerprints above have the number of bits and the number of bits set at the -end - the 2048 bit linear path fingerprint is truncated above. - -## Fingerprint Types -There are two, orthogonal, attributes of a molecular fingerprint. - -1. The molecular subset it describes -2. How it is represented. - -In terms of molecular subset, the most popular kinds of fingerprints are - -* Linear path fingerprints -* EC type shell fingerprints -* Atom Pair fingerprints -* Topological Torsion (short path) fingerprints -* Dictionary based fingerprints. - -Each of these can be instantiated as either fixed, constant width, binary form or as a -sparse, counted form. Each of these has advantages and disadvantages. - -By convention, all fixed width fingerprints will have a tag that starts with `FP`, and -all sparse, counted fingerprints will have a tag starting with `NC`, for Non Colliding. - -## Collisions -Many fingerprint types can generate very large numbers of features, many more than the -length of a typical fixed length fingerprint - frequently 1k or 2k. - -If a fingerprint that might assume a very large number of possible values is converted -to a fixed form, collisions are inevitable. - -## Performance -Fixed width binary fingerprints can be processed extremely quickly using special hardware -instructions. But they may suffer from collisions, and they are insensitive to repeated -features in the molecule - once the bit is set, it does not matter if the molecule has -one or twenty one instances of that bit. - -Sparse, counted, non colliding fingerprints on the other hand do not suffer from collisions, -all original bit numbers are preserved. But they cannot be processed nearly as quickly -as fixed width binary fingerprints. But they do retain count information, and so -accurately differentiate molecules with differing instances of a feature. - -Generally fixed binary fingerprints work well in practice, and comprise the default -set of fingerprints generated by `gfp_make`. When build svm fingerprint models, -sparse, counted fingerprints generally work better. - -## The standard set. -Much of the use of `gfp_make` has been for finding molecules similar to other -molecules, that are concordant with Chemist expectations of molecular similarity. For -interactive uses, fixed binary fingerprints are preferred. But these fingerprints -suffer from the repeated features flaw mentioned previously. - -### Molecular Properties -One component of the standard set is a set of 8 molecular properties - -* Number of atoms -* Number of rings -* Largest ring size -* Ring atoms -* Number aromatic atoms -* Number fused ring atoms -* Heteroatom count -* Unsaturation count - -These are combined as a `min/max` ratio of each, generating a similarity in the -range `[0,1]`. Making these molecular properties part of the overall similarity -measure, helps mitigate the repeated feature problem. This will show up in -the fingerprint file with the tag `MPR`. - -Another component of the standard set is a linear path fingerprint, which -traces all paths up to length 7 and hashes them to a fixed width, 2048 bit -fingerprint. `FPIW`. - -The other two components are based on a modification of the MACCS keys. This variant -has been heavily modified from the original implementation and now contains 192 -bits. You will see two tags in the fingerprint, `FPMK` and `FPMK2`. The first is -a dictionary based fixed width fingerprint, where a bit is set if the query matches. - -`FPMK2` is somewhat unusual. In order to help with the repeated feature issue, bits in -this fingerprint are only set if the feature occurs more frequently than the average -number of times that bit is hit in a large collection of molecules. So, if the -average molecule (that has that feature) has an average of 2.3 occurrences, `FPMK2` will -only be set for those moleules containing 3 or more instances. - -These fingerprint components are combined with equal weights to yield a composite -fingerprint. Over many years of use, this fingerprint combination seems to do very -well in terms of agreeing with Chemists' notions of molecular similarity. - -## Dictionary based fingerprints. -There are very few dictionary based fingerprint types built into `gfp_make`. The most -commonly used will be - -* maccskeys -* pubchem -* abraham -* tsubstructure. - -### tsubstructure -You can add your own dictionary based fingerprint by using `tsubstructure`. The simplest -way of doing this is to create a file of smarts (or a file of queries). Provide that -to gfp_make via -``` -gfp_make -TSUB S:/path/to/smarts_file file.smi > file.gfp -``` -If you have a file containing the names of query files, change `S:` to `Q:`. This -is the same syntax that the -q option to `tsubstructure` uses. The fingerprint will -show up as `FPTSUB`. There is no provision for multiple -TSUB options - this is -probably a bug that should be fixed. - -By using the `-tsubnc` option, `tsubstructure` will generate a non colliding -counted fingerprint, with tag `NCTS`. - -Early studies demonstrated that generic dictionary based fingerprints seldom do well in -models, and so they are typically not explored extensively during model building. -Custom dictionary based fingerprints, introduced via `-TSUB` can be very useful in -models. - -A common use might be to take some isotopically labelled fragments (possibly -from dicer) and form a fingerprint which is the count of the number of instances -of that fragment. - -``` -gfp_make -TSUB M:/path/to_file.smi%onlysubiso ... -``` -will do that. - -## Fingerprint Weights - -In any system in which multiple fingerprints are combined, the question of weighting -arises. By default, most `gfp` tools use all fingerprints in the input file, and -assign them equal weights in the overall similarity measure. - -This can be overridden on the command line, at the cost of some complexity. - -If you need to do this, all components of the fingerprint will need to be specified. -For example if a fingerprint was generated via -``` -gfp_make -IW:APT -MAP8:Y -TSUB:file.smi -``` -the tags int he fingerprint will be `FPIWAPT` and `NCEC3APT`. In order to use that -fingerprint file with different weights the `gfp` tool should accept - -``` -gfp_tool... -F FPIWAPT,w=0.3 -F NCEC3APT,nc,w=0.7 -P none ... -``` -Once we start specifying things on the command line, we need to specify everything. -While the default fingerprint type is fixed binary, the `NCEC3APT` fingerprint -needs the `nc` qualifier to tell the tool that this is a non colliding fingerprint. -In addition, we need to tell it that there are no molecular properties in the file. - -Full command line initialisation of the `gfp` environment is very complex and is -detailed below. - -## Arbitrary GFP file -While most fingerprints can be constructed within `gfp_make` there may be circumstances -where a complex fingerprint is constructed externally, and it needs to be combined with -other fingerprints. - -`gfp_make` supports a `-JOIN` option which allows joining an external file into the -fingerprint being generated. - -``` -gfp_make -IW -EC3:APT -JOIN file.special.gfp file.smi > file.gfp -``` -will have `FPIW`, `NCEC3:APT` and whatever tag is in `file.special.gfp` in the -output. Alternatively, you can generate fingerprints with `gfp_make` and use -`tdt_join` to merge in an external file - that is what `gfp_make` is doing -internally. - -## Tools -The distribution comes with several tools for dealing with these `tdt` fingerprint files, look -for tools containing `*tdt*`. -. fetch_tdt -. fetch_tdt_quick -. tdt_join -. tdt_sort -. tdt_stats - -## Command Line Initialization -Most `gfp` tools support `-F` and `-P` options. The `-F` option controls fingerprints, and -the `-P` option controls use of molecular properties in a calculation. - -The `-P` option is easy. It can be one of two values - -* MPR -* none - -using -``` -gfp_make -MPR ... file.smi > file.gfp -``` -generates a fingerprint that contains encoded properties in the `MPR` tag. Use the first -form if molecular properties are in the gfp file, and you want to use them in the -similarity calculation. Use `-P none` to not consider molecular properties - even if -they might be present in the input file. - -``` -gfp_make -IW -MPR file.smi > file.gfp -``` -has both `MPR` and `FPIW` tags. But then -``` -gfp_nearneighbours_single_file -n 1 -F FPIW -P none file.gfp > file.nn -``` -will only use the `FPIW` fingerprint. If used without `-F` or `-P` options, the -similarity measure would be an equally weighted combination of `FPIW` and `MPR`. - -This principle applies to all fingerprints. Whatever is specified on the -command line governs what is used - regardless of what might be in the file. - -The `-F` option will generally be of the form `TAG,...` where the information -following `TAG` indicates how it is used. - -The following qualifiers to a tag are recognised - -* w= -* fold= -* hex -* HEX -* ascii -* sparse -* fc or counted -* cosine -* fvb -* rr -* forbes -* sm -* manh -* nc -* mc01 -* mcfsc -* mcsp -* scale = -* soergel -* soergelv -* ctan -* dice -* overlap - -## Atom Count Window -On many gfp tools, the `-W` option can be used to specify an atom count window. -If molecular properties are present in the fingerprint, then the number of atoms in -the molecule is known. If two molecules differ in atom count, by more than the window, -their similarity is not computed. This can result in significant speedups, at the risk -of not finding molecules that might have quite different atom counts, yet still have -a short distance. diff --git a/docs/Utilities/GFP_Tools/evidence.md b/docs/Utilities/GFP_Tools/evidence.md deleted file mode 100644 index 8c73361a..00000000 --- a/docs/Utilities/GFP_Tools/evidence.md +++ /dev/null @@ -1,87 +0,0 @@ -# Evidence - -When we collect a biological result, we usually end up with a numerical value -associated with each molecule. This data is much more than an individual number -associated with each molecule, there is a story in there. There is evidence. - -If we see a particular result for a molecule, and we also see very similar -results for its close neighbours, that is very strong evidence that the data -associated with our starting molecule is good. It is backed up by supporting -evidence. - -If on the other hand, we see a particular result for a molecule, but -we see a very different results for its close neighbours, that raises -questions about the validity of the molecule under consideration. It it -an activity cliff? Is it an experimental artifact or a data problem? Is -the structure correct? There is no supporting evidence for the activity -associated with that molecule. - -We also frequently encounter the case where a molecule may not have close -neighbours, but moderately close neighbours, and where the neighbours -cannot really provide strong evidence one way or another, for the activity -of the original molecules. - -## Evidence -The `evidence` tool reads an activity file and a nearest neighbours file -and, for each molecule, gathers statistics about the molecule, its activity -and the activity of its neighbours. The tool is designed for use in an -interactive structure examination tool like Spotfile, DataWarrior or similar. - -In addition to reporting minimal statistics about the nearest neighbours, -the tool calculates a number of pseudo knn 'models' for each target molecule, -and reports those. - -## Usage -The usage message is -``` -Computes measures of internal consistency for a series of measured values based on how -consistent the values are across the neighbours -Takes a single argument, a TFDataRecord serialized nnbr::NearNeighbours protos, such as -might be produced by the -T option to nn2proto -The following arguments are recognised - -config an EvidenceData::Options textproto file with options - -A descriptor file containing activity values for each molecule - -smiles smiles for each molecule, will be included in the output - -C data is classlfication type (not implemented) - -diff for each column generated, insert an extra column with difference from actual - -v verbose output -``` - -Unlike other tools, this options for this tool are largely driven by the contents -of the textproto configuration file provided by the `-config` option. See the -proto definition to see what options you might like. For testing I have found -this configuration useful. -``` -knn: [1, 2, 5, 10] -closest_value: [1, 2, 5, 10] -piecewise_linear { - min: 0.15 - max: 0.4 -} -``` - -This builds 4 different KNN models, reports the closest activity to the -target within the 1, 2, 5, 10 neighbours, and explores a piecewise linear -weighting function. - -The `-A` option is mandatory and is a descriptor file containing the activity data. - -If you would like smiles in the output, add the `-smiles` option to provide -a smiles file with the smiles for every identifier. - -The tool will ultimately support classification data, but that is not there yet. - -If the `-diff` option is specified, for every calculation that the tool does, it will -add a column to the output which contains the (signed) difference between the computed -value and the actual value. This should be useful when sorting in order to identify -outliers. - -## Typical Workflow - -During testing and development, I used this sequence of commands. -``` -gfp_make -STD data.smi > data.gfp -gfp_nearneighbours_single_file -n 10 data.gfp > data.nn -nn2proto -T Tfile -v data.nn -evidence -smiles data.smi -v -A data.activity -config evidence.textproto Tfile -``` diff --git a/docs/Utilities/GFP_Tools/fingerpring_representations.md b/docs/Utilities/GFP_Tools/fingerpring_representations.md deleted file mode 100644 index d8ea5702..00000000 --- a/docs/Utilities/GFP_Tools/fingerpring_representations.md +++ /dev/null @@ -1,7 +0,0 @@ -# Fingerprint Representations - -## Background -Similarity calculations in LillyMol are generally done with composite fingerprints. -Generally we find that all fingerprints suffer from various deficiencies in how -the capture molecular similarity, either from a human perception perspective, or -from performance in a model. diff --git a/docs/Utilities/GFP_Tools/fingerprints_as_descriptors.md b/docs/Utilities/GFP_Tools/fingerprints_as_descriptors.md deleted file mode 100644 index dc8890c7..00000000 --- a/docs/Utilities/GFP_Tools/fingerprints_as_descriptors.md +++ /dev/null @@ -1,207 +0,0 @@ -# Fingerprints as Descriptors - -## Background - -A common need is to explore use of gfp fingerprints in other contexts, for -example with similarity metrics other than Tanimoto, or to use the features -in a model that requires tabular data input. - -There are two approaches to this. - -## gfp_to_descriptors gfp_to_descriptors_multiple - -Both of these tools read a fingerprint file and convert the fingerprint -data contained into tabular form. The first was developed first and can -only handle a single fingerprint. The second one was built later and can -handle multiple fingerprints in the input file. Generally the second one -should be used. - -### TLDR -``` -gfp_make.sh -ECC3 rand.smi > rand.gfp -gfp_to_descriptors_multiple rand.gfp > rand.dat -``` -For 2000 random Chembl molecules, this generates an output fle with 2001 rows -(includes a header) and 30.8k columns. This means that across those 2000 -molecules there wee 30.8k different features generated. - -### Usage -The usage message of `gfp_to_descriptors_multiple` is -``` -Converts a fingerprint set from a gfp file to a descriptor file - -F specify tag of the fingerprint to be processed, standard .gfp syntax - -x discard bits that hit less than of the time - -n discard bits hit in less than of the fingerprints - -X discard bits that hit more than of the time - -N discard bits hit in more than of the fingerprints - use either -x or -n, and either -X or -N - -y discard records unless they have at least non zero values - -s sort columns by frequency (within fingeprint) - -I prefix for descriptors produced - -u gsub space to _ in multi-token names - -d fold sparse fingerprints to constant width - -q if a bit is absent write as -1 rather than 0 - -o output separator (def ' ') - -v verbose output -``` -Just like any gfp tool, you don't need to specify the fingerprint (-F) -unless you with to. - -### Support -Many of the features generated will appear in only one, or a small number of -molecules, and may not be useful. You can specify the support that a feature -must have for inclusion via one of two different means. - -The `-n -N` combination allows specifying lower and upper bounds on the number -of molecules that contain a feature. Just as a very rare feature may not be -informative, neither will an extremely common feature. Alterntatively the -support requirements can be expressed in fractional form via the `-x -X` options. - -In the example above running -``` -gfp_to_descriptors_multiple -n 10 -N 1990 rand.gfp -``` -reports -``` -Will discard bits with fewer than 20 molecules hit -Will discard bits with more than 1980 molecules hit -Auto sized for - 1 sparse fingerprints -Setting 1 sparse fingerprint weights to 1 -Non colliding fingerprint 0 'NCEC3C' weight 1 -Read 2000 fingerprints from '/tmp/rand.gfp' -Found 30874 bits set in NCEC3C fingerprint -30177 bits below threshold 20 -0 bits above threshold 1980 -Will produce 697 descriptors -``` -So a requirement that a feature appear in 1% of the molecules, results in -the number of features generated from 20k to 697. This dramatic reduction is -because this is a randomly chosen set of molecules. Most sets of molecules -being studied will have considerable internal similarity, and imposing -a support requirement will not have such a dramatic impact. For example -a 2750 member SAR dataset reports -``` -Will discard bits set less than 0.01 of the time -Will discard bits set more than 0.99 of the time -Auto sized for - 1 sparse fingerprints - Setting 1 sparse fingerprint weights to 1 - Non colliding fingerprint 0 'NCEC3C' weight 1 - Read 2746 fingerprints from '/tmp/data.gfp' - Will discard bits that occur in fewer than 0.01 molecules - Will discard bits that occur in more than 2719 molecules - Found 15412 bits set in NCEC3C fingerprint - 14525 bits below threshold 27 - 2 bits above threshold 2719 - Will produce 885 descriptors -``` -so 2750 related molecules generate more features than 2000 random molecules. -And there were only 15k bits found, rather than the 30k found in the random set. - -### Fixed Width Fingerprints -`gfp_to_descriptors_multiple` can also generate a fixed width tabular output. -In the case of sparse fingerprints, this will necessarily come at the cost of -possible collisions, where arbitrary feature numbers get hashed to the same -value. For example on a recent SAR dataset running -``` -gfp_to_descriptors_multiple.sh -d 256 -v -x 0.01 -X 0.99 train.gfp -``` -reports -``` -Will discard bits set less than 0.01 of the time -Will discard bits set more than 0.99 of the time -Auto sized for - 1 sparse fingerprints - Setting 1 sparse fingerprint weights to 1 - Non colliding fingerprint 0 'NCEC3C' weight 1 - Read 2746 fingerprints from '/tmp/data.gfp' - Will discard bits that occur in fewer than 0.01 molecules - Will discard bits that occur in more than 2719 molecules - Will fold fingerprints to a constant width of 256 bits - Found 15412 bits set in NCEC3C fingerprint - After folding to 256 bits, how many of the initial bits hit each fixed width bit - 126 of 256 fixed width bits never set - Of bits set, set between 1 and 483 ave 6.80769 - 14525 bits below threshold 27 - 2 bits above threshold 2719 - Will produce 256 descriptors - 77 of 256 bits with collisions (fraction 0.300781) max collision count 482 -``` -Indeed, folding those 15k bits found, or the survivors of the support requirements -to just 256 bits, does indeed result in significant bit collisions. - -Note that bits are scanned before folding in order to impose support requirements. - -## Generate Fixed Width Fingerprints Directly -While `gfp_to_descriptors_multiple` is a suitable means of converting fingerprints -to tabular form, some fingerprint generators can generate tabular output -directly. - -### iwfp -This tool generates linear fingerprints - like the traditional Daylight fingerprint. -This is a very old tool, and internally, fingerprints are generated as fixed width - -most more recent tools use a Sparse_Fingerprint_Creator object to hold a hashed -set of bits and counts. So `iwfp` can directly generate tabular output of fingerprints. -``` -iwfp -P UST:AQY -a -a -g all -l file.smi > file.dat -``` -will generate a 2001*2049 file (file.smi contains 2000 molecules). The number -of columns can be controlled via the -c option - it defaults to 2048. Note that -two `-a` options are needed in order to get counted features, just one instance -results in binary output. - -### iwecfp -This is LillyMol's extended connectivity (Morgan) fingerprint generator. You can -generate descriptor file output via the `-Y desc=` option combination, generating -\ features. - -``` -iwecfp -R 3 -v -P UST:AHY -Y desc=1024 file.smi > file.dat -``` -generates a descriptor with 1024 columns. Note that because we used the `-v` option -we get a report on the bits set per molecule -``` -Fingerprints had between 16 and 138 ave 75.996 bits set -``` -which is typical of EC type fingerprints. Note that there is not a strong need -for many descriptors, 1024, of even narrower, will see few collisions. - -Note too that if we use a simpler atom typing, `-P UST:Y` we get -``` -Fingerprints had between 14 and 116 ave 63.9895 bits set -``` -We can see the influence of using the more complex fingerprint. A very -complex atom type `-P UST:ACFHLOPSZ` results in -``` -Fingerprints had between 16 and 160 ave 81.679 bits set -``` -Adding one round of Morgan type expansion to this `-P UST:ACFHLOPSZ1` yields -``` -Fingerprints had between 16 and 186 ave 93.869 bits set -``` -Contrast this with linear fingerprints where -``` -iwfp -B -P UST:ACFHLOPSZ1 -a -a -``` -yields -``` -Bits hit between 32 and 1741 average 756.212 -``` -a dramatically different result, typical of the difference in bits generated -by EC and linear type fingerprints. But generally EC type fingerprints do better -in models. - - -### extended_atom_pairs -This is LillyMol's atom pair fingerprint generator. You can generate descriptor -file output via the `-G desc=` option combination, generating \ features. - -``` -extended_atom_pairs -G desc=1024 -v -P UST:ACY -``` -reports -``` -Fingerprint 'NCYAC<' between 10 and 528 ave 179.722 bits set -``` - diff --git a/docs/Utilities/GFP_Tools/gfp.md b/docs/Utilities/GFP_Tools/gfp.md deleted file mode 100644 index bdc45039..00000000 --- a/docs/Utilities/GFP_Tools/gfp.md +++ /dev/null @@ -1,479 +0,0 @@ -# GFP -Within LillyMol the GFP framework is used for similarity computations. - -One of the unusual features of GFP is the idea that similarity is -a composite measure, derived from one or more fingerprints. Also -unusual is the use of molecular properties as a component of -similarity. - -All fingerprints have advantages and disadvantages, and what we have -found is that combining fingerprint types can offer significant -advantages. - -## Theory -There are an unlimited number of ways in which molecular similarity -can be measured. Much of drug discovery can be boiled down to a quest -to replicate similarity as perceived by a biological target. If we have -a similarity measure that is closely aligned with how a target perceives -similarity, we may be able to reliably identify active molecules. - -Possible similarity measures might include things like - -* 3D shape similarity -* 3D charge distribution. -* 3D pharmacaphore score -* Docking score. -* Molecular dynamics trajectory -* Custom 3D scoring function -* Distance in a molecular descriptor space -* Distance in a learned embedding space -* 2D molecular fingerprints. -* Innumerable other things not mentioned above. - -In general, it will be hard to predict which of these methods will be -most useful - if you have a 3D enabled target, you should consider using -that information. But in general, starting with a new problem, it is -hard to predict which similarity based methodology will work best. - -Here we focus on one item, 2D molecular fingerprints. - -### Fingerprints -Molecular fingerprints have a well known history in Cheminformatics. They can -work very well, and are a generally accepted methodology. They can be fast to -compute and fast to use. There are many examples of 2D fingerprint based methods -out-performing more expensive 3D methods, although there are also plenty of -examples where targeted 3D methods are demonstrably superior. - -The most common kinds of fingerprints are - -* Linear path based fingerprints -* Extended connectivity shell based fingerprints -* Atom pair fingerprints -* Reduced graph fingerprints -* Dictionary based fingerprints - -Many other forms are possible, as will be seen below. - -Generally fingerprints are based on subsets of the atoms in a molecule. These may -be contiguous sets of atoms, or disconnected forms - atom pairs. - -#### Binary vs Counted -Completely separate from the shape of the atomic subsets that define the features -of a fingerprint is whether to compute presence or absence of these features or to -keep track of the number of instances in each molecule. Certainly binary fingerprints -have a significant advantage in terms of speed, since they can be evaluated using -popcount hardware instructions. Several publications have studied differences in -performance between counted and binary fingerprints, and most have reported in -favor of binary. That is different from the experience at Lilly, where for model -building, counted fingerprints generally perform better. But there is a major -run-time performance penalty for using counted fingerprints - depending on the -application. - -We will see that the GFP framework can deal equally with counted and binary fingerprints. - -### gfp_make -This tool is used for generating fingerprints. See [gfp_make](gfp_make.md). All -gfp tools require fingerprint input, and usually those are generated by gfp_make. -By convention, fingerprint files have the suffix `.gfp`, so a typical invocation -will usually look like -``` -gfp_make file.smi > file.gfp -``` -and the resulting file can then be used by any of the `gfp_*` tools. Mostly the -`gfp_*` tools have no idea they are dealing with molecules, they just process -fingerprints. - -### Fingerprint Files -The default format for a fingerprint file is a Thor DataTree (TDT) form as -defined by Daylight. Today this is an archaic file format, but works well -in this use case. It is an ASCII format, which enables use of various text -based tools. Parsing the fingerprint file is seldom a rate limiting step, -although there are exceptions to that.... - -If `gfp_make` is invoked with no arguments, the resulting fingerprint file might look like -``` -$SMI -FPIW -FPMK -FPMK2 -PCN -MPR<1EI++EI..k..2;64;12;64;12;1> -| -``` -The `$SMI<` token is the smiles of the molecule, and `PCN<` is the name. These -are only ever treated as text by GFP tools, the smiles is included just as a -convenience for subsequent processing. - -Generally smiles files must include a name, in order to populate `PCN`. Many -gfp tools will run if the name is empty, but generally this will cause problems. -All molecules should have names! - -There are several tools that can help process TDT type files, these include -``` -tdt_join -tdt_merge -tdt_sort -tdt_stats -``` -those will be covered elsewhere. - -The default fingerprint, above, consists of 4 components. - -#### FPIW -A linear path based fingerprint. Conceptually similar to the Daylight fingerprint, -or RDKit linear fingerprint. A default atom typing is used, and paths up to length -7 are perceived. The fingerprint is 2048 bits. - -This is a binary, fixed width fingerprint. By convention, all fixed width, binary -fingerprints start with 'FP'. - -#### FPMK FPMK2 -LillyMol contains an implementation of the MACCS keys, although the current -implementation has diverged considerably from the original definition, currently generating -192 features. While FPMK is a traditional dictionary based fingerprint, -recording the presence or absence of each of the 192 features in the dictionary, FPMK2 -is slightly different. - -A well known shortcoming of binary fingerprints is their insensitivity to repeated -features. This can lead to what to us look like significantly different molecules -being classified as similar - because the measure of similarity does not -recognised differing numbers of a feature. In order to help mitigate this, the MK2 -fingerprint is a binary fingerprint that is only set if the number of features -is above some threshold. That threshold is determined by profiling a large -collection of molecules, and figuring out, for those molecules when the feature -is present, how many times is average. Once that is known, the feature will only -be set if the number of instances is above that threshold. Median might be a better -measure than average... - -#### MPR -In a further attempt to deal with repeated features, the MPR fingerprint component -encodes 8 molecular properties that are incorporated into the overall measure -of similarity. This is NOT a binary fingerprint, but contains encoded values for -the following features - -1. Number heavy atoms -2. Largest ring size -3. Number of rings -4. Number of ring atoms -5. Number of aromatic atoms -6. Number of fused ring atoms -7. Number of heteroatoms -8. Number of unsaturated atoms. - -These are combined as ratios. For example if molecule 1 contains - -* natoms = 20 -* largest_ring = 6 - -and molecule 2 contains - -* natoms = 10 -* largest_ring = 7 - -That component of the similarity calculation is `10/20 + 6/7`. Across all -properties, the min is divided by the max, generating a number that is always -in the range [0,1]. - -There are of course an unlimited number of molecular properties that could be -used for this purpose. Other properties that would seem like very good candidates -include - -* longest path -* atoms in scaffold -* size of largest ring system -* number linker atoms -* rotatable bonds - -Although clearly some of these would be more expensive to compute than the current -set of features. - -These have just been added as an optional usage in gfp_make. -``` -gfp_make -MPR=none,natoms,nrings,rotbond ... -``` - -See the section below for an expanded discussion of molecular properties in gfp. - -## Other Fingerprints -gfp_make can generate a very large number of fingerprints. There may be one -or more versions of -* Linear Fingerprints -* EC fingerprints -* Atom Pair fingerprints -* Topological Torsion fingerprints -* Molecular Property fingerprints -* MACCS keys fingerprints -* PUBCHEM fingerprints -* clogp, BioByte or Marvin -* psa -* ErG -* Other reduced graph forms. -* CATS -* iwdescr properties -* Structural abstraction fingerprints -* Ghose Crippen fingerprints -* Ring fingerprints -* Ring substituion fingerprints -* A fingerprint defined by tsubstructure matches -* Fingerprints derived from 3d structures (needs corina). - -Many of these fingerprints can be further modified by atom type and whether they are -fixed binary forms, or sparse, non-colliding forms. Just taking all pairwise -combinations of the 19 types mentioned above yields 171 diffrent fingerprints. See -[atom typing](/docs/Molecule_Lib/atom_typing.md) for information on atom typing in LillyMol -and how use of multiple atomic properties could explode the number of possibilies. - -gfp_make is separately documented [gfp_make](gfp_make.md). - -### Multiple Fingerprints -One of the hallmark features of gfp is the use of multiple fingerprints. All -gfp tools are designed to examine their input file(s) and determine which fingerprints -are present, and to use those fingerprints for the calculation. This can be any -number of fingerprints. If molecular properties are present, they will be used -as just another 'fingerprint'. - -When there are multiple fingerprints in a file, the default similarity measure is -the average of the components. You can usually override that on the command line. For -example if you had a file with two fingerprints -``` -gfp_make -FPAB -FPXY file.smi > file.gfp -``` -and wanted to find the nearest neighbours within that file, but you wanted -to provide a higher weight to the AB fingerprint. -``` -gfp_nearneighbours_single_file -F FPAB,w=0.8 -F FPXY,w=0.2 file.gfp > file.nn -``` -would do that. Note that the 'FPAB' and 'FPXY' are *not* the same as the options -given on the command line, they must be the same as the tags placed in the file. -So, specifying -FPAB on the command line might result in a fingerprint tag 'FPAB2' -which might be the result of some default condition having been applied. The -only way to know for sure is to look in the fingerprint file. - -This mechanism gets more complicated if sparse, non colliding fingerprints are -involved. Let's imagine in the case above that FPXY produces a non colliding -fingerprint. By convention, these start with 'NC'. But if you specify weights -on the command line, all conventions are discarded, and you are in complete -control. Imagine that specifying -FPXY to gfp_make results in a fingerprint -with tag 'NCXY' in the fingerprint file. -``` -gfp_nearneighbours_single_file -F FPAB,w=0.8 -F NCXY,w=0.2,nc file.gfp > file.nn -``` -you need to tell it that NCXY is a non colliding fingerprint. - -If molecular properties are present, a weight can be specified via the -P -option, `-P 0.1` or `-P none` to not use the molecular properties that might -be in the fingerprint file. - -Many tools consume multiple fingerprint files. It is essential that a -common set of fingerprints be in both files. Generally the order of the -fingerprints for each molecule does not matter, so something like -``` -$SMI -PCN -FP1<....> -NC2<.............> -MPR<...> -| -``` -and -``` -$SMI -PCN -MPR<...> -NC2<.............> -FP1<....> -| -``` -can usually be compared. When multiple files are being processed, the -first file read will define what fingerprints must be present in all -subsequent files. There can be extra fingerprints in those other files, -but those found in the first one must be present. Across tools it is -unpredictable as to which file is opened first. - -Bottom line - make things easy for yourself and use fingerprint files -both generated by the same options to gfp_make. - -### Other Fingerprint Forms -The default encoding used for fingerprints is something defined by Daylight. -GFP tools can consume fingerprints encoded in several different forms. For example -if you have hex encoded bits, `-F FPHS9,hex` means that the fingerprint tag -'FPHS9` is a fixed width fingerprint, lowercase hex encoded. Adding 'HEX' means -it is uppercase hex encoded. Adding 'ascii' means it is the most verbose of -all forms '00011101'. - -Dense fingerprints can be read from a sparse representation by adding '-F FPAB,sparse' -``` -FPAB<3,4,7-10,15:64> -``` -where the ':64' specifies that the number of bits in this fingerprint is -64. - -Libsvm fingerprints can be read as sparse fingerprints via -``` --F NCFP,sparse,nc -``` -where the input file might look like -``` -NCFP<3:1 10:7 20:1 31:4> -``` -for a fingerprint with 4 features. The features, 3, 10, 20, 31 must be in order. - -The gfp framework also has a fixed width, counted fingerprint form, with the -counts encoded as bytes. This is not often used, but processing these fingerprints -can be a lot faster than processing sparse fingerprints. - -## Distance measures. -For all fingerprints, the default similarity measure is Tanimoto, but it is -possible to apply different similarity metrics to individual fingerprints. - -For example if you wished to apply a cosine similarity measure to fingerprint -`FPXYZ` and a regular Tanimoto to `NCAB3`, that would be specified as -``` --F FPXYZ,cosine -F NCAB3 -``` -and of course all previously mentioned tokens can be appended as needed. The -following tokens at the end of a fingerprint tag specification are recognised - -* w=weight -* fold=nfold fold a binary fingerprint `nfold` times -* nc fingerprint is sparse, non colliding form -* hex lowercase hex form -* HEX uppercase hex form -* ascii '01' form for binary fingerprints -* sparse libsvm form for sparse fingerprints -* counted fixed size counted fingerprints -* cosine use cosine similarity -* fvb Flinger, Verducci and Blower modified Tversky -* rr Russel Rao similarity -* forbes Forbes similarity -* sm Simple Matching -* manh Manhattan distance -* soergel, soergelv Soergel - and variant -* ctan continuous Tanimoto -* dice Dice similarity -* overlap overlap distance - -Most of these have been added for a specific project and are seldom used. They -are not fully described here. Most are well known. - -There are also other fingerprints related to a failed multi-valued fingerprint idea, -initially intended to deal with conformers. Do not even think about -trying to use those. Other features in the code, do not use, they should be -removed. - -It is possible that, for any given project, one or more of these other -distance measure might be better than Tanimoto. We seldom take the time to -explore that. - -### Molecular Properties. -The most commonly used molecular propties, those activated with the -defalt -MPR option were discussed above. There is however a great deal more -flexibility available with molecular features in the GFP framework. - -Molecular properties can be of either integer, or floating point forms. The -default set is integer form, and those are typically used via the ratio -scheme previously outlined. - -Adding '-P dice' will result in the Dice similarity measure being used -for the integer molecular properties. - -Continuous molecular properties can be specified via '-P desc=...'. These must be -encoded as -``` -DSC<0.1 3.14 8 0 4.8> -``` -where the tag can be anything, although DSC is used by convention - but not -implicitly recognised by GFP. - -Several -P possibilities are recognised. The first component of the flag should -start with the tag of the descriptors, 'DSC' above. Somewhere within the tag -there must be a 'desc=' directive. - -* -P tag,desc= specify the TDT tag containing molecular descriptors (DSC above). All -the features in the input are used. - -* -P tag,desc=all Same as the previous, all features used. - -* -P tag,desc=n Only use the first 'n' descriptors in the input - -* -P tag,desc=5,w=0.1,cartesian tag, use first 5 descriptors, use Cartesian distance. -But beware, unless the values are unit vectors, this will result in invalid distances. - -* -P tag,desc=5,w=0.1,dice tag, use first 5 descriptors, use Dice distance. - -* -P tag,desc=,w=0.1,scale=0.1 tag, all descriptors, weight, scale all values on input. - -* -P tag,desc=all,w=0.2,exp1= - -* -P tag,desc=10,w=0.1,exp2= - -The usual way in which molecular descriptors are added to a gfp file is via -`gfp_add_descriptors` which takes as arguments a descriptor file and a gfp -file and will match up the files via the identifiers, and insert an extra -dataitem in each fingerprint. - -Note that computationsl involving molecular properties tend to be quite -slow compared with bit comparison operations. - -### Tversky -Many tools support a '-V' option, which allows specification of Tversky -parameters. It is always super confusion which way the parameters go, and -it is often easiest to try ot both ways and see. - -For example, start with benzene and do a similarity search into a larger -set with the opposite Tversky settings. -``` -gfp_make -NCIW benzene.smi > benzene.gfp -gfp_make -NCIW haystack.smi > haystack.gfp -gfp_lnearneighbours -n 10 -p benzene.smi -V a=2 -V b=0 haystack > benzene.nn -``` -we find that any molecule that contains a benzene atom matches with -zero distance. These might include -``` -COC1=C2C(=CC(=C1)N)SC(N)=N2 CHEMBL587076 -C1=C(O)C=CC(=C1)[C@H]1C[C@@H]1CN CHEMBL492304 -C(=N)(N)NN=CC1=CC=CC=C1 CHEMBL1183425 -N(C)(C1=CC=CC=C1)CCCBr CHEMBL2380319 -C1CC2=CC(=CC=C2C(=O)C1)O CHEMBL3288306 -C1(=CC=CC(=C1C(=O)CC)O)O CHEMBL3274338 -C1=CC=C2C(=C1)[C@@H]1C(C2)[C@H]1N CHEMBL4302026 -N(=O)(=O)C1=C(N)C=CC=C1N CHEMBL167321 -N(O)(C1=CC=CC=C1)CCC CHEMBL358605 -C1=CC=C2C(=C1)NC(N)S2 CHEMBL568765 -``` -Essentially we are doing a substructure search for benzene. - -If on the other hand we reverse the parameters, we might find as matches - - -## Tools -All programmes that deal with gfp fingerprint files are prefixed with 'gfp_', -so tab completion is your friend. - -The tool families include: - -* gfp_make makes fingerprint files -* gfp_distance_matrix compute a distance matrix -* gfp_distance_filter filter fingerprints by similarity to another set -* gfp_flatten_counted turn counted bits to a max of 1 -* gfp_leader sphere exclusion clustering -* gfp_incremental_diversity how does a collection evolve -* gfp_lnearneighbours finds nearest neighbours between two fingerprint files. -* gfp_naive_bayesian Naive Bayesian models - sometimes pretty good! -* gfp_pairwise_distances Pairwise distances from a larger set. -* gfp_profile_activity_by_bits Compare bit presence against activity -* gfp_single_linkage Single linkage clustering -* gfp_spread* maximum diversity selection -* gfp_standalone operates on a smiles file, computing fp's on the fly -* gfp_to_descriptors* convert gfp files to descriptor form. -* gfp_svmfp_score* scores an svmfp model. -* gfp_to_svm_lite converts gfp files to svm_learn input -* evidence builds local knn models to examine activity consisteny. - -These tools are described separately. Many have multiple variants. Where there -is a '_standard' variant, that is a special purpose tool that only operates -on fingerprints produced by `gfp_make -STD` and most are multi-threaded. -By dropping the complexity of dealing with arbitrary fingerprint forms, -efficiency is gained. Over time, the more flexible forms have closed that -performance gap. - diff --git a/docs/Utilities/GFP_Tools/gfp_leader.md b/docs/Utilities/GFP_Tools/gfp_leader.md deleted file mode 100644 index 96334b64..00000000 --- a/docs/Utilities/GFP_Tools/gfp_leader.md +++ /dev/null @@ -1,194 +0,0 @@ -# gfp_leader - -A clustering tool also known as sphere exclusion. - -## Use Case -The most common use case for `gfp_leader` is when you have a set of molecules -which can be ordered by some desirability function, and when you need to select -a limited number of desirable, yet diverse candidates. - -The desirability measure might be things like - -. A docking score -. One or model model scores -. Medchem demerits -. Heavy atom count -. Other... - -There is almost always some relative desirability score that can be assigned to -a set of molecules, even if just heavy atom count in order to favor smaller molecules. - -## Algorithm -A radius must be specified. The guarantee is that no two selected molecules will -be closer than this distance. What distance to use will be a function of what -fingerprints are being used. For the standard `gfp` set, typical distances might -be in the range of 0.1 to 0.2, although there is no 'correct' value. It depends on -how many molecules you have, how many you must select, and how confident you are in -the scoring function. - -If the scoring function is very good, you might be comfortable with a larger radius, -confident that the model is giving accurate scores to non leader molecules in -each cluster. If on the other hand you had a very weak desirability, heavy atom -count, you might want a shorter radius, since the 'model' is not very predictive -of the other molecules in each cluster. There are many factors to consider, and -choice of threshold cannot be strictly prescribed. - -Some have criticized leader for the need to have an arbitrary distance. More -expensive clustering algorithms may not properly emphasise the most desirabile -molecules, and/or have their own tuning parameters. - -It is also possible that any given choice of the threshold will yield too many, -or too few clusters. You may need to run multiple times with different thresholds -in order to get a desirable outcome. - -Leader scales well. Worst case would be if every molecule ended up in its own -cluster, a very small radius and no duplicates. That would take `n * (n - 1) / 2` -calculations. The other extreme would be if every molecule ended up in the same -cluster, which would take `n - 1` calculations. So it is always better than `n^2`. - -Leader starts with the order set of candidates. The first item is marked as selected -and then all subsequent items that are within `threshold` of that first item are -placed in the first cluster. It then returns to the beginning of the list and finds -the first unselected item. Subsequent items that are within `threshold` of that -leader are placed in its cluster. This continues until all items have been selected, -or until the number of items needed have been selected. - -## HowTo -starting with a smiles file, a likely sequence of commands might be -``` -gfp_make file.smi > file.gfp -gfp_leader -t 0.15 -v file.gfp > file.ldr -``` - -If you have a large file, you may want to run `gfp_leader_tbb` in order to -run multi-threaded. If you are using the standard fingerprints `gfp_make -STD` then -you can use `gfp_leader_std` which is also multi-threaded. - -The following options are supported by the base, serial, version. -``` -Performs leader clustering on a set of fingerprints -Usage - -C maximum number of clusters to find - -t specify distance threshold - -t col= threshold is column of the name field - -t tag= threshold for each molecule in dataitem - -H threshold for each molecule in dataitem - -m maximum cluster size - -M max cluster size for each molecule in - -M col=nn max cluster size is column of the name field - -S score tag - -S col=nn score is column in the name field - -I specify identifier tag - -r sort clusters by distance from leader - -E specify pool object dataitems to be echo'd (default $SMI and PCN) - -E ALL echo all dataitems from the pool file - -A file(s) of previously selected molecules - discard all within threshold - -a use as the threshold when comparing against the -A file - -L write fingerprints discarded by -A file(s) - -D ... miscellaneous options, enter '-D help' for info - -s specify max pool size (not needed) - -F ... gfp options, enter '-F help' for details - -V ... Tversky specification, enter '-V help' for details - -v verbose output -``` - -### -C ncluster -The number of clusters to form. By default, clustering will continue until -all fingerprints have been placed into a cluster. - -### -t distance -The threshold used for cluster formation. Note that a constant threshold is -used for all selections. This may, or may not be what you want. - -### -t tag=TAG -Specify a per-molecule threshold. For the most desirable molecules you may want -to use a small threshold, so only the very close neighbours get clustered together -with the most desirable molecules. Then use a larger threshold with less desirable -molecules. -``` -$SMI -PCN -THRESHOLD<0.11> -| -``` -This is the same as `-H THRESHOLD`. It is more common to place the threshold -in a column of the name, see below. - -### -t col=col -Instead of a TDT tag, per-molecule thresholds can be specified as part of the name -field -``` -$SMI -PCN -| -``` -In the example above, `-t col=2` means that the threshold for each molecule if -found in column 2 of the name. - -### -m max -Maximum cluster size. Once a cluster has \ members, stop adding -items to that cluster. - -### -M TAG -Specify a per molecule maximum cluster size via a TDT tag. -``` -$SMI -PCN -MAX_CLUSTER_SIZE<50> -... -``` -In this case, 'methane', if it is selected as a leader, can only form a -cluster with as many as 50 items. More commonly specifying a maximum -cluster size is done via a column in the name. -``` -$SMI -PCN -... -``` -and then specifying `-M col=2`. - -### -S tag -S col=nn -The relative score for each molecule is in a TDT tag. Usually it is easier -to sort the input file by score ahead of time. - -### -A fname -Specify a file of fingerprints containing molecules that are to be treated as -having been previously selected. For example if running two different selection -methods, a docking score and a QSAR model, you can either - -* Combine the scores into a composite measure and sort by that. -* Run leader with one method, then again with the other method. - -A workflow that needs to select 1000 moleules might look something like. -``` -sort_by_docking_score file.smi > file.sorted1.smi -gfp_make file.sorted1.smi > file.sorted1.gfp -gfp_leader -C 500 -t 0.2 -v file.sorted1.gfp > file.sorted1.ldr -nplotnn -n 0 file.sorted1.ldr > file.sorted1.sel - -# The selections from the first method are now available. Fetch those -# fingerprints from the previously generated file, or regenerate. -fetch_tdt_quick -c 2 file.sorted1.sel file.sorted1.gfp > file.sorted1.sel.gfp -sort_by_qsar_score file.smi > file.sorted2.smi -gfp_make file.sorted2.smi > file.sorted2.gfp -gfp_leader -A file.sorted1.sel.gfp -C 500 -t 0.2 -v file.sorted2.ldr -nplotnn -n 0 file.sorted2.ldr file.sorted2.ldr.smi -``` -The two files, 'file.sorted1.sel' and 'file.sorted2.sel' contain the -selections. - -### -a threshold -By default, unselected items are discarded if they are within the -global threshold (-t) of any previously selected item (-A). That -can be changed via the `-a` option. - -### -r -By default, when a cluster is written, it will be in the order of the -molecules in the input file - which was ordered by desirability. In some -cases it may be desirable to have cluster members sorted by distance -from the leader. - -### -s \ -The number of fingerprints in the input. Seldom needed. By default the tool counts -the fingerprints, and then allocates internal arrays. Once upon a time this -may have been slow. diff --git a/docs/Utilities/GFP_Tools/gfp_make.md b/docs/Utilities/GFP_Tools/gfp_make.md deleted file mode 100644 index 4c83b03a..00000000 --- a/docs/Utilities/GFP_Tools/gfp_make.md +++ /dev/null @@ -1,55 +0,0 @@ -# gfp_make - -gfp_make is the primary means by which structures are converted into fingerprint form. - -A typical usage might be -``` -gfp_make file1.smi > file1.gfp -gfp_make file2.smi > file2.gfp -gfp_* file1.gfp file2.gfp > result -``` -where the result is some kind of comparison between 'file1' and 'file2'. - -## Details -gfp_make is a fundamentally simple too, that converts a set of fingerprint specifications -given on the command line to a (possibly pipelined) invocation of one or more -tools that add fingerprints to an input stream. - -For example if you invode with the default arguments, and add the -v (verbose) option, -it will show what commands are executed. That might look like -``` -temperature -J MPR -E autocreate -g all -l file.smi | - maccskeys -E autocreate -n -J FPMK -J LEVEL2=FPMK2 -f - | - iwfp -E autocreate -J FPIW -f - -``` -where the default fingerprint, -MPR -IW -MK -MK2, have been turned into three -program invocations. The tool [temperature](/docs/Molecule_Tools/temperature.md) -initiates processing, standardising the smiles, stripping to the largest -fragment, and then writing the fingerprint, with molecular properties, to the output. -The next stage of the pipeline is an invocation of [maccskeys](/docs/Molecule_Tools/maccskeys.md) -which is instructed to generate two fingerprints, the normal, and the level 2 fingerprint -which accounts for number of times set. The resulting stream is then passed to -[iwfp](/docs/Molecule_Tools/iwfp) which adds the linear fingerprint to the -stream. gfp_make is responsible for constructing these command pipelines, which -can be of arbitrary complexity. - -### Implementation -The current gfp_make is a perl script, that was first initiated in the 1990's and -has been in continuous use ever since. It has grown considerably and is now too -complex. Work is underway on a ruby alternative that promises to be more -flexible and maintainable. The design has however proven to be remarkably -adaptable and useful, with gfp_make now also serving as the core of svmfp models. - -## Fingerprints -Numerous tools within LillyMol have been -implemented so they are compatible with the expectations of gfp_make. - -Primarily this involves two functionalities. - -1. Read a molecule and generate a fingerprint. -2. Read an already formed fingerprint stream froms stdin and insert one or more extra fingerprints. - -We see that in the above example, temperature is satisfying the first requirement, -while maccskeys and iwfp the second. The order in which various tools are added -to the pipeline is hard coded within the logic of gfp_make, and is not meaningful. - diff --git a/docs/Utilities/GFP_Tools/iwstats.md b/docs/Utilities/GFP_Tools/iwstats.md deleted file mode 100644 index e1a8e3df..00000000 --- a/docs/Utilities/GFP_Tools/iwstats.md +++ /dev/null @@ -1,73 +0,0 @@ -# iwstats - -`iwstats` is a command line tool that computes a variety of statistical -measures comparing an experimental set of continuous values and a -predicted set of such values. - -The observed and predicted values can both be in the same file, or can come -from different files. - -Note that this could obviously be done using R or Julia or Python but in -an environment where pre-requisites have often been difficult to achieve -this works well and is fast. And some measures have arisen at Lilly, and -this is the reference implementation. - -## Minimal Invocation. -If you have a tabular file with the experimental values in column 2 -and the predicted values in column 3 -``` -iwstats -e 2 -p 3 file.dat -``` -will yield various statistical association measures between those two columns. If -the input file has a header record, add the `-j` option, or `-s 1`. - -If the experimental results are in a different tabular file, `id expt` form -``` -iwstats -E expt.dat -e 2 -p 2 -j file.dat -``` - -## Options -The tool is complex and recognises the following options -``` -Computes Bsquared and other statistics - allows missing values - -e column for experimental (measured) values - -E activities are in a different file - -z strip leading 0's from identifiers when using -E - -p column for predicted values - -s skip records at the top of each file - -n process only the first/best records - -t compute Bsquared for sections of the data. For example, - -t 10 would report values for the first/best 10, 20, 30 values - -P sample the first/best percent of the data - -j treat as a descriptor file - -M missing value string - -q quietly ignore missing values - -R . randomise the sort when duplicate predicted values present - -c number of valid pairs needed for producing correlations (default 20) - -w when duplicate values present, suppress computation of best - and worst BSquared values - -r max relative error allowed computations - -T truncate predicted values to the experimental range - -h use traditional Q2 formula - -b compute distribution functions across buckets - -F calculate number of predictions outside fold of experimental (multiplicative) - -D calculate number of predictions differing by from experimental (additive) - -k just skip predicted values that have no experimental value (repeat for quiet) - -d compute Dave's cMSD rather than cMSR - -o cutoff for active/inactive (for enrichment metrics BEDROC EF ...) - -a BEDROC alpha value (default 20) - -f Enrichment Factor default fraction (default 0.5) - -u discard experimental values below - useful for studying actives only - -U keep only the most active experimental values - -m do analysis by data in column or descriptor name - -L write residuals to - -v verbose output -``` - -Some combination of `-E`, `-e` and `-p` must be used in order to specify where the -column of experimental data, and the column of predicted values are located. - -### Missing Values -Within LillyMol many tools use `.` as the missing value string. That can be changed -with the `-M` option. When missing values are present, various warnings will be -issued, but if the `-q` option is used, most of those warnings will not be issued. diff --git a/docs/Utilities/General/dicer_fragments_collate.md b/docs/Utilities/General/dicer_fragments_collate.md deleted file mode 100644 index 7185af1e..00000000 --- a/docs/Utilities/General/dicer_fragments_collate.md +++ /dev/null @@ -1,59 +0,0 @@ -# dicer_fragments_collate - -This tool aggregates multiple `dicer_data::DicerFragment` protos -into an aggregated form. This problem might arise in a situation -where dicer has been run across multiple sets of molecules, each -of which generated its own set of protos, which need to be -aggregated across those multiple results. - -For example if one file contained -``` -iso: ATT smi: "O[1CH]=O" par: "CHEMBL1213530" nat: 3 n: 100 -``` -and another contained -``` -iso: ATT smi: "O[1CH]=O" par: "SIGMA28760326" nat: 3 n: 50 - -``` -the final result would be -``` -iso: ATT smi: "O[1CH]=O" par: "CHEMBL1213530" nat: 3 n: 150 -``` -where the `n` attribute is the sum of the individual vales, -and the `par` value is that of the first one encountered as -the files are scanned. - -## Scalability -When run on large collections, with few restrictions on what fragments -get formed, this tool can consume large amounts of RAM, since the -data is loaded into internal hashes. - -## Arguments -The arguments are -``` -Aggregates multiple dicer_data::DicerFragment text proto files - -p minimum support level (n: value) for inclusion - -nosmi each record does not contain a leading non-unique smiles - -r report progress every items read - -minimal extract only the essential information from the protos - -tfdata data is TFDataRecord serialized protos - -v verbose output -``` - -### -p -Impose a support requirement for output. If the sum of all values is less -than the result will not be written. - -### -nosmi -The input is textproto form, and does **not** contain a leading smiles string. - -### -r -Report progress every items processed. This can be a long running task and -knowing how it is progressing can be helpful. - -### -minimal -The default mode is to store the protos in memory. With this option, only the minimal -amount of data is extracted from the protos. This can help with memory consumption. - -### _tfdata -The input data is TFDataRecord serialized protos. The output is always textproto form. diff --git a/docs/Utilities/General/fetch_smiles_quick.md b/docs/Utilities/General/fetch_smiles_quick.md deleted file mode 100644 index 2632128c..00000000 --- a/docs/Utilities/General/fetch_smiles_quick.md +++ /dev/null @@ -1,111 +0,0 @@ -# fetch_smiles_quick - -Extract selected rows from a file. Performs an inner join between columns -in two different files. The most common usage is to start with two files - -1. File containing identifiers of interest -2. File containing a superset of the identifiers of interest. - -``` -fetch_smiles_quick idfile file.smi > idfile.smi -``` -where the identifiers in `idfile` are assumed to be in column 1 and -the identifiers in the smiles file, `file.smi` are assumed to be -in column 2. - -Note that the order in which the output is written is according -to position in `file.smi`. If it is important to preserve the -order specified in `idfile`, use `fetch_smiles` instead. - -# HOWTO -The following options are recognised. -``` -Fetches records from one file based on identifiers in one or more other file(s) - -c identifier column in identifier file - -C identifier column in smiles file - -C RX= identifier is whichever column(s) match - -d ignore duplicate identifiers in identifier file - -q quietly ignore duplicate identifiers in identifier file - -a write all instances of identifiers in smiles file - by default, only the first is written - -X write smiles records not in to - -Y write identifiers not in to - -w write the -Y file as a smiles file (swap columns) - -k suppress addition of info from identifier file - -n string to insert between record and info from identifier file - -x invert behaviour, selection becomes deselection - -z strip leading zero's from identifiers - -j identifier file is descriptor file, skip header record - -b stop processing identifier file on error, but continue processing (dangerous) - -i column separator in identifier file - -I column separator in smiles file - -S first files are identifier files, last is haystack. Create many subsets - -u suffix for -S files created - -g start number for files created (-g 1 for dopattern) -``` - -# Options - -## -c_\ -Identifier column in identifier file. Default is column 1. - -## -C_\ -Identifier column in smiles file. Default is column 2. - -## -C RX=\ -Identifier is whichever column(s) match \. - -## -d -Ignore duplicate identifiers in identifier file. By default duplicate -identifiers in the identifier file is a fatal error. - -## -q -Quietly ignore duplicate identifiers in identifier file. - -## -a -Write all instances of identifiers in smiles file - by default, -only the first is written. - -## -X \ -Write smiles records not in \ to \. - -## -Y \ -Write identifiers not in < to \. - -## -w -Write the -Y file as a smiles file (swap columns). - -## -k -Suppress addition of info from identifier file. - -## -n \ -String to insert between record and info from identifier file. - -## -x -Invert behaviour, selection becomes deselection. - -## -z -Strip leading zero's from identifiers. - -## -j -Identifier file is descriptor file, skip header record. - -## -b -Stop processing identifier file on error, but continue processing (dangerous). - -## -i \ -Column separator in identifier file. Note that things like `-i tab` are -recognised, and converted to their non printing form. Enter `-i help` for -a list of the directives that are recognised. - -## -I \ -Column separator in smiles file. - -## -S \ -First files are identifier files, last is haystack. Create many subsets. - -## -u \ -Suffix for -S files created. - -## -g \ -Start number for files created (-g 1 for dopattern). diff --git a/docs/Utilities/General/iwcut.md b/docs/Utilities/General/iwcut.md deleted file mode 100644 index 1cb6e319..00000000 --- a/docs/Utilities/General/iwcut.md +++ /dev/null @@ -1,19 +0,0 @@ -# iwcut - -`iwcut` is very much like the standard Linux cut command. It was built -before `cut` adopted some of the features built into `iwcut`. The most -notable remaining difference is that `cut -f 2,1` will write out columns -1 and 2, whereas `iwcut -f 2,1` will write out column 2 followed by -column 1. This makes it very useful for doing things like extracting -a smiles file from something like -``` -methane,C,... -``` - -It also understands column headings, so if you have a tabular file with a -header, it can extract columns by name, while also preserving the column -order requested on the command line. In that case, it also knows that the -first column is 'special'. - -There are a variety of other options specific to the kinds of files we -handle. diff --git a/docs/Utilities/General/nn2csv.md b/docs/Utilities/General/nn2csv.md deleted file mode 100644 index bb043da0..00000000 --- a/docs/Utilities/General/nn2csv.md +++ /dev/null @@ -1,71 +0,0 @@ -# Near Neighbour Files to CSV - -## Background - -Most of the gfp_* tools generate output files in a TDT form, -which can then be passed to nplotnn for conversion to smiles or -other forms. - -For conversion to smiles forms, it works well, and can easily -perform rudimentary filtering on the neighbour list. - -Over time, more and more people have wanted to be able to convert -to tabular form, and I have tried to make nplotnn generate tabular -output. That was a mistake, it really cannot do that and I will -try to remove those attempts. - -## nn2csv -This is a very simple tool that is designed to do one thing, and to -do it very well. It converts a nearest neighbour file to csv form. -It hardly does any filtering, and has only a couple of options. -I thought hard about even adding the options I did, perhaps it should -have none. - -Like nplotnn, it can process multiple files. Clearly in order to generate -a tabular file, it must scan them all in order to find what is the -maximum number of neighbours, which then controls how many columns -are needed. Molecules that do not have enough neighbours are padded with -'*' characters. - -Here is the usage message -``` -Converts a .nn file to csv form - -o set token separator (default ,) - -n only write neighbours - -s ... sort the targets, enter '-s help' for info - -z do not write molecules with no neighbours - -v verbose output -``` -which is refreshingly brief. - -with no options, it write all data from the input file. I anticipate -this being the most common use. - -Since it was easy, I added the ability to trim the number of columns -generated with the `-n` option. And also since it was easy, I added -sorting - even though such sorting will be trivial once the csv -data is read into `Julia`, `Python`, `R`, `Excel` or other data -analysis tool. - -Note that if you ask for more columns than are in the input, it -will dutifully generate the number of columns of output you -requested! The extra columns are empty. - -The -z option might save you a subsequent step in filtering, and -it was very easy here. - -Same with the sorting option, it was easy to do here, and might -save a step in a slower language. - -Most of the other functionality in nplotnn simply does not apply, -the tools are different. - -## Pipelines -Note that while this tool does scan all its inputs before generating -any output, it accumulates all the data it needs, and does not -rescan the input. So if needed, it can be used as a pipelined -command. Therefore this command combination is ok -``` -gfp_nearneighbours_single_file.sh -v -T 0.5 -n 20 ~/rand.chembl.gfp | nn2csv.sh - -``` - diff --git a/docs/Vendor/AAREADME.md b/docs/Vendor/AAREADME.md new file mode 100644 index 00000000..7618a4ce --- /dev/null +++ b/docs/Vendor/AAREADME.md @@ -0,0 +1,2 @@ +# Vendor +This directory holds tools that are dependent on tools from other entities. diff --git a/docs/Vendor/marvin_pka.md b/docs/Vendor/marvin_pka.md new file mode 100644 index 00000000..bb79b72c --- /dev/null +++ b/docs/Vendor/marvin_pka.md @@ -0,0 +1,36 @@ +# marvin_pka +This tool parses the output from cxcalc (Chemaxon/Marvin) and generates a molecule with +assigned formal charges. + +The output from +``` +cxcalc pka file.smi > file.pka +``` + +might look like +``` +id apKa1 apKa2 bpKa1 bpKa2 atoms +1 16.76 0.07 3,2 +2 3.55 5.69 2,7 +3 3.36 -1.17 -9.49 7,1,2 +4 10.45 4.50 7,1 +5 -0.22 9 +6 11.47 15.28 0.69 0.07 6,3,1,8 +7 4.56 8 +8 13.50 14.14 5.89 2.07 8,7,9,7 +9 2.77 13.78 9.78 -3.80 8,6,3,6 +``` +where each record contains varying numbers of entries for most basic +and most acidic pK values. + +Note that the last column is a list of the correspinding atom numbers +associated with the sites in the earlier columns. A complete workflow +might look like + +``` +cxcalc pka file.smi > file.pka +marvin_pka -M file.pka -P 7.4 -d 1.0 file.smi +``` +where the resulting output will have formal charges applied if the +computed pK is more than 1 log unit away from the assumed pH of 7.4 +in this case. diff --git a/docs/python/LillyMolPython.md b/docs/python/LillyMolPython.md index c48a837f..0664c0dc 100644 --- a/docs/python/LillyMolPython.md +++ b/docs/python/LillyMolPython.md @@ -30,12 +30,12 @@ but for now, there is no claim that this is as fast as things could be. ## Building Your python environment *must* include pybind11. Normally ``` -pip install pybond11 +pip install pybind11 ``` will accomplish this. Normally the python bindings are built as part of the default build, -the script [build_from_source.sh](/src/build_from_source.sh), but if +the script [build_linux.sh](/src/build_linux.sh), but if you wish to compile separately that can be done via ``` bazelisk --output_user_root=/local/disk/ian build --cxxopt=-DTODAY=\"$(date +%Y-%b-%d)\" --cxxopt=-DGIT_HASH=\"$(git rev-parse --short --verify HEAD)\" --local_cpu_resources=10 -c opt pybind:all @@ -47,7 +47,7 @@ the needed files out of bazel-bin and into lib. See `WORKSPACE` for how we configured the local python and pybind11 installs. This was quite difficult to get right. Normally these will be auto -configured for you by the [build_third_party](/src/build_third_party.sh) script, +configured for you by the build script, which in turn calls [update_python_in_workspace](/src/update_python_in_workspace.py) which interrogates the python installation. @@ -126,6 +126,13 @@ mols = MolFromSmiles(["C", "CC", "C1CC1"]) which returns a list of molecules. This may offer speed advantages depending on the structure of the program. +And for clarity +``` +mol = LillyMolFromSmiles("C methane") +``` +also works. + + There are other means by which molecules can enter the system. ``` @@ -219,7 +226,7 @@ The most common methods for a Molecule currently implemented are | GetNumAtoms() | Number of atoms (explicit atoms only) | | nedges() | Number of bonds | | bonds() | Iterable collection of Bonds | -| nrings() | Number of rings | +| nrings() | Number of SSSR rings | | nrings(atom) | Ring membership of 'atom' | | is_ring_atom(atom) | True if 'atom' is in a ring | | IsInRing(atom) | True if 'atom' is in a ring | @@ -260,9 +267,10 @@ The most common methods for a Molecule currently implemented are | remove_atom(atom) | Remove an atom | | remove_atoms(list, flag) | Remove all atoms where list[i] == flag | | remove_atoms(Set_of_Atoms) | Remove the atoms in the set | +| remove_atoms(numpy_array, flag) | Remove atoms where numpy_array[i] == flag | | remove_non_periodic_table_elements() | Remove any non-natural atoms | | remove_all(atomic_number) | Remove all atoms with atomic_number | -| move_to_end_of_connection_table(z) | Move all atoms with atomic number to end of connection table | +| move_to_end_of_connection_table(z) | Move all atoms with atomic number z to end of connection table | | chop(n) | Remove the last 'n' atoms in the molecule | | organic_only() | True if only C, N, O, F, P, S, Cl, Br, I | | remove_explicit_hydrogens() | Remove explicit Hydrogens | @@ -323,13 +331,16 @@ The most common methods for a Molecule currently implemented are | isotope(atom) | Isotope on 'atom' | | set_isotope(atom, iso) | Set isotope | | set_isotopes(Set_of_Atoms, iso) | Set isotope for atoms in the set | +| set_isotopes(numpy_array) | Set each isotope | | remove_isotopes() | Remove all isotopes | | number_isotopic_atoms() | Number of atoms with non zero isotopes | | bonds_between(a1, a2) | Bonds between atoms | | longest_path() | Longest through bond path | +| atoms_on_shortest_path(a1, a2) | Set_of_Atoms holding atoms on shortest path between a1 and a2 | +| down_the_bond(a1, a2) | Return all atoms found by looking down the a1->a2 bond. May return None | | atom_map_number(atom) | Atom map number on 'atom' | | set_atom_map_number(atom, nbr) | Set atom map number | -| reset_all_atom_map_numbers() | Remove all atom map numbers | +| reset_atom_map_numbers() | Remove all atom map numbers | | atom_with_atom_map_number(number) | Atom with atom map number | | bond_length(a1, a2) | Bond distance | | bond_angle(a1, a2, a3) | Bond angle | diff --git a/docs/python/fingerprints.md b/docs/python/fingerprints.md new file mode 100644 index 00000000..02136167 --- /dev/null +++ b/docs/python/fingerprints.md @@ -0,0 +1,122 @@ +# Fingerprints +Long term, the LillyMol GFP (Generalised FingerPrints) class will be +made available from python. In the meantime there are some capabilities +implemented today that may be useful for computing small numbers of similarities. + +## Similarity +There is no best similarity measure. The best similarity measure would +likely be one that accurately mapped to changes in Biological activity. +Much of Cheminformatics, and other, research is designed to find +means of accurately predicting similarity in Biological activity. +It is hard. + +A good fingerprint might be the one that does the best at tracking +changes in Biological activity. That will likely be target dependent +and will need extensive study to identify. + +A good fingerprint might be one that generally corresponds with human +perceptions of chemical similarity. + +Our experience is that fingerprints like EC (Extended Connectivity, or Morgan) +fingerprints work best for things like SVM fingerprint models, linear +path fingerprints tend to work best for corresponding to human +perception. It all depends. + +## HowTo + +The following toy application shows a simple N*N near neighbour +computation in LillyMol python. +``` +from absl import app +from absl import logging + +from lillymol import * +from lillymol_io import * +from lillymol_fingerprint import * + +def main(argv): + if len(argv) == 1: + logging.error("Must specify input file") + + mols = slurp(argv[1]) + logging.info("Read %d molecules from %s", len(mols), argv[1]) + + fps = [linear_fingerprint(mol) for mol in mols] + logging.info("Fingerprints generated") + + nfp = len(fps); + for i in range(nfp): + max_similarity = 0.0 + idmax = -1 + for j in range(nfp): + if i == j: + continue + t = tanimoto(fps[i], fps[j]) + if t > max_similarity: + max_similarity = t + idmax = j + + print(f"{mols[i].smiles()} {mols[i].name()} {mols[idmax].smiles()} {mols[idmax].name()} {1.0 - max_similarity}") + +if __name__ == "__main__": + app.run(main) +``` +This does a very dumb N*N nearest neigbour computation. With some book-keeping +only half the calculations need to be performed. This example is just designed +to demonstrate outlines of how fingerprints in LillyMol python work. + +Unfortunately this is quite slow. Running 2000 random Chembl molecules takes 21 seconds. +Running gfp_nearneighbours_single_file on the same input file, but with standard +gfp fingerprints, takes less than 1 second. But that is dealing with bit vector +fingerprints and can use popc instructions for computing similarity. This calculation +is using counted fingerprints, which will necessarily be a much more expensive +computation. + +When GFP fingerprints are ported into the LillyMol Python environment, faster +computation of binary fingerprints will become available. That said, there +are advantages to counted fingerprints, since they do not suffer from the +repeated feature problem of binary fingerprints. If a molecule contains +four instances of a feature, that will still just set the bit once in a binary +fingerprint, whereas in a counted fingerprint, the number of instances will +be recorded and will count in the similarity computation. + +Only linear fingerprints have a method for constructing fingerprints such as the above. +All fingerprints can work via a fingerprint generator, that can be configured with +things like atom typing. +``` + fpgen = LinearFingerprintCreator(2048) + fpgen.set_max_length(7) + fpgen.set_atom_type("UST:AY") + + fps = [fpgen.fingerprint(mol) for mol in mols] + logging.info("Fingerprints generated") +``` + +For example to generate 1024 bit EC fingerprints, of radius 3, using an atom type of 'UST:AY' that +could be done by +``` + fpgen = ECFingerprintCreator(1024) + fpgen.set_max_radius(3) + fpgen.set_atom_type("UST:AY") + + fps = [fpgen.fingerprint(mol) for mol in mols] +``` + +Atom pair fingerprints are similar, this time restricting separations to 10 bonds. +``` + fpgen = AtomPairFingerprintCreator(1024) + fpget.set_max_separation(10) + fpgen.set_atom_type("UST:AY1") + + fps = [fpgen.fingerprint(mol) for mol in mols] +``` +All generate numpy byte arrays, containing counted fingerprints. + +Again, speed is not good. This can be improved by several approaches + +* bitvector based fingerprints +* GFP fingerprints +* A multiple fingerprint container that can be queried with just 1 call from python. + +And all constructors could be altered to allow pythonic **kwargs to deal with +settable parameters. diff --git a/src/Foundational/data_source/tfdatarecord.cc b/src/Foundational/data_source/tfdatarecord.cc index 940a64fc..49de6d80 100644 --- a/src/Foundational/data_source/tfdatarecord.cc +++ b/src/Foundational/data_source/tfdatarecord.cc @@ -247,6 +247,27 @@ TFDataReader::FillReadBuffer(uint64_t bytes_needed) { return true; } +int +TFDataReader::seek_zero() { + if (_fd < 0) { + return 0; + } + + off_t rc = IW_FD_LSEEK(_fd, 0, SEEK_SET); + + if (rc < 0) { + cerr << "TFDataReader::seek_zero:cannot seek back to start of file\n"; + _good = false; + return 0; + } + + _good = true; + _eof = false; + _next = 0; + return 1; +} + + TFDataWriter::TFDataWriter() { } diff --git a/src/Foundational/data_source/tfdatarecord.h b/src/Foundational/data_source/tfdatarecord.h index cd25d743..8deb18f5 100644 --- a/src/Foundational/data_source/tfdatarecord.h +++ b/src/Foundational/data_source/tfdatarecord.h @@ -55,6 +55,8 @@ class TFDataReader { bool good() const { return _good;} bool eof() const { return _eof;} + int seek_zero(); + int Close(); // Will not reduce the size of _read_buffer, but may diff --git a/src/Foundational/iwmisc/iwdigits.cc b/src/Foundational/iwmisc/iwdigits.cc index 4b9c8001..cfa02300 100644 --- a/src/Foundational/iwmisc/iwdigits.cc +++ b/src/Foundational/iwmisc/iwdigits.cc @@ -97,12 +97,10 @@ IWDigits::_fill_in_the_digits() int IWDigits::append_number(T & buffer, int zdigit) const { - if (zdigit < 0) - { + if (zdigit < 0) { zdigit = - zdigit; // convert to something >= 0 - if (_leading_space.length() > 0) - { + if (_leading_space.length() > 0) { buffer << _leading_space; buffer << '-'; buffer.append_number( zdigit); diff --git a/src/Foundational/iwmisc/proto_support.h b/src/Foundational/iwmisc/proto_support.h index 92213beb..ab68e33f 100644 --- a/src/Foundational/iwmisc/proto_support.h +++ b/src/Foundational/iwmisc/proto_support.h @@ -15,6 +15,7 @@ #include "google/protobuf/io/zero_copy_stream_impl.h" #include "Foundational/data_source/iwstring_data_source.h" +#include "Foundational/iwmisc/proto_support.h" #include "Foundational/iwstring/iwstring.h" namespace iwmisc { diff --git a/src/Foundational/iwstring/IWString_class.cc b/src/Foundational/iwstring/IWString_class.cc index 66b1e47a..dd0393d1 100644 --- a/src/Foundational/iwstring/IWString_class.cc +++ b/src/Foundational/iwstring/IWString_class.cc @@ -5037,11 +5037,13 @@ IWString::operator =(const std::string & rhs) void IWString::operator +=(const std::string & rhs) { - const_IWSubstring tmp(rhs); - - operator += (tmp); + resizable_array::add(rhs.data(), rhs.length()); +} - return; +void +IWString::operator +=(const std::string_view & rhs) +{ + resizable_array::add(rhs.data(), rhs.length()); } #endif diff --git a/src/Foundational/iwstring/iwstring.h b/src/Foundational/iwstring/iwstring.h index 26e3bcff..1349dc1e 100644 --- a/src/Foundational/iwstring/iwstring.h +++ b/src/Foundational/iwstring/iwstring.h @@ -23,6 +23,7 @@ #endif #include #include +#include #define IW_STD_STRING_DEFINED 1 @@ -412,8 +413,11 @@ class IWString : public resizable_array int operator == (const IWString &) const; int operator != (const IWString &) const; + // These are not instantiated. Did not work inside Google. bool operator== (const std::string& rhs) const; bool operator!= (const std::string& rhs) const; + bool operator== (const std::string_view& rhs) const; + bool operator!= (const std::string_view& rhs) const; // The relational operators are implemented using strncmp @@ -600,6 +604,7 @@ class IWString : public resizable_array #if defined (IW_STD_STRING_DEFINED) void operator += (const std::string &); + void operator += (const std::string_view &); #endif void append_number (int); @@ -696,6 +701,7 @@ class IWString : public resizable_array IWString & operator << (double); #ifdef IW_STD_STRING_DEFINED IWString & operator << (const std::string & s) { this->operator+=(s); return *this;} + IWString & operator << (const std::string_view & s) { this->operator+=(s); return *this;} #endif int operator < (int) const; @@ -1002,6 +1008,20 @@ Equals(const const_IWSubstring& lhs, const std::string& rhs) { } return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length()); } +inline bool +Equals(const IWString& lhs, const std::string_view& rhs) { + if (static_cast(lhs.length()) != rhs.size()) { + return false; + } + return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length()); +} +inline bool +Equals(const const_IWSubstring& lhs, const std::string_view& rhs) { + if (static_cast(lhs.length()) != rhs.size()) { + return false; + } + return 0 == ::strncmp(lhs.data(), rhs.data(), lhs.length()); +} } // namespace iwstring diff --git a/src/Foundational/iwstring/iwstring_test.cc b/src/Foundational/iwstring/iwstring_test.cc index f9b4e416..3ad602d0 100644 --- a/src/Foundational/iwstring/iwstring_test.cc +++ b/src/Foundational/iwstring/iwstring_test.cc @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -23,6 +24,19 @@ TEST(TestConstIWSubstring, TestAsString) { const std::string as_string = s.AsString(); EXPECT_EQ(as_string, "hello"); } +TEST(TestAppendOperator, TestStd) { + IWString s("hello"); + std::string_view space(" "); + std::string world("world"); + s << space << world; + EXPECT_EQ(s, "hello world"); +} +TEST(TestEqualsOperator, TestStd) { + IWString hello = "hello"; + std::string world; + EXPECT_FALSE(iwstring::Equals(hello, world)); +} + TEST(TestIWString, TestExpandEnvironmentVariablesNothing) { const IWString hello("hello world"); EXPECT_EQ(hello.ExpandEnvironmentVariables(), hello); diff --git a/src/MODULE.bazel b/src/MODULE.bazel index 3907027f..9b18bfcf 100644 --- a/src/MODULE.bazel +++ b/src/MODULE.bazel @@ -17,20 +17,27 @@ http_archive = use_repo_rule( # There are newer versions of absl, but there seem to be incompatibilities with # other Google tools, proto and re2. Hopefully will resolve eventually... bazel_dep(name = "abseil-cpp", version = "20240116.2", repo_name="com_google_absl") -bazel_dep(name = "bazel_skylib", version = "1.5.0") +bazel_dep(name = "bazel_skylib", version = "1.7.1") bazel_dep(name = "caseyduquettesc_rules_python_pytest", version = "1.1.0", repo_name = "rules_python_pytest") bazel_dep(name = "eigen", version = "3.4.0", repo_name="com_gitlab_libeigen_eigen") -bazel_dep(name = "google_benchmark", version = "1.8.3", repo_name="benchmark") -bazel_dep(name = "googletest", version = "1.14.0") -bazel_dep(name = "onetbb", version = "2021.10.0", repo_name="onetbb") -bazel_dep(name = "pybind11_bazel", version = "2.11.1.bzl.3") +# Does not work inside Lilly, link error half way through the build. +#bazel_dep(name = "libzmq", version = "4.3.5") + +bazel_dep(name = "google_benchmark", version = "1.8.5", repo_name="benchmark") +bazel_dep(name = "googletest", version = "1.15.2") +bazel_dep(name = "onetbb", version = "2021.13.0", repo_name="onetbb") +# Version 27.0 works for c++, but does not work with python 3.11 +bazel_dep(name = "protobuf", version = "26.0", repo_name="com_google_protobuf") +bazel_dep(name = "pybind11_bazel", version = "2.12.0") # bazel_dep(name = "pybind11_protobuf", version = "2.11.1.bzl.3") -bazel_dep(name = "re2", version = "2024-06-01") +bazel_dep(name = "re2", version = "2024-07-02") bazel_dep(name = "rules_cc", version = "0.0.9") -bazel_dep(name = "rules_go", version = "0.48.0") +bazel_dep(name = "rules_go", version = "0.49.0") bazel_dep(name = "rules_pkg", version = "0.10.1") -bazel_dep(name = "rules_proto", version = "6.0.0") -# bazel_dep(name = "rules_ruby", version = "0.9.1") -bazel_dep(name = "protobuf", version = "27.0", repo_name="com_google_protobuf") -bazel_dep(name = "zlib", version = "1.3") +bazel_dep(name = "rules_proto", version = "6.0.2") +# bazel_dep(name = "rules_ruby", version = "0.12.0") +# Seem to be two toml handlers. Figure out which to use... +# bazel_dep(name = "tomlplusplus", version = "3.4.0") +# bazel_dep(name = "cpptoml", version = "0.1.1") +bazel_dep(name = "zlib", version = "1.3.1") diff --git a/src/MODULE.bazel.lock b/src/MODULE.bazel.lock deleted file mode 100644 index bfe4fa25..00000000 --- a/src/MODULE.bazel.lock +++ /dev/null @@ -1,1311 +0,0 @@ -{ - "lockFileVersion": 1, - "moduleFileHash": "623e5c7ff7948d45516fa3218218a74b64dbf80ad5a145ac3ffc8159c683ad60", - "flags": { - "cmdRegistries": [ - "https://bcr.bazel.build/" - ], - "cmdModuleOverrides": {}, - "allowedYankedVersions": [], - "envVarAllowedYankedVersions": "", - "ignoreDevDependency": false, - "directDependenciesMode": "WARNING", - "compatibilityMode": "ERROR" - }, - "localOverrideHashes": { - "bazel_tools": "11c49407fdc54b48d69dcd4478440118124b9cd51b2dca5947a6414a585964a1" - }, - "moduleDepGraph": { - "": { - "name": "lillymol", - "version": "1.2.0", - "key": "", - "repoName": "lillymol", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "com_google_absl": "abseil-cpp@20230125.1", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_cc": "rules_cc@0.0.6", - "rules_go": "rules_go@0.39.1", - "rules_pkg": "rules_pkg@0.7.0", - "rules_proto": "rules_proto@5.3.0-21.7", - "rules_python": "rules_python@0.23.1", - "googletest": "googletest@1.12.1", - "com_google_protobuf": "protobuf@21.7", - "re2": "re2@2023-06-02", - "zlib": "zlib@1.2.13" - } - }, - "bazel_tools@_": { - "name": "bazel_tools", - "version": "", - "key": "bazel_tools@_", - "repoName": "bazel_tools", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "@local_config_cc_toolchains//:all", - "@local_config_sh//:local_sh_toolchain" - ], - "extensionUsages": [ - { - "extensionBzlFile": "@bazel_tools//tools/cpp:cc_configure.bzl", - "extensionName": "cc_configure_extension", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 13, - "column": 29 - }, - "imports": { - "local_config_cc": "local_config_cc", - "local_config_cc_toolchains": "local_config_cc_toolchains" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_tools//tools/osx:xcode_configure.bzl", - "extensionName": "xcode_configure_extension", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 17, - "column": 32 - }, - "imports": { - "local_config_xcode": "local_config_xcode" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@rules_java//java:extensions.bzl", - "extensionName": "toolchains", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 20, - "column": 32 - }, - "imports": { - "local_jdk": "local_jdk", - "remote_java_tools": "remote_java_tools", - "remote_java_tools_linux": "remote_java_tools_linux", - "remote_java_tools_windows": "remote_java_tools_windows", - "remote_java_tools_darwin_x86_64": "remote_java_tools_darwin_x86_64", - "remote_java_tools_darwin_arm64": "remote_java_tools_darwin_arm64" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_tools//tools/sh:sh_configure.bzl", - "extensionName": "sh_configure_extension", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 31, - "column": 39 - }, - "imports": { - "local_config_sh": "local_config_sh" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_tools//tools/test:extensions.bzl", - "extensionName": "remote_coverage_tools_extension", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 35, - "column": 48 - }, - "imports": { - "remote_coverage_tools": "remote_coverage_tools" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_tools//tools/android:android_extensions.bzl", - "extensionName": "remote_android_tools_extensions", - "usingModule": "bazel_tools@_", - "location": { - "file": "@@bazel_tools//:MODULE.bazel", - "line": 38, - "column": 42 - }, - "imports": { - "android_gmaven_r8": "android_gmaven_r8", - "android_tools": "android_tools" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "local_config_platform": "local_config_platform@_", - "rules_cc": "rules_cc@0.0.6", - "rules_java": "rules_java@5.5.0", - "rules_license": "rules_license@0.0.3", - "rules_proto": "rules_proto@5.3.0-21.7", - "rules_python": "rules_python@0.23.1", - "platforms": "platforms@0.0.6", - "com_google_protobuf": "protobuf@21.7", - "zlib": "zlib@1.2.13" - } - }, - "local_config_platform@_": { - "name": "local_config_platform", - "version": "", - "key": "local_config_platform@_", - "repoName": "local_config_platform", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "platforms": "platforms@0.0.6" - } - }, - "abseil-cpp@20230125.1": { - "name": "abseil-cpp", - "version": "20230125.1", - "key": "abseil-cpp@20230125.1", - "repoName": "abseil-cpp", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "rules_cc": "rules_cc@0.0.6", - "platforms": "platforms@0.0.6", - "bazel_skylib": "bazel_skylib@1.4.2" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--abseil-cpp~20230125.1","urls":["--https://github.com/abseil/abseil-cpp/archive/refs/tags/20230125.1.tar.gz"],"integrity":"--sha256-gTEcF1mbNxIGne0gzKCaYqsL8qid+haZN4bIeCt+0UU=","strip_prefix":"--abseil-cpp-20230125.1","remote_patches":{"--https://bcr.bazel.build/modules/abseil-cpp/20230125.1/patches/module_dot_bazel.patch":"--sha256-L1wChhBmDOnRbPbD4MENVXHjOBT2KFrDxT6D+aoThxk="},"remote_patch_strip":0} - } - }, - "bazel_skylib@1.4.2": { - "name": "bazel_skylib", - "version": "1.4.2", - "key": "bazel_skylib@1.4.2", - "repoName": "bazel_skylib", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "//toolchains/unittest:cmd_toolchain", - "//toolchains/unittest:bash_toolchain" - ], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "platforms": "platforms@0.0.6" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--bazel_skylib~1.4.2","urls":["--https://github.com/bazelbuild/bazel-skylib/releases/download/1.4.2/bazel-skylib-1.4.2.tar.gz"],"integrity":"--sha256-Zv/ZMVZlv6r8lrUiePV8fi3Qn17eJ56m05sr5HHn46o=","strip_prefix":"--","remote_patches":{},"remote_patch_strip":0} - } - }, - "rules_cc@0.0.6": { - "name": "rules_cc", - "version": "0.0.6", - "key": "rules_cc@0.0.6", - "repoName": "rules_cc", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "@local_config_cc_toolchains//:all" - ], - "extensionUsages": [ - { - "extensionBzlFile": "@rules_cc//cc:extensions.bzl", - "extensionName": "cc_configure", - "usingModule": "rules_cc@0.0.6", - "location": { - "file": "https://bcr.bazel.build/modules/rules_cc/0.0.6/MODULE.bazel", - "line": 9, - "column": 29 - }, - "imports": { - "local_config_cc_toolchains": "local_config_cc_toolchains" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "platforms": "platforms@0.0.6" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_cc~0.0.6","urls":["--https://github.com/bazelbuild/rules_cc/releases/download/0.0.6/rules_cc-0.0.6.tar.gz"],"integrity":"--sha256-PZ4nHih2ukLhFMm5vFFFTjecvw7J751A4q5M7GGjG0A=","strip_prefix":"--rules_cc-0.0.6","remote_patches":{"--https://bcr.bazel.build/modules/rules_cc/0.0.6/patches/module_dot_bazel.patch":"--sha256-aYtJVwlC1sQA+C+afK5ySZMSPhIheLqTDq84TQAAxb0="},"remote_patch_strip":0} - } - }, - "rules_go@0.39.1": { - "name": "rules_go", - "version": "0.39.1", - "key": "rules_go@0.39.1", - "repoName": "io_bazel_rules_go", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "@go_toolchains//:all" - ], - "extensionUsages": [ - { - "extensionBzlFile": "@io_bazel_rules_go//go/private:extensions.bzl", - "extensionName": "non_module_dependencies", - "usingModule": "rules_go@0.39.1", - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 13, - "column": 40 - }, - "imports": { - "go_googleapis": "go_googleapis", - "io_bazel_rules_nogo": "io_bazel_rules_nogo" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@io_bazel_rules_go//go:extensions.bzl", - "extensionName": "go_sdk", - "usingModule": "rules_go@0.39.1", - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 20, - "column": 23 - }, - "imports": { - "go_toolchains": "go_toolchains" - }, - "devImports": [], - "tags": [ - { - "tagName": "download", - "attributeValues": {"name":"--go_default_sdk","version":"--1.19.8"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 21, - "column": 16 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@gazelle//:extensions.bzl", - "extensionName": "go_deps", - "usingModule": "rules_go@0.39.1", - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 31, - "column": 24 - }, - "imports": { - "com_github_gogo_protobuf": "com_github_gogo_protobuf", - "com_github_golang_mock": "com_github_golang_mock", - "com_github_golang_protobuf": "com_github_golang_protobuf", - "org_golang_google_genproto": "org_golang_google_genproto", - "org_golang_google_grpc": "org_golang_google_grpc", - "org_golang_google_protobuf": "org_golang_google_protobuf", - "org_golang_x_net": "org_golang_x_net" - }, - "devImports": [], - "tags": [ - { - "tagName": "from_file", - "attributeValues": {"go_mod":"--//:go.mod"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 32, - "column": 18 - } - }, - { - "tagName": "module", - "attributeValues": {"path":"--github.com/gogo/protobuf","sum":"--h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=","version":"--v1.3.2"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_go/0.39.1/MODULE.bazel", - "line": 33, - "column": 15 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "platforms": "platforms@0.0.6", - "rules_proto": "rules_proto@5.3.0-21.7", - "com_google_protobuf": "protobuf@21.7", - "gazelle": "gazelle@0.30.0" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_go~0.39.1","urls":["--https://github.com/bazelbuild/rules_go/releases/download/v0.39.1/rules_go-v0.39.1.zip"],"integrity":"--sha256-bcLaerTPXXv8fJSXdrG3xzPwXlbtxLzZAiuySdLiqZY=","strip_prefix":"--","remote_patches":{},"remote_patch_strip":0} - } - }, - "rules_pkg@0.7.0": { - "name": "rules_pkg", - "version": "0.7.0", - "key": "rules_pkg@0.7.0", - "repoName": "rules_pkg", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "rules_python": "rules_python@0.23.1", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_license": "rules_license@0.0.3" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_pkg~0.7.0","urls":["--https://github.com/bazelbuild/rules_pkg/releases/download/0.7.0/rules_pkg-0.7.0.tar.gz"],"integrity":"--sha256-iimOgydi7aGDBZfWT+fbWBeKqEzVkm121bdE1lWJQcI=","strip_prefix":"--","remote_patches":{"--https://bcr.bazel.build/modules/rules_pkg/0.7.0/patches/module_dot_bazel.patch":"--sha256-4OaEPZwYF6iC71ZTDg6MJ7LLqX7ZA0/kK4mT+4xKqiE="},"remote_patch_strip":0} - } - }, - "rules_proto@5.3.0-21.7": { - "name": "rules_proto", - "version": "5.3.0-21.7", - "key": "rules_proto@5.3.0-21.7", - "repoName": "rules_proto", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "com_google_protobuf": "protobuf@21.7", - "rules_cc": "rules_cc@0.0.6" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_proto~5.3.0-21.7","urls":["--https://github.com/bazelbuild/rules_proto/archive/refs/tags/5.3.0-21.7.tar.gz"],"integrity":"--sha256-3D+yBqLLNEG0heseQjFlsjEjWh6psDG0Qzz3vB+kYN0=","strip_prefix":"--rules_proto-5.3.0-21.7","remote_patches":{},"remote_patch_strip":0} - } - }, - "rules_python@0.23.1": { - "name": "rules_python", - "version": "0.23.1", - "key": "rules_python@0.23.1", - "repoName": "rules_python", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "@pythons_hub//:all" - ], - "extensionUsages": [ - { - "extensionBzlFile": "@rules_python//python/extensions/private:internal_deps.bzl", - "extensionName": "internal_deps", - "usingModule": "rules_python@0.23.1", - "location": { - "file": "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel", - "line": 14, - "column": 30 - }, - "imports": { - "pypi__build": "pypi__build", - "pypi__click": "pypi__click", - "pypi__colorama": "pypi__colorama", - "pypi__importlib_metadata": "pypi__importlib_metadata", - "pypi__installer": "pypi__installer", - "pypi__more_itertools": "pypi__more_itertools", - "pypi__packaging": "pypi__packaging", - "pypi__pep517": "pypi__pep517", - "pypi__pip": "pypi__pip", - "pypi__pip_tools": "pypi__pip_tools", - "pypi__setuptools": "pypi__setuptools", - "pypi__tomli": "pypi__tomli", - "pypi__wheel": "pypi__wheel", - "pypi__zipp": "pypi__zipp", - "pypi__coverage_cp310_aarch64-apple-darwin": "pypi__coverage_cp310_aarch64-apple-darwin", - "pypi__coverage_cp310_aarch64-unknown-linux-gnu": "pypi__coverage_cp310_aarch64-unknown-linux-gnu", - "pypi__coverage_cp310_x86_64-apple-darwin": "pypi__coverage_cp310_x86_64-apple-darwin", - "pypi__coverage_cp310_x86_64-unknown-linux-gnu": "pypi__coverage_cp310_x86_64-unknown-linux-gnu", - "pypi__coverage_cp311_aarch64-apple-darwin": "pypi__coverage_cp311_aarch64-apple-darwin", - "pypi__coverage_cp311_aarch64-unknown-linux-gnu": "pypi__coverage_cp311_aarch64-unknown-linux-gnu", - "pypi__coverage_cp311_x86_64-apple-darwin": "pypi__coverage_cp311_x86_64-apple-darwin", - "pypi__coverage_cp311_x86_64-unknown-linux-gnu": "pypi__coverage_cp311_x86_64-unknown-linux-gnu", - "pypi__coverage_cp38_aarch64-apple-darwin": "pypi__coverage_cp38_aarch64-apple-darwin", - "pypi__coverage_cp38_aarch64-unknown-linux-gnu": "pypi__coverage_cp38_aarch64-unknown-linux-gnu", - "pypi__coverage_cp38_x86_64-apple-darwin": "pypi__coverage_cp38_x86_64-apple-darwin", - "pypi__coverage_cp38_x86_64-unknown-linux-gnu": "pypi__coverage_cp38_x86_64-unknown-linux-gnu", - "pypi__coverage_cp39_aarch64-apple-darwin": "pypi__coverage_cp39_aarch64-apple-darwin", - "pypi__coverage_cp39_aarch64-unknown-linux-gnu": "pypi__coverage_cp39_aarch64-unknown-linux-gnu", - "pypi__coverage_cp39_x86_64-apple-darwin": "pypi__coverage_cp39_x86_64-apple-darwin", - "pypi__coverage_cp39_x86_64-unknown-linux-gnu": "pypi__coverage_cp39_x86_64-unknown-linux-gnu" - }, - "devImports": [], - "tags": [ - { - "tagName": "install", - "attributeValues": {}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel", - "line": 15, - "column": 22 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@rules_python//python/extensions:python.bzl", - "extensionName": "python", - "usingModule": "rules_python@0.23.1", - "location": { - "file": "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel", - "line": 53, - "column": 23 - }, - "imports": { - "pythons_hub": "pythons_hub" - }, - "devImports": [], - "tags": [ - { - "tagName": "toolchain", - "attributeValues": {"is_default":true,"python_version":"--3.11"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_python/0.23.1/MODULE.bazel", - "line": 59, - "column": 17 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "platforms": "platforms@0.0.6", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_proto": "rules_proto@5.3.0-21.7", - "com_google_protobuf": "protobuf@21.7" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_python~0.23.1","urls":["--https://github.com/bazelbuild/rules_python/releases/download/0.23.1/rules_python-0.23.1.tar.gz"],"integrity":"--sha256-hK7J4hzFb7x/EzUDWnHIUNG5tcxv9Jcwb4TM7Zp2mEE=","strip_prefix":"--rules_python-0.23.1","remote_patches":{"--https://bcr.bazel.build/modules/rules_python/0.23.1/patches/module_dot_bazel_version.patch":"--sha256-Fb/omGfKlthLHMy1276rtIDI9k5sZQQhAeNsleX4y2k="},"remote_patch_strip":0} - } - }, - "googletest@1.12.1": { - "name": "googletest", - "version": "1.12.1", - "key": "googletest@1.12.1", - "repoName": "googletest", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "com_google_absl": "abseil-cpp@20230125.1", - "platforms": "platforms@0.0.6", - "rules_cc": "rules_cc@0.0.6" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--googletest~1.12.1","urls":["--https://github.com/google/googletest/archive/refs/tags/release-1.12.1.tar.gz"],"integrity":"--sha256-gZZP5XjpvXyU39sJyOTW5nWeGZZ+OX2+pI0cEORdDfI=","strip_prefix":"--googletest-release-1.12.1","remote_patches":{"--https://bcr.bazel.build/modules/googletest/1.12.1/patches/module_dot_bazel.patch":"--sha256-8/a2a7y0JeRh9gsDCSsFhJjXqsauwyFRK8ccm2wvVaE="},"remote_patch_strip":0} - } - }, - "protobuf@21.7": { - "name": "protobuf", - "version": "21.7", - "key": "protobuf@21.7", - "repoName": "protobuf", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [ - { - "extensionBzlFile": "@rules_jvm_external//:extensions.bzl", - "extensionName": "maven", - "usingModule": "protobuf@21.7", - "location": { - "file": "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel", - "line": 22, - "column": 22 - }, - "imports": { - "maven": "maven" - }, - "devImports": [], - "tags": [ - { - "tagName": "install", - "attributeValues": {"name":"--maven","artifacts":["--com.google.code.findbugs:jsr305:3.0.2","--com.google.code.gson:gson:2.8.9","--com.google.errorprone:error_prone_annotations:2.3.2","--com.google.j2objc:j2objc-annotations:1.3","--com.google.guava:guava:31.1-jre","--com.google.guava:guava-testlib:31.1-jre","--com.google.truth:truth:1.1.2","--junit:junit:4.13.2","--org.mockito:mockito-core:4.3.1"]}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/protobuf/21.7/MODULE.bazel", - "line": 24, - "column": 14 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_python": "rules_python@0.23.1", - "rules_cc": "rules_cc@0.0.6", - "rules_proto": "rules_proto@5.3.0-21.7", - "rules_java": "rules_java@5.5.0", - "rules_pkg": "rules_pkg@0.7.0", - "com_google_abseil": "abseil-cpp@20230125.1", - "zlib": "zlib@1.2.13", - "upb": "upb@0.0.0-20220923-a547704", - "rules_jvm_external": "rules_jvm_external@4.4.2", - "com_google_googletest": "googletest@1.12.1" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--protobuf~21.7","urls":["--https://github.com/protocolbuffers/protobuf/releases/download/v21.7/protobuf-all-21.7.zip"],"integrity":"--sha256-VJOiH17T/FAuZv7GuUScBqVRztYwAvpIkDxA36jeeko=","strip_prefix":"--protobuf-21.7","remote_patches":{"--https://bcr.bazel.build/modules/protobuf/21.7/patches/add_module_dot_bazel.patch":"--sha256-q3V2+eq0v2XF0z8z+V+QF4cynD6JvHI1y3kI/+rzl5s=","--https://bcr.bazel.build/modules/protobuf/21.7/patches/add_module_dot_bazel_for_examples.patch":"--sha256-O7YP6s3lo/1opUiO0jqXYORNHdZ/2q3hjz1QGy8QdIU=","--https://bcr.bazel.build/modules/protobuf/21.7/patches/relative_repo_names.patch":"--sha256-RK9RjW8T5UJNG7flIrnFiNE9vKwWB+8uWWtJqXYT0w4=","--https://bcr.bazel.build/modules/protobuf/21.7/patches/add_missing_files.patch":"--sha256-Hyne4DG2u5bXcWHNxNMirA2QFAe/2Cl8oMm1XJdkQIY="},"remote_patch_strip":1} - } - }, - "re2@2023-06-02": { - "name": "re2", - "version": "2023-06-02", - "key": "re2@2023-06-02", - "repoName": "re2", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "rules_cc": "rules_cc@0.0.6", - "platforms": "platforms@0.0.6", - "com_google_absl": "abseil-cpp@20230125.1", - "com_google_googletest": "googletest@1.12.1" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--re2~2023-06-02","urls":["--https://github.com/google/re2/releases/download/2023-06-02/re2-2023-06-02.zip"],"integrity":"--sha256-HWHq3eRzAacQfKP6j66hqO3fH5uQIRMNniM3aSLoyGc=","strip_prefix":"--re2-2023-06-02","remote_patches":{"--https://bcr.bazel.build/modules/re2/2023-06-02/patches/module_dot_bazel.patch":"--sha256-yVczUziJokEqTp19tz2JMt/KCQhpd58IAaPEqH+ZNZg="},"remote_patch_strip":0} - } - }, - "zlib@1.2.13": { - "name": "zlib", - "version": "1.2.13", - "key": "zlib@1.2.13", - "repoName": "zlib", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--zlib~1.2.13","urls":["--https://github.com/madler/zlib/archive/refs/tags/v1.2.13.zip"],"integrity":"--sha256-woVpUbvzDjCGGs43ZVldhroT8s8BJ52QH2xiJYxX9P8=","strip_prefix":"--zlib-1.2.13","remote_patches":{"--https://bcr.bazel.build/modules/zlib/1.2.13/patches/add_build_file.patch":"--sha256-Z2ig1F01/dfdG63H+GwYRMcGbW/zAGIUWnKKrwKSEaQ=","--https://bcr.bazel.build/modules/zlib/1.2.13/patches/module_dot_bazel.patch":"--sha256-Nc7xP02Dl6yHQvkiZWSQnlnw1T277yS4cJxxONWJ/Ic="},"remote_patch_strip":0} - } - }, - "rules_java@5.5.0": { - "name": "rules_java", - "version": "5.5.0", - "key": "rules_java@5.5.0", - "repoName": "rules_java", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [ - "//toolchains:all", - "@local_jdk//:runtime_toolchain_definition", - "@remotejdk11_linux_toolchain_config_repo//:toolchain", - "@remotejdk11_macos_toolchain_config_repo//:toolchain", - "@remotejdk11_macos_aarch64_toolchain_config_repo//:toolchain", - "@remotejdk11_win_toolchain_config_repo//:toolchain", - "@remotejdk17_linux_toolchain_config_repo//:toolchain", - "@remotejdk17_macos_toolchain_config_repo//:toolchain", - "@remotejdk17_macos_aarch64_toolchain_config_repo//:toolchain", - "@remotejdk17_win_toolchain_config_repo//:toolchain", - "@remotejdk19_linux_toolchain_config_repo//:toolchain", - "@remotejdk19_macos_toolchain_config_repo//:toolchain", - "@remotejdk19_macos_aarch64_toolchain_config_repo//:toolchain", - "@remotejdk19_win_toolchain_config_repo//:toolchain", - "@remotejdk11_linux_aarch64_toolchain_config_repo//:toolchain", - "@remotejdk11_linux_ppc64le_toolchain_config_repo//:toolchain", - "@remotejdk11_linux_s390x_toolchain_config_repo//:toolchain" - ], - "extensionUsages": [ - { - "extensionBzlFile": "@rules_java//java:extensions.bzl", - "extensionName": "toolchains", - "usingModule": "rules_java@5.5.0", - "location": { - "file": "https://bcr.bazel.build/modules/rules_java/5.5.0/MODULE.bazel", - "line": 16, - "column": 27 - }, - "imports": { - "remote_java_tools": "remote_java_tools", - "remote_java_tools_linux": "remote_java_tools_linux", - "remote_java_tools_windows": "remote_java_tools_windows", - "remote_java_tools_darwin_x86_64": "remote_java_tools_darwin_x86_64", - "remote_java_tools_darwin_arm64": "remote_java_tools_darwin_arm64", - "local_jdk": "local_jdk", - "remotejdk11_linux_toolchain_config_repo": "remotejdk11_linux_toolchain_config_repo", - "remotejdk11_macos_toolchain_config_repo": "remotejdk11_macos_toolchain_config_repo", - "remotejdk11_macos_aarch64_toolchain_config_repo": "remotejdk11_macos_aarch64_toolchain_config_repo", - "remotejdk11_win_toolchain_config_repo": "remotejdk11_win_toolchain_config_repo", - "remotejdk17_linux_toolchain_config_repo": "remotejdk17_linux_toolchain_config_repo", - "remotejdk17_macos_toolchain_config_repo": "remotejdk17_macos_toolchain_config_repo", - "remotejdk17_macos_aarch64_toolchain_config_repo": "remotejdk17_macos_aarch64_toolchain_config_repo", - "remotejdk17_win_toolchain_config_repo": "remotejdk17_win_toolchain_config_repo", - "remotejdk19_linux_toolchain_config_repo": "remotejdk19_linux_toolchain_config_repo", - "remotejdk19_macos_toolchain_config_repo": "remotejdk19_macos_toolchain_config_repo", - "remotejdk19_macos_aarch64_toolchain_config_repo": "remotejdk19_macos_aarch64_toolchain_config_repo", - "remotejdk19_win_toolchain_config_repo": "remotejdk19_win_toolchain_config_repo", - "remotejdk11_linux_aarch64_toolchain_config_repo": "remotejdk11_linux_aarch64_toolchain_config_repo", - "remotejdk11_linux_ppc64le_toolchain_config_repo": "remotejdk11_linux_ppc64le_toolchain_config_repo", - "remotejdk11_linux_s390x_toolchain_config_repo": "remotejdk11_linux_s390x_toolchain_config_repo" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "platforms": "platforms@0.0.6", - "rules_cc": "rules_cc@0.0.6", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_proto": "rules_proto@5.3.0-21.7" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0","urls":["--https://github.com/bazelbuild/rules_java/releases/download/5.5.0/rules_java-5.5.0.tar.gz"],"integrity":"--sha256-vPq/tAfLDIggFBMQ+qEC9/uSzIBrDw4mpiUZYQGwtX4=","strip_prefix":"--","remote_patches":{},"remote_patch_strip":0} - } - }, - "rules_license@0.0.3": { - "name": "rules_license", - "version": "0.0.3", - "key": "rules_license@0.0.3", - "repoName": "rules_license", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_license~0.0.3","urls":["--https://github.com/bazelbuild/rules_license/releases/download/0.0.3/rules_license-0.0.3.tar.gz"],"integrity":"--sha256-AMzA3yExLBJ6xLEogKsPmibBz/mUQtxsWjMXUDYN48M=","strip_prefix":"--","remote_patches":{"--https://bcr.bazel.build/modules/rules_license/0.0.3/patches/module_dot_bazel.patch":"--sha256-yim5cwFqlS1F1UomygmIEM/UQhrkQZyYrwo48WFt4gE="},"remote_patch_strip":0} - } - }, - "platforms@0.0.6": { - "name": "platforms", - "version": "0.0.6", - "key": "platforms@0.0.6", - "repoName": "platforms", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--platforms","urls":["--https://github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz"],"integrity":"--sha256-Uwj8HYhlQGpJQnuiSpq1MIfxf1Jmp6q7/CiCPzkW4co=","strip_prefix":"--","remote_patches":{"--https://bcr.bazel.build/modules/platforms/0.0.6/patches/module_dot_bazel.patch":"--sha256-/ITFV+xsibQyr8dnuvUILvdoOigF6XuiyZHN+Gf7UM8="},"remote_patch_strip":0} - } - }, - "gazelle@0.30.0": { - "name": "gazelle", - "version": "0.30.0", - "key": "gazelle@0.30.0", - "repoName": "bazel_gazelle", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [ - { - "extensionBzlFile": "@io_bazel_rules_go//go:extensions.bzl", - "extensionName": "go_sdk", - "usingModule": "gazelle@0.30.0", - "location": { - "file": "https://bcr.bazel.build/modules/gazelle/0.30.0/MODULE.bazel", - "line": 12, - "column": 23 - }, - "imports": { - "go_default_sdk": "go_default_sdk" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_gazelle//internal/bzlmod:non_module_deps.bzl", - "extensionName": "non_module_deps", - "usingModule": "gazelle@0.30.0", - "location": { - "file": "https://bcr.bazel.build/modules/gazelle/0.30.0/MODULE.bazel", - "line": 17, - "column": 32 - }, - "imports": { - "bazel_gazelle_go_repository_cache": "bazel_gazelle_go_repository_cache", - "bazel_gazelle_go_repository_tools": "bazel_gazelle_go_repository_tools" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@io_bazel_rules_go//go/private:extensions.bzl", - "extensionName": "non_module_dependencies", - "usingModule": "gazelle@0.30.0", - "location": { - "file": "https://bcr.bazel.build/modules/gazelle/0.30.0/MODULE.bazel", - "line": 24, - "column": 41 - }, - "imports": { - "go_googleapis": "go_googleapis" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": "@bazel_gazelle//:extensions.bzl", - "extensionName": "go_deps", - "usingModule": "gazelle@0.30.0", - "location": { - "file": "https://bcr.bazel.build/modules/gazelle/0.30.0/MODULE.bazel", - "line": 30, - "column": 24 - }, - "imports": { - "com_github_bazelbuild_buildtools": "com_github_bazelbuild_buildtools", - "com_github_bmatcuk_doublestar_v4": "com_github_bmatcuk_doublestar_v4", - "com_github_fsnotify_fsnotify": "com_github_fsnotify_fsnotify", - "com_github_google_go_cmp": "com_github_google_go_cmp", - "com_github_pelletier_go_toml": "com_github_pelletier_go_toml", - "com_github_pmezard_go_difflib": "com_github_pmezard_go_difflib", - "org_golang_x_mod": "org_golang_x_mod", - "org_golang_x_sync": "org_golang_x_sync", - "org_golang_x_tools": "org_golang_x_tools", - "bazel_gazelle_go_repository_config": "bazel_gazelle_go_repository_config" - }, - "devImports": [], - "tags": [ - { - "tagName": "from_file", - "attributeValues": {"go_mod":"--//:go.mod"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/gazelle/0.30.0/MODULE.bazel", - "line": 31, - "column": 18 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "com_google_protobuf": "protobuf@21.7", - "io_bazel_rules_go": "rules_go@0.39.1", - "rules_proto": "rules_proto@5.3.0-21.7" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--gazelle~0.30.0","urls":["--https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.30.0/bazel-gazelle-v0.30.0.tar.gz"],"integrity":"--sha256-cn8+Tt2W6iDCnowsqejSr3JNjHd455I6hUssgJUrxAU=","strip_prefix":"--","remote_patches":{},"remote_patch_strip":0} - } - }, - "upb@0.0.0-20220923-a547704": { - "name": "upb", - "version": "0.0.0-20220923-a547704", - "key": "upb@0.0.0-20220923-a547704", - "repoName": "upb", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_proto": "rules_proto@5.3.0-21.7", - "com_google_protobuf": "protobuf@21.7", - "com_google_absl": "abseil-cpp@20230125.1", - "platforms": "platforms@0.0.6" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--upb~0.0.0-20220923-a547704","urls":["--https://github.com/protocolbuffers/upb/archive/a5477045acaa34586420942098f5fecd3570f577.tar.gz"],"integrity":"--sha256-z39x6v+QskwaKLSWRan/A6mmwecTQpHOcJActj5zZLU=","strip_prefix":"--upb-a5477045acaa34586420942098f5fecd3570f577","remote_patches":{"--https://bcr.bazel.build/modules/upb/0.0.0-20220923-a547704/patches/module_dot_bazel.patch":"--sha256-wH4mNS6ZYy+8uC0HoAft/c7SDsq2Kxf+J8dUakXhaB0="},"remote_patch_strip":0} - } - }, - "rules_jvm_external@4.4.2": { - "name": "rules_jvm_external", - "version": "4.4.2", - "key": "rules_jvm_external@4.4.2", - "repoName": "rules_jvm_external", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [ - { - "extensionBzlFile": "@rules_jvm_external//:non-module-deps.bzl", - "extensionName": "non_module_deps", - "usingModule": "rules_jvm_external@4.4.2", - "location": { - "file": "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel", - "line": 9, - "column": 32 - }, - "imports": { - "io_bazel_rules_kotlin": "io_bazel_rules_kotlin" - }, - "devImports": [], - "tags": [], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - }, - { - "extensionBzlFile": ":extensions.bzl", - "extensionName": "maven", - "usingModule": "rules_jvm_external@4.4.2", - "location": { - "file": "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel", - "line": 16, - "column": 22 - }, - "imports": { - "rules_jvm_external_deps": "rules_jvm_external_deps" - }, - "devImports": [], - "tags": [ - { - "tagName": "install", - "attributeValues": {"name":"--rules_jvm_external_deps","artifacts":["--com.google.cloud:google-cloud-core:1.93.10","--com.google.cloud:google-cloud-storage:1.113.4","--com.google.code.gson:gson:2.9.0","--org.apache.maven:maven-artifact:3.8.6","--software.amazon.awssdk:s3:2.17.183"],"lock_file":"--@rules_jvm_external//:rules_jvm_external_deps_install.json"}, - "devDependency": false, - "location": { - "file": "https://bcr.bazel.build/modules/rules_jvm_external/4.4.2/MODULE.bazel", - "line": 18, - "column": 14 - } - } - ], - "hasDevUseExtension": false, - "hasNonDevUseExtension": true - } - ], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "io_bazel_stardoc": "stardoc@0.5.1" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_jvm_external~4.4.2","urls":["--https://github.com/bazelbuild/rules_jvm_external/archive/refs/tags/4.4.2.zip"],"integrity":"--sha256-c1YC9QgT6y6pPKP15DsZWb2AshO4NqB6YqKddXZwt3s=","strip_prefix":"--rules_jvm_external-4.4.2","remote_patches":{},"remote_patch_strip":0} - } - }, - "stardoc@0.5.1": { - "name": "stardoc", - "version": "0.5.1", - "key": "stardoc@0.5.1", - "repoName": "stardoc", - "executionPlatformsToRegister": [], - "toolchainsToRegister": [], - "extensionUsages": [], - "deps": { - "bazel_tools": "bazel_tools@_", - "local_config_platform": "local_config_platform@_", - "bazel_skylib": "bazel_skylib@1.4.2", - "rules_java": "rules_java@5.5.0" - }, - "repoSpec": { - "bzlFile": "@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--stardoc~0.5.1","urls":["--https://github.com/bazelbuild/stardoc/releases/download/0.5.1/stardoc-0.5.1.tar.gz"],"integrity":"--sha256-qoFNrgrEALurLoiB+ZFcb0fElmS/CHxAmhX5BDjSwj4=","strip_prefix":"--","remote_patches":{"--https://bcr.bazel.build/modules/stardoc/0.5.1/patches/module_dot_bazel.patch":"--sha256-UAULCuTpJE7SG0YrR9XLjMfxMRmbP+za3uW9ONZ5rjI="},"remote_patch_strip":0} - } - } - }, - "moduleExtensions": { - "@bazel_tools//tools/cpp:cc_configure.bzl%cc_configure_extension": { - "bzlTransitiveDigest": "fX+NTqVY9jebrhWZSjm+R2r4sMbV1U3pvP90DKmouSg=", - "generatedRepoSpecs": { - "local_config_cc": { - "bzlFile": "@@bazel_tools//tools/cpp:cc_configure.bzl", - "ruleClassName": "cc_autoconf", - "attributes": {"name":"--bazel_tools~cc_configure_extension~local_config_cc"} - }, - "local_config_cc_toolchains": { - "bzlFile": "@@bazel_tools//tools/cpp:cc_configure.bzl", - "ruleClassName": "cc_autoconf_toolchains", - "attributes": {"name":"--bazel_tools~cc_configure_extension~local_config_cc_toolchains"} - } - } - }, - "@rules_go~0.39.1//go:extensions.bzl%go_sdk": { - "bzlTransitiveDigest": "baCc5Mc6nJAIoj3TovuW1bOINXCqP/9lOv0UCbAkhsk=", - "generatedRepoSpecs": { - "go_default_sdk": { - "bzlFile": "@@rules_go~0.39.1//go/private:sdk.bzl", - "ruleClassName": "go_download_sdk_rule", - "attributes": {"name":"--rules_go~0.39.1~go_sdk~go_default_sdk","goos":"--","goarch":"--","sdks":{},"urls":["--https://dl.google.com/go/{}"],"version":"--1.19.8"} - }, - "go_toolchains": { - "bzlFile": "@@rules_go~0.39.1//go/private:sdk.bzl", - "ruleClassName": "go_multiple_toolchains", - "attributes": {"name":"--rules_go~0.39.1~go_sdk~go_toolchains","prefixes":["--_0000_go_default_sdk_"],"geese":["--"],"goarchs":["--"],"sdk_repos":["--go_default_sdk"],"sdk_types":["--remote"],"sdk_versions":["--1.19.8"]} - } - } - }, - "@rules_cc~0.0.6//cc:extensions.bzl%cc_configure": { - "bzlTransitiveDigest": "VKMuk3c8UzzPCfNMffIZahPRypRs3RxaBtETOTI5fJg=", - "generatedRepoSpecs": { - "local_config_cc": { - "bzlFile": "@@rules_cc~0.0.6//cc/private/toolchain:cc_configure.bzl", - "ruleClassName": "cc_autoconf", - "attributes": {"name":"--rules_cc~0.0.6~cc_configure~local_config_cc"} - }, - "local_config_cc_toolchains": { - "bzlFile": "@@rules_cc~0.0.6//cc/private/toolchain:cc_configure.bzl", - "ruleClassName": "cc_autoconf_toolchains", - "attributes": {"name":"--rules_cc~0.0.6~cc_configure~local_config_cc_toolchains"} - }, - "local_config_xcode": { - "bzlFile": "@@bazel_tools//tools/osx:xcode_configure.bzl", - "ruleClassName": "xcode_autoconf", - "attributes": {"name":"--rules_cc~0.0.6~cc_configure~local_config_xcode","xcode_locator":"--@bazel_tools//tools/osx:xcode_locator.m","remote_xcode":"--"} - } - } - }, - "@bazel_tools//tools/sh:sh_configure.bzl%sh_configure_extension": { - "bzlTransitiveDigest": "hp4NgmNjEg5+xgvzfh6L83bt9/aiiWETuNpwNuF1MSU=", - "generatedRepoSpecs": { - "local_config_sh": { - "bzlFile": "@@bazel_tools//tools/sh:sh_configure.bzl", - "ruleClassName": "sh_config", - "attributes": {"name":"--bazel_tools~sh_configure_extension~local_config_sh"} - } - } - }, - "@rules_java~5.5.0//java:extensions.bzl%toolchains": { - "bzlTransitiveDigest": "IVTttRaqn26iAvJN4qehdM+OxbrjZDF3SRPyI2lokXk=", - "generatedRepoSpecs": { - "remotejdk19_macos_aarch64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_macos_aarch64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_19\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"19\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:aarch64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk19_macos_aarch64//:jdk\",\n)\n"} - }, - "remotejdk17_macos_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_macos_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_17\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"17\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk17_macos//:jdk\",\n)\n"} - }, - "remotejdk17_linux_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_linux_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_17\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"17\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk17_linux//:jdk\",\n)\n"} - }, - "remote_java_tools_darwin": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools_darwin","sha256":"--abc434be713ee9e1fd6525d7a7bd9d7cdff6e27ae3ca9d96420490e7ff6e28a3","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools_darwin_x86_64-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools_darwin_x86_64-v12.0.zip"]} - }, - "remotejdk17_macos_aarch64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_macos_aarch64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--54247dde248ffbcd3c048675504b1c503b81daf2dc0d64a79e353c48d383c977","strip_prefix":"--zulu17.32.13-ca-jdk17.0.2-macosx_aarch64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-macosx_aarch64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-macosx_aarch64.tar.gz"]} - }, - "remote_java_tools_windows": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools_windows","sha256":"--7b938f0c67d9d390f10489b1b9a4dabb51e39ecc94532c3acdf8c4c16900457f","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools_windows-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools_windows-v12.0.zip"]} - }, - "remotejdk11_win": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_win","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--a106c77389a63b6bd963a087d5f01171bd32aa3ee7377ecef87531390dcb9050","strip_prefix":"--zulu11.56.19-ca-jdk11.0.15-win_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-win_x64.zip","--https://cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-win_x64.zip"]} - }, - "remotejdk11_win_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_win_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:windows\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_win//:jdk\",\n)\n"} - }, - "remotejdk11_linux_aarch64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_aarch64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--fc7c41a0005180d4ca471c90d01e049469e0614cf774566d4cf383caa29d1a97","strip_prefix":"--zulu11.56.19-ca-jdk11.0.15-linux_aarch64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu-embedded/bin/zulu11.56.19-ca-jdk11.0.15-linux_aarch64.tar.gz","--https://cdn.azul.com/zulu-embedded/bin/zulu11.56.19-ca-jdk11.0.15-linux_aarch64.tar.gz"]} - }, - "remotejdk17_linux": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_linux","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--73d5c4bae20325ca41b606f7eae64669db3aac638c5b3ead4a975055846ad6de","strip_prefix":"--zulu17.32.13-ca-jdk17.0.2-linux_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-linux_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-linux_x64.tar.gz"]} - }, - "remotejdk11_linux_s390x_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_s390x_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:s390x\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_linux_s390x//:jdk\",\n)\n"} - }, - "remotejdk11_linux_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_linux//:jdk\",\n)\n"} - }, - "remotejdk11_macos": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_macos","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--2614e5c5de8e989d4d81759de4c333aa5b867b17ab9ee78754309ba65c7f6f55","strip_prefix":"--zulu11.56.19-ca-jdk11.0.15-macosx_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-macosx_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-macosx_x64.tar.gz"]} - }, - "remotejdk11_win_arm64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_win_arm64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--b8a28e6e767d90acf793ea6f5bed0bb595ba0ba5ebdf8b99f395266161e53ec2","strip_prefix":"--jdk-11.0.13+8","urls":["--https://mirror.bazel.build/aka.ms/download-jdk/microsoft-jdk-11.0.13.8.1-windows-aarch64.zip"]} - }, - "remotejdk17_macos": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_macos","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--89d04b2d99b05dcb25114178e65f6a1c5ca742e125cab0a63d87e7e42f3fcb80","strip_prefix":"--zulu17.32.13-ca-jdk17.0.2-macosx_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-macosx_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-macosx_x64.tar.gz"]} - }, - "remotejdk17_macos_aarch64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_macos_aarch64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_17\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"17\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:aarch64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk17_macos_aarch64//:jdk\",\n)\n"} - }, - "remotejdk17_win": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_win","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--e965aa0ea7a0661a3446cf8f10ee00684b851f883b803315289f26b4aa907fdb","strip_prefix":"--zulu17.32.13-ca-jdk17.0.2-win_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-win_x64.zip","--https://cdn.azul.com/zulu/bin/zulu17.32.13-ca-jdk17.0.2-win_x64.zip"]} - }, - "remotejdk11_macos_aarch64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_macos_aarch64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:aarch64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_macos_aarch64//:jdk\",\n)\n"} - }, - "remotejdk11_linux_ppc64le_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_ppc64le_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:ppc\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_linux_ppc64le//:jdk\",\n)\n"} - }, - "remote_java_tools_linux": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools_linux","sha256":"--4b8366b780387fc5ce69527ed287f2b444ee429d3325305ad062c92ac43c7fb6","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools_linux-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools_linux-v12.0.zip"]} - }, - "remotejdk19_macos_aarch64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_macos_aarch64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--177d058d968b2fbe7a5ff5eceb18cdc16f6376ce291004f1a3139e78b2fb6391","strip_prefix":"--zulu19.32.13-ca-jdk19.0.2-macosx_aarch64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-macosx_aarch64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-macosx_aarch64.tar.gz"]} - }, - "remotejdk19_win_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_win_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_19\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"19\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:windows\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk19_win//:jdk\",\n)\n"} - }, - "remotejdk19_macos_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_macos_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_19\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"19\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk19_macos//:jdk\",\n)\n"} - }, - "remotejdk19_linux": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_linux","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--4a994aded1d9b35258d543a59d4963d2687a1094a818b79a21f00273fbbc5bca","strip_prefix":"--zulu19.32.13-ca-jdk19.0.2-linux_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-linux_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-linux_x64.tar.gz"]} - }, - "remotejdk11_linux_aarch64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_aarch64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:aarch64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_linux_aarch64//:jdk\",\n)\n"} - }, - "remotejdk11_linux_s390x": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_s390x","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--a58fc0361966af0a5d5a31a2d8a208e3c9bb0f54f345596fd80b99ea9a39788b","strip_prefix":"--jdk-11.0.15+10","urls":["--https://mirror.bazel.build/github.com/adoptium/temurin11-binaries/releases/download/jdk-11.0.15+10/OpenJDK11U-jdk_s390x_linux_hotspot_11.0.15_10.tar.gz","--https://github.com/adoptium/temurin11-binaries/releases/download/jdk-11.0.15+10/OpenJDK11U-jdk_s390x_linux_hotspot_11.0.15_10.tar.gz"]} - }, - "remotejdk17_win_arm64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_win_arm64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_17\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"17\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:windows\", \"@platforms//cpu:arm64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk17_win_arm64//:jdk\",\n)\n"} - }, - "remotejdk11_linux": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--e064b61d93304012351242bf0823c6a2e41d9e28add7ea7f05378b7243d34247","strip_prefix":"--zulu11.56.19-ca-jdk11.0.15-linux_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-linux_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-linux_x64.tar.gz"]} - }, - "remotejdk11_macos_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_macos_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:macos\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_macos//:jdk\",\n)\n"} - }, - "remotejdk19_win": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_win","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--d6c768c5ec3252f936bd0562c25458f7c753c62835ca3e91166f975f7a5fe9f1","strip_prefix":"--zulu19.32.13-ca-jdk19.0.2-win_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-win_x64.zip","--https://cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-win_x64.zip"]} - }, - "remotejdk17_win_arm64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_win_arm64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--811d7e7591bac4f081dfb00ba6bd15b6fc5969e1f89f0f327ef75147027c3877","strip_prefix":"--zulu17.30.15-ca-jdk17.0.1-win_aarch64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu17.30.15-ca-jdk17.0.1-win_aarch64.zip","--https://cdn.azul.com/zulu/bin/zulu17.30.15-ca-jdk17.0.1-win_aarch64.zip"]} - }, - "remotejdk19_macos": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_macos","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--2804575ae9ac63e39caa910e57610bf52b0f9e2d671928a98d18e2fcc9f62ac1","strip_prefix":"--zulu19.32.13-ca-jdk19.0.2-macosx_x64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-macosx_x64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu19.32.13-ca-jdk19.0.2-macosx_x64.tar.gz"]} - }, - "remotejdk19_linux_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk19_linux_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_19\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"19\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:linux\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk19_linux//:jdk\",\n)\n"} - }, - "remote_java_tools_darwin_arm64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools_darwin_arm64","sha256":"--24a47a5557ee2ccdacd10a54fe4c15d627c6aeaf7596a5dccf2e11a866a5a32a","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools_darwin_arm64-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools_darwin_arm64-v12.0.zip"]} - }, - "remotejdk11_win_arm64_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_win_arm64_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_11\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"11\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:windows\", \"@platforms//cpu:arm64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk11_win_arm64//:jdk\",\n)\n"} - }, - "local_jdk": { - "bzlFile": "@@rules_java~5.5.0//toolchains:local_java_repository.bzl", - "ruleClassName": "_local_java_repository_rule", - "attributes": {"name":"--rules_java~5.5.0~toolchains~local_jdk","target_name":"--local_jdk","java_home":"--","version":"--","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD"} - }, - "remote_java_tools_darwin_x86_64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools_darwin_x86_64","sha256":"--abc434be713ee9e1fd6525d7a7bd9d7cdff6e27ae3ca9d96420490e7ff6e28a3","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools_darwin_x86_64-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools_darwin_x86_64-v12.0.zip"]} - }, - "remote_java_tools": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remote_java_tools","sha256":"--6efab6ca6e16e02c90e62bbd08ca65f61527984ab78564ea7ad7a2692b2ffdbb","urls":["--https://mirror.bazel.build/bazel_java_tools/releases/java/v12.0/java_tools-v12.0.zip","--https://github.com/bazelbuild/java_tools/releases/download/java_v12.0/java_tools-v12.0.zip"]} - }, - "remotejdk17_win_toolchain_config_repo": { - "bzlFile": "@@rules_java~5.5.0//toolchains:remote_java_repository.bzl", - "ruleClassName": "_toolchain_config", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk17_win_toolchain_config_repo","build_file":"--\nconfig_setting(\n name = \"prefix_version_setting\",\n values = {\"java_runtime_version\": \"remotejdk_17\"},\n visibility = [\"//visibility:private\"],\n)\nconfig_setting(\n name = \"version_setting\",\n values = {\"java_runtime_version\": \"17\"},\n visibility = [\"//visibility:private\"],\n)\nalias(\n name = \"version_or_prefix_version_setting\",\n actual = select({\n \":version_setting\": \":version_setting\",\n \"//conditions:default\": \":prefix_version_setting\",\n }),\n visibility = [\"//visibility:private\"],\n)\ntoolchain(\n name = \"toolchain\",\n target_compatible_with = [\"@platforms//os:windows\", \"@platforms//cpu:x86_64\"],\n target_settings = [\":version_or_prefix_version_setting\"],\n toolchain_type = \"@bazel_tools//tools/jdk:runtime_toolchain_type\",\n toolchain = \"@remotejdk17_win//:jdk\",\n)\n"} - }, - "remotejdk11_linux_ppc64le": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_linux_ppc64le","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--a8fba686f6eb8ae1d1a9566821dbd5a85a1108b96ad857fdbac5c1e4649fc56f","strip_prefix":"--jdk-11.0.15+10","urls":["--https://mirror.bazel.build/github.com/adoptium/temurin11-binaries/releases/download/jdk-11.0.15+10/OpenJDK11U-jdk_ppc64le_linux_hotspot_11.0.15_10.tar.gz","--https://github.com/adoptium/temurin11-binaries/releases/download/jdk-11.0.15+10/OpenJDK11U-jdk_ppc64le_linux_hotspot_11.0.15_10.tar.gz"]} - }, - "remotejdk11_macos_aarch64": { - "bzlFile": "@@bazel_tools//tools/build_defs/repo:http.bzl", - "ruleClassName": "http_archive", - "attributes": {"name":"--rules_java~5.5.0~toolchains~remotejdk11_macos_aarch64","build_file":"@@rules_java~5.5.0//toolchains:jdk.BUILD","sha256":"--6bb0d2c6e8a29dcd9c577bbb2986352ba12481a9549ac2c0bcfd00ed60e538d2","strip_prefix":"--zulu11.56.19-ca-jdk11.0.15-macosx_aarch64","urls":["--https://mirror.bazel.build/cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-macosx_aarch64.tar.gz","--https://cdn.azul.com/zulu/bin/zulu11.56.19-ca-jdk11.0.15-macosx_aarch64.tar.gz"]} - } - } - }, - "@rules_python~0.23.1//python/extensions:python.bzl%python": { - "bzlTransitiveDigest": "2Q3QMwDMkksIS2rBF6FfvitzVzCRMwBNZ+dikL9yolM=", - "generatedRepoSpecs": { - "python_aliases": { - "bzlFile": "@@rules_python~0.23.1//python/private:toolchains_repo.bzl", - "ruleClassName": "multi_toolchain_aliases", - "attributes": {"name":"--rules_python~0.23.1~python~python_aliases","python_versions":{"--3.11":"--python_3_11"}} - }, - "python_3_11": { - "bzlFile": "@@rules_python~0.23.1//python/private:toolchains_repo.bzl", - "ruleClassName": "toolchain_aliases", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11","python_version":"--3.11.1","user_repository_name":"--python_3_11"} - }, - "python_3_11_aarch64-unknown-linux-gnu": { - "bzlFile": "@@rules_python~0.23.1//python:repositories.bzl", - "ruleClassName": "python_repository", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11_aarch64-unknown-linux-gnu","sha256":"--debf15783bdcb5530504f533d33fda75a7b905cec5361ae8f33da5ba6599f8b4","patches":[],"platform":"--aarch64-unknown-linux-gnu","python_version":"--3.11.1","release_filename":"--20230116/cpython-3.11.1+20230116-aarch64-unknown-linux-gnu-install_only.tar.gz","urls":["--https://github.com/indygreg/python-build-standalone/releases/download/20230116/cpython-3.11.1+20230116-aarch64-unknown-linux-gnu-install_only.tar.gz"],"distutils":null,"distutils_content":"--","strip_prefix":"--python","coverage_tool":null,"ignore_root_user_error":false} - }, - "python_3_11_aarch64-apple-darwin": { - "bzlFile": "@@rules_python~0.23.1//python:repositories.bzl", - "ruleClassName": "python_repository", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11_aarch64-apple-darwin","sha256":"--4918cdf1cab742a90f85318f88b8122aeaa2d04705803c7b6e78e81a3dd40f80","patches":[],"platform":"--aarch64-apple-darwin","python_version":"--3.11.1","release_filename":"--20230116/cpython-3.11.1+20230116-aarch64-apple-darwin-install_only.tar.gz","urls":["--https://github.com/indygreg/python-build-standalone/releases/download/20230116/cpython-3.11.1+20230116-aarch64-apple-darwin-install_only.tar.gz"],"distutils":null,"distutils_content":"--","strip_prefix":"--python","coverage_tool":null,"ignore_root_user_error":false} - }, - "python_3_11_x86_64-apple-darwin": { - "bzlFile": "@@rules_python~0.23.1//python:repositories.bzl", - "ruleClassName": "python_repository", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11_x86_64-apple-darwin","sha256":"--20a4203d069dc9b710f70b09e7da2ce6f473d6b1110f9535fb6f4c469ed54733","patches":[],"platform":"--x86_64-apple-darwin","python_version":"--3.11.1","release_filename":"--20230116/cpython-3.11.1+20230116-x86_64-apple-darwin-install_only.tar.gz","urls":["--https://github.com/indygreg/python-build-standalone/releases/download/20230116/cpython-3.11.1+20230116-x86_64-apple-darwin-install_only.tar.gz"],"distutils":null,"distutils_content":"--","strip_prefix":"--python","coverage_tool":null,"ignore_root_user_error":false} - }, - "pythons_hub": { - "bzlFile": "@@rules_python~0.23.1//python/extensions/private:pythons_hub.bzl", - "ruleClassName": "hub_repo", - "attributes": {"name":"--rules_python~0.23.1~python~pythons_hub","toolchain_prefixes":["--_0000_python_3_11_"],"toolchain_python_versions":["--3.11"],"toolchain_set_python_version_constraints":["--False"],"toolchain_user_repository_names":["--python_3_11"]} - }, - "python_3_11_x86_64-pc-windows-msvc": { - "bzlFile": "@@rules_python~0.23.1//python:repositories.bzl", - "ruleClassName": "python_repository", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11_x86_64-pc-windows-msvc","sha256":"--edc08979cb0666a597466176511529c049a6f0bba8adf70df441708f766de5bf","patches":[],"platform":"--x86_64-pc-windows-msvc","python_version":"--3.11.1","release_filename":"--20230116/cpython-3.11.1+20230116-x86_64-pc-windows-msvc-shared-install_only.tar.gz","urls":["--https://github.com/indygreg/python-build-standalone/releases/download/20230116/cpython-3.11.1+20230116-x86_64-pc-windows-msvc-shared-install_only.tar.gz"],"distutils":null,"distutils_content":"--","strip_prefix":"--python","coverage_tool":null,"ignore_root_user_error":false} - }, - "python_3_11_x86_64-unknown-linux-gnu": { - "bzlFile": "@@rules_python~0.23.1//python:repositories.bzl", - "ruleClassName": "python_repository", - "attributes": {"name":"--rules_python~0.23.1~python~python_3_11_x86_64-unknown-linux-gnu","sha256":"--02a551fefab3750effd0e156c25446547c238688a32fabde2995c941c03a6423","patches":[],"platform":"--x86_64-unknown-linux-gnu","python_version":"--3.11.1","release_filename":"--20230116/cpython-3.11.1+20230116-x86_64-unknown-linux-gnu-install_only.tar.gz","urls":["--https://github.com/indygreg/python-build-standalone/releases/download/20230116/cpython-3.11.1+20230116-x86_64-unknown-linux-gnu-install_only.tar.gz"],"distutils":null,"distutils_content":"--","strip_prefix":"--python","coverage_tool":null,"ignore_root_user_error":false} - } - } - } - } -} \ No newline at end of file diff --git a/src/Molecule_Lib/_iwaray_coords.cc b/src/Molecule_Lib/_iwaray_coords.cc deleted file mode 100644 index 4acd7ec5..00000000 --- a/src/Molecule_Lib/_iwaray_coords.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "coordinates.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_csubs.cc b/src/Molecule_Lib/_iwaray_csubs.cc deleted file mode 100644 index 92b50af2..00000000 --- a/src/Molecule_Lib/_iwaray_csubs.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "path.h" -#include "substructure.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_ofsmtype.cc b/src/Molecule_Lib/_iwaray_ofsmtype.cc deleted file mode 100644 index 83a24b3d..00000000 --- a/src/Molecule_Lib/_iwaray_ofsmtype.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "ostream_and_type.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_p_rmele.cc b/src/Molecule_Lib/_iwaray_p_rmele.cc deleted file mode 100644 index 2541fa06..00000000 --- a/src/Molecule_Lib/_iwaray_p_rmele.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "rmele.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_p_rs.cc b/src/Molecule_Lib/_iwaray_p_rs.cc deleted file mode 100644 index 5801cffc..00000000 --- a/src/Molecule_Lib/_iwaray_p_rs.cc +++ /dev/null @@ -1,10 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "substructure.h" - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_path_scoring.cc b/src/Molecule_Lib/_iwaray_path_scoring.cc deleted file mode 100644 index 3bbe02fa..00000000 --- a/src/Molecule_Lib/_iwaray_path_scoring.cc +++ /dev/null @@ -1,10 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" - -#include "path_scoring.h" - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_rxn.cc b/src/Molecule_Lib/_iwaray_rxn.cc deleted file mode 100644 index 9e829c11..00000000 --- a/src/Molecule_Lib/_iwaray_rxn.cc +++ /dev/null @@ -1,136 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" - -#include "iwreaction.h" - -template class resizable_array_p; -template class resizable_array_base ; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array; -template class resizable_array_base; - -template class resizable_array; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; - -template class resizable_array_p; -template class resizable_array_base; -template class resizable_array; - -#ifdef DONTNEEDTHISANYMORE - -static void -unused () -{ - resizable_array ra; - ra.resize (0); - ra.add (nullptr); - ra[0]; - - resizable_array_p srs; - resizable_array_base srsb; - - srs.add (nullptr); - srsb[0]; - - resizable_array_p rfc; - resizable_array_base rfcb; - - rfc.add (nullptr); - - rfc[0]; - - resizable_array_p rpi; - resizable_array_base rpib; - - rpi.add (nullptr); - - rpi[0]; - - resizable_array_p rce; - resizable_array_base rceb; - - rce.add (nullptr); - rce[0]; - - resizable_array_p rsc; - resizable_array_base rscb; - - rsc.add (nullptr); - rsc[0]; - rscb.last_item (); - - resizable_array_p ipb; - resizable_array_base ipbb; - - ipb.add (nullptr); - ipb[0]; - - resizable_array_p mae; - resizable_array maep; - resizable_array_base maeb; - - mae.add (nullptr); - mae[0]; - - resizable_array_p nr; - resizable_array_base nrb; - - nr.add (nullptr); - nr[0]; - - resizable_array_p sa; - resizable_array_base sab; - - sab.add (nullptr); - (void) sab[0]; - - return; -} - -#endif diff --git a/src/Molecule_Lib/_iwaray_set_of_atoms.cc b/src/Molecule_Lib/_iwaray_set_of_atoms.cc deleted file mode 100644 index b5731d2b..00000000 --- a/src/Molecule_Lib/_iwaray_set_of_atoms.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "molecule.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_iwaray_ssq.cc b/src/Molecule_Lib/_iwaray_ssq.cc deleted file mode 100644 index 23ccf2d5..00000000 --- a/src/Molecule_Lib/_iwaray_ssq.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "substructure.h" -#include "path.h" - -template class resizable_array_p; -template class resizable_array_base; - diff --git a/src/Molecule_Lib/_iwaray_sym.cc b/src/Molecule_Lib/_iwaray_sym.cc deleted file mode 100644 index 879664b1..00000000 --- a/src/Molecule_Lib/_iwaray_sym.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "molecule.h" -#include "symmetry.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_atom.cc b/src/Molecule_Lib/_resizable_array_atom.cc deleted file mode 100644 index b2223e04..00000000 --- a/src/Molecule_Lib/_resizable_array_atom.cc +++ /dev/null @@ -1,48 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "atom.h" - -//template class resizable_array_p; -//template class resizable_array; -//template class resizable_array_base; -//template class resizable_array_p; -// template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_base::ok() const; -template Atom * resizable_array_p::remove_no_delete(int); -template int resizable_array_p::resize(int); -template int resizable_array_p::remove_items(int const*); -template int resizable_array_p::remove_item(int); -template int resizable_array_base::index(Atom*) const; -template int resizable_array_base::add(Atom*); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::remove_item(int); -template int resizable_array_base::remove_items(int const*); - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_base::add(Connection*); -template int resizable_array_p::remove_item(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::remove_item(int); -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); - - -#if ! defined(NDEBUG) -template Connection * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_atom_list.cc b/src/Molecule_Lib/_resizable_array_atom_list.cc deleted file mode 100644 index 41f98965..00000000 --- a/src/Molecule_Lib/_resizable_array_atom_list.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "mdl_file_data.h" - -//template class resizable_array_p; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_bond.cc b/src/Molecule_Lib/_resizable_array_bond.cc deleted file mode 100644 index 65dec12c..00000000 --- a/src/Molecule_Lib/_resizable_array_bond.cc +++ /dev/null @@ -1,29 +0,0 @@ - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "bond.h" - -//template class resizable_array; -//template class resizable_array; -//template class resizable_array_base; - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - -template int resizable_array_base::add(Bond const*); -template int resizable_array_base::insert_before(int, Bond const*); - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - -template int resizable_array::add_if_not_already_present(Bond const*); -template int resizable_array_base::insert_before(int, Bond*); -template int resizable_array_base::swap_elements(int, int); -template int resizable_array::resize_keep_storage(int); - -#if ! defined(NDEBUG) -template Bond * resizable_array_base::last_item() const; -template int resizable_array_base::ok() const; -#endif - -template const Bond * & resizable_array_base::operator[](int) const; diff --git a/src/Molecule_Lib/_resizable_array_const_set_of_atoms.cc b/src/Molecule_Lib/_resizable_array_const_set_of_atoms.cc deleted file mode 100644 index c62aedcb..00000000 --- a/src/Molecule_Lib/_resizable_array_const_set_of_atoms.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "molecule.h" - -//template class resizable_array; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_element.cc b/src/Molecule_Lib/_resizable_array_element.cc deleted file mode 100644 index e642e478..00000000 --- a/src/Molecule_Lib/_resizable_array_element.cc +++ /dev/null @@ -1,26 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "element.h" - -//template class resizable_array; -//template class resizable_array_base; - - -template resizable_array::resizable_array(); -template resizable_array::resizable_array(int); -template resizable_array::~resizable_array(); - -template int resizable_array_base::add(Element const*); -template void resizable_array_base::operator+=(resizable_array_base const&); -template resizable_array & resizable_array::operator=(resizable_array const&); -template int resizable_array_base::contains(Element const*) const; -template int resizable_array::resize_keep_storage(int); -template int resizable_array::add_if_not_already_present(Element const*); - -template int resizable_array_base::ok() const; -template const Element * resizable_array_base::item(int) const; - -#if ! defined(NDEBUG) -template const Element * & resizable_array_base::operator[](int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_lors.cc b/src/Molecule_Lib/_resizable_array_lors.cc deleted file mode 100644 index 3f531895..00000000 --- a/src/Molecule_Lib/_resizable_array_lors.cc +++ /dev/null @@ -1,10 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "molecule.h" - -//template class resizable_array_p; -//template class resizable_array_base; -#if ! defined(NDEBUG) -template List_of_Ring_Sizes * & resizable_array_base::operator[](int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_mdl_atom.cc b/src/Molecule_Lib/_resizable_array_mdl_atom.cc deleted file mode 100644 index 5d0920cb..00000000 --- a/src/Molecule_Lib/_resizable_array_mdl_atom.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include - - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "mdl_file_data.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); - -template int resizable_array_base::swap_elements(int, int); -template int resizable_array_p::resize_keep_storage(int); -template int resizable_array_p::remove_item(int); -template int resizable_array_base::add(MDL_Atom_Data*); - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); - -template int resizable_array_base::add(MDL_Bond_Data*); -template int resizable_array_p::remove_item(int); -template int resizable_array_p::resize_keep_storage(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::remove_item(int); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::remove_item(int); - -template int resizable_array_base::resize(int); -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); - -#if ! defined(NDEBUG) -template MDL_Atom_Data * & resizable_array_base::operator[](int) const; -template MDL_Bond_Data * & resizable_array_base::operator[](int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_molecule.cc b/src/Molecule_Lib/_resizable_array_molecule.cc deleted file mode 100644 index 742de554..00000000 --- a/src/Molecule_Lib/_resizable_array_molecule.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "molecule.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_base::add(Molecule*); -template int resizable_array_p::remove_item(int); -template void resizable_array_base::sort(int (*)(Molecule* const*, Molecule* const*)); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_p::resize_keep_storage(int); -template int resizable_array_base::remove_item(int); - -#if ! defined(NDEBUG) -template Molecule * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_molecule_to_match.cc b/src/Molecule_Lib/_resizable_array_molecule_to_match.cc deleted file mode 100644 index a0020899..00000000 --- a/src/Molecule_Lib/_resizable_array_molecule_to_match.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "target.h" - -//template class resizable_array_p; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_p_beep.cc b/src/Molecule_Lib/_resizable_array_p_beep.cc deleted file mode 100644 index 0a6324fd..00000000 --- a/src/Molecule_Lib/_resizable_array_p_beep.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "pearlman.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_p::remove_item(int); -template int resizable_array_base::add(Beep*); -template void resizable_array_base::sort(int (*)(Beep* const*, Beep* const*)); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); -template resizable_array_base::resizable_array_base(); -template int resizable_array_base::remove_item(int); - -#if ! defined(NDEBUG) -template Beep * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_bond.cc b/src/Molecule_Lib/_resizable_array_p_bond.cc deleted file mode 100644 index 02f7b30e..00000000 --- a/src/Molecule_Lib/_resizable_array_p_bond.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "bond.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); - -template resizable_array_base::resizable_array_base(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_p::resize_keep_storage(int); -template Bond * resizable_array_base::item(int) const; -template int resizable_array_p::remove_item(int); -template Bond * resizable_array_p::remove_no_delete(int); -template int resizable_array_base::remove_first(Bond*); -template int resizable_array_base::ok() const; -template void resizable_array_base::sort(int (*)(Bond* const*, Bond* const*)); -template int resizable_array_base::add(Bond*); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::remove_item(int); -template int resizable_array_base::resize(int); -template Bond * & resizable_array_base::operator[](int) const; -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); - -template resizable_array_base::~resizable_array_base(); -template int resizable_array_base::resize(int); - -#if ! defined(NDEBUG) -template const Bond * & resizable_array_base::operator[](int) const; -template Bond * resizable_array_base::last_item() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_cs.cc b/src/Molecule_Lib/_resizable_array_p_cs.cc deleted file mode 100644 index 57aabf07..00000000 --- a/src/Molecule_Lib/_resizable_array_p_cs.cc +++ /dev/null @@ -1,36 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "chiral_centre.h" - -//template class resizable_array_p; -//template class resizable_array_base; - - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_p::remove_item(int); -template int resizable_array_base::add(Chiral_Centre*); -template int resizable_array_p::resize_keep_storage(int); -template Chiral_Centre * resizable_array_p::remove_no_delete(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); -template int resizable_array_base::remove_item(int); -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); - -template int resizable_array_p::resize_no_delete(int); - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); -template int resizable_array::resize_keep_storage(int); - -#if ! defined(NDEBUG) -template Chiral_Centre * & resizable_array_base::operator[](int) const; -template Chiral_Centre * resizable_array_base::last_item() const; -template int resizable_array_base::ok() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_ctb.cc b/src/Molecule_Lib/_resizable_array_p_ctb.cc deleted file mode 100644 index 41d596c5..00000000 --- a/src/Molecule_Lib/_resizable_array_p_ctb.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "cis_trans_bond.h" - -//template class resizable_array_p; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_p_element.cc b/src/Molecule_Lib/_resizable_array_p_element.cc deleted file mode 100644 index 1a9f9387..00000000 --- a/src/Molecule_Lib/_resizable_array_p_element.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "element.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_p::resize(int); -template int resizable_array_base::add(Element*); -template int resizable_array_base::resize(int); - -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::resize(int); -template Element * resizable_array_base::last_item() const; - -#if ! defined(NDEBUG) -template Element * & resizable_array_base::operator[](int) const; -template const Element * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -template const Element * resizable_array_base::item(int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_ema.cc b/src/Molecule_Lib/_resizable_array_p_ema.cc deleted file mode 100644 index 6fb08ca9..00000000 --- a/src/Molecule_Lib/_resizable_array_p_ema.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "ematch.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template int resizable_array_base::add(Element_Matcher*); -template int resizable_array_p::resize(int); -template resizable_array_base::~resizable_array_base(); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::resize(int); -template resizable_array_base::resizable_array_base(); - -#if ! defined(NDEBUG) -template int resizable_array_base::ok() const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_etr.cc b/src/Molecule_Lib/_resizable_array_p_etr.cc deleted file mode 100644 index 30ba401c..00000000 --- a/src/Molecule_Lib/_resizable_array_p_etr.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "etrans.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::~resizable_array_base(); - -template resizable_array_base::resizable_array_base(); - -template int resizable_array_p::resize(int); -template int resizable_array_base::add(Element_Transformation*); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::resize(int); - -//template int resizable_array_p::resize(int); - -template int resizable_array_base::ok() const; diff --git a/src/Molecule_Lib/_resizable_array_p_fpc.cc b/src/Molecule_Lib/_resizable_array_p_fpc.cc deleted file mode 100644 index 83e8339c..00000000 --- a/src/Molecule_Lib/_resizable_array_p_fpc.cc +++ /dev/null @@ -1,18 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "iwaray.h" -#include "dy_pool.h" - -#if defined(__GNUG__) || defined (__SUNPRO_CC) - -template class resizable_array_p; -template class resizable_array_base; - -#else - -static void -unused () -{ - resizable_array_p; -} - -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_mgroup.cc b/src/Molecule_Lib/_resizable_array_p_mgroup.cc deleted file mode 100644 index 29457fb4..00000000 --- a/src/Molecule_Lib/_resizable_array_p_mgroup.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include - - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "iwaray.h" -#include "mgroup.h" - -#if defined(__GNUG__) || defined (__SUNPRO_CC) -template class resizable_array_p; -template class resizable_array_base; -#else - -static void -unused () -{ - resizable_array_p m; - resizable_array_base mb; - - mb[0]; - mb.add(nullptr); - - m.transfer_in (m); -} - -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_path.cc b/src/Molecule_Lib/_resizable_array_p_path.cc deleted file mode 100644 index 80063215..00000000 --- a/src/Molecule_Lib/_resizable_array_p_path.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "path.h" - -//template class resizable_array_p; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_p_pthmsg.cc b/src/Molecule_Lib/_resizable_array_p_pthmsg.cc deleted file mode 100644 index 93d3b3e6..00000000 --- a/src/Molecule_Lib/_resizable_array_p_pthmsg.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" - -#include "pearlman.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_p::remove_item(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_p::resize_keep_storage(int); -template int resizable_array_base::add(Path_Message*); -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); -template int resizable_array_base::remove_item(int); - - -#if ! defined(NDEBUG) -template Path_Message * & resizable_array_base::operator[](int) const; -template const Path_Message * & resizable_array_base::operator[](int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_p_qry_dm.cc b/src/Molecule_Lib/_resizable_array_p_qry_dm.cc deleted file mode 100644 index c5c90016..00000000 --- a/src/Molecule_Lib/_resizable_array_p_qry_dm.cc +++ /dev/null @@ -1,17 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "qry_and_demerit.h" -#include "path.h" - -#if defined(__GNUG__) || defined (__SUNPRO_CC) -template class resizable_array_p; -template class resizable_array_base; -#else - -static void -unused () -{ - resizable_array_p v; -} - -#endif - diff --git a/src/Molecule_Lib/_resizable_array_p_ring.cc b/src/Molecule_Lib/_resizable_array_p_ring.cc deleted file mode 100644 index 751ff683..00000000 --- a/src/Molecule_Lib/_resizable_array_p_ring.cc +++ /dev/null @@ -1,35 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "path.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_base::contains(Ring*) const; -template Ring * resizable_array_p::remove_no_delete(int); -template int resizable_array_p::transfer_in(resizable_array_p&); -template int resizable_array_base::remove_first(Ring*); -template void resizable_array_base::sort(int (*)(Ring* const*, Ring* const*)); -template int resizable_array_p::remove_item(int); -template int resizable_array_p::transfer_in(resizable_array_p&, int); -template int resizable_array_base::add(Ring*); -template int resizable_array_p::resize(int); -template int resizable_array_base::remove_item(int); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); - -#if ! defined(NDEBUG) -template Ring * & resizable_array_base::operator[](int) const; -template const Ring * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -#endif - diff --git a/src/Molecule_Lib/_resizable_array_p_sms.cc b/src/Molecule_Lib/_resizable_array_p_sms.cc deleted file mode 100644 index a0518823..00000000 --- a/src/Molecule_Lib/_resizable_array_p_sms.cc +++ /dev/null @@ -1,6 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "iwaray.h" -#include "dy_pool.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Molecule_Lib/_resizable_array_path.cc b/src/Molecule_Lib/_resizable_array_path.cc deleted file mode 100644 index e300fbc6..00000000 --- a/src/Molecule_Lib/_resizable_array_path.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "iwaray.h" -#include "path.h" - -template class resizable_array; diff --git a/src/Molecule_Lib/_resizable_array_possible_lactim.cc b/src/Molecule_Lib/_resizable_array_possible_lactim.cc deleted file mode 100644 index ec2d3bc9..00000000 --- a/src/Molecule_Lib/_resizable_array_possible_lactim.cc +++ /dev/null @@ -1,18 +0,0 @@ -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "molecule.h" - -#include "iwstandard.h" - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template int resizable_array_base::add(Possible_Lactim_Lactam*); -template Possible_Lactim_Lactam * & resizable_array_base::operator[](int) const; -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); -template int resizable_array_base::remove_item(int); -template int resizable_array_p::remove_item(int); - -#if ! defined(NDEBUG) -#endif diff --git a/src/Molecule_Lib/_resizable_array_ring.cc b/src/Molecule_Lib/_resizable_array_ring.cc deleted file mode 100644 index 09e91f37..00000000 --- a/src/Molecule_Lib/_resizable_array_ring.cc +++ /dev/null @@ -1,25 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "path.h" - -//template class resizable_array; -//template class resizable_array; -//template class resizable_array_base; - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - -template int resizable_array::add_if_not_already_present(Ring*); -template int resizable_array_base::resize(int); -template int resizable_array_base::add(Ring const*); -template int resizable_array_base::contains(Ring const*) const; -template resizable_array & resizable_array::operator=(resizable_array const&); -template int resizable_array::resize_keep_storage(int); - -#if ! defined(NDEBUG) -template Ring * & resizable_array_base::operator[](int) const; -#endif diff --git a/src/Molecule_Lib/_resizable_array_scc.cc b/src/Molecule_Lib/_resizable_array_scc.cc deleted file mode 100644 index 30eb75b4..00000000 --- a/src/Molecule_Lib/_resizable_array_scc.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "substructure.h" - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); - -template int resizable_array_base::add(Substructure_Chiral_Centre*); diff --git a/src/Molecule_Lib/_resizable_array_subs.cc b/src/Molecule_Lib/_resizable_array_subs.cc deleted file mode 100644 index 5d600609..00000000 --- a/src/Molecule_Lib/_resizable_array_subs.cc +++ /dev/null @@ -1,183 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#define EXTENDING_RESIZABLE_ARRAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" - -#include "substructure.h" - -//template class resizable_array; - -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - -template int resizable_array::add_if_not_already_present(Substructure_Atom*); -template int resizable_array::resize_keep_storage(int); - -//template class resizable_array; -//template class resizable_array_p; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_p::transfer_in(resizable_array_p&, int); - -//template class resizable_array_base; - -template int resizable_array_base::remove_first(Substructure_Atom*); -template Substructure_Atom * resizable_array_base::pop(); -template int resizable_array_base::make_room_for_extra_items(int); -template int resizable_array_base::ok() const; -template int resizable_array::extend(int, Substructure_Atom*); -template resizable_array_base::~resizable_array_base(); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); -//template int resizable_array_p::remove_no_delete(int); -//template int resizable_array_p::transfer_in(resizable_array_p&, int); -template Substructure_Atom * resizable_array_p::remove_no_delete(int); - -//template class extending_resizable_array; - -template Substructure_Atom * & extending_resizable_array::operator[](int); -template extending_resizable_array::extending_resizable_array(Substructure_Atom*); - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_p::remove_item(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::add(Query_Atoms_Matched*); -template int resizable_array_p::resize_keep_storage(int); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); -template resizable_array_base::~resizable_array_base(); -template int resizable_array_base::remove_item(int); -template int resizable_array_p::resize_no_delete(int); - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_base::add(Substructure_Bond*); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_p::resize(int); - -//template class resizable_array; -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); - -template int resizable_array_base::add(Substructure_Environment*); - -template resizable_array::~resizable_array(); -template resizable_array::resizable_array(); -template int resizable_array_base::resize(int); -template resizable_array_base::~resizable_array_base(); -template int resizable_array_p::resize(int); -template resizable_array_base::resizable_array_base(); - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); - - -template int resizable_array_base::add(Substructure_Atom_Specifier*); -template resizable_array_base::~resizable_array_base(); - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); - -template int resizable_array_base::add(Elements_Needed*); -template int resizable_array_base::resize(int); -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); - - -//template class resizable_array_p; -//template class resizable_array; -//template class resizable_array_base; - -//template class resizable_array_p; -//template class resizable_array_base; - -template resizable_array_base::resizable_array_base(); -template resizable_array_base::~resizable_array_base(); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_p::resize(int); -template int resizable_array_base::resize(int); - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::~resizable_array_p(); - -template int resizable_array_base::add(Link_Atom*); -template int resizable_array_p::transfer_in(resizable_array_p&); -template resizable_array::resizable_array(); -template resizable_array::~resizable_array(); - - -template resizable_array_p::resizable_array_p(); -template resizable_array_p::resizable_array_p(int); -template resizable_array_p::~resizable_array_p(); -template resizable_array_base::~resizable_array_base(); -template resizable_array_base::resizable_array_base(); - -template int resizable_array_p::resize(int); -template int resizable_array_p::resize_no_delete(int); -template int resizable_array_base::resize(int); -template resizable_array_p & resizable_array_p::operator=(resizable_array_p && rhs); - - -template int resizable_array_base::add(Substructure_Atom*); -template int resizable_array_base::resize(int); -template int resizable_array_base::remove_two_items(Substructure_Atom*, Substructure_Atom*); -template int resizable_array_base::remove_item(int); -//template int resizable_array_p::remove_no_delete(int); - -template int resizable_array_base::resize(int); - - -#if ! defined(NDEBUG) -template Substructure_Atom_Specifier * & resizable_array_base::operator[](int) const; -template Elements_Needed * & resizable_array_base::operator[](int) const; -template Substructure_Bond * & resizable_array_base::operator[](int) const; -template Substructure_Atom * & resizable_array_base::operator[](int) const; -template Substructure_Environment * & resizable_array_base::operator[](int) const; -template Query_Atoms_Matched * & resizable_array_base::operator[](int) const; -template Substructure_Atom * resizable_array_base::item(int) const; -template Substructure_Chiral_Centre * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -template Link_Atom * & resizable_array_base::operator[](int) const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; -template int resizable_array_base::ok() const; - - -#endif - diff --git a/src/Molecule_Lib/_resizable_array_target.cc b/src/Molecule_Lib/_resizable_array_target.cc deleted file mode 100644 index 4c90e323..00000000 --- a/src/Molecule_Lib/_resizable_array_target.cc +++ /dev/null @@ -1,11 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#define IWARAY_IMPLEMENTATION -#include "Foundational/iwaray/iwaray.h" -#include "target.h" - -//template class iwaray; - -//template class resizable_array_p; -//template class resizable_array_base; diff --git a/src/Molecule_Lib/down_the_bond.cc b/src/Molecule_Lib/down_the_bond.cc index 8b5930f1..881a1cfb 100644 --- a/src/Molecule_Lib/down_the_bond.cc +++ b/src/Molecule_Lib/down_the_bond.cc @@ -1,6 +1,10 @@ +#include #include #include + +#define RESIZABLE_ARRAY_IMPLEMENTATION 1 + #include "Foundational/iwmisc/misc.h" #include "misc2.h" @@ -11,60 +15,742 @@ namespace down_the_bond { using std::cerr; -constexpr char open_brace = '{'; -constexpr char close_brace = '}'; +constexpr char kOpenBrace = '{'; +constexpr char kCloseBrace = '}'; +constexpr char kOpenSquareBracket = '['; +constexpr char kCloseSquareBracket = ']'; + +void +DownTheBond::DefaultValues() { + _match_as_match = 1; + _no_other_substituents_allowed = false; + _match_individual_substituent = false; + _all_queries_require_zero_hits = 0; +} DownTheBond::DownTheBond() { _a1 = -1; _a2 = -1; + DefaultValues(); } DownTheBond::DownTheBond(int a1) { _a1 = a1; _a2 = -1; + DefaultValues(); +} + +// Hold something like a>5; +// This means that the first token holds any operator specification, +// so parsing a>3;h[0] gets parsed into two tokens, with the first one +// holding the low priority and operator ';'. +// Note that currently operators are not supported. There is an implicit +// and operation for all specifications. +class DTB_Token { + public: + enum class Token { + kUndefined = 0, + kNatoms = 1, + kheteroatoms = 2, + kRingAtoms = 3, + kUnsaturation = 4, + kAromatic = 5, + kMaxDistance = 6, + kSmarts = 7 + }; + enum class Operator { + kUndefined = 0, + kHighPriorityAnd = 1, + kOr = 2, + kXor = 3, + kLowPriorityAnd = 4 + }; + + IWString _smarts; + + private: + // The compiler will probably pad this for alignment. + const Token _type; + iwmatcher::Matcher _numeric; + Operator _op; + + // Private functions + int ParseRange(const const_IWSubstring& buffer, int& ndx); + int ParseMinMax(const const_IWSubstring& buffer, int& ndx); + + public: + DTB_Token(const Token token); + + int Build(const const_IWSubstring& buffer, int& ndx); + + Token type() const { + return _type; + } + + void set_smarts(const const_IWSubstring& buffer, int istart, int nchars) { + _smarts.set(buffer.data() + istart, nchars); + } + const IWString& smarts() const { + return _smarts; + } + + iwmatcher::Matcher numeric() const { + return _numeric; + } + + void set_operator(Operator s) { + _op = s; + } + + // Initialise a matcher based on the values held. + template void set_matcher(iwmatcher::Matcher& matcher) const; +}; + +DTB_Token::DTB_Token(const Token token) : _type(token) { + _op = Operator::kUndefined; +} + +template +void +DTB_Token::set_matcher(iwmatcher::Matcher& matcher) const { + matcher = _numeric; +} + +int +GetNumber(const const_IWSubstring& buffer, int& ndx, uint32_t& value) { + value = 0; + int rc = 0; + while (ndx < buffer.length() && isdigit(buffer[ndx])) { + value = value * 10 + buffer[ndx] - '0'; + ++ndx; + ++rc; + } + + return rc; +} + +// The letter part of the token has already been set, parse the numeric +// qualifier +// <3 +// >4 +// 6 +// {3-5} + +int +DTB_Token::Build(const const_IWSubstring& buffer, int& ndx) { + if (ndx >= buffer.length()) { + return 0; + } + + const char c = buffer[ndx]; + if (isdigit(c)) { + uint32_t value = 0; + if (! GetNumber(buffer, ndx, value)) { + return 0; + } + _numeric.add(value); + + return 1; + } + + // {3-5}, <4 >3 + if (c == kOpenBrace) { + return ParseRange(buffer, ndx); + } else if (c == '<') { + return ParseMinMax(buffer, ndx); + } else if (c == '>') { + return ParseMinMax(buffer, ndx); + } else { + cerr << "DTB_Token::Build:unrecognised numeric qualifier '" << c << "'\n"; + return 0; + } +} + +// ndx will be pointing at '>' or '<' +int +DTB_Token::ParseMinMax(const const_IWSubstring& buffer, int& ndx) { + if (ndx == buffer.length() - 1) { + cerr << "DTB_Token::ParseMinMax:no numeric qualifier\n"; + return 0; + } + + const char minmax = buffer[ndx]; + ++ndx; + uint32_t value; + if (! GetNumber(buffer, ndx, value)) { + cerr << "DTB_Token::ParseMinMax:invalid numeric\n"; + return 0; + } + + if (minmax == '<') { + _numeric.set_max(value - 1); + } else if (minmax == '>') { + _numeric.set_min(value + 1); + } else { + cerr << "DTB_Token::ParseMinMax:invalid operator '" << minmax << "'\n"; + return 0; + } + + return 1; } -// Various directives can be placed here. +// THis is not a robust parser. TODO:ianwatson see if we can use the +// code implemented in the substructure search code. +int +DTB_Token::ParseRange(const const_IWSubstring& buffer, int& ndx) { + assert(buffer[ndx] == kOpenBrace); + ++ndx; + if (ndx >= buffer.length() - 1) { + cerr << "DTB_Token::ParseRange:invalid range\n"; + return 0; + } + + uint32_t minval = 0; + if (! GetNumber(buffer, ndx, minval)) { + cerr << "DTB_Token::ParseRange:invalid minval\n"; + return 0; + } + + if (ndx >= buffer.length()) { + return 0; + } + + // {n} is OK, interpreted as a single value. + if (buffer[ndx] == kCloseBrace) { + ++ndx; + _numeric.add(minval); + return 1; + } + + if (buffer[ndx] != '-') { + cerr << "DTB_Token::ParseRange:not - separating values, got '" << buffer[ndx] << "'\n"; + return 0; + } + ++ndx; + if (ndx >= buffer.length()) { + return 0; + } + if (buffer[ndx] == kCloseBrace) { + _numeric.set_min(minval); + ++ndx; + return 1; + } + + uint32_t maxval = 0; + if (! GetNumber(buffer, ndx, maxval)) { + cerr << "DTB_Token::ParseRange:invalid maxval\n"; + return 0; + } + + if (minval > maxval) { + cerr << "DTB_Token::ParseRange:invalid range " << minval << ' ' << maxval << '\n'; + return 0; + } + + if (buffer[ndx] != kCloseBrace) { + cerr << "DTB_Token::ParseRange:not closing brace\n"; + return 0; + } + + ++ndx; + + _numeric.set_min(minval); + _numeric.set_max(maxval); + + return 1; +} +int +QueryMatches::Build(const IWString& smarts, const iwmatcher::Matcher& numeric) { + _query = std::make_unique(); + if (! _query->construct_from_smarts_token(smarts.data(), smarts.size())) { + cerr << "QueryMatches::Build:invalid smarts '" << smarts << "'\n"; + return 0; + } + _query->count_attributes_specified(); + + _hits_needed = numeric; + + return 1; +} + +int +QueryMatches::RequiresZeroHits() const { + if (! _hits_needed.is_set()) { + return 0; + } + if (_hits_needed.match_any()) { + return 0; + } + + if (! _hits_needed.matches(0)) { + return 0; + } + + if (_hits_needed.size() != 1) { + return 0; + } + + uint32_t tmp; + if (_hits_needed.min(tmp) || _hits_needed.max(tmp)) { + return 0; + } + + return 1; +} + +int +FindClosingBracket(const const_IWSubstring& buffer, int ndx) { + assert(buffer[ndx] == kOpenSquareBracket); + + const int nchars = buffer.length(); + int bracket_level = 1; + + for (int i = ndx + 1; i < nchars; ++i) { + if (buffer[i] == kOpenSquareBracket) { + ++bracket_level; + } + if (buffer[i] == kCloseSquareBracket) { + --bracket_level; + if (bracket_level == 0) { + return i - ndx + 1; + } + } + } + + return -1; +} + +int +ParseToDTB(const const_IWSubstring& buffer, + resizable_array_p& tokens) { + int nchars = buffer.length(); + for (int i = 0; i < nchars; ++i) { + std::unique_ptr token; + int chars_consumed = 1; + switch (buffer[i]) { + case 'a': + token = std::make_unique(DTB_Token::Token::kNatoms); + break; + case 'h': + token = std::make_unique(DTB_Token::Token::kheteroatoms); + break; + case 'r': + token = std::make_unique(DTB_Token::Token::kRingAtoms); + break; + case 'u': + token = std::make_unique(DTB_Token::Token::kUnsaturation); + break; + case 'm': + token = std::make_unique(DTB_Token::Token::kAromatic); + break; + case 'd': + token = std::make_unique(DTB_Token::Token::kMaxDistance); + break; + case kOpenSquareBracket: + token = std::make_unique(DTB_Token::Token::kSmarts); + chars_consumed = FindClosingBracket(buffer, i); + if (chars_consumed < 0) { + cerr << "ParseToDTB:no closing brace '" << buffer << "'\n"; + return 0; + } + token->set_smarts(buffer, i, i + chars_consumed); + break; + default: + cerr << "ParseDTB:unrecognised property '" << buffer[i] << "'\n"; + return 0; + } + i += chars_consumed; + int isave = i; + if (! token->Build(buffer, i)) { + cerr << "ParseDTB:invalid specification '" << buffer << "'\n"; + cerr << " "; + for (int i = 0; i < isave; ++i) { + cerr << ' '; + } + cerr << "^\n"; + return 0; + } + + tokens << token.release(); + + if (i == buffer.length() - 1) { + break; + } + if (buffer[i] == ';') { + tokens.back()->set_operator(DTB_Token::Operator::kLowPriorityAnd); + } else if (buffer[i] == '&') { + tokens.back()->set_operator(DTB_Token::Operator::kHighPriorityAnd); + } else if (buffer[i] == '|') { + tokens.back()->set_operator(DTB_Token::Operator::kOr); + } else if (buffer[i] == '^') { + tokens.back()->set_operator(DTB_Token::Operator::kXor); + } + } + + return tokens.size(); +} + +// #define DEBUG_DOWN_THE_BOND_BUILD + +// A DownTheBond directive consists of a token followed +// by a numeric qualifier. +// a2, a>3, a<4, a[4-9] + +// The following tokens are recognised. // a total number of atoms seen down the bond. -// u total number of unmatched atoms seen downthe bond. +// h total number of heteroatoms seen down the bond. +// r total number of ring atoms seen down the bond. +// m total number of aromatic atoms seen down the bond. +// u total number of usaturated atoms seen down the bond. // d longest distance of any atom from the _a2. -// smarts for things that must NOT be present?? -// For now, only the 'a' directive is supported -// a2, a>3, a<4, a[4-9] -// Long term other directives would be supported via logical operators -// a9;d<4 +// [smarts] smarts for atoms seen down the bond. +// For now, logical operators are not supported. All tokens +// must be separated by a ';' operator. +// a[3-9];r0;[n]1;d<5 int DownTheBond::Build(const const_IWSubstring& buffer) { - // Temporary, pending more functionality being added. - if (buffer[0] != 'a') { - cerr << "DownTheBond::Build:not an atom directive '" << buffer << "'\n"; - return 0; + const_IWSubstring mybuffer(buffer); + if (mybuffer[0] == '!') { + _match_as_match = 0; } - const char * s = buffer.data(); - if (! substructure_spec::SmartsNumericQualifier(s + 1, buffer.length() - 1, _natoms)) { + + resizable_array_p tokens; + if (! ParseToDTB(mybuffer, tokens)) { cerr << "DownTheBond::Build:cannot parse '" << buffer << "'\n"; return 0; } +#ifdef DEBUG_DOWN_THE_BOND_BUILD + cerr << "Got " << tokens.size() << " Down the bond tokens\n"; +#endif + + // We cannot multiply specify a condition. + resizable_array seen; + seen.reserve(tokens.size()); + + for (const DTB_Token* dtb : tokens) { + if (dtb->type() == DTB_Token::Token::kUndefined) { + return 0; + } + // OK to have multiple smarts + if (dtb->type() == DTB_Token::Token::kSmarts) { + continue; + } + if (! seen.add_if_not_already_present(static_cast(dtb->type()))) { + cerr << "DownTheBond::Build:duplicate specification\n"; + return 0; + } + } + + for (const DTB_Token* dtb : tokens) { + switch (dtb->type()) { + case DTB_Token::Token::kUndefined: + return 0; + case DTB_Token::Token::kNatoms: + dtb->set_matcher(_natoms); + break; + case DTB_Token::Token::kRingAtoms: + dtb->set_matcher(_ring_atom_count); + break; + case DTB_Token::Token::kheteroatoms: + dtb->set_matcher(_heteroatom_count); + break; + case DTB_Token::Token::kUnsaturation: + dtb->set_matcher(_unsaturation_count); + break; + case DTB_Token::Token::kAromatic: + dtb->set_matcher(_aromatic_count); + break; + case DTB_Token::Token::kMaxDistance: + dtb->set_matcher(_max_distance); + break; + case DTB_Token::Token::kSmarts: + std::unique_ptr qm = std::make_unique(); + if (! qm->Build(dtb->smarts(), dtb->numeric())) { + cerr << "DownTheBond::Build:bad smarts '" << dtb->smarts() << "'\n"; + return 0; + } + _query << qm.release(); + break; + } + } + + _all_queries_require_zero_hits = AllQueriesRequireZeroHits(); + +#ifdef DEBUG_DOWN_THE_BOND_BUILD + cerr << "DownTheBond::Build:added " << _query.size() << " query specifications, _all_queries_require_zero_hits " << _all_queries_require_zero_hits << '\n'; +#endif return 1; } +static std::optional +IdentifyDTB(const Molecule& m, + atom_number_t zatom, + atom_number_t previous_atom, + atom_number_t avoid, + int* visited) { + int rc = 1; + visited[zatom] = 1; + for (const Bond* b : m[zatom]) { + atom_number_t o = b->other(zatom); + // cerr << "From " << zatom << " to " << o << " previous_atom " << previous_atom << " visited " << visited[o] << " avoid " << avoid << '\n'; + if (o == previous_atom) { + continue; + } + if (visited[o]) { + continue; + } + if (o == avoid) { + return std::nullopt; + } + + std::optional tmp = IdentifyDTB(m, o, zatom, avoid, visited); + if (! tmp) { + return std::nullopt; + } + rc += *tmp; + } + + return rc; +} + +// #define DEBUG_MATCHES_INDIVIDUAL_SUBSTITUENT + +// THis is somewhat inefficient. +// If _no_other_substituents_allowed is set, then once we find a non-match +// we should return 0. But instead we continue looking for matches. +// This is just about code complexity. +int +DownTheBond::MatchesIndividualSubstituent(Molecule& m, + atom_number_t a1, + atom_number_t a2, + int* visited) { + +#ifdef DEBUG_MATCHES_INDIVIDUAL_SUBSTITUENT + cerr << "DownTheBond::MatchesIndividualSubstituent:from " << a1 << " to " << a2 << '\n'; +#endif + int attachments_found = 0; + int attachments_matched = 0; + for (const Bond* b : m[a2]) { + const atom_number_t o = b->other(a2); + if (o == a1) { + continue; + } + + std::fill_n(visited, m.natoms(), 0); + visited[a1] = 1; + + std::optional natoms = IdentifyDTB(m, o, a2, a1, visited); + if (! natoms) { + // cerr << "No fragment found to atom " << o << ' ' << m.smarts_equivalent_for_atom(o) << '\n'; + continue; + } +#ifdef DEBUG_MATCHES_INDIVIDUAL_SUBSTITUENT + for (int i = 0; i < m.natoms(); ++i) { + cerr << " atom " << i << ' ' << m.smarts_equivalent_for_atom(i) << " visited " << visited[i] << '\n'; + } +#endif + visited[a1] = 0; + visited[a2] = 1; + +#ifdef DEBUG_MATCHES_INDIVIDUAL_SUBSTITUENT + cerr << "Found " << *natoms << " atoms in attachment\n"; +#endif + + ++attachments_found; + int matched_here = 0; + // Need to add 1 to natoms because a2 was not counted above. + // All other properties are computed from the visited array. + if (_natoms.is_set() && ! _natoms.matches(*natoms + 1)) { + continue; + } else { + matched_here = 1; + } + if (_heteroatom_count.is_set() && ! OkHeteratomCount(m, visited)) { + continue; + } else { + matched_here = 1; + } + if (_ring_atom_count.is_set() && ! OkRingAtomCount(m, visited)) { + continue; + } else { + matched_here = 1; + } + if (_unsaturation_count.is_set() && ! OkUnsaturationCount(m, visited)) { + continue; + } else { + matched_here = 1; + } + if (_aromatic_count.is_set() && ! OkAromaticCount(m, visited)) { + continue; + } else { + matched_here = 1; + } + if (_max_distance.is_set() && ! OkMaxDistance(m, a2, visited)) { + continue; + } else { + matched_here = 1; + } + + if (matched_here) { + ++attachments_matched; + } + } + +#ifdef DEBUG_MATCHES_INDIVIDUAL_SUBSTITUENT + cerr << "Found " << attachments_found << " attachments, " << attachments_matched << " matched\n"; +#endif + if (attachments_matched == 0) { + return ! _match_as_match; + } + + if (_no_other_substituents_allowed && attachments_matched < attachments_found) { + return ! _match_as_match; + } + + return 1; +} + +int +DownTheBond::OkHeteratomCount(const Molecule& m, const int* visited) const { + const int matoms = m.natoms(); + + int rc = 0; + for (int i = 0; i < matoms; ++i) { + if (visited[i] == 0) { + continue; + } + + if (m.atomic_number(i) != 6) { + ++rc; + } + } + + return _heteroatom_count.matches(rc); +} + +int +DownTheBond::OkUnsaturationCount(Molecule& m, const int* visited) const { + const int matoms = m.natoms(); + + int rc = 0; + for (int i = 0; i < matoms; ++i) { + if (visited[i] == 0) { + continue; + } + + if (m.is_aromatic(i)) { + continue; + } + + if (m.unsaturation(i)) { + ++rc; + } + } + + return _unsaturation_count.matches(rc); +} + +int +DownTheBond::OkAromaticCount(Molecule& m, const int* visited) const { + const int matoms = m.natoms(); + + int rc = 0; + for (int i = 0; i < matoms; ++i) { + if (visited[i] == 0) { + continue; + } + + if (m.is_aromatic(i)) { + ++rc; + } + } + + return _aromatic_count.matches(rc); +} + +int +DownTheBond::OkMaxDistance(Molecule& m, atom_number_t a2, const int* visited) const { + const int matoms = m.natoms(); + int maxdist = 0; + for (int i = 0; i < matoms; ++i) { + if (! visited[i]) { + continue; + } + const int d = m.bonds_between(a2, i); + if (d > maxdist) { + maxdist = d; + } + } + + return _max_distance.matches(maxdist); +} + +int +DownTheBond::OkRingAtomCount(Molecule& m, const int* visited) const { + const int matoms = m.natoms(); + + int rc = 0; + for (int i = 0; i < matoms; ++i) { + if (visited[i] == 0) { + continue; + } + + if (m.ring_bond_count(i) > 0) { + ++rc; + } + } + + return _ring_atom_count.matches(rc); +} + +// #define DEBUG_DOWN_THE_BOND_MATCHES + int DownTheBond::Matches(Molecule_to_Match& target, Query_Atoms_Matched & matched_atoms, int * visited) { - const Molecule& m = *target.molecule(); + Molecule& m = *target.molecule(); + + if (! matched_atoms.ok_index(_a1) || ! matched_atoms.ok_index(_a2)) { + cerr << "DownTheBond:Matches:invalid matched query atom number " << _a1 << " or " << _a2 << '\n'; + cerr << "Have " << matched_atoms.size() << " matched query atoms\n"; + return 0; + } const atom_number_t a1 = matched_atoms[_a1]->current_hold_atom()->atom_number(); const atom_number_t a2 = matched_atoms[_a2]->current_hold_atom()->atom_number(); - // cerr << "atoms " << a1 << " and " << a2 << '\n'; +#ifdef DEBUG_DOWN_THE_BOND_MATCHES + cerr << "atoms " << a1 << " and " << a2 << '\n'; +#endif if (! m.ok_atom_number(a1) || ! m.ok_atom_number(a2)) { cerr << "DownTheBond::Matches:invalid atom number " << a1 << " or " << a2 << '\n'; return 0; } + // cerr << "_match_individual_substituent " << _match_individual_substituent << '\n'; + if (_match_individual_substituent) { + return MatchesIndividualSubstituent(m, a1, a2, visited); + } + + const bool compute_natoms = _natoms.is_set(); + const bool compute_heteroatoms = _heteroatom_count.is_set(); + const bool compute_ring_atoms = _ring_atom_count.is_set(); + const bool compute_unsaturation = _unsaturation_count.is_set(); + const bool compute_aromatic = _aromatic_count.is_set(); + resizable_array atom_stack; + int heteroatoms = m.atomic_number(a2) != 6; + int ring_atoms = m.is_ring_atom(a2); + int aromatic = 0; + int unsaturation = 0; + if (m.is_aromatic(a2)) { + aromatic = 1; + } else if (! m.saturated(a2)) { + unsaturation = 1; + } + const Atom& atom2 = m.atom(a2); for (const Bond* b : atom2) { const atom_number_t j = b->other(a2); @@ -75,7 +761,7 @@ DownTheBond::Matches(Molecule_to_Match& target, } if (atom_stack.empty()) { - return _natoms.matches(1); + return NoAtomsDownTheBond(m, a1, a2); } std::fill_n(visited, m.natoms(), 0); @@ -91,10 +777,23 @@ DownTheBond::Matches(Molecule_to_Match& target, visited[i] = 1; ++number_visited; const Atom& atom = m.atom(i); + if (compute_heteroatoms && atom.atomic_number() != 6) { + ++heteroatoms; + } + if (compute_ring_atoms && m.ring_bond_count(i)) { + ++ring_atoms; + } + if (compute_unsaturation && ! m.is_aromatic(i) && atom.unsaturated()) { + ++unsaturation; + } + if (compute_aromatic && m.is_aromatic(i)) { + ++aromatic; + } + for (const Bond* bond : atom) { atom_number_t j = bond->other(i); if (j == a1) { // Must be part of a loop, must fail. - return 0; + return ! _match_as_match; } if (visited[j]) { continue; @@ -103,9 +802,154 @@ DownTheBond::Matches(Molecule_to_Match& target, } } - //cerr << "visited " << number_visited << " atom, matches " << _natoms.matches(number_visited) << '\n'; +#ifdef DEBUG_DOWN_THE_BOND_MATCHES + cerr << "From atom " << a1 << " visited " << number_visited << " atoms\n"; + cerr << "heteroatoms " << compute_heteroatoms << " value " << heteroatoms << '\n'; + cerr << "_heteroatom_count.matches " << _heteroatom_count.matches(heteroatoms) << '\n'; +#endif + + if (compute_heteroatoms && ! _heteroatom_count.matches(heteroatoms)) { + return ! _match_as_match; + } + if (compute_ring_atoms && ! _ring_atom_count.matches(ring_atoms)) { + return ! _match_as_match; + } + if (compute_unsaturation && ! _unsaturation_count.matches(unsaturation)) { + return ! _match_as_match; + } + if (compute_aromatic && ! _aromatic_count.matches(aromatic)) { + return ! _match_as_match; + } + if (compute_natoms && ! _natoms.matches(number_visited)) { + return ! _match_as_match; + } + +#ifdef DEBUG_DOWN_THE_BOND_MATCHES + cerr << "Have " << _query.size() << " query constraints\n"; +#endif + if (_query.empty()) { + return _match_as_match; + } + + const int matoms = m.natoms(); + // If any atoms have been visited, we run the queries. + // If no atoms have been visited, and all queries are for 0 occurrences, that is ok. + // If no atoms have been visited, and all queries require a positive match, that is a fail. + + // First case, some atoms visited. + if (std::any_of(visited, visited + matoms, [](int v) { + return v == 1; + })) { + } else if (_all_queries_require_zero_hits) { + return _match_as_match; + } else { + return !_match_as_match;; + } + + std::unique_ptr already_matched = std::make_unique(matoms); + + for (QueryMatches* qm : _query) { + uint32_t nhits = 0; + for (int i = 0; i < matoms; ++i) { + // cerr << i << ' ' << m.smarts_equivalent_for_atom(i) << " visited " << visited[i] << '\n'; + if (!visited[i]) { + continue; + } + std::fill_n(already_matched.get(), matoms, 0); + if (qm->ss_atom().matches(target[i], already_matched.get())) { + ++nhits; + } + } +#ifdef DEBUG_DOWN_THE_BOND_MATCHES + cerr << "Got " << nhits << " hits\n"; +#endif + if (! qm->Matches(nhits)) { + return ! _match_as_match; + } + } + + return _match_as_match; +} + +// If _query is empty, it does not matter what this function does. +int +DownTheBond::AllQueriesRequireZeroHits() const { + for (const QueryMatches* q : _query) { + if (!q->RequiresZeroHits()) { + return 0; + } + } + + return 1; +} + +// Atom a2 is terminal. There are no further atoms. By definition, atom a2 is +// included in what is downt he bond, so we have 1 matched atom. +int +DownTheBond::NoAtomsDownTheBond(Molecule& m, atom_number_t a1, atom_number_t a2) { + int positive_match_found = 0; + + if (! _natoms.is_set()) { + } else if (_natoms.matches(1)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + + if (! _heteroatom_count.is_set()) { + } else { + int h = (m.atomic_number(a2) != 6); + if (_heteroatom_count.matches(h)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + } + + if (! _ring_atom_count.is_set()) { + } else if (_ring_atom_count.matches(0)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + + if (! _unsaturation_count.is_set()) { + } else { + int u = 0; + if (m.is_aromatic(a2)) { + u = 0; + } else if (m.saturated(a2)) { + u = 0; + } else { + u = 1; + } + if (_unsaturation_count.matches(u)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + } + + if (! _aromatic_count.is_set()) { + } else if (_aromatic_count.matches(0)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + + if (! _max_distance.is_set()) { + } else if (_max_distance.matches(0)) { + ++positive_match_found; + } else { + return ! _match_as_match; + } + + // Nothing has rejected the value. If there was nothing specified, that is a match. + if (positive_match_found == 0) { + return 1; + } - return _natoms.matches(number_visited); + return _match_as_match; } } // namespace down_the_bond diff --git a/src/Molecule_Lib/etrans.h b/src/Molecule_Lib/etrans.h index 91fd7fef..f49e7a6a 100644 --- a/src/Molecule_Lib/etrans.h +++ b/src/Molecule_Lib/etrans.h @@ -58,7 +58,7 @@ class Element_Transformations : public resizable_array_p int active() const { return _number_elements;} - int construct_from_command_line(Command_Line &, int = 0, char = 't'); + int construct_from_command_line(Command_Line &, int verbose= 0, char = 't'); // Add a transformation directive 'Br=Cl' for example. int Add(const IWString& token); diff --git a/src/Molecule_Lib/istream_and_type.h b/src/Molecule_Lib/istream_and_type.h index 570f0011..90a5620b 100644 --- a/src/Molecule_Lib/istream_and_type.h +++ b/src/Molecule_Lib/istream_and_type.h @@ -363,6 +363,13 @@ data_source_and_type::next_molecule() { } } + if (moleculeio::ignore_all_chiral_information_on_input()) { + m->remove_all_chiral_centres(); + } + if (moleculeio::discard_directional_bonds_on_input()) { + m->revert_all_directional_bonds_to_non_directional(); + } + return m; } diff --git a/src/Molecule_Lib/iwmfingerprint.cc b/src/Molecule_Lib/iwmfingerprint.cc index ac66e9cc..2e8bd530 100644 --- a/src/Molecule_Lib/iwmfingerprint.cc +++ b/src/Molecule_Lib/iwmfingerprint.cc @@ -2310,9 +2310,10 @@ IWMFingerprint::_construct_fingerprint (Molecule & m, const int * include_these_atoms) { assert (nullptr == _bvector); - assert (0 == _nbits); - allocate_space_for_bits(bits_per_iwmfingerprint); + if (_nbits == 0) { + allocate_space_for_bits(bits_per_iwmfingerprint); + } _bvector = new_int(bits_per_iwmfingerprint); diff --git a/src/Molecule_Lib/iwreaction.cc b/src/Molecule_Lib/iwreaction.cc index 87bc2dcb..d39ee6ad 100644 --- a/src/Molecule_Lib/iwreaction.cc +++ b/src/Molecule_Lib/iwreaction.cc @@ -5070,7 +5070,7 @@ Sidechain_Reaction_Site::do_makes_breaks (Molecule & result, return Reaction_Site::do_makes_breaks(result, *embedding, offset, etmp); } -//#define DEBUG_MAKE_INTER_PARTICLE_BONDS +// #define DEBUG_MAKE_INTER_PARTICLE_BONDS int determine_atom_number(const Set_of_Atoms & scaffold_embedding, diff --git a/src/Molecule_Lib/iwsubstructure.cc b/src/Molecule_Lib/iwsubstructure.cc index 5f66fe1c..25882731 100644 --- a/src/Molecule_Lib/iwsubstructure.cc +++ b/src/Molecule_Lib/iwsubstructure.cc @@ -8,6 +8,7 @@ #include "Molecule_Lib/atom_typing.h" #include "Molecule_Lib/misc2.h" #include "Molecule_Lib/path.h" +#include "Molecule_Lib/rotbond_common.h" #include "Molecule_Lib/substructure.h" #include "Molecule_Lib/target.h" @@ -3252,13 +3253,80 @@ Single_Substructure_Query::_atom_type_groupings_matched(const Query_Atoms_Matche return 1; } +// Move from `a1` towards `a2` counting the number of rotatable bonds +// encountered. +int +SeparatedAtoms::RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2, + const int* bond_rotatable) const { + const int d = m.bonds_between(a1, a2); + + //cerr << "RotatableBondsBetween from " << a1 << " to " << a2 << " d " << d << '\n'; + + for (const Bond* b : m[a1]) { + const atom_number_t o = b->other(a1); + if (m.bonds_between(o, a2) != d - 1) { + continue; + } + + int rc = 0; + if (bond_rotatable[b->bond_number()]) { + rc = 1; + } + //cerr << " to atom " << o << " rotatable? " << rc << '\n'; + + if (o == a2) { + return rc; + } + + return rc + RotatableBondsBetween(m, o, a2, bond_rotatable); + } + + return 0; +} + +int +SeparatedAtoms::RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2) const { + static quick_rotbond::QuickRotatableBonds rotbond; + rotbond.set_calculation_type(quick_rotbond::QuickRotatableBonds::RotBond::kExpensive); + + std::unique_ptr bond_rotatable = std::make_unique(m.nedges()); + + const uint32_t rotatable_bonds = rotbond.Process(m, bond_rotatable.get()); + // If there is a minimum number of rotatable bonds between a1 and a2, and + // the total number of rotbonds in the molecule is below that, no match is possible. + // cerr << "MOlecule contains " << rotatable_bonds << " rotatable bonds\n"; + + uint32_t tmp; + if (_rotbond.min(tmp) && rotatable_bonds < tmp) { + return 0; + } + + return RotatableBondsBetween(m, a1, a2, bond_rotatable.get()); +} + int SeparatedAtoms::Matches(Molecule& m, const Set_of_Atoms& embedding) const { atom_number_t a1 = embedding[_a1]; atom_number_t a2 = embedding[_a2]; // cerr << "Matched atoms are " << a1 << " and " << a2 << " betw " << m.bonds_between(a1, a2) << '\n'; - return _separation.matches(m.bonds_between(a1, a2)); + if (! _separation.matches(m.bonds_between(a1, a2))) { + return 0; + } + + if (_rotbond.is_set()) { + // int rbb = RotatableBondsBetween(m, a1, a2); + // cerr << " rbb " << rbb << " atoms " << a1 << ' ' << a2 << '\n'; + if (! _rotbond.matches(RotatableBondsBetween(m, a1, a2))) { + return 0; + } + } + + return 1; } int diff --git a/src/Molecule_Lib/mdl.cc b/src/Molecule_Lib/mdl.cc index 5904eaf6..36f55fe0 100644 --- a/src/Molecule_Lib/mdl.cc +++ b/src/Molecule_Lib/mdl.cc @@ -1033,9 +1033,17 @@ Molecule::_mdl_set_bond_directionality(atom_number_t a1, atom_number_t a2, } if (1 == directionality) { - b->set_wedge_up(); + if (a1 == b->a1()) { + b->set_wedge_up(); + } else { + b->set_wedge_down(); + } } else if (6 == directionality) { - b->set_wedge_down(); + if (a1 == b->a1()) { + b->set_wedge_down(); + } else { + b->set_wedge_up(); + } } else if (4 == directionality) { b->set_wedge_either(); } else if (3 == directionality) { @@ -2236,6 +2244,9 @@ Molecule::_discern_chirality_from_wedge_bond(atom_number_t a1, atom_number_t a2, /* Atom ZATOM is 4 connected and has a wedge bond to atom A2 + + THis is buggy and not correct. I need to figure out how to parse + wedge bonds. */ // #define DEBUG_DISCERN_CHIRALITY_FROM_WEDGE_BOND_4 diff --git a/src/Molecule_Lib/mdl_v30.cc b/src/Molecule_Lib/mdl_v30.cc index 2a4fdb0a..9684a577 100644 --- a/src/Molecule_Lib/mdl_v30.cc +++ b/src/Molecule_Lib/mdl_v30.cc @@ -555,6 +555,11 @@ Molecule::_parse_v30_bond_record(const const_IWSubstring & buffer, // for backward compatibility, we need to convert back to V2 numbers if (cfg) { + static int issue_warning = 1; + if (issue_warning) { + cerr << "Warning reading MDL 3000 CFG= bond records does not work well\n"; + issue_warning = 0; + } #define CORRECT #ifdef CORRECT if (1 == cfg) diff --git a/src/Molecule_Lib/molecular_formula.cc b/src/Molecule_Lib/molecular_formula.cc index 74af97bc..81a03275 100644 --- a/src/Molecule_Lib/molecular_formula.cc +++ b/src/Molecule_Lib/molecular_formula.cc @@ -106,29 +106,15 @@ static int ignore[256] = { 1, // 89 Y 1, // 90 Z 0, // 91 [ - 1, // 92 \ + 1, // 92 backslash 0, // 93 ] 1, // 94 ^ 1, // 95 _ 1, // 96 ` 1, // 97 a 1, // 98 b -#ifdef NOT_SURE_WHAT_IS_GOING_ON - I build this with ruby -256.times do |i| - q = if i.chr.to_str =~ /[[:print:]]/ - i.chr.to_str - else - ' ' - end - - $stdout << " 0, // #{i} #{q}\n" -end -But for some reason, the lowercase letters are offset. No idea -what went wrong, but ignoring it for now. -#endif - 1, // 99 c - 0, // 100 d + 0, // 99 c + 1, // 100 d 1, // 101 e 1, // 102 f 1, // 103 g @@ -138,13 +124,13 @@ what went wrong, but ignoring it for now. 1, // 107 k 1, // 108 l 1, // 109 m - 1, // 110 n + 0, // 110 n 0, // 111 o 0, // 112 p - 0, // 113 q + 1, // 113 q 1, // 114 r - 1, // 115 s - 0, // 116 t + 0, // 115 s + 1, // 116 t 1, // 117 u 1, // 118 v 1, // 119 w @@ -382,18 +368,15 @@ static int is_single_letter_element[256] = { -1, // 89 Y -1, // 90 Z -1, // 91 [ - -1, // 92 \ + -1, // 92 backslash -1, // 93 ] -1, // 94 ^ -1, // 95 _ -1, // 96 ` -1, // 97 a -1, // 98 b -#ifdef NOT_SURE_WHAT_IS_GOING_ON - same thing here -#endif - -1, // 99 c - 0, // 100 d + 0, // 99 c + -1, // 100 d -1, // 101 e -1, // 102 f -1, // 103 g @@ -403,13 +386,13 @@ static int is_single_letter_element[256] = { -1, // 107 k -1, // 108 l -1, // 109 m - -1, // 110 n - 1, // 111 o - 2, // 112 p - 4, // 113 q + 1, // 110 n + 2, // 111 o + 4, // 112 p + -1, // 113 q -1, // 114 r - -1, // 115 s - 5, // 116 t + 5, // 115 s + -1, // 116 t -1, // 117 u -1, // 118 v -1, // 119 w @@ -564,24 +547,15 @@ MolecularFormula::Build(const const_IWSubstring& smiles) { std::fill_n(_count, kOther + 1, 0); -#ifdef NOT_SURE_WHAT_IS_GOING_ON - // used to debug this problem. - if (smiles == "Cr") { - for (int i = 0; i < 256; ++i) { - char c = static_cast(i); - std::cerr << i << " '" << c << "' ignore " << ignore[i] << '\n'; - } - } -#endif - + // std::cerr << "Parsing smiles '" << smiles << "'\n"; int rc = 0; for (int i = 0; i < nchars; ++i) { unsigned char c = smiles[i]; + // std::cerr << "i = " << i << " examining '" << c << "'\n"; if (c == ' ' || c == '\t') { return rc; } - // std::cerr << i << " '" << c << " ignore " << ignore[c] << '\n'; if (ignore[c]) { continue; } diff --git a/src/Molecule_Lib/molecular_formula_test.cc b/src/Molecule_Lib/molecular_formula_test.cc index 3ab399af..1b624905 100644 --- a/src/Molecule_Lib/molecular_formula_test.cc +++ b/src/Molecule_Lib/molecular_formula_test.cc @@ -44,7 +44,7 @@ class TestMf: public testing::TestWithParam { TEST_P(TestMf, TestWorks) { const auto params = GetParam(); - std::cerr << "Testing '" << params.smiles << "'\n"; + // std::cerr << "Testing '" << params.smiles << "'\n"; ASSERT_EQ(_mf.Build(params.smiles), params.expected) << params.smiles; const uint32_t* count = _mf.mf(); diff --git a/src/Molecule_Lib/molecule.h b/src/Molecule_Lib/molecule.h index 4dc5b992..8656db65 100644 --- a/src/Molecule_Lib/molecule.h +++ b/src/Molecule_Lib/molecule.h @@ -1409,6 +1409,14 @@ class __attribute__((visibility("default"))) Molecule : private resizable_array_ int identify_side_of_bond(int* either_side, atom_number_t astart, int flag, atom_number_t avoid) const; + // If successful, return the number of atoms down the from->to bond (including to); + // `down_the_bond` will be 1 for those atoms identified. + // Fails if the from-to bond is in a ring. + // Note that this does NOT initialise `down_the_bond`. + // THis can be helpful since it allows you to process all the connections to + // an atom, and assign different flag values to each connection. + std::optional DownTheBond(atom_number_t from, atom_number_t to, + int* down_the_bond, int flag=1) const; // All these functions work the same. If the flag(last argument) is set, then there // will be no requirement for the atoms to be bonded. diff --git a/src/Molecule_Lib/moleculed.cc b/src/Molecule_Lib/moleculed.cc index 4a4ac9ec..88f88aba 100644 --- a/src/Molecule_Lib/moleculed.cc +++ b/src/Molecule_Lib/moleculed.cc @@ -1150,3 +1150,45 @@ Molecule::recompute_distance_matrix() return rc; } + +static constexpr int kStartingAtom = 309; + +static int +DownTheBondInternal(const Molecule & m, + atom_number_t zatom, + atom_number_t previous_atom, + int* down_the_bond, + int flag) { + down_the_bond[zatom] = flag; + int rc = 1; + + for (const Bond* b : m[zatom]) { + atom_number_t o = b->other(zatom); + if (o == previous_atom) { + continue; + } + if (down_the_bond[o] == 0) { + int tmp = DownTheBondInternal(m, o, zatom, down_the_bond, flag); + if (tmp < 0) { + return -1; + } + rc += tmp; + } else if (down_the_bond[o] == kStartingAtom) { + return -1; + } + } + + return rc; +} + +std::optional +Molecule::DownTheBond(atom_number_t from, atom_number_t to, int* down_the_bond, int flag) const { + down_the_bond[from] = kStartingAtom; + const int rc = DownTheBondInternal(*this, to, from, down_the_bond, flag); + down_the_bond[from] = 0; + if (rc < 0) { + return std::nullopt; + } + + return rc; +} diff --git a/src/Molecule_Lib/reaction_from_smirks.cc b/src/Molecule_Lib/reaction_from_smirks.cc index 1e9addc1..59d0057e 100644 --- a/src/Molecule_Lib/reaction_from_smirks.cc +++ b/src/Molecule_Lib/reaction_from_smirks.cc @@ -1,12 +1,14 @@ #include + #include #include #include "Foundational/iwmisc/misc.h" -//#define TESTING_STUFF +// #define TESTING_STUFF #ifdef TESTING_STUFF #include "Foundational/cmdline/cmdline.h" + #include "Molecule_Lib/aromatic.h" #endif @@ -18,31 +20,28 @@ using std::cerr; static int smirks_lost_atom_means_remove_frgment = 1; void -set_smirks_lost_atom_means_remove_frgment(int s) -{ +set_smirks_lost_atom_means_remove_frgment(int s) { smirks_lost_atom_means_remove_frgment = s; } int -IWReaction::construct_from_smirks(const const_IWSubstring & smirks) -{ - if (! smirks.starts_with("F:")) +IWReaction::construct_from_smirks(const const_IWSubstring& smirks) { + if (!smirks.starts_with("F:")) { return _construct_from_smirks(smirks); + } IWString fname(smirks); fname.remove_leading_chars(2); iwstring_data_source input(fname.null_terminated_chars()); - if (! input.good()) - { + if (!input.good()) { cerr << "IWReaction::construct_from_smirks:cannot open '" << fname << "'\n"; return 0; } const_IWSubstring buffer; - if (! input.next_record(buffer)) - { + if (!input.next_record(buffer)) { cerr << "IWReaction::const_IWSubstring:cannot read smirks from '" << fname << "'\n"; return 0; } @@ -51,36 +50,32 @@ IWReaction::construct_from_smirks(const const_IWSubstring & smirks) } int -IWReaction::_construct_from_smirks(const const_IWSubstring & smirks) -{ +IWReaction::_construct_from_smirks(const const_IWSubstring& smirks) { int square_bracket_level = 0; const int n = smirks.length(); int first_open_angle = -1; int second_open_angle = -1; - for (int i = 0; i < n; ++i) - { + for (int i = 0; i < n; ++i) { const char c = smirks[i]; - if (']' == c) + if (']' == c) { square_bracket_level--; - else if ('[' == c) + } else if ('[' == c) { square_bracket_level++; - else if (0 == square_bracket_level && '>' == c) - { - if (first_open_angle < 0) + } else if (0 == square_bracket_level && '>' == c) { + if (first_open_angle < 0) { first_open_angle = i; - else - { + } else { second_open_angle = i; break; } } } - if (second_open_angle < 0) - { - cerr << "IWReaction::build_from_smirks:yipes, did not find >> construct '" << smirks << "'\n"; + if (second_open_angle < 0) { + cerr << "IWReaction::build_from_smirks:yipes, did not find >> construct '" << smirks + << "'\n"; return 0; } @@ -88,39 +83,35 @@ IWReaction::_construct_from_smirks(const const_IWSubstring & smirks) smirks.from_to(0, first_open_angle - 1, reagents); -//cerr << "Reagents " << reagents << '\n'; + // cerr << "Reagents " << reagents << '\n'; smirks.from_to(second_open_angle + 1, smirks.length() - 1, products); -//cerr << "products " << products << '\n'; + // cerr << "products " << products << '\n'; return construct_from_smirks(reagents, products); } void -fetch_closing_paren(const_IWSubstring & buffer, - IWString & token) -{ +fetch_closing_paren(const_IWSubstring& buffer, IWString& token) { int paren_level = 1; - for (auto i = 1; i < buffer.length(); ++i) - { + for (auto i = 1; i < buffer.length(); ++i) { const auto c = buffer[i]; - if ('(' == c) + if ('(' == c) { paren_level++; - else if (')' == c) - { + } else if (')' == c) { paren_level--; - if (0 == paren_level) - { - buffer.from_to(1, i-1, token); + if (0 == paren_level) { + buffer.from_to(1, i - 1, token); buffer += i; return; } } } - cerr << "fetch_closing_paren:did not find closing paren '" << buffer << "'\n"; // should not happen + cerr << "fetch_closing_paren:did not find closing paren '" << buffer + << "'\n"; // should not happen return; } @@ -137,60 +128,53 @@ fetch_closing_paren(const_IWSubstring & buffer, #ifdef OLD_VERSION_DOES_NOT_WORK static int -tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value - resizable_array_p & components) -{ -//cerr << "Smarts '" << smarts << "'\n"; +tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value + resizable_array_p& components) { + // cerr << "Smarts '" << smarts << "'\n"; - while (smarts.length() > 0) - { -// cerr << " smarts '" << smarts << "'\n"; + while (smarts.length() > 0) { + // cerr << " smarts '" << smarts << "'\n"; - if (smarts.starts_with('(')) - { - IWString * c = new IWString; + if (smarts.starts_with('(')) { + IWString* c = new IWString; fetch_closing_paren(smarts, *c); components.add(c); smarts++; - if (smarts.starts_with('.')) + if (smarts.starts_with('.')) { smarts++; - } - else - { - IWString * s = new IWString; + } + } else { + IWString* s = new IWString; -// cerr << "Check " << smarts.length() << " characters\n"; - for (int i = 0; i < smarts.length(); ++i) - { + // cerr << "Check " << smarts.length() << " characters\n"; + for (int i = 0; i < smarts.length(); ++i) { const char c = smarts[i]; - if ('.' != c) - { + if ('.' != c) { s->add(c); continue; } - if (smarts.matches_at_position(i, "...", 3)) - { + if (smarts.matches_at_position(i, "...", 3)) { *s << "..."; i += 2; continue; } -// cerr << "Found '" << *s << "'\n"; + // cerr << "Found '" << *s << "'\n"; components.add(s); smarts += (i + 1); -// cerr << "Smarts updated to " << smarts << '\n'; + // cerr << "Smarts updated to " << smarts << '\n'; s = nullptr; break; } - if (s != nullptr && s->length() > 0) - { -// cerr << "At end of loop '" << *s << "'\n"; + if (s != nullptr && s->length() > 0) { + // cerr << "At end of loop '" << *s << "'\n"; components.add(s); -// cerr << "Smarts now " << smarts << " s '" << *s << "' s->length() " << s->length() << '\n'; + // cerr << "Smarts now " << smarts << " s '" << *s << "' s->length() " << + // s->length() << '\n'; smarts += s->length(); } } @@ -202,9 +186,8 @@ tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value // This badly needs unit tests. static int -tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value - resizable_array_p & components) -{ +tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value + resizable_array_p& components) { #ifdef debug_TOKENISE_SMARTS_INTO_COMPONENTS cerr << "Smarts '" << smarts << "'\n"; #endif @@ -215,16 +198,15 @@ tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value #endif if (smarts.starts_with('(')) { - IWString * c = new IWString; + IWString* c = new IWString; fetch_closing_paren(smarts, *c); components.add(c); smarts++; - if (smarts.starts_with('.')) + if (smarts.starts_with('.')) { smarts++; - } - else - { - IWString * s = new IWString; + } + } else { + IWString* s = new IWString; while (smarts.length() > 0) { const char c = smarts[0]; @@ -256,14 +238,11 @@ tokenise_smarts_into_components(const_IWSubstring smarts, // pass by value return components.number_elements(); } - #ifdef NO_LONGER_USED_JJJJJ static void -file_scope_identify_atom_map_numbers(const Substructure_Query & q, - extending_resizable_array & numbers_in_use) -{ - for (int i = 0; i < q.number_elements(); ++i) - { +file_scope_identify_atom_map_numbers(const Substructure_Query& q, + extending_resizable_array& numbers_in_use) { + for (int i = 0; i < q.number_elements(); ++i) { q[i]->identify_atom_map_numbers(numbers_in_use); } @@ -271,17 +250,16 @@ file_scope_identify_atom_map_numbers(const Substructure_Query & q, } static void -file_scope_identify_atom_map_numbers(const Molecule & m, - extending_resizable_array & numbers_in_use) -{ +file_scope_identify_atom_map_numbers(const Molecule& m, + extending_resizable_array& numbers_in_use) { const int matoms = m.natoms(); - for (int i = 0; i < matoms; ++i) - { + for (int i = 0; i < matoms; ++i) { const auto a = m.atom_map_number(i); - if (0 == a) + if (0 == a) { continue; + } numbers_in_use[a]++; } @@ -291,48 +269,45 @@ file_scope_identify_atom_map_numbers(const Molecule & m, #endif int -IWReaction::construct_from_smirks(const const_IWSubstring & reagents, - const const_IWSubstring & products) -{ - if (0 != reagents.balance('(', ')')) - { - cerr << "IWReaction::construct_from_smirks:unbalanced parentheses in reagent specification '" << reagents << "'\n"; +IWReaction::construct_from_smirks(const const_IWSubstring& reagents, + const const_IWSubstring& products) { + if (0 != reagents.balance('(', ')')) { + cerr << "IWReaction::construct_from_smirks:unbalanced parentheses in reagent " + "specification '" + << reagents << "'\n"; return 0; } resizable_array_p smarts; - if (! tokenise_smarts_into_components(reagents, smarts)) - { + if (!tokenise_smarts_into_components(reagents, smarts)) { cerr << "IWReaction::construct_from_smirks:cannot separate reagent smarts\n"; return 0; } -//#define DEBUG_SMIRKS_TOKENISING +// #define DEBUG_SMIRKS_TOKENISING #ifdef DEBUG_SMIRKS_TOKENISING cerr << "From '" << reagents << "' generate\n"; - for (auto i = 0; i < smarts.number_elements(); ++i) - { - const IWString & s = *(smarts[i]); + for (auto i = 0; i < smarts.number_elements(); ++i) { + const IWString& s = *(smarts[i]); cerr << ' ' << s << '\n'; } #endif - const IWString & s = *(smarts[0]); - if (! this->create_from_smarts(s)) - { - cerr << "IWReaction::construct_from_smirks:cannot interpret scaffold smarts '" << s << "'\n"; + const IWString& s = *(smarts[0]); + if (!this->create_from_smarts(s)) { + cerr << "IWReaction::construct_from_smirks:cannot interpret scaffold smarts '" << s + << "'\n"; return 0; } - for (auto i = 1; i < smarts.number_elements(); ++i) - { - Sidechain_Reaction_Site * r = new Sidechain_Reaction_Site; + for (auto i = 1; i < smarts.number_elements(); ++i) { + Sidechain_Reaction_Site* r = new Sidechain_Reaction_Site; - if (! r->create_from_smarts(*smarts[i])) - { - cerr << "IWReaction::construct_from_smirks:invalid sidechain smarts '" << (*smarts[i]) << "'\n"; + if (!r->create_from_smarts(*smarts[i])) { + cerr << "IWReaction::construct_from_smirks:invalid sidechain smarts '" + << (*smarts[i]) << "'\n"; delete r; return 0; } @@ -342,73 +317,77 @@ IWReaction::construct_from_smirks(const const_IWSubstring & reagents, Substructure_Query product_molecule; - if (! product_molecule.create_from_smarts(products)) - { + if (!product_molecule.create_from_smarts(products)) { cerr << "IWReaction::construct_from_smirks:invalid product " << products << '\n'; return 0; } -// the RHS cannot contain ambiguous elements + // the RHS cannot contain ambiguous elements - if (product_molecule.any_query_atom([] (const Substructure_Atom * q) { if (q->element().number_elements() > 1) - return 1; - else - return 0; })) - { - cerr << "IWReaction::_identify_changes_from_smirks:RHS has multiple elements, impossible\n"; + if (product_molecule.any_query_atom([](const Substructure_Atom* q) { + if (q->element().number_elements() > 1) { + return 1; + } else { + return 0; + } + })) { + cerr << "IWReaction::_identify_changes_from_smirks:RHS has multiple elements, " + "impossible\n"; return 0; } -// GAther up all the atom map numbers that are in use on both sides + // GAther up all the atom map numbers that are in use on both sides - extending_resizable_array ramn; // reagent atom map numbers + extending_resizable_array ramn; // reagent atom map numbers this->identify_atom_map_numbers(ramn); const auto ns = _sidechains.number_elements(); - for (int i = 0; i < ns; ++i) - { + for (int i = 0; i < ns; ++i) { _sidechains[i]->identify_atom_map_numbers(ramn); } #ifdef DEBUG_CONSTRUCT_FROM_SMIRKS cerr << "Reagent atom numbers"; - for (int i = 0; i < ramn.number_elements(); ++i) - { - if (ramn[i] > 0) + for (int i = 0; i < ramn.number_elements(); ++i) { + if (ramn[i] > 0) { cerr << ' ' << i; + } } cerr << '\n'; #endif -// On the LHS, each atom map number must be used once + // On the LHS, each atom map number must be used once - int h = 0; // highest atom man number in use + int h = 0; // highest atom man number in use - for (int i = 1; i < ramn.number_elements(); ++i) - { - if (0 == ramn[i]) + for (int i = 1; i < ramn.number_elements(); ++i) { + if (0 == ramn[i]) { continue; + } h = i; - if (1 == ramn[i]) + if (1 == ramn[i]) { continue; + } - cerr << "IWReaction::construct_from_smirks:atom map number " << i << " found " << ramn[i] << " occurrences\n"; + cerr << "IWReaction::construct_from_smirks:atom map number " << i << " found " + << ramn[i] << " occurrences\n"; return 0; } const int h2 = product_molecule.highest_atom_map_number(); - if (h2 > h) + if (h2 > h) { h = h2; + } #ifdef DEBUG_CONSTRUCT_FROM_SMIRKS cerr << "Highest atom map number in use " << h << '\n'; #endif -// We need to assign atom map numbers to unmapped atoms in the product. These are, by definition -// orphan atoms. + // We need to assign atom map numbers to unmapped atoms in the product. These are, by + // definition orphan atoms. product_molecule.assign_atom_map_numbers(h); @@ -416,14 +395,13 @@ IWReaction::construct_from_smirks(const const_IWSubstring & reagents, cerr << "After assigning atom map number to product atoms, h = " << h << '\n'; #endif -// Ensure that all atom numbers are present. Identify any orphans - atoms in the products that -// do not have a corresponding atom in the reagents + // Ensure that all atom numbers are present. Identify any orphans - atoms in the + // products that do not have a corresponding atom in the reagents extending_resizable_array pamn(0); product_molecule.identify_atom_map_numbers(pamn); - if (pamn.empty()) - { + if (pamn.empty()) { cerr << "IWReaction::construct_from_smirks:No mapped atoms on RHS, cannot continue\n"; return 0; } @@ -432,83 +410,83 @@ IWReaction::construct_from_smirks(const const_IWSubstring & reagents, resizable_array orphan_atoms; - for (int i = 1; i < istop; i++) - { + for (int i = 1; i < istop; i++) { #ifdef DEBUG_CONSTRUCT_FROM_SMIRKS cerr << "i = " << i << " ran " << ramn[i] << " pan " << pamn[i] << '\n'; #endif - if (1 == ramn[i] && 1 == pamn[i]) // one occurrence on LHS and RHS + if (1 == ramn[i] && 1 == pamn[i]) { // one occurrence on LHS and RHS continue; + } - if (0 == ramn[i] && 0 == pamn[i]) // atom map number not used either side, OK + if (0 == ramn[i] && 0 == pamn[i]) { // atom map number not used either side, OK continue; + } - if (ramn[i] == pamn[i]) // equal, but not 1, bad + if (ramn[i] == pamn[i]) // equal, but not 1, bad { - cerr << "IWReaction::construct_from_smirks:atom map " << i << " used " << ramn[i] << " in reagents and " << pamn[i] << " in products\n"; + cerr << "IWReaction::construct_from_smirks:atom map " << i << " used " << ramn[i] + << " in reagents and " << pamn[i] << " in products\n"; return 0; } - if (1 == ramn[i] && 0 == pamn[i]) // atom disappears, this is OK + if (1 == ramn[i] && 0 == pamn[i]) { // atom disappears, this is OK continue; + } - if (0 == ramn[i] && 1 == pamn[i]) - { + if (0 == ramn[i] && 1 == pamn[i]) { orphan_atoms.add(i); continue; } - cerr << "IWReaction::construct_from_smirks:atom number mismatch, atom " << i << " in reagents " << ramn[i] << " in products " << pamn[i] << '\n'; + cerr << "IWReaction::construct_from_smirks:atom number mismatch, atom " << i + << " in reagents " << ramn[i] << " in products " << pamn[i] << '\n'; return 0; } Molecule orphan_molecule; - if (orphan_atoms.number_elements()) - { - Sidechain_Reaction_Site * x = new Sidechain_Reaction_Site(); + if (orphan_atoms.number_elements()) { + Sidechain_Reaction_Site* x = new Sidechain_Reaction_Site(); - if (! _create_orphan_molecule(orphan_atoms, orphan_molecule, product_molecule, *x)) - { - cerr << "IWReaction::_identify_changes_from_smirks:cannot construct orphan molecule, has " << orphan_atoms.number_elements() << " orphan atoms\n"; + if (!_create_orphan_molecule(orphan_atoms, orphan_molecule, product_molecule, *x)) { + cerr << "IWReaction::_identify_changes_from_smirks:cannot construct orphan " + "molecule, has " + << orphan_atoms.number_elements() << " orphan atoms\n"; delete x; return 0; } - + _sidechains.add(x); } - for (int i = 0; i < _sidechains.number_elements(); ++i) - { + for (int i = 0; i < _sidechains.number_elements(); ++i) { _sidechains[i]->set_sidechain_number(i); } - IWString foo ("FOO.qry"); - product_molecule.write_msi(foo); + // IWString foo ("FOO.qry"); + // product_molecule.write_msi(foo); - return _identify_changes_from_smirks(ramn, h+1, orphan_molecule, product_molecule); + return _identify_changes_from_smirks(ramn, h + 1, orphan_molecule, product_molecule); } static int -add_bond_to_orphan(Molecule & orphan_molecule, - const atom_number_t a1, - const atom_number_t a2, - const Substructure_Bond * b) -{ - if (orphan_molecule.are_bonded(a1, a2)) // this process will find most bonds twice +add_bond_to_orphan(Molecule& orphan_molecule, const atom_number_t a1, + const atom_number_t a2, const Substructure_Bond* b) { + if (orphan_molecule.are_bonded(a1, a2)) { // this process will find most bonds twice return 0; + } const bond_type_t bt = b->types_matched(); - if (IS_SINGLE_BOND(bt)) + if (IS_SINGLE_BOND(bt)) { orphan_molecule.add_bond(a1, a2, SINGLE_BOND); - else if (IS_DOUBLE_BOND(bt)) + } else if (IS_DOUBLE_BOND(bt)) { orphan_molecule.add_bond(a1, a2, DOUBLE_BOND); - else if (IS_TRIPLE_BOND(bt)) + } else if (IS_TRIPLE_BOND(bt)) { orphan_molecule.add_bond(a1, a2, TRIPLE_BOND); - else - { - cerr << "IWReaction::_create_orphan_molecule:bond type not unitary form " << bt << ", assuming single bond!!\n"; + } else { + cerr << "IWReaction::_create_orphan_molecule:bond type not unitary form " << bt + << ", assuming single bond!!\n"; orphan_molecule.add_bond(a1, a2, SINGLE_BOND); } @@ -516,89 +494,95 @@ add_bond_to_orphan(Molecule & orphan_molecule, } int -IWReaction::_create_orphan_molecule(const resizable_array & orphan_atoms, // atom map numbers - Molecule & orphan_molecule, - const Substructure_Query & product_molecule, - Sidechain_Reaction_Site & sc) -{ +IWReaction::_create_orphan_molecule( + const resizable_array& orphan_atoms, // atom map numbers + Molecule& orphan_molecule, const Substructure_Query& product_molecule, + Sidechain_Reaction_Site& sc) { const int n = orphan_atoms.number_elements(); const int mx = orphan_atoms.max_val(); -// convenient to have two cross reference arrays + // convenient to have two cross reference arrays - int * mol2qry = new int[n];std::unique_ptr free_mol2qry(mol2qry); - int * amap2mol = new_int(mx + 1, -1); std::unique_ptr free_amap2mol(amap2mol); + int* mol2qry = new int[n]; + std::unique_ptr free_mol2qry(mol2qry); + int* amap2mol = new_int(mx + 1, -1); + std::unique_ptr free_amap2mol(amap2mol); - const Substructure_Atom ** query_atoms = new const Substructure_Atom *[n]; std::unique_ptr free_query_atoms(query_atoms); -//resizable_array query_atoms; + const Substructure_Atom** query_atoms = new const Substructure_Atom*[n]; + std::unique_ptr free_query_atoms(query_atoms); + // resizable_array query_atoms; - for (int i = 0; i < orphan_atoms.number_elements(); ++i) - { + for (int i = 0; i < orphan_atoms.number_elements(); ++i) { const int oi = orphan_atoms[i]; - const Substructure_Atom * qi = product_molecule.query_atom_with_atom_map_number(oi); + const Substructure_Atom* qi = product_molecule.query_atom_with_atom_map_number(oi); - const Substructure_Atom_Specifier * si; - if (qi->ncomponents() > 0) // was complex smarts + const Substructure_Atom_Specifier* si; + if (qi->ncomponents() > 0) { // was complex smarts si = qi->component(0); - else // was simple + } else { // was simple si = qi; + } - assert (nullptr != qi); + assert(nullptr != qi); - const auto & e = si->element(); - if (e.empty()) - { - cerr << "IWReaction::_create_orphan_molecule:RHS orphan atom has no element specified, impossible. Atom map " << oi << '\n'; + const auto& e = si->element(); + if (e.empty()) { + cerr << "IWReaction::_create_orphan_molecule:RHS orphan atom has no element " + "specified, impossible. Atom map " + << oi << '\n'; return 0; } const int x = orphan_molecule.natoms(); - query_atoms[x] = qi; // the X'th query atom + query_atoms[x] = qi; // the X'th query atom - amap2mol[oi] = x; // atom map OI is atom X in molecule + amap2mol[oi] = x; // atom map OI is atom X in molecule orphan_molecule.add(e[0]); orphan_molecule.set_atom_map_number(x, oi); - const auto & iso = si->isotope(); - if (iso.number_elements() > 0) + const auto& iso = si->isotope(); + if (iso.number_elements() > 0) { orphan_molecule.set_isotope(x, iso[0]); - const auto & fc = si->formal_charge(); - if (fc.number_elements() > 0) + } + const auto& fc = si->formal_charge(); + if (fc.number_elements() > 0) { orphan_molecule.set_formal_charge(x, fc[0]); + } } -// Now the atoms have been added do the bonds. We need to handle both bonds within the orphan -// atoms, and bonds to other parts of the products + // Now the atoms have been added do the bonds. We need to handle both bonds within the + // orphan atoms, and bonds to other parts of the products - for (int i = 0; i < orphan_atoms.number_elements(); ++i) - { - const Substructure_Atom * qi = query_atoms[i]; + for (int i = 0; i < orphan_atoms.number_elements(); ++i) { + const Substructure_Atom* qi = query_atoms[i]; const int nc = qi->number_children(); - for (int j = 0; j < nc; ++j) // scan all atoms attached to the corresponding query atom + for (int j = 0; j < nc; + ++j) // scan all atoms attached to the corresponding query atom { - const Substructure_Atom * c = qi->child(j); - const Substructure_Bond * b = c->bond_to_parent(); + const Substructure_Atom* c = qi->child(j); + const Substructure_Bond* b = c->bond_to_parent(); const int atom_number_in_orphan = amap2mol[c->atom_map_number()]; - if (atom_number_in_orphan >= 0) + if (atom_number_in_orphan >= 0) { add_bond_to_orphan(orphan_molecule, i, atom_number_in_orphan, b); + } } - const auto & bonds = qi->bonds(); + const auto& bonds = qi->bonds(); - for (int j = 0; j < bonds.number_elements(); ++j) - { - const Substructure_Bond * b = bonds[j]; - const Substructure_Atom * a = b->a(); + for (int j = 0; j < bonds.number_elements(); ++j) { + const Substructure_Bond* b = bonds[j]; + const Substructure_Atom* a = b->a(); const int atom_number_in_orphan = amap2mol[a->atom_map_number()]; - if (atom_number_in_orphan >= 0) + if (atom_number_in_orphan >= 0) { add_bond_to_orphan(orphan_molecule, i, atom_number_in_orphan, b); + } } } @@ -617,8 +601,8 @@ IWReaction::_create_orphan_molecule(const resizable_array & orphan_atoms, if all product atoms are mapped to a query atom on the LHS, this reaction can be used for enumerations of numerous reagents. - If there are orphan atoms on the RHS, not present on the LHS, then we must assume this is a - fixed reagent sidechain. + If there are orphan atoms on the RHS, not present on the LHS, then we must assume this + is a fixed reagent sidechain. As we look at the molecules on the RHS, there will be two kinds of atoms. Those with a corresponding atom number on the reagent side. @@ -628,164 +612,185 @@ IWReaction::_create_orphan_molecule(const resizable_array & orphan_atoms, */ int -IWReaction::_identify_changes_from_smirks(const extending_resizable_array & atom_map_numbers_in_reagents, - const int istop, - const Molecule & orphan_molecule, - const Substructure_Query & product_molecule) -{ +IWReaction::_identify_changes_from_smirks( + const extending_resizable_array& atom_map_numbers_in_reagents, const int istop, + const Molecule& orphan_molecule, const Substructure_Query& product_molecule) { #ifdef DEBUG_CONSTRUCT_FROM_SMIRKS - cerr << "IWReaction::_identify_changes_from_smirks:must examine " << istop << " mapped atoms\n"; + cerr << "IWReaction::_identify_changes_from_smirks:must examine " << istop + << " mapped atoms\n"; product_molecule.print_connectivity_graph(cerr); #endif resizable_array atoms_lost; - for (int i = 0; i < istop; i++) - { -// cerr << "Atom map i " << i << " mapped in reagents " << atom_map_numbers_in_reagents[i] << '\n'; - if (atom_map_numbers_in_reagents[i] <= 0) // atom map number not present on LHS, orphan + for (int i = 0; i < istop; i++) { + // cerr << "Atom map i " << i << " mapped in reagents " << + // atom_map_numbers_in_reagents[i] << '\n'; + if (atom_map_numbers_in_reagents[i] <= + 0) { // atom map number not present on LHS, orphan continue; + } - Reaction_Site * r1 = _reaction_site_with_atom_map_number(i); - assert (nullptr != r1); + Reaction_Site* r1 = _reaction_site_with_atom_map_number(i); + assert(nullptr != r1); - const Substructure_Atom * q1 = r1->query_atom_with_atom_map_number(i); - assert (nullptr != q1); + const Substructure_Atom* q1 = r1->query_atom_with_atom_map_number(i); + assert(nullptr != q1); - const Substructure_Atom * q2 = product_molecule.query_atom_with_atom_map_number(i); + const Substructure_Atom* q2 = product_molecule.query_atom_with_atom_map_number(i); - if (nullptr == q2) // atom eliminated in products + if (nullptr == q2) { // atom eliminated in products atoms_lost.add(i); - else + } else { _discern_atomic_changes(*r1, *q1, *q2); + } } - if (atoms_lost.number_elements()) - { - for (int i = 0; i < atoms_lost.number_elements(); ++i) - { + if (atoms_lost.number_elements()) { + for (int i = 0; i < atoms_lost.number_elements(); ++i) { const int j = atoms_lost[i]; - Reaction_Site * r = _reaction_site_with_atom_map_number(j); - assert (nullptr != r); + Reaction_Site* r = _reaction_site_with_atom_map_number(j); + assert(nullptr != r); - const Substructure_Atom * s = r->query_atom_with_atom_map_number(j); - assert (nullptr != s); + const Substructure_Atom* s = r->query_atom_with_atom_map_number(j); + assert(nullptr != s); const int x = s->initial_atom_number(); - if (smirks_lost_atom_means_remove_frgment) + if (smirks_lost_atom_means_remove_frgment) { r->add_fragment_to_be_removed(x); - else + } else { r->add_atom_to_be_removed(x); + } } } #ifdef DEBUG_CONSTRUCT_FROM_SMIRKS - for (int i = 1; i < istop; ++i) - { - cerr << i << " atom_map_numbers_in_reagents " << atom_map_numbers_in_reagents[i] << '\n'; + for (int i = 1; i < istop; ++i) { + cerr << i << " atom_map_numbers_in_reagents " << atom_map_numbers_in_reagents[i] + << '\n'; } #endif -// Any bond changes. this must be done on a per sidechain basis because there may be bonding -// changes that occur just within a given sidechain -// e.g. ([CH3:1].[NH2D1:2]).[OH:3]-[C:4]=[O:5]>>[CH2:1]-[NH:2]-[C:4]=[O:5] + // Any bond changes. this must be done on a per sidechain basis because there may be + // bonding changes that occur just within a given sidechain e.g. + // ([CH3:1].[NH2D1:2]).[OH:3]-[C:4]=[O:5]>>[CH2:1]-[NH:2]-[C:4]=[O:5] - for (int i = 0; i < istop; i++) - { -// cerr << i << " atom_map_numbers_in_reagents " << atom_map_numbers_in_reagents[i] << '\n'; + for (int i = 0; i < istop; i++) { + // cerr << i << " atom_map_numbers_in_reagents " << atom_map_numbers_in_reagents[i] + // << '\n'; - Reaction_Site * i_site = _reaction_site_with_atom_map_number(i); // may be NULL + Reaction_Site* i_site = _reaction_site_with_atom_map_number(i); // may be NULL - for (int j = i + 1; j < istop; j++) - { -// cerr << i << ',' << j << " atom_map_numbers_in_reagents " << atom_map_numbers_in_reagents[i] << " j " << ' ' << atom_map_numbers_in_reagents[j] << '\n'; - if (atom_map_numbers_in_reagents[i] <= 0 && atom_map_numbers_in_reagents[j] <= 0) // numbers not used + for (int j = i + 1; j < istop; j++) { + // cerr << i << ',' << j << " atom_map_numbers_in_reagents " << + // atom_map_numbers_in_reagents[i] << " j " << ' ' << + // atom_map_numbers_in_reagents[j] << '\n'; + if (atom_map_numbers_in_reagents[i] <= 0 && + atom_map_numbers_in_reagents[j] <= 0) { // numbers not used continue; + } - Reaction_Site * j_site = _reaction_site_with_atom_map_number(j); // may be NULL + Reaction_Site* j_site = _reaction_site_with_atom_map_number(j); // may be NULL - if (nullptr == i_site && nullptr == j_site) // should not happen + if (nullptr == i_site && nullptr == j_site) { // should not happen continue; + } - const Substructure_Bond * bonded_in_product = product_molecule.bond_between_atom_map_numbers(i, j); -// cerr << "Between mapped atoms " << i << " and " << j << " bnd " << bonded_in_product << " Product\n"; + const Substructure_Bond* bonded_in_product = + product_molecule.bond_between_atom_map_numbers(i, j); + // cerr << "Between mapped atoms " << i << " and " << j << " bnd " << + // bonded_in_product << " Product\n"; - const Substructure_Bond * bonded_in_reagent; - if (i_site == j_site) // two mapped atoms in same reagent + const Substructure_Bond* bonded_in_reagent; + if (i_site == j_site) { // two mapped atoms in same reagent bonded_in_reagent = i_site->bond_between_atom_map_numbers(i, j); - else // not in same reagent, therefore not bonded - bonded_in_reagent = nullptr; + } else { // not in same reagent, therefore not bonded + bonded_in_reagent = nullptr; + } -// cerr << "bonded? " << bonded_in_reagent << ' ' << bonded_in_product << '\n'; + // cerr << "bonded? " << bonded_in_reagent << ' ' << bonded_in_product << '\n'; - if (nullptr == bonded_in_reagent && nullptr == bonded_in_product) // no bond btw I and J either side + if (nullptr == bonded_in_reagent && + nullptr == bonded_in_product) { // no bond btw I and J either side continue; + } - bond_type_t btr = nullptr == bonded_in_reagent ? INVALID_BOND_TYPE : BOND_TYPE_ONLY(bonded_in_reagent->types_matched()); - bond_type_t btp = nullptr == bonded_in_product ? INVALID_BOND_TYPE : BOND_TYPE_ONLY(bonded_in_product->types_matched()); + bond_type_t btr = nullptr == bonded_in_reagent + ? INVALID_BOND_TYPE + : BOND_TYPE_ONLY(bonded_in_reagent->types_matched()); + bond_type_t btp = nullptr == bonded_in_product + ? INVALID_BOND_TYPE + : BOND_TYPE_ONLY(bonded_in_product->types_matched()); - if (btr == btp) // same bond type, no need to change anything + if (btr == btp) { // same bond type, no need to change anything continue; + } -// the bonding changes. Is it a removal, an addition or a change? What components are involved? + // the bonding changes. Is it a removal, an addition or a change? What components + // are involved? -// cerr << "Between atoms " << i << " and " << j << " reagent? " << bonded_in_reagent << " (" << btr << ") products " << bonded_in_product << " (" << btp << ")\n"; -// cerr << "Sites " << i_site << " and " << j_site << '\n'; + // cerr << "Between atoms " << i << " and " << j << " reagent? " << + // bonded_in_reagent << " (" << btr << ") products " << bonded_in_product << " (" + // << btp << ")\n"; cerr << "Sites " << i_site << " and " << j_site << '\n'; - if (i_site == j_site) // new/changed or removed bond in the same component + if (i_site == j_site) // new/changed or removed bond in the same component { - const Substructure_Atom * ai = i_site->query_atom_with_atom_map_number(i); - const Substructure_Atom * aj = j_site->query_atom_with_atom_map_number(j); -// cerr << "Atom numbers " << ai->initial_atom_number() << " and " << aj->initial_atom_number() << " xBTP " << btp << '\n'; - if (INVALID_BOND_TYPE == btp) // bond is removed - i_site->add_bond_to_be_broken(ai->initial_atom_number(), aj->initial_atom_number()); - else // new or changed - i_site->add_bond_to_be_made(ai->initial_atom_number(), aj->initial_atom_number(), btp); + const Substructure_Atom* ai = i_site->query_atom_with_atom_map_number(i); + const Substructure_Atom* aj = j_site->query_atom_with_atom_map_number(j); + // cerr << "Atom numbers " << ai->initial_atom_number() << " and " << + // aj->initial_atom_number() << " xBTP " << btp << '\n'; + if (INVALID_BOND_TYPE == btp) { // bond is removed + i_site->add_bond_to_be_broken(ai->initial_atom_number(), + aj->initial_atom_number()); + } else { // new or changed + i_site->add_bond_to_be_made(ai->initial_atom_number(), + aj->initial_atom_number(), btp); + } continue; } - if (nullptr != i_site && nullptr != j_site) // new inter partile bond btw existing reagents + if (nullptr != i_site && + nullptr != j_site) // new inter partile bond btw existing reagents { -// cerr << "Adding inter particle bond btw " << i << " and " << j << '\n'; - if (! _from_smirks_add_inter_particle_bond(i, i_site, j, j_site, btp)) + // cerr << "Adding inter particle bond btw " << i << " and " << j << '\n'; + if (!_from_smirks_add_inter_particle_bond(i, i_site, j, j_site, btp)) { return 0; + } continue; } -// One of these atom map numbers is not present on the LHS + // One of these atom map numbers is not present on the LHS -// cerr << "Bond involves different sites. I " << i_site << " J " << j_site << " type " << btp << '\n'; - assert (INVALID_BOND_TYPE != btp); + // cerr << "Bond involves different sites. I " << i_site << " J " << j_site << " + // type " << btp << '\n'; + assert(INVALID_BOND_TYPE != btp); - if (nullptr == i_site) - { - const Substructure_Atom * q2 = j_site->query_atom_with_atom_map_number(j); + if (nullptr == i_site) { + const Substructure_Atom* q2 = j_site->query_atom_with_atom_map_number(j); const atom_number_t x2 = q2->initial_atom_number(); const atom_number_t x1 = orphan_molecule.atom_with_atom_map_number(i); const int r = _sidechain_with_mapped_atom(j); _sidechains.last_item()->add_inter_particle_bond(r, x2, x1, btp); - } - else if (nullptr == j_site) - { - const Substructure_Atom * q1 = i_site->query_atom_with_atom_map_number(i); + } else if (nullptr == j_site) { + const Substructure_Atom* q1 = i_site->query_atom_with_atom_map_number(i); const atom_number_t x1 = q1->initial_atom_number(); const atom_number_t x2 = orphan_molecule.atom_with_atom_map_number(j); const int r = _sidechain_with_mapped_atom(i); _sidechains.last_item()->add_inter_particle_bond(r, x1, x2, btp); - } - else // HUH! + } else { // HUH! return 0; + } } } -// deal with chirality sometime - hard... + // deal with chirality sometime - hard... return 1; } @@ -798,39 +803,35 @@ IWReaction::_identify_changes_from_smirks(const extending_resizable_array & */ int -IWReaction::_from_smirks_add_inter_particle_bond(int amap1, - Reaction_Site * a1site, - int amap2, - Reaction_Site * a2site, - bond_type_t bt) -{ - assert (nullptr != a1site); - assert (nullptr != a2site); - - const Substructure_Atom * a1 = a1site->query_atom_with_atom_map_number(amap1); - const Substructure_Atom * a2 = a2site->query_atom_with_atom_map_number(amap2); - assert (nullptr != a1); - assert (nullptr != a2); - -// If either one of the sites is the scaffold, handle specially - - if (a1site == this) - { +IWReaction::_from_smirks_add_inter_particle_bond(int amap1, Reaction_Site* a1site, + int amap2, Reaction_Site* a2site, + bond_type_t bt) { + assert(nullptr != a1site); + assert(nullptr != a2site); + + const Substructure_Atom* a1 = a1site->query_atom_with_atom_map_number(amap1); + const Substructure_Atom* a2 = a2site->query_atom_with_atom_map_number(amap2); + assert(nullptr != a1); + assert(nullptr != a2); + + // If either one of the sites is the scaffold, handle specially + + if (a1site == this) { const int r2 = _sidechain_with_mapped_atom(amap2); - _sidechains[r2]->add_inter_particle_bond(-1, a1->initial_atom_number(), a2->initial_atom_number(), bt); - } - else if (a2site == this) - { + _sidechains[r2]->add_inter_particle_bond(-1, a1->initial_atom_number(), + a2->initial_atom_number(), bt); + } else if (a2site == this) { const int r1 = _sidechain_with_mapped_atom(amap1); - _sidechains[r1]->add_inter_particle_bond(-1, a2->initial_atom_number(), a1->initial_atom_number(), bt); - } - else // different sidechains + _sidechains[r1]->add_inter_particle_bond(-1, a2->initial_atom_number(), + a1->initial_atom_number(), bt); + } else // different sidechains { const int r1 = _sidechain_with_mapped_atom(amap1); const int r2 = _sidechain_with_mapped_atom(amap2); - _sidechains[r1]->add_inter_particle_bond(r2, a2->initial_atom_number(), a1->initial_atom_number(), bt); + _sidechains[r1]->add_inter_particle_bond(r2, a2->initial_atom_number(), + a1->initial_atom_number(), bt); } - + return 1; } @@ -840,15 +841,14 @@ IWReaction::_from_smirks_add_inter_particle_bond(int amap1, int IWReaction::_from_smirks_add_inter_particle_bond_involves_scaffold(int a1, int a2, - bond_type_t bt) -{ + bond_type_t bt) { int r2 = -1; - for (auto i = 0; i < _sidechains.number_elements(); ++i) - { - Substructure_Atom * x = _sidechains[i]->query_atom_with_atom_map_number(a2); - if (nullptr == x) + for (auto i = 0; i < _sidechains.number_elements(); ++i) { + Substructure_Atom* x = _sidechains[i]->query_atom_with_atom_map_number(a2); + if (nullptr == x) { continue; + } r2 = i; break; @@ -856,9 +856,10 @@ IWReaction::_from_smirks_add_inter_particle_bond_involves_scaffold(int a1, int a cerr << "Atom " << a1 << " in scaffold, atom " << a2 << " in sidechain " << r2 << '\n'; - if (r2 < 0) - { - cerr << "IWReaction::_from_smirks_add_inter_particle_bond_involves_scaffold:no sidechain for " << a2 << '\n'; + if (r2 < 0) { + cerr << "IWReaction::_from_smirks_add_inter_particle_bond_involves_scaffold:no " + "sidechain for " + << a2 << '\n'; return 0; } @@ -873,151 +874,148 @@ IWReaction::_from_smirks_add_inter_particle_bond_involves_scaffold(int a1, int a */ int -IWReaction::_discern_atomic_changes(Reaction_Site & r, - const Substructure_Atom & q1, - const Substructure_Atom & q2) -{ +IWReaction::_discern_atomic_changes(Reaction_Site& r, const Substructure_Atom& q1, + const Substructure_Atom& q2) { const int nc1 = q1.ncomponents(); const int nc2 = q2.ncomponents(); const auto a = q1.initial_atom_number(); - if (nc1 > 0 && nc2 > 0) + if (nc1 > 0 && nc2 > 0) { return _discern_atomic_changes_specifier(r, a, *q1.component(0), *q2.component(0)); - if (nc1 > 0) + } + if (nc1 > 0) { return _discern_atomic_changes_specifier(r, a, *q1.component(0), q2); - if (nc2 > 0) + } + if (nc2 > 0) { return _discern_atomic_changes_specifier(r, a, q1, *q2.component(0)); + } return _discern_atomic_changes_specifier(r, a, q1, q2); } int -IWReaction::_discern_atomic_changes_specifier(Reaction_Site & r, - const atom_number_t a, - const Substructure_Atom_Specifier & q1, - const Substructure_Atom_Specifier & q2) -{ - - const auto & e2 = q2.element(); - if (e2.number_elements() > 1) - { +IWReaction::_discern_atomic_changes_specifier(Reaction_Site& r, const atom_number_t a, + const Substructure_Atom_Specifier& q1, + const Substructure_Atom_Specifier& q2) { + const auto& e2 = q2.element(); + if (e2.number_elements() > 1) { cerr << "IWReaction::_discern_atomic_changes:RHS has multiple elements\n"; - for (int i = 0; i < e2.number_elements(); ++i) - { + for (int i = 0; i < e2.number_elements(); ++i) { cerr << ' ' << e2[i]; } cerr << '\n'; return 0; } - if (e2.number_elements() > 0) - { - Reaction_Change_Element * rce = new Reaction_Change_Element; + if (e2.number_elements() > 0) { + Reaction_Change_Element* rce = new Reaction_Change_Element; rce->set_atom(a); rce->set_element(e2[0]); r.add_element_to_be_changed(rce); } - const auto & fc1 = q1.Substructure_Atom_Specifier::formal_charge(); - const auto & fc2 = q2.Substructure_Atom_Specifier::formal_charge(); + const auto& fc1 = q1.Substructure_Atom_Specifier::formal_charge(); + const auto& fc2 = q2.Substructure_Atom_Specifier::formal_charge(); int need_to_set_formal_charge = 99; - if (fc1.empty() && fc2.empty()) // not specified either side + if (fc1.empty() && fc2.empty()) { // not specified either side ; - else if (fc1.number_elements() > 0 && fc2.empty()) // set on LHS, but not on right + } else if (fc1.number_elements() > 0 && fc2.empty()) { // set on LHS, but not on right need_to_set_formal_charge = 0; - else if (fc2.number_elements() > 0) // set on RHS, must set + } else if (fc2.number_elements() > 0) { // set on RHS, must set need_to_set_formal_charge = fc2[0]; + } - if (99 != need_to_set_formal_charge) + if (99 != need_to_set_formal_charge) { r.add_formal_charge_to_assign(a, need_to_set_formal_charge); + } - const auto & iso1 = q1.isotope(); - const auto & iso2 = q2.isotope(); + const auto& iso1 = q1.isotope(); + const auto& iso2 = q2.isotope(); - if (iso2.number_elements() > 1) - { + if (iso2.number_elements() > 1) { cerr << "IWReaction::_discern_atomic_changes:RHS has multiple isotopes\n"; return 0; } int need_to_set_isotope = 2776704; - if (iso1.empty() && iso2.empty()) + if (iso1.empty() && iso2.empty()) { ; - else if (iso1.number_elements() > 0 && iso2.empty()) + } else if (iso1.number_elements() > 0 && iso2.empty()) { need_to_set_isotope = 0; - else if (iso2.number_elements()) + } else if (iso2.number_elements()) { need_to_set_isotope = iso2[0]; + } - if (2776704 != need_to_set_isotope) + if (2776704 != need_to_set_isotope) { r.add_isotope_to_be_placed(a, need_to_set_isotope); + } -//cerr << "need_to_set_isotope " << need_to_set_isotope << '\n'; + // cerr << "need_to_set_isotope " << need_to_set_isotope << '\n'; const aromaticity_type_t arom1 = q1.Substructure_Atom_Specifier::aromaticity(); const aromaticity_type_t arom2 = q2.Substructure_Atom_Specifier::aromaticity(); - if (arom1 != arom2) // finish this sometime... + if (arom1 != arom2) // finish this sometime... { } return 1; } - -const Element * -Substructure_Atom::first_specified_element() const -{ - if (_element.number_elements()) +const Element* +Substructure_Atom::first_specified_element() const { + if (_element.number_elements()) { return _element[0]; + } - for (int i = 0; i < _components.number_elements(); i++) - { - const Substructure_Atom_Specifier * c = _components[i]; + for (int i = 0; i < _components.number_elements(); i++) { + const Substructure_Atom_Specifier* c = _components[i]; - if (c->element().number_elements()) + if (c->element().number_elements()) { return c->element()[0]; + } } return nullptr; } int -Substructure_Atom::first_specified_formal_charge() const -{ - if (_formal_charge.number_elements()) +Substructure_Atom::first_specified_formal_charge() const { + if (_formal_charge.number_elements()) { return _formal_charge[0]; + } - for (int i = 0; i < _components.number_elements(); i++) - { - const Substructure_Atom_Specifier * c = _components[i]; + for (int i = 0; i < _components.number_elements(); i++) { + const Substructure_Atom_Specifier* c = _components[i]; - const Min_Max_Specifier & fc = c->formal_charge(); + const Min_Max_Specifier& fc = c->formal_charge(); - if (fc.number_elements()) + if (fc.number_elements()) { return fc[0]; + } } return 0; } int -Substructure_Atom::first_specified_isotope() const -{ - if (_isotope.number_elements()) +Substructure_Atom::first_specified_isotope() const { + if (_isotope.number_elements()) { return _isotope[0]; + } - for (int i = 0; i < _components.number_elements(); i++) - { - const Substructure_Atom_Specifier * c = _components[i]; + for (int i = 0; i < _components.number_elements(); i++) { + const Substructure_Atom_Specifier* c = _components[i]; - const iwmatcher::Matcher & iso = c->isotope(); + const iwmatcher::Matcher& iso = c->isotope(); - if (iso.number_elements()) + if (iso.number_elements()) { return iso[0]; + } } return 0; @@ -1028,20 +1026,16 @@ Substructure_Atom::first_specified_isotope() const static int verbose = 0; static void -usage(int rc) -{ +usage(int rc) { exit(rc); } template int -test_reaction_from_smirks_record(const const_IWSubstring & buffer, - T & output) -{ +test_reaction_from_smirks_record(const const_IWSubstring& buffer, T& output) { IWReaction rxn; - if (! rxn.construct_from_smirks(buffer)) - { + if (!rxn.construct_from_smirks(buffer)) { cerr << "Invalid smirks\n"; return 0; } @@ -1051,15 +1045,11 @@ test_reaction_from_smirks_record(const const_IWSubstring & buffer, template int -test_reaction_from_smirks(iwstring_data_source & input, - T & output) -{ +test_reaction_from_smirks(iwstring_data_source& input, T& output) { const_IWSubstring buffer; - while (input.next_record(buffer)) - { - if (! test_reaction_from_smirks_record(buffer, output)) - { + while (input.next_record(buffer)) { + if (!test_reaction_from_smirks_record(buffer, output)) { cerr << "Fatal error processing '" << buffer << "'\n"; return 0; } @@ -1070,13 +1060,10 @@ test_reaction_from_smirks(iwstring_data_source & input, template int -test_reaction_from_smirks(const char * fname, - T & output) -{ +test_reaction_from_smirks(const char* fname, T& output) { iwstring_data_source input(fname); - if (! input.good()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 0; } @@ -1085,41 +1072,33 @@ test_reaction_from_smirks(const char * fname, } static int -test_reaction_from_smirks(const int argc, - char ** argv) -{ +test_reaction_from_smirks(const int argc, char** argv) { Command_Line cl(argc, argv, "vA:E:i:g:l"); - if (cl.unrecognised_options_encountered()) - { + if (cl.unrecognised_options_encountered()) { cerr << "Unrecognised options encountered\n"; usage(1); } verbose = cl.option_count('v'); - if (cl.option_present('A')) - { - if (! process_standard_aromaticity_options(cl, verbose, 'A')) - { + if (cl.option_present('A')) { + if (!process_standard_aromaticity_options(cl, verbose, 'A')) { cerr << "Cannot initialise aromaticity specifications\n"; usage(5); } - } - else + } else { set_global_aromaticity_type(Daylight); + } - if (cl.option_present('E')) - { - if (! process_elements(cl, verbose, 'E')) - { + if (cl.option_present('E')) { + if (!process_elements(cl, verbose, 'E')) { cerr << "Cannot initialise elements\n"; return 6; } } - if (cl.empty()) - { + if (cl.empty()) { cerr << "Insufficient arguments\n"; usage(1); } @@ -1128,8 +1107,7 @@ test_reaction_from_smirks(const int argc, } int -main(int argc, char **argv) -{ +main(int argc, char** argv) { int rc = test_reaction_from_smirks(argc, argv); return rc; diff --git a/src/Molecule_Lib/rotbond_common.cc b/src/Molecule_Lib/rotbond_common.cc index 7891007d..6cd55ceb 100644 --- a/src/Molecule_Lib/rotbond_common.cc +++ b/src/Molecule_Lib/rotbond_common.cc @@ -6,6 +6,7 @@ #include +#include #include #include @@ -370,7 +371,7 @@ IsAmide(Molecule& m, } int -QuickRotatableBonds::Process(Molecule& m) { +QuickRotatableBonds::Process(Molecule& m, int* bond_rotatable) { if (m.empty()) { return 0; } @@ -380,6 +381,11 @@ QuickRotatableBonds::Process(Molecule& m) { return 0; } + if (bond_rotatable != nullptr) { + std::fill_n(bond_rotatable, m.nedges(), 0); + m.assign_bond_numbers_to_bonds_if_needed(); + } + // Force sssr. m.ring_membership(); @@ -390,7 +396,7 @@ QuickRotatableBonds::Process(Molecule& m) { case RotBond::kQuick: return Quickest(m); case RotBond::kExpensive: - return Expensive(m); + return Expensive(m, bond_rotatable); default: cerr << "QuickRotatableBonds::Process:should not come here\n"; return 0; @@ -414,7 +420,7 @@ AtomsWithTripleBonds(const Molecule& m) { } int -QuickRotatableBonds::Expensive(Molecule& m) { +QuickRotatableBonds::Expensive(Molecule& m, int* bond_rotatable) { const int matoms = m.natoms(); resizable_array candidate_bonds; @@ -485,6 +491,11 @@ QuickRotatableBonds::Expensive(Molecule& m) { } ++rc; + + if (bond_rotatable != nullptr) { + bond_rotatable[b->bond_number()] = 1; + } + if (_isotope == 0) { continue; } @@ -540,4 +551,85 @@ QuickRotatableBonds::Quickest(Molecule& m) { return rc; } +int +RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2, + const int* bond_rotatable) { + const int d = m.bonds_between(a1, a2); + + //cerr << "RotatableBondsBetween from " << a1 << " to " << a2 << " d " << d << '\n'; + + for (const Bond* b : m[a1]) { + const atom_number_t o = b->other(a1); + if (m.bonds_between(o, a2) != d - 1) { + continue; + } + + int rc = 0; + if (bond_rotatable[b->bond_number()]) { + rc = 1; + } + + if (o == a2) { + return rc; + } + + return rc + RotatableBondsBetween(m, o, a2, bond_rotatable); + } + + return 0; +} + +int +QuickRotatableBonds::RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2) { + const int matoms = m.natoms(); + if (matoms < 2) { + return 0; + } + + if (m.fragment_membership(a1) != m.fragment_membership(a2)) { + cerr << "QuickRotatableBonds::RotatableBondsBetween:inconsistent fragment membership\n"; + return 0; + } + + std::unique_ptr bond_rotatable = std::make_unique(m.nedges()); + + Process(m, bond_rotatable.get()); + + return quick_rotbond::RotatableBondsBetween(m, a1, a2, bond_rotatable.get()); +} + +std::unique_ptr +QuickRotatableBonds::RotatableBondsBetween(Molecule& m) { + const int matoms = m.natoms(); + std::unique_ptr result = std::make_unique(matoms * matoms); + + if (matoms <= 1) { + return result; + } + + if (matoms == 2) { + std::fill_n(result.get(), 0, 2*2); + return result; + } + + std::unique_ptr bond_rotatable = std::make_unique(m.nedges() * m.nedges()); + + Process(m, bond_rotatable.get()); + + for (int i = 0; i < matoms; ++i) { + result[i * matoms + i] = 0; + for (int j = i + 1; j < matoms; ++j) { + int r = quick_rotbond::RotatableBondsBetween(m, i, j, bond_rotatable.get()); + result[i * matoms + j] = r; + result[j * matoms + i] = r; + } + } + + return result; +} + } // namespace quick_rotbond diff --git a/src/Molecule_Lib/rotbond_common.h b/src/Molecule_Lib/rotbond_common.h index 81abfe26..8862428c 100644 --- a/src/Molecule_Lib/rotbond_common.h +++ b/src/Molecule_Lib/rotbond_common.h @@ -42,7 +42,7 @@ class QuickRotatableBonds { int Quickest(Molecule& m); // the more expensive calculation handles CF3 and amides. - int Expensive(Molecule& m); + int Expensive(Molecule& m, int* bond_rotatable); // If requested, we can label the rotatable bonds. isotope_t _isotope; @@ -77,7 +77,23 @@ class QuickRotatableBonds { // Using whatever method is specified in _calculation, return the // number of rotatable bonds. - int Process(Molecule& m); + // If bond_rotatable is not null each bond, set + // the corresponding entry in `bond_rotatable` to 1 for rotatable bonds. + int Process(Molecule& m, int* bond_rotatable = nullptr); + + // Use the current computation to determine the number of rotatable bonds + // between atoms `a1` and `a2`. A greedy path is traced between the two + // atoms and the first traversal from `a1` to `a2` is used. + // Note that this could produce unstable results if there is a case of + // multiple paths with different rotatable bonds, but not sure that + // can happen. + int RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2); + + // return an array of m.natoms()*m.natoms() with the rotatable bonds + // between atoms. + std::unique_ptr RotatableBondsBetween(Molecule& m); }; } // namespace quick_rotbond diff --git a/src/Molecule_Lib/rwmolecule.cc b/src/Molecule_Lib/rwmolecule.cc index feb04a42..013c6b89 100644 --- a/src/Molecule_Lib/rwmolecule.cc +++ b/src/Molecule_Lib/rwmolecule.cc @@ -17,7 +17,6 @@ #include "smiles.h" using std::cerr; -using std::endl; /* Fileconv can be run so as to ignore all chiral information on input @@ -675,7 +674,7 @@ create_file_with_appropriate_name(const const_IWSubstring & old_name, const char * new_suffix = suffix_for_file_type(file_type); if (nullptr == new_suffix) { cerr << "create_file_with_appropriate_name: unrecognised type " << - file_type << endl; + file_type << '\n'; new_name = "UNK_TYPE"; return 0; @@ -710,7 +709,7 @@ append_appropriate_suffix(IWString & fname, FileType file_type) if (nullptr == new_suffix) { cerr << "append_appropriate_suffix: unrecognised type " << - file_type << endl; + file_type << '\n'; return 0; } @@ -1694,7 +1693,9 @@ process_input_type(const Command_Line & cl, FileType & input_type) } else if (optval.starts_with("mdlsep=")) { optval.remove_leading_chars(7); - mdlfos->set_mdl_insert_between_sdf_name_tokens(optval); + IWString tmp(optval); + char_name_to_char(tmp, false /* no error message of not recognised */); + mdlfos->set_mdl_insert_between_sdf_name_tokens(tmp); } else if (optval == "mdlnwce") { mdlfos->set_mdl_display_invalid_chiral_connectivity(0); } else if ("sasge" == optval) { diff --git a/src/Molecule_Lib/rwsubstructure.h b/src/Molecule_Lib/rwsubstructure.h index 0489d6c4..7317ea6f 100644 --- a/src/Molecule_Lib/rwsubstructure.h +++ b/src/Molecule_Lib/rwsubstructure.h @@ -126,6 +126,7 @@ ReadFileOfProtoQueries(const const_IWSubstring fname, resizable_array_p& quer while (input.next_record(line)) { std::optional file_that_exists = iwmisc::FileOrPath(fname, line); + // std::cerr << "Looking for query file '" << fname << ' ' << line << '\n'; if (! file_that_exists) { std::cerr << "ReadFileOfProtoQueries:no file '" << line << "'\n"; return 0; @@ -934,7 +935,7 @@ process_cmdline_token(char option, { const_IWSubstring mytoken(token); -//std::cerr << "Examining token '" << mytoken << "'\n"; + // std::cerr << "process_cmdline_token:examining token '" << mytoken << "'\n"; if (mytoken.starts_with("F:") || mytoken.starts_with("Q:")) { mytoken.remove_leading_chars(2); diff --git a/src/Molecule_Lib/rwsubstructure_proto.cc b/src/Molecule_Lib/rwsubstructure_proto.cc index 578b885f..adb7a22e 100644 --- a/src/Molecule_Lib/rwsubstructure_proto.cc +++ b/src/Molecule_Lib/rwsubstructure_proto.cc @@ -2122,6 +2122,7 @@ SeparatedAtoms::Build(const SubstructureSearch::SeparatedAtoms& proto) { return 0; } } + MATCHER_FROM_PROTO(proto, rotbond, uint32_t, _rotbond); return 1; } @@ -2343,6 +2344,7 @@ Single_Substructure_Query::_construct_from_proto(const SubstructureSearch::Singl cerr << subst.ShortDebugString() << '\n'; return 0; } + substituent->set_is_ring_substituent(0); _substituent.add(substituent.release()); } } @@ -3303,16 +3305,16 @@ Link_Atom::ConstructFromProto(const SubstructureSearch::LinkAtoms & proto) _a1 = proto.a1(); _a2 = proto.a2(); - if (_a1 == _a2) - { + if (_a1 == _a2) { cerr << "Link_Atom::ConstructFromProto:atoms must be distinct " << proto.ShortDebugString() << "\n"; return 0; } static constexpr uint32_t no_limit = std::numeric_limits::max(); - if (!GETVALUES(proto, distance, 0, no_limit)) + if (!GETVALUES(proto, distance, 0, no_limit)) { return 0; + } return 1; } @@ -3326,17 +3328,45 @@ DownTheBond::ConstructFromProto(const SubstructureSearch::DownTheBond& proto) { _a1 = proto.a1(); _a2 = proto.a2(); + if (proto.has_match_as_match()) { + _match_as_match = proto.match_as_match(); + } + if (_a1 == _a2) { cerr << "DownTheBond::ConstructFromProto:a1 a2 the same " << proto.ShortDebugString() << '\n'; return 0; } + if (proto.has_no_other_substituents_allowed()) { + _no_other_substituents_allowed = proto.no_other_substituents_allowed(); + } + if (proto.has_match_individual_substituent()) { + _match_individual_substituent = proto.match_individual_substituent(); + } + if (_no_other_substituents_allowed) { + _match_individual_substituent = true; + } - static constexpr uint32_t no_limit = std::numeric_limits::max(); + MATCHER_FROM_PROTO(proto, natoms, uint32_t, _natoms); + MATCHER_FROM_PROTO(proto, heteroatom_count, uint32_t, _heteroatom_count); + MATCHER_FROM_PROTO(proto, ring_atom_count, uint32_t, _ring_atom_count); + MATCHER_FROM_PROTO(proto, unsaturation_count, uint32_t, _unsaturation_count); + MATCHER_FROM_PROTO(proto, aromatic_count, uint32_t, _aromatic_count); - if (! GETVALUES(proto, natoms, 0, no_limit)) { - return 0; + + for (const auto& query_match : proto.query_matches()) { + std::unique_ptr qm = std::make_unique(); + if (! qm->Build(query_match)) { + cerr << "DownTheBond::Build:invalid query match " << proto.ShortDebugString() << '\n'; + return 0; + } + _query << qm.release(); + } + + if (_query.size()) { + _all_queries_require_zero_hits = AllQueriesRequireZeroHits(); } + return 1; } @@ -3344,11 +3374,71 @@ int DownTheBond::BuildProto(SubstructureSearch::DownTheBond& proto) const { proto.set_a1(_a1); proto.set_a2(_a2); - SETPROTOVALUES(proto, natoms, int); + + if (! _match_as_match) { + proto.set_match_as_match(false); + } + + if (_no_other_substituents_allowed) { + proto.set_no_other_substituents_allowed(true); + } + if (_match_individual_substituent) { + proto.set_match_individual_substituent(true); + } + + PROTO_FROM_MATCHER(_natoms, natoms, uint32_t, proto); + PROTO_FROM_MATCHER(_heteroatom_count, heteroatom_count, uint32_t, proto); + PROTO_FROM_MATCHER(_ring_atom_count, ring_atom_count, uint32_t, proto); + PROTO_FROM_MATCHER(_unsaturation_count, unsaturation_count, uint32_t, proto); + PROTO_FROM_MATCHER(_aromatic_count, aromatic_count, uint32_t, proto); + + for (const auto* qm : _query) { + SubstructureSearch::QueryMatches* destination = proto.mutable_query_matches()->Add(); + qm->BuildProto(*destination); + } return 1; } +namespace down_the_bond { +int +QueryMatches::Build(const SubstructureSearch::QueryMatches& proto) { + if (! proto.has_smarts()) { + cerr << "QueryMatches::Build:no smarts attribute in " << proto.ShortDebugString() << '\n'; + return 0; + } + + // Might not be a smarts, but that is OK. + _smarts = proto.smarts(); + + _query = std::make_unique(); + if (! _query->construct_from_smarts_token(proto.smarts().data(), proto.smarts().size())) { + cerr << "QueryMatches::Build:cannot parse " << proto.smarts() << '\n'; + return 0; + } + _query->count_attributes_specified(); + + MATCHER_FROM_PROTO(proto, hits_needed, uint32_t, _hits_needed); + if (! _hits_needed.is_set()) { + cerr << "QueryMatches::Build:hits_needed not set " << proto.ShortDebugString() << '\n'; + return 0; + } + + return 1; +} + +int +QueryMatches::BuildProto(SubstructureSearch::QueryMatches& proto) const { + if (! _query || _smarts.empty()) { + return 1; + } + proto.set_smarts(_smarts.data(), _smarts.length()); + PROTO_FROM_MATCHER(_hits_needed, hits_needed, uint32_t, proto); + return 1; +} + +} // namespace down_the_bond + #ifdef IMPLEMENT_THIS TODO int @@ -3836,11 +3926,19 @@ SeparatedAtoms::BuildProto(SubstructureSearch::SeparatedAtoms& proto) const { proto.set_a1(_a1); proto.set_a2(_a2); SetProtoValues(_separation, "bonds_between", proto); + PROTO_FROM_MATCHER(_rotbond, rotbond, uint32_t, proto); return 1; } int Substituent::ConstructFromProto(const SubstructureSearch::Substituent& proto) { + if (proto.has_match_as_match()) { + _match_as_match_or_rejection = proto.match_as_match(); + } + if (proto.has_no_other_substituents_allowed()) { + _no_other_substituents_allowed = proto.no_other_substituents_allowed(); + } + if (!GETVALUES(proto, hits_needed, 0, no_limit)) return 0; if (!GETVALUES(proto, natoms, 1, no_limit)) @@ -3850,6 +3948,9 @@ Substituent::ConstructFromProto(const SubstructureSearch::Substituent& proto) { if (!GETVALUES(proto, length, 1, no_limit)) return 0; + MATCHER_FROM_PROTO(proto, heteroatom_count, uint32_t, _heteroatom_count); + MATCHER_FROM_PROTO(proto, unsaturation_count, uint32_t, _unsaturation_count); + if (proto.has_set_global_id()) { _set_global_id = proto.set_global_id(); } @@ -3883,6 +3984,12 @@ Substituent::ConstructFromProto(const SubstructureSearch::Substituent& proto) { int Substituent::BuildProto(SubstructureSearch::Substituent& proto) const { + if (_match_as_match_or_rejection == 0) { + proto.set_match_as_match(false); + } + if (_no_other_substituents_allowed) { + proto.set_no_other_substituents_allowed(true); + } SetProtoValues(_hits_needed, "hits_needed", proto); SetProtoValues(_natoms, "natoms", proto); SetProtoValues(_nrings, "nrings", proto); @@ -3899,6 +4006,8 @@ Substituent::BuildProto(SubstructureSearch::Substituent& proto) const { for (const IWString* smt : _disqualifying_smarts) { proto.add_disqualifying_smarts(smt->AsString()); } + PROTO_FROM_MATCHER(_heteroatom_count, heteroatom_count, uint32_t, proto); + PROTO_FROM_MATCHER(_unsaturation_count, unsaturation_count, uint32_t, proto); cerr << "Substituent::BuildProto:implement this something\n"; return 0; @@ -4235,9 +4344,12 @@ Region::ConstructFromProto(const SubstructureSearch::Region& proto) { return 0; } + MATCHER_FROM_PROTO(proto, heteroatom_count, uint32_t, _heteroatom_count); + if (_natoms.is_set()) { } else if (_nrings.is_set()) { } else if (_atoms_not_on_shortest_path.is_set()) { + } else if (_heteroatom_count.is_set()) { } else { cerr << "Region::ConstructFromProto:nothing specified\n"; return 0; @@ -4264,6 +4376,10 @@ Region::BuildProto(SubstructureSearch::Region& proto) const { SetProtoValues(_atoms_not_on_shortest_path, "atoms_not_on_shortest_path", proto); } + if (_heteroatom_count.is_set()) { + PROTO_FROM_MATCHER(_heteroatom_count, heteroatom_count, uint32_t, proto); + } + return 1; } diff --git a/src/Molecule_Lib/smi.cc b/src/Molecule_Lib/smi.cc index dd994189..d3d1a070 100644 --- a/src/Molecule_Lib/smi.cc +++ b/src/Molecule_Lib/smi.cc @@ -2748,7 +2748,7 @@ Substructure_Atom::_parse_smarts_specifier(const const_IWSubstring & qsmarts, } else if (is_down_the_bond(qsmarts, characters_processed, down_the_bond_qualifier) && check_compatiability_table(previous_token_was, PREVIOUS_TOKEN_WAS_DOWN_THE_BOND)) { - std::unique_ptr dtb = std::make_unique(pst.last_query_atom_created()); + std::unique_ptr dtb = std::make_unique(previous_atom->unique_id()); if (! dtb->Build(down_the_bond_qualifier)) { cerr << "Substructure_Atom::parse_smiles_token:invalid down the bond {" << down_the_bond_qualifier << "}\n"; return 0; @@ -2779,7 +2779,7 @@ Substructure_Atom::_parse_smarts_specifier(const const_IWSubstring & qsmarts, return 0; } - assert(!previous_bond); +// assert(!previous_bond); // Push the various stacks. diff --git a/src/Molecule_Lib/ss_ring_base.cc b/src/Molecule_Lib/ss_ring_base.cc index 8784af0a..bd93bfd2 100644 --- a/src/Molecule_Lib/ss_ring_base.cc +++ b/src/Molecule_Lib/ss_ring_base.cc @@ -583,6 +583,11 @@ Substructure_Ring_Base::ExtendToCarbonyl(const Molecule& m, return rc; } +// A magic number that when encountered in the `in_substituent` array +// means this is the starting atom and it is OK to revisit it - the +// substituent is a ring. +static constexpr int kOkForRingClosure = 72997; + int IdentifySubstituent(const Molecule& m, atom_number_t zatom, @@ -597,6 +602,9 @@ IdentifySubstituent(const Molecule& m, continue; } + if (in_substituent[j] == kOkForRingClosure) { + continue; + } if (in_substituent[j] == 1 || in_substituent[j] == 3) { return -1; } @@ -613,6 +621,7 @@ IdentifySubstituent(const Molecule& m, return rc; } +#ifdef NO_LONGER_USED_SDASDASD int IdentifySubstituent(const Molecule& m, atom_number_t zatom, @@ -633,6 +642,7 @@ IdentifySubstituent(const Molecule& m, return rc; } +#endif void TranslateNumbers(int * storage, @@ -647,7 +657,10 @@ TranslateNumbers(int * storage, } Substituent::Substituent() { + _match_as_match_or_rejection = 1; _set_global_id = -1; + _is_ring_substituent = 1; + _no_other_substituents_allowed = 0; } // for all atoms for which storage[i] == flag, set the global_id @@ -702,7 +715,7 @@ InvalidateOtherAtoms(const int* subset, return rc; } -//#define DEBUG_SUBSTITUENT_MATCHES +// #define DEBUG_SUBSTITUENT_MATCHES // A wrapper for MatchesInner, that handles _set_global_id if needed. // If queries are present, things are more complex. We only want to @@ -715,7 +728,10 @@ int Substituent::Matches(Molecule_to_Match& target, const int * ring_atoms, int * storage, std::unique_ptr& matched_by_global_specs) { - const int rc = MatchesInner(target, ring_atoms, storage); + int rc = MatchesInner(target, ring_atoms, storage); + if (_match_as_match_or_rejection == 0) { + rc = ! rc; + } #ifdef DEBUG_SUBSTITUENT_MATCHES cerr << "Substituent::Matches:from Inner " << rc << '\n'; @@ -786,55 +802,86 @@ Substituent::MatchesInner(Molecule_to_Match& target, const int * ring_atoms, } const Atom& a = m.atom(i); // ring atoms will not have substituents if they are 2 connected. - // If we ever process things other than ring atoms, this will need to be changed. - if (a.ncon() == 2) { + if (_is_ring_substituent && a.ncon() == 2) { continue; } - // Turn off atoms left by a previously failed attempt. - TranslateNumbers(storage, matoms, 2, 0); - const int atoms_in_substituent = IdentifySubstituent(m, i, storage); + int substituents_this_atom = 0; + int matches_this_atom = 0; + for (const Bond* b : a) { + const atom_number_t o = b->other(i); + if (storage[o]) { + continue; + } + // Turn off atoms left by a previously failed attempt. + TranslateNumbers(storage, matoms, 2, 0); + storage[i] = kOkForRingClosure; + const int atoms_in_substituent = IdentifySubstituent(m, o, i, storage); + storage[i] = 1; #ifdef DEBUG_SUBSTITUENT_MATCHES - cerr << "from atom " << i << " " << atoms_in_substituent << " atoms_in_substituent\n"; + cerr << "from atom " << i << " to " << o << " find " << atoms_in_substituent << " atoms_in_substituent\n"; #endif - if (atoms_in_substituent <= 0) { - continue; - } + ++substituents_found; - ++substituents_found; + if (atoms_in_substituent <= 0) { + continue; + } - if (! _natoms.is_set()) { - } else if (_natoms.matches(atoms_in_substituent)) { - } else { - continue; - } + ++substituents_this_atom; - if (! _nrings.is_set()) { - } else if (OkNrings(m, storage, 2)) { - } else { - continue; - } + if (! _natoms.is_set()) { + } else if (_natoms.matches(atoms_in_substituent)) { + } else { + continue; + } - if (! _length.is_set()) { - } else if (OkLength(m, storage, i, 2)) { - } else { - continue; - } + if (! _nrings.is_set()) { + } else if (OkNrings(m, storage, 2)) { + } else { + continue; + } + + if (! _length.is_set()) { + } else if (OkLength(m, storage, i, 2)) { + } else { + continue; + } - if (_required.size() > 0 || _disqualifying.size() > 0) { - int got_required_match = 0; - int got_rejected_match = 0; - RunQueries(target, storage, 2, got_required_match, got_rejected_match); - if (got_rejected_match) { + if (! _heteroatom_count.is_set()) { + } else if (OkHeteratomCount(m, storage, i, 2)) { + } else { continue; } - if (_required.size() > 0 && ! got_required_match) { + + if (! _unsaturation_count.is_set()) { + } else if (OkUnsaturation(m, storage, i, 2)) { + } else { continue; } + + if (_required.size() > 0 || _disqualifying.size() > 0) { + int got_required_match = 0; + int got_rejected_match = 0; + RunQueries(target, storage, 2, got_required_match, got_rejected_match); + if (got_rejected_match) { + continue; + } + if (_required.size() > 0 && ! got_required_match) { + continue; + } + } + + TranslateNumbers(storage, matoms, 2, 3); // Mark as having a successful substituent match. + ++matches_found; + ++matches_this_atom; } - TranslateNumbers(storage, matoms, 2, 3); // Mark as having a successful substituent match. - ++matches_found; + if (substituents_this_atom == 0) { + continue; + } + if (_no_other_substituents_allowed && matches_this_atom < substituents_this_atom) { + return 0; + } } #ifdef DEBUG_SUBSTITUENT_MATCHES @@ -901,6 +948,63 @@ Substituent::OkLength(Molecule& m, const int* storage, atom_number_t anchor, int return _length.matches(longest_length); } +int +Substituent::OkHeteratomCount(const Molecule& m, const int* storage, + atom_number_t anchor, int flag) const { + assert(_heteroatom_count.is_set()); + + const int matoms = m.natoms(); + int h = 0; + for (int i = 0; i < matoms; ++i) { +#ifdef DEBUG_OKLENGTH + cerr << i << " " << m.smarts_equivalent_for_atom(i) << " storage " << storage[i] << '\n'; +#endif + if (storage[i] != flag) { + continue; + } + if (m.atomic_number(i) != 6) { + ++h; + } + } + +#ifdef DEBUG_OKLENGTH + cerr << "Count " << h << " heteratoms, match " << _heteroatom_count.matches(h) << '\n'; +#endif + + return _heteroatom_count.matches(h); +} + +int +Substituent::OkUnsaturation(Molecule& m, const int* storage, + atom_number_t anchor, int flag) const { + assert(_unsaturation_count.is_set()); + + const int matoms = m.natoms(); + int rc = 0; + for (int i = 0; i < matoms; ++i) { +#ifdef DEBUG_OKLENGTH + cerr << i << " " << m.smarts_equivalent_for_atom(i) << " storage " << storage[i] << '\n'; +#endif + if (storage[i] != flag) { + continue; + } + + if (m.is_aromatic(i)) { + continue; + } + + if (m.unsaturation(i)) { + ++rc; + } + } + +#ifdef DEBUG_OKLENGTH + cerr << "Count " << rc << " heteratoms, match " << _heteroatom_count.matches(rc) << '\n'; +#endif + + return _unsaturation_count.matches(rc); +} + int AllMatchedAtomsInStorage(const Substructure_Results& sresults, const int* storage, diff --git a/src/Molecule_Lib/ss_ring_base_test.cc b/src/Molecule_Lib/ss_ring_base_test.cc index b4e32384..167dbd0f 100644 --- a/src/Molecule_Lib/ss_ring_base_test.cc +++ b/src/Molecule_Lib/ss_ring_base_test.cc @@ -612,4 +612,64 @@ query { EXPECT_THAT(*_sresults.embedding(0), UnorderedElementsAreArray({3})); } +struct ProtoSmilesExpected { + std::string proto; + IWString smiles; + int expected; +}; + +std::ostream& +operator<<(std::ostream& output, const ProtoSmilesExpected& pse) { + output << pse.proto << ' ' << pse.smiles << " expect " << pse.expected; + + return output; +} + +class TestSubstructureP: public testing::TestWithParam { + protected: + SubstructureSearch::SubstructureQuery _proto; + Substructure_Query _query; + Molecule _mol; + Substructure_Results _sresults; +}; + +TEST_P(TestSubstructureP, Tests) { + const auto params = GetParam(); + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.proto, &_proto)); + ASSERT_TRUE(_query.ConstructFromProto(_proto)); + ASSERT_TRUE(_mol.build_from_smiles(params.smiles)); + // cerr << "Testing " << params.smiles << " expecting " << params.expected << '\n'; + EXPECT_EQ(_query.substructure_search(_mol, _sresults), params.expected) << params; +} +INSTANTIATE_TEST_SUITE_P(TestSubstructureP, TestSubstructureP, testing::Values( + ProtoSmilesExpected{R"pb( +query { + ring_specifier { + base { + substituent { + natoms: 1; + heteroatom_count: 0 + set_global_id: 1 + hits_needed: 2 + } + } + } + smarts: "[/IWgid1C]" +} +)pb", "C1CC1(C)C", 2}, + + ProtoSmilesExpected{R"pb( +query { + substituent { + max_natoms: 3 + heteroatom_count: 0 + no_other_substituents_allowed: true + } + smarts: "[CD3T2](=O)-[ND2]" +} +)pb", "NCCC(=O)NCCC", 0} + +)); + + } // namespace diff --git a/src/Molecule_Lib/standardise.cc b/src/Molecule_Lib/standardise.cc index 13cfaf16..6b9260af 100644 --- a/src/Molecule_Lib/standardise.cc +++ b/src/Molecule_Lib/standardise.cc @@ -454,6 +454,7 @@ display_all_chemical_standardisation_options(std::ostream & os, char zoption) os << zopt << CS_SULFONYL_UREA << " convert S-C(=N)-N to S=C(-N)-N\n"; os << zopt << CS_124TRIAZINE << " convert S-C(=N)-N to S=C(-N)-N\n"; os << zopt << CS_ENOL_FUSED << " convert [O,S;D1]-c(:n):[aD3x3] to O=C form\n"; + os << zopt << CS_ISOTOPE << " convert all isotopic atoms to non isotopic forms\n"; os << zopt << CS_ALL << " ALL the above standardistions\n"; os << zopt << CS_REVERSE_NITRO << " convert O=N=O nitro groups to charge separated\n"; @@ -858,8 +859,9 @@ Chemical_Standardisation::Activate(const IWString& directive, else if (tmp == CS_ENOL_FUSED) { _transform_enol_fused.activate(); } - else - { + else if (tmp == CS_ISOTOPE) { + _transform_isotopes.activate(); + } else { cerr << "Chemical_Standardisation::Activate:unrecognized directive '" << directive << "'\n"; return 0; } @@ -3627,8 +3629,7 @@ Chemical_Standardisation::process(Molecule & m) const auto asave = global_aromaticity_type(); - if (Daylight != asave) - { + if (Daylight != asave) { set_global_aromaticity_type(Daylight); m.compute_aromaticity(); } @@ -3663,19 +3664,27 @@ Chemical_Standardisation::_process(Molecule & m) // Removing atoms can mess up anything which comes after, so make // sure we do that before anything else - if (_append_string_depending_on_what_changed) + if (_append_string_depending_on_what_changed) { _append_to_changed_molecules.resize_keep_storage(0); + } int rc = 0; - if (_remove_hydrogens.active()) + if (_transform_isotopes.active()) { + rc += _do_unset_isotopes(m); + } + + if (_remove_hydrogens.active()) { rc += _do_remove_hydrogens(m); + } - if (0 == m.natoms()) + if (0 == m.natoms()) { return 0; + } - if (_transform_covalent_metals.active()) + if (_transform_covalent_metals.active()) { _do_transform_covalent_metals(m); + } if (_convert_to_canonical_order != Canonicalise::kNone) { ConvertToCanonicalOrder(m); @@ -8311,6 +8320,8 @@ IWStandard_Current_Molecule::IWStandard_Current_Molecule() _nitrogens = 0; _oxygens = 0; + _isotope = 0; + return; } @@ -8650,10 +8661,13 @@ IWStandard_Current_Molecule::initialise(Molecule & m) atom_number_t first_singly_connected_oxygen = INVALID_ATOM_NUMBER; atom_number_t first_singly_connected_sulphur = INVALID_ATOM_NUMBER; - for (int i = 0; i < _matoms; i++) - { + for (int i = 0; i < _matoms; i++) { Atom * ai = const_cast(_atom[i]); + if (ai->isotope()) { + ++_isotope; + } + if (! ai->valence_ok()) _possible_valence_errors++; @@ -8945,7 +8959,8 @@ Chemical_Standardisation::_processing_needed(const IWStandard_Current_Molecule & (_transform_124_triazine.active() && current_molecule_data.nitrogens() > 2) || (_transform_charged_non_organic.active() && current_molecule_data.non_organic() && (current_molecule_data.npos() > 0 || current_molecule_data.nneg() > 0)) || - (_transform_enol_fused.active() && current_molecule_data.nrings() > 1) + (_transform_enol_fused.active() && current_molecule_data.nrings() > 1) || + (_transform_isotopes.active() && current_molecule_data.isotope() > 0) ) { return 1; } @@ -10470,3 +10485,19 @@ ExternalTransformation::Process(Molecule& m, return rc; } + +int +Chemical_Standardisation::_do_unset_isotopes(Molecule& m) { + const int rc = m.unset_isotopes(); + if (rc == 0) { + return 0; + } + + _transform_isotopes.extra(rc); + + if (_append_string_depending_on_what_changed) { + _append_to_changed_molecules << " STD:isotope"; + } + + return rc; +} diff --git a/src/Molecule_Lib/standardise.h b/src/Molecule_Lib/standardise.h index 17179a9d..9267d51e 100644 --- a/src/Molecule_Lib/standardise.h +++ b/src/Molecule_Lib/standardise.h @@ -217,6 +217,8 @@ class IWStandard_Current_Molecule int _possible_valence_errors; + int _isotope; + resizable_array_p _possible_lactam; // We save the rings because they may get recomputed during our changes @@ -306,6 +308,9 @@ class IWStandard_Current_Molecule int explicit_hydrogen_count () const { return _explicit_hydrogen_count;} int possible_valence_errors () const { return _possible_valence_errors;} const resizable_array_p & possible_lactam() const { return _possible_lactam;} + int isotope() const { + return _isotope; + } int remove_possible_guanidine (const atom_number_t); @@ -369,6 +374,7 @@ class IWStandard_Current_Molecule #define CS_124TRIAZINE "124-triazine" #define CS_ENOL_FUSED "enol-fused" #define CS_FCNO "fcno" +#define CS_ISOTOPE "isotope" namespace standardise { @@ -442,6 +448,7 @@ class Chemical_Standardisation Chemical_Transformation _transform_124_triazine; Chemical_Transformation _transform_enol_fused; Chemical_Transformation _transform_charged_non_organic; + Chemical_Transformation _transform_isotopes; // Various reverse direction transformations @@ -481,6 +488,7 @@ class Chemical_Standardisation void _do_transform_plus_minus_pair (Molecule & m, atom_number_t a1, atom_number_t a2, IWStandard_Current_Molecule & current_molecule_data); + int _do_unset_isotopes(Molecule& m); int _do_transform_amines (Molecule &, Set_of_Atoms &, IWStandard_Current_Molecule & current_molecule_data); int _do_transform_nitro (Molecule &, IWStandard_Current_Molecule & current_molecule_data); int _do_transform_nplus_ominus (Molecule &, IWStandard_Current_Molecule & current_molecule_data); diff --git a/src/Molecule_Lib/standardise_test.cc b/src/Molecule_Lib/standardise_test.cc index e296c1a5..9c7fc427 100644 --- a/src/Molecule_Lib/standardise_test.cc +++ b/src/Molecule_Lib/standardise_test.cc @@ -1,4 +1,5 @@ #include +#include #include "googletest/include/gtest/gtest.h" @@ -441,7 +442,7 @@ TEST_F(TestStandardisation, TestExternalChargedAcid) { } struct ForStd { - IWString std; + std::vector directives; IWString smiles; IWString expected; }; @@ -457,15 +458,19 @@ TEST_P(TestStandardisationP, Tests) { static constexpr int kVerbose = 0; - ASSERT_TRUE(_chemical_standardisation.Activate(params.std, kVerbose)); + for (const IWString& directive : params.directives) { + ASSERT_TRUE(_chemical_standardisation.Activate(directive, kVerbose)); + } ASSERT_TRUE(_m.build_from_smiles(params.smiles)); ASSERT_TRUE(_chemical_standardisation.process(_m)); EXPECT_EQ(_m.unique_smiles(), params.expected) << "got " << _m.unique_smiles() << " expected " << params.expected; } INSTANTIATE_TEST_SUITE_P(TestStandardisationP, TestStandardisationP, testing::Values( - ForStd{"rvnv5", "N1(=NC(=N(=O)C2=CC(=CC=C12)OCCCN1CCOCC1)CC)=O CHEMBL553213", - "CCc1[n][n+]([O-])c2c([n+]1[O-])cc(OCCCN1CCOCC1)cc2"} + ForStd{{"rvnv5"}, "N1(=NC(=N(=O)C2=CC(=CC=C12)OCCCN1CCOCC1)CC)=O CHEMBL553213", + "CCc1[n][n+]([O-])c2c([n+]1[O-])cc(OCCCN1CCOCC1)cc2"}, + ForStd{{"isotope"}, "[2H]-C", "C[H]"}, + ForStd{{"isotope", "all"}, "[2H]-C", "C"} )); } // namespace diff --git a/src/Molecule_Lib/substructure.h b/src/Molecule_Lib/substructure.h index 03f2dc59..4e05cc48 100644 --- a/src/Molecule_Lib/substructure.h +++ b/src/Molecule_Lib/substructure.h @@ -304,6 +304,37 @@ class No_Matched_Atoms_Between { namespace down_the_bond { + +// In a down the bond specification, we can specify substructure based requirements. +// Note that this is not a full substructure_query, just a Substructure_Atom. but +// note that recursive smarts can work there. +class QueryMatches { + private: + // Cannot be a Substructure_Atom since that is not defined yet. + std::unique_ptr _query; + + // We retain the smarts so we can be written if requested. + IWString _smarts; + + iwmatcher::Matcher _hits_needed; + + public: + int Build(const SubstructureSearch::QueryMatches& qm); + int BuildProto(SubstructureSearch::QueryMatches& proto) const; + int Build(const IWString& smarts, const iwmatcher::Matcher& numeric); + + Substructure_Atom& ss_atom() { + return *_query; + } + + // Returns true if _hits_needed only matches 0. + int RequiresZeroHits() const; + + int Matches(uint32_t n) const { + return _hits_needed.matches(n); + } +}; + // Looking down from one matched atom to another, how many atoms are // there - matched or otherwise. Includes matched atom _a2. // the -{} directive is parsed into one of these. @@ -313,7 +344,57 @@ class DownTheBond { int _a1; int _a2; - Min_Max_Specifier _natoms; + int _match_as_match; + + // By default, we aggregate all atoms that appear down the a1->a2 bond. + // If this is set, then we process the unmatched attachments to a2 one at a + // time, and a match is returned if there is at least one substituent that + // individually matches the requirements. + boolean _match_individual_substituent; + + // By defaul, when matching individual substituents, a match is returned if at + // least one unmatched substituent on a2 matches the requirements. If this is set, + // then any non-matching other substituent will cause a failed match. + // If this is set, it automatically turns on match_individual_substituent. + boolean _no_other_substituents_allowed; + + // The number of atoms + iwmatcher::Matcher _natoms; + // The number of heteratoms. + iwmatcher::Matcher _heteroatom_count; + // Number of ring atoms. + iwmatcher::Matcher _ring_atom_count; + // Number of unsaturated atoms + iwmatcher::Matcher _unsaturation_count; + // Number of aromatic atoms. + iwmatcher::Matcher _aromatic_count; + // Max bond distance of any atom from `a2`. + iwmatcher::Matcher _max_distance; + // Note that if any more Matcher's are added here be sure to add logic to + // DownTheBond::NoAtomsDownTheBond(Molecule) + + resizable_array_p _query; + + // We need special handling for the case where there are no atoms + // down the bond. In that case, if all the queries require zero hits + // then that will be a match. Determine this once and store. + int _all_queries_require_zero_hits; + + // Private functions + void DefaultValues(); + + int AllQueriesRequireZeroHits() const; + + int MatchesIndividualSubstituent(Molecule& m, + atom_number_t a1, + atom_number_t a2, + int* visited); + int OkHeteratomCount(const Molecule& m, const int* visited) const; + int OkUnsaturationCount(Molecule& m, const int* visited) const; + int OkAromaticCount(Molecule& m, const int* visited) const; + int OkRingAtomCount(Molecule& m, const int* visited) const; + int OkMaxDistance(Molecule& m, atom_number_t a2, const int* visited) const; + int NoAtomsDownTheBond(Molecule& m, atom_number_t a1, atom_number_t a2); public: DownTheBond(); @@ -1671,9 +1752,12 @@ class Substructure_Ring_Environment : public Substructure_Atom }; class Substructure_Query; + // A ring or ring system can have any number of Substituent's. class Substituent { private: + int _match_as_match_or_rejection; + // How many of these must there be attached to a ring. Min_Max_Specifier _hits_needed; // the atoms in the substituent. @@ -1695,9 +1779,26 @@ class Substituent { resizable_array_p _required_smarts; resizable_array_p _disqualifying_smarts; + // Number of heteroatoms in the substituent. + iwmatcher::Matcher _heteroatom_count; + iwmatcher::Matcher _unsaturation_count; + + // This class was developed for ring substituents, but was later + // adapted to general query matches. If processing a ring substituent + // there are different conditions that apply. + int _is_ring_substituent; + + // This is intended for use with matched atoms, and not really for use + // with rings and ring systems. + int _no_other_substituents_allowed; + // private functions. int OkNrings(Molecule& m, const int* storage, int flag) const; int OkLength(Molecule& m, const int* storage, atom_number_t anchor, int flag) const; + int OkHeteratomCount(const Molecule& m, const int* storage, + atom_number_t anchor, int flag) const; + int OkUnsaturation(Molecule& m, const int* storage, + atom_number_t anchor, int flag) const; int RunQueries(Molecule_to_Match& target, const int * storage, int flag, int& got_required_match, int& got_rejected_match); @@ -1712,6 +1813,10 @@ class Substituent { Substituent(); int ConstructFromProto(const SubstructureSearch::Substituent& proto); + void set_is_ring_substituent(int s) { + _is_ring_substituent = s; + } + int BuildProto(SubstructureSearch::Substituent& proto) const; // Given a set of matched atoms (which may, or may not) describe @@ -2468,6 +2573,19 @@ class SeparatedAtoms { // The bonds_between value that must be met. Min_Max_Specifier _separation; + // Constraints on the number of rotatable bonds along the shortest + // path between matched atom _a1 and matched atom _a2. + iwmatcher::Matcher _rotbond; + + // Private functions + int RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2) const; + int RotatableBondsBetween(Molecule& m, + atom_number_t a1, + atom_number_t a2, + const int* bond_rotatable) const; + public: SeparatedAtoms(); @@ -2497,6 +2615,8 @@ class Region { Min_Max_Specifier _atoms_not_on_shortest_path; + iwmatcher::Matcher _heteroatom_count; + public: int ConstructFromProto(const SubstructureSearch::Region& proto); int BuildProto(SubstructureSearch::Region& proto) const; diff --git a/src/Molecule_Lib/substructure.proto b/src/Molecule_Lib/substructure.proto index 048ae80a..962d89ff 100644 --- a/src/Molecule_Lib/substructure.proto +++ b/src/Molecule_Lib/substructure.proto @@ -170,6 +170,10 @@ message Region { optional uint32 max_atoms_not_on_shortest_path = 9; repeated uint32 atoms_not_on_shortest_path = 10; + repeated uint32 heteroatom_count = 11; + optional uint32 min_heteroatom_count = 12; + optional uint32 max_heteroatom_count = 13; + // Many more things could be added here.... } @@ -181,8 +185,22 @@ message LinkAtoms { optional uint32 max_distance = 5; } +// Within a DownTheBond message, we can have instances of these. +// They describe a substructure match and the number of hits needed. +message QueryMatches { + // This can only be built from smarts. It builds a Substructure_Atom. + optional string smarts = 1; + + repeated uint32 hits_needed = 2; + optional uint32 min_hits_needed = 3; + optional uint32 max_hits_needed = 4; +} + // The -{} directive in smarts is parsed into this structure. // The perception is looking from a1->a2. Atom 2 is counted. +// The properties are for all atoms found down the a1->a2 bond, but +// this can be modified with the match_individual_substituent setting. + message DownTheBond { // The matched atoms that describe the bond optional uint32 a1 = 1; @@ -193,6 +211,40 @@ message DownTheBond { optional uint32 max_natoms = 4; repeated uint32 natoms = 5; + // Number of heteroatoms encountered + repeated uint32 heteroatom_count = 6; + optional uint32 min_heteroatom_count = 7; + optional uint32 max_heteroatom_count = 8; + + // Number of ring atoms encountered + repeated uint32 ring_atom_count = 9; + optional uint32 min_ring_atom_count = 10; + optional uint32 max_ring_atom_count = 11; + + // Number of unsaturated atoms encountered + repeated uint32 unsaturation_count = 12; + optional uint32 min_unsaturation_count = 13; + optional uint32 max_unsaturation_count = 14; + + // Number of aromatic atoms encountered + repeated uint32 aromatic_count = 15; + optional uint32 min_aromatic_count = 16; + optional uint32 max_aromatic_count = 17; + + // By default, all substituents attached to the anchor atom are + // aggregated and the totals are for all atoms found down that bond. + // If this is set, totals are summed per connection, and a match is + // returned only if there is at least one substituent satisfying the requirements. + optional bool match_individual_substituent = 18; + + // If doing individual substituents, do we allow non-matching substituents. + optional bool no_other_substituents_allowed = 19; + + repeated QueryMatches query_matches = 20; + + // By default, DownTheBond directives are for a positive match. + // Set this to invert. + optional bool match_as_match = 21; } // How an environment is attached to matched atoms. @@ -380,12 +432,27 @@ message Substituent { optional uint32 max_length = 11; repeated uint32 length = 12; + // This only works if the Substituent is part of a ring specification. optional uint32 set_global_id = 13; // If any of `required_smarts` match, the substituent is accepted. repeated string required_smarts = 14; // If any of `disqualifying_smarts` match, the substituent is rejected. repeated string disqualifying_smarts = 15; + + repeated uint32 heteroatom_count = 16; + optional uint32 min_heteroatom_count = 17; + optional uint32 max_heteroatom_count = 18; + + repeated uint32 unsaturation_count = 19; + optional uint32 min_unsaturation_count = 20; + optional uint32 max_unsaturation_count = 21; + + // Mostly intended for use with a smarts match rather than a ring. + optional bool match_as_match = 22; + + // Mostly intended for use with a smarts match rather than a ring. + optional bool no_other_substituents_allowed = 23; } // When describing an inter-ring region with > 2 rings connected, we need to @@ -923,7 +990,7 @@ message MatchedAtomMatch { } // Specifications for pairs of matched atoms. -// This is redundant with linkers - as currently implemented. +// This is redundant with linkers, but new development will be here. message SeparatedAtoms { // A pair of matched atoms optional uint32 a1 = 1; @@ -933,6 +1000,12 @@ message SeparatedAtoms { repeated uint32 bonds_between = 3; optional uint32 min_bonds_between = 4; optional uint32 max_bonds_between = 5; + + // Constraints on the number of rotatable bonds found between the two + // matched atoms. + repeated uint32 rotbond = 6; + optional uint32 min_rotbond = 7; + optional uint32 max_rotbond = 8; } // Before atom matching is attempted we can quickly check some whole diff --git a/src/Molecule_Lib/substructure_a.cc b/src/Molecule_Lib/substructure_a.cc index 99d7c6a9..238e9f90 100644 --- a/src/Molecule_Lib/substructure_a.cc +++ b/src/Molecule_Lib/substructure_a.cc @@ -959,7 +959,7 @@ Substructure_Atom::_matches(Target_Atom & target, const int * already_matched) const int m = Substructure_Atom_Specifier::matches(target); #ifdef DEBUG_ATOM_MATCHES - cerr << "Matching atom with " << _components.number_elements() << " components, underlying specifier match " << m << " match or rej " << _match_as_match_or_rejection << endl; + cerr << "Matching atom with " << _components.number_elements() << " components, underlying specifier match " << m << " match or rej " << _match_as_match_or_rejection << '\n'; #endif if (m && 0 == _match_as_match_or_rejection) { // matches, but we are a rejection criterion diff --git a/src/Molecule_Lib/substructure_dtb_test.cc b/src/Molecule_Lib/substructure_dtb_test.cc index 87a0d7b9..26f4b9e8 100644 --- a/src/Molecule_Lib/substructure_dtb_test.cc +++ b/src/Molecule_Lib/substructure_dtb_test.cc @@ -38,7 +38,13 @@ INSTANTIATE_TEST_SUITE_P(TestDTB, TestDTB, testing::Values( SmilesSmartsNhits{"OCC(C)C", "O-{a{1-3}}C", 0}, SmilesSmartsNhits{"OCC(C)C", "O-{a4}C", 1}, SmilesSmartsNhits{"OCC(C)CC", "O-{a{2-7}}C", 1}, - SmilesSmartsNhits{"O1CCCC1", "O-{a{1-3}}CC", 0} // contains ring + SmilesSmartsNhits{"O1CCCC1", "O-{a{1-3}}CC", 0}, // contains ring + SmilesSmartsNhits{"S1N=C(CC(N)C(=O)O)C(=O)N1 CHEMBL1094324", + "[OHD1]-[CD3R0](=O)-[CD3R0](-{[CD2]1;[$(O=c1nsnc1)]1;d3;m5;r5;u1;a7}*)-[ND1H2]", 1}, + SmilesSmartsNhits{"c1ccccc1O", "c1ccccc1-{a1}O", 2}, + SmilesSmartsNhits{"CN", "C-{h1}*", 1}, + SmilesSmartsNhits{"CN", "C-{a1}*", 1}, + SmilesSmartsNhits{"CN", "C-{a1;h1;r0;m0;u0;d0}*", 1} )); const std::string zero_atoms = R"pb( @@ -80,6 +86,13 @@ struct SmilesProtoNhits { int nhits; }; +std::ostream& +operator<<(std::ostream& output, const SmilesProtoNhits& proto) { + output << proto.smiles << ' ' << proto.proto << ' ' << proto.nhits; + + return output; +} + class TestDTBProto : public testing::TestWithParam { protected: Substructure_Query _query; @@ -95,7 +108,7 @@ TEST_P(TestDTBProto, Test1) { ASSERT_TRUE(_query.ConstructFromProto(proto)); // std::cerr << "TestingH '" << params.smiles << "' smarts '" << params.proto << " xpt " << params.nhits << '\n'; - EXPECT_EQ(_query.substructure_search(&_m), params.nhits); + EXPECT_EQ(_query.substructure_search(&_m), params.nhits) << "failed " << params; } INSTANTIATE_TEST_SUITE_P(TestDTBProto, TestDTBProto, testing::Values( SmilesProtoNhits{"OC", zero_atoms, 0}, @@ -103,7 +116,275 @@ INSTANTIATE_TEST_SUITE_P(TestDTBProto, TestDTBProto, testing::Values( SmilesProtoNhits{"OCC", one_atom, 0}, SmilesProtoNhits{"OC", two_atoms, 0}, SmilesProtoNhits{"OCC", two_atoms, 1}, - SmilesProtoNhits{"OCCC", two_atoms, 0} + SmilesProtoNhits{"OCCC", two_atoms, 0}, + SmilesProtoNhits{"OCCCC", + R"pb( +query { + smarts: "OC-{a3;r0;h0;m0}C" +} +)pb", 1}, + SmilesProtoNhits{"OCCCC", + R"pb( +query { + smarts: "OC-{m>0}C" +} +)pb", 0}, + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{m5}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{m{5}}C" +}, +)pb", 1}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{m{5-}}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{m>4}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{u>0}C" +} +)pb", 0}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{r5}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCCn1cccc1", + R"pb( +query { + smarts: "OC-{r5;m>4;a7;u0}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCC(=O)N", + R"pb( +query { + smarts: "OC-{u1}C" +} +)pb", 0}, + + SmilesProtoNhits{"OCCC(=O)N", + R"pb( +query { + smarts: "OC-{u2}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCC(=O)N", + R"pb( +query { + smarts: "OC-{a4;h2;u2;r0;m0}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCC#C", + R"pb( +query { + smarts: "OC-{u2}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCC#C", + R"pb( +query { + smarts: "OC-{u2;h0;a3;r0;m0}C" +} +)pb", 1}, + + SmilesProtoNhits{"OCCC(=O)N(CCC)CO", + R"pb( +query { + smarts: "[CD3T2](=O)N" + down_the_bond { + a1: 0 + a2: 2 + heteroatom_count: 1 + } +} +)pb", 0}, + + SmilesProtoNhits{"OCCC(=O)N(CCC)CO", + R"pb( +query { + smarts: "[CD3T2](=O)N" + down_the_bond { + a1: 0 + a2: 2 + heteroatom_count: 1 + match_individual_substituent: true + } +} +)pb", 1}, + + SmilesProtoNhits{"OCCC(=O)N(CCC)CO", + R"pb( +query { + smarts: "[CD3T2](=O)N" + down_the_bond { + a1: 0 + a2: 2 + heteroatom_count: 1 + match_individual_substituent: true + no_other_substituents_allowed: true + } +} +)pb", 0}, + + SmilesProtoNhits{"P(=O)(O)(O)CCCC(N)C(O)=O CHEMBL28862", + R"pb( +query { + smarts: "[ND1H2]-[CD3x0](-[R0])-[CD3](=O)-[OD1]", + down_the_bond { + a1: 1 + a2: 2 + query_matches { + smarts: "[$(P(=O)-([OH])-[OH])]" + hits_needed: 0 + } + } +} +)pb", 0}, + + SmilesProtoNhits{"C(N)(C(=O)O)C(O)C CHEMBL30037", + R"pb( +query { + smarts: "[ND1H2]-[CD3x0](-[R0])-[CD3](=O)-[OD1]", + down_the_bond { + a1: 1 + a2: 2 + query_matches { + smarts: "[$([OD1]-[CD3])]" + hits_needed: 1 + } + } +} +)pb", 1}, + + SmilesProtoNhits{"C(=O)(O)[C@@H](N)[C@H](O)[C@H](O)C(=O)O CHEMBL28259", + R"pb( +query { + smarts: "[ND1H2]-[CD3x0](-[R0])-[CD3](=O)-[OD1]", + down_the_bond { + a1: 1 + a2: 2 + query_matches { + smarts: "[$([OD1]-[CD3G0])]" + min_hits_needed: 2 + } + } +}, +)pb", 1}, + + SmilesProtoNhits{"C(=O)(O)C(N)CCCCN CHEMBL28328", + R"pb( +query { + smarts: "[ND1H2]-[CD3x0](-[R0])-[CD3](=O)-[OD1]", + down_the_bond { + a1: 1 + a2: 2 + query_matches { + smarts: "[CD2]", + hits_needed: 4 + } + } +} +)pb", 1}, + + SmilesProtoNhits{"C(=O)(O)C(N)CCCCN CHEMBL28328", + R"pb( +query { + smarts: "[ND1H2]-[CD3x0](-[R0])-[CD3](=O)-[OD1]", + down_the_bond { + a1: 1 + a2: 2 + query_matches { + smarts: "[CD2]", + max_hits_needed: 3 + } + } +} +)pb", 0}, + + SmilesProtoNhits{"c1ccccc1CC(C)(C)C", + R"pb( +query { + smarts: "c-[CH2][CD4]" + down_the_bond { + a1: 1 + a2: 2 + natoms: 4 + } +} +)pb", 1}, + + SmilesProtoNhits{"c1ccccc1CC(C)(C)C", + R"pb( +query { + smarts: "c-[CH2][CD4]" + down_the_bond { + a1: 1 + a2: 2 + match_individual_substituent: true + natoms: 2 + heteroatom_count: 0 + ring_atom_count: 0 + unsaturation_count: 0 + aromatic_count: 0 + } +} +)pb", 1}, + + SmilesProtoNhits{"FC(C)N", + R"pb( +query { + smarts: "F[CD3]", + down_the_bond { + match_individual_substituent: true + a1: 0 + a2: 1 + natoms: 2 + heteroatom_count: 1 + } +} +)pb", 1}, + + SmilesProtoNhits{"NCCC(F)(F)F", + R"pb( +query { + smarts: "[ND1H2]-[CD2]", + down_the_bond { + a1: 0 + a2: 1 + query_matches { + smarts: "[$([CD4](F)(F)F)]", + hits_needed: 1 + } + } +} +)pb", 1} + )); } // namespace diff --git a/src/Molecule_Lib/substructure_nmab.cc b/src/Molecule_Lib/substructure_nmab.cc index 285daaba..229b073c 100644 --- a/src/Molecule_Lib/substructure_nmab.cc +++ b/src/Molecule_Lib/substructure_nmab.cc @@ -1359,6 +1359,27 @@ IdentifyShortestPath(Molecule& m, return rc; } +static int +CountHeteroatoms(Molecule& m, atom_number_t avoid1, atom_number_t avoid2, + const int* in_region) { + const int matoms = m.natoms(); + + int rc = 0; + for (int i = 0; i < matoms; ++i) { + if (! in_region[i]) { + continue; + } + if (i == avoid1 || i == avoid2) { + continue; + } + if (m.atomic_number(i) != 6) { + ++rc; + } + } + + return rc; +} + // #define DEBUG_REGION_MATCHES int @@ -1433,6 +1454,13 @@ Region::Matches(Molecule_to_Match& target, } } + if (_heteroatom_count.is_set()) { + int heteratoms = CountHeteroatoms(*m, edges[0], edges[1], tmp.get()); + if (! _heteroatom_count.matches(heteratoms)) { + return 0; + } + } + return 1; } diff --git a/src/Molecule_Lib/substructure_nmab_test.cc b/src/Molecule_Lib/substructure_nmab_test.cc index 548ca1e2..84189730 100644 --- a/src/Molecule_Lib/substructure_nmab_test.cc +++ b/src/Molecule_Lib/substructure_nmab_test.cc @@ -553,7 +553,44 @@ query { atoms_not_on_shortest_path: 0 } } -)pb", "CNCC1C(=O)C(C1)CNC", 1} +)pb", "CNCC1C(=O)C(C1)CNC", 1}, + + ProtoMolMatches{R"pb( +query { + smarts: "[N].[N]" + unique_embeddings_only: true + region { + atom: [0, 1] + atoms_not_on_shortest_path: 0 + heteroatom_count: 0 + } +} +)pb", "NCCCCCCN", 1}, + + ProtoMolMatches{R"pb( +query { + smarts: "[N].[N]" + unique_embeddings_only: true + region { + atom: [0, 1] + atoms_not_on_shortest_path: 0 + heteroatom_count: [1, 3] + } +} +)pb", "NCCCCCCN", 0}, + + ProtoMolMatches{R"pb( +query { + smarts: "[N].[N]" + unique_embeddings_only: true + region { + atom: [0, 1] + natoms: 6 + atoms_not_on_shortest_path: 0 + heteroatom_count: [2, 3] + } +} +)pb", "NCOCOCCN", 1} )); diff --git a/src/Molecule_Lib/substructure_sepma_test.cc b/src/Molecule_Lib/substructure_sepma_test.cc index 7bae1fae..0e8a9363 100644 --- a/src/Molecule_Lib/substructure_sepma_test.cc +++ b/src/Molecule_Lib/substructure_sepma_test.cc @@ -311,7 +311,7 @@ TEST_F(TestSeparatedAtoms, TestMultiple) { EXPECT_FALSE(_query.substructure_search(&_m)); } -TEST_F(TestSeparatedAtoms, TestRespedtInitialNumbering) { +TEST_F(TestSeparatedAtoms, TestRespectInitialNumbering) { _string_proto = R"(query { respect_initial_atom_numbering: true query_atom { @@ -355,4 +355,96 @@ TEST_F(TestSeparatedAtoms, TestRespedtInitialNumbering) { EXPECT_FALSE(_query.substructure_search(&_m)); } +struct ProtoSmilesResult { + std::string proto; + IWString smiles; + int expected; +}; + +std::ostream& +operator<< (std::ostream& output, const ProtoSmilesResult& psr) { + output << psr.proto << ' ' << psr.smiles << ' ' << psr.expected; + return output; +} + +class TestSepma: public testing::TestWithParam { + protected: + SubstructureSearch::SubstructureQuery _proto; + Substructure_Query _query; + Substructure_Results _sresults; + Molecule _mol; +}; +TEST_P(TestSepma, Tests) { + const auto params = GetParam(); + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.proto, &_proto)); + ASSERT_TRUE(_query.ConstructFromProto(_proto)); + ASSERT_TRUE(_mol.build_from_smiles(params.smiles)); + // cerr << "Testing " << params.smiles << " expecting " << params.expected << '\n'; + EXPECT_EQ(_query.substructure_search(_mol, _sresults), params.expected) << params; +} +INSTANTIATE_TEST_SUITE_P(TestSepma, TestSepma, testing::Values( + ProtoSmilesResult{ + R"pb( +query { + smarts: "O.O" + unique_embeddings_only: true + separated_atoms { + a1: 0 + a2: 1 + rotbond: 3 + } +} +)pb", "OCCCCO", 1}, + + ProtoSmilesResult{ + R"pb( +query { + smarts: "O.O" + separated_atoms { + a1: 0 + a2: 1 + rotbond: 3 + } +} +)pb", "OCCCCO", 2}, + + ProtoSmilesResult{ + R"pb( +query { + smarts: "O.O" + separated_atoms { + a1: 0 + a2: 1 + bonds_between: 5 + } +} +)pb", "OC1CCC(O)CC1", 2}, + + ProtoSmilesResult{ + R"pb( +query { + smarts: "O.O" + separated_atoms { + a1: 0 + a2: 1 + bonds_between: 5 + rotbond: 0 + } +} +)pb", "OC1CCC(O)CC1", 2}, + + ProtoSmilesResult{ + R"pb( +query { + smarts: "O.O" + separated_atoms { + a1: 0 + a2: 1 + min_rotbond: 2 + max_rotbond: 2 + } +} +)pb", "OCC(=O)NCO", 2} +)); + } // namespace diff --git a/src/Molecule_Lib/substructure_spec.cc b/src/Molecule_Lib/substructure_spec.cc index 8f3b7d7e..828cabb1 100644 --- a/src/Molecule_Lib/substructure_spec.cc +++ b/src/Molecule_Lib/substructure_spec.cc @@ -2076,10 +2076,6 @@ Substructure_Atom::construct_from_smarts_token(const const_IWSubstring& smarts) Atomic_Smarts_Component* asc = &tokens; - // As tokenised, the operator comes in with the previous token, but - // needed to be added to _operator with the next component. - int op = IW_LOGEXP_UNDEFINED; - do { #ifdef DEBUG_ATOM_CONSTRUCT_FROM_SMARTS_TOKEN cerr << "Building component from '" << *asc << " op " << asc->op() << '\n'; diff --git a/src/Molecule_Lib/substructure_test.cc b/src/Molecule_Lib/substructure_test.cc index 752b6f75..c40a9967 100644 --- a/src/Molecule_Lib/substructure_test.cc +++ b/src/Molecule_Lib/substructure_test.cc @@ -2692,6 +2692,13 @@ struct ProtoMolMatches { int expected; }; +std::ostream& +operator<<(std::ostream& output, const ProtoMolMatches& pmm) { + output << pmm.proto << ' ' << " smiles " << pmm.smiles << " expected " << pmm.expected; + + return output; +} + TEST_F(TestSubstructure, TestAndWithTwoZeros) { ASSERT_TRUE(_query.create_from_smarts("0[<23C]&&0[>23C]")); @@ -2918,7 +2925,7 @@ TEST_P(TestSubstructureP, Tests) { ASSERT_TRUE(_query.ConstructFromProto(_proto)); ASSERT_TRUE(_mol.build_from_smiles(params.smiles)); // cerr << "Testing " << params.smiles << " expecting " << params.expected << '\n'; - EXPECT_EQ(_query.substructure_search(_mol, _sresults), params.expected); + EXPECT_EQ(_query.substructure_search(_mol, _sresults), params.expected) << params; } INSTANTIATE_TEST_SUITE_P(TestSubstructureP, TestSubstructureP, testing::Values( ProtoMolMatches{aminunch, "CN", 1}, @@ -2954,7 +2961,63 @@ INSTANTIATE_TEST_SUITE_P(TestSubstructureP, TestSubstructureP, testing::Values( ProtoMolMatches{substituent_isotope1, "C[1CH2](C)(C)C1CC1CN", 4}, ProtoMolMatches{substituent_isotope1, "C[1CH2](C)(C)C1CC1C[1NH2]", 6}, ProtoMolMatches{substituent_different_global_ids_23, "CCC1CC1C", 2}, - ProtoMolMatches{substituent_different_global_ids_33, "CCC1CC1C", 3} + ProtoMolMatches{substituent_different_global_ids_33, "CCC1CC1C", 3}, + + ProtoMolMatches{R"pb( +query { + smarts: "NC=O" + substituent { + heteroatom_count: 0 + } +} +)pb", "NC(=O)C", 1}, + + ProtoMolMatches{R"pb( +query { + smarts: "NC=O" + substituent { + heteroatom_count: 0 + } +} +)pb", "NC(=O)CCCO", 0}, + + ProtoMolMatches{R"pb( +query { + smarts: "NC=O" + substituent { + heteroatom_count: 1 + } +} +)pb", "NC(=O)CCCO", 1}, + + ProtoMolMatches{R"pb( +query { + smarts: "NC=O" + substituent { + heteroatom_count: 1 + length: 4 + natoms: 4 + nrings: 0 + unsaturation_count: 0 + hits_needed: 1 + } +} +)pb", "NC(=O)CCCO", 1}, + + ProtoMolMatches{R"pb( +query { + smarts: "CCCC" + unique_embeddings_only: true + substituent { + heteroatom_count: 1 + length: 1 + natoms: 1 + nrings: 0 + unsaturation_count: 0 + hits_needed: 4 + } +} +)pb", "FC(F)(F)CCCF", 1} )); struct SmilesSmartsMatches { diff --git a/src/Molecule_Tools/BUILD b/src/Molecule_Tools/BUILD index 52e6e147..0382334e 100644 --- a/src/Molecule_Tools/BUILD +++ b/src/Molecule_Tools/BUILD @@ -16,6 +16,7 @@ local_install( ":common_names", ":dbf", ":dicer", + ":dicer_to_topological_types", ":echoqry", ":echorxn", ":ec_fingerprint", @@ -54,7 +55,6 @@ local_install( ":long_molecules", ":maccskeys", ":make_these_molecules", - ":marvin_pka", ":medchemwizard", ":minor_changes", ":mkfrag", @@ -91,6 +91,7 @@ local_install( ":ring_extraction", ":ring_fingerprint", ":ring_replacement", + ":ring_replacement_collate", ":ring_size_fingerprint", ":ring_substitution", ":ring_trimming", @@ -331,6 +332,20 @@ cc_proto_library( ], ) +proto_library( + name = "pharmacophore_2d_proto", + srcs = [ + "pharmacophore_2d.proto", + ], +) + +cc_proto_library( + name = "pharmacophore_2d_cc_proto", + deps = [ + "pharmacophore_2d_proto", + ], +) + proto_library( name = "random_molecular_permutations_proto", srcs = [ @@ -1256,18 +1271,6 @@ cc_binary( ], ) -cc_binary( - name = "marvin_pka", - srcs = [ - "marvin_pka.cc", - ], - deps = [ - "//Foundational/cmdline:iwcmdline", - "//Molecule_Lib:iwmolecule", - "//Molecule_Lib:moleculeio", - ], -) - cc_binary( name = "minor_changes", srcs = [ @@ -1520,9 +1523,11 @@ cc_binary( "pharmacophore_2d.cc", ], deps = [ + ":pharmacophore_2d_cc_proto", "//Foundational/cmdline:iwcmdline", "//Molecule_Lib:iwmolecule", "//Molecule_Lib:moleculeio", + "@com_google_protobuf//:protobuf", ], ) @@ -1819,6 +1824,23 @@ cc_binary( "//Molecule_Lib:iwmolecule", "//Molecule_Lib:moleculeio", "//Molecule_Lib:substructure_cc_proto", + "@com_google_protobuf//:protobuf", + ], +) + +cc_binary( + name = "ring_replacement_collate", + srcs = [ + "ring_replacement_collate.cc", + ], + deps = [ + ":replacement_ring_cc_proto", + "//Foundational/accumulator", + "//Foundational/cmdline_v2", + "//Foundational/iwaray", + "//Foundational/iwmisc", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_protobuf//:protobuf", ], ) diff --git a/src/Molecule_Tools/INT_iwdescr_fp.cc b/src/Molecule_Tools/INT_iwdescr_fp.cc deleted file mode 100644 index e69de29b..00000000 diff --git a/src/Molecule_Tools/INT_tsubstructure_fp.cc b/src/Molecule_Tools/INT_tsubstructure_fp.cc deleted file mode 100644 index 0c6b5f4e..00000000 --- a/src/Molecule_Tools/INT_tsubstructure_fp.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include "Foundational/iwmisc/sparse_fp_creator.h" -#include "Foundational/iwbits/iwbits.h" - -#include "Molecule_Lib/molecule.h" -#include "tsubstructure_fp.h" - -TSubstructure_FP::TSubstructure_FP() -{ - _work_as_filter = 0; - _bit_replicates = 1; - _default_fingerprint_nbits = 0; - - return; -} - -/* - We have two scenarios for fingerprint output. - We are reading a file of molecules - We are reading a TDT file as a filter -*/ - -int -TSubstructure_FP::_do_fingerprint_output(int nq, - const int * hits, - std::ostream & output) -{ - IW_Bits_Base fp; - if (_default_fingerprint_nbits) - fp.allocate_space_for_bits(_default_fingerprint_nbits * _bit_replicates); - - if (1 == _bit_replicates) - fp.construct_from_array_of_ints(hits, _default_fingerprint_nbits); // 2nd arg default_fingerprint_nbits rather than nq to make sure the bit vector is filled all the way - else - { - for (int i = 0; i < nq; i++) - { - if (0 == hits[i]) - continue; - - for (int j = 0; j < _bit_replicates; j++) - { - if (hits[j]) - fp.set(j * _default_fingerprint_nbits + i); - } - } - } - - fp.write_daylight_ascii_representation(output, _tag); - - return output.good(); -} - -int -TSubstructure_FP::_do_sparse_fingerprint_output (int nq, - const int * hits, - std::ostream & output) -{ - Sparse_Fingerprint_Creator sfp; - - if (1 == _bit_replicates) - sfp.create_from_array_of_ints(hits, nq); - else - { - for (int i = 0; i < nq; i++) - { - if (0 == hits[i]) - continue; - - for (int j = 0; j < _bit_replicates; j++) - { - sfp.hit_bit(j * _bit_replicates + i); - } - } - } - - IWString ascii_rep; - sfp.daylight_ascii_form_with_counts_encoded(ascii_rep); - - output << _tag << ascii_rep << ">\n"; - - return output.good(); -} - -int -write_smiles_and_pcn(Molecule & m, - std::ostream & output) -{ - output << "$SMI<" << m.smiles() << ">\n"; - output << "PCN<" << m.molecule_name() << ">\n"; - - return output.good(); -} - -int -TSubstructure_FP::do_fingerprint_output(Molecule & m, - int nq, - const int * hits, - std::ostream & output) -{ - if (! _work_as_filter) - write_smiles_and_pcn(m, output); - - if (_tag.starts_with("NC")) - (void) _do_sparse_fingerprint_output(nq, hits, output); - else - (void) _do_fingerprint_output(nq, hits, output); - - if (! _work_as_filter) - output << "|\n"; - - return output.good(); -} diff --git a/src/Molecule_Tools/_resizable_array_nas.cc b/src/Molecule_Tools/_resizable_array_nas.cc deleted file mode 100644 index 916b5cfa..00000000 --- a/src/Molecule_Tools/_resizable_array_nas.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "nass.h" - -template class resizable_array_p; -template class resizable_array_base; - diff --git a/src/Molecule_Tools/dbf.cc b/src/Molecule_Tools/dbf.cc index 32e253b2..ad67a0d9 100644 --- a/src/Molecule_Tools/dbf.cc +++ b/src/Molecule_Tools/dbf.cc @@ -996,7 +996,7 @@ do_spatial_and_topological_distances_computation( int b; if (!ok_topology(m, *ci, *cj, b)) { - cerr << "Distance " << d << " rejected by topology\n"; + // cerr << "Distance " << d << " rejected by topology\n"; continue; } diff --git a/src/Molecule_Tools/fileconv.cc b/src/Molecule_Tools/fileconv.cc index 362d0618..afc7bc99 100644 --- a/src/Molecule_Tools/fileconv.cc +++ b/src/Molecule_Tools/fileconv.cc @@ -14,6 +14,7 @@ using std::cerr; #include "Foundational/accumulator/accumulator.h" #include "Foundational/cmdline/cmdline.h" #include "Foundational/iwmisc/misc.h" +#include "Foundational/iwmisc/report_progress.h" #include "Molecule_Lib/molecule.h" #include "Molecule_Lib/istream_and_type.h" @@ -56,6 +57,8 @@ struct LocalOptions { // Optionally we can write connection table errors to a destination. IWString connection_table_error_file; + Report_Progress report_progress; + // functions private: // If requested, write `smiles` to stream_for_smiles_before_filters @@ -75,6 +78,8 @@ struct LocalOptions { int ParseBadHandlingoptions(Command_Line& cl, char flag); + int SetupReporting(Command_Line& cl, char flag); + public: // Initialise settings from command line options. int Build(Command_Line& cl); @@ -114,6 +119,8 @@ LocalOptions::Build(Command_Line& cl) { SetDebugPrintEachMolecule(cl, 'Y'); + SetupReporting(cl, 'Y'); + audit_input = cl.option_present('a'); if (! ParseBadHandlingoptions(cl, 'B')) { @@ -282,6 +289,26 @@ LocalOptions::OpenPreFilteredSmilesFile(Command_Line& cl, char flag) { return 1; } +int +LocalOptions::SetupReporting(Command_Line& cl, char flag) { + const_IWSubstring y; + for (int i = 0; cl.value(flag, y, i); ++i) { + if (! y.starts_with("rpt=")) { + continue; + } + y.remove_leading_chars(4); + uint32_t rpt; + if (! y.numeric_value(rpt)) { + cerr << "Invalid -Y rpt=... directive '" << y << "'\n"; + return 0; + } + report_progress.set_report_every(rpt); + return 1; + } + + return 1; +} + void LocalOptions::SetupInputFile(data_source_and_type& input) const { if (verbose > 1) { @@ -395,6 +422,18 @@ LocalOptions::ReportResults(const Command_Line& cl, std::ostream& output) const int LocalOptions::Fileconv(Molecule& m, fileconv::FileconvConfig& config) { + + if (report_progress()) { + cerr << "Read " << molecules_read << " molecules"; + if (! audit_input) { + cerr << ", wrote " << molecules_written; + if (molecules_changed) { + cerr << molecules_changed << " molecules changed"; + } + } + cerr << '\n'; + } + if (debug_print_each_molecule) { m.compute_aromaticity_if_needed(); m.debug_print(cerr); diff --git a/src/Molecule_Tools/fingerprint_substructure.cc b/src/Molecule_Tools/fingerprint_substructure.cc index 4f27efaa..5c12f892 100644 --- a/src/Molecule_Tools/fingerprint_substructure.cc +++ b/src/Molecule_Tools/fingerprint_substructure.cc @@ -12,6 +12,7 @@ #include "Molecule_Lib/aromatic.h" #include "Molecule_Lib/atom_typing.h" +#include "Molecule_Lib/etrans.h" #include "Molecule_Lib/iwmfingerprint.h" #include "Molecule_Lib/istream_and_type.h" #include "Molecule_Lib/molecule.h" @@ -31,6 +32,8 @@ static int verbose = 0; static Chemical_Standardisation chemical_standardisation; +static Element_Transformations element_transformations; + static int reduce_to_largest_fragment = 0; static resizable_array_p queries; @@ -98,6 +101,7 @@ usage (int rc) cerr << " -P ... atom typing specification, enter '-P help' for info\n"; cerr << " -M produce atom pair fingerprints\n"; cerr << " -f work as a filter\n"; + cerr << " -T ... standard element transformations -T I=Cl -T Br=Cl ...\n"; cerr << " -e truncate any counted fingerprint to 0/1\n"; cerr << " -l reduce to largest fragment\n"; cerr << " -i input specification\n"; @@ -694,6 +698,10 @@ preprocess (Molecule & m) if (chemical_standardisation.active()) chemical_standardisation.process(m); + if (element_transformations.active()) { + element_transformations.process(m); + } + return; } @@ -969,6 +977,13 @@ fingerprint_substructure (int argc, char ** argv) } } + if (cl.option_present('T')) { + if (! element_transformations.construct_from_command_line(cl, verbose, 'T')) { + cerr << "Cannot initialise element transformations (-T)\n"; + return 0; + } + } + if (cl.option_present('e')) { truncate_counted_fingerprints_to_one = 1; @@ -977,8 +992,7 @@ fingerprint_substructure (int argc, char ** argv) cerr << "Will truncate counted fingerprints to 0/1\n"; } - if (cl.option_present('T')) - { + if (cl.option_present('T')) { cl.value('T', fingerprint_tag); if (verbose) diff --git a/src/Molecule_Tools/iwdescr.cc b/src/Molecule_Tools/iwdescr.cc index ba4201e5..1d33aad8 100644 --- a/src/Molecule_Tools/iwdescr.cc +++ b/src/Molecule_Tools/iwdescr.cc @@ -2279,6 +2279,7 @@ write_the_output(Molecule & m, } output << output_separator; + cerr << "writing " << i << std::endl; float v; if (descriptor[i].value(v)) { @@ -8528,7 +8529,7 @@ preprocess(Molecule & m) if (0 == reduce_to_largest_fragment) { cerr << "Fatal, '" << m.name() << " has " << m.number_fragments() << " components\n"; - iwabort(); + //iwabort(); } (void) m.reduce_to_largest_fragment_carefully(); @@ -9229,6 +9230,9 @@ iwdescr(int argc, char ** argv) if (verbose) cerr << "Undefined values will be written as '" << undefined_value << "'\n"; } + if (undefined_value == "none") { + undefined_value = ""; + } } // Ran into problems trying to do test with a charge assigner present diff --git a/src/Molecule_Tools/mol2qry.cc b/src/Molecule_Tools/mol2qry.cc index b08321db..3e4e6970 100644 --- a/src/Molecule_Tools/mol2qry.cc +++ b/src/Molecule_Tools/mol2qry.cc @@ -15,6 +15,7 @@ #include "Molecule_Lib/aromatic.h" #include "Molecule_Lib/ematch.h" +#include "Molecule_Lib/etrans.h" #include "Molecule_Lib/istream_and_type.h" #include "Molecule_Lib/mdl_molecule.h" #include "Molecule_Lib/molecule_to_query.h" @@ -33,7 +34,7 @@ constexpr char kCloseSquareBracket = ']'; constexpr char kOpenBrace = '{'; constexpr char kCloseBrace = '}'; -const char * prog_name = nullptr; +const char* prog_name = nullptr; int queries_written = 0; int verbose = 0; @@ -50,6 +51,8 @@ int isotopically_labelled_from_slicer = 0; Chemical_Standardisation chemical_standardisation; +Element_Transformations element_transformations; + // We can recognise R atoms as substitution points int change_R_groups_to_substitutions = 0; @@ -75,11 +78,10 @@ int write_smarts_relationals_as_rdkit_ranges = 0; // People sometimes draw a two atom molecule across a ring in order to // signify that every atom on that ring could be a point of substitution -//static int kludge_for_ring_substitution = 0; +// static int kludge_for_ring_substitution = 0; int -WriteProto(Substructure_Query& query, - IWString_and_File_Descriptor& output) { +WriteProto(Substructure_Query& query, IWString_and_File_Descriptor& output) { SubstructureSearch::SubstructureQuery proto = query.BuildProto(); std::string as_string; @@ -90,12 +92,10 @@ WriteProto(Substructure_Query& query, return 1; } - // Output by this programme is complex. // struct Mol2QryOutput { - public: - + public: IWString stem_for_output; // If we are processing multiple molecules, we need to produce a different .qry @@ -121,18 +121,18 @@ struct Mol2QryOutput { std::unique_ptr proto_destination; - public: - Mol2QryOutput(); + public: + Mol2QryOutput(); - int Initialise(Command_Line& cl); + int Initialise(Command_Line& cl); - int SetOutputFnameIfNeeded(const char * ifile, IWString& output_fname) const; + int SetOutputFnameIfNeeded(const char* ifile, IWString& output_fname) const; - IWString NextFileNameStem(); + IWString NextFileNameStem(); - int NewFileCreated(const IWString& fname); + int NewFileCreated(const IWString& fname); - int DoOutput(Substructure_Query& query, IWString& fname); + int DoOutput(Substructure_Query& query, IWString& fname); }; Mol2QryOutput::Mol2QryOutput() { @@ -154,7 +154,8 @@ Mol2QryOutput::Initialise(Command_Line& cl) { } if (cl.option_present('b')) { - if (cl.option_present('F')) { cerr << "The -F and -b options don't make sense together\n"; + if (cl.option_present('F')) { + cerr << "The -F and -b options don't make sense together\n"; return 0; } @@ -163,11 +164,12 @@ Mol2QryOutput::Initialise(Command_Line& cl) { // because it would create a single query with many components, rather than many // queries. if (write_as_text_proto) { - cerr << "Mol2QryOutput::Initialise:cannot write multiple text proto to a single file, suggest using -P option\n"; - return 0; + cerr << "Mol2QryOutput::Initialise:cannot write multiple text proto to a single " + "file, suggest using -P option\n"; + return 0; } - if (! cl.option_present('S')) { + if (!cl.option_present('S')) { cerr << "Sorry, must specify the -S option with the -b option\n"; return 0; } @@ -194,15 +196,17 @@ Mol2QryOutput::Initialise(Command_Line& cl) { IWString fname(stem_for_output); if (write_as_text_proto) { fname << ".txtproto"; - if (! stream_for_all_queries_iwstring.open(fname.null_terminated_chars())) { - cerr << "Mol2QryOutput::Initialise:cannot open stream for all queries '" << fname << "'\n"; + if (!stream_for_all_queries_iwstring.open(fname.null_terminated_chars())) { + cerr << "Mol2QryOutput::Initialise:cannot open stream for all queries '" + << fname << "'\n"; return 0; } } else { fname << ".qry"; stream_for_all_queries.open(fname.null_terminated_chars(), std::ios::out); - if (! stream_for_all_queries.good()) { - cerr << "Mol2QryOutput::Initialise:cannot open stream for all queries '" << fname << "'\n"; + if (!stream_for_all_queries.good()) { + cerr << "Mol2QryOutput::Initialise:cannot open stream for all queries '" + << fname << "'\n"; return 0; } } @@ -221,7 +225,7 @@ Mol2QryOutput::Initialise(Command_Line& cl) { if (cl.option_present('P')) { IWString fname = cl.string_value('P'); proto_destination = std::make_unique(); - if (! proto_destination->Open(fname)) { + if (!proto_destination->Open(fname)) { cerr << "Cannot open binary serialized proto file " << fname << '\n'; return 0; } @@ -232,9 +236,9 @@ Mol2QryOutput::Initialise(Command_Line& cl) { } if (cl.option_present('F')) { - const char * f = cl.option_value('F'); + const char* f = cl.option_value('F'); - if ( !stream_for_names_of_query_files.open(f)) { + if (!stream_for_names_of_query_files.open(f)) { cerr << "Cannot open stream for query files '" << f << "'\n"; return 8; } @@ -250,20 +254,19 @@ Mol2QryOutput::Initialise(Command_Line& cl) { // Input file `ifile` is being processed. If this needs a specific output file name // for this input, set it in `output_fname`. int -Mol2QryOutput::SetOutputFnameIfNeeded(const char* ifile, - IWString& output_fname) const { +Mol2QryOutput::SetOutputFnameIfNeeded(const char* ifile, IWString& output_fname) const { if (all_queries_in_one_file) { // file already opened elsewhere return 1; - } + } if (proto_destination) { return 1; - } + } if (!stem_for_output.empty()) { output_fname = stem_for_output; return 1; - } + } output_fname = ifile; output_fname.remove_suffix(); @@ -298,18 +301,19 @@ Mol2QryOutput::NewFileCreated(const IWString& fname) { } int -Mol2QryOutput::DoOutput(Substructure_Query& query, - IWString& fname) { - // cerr << "write_as_text_proto " << write_as_text_proto << " proto_destination " << (proto_destination ? "yes":"no") << " fname '" << fname << "'\n"; +Mol2QryOutput::DoOutput(Substructure_Query& query, IWString& fname) { + // cerr << "write_as_text_proto " << write_as_text_proto << " proto_destination " << + // (proto_destination ? "yes":"no") << " fname '" << fname << "'\n"; if (write_as_text_proto) { - cerr << "write_as_text_proto, all_queries_in_one_file " << all_queries_in_one_file << '\n'; + cerr << "write_as_text_proto, all_queries_in_one_file " << all_queries_in_one_file + << '\n'; if (all_queries_in_one_file) { WriteProto(query, stream_for_all_queries_iwstring); stream_for_all_queries_iwstring.write_if_buffer_holds_more_than(4096); return 1; } IWString_and_File_Descriptor output; - if (! output.open(fname.null_terminated_chars())) { + if (!output.open(fname.null_terminated_chars())) { cerr << "Mol2QryOutput::DoOutput:cannot open '" << fname << "'\n"; return 0; } @@ -317,21 +321,21 @@ Mol2QryOutput::DoOutput(Substructure_Query& query, WriteProto(query, output); NewFileCreated(fname); return 1; - } - + } + if (proto_destination) { SubstructureSearch::SubstructureQuery proto = query.BuildProto(); std::string serialized; proto.SerializeToString(&serialized); return proto_destination->Write(serialized.data(), serialized.size()); - } + } if (all_queries_in_one_file) { return query.write_msi(stream_for_all_queries); } - + std::ofstream output(fname.null_terminated_chars(), std::ios::out); - if (! output.good()) { + if (!output.good()) { cerr << "Mol2QryOutput::DoOutput:cannot open '" << fname << "'\n"; return 0; } @@ -345,18 +349,17 @@ Mol2QryOutput::DoOutput(Substructure_Query& query, return 0; } - void -usage(int rc = 1) -{ +usage(int rc = 1) { // clang-format off #if defined(GIT_HASH) && defined(TODAY) cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; #else cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; #endif -// clang-format on -// clang-format off + // clang-format on + // clang-format off + cerr << "Converts a file of molecule(s) to query forms\n"; cerr << " -m all ncon and nbonds values are written as minima\n"; // does not work, not sure it makes sense cerr << " -r all ring bonds become type ANY\n"; cerr << " -j all atoms conserve their ring membership\n"; @@ -391,12 +394,13 @@ usage(int rc = 1) cerr << " -D ... create proto query files with GeometricConstraints, '-D help' for info\n"; cerr << " -B generate smarts instead of query file\n"; cerr << " -Y ... more obscure options, enter '-Y help' for info\n"; + cerr << " -T ... standard element transformations -T I=Cl -T Br=Cl ...\n"; cerr << " -i specify input file type\n"; display_standard_aromaticity_options(cerr); cerr << " -g ... chemical standardisation options, enter '-g help' for info\n"; cerr << " -v verbose operation\n"; -// clang-format on - + // clang-format on + exit(rc); } @@ -419,15 +423,14 @@ DisplayGeometricConfigOptions(std::ostream& output) { } int -BuildGeometricConfig(Command_Line & cl, char flag, - GeometryConfig& geometry_config) { +BuildGeometricConfig(Command_Line& cl, char flag, GeometryConfig& geometry_config) { IWString d; for (int i = 0; cl.value(flag, d, i); ++i) { d.to_lowercase(); if (d.starts_with("tol=")) { d.remove_leading_chars(4); - if (! d.numeric_value(geometry_config.tolerance) || - geometry_config.tolerance < 0.0) { + if (!d.numeric_value(geometry_config.tolerance) || + geometry_config.tolerance < 0.0) { cerr << "BuildGeometricConfig:invalid tolerance " << d << '\n'; return 0; } @@ -459,23 +462,22 @@ DistanceRange(float distance, const GeometryConfig& config) { // A smarts is constructed, that consists only of the atomic // number. That should be refined. int -ToGeometricConstraints(MDL_Molecule& m, - const IWString& name_stem, +ToGeometricConstraints(MDL_Molecule& m, const IWString& name_stem, const GeometryConfig& config) { IWString smarts; const int matoms = m.natoms(); std::unique_ptr ecount(new_int(HIGHEST_ATOMIC_NUMBER + 1)); std::unique_ptr atom_xref(new_int(matoms, -1)); - int ndx = 0; // A count of the number of active atoms. + int ndx = 0; // A count of the number of active atoms. for (int i = 0; i < matoms; ++i) { - const Atom * a = m.atomi(i); + const Atom* a = m.atomi(i); int atnum = a->atomic_number(); if (atnum == 0 || atnum > HIGHEST_ATOMIC_NUMBER) { cerr << "ToGeometricConstraints:invalid atomic number " << atnum << '\n'; return 0; } - if (atnum == 1 && ! config.include_hydrogen) { + if (atnum == 1 && !config.include_hydrogen) { continue; } atom_xref[i] = ndx; @@ -487,7 +489,7 @@ ToGeometricConstraints(MDL_Molecule& m, } SubstructureSearch::SubstructureQuery proto; - SubstructureSearch::SingleSubstructureQuery * qry = proto.add_query(); + SubstructureSearch::SingleSubstructureQuery* qry = proto.add_query(); qry->set_smarts(smarts.data(), smarts.length()); const IWString& mname = m.name(); @@ -498,13 +500,13 @@ ToGeometricConstraints(MDL_Molecule& m, if (ecount[i] == 0) { continue; } - SubstructureSearch::ElementsNeeded * needed - = qry->mutable_required_molecular_properties()->add_elements_needed(); + SubstructureSearch::ElementsNeeded* needed = + qry->mutable_required_molecular_properties()->add_elements_needed(); needed->add_atomic_number(i); needed->set_min_hits_needed(ecount[i]); } - GeometricConstraints::SetOfConstraints * constraints = qry->add_geometric_constraints(); + GeometricConstraints::SetOfConstraints* constraints = qry->add_geometric_constraints(); for (int i = 0; i < matoms; ++i) { if (atom_xref[i] < 0) { @@ -515,10 +517,10 @@ ToGeometricConstraints(MDL_Molecule& m, continue; } const float d = m.distance_between_atoms(i, j); - GeometricConstraints::Distance * dconstraint = constraints->add_distance(); + GeometricConstraints::Distance* dconstraint = constraints->add_distance(); dconstraint->set_a1(atom_xref[i]); dconstraint->set_a2(atom_xref[j]); - GeometricConstraints::Range * range = dconstraint->mutable_range(); + GeometricConstraints::Range* range = dconstraint->mutable_range(); const auto [min_dist, max_dist] = DistanceRange(d, config); range->set_min(min_dist); range->set_max(max_dist); @@ -539,58 +541,53 @@ ToGeometricConstraints(MDL_Molecule& m, } int -expand_isotopes(MDL_Molecule & m, - atom_number_t zatom, - int radius, - isotope_t iso) -{ - const Atom * a = m.atomi(zatom); +expand_isotopes(MDL_Molecule& m, atom_number_t zatom, int radius, isotope_t iso) { + const Atom* a = m.atomi(zatom); int acon = a->ncon(); -//cerr << "Expanding isotopes from " << zatom << '\n'; + // cerr << "Expanding isotopes from " << zatom << '\n'; - for (int i = 0; i < acon; i++) - { + for (int i = 0; i < acon; i++) { atom_number_t j = a->other(zatom, i); - if (iso == m.isotope(j)) + if (iso == m.isotope(j)) { continue; + } m.set_isotope(j, iso); - if (radius > 0) + if (radius > 0) { expand_isotopes(m, j, radius - 1, iso); + } } return 1; } int -identify_coordination_point_and_adjacent_atoms(MDL_Molecule & m) -{ +identify_coordination_point_and_adjacent_atoms(MDL_Molecule& m) { Substructure_Results sresults; Molecule_to_Match target(&m); const int nhits = coordination_point.substructure_search(target, sresults); - if (0 == nhits) - { + if (0 == nhits) { cerr << "Zero hits to coordination point substructure search\n"; return 0; } - if (verbose) + if (verbose) { cerr << m.name() << " " << nhits << " hits to coordination point query\n"; + } - for (const Set_of_Atoms* e : sresults.embeddings()) - { + for (const Set_of_Atoms* e : sresults.embeddings()) { const atom_number_t j = e->front(); - MDL_Atom_Data * mdlad = m.mdl_atom_data(j); + MDL_Atom_Data* mdlad = m.mdl_atom_data(j); - mdlad->set_substitution(-2); // means exactly as specified + mdlad->set_substitution(-2); // means exactly as specified m.set_isotope(j, 973); @@ -602,12 +599,9 @@ identify_coordination_point_and_adjacent_atoms(MDL_Molecule & m) } int -mol2qry(MDL_Molecule & m, - Molecule_to_Query_Specifications & mqs, - IWString& fname, - Mol2QryOutput& mol2qry_output) -{ - Set_of_Atoms & substitution_points = mqs.externally_specified_substitution_points(); +mol2qry(MDL_Molecule& m, Molecule_to_Query_Specifications& mqs, IWString& fname, + Mol2QryOutput& mol2qry_output) { + Set_of_Atoms& substitution_points = mqs.externally_specified_substitution_points(); substitution_points.resize_keep_storage(0); @@ -624,7 +618,7 @@ mol2qry(MDL_Molecule & m, } Substructure_Query query; - if (! query.create_from_molecule(m, mqs)) { // it inherits the molecule name + if (!query.create_from_molecule(m, mqs)) { // it inherits the molecule name cerr << "cannot create query from molecule '" << m.name() << "'\n"; return 1; } @@ -636,7 +630,7 @@ mol2qry(MDL_Molecule & m, return 0; } } - + if (append_to_comment.length()) { IWString tmp(m.name()); tmp.append_with_spacer(append_to_comment); @@ -651,14 +645,12 @@ mol2qry(MDL_Molecule & m, } int -mol2qry(MDL_Molecule & m, - Molecule_to_Query_Specifications & mqs, - const GeometryConfig& geometry_config, - const IWString & output_stem, - Mol2QryOutput& mol2qry_output) -{ +mol2qry(MDL_Molecule& m, Molecule_to_Query_Specifications& mqs, + const GeometryConfig& geometry_config, const IWString& output_stem, + Mol2QryOutput& mol2qry_output) { if (isotopically_labelled_from_slicer && 0 == m.number_isotopic_atoms()) { - cerr << "Warning, only substitute at isotopically labelled atoms, but no isotopes '" << m.name() << "'\n"; + cerr << "Warning, only substitute at isotopically labelled atoms, but no isotopes '" + << m.name() << "'\n"; } if (mol2qry_output.all_queries_in_one_file) { @@ -695,43 +687,46 @@ mol2qry(MDL_Molecule & m, */ void -preprocess(MDL_Molecule & m) -{ - if (remove_isotopes_from_input_molecules) +preprocess(MDL_Molecule& m) { + if (remove_isotopes_from_input_molecules) { m.transform_to_non_isotopic_form(); + } - if (chemical_standardisation.active()) + if (element_transformations.active()) { + element_transformations.process(m); + } + + if (chemical_standardisation.active()) { chemical_standardisation.process(m); + } - if (remove_chiral_centres) + if (remove_chiral_centres) { m.remove_all_chiral_centres(); + } - if (add_explicit_hydrogens) + if (add_explicit_hydrogens) { m.make_implicit_hydrogens_explicit(); + } return; } int -mol2qry(data_source_and_type & input, - Molecule_to_Query_Specifications & mqs, - const GeometryConfig& geometry_config, - IWString & output_fname, - Mol2QryOutput& mol2qry_output) -{ - MDL_Molecule * m; - - while (nullptr != (m = input.next_molecule())) - { +mol2qry(data_source_and_type& input, Molecule_to_Query_Specifications& mqs, + const GeometryConfig& geometry_config, IWString& output_fname, + Mol2QryOutput& mol2qry_output) { + MDL_Molecule* m; + + while (nullptr != (m = input.next_molecule())) { std::unique_ptr free_m(m); preprocess(*m); - if (! m->arrays_allocated()) { + if (!m->arrays_allocated()) { m->build(*m); } - if (! mol2qry(*m, mqs, geometry_config, output_fname, mol2qry_output)) { + if (!mol2qry(*m, mqs, geometry_config, output_fname, mol2qry_output)) { return 0; } } @@ -740,37 +735,31 @@ mol2qry(data_source_and_type & input, } int -mol2qry(const char * input_fname, - FileType input_type, - Molecule_to_Query_Specifications & mqs, - const GeometryConfig& geometry_config, - IWString & output_fname, - Mol2QryOutput& mol2qry_output) -{ +mol2qry(const char* input_fname, FileType input_type, + Molecule_to_Query_Specifications& mqs, const GeometryConfig& geometry_config, + IWString& output_fname, Mol2QryOutput& mol2qry_output) { if (FILE_TYPE_INVALID == input_type) { input_type = discern_file_type_from_name(input_fname); - assert (FILE_TYPE_INVALID != input_type); + assert(FILE_TYPE_INVALID != input_type); } data_source_and_type input(input_type, input_fname); - if (! input.ok()) { + if (!input.ok()) { cerr << prog_name << ": cannot read '" << input_fname << "'\n"; return 1; } - if (verbose > 1) + if (verbose > 1) { input.set_verbose(1); + } return mol2qry(input, mqs, geometry_config, output_fname, mol2qry_output); } int -mol2qry(const char * ifile, - const FileType input_type, - Molecule_to_Query_Specifications & mqs, - const GeometryConfig& geometry_config, - Mol2QryOutput& mol2qry_output) -{ +mol2qry(const char* ifile, const FileType input_type, + Molecule_to_Query_Specifications& mqs, const GeometryConfig& geometry_config, + Mol2QryOutput& mol2qry_output) { IWString output_fname; mol2qry_output.SetOutputFnameIfNeeded(ifile, output_fname); @@ -779,13 +768,11 @@ mol2qry(const char * ifile, } int -do_read_environment(const const_IWSubstring & fname, - Molecule_to_Query_Specifications & mqs) -{ +do_read_environment(const const_IWSubstring& fname, + Molecule_to_Query_Specifications& mqs) { iwstring_data_source input(fname); - if (! input.good()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 0; } @@ -794,13 +781,11 @@ do_read_environment(const const_IWSubstring & fname, } int -do_read_environment_no_match(const const_IWSubstring & fname, - Molecule_to_Query_Specifications & mqs) -{ +do_read_environment_no_match(const const_IWSubstring& fname, + Molecule_to_Query_Specifications& mqs) { iwstring_data_source input(fname); - if (! input.good()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 0; } @@ -808,15 +793,13 @@ do_read_environment_no_match(const const_IWSubstring & fname, return mqs.read_environment_no_match_specification(input); } - int -process_smiles_from_command_line(const IWString & smiles, - Molecule_to_Query_Specifications & mqs, +process_smiles_from_command_line(const IWString& smiles, + Molecule_to_Query_Specifications& mqs, const GeometryConfig& geometry_config, - Mol2QryOutput& mol2qry_output) -{ + Mol2QryOutput& mol2qry_output) { MDL_Molecule m; - if (! m.build_from_smiles(smiles)) { + if (!m.build_from_smiles(smiles)) { cerr << "Cannot parse -M smiles '" << smiles << "'\n"; return 54; } @@ -838,8 +821,7 @@ process_smiles_from_command_line(const IWString & smiles, // Only a limited set of functionality is supported. int -Mol2Smarts(Molecule_to_Query_Specifications& mqs, - Molecule& m, +Mol2Smarts(Molecule_to_Query_Specifications& mqs, Molecule& m, IWString_and_File_Descriptor& output) { m.compute_aromaticity_if_needed(); @@ -897,12 +879,11 @@ Mol2Smarts(Molecule_to_Query_Specifications& mqs, } int -Mol2Smarts(Molecule_to_Query_Specifications& mqs, - data_source_and_type& input, +Mol2Smarts(Molecule_to_Query_Specifications& mqs, data_source_and_type& input, IWString_and_File_Descriptor& output) { - Molecule * m; + Molecule* m; while ((m = input.next_molecule()) != nullptr) { - if (! Mol2Smarts(mqs, *m, output)) { + if (!Mol2Smarts(mqs, *m, output)) { cerr << "Cannot process " << m->name() << '\n'; return 0; } @@ -912,12 +893,10 @@ Mol2Smarts(Molecule_to_Query_Specifications& mqs, } int -Mol2Smarts(Molecule_to_Query_Specifications& mqs, - const char* fname, - FileType input_type, +Mol2Smarts(Molecule_to_Query_Specifications& mqs, const char* fname, FileType input_type, IWString_and_File_Descriptor& output) { - data_source_and_type input (input_type, fname); - if (! input.good()) { + data_source_and_type input(input_type, fname); + if (!input.good()) { cerr << "Mol2Smarts:cannot open '" << fname << "'\n"; return 0; } @@ -926,12 +905,10 @@ Mol2Smarts(Molecule_to_Query_Specifications& mqs, } int -Mol2Smarts(Molecule_to_Query_Specifications& mqs, - Command_Line& cl, - FileType input_type, +Mol2Smarts(Molecule_to_Query_Specifications& mqs, Command_Line& cl, FileType input_type, IWString_and_File_Descriptor& output) { - for (const char* fname: cl) { - if (! Mol2Smarts(mqs, fname, input_type, output)) { + for (const char* fname : cl) { + if (!Mol2Smarts(mqs, fname, input_type, output)) { cerr << "Mol2Smarts:error processing '" << fname << "'\n"; return 0; } @@ -941,22 +918,19 @@ Mol2Smarts(Molecule_to_Query_Specifications& mqs, } int -Mol2Smarts(Molecule_to_Query_Specifications& mqs, - Command_Line& cl, - FileType input_type, +Mol2Smarts(Molecule_to_Query_Specifications& mqs, Command_Line& cl, FileType input_type, IWString& fname) { IWString_and_File_Descriptor output; - if (! output.open(fname.null_terminated_chars())) { + if (!output.open(fname.null_terminated_chars())) { cerr << "Mol2Smarts:cannot open '" << fname << "'\n"; return 0; - } + } return Mol2Smarts(mqs, cl, input_type, output); } void -display_dash_y_options(std::ostream & os) -{ +display_dash_y_options(std::ostream& os) { os << R"( -Y minextra=n for a match, target must have at least N extra atoms -Y maxextra=n for a match, target must have at most N extra atoms @@ -981,8 +955,8 @@ display_dash_y_options(std::ostream & os) } int -mol2qry(int argc, char ** argv) { - Command_Line cl(argc, argv, "aA:S:P:nmvE:i:M:sV:X:F:f:R:btg:heu:ojK:Y:kl:L:IcdD:px:B:"); +mol2qry(int argc, char** argv) { + Command_Line cl(argc, argv, "aA:S:P:nmvE:i:M:sV:X:F:f:R:btg:heu:ojK:Y:kl:L:IcdD:px:B:T:"); verbose = cl.option_count('v'); @@ -990,34 +964,34 @@ mol2qry(int argc, char ** argv) { usage(2); } - if (! process_elements(cl)) { + if (!process_elements(cl)) { usage(3); } - if (! cl.option_present('A')) { + if (!cl.option_present('A')) { set_global_aromaticity_type(Daylight); cerr << "Using Daylight aromaticity by default\n"; - } - else if (! process_standard_aromaticity_options(cl, verbose)) { + } else if (!process_standard_aromaticity_options(cl, verbose)) { usage(4); } if (cl.option_present('g')) { - if (! chemical_standardisation.construct_from_command_line(cl, verbose > 1, 'g')) { + if (!chemical_standardisation.construct_from_command_line(cl, verbose > 1, 'g')) { cerr << "Cannot process chemical standardisation options (-g)\n"; usage(32); } } if (cl.option_present('K')) { - if (! process_standard_smiles_options(cl, verbose, 'K')) { + if (!process_standard_smiles_options(cl, verbose, 'K')) { cerr << "Cannot initialise standard smiles options (-K)\n"; return 4; } } - + if (cl.option_present('m') && cl.option_present('R')) { - cerr << "Sorry, the -m and -R options are mutually incompatible, contact LillyMol on github (https://github.com/EliLillyCo/LillyMol)\n"; + cerr << "Sorry, the -m and -R options are mutually incompatible, contact LillyMol on " + "github (https://github.com/EliLillyCo/LillyMol)\n"; return 3; } @@ -1038,10 +1012,10 @@ mol2qry(int argc, char ** argv) { non_ring_atoms_become_nrings_0 = 1; } -// Historical quirk. When I wrote this, the -R option meant regular expression. -// Then in May 2005, I needed to allow both regular expressions and element matches. -// The Element_Matcher object can do that, but for it to process a regular expression, -// the string must start with 'RX=' + // Historical quirk. When I wrote this, the -R option meant regular expression. + // Then in May 2005, I needed to allow both regular expressions and element matches. + // The Element_Matcher object can do that, but for it to process a regular expression, + // the string must start with 'RX=' if (cl.option_present('R')) { const_IWSubstring r = cl.string_value('R'); @@ -1054,7 +1028,7 @@ mol2qry(int argc, char ** argv) { tmp << "RX=" << r; } - if (! rgroup.construct_from_string(tmp)) { + if (!rgroup.construct_from_string(tmp)) { cerr << "Invalid R group matching specification '" << tmp << "'\n"; return 4; } @@ -1075,7 +1049,7 @@ mol2qry(int argc, char ** argv) { } if (cl.option_present('i')) { - if (! process_input_type(cl, input_type)) { + if (!process_input_type(cl, input_type)) { cerr << "Cannot parse -i directives\n"; usage(16); } @@ -1099,8 +1073,9 @@ mol2qry(int argc, char ** argv) { if (cl.option_present('e')) { mqs.set_just_atomic_number_and_connectivity(1); - if (verbose) + if (verbose) { cerr << "Queries will contain just atomic number and connectivity info\n"; + } } if (cl.option_present('s') && cl.option_present('w')) { @@ -1109,21 +1084,23 @@ mol2qry(int argc, char ** argv) { } if (cl.option_present('s') || cl.option_present('c') || cl.option_present('t')) { -// mqs.substitutions_only_at().create_from_smarts("[!0*]"); + // mqs.substitutions_only_at().create_from_smarts("[!0*]"); isotopically_labelled_from_slicer = 1; mqs.set_substituents_only_at_isotopic_atoms(1); if (cl.option_present('t')) { mqs.set_must_have_substituent_at_every_isotopic_atom(0); - if (verbose) + if (verbose) { cerr << "Not all isotopically labelled atoms need substituents\n"; + } } if (cl.option_present('c')) { mqs.set_isotope_count_means_extra_connections(1); - if (verbose) + if (verbose) { cerr << "Isotopic number indicates number of extra connections\n"; + } } } else if (cl.option_present('w')) { mqs.set_substituents_only_at_non_isotopic_atoms(1); @@ -1131,7 +1108,7 @@ mol2qry(int argc, char ** argv) { const_IWSubstring smarts; cl.value('u', smarts); - if (! mqs.substitutions_only_at().create_from_smarts(smarts)) { + if (!mqs.substitutions_only_at().create_from_smarts(smarts)) { cerr << "Invalid smarts for substitution point(s) '" << smarts << "'\n"; return 3; } @@ -1139,7 +1116,7 @@ mol2qry(int argc, char ** argv) { if (cl.option_present('x')) { int x; - if (! cl.value('x', x) || x < 1) { + if (!cl.value('x', x) || x < 1) { cerr << "The isotope becomes any atom option (-x) must have a whole +ve number\n"; return 1; } @@ -1153,7 +1130,7 @@ mol2qry(int argc, char ** argv) { int i = 0; const_IWSubstring f; while (cl.value('f', f, i++)) { - if (! mqs.set_smarts_for_atom(f)) { + if (!mqs.set_smarts_for_atom(f)) { cerr << "Invalid smarts for atom '" << f << "'\n"; return 1; } @@ -1162,33 +1139,37 @@ mol2qry(int argc, char ** argv) { if (cl.option_present('a')) { mqs.set_only_aromatic_atoms_match_aromatic_atoms(1); - if (verbose) + if (verbose) { cerr << "Only aromatic atoms will match aromatic atoms\n"; + } } if (cl.option_present('d')) { mqs.set_preserve_saturation(1); - if (verbose) + if (verbose) { cerr << "Atom saturation will be preserved\n"; + } } if (cl.option_present('V')) { const_IWSubstring v = cl.string_value('V'); - if (! do_read_environment(v, mqs)) { + if (!do_read_environment(v, mqs)) { cerr << "Cannot read query environment specification from '" << v << "'\n"; return 8; } - if (verbose) + if (verbose) { cerr << "Read query environment specification from '" << v << "'\n"; + } } if (cl.option_present('X')) { const_IWSubstring x = cl.string_value('X'); - if (! do_read_environment_no_match(x, mqs)) { - cerr << "Cannot read query environment rejection specification from '" << x << "'\n"; + if (!do_read_environment_no_match(x, mqs)) { + cerr << "Cannot read query environment rejection specification from '" << x + << "'\n"; return 8; } @@ -1213,23 +1194,27 @@ mol2qry(int argc, char ** argv) { } if (cl.option_present('L')) { - if (! cl.option_present('l')) { - cerr << "When specifying a coordination point (-L) must also specify bond radius (-l)\n"; + if (!cl.option_present('l')) { + cerr << "When specifying a coordination point (-L) must also specify bond radius " + "(-l)\n"; usage(3); } - if (! cl.value('l', radius_from_coordination_point) || radius_from_coordination_point < 1) { - cerr << "The radius from coordination point option (-l) must be a whole +ve number\n"; + if (!cl.value('l', radius_from_coordination_point) || + radius_from_coordination_point < 1) { + cerr << "The radius from coordination point option (-l) must be a whole +ve " + "number\n"; usage(3); } if (verbose) { - cerr << "Will include all atoms within " << radius_from_coordination_point << " bonds of coordination point\n"; + cerr << "Will include all atoms within " << radius_from_coordination_point + << " bonds of coordination point\n"; } const const_IWSubstring smt = cl.string_value('L'); - if (! coordination_point.create_from_smarts(smt)) { + if (!coordination_point.create_from_smarts(smt)) { cerr << "Invalid coordination point smarts '" << smt << "'\n"; return 3; } @@ -1249,189 +1234,182 @@ mol2qry(int argc, char ** argv) { } } + if (cl.option_present('T')) { + if (! element_transformations.construct_from_command_line(cl, verbose, 'T')) { + cerr << "Cannot initialise element transformations (-T)\n"; + return 0; + } + } + if (cl.option_present('Y')) { int i = 0; const_IWSubstring y; while (cl.value('Y', y, i++)) { - if (y.starts_with("minextra=")) - { + if (y.starts_with("minextra=")) { y.remove_leading_chars(9); int e; - if (! y.numeric_value(e) || e < 0) - { - cerr << "The min number extra atoms to be matched '-Y minextra=' must be a whole +ve number\n"; + if (!y.numeric_value(e) || e < 0) { + cerr << "The min number extra atoms to be matched '-Y minextra=' must be a " + "whole +ve number\n"; display_dash_y_options(cerr); } mqs.set_min_extra_atoms_in_target(e); - if (verbose) + if (verbose) { cerr << "Matches require at least " << e << " extra atoms\n"; - } - else if (y.starts_with("maxextra=")) - { + } + } else if (y.starts_with("maxextra=")) { y.remove_leading_chars(9); int e; - if (! y.numeric_value(e) || e < 0) - { - cerr << "The max number extra atoms to be matched '-Y minextra=' must be a whole +ve number\n"; + if (!y.numeric_value(e) || e < 0) { + cerr << "The max number extra atoms to be matched '-Y minextra=' must be a " + "whole +ve number\n"; display_dash_y_options(cerr); } mqs.set_max_extra_atoms_in_target(e); - if (verbose) + if (verbose) { cerr << "Matches require at most " << e << " extra atoms\n"; - } - else if (y.starts_with("ncon=")) - { + } + } else if (y.starts_with("ncon=")) { y.remove_leading_chars(5); int n; - if (! y.numeric_value(n) || n < 0) - { - cerr << "The number of connections to matched atoms '-Y ncon=' must be a whole +ve number\n"; + if (!y.numeric_value(n) || n < 0) { + cerr << "The number of connections to matched atoms '-Y ncon=' must be a whole " + "+ve number\n"; display_dash_y_options(cerr); } mqs.set_ncon(n); - if (verbose) + if (verbose) { cerr << "Matches can have only " << n << " connections to unmatched atoms\n"; - } - else if (y.starts_with("min_ncon=")) - { + } + } else if (y.starts_with("min_ncon=")) { y.remove_leading_chars(9); int n; - if (! y.numeric_value(n) || n < 0) - { - cerr << "The minimum number of connections to matched atoms '-Y min_ncon=' must be a whole +ve number\n"; + if (!y.numeric_value(n) || n < 0) { + cerr << "The minimum number of connections to matched atoms '-Y min_ncon=' " + "must be a whole +ve number\n"; display_dash_y_options(cerr); } mqs.set_min_ncon(n); - if (verbose) - cerr << "Matches must have at least " << n << " connections to unmatched atoms\n"; - } - else if (y.starts_with("max_ncon=")) - { + if (verbose) { + cerr << "Matches must have at least " << n + << " connections to unmatched atoms\n"; + } + } else if (y.starts_with("max_ncon=")) { y.remove_leading_chars(9); int n; - if (! y.numeric_value(n) || n < 0) - { - cerr << "The maximum number of connections to matched atoms '-Y max_ncon=' must be a whole +ve number\n"; + if (!y.numeric_value(n) || n < 0) { + cerr << "The maximum number of connections to matched atoms '-Y max_ncon=' " + "must be a whole +ve number\n"; display_dash_y_options(cerr); } mqs.set_max_ncon(n); - if (verbose) - cerr << "Matches must have at least " << n << " connections to unmatched atoms\n"; - } - else if ("exph" == y) - { + if (verbose) { + cerr << "Matches must have at least " << n + << " connections to unmatched atoms\n"; + } + } else if ("exph" == y) { add_explicit_hydrogens = 1; - if (verbose) + if (verbose) { cerr << "Explicit Hydrogens will be added to the molecules\n"; + } mqs.set_convert_explicit_hydrogens_to_match_any_atom(1); - } - else if ("ablk" == y) - { + } else if ("ablk" == y) { set_aromatic_bonds_lose_kekule_identity(1); - if (verbose) + if (verbose) { cerr << "Aromatic bonds will lose their Kekule identity\n"; - } - else if (y.starts_with("minfm=")) - { + } + } else if (y.starts_with("minfm=")) { y.remove_leading_chars(6); float f; - if (! y.numeric_value(f) || f < 0.0 || f > 1.0) - { - cerr << "The min fraction atoms matched directive (minfm=) must be a valid fraction\n"; + if (!y.numeric_value(f) || f < 0.0 || f > 1.0) { + cerr << "The min fraction atoms matched directive (minfm=) must be a valid " + "fraction\n"; return 2; } mqs.set_min_fraction_atoms_matched(f); - if (verbose) - cerr << "Matches will require a min fraction atom matched of " << f << '\n'; - } - else if (y.starts_with("maxfm=")) - { + if (verbose) { + cerr << "Matches will require a min fraction atom matched of " << f << '\n'; + } + } else if (y.starts_with("maxfm=")) { y.remove_leading_chars(6); float f; - if (! y.numeric_value(f) || f < 0.0 || f > 1.0) - { - cerr << "The max fraction atoms matched directive (maxfm=) must be a valid fraction\n"; + if (!y.numeric_value(f) || f < 0.0 || f > 1.0) { + cerr << "The max fraction atoms matched directive (maxfm=) must be a valid " + "fraction\n"; return 2; } mqs.set_max_fraction_atoms_matched(f); - if (verbose) - cerr << "Matches will require a max fraction atom matched of " << f << '\n'; - } - else if (y.starts_with("A2A=")) - { + if (verbose) { + cerr << "Matches will require a max fraction atom matched of " << f << '\n'; + } + } else if (y.starts_with("A2A=")) { y.remove_leading_chars(4); int a; - if (! y.numeric_value(a) || a < 1 || a > 3) - { + if (!y.numeric_value(a) || a < 1 || a > 3) { cerr << "The A2A= qualifier must be an int between 1 and 3\n"; return 0; } mqs.set_convert_all_aromatic_atoms_to_generic_aromatic(a); - if (verbose) + if (verbose) { cerr << "Convert aromatic atoms to generic aromatic directive " << a << '\n'; - } - else if ("rmiso" == y) - { + } + } else if ("rmiso" == y) { remove_isotopes_from_input_molecules = 1; - if (verbose) + if (verbose) { cerr << "Will immediately remove isotopes from molecules being read\n"; - } - else if (y.starts_with("APPC=")) - { + } + } else if (y.starts_with("APPC=")) { append_to_comment = y; append_to_comment.remove_leading_chars(5); - if (verbose) + if (verbose) { cerr << "Will append '" << append_to_comment << "' to each query name\n"; - } - else if ("test" == y) - { + } + } else if ("test" == y) { perform_matching_test = 1; - if (verbose) - cerr << "Will try a match into the originating molecule for each query formed\n"; - } - else if (y == "smtrange") { + if (verbose) { + cerr + << "Will try a match into the originating molecule for each query formed\n"; + } + } else if (y == "smtrange") { write_smarts_relationals_as_rdkit_ranges = 1; if (verbose) { cerr << "Smarts relational specifications written as rdkit ranges\n"; } - } - else if ("help" == y) - { - display_dash_y_options (cerr); - } - else - { + } else if ("help" == y) { + display_dash_y_options(cerr); + } else { cerr << "Unrecognised -Y qualifier '" << y << "'\n"; - display_dash_y_options (cerr); + display_dash_y_options(cerr); } } } GeometryConfig geometry_config; if (cl.option_present('D')) { - if (! BuildGeometricConfig(cl, 'D', geometry_config)) { + if (!BuildGeometricConfig(cl, 'D', geometry_config)) { cerr << "Cannot determine geometric constraints specifications (-D)\n"; return 1; } - if (verbose) + if (verbose) { cerr << "Will write geometric constraint query protos\n"; + } } if (cl.option_present('B')) { @@ -1440,7 +1418,7 @@ mol2qry(int argc, char ** argv) { } Mol2QryOutput mol2qry_output; - if (! mol2qry_output.Initialise(cl)) { + if (!mol2qry_output.Initialise(cl)) { cerr << "Cannot initialise output\n"; usage(1); } @@ -1463,17 +1441,16 @@ mol2qry(int argc, char ** argv) { cl.value('M', smiles); rc = process_smiles_from_command_line(smiles, mqs, geometry_config, mol2qry_output); - } - else if (cl.empty()) { + } else if (cl.empty()) { usage(1); - } else if (input_type == FILE_TYPE_INVALID && ! all_files_recognised_by_suffix(cl)) { + } else if (input_type == FILE_TYPE_INVALID && !all_files_recognised_by_suffix(cl)) { cerr << "Cannot discern input type(s) of command line files\n"; return 8; } - if (! cl.option_present('M')) { + if (!cl.option_present('M')) { for (int i = 0; i < cl.number_elements(); i++) { - if (! mol2qry(cl[i], input_type, mqs, geometry_config, mol2qry_output)) { + if (!mol2qry(cl[i], input_type, mqs, geometry_config, mol2qry_output)) { rc = i + 1; break; } @@ -1489,10 +1466,8 @@ mol2qry(int argc, char ** argv) { } // namespace mol2qry - int -main(int argc, char ** argv) -{ +main(int argc, char** argv) { mol2qry::prog_name = argv[0]; int rc = mol2qry::mol2qry(argc, argv); diff --git a/src/Molecule_Tools/molecule_filter.cc b/src/Molecule_Tools/molecule_filter.cc index eb77ac98..435c6c8c 100644 --- a/src/Molecule_Tools/molecule_filter.cc +++ b/src/Molecule_Tools/molecule_filter.cc @@ -28,7 +28,6 @@ namespace molecule_filter { using std::cerr; -// By convention the Usage function tells how to use the tool. void Usage(int rc) { // clang-format off @@ -39,10 +38,13 @@ Usage(int rc) { #endif // clang-format on // clang-format off - cerr << R"( - -F textproto describing constraints on filter. + cerr << R"(Filters molecules based on easy to compute molecular properties. +Designed for rapidly triaging large collections of molecules in the most +computationally efficient way possible. + -F textproto describing constraints on filter - required. -c remove chirality - -B write rejected moleculed to + -l reduce to largest fragment + -B write rejected molecules to -v verbose output )"; // clang-format on @@ -67,8 +69,8 @@ class Options { Chemical_Standardisation _chemical_standardisation; - int _molecules_read = 0; - int _molecules_passed = 0; + uint64_t _molecules_read = 0; + uint64_t _molecules_passed = 0; MoleculeFilterData::Requirements _requirements; @@ -77,38 +79,43 @@ class Options { IWString_and_File_Descriptor _reject_stream; // Values accumulated based on rejections - int _too_few_atoms = 0; - int _too_many_atoms = 0; - int _too_few_rings = 0; - int _too_many_rings = 0; - int _too_few_heteroatoms = 0; - int _min_heteroatom_fraction = 0; - int _max_heteroatom_fraction = 0; - int _too_few_aromatic_rings = 0; - int _too_many_aromatic_rings = 0; - int _too_few_aliphatic_rings = 0; - int _too_many_aliphatic_rings = 0; - int _ring_system_too_large = 0; - int _too_many_aromatic_rings_in_system = 0; - int _ring_too_large = 0; - int _non_organic = 0; - int _isotope = 0; - int _too_few_rotbond = 0; - int _too_many_rotbond = 0; - int _low_tpsa = 0; - int _high_tpsa = 0; - int _low_xlogp = 0; - int _high_xlogp = 0; - int _low_alogp = 0; - int _high_alogp = 0; - int _too_few_hba = 0; - int _too_many_hba = 0; - int _too_few_hbd = 0; - int _too_many_hbd = 0; - int _too_many_halogens = 0; - int _too_long = 0; - int _too_few_csp3 = 0; - int _aromdens_too_high = 0; + uint64_t _too_few_atoms = 0; + uint64_t _too_many_atoms = 0; + uint64_t _too_few_rings = 0; + uint64_t _too_many_rings = 0; + uint64_t _too_few_heteroatoms = 0; + uint64_t _min_heteroatom_fraction = 0; + uint64_t _max_heteroatom_fraction = 0; + uint64_t _too_few_aromatic_rings = 0; + uint64_t _too_many_aromatic_rings = 0; + uint64_t _too_few_aliphatic_rings = 0; + uint64_t _too_many_aliphatic_rings = 0; + uint64_t _ring_system_too_large = 0; + uint64_t _too_many_aromatic_rings_in_system = 0; + uint64_t _ring_too_large = 0; + uint64_t _non_organic = 0; + uint64_t _isotope = 0; + uint64_t _too_few_rotbond = 0; + uint64_t _too_many_rotbond = 0; + uint64_t _low_tpsa = 0; + uint64_t _high_tpsa = 0; + uint64_t _low_xlogp = 0; + uint64_t _high_xlogp = 0; + uint64_t _low_alogp = 0; + uint64_t _high_alogp = 0; + uint64_t _too_few_hba = 0; + uint64_t _too_many_hba = 0; + uint64_t _too_few_hbd = 0; + uint64_t _too_many_hbd = 0; + uint64_t _too_many_halogens = 0; + uint64_t _too_long = 0; + uint64_t _too_few_csp3 = 0; + uint64_t _aromdens_too_high = 0; + uint64_t _too_many_chiral = 0; + uint64_t _too_many_fragments = 0; + + uint64_t _matches_exclusion_smarts = 0; + uint64_t _no_match_required_smarts = 0; // Private functions int Process(Molecule& m); @@ -191,6 +198,24 @@ Options::Initialise(Command_Line& cl) { _requirements = std::move(*maybe_proto); } + if (_requirements.required_smarts_size() || + _requirements.must_not_have_smarts_size()) { + cerr << "Options::Initialise:smarts not implemented\n"; + return 0; + } + + if (_remove_chirality && _requirements.has_max_chiral()) { + cerr << "Options::Initialise:removing chirality has been specified (-c)\n"; + cerr << "But the config file contains 'max_chiral'. Impossible\n"; + return 0; + } + + if (_requirements.has_max_distance() && ! _reduce_to_largest_fragment) { + cerr << "Options::Initialise:max distance specified, but largest fragment not selected (-l)\n"; + cerr << "Automatically enabling largest fragment selection\n"; + _reduce_to_largest_fragment = 1; + } + if (cl.option_present('B')) { IWString fname = cl.string_value('B'); fname.EnsureEndsWith(".smi"); @@ -322,25 +347,13 @@ Options::Report(std::ostream& output) const { if (_requirements.has_max_aromatic_density()) { output << _aromdens_too_high << " aromatic density too high " << _requirements.max_aromatic_density() << '\n'; } - return 1; -} - -int -Options::Preprocess(Molecule& m) { - if (m.empty()) { - return 0; - } - if (_reduce_to_largest_fragment) { - m.reduce_to_largest_fragment_carefully(); + if (_requirements.has_max_chiral()) { + output << _too_many_chiral << " too many chiral centres " << _requirements.max_chiral() << '\n'; } - if (_remove_chirality) { - m.remove_all_chiral_centres(); - } - - if (_chemical_standardisation.active()) { - _chemical_standardisation.process(m); + if (_requirements.has_max_number_fragments()) { + output << _too_many_fragments << " too many fragments " << _requirements.max_number_fragments() << '\n'; } return 1; @@ -401,192 +414,6 @@ LargestFragment(const const_IWSubstring& smiles, return true; } -#ifdef NOW_IN_LIBRARY -std::tuple -MaxRingSystemSize(Molecule& m, std::unique_ptr& tmp) { - const int matoms = m.natoms(); - - m.compute_aromaticity_if_needed(); - - if (! tmp) { - tmp.reset(new int[matoms]); - } - std::fill_n(tmp.get(), matoms, 0); - - const int nrings = m.nrings(); - - std::unique_ptr ring_already_done = std::make_unique(nrings); - std::fill_n(ring_already_done.get(), nrings, 0); - - int max_system_size = 0; - int max_aromatic_rings_in_system = 0; - for (int i = 0; i < nrings; ++i) { - if (ring_already_done[i]) { - continue; - } - const Ring* ri = m.ringi(i); - if (! ri->is_fused()) { - continue; - } - - int system_size = 1; - int aromatic_rings_in_system; - if (ri->is_aromatic()) { - aromatic_rings_in_system = 1; - } else { - aromatic_rings_in_system = 0; - } - - - for (int j = i + 1; j < nrings; ++j) { - if (ring_already_done[j]) { - continue; - } - - ring_already_done[j] = 1; - const Ring* rj = m.ringi(j); - if (ri->fused_system_identifier() == rj->fused_system_identifier()) { - ++system_size; - if (rj->is_aromatic()) { - ++aromatic_rings_in_system; - } - } - } - if (system_size > max_system_size) { - max_system_size = system_size; - } - if (aromatic_rings_in_system > max_aromatic_rings_in_system) { - max_aromatic_rings_in_system = aromatic_rings_in_system; - } - } - - return std::make_tuple(max_system_size, max_aromatic_rings_in_system); -} - -// Lifted from iwdescr.cc -void -RuleOfFive(Molecule & m, int& acceptor, int& donor) { - acceptor = 0; - donor = 0; - - const int matoms = m.natoms(); - - for (int i = 0; i < matoms; i++) { - atomic_number_t z = m.atomic_number(i); - // Intercept the most common case. - if (z == 6) { - continue; - } - - if (z == 7 || z == 8) { - } else { - continue; - } - - ++acceptor; - - const int h = m.hcount(i); - - // acceptor - if (0 == h) { - continue; - } - - if (7 == z && h > 1) { - donor += 2; - } else { - donor += 1; - } - } -} - - -int -HalogenCount(const Molecule& m) { - static std::vector halogen = { - 0, // 0 - 0, // 1 - 0, // 2 - 0, // 3 - 0, // 4 - 0, // 5 - 0, // 6 - 0, // 7 - 0, // 8 - 1, // 9 - 0, // 10 - 0, // 11 - 0, // 12 - 0, // 13 - 0, // 14 - 0, // 15 - 0, // 16 - 1, // 17 - 0, // 18 - 0, // 19 - 0, // 20 - 0, // 21 - 0, // 22 - 0, // 23 - 0, // 24 - 0, // 25 - 0, // 26 - 0, // 27 - 0, // 28 - 0, // 29 - 0, // 30 - 0, // 31 - 0, // 32 - 0, // 33 - 0, // 34 - 0, // 35 - 0, // 36 - 1, // 37 - 0, // 38 - 0, // 39 - 0, // 40 - 0, // 41 - 0, // 42 - 0, // 43 - 0, // 44 - 0, // 45 - 0, // 46 - 0, // 47 - 0, // 48 - 0, // 49 - 0, // 50 - 0, // 51 - 0, // 52 - 1 // 53 - }; - - int rc = 0; - - for (const Atom* a : m) { - const uint32_t z = a->atomic_number(); - if (z < halogen.size()) { - rc += halogen[z]; - } - } - - return rc; -} - -int -Sp3Carbon(Molecule & m) { - int rc = 0; - - const int matoms = m.natoms(); - for (int i = 0; i < matoms; ++i) { - if (m.saturated(i)) { - ++rc; - } - } - - return rc; -} -#endif // NOW_IN_LIBRARY - // If chemical standardisation is in effect int Options::Process(const const_IWSubstring& line, @@ -598,6 +425,14 @@ Options::Process(const const_IWSubstring& line, bool smiles_changed = false; + if (_requirements.has_max_number_fragments()) { + const int nfrag = smiles.ccount('.') + 1; + if (nfrag > _requirements.max_number_fragments()) { + ++_too_many_fragments; + return 0; + } + } + const_IWSubstring largest_frag; int matoms = 0; int nrings = 0; @@ -644,6 +479,22 @@ Options::Process(const const_IWSubstring& line, return 0; } + if (m.empty()) { + cerr << "MoleculeFilterLine:no atoms '" << line << "'\n"; + return 0; + } + + if (_requirements.has_max_chiral() && + m.chiral_centres() > _requirements.max_chiral()) { + ++_too_many_chiral; + return 0; + } + + if (_remove_chirality) { + m.remove_all_chiral_centres(); + smiles_changed = true; + } + if (_chemical_standardisation.active()) { if (_chemical_standardisation.process(m)) { smiles_changed = true; @@ -727,6 +578,12 @@ Options::Process(Molecule& m, return 0; } + if (_requirements.has_max_chiral() && + m.chiral_centres() > _requirements.max_chiral()) { + ++_too_many_chiral; + return 0; + } + int arc = 0; int need_to_compute_aromatic_rings = 0; if (_requirements.has_min_aromatic_ring_count() || @@ -972,7 +829,7 @@ MoleculeFilter(int argc, char** argv) { Options options; if (! options.Initialise(cl)) { cerr << "Cannot initialise options\n"; - return 1; + Usage(1); } if (cl.empty()) { diff --git a/src/Molecule_Tools/molecule_filter.proto b/src/Molecule_Tools/molecule_filter.proto index b265078f..092f3c5c 100644 --- a/src/Molecule_Tools/molecule_filter.proto +++ b/src/Molecule_Tools/molecule_filter.proto @@ -45,6 +45,10 @@ message Requirements { // Note that this does NOT span spiro fusions. optional int32 largest_ring_size = 26; + // These are not implemented. Likely more efficient to + // run this tool and pipe the results to tsubstructure. + // If there were a small number of queries, this might make + // sense. For now, not implemented. repeated string required_smarts = 27; repeated string must_not_have_smarts = 28; @@ -61,5 +65,8 @@ message Requirements { optional float max_aromatic_density = 34; + optional int32 max_chiral = 35; + + optional int32 max_number_fragments = 36; } diff --git a/src/Molecule_Tools/pharmacophore_2d.cc b/src/Molecule_Tools/pharmacophore_2d.cc index f2b8f5b3..48046903 100644 --- a/src/Molecule_Tools/pharmacophore_2d.cc +++ b/src/Molecule_Tools/pharmacophore_2d.cc @@ -1,74 +1,584 @@ // Generate pharmacophore-like queries for 2D molecules #include +#include #include +#include -#include "google/protobuf/text_format.h" #include "google/protobuf/io/zero_copy_stream.h" #include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" #include "Foundational/accumulator/accumulator.h" #include "Foundational/cmdline/cmdline.h" +#include "Foundational/data_source/tfdatarecord.h" #include "Foundational/iwmisc/misc.h" +#include "Foundational/iwmisc/proto_support.h" #define RESIZABLE_ARRAY_IWQSORT_IMPLEMENTATION #include "Foundational/iwqsort/iwqsort.h" #include "Molecule_Lib/istream_and_type.h" #include "Molecule_Lib/molecule.h" +#include "Molecule_Lib/path.h" +#include "Molecule_Lib/rotbond_common.h" +#include "Molecule_Lib/smiles.h" #include "Molecule_Lib/substructure.h" #include "Molecule_Lib/substructure.pb.h" #include "Molecule_Lib/target.h" -namespace pharacaphore_2d { +#include "Molecule_Tools/pharmacophore_2d.pb.h" + +namespace pharmacophore_2d { + using std::cerr; -using std::cout; -int verbose = 0; +void +Usage(int rc) { +// clang-format off +#if defined(GIT_HASH) && defined(TODAY) + cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; +#else + cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; +#endif + // clang-format on + // clang-format off + cerr << R"(Generates query files based on 2D pharmacophore features. +Queries define pharmacophore features and query files are generated that describe the topological relationships +between the pharmacophoric features in the starting molecules. One query per starting molecule. +Parameters should most be specified via the -C option, but some are also available via command line options. + -C pharmacophore2d::Pharmacophore2DConfig configuration textproto + -S file name stem for generated query files - multiple textproto files are generated. + -G write names of query files generated to - use with -q PROTOFILE: + -t write serialized SubstructureSearch::SubstructureQuery protos to -S value. One file generated -q TFPROTO: + + -s define functional groups via smarts - or use the functional_group attribute in the -C file. + -q define functional groups via query files - or use the functional_group attribute in the -C file. + -n ncon values become min_ncon values in the query\n"; + -d min separation between atoms\n"; + -D max separation between atoms\n"; + -Y ... other options, enter '-Y help' for info\n"; + -v verbose output\n"; +)"; + // clang-format on + + ::exit(rc); +} -int molecules_read = 0; -int no_functional_groups = 0; -int only_one_functional_group = 0; +// The atomic properties that can be transferred to the query. +// We OR these into an uint32_t +static constexpr uint32_t kPropAtomicNumber = 1; +static constexpr uint32_t kPropRingBondCount = 2; +static constexpr uint32_t kPropAromatic = 4; +static constexpr uint32_t kPropNcon = 8; +static constexpr uint32_t kPropHasPiElectron = 16; +static constexpr uint32_t kPropPiElectronCount = 32; +static constexpr uint32_t kPropUnsaturation = 64; +static constexpr uint32_t kPropIsotope = 128; +static constexpr uint32_t kPropRingSize = 256; +static constexpr uint32_t kPropSpinach = 512; +static constexpr uint32_t kPropFusedSystemSize = 1024; +static constexpr uint32_t kPropHcount = 2048; -resizable_array_p atoms_to_ignore; +class PerMoleculeData { + private: + Molecule& _m; -Accumulator_Int functional_group_count; -Accumulator fraction_atoms_in_functional_groups; + const int _matoms; -// When generating distance constraints, ignore atoms outside the -// range specified here. -int min_separation = 1; -int max_separation = std::numeric_limits::max(); + int* _rotbond_between; -// When two atoms are separated by `d` bonds, the min_bonds_between -// will be `d - delta_shorter` (if positive) and the max_bonds_between -// will be `d + delta_longer`. -int delta_shorter = 0; -int delta_longer = 0; + int* _functional_group; -int ncon_becomes_min_ncon = 0; + quick_rotbond::QuickRotatableBonds _rotbond; -IWString_and_File_Descriptor stream_for_labelled_smiles; + public: + PerMoleculeData(Molecule& m); + ~PerMoleculeData(); -void -Usage(int rc) { - cerr << " -n ncon values become min_ncon values in the query\n"; - cerr << " -d min separation between atoms\n"; - cerr << " -D max separation between atoms\n"; - cerr << " -L write labelled molecule to \n"; - cerr << " -v verbose output\n"; - exit(rc); + int* + functional_group() { + return _functional_group; + } + + int functional_group(atom_number_t a) const { + return _functional_group[a]; + } + + int& functional_group(atom_number_t a) { + return _functional_group[a]; + } + + int rotbond_between(atom_number_t a1, atom_number_t a2) const { + return _rotbond_between[a1 * _matoms + a2]; + } +}; + +PerMoleculeData::PerMoleculeData(Molecule& m) : _m(m), _matoms(m.natoms()) { + _rotbond.set_calculation_type(quick_rotbond::QuickRotatableBonds::RotBond::kExpensive); + + _rotbond_between = _rotbond.RotatableBondsBetween(m).release(); + + _functional_group = new_int(_matoms, -1); +} + +PerMoleculeData::~PerMoleculeData() { + delete[] _rotbond_between; +} + +class Output { + private: + int _verbose; + + int _nfiles; + + IWString _stem; + + std::vector _files_generated; + + std::unique_ptr _tfdata; + + public: + Output(); + + int Initialise(Command_Line& cl); + + int Write(const SubstructureSearch::SubstructureQuery& qry); + + int nfiles() const { + return _nfiles; + } + + int WriteFilesGenerated(IWString& fname) const; + int WriteFilesGenerated(IWString_and_File_Descriptor& fname) const; +}; + +Output::Output() { + _verbose = 0; + + _nfiles = 0; + + _files_generated.reserve(10000); +} + +int +Output::Initialise(Command_Line& cl) { + _verbose = cl.option_present('v'); + + if (!cl.option_present('S')) { + cerr << "Output::Initialise:must specify output file name stem with the -S option\n"; + Usage(0); + } + + cl.value('S', _stem); + + if (cl.option_present('t')) { + _stem.EnsureEndsWith(".dat"); + _tfdata = std::make_unique(); + if (!_tfdata->Open(_stem)) { + cerr << "Output::Initialise:cannot open TFDataRecord file '" << _stem << "'\n"; + return 0; + } + + if (_verbose) { + cerr << "TFDataRecord serialized protos written to '" << _stem << "'\n"; + } + + return 1; + } + + if (_verbose) { + cerr << "Files written with stem '" << _stem << "'\n"; + } + + return 1; +} + +int +Output::Write(const SubstructureSearch::SubstructureQuery& qry) { + if (_tfdata) { + return _tfdata->WriteSerializedProto(qry); + } + + ++_nfiles; + IWString fname; + fname << _stem << _nfiles << ".textproto"; + + _files_generated.push_back(fname); + + return iwmisc::WriteTextProto(qry, fname); +} + +int +Output::WriteFilesGenerated(IWString& fname) const { + IWString_and_File_Descriptor output; + if (!output.open(fname.null_terminated_chars())) { + cerr << "Output::WriteFilesGenerated:cannot open '" << fname << "'\n"; + return 0; + } + + return WriteFilesGenerated(output); +} + +int +Output::WriteFilesGenerated(IWString_and_File_Descriptor& output) const { + if (_verbose) { + cerr << "writing " << _files_generated.size() << " generated query file names\n"; + } + + for (const IWString& fname : _files_generated) { + output << fname << '\n'; + output.write_if_buffer_holds_more_than(4096); + } + + return output.good(); +} + +class Options { + private: + int _verbose; + + int _molecules_read; + int _no_functional_groups; + int _only_one_functional_group; + + resizable_array_p _atoms_to_ignore; + + // We can specify certain arrangements of atoms that will be placed + // in functional groups, which might not necessarily be discovered + // otherwise. Acids are an example. + resizable_array_p _external_functional_groups; + + pharmacophore2d::Pharmacophore2DConfig _proto; + + // Statistics on how the functional groups match. + extending_resizable_array _functional_group_count; + Accumulator _fraction_atoms_in_functional_groups; + + // When generating distance constraints, ignore atoms outside the + // range specified here. + int _min_separation = 1; + int _max_separation = std::numeric_limits::max(); + + // When two atoms are separated by `d` bonds, the min_bonds_between + // will be `d - delta_shorter` (if positive) and the max_bonds_between + // will be `d + delta_longer`. + int _delta_shorter = 0; + int _delta_longer = 0; + + int _ncon_becomes_min_ncon = 0; + + int _ring_bond_count_becomes_min_ring_bond_count = 0; + + IWString_and_File_Descriptor _stream_for_labelled_smiles; + + // If writing labelled smiles, we can write as either atom map numbers, default + // or as isotopic labels. This variable controls that behaviour. + int _label_with_isotopes = 0; + + // The atomic properties that will be transferred to the query atom. + // These are or'd values of the various kProp* variables. + uint32_t _atomic_properties; + + quick_rotbond::QuickRotatableBonds _rotbond; + + int _reduce_to_largest_fragment; + + // Private functions. + void DisplayDashYOptions(std::ostream& output); + + int Pharmacophore2d(Molecule& m, PerMoleculeData& pmd, int number_functional_groups, + IWString& fname); + std::optional Pharmacophore2d( + Molecule& m, PerMoleculeData& pmd, int number_functional_groups); + + int IdentifyAtomsToIgnore(Molecule& m, int* ignore_atom); + std::tuple IdentifyFunctionalGroups(Molecule& m, int* fg, const int* ignore); + int WriteLabelledSmiles(const Molecule& m, const int* functional_group); + void IdentifyExternallySpecified(Molecule& m, int* fg, const int* ignore, + int& atoms_in_functional_groups, int& group_number); + int BuildQuery(Molecule& m, const int* functional_group, int group_number, + const resizable_array& atom_order_in_smiles, + SubstructureSearch::SingleSubstructureQuery& query); + void CopyAttributes(Molecule& m, atom_number_t zatom, + SubstructureSearch::SubstructureAtom& query_atom) const; + int AddDistanceConstraints(Molecule& m, PerMoleculeData& pmd, + SubstructureSearch::SingleSubstructureQuery& query); + int AddRotbond(Molecule& m, const PerMoleculeData& pmd, atom_number_t a1, + atom_number_t a2, SubstructureSearch::SeparatedAtoms& separated_atoms); + + public: + Options(); + + int Initialise(Command_Line& cl); + + void Preprocess(Molecule& m); + + int Process(Molecule& m, IWString& output_fname); + int Process(Molecule& m, IWString_and_File_Descriptor& output); + std::optional Process(Molecule& m); + + int verbose() const { + return _verbose; + } + + int molecules_read() const { + return _molecules_read; + } + + int Report(std::ostream& output) const; +}; + +Options::Options() { + _verbose = 0; + + _molecules_read = 0; + _no_functional_groups = 0; + _only_one_functional_group = 0; + + _min_separation = 1; + _max_separation = std::numeric_limits::max(); + + _delta_shorter = 0; + _delta_longer = 0; + + _ncon_becomes_min_ncon = 0; + + _ring_bond_count_becomes_min_ring_bond_count = 0; + + _label_with_isotopes = 0; + + _atomic_properties = (kPropAtomicNumber | kPropAromatic); + + _rotbond.set_calculation_type(quick_rotbond::QuickRotatableBonds::RotBond::kExpensive); + + _reduce_to_largest_fragment = 0; } void -Preprocess(Molecule& m) { +Options::DisplayDashYOptions(std::ostream& output) { + output << " -Y iso label functional groups in the -L file as isotopes " + "(default is atom map)\n"; + output << " -Y fname= file name to which labelled smiles are written\n"; + + ::exit(0); +} + +int +Options::Initialise(Command_Line& cl) { + _verbose = cl.option_present('v'); + + if (cl.option_present('l')) { + _reduce_to_largest_fragment = 1; + if (_verbose) { + cerr << "Will reduce to the largest fragment\n"; + } + } + + if (cl.option_present('C')) { + IWString fname = cl.string_value('C'); + std::optional p = + iwmisc::ReadTextProto(fname); + + if (!p) { + cerr << "Cannot read textproto config file '" << fname << "'\n"; + return 0; + } + + _proto = *p; + if (_verbose) { + cerr << "Read proto config from '" << fname << "'\n"; + } + + if (_proto.has_min_separation()) { + _min_separation = _proto.min_separation(); + } + if (_proto.has_max_separation()) { + _max_separation = _proto.max_separation(); + } + if (_proto.has_ncon_becomes_min_ncon()) { + _ncon_becomes_min_ncon = _proto.ncon_becomes_min_ncon(); + } + if (_proto.has_ring_bond_count_becomes_min_ring_bond_count()) { + _ring_bond_count_becomes_min_ring_bond_count = + _proto.ring_bond_count_becomes_min_ring_bond_count(); + } + + if (_proto.has_delta_shorter()) { + _delta_shorter = _proto.delta_shorter(); + } + if (_proto.has_delta_longer()) { + _delta_longer = _proto.delta_longer(); + } + + for (const std::string& q : _proto.functional_group()) { + IWString tmp(q); + if (!process_cmdline_token('*', tmp, _external_functional_groups, _verbose)) { + cerr << "Invalid functional group specification '" << q << "'\n"; + return 0; + } + } + + for (const std::string& q : _proto.atoms_to_ignore()) { + IWString tmp(q); + if (!process_cmdline_token('*', tmp, _atoms_to_ignore, _verbose)) { + cerr << "Invalid atoms to ignore specification '" << q << "'\n"; + return 0; + } + } + + if (_proto.atomic_property_size() > 0) { + _atomic_properties = 0; + for (auto ap : _proto.atomic_property()) { + switch (ap) { + case pharmacophore2d::ATOMIC_NUMBER: + _atomic_properties |= kPropAtomicNumber; + break; + case pharmacophore2d::NCON: + _atomic_properties |= kPropNcon; + break; + case pharmacophore2d::AP_AROMATIC: + _atomic_properties |= kPropAromatic; + break; + case pharmacophore2d::RING_BOND_COUNT: + _atomic_properties |= kPropRingBondCount; + break; + case pharmacophore2d::HAS_PI_ELECTRON: + _atomic_properties |= kPropHasPiElectron; + break; + case pharmacophore2d::PI_ELECTRON_COUNT: + _atomic_properties |= kPropPiElectronCount; + break; + case pharmacophore2d::UNSATURATION: + _atomic_properties |= kPropUnsaturation; + break; + case pharmacophore2d::ISOTOPE: + _atomic_properties |= kPropIsotope; + break; + case pharmacophore2d::RING_SIZE: + _atomic_properties |= kPropRingSize; + break; + case pharmacophore2d::SPINACH: + _atomic_properties |= kPropSpinach; + break; + case pharmacophore2d::FUSED_SYSTEM_SIZE: + _atomic_properties |= kPropFusedSystemSize; + break; + case pharmacophore2d::HCOUNT: + _atomic_properties |= kPropHcount; + break; + } + } + } + } + + if (cl.option_present('s')) { + const_IWSubstring smt; + for (int i = 0; cl.value('s', smt, i); ++i) { + std::unique_ptr q = std::make_unique(); + if (!q->create_from_smarts(smt)) { + cerr << "Invalid functional group smarts '" << smt << "'\n"; + return 1; + } + + _external_functional_groups << q.release(); + } + } + + if (cl.option_present('q')) { + if (!process_queries(cl, _external_functional_groups, _verbose, 'q')) { + cerr << "Cannot read external queries (-q)\n"; + return 0; + } + } + + if (_verbose && _external_functional_groups.size() > 0) { + cerr << "Defined " << _external_functional_groups.size() + << " externally specified functional groups\n"; + } + + if (cl.option_present('d')) { + if (!cl.value('d', _min_separation) || _min_separation < 1) { + cerr << "Invalid minimum separation (-d)\n"; + Usage(1); + } + if (_verbose) { + cerr << "Will ignore atoms separated by less than " << _min_separation + << " bonds\n"; + } + } + + if (cl.option_present('D')) { + if (!cl.value('D', _max_separation) || _max_separation < _min_separation) { + cerr << "Invalid maximum separation (-D)\n"; + Usage(1); + } + if (_verbose) { + cerr << "Will ignore atoms separated by more than " << _max_separation + << " bonds\n"; + } + } + + if (cl.option_present('n')) { + _ncon_becomes_min_ncon = 1; + if (_verbose) { + cerr << "Ncon values become min_ncon query attributes\n"; + } + } + + if (cl.option_present('Y')) { + IWString fname; + const_IWSubstring y; + for (int i = 0; cl.value('Y', y, i); ++i) { + if (y == "iso") { + _label_with_isotopes = 1; + if (_verbose) { + cerr << "Will write functional group labels as isotopes\n"; + } + } else if (y.starts_with("fname=")) { + y.remove_leading_chars(6); + fname = y; + } else if (y == "help") { + DisplayDashYOptions(cerr); + } else { + cerr << "Unrecognised -Y qualifier '" << y << "'\n"; + DisplayDashYOptions(cerr); + } + } + + if (fname.empty()) { + cerr << "If writing labelled smiles must specify file name\n"; + return 0; + } + + fname.EnsureEndsWith(".smi"); + if (!_stream_for_labelled_smiles.open(fname.null_terminated_chars())) { + cerr << "Cannot open stream for labelled smiles '" << fname << "'\n"; + return 0; + } + + if (_verbose) { + cerr << "Labelled molecules written to '" << fname << "'\n"; + } + } + + return 1; } void -AddDistanceConstraints(Molecule& m, - const int * functional_group, - SubstructureSearch::SingleSubstructureQuery& query) { +Options::Preprocess(Molecule& m) { + if (_reduce_to_largest_fragment) { + m.reduce_to_largest_fragment_carefully(); + } +} +int +Options::AddDistanceConstraints(Molecule& m, PerMoleculeData& pmd, + SubstructureSearch::SingleSubstructureQuery& query) { const int matoms = m.natoms(); + + const int* functional_group = pmd.functional_group(); + + int rc = 0; for (int i = 0; i < matoms; ++i) { if (functional_group[i] < 0) { continue; @@ -82,31 +592,69 @@ AddDistanceConstraints(Molecule& m, } // Atoms in two different functional groups. const int d = m.bonds_between(i, j); - if (d < min_separation || d > max_separation) { + if (d < _min_separation || d > _max_separation) { continue; } + + ++rc; + SubstructureSearch::SeparatedAtoms* separated_atoms = query.add_separated_atoms(); separated_atoms->set_a1(i); separated_atoms->set_a2(j); - if (delta_longer == 0 && delta_shorter == 0) { + if (_delta_longer == 0 && _delta_shorter == 0) { separated_atoms->add_bonds_between(d); } else { - if (d - delta_shorter > 0) { - separated_atoms->set_min_bonds_between(d - delta_shorter); + if (_delta_shorter > 0 && d - _delta_shorter > 0) { + separated_atoms->set_min_bonds_between(d - _delta_shorter); + } + if (_delta_longer > 0) { + separated_atoms->set_max_bonds_between(d + _delta_longer); } - separated_atoms->set_max_bonds_between(d + delta_longer); + } + + if (_proto.preserve_rotbond() || _proto.has_extra_rotbond() || + _proto.has_less_rotbond()) { + AddRotbond(m, pmd, i, j, *separated_atoms); } } } + + return rc; } -void -AddDistanceConstraints(Molecule& m, - const int * functional_group, - int g1, - int g2, - SubstructureSearch::SingleSubstructureQuery& query) { +int +Options::AddRotbond(Molecule& m, const PerMoleculeData& pmd, atom_number_t a1, + atom_number_t a2, + SubstructureSearch::SeparatedAtoms& separated_atoms) { + int rb = pmd.rotbond_between(a1, a2); + + // If we set a min or max, we do not need to set a value. + int add_rotbond = 1; + + if (_proto.has_extra_rotbond()) { + separated_atoms.set_max_rotbond(rb + _proto.extra_rotbond()); + add_rotbond = 0; + } + + if (_proto.has_less_rotbond()) { + int r = rb - _proto.less_rotbond(); + if (r > 0) { + separated_atoms.set_min_rotbond(r); + } + add_rotbond = 0; + } + + if (add_rotbond) { + separated_atoms.add_rotbond(rb); + } + return 1; +} + +#ifdef NO_LONGER_USED_JJJ +void +Options::AddDistanceConstraints(Molecule& m, const int* functional_group, int g1, int g2, + SubstructureSearch::SingleSubstructureQuery& query) { const int matoms = m.natoms(); for (int i = 0; i < matoms; ++i) { for (int j = 0; j < matoms; ++j) { @@ -117,57 +665,124 @@ AddDistanceConstraints(Molecule& m, continue; } const int d = m.bonds_between(i, j); - if (d < min_separation || d > max_separation) { + if (d < _min_separation || d > _max_separation) { continue; } + SubstructureSearch::SeparatedAtoms* separated_atoms = query.add_separated_atoms(); separated_atoms->set_a1(i); separated_atoms->set_a2(j); - if (delta_longer == 0 && delta_shorter == 0) { + if (_delta_longer == 0 && _delta_shorter == 0) { separated_atoms->add_bonds_between(d); } else { - if (d - delta_shorter > 0) { - separated_atoms->set_min_bonds_between(d - delta_shorter); + if (d - _delta_shorter > 0) { + separated_atoms->set_min_bonds_between(d - _delta_shorter); } - separated_atoms->set_max_bonds_between(d + delta_longer); + separated_atoms->set_max_bonds_between(d + _delta_longer); } } } } +#endif + +int +SpinachAttribute(Molecule& m, atom_number_t zatom) { + const int matoms = m.natoms(); + std::unique_ptr spinach = std::make_unique(matoms); + + m.identify_spinach(spinach.get()); + + if (spinach[zatom]) { + return 1; + } + + return 0; +} void -CopyAttributes(Molecule& m, - atom_number_t zatom, - SubstructureSearch::SubstructureAtom& query_atom) { +Options::CopyAttributes(Molecule& m, atom_number_t zatom, + SubstructureSearch::SubstructureAtom& query_atom) const { SubstructureSearch::SubstructureAtomSpecifier* qas = query_atom.add_atom_properties(); const Atom& atom = m.atom(zatom); - qas->add_atomic_number(atom.atomic_number()); + if (_atomic_properties & kPropAtomicNumber) { + qas->add_atomic_number(atom.atomic_number()); + } - if (ncon_becomes_min_ncon) { + if ((_atomic_properties & kPropNcon) == 0) { + } else if (_ncon_becomes_min_ncon) { qas->set_min_ncon(atom.ncon()); } else { qas->add_ncon(atom.ncon()); - qas->add_nbonds(m.nbonds(zatom)); } - // An open question as to whether ring membership should be allowed to change. - if (m.is_aromatic(zatom)) { - qas->set_aromatic(true); - } else { - qas->add_ring_bond_count(m.ring_bond_count(zatom)); + if (_atomic_properties & kPropRingBondCount) { + if (_ring_bond_count_becomes_min_ring_bond_count) { + qas->set_min_ring_bond_count(m.ring_bond_count(zatom)); + } else { + qas->add_ring_bond_count(m.ring_bond_count(zatom)); + } + } + + // Should we also set this if the atom is not aromatic. + if (_atomic_properties & kPropAromatic) { + if (m.is_aromatic(zatom)) { + qas->set_aromatic(true); + } + } + + if (_atomic_properties & kPropUnsaturation) { + if (m.saturated(zatom)) { + qas->add_unsaturation(0); + } else { + qas->set_min_unsaturation(1); // Or should we copy it. + } + } + + if (_atomic_properties & kPropHasPiElectron) { + cerr << "Warning kPropHasPiElectron not implemented\n"; + } + if (_atomic_properties & kPropPiElectronCount) { + cerr << "Warning kPropPiElectronCount not implemented\n"; + } + + if (_atomic_properties & kPropSpinach) { + qas->set_match_spinach_only(SpinachAttribute(m, zatom)); + } + + if (_atomic_properties & kPropFusedSystemSize) { + if (m.ring_bond_count(zatom)) { + qas->add_fused_system_size(m.fused_system_size(zatom)); + } + } + + if (_atomic_properties & kPropRingSize) { + const Ring* r = m.ring_containing_atom(zatom); + if (r != nullptr) { + qas->add_ring_size(r->size()); + } + } + + if (_atomic_properties & kPropHcount) { + if (_proto.hcount_becomes_min()) { + qas->set_min_hcount(m.hcount(zatom)); + } else { + qas->add_hcount(m.hcount(zatom)); + } + } + + // It is an open question as to whether we should set isotope 0 or not. + if (_atomic_properties & kPropIsotope && m.isotope(zatom)) { + qas->add_isotope(m.isotope(zatom)); } } int -AddBonds(Molecule& m, - const resizable_array& atom_order_in_smiles, - const atom_number_t zatom, - const int * functional_group, - int group_number, +AddBonds(Molecule& m, const resizable_array& atom_order_in_smiles, + const atom_number_t zatom, const int* functional_group, int group_number, SubstructureSearch::SubstructureAtom& query_atom) { const Atom& atom = m.atom(zatom); int rc = 0; - for (const Bond * b : atom) { + for (const Bond* b : atom) { const atom_number_t other = b->other(zatom); if (functional_group[other] != group_number) { continue; @@ -175,7 +790,9 @@ AddBonds(Molecule& m, if (atom_order_in_smiles[other] > atom_order_in_smiles[zatom]) { continue; } -// cerr << "AddBonds:adding bond between " << zatom << " fg " << functional_group[zatom] << " and " << other << " fg " << functional_group[other] << '\n'; + // cerr << "AddBonds:adding bond between " << zatom << " fg " << + // functional_group[zatom] << " and " << other << " fg " << functional_group[other] + // << '\n'; SubstructureSearch::SubstructureBond* query_bond = query_atom.add_query_bond(); query_bond->set_other_end(other); if (b->is_aromatic()) { @@ -197,9 +814,7 @@ AddBonds(Molecule& m, // by `atom_order_in_smiles`. // natoms is the size of the `functional_group` array. Set_of_Atoms -AtomsInFunctionalGroup(int natoms, - const int * functional_group, - int group_number, +AtomsInFunctionalGroup(int natoms, const int* functional_group, int group_number, const resizable_array& atom_order_in_smiles) { Set_of_Atoms atoms_in_group; for (int i = 0; i < natoms; ++i) { @@ -208,26 +823,27 @@ AtomsInFunctionalGroup(int natoms, } } atoms_in_group.iwqsort_lambda([&atom_order_in_smiles](int a1, int a2) { - if (atom_order_in_smiles[a1] < atom_order_in_smiles[a2]) + if (atom_order_in_smiles[a1] < atom_order_in_smiles[a2]) { return -1; - if (atom_order_in_smiles[a1] > atom_order_in_smiles[a2]) + } + if (atom_order_in_smiles[a1] > atom_order_in_smiles[a2]) { return 1; + } return 0; // Should never happen here. }); - + return atoms_in_group; } int -BuildQuery(Molecule& m, - const int * functional_group, - int group_number, - const resizable_array& atom_order_in_smiles, - SubstructureSearch::SingleSubstructureQuery& query) { +Options::BuildQuery(Molecule& m, const int* functional_group, int group_number, + const resizable_array& atom_order_in_smiles, + SubstructureSearch::SingleSubstructureQuery& query) { const int matoms = m.natoms(); - const resizable_array atoms_in_functional_group = AtomsInFunctionalGroup(matoms, functional_group, group_number, atom_order_in_smiles); + const resizable_array atoms_in_functional_group = AtomsInFunctionalGroup( + matoms, functional_group, group_number, atom_order_in_smiles); for (atom_number_t a : atoms_in_functional_group) { - SubstructureSearch::SubstructureAtom * atom = query.add_query_atom(); + SubstructureSearch::SubstructureAtom* atom = query.add_query_atom(); atom->set_id(a); CopyAttributes(m, a, *atom); AddBonds(m, atom_order_in_smiles, a, functional_group, group_number, *atom); @@ -238,9 +854,7 @@ BuildQuery(Molecule& m, #ifdef NOT_NEEDED_ASDASD Set_of_Atoms -GetFunctionalGroup(const int * functional_group, - int group_number, - int n) { +GetFunctionalGroup(const int* functional_group, int group_number, int n) { Set_of_Atoms result; for (int i = 0; i < n; ++i) { if (functional_group[i] == group_number) { @@ -249,15 +863,13 @@ GetFunctionalGroup(const int * functional_group, } return result; } -#endif int -Pharacaphore2d(Molecule& m, - const int * functional_group, - int number_functional_groups, - IWString_and_File_Descriptor& output) { - if (verbose > 1) +Options::Pharmacophore2d(Molecule& m, PerMoleculeData& pmd, int number_functional_groups, + IWString& fname) { + if (_verbose > 1) { cerr << m.name() << " has " << number_functional_groups << " functional groups\n"; + } // Force a smiles computation, and copy the atom order. m.smiles(); @@ -265,32 +877,78 @@ Pharacaphore2d(Molecule& m, SubstructureSearch::SubstructureQuery composite_query; composite_query.set_name(m.name().data(), m.name().length()); - SubstructureSearch::SingleSubstructureQuery * query = composite_query.add_query(); + SubstructureSearch::SingleSubstructureQuery* query = composite_query.add_query(); query->set_respect_initial_atom_numbering(true); int group_number = 0; for (int i = 1; i <= number_functional_groups; ++i, ++group_number) { - BuildQuery(m, functional_group, i, atom_order_in_smiles, *query); + BuildQuery(m, pmd.functional_group(), i, atom_order_in_smiles, *query); + } + + if (!AddDistanceConstraints(m, pmd, *query)) { + return 0; + } + + if (_proto.has_max_extra_atoms()) { + auto* mpr = query->mutable_required_molecular_properties(); + mpr->set_max_natoms(m.natoms() + _proto.max_extra_atoms()); + } + + IWString_and_File_Descriptor output; + if (!output.open(fname.null_terminated_chars())) { + cerr << "Options::Pharmacophore2d:cannot open '" << fname << "'\n"; + return 0; } - - AddDistanceConstraints(m, functional_group, *query); - using google::protobuf::io::ZeroCopyOutputStream; using google::protobuf::io::FileOutputStream; - std::unique_ptr zero_copy_output(new FileOutputStream(output.fd())); - if (! google::protobuf::TextFormat::Print(composite_query, zero_copy_output.get())) { - cerr << "Pharacaphore2d:cannot write\n"; + using google::protobuf::io::ZeroCopyOutputStream; + std::unique_ptr zero_copy_output( + new FileOutputStream(output.fd())); + if (!google::protobuf::TextFormat::Print(composite_query, zero_copy_output.get())) { + cerr << "Pharmacophore2d:cannot write\n"; return 0; } return 1; } +#endif + +std::optional +Options::Pharmacophore2d(Molecule& m, PerMoleculeData& pmd, + int number_functional_groups) { + if (_verbose > 1) { + cerr << m.name() << " has " << number_functional_groups << " functional groups\n"; + } + + // Force a smiles computation, and copy the atom order. + m.smiles(); + resizable_array atom_order_in_smiles = m.atom_order_in_smiles(); + + SubstructureSearch::SubstructureQuery result; + result.set_name(m.name().data(), m.name().length()); + SubstructureSearch::SingleSubstructureQuery* query = result.add_query(); + query->set_respect_initial_atom_numbering(true); + int group_number = 0; + for (int i = 1; i <= number_functional_groups; ++i, ++group_number) { + BuildQuery(m, pmd.functional_group(), i, atom_order_in_smiles, *query); + } + + if (!AddDistanceConstraints(m, pmd, *query)) { + return std::nullopt; + } + + if (_proto.has_max_extra_atoms()) { + auto* mpr = query->mutable_required_molecular_properties(); + mpr->set_max_natoms(m.natoms() + _proto.max_extra_atoms()); + } + + return result; +} +// Run the queries in _atoms_to_ignore and set `ignore_atom`. int -IdentifyAtomsToIgnore(Molecule& m, - resizable_array_p& atoms_to_ignore, - int * ignore_atom) { +Options::IdentifyAtomsToIgnore(Molecule& m, int* ignore_atom) { Molecule_to_Match target(&m); - for (Substructure_Query* q : atoms_to_ignore) { + for (Substructure_Query* q : _atoms_to_ignore) { Substructure_Results sresults; const int nhits = q->substructure_search(target, sresults); if (nhits == 0) { @@ -303,15 +961,12 @@ IdentifyAtomsToIgnore(Molecule& m, } int -IdentifyFunctionalGroup(Molecule& m, - const atom_number_t zatom, - int * fg, - int group_number, - const int * ignore) { +IdentifyFunctionalGroup(Molecule& m, const atom_number_t zatom, int* fg, int group_number, + const int* ignore) { fg[zatom] = group_number; int rc = 1; const Atom& a = m.atom(zatom); - for (const Bond * b : a) { + for (const Bond* b : a) { const atom_number_t other = b->other(zatom); if (ignore[other]) { continue; @@ -328,7 +983,8 @@ IdentifyFunctionalGroup(Molecule& m, continue; } // Always break at an aromatic ring. - if (b->is_single_bond() && b->nrings() == 0 && (m.is_aromatic(zatom) || m.is_aromatic(other))) { + if (b->is_single_bond() && b->nrings() == 0 && + (m.is_aromatic(zatom) || m.is_aromatic(other))) { continue; } rc += IdentifyFunctionalGroup(m, other, fg, group_number, ignore); @@ -337,16 +993,71 @@ IdentifyFunctionalGroup(Molecule& m, return rc; } -std::tuple -IdentifyFunctionalGroups(Molecule& m, - int * fg, - const int * ignore) { - const int matoms = m.natoms(); +int +AnyMembersPositive(const Set_of_Atoms& embedding, const int* values) { + for (atom_number_t a : embedding) { + if (values[a] >= 0) { + return 1; + } + } + + return 0; +} + +void +Options::IdentifyExternallySpecified(Molecule& m, int* fg, const int* ignore, + int& atoms_in_functional_groups, int& group_number) { + Molecule_to_Match target(&m); + + uint32_t queries_matching = 0; + + // cerr << "Testing " << _external_functional_groups.size() << " functional groups\n"; + for (Substructure_Query* q : _external_functional_groups) { + Substructure_Results sresults; + if (!q->substructure_search(target, sresults)) { + continue; + } + + ++queries_matching; + + for (const Set_of_Atoms* e : sresults.embeddings()) { + if (AnyMembersPositive(*e, fg)) { + continue; + } + if (e->any_members_set_in_array(ignore)) { + continue; + } + ++group_number; + e->set_vector(fg, group_number); + atoms_in_functional_groups += e->number_elements(); + } + } + + if (atoms_in_functional_groups == 0) { + return; + } + // If we do not match all functional groups, appear as if nothing matched. + if (_proto.all_functional_groups_must_match() && + queries_matching != _external_functional_groups.size()) { + group_number = 0; + atoms_in_functional_groups = 0; + } +} + +std::tuple +Options::IdentifyFunctionalGroups(Molecule& m, int* fg, const int* ignore) { m.compute_aromaticity_if_needed(); int atoms_in_functional_groups = 0; int group_number = 0; + if (_external_functional_groups.size() > 0) { + IdentifyExternallySpecified(m, fg, ignore, atoms_in_functional_groups, group_number); + return {group_number, atoms_in_functional_groups}; + } + + const int matoms = m.natoms(); + for (int i = 0; i < matoms; ++i) { if (fg[i] > 0) { continue; @@ -365,25 +1076,74 @@ IdentifyFunctionalGroups(Molecule& m, } int -WriteLabelledSmiles(const Molecule & m, - const int * functional_group, - IWString_and_File_Descriptor& stream_for_labelled_smiles) { +Options::WriteLabelledSmiles(const Molecule& m, const int* functional_group) { Molecule mcopy(m); const int matoms = m.natoms(); for (int i = 0; i < matoms; ++i) { if (functional_group[i] < 0) { continue; } - mcopy.set_atom_map_number(i, functional_group[i]); + if (_label_with_isotopes) { + mcopy.set_isotope(i, functional_group[i] + 1); + } else { + mcopy.set_atom_map_number(i, functional_group[i] + 1); + } } - stream_for_labelled_smiles << mcopy.smiles() << ' ' << m.name() << '\n'; - stream_for_labelled_smiles.write_if_buffer_holds_more_than(8192); + + _stream_for_labelled_smiles << mcopy.smiles() << ' ' << m.name() << '\n'; + _stream_for_labelled_smiles.write_if_buffer_holds_more_than(8192); + return 1; } +std::optional +Options::Process(Molecule& m) { + ++_molecules_read; + + const int matoms = m.natoms(); + if (matoms == 0) { + cerr << "Ignoring empty molecule " << m.name() << '\n'; + return std::nullopt; + } + + // this is not implemented yet. + std::unique_ptr ignore_atoms(new_int(matoms)); + if (_atoms_to_ignore.number_elements() > 0) { + IdentifyAtomsToIgnore(m, ignore_atoms.get()); + } + + PerMoleculeData pmd(m); + + const auto [number_functional_groups, atoms_in_functional_groups] = + IdentifyFunctionalGroups(m, pmd.functional_group(), ignore_atoms.get()); + if (_verbose) { + ++_functional_group_count[number_functional_groups]; + _fraction_atoms_in_functional_groups.extra( + static_cast(atoms_in_functional_groups) / static_cast(matoms)); + } + + if (number_functional_groups == 0) { + ++_no_functional_groups; + return std::nullopt; + } + + if (_stream_for_labelled_smiles.active()) { + WriteLabelledSmiles(m, pmd.functional_group()); + } + + if (number_functional_groups == 1) { + ++_only_one_functional_group; + return std::nullopt; + } + + return Pharmacophore2d(m, pmd, number_functional_groups); +} + +#ifdef NO_LONGER_USED_JJJ int -Pharacaphore2d(Molecule& m, - IWString_and_File_Descriptor& output) { +Options::Process(Molecule& m, IWString& fname) { + ++_molecules_read; + const int matoms = m.natoms(); if (matoms == 0) { cerr << "Ignoring empty molecule " << m.name() << '\n'; @@ -391,150 +1151,151 @@ Pharacaphore2d(Molecule& m, } std::unique_ptr ignore_atoms(new_int(matoms)); - if (atoms_to_ignore.number_elements() > 0) { - IdentifyAtomsToIgnore(m, atoms_to_ignore, ignore_atoms.get()); + if (_atoms_to_ignore.number_elements() > 0) { + IdentifyAtomsToIgnore(m, ignore_atoms.get()); } - std::unique_ptr functional_group(new_int(matoms, -1)); + PerMoleculeData pmd(m); + const auto [number_functional_groups, atoms_in_functional_groups] = - IdentifyFunctionalGroups(m, functional_group.get(), ignore_atoms.get()); - if (verbose) { - functional_group_count.extra(number_functional_groups); - fraction_atoms_in_functional_groups.extra(static_cast(atoms_in_functional_groups) / - static_cast(matoms)); + IdentifyFunctionalGroups(m, pmd.functional_group(), ignore_atoms.get()); + if (_verbose) { + ++_functional_group_count[number_functional_groups]; + _fraction_atoms_in_functional_groups.extra( + static_cast(atoms_in_functional_groups) / static_cast(matoms)); } + if (number_functional_groups == 0) { - no_functional_groups++; - return 1; + ++_no_functional_groups; + return 0; } - - if (stream_for_labelled_smiles.active()) { - WriteLabelledSmiles(m, functional_group.get(), stream_for_labelled_smiles); + + if (_stream_for_labelled_smiles.active()) { + WriteLabelledSmiles(m, pmd.functional_group()); } if (number_functional_groups == 1) { - only_one_functional_group++; + ++_only_one_functional_group; return 1; } - return Pharacaphore2d(m, functional_group.get(), number_functional_groups, output); + return Pharmacophore2d(m, pmd, number_functional_groups, fname); +} +#endif + +int +Options::Report(std::ostream& output) const { + output << "Read " << _molecules_read << " molecules\n"; + output << _no_functional_groups << " molecules had no functional groups\n"; + output << _only_one_functional_group << " molecules had only one functional group\n"; + for (int i = 0; i < _functional_group_count.number_elements(); ++i) { + if (_functional_group_count[i]) { + output << _functional_group_count[i] << " molecules had " << i + << " functional groups\n"; + } + } + + return output.good(); } int -Pharacaphore2d(Molecule& m, - IWString& output_fname) { +Write(const SubstructureSearch::SubstructureQuery& qry, IWString& fname) { IWString_and_File_Descriptor output; - if (! output.open(output_fname)) { - cerr << "Cannot open " << output_fname << '\n'; + if (!output.open(fname.null_terminated_chars())) { + cerr << "Pharmacophore2d:cannot open '" << fname << "'\n"; return 0; } - return Pharacaphore2d(m, output); + using google::protobuf::io::FileOutputStream; + using google::protobuf::io::ZeroCopyOutputStream; + std::unique_ptr zero_copy_output( + new FileOutputStream(output.fd())); + if (!google::protobuf::TextFormat::Print(qry, zero_copy_output.get())) { + cerr << "Pharmacophore2d:cannot write\n"; + return 0; + } + + return 1; } int -Pharacaphore2d (data_source_and_type & input, - const IWString& output_stem) { - Molecule * m; +Pharmacophore2d(Options& options, data_source_and_type& input, Output& output) { + Molecule* m; while ((m = input.next_molecule()) != nullptr) { - molecules_read++; std::unique_ptr free_m(m); - Preprocess(*m); - IWString output_fname; - output_fname << output_stem << molecules_read << ".proto"; - if (! Pharacaphore2d(*m, output_fname)) { - cerr << "Fatal error processing " << m->name() << '\n'; - return 0; + + options.Preprocess(*m); + + std::optional maybe_qry = options.Process(*m); + if (!maybe_qry) { + continue; } + + output.Write(*maybe_qry); + } + + if (options.verbose()) { + cerr << "Generated " << output.nfiles() << " valid files\n"; } return 1; } int -Pharacaphore2d(const char * fname, - FileType input_type, - const IWString& output_stem) { +Pharmacophore2d(Options& options, const char* fname, FileType input_type, + Output& output) { if (input_type == FILE_TYPE_INVALID) { input_type = discern_file_type_from_name(fname); } data_source_and_type input(input_type, fname); - if (! input.good()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 1; } - return Pharacaphore2d(input, output_stem); + return Pharmacophore2d(options, input, output); } int -Pharacaphore2d(int argc, char** argv) { - Command_Line cl(argc, argv, "vi:A:S:L:d:D:n"); +Pharmacophore2d(int argc, char** argv) { + Command_Line cl(argc, argv, "vi:A:lS:td:D:nY:s:q:C:G:"); if (cl.unrecognised_options_encountered()) { cerr << "unrecognised_options_encountered\n"; Usage(1); } - verbose = cl.option_count('v'); + int verbose = cl.option_count('v'); - if (cl.option_present('d')) { - if (! cl.value('d', min_separation) || min_separation < 1) { - cerr << "Invalid minimum separation (-d)\n"; - Usage(1); - } - if (verbose) { - cerr << "Will ignore atoms separated by less than " << min_separation << " bonds\n"; - } - } + Options options; - if (cl.option_present('D')) { - if (! cl.value('D', max_separation) || max_separation < min_separation) { - cerr << "Invalid maximum separation (-D)\n"; - Usage(1); - } - if (verbose) { - cerr << "Will ignore atoms separated by more than " << max_separation << " bonds\n"; - } + if (!options.Initialise(cl)) { + cerr << "Cannot initialise options\n"; + return 1; } - if (cl.option_present('n')) { - ncon_becomes_min_ncon = 1; - if (verbose) { - cerr << "Ncon values become min_ncon query attributes\n"; - } + Output output; + if (!output.Initialise(cl)) { + cerr << "Cannot initialise output\n"; + return 1; } FileType input_type = FILE_TYPE_INVALID; if (cl.option_present('i')) { - if (! process_input_type(cl, input_type)) { + if (!process_input_type(cl, input_type)) { cerr << "Cannot process -i option\n"; return 1; } - } else if (! all_files_recognised_by_suffix(cl)) { + } else if (!all_files_recognised_by_suffix(cl)) { cerr << "Cannot discern all file types, use the -i option\n"; return 4; } IWString output_stem; - if (! cl.option_present('S')) { + if (!cl.option_present('S')) { cerr << "Must specify the output stem (-S)\n"; Usage(1); } - if (cl.option_present('L')) { - IWString fname = cl.string_value('L'); - if (! fname.ends_with(".smi")) { - fname << ".smi"; - } - if (! stream_for_labelled_smiles.open(fname)) { - cerr << "Cannot open stream for labelled molecules '" << fname << "'\n"; - return 1; - } - if (verbose) - cerr << "Labelled molecules written to '" << fname << "'\n"; - } - cl.value('S', output_stem); if (cl.empty()) { @@ -543,25 +1304,28 @@ Pharacaphore2d(int argc, char** argv) { } for (const auto* fname : cl) { - if (! Pharacaphore2d(fname, input_type, output_stem)) { + if (!Pharmacophore2d(options, fname, input_type, output)) { cerr << "Error processing " << fname << "\n"; return 1; } } if (verbose) { - cerr << "Read " << molecules_read << " molecules\n"; - cerr << no_functional_groups << " molecules had no functional groups\n"; - cerr << only_one_functional_group << " molecules had only one functional group\n"; + options.Report(cerr); + } + + if (cl.option_present('G')) { + IWString fname = cl.string_value('G'); + output.WriteFilesGenerated(fname); } return 0; } -} // namespace pharacaphore_2d +} // namespace pharmacophore_2d int -main(int argc, char ** argv) { +main(int argc, char** argv) { GOOGLE_PROTOBUF_VERIFY_VERSION; - return pharacaphore_2d::Pharacaphore2d(argc, argv); + return pharmacophore_2d::Pharmacophore2d(argc, argv); } diff --git a/src/Molecule_Tools/pharmacophore_2d.proto b/src/Molecule_Tools/pharmacophore_2d.proto new file mode 100644 index 00000000..a8c3584e --- /dev/null +++ b/src/Molecule_Tools/pharmacophore_2d.proto @@ -0,0 +1,70 @@ +syntax = "proto3"; + +package pharmacophore2d; + +// These are the atomic properties that can be transferred to the atom +// in the SubstructureAtom created. +enum AtomicProperty { + UNSPECIFIED = 0; + ATOMIC_NUMBER = 1; + NCON = 2; + AP_AROMATIC = 3; + RING_BOND_COUNT = 4; + HAS_PI_ELECTRON = 5; // Not implemented, do not use + PI_ELECTRON_COUNT = 6; // Not implemented, do not use + UNSATURATION = 7; + ISOTOPE = 8; + RING_SIZE = 9; + SPINACH = 10; + FUSED_SYSTEM_SIZE = 11; + HCOUNT = 12; +} + +message Pharmacophore2DConfig { + + // Substructure query specifications for functional groups. + // Same syntax as the -q option to tsubstructure. + // "SMARTS:smt", "PROTO:/path/to/qry.textproto",... + repeated string functional_group = 1; + + // If set then each external functional group query must match. + optional bool all_functional_groups_must_match = 2; + + // Substructure query specifications for atoms to ignore. + repeated string atoms_to_ignore = 3; + + // We ignore pharmacophore features that are outside this distance range. + optional uint32 min_separation = 4; + optional uint32 max_separation = 5; + + // By default the bond separation in the query will be the same as + // what it is in the starting molecule. If either of these are specified + // the initial distance is converted to a range + // d - delta_shorter, d + delta_longer + optional uint32 delta_shorter = 6; + optional uint32 delta_longer = 7; + + // If set, all atomic ncon values become min_ncon in the query. + optional bool ncon_becomes_min_ncon = 8; + + // If set, all atomic ring_bond_count values become min_ring_bond_count + // specifications in the query. + optional bool ring_bond_count_becomes_min_ring_bond_count = 9; + + // The atomic properties that get transferred to the query atom. + repeated AtomicProperty atomic_property = 10; + + // If set, when the query is constructed, the separated_atom message + // will preserve the same rotatable bonds between matched atoms. + optional bool preserve_rotbond = 11; + + // Or we can specify a number of extra or fewer rotabable bonds allowed. + optional uint32 extra_rotbond = 12; + optional uint32 less_rotbond = 13; + + // Impose a limit on the number of atoms in the target molecule relative + // to the starting molecule. + optional uint32 max_extra_atoms = 14; + + optional bool hcount_becomes_min = 15; +} diff --git a/src/Molecule_Tools/ring_replacement_collate.cc b/src/Molecule_Tools/ring_replacement_collate.cc new file mode 100644 index 00000000..fc9553f2 --- /dev/null +++ b/src/Molecule_Tools/ring_replacement_collate.cc @@ -0,0 +1,350 @@ +// Collate multiple independently generated ring replacement sets. + +#include +#include +#include +#include + +#include "absl/container/flat_hash_map.h" + +#include "google/protobuf/io/zero_copy_stream.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" + +#include "Foundational/accumulator/accumulator.h" +#include "Foundational/cmdline_v2/cmdline_v2.h" +#include "Foundational/data_source/iwstring_data_source.h" +#include "Foundational/iwmisc/iwre2.h" + +#include "Molecule_Tools/replacement_ring.pb.h" + +namespace ring_replacement_collate { + +using std::cerr; +namespace fs = std::filesystem; + +void +Usage(int rc) { +// clang-format off +#if defined(GIT_HASH) && defined(TODAY) + cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; +#else + cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; +#endif + // clang-format on + // clang-format off + cerr << R"( +)"; + // clang-format on + ::exit(rc); +} + +// We keep track of rings by type. And within each type there are protos +// describing the rings found. + +using RingData = absl::flat_hash_map>; + +class Options { + private: + // Regular expression describing the file names to be processed. + std::unique_ptr _rx; + + int _ignore_errors; + int _errors_encountered; + + std::string _prefix; + + // private functions + int AggregateRings(const char* dirname, + const std::filesystem::directory_entry& fname, + RingData& rings); + int AggregateRings(iwstring_data_source& input, + absl::flat_hash_map& rings); + int AggregateRings(const RplRing::ReplacementRing& ring, + absl::flat_hash_map& rings); + int WriteRings(const absl::flat_hash_map& rings, + const std::string name_stem, + const std::string ring_type); + int WriteRings(const absl::flat_hash_map& rings, + IWString_and_File_Descriptor& output); + + public: + Options(); + + int Initialise(Command_Line_v2& cl); + + int AggregateRings(const char* dirname, + RingData& rings); + + // The name stem for where the output files are written. + // If this includes a directory path, that must have already been + // created. + // The file name will be prefix/_prefix_ringtype.smi + int WriteRings(const RingData& rings, const std::string& name_stem); + + int Report(std::ostream& output) const; +}; + +Options::Options() { + _ignore_errors = 0; + _errors_encountered = 0; + + const_IWSubstring tmp(R"(_([0-9,a,A]+)\.smi$)"); + iwre2::RE2Reset(_rx, tmp); + + _prefix = "ring_"; +} + +int +Options::Initialise(Command_Line_v2& cl) { + int verbose = cl.option_present('v'); + + if (cl.option_present("ignore_errors")) { + _ignore_errors = 1; + if (verbose) { + cerr << "Will ignore errors\n"; + } + } + + if (cl.option_present("prefix")) { + cl.value("prefix", _prefix); + if (verbose) { + cerr << "Prefix for output files '" << _prefix << "'\n"; + } + } + + return 1; +} + +int +Options::AggregateRings(const char* dirname, + RingData& rings) { + // for (auto it{fs::directory_iterator("sandbox")}; it != fs::directory_iterator(); ++it) + + for (auto& fname : fs::directory_iterator(dirname)) { + if (! AggregateRings(dirname, fname, rings)) { + cerr << "Error processing '" << fname << "'\n"; + return 0; + } + } + + return 1; +} + +int +Options::AggregateRings(const char* dirname, + const std::filesystem::directory_entry& fname, + RingData& rings) { + std::string ring_type; + const std::string x = fname.path(); + if (! RE2::PartialMatch(x, *_rx, &ring_type)) { + cerr << "Options::AggregateRings:cannot determine ring type '" << fname << + "' pattern '" << _rx->pattern() << "'\n"; + ++_errors_encountered; + return _ignore_errors; + } + + fs::path full_path_name(dirname); + full_path_name /= fname; + + iwstring_data_source input(full_path_name.c_str()); + if (! input.good()) { + cerr << "Options::AggregateRings:cannot open '" << full_path_name << "'\n"; + return 0; + } + + auto iter = rings.find(ring_type); + if (iter != rings.end()) { + return AggregateRings(input, iter->second); + } + + // New ring type, create; + absl::flat_hash_map proto; + auto iter2 = rings.emplace(ring_type, std::move(proto)); + + return AggregateRings(input, std::get<1>(*iter2.first)); +} + +int +Options::AggregateRings(iwstring_data_source& input, + absl::flat_hash_map& rings) { + const_IWSubstring buffer; + while (input.next_record(buffer)) { + RplRing::ReplacementRing proto; + google::protobuf::io::ArrayInputStream zero_copy_array(buffer.data(), buffer.nchars()); + if (!google::protobuf::TextFormat::Parse(&zero_copy_array, &proto)) { + cerr << "Options::AggregateRings:cannot parse " << buffer << "'\n"; + return 0; + } + + if (! AggregateRings(proto, rings)) { + cerr << "Options::AggregateRings:cannot parse proto " << buffer << '\n'; + return 0; + } + } + + // cerr << "At end of " << input.fname() << " hrve " << rings.size() << " lines_read " << input.lines_read() << '\n'; + + return 1; +} + +int +Options::AggregateRings(const RplRing::ReplacementRing& ring, + absl::flat_hash_map& rings) { + if (ring.usmi().size() == 0) { + cerr << "EMpty usmi " << ring.ShortDebugString() << '\n'; + return 0; + } + + auto iter = rings.find(ring.usmi()); + if (iter != rings.end()) { + const auto n = ring.n(); + iter->second.set_n(n + ring.n()); + return 1; + } + + rings.emplace(ring.usmi(), ring); + + return 1; +} + +int +Options::Report(std::ostream& output) const { + return 1; +} + +int +Options::WriteRings(const RingData& rings, const std::string& name_stem) { + for (const auto& [ring_type, myrings] : rings) { + if (! WriteRings(myrings, name_stem, ring_type)) { + cerr << "Options::WriteRings:cannot write " << name_stem << ' ' << ring_type << '\n'; + return 0; + } + } + + return 1; +} + +int +Options::WriteRings(const absl::flat_hash_map& rings, + const std::string destdir, + const std::string ring_type) { + IWString full_path_name; + full_path_name << destdir << '/' << _prefix << ring_type << ".smi"; + + IWString_and_File_Descriptor output; + if (! output.open(full_path_name)) { + cerr << "Options::WriteRings:cannot open '" << full_path_name << "'\n"; + return 0; + } + + return WriteRings(rings, output); +} + +int +Options::WriteRings(const absl::flat_hash_map& rings, + IWString_and_File_Descriptor& output) { + static google::protobuf::TextFormat::Printer printer; + printer.SetSingleLineMode(true); + + std::string buffer; + + for (const auto& [_, proto] : rings) { + if (! printer.PrintToString(proto, &buffer)) { + cerr << "Options::WriteRings write '" << proto.ShortDebugString() << "'\n"; + return 0; + } + + output << buffer; + output << '\n'; + + output.write_if_buffer_holds_more_than(8192); + } + + return 1; +} + + +int +Main(int argc, char** argv) { + Command_Line_v2 cl(argc, argv, "-v--destdir=s-ignore_errors-prefix=s"); + + if (cl.unrecognised_options_encountered()) { + cerr << "unrecognised_options_encountered\n"; + Usage(1); + } + + const int verbose = cl.option_present('v'); + + Options options; + if ( !options.Initialise(cl)) { + cerr << "Cannot initialise options\n"; + return 1; + } + + if (! cl.option_present("destdir")) { + cerr << "Must specify destination directory via the -destdir option\n"; + Usage(1); + } + + if (cl.empty()) { + cerr << "Insufficient arguments\n"; + Usage(1); + } + + RingData rings; + + for (const char* dir : cl) { + if (! options.AggregateRings(dir, rings)) { + cerr << "Error processing directory '" << dir << "'\n"; + return 1; + } + } + + if (rings.empty()) { + cerr << "NO rings found\n"; + return 1; + } + + if (verbose) { + cerr << "Read data on " << rings.size() << " ring types\n"; + } + + IWString tmp; + cl.value("destdir", tmp); + std::string destdir(tmp.AsString()); + fs::path path_name(destdir); + if (! fs::is_directory(path_name)) { + if (! fs::create_directories(path_name)) { + cerr << "Cannot create '" << path_name << "'\n"; + return 1; + } + } + + if (verbose) { + cerr << "Writing " << rings.size() << " ring types\n"; + Accumulator_Int acc; + for (const auto& [_, ring_type] : rings) { + for (const auto& [_, proto] : ring_type) { + acc.extra(proto.n()); + } + } + cerr << "Counts btw " << acc.minval() << " and " << acc.maxval() << + " ave " << acc.average() << '\n'; + } + + if (! options.WriteRings(rings, destdir)) { + cerr << "Cannot write rings to " << destdir << "'\n"; + return 1; + } + + return 0; +} + +} // namespace ring_replacement_collate + +int +main(int argc, char** argv) { + GOOGLE_PROTOBUF_VERIFY_VERSION; + return ring_replacement_collate::Main(argc, argv); +} diff --git a/src/Molecule_Tools/substructure_demerits.cc b/src/Molecule_Tools/substructure_demerits.cc index 6548ee32..90892e56 100644 --- a/src/Molecule_Tools/substructure_demerits.cc +++ b/src/Molecule_Tools/substructure_demerits.cc @@ -562,8 +562,10 @@ grow_chain(Molecule & m, assert(0 == already_done[tmp]); - if (6 != mz[tmp] || ncon[tmp] > 2 || m.is_ring_atom(tmp) || m.formal_charge(tmp)) + if (6 != mz[tmp] || ncon[tmp] > 2 || m.is_ring_atom(tmp) || m.formal_charge(tmp) || + ! m.saturated(tmp)) { return rc; + } zatom = tmp; } @@ -575,24 +577,35 @@ determine_chain(Molecule & m, const atomic_number_t * mz, const int * ncon, int * already_done) { -//int matoms = m.natoms(); - already_done[zatom] = 1; int rc = 1; assert(2 == ncon[zatom]); - const Atom * a = m.atomi(zatom); + const Atom& a = m[zatom]; + assert(a.ncon() == 2); - for (int i = 0; i < 2; i++) - { - atom_number_t a1 = a->other(zatom, i); + for (const Bond* b : a) { + const atom_number_t a1 = b->other(zatom); assert(a1 >= 0 && a1 < m.natoms() && 0 == already_done[a1]); - if (6 == mz[a1] && ncon[a1] <= 2 && m.is_non_ring_atom(a1) && 0 == m.formal_charge(a1)) - { - rc += grow_chain(m, a1, mz, ncon, already_done); + if (mz[a1] != 6) { + continue; + } + if (ncon[a1] > 2) { + continue; + } + if (m.ring_bond_count(a1)) { + continue; } + if (m.formal_charge(a1)) { + continue; + } + if (! m.saturated(a1)) { + continue; + } + + rc += grow_chain(m, a1, mz, ncon, already_done); } return rc; @@ -634,6 +647,7 @@ long_carbon_chains(const Molecule & m_in, Demerit & demerit, if (2 != ncon[i]) continue; + if (already_done[i]) continue; @@ -642,6 +656,10 @@ long_carbon_chains(const Molecule & m_in, Demerit & demerit, const Atom * a = m.atomi(i); + if (! a->fully_saturated()) { + continue; + } + if (2 == a->nbonds() && 0 == a->formal_charge()) { int path_length = determine_chain(m, i, mz, ncon, already_done); diff --git a/src/Molecule_Tools/trxn.cc b/src/Molecule_Tools/trxn.cc index 1fcef07d..b5ea6020 100644 --- a/src/Molecule_Tools/trxn.cc +++ b/src/Molecule_Tools/trxn.cc @@ -340,7 +340,10 @@ static void do_append_text(Molecule& m, const IWString& to_append) { IWString tmp = m.name(); - if (!tmp.ends_with(' ')) { + // Make sure there is a space between what is there already and what we are adding. + if (to_append.starts_with(' ')) { + } else if (tmp.ends_with(' ')) { + } else { tmp += ' '; } @@ -457,6 +460,33 @@ RemoveSizeNotOk(resizable_array_p& fragments) { }); } +static int +WriteMultiFragmentMolecule(Molecule& product, + Molecule_Output_Object& output) { + resizable_array_p fragments; + product.create_components(fragments); + RemoveSizeNotOk(fragments); + if (suppress_duplicate_molecules) { + IW_STL_Hash_Set seen; + for (Molecule* frag : fragments) { + if (seen.contains(frag->unique_smiles())) { + continue; + } + seen.insert(frag->unique_smiles()); + + frag->set_name(product.name()); + output.write(*frag); + } + } else { + for (Molecule* frag : fragments) { + frag->set_name(product.name()); + output.write(*frag); + } + } + + return 1; +} + static int do_write(Molecule_and_Embedding* sidechain, Molecule& product, int nhits, Molecule_Output_Object& output) { @@ -561,14 +591,7 @@ do_write(Molecule_and_Embedding* sidechain, Molecule& product, int nhits, if (write_multi_fragment_products_as_separate_molecules && product.number_fragments() > 1) { - resizable_array_p fragments; - product.create_components(fragments); - RemoveSizeNotOk(fragments); - for (Molecule* frag : fragments) { - frag->set_name(product.name()); - output.write(*frag); - } - return 1; + return WriteMultiFragmentMolecule(product, output); } return output.write(product); @@ -1165,14 +1188,14 @@ do_write(IW_TDT& tdt, Molecule& result, std::ostream& output) { tdt.set_dataitem_value(identifier_tag, result.name()); } else { IWString tmp(result.name()); - tmp << ' ' << append_to_changed_molecules; + tmp << append_to_changed_molecules; tdt.set_dataitem_value(identifier_tag, tmp); } } else { IWString tmp(result.name()); if (append_to_changed_molecules.length()) { - tmp << ' ' << append_to_changed_molecules; + tmp << append_to_changed_molecules; } tdt.add_dataitem(identifier_tag, tmp); diff --git a/src/Molecule_Tools/tsubstructure.cc b/src/Molecule_Tools/tsubstructure.cc index 1b7e3846..981edd80 100644 --- a/src/Molecule_Tools/tsubstructure.cc +++ b/src/Molecule_Tools/tsubstructure.cc @@ -11,6 +11,7 @@ #include "Foundational/iwbits/iwbits.h" #include "Foundational/iw_tdt/iw_tdt.h" #include "Foundational/iwmisc/misc.h" +#include "Foundational/iwmisc/iwre2.h" #include "Foundational/iwmisc/report_progress.h" #include "Molecule_Lib/aromatic.h" @@ -99,6 +100,14 @@ static int write_results_as_proto = 0; // Nov 2022. Pass our own Molecule_to_Query to substructur reading. static Molecule_to_Query_Specifications mqs; +// Jul 2024. +// Have a file with a bunch of different motifs. Need a simple way of +// adding something to the name for molecules that match any of the queries. +static IWString append_to_matches; + +// If set, only perform searches on molecules that match a regex. +static std::unique_ptr rx_match_only; + static void usage(int rc) { @@ -212,6 +221,8 @@ display_dash_M_options() cerr << " -M ucez interpret uppercase smarts letters as element specifiers only\n"; cerr << " -M ecount initialise element counts in targets - may help '-q M:...'\n"; cerr << " -M mmaq must match all queries in order for a match to be perceived\n"; + cerr << " -M app= append to molecules matching any query. For example -m NONM -M app=foo\n"; + cerr << " -M rx= only process molecules where the name matches - must useful with the app= directive\n"; cerr << " -M nsssr within target objects, nrings includes non-sssr rings\n"; cerr << " -M DCF=fname create a csib directcolorfile \n"; cerr << " -M CEH Condense Explicit Hydrogens to anchor atom(s)\n"; @@ -1290,7 +1301,7 @@ tsubstructure (Molecule & m, int * tmp = new_int(tsize); std::unique_ptr free_tmp(tmp); - int matoms = m.natoms(); + const int matoms = m.natoms(); isotope_t * atom_isotopic_label; @@ -1326,7 +1337,11 @@ tsubstructure (Molecule & m, } } - int nmatched = do_all_queries(m, queries, sresults, tmp, new_elements, atom_isotopic_label); + int nmatched = 0; + if (rx_match_only && ! iwre2::RE2PartialMatch(m.name(), *rx_match_only)) { + } else { + nmatched = do_all_queries(m, queries, sresults, tmp, new_elements, atom_isotopic_label); + } if (nullptr != atom_isotopic_label) delete [] atom_isotopic_label; @@ -1345,6 +1360,9 @@ tsubstructure (Molecule & m, if (nmatched) { molecules_which_match++; + if (! append_to_matches.empty()) { + m << append_to_matches; + } if (matched_structures_number_assigner.active()) matched_structures_number_assigner.process(m); if (write_matched_atoms_as_mdl_v30_atom_lists || write_matched_atoms_as_mdl_v30_bond_lists) @@ -1417,8 +1435,7 @@ tsubstructure (data_source_and_type & input, preprocess(*m); - if (0 == m->natoms()) - { + if (m->empty()) { if (verbose) cerr << "Empty molecule skipped\n"; continue; @@ -2509,6 +2526,25 @@ tsubstructure(int argc, char ** argv) if (verbose) cerr << "Match results written in proto form\n"; } + else if (m.starts_with("app=")) { + m.remove_leading_chars(4); + append_to_matches << ' ' << m; + if (verbose) { + cerr << "Will append '" << append_to_matches << "' to the names of matched molecules\n"; + } + NONM_append_non_match_query_details = 0; + } + else if (m.starts_with("rx=")) { + m.remove_leading_chars(3); + if (! iwre2::RE2Reset(rx_match_only, m)) { + cerr << "Invalid regex '" << m << "'\n"; + return 1; + } + + if (verbose) { + cerr << "Will only attempt matching on molecules matching '" << rx_match_only->pattern() << "'\n"; + } + } else if ("help" == m) { display_dash_M_options(); diff --git a/src/Molecule_Tools/xlogp.cc b/src/Molecule_Tools/xlogp.cc index 1e25c52a..3ca2ce0c 100644 --- a/src/Molecule_Tools/xlogp.cc +++ b/src/Molecule_Tools/xlogp.cc @@ -2787,6 +2787,9 @@ IdentifyTerminalGroups(Molecule& m, std::optional XLogPWFL(Molecule& m, int* status) { + // Always remove any explicit Hydrogen atoms, Sorry, this might change `m`. + m.remove_all(1); + const int matoms = m.natoms(); std::fill_n(status, matoms, 0); diff --git a/src/Molecule_Tools_Bdb/selimsteg_build.cc b/src/Molecule_Tools_Bdb/selimsteg_build.cc deleted file mode 100644 index 66c194a5..00000000 --- a/src/Molecule_Tools_Bdb/selimsteg_build.cc +++ /dev/null @@ -1,536 +0,0 @@ -/* - Build the selimsteg infrastructure. - Input is raw structure file -*/ - -#include -#include -using std::cerr; -using std::endl; - -#include "Foundational/cmdline/cmdline.h" -#include "Foundational/iwstring/iw_stl_hash_set.h" -#include "Foundational/iwmisc/misc.h" - -#include "Molecule_Lib/istream_and_type.h" -#include "Molecule_Lib/molecule.h" -#include "Molecule_Lib/aromatic.h" -#include "Molecule_Lib/standardise.h" - -const char * prog_name = nullptr; - -static int verbose = 0; - -static int molecules_read = 0; - -static int reduce_to_largest_fragment = 0; - -static Chemical_Standardisation chemical_standardisation; - -static int discard_molecule_if_multiple_fragments_larger_than = 0; - -static int lower_atom_count_cutoff = 0; -static int upper_atom_count_cutoff = 0; - -static IWString_and_File_Descriptor organic_stream; - -static IWString_and_File_Descriptor stream_for_discards; - -static int molecules_discarded = 0; - -static IWString_STL_Hash_Set lly_seen; - -static int perform_duplicate_id_checking = 0; - -static int duplicate_identifiers_found = 0; - -static int discarded_for_too_few_atoms = 0; -static int discarded_for_too_many_atoms = 0; -static int discarded_for_isotopes = 0; -static int discarded_for_bad_valence = 0; -static int discarded_for_non_organic = 0; -static int discarded_for_being_mixture = 0; - -static IWString_and_File_Descriptor stream_for_unstandardised_smiles; - -static void -usage (int rc) -{ - cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << endl; - cerr << "Does processing for selimsteg infrastructure - input is raw structures\n"; - cerr << " -O file name for organic subset\n"; - cerr << " -c lower atom count for organic (based on atoms in largest fragment)\n"; - cerr << " -C upper atom count for organic (based on atoms in largest fragment)\n"; - cerr << " -B file for discarded smiles\n"; - cerr << " -d perform duplicate ID checking\n"; - cerr << " -x discard if multiple fragments with more than (mixture)\n"; - cerr << " -U stream for unstandardised smiles\n"; - cerr << " -i input specification\n"; - cerr << " -l strip to largest fragment\n"; - cerr << " -g ... chemical standardisation options\n"; - cerr << " -E ... standard element specifications\n"; - cerr << " -A ... standard aromaticity specifications\n"; - cerr << " -v verbose output\n"; - - exit(rc); -} - -static void -preprocess (Molecule & m) -{ - if (chemical_standardisation.active()) - chemical_standardisation.process(m); - - return; -} - -static int -write_discard_stream_if_open (Molecule & m, - const char * reason) -{ - molecules_discarded++; - - if (! stream_for_discards.is_open()) - return 1; - - stream_for_discards << m.smiles() << ' ' << m.name() << ' ' << reason << '\n'; - stream_for_discards.write_if_buffer_holds_more_than(32768); - - return 1; -} - -static int ok_elements[HIGHEST_ATOMIC_NUMBER + 1]; - -static int -contains_non_organics (const Molecule & m) -{ - if (m.organic_only()) - return 0; - - for (int i = m.natoms() - 1; i >= 0; i--) - { - const Element * e = m.elementi(i); - -// cerr << "Organic? '" << e->symbol() << "' " << e->organic() << endl; - - if (e->organic()) - continue; - - if (! ok_elements[e->atomic_number()]) - return 1; - - if (m.ncon(i) > 0) // cannot tolerate covalent inorganics - return 1; - } - - return 0; // no covalently bonded non organics -} - -static int -too_many_large_fragments (Molecule & m) -{ - int rc = 0; - - int nf = m.number_fragments(); - - for (int i = 0; i < nf; i++) - { - if (m.atoms_in_fragment(i) > discard_molecule_if_multiple_fragments_larger_than) - { - rc++; - - if (rc > 1) - return 1; - } - } - - return 0; -} - -/* - Only return 0 on a catastrophic error -*/ - -static int -selimsteg_build (Molecule & m, - IWString_and_File_Descriptor & output) -{ - if (m.number_isotopic_atoms() > 0) // always bad, test first - { - discarded_for_isotopes++; - return write_discard_stream_if_open(m, "isotope"); - } - - if (too_many_large_fragments(m)) // need to do before fragment stripping - { - discarded_for_being_mixture++; - return write_discard_stream_if_open(m, "mixture"); - } - - if (contains_non_organics(m)) - { - discarded_for_non_organic++; - return write_discard_stream_if_open(m, "nonorganic"); - } - - int matoms; - - if (reduce_to_largest_fragment) - { - m.reduce_to_largest_fragment_carefully(); - matoms = m.natoms(); - } - else - matoms = m.atoms_in_largest_fragment(); - - if (lower_atom_count_cutoff > 0 && matoms < lower_atom_count_cutoff) - { - discarded_for_too_few_atoms++; - return write_discard_stream_if_open(m, "too few atoms"); - } - - if (upper_atom_count_cutoff > 0 && matoms > upper_atom_count_cutoff) - { - discarded_for_too_many_atoms++; - return write_discard_stream_if_open(m, "too many atoms"); - } - - if (! m.valence_ok()) // should this be done before fragment stripping?? - { - discarded_for_bad_valence++; - return write_discard_stream_if_open(m, "bad valence"); - } - - output << m.smiles() << ' ' << m.name() << '\n'; - - output.write_if_buffer_holds_more_than(32768); - - return 1; -} - -static int -is_duplicate_id (const IWString & id) -{ - if (lly_seen.contains(id)) - { - cerr << "Duplicate identifier '" << id << "'\n"; - duplicate_identifiers_found++; - return 1; - } - - lly_seen.insert(id); - - return 0; -} - -static int -selimsteg_build (data_source_and_type & input, - IWString_and_File_Descriptor & output) -{ - Molecule * m; - while (NULL != (m = input.next_molecule())) - { - molecules_read++; - - std::unique_ptr free_m(m); - - if (! perform_duplicate_id_checking) - ; - else if (is_duplicate_id(m->name())) - continue; - - if (stream_for_unstandardised_smiles.is_open()) - { - stream_for_unstandardised_smiles << m->smiles() << ' ' << m->name() << '\n'; - stream_for_unstandardised_smiles.write_if_buffer_holds_more_than(32768); - } - - preprocess(*m); - - output << m->smiles() << ' ' << m->name() << '\n'; - - output.write_if_buffer_holds_more_than(32768); - - if (! selimsteg_build(*m, organic_stream)) // note different output stream - return 0; - } - - return 1; -} - -static int -selimsteg_build (const char * fname, FileType input_type, - IWString_and_File_Descriptor & output) -{ - assert (NULL != fname); - - if (input_type == FILE_TYPE_INVALID) - { - input_type = discern_file_type_from_name(fname); - assert (0 != input_type); - } - - data_source_and_type input(input_type, fname); - if (! input.good()) - { - cerr << prog_name << ": cannot open '" << fname << "'\n"; - return 0; - } - - if (verbose > 1) - input.set_verbose(1); - - return selimsteg_build(input, output); -} - -static int -selimsteg_build (int argc, char ** argv) -{ - Command_Line cl (argc, argv, "vA:E:i:g:lO:c:C:x:B:dU:"); - - if (cl.unrecognised_options_encountered()) - { - cerr << "Unrecognised options encountered\n"; - usage(1); - } - - verbose = cl.option_count('v'); - - if (cl.option_present('A')) - { - if (! process_standard_aromaticity_options(cl, verbose, 'A')) - { - cerr << "Cannot initialise aromaticity specifications\n"; - usage(5); - } - } - - if (cl.option_present('E')) - { - if (! process_elements(cl, verbose, 'E')) - { - cerr << "Cannot initialise elements\n"; - return 6; - } - } - - if (cl.option_present('g')) - { - if (! chemical_standardisation.construct_from_command_line(cl, verbose > 1, 'g')) - { - cerr << "Cannot process chemical standardisation options (-g)\n"; - usage(32); - } - } - - if (cl.option_present('l')) - { - reduce_to_largest_fragment = 1; - - if (verbose) - cerr << "Will reduce to largest 'organic' fragment\n"; - } - - if (cl.option_present('x')) - { - if (! cl.value('x', discard_molecule_if_multiple_fragments_larger_than) || discard_molecule_if_multiple_fragments_larger_than < 1) - { - cerr << "The discard molecules if mixtures of size (-x) must have a value +ve whole number\n"; - usage(3); - } - - if (verbose) - cerr << "Molecules considered mixtures if >1 fragment with " << discard_molecule_if_multiple_fragments_larger_than << " atoms\n"; - } - - if (cl.option_present('d')) - { - perform_duplicate_id_checking = 1; - - if (verbose) - cerr << "WIll perform duplicate id checking\n"; - } - - int need_to_check_atom_count_cutoffs = 0; - - if (cl.option_present('c')) - { - if (! cl.value('c', lower_atom_count_cutoff) || lower_atom_count_cutoff < 1) - { - cerr << "The lower atom count limit (-c) must be a whole +ve number\n"; - usage(4); - } - - if (verbose) - cerr << "Will discard molecules having fewer than " << lower_atom_count_cutoff << " atoms\n"; - - need_to_check_atom_count_cutoffs = 1; - } - - if (cl.option_present('C')) - { - if (! cl.value('C', upper_atom_count_cutoff) || upper_atom_count_cutoff < 1) - { - cerr << "The upper atom count limit (-C) must be a whole +ve number\n"; - usage(4); - } - - if (verbose) - cerr << "Will discard molecules having more than " << upper_atom_count_cutoff << " atoms\n"; - - need_to_check_atom_count_cutoffs = 1; - } - - if (! need_to_check_atom_count_cutoffs) - ; - else if (lower_atom_count_cutoff < upper_atom_count_cutoff) - ; - else if (0 == upper_atom_count_cutoff) - ; - else - { - cerr << "Inconsistent lower (" << lower_atom_count_cutoff << ") and upper (" << upper_atom_count_cutoff << ") atom count limits\n"; - return 3; - } - - FileType input_type = FILE_TYPE_INVALID; - - if (cl.option_present('i')) - { - if (! process_input_type(cl, input_type)) - { - cerr << "Cannot determine input type\n"; - usage (6); - } - } - else if (1 == cl.number_elements() && 0 == ::strncmp(cl[0], "-", 1)) - input_type = FILE_TYPE_SMI; - else if (! all_files_recognised_by_suffix(cl)) - return 4; - - if (0 == cl.number_elements()) - { - cerr << "Insufficient arguments\n"; - usage(2); - } - - if (! cl.option_present('O')) - { - cerr << "MUst specify file name for organic subset via the -O option\n"; - usage(3); - } - - if (cl.option_present('O')) - { - IWString o = cl.string_value('O'); - - if (! o.ends_with(".smi")) - o << ".smi"; - - if (! organic_stream.open(o.null_terminated_chars())) - { - cerr << "Cannot open stream for organic subset '" << o << "'\n"; - return 4; - } - - if (verbose) - cerr << "Organic subset written to '" << o << "'\n"; - } - - if (cl.option_present('B')) - { - IWString b = cl.string_value('B'); - - if (! b.ends_with(".smi")) - b << ".smi"; - - if (! stream_for_discards.open(b.null_terminated_chars())) - { - cerr << "Cannot open stream for discards '" << b << "'\n"; - return 4; - } - - if (verbose) - cerr << "Discarded molecules written to '" << b << "'\n"; - } - - if (cl.option_present('U')) - { - IWString u = cl.string_value('U'); - - if (! u.ends_with(".smi")) - u << ".smi"; - - if (! stream_for_unstandardised_smiles.open(u.null_terminated_chars())) - { - cerr << "Cannot open stream for unstandardised smiles '" << u << "'\n"; - return 2; - } - - if (verbose) - cerr << "Unstandardised smiles written to '" << u << "'\n"; - } - -// initialise the array of allowable elements - - set_vector(ok_elements, HIGHEST_ATOMIC_NUMBER + 1, 0); - - ok_elements[6] = 1; - ok_elements[7] = 1; - ok_elements[8] = 1; - ok_elements[9] = 1; - ok_elements[15] = 1; - ok_elements[16] = 1; - ok_elements[17] = 1; - ok_elements[35] = 1; - ok_elements[53] = 1; - ok_elements[3] = 1; // Li - ok_elements[11] = 1; // Na - ok_elements[12] = 1; // Mg - ok_elements[19] = 1; // K - ok_elements[20] = 1; // Ca - - IWString_and_File_Descriptor output(1); - - int rc = 0; - for (int i = 0; i < cl.number_elements(); i++) - { - if (! selimsteg_build(cl[i], input_type, output)) - { - rc = i + 1; - break; - } - } - - output.flush(); - if (organic_stream.is_open()) - organic_stream.close(); - if (stream_for_discards.is_open()) - stream_for_discards.close(); - - if (verbose) - { - cerr << "Read " << molecules_read << " molecules\n"; - cerr << molecules_discarded << " molecules discarded\n"; - cerr << discarded_for_too_few_atoms<< " discarded for too few atoms\n"; - cerr << discarded_for_too_many_atoms<< " discarded for too many atoms\n"; - cerr << discarded_for_isotopes<< " discarded for isotopes\n"; - cerr << discarded_for_bad_valence<< " discarded for bad valence\n"; - cerr << discarded_for_non_organic<< " discarded for non organic\n"; - cerr << discarded_for_being_mixture<< " discarded for being mixtures\n"; - if (perform_duplicate_id_checking) - cerr << duplicate_identifiers_found << " duplicate identifiers found\n"; - } - - return rc; -} - -int -main (int argc, char ** argv) -{ - prog_name = argv[0]; - - int rc = selimsteg_build (argc, argv); - - return rc; -} diff --git a/src/Utilities/GFP_Tools/BUILD b/src/Utilities/GFP_Tools/BUILD index c5ffc7d9..1010c074 100644 --- a/src/Utilities/GFP_Tools/BUILD +++ b/src/Utilities/GFP_Tools/BUILD @@ -48,6 +48,7 @@ local_install( ":nplotnn", ":parallel_nn_search_to_gfp_spread", ":random_fingerprint", + ":train_test_split_optimise", ], ) diff --git a/src/Utilities/GFP_Tools/_resizable_array_sid.cc b/src/Utilities/GFP_Tools/_resizable_array_sid.cc deleted file mode 100644 index 3db60556..00000000 --- a/src/Utilities/GFP_Tools/_resizable_array_sid.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION - -#include "smiles_id_dist.h" - -template class resizable_array_p; -template class resizable_array_base; diff --git a/src/Utilities/GFP_Tools/_resizable_array_spread_v2_object.cc b/src/Utilities/GFP_Tools/_resizable_array_spread_v2_object.cc deleted file mode 100644 index d42d5560..00000000 --- a/src/Utilities/GFP_Tools/_resizable_array_spread_v2_object.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -#define RESIZABLE_ARRAY_IMPLEMENTATION -#include "spread_v2.h" - -template class resizable_array; -template class resizable_array_base; diff --git a/src/Utilities/GFP_Tools/gfp_leader_v2.cc b/src/Utilities/GFP_Tools/gfp_leader_v2.cc deleted file mode 100644 index e31b567c..00000000 --- a/src/Utilities/GFP_Tools/gfp_leader_v2.cc +++ /dev/null @@ -1,1502 +0,0 @@ -/* - An implementation of the leader algorithm for fingerprints - This variant produces output that can be processed by nplotnn -*/ - -#include -#include - -#include -#include -#include - -#include "google/protobuf/text_format.h" - -#include "Foundational/accumulator/accumulator.h" -#include "Foundational/cmdline/cmdline.h" -#include "Foundational/iw_tdt/iw_tdt.h" - -#define RESIZABLE_ARRAY_IWQSORT_IMPLEMENTATION -#include "Foundational/iwqsort/iwqsort.h" -#include "leader.h" -#include "sparse_collection.h" -#include "tversky.h" - -#include "Utilities/GFP_Tools/nearneighbours.pb.h" - -using std::cerr; - -static Tversky tversky; - -// If there is a per-item score. Can be specified as either -// TDT dataitem value -// Numeric value in a column in the name. - -static IWString score_tag; -static int score_column = -1; - -/* - May 99. When looking for the next molecule to select, we can - add a multiple of the distance to the nearest cluster centre - to the score. That way, we can include some varying function - of diversity into the next cluster selection -*/ - -static similarity_type_t cluster_distance_scale_factor = 0.0; - -static resizable_array_p dataitems_to_echo; - -static int verbose = 0; - -static similarity_type_t abandon_distance_cutoff = -1.0; - -/* - The variables which control the clustering -*/ - -static int max_clusters_to_find = std::numeric_limits::max(); -static int clusters_found = 0; -static int max_cluster_size = 0; - -static similarity_type_t threshold = 0.0; - -static int threshold_column = -1; - -/* - when dealing with clusters which are decided by the threshold, we can - optinally sort the cluster members by their distance from the leader -*/ - -static int sort_by_distance_from_centre = 0; - -static int items_selected = 0; - -extending_resizable_array cluster_size; - -static Accumulator distance_stats; - -static std::ofstream stream_for_discarded_by_previously_selected; - -static IWString smiles_tag("$SMI<"); -static IWString identifier_tag("PCN<"); - -static IWString distance_tag("DIST<"); - -/* - Mar 2004. in order to - sample an SAR within each cluster. Note that the way this is implemented - now is somewhat strange because rather than marking the discarded molecules - in some way, they simply do not appear in the output! Change if this ever - becomes a problem. - - There are two ways of doing the within-cluster clustering. - A fixed threshold applied to all clusters - A constant multiple of the threshold used to form the cluster -*/ - -static similarity_type_t sub_cluster_threshold = static_cast(0.0); - -static float sub_cluster_threshold_ratio = static_cast(0.0); - -/* - A single variable that indicates whether or not any form of cluster clustering - is active -*/ - -static int leader_on_clusters = 0; - -// Sept 2023. Add proto output as an option. -static int write_as_proto = 0; - -// When writing textproto values a zero distance will not be written. -// We have the option of writing a small value instead of zero. -static int offset_zero_distances_in_textproto = 0; -// The actual distance used. -static constexpr float kNotZero = 0.00001f; - -GFP_L::GFP_L() { - _selected = 0; - - _score = 0.0; - - _shortest_distance_to_cluster_centre = static_cast(1.0); - - return; -} - -static IWString threshold_from_file_tag; - -static IWString max_cluster_size_tag; - -static int max_cluster_size_column = -1; - -int -GFP_L::construct_from_tdt(IW_TDT &tdt, int &fatal) { - if (!IW_General_Fingerprint::construct_from_tdt(tdt, fatal)) { - return 0; - } - - if (threshold_from_file_tag.length()) { - similarity_type_t tmp; - if (!tdt.dataitem_value(threshold_from_file_tag, tmp) || tmp < 0.0) { - cerr << "GFP_L::construct_from_tdt: invalid '" << threshold_from_file_tag - << "' in tdt\n"; - return 0; - } - - _threshold.set(tmp); - } else if (threshold_column >= 0) { - const_IWSubstring t; - if (!_id.word(threshold_column, t)) { - cerr << "GFP_L::construct_from_tdt: no " << threshold_column << " column in '" - << _id << "'"; - if (::threshold > 0.0) { - cerr << " using default threshold\n"; - } else { - cerr << '\n'; - return 0; - } - } else { - similarity_type_t d; - if (!t.numeric_value(d) || d < 0.0 || d > 1.0) { - cerr << "Invalid threshold '" << t << "' in '" << _id << "'\n"; - return 0; - } - - _threshold.set(d); - } - } - - if (score_tag.length()) { - if (!tdt.dataitem_value(score_tag, _score)) { - cerr << "GFP_L::construct_from_tdt: cannot extract '" << score_tag - << "' from tdt\n"; - return 0; - } - } - - if (score_column >= 0) { - const_IWSubstring c; - if (!_id.word(score_column, c)) { - cerr << "Cannot extract column " << score_column << " from '" << _id << "'\n"; - return 0; - } - - if (!c.numeric_value(_score)) { - cerr << "Invalid score, column " << score_column << " in '" << _id << "'\n"; - return 0; - } - - if (verbose > 2) { - cerr << _id << " set score to " << _score << '\n'; - } - } - - if (max_cluster_size_tag.length() > 0) { - int tmp; - if (!tdt.dataitem_value(max_cluster_size_tag, tmp) || tmp < 1) { - cerr << "GFP_L::construct_from_tdt: missing or invalid '" << max_cluster_size_tag - << "' in tdt\n"; - return 0; - } - - _max_cluster_size.set(tmp); - } - - if (max_cluster_size_column >= 0) { - const_IWSubstring c; - if (!_id.word(max_cluster_size_column, c)) { - cerr << "Cannot extract column " << max_cluster_size_column << " from '" << _id - << "'\n"; - return 0; - } - - int tmp; - if (!c.numeric_value(tmp) || tmp < 1) { - cerr << "Invalid maximum cluster size, column " << max_cluster_size_column - << " in '" << _id << "'\n"; - return 0; - } - - // cerr << "Max cluster size for '" << _id << "' is " << tmp << '\n'; - _max_cluster_size.set(tmp); - } - - return 1; -} - -/* - Our pool is an array of FP objects -*/ - -static GFP_L *pool = nullptr; - -static int pool_size = 0; - -/* - A cluster is a set of pointers to such objects -*/ - -typedef resizable_array Cluster; - -/* - Within the pool it is convenient to keep track of the first unselected - item -*/ - -static int first_unselected = 0; -static int last_unselected = 0; - -static int -echo_selected_dataitems(IW_TDT &tdt, const resizable_array_p &items_to_echo, - IWString_and_File_Descriptor &output) { - int ne = items_to_echo.number_elements(); - for (int i = 0; i < ne; i++) { - const IWString &tag = *(items_to_echo[i]); - - if (!tdt.echo_dataitem(tag, 0, output)) { - cerr << "Cannot echo '" << tag << "' from TDT\n"; - throw "Missing dataitem"; - return 0; - } - } - - return output.good(); -} - -static int -echo_all_dataitems(IW_TDT &tdt, IWString_and_File_Descriptor &output) { - return tdt.write_all_except_vbar(output); -} - -static int -write_cluster_data(IW_TDT &tdt, int clusters_found, int id_within_cluster, - similarity_type_t distance_to_centre, - IWString_and_File_Descriptor &output) { - if (dataitems_to_echo.number_elements()) { - echo_selected_dataitems(tdt, dataitems_to_echo, output); - } else { - echo_all_dataitems(tdt, output); - } - - if (distance_to_centre >= 0.0) { - output << distance_tag << distance_to_centre << ">\n"; - } - - if (!output.good()) { - throw "Bad output stream"; - } - - return output.good(); -} - -static int -get_tdt(IW_TDT &tdt, iwstring_data_source &input, const GFP_L &fp) { - off_t offset; - (void)fp.offset(offset); - - if (!input.seekg(offset)) { - cerr << "Cannot seek to offset '" << offset << '\n'; - return 0; - } - - return tdt.next(input); -} - -// We may write a small floating point number for zero distances just to -// ensure consistent tabular output in textproto form. -static int -WriteAsProto(const Cluster& cluster, - int cluster_number, - const IW_TDT& tdt, - iwstring_data_source& input, - IWString_and_File_Descriptor& output) { - nnbr::NearNeighbours proto; - IWString value; - tdt.dataitem_value(smiles_tag, value); - proto.set_smiles(value.data(), value.length()); - tdt.dataitem_value(identifier_tag, value); - proto.set_name(value.data(), value.length()); - proto.set_csize(cluster.size()); - proto.set_cluster(cluster_number); - - for (const GFP_L* c : cluster) { - IW_TDT tdt; - if (! get_tdt(tdt, input, *c)) { - continue; - } - - nnbr::Nbr* nbr = proto.add_nbr(); - - tdt.dataitem_value(smiles_tag, value); - nbr->set_smi(value.data(), value.length()); - tdt.dataitem_value(identifier_tag, value); - nbr->set_id(value.data(), value.length()); - if (c->distance() > 0.0f) { - nbr->set_dist(c->distance()); - } else if (offset_zero_distances_in_textproto) { - nbr->set_dist(kNotZero); - } - } - - static google::protobuf::TextFormat::Printer printer; - printer.SetSingleLineMode(true); - - std::string buffer; - if (!printer.PrintToString(proto, &buffer)) { - cerr << "WriteAsProto:cannot print " << proto.ShortDebugString() << '\n'; - return 0; - } - - output << buffer << '\n'; - output.write_if_buffer_holds_more_than(8192); - - return 1; -} - -static int -do_leader_on_clusters(Cluster &cluster, similarity_type_t threshold) { - similarity_type_t my_threshold; - if (sub_cluster_threshold > static_cast(0.0)) { - my_threshold = sub_cluster_threshold; - } else if (sub_cluster_threshold_ratio > static_cast(0.0)) { - my_threshold = threshold * sub_cluster_threshold_ratio; - } else { - cerr << "Not sure how to sub cluster\n"; - return 0; - } - - for (int i = 0; i < cluster.number_elements(); i++) { - GFP_L *ldr = cluster[i]; - - for (int j = i + 1; j < cluster.number_elements(); j++) { - GFP_L *cj = cluster[j]; - - similarity_type_t d = ldr->IW_General_Fingerprint::distance(*cj); - if (d <= my_threshold) { - cluster.remove_item(j); - j--; - } - } - } - - return cluster.number_elements(); -} - -int -distance_comparitor(GFP_L *const *ppfp1, GFP_L *const *ppfp2) { - const GFP_L *pfp1 = *ppfp1; - const GFP_L *pfp2 = *ppfp2; - - if (pfp1->distance() < pfp2->distance()) { - return -1; - } else if (pfp1->distance() > pfp2->distance()) { - return 1; - } else { - return 0; - } -} - -class Distance_Comparitor { - private: - public: - int operator()(GFP_L *const, GFP_L *const); -}; - -int -Distance_Comparitor::operator()(GFP_L *const p1, GFP_L *const p2) { - if (p1->distance() < p2->distance()) { - return -1; - } - - if (p1->distance() > p2->distance()) { - return 1; - } - - return 0; -} - -template void resizable_array_base::iwqsort( - Distance_Comparitor &); -template void iwqsort(GFP_L **, int, Distance_Comparitor &); -template void iwqsort(GFP_L **, int, Distance_Comparitor &, - void *); -template void compare_two_items(GFP_L **, - Distance_Comparitor &, - void *); -template void move_in_from_left(GFP_L **, int &, int &, int, - Distance_Comparitor &, - void *); -// template void move_in_from_right(GFP_L**, int&, int&, -// Distance_Comparitor&); -template void swap_elements(GFP_L *&, GFP_L *&, void *); -template void move_in_from_right(GFP_L **, int &, int &, - Distance_Comparitor &); - -static int -process_cluster(Cluster &cluster, similarity_type_t my_threshold, - iwstring_data_source &input, IWString_and_File_Descriptor &output) { - if (sort_by_distance_from_centre) { - Distance_Comparitor dc; - cluster.iwqsort(dc); - } - - if (leader_on_clusters) { - do_leader_on_clusters(cluster, my_threshold); - } - - int cs = cluster.number_elements(); - cluster_size[cs]++; // the leader isn't in the cluster - - GFP_L *centre = cluster[0]; - - if (verbose) { - cerr << "Cluster " << clusters_found << ' ' << cs << " items, centre '" - << cluster[0]->id() << "', "; - if (threshold_from_file_tag.length()) { - similarity_type_t threshold = 0.0f; - (void)centre->threshold(threshold); - cerr << "threshold " << threshold << ", "; - } - cerr << (items_selected + cs) << " items selected\n"; - } - - IW_TDT tdt; - if (!get_tdt(tdt, input, *centre)) { - return 0; - } - - if (write_as_proto) { - return WriteAsProto(cluster, clusters_found, tdt, input, output); - } - - if (dataitems_to_echo.number_elements()) { - echo_selected_dataitems(tdt, dataitems_to_echo, output); - } else { - echo_all_dataitems(tdt, output); - } - - output << "CLUSTER<" << clusters_found << ">\n"; - output << "CSIZE<" << cs << ">\n"; - - // start at 1, we've already done centre above - for (int i = 1; i < cs && output.good(); i++) { - GFP_L &fp = *(cluster[i]); - - if (!get_tdt(tdt, input, fp)) { - return 0; - } - - if (!write_cluster_data(tdt, clusters_found, i, fp.distance(), output)) { - return 0; - } - } - - output << "|\n"; - - return output.good(); -} - -static int -choose_next_centre(int &icentre) { - icentre = -1; - - // just grab the first unselected item - if (0 == score_tag.length() && score_column < 0) { - for (int i = 0; i < pool_size; i++) { - if (!pool[i].selected()) { - icentre = i; - return 1; - } - } - } else if (cluster_distance_scale_factor > static_cast(0.0)) { - score_t max_score = static_cast(0.0); - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected()) { - continue; - } - - score_t s = pool[i].score() + cluster_distance_scale_factor * - pool[i].shortest_distance_to_cluster_centre(); - - if (icentre < 0 || s > max_score) { - max_score = s; - icentre = i; - } - } - } else // raw scores - { - score_t max_score = static_cast(0.0); - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected()) { - continue; - } - - score_t s = pool[i].score(); - - if (icentre < 0 || s > max_score) { - max_score = s; - icentre = i; - } - } - } - - return icentre >= 0; -} - -static int -compute_the_distance(IW_General_Fingerprint &fp, IW_General_Fingerprint &p, - similarity_type_t &d) { - if (!can_be_compared(fp, p)) { - return 0; - } - - if (tversky.active()) { - d = static_cast(1.0) - - fp.IW_General_Fingerprint::tversky(p, tversky); - return 1; - } - - if (abandon_distance_cutoff > static_cast(0.0)) { - if (!fp.IW_General_Fingerprint::tanimoto(p, abandon_distance_cutoff, d)) { - return 0; - } - - d = static_cast(1.0) - d; - return 1; - } - - d = fp.IW_General_Fingerprint::distance(p); - - return 1; -} - -static int -form_cluster_threshold(int icentre, Cluster &cluster, - const similarity_type_t my_threshold) { - GFP_L &fp = pool[icentre]; - - if (verbose > 2) { - cerr << "Leader is " << fp.id() << '\n'; - } - - int istop = last_unselected; - - int next_first_unselected = -1; - - for (int i = first_unselected; i <= istop; i++) { - GFP_L &p = pool[i]; - - if (p.selected()) { - continue; - } - - if (next_first_unselected < 0) { - next_first_unselected = i; - } - - last_unselected = i; - - similarity_type_t d; - if (!compute_the_distance(fp, p, d)) { - continue; - } - - if (d <= my_threshold) { - cluster.add(&(pool[i])); - p.set_selected(1); - p.set_distance(d); - if (i == next_first_unselected) { - next_first_unselected = -1; - } - } else if (d < p.shortest_distance_to_cluster_centre()) { - p.set_shortest_distance_to_cluster_centre(d); - } - } - - if (next_first_unselected > first_unselected) { - first_unselected = next_first_unselected; - } - - return cluster.number_elements(); -} - -static int -form_cluster_max_cluster_size(int icentre, Cluster &cluster, - similarity_type_t my_threshold, - int max_cluster_size_this_molecule) { - assert(max_cluster_size_this_molecule > 0); - - cluster.resize(pool_size); - - GFP_L &fp = pool[icentre]; - - for (int i = 0; i < pool_size; i++) { - GFP_L &p = pool[i]; - - if (p.selected()) { - continue; - } - - if (!can_be_compared(fp, p)) { - continue; - } - - similarity_type_t d; - if (!compute_the_distance(fp, p, d)) { - continue; - } - - if (my_threshold > static_cast(0.0) && d > my_threshold) { - continue; - } - - cluster.add(&(pool[i])); - p.set_distance(d); - if (d < p.shortest_distance_to_cluster_centre()) { - p.set_shortest_distance_to_cluster_centre(d); - } - } - - cluster.sort(&distance_comparitor); - - cluster.resize_keep_storage(max_cluster_size_this_molecule); - - int istop; - if (cluster.number_elements() < max_cluster_size_this_molecule) { - istop = cluster.number_elements(); - } else { - istop = max_cluster_size_this_molecule; - } - - for (int i = 0; i < istop; i++) { - GFP_L *p = cluster[i]; - - p->set_selected(1); - } - - return 1; -} - -/* - The clustering will be limited either by the maximum number of items which - can be in a cluster, or a threshold -*/ - -static int -form_cluster(int icentre, Cluster &cluster, const similarity_type_t my_threshold, - int max_cluster_size_this_molecule) { - cluster.resize_keep_storage(0); - - cluster.add(&(pool[icentre])); - - pool[icentre].selected() = 1; - pool[icentre].set_distance(static_cast(0.0)); - - if (max_cluster_size_this_molecule) { - return form_cluster_max_cluster_size(icentre, cluster, my_threshold, - max_cluster_size_this_molecule); - } else { - return form_cluster_threshold(icentre, cluster, my_threshold); - } -} - -int -leader(iwstring_data_source &input, IWString_and_File_Descriptor &output) { - assert(pool_size > 1); - - assert(0 == items_selected); - assert(0 == clusters_found); - - first_unselected = 0; - last_unselected = pool_size - 1; - - int icentre; - if (!choose_next_centre(icentre)) { - cerr << "Yipes, cannot find initial leader\n"; - return 0; - } - - Cluster cluster; - if (!cluster.resize(pool_size)) { - cerr << "Yipes, cannot allocate " << pool_size << " elements in pool\n"; - return 0; - } - - while (items_selected < pool_size) { - GFP_L ¢re = pool[icentre]; - - similarity_type_t my_threshold = 0.0f; - if (centre.threshold(my_threshold)) { // has come from the file - ; - } else { - my_threshold = threshold; - } - - int max_cluster_size_this_molecule; - - if (max_cluster_size_tag.length() && - pool[icentre].max_cluster_size(max_cluster_size_this_molecule)) { - ; - } else if (max_cluster_size_column >= 0 && - pool[icentre].max_cluster_size(max_cluster_size_this_molecule)) { - ; - } else { - max_cluster_size_this_molecule = max_cluster_size; - } - - if (verbose > 1) { - cerr << "Start cluster " << clusters_found << ". ndx " << icentre - << ", threshold = " << my_threshold; - if (max_cluster_size_this_molecule > 0) { - cerr << ", max size " << max_cluster_size_this_molecule; - } - cerr << '\n'; - } - - (void)form_cluster(icentre, cluster, my_threshold, max_cluster_size_this_molecule); - - (void)process_cluster(cluster, my_threshold, input, output); - - clusters_found++; - if (clusters_found >= max_clusters_to_find) { - break; - } - - items_selected += cluster.number_elements(); - - if (!choose_next_centre(icentre)) { - break; - } - } - - return 1; -} - -static int -build_pool(iwstring_data_source &input) { - off_t offset = input.tellg(); - - int items_in_pool = 0; - - int tdts_read = 0; - - IW_TDT tdt; - while (tdt.next(input)) { - tdts_read++; - - int fatal; - if (!pool[items_in_pool].construct_from_tdt(tdt, fatal)) { - if (fatal) { - cerr << "Cannot parse tdt " << tdt; - return 0; - } - - offset = input.tellg(); - continue; - } - - pool[items_in_pool].set_offset(offset); - - items_in_pool++; - - if (items_in_pool == pool_size) { - cerr << "Pool is full, max " << pool_size << '\n'; - break; - } - - offset = input.tellg(); - } - - pool_size = items_in_pool; - - if (verbose) { - cerr << "Read " << tdts_read << " TDT's, pool contains " << pool_size - << " fingerprints\n"; - } - - return 1; -} - -static int -build_pool(const const_IWSubstring &fname, iwstring_data_source &input) { - IWString tmp(fname); - - if (!input.open(tmp)) // method is non-const on its argument! - { - cerr << "Cannot open '" << fname << "' for input\n"; - return 0; - } - - if (0 == pool_size) { - pool_size = input.count_records_starting_with(identifier_tag); - - if (0 == pool_size) { - cerr << "No occurrences of " << identifier_tag << "' in input\n"; - return 0; - } - - pool = new GFP_L[pool_size]; - if (nullptr == pool) { - cerr << "Yipes, could not allocate pool of size " << pool_size << '\n'; - return 62; - } - - cerr << "Pool automatically sized to " << pool_size << '\n'; - } - - return build_pool(input); -} - -/* - If we have a previously selected file, we keep track of the number - of members of the pool that get selected by the previously selected file -*/ - -static int molecules_selected_by_previously_selected_file = 0; - -static int -do_previously_selected_file(IW_General_Fingerprint &fp, similarity_type_t t) { - for (int i = 0; i < pool_size; i++) { - GFP_L &p = pool[i]; - - if (p.selected()) { // probably hit during an earlier pass - continue; - } - - if (!can_be_compared(fp, p)) { - continue; - } - - similarity_type_t d; - if (!compute_the_distance(p, fp, d)) { - continue; - } - - if (d > t) { - continue; - } - - p.set_selected(1); - - molecules_selected_by_previously_selected_file++; - - if (verbose > 1) { - cerr << p.id() << " distance " << d << " from previously selected '" << fp.id() - << "'\n"; - } - - if (stream_for_discarded_by_previously_selected.rdbuf()->is_open()) { - stream_for_discarded_by_previously_selected << identifier_tag << p.id() << ">\n"; - stream_for_discarded_by_previously_selected << identifier_tag << fp.id() << ">\n"; - stream_for_discarded_by_previously_selected << distance_tag << d << ">\n"; - stream_for_discarded_by_previously_selected << "|\n"; - } - } - - return 1; -} - -static int -do_previously_selected_file(IW_TDT &tdt, similarity_type_t t) { - IW_General_Fingerprint fp; - - int fatal; - if (!fp.construct_from_tdt(tdt, fatal)) { - cerr << "Cannot construct fingerprint from TDT\n"; - cerr << tdt; - - if (fatal) { - return 0; - } - - return 1; - } - - return do_previously_selected_file(fp, t); -} - -static int -do_previously_selected_file(iwstring_data_source &input, similarity_type_t t) { - IW_TDT tdt; - while (tdt.next(input)) { - if (!do_previously_selected_file(tdt, t)) { - return 0; - } - } - - return 1; -} - -static int -do_previously_selected_file(const IWString &fname, similarity_type_t t) { - iwstring_data_source input(fname); - - if (!input.ok()) { - cerr << "Cannot open previously selected file '" << fname << "'\n"; - return 0; - } - - return do_previously_selected_file(input, t); -} - -static int -set_thresholds_via_factor(score_t mvieth_factor) { - score_t global_max_score = -std::numeric_limits::max(); - for (int i = 0; i < pool_size; i++) { - score_t s = pool[i].score(); - if (s > global_max_score) { - global_max_score = s; - } - } - - if (verbose) { - cerr << "Max score in pool is " << global_max_score << '\n'; - } - - for (int i = 0; i < pool_size; i++) { - GFP_L &fp = pool[i]; - - similarity_type_t t = (global_max_score - fp.score()) / mvieth_factor; - fp.set_threshold(t); - - if (verbose > 1) { - cerr << "i = " << i << " '" << fp.id() << " score " << fp.score() - << " threshold set to " << t << '\n'; - } - } - - return 1; -} - -static int -process_dash_e_option(Command_Line &cl, char e, - resizable_array_p &items_to_echo) { - if (!cl.option_present(e)) { - items_to_echo.resize(2); - IWString *t = new IWString(smiles_tag); - items_to_echo.add(t); - t = new IWString(identifier_tag); - items_to_echo.add(t); - - return 1; - } - - int all_found = 0; - - const_IWSubstring evalue; - int i = 0; - while (cl.value(e, evalue, i++)) { - if ("ALL" == evalue) { - all_found = 1; - if (verbose) { - cerr << "Will echo entire tdt on output\n"; - } - } else { - IWString *t = new IWString(evalue); - items_to_echo.add(t); - if (verbose) { - cerr << "Will echo item '" << evalue << "'\n"; - } - } - } - - if (all_found && items_to_echo.number_elements()) { - cerr << "Using '-" << e << " ALL' and other -" << e - << " options doesn't make sense\n"; - return 0; - } - - return 1; -} - -static void -usage(int rc) { -// clang-format off -#if defined(GIT_HASH) && defined(TODAY) - cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; -#else - cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; -#endif -// clang-format on -// clang-format off - cerr << "Performs leader clustering on a set of fingerprints\n"; - cerr << "Usage \n"; - cerr << " -C maximum number of clusters to find\n"; - cerr << " -t specify distance threshold\n"; - cerr << " -t col= threshold is column of the name field\n"; - cerr << " -t tag= threshold for each molecule in dataitem \n"; - cerr << " -H threshold for each molecule in dataitem \n"; - cerr << " -m maximum cluster size\n"; - cerr << " -M max cluster size for each molecule in \n"; - cerr << " -M col=nn max cluster size is column of the name field\n"; - cerr << " -S score tag\n"; - cerr << " -S col=nn score is column in the name field\n"; - cerr << " -I specify identifier tag\n"; - cerr << " -r sort clusters by distance from leader\n"; - cerr << " -E specify pool object dataitems to be echo'd (default $SMI and PCN)\n"; - cerr << " -E ALL echo all dataitems from the pool file\n"; - // cerr << " -X abandon distance computation if any component > - // distance\n"; cerr << " -Y thresold = (max_score - score) / factor\n"; - // cerr << " -R score = score + * distance to cluster\n"; - cerr << " -A file(s) of previously selected molecules - discard all within threshold\n"; - cerr << " -a use as the threshold when comparing against the -A file\n"; - cerr << " -L write fingerprints discarded by -A file(s)\n"; - cerr << " -s specify max pool size\n"; - cerr << " -D ... miscellaneous options, enter '-D help' for info\n"; - cerr << " -F ... gfp options, enter '-F help' for details\n"; - cerr << " -V ... Tversky specification, enter '-V help' for details\n"; - cerr << " -k . write neighbours as textproto\n"; - cerr << " -v verbose output\n"; -// clang-format on - - exit(rc); -} - -static int -display_dash_D_options(std::ostream &output) { - output << " -D sbc= cluster each cluster, constant threshold \n"; - output << " -D sbcr= cluster each cluster. Use ratio of cluster threshold\n"; - - exit(0); - - return 0; -} - -static void -DisplayDashKOptions(std::ostream& output) { - output << R"(-k . default textproto output --k nozero write a small value rather than zero in textproto output -)"; - ::exit(0); -} - -static int -leader(int argc, char **argv) { - Command_Line cl(argc, argv, "vs:I:E:t:F:P:W:X:rH:S:C:Y:V:R:m:M:A:L:a:D:k:"); - - if (cl.unrecognised_options_encountered()) { - cerr << "Unrecognised options encountered\n"; - usage(1); - } - - verbose = cl.option_count('v'); - - if (cl.option_present('D')) { - int i = 0; - const_IWSubstring d; - while (cl.value('D', d, i++)) { - const_IWSubstring directive; - float dvalue; - - if (!d.split_into_directive_and_value(directive, '=', dvalue)) { - cerr << "Invalid -D qualifier '" << d << "'\n"; - display_dash_D_options(cerr); - } - - if ("sbc" == directive) { - sub_cluster_threshold = dvalue; - if (verbose) { - cerr << "Fixed sub-cluster threshold " << sub_cluster_threshold << '\n'; - } - - leader_on_clusters = 1; - } else if ("sbcr" == directive) { - sub_cluster_threshold_ratio = dvalue; - if (verbose) { - cerr << "Variable sub-cluster threshold ratio " << sub_cluster_threshold_ratio - << '\n'; - } - - leader_on_clusters = 1; - } else if ("help" == directive) { - display_dash_D_options(cerr); - } else { - cerr << "Unrecognised -D qualifier '" << d << "'\n"; - display_dash_D_options(cerr); - } - } - } - - if (0 == cl.number_elements()) { - cerr << "Insufficient arguments\n"; - usage(1); - } - - if (cl.option_present('s')) { - if (!cl.value('s', pool_size) || pool_size < 1) { - cerr << "The -s option must be followed by a whole positive number\n"; - usage(3); - } - - pool = new GFP_L[pool_size]; - if (nullptr == pool) { - cerr << "Yipes, could not allocate pool of size " << pool_size << '\n'; - return 62; - } - - if (verbose) { - cerr << "system sized to " << pool_size << '\n'; - } - } - - if (cl.option_present('V')) { - if (!tversky.parse_command_line(cl, 'V', verbose)) { - cerr << "Cannot parse Tversky specifications\n"; - usage(38); - } - } - - if (cl.option_present('C')) { - if (!cl.value('C', max_clusters_to_find) || max_clusters_to_find < 1) { - cerr << "The -C option (max clusters to find) must be followed by a positive " - "integer\n"; - usage(41); - } - - if (verbose) { - cerr << "Will find a max of " << max_clusters_to_find << " clusters\n"; - } - } - - if (cl.option_present('r')) { - sort_by_distance_from_centre = 1; - if (verbose) { - cerr << "Clusters sorted by distance from centre\n"; - } - } - - if (cl.option_present('k')) { - write_as_proto = 1; - const_IWSubstring k; - for (int i = 0; cl.value('k', k, i); ++i) { - if (k == '.') { - continue; - } - if (k == "nozero") { - offset_zero_distances_in_textproto = 1; - if (verbose) { - cerr << "Will offset zero distance values in textproto output\n"; - } - } else if (k == "help") { - DisplayDashKOptions(cerr); - } else { - cerr << "Unrecognised -k qualifier '" << k << "'\n"; - DisplayDashKOptions(cerr); - } - } - if (verbose) { - cerr << "Will write neighbours as nnbr::NearNeighbours textproto form\n"; - } - } - - // We need to be careful with the -i and -I options. Remember - // that the pool is built first - - if (cl.option_present('I')) { - (void)cl.value('I', identifier_tag); - - set_identifier_tag(identifier_tag); - - if (verbose) { - cerr << "Identifiers in dataitem '" << identifier_tag << "'\n"; - } - } - - if (cl.option_present('H')) { - cl.value('H', threshold_from_file_tag); - if (verbose) { - cerr << "Each threshold from the '" << threshold_from_file_tag - << "' dataitem in the input\n"; - } - - if (!threshold_from_file_tag.ends_with('<')) { - threshold_from_file_tag << '<'; - } - } - - if (cl.option_present('S')) { - const_IWSubstring s = cl.string_value('S'); - - if (s.starts_with("col=")) { - s.remove_leading_chars(4); - if (!s.numeric_value(score_column) || score_column < 1) { - cerr << "Invalid column for score '" << s << "'\n"; - usage(14); - } - - if (verbose) { - cerr << "Score for each item in column " << score_column << '\n'; - } - - score_column--; - } else { - score_tag = s; - if (verbose) { - cerr << "Score tag is " << score_tag << "'\n"; - } - } - } - - if (cl.option_present('R')) { - if (!cl.value('R', cluster_distance_scale_factor) || - cluster_distance_scale_factor <= 0.0) { - cerr << "The cluster distance scale factor option (-R) must be followed by a " - "positive number\n"; - usage(19); - } - - if (verbose) { - cerr << "Scores adjusted by " << cluster_distance_scale_factor - << " times distance to nearest cluster centre\n"; - } - } - - if (cl.option_present('F') || cl.option_present('P') || cl.option_present('W')) { - if (!initialise_fingerprints(cl, verbose)) { - cerr << "Cannot initialise general fingerprint options\n"; - usage(17); - } - } - - if (cl.option_present('X')) { - if (!cl.value('X', abandon_distance_cutoff) || abandon_distance_cutoff < 0.0 || - abandon_distance_cutoff > 1.0) { - cerr << "The -X option must be followed by a valid distance (0.0, 1.0)\n"; - usage(13); - } - - if (verbose) { - cerr << "Distance compuations abandoned if any component > " - << abandon_distance_cutoff << '\n'; - } - } - - if (!process_dash_e_option(cl, 'E', dataitems_to_echo)) { - cerr << "Cannot process -E option\n"; - usage(15); - } - - if (!cl.option_present('t') && !cl.option_present('H') && !cl.option_present('Y')) { - cerr << "Threshold distance must be specified via -t, -H or -V options\n"; - usage(28); - } - - if (need_to_call_initialise_fingerprints(cl)) { - if (!initialise_fingerprints(cl, verbose)) { - cerr << "Cannot initialise GFP options\n"; - usage(23); - } - } - - if (cl.option_present('t')) { - int i = 0; - const_IWSubstring t; - while (cl.value('t', t, i++)) { - if (t.starts_with("col=")) { - t.remove_leading_chars(4); - if (!t.numeric_value(threshold_column) || threshold_column < 1) { - cerr << "Invalid column for threshold '" << t << "'\n"; - usage(14); - } - - if (verbose) { - cerr << "Threshold for each item in column " << threshold_column << '\n'; - } - - threshold_column--; - } else if (t.starts_with("tag=")) { - threshold_from_file_tag = t; - threshold_from_file_tag.remove_leading_chars(4); - if (verbose) { - cerr << "Threshold in tag " << threshold_from_file_tag << "'\n"; - } - - if (!threshold_from_file_tag.ends_with('<')) { - threshold_from_file_tag.add('<'); - } - } else if (!t.numeric_value(threshold) || threshold < 0.0 || threshold > 1.0) { - cerr << "The -t option must be followed by a valid distance value\n"; - usage(12); - } else { - if (verbose) { - cerr << "Distance threshold set to " << threshold << '\n'; - } - } - } - } - - if (cl.option_present('m')) { - if (!cl.value('m', max_cluster_size) || max_cluster_size < 2) { - cerr << "The -m (max cluster size) option must be followed by a whole number > 1\n"; - usage(43); - } - - if (verbose) { - cerr << "Max cluster size " << max_cluster_size << '\n'; - } - } - - if (cl.option_present('M')) { - const_IWSubstring m = cl.string_value('M'); - - if (m.starts_with("col=")) { - m.remove_leading_chars(4); - if (!m.numeric_value(max_cluster_size_column) || max_cluster_size_column < 1) { - cerr << "The column for the per molecule maximum cluster size must be a whole " - "positive number\n"; - usage(11); - } - - if (verbose) { - cerr << "The maximum cluster size per molecule will be in column " - << max_cluster_size_column << '\n'; - } - - max_cluster_size_column--; - } else { - max_cluster_size_tag = m; - - if (verbose) { - cerr << "Max cluster size in '" << max_cluster_size_tag << "' tag\n"; - } - } - } - - if (cl.number_elements() > 1) { - cerr << "Extra arguments ignored\n"; - } - - iwstring_data_source pool_file; - - if (!build_pool(cl[0], pool_file) || 0 == pool_size) { - cerr << "Cannot build pool from '" << cl[0] << "'\n"; - return 21; - } - - if (cl.option_present('A')) { - if (!cl.option_present('t') && !cl.option_present('a')) { - cerr << "Must have a threshold available with the -A option (use -t or -a)\n"; - usage(11); - } - - if (cl.option_present('L')) { - IWString fname = cl.string_value('L'); - stream_for_discarded_by_previously_selected.open(fname.null_terminated_chars(), - std::ios::out); - if (!stream_for_discarded_by_previously_selected.good()) { - cerr << "Cannot open stream for molecules discarded by previously selected '" - << fname << "'\n"; - return 4; - } - - if (verbose) { - cerr << "Molecules too close to previously selected file(s) written to '" << fname - << "'\n"; - } - } - - similarity_type_t t; - - if (cl.option_present('a')) { - if (!cl.value('a', t) || t < 0.0 || t >= 1.0) { - cerr << "Invalid value for previously selected threshold (-a option)\n"; - usage(4); - } - - if (verbose) { - cerr << "Will use " << t - << " as the threshold for the previously selected list\n"; - } - } else { - t = threshold; - } - - IWString fname; - int i = 0; - while (cl.value('A', fname, i++)) { - if (!do_previously_selected_file(fname, t)) { - cerr << "Cannot process previously selected file (-A option)\n"; - return 8; - } - } - - if (verbose) { - cerr << "Rejected " << molecules_selected_by_previously_selected_file - << " molecules by previously selected file(s)\n"; - } - - if (molecules_selected_by_previously_selected_file == pool_size) { - cerr << "Yipes, the previously selected file knocked out the whole pool\n"; - return 1; - } - } - - /* - threshold = (MAX_SCORE - score) / factor - where factor is a user settable number - */ - - if (cl.option_present('Y')) { - if (!cl.option_present('S')) { - cerr << "Scores must be present(-S) in order to use -V\n"; - usage(42); - } - - if (cl.option_present('H')) { - cerr << "The -H (threshold in file) and -Y options are mutually exclusive\n"; - usage(31); - } - - score_t mvieth_factor; - - if (!cl.value('Y', mvieth_factor) || mvieth_factor <= 0.0) { - cerr << "The Vieth factor (-Y) must be followed by a positive number\n"; - usage(12); - } - - if (verbose) { - cerr << "Vieth factor " << mvieth_factor << '\n'; - } - - set_thresholds_via_factor(mvieth_factor); - } - - IWString_and_File_Descriptor output(1); - - try { - leader(pool_file, output); - } catch (const char *err) { - cerr << "Caught '" << err << "' terminated\n"; - return 81; - } - - std::cout.flush(); - - if (verbose) { - cerr << "Clustered " << pool_size << " fingerprints into " << clusters_found - << " clusters\n"; - int isum = 0; - for (int i = 0; i < cluster_size.number_elements(); i++) { - int j = cluster_size[i]; - if (0 == j) { - continue; - } - - cerr << j << " clusters were of size " << i << " members\n"; - - isum += j * i; - } - - cerr << "In clusters " << isum << '\n'; - } - - return 0; -} - -int -main(int argc, char **argv) { - int rc = leader(argc, argv); - - return rc; -} diff --git a/src/Utilities/GFP_Tools/gfp_spread_v2.cc b/src/Utilities/GFP_Tools/gfp_spread_v2.cc deleted file mode 100644 index e83fec7f..00000000 --- a/src/Utilities/GFP_Tools/gfp_spread_v2.cc +++ /dev/null @@ -1,1343 +0,0 @@ -/* - Spread implementation -*/ - -#include -#include - -#include "Foundational/accumulator/accumulator.h" -#include "Foundational/cmdline/cmdline.h" -#include "Foundational/data_source/iwstring_data_source.h" -#include "Foundational/iw_tdt/iw_tdt.h" -#include "Foundational/iwmisc/numeric_data_from_file.h" -#include "sparse_collection.h" -#include "spread_v2.h" -#include "tversky.h" - -using std::cerr; -using std::endl; - -static int verbose = 0; - -static Tversky tversky; - -/* - When doing a run with no pre-selected molecules, start with the - object which is furthest from the first fingerprint. -*/ - -static int start_with_object_furthest_from_first = 0; - -static int start_with_object_furthest_from_everything = 0; - -static int choose_first_item_randomly = 0; - -static int first_item_is_one_with_highest_scale_factor = 0; - -static int already_selected_molecules_present = 0; - -/* - Our pool is an array of FP objects -*/ - -static Spread_Object *pool = nullptr; - -static int pool_size = 0; - -static int poolptr = 0; // the next item in the pool to be filled - -static int number_to_select = 0; - -static similarity_type_t stop_once_distance_drops_below = 0.0f; - -static int report_establish_initial_distances = 0; - -/* - We keep track of the distances of the nearest selected items -*/ - -static Accumulator nearest_selected_neighbour_distance; - -static int retest_no_neighbours = 0; - -static IWString smiles_tag("$SMI<"); -static IWString identifier_tag("PCN<"); -static IWString distance_tag("DIST<"); - -static IWString previously_computed_nn_distance_tag; - -static int previous_computed_distance_column = -1; - -static Numeric_Data_From_File previously_computed_distances; - -/* - How often do we squeeze out selected items -*/ - -static int squeeze_pool = 0; - -/* - The normal output of the near neighbour programmes leaves $SMI and PCN tag in the - output file. Those would confuse things, so they must be transformed to something - else. We assume that has been done -*/ - -static IWString nn_smiles_tag("NNSMI<"); -static IWString nn_id_tag("NNID<"); - -static float blurr_distances = static_cast(0.0); - -static similarity_type_t longest_distance_recognised = - static_cast(0.0); - -static int -get_previously_computed_nearest_neighbour( - Spread_Object &p, const IW_STL_Hash_Map_float &previously_computed_distances) { - IW_STL_Hash_Map_float::const_iterator f = previously_computed_distances.find(p.id()); - - if (f == - previously_computed_distances.end()) { // OK if no previously computed distance - return 1; - } - - p.set_nearest_previously_selected_neighbour("C", "UNK", (*f).second); - - return 1; -} - -static int -get_previously_computed_nearest_neighbour( - const IW_TDT &tdt, Spread_Object &p, - const IWString &previously_computed_nn_distance_tag) { - similarity_type_t d; - if (!tdt.dataitem_value(previously_computed_nn_distance_tag, d)) { - return 0; - } - - IWString nnsmiles; - if (!tdt.dataitem_value(nn_smiles_tag, nnsmiles)) { - return 0; - } - - IWString nnid; - if (!tdt.dataitem_value(nn_id_tag, nnid)) { - return 0; - } - - p.set_nearest_previously_selected_neighbour(nnsmiles, nnid, d); - - // cerr << "Set distance to previous " << p.distance() << endl; - - return 1; -} - -static int -build_pool(iwstring_data_source &input, - const IWString &previously_computed_nn_distance_tag) { - assert(pool_size > 0); - // cerr << "Pool ptr " << poolptr << ", pool size " << pool_size << endl; - assert(poolptr >= 0 && poolptr < pool_size); - - int items_with_previously_computed_distances = 0; - - IW_TDT tdt; - while (tdt.next(input)) { - int fatal; - if (!pool[poolptr].construct_from_tdt(tdt, fatal)) { - if (fatal) { - cerr << "Cannot parse tdt " << tdt; - return 0; - } - - continue; - } - - if (previously_computed_nn_distance_tag.length()) { - get_previously_computed_nearest_neighbour( - tdt, pool[poolptr], - previously_computed_nn_distance_tag); // should really check for fatal errors - } else if (previously_computed_distances.size()) { - if (!get_previously_computed_nearest_neighbour(pool[poolptr], - previously_computed_distances)) { - return 0; - } - } else if (previous_computed_distance_column > 0) { - if (!pool[poolptr].set_distance_to_previously_selected_from_column( - previous_computed_distance_column)) { - cerr << "Cannot set previously selected distance based on column '" - << pool[poolptr].id() << "'\n"; - return 0; - } - } - - if (pool[poolptr].has_a_nearest_selected_neighbour()) { - items_with_previously_computed_distances++; - } - - poolptr++; - - if (poolptr >= pool_size) { - cerr << "Pool is full, max " << pool_size << endl; - break; - } - } - - poolptr--; - - if (verbose) { - cerr << "Pool now contains " << (poolptr + 1) << " objects\n"; - if (previously_computed_nn_distance_tag.length()) { - cerr << items_with_previously_computed_distances - << " items had previously computed distances\n"; - } - } - - return 1; -} - -static int -allocate_pool() { - assert(pool_size > 0); - assert(nullptr == pool); - - pool = new Spread_Object[pool_size]; - if (nullptr == pool) { - cerr << "Yipes, could not allocate pool of size " << pool_size << endl; - return 62; - } - - if (verbose) { - cerr << "system sized to " << pool_size << endl; - } - - return 1; -} - -static int -build_pool(const char *fname, const IWString &previously_computed_nn_distance_tag) { - iwstring_data_source input(fname); - if (!input.ok()) { - cerr << "Cannot open '" << fname << "' for input\n"; - return 0; - } - - if (0 == pool_size) { - pool_size = input.count_records_starting_with("PCN<"); - - if (0 == pool_size) { - cerr << "Zero occurrences of '" - << "PCN<" - << "' in '" << fname << "'\n"; - return 0; - } - - if (!allocate_pool()) { - return 0; - } - } - - return build_pool(input, previously_computed_nn_distance_tag); -} - -static int -do_squeeze_pool(int &istart, int &istop) { - int iptr = 0; - for (int i = 0; i < istart; i++) { - if (pool[i].selected()) { - continue; - } - - if (iptr != i) { - pool[iptr] = pool[i]; - } - - iptr++; - } - - istart = 0; - istop = iptr; - - return 1; -} - -/* - After establishing initial distances, some pool members don't have a nearest selected - neighbour. re-scan the pool -*/ - -static int -rescan_for_no_neighbours(iwstring_data_source &input) { - resizable_array to_scan; - to_scan.resize(pool_size); - - for (int i = 0; i < pool_size; i++) { - if (!pool[i].has_a_nearest_selected_neighbour()) { - to_scan.add(&(pool[i])); - } - } - - int nts = to_scan.number_elements(); - - if (0 == nts) { - return 1; - } - - if (verbose) { - cerr << "After reading previously selected, " << nts << " items with no neighbour\n"; - } - - if (!input.seekg(0)) { - cerr << "rescan_for_no_neighbours: yipes, cannot seek back to beginning of file\n"; - return 0; - } - - IW_TDT tdt; - - while (tdt.next(input)) { - int fatal; - Spread_Object fp; - if (!fp.construct_from_tdt(tdt, fatal)) { - if (fatal) { - cerr << "Cannot build fingerpint\n" << tdt << endl; - return 0; - } - - continue; - } - - for (int i = 0; i < to_scan.number_elements(); i++) { - Spread_Object *p = to_scan[i]; - - if (tversky.active()) { - p->object_has_been_selected(fp, tversky); - } else { - p->object_has_been_selected(fp); - } - } - } - - return 1; -} - -static int -establish_initial_distances(iwstring_data_source &input) { - int ntdt = 0; - - IW_TDT tdt; - while (tdt.next(input)) { - int fatal; - Spread_Object fp; - if (!fp.construct_from_tdt(tdt, fatal)) { - if (fatal) { - cerr << "Cannot build fingerpint\n" << tdt << endl; - return 0; - } - - continue; - } - - ntdt++; - - // Run this fingerprint against everything in the pool - - for (int i = 0; i < pool_size; i++) { - if (tversky.active()) { - pool[i].object_has_been_selected(fp, tversky); - } else if (static_cast(0.0) != longest_distance_recognised) { - pool[i].object_has_been_selected_max_distance(fp, longest_distance_recognised); - } else { - pool[i].object_has_been_selected(fp); - } - } - - if (report_establish_initial_distances && - 0 == ntdt % report_establish_initial_distances) { - cerr << "Established initial distances " << ntdt << endl; - } - } - - if (0 == ntdt) { - cerr << "establish_initial_distances:: warning, no TDT's read\n"; - } - - if (verbose > 1) { - cerr << "INitial distances established\n"; - for (int i = 0; i < pool_size; i++) { - cerr << "Pool " << i << " is " << pool[i].id() << " dist " << pool[i].distance() - << endl; - } - } - - if (retest_no_neighbours) { - rescan_for_no_neighbours(input); - } - - return 1; -} - -static int -establish_initial_distances(const const_IWSubstring &fname) { - static int first_call = 1; - - if (first_call) { - for (int i = 0; i < pool_size; i++) { - pool[i].set_distance(2.0); - } - - first_call = 0; - } - - iwstring_data_source input(fname); - if (!input.good()) { - cerr << "Cannot open already selected file '" << fname << "'\n"; - return 0; - } - - if (verbose) { - cerr << "Establishing initial distances wrt '" << fname << "'\n"; - } - - return establish_initial_distances(input); -} - -/* - We use the atom count windows in an unusual manner. - Usually atom count windows are used to NOT compare molecules - that have very different atom counts. - But when doing maximum dis-similarity selections, we actually - want to find things at large distances. Therefore, invert - the use of the atom count window. Initially, only consider - molecules that have very different atom counts. When it - seems no longer possible to do that, stop using the atom - count windows -*/ - -static int use_atom_count_window = 0; - -static void -do_object_has_been_selected_tversky(int isel) { - Spread_Object &fpsel = pool[isel]; - - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected() || i == isel) { - continue; - } - - if (static_cast(0.0) != blurr_distances) { - pool[i].object_has_been_selected(fpsel, tversky, blurr_distances); - } else { - pool[i].object_has_been_selected(fpsel, tversky); - } - } - - return; -} - -static void -do_object_has_been_selected_no_blurring(int isel) { - Spread_Object &fpsel = pool[isel]; - - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected() || i == isel) { - continue; - } - - pool[i].object_has_been_selected(fpsel); - } - - return; -} - -static void -do_object_has_been_selected_with_blurring(int isel) { - Spread_Object &fpsel = pool[isel]; - - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected() || i == isel) { - continue; - } - - pool[i].object_has_been_selected(fpsel, blurr_distances); - } - - return; -} - -static void -do_object_has_been_selected_with_distance_cutoff(int isel) { - Spread_Object &fpsel = pool[isel]; - for (int i = 0; i < pool_size; i++) { - if (pool[i].selected() || i == isel) { - continue; - } - - pool[i].object_has_been_selected_max_distance(fpsel, longest_distance_recognised); - } - - return; -} - -static void -do_object_has_been_selected(int isel) { - if (tversky.active()) { - do_object_has_been_selected_tversky(isel); - } else if (static_cast(0.0) != blurr_distances) { - do_object_has_been_selected_with_blurring(isel); - } else if (static_cast(0.0) != longest_distance_recognised) { - do_object_has_been_selected_with_distance_cutoff(isel); - } else { - do_object_has_been_selected_no_blurring(isel); - } - - return; -} - -static similarity_type_t -compute_the_distance(Spread_Object &fp1, Spread_Object &fp2) { - if (tversky.active()) { - return static_cast(1.0) - - fp1.IW_General_Fingerprint::tversky(fp2, tversky); - } - - return fp1.IW_General_Fingerprint::distance(fp2); -} - -static int -choose_largest_previously_computed_distance() { - int rc = -1; - similarity_type_t dmax = static_cast(0.0); - - for (int i = 0; i < pool_size; i++) { - similarity_type_t d = pool[i].distance(); - - if (static_cast(1.0) == d) { - continue; - } - - if (d > dmax) { - rc = i; - dmax = d; - } - } - - if (rc < 0) { - cerr << "Warning, none of " << pool_size - << " items have previously computed distances!\n"; - rc = 0; - } - - return rc; -} - -/* - No checks as to whether things have scaling factors or not -*/ - -static int -item_with_highest_scale_factor() { - float highest_scale = pool[0].scale(); - int rc = 0; - - for (int i = 1; i < pool_size; i++) { - if (pool[i].scale() > highest_scale) { - highest_scale = pool[i].scale(); - rc = i; - } - } - - return rc; -} - -/* - Since Tversky is asymmetric, we should probably do both loops 1-pool_size -*/ - -static int -do_start_with_object_furthest_from_everything(int &istart) { - int id_of_further_distance_encountered = -1; - similarity_type_t furthest_distance_encountered = 0.0; - - for (int i = 0; i < pool_size; i++) { - Spread_Object &pi = pool[i]; - - for (int j = i + 1; j < pool_size; j++) { - similarity_type_t d = compute_the_distance(pi, pool[j]); - if (d > furthest_distance_encountered) { - furthest_distance_encountered = d; - id_of_further_distance_encountered = i; - } - } - } - - istart = id_of_further_distance_encountered; - - if (verbose) { - cerr << "Starting with '" << pool[id_of_further_distance_encountered].id() - << "' dist " << furthest_distance_encountered << endl; - } - - return 1; -} - -static int -do_start_with_object_furthest_from_first(int &istart) { - resizable_array already_done; - already_done.resize(start_with_object_furthest_from_first); - similarity_type_t furthest_distance_encountered = 0.0; - int id_of_further_distance_encountered = 0; - - for (int i = 0; i < start_with_object_furthest_from_first; i++) { - already_done.add(istart); - - Spread_Object &fp0 = pool[istart]; - - int furthest_away = -1; - - similarity_type_t d0 = 0.0; - for (int j = 1; j < pool_size; j++) { - similarity_type_t d = compute_the_distance(pool[j], fp0); - - if (d <= d0) { - continue; - } - - if (j == istart || already_done.contains(j)) { - continue; - } - - d0 = d; - furthest_away = j; - } - - assert(furthest_away > 0); - - if (verbose) { - cerr << "Furthest from first fingerprint is " << furthest_away << " '" - << pool[furthest_away].id() << "', distance " << d0 << endl; - } - - istart = furthest_away; - - if (d0 > furthest_distance_encountered) { - furthest_distance_encountered = d0; - id_of_further_distance_encountered = istart; - } - } - - istart = id_of_further_distance_encountered; - - if (verbose) { - cerr << "Starting with '" << pool[id_of_further_distance_encountered].id() - << "' dist " << furthest_distance_encountered << endl; - } - - return 1; -} - -static int -furthest_from_already_selected() { - int rc = 0; - similarity_type_t maxd = pool[0].distance(); - - for (int i = 1; i < pool_size; i++) { - similarity_type_t d = pool[i].distance(); - - if (d <= maxd) { - continue; - } - - maxd = d; - rc = i; - } - - return rc; -} - -static int -fpobj_spread(IWString_and_File_Descriptor &output) { - int first_selected; - - if (choose_first_item_randomly) { - std::random_device rd; - std::default_random_engine generator(rd()); - std::uniform_int_distribution u(0, pool_size - 1); - first_selected = u(generator); - } else if (already_selected_molecules_present) { - first_selected = furthest_from_already_selected(); - } else if (previously_computed_nn_distance_tag.length()) { - first_selected = choose_largest_previously_computed_distance(); - } else if (first_item_is_one_with_highest_scale_factor) { - first_selected = item_with_highest_scale_factor(); - } else { - first_selected = 0; - } - - if (start_with_object_furthest_from_first) { - do_start_with_object_furthest_from_first(first_selected); - } else if (start_with_object_furthest_from_everything) { - do_start_with_object_furthest_from_everything(first_selected); - } - - Spread_Object &fp0 = pool[first_selected]; - fp0.set_selected(); - do_object_has_been_selected(first_selected); - - if (verbose > 1) { - cerr << "First selected '" << fp0.id() << "'\n"; - } - - output << smiles_tag << fp0.smiles() << ">\n"; - output << identifier_tag << fp0.id() << ">\n"; - - const Smiles_ID_Dist &sid = fp0.nsn(); - - if (already_selected_molecules_present || - previously_computed_nn_distance_tag.length()) { - output << smiles_tag << sid.smiles() << ">\n"; - output << identifier_tag << sid.id() << ">\n"; - output << distance_tag - << static_cast(sid.distance() * pool[first_selected].scale()) << ">\n"; - } else { - output << smiles_tag << "*>\n"; - output << identifier_tag << "*>\n"; - output << distance_tag << "1>\n"; - } - output << "|\n"; - - // We need a heuristic about when to turn off the atom count window if being used - - int strikes = 0; - - int number_selected = 1; - - int istart = 0; // first unselected point - int istop = pool_size; // last unselected point - - while (number_selected < number_to_select) { - similarity_type_t maxdist = -1.0; - int ichoose = -1; - - int firstunselected = -1; - int lastunselected = -1; - - for (int i = istart; i < istop; i++) { - if (pool[i].selected()) { - continue; - } - - if (firstunselected < 0) { - firstunselected = i; - } - lastunselected = i; - - // cerr << number_selected << " distance " << i << " '" << pool[i].id() << "' is - // " << pool[i].distance() << endl; - - if (pool[i].distance() > maxdist) { - maxdist = pool[i].distance(); - ichoose = i; - } - } - // cerr << "next " << ichoose << " '" << pool[ichoose].id() << "'\n"; - - istart = firstunselected; - istop = lastunselected + 1; - - assert(ichoose >= 0); - - Spread_Object &fpsel = pool[ichoose]; - - output << smiles_tag << fpsel.smiles() << ">\n"; - output << identifier_tag << fpsel.id() << ">\n"; - const Smiles_ID_Dist &sid = fpsel.nsn(); - output << smiles_tag << sid.smiles() << ">\n"; - output << identifier_tag << sid.id() << ">\n"; - if (static_cast(1.0) != fpsel.scale()) { - output << "SCALE<" << fpsel.scale() << ">\n"; - } - output << distance_tag << fpsel.distance() - << ">\n"; // the sid object does not know about any scaling of the distance - output << "|\n"; - - if (verbose > 1) { - cerr << "Selected " << number_selected << " '" << fpsel.id() << "' (index " - << ichoose << ") distance " << fpsel.distance() << " NSN '" << sid.id() - << "'\n"; - } - - output.write_if_buffer_holds_more_than(32768); - - nearest_selected_neighbour_distance.extra(fpsel.distance()); - - number_selected++; - - fpsel.set_selected(); - - int molecules_skipped_for_being_too_close = 0; - - for (int i = istart; i < istop; i++) { - if (pool[i].selected()) { - continue; - } - - if (!use_atom_count_window) { - ; - } else if (can_be_compared(fpsel, pool[i])) // atom counts too close - { - molecules_skipped_for_being_too_close++; - continue; - } - - if (static_cast(0.0) != blurr_distances) { - pool[i].object_has_been_selected(fpsel, blurr_distances); - } else if (static_cast(0.0) != longest_distance_recognised) { - pool[i].object_has_been_selected_max_distance(fpsel, longest_distance_recognised); - } else { - pool[i].object_has_been_selected(fpsel); - } - } - - if (verbose > 2) { - cerr << molecules_skipped_for_being_too_close - << " molecules_skipped_for_being_too_close\n"; - } - - if (!use_atom_count_window) { - ; - } else if (0 == molecules_skipped_for_being_too_close) { - ; - } else if (static_cast(molecules_skipped_for_being_too_close) / - static_cast(pool_size - number_selected) > - 0.8) { - strikes++; - - if (strikes > 10) // 10 consecutive selections each skip 80% of the pool - { - use_atom_count_window = 0; - if (verbose) { - cerr << "Atom count window turned off, selected " << number_selected << endl; - } - } - } else { - strikes = 0; - } - - if (squeeze_pool && 0 == number_selected % squeeze_pool) { - do_squeeze_pool(istart, istop); - } - - if (fpsel.distance() < stop_once_distance_drops_below) { - break; - } - } - - if (verbose) { - cerr << "Returning with " << number_selected << " items selected\n"; - } - - return number_selected; -} - -static void -usage(int rc) { -// clang-format off -#if defined(GIT_HASH) && defined(TODAY) - cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; -#else - cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; -#endif -// clang-format on - cerr << "Usage \n"; - cerr << " -s specify max pool size\n"; - cerr << " -n specify how many items to select\n"; - cerr << " -t stop selection once distance drops below \n"; - cerr << " -A specify file of already selected items\n"; - cerr << " -N gfp_nearneighbours has been run and initial distances are " - "in \n"; - cerr << " -p COL= distance scaling factor is column of name\n"; - cerr << " -p specify distance scaling factor in \n"; - cerr << " -p FILE= distance scaling factors in \n"; - cerr << " -p oknoscale ok if not all items have a distance scaling factor - will " - "default to 1.0\n"; - cerr << " -r report progress of initial distance assignments\n"; - cerr << " -S ... options for specifying first item selected, enter '-S " - "help'\n"; - cerr << " -M ... miscellaneous options, enter '-M help' for info\n"; - cerr << " -b \"blurr\" distances to resolution per range\n"; - // cerr << " -i specify identifier tag in pool\n"; - // cerr << " -I specify identifier tag in input file\n"; - cerr << " -F,-P,-W ... gfp options, enter '-F help' for details\n"; - cerr << " -V <...> Tversky conditions, enter '-V help' for details\n"; - cerr << " -v verbose output\n"; - - exit(rc); -} - -static void -display_miscellaneous_options(std::ostream &os) { - os << " -M recomp recompute distance if no nbrs found\n"; - os << " -M nscale include scale factor of nbr with scale\n"; - os << " -M squeeze=nnn squeeze out selected molecules evern steps\n"; - os << " -M ldist= all distances truncated to \n"; - - exit(3); -} - -static void -display_first_item_selection_options(std::ostream &os) { - os << " -S rand randomly choose first item\n"; - os << " -S hsf start with item with highest scale factor\n"; - os << " -S furthest start with item furthest from all other items\n"; - os << " -S fff=nnn start with item nnn times furthest from first item\n"; - - exit(1); -} - -static int -fpobj_spread(int argc, char **argv) { - Command_Line cl(argc, argv, "vs:n:I:i:A:r:p:t:F:P:W:Q:O:N:V:b:S:M:"); - - if (cl.unrecognised_options_encountered()) { - cerr << "Unrecognised options encountered\n"; - usage(1); - } - - verbose = cl.option_count('v'); - - if (cl.option_present('W')) { - use_atom_count_window = 1; - - if (verbose) { - cerr << "Atom count window specified, special processing for spread\n"; - } - } - - if (cl.option_present('V')) { - if (!tversky.parse_command_line(cl, 'V', verbose)) { - cerr << "Cannot initialise Tversky parameters\n"; - return 8; - } - } - - if (cl.option_present('M')) { - int i = 0; - IWString m; - while (cl.value('M', m, i++)) { - m.to_lowercase(); - if ("recomp" == m) { - retest_no_neighbours = 1; - if (verbose) { - cerr << "Molecules with no initial nearest neighbour will be recomputed " - "without window\n"; - } - } else if ("nscale" == m) { - if (!cl.option_present('p')) { - cerr << "NO scaling data specified (-p) so cannot use 'nscale'\n"; - return 3; - } - - set_include_scale_of_nsn_with_scale(1); - if (verbose) { - cerr << "Will include scaling factor of nearest selected nbr with scale\n"; - } - } else if (m.starts_with("squeeze=")) { - m.remove_leading_chars(8); - if (!m.numeric_value(squeeze_pool) || squeeze_pool < 1) { - cerr << "The squeeze= directive must be a +ve number\n"; - exit(3); - } - - if (verbose) { - cerr << "Already selected molecules will be squeezed out every " << squeeze_pool - << " selections\n"; - } - } else if (m.starts_with("ldist=")) { - m.remove_leading_chars(6); - if (!m.numeric_value(longest_distance_recognised) || - longest_distance_recognised <= 0.0f || longest_distance_recognised > 1.0f) { - cerr << "The longest recognised distance directive 'ldist=' must be a valid " - "distance\n"; - exit(2); - } - - if (verbose) { - cerr << "Long distances truncated to " << longest_distance_recognised << endl; - } - } else if ("help" == m) { - display_miscellaneous_options(cerr); - } else { - cerr << "Unrecognised -M qualifier '" << m << "\n"; - display_miscellaneous_options(cerr); - } - } - } - - if (0 == cl.number_elements()) { - cerr << "Insufficient arguments\n"; - usage(1); - } - - if (need_to_call_initialise_fingerprints(cl)) { - if (!initialise_fingerprints(cl, verbose)) { - cerr << "Cannot initialise GFP options\n"; - usage(23); - } - } else if (!initialise_fingerprints(cl[0], verbose)) { - cerr << "Cannot initialise fingerprints from '" << cl[0] << "'\n"; - return 11; - } - - if (cl.option_present('s')) { - if (!cl.value('s', pool_size) || pool_size < 1) { - cerr << "The -s option must be followed by a whole positive number\n"; - usage(3); - } - - if (!allocate_pool()) { - return 83; - } - } - - // We need to be careful with the -i and -I options. Remember - // that the pool is built first - - if (cl.option_present('i')) { - const_IWSubstring id; - (void)cl.value('i', id); - - set_identifier_tag(id); - - if (verbose) { - cerr << "Identifiers in pool tagged as '" << id << "'\n"; - } - } - - int scaling_factor_specified = 0; - - if (cl.option_present('p')) { - IWString tag, fname, col; - - int i = 0; - const_IWSubstring p; - - while (cl.value('p', p, i++)) { - if (p.starts_with("FILE=") && 0 == fname.length()) { - p.remove_leading_chars(5); - fname = p; - } else if (p.starts_with("COL=") && 0 == col.length()) { - p.remove_leading_chars(4); - col = p; - } else if ("oknoscale" == p) { - set_every_object_must_have_a_scale_factor(0); - } else if (0 == tag.length()) { // tag can only be specified once - tag = p; - } else { - cerr << "Unrecognised -p qualifier '" << p << "'\n"; - usage(4); - } - } - - if (0 == tag.length() && 0 == fname.length() && 0 == col.length()) { - cerr << "Must specify either tag, file name or column for the weighting factor\n"; - usage(3); - } - - if (tag.length() && fname.length() && col.length()) { - cerr << "Must specify just one tag, just one file name or a column for the " - "weighting factor\n"; - usage(3); - } - - if (tag.length()) { - set_scale_tag(tag); - - if (verbose) { - cerr << "The scale factor will be the '" << tag << "' dataitem\n"; - } - } else if (fname.length()) { - if (!read_scaling_data(fname.null_terminated_chars(), verbose)) { - cerr << "Cannot read scaling data from '" << fname << "'\n"; - return 3; - } - } else if (col.length()) { - int c; - if (!col.numeric_value(c) || c < 2) { - cerr << "Invalid scaling factor column '" << col << "'\n"; - return 4; - } - - set_scaling_factor_column(c); - } - - scaling_factor_specified = 1; - } - - if (cl.option_present('N')) { - int nset = 0; - - IWString fname; - - int i = 0; - const_IWSubstring n; - - while (cl.value('N', n, i++)) { - if (n.starts_with("FILE=")) { - n.remove_leading_chars(5); - fname = n; - nset++; - } else if (n.starts_with("COL=")) { - n.remove_leading_chars(4); - if (!n.numeric_value(previous_computed_distance_column) || - previous_computed_distance_column < 1) { - cerr << "The previously computed distance column must be a whole +ve number\n"; - return 3; - } - if (verbose) { - cerr << "Previously computed near neighbour distances in column " - << previous_computed_distance_column << endl; - } - - previous_computed_distance_column--; - nset++; - } else if (0 == previously_computed_nn_distance_tag.length()) { - previously_computed_nn_distance_tag = n; - if (verbose) { - cerr << "Previously computed near neighbour distances in the '" - << previously_computed_nn_distance_tag << "' tag\n"; - } - - if (!previously_computed_nn_distance_tag.ends_with('<')) { - previously_computed_nn_distance_tag += '<'; - } - - nset++; - } else { - cerr << "Unrecognised -N qualifier '" << n << "'\n"; - usage(3); - } - } - - if (nset > 1) { - cerr << "Can specify just one of FILE=, COL= or tag for previously computed " - "distances\n"; - usage(3); - } - - if (fname.length()) { - if (!previously_computed_distances.read_data(fname)) { - cerr << "Cannot read previously computed distances from '" << fname << "'\n"; - return 4; - } - - if (verbose) { - cerr << "Read " << previously_computed_distances.size() - << " previously computed nn distances from '" << fname << "'\n"; - } - } - } - - // build the pool - - for (int i = 0; i < cl.number_elements(); i++) { - if (!build_pool(cl[i], previously_computed_nn_distance_tag)) { - cerr << "Yipes, cannot build pool from '" << cl[i] << "'\n"; - return i + 1; - } - } - - pool_size = poolptr + 1; - - if (verbose && scaling_factor_specified) { - const Accumulator &sfs = scale_factor_statistics(); - cerr << sfs.n() << " of " << pool_size << " pool objects had demerit/scale factors\n"; - - if (sfs.n() > 0) { - cerr << "Scale factors between " << sfs.minval() << " and " << sfs.maxval(); - if (sfs.n() > 1) { - cerr << ", ave " << sfs.average(); - } - cerr << endl; - } - } - - // Now that the pool is built, we can switch identifiers if needed - - if (cl.option_present('I')) { - const_IWSubstring id; - cl.value('I', id); - - set_identifier_tag(id); - - if (verbose) { - cerr << "Identifiers in input tagged as '" << id << "'\n"; - } - } - - if (cl.option_present('r')) { - if (!cl.value('r', report_establish_initial_distances) || - report_establish_initial_distances < 1) { - cerr << "The -r option must be followed by a whole positive number\n"; - usage(18); - } - - if (verbose) { - cerr << "Will report initial neighbour assignments every " - << report_establish_initial_distances << " fingerprints\n"; - } - } - - // There can be any number of already present files. - // Each can have its own filter if the name starts with FILTER: - - if (cl.option_present('A')) { - set_scaling_factor_column(-1); // turn off scaling factor stuff - set_scale_tag(""); - set_every_object_must_have_a_scale_factor(0); - - const_IWSubstring a; - int i = 0; - while (cl.value('A', a, i++)) { - const_IWSubstring fname = a; // by default - - if (!establish_initial_distances(fname)) { - cerr << "Cannot establish initial distances from '" << fname << "'\n"; - return 54; - } - } - - already_selected_molecules_present = 1; - } else { - already_selected_molecules_present = 0; - } - - if (cl.option_present('S')) { - if (cl.option_count('S') > 1) { - cerr << "There can be only one means of selecting the first item (-S)\n"; - usage(3); - } - - if (already_selected_molecules_present) { - cerr << "Cannot specify how to select first molecule if already selected molecules " - "present(-A)\n"; - usage(3); - } - - IWString s = cl.string_value('S'); - s.to_lowercase(); - - if ("rand" == s) { - choose_first_item_randomly = 1; - if (verbose) { - cerr << "Will choose the first item randomly"; - } - } else if ("furthest" == s) { - start_with_object_furthest_from_everything = 1; - if (verbose) { - cerr << "Will start with the item furthest from all other items\n"; - } - } else if ("hsf" == s) { - first_item_is_one_with_highest_scale_factor = 1; - if (verbose) { - cerr << "First selected will be molecule with highest scale factor\n"; - } - } else if (s.starts_with("fff=")) { - s.remove_leading_chars(4); - if (!s.numeric_value(start_with_object_furthest_from_first) || - start_with_object_furthest_from_first < 1) { - cerr << "The times furthest from first option 'fff=nnn' must be a +ve number\n"; - display_first_item_selection_options(cerr); - } - start_with_object_furthest_from_first = 1; - - if (verbose) { - cerr << "Will start with the molecule furthest from the first molecule in the " - "set\n"; - } - } else if ("help" == s) { - display_first_item_selection_options(cerr); - } else { - cerr << "Unrecognised first item selection directive '" << s << "'\n"; - display_first_item_selection_options(cerr); - } - } - if (cl.option_present('b')) { - if (!cl.value('b', blurr_distances) || blurr_distances < static_cast(0.0)) { - cerr << "The blurr distances option (-b) must be a non negative number\n"; - usage(4); - } - - std::random_device rd; - std::default_random_engine generator(rd()); - std::uniform_real_distribution u(0.0f, 1.0f); - - if (verbose) { - cerr << "Distance blurring factor set to " << blurr_distances << '\n'; - for (int i = 0; i < 5; i++) { - similarity_type_t r = u(generator); - cerr << "distance " << r << " becomes " << do_blurring(r, blurr_distances) - << '\n'; - } - } - } - - if (cl.option_present('n')) { - int n; - if (!cl.value('n', n) || n < 1) { - cerr << "the -n option must be followed by a whole positive number\n"; - usage(13); - } - - if (n > pool_size) { - cerr << "You asked for " << n << " molecules, but pool only contains " << pool_size - << ". Shortened\n"; - n = pool_size; - } - - number_to_select = n; - if (verbose) { - cerr << number_to_select << " molecules will be selected\n"; - } - } else { - number_to_select = pool_size; - } - - if (cl.option_present('t')) { - if (!cl.value('t', stop_once_distance_drops_below) || - stop_once_distance_drops_below < 0.0f || stop_once_distance_drops_below >= 1.0f) { - cerr << "The stop selection distance option (-t) must be a valid distance\n"; - usage(1); - } - - if (verbose) { - cerr << "Will stop selection once distance drops below " - << stop_once_distance_drops_below << endl; - } - } - - IWString_and_File_Descriptor output(1); - - (void)fpobj_spread(output); - - if (verbose) { - cerr << "Nearest previously selected item distances between " - << nearest_selected_neighbour_distance.minval() << " and " - << nearest_selected_neighbour_distance.maxval(); - if (nearest_selected_neighbour_distance.n() > 1) { - cerr << " ave " << nearest_selected_neighbour_distance.average(); - } - cerr << endl; - } - - // delete [] pool; leave this out for efficiency - - cerr << "Output can be processed with nplotnn\n"; - - return 0; -} - -int -main(int argc, char **argv) { - int rc = fpobj_spread(argc, argv); - - return rc; -} diff --git a/src/Utilities/GFP_Tools/gfp_to_descriptors_multiple.cc b/src/Utilities/GFP_Tools/gfp_to_descriptors_multiple.cc index e0c9cd60..9d485d18 100644 --- a/src/Utilities/GFP_Tools/gfp_to_descriptors_multiple.cc +++ b/src/Utilities/GFP_Tools/gfp_to_descriptors_multiple.cc @@ -184,7 +184,7 @@ fill_output_array(const Sparse_Fingerprint& fp, } // cerr << "Bit " << zbit << " found in column " << f->second << '\n'; - tmp[(*f).second] = zcount; + tmp[(*f).second] += zcount; rc++; } diff --git a/src/Utilities/GFP_Tools/spread_v2.cc b/src/Utilities/GFP_Tools/spread_v2.cc deleted file mode 100644 index 514cdf04..00000000 --- a/src/Utilities/GFP_Tools/spread_v2.cc +++ /dev/null @@ -1,390 +0,0 @@ -#include - -#include "Foundational/iw_tdt/iw_tdt.h" -#include "Foundational/iwmisc/numeric_data_from_file.h" - -#include "spread_v2.h" - -using std::cerr; -using std::endl; - -static IWString scale_tag; - -static IWString smiles_tag("$SMI<"); - -int -set_scale_tag(const const_IWSubstring & tag) -{ - scale_tag = tag; - - if (0 == scale_tag.length()) // has been turned off - ; - else if (! scale_tag.ends_with('<')) - scale_tag += '<'; - - return 1; -} - -static Numeric_Data_From_File id_to_scale; - -static int scaling_factor_column = -1; - -void -set_scaling_factor_column(int c) -{ - if (c >= 1) - scaling_factor_column = c - 1; // convert to word index - else if (0 == c) - cerr << "Column zero is invalid for scaling factor column\n"; - else - scaling_factor_column = c; - - return; -} - -static int every_object_must_have_a_scale_factor = 1; - -void -set_every_object_must_have_a_scale_factor(int s) -{ - every_object_must_have_a_scale_factor = s; -} - -static int include_scale_of_nsn_with_scale = 0; - -void -set_include_scale_of_nsn_with_scale(int s) -{ - include_scale_of_nsn_with_scale = s; -} - -int -read_scaling_data(const char * fname, - int verbose) -{ - if (! id_to_scale.read_data(fname)) - { - cerr << "Cannot read scaling factor data from '" << fname << "'\n"; - return 0; - } - - if (verbose) - cerr << "Read " << id_to_scale.size() << " id->scale values\n"; - - return 1; -} - -/* - We keep track of statistics on scale factors -*/ - -static Accumulator scale_stats; - -const Accumulator & -scale_factor_statistics() -{ - return scale_stats; -} - -Spread_Object::Spread_Object() -{ - _selected = 0; - - _scale = 1.0; - - _scale_of_nearest_selected_nbr = 1.0; - - _scaled_distance = 0.0; - - return; -} - -Spread_Object & -Spread_Object::operator=(const Spread_Object & rhs) -{ - IW_General_Fingerprint::operator=(rhs); - - _nearest_selected_neighbour = rhs._nearest_selected_neighbour; - - _smiles = rhs._smiles; - - _selected = rhs._selected; - _scale = rhs._scale; - _scale_of_nearest_selected_nbr = rhs._scale_of_nearest_selected_nbr; - _scaled_distance = rhs._scaled_distance; - return *this; -} - -void -Spread_Object::_update_nsn_stuff(const Spread_Object & fpsel, - similarity_type_t d) -{ - _nearest_selected_neighbour.set_distance(d); - _nearest_selected_neighbour.set_smiles(fpsel.smiles()); - _nearest_selected_neighbour.set_id(fpsel.id()); - - if (include_scale_of_nsn_with_scale) - { - _scale_of_nearest_selected_nbr = fpsel.scale(); - - _scaled_distance = _scale * _scale_of_nearest_selected_nbr * d; - } - else - _scaled_distance = _scale * d; - - return; -} - -int -Spread_Object::object_has_been_selected(Spread_Object & fpsel) -{ - similarity_type_t new_distance = IW_General_Fingerprint::distance(fpsel); - - if (new_distance >= _nearest_selected_neighbour.distance()) - return 0; - - _update_nsn_stuff(fpsel, new_distance); - - return 1; -} - -int -Spread_Object::object_has_been_selected(Spread_Object & fpsel, - float blurr_distances) -{ - similarity_type_t new_distance = IW_General_Fingerprint::distance(fpsel); - - new_distance = do_blurring(new_distance, blurr_distances); - - if (new_distance >= _nearest_selected_neighbour.distance()) - return 0; - - _update_nsn_stuff(fpsel, new_distance); - - return 1; -} - -int -Spread_Object::object_has_been_selected(Spread_Object & fpsel, - const Tversky & tversky) -{ - similarity_type_t new_distance = static_cast(1.0) - IW_General_Fingerprint::tversky(fpsel, tversky); - - if (new_distance >= _nearest_selected_neighbour.distance()) - return 0; - - _update_nsn_stuff(fpsel, new_distance); - - return 1; -} - -int -Spread_Object::object_has_been_selected(Spread_Object & fpsel, - const Tversky & tversky, - float blurr_distances) -{ - similarity_type_t new_distance = static_cast(1.0) - IW_General_Fingerprint::tversky(fpsel, tversky); - - new_distance = do_blurring(new_distance, blurr_distances); - - if (new_distance >= _nearest_selected_neighbour.distance()) - return 0; - - _update_nsn_stuff(fpsel, new_distance); - - return 1; -} - - -int -Spread_Object::object_has_been_selected_max_distance(Spread_Object & fpsel, - similarity_type_t max_dist) -{ - similarity_type_t new_distance = IW_General_Fingerprint::distance(fpsel); - - if (new_distance > max_dist) - new_distance = max_dist; - - if (new_distance >= _nearest_selected_neighbour.distance()) - return 0; - - _update_nsn_stuff(fpsel, new_distance); - - return 1; -} - -int -Spread_Object::_determine_scale_from_column(int c) -{ - if (_id.nwords() > c) // great, our token is there - ; - else if (every_object_must_have_a_scale_factor) - { - cerr << "Spread_Object::_determine_scale_from_column:no column data '" << _id << "'\n"; - cerr << "Column " << c << endl; - return 0; - } - else - { - _scale = static_cast(1.0); - return 1; - } - - const_IWSubstring token = _id.word(c); - - if (! token.numeric_value(_scale) || _scale < 0.0) // even though a zero value makes no sense. - { - cerr << "Spread_Object::_determine_scale_from_column:invalid scaling value '" << _id << "'\n"; - return 0; - } - -//cerr << "Converted '" << token << "' to " << _scale << endl; - - scale_stats.extra(_scale); - - return 1; -} - -int -Spread_Object::_determine_scale_from_hash(const IW_STL_Hash_Map_float & id_to_scale) -{ - IW_STL_Hash_Map_float::const_iterator f; - - if (_id.nwords() > 1) - { - IWString tmp(_id); - tmp.truncate_at_first(' '); - f = id_to_scale.find(tmp); - } - else - f = id_to_scale.find(_id); - - if (f != id_to_scale.end()) - { - _scale = (*f).second; - - scale_stats.extra(_scale); - - return 1; - } - - if (every_object_must_have_a_scale_factor) - { - cerr << "No scaling data for '" << _id << "'\n"; - return 0; - } - - return 1; -} - -int -Spread_Object::_determine_scale_from_tag(const IW_TDT & tdt, - const IWString & scale_tag) -{ - const_IWSubstring s; - if (! tdt.dataitem_value(scale_tag, s)) // no scale tag present - return 1; - - if (! s.numeric_value(_scale) || _scale <= 0.0) - { - cerr << "Spread_Object::create_from_tdt: invalid scale/demerit value\n"; - return 0; - } - - scale_stats.extra(_scale); - - return 1; -} - -int -Spread_Object::construct_from_tdt(IW_TDT & tdt, int & fatal) -{ - if (! IW_General_Fingerprint::construct_from_tdt(tdt, fatal)) - return 0; - - _scale = 1.0; - - if (! tdt.dataitem_value(smiles_tag, _smiles)) - { - cerr << "Spread_Object::construct_from_tdt: no smiles tag '" << smiles_tag << "' in TDT\n"; - return 0; - } - - if (scale_tag.length()) - { - if (! _determine_scale_from_tag(tdt, scale_tag)) - { - fatal = 1; - return 0; - } - } - else if (id_to_scale.size()) - { - if (! _determine_scale_from_hash(id_to_scale)) - { - fatal = 1; - return 0; - } - } - else if (scaling_factor_column > 0) - { - if (! _determine_scale_from_column(scaling_factor_column)) - { - fatal = 1; - return 0; - } - } - -//cerr << "Object " << _id << " with scale " << _scale << endl; - - return 1; -} - -void -Spread_Object::set_nearest_previously_selected_neighbour(const IWString & nnsmiles, - const IWString & nnid, - similarity_type_t d) -{ - _nearest_selected_neighbour.set_smiles(nnsmiles); - _nearest_selected_neighbour.set_id(nnid); - _nearest_selected_neighbour.set_distance(d); - - _scaled_distance = d; - _scale_of_nearest_selected_nbr = d; - - return; -} - -similarity_type_t -do_blurring(similarity_type_t d, - float blurr_distances) -{ - float tmp = d * blurr_distances + static_cast(0.499999); - - tmp = static_cast(static_cast(tmp)); - - return tmp / blurr_distances; -} - -int -Spread_Object::set_distance_to_previously_selected_from_column(int col) -{ - const_IWSubstring token; - - if (! _id.word(col, token)) - { - cerr << "Spread_Object::set_distance_to_previously_selected_from_column:cannot extract column " << (col+1) << endl; - return 0; - } - - similarity_type_t d; - if (! token.numeric_value(d) || d < 0.0) - { - cerr << "Spread_Object::set_distance_to_previously_selected_from_column:invalid distance " << token << "'\n"; - return 0; - } - - _nearest_selected_neighbour.set_distance(d); - - return 1; -} diff --git a/src/Utilities/GFP_Tools/spread_v2.h b/src/Utilities/GFP_Tools/spread_v2.h deleted file mode 100644 index 53099096..00000000 --- a/src/Utilities/GFP_Tools/spread_v2.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef SPREADV2_H -#define SPREADV2_H - -/* - All the variants of spread need a class like this -*/ - -#include "Foundational/accumulator/accumulator.h" -#include "Foundational/iwstring/iw_stl_hash_map.h" - -#include "gfp.h" -#include "smiles_id_dist.h" - -class Tversky; - -class Spread_Object : public IW_General_Fingerprint -{ - private: - int _selected; - - Smiles_ID_Dist _nearest_selected_neighbour; - -// Our own smiles - - IWString _smiles; - - float _scale; // the demerit value - - float _scale_of_nearest_selected_nbr; - - float _scaled_distance; - -// private functions - - int _determine_scale_from_hash (const IW_STL_Hash_Map_float & id_to_scale); - int _determine_scale_from_tag (const IW_TDT & tdt, const IWString & scale_tag); - int _determine_scale_from_column (int c); - - void _update_nsn_stuff (const Spread_Object & fpsel, similarity_type_t d); - - public: - Spread_Object (); - - Spread_Object & operator= (const Spread_Object & rhs); - - void set_scale (float s) { assert (s > 0.0); _scale = s;} - float scale () const { return _scale;} - -// similarity_type_t distance () const { return _scale * _nearest_selected_neighbour.distance (); } - similarity_type_t distance () const { return _scaled_distance;} - void set_distance (similarity_type_t d) { _nearest_selected_neighbour.set_distance (d); _scaled_distance = _scale * d;} - - int construct_from_tdt (IW_TDT &, int &); - - const IWString & smiles () const { return _smiles;} - - int selected () const { return _selected;} - void set_selected () { assert (! _selected); _selected = 1;} - void set_selected (int s) { _selected = s;} - - const Smiles_ID_Dist & nsn () const { return _nearest_selected_neighbour;} - int has_a_nearest_selected_neighbour () const { return _nearest_selected_neighbour.id ().length ();} - - void set_nearest_previously_selected_neighbour (const IWString &, const IWString &, similarity_type_t); - - int set_distance_to_previously_selected_from_column (int); - - int object_has_been_selected (Spread_Object &); - int object_has_been_selected (Spread_Object &, float blurr_distances); - int object_has_been_selected (Spread_Object &, const Tversky & tv); - int object_has_been_selected (Spread_Object &, const Tversky & tv, float blurr_distances); - int object_has_been_selected_max_distance (Spread_Object &, float max_dist); -}; - -extern int set_scale_tag (const const_IWSubstring &); -extern int read_scaling_data (const char *, int verbose); -extern void set_every_object_must_have_a_scale_factor (int s); -extern void set_scaling_factor_column (int c); - -extern const Accumulator & scale_factor_statistics (); - -extern similarity_type_t do_blurring (similarity_type_t d, float); - -extern void set_include_scale_of_nsn_with_scale (int); - -#endif diff --git a/src/Utilities/GeneExpression/BUILD b/src/Utilities/GeneExpression/BUILD new file mode 100644 index 00000000..78793160 --- /dev/null +++ b/src/Utilities/GeneExpression/BUILD @@ -0,0 +1,121 @@ +load("@rules_proto//proto:defs.bzl", "proto_library") +load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") +load("//build_deps:install.bzl", "local_install") + +local_install( + name = "install", + srcs = [ + ":gene_expression_to_proto", + ":gene_expression_nearneighbours", + ], +) + +proto_library( + name = "gene_expression_proto", + srcs = [ + "gene_expression.proto", + ], +) + +cc_proto_library( + name = "gene_expression_cc_proto", + deps = [ + ":gene_expression_proto", + ] +) + +py_proto_library( + name = "gene_expression_py_proto", + srcs = [ + "gene_expression.proto", + ] +) + + +cc_library( + name = "gene_expression_lib", + srcs = [ + "gene_expression.cc", + ], + hdrs = [ + "gene_expression.h", + ], + deps = [ + ":gene_expression_cc_proto", + ] +) + +cc_library( + name = "needle_lib", + srcs = [ + "needle.cc", + ], + hdrs = [ + "needle.h", + ], + deps = [ + "//Foundational/accumulator", + "//Foundational/iwstring:iwstring", + ":gene_expression_cc_proto", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_protobuf//:protobuf", + ] +) + +#cc_binary( +# name = "gene_expression_benchmark", +# srcs = [ +# "gene_expression_benchmark.cc", +# ], +# deps = [ +# ":gene_expression_cc_proto", +# ":gene_expression_lib", +# "@benchmark", +# ] +#) + +cc_binary( + name = "gene_expression_nearneighbours", + srcs = [ + "gene_expression_nearneighbours.cc", + ], + deps = [ + ":gene_expression_cc_proto", + ":gene_expression_lib", + ":needle_lib", + "//Foundational/accumulator", + "//Foundational/cmdline_v2", + "//Foundational/data_source:iwtfdata_record", + "@com_google_protobuf//:protobuf", + ], + tags = [ + "gene_expression", + ], +) + +cc_binary( + name = "gene_expression_to_proto", + srcs = [ + "gene_expression_to_proto.cc", + ], + deps = [ + ":gene_expression_cc_proto", + ":gene_expression_lib", + "//Foundational/cmdline_v2", + "//Foundational/data_source:iwstring_data_source", + "//Foundational/data_source:iwtfdata_record", + "@com_google_protobuf//:protobuf", + ] +) + +cc_test ( + name = "needle_test", + srcs = [ + "needle_test.cc", + ], + deps = [ + "needle_lib", + ":gene_expression_cc_proto", + "@googletest//:gtest_main", + ], +) diff --git a/src/Utilities/GeneExpression/gene_expression.cc b/src/Utilities/GeneExpression/gene_expression.cc new file mode 100644 index 00000000..569d1cd6 --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression.cc @@ -0,0 +1,55 @@ +#include "gene_expression.h" + +namespace gene_expression { + +GeneProfile::GeneProfile() { + _number_genes = 0; + _rank = nullptr; + _max_positive_connection_strength = 0; +} + +GeneProfile::~GeneProfile() { + _number_genes = 0; + delete [] _rank; +} + +double +GeneProfile::Association(const GeneProfile& rhs) const { + assert(_number_genes == rhs._number_genes); + + int64_t sum = 0; + + // could use std::inner_product, but why... + for (uint32_t i = 0; i < _number_genes; ++i) { + sum += _rank[i] * rhs._rank[i]; + } + + return static_cast(sum) / _max_positive_connection_strength; +} + +int +GeneProfile::Build(const Profile& proto) { + _id = proto.name(); + +#ifdef REIMPLEMENT_QQ + _number_genes = proto.rank_size(); + + _rank = new int[_number_genes]; + + for (uint32_t i = 0; i < _number_genes; ++i) { + _rank[i] = proto.rank(i); + } + + uint64_t sum = 0; + + for (uint32_t i = 0; i < _number_genes; ++i) { + sum += _rank[i] * _rank[i]; + } + + _max_positive_connection_strength = static_cast(sum); +#endif + + return 1; +} + +} // namespace gene_expression diff --git a/src/Utilities/GeneExpression/gene_expression.h b/src/Utilities/GeneExpression/gene_expression.h new file mode 100644 index 00000000..84c8378b --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression.h @@ -0,0 +1,37 @@ +#ifndef UTILITIES_GENEEXPRESSION_GENE_EXPRESSION_H_ +#define UTILITIES_GENEEXPRESSION_GENE_EXPRESSION_H_ + +#include + +#include "Utilities/GeneExpression/gene_expression.pb.h" + +namespace gene_expression { + +class GeneProfile { + private: + uint32_t _number_genes; + int32_t* _rank; + + double _max_positive_connection_strength; + + // Profile _profile; + + std::string _id; + + public: + GeneProfile(); + ~GeneProfile(); + + int Build(const Profile& proto); + + double Association(const GeneProfile& rhs) const; + + const std::string& id() const { + return _id; + } +}; + + +} // namespace gene_expression + +#endif // UTILITIES_GENEEXPRESSION_GENE_EXPRESSION_H_ diff --git a/src/Utilities/GeneExpression/gene_expression.proto b/src/Utilities/GeneExpression/gene_expression.proto new file mode 100644 index 00000000..81a76951 --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression.proto @@ -0,0 +1,14 @@ +syntax = "proto3"; + +package gene_expression; + +message Gene { + uint32 gene_id = 1; + float score = 2; +} + +message Profile { + optional string name = 1; + + repeated Gene gene = 2; +} diff --git a/src/Utilities/GeneExpression/gene_expression_benchmark.cc b/src/Utilities/GeneExpression/gene_expression_benchmark.cc new file mode 100644 index 00000000..0f5b472a --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression_benchmark.cc @@ -0,0 +1,70 @@ +// Benchmarking for the GeneProfile class + +#include +#include +#include +#include + +#include "benchmark/benchmark.h" + +#include "gene_expression.h" + +namespace gene_expression { + +namespace { +// FIll the `ngenes` rank values in `proto` with random values +void +FillRank(int ngenes, int* tmp, std::mt19937_64& rng, Profile& proto) { + std::iota(tmp, tmp + ngenes, 1); + std::shuffle(tmp, tmp + ngenes, rng); + + std::bernoulli_distribution bernouilli(0.50); + + for (int i = 0; i < ngenes; ++i) { + if (bernouilli(rng)) { + tmp[i] = - tmp[i]; + } + } + + proto.mutable_rank()->Reserve(ngenes); + for (int i = 0; i < ngenes; ++i) { + proto.add_rank(tmp[i]); + } +} + +void +BM_Association(benchmark::State& state) { + const int ngenes = state.range(0); + + GeneProfile p1; + GeneProfile p2; + + Profile proto1; + Profile proto2; + + proto1.set_name("id1"); + proto2.set_name("id2"); + + std::random_device rd; + std::mt19937_64 rng(rd()); + + std::unique_ptr tmp = std::make_unique(ngenes); + + FillRank(ngenes, tmp.get(), rng, proto1); + FillRank(ngenes, tmp.get(), rng, proto2); + + p1.Build(proto1); + p2.Build(proto2); + + double total = 0.0; + for (auto _ : state) { + total += p1.Association(p2); + } +} +BENCHMARK(BM_Association)->RangeMultiplier(2)->Range(64, 512); + +} // namespace + +} // namespace gene_expression + +BENCHMARK_MAIN(); diff --git a/src/Utilities/GeneExpression/gene_expression_nearneighbours.cc b/src/Utilities/GeneExpression/gene_expression_nearneighbours.cc new file mode 100644 index 00000000..7ed00aac --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression_nearneighbours.cc @@ -0,0 +1,414 @@ +// Nearest neighbours for serialized Profile gene expression data + +#include +#include +#include +#include + +#define RESIZABLE_ARRAY_IMPLEMENTATION + +#include "Foundational/accumulator/accumulator.h" +#include "Foundational/cmdline_v2/cmdline_v2.h" +#include "Foundational/data_source/tfdatarecord.h" +#include "Foundational/iwmisc/report_progress.h" + +#include "gene_expression.h" +#include "needle.h" + +namespace gene_expression { + +using std::cerr; +using iw_tf_data_record::TFDataReader; + +void +Usage(int rc) { +// clang-format off +#if defined(GIT_HASH) && defined(TODAY) + cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; +#else + cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; +#endif + cerr << R"(Near neighbour finder for serialized Profile gene expression data. +Implements ideas from: + A simple and robust method for connecting small-molecule drugs using gene-expression signatures + Shu-Dong Zhang* and Timothy W Gant* BMC Bioinformatics 2008, 9:258 doi:10.1186/1471-2105-9-258 + +Input consists of two TFDataRecord files of serialized Profile protos, likely generated by +`gene_expression_to_proto`. +One file, the -needles file, is a file of needles. The file(s) presented as arguments are the haystack. +Output is to stdout. + +For every item in the needles file, the -n closest neighbours from the haystack are accumulated. + -needles file of TFDataRecord serialized Profile protos - needles. + -Hmaxrank only consider the top ranked genes in the Haystack (suggest 100 or more) + -Nmaxrank only consider the top ranked genes in the Needles + If neither -Hmaxrank not -Nmaxrank are specified, all genes in the input + files are used. + -n number of neighbours to find. + -maxgeneid gene ids above are placed in a hash rather than an array. + this will lower memory consumption at the expense of compute time. + Suggest a value like 100000 + -v verbose output. +)"; + // clang-format on + // clang-format off + ::exit(rc); +} + + +using needle::Needle; + +class Options { + private: + int _verbose; + + // There are two maxrank values. If neither are specified, we use + // all the data in the input files. + uint32_t _haystack_maxrank; + uint32_t _needle_maxrank; + + int _number_needles; + Needle* _needles; + + uint64_t _haystack_members_read; + + Report_Progress _report_progress; + + // The number of items retrieved with unit similarity. + uint64_t _exact_matches_found; + + // We can optionally write the gene_id's of the matched genes. + int _write_matched_genes; + + // Statistics across all needles and all nbrs + Accumulator _acc_score; + + public: + Options(); + ~Options(); + + int Initialise(Command_Line_v2& cl); + + int Compare(const gene_expression::Profile& haystack); + + uint64_t haystack_members_read() const { + return _haystack_members_read; + } + + // Non cost because it also gathers nearest neighbour statistics. + int WriteNeighbours(IWString_and_File_Descriptor& output); + + int Report(std::ostream& output) const; +}; + +Options::Options() { + _verbose = 0; + _haystack_maxrank = std::numeric_limits::max(); + _needle_maxrank = std::numeric_limits::max(); + + _haystack_members_read = 0; + + _number_needles = 0; + _needles = nullptr; + + _write_matched_genes = 0; + + _exact_matches_found = 0; +} + +Options::~Options() { + if (_needles != nullptr) { + delete [] _needles; + } +} + +uint32_t +RecordsRemaining(TFDataReader& tfdata_reader) { + for (uint32_t rc = 0; ; ++rc) { + std::optional s = tfdata_reader.Next(); + if (! s) { + tfdata_reader.seek_zero(); + return rc; + } + } + + cerr << "RecordsRemaining:should not come to here\n"; + + // Should not come here. + return 0; +} + +int +Options::Initialise(Command_Line_v2& cl) { + _verbose = cl.option_present('v'); + + if (cl.option_present("Hmaxrank")) { + if (! cl.value("Hmaxrank", _haystack_maxrank) || _haystack_maxrank < 1) { + cerr << "Invalid -Hmaxrank " << _haystack_maxrank << '\n'; + return 0; + } + if (_verbose) { + cerr << "Will consider the " << _haystack_maxrank << " highest scores from the haystack\n"; + } + } + + if (cl.option_present("Nmaxrank")) { + if (! cl.value("Nmaxrank", _needle_maxrank) || _needle_maxrank < 1) { + cerr << "Invalid -Nmaxrank " << _needle_maxrank << '\n'; + return 0; + } + + if (_verbose) { + cerr << "Will consider the " << _needle_maxrank << " highest scores from the needles\n"; + } + } + + if (cl.option_present('n')) { + if (! cl.value('n', Needle::_number_neighbours)) { + cerr << "Invalid number of neighbours (-n)\n"; + return 0; + } + + if (_verbose) { + cerr << "Will keep a max of " << Needle::_number_neighbours << " neighbours\n"; + } + } + + if (cl.option_present("maxgeneid")) { + uint32_t tmp; + cl.value("maxgeneid", tmp); + if (tmp < 10000) { + cerr << "Unrealistic value for -maxgeneid " << tmp << '\n'; + return 0; + } + needle::set_large_gene_id_threshold(tmp); + if (_verbose) { + cerr << "Large gene id threshold " << tmp << '\n'; + } + } + + if (! cl.option_present("needles")) { + cerr << "Must specify file of needles via the -needles option\n"; + Usage(1); + } + + if (cl.option_present("needles")) { + IWString fname = cl.string_value("needles"); + + TFDataReader tfdata_reader(fname); + if (! tfdata_reader.good()) { + cerr << "Cannot open needles file '" << fname << "'\n"; + return 0; + } + + _number_needles = RecordsRemaining(tfdata_reader); + cerr << "Find " << _number_needles << " needles in input\n"; + if (_number_needles == 0) { + cerr << "Cannot determine number needles in '" << fname << "'\n"; + return 0; + } + + _needles = new Needle[_number_needles]; + + for (int i = 0; i < _number_needles; ++i) { + std::optional proto = tfdata_reader.ReadProto(); + if (! proto) { + cerr << "Reading needle failed " << i << '\n'; + return 0; + } + + _needles[i].Build(*proto, _needle_maxrank); + } + + Accumulator_Int acc_hash; + for (int i = 0; i < _number_needles; ++i) { + acc_hash.extra(_needles[i].GeneIdsInHash()); + } + + if (_verbose) { + cerr << "Read " << _number_needles << " needles from '" << fname << "'\n"; + cerr << "Have btw " << acc_hash.minval() << " and " << acc_hash.maxval() << + " items in the large gene id hash\n"; + } + } + + + if (cl.option_present("genes")) { + _write_matched_genes = 1; + if (_verbose) { + cerr << "Output will include a list of matched genes\n"; + } + } + + if (cl.option_present("rpt")) { + uint32_t rpt; + cl.value("rpt", rpt); + _report_progress.set_report_every(rpt); + if (_verbose) { + cerr << "Will report progress every " << rpt << " gene expression profiles read\n"; + } + } + + return 1; +} + +int +Options::Compare(const gene_expression::Profile& haystack) { + ++_haystack_members_read; + + if (_report_progress()) { + cerr << "Read " << _haystack_members_read << " haystack members\n"; + } + + static int first_call = true; + if (first_call) { + uint32_t genes_in_haystack = haystack.gene_size(); + cerr << "First call genes_in_haystack " << genes_in_haystack << '\n'; + if (_haystack_maxrank < genes_in_haystack) { + _needles[0].SetMaxPossibleAssociation(_haystack_maxrank); + } else { + _needles[0].SetMaxPossibleAssociation(genes_in_haystack); + } + + if (_write_matched_genes) { + needle::set_accumulate_matching_genes(1); + } + + first_call = false; + } + + + // cerr << "Haystack has " << haystack.gene_size() << " genes stored\n"; + for (int i = 0; i < _number_needles; ++i) { + _needles[i].Compare(haystack, _needle_maxrank, _haystack_maxrank); + } + + return 1; +} + +int +Options::WriteNeighbours(IWString_and_File_Descriptor& output) { + // first work out the max number of genes in common. + Accumulator_Int acc_genes_in_common; + + for (int i = 0; i < _number_needles; ++i) { + uint32_t m = _needles[i].MaxNumberGenes(); + acc_genes_in_common.extra(m); + } + + if (_verbose && acc_genes_in_common.minval() < acc_genes_in_common.maxval()) { + cerr << "Genes in common btw " << acc_genes_in_common.minval() << + " and " << acc_genes_in_common.maxval() << '\n'; + } + + const uint32_t max_genes_in_common = acc_genes_in_common.maxval(); + + for (int i = 0; i < _number_needles; ++i) { + if (_verbose) { + _needles[i].UpdateNnbrStatistics(_exact_matches_found, _acc_score); + } + _needles[i].WriteNeighbours(max_genes_in_common, output); + } + + return output.good(); +} + +int +Options::Report(std::ostream& output) const { + output << "Report on " << _number_needles << " needles that compared " << + _haystack_members_read << " haystack members\n"; + + if (_verbose == 0) { + return output.good(); + } + + output << "Found " << _exact_matches_found << " exact matches\n"; + output << "Scores btw " << _acc_score.minval() << " and " << _acc_score.maxval(); + if (_acc_score.n() > 1) { + output << " ave " << static_cast(_acc_score.average()); + } + output << '\n'; + return output.good(); +} + +int +GeneExpressionNearNeighbours(Options& options, + const gene_expression::Profile& proto) { + return options.Compare(proto); +} + +int +GeneExpressionNearNeighbours(Options& options, + TFDataReader& input) { + while (true) { + std::optional maybe_proto = input.ReadProto(); + if (! maybe_proto) { + return 1; + } + + if (! GeneExpressionNearNeighbours(options, *maybe_proto)) { + cerr << "Error processing " << maybe_proto->ShortDebugString() << '\n'; + return 0; + } + + if (options.haystack_members_read() > std::numeric_limits::max()) { + cerr << "Early return\n"; + return 1; + } + } +} + +int +GeneExpressionNearNeighbours(Options& options, + const char* fname) { + TFDataReader input(fname); + if (! input.good()) { + cerr << "Cannot open '" << fname << "'\n"; + return 0; + } + + return GeneExpressionNearNeighbours(options, input); +} + +int +Main(int argc, char** argv) { + Command_Line_v2 cl(argc, argv, "-v-Hmaxrank=ipos-n=ipos-needles=sfile-rpt=ipos-Nmaxrank=ipos-genes-maxgeneid=ipos"); + if (cl.unrecognised_options_encountered()) { + cerr << "unrecognised_options_encountered\n"; + Usage(1); + } + + if (cl.empty()) { + cerr << "Must specify haystack file(s) as command line arguments\n"; + Usage(1); + } + + Options options; + if (! options.Initialise(cl)) { + return 1; + } + + for (const char* fname : cl) { + if (! GeneExpressionNearNeighbours(options, fname)) { + cerr << "Error processing '" << fname << "'\n"; + return 1; + } + } + + IWString_and_File_Descriptor output(1); + + options.WriteNeighbours(output); + + output.flush(); + + options.Report(cerr); + + return 0; +} + +} // namespace gene_expression + +int +main(int argc, char** argv) { + gene_expression::Main(argc, argv); +} diff --git a/src/Utilities/GeneExpression/gene_expression_to_proto.cc b/src/Utilities/GeneExpression/gene_expression_to_proto.cc new file mode 100644 index 00000000..a04b3f56 --- /dev/null +++ b/src/Utilities/GeneExpression/gene_expression_to_proto.cc @@ -0,0 +1,441 @@ +// Convert csv gene expression data to proto form. + +#include +#include +#include +#include + +#include "Foundational/cmdline_v2/cmdline_v2.h" +#include "Foundational/data_source/iwstring_data_source.h" +#include "Foundational/data_source/tfdatarecord.h" +#include "Foundational/iwmisc/report_progress.h" + +#include "Utilities/GeneExpression/gene_expression.pb.h" + +namespace gene_expression_to_proto { + +using std::cerr; + +using iw_tf_data_record::TFDataWriter; + +namespace fs = std::filesystem; + +void +Usage(int rc) { +// clang-format off +#if defined(GIT_HASH) && defined(TODAY) + cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; +#else + cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; +#endif + // clang-format on + // clang-format off + cerr << R"(Converts csv of gene expression data to proto form +-d input files are to be interpreted as directories. All files ending in are processed. +-S name of output file (a .dat suffix will be added). +-n max number of items per file if generating multiple files. +-fname store only the file name, not the full path name. +-maxrank only store the top genes - can save a lot of space and time. +-rpt report progress every items processed +-v verbose output +)"; + + ::exit(rc); +} + +class Options { + private: + int _verbose; + + // By default, we put all output in one file, but if this is specified, we + // can create a sequence of files. + uint32_t _max_items_per_file; + + // We can create a sequence of files if _max_items_per_file is specified. + // When creating a sequence of files we need to be able to form the name + // of the next file to be created. + int _file_index; + + // If creating multiple files, we need to know how many items have + // been written to the current file. + uint32_t _items_in_current_file; + + int _ignore_errors; + + std::unique_ptr _output; + + // The file name stem. + IWString _stem; + + IWString _suffix; + + // We often get full path names as input. If set, store only the file + // name component. + int _name_is_file_name; + + // Deliberate choice to make this int rather than uint32_t since + // proto.gene_size() returns an int. + int _maxrank; + + uint64_t _protos_written; + + // Keep track of the number of genes written with each proto. + extending_resizable_array _gene_size; + + Report_Progress _report_progress; + + // private functions + int OpenNextFile(); + + public: + Options(); + int Initialise(Command_Line_v2& cl); + + int SetName(const IWString& fname, gene_expression::Profile& proto) const; + + int maxrank() const { + return _maxrank; + } + + void GeneSize(int s) { + ++_gene_size[s]; + } + + int Write(const gene_expression::Profile& proto); + + int Report(std::ostream& output) const; +}; + +Options::Options() { + _verbose = 0; + _file_index = 0; + _items_in_current_file = 0; + _protos_written = 0; + _maxrank = std::numeric_limits::max(); + _max_items_per_file = std::numeric_limits::max(); + _name_is_file_name = 0; + _ignore_errors = 0; + + _suffix = ".dat"; +} + +int +Options::Initialise(Command_Line_v2& cl) { + _verbose = cl.option_count('v'); + + if (! cl.option_present('S')) { + cerr << "Must specfy output file via the -S option\n"; + Usage(1); + } + + if (cl.option_present("ignore_errors")) { + _ignore_errors = 1; + if (_verbose) { + cerr << "Will ignore otherwise fatal errors\n"; + } + } + + cl.value('S', _stem); + if (_verbose) { + cerr << "Output to '" << _stem << "'\n"; + } + + if (cl.option_present('n')) { + if (! cl.value('n', _max_items_per_file)) { + cerr << "The max number of items per file must be a whole +ve number\n"; + Usage(1); + } + if (_verbose) { + cerr << "Will write a max of " << _max_items_per_file << " items to each output file\n"; + } + } + + if (cl.option_present("maxrank")) { + cl.value("maxrank", _maxrank); + if (_maxrank <= 0) { + cerr << "Invalid maxrank " << _maxrank << '\n'; + return 0; + } + if (_verbose) { + cerr << "Will write a max of " << _maxrank << " expression values\n"; + } + } + + if (cl.option_present("fname")) { + _name_is_file_name = 1; + if (_verbose) { + cerr << "Will only store the file name component\n"; + } + } + + if (cl.option_present("rpt")) { + uint32_t rpt; + cl.value("rpt", rpt); + _report_progress.set_report_every(rpt); + if (_verbose) { + cerr << "Will report progress every " << rpt << " protos stored\n"; + } + } + + return 1; +} + +int +Options::OpenNextFile() { + IWString fname; + if (_max_items_per_file == std::numeric_limits::max()) { + fname << _stem; + fname.EnsureEndsWith(_suffix); + } else { + fname << _stem << _file_index << _suffix; + } + + if (_output) { + _output.reset(); + } + + _output = std::make_unique(); + if (! _output->Open(fname)) { + cerr << "Open::OpenNextFile:cannot open '" << fname << "'\n"; + return 0; + } + + return 1; +} + +int +Options::Write(const gene_expression::Profile& proto) { + if (_verbose > 1) { + cerr << "Writing " << proto.name() << '\n'; + } + + if (_report_progress()) { + Report(cerr); + } + + if (! _output) { + if (! OpenNextFile()) { + return 0; + } + } else if (_items_in_current_file >= _max_items_per_file) { + if (! OpenNextFile()) { + cerr << "Options::Write:cannot open next file " << _file_index << '\n'; + return 0; + } + _items_in_current_file = 0; + } + + if (!_output->WriteSerializedProto(proto)) { + cerr << "Cannot write\n"; + return 0; + } + + ++_items_in_current_file; + ++_protos_written; + + return 1; +} + +int +Options::SetName(const IWString& fname, gene_expression::Profile& proto) const { + if (! _name_is_file_name) { + proto.set_name(fname.data(), fname.size()); + return 1; + } + + std::string tmp(fname.data(), fname.size()); + // cerr << "Setting name '" << fs::path(fname.data()).filename() << "'\n"; + proto.set_name(fs::path(tmp).filename()); + return 1; +} + +int +Options::Report(std::ostream& output) const { + output << "Wrote " << _protos_written << " protos\n"; + for (int i = 0; i < _gene_size.number_elements(); ++i) { + if (_gene_size[i]) { + output << _gene_size[i] << " protos had " << i << " genes\n"; + } + } + return 1; +} + +int +AddExpressionData(const const_IWSubstring& line, gene_expression::Profile& proto) { + int i = 0; + const_IWSubstring s_gene, s_expression; + + constexpr char kComma = ','; + + if (! line.nextword(s_gene, i, kComma) || ! line.nextword(s_expression, i, kComma) || + s_gene.empty() || s_expression.empty()) { + cerr << "Options::Process:invalid input '" << line << "'\n"; + return 0; + } + + uint32_t gene; + if (! s_gene.numeric_value(gene)) { + cerr << "Options::Process:invalid gene number '" << s_gene << "'\n"; + return 0; + } + + double expression; + if (! s_expression.numeric_value(expression)) { + cerr << "Options::Process:invalid expression '" << s_expression << "'\n"; + return 0; + } + + gene_expression::Gene* g = proto.add_gene(); + g->set_gene_id(gene); + g->set_score(static_cast(expression)); + + return 1; +} + +int +GeneExpressionToProtoInner(IWString& fname, Options& options) { + iwstring_data_source input; + if (! input.open(fname)) { + cerr << "GeneExpressionToProtoInner:cannot open '" << fname << "'\n"; + return 0; + } + + if (fname.ends_with(".csv")) { + fname.chop(4); + } + + gene_expression::Profile proto; + options.SetName(fname, proto); + + const_IWSubstring line; + + if (! input.next_record(line)) { + cerr << "Cannot read header\n"; + return 0; + } + + while (input.next_record(line)) { + if (! AddExpressionData(line, proto)) { + cerr << "Error processing '" << line << "'\n"; + return 0; + } + if (proto.gene_size() >= options.maxrank()) { + break; + } + } + + options.GeneSize(proto.gene_size()); + // cerr << "Writing proto with " << proto.gene_size() << " items\n"; + + return options.Write(proto); +} + +int +GeneExpressionToProtoFile(IWString& fname, Options& options) { + iwstring_data_source input; + if (! input.open(fname)) { + cerr << "GeneExpressionToProtoFile:cannot open '" << fname << "'\n"; + return 0; + } + + IWString line; + while (input.next_record(line)) { + if (! GeneExpressionToProtoInner(line, options)) { + cerr << "Fatal error processing '" << line << "'\n"; + return 0; + } + } + + return 1; +} + +int +GeneExpressionToProto(IWString& fname, Options& options) { + if (fname.starts_with("F:")) { + fname.remove_leading_chars(2); + return GeneExpressionToProtoFile(fname, options); + } else { + return GeneExpressionToProtoInner(fname, options); + } +} + +int +GeneExpressionToProtoDir(const std::string& dirname, const IWString& suffix, + Options& options) { + for (const auto& entry : fs::directory_iterator(dirname)) { + std::filesystem::path fname = entry.path(); + IWString tmp(fname.string()); + if (! tmp.ends_with(suffix)) { + continue; + } + if (! GeneExpressionToProtoInner(tmp, options)) { + cerr << "GeneExpressionToProtoDir:error processing '" << tmp << "'\n"; + return 0; + } + } + + return 1; +} + +int +Main(int argc, char** argv) { + Command_Line_v2 cl(argc, argv, "-v-S=s-n=ipos-rpt=ipos-d=s-fname-maxrank=i-ignore_errors"); + if (cl.unrecognised_options_encountered()) { + cerr << "unrecognised_options_encountered\n"; + Usage(1); + } + + const int verbose = cl.option_present('v'); + + if (cl.empty()) { + cerr << "Insufficient arguments\n"; + Usage(0); + } + + IWString directory_suffix; + if (cl.option_present('d')) { + cl.value('d', directory_suffix); + if (verbose) { + cerr << "Input files interpreted as directories\n"; + } + } + + Options options; + + if (! options.Initialise(cl)) { + cerr << "Cannot initialise output data\n"; + return 1; + } + + for (const char* tmp : cl) { + if (directory_suffix.empty()) { + IWString fname(tmp); + if (! GeneExpressionToProto(fname, options)) { + cerr << "Error processing '" << fname << "'\n"; + return 1; + } + } else { + std::string dirname(tmp); + if (! GeneExpressionToProtoDir(dirname, directory_suffix, options)) { + cerr << "Error processing directory '" << dirname << "'\n"; + return 1; + } + } + + } + + if (verbose) { + options.Report(cerr); + } + + return 0; +} + +} // namespace gene_expression_to_proto + +int +main(int argc, char **argv) { + int rc = gene_expression_to_proto::Main(argc, argv); + + return rc; +} diff --git a/src/Utilities/GeneExpression/needle.cc b/src/Utilities/GeneExpression/needle.cc new file mode 100644 index 00000000..748fc41b --- /dev/null +++ b/src/Utilities/GeneExpression/needle.cc @@ -0,0 +1,456 @@ +#include +#include +#include + +#include "Foundational/accumulator/accumulator.h" + +#include "needle.h" + +namespace needle { + +using std::cerr; + +uint32_t Needle::_number_neighbours = 1; +double* Needle::_max_possible_association = nullptr; +int Needle::_accumulate_matching_genes = 0; + +// If gene ids that are above this number (if non zero) then those +// gene ids get placed into a hash rather than to the _gene_to_index array. +uint32_t Needle::_large_gene_id_threshold = 100000; + +void +set_accumulate_matching_genes(int s) { + Needle::_accumulate_matching_genes = s; +} + +void set_large_gene_id_threshold(uint32_t s) { + Needle::_large_gene_id_threshold = s; +} + +Needle::Needle() { + _number_genes = 0; + _highest_gene_number = 0; + _gene_to_index = nullptr; + _sign = nullptr; + _lhs = nullptr; + _rank = nullptr; +} + +Needle::~Needle() { + if (_gene_to_index != nullptr) { + delete [] _gene_to_index; + } + + delete [] _sign; + delete [] _lhs; + delete [] _rank; +} + +void +Needle::SetMaxPossibleAssociation(uint32_t maxrank) { + + _max_possible_association = new double[maxrank + 1]; + + // This should never be used since this is a division. + _max_possible_association[0] = 0; + + uint64_t sum = 0; + for (uint32_t i = 1; i <= maxrank; ++i) { + uint64_t tmp = i * i; + sum += tmp; + _max_possible_association[i] = static_cast(sum); + // cerr << i << " sum " << sum << '\n'; + } + + cerr << "Needle::SetMaxPossibleAssociation:max possible rank " << maxrank << + " max_sum " << _max_possible_association[maxrank - 1] << '\n'; + + return; +} + +static constexpr int kSwitchToSparse = 100000; + +int +Needle::Build(const gene_expression::Profile& proto, uint32_t maxrank) { + _name = proto.name(); + _number_genes = proto.gene_size(); + _lhs = new int[_number_genes]; + _rank = new int[_number_genes]; + _sign = new int[_number_genes]; + + _highest_gene_number = 0; + for (const gene_expression::Gene& g : proto.gene()) { + if (g.gene_id() > _highest_gene_number) { + _highest_gene_number = g.gene_id(); + } + } + + // cerr << " cmp lar " << _large_gene_id_threshold << " high " << _highest_gene_number << '\n'; + if (_large_gene_id_threshold > 0 && _highest_gene_number > _large_gene_id_threshold) { + _gene_to_index = new int[_large_gene_id_threshold + 1]; + _max_gene_id_in_index = _large_gene_id_threshold; + } else { + _gene_to_index = new int[_highest_gene_number + 1]; + _max_gene_id_in_index = _highest_gene_number; + } + + if (_max_gene_id_in_index > 10000000) { + cerr << "WARNING: highest gene number " << _highest_gene_number << + " use the -maxgeneid option to lower memory requirements\n"; + } + + std::fill_n(_gene_to_index, _max_gene_id_in_index + 1, -1); + + for (int i = 0; i < proto.gene_size(); ++i) { + uint32_t g = proto.gene(i).gene_id(); + if (g <= _max_gene_id_in_index) { + _gene_to_index[g] = i; + } else { + _large_gene_to_index[g] = i; + } + if (proto.gene(i).score() < 0.0) { + _sign[i] = -1; + } else { + _sign[i] = 1; + } + } + + return 1; +} + +int +Needle::GeneToIndex(uint32_t gene_id) const { + if (gene_id <= _max_gene_id_in_index) { + return _gene_to_index[gene_id]; + } + if (gene_id > _highest_gene_number) { + return -1; + } + + auto iter = _large_gene_to_index.find(gene_id); + if (iter == _large_gene_to_index.end()) { + return -1; + } + + return iter->second; +} + +template +int CompareScores(float s1, float s2, CMP) { + return CMP(std::abs(s1), std::abs(s2)); +} + +// #define CHECK_SORTED + +// `proto` is a member of the haystack. +// needle_max_rank and haystack_max_rank are what were specified on the command line. +int +Needle::Compare(const gene_expression::Profile& proto, + uint32_t needle_max_rank, + uint32_t haystack_max_rank) { + + uint32_t istop_rhs = proto.gene_size(); + // cerr << "Haystack has " << proto.gene_size() << " genes, cmp " << haystack_max_rank << '\n'; + if (istop_rhs > haystack_max_rank) { + istop_rhs = haystack_max_rank; + } + + uint32_t istop_lhs = _number_genes; + if (istop_lhs > needle_max_rank) { + istop_lhs = needle_max_rank; + } + +#ifdef CHECK_SORTED + cerr << "Check " << istop_lhs << " genes, needle_max_rank " << needle_max_rank << " istop_rhs " << istop_rhs << '\n'; +#endif + + // Mark all the genes as uninvolved. + std::fill_n(_lhs, istop_lhs, 0); + + // Make a list of the genes from the RHS that are being used. + + std::vector rhs_needed; + rhs_needed.reserve(istop_rhs); + + for (uint32_t i = 0; i < istop_rhs; ++i) { + const gene_expression::Gene& g = proto.gene(i); + uint32_t gene_id = g.gene_id(); + int ndx = GeneToIndex(gene_id); + if (ndx < 0) { + continue; + } + + if (static_cast(ndx) > needle_max_rank) { + continue; + } + + rhs_needed.push_back(i); + // Mark the left hand side of the comparison. + _lhs[ndx] = 1; + } + + // No genes in common within any maxrank settings. + if (rhs_needed.empty()) { + cerr << "No genes to be compared\n"; + return 1; + } + + // Assign Needle ranks. + int rank = 0; + for (uint32_t i = 0; i < istop_lhs; ++i) { + if (_lhs[i] == 0) { + continue; + } + _rank[i] = rank; + ++rank; + } + + // Loop through all the haystack genes being processed. + // Fetch the corresponding item from the lhs, get ranks and compute... + + int64_t sum = 0; + uint32_t in_common = rhs_needed.size(); + for (uint32_t i = 0; i < in_common; ++i) { + uint32_t rhs = rhs_needed[i]; + const gene_expression::Gene& gene = proto.gene(rhs); + uint32_t gene_id = gene.gene_id(); + uint32_t lhs = GeneToIndex(gene_id); + + int64_t rhs_signed_rank; + if (gene.score() < 0.0) { + rhs_signed_rank = -static_cast(in_common - i); + } else { + rhs_signed_rank = static_cast(in_common - i); + } + // cerr << "in_common " << in_common << " i " << i << " rhs_signed_rank " << rhs_signed_rank << '\n'; + assert(_rank[lhs] >= 0); + + int64_t lhs_signed_rank; + if (_sign[lhs] < 0) { + lhs_signed_rank = - static_cast(in_common - _rank[lhs]); + } else { + lhs_signed_rank = static_cast(in_common - _rank[lhs]); + } + sum += lhs_signed_rank * rhs_signed_rank; + + // cerr << i << " rhs " << rhs << " gene_id " << gene_id << " lhs " << lhs << " rank " << lhs_signed_rank << " rhs " << rhs_signed_rank << " sum " << sum << '\n'; + } + + if (sum == 0) { + return 1; + } + + if (sum > _max_possible_association[in_common]) { + cerr << " sum " << sum << " in_common " << in_common << " max " << _max_possible_association[in_common] << '\n'; + } + float score = (static_cast(sum) / _max_possible_association[in_common]); + +#ifdef STORE_MATCHING_GENES + // Convert rhs_needed from indices into gene ids + for (uint32_t i = 0; i < rhs_needed.size(); ++i) { + uint32_t j = rhs_needed[i]; + rhs_needed[i] = proto.gene(j).gene_id(); + } +#endif + return InsertIntoNbrList(proto.name(), score, in_common, rhs_needed); +} + +int +Needle::InsertIntoNbrList(const std::string& name, float score, int in_common, + const std::vector& rhs_needed) { + + if (rhs_needed.empty()) { + cerr << "rhs_needed empty!\n"; + } +#ifdef CHECK_SORTED + cerr << "sum " << sum << " Storing " << score << " current size " << _nbr_list.size() << '\n'; + cerr << _max_possible_association[in_common] << " max possible, " << in_common << " in common\n"; +#endif + assert(std::abs(score) <= 1.0f); + + if (_nbr_list.empty()) { + // Need to reserve one extra so that the insert_before call below ramains valid + // if something is added near the end of a list. + _nbr_list.reserve(_number_neighbours + 1); + _nbr_list << new Neighbour(name, score, in_common); +#ifdef STORE_MATCHING_GENES + _nbr_list.back()->genes = std::move(rhs_needed); +#endif + return 1; + } + + // cerr << "Last item on list " << _nbr_list.back()->score << '\n'; + + // Lower score than the least similar we have. If the list is full + // just return, otherwise append to end. + if (std::abs(score) <= std::abs(_nbr_list.back()->score)) { + if (_nbr_list.size() < _number_neighbours) { + _nbr_list << new Neighbour(name, score, in_common); +#ifdef STORE_MATCHING_GENES + _nbr_list.back()->genes = std::move(rhs_needed); +#endif + } + return 1; + } + + // If better than first on the queue, insert at beginning. + if (std::abs(score) >= std::abs(_nbr_list[0]->score)) { + if (_nbr_list.size() >= _number_neighbours) { + _nbr_list.pop(); + } + + _nbr_list.insert_at_beginning(new Neighbour(name, score, in_common)); +#ifdef STORE_MATCHING_GENES + _nbr_list.front()->genes = std::move(rhs_needed); +#endif + return 1; + } + + // We need to find it. + + auto cmp = [](const Neighbour* n, float score) { + return std::abs(n->score) >= std::abs(score); + }; + auto iter = std::lower_bound(_nbr_list.rawdata(), _nbr_list.rawdata() + _nbr_list.size(), + score, cmp); +#ifdef CHECK_SORTED + for (uint32_t i = 0; i < _nbr_list.size(); ++i) { + cerr << i << ' ' << _nbr_list[i]->score << '\n'; + } + cerr << " score " << score << " goes at " << (iter - _nbr_list.rawdata()) << '\n'; +#endif + + // Very important that the pop operation be done after the insertion, and not before. +#ifdef STORE_MATCHING_GENES + Neighbour* tmp = new Neighbour(name, score, in_common); + tmp->genes = std::move(rhs_needed); + _nbr_list.insert_before(iter - _nbr_list.rawdata(), tmp); +#else + _nbr_list.insert_before(iter - _nbr_list.rawdata(), new Neighbour(name, score, in_common)); +#endif + + if (_nbr_list.size() > _number_neighbours) { + _nbr_list.pop(); + } + +#ifdef CHECK_SORTED + CheckSorted(); +#endif + + return 1; +} + +int +Needle::CheckSorted() const { + for (uint32_t i = 1; i < _nbr_list.size(); ++i) { + if (std::abs(_nbr_list[i-1]->score) >= std::abs(_nbr_list[i]->score)) { + continue; + } + cerr << "Needle::CheckSorted:out of order at " << i << ' ' << (_nbr_list[i-1]->score - _nbr_list[i]->score) << '\n'; + for (uint32_t j = 0; j < _nbr_list.size(); ++j) { + cerr << j << ' ' << _nbr_list[j]->score << '\n'; + } + return 0; + } + + return 1; +} + +int +Needle::WriteNeighbours(uint32_t max_genes_in_common, IWString_and_File_Descriptor& output) { + static constexpr char kSep = ' '; + +#ifdef STORE_MATCHING_GENES + static constexpr std::string_view kMissing = "-1"; +#endif + + // First Needle to write a neighbour list must write the header. + static bool first_call = true; + if (first_call) { + output << "Needle" << kSep << "Haystack" << kSep << "Score" << kSep << "Common"; +#ifdef STORE_MATCHING_GENES + for (uint32_t i = 0; i < max_genes_in_common; ++i) { + output << kSep << "gid" << i; + } +#endif + output << "\n"; + + first_call = false; + } + + for (uint32_t i = 0; i < _nbr_list.size(); ++i) { + const Neighbour* n = _nbr_list[i]; + output << _name << kSep << n->name << kSep << n->score << kSep << n->genes_in_common; +#ifdef STORE_MATCHING_GENES + for (uint32_t gene : n->genes) { + output << kSep << gene; + } + + for (uint32_t i = n->genes.size(); i < max_genes_in_common; ++i) { + output << kSep << kMissing; + } +#endif + output << '\n'; + + output.write_if_buffer_holds_more_than(4096); + } + + return output.good(); +} + +void +Needle::UpdateNnbrStatistics(uint64_t& exact_matches_found, Accumulator& acc_sim) const { + if (_nbr_list.empty()) { + return; + } + if (abs(_nbr_list.front()->score - 1.0f) < 1.0e-06) { + ++exact_matches_found; + } + + _nbr_list.each_lambda([&acc_sim] (const Neighbour* nbr) { + acc_sim.extra(nbr->score); + }); +} + +bool +operator==(const Neighbour& n1, const Neighbour& n2) { +#ifdef DEBUG_COMPARE_NEIGHGOURS + cerr << "cmp " << n1.name << " and " << n2.name << '\n'; +#endif + if (n1.name != n2.name) { + return false; + } + +#ifdef DEBUG_COMPARE_NEIGHGOURS + cerr << " cmp score " << n1.score << " and " << n2.score << '\n'; +#endif + if (n1.score == n2.score) { + } else if (std::abs(n1.score - n2.score) < 1.0e-05) { + } else { + return false; + } + +#ifdef DEBUG_COMPARE_NEIGHGOURS + cerr << " genes_in_common " << n1.genes_in_common << " and " << n2.genes_in_common << '\n'; +#endif + return n1.genes_in_common == n2.genes_in_common; +} + +std::ostream& operator<<(std::ostream& output, const Neighbour& nbr) { + output << "name: " << nbr.name << " score: " << nbr.score << " common: " << nbr.genes_in_common; + return output; +} + +uint32_t +Needle::MaxNumberGenes() const { + uint32_t rc = 0; + for (const Neighbour* nbr : _nbr_list) { + if (nbr->genes.size() > rc) { + rc = nbr->genes.size(); + } + } + + return rc; +} + +} // namespace needle diff --git a/src/Utilities/GeneExpression/needle.h b/src/Utilities/GeneExpression/needle.h new file mode 100644 index 00000000..3782a356 --- /dev/null +++ b/src/Utilities/GeneExpression/needle.h @@ -0,0 +1,173 @@ +#ifndef UTILITIES_GENEEXPRESSION_NEEDLE_H_ +#define UTILITIES_GENEEXPRESSION_NEEDLE_H_ + +#include +#include + +#include "absl/container/flat_hash_map.h" + +#include "Foundational/accumulator/accumulator.h" +#include "Foundational/iwaray/iwaray.h" +#include "Foundational/iwstring/iwstring.h" + +#include "Utilities/GeneExpression/gene_expression.pb.h" + +// Set this in order to accumulate with each neighbour the +// list of matched genes. +#define STORE_MATCHING_GENES + +namespace needle { + +struct Neighbour { + public: + std::string name; + float score; + // genes_in_common will be the same as genes.size() + // but we may not always accumulate the matching genes. + uint32_t genes_in_common; + // A list of the gene ids shared. + std::vector genes; + + public: +}; + +struct NeedleCompare { + bool operator()(const Neighbour& n1, const Neighbour& n2) { + return n1.score < n2.score; + } +}; + +class Needle { + private: + // The number of genes we read. + uint32_t _number_genes; + // We read a bunch of gene ids from our proto. We record the highest number encountered + // so we can quickly figure out if we have data for a particular gene. + uint32_t _highest_gene_number; + + // This is a mapping from gene id + // to where that gene occurs in our proto. A negative entry means not present. + int* _gene_to_index; + + // Gene ids larger than this threshold are stored in a hash rather than the + // _gene_to_index array. That is because gene_id's can be quite large and + // that consumes a lot of memory. + static uint32_t _large_gene_id_threshold; + + // The size of the _gene_to_index array can vary, so store the highest + // gene number that is in that array. Genes above that number will need + // to go to _large_gene_to_index. + uint32_t _max_gene_id_in_index; + + // For gene id's above _max_gene_id_in_index; + absl::flat_hash_map _large_gene_to_index; + + // Observe that we can get very high gene id's (100M) which will cause memory + // problems if we are using a large number of needles. We can have a sparse + // index for dealing with large gene id's. TODO:ianwatson figure this out. + + // For each gene, the sign of the response. + int* _sign; + + // When comparing with a haystack gene expression, this keeps track of + // which of our genes are to be used in the computation. + // This is a mapping from an index over the haystack member to + // our genes. + int* _lhs; + + // Once we have decided which items are needed, we scan the list and + // sequentially assign a rank to each. + int* _rank; + + // Each item will be assigned a signed rank, depending on whether or not the + // score is positive or negative. + int* _signed_rank; + + // std::priority_queue, NeedleCompare> _neighbours; + + resizable_array_p _nbr_list; + + // Every class instance needs to know the number of neighbours to retain. + public: + static uint32_t _number_neighbours; + private: + + // Every class instance needs to know the maximum possible association value for + // the number of genes found to be in common. + static double* _max_possible_association; + + // Whether or not we keep track of the matching gene ids during comparisons. + static int _accumulate_matching_genes; + + IWString _name; + + // Private functions + int CheckSorted() const; + int GeneToIndex(uint32_t gene_id) const; + int InsertIntoNbrList(const std::string& name, float score, int in_common, + const std::vector& rhs_needed); + + public: + Needle(); + ~ Needle(); + + int Build(const gene_expression::Profile& proto, uint32_t maxrank); + + void set_name(IWString& s) { + _name = s; + } + + uint32_t number_genes() const { + return _number_genes; + } + + void SetMaxPossibleAssociation(uint32_t maxrank); + + int Compare(const gene_expression::Profile& proto, + uint32_t needle_max_rank, + uint32_t haystack_max_rank); + + void UpdateNnbrStatistics(uint64_t& exact_matches_found, Accumulator& acc_sim) const; + + // Scan the _neighbours list and return the max number of genes_in_common + // among those neighbours. + uint32_t MaxNumberGenes() const; + + // max_genes_in_common is only used if we are also writing the matching gene id's. + // It is simple to compute so it is always included, but may not get used. + int WriteNeighbours(uint32_t max_genes_in_common, IWString_and_File_Descriptor& output); + + int number_neighbours() const { + return _nbr_list.number_elements(); + } + const Neighbour* nbr(int ndx) const { + return _nbr_list[ndx]; + } + + // Now many gene ids are in the overflow hash. + uint32_t GeneIdsInHash() const { + return _large_gene_to_index.size(); + } + + friend void set_large_gene_id_threshold(uint32_t); + friend void set_accumulate_matching_genes(int); + +}; + +// control whether or not the list of matching gene id's is accumulated and +// written. This will have performance implications, since keeping track of +// the matching gene id's will be expensive. +void set_accumulate_matching_genes(int s); + +// Set the threshold above which genes id's are put into a hash rather +// than the _gene_to_index array. +void set_large_gene_id_threshold(uint32_t s); + +bool operator==(const Neighbour& n1, const Neighbour& n2); + +std::ostream& operator<<(std::ostream& output, const Neighbour& nbr); + +} // namespace needle + + +#endif // UTILITIES_GENEEXPRESSION_NEEDLE_H_ diff --git a/src/Utilities/GeneExpression/needle_test.cc b/src/Utilities/GeneExpression/needle_test.cc new file mode 100644 index 00000000..4b9c0561 --- /dev/null +++ b/src/Utilities/GeneExpression/needle_test.cc @@ -0,0 +1,1592 @@ +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "google/protobuf/text_format.h" + +#include "Utilities/GeneExpression/gene_expression.pb.h" + +#include "needle.h" + +namespace { + +using needle::Needle; +using needle::Neighbour; + +TEST(TestNeedle, TestIdentical) { + std::string string_proto1 = R"pb(name: "t1", +gene { + gene_id: 10 + score: 1.0 +} +gene { + gene_id: 12 + score: 0.5 +} +)pb"; + + gene_expression::Profile proto1; + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(string_proto1, &proto1)); + + // This test is not about maxrank, so use a number larger than the number of genes. + constexpr uint32_t kLarge = 10; + + Needle needle; + ASSERT_TRUE(needle.Build(proto1, kLarge)); + + needle.SetMaxPossibleAssociation(kLarge); + needle::Needle::_number_neighbours = kLarge; + + needle.Compare(proto1, kLarge, kLarge); + + EXPECT_EQ(needle.number_neighbours(), 1); + + Neighbour expected{"t1", 1.0f, 2}; + EXPECT_EQ(*needle.nbr(0), expected); +} + +TEST(TestNeedle, TestIdenticalOppositeSign) { + std::string string_proto1 = R"pb(name: "t1", +gene { + gene_id: 10 + score: 1.0 +} +gene { + gene_id: 12 + score: 0.5 +} +)pb"; + + gene_expression::Profile proto1; + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(string_proto1, &proto1)); + + // This test is not about maxrank, so use a number larger than the number of genes. + constexpr uint32_t kLarge = 10; + + Needle needle; + ASSERT_TRUE(needle.Build(proto1, kLarge)); + + needle.SetMaxPossibleAssociation(kLarge); + needle::Needle::_number_neighbours = kLarge; + + proto1.mutable_gene(0)->set_score(-1.0); + proto1.mutable_gene(1)->set_score(-0.5); + needle.Compare(proto1, kLarge, kLarge); + + EXPECT_EQ(needle.number_neighbours(), 1); + + Neighbour expected{"t1", -1.0f, 2}; + EXPECT_EQ(*needle.nbr(0), expected); +} + +// Hmmm, made a mistake here. This struct cannot explore +// mutiple neighbours since there are only 2 string protos. +// We should have needle_string_proto and a vector of +// haystack_string_proto. But discovered that too late +// and did not want to change things. +// The neighbour list handling has been tested in the +// neighbour finding code. These tests really just test +// the score. +struct Cmp { + std::string string_proto1; + std::string string_proto2; + uint32_t max_association; + int number_neighbours; + uint32_t needle_maxrank; + uint32_t haystack_maxrank; + std::vector expected; +}; + +class TestGeneExpression : public testing::TestWithParam { + protected: + gene_expression::Profile _proto1; + gene_expression::Profile _proto2; + + Needle _n1; + Needle _n2; +}; + +TEST_P(TestGeneExpression, Test1) { + const auto params = GetParam(); + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.string_proto1, &_proto1)); + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.string_proto2, &_proto2)); + + ASSERT_TRUE(_n1.Build(_proto1, params.needle_maxrank)); + ASSERT_TRUE(_n2.Build(_proto2, params.haystack_maxrank)); + + _n1.SetMaxPossibleAssociation(params.max_association); + needle::Needle::_number_neighbours = params.number_neighbours; + + _n1.Compare(_proto2, params.needle_maxrank, params.haystack_maxrank); + + EXPECT_EQ(_n1.number_neighbours(), params.expected.size()); + if (_n1.number_neighbours() > 0) { + for (int i = 0; i < _n1.number_neighbours(); ++i) { + EXPECT_EQ(*_n1.nbr(i), params.expected[i]); + } + } +} +INSTANTIATE_TEST_SUITE_P(TestOrder, TestGeneExpression, testing::Values( + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 1.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 1.0, 1}} +}, + + // No genes in common + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{} +}, + + // extra genes in the haystack. + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 4 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 1.0f, 1}} +}, + + // extra genes in the haystack, diff sign + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: -1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 0.3 + } + gene { + gene_id: 55 + score: 0.3 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 4 + score: 3.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", -1.0f, 1}} +}, + + // extra genes in the needle + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 8 + score: -1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 3 + score: -1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 1.0f, 1}} +}, + + // same two genes, just order swapped + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: -1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: 3.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 0.8f, 2}} +}, + + // same two genes, order swapped and extra needle genes + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 9 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 11 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 13 + score: -1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: 3.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 0.8f, 2}} +}, + + // same two genes, order swapped and exra haystack genes + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: -1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 53 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 55 + score: 2.0 + } + gene { + gene_id: 1 + score: 3.0 + } + gene { + gene_id: 65 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 0.8f, 2}} +}, + + + // three genes in common same order, all same sign + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 3 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 1.0f, 3}} +}, + + // three genes in common same order, first pair different signs + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: -2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 3 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", -4.0f/14.0f, 3}} +}, + + // three genes in common same order, second pair different signs + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 3 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 6.0f/14.0f, 3}} +}, + + // three genes in common same order, third pair different signs + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 3 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 12.0f/14.0f, 3}} +}, + + // three genes in common same order, third pair different signs. Extra genes + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 21 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 31 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 41 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } + gene { + gene_id: 51 + score: 1 + } + gene { + gene_id: 61 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 91 + score: 2.0 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 81 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 82 + score: 2.0 + } + gene { + gene_id: 3 + score: -2.0 + } + gene { + gene_id: 83 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 12.0f/14.0f, 3}} +}, + + // three genes in common first two out of order + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 3 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 11.0f/14.0f, 3}} +}, + + // three genes in common, swap 1 and 3 + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 4.0f/14.0f, 3}} +}, + + // three genes in common, swap 2 and 3 + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 2 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 9.0f/14.0f, 3}} +}, + + // Same as previous, extra genes added + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 101 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 102 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 103 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } + gene { + gene_id: 104 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 201 + score: 1 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 202 + score: 1 + } + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 203 + score: 1 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 204 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 9.0f/14.0f, 3}} +}, + + // Same as previous, adjust haystack_maxrank + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 101 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 102 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 103 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } + gene { + gene_id: 104 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 201 + score: 1 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 202 + score: 1 + } + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 203 + score: 1 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 204 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 2, // brings in gene 1 +{Neighbour{"t2", 14.0f/14.0f, 1}} +}, + + + // Same as previous, adjust haystack_maxrank + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 101 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 102 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 103 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } + gene { + gene_id: 104 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 201 + score: 1 + } + gene { + gene_id: 1 + score: 2.0 + } + gene { + gene_id: 202 + score: 1 + } + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 203 + score: 1 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 204 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 4, // brings in genes 1 and 3 +{Neighbour{"t2", 14.0f/14.0f, 2}} +}, + + // three genes in common, reverse order + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 4.0f/14.0f, 3}} +}, + + // three genes in common, reverse same sign + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 3 + score: 2.0 + } + gene { + gene_id: 2 + score: 2.0 + } + gene { + gene_id: 1 + score: 2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", 10.0f/14.0f, 3}} +}, + + // three genes in common, reverse order, opposite sign + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 3 + score: -2.0 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 1 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", -10.0f/14.0f, 3}} +}, + + // three genes in common, reverse order, opposite sign, added genes + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 101 + score: 1 + } + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 102 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 103 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } + gene { + gene_id: 104 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 201 + score: 1 + } + gene { + gene_id: 3 + score: -2.0 + } + gene { + gene_id: 202 + score: 1 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 203 + score: 1 + } + gene { + gene_id: 1 + score: -2.0 + } + gene { + gene_id: 204 + score: 1 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", -10.0f/14.0f, 3}} +}, + + + // three genes in common, same order, opposite sign + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 1 + } + gene { + gene_id: 2 + score: 1 + } + gene { + gene_id: 3 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 1 + score: -2.0 + } + gene { + gene_id: 2 + score: -2.0 + } + gene { + gene_id: 3 + score: -2.0 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +10, 10, 10, 10, +{Neighbour{"t2", -14.0f/14.0f, 3}} +}, + + // example from Rick Higgs + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: 6.1 + } + gene { + gene_id: 2 + score: -4.2 + } + gene { + gene_id: 3 + score: 2.5 + } + gene { + gene_id: 4 + score: -2.1 + } + gene { + gene_id: 5 + score: 1.9 + } + gene { + gene_id: 6 + score: 1.8 + } + gene { + gene_id: 7 + score: -1.7 + } + gene { + gene_id: 8 + score: -1.6 + } + gene { + gene_id: 9 + score: 1.4 + } + gene { + gene_id: 10 + score: -1.2 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 2 + score: -5.9 + } + gene { + gene_id: 1 + score: 5.5 + } + gene { + gene_id: 4 + score: -2.8 + } + gene { + gene_id: 3 + score: 2.5 + } + gene { + gene_id: 5 + score: 2 + } + gene { + gene_id: 7 + score: -1.9 + } + gene { + gene_id: 6 + score: 1.8 + } + gene { + gene_id: 10 + score: -1.5 + } + gene { + gene_id: 9 + score: -1.4 + } + gene { + gene_id: 8 + score: -1.3 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +20, 20, 20, 20, +{Neighbour{"t2", 370.0f/385.0f, 10}} +}, + + + // example from Rick Higgs, with extra genes inserted + Cmp{ +R"pb( + name: "t1", + gene { + gene_id: 100 + score: 2.5 + } + gene { + gene_id: 101 + score: 2.5 + } + gene { + gene_id: 102 + score: 2.5 + } + gene { + gene_id: 103 + score: 2.5 + } + gene { + gene_id: 1 + score: 6.1 + } + gene { + gene_id: 104 + score: 2.5 + } + gene { + gene_id: 105 + score: 2.5 + } + gene { + gene_id: 106 + score: 2.5 + } + gene { + gene_id: 2 + score: -4.2 + } + gene { + gene_id: 107 + score: 2.5 + } + gene { + gene_id: 3 + score: 2.5 + } + gene { + gene_id: 108 + score: 2.5 + } + gene { + gene_id: 109 + score: 2.5 + } + gene { + gene_id: 110 + score: 2.5 + } + gene { + gene_id: 4 + score: -2.1 + } + gene { + gene_id: 111 + score: 2.5 + } + gene { + gene_id: 112 + score: 2.5 + } + gene { + gene_id: 113 + score: 2.5 + } + gene { + gene_id: 114 + score: 2.5 + } + gene { + gene_id: 5 + score: 1.9 + } + gene { + gene_id: 115 + score: 2.5 + } + gene { + gene_id: 6 + score: 1.8 + } + gene { + gene_id: 7 + score: -1.7 + } + gene { + gene_id: 116 + score: 2.5 + } + gene { + gene_id: 117 + score: 2.5 + } + gene { + gene_id: 8 + score: -1.6 + } + gene { + gene_id: 118 + score: 2.5 + } + gene { + gene_id: 9 + score: 1.4 + } + gene { + gene_id: 119 + score: 2.5 + } + gene { + gene_id: 120 + score: 2.5 + } + gene { + gene_id: 10 + score: -1.2 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 200 + score: -1.2 + } + gene { + gene_id: 201 + score: -1.2 + } + gene { + gene_id: 202 + score: -1.2 + } + gene { + gene_id: 2 + score: -5.9 + } + gene { + gene_id: 1 + score: 5.5 + } + gene { + gene_id: 4 + score: -2.8 + } + gene { + gene_id: 3 + score: 2.5 + } + gene { + gene_id: 203 + score: -1.2 + } + gene { + gene_id: 204 + score: -1.2 + } + gene { + gene_id: 205 + score: -1.2 + } + gene { + gene_id: 5 + score: 2 + } + gene { + gene_id: 206 + score: -1.2 + } + gene { + gene_id: 207 + score: -1.2 + } + gene { + gene_id: 7 + score: -1.9 + } + gene { + gene_id: 6 + score: 1.8 + } + gene { + gene_id: 10 + score: -1.5 + } + gene { + gene_id: 9 + score: -1.4 + } + gene { + gene_id: 208 + score: -1.2 + } + gene { + gene_id: 8 + score: -1.3 + } +)pb", +// max association, number nbrs, needle_maxrank, haystack_maxrank +40, 40, 40, 40, +{Neighbour{"t2", 370.0f/385.0f, 10}} +} +)); + + +struct CmpGenes { + std::string string_proto1; + std::string string_proto2; + uint32_t max_association; + int number_neighbours; + std::vector expected; +}; + +class TestGeneExpressionThreshold : public testing::TestWithParam> { + protected: + gene_expression::Profile _proto1; + gene_expression::Profile _proto2; + + Needle _n1; + Needle _n2; +}; + +TEST_P(TestGeneExpressionThreshold, Test1) { + auto p = GetParam(); + int large_gene_threshold = std::get<0>(p); + const CmpGenes& params = std::get<1>(p); + + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.string_proto1, &_proto1)); + ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(params.string_proto2, &_proto2)); + + needle::set_large_gene_id_threshold(large_gene_threshold); + + constexpr int kMaxRank = 1000; + ASSERT_TRUE(_n1.Build(_proto1, kMaxRank)); + ASSERT_TRUE(_n2.Build(_proto2, kMaxRank)); + + // If no threshold, all genes should be in the array. + if (large_gene_threshold == 0) { + EXPECT_EQ(_n1.GeneIdsInHash(), 0); + EXPECT_EQ(_n2.GeneIdsInHash(), 0); + } else { + EXPECT_GT(_n1.GeneIdsInHash(), 0); + EXPECT_GT(_n2.GeneIdsInHash(), 0); + } + + _n1.SetMaxPossibleAssociation(params.max_association); + needle::Needle::_number_neighbours = params.number_neighbours; + + _n1.Compare(_proto2, kMaxRank, kMaxRank); + + EXPECT_EQ(_n1.number_neighbours(), params.expected.size()); + if (_n1.number_neighbours() > 0) { + for (int i = 0; i < _n1.number_neighbours(); ++i) { + EXPECT_EQ(*_n1.nbr(i), params.expected[i]); + } + } +} +INSTANTIATE_TEST_SUITE_P(TestThreshold, TestGeneExpressionThreshold, + // Values for the large gene id threshold. + testing::Combine(testing::Range(0, 10), + testing::Values( +CmpGenes{ +R"pb( + name: "t1", + gene { + gene_id: 1 + score: -8.2 + } + gene { + gene_id: 2 + score: 8 + } + gene { + gene_id: 3 + score: -7 + } + gene { + gene_id: 4 + score: -6 + } + gene { + gene_id: 5 + score: 5 + } + gene { + gene_id: 6 + score: -4 + } + gene { + gene_id: 9 + score: 3 + } + gene { + gene_id: 10 + score: -2 + } + gene { + gene_id: 11 + score: 1 + } +)pb", +R"pb( + name: "t2", + gene { + gene_id: 6 + score: -8.2 + } + gene { + gene_id: 1 + score: 8 + } + gene { + gene_id: 10 + score: -7 + } + gene { + gene_id: 2 + score: -6 + } + gene { + gene_id: 3 + score: 5 + } + gene { + gene_id: 9 + score: -4 + } + gene { + gene_id: 5 + score: 3 + } + gene { + gene_id: 7 + score: -2 + } + gene { + gene_id: 4 + score: 1 + } +)pb", +// max association, number nbrs +40, 40, +{Neighbour{"t2", -88.0f/204.0f, 8}} +} +))); + +} // namespace diff --git a/src/Utilities/General/BUILD b/src/Utilities/General/BUILD index e1ed6725..dc5b7bbb 100644 --- a/src/Utilities/General/BUILD +++ b/src/Utilities/General/BUILD @@ -212,51 +212,6 @@ py_proto_library( ], ) -proto_library( - name = "xgboost_model_proto", - srcs = [ - "xgboost_model.proto", - ], -) - -cc_proto_library( - name = "xgboost_model_cc_proto", - deps = [ - "xgboost_model_proto", - ], - visibility = [ - "//visibility:public", - ], -) - -py_proto_library( - name = "xgboost_model_py_proto", - srcs = [ - "xgboost_model.proto", - ], -) - -proto_library( - name = "random_forest_model_proto", - srcs = [ - "random_forest_model.proto", - ], -) - -cc_proto_library( - name = "random_forest_model_cc_proto", - deps = [ - "random_forest_model_proto", - ] -) - -py_proto_library( - name = "random_forest_model_py_proto", - srcs = [ - "random_forest_model.proto", - ], -) - cc_library( name = "scaler", srcs = [ @@ -392,6 +347,7 @@ cc_binary( "//Foundational/data_source:iwtfdata_record", "//Foundational/iwstring", "//Molecule_Tools:dicer_fragments_cc_proto", + "@com_google_absl//absl/log:initialize", "@com_google_absl//absl/container:flat_hash_map", ], ) @@ -979,24 +935,6 @@ cc_binary( ], ) -cc_binary( - name = "xgboost_model_evaluate", - srcs = [ - "xgboost_model_evaluate.cc", - ], - deps = [ - ":xgboost_model_cc_proto", - "//Foundational/cmdline_v2:cmdline_v2", - "//Foundational/data_source:iwstring_data_source", - "//Foundational/iwmisc", - "@xgboost//:xgboost", - "@dlmc//:dlmc", - ], - tags = [ - "xgboost", - ], -) - cc_test( name = "scaler_test", srcs = [ diff --git a/src/Utilities/General/dicer_fragments_collate.cc b/src/Utilities/General/dicer_fragments_collate.cc index 99fbbe04..01ef9874 100644 --- a/src/Utilities/General/dicer_fragments_collate.cc +++ b/src/Utilities/General/dicer_fragments_collate.cc @@ -5,10 +5,12 @@ #include #include +#include #include #include #include "absl/container/flat_hash_map.h" +#include "absl/log/initialize.h" #include "google/protobuf/io/zero_copy_stream_impl_lite.h" #include "google/protobuf/text_format.h" @@ -38,6 +40,7 @@ struct DicerData { IWString smiles; std::string par; uint32_t n; + uint32_t natoms; }; class DicerFragmentsCollate { @@ -59,6 +62,10 @@ class DicerFragmentsCollate { // non-unique smiles. int _has_leading_non_unique_smiles; + // It can be convenient to limit the number of atoms + uint32_t _max_atoms; + int _discarded_for_too_many_atoms; + // If it has a leading non unique smiles, keep track of them and // write them at the end. absl::flat_hash_map _usmi_to_non_unique; @@ -117,6 +124,9 @@ DicerFragmentsCollate::DicerFragmentsCollate() { _hash_value_is_proto = 1; + _max_atoms = std::numeric_limits::max(); + _discarded_for_too_many_atoms = 0; + _items_read = 0; _support = 0; _suppressed_by_support = 0; @@ -142,6 +152,17 @@ DicerFragmentsCollate::Initialise(Command_Line_v2& cl) { _has_leading_non_unique_smiles = 0; } + if (cl.option_present("maxat")) { + cl.value("maxat", _max_atoms); + if (_max_atoms < 1) { + cerr << "Invalid -maxat value\n"; + return 0; + } + if (_verbose) { + cerr << "Will discard input with more than " << _max_atoms << " atoms\n"; + } + } + if (cl.option_present('r')) { int rpt; if (! cl.value('r', rpt) || rpt < 1) { @@ -283,6 +304,11 @@ DicerFragmentsCollate::AccumulateRecord(dicer_data::DicerFragment& proto, cerr << '\n'; } + if (proto.nat() > _max_atoms) { + ++_discarded_for_too_many_atoms; + return 1; + } + if (_hash_value_is_proto) { return InsertIntoProtoHash(proto, non_unique_smiles); } else { @@ -323,6 +349,7 @@ DicerFragmentsCollate::InsertIntoDataHash(dicer_data::DicerFragment& proto, if (iter != _hash_data.end()) { const int n = iter->second.n; iter->second.n = n + 1; + iter->second.natoms = proto.nat(); return 1; } @@ -428,6 +455,7 @@ DicerFragmentsCollate::WriteFromDataHash(IWString_and_File_Descriptor& output) { output << " smi: " << usmi; output << " par: " << data.par; output << " n: " << data.n; + output << " nat: " << data.natoms; output << '\n'; output.write_if_buffer_holds_more_than(32768); @@ -454,6 +482,10 @@ int DicerFragmentsCollate::Report(std::ostream& output) const { output << "DicerFragmentsCollate::Report: read " << _items_read << " items, "; + if (_max_atoms != std::numeric_limits::max()) { + output << "discarded " << _discarded_for_too_many_atoms << + " inputs for having more than " << _max_atoms << " atoms\n"; + } if (_hash_value_is_proto) { output << _hash.size(); } else { @@ -482,6 +514,7 @@ Usage(int rc) { cerr << "Aggregates multiple dicer_data::DicerFragment text proto files\n"; cerr << " -p minimum support level (n: value) for inclusion\n"; cerr << " -nosmi each record does not contain a leading non-unique smiles\n"; + cerr << " -maxat discard input that contains more than atoms\n"; cerr << " -r report progress every items read\n"; cerr << " -minimal extract only the essential information from the protos\n"; cerr << " -tfdata data is TFDataRecord serialized protos\n"; @@ -493,7 +526,7 @@ Usage(int rc) { int Main(int argc, char** argv) { - Command_Line_v2 cl(argc, argv, "-v-p=ipos-nosmi-r=ipos-minimal-tfdata"); + Command_Line_v2 cl(argc, argv, "-v-p=ipos-nosmi-r=ipos-minimal-tfdata-maxat=ipos"); if (cl.unrecognised_options_encountered()) { cerr << "Unrecognised options present\n"; @@ -538,6 +571,8 @@ Main(int argc, char** argv) { int main(int argc, char** argv) { + absl::InitializeLog(); + int rc = dicer_fragments_collate::Main(argc, argv); return rc; diff --git a/src/Utilities/General/fetch_smiles_quick.cc b/src/Utilities/General/fetch_smiles_quick.cc index 0cae542a..3bb2c527 100644 --- a/src/Utilities/General/fetch_smiles_quick.cc +++ b/src/Utilities/General/fetch_smiles_quick.cc @@ -1,8 +1,9 @@ /* - Fetches records from a file based on identifiers in another file + Fetches records from a smiles file based on identifiers in another file */ #include + #include #include "Foundational/cmdline/cmdline.h" @@ -12,9 +13,8 @@ #include "Foundational/iwstring/iw_stl_hash_set.h" using std::cerr; -using std::endl; -const char * prog_name = nullptr; +const char* prog_name = nullptr; static int verbose = 0; @@ -55,21 +55,24 @@ static int identifier_file_is_descriptor_file = 0; static char identifier_file_column_separator = ' '; static char smiles_file_column_separator = ' '; +static int flush_output = 0; + static void -usage (int rc) -{ +usage(int rc) { // clang-format off #if defined(GIT_HASH) && defined(TODAY) cerr << __FILE__ << " compiled " << TODAY << " git hash " << GIT_HASH << '\n'; #else cerr << __FILE__ << " compiled " << __DATE__ << " " << __TIME__ << '\n'; #endif -// clang-format on + // clang-format on + // clang-format off cerr << "Fetches records from one file based on identifiers in one or more other file(s)\n"; cerr << prog_name << " identifier_file smiles_file > newfile\n"; cerr << " -c identifier column in identifier file\n"; cerr << " -C identifier column in smiles file\n"; cerr << " -C RX= identifier is whichever column(s) match \n"; + cerr << " -K one or more identifiers to fetch (multiple -K options)\n"; cerr << " -d ignore duplicate identifiers in identifier file\n"; cerr << " -q quietly ignore duplicate identifiers in identifier file\n"; cerr << " -a write all instances of identifiers in smiles file\n"; @@ -83,91 +86,84 @@ usage (int rc) cerr << " -z strip leading zero's from identifiers\n"; cerr << " -j identifier file is descriptor file, skip header record\n"; cerr << " -b stop processing identifier file on error, but continue processing (dangerous)\n"; + cerr << " -f do not fail if a specified column is not present in either the identifier file or smiles file\n"; + cerr << " -h flush output after each molecule written\n"; cerr << " -i column separator in identifier file\n"; cerr << " -I column separator in smiles file\n"; cerr << " -S first files are identifier files, last is haystack. Create many subsets\n"; cerr << " -u suffix for -S files created\n"; cerr << " -g start number for files created (-g 1 for dopattern)\n"; cerr << " -v verbose output\n"; + // clang-format on exit(rc); } static int -separate_into_id_and_data_2 (const const_IWSubstring & buffer, - int col, - const char sep, - IWString & id, - IWString & zdata) -{ - if (0 == col) // the most common case - { -// cerr << "Extracting column " << col << " from '" << buffer << "'\n"; +separate_into_id_and_data_2(const const_IWSubstring& buffer, int col, const char sep, + IWString& id, IWString& zdata) { + if (0 == col) { // the most common case + // cerr << "Extracting column " << col << " from '" << buffer << "'\n"; int ispace = buffer.index(sep); - if (ispace < 0) // just one token on the line - { + if (ispace < 0) { // just one token on the line id = buffer; return 1; } - const char * s = buffer.rawchars(); + const char* s = buffer.rawchars(); id.set(s, ispace); - zdata.set(s + ispace +1, buffer.length() - ispace - 1); + zdata.set(s + ispace + 1, buffer.length() - ispace - 1); -// cerr << "Split into '" << id << "' and '" << zdata << "'\n"; + // cerr << "Split into '" << id << "' and '" << zdata << "'\n"; return 1; } - if (col < 0) - { + if (col < 0) { int nw; - if (' ' == sep) + if (' ' == sep) { nw = buffer.nwords(); - else + } else { nw = buffer.nwords(sep); + } - if (-col > nw) - { - cerr << "Record contains only " << nw << " columns, cannot fetch column " << col << endl; + if (-col > nw) { + cerr << "Record contains only " << nw << " columns, cannot fetch column " << col + << '\n'; return 0; } col = nw + col; } - int j = 0; const_IWSubstring token; - if (' ' == sep) - { - for (auto c = 0; buffer.nextword(token, j); ++c) - { - if (c == col) + if (' ' == sep) { + for (auto c = 0; buffer.nextword(token, j); ++c) { + if (c == col) { id = token; - else if (include_identifier_file_info) + } else if (include_identifier_file_info) { zdata.append_with_spacer(token); + } } - } - else - { - for (auto c = 0; buffer.nextword_single_delimiter(token, j); ++c) - { - if (c == col) + } else { + for (auto c = 0; buffer.nextword_single_delimiter(token, j); ++c) { + if (c == col) { id = token; - else if (include_identifier_file_info) + } else if (include_identifier_file_info) { zdata.append_with_spacer(token); + } } } - if (0 == id.length()) - { - if (ignore_column_not_present) + if (0 == id.length()) { + if (ignore_column_not_present) { return 0; + } cerr << "Cannot extract column " << col << " from '" << buffer << "'\n"; return 0; @@ -177,45 +173,43 @@ separate_into_id_and_data_2 (const const_IWSubstring & buffer, } static int -separate_into_id_and_data (const const_IWSubstring & buffer, - int col, - const char sep, - IWString & id, - IWString & zdata) -{ - if (! separate_into_id_and_data_2(buffer, col, sep, id, zdata)) +separate_into_id_and_data(const const_IWSubstring& buffer, int col, const char sep, + IWString& id, IWString& zdata) { + if (!separate_into_id_and_data_2(buffer, col, sep, id, zdata)) { return 0; + } - if (strip_leading_zeros_from_identifiers) + if (strip_leading_zeros_from_identifiers) { id.remove_leading_chars('0'); + } return 1; } static int -read_identifiers_to_fetch (const const_IWSubstring & buffer, - int record_number, - IW_STL_Hash_Map_String & identifiers_to_fetch) -{ +read_identifiers_to_fetch(const const_IWSubstring& buffer, int record_number, + IW_STL_Hash_Map_String& identifiers_to_fetch) { IWString id, zdata; - if (separate_into_id_and_data(buffer, identifier_column_in_identifier_file, identifier_file_column_separator, id, zdata)) + if (separate_into_id_and_data(buffer, identifier_column_in_identifier_file, + identifier_file_column_separator, id, zdata)) { ; - else if (ignore_column_not_present) + } else if (ignore_column_not_present) { return 1; - else + } else { return 0; + } IW_STL_Hash_Map_String::const_iterator f = identifiers_to_fetch.find(id); - if (f == identifiers_to_fetch.end()) - { + if (f == identifiers_to_fetch.end()) { identifiers_to_fetch[id] = zdata; return 1; } - if (ignore_duplicate_identifiers_in_identifier_file) - { - if (1 == ignore_duplicate_identifiers_in_identifier_file) - cerr << "Ignoring duplicate identifer '" << id << "', line " << record_number << "\n"; + if (ignore_duplicate_identifiers_in_identifier_file) { + if (1 == ignore_duplicate_identifiers_in_identifier_file) { + cerr << "Ignoring duplicate identifer '" << id << "', line " << record_number + << "\n"; + } duplicate_identifiers_in_identifier_file++; return 1; } @@ -225,19 +219,18 @@ read_identifiers_to_fetch (const const_IWSubstring & buffer, } static int -read_identifiers_to_fetch (iwstring_data_source & input, - IW_STL_Hash_Map_String & identifiers_to_fetch) -{ +read_identifiers_to_fetch(iwstring_data_source& input, + IW_STL_Hash_Map_String& identifiers_to_fetch) { const_IWSubstring buffer; - if (identifier_file_is_descriptor_file) - (void) input.next_record(buffer); + if (identifier_file_is_descriptor_file) { + (void)input.next_record(buffer); + } - while (input.next_record(buffer)) - { - if (! read_identifiers_to_fetch(buffer, input.lines_read(), identifiers_to_fetch)) - { - cerr << "Fatal error processing identifier file '" << buffer << "', line " << input.lines_read() << endl; + while (input.next_record(buffer)) { + if (!read_identifiers_to_fetch(buffer, input.lines_read(), identifiers_to_fetch)) { + cerr << "Fatal error processing identifier file '" << buffer << "', line " + << input.lines_read() << '\n'; return 0; } } @@ -246,13 +239,11 @@ read_identifiers_to_fetch (iwstring_data_source & input, } static int -read_identifiers_to_fetch (const char * fname, - IW_STL_Hash_Map_String & identifiers_to_fetch) -{ +read_identifiers_to_fetch(const char* fname, + IW_STL_Hash_Map_String& identifiers_to_fetch) { iwstring_data_source input(fname); - if (! input.good()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 0; } @@ -261,59 +252,57 @@ read_identifiers_to_fetch (const char * fname, } static int -handle_record_not_in_identifier_file (const const_IWSubstring & buffer, - const IWString & id) -{ - if (verbose > 2) +handle_record_not_in_identifier_file(const const_IWSubstring& buffer, + const IWString& id) { + if (verbose > 2) { cerr << "Smiles file identifier '" << id << "' not requested\n"; + } not_in_identifier_file++; - if (stream_for_not_in_identifier_file.rdbuf()->is_open()) + if (stream_for_not_in_identifier_file.rdbuf()->is_open()) { stream_for_not_in_identifier_file << buffer << '\n'; + } return 1; } static int -fetch_smiles_quick (const const_IWSubstring & buffer, - const IWString & id, - const IWString & zdata, - IW_STL_Hash_Map_String & identifiers_to_fetch, - IWString & output_buffer) -{ +fetch_smiles_quick(const const_IWSubstring& buffer, const IWString& id, + const IWString& zdata, IW_STL_Hash_Map_String& identifiers_to_fetch, + IWString& output_buffer) { IW_STL_Hash_Map_String::const_iterator f = identifiers_to_fetch.find(id); bool is_match; - if (f == identifiers_to_fetch.end()) + if (f == identifiers_to_fetch.end()) { is_match = false; - else + } else { is_match = true; + } -//if (is_match) -// cerr << "Got match for '" << id << "'\n"; + // if (is_match) + // cerr << "Got match for '" << id << "'\n"; - if (invert_fetching_operation) - { - if (is_match) - { + if (invert_fetching_operation) { + if (is_match) { is_match = false; -// cerr << "Switched to non-match for '" << id << "'\n"; - } - else + // cerr << "Switched to non-match for '" << id << "'\n"; + } else { is_match = true; + } } - if (! is_match) + if (!is_match) { return handle_record_not_in_identifier_file(buffer, id); - + } + output_buffer << buffer; - if (invert_fetching_operation) // the iterator may be bad + if (invert_fetching_operation) { // the iterator may be bad ; - else if (include_identifier_file_info) { - if (! f->second.empty()) { + } else if (include_identifier_file_info) { + if (!f->second.empty()) { output_buffer << string_before_identifier_file_info << (*f).second; } } @@ -322,31 +311,31 @@ fetch_smiles_quick (const const_IWSubstring & buffer, items_written++; - if (erase_identifiers_written) + if (erase_identifiers_written) { identifiers_to_fetch.erase(id); + } return 1; } static int -fetch_smiles_quick (const const_IWSubstring & buffer, - IW_STL_Hash_Map_String & identifiers_to_fetch, - IWString & output_buffer) -{ +fetch_smiles_quick(const const_IWSubstring& buffer, + IW_STL_Hash_Map_String& identifiers_to_fetch, + IWString& output_buffer) { IWString id, zdata; - if (identifier_regexp_in_smiles_file) - { + if (identifier_regexp_in_smiles_file) { int i = 0; const_IWSubstring token; int col = 0; - while (buffer.nextword(token, i)) - { - if (! iwre2::RE2PartialMatch(token, *identifier_regexp_in_smiles_file)) + while (buffer.nextword(token, i)) { + if (!iwre2::RE2PartialMatch(token, *identifier_regexp_in_smiles_file)) { continue; + } - if (! identifiers_to_fetch.contains(token)) + if (!identifiers_to_fetch.contains(token)) { continue; + } id = token; zdata = buffer; @@ -356,88 +345,90 @@ fetch_smiles_quick (const const_IWSubstring & buffer, } return handle_record_not_in_identifier_file(buffer, ""); - } - else if (! separate_into_id_and_data(buffer, identifier_column_in_smiles_file, smiles_file_column_separator, id, zdata)) - { - if (ignore_column_not_present) + } else if (!separate_into_id_and_data(buffer, identifier_column_in_smiles_file, + smiles_file_column_separator, id, zdata)) { + if (ignore_column_not_present) { return 1; - else + } else { return 0; - } - else + } + } else { return fetch_smiles_quick(buffer, id, zdata, identifiers_to_fetch, output_buffer); + } } static int -fetch_smiles_quick (iwstring_data_source & input, - IW_STL_Hash_Map_String & identifiers_to_fetch, - IWString_and_File_Descriptor & output) -{ +fetch_smiles_quick(iwstring_data_source& input, + IW_STL_Hash_Map_String& identifiers_to_fetch, + IWString_and_File_Descriptor& output) { const_IWSubstring buffer; - while (input.next_record(buffer)) - { - if (! fetch_smiles_quick(buffer, identifiers_to_fetch, output)) - { - cerr << "Fatal error processing '" << buffer << "', line " << input.lines_read() << endl; + while (input.next_record(buffer)) { + if (!fetch_smiles_quick(buffer, identifiers_to_fetch, output)) { + cerr << "Fatal error processing '" << buffer << "', line " << input.lines_read() + << '\n'; return 0; } output.write_if_buffer_holds_more_than(block_size); - if (0 == identifiers_to_fetch.size()) + if (0 == identifiers_to_fetch.size()) { return 1; + } } return 1; } static size_t -fetch_smiles_quick_multiple_files (iwstring_data_source & input, - const IW_STL_Hash_Map_String & file_contents, - IWString_and_File_Descriptor & output, - IWString_and_File_Descriptor & xout, - IWString_and_File_Descriptor & yout) -{ +fetch_smiles_quick_multiple_files(iwstring_data_source& input, + const IW_STL_Hash_Map_String& file_contents, + IWString_and_File_Descriptor& output, + IWString_and_File_Descriptor& xout, + IWString_and_File_Descriptor& yout) { IW_STL_Hash_Set written; bool fill_written_hash = false; - if (xout.is_open()) + if (xout.is_open()) { fill_written_hash = true; + } const_IWSubstring buffer; - while (input.next_record(buffer)) - { + while (input.next_record(buffer)) { IWString id, zdata; - if (! separate_into_id_and_data(buffer, identifier_column_in_identifier_file, identifier_file_column_separator, id, zdata)) + if (!separate_into_id_and_data(buffer, identifier_column_in_identifier_file, + identifier_file_column_separator, id, zdata)) { return 0; + } const auto f = file_contents.find(id); - if (f != file_contents.end()) - { + if (f != file_contents.end()) { output << (*f).second; - output.write_if_buffer_holds_more_than(block_size); - if (fill_written_hash) + if (flush_output) { + output.flush(); + } else { + output.write_if_buffer_holds_more_than(block_size); + } + if (fill_written_hash) { written.insert(id); + } continue; } -// The identifier was not in the haystack + // The identifier was not in the haystack - if (yout.is_open()) - { + if (yout.is_open()) { yout << id << '\n'; yout.write_if_buffer_holds_more_than(block_size); } } - if (xout.is_open()) - { - for (auto c : file_contents) - { - if (written.contains(c.first)) + if (xout.is_open()) { + for (auto c : file_contents) { + if (written.contains(c.first)) { continue; + } xout << c.second; xout.write_if_buffer_holds_more_than(block_size); @@ -448,46 +439,42 @@ fetch_smiles_quick_multiple_files (iwstring_data_source & input, } static size_t -fetch_smiles_quick_multiple_files (const char * ifile, - const IW_STL_Hash_Map_String & file_contents, - const char * ofile, - const char * xfile, - const char * yfile) -{ +fetch_smiles_quick_multiple_files(const char* ifile, + const IW_STL_Hash_Map_String& file_contents, + const char* ofile, const char* xfile, + const char* yfile) { iwstring_data_source input(ifile); - if (! input.good()) - { - cerr << "fetch_smiles_quick_multiple_files:cannot open input file '" << ifile << "'\n"; + if (!input.good()) { + cerr << "fetch_smiles_quick_multiple_files:cannot open input file '" << ifile + << "'\n"; return 0; } IWString_and_File_Descriptor output; - if (! output.open(ofile)) - { - cerr << "fetch_smiles_quick_multiple_files:cannot open output file '" << ofile << "'\n"; + if (!output.open(ofile)) { + cerr << "fetch_smiles_quick_multiple_files:cannot open output file '" << ofile + << "'\n"; return 0; } IWString_and_File_Descriptor xout, yout; - if (0 == ::strlen(xfile)) + if (0 == ::strlen(xfile)) { ; - else if (xout.open(xfile)) + } else if (xout.open(xfile)) { ; - else - { + } else { cerr << "fetch_smiles_quick_multiple_files:cannot open xfile '" << xfile << "'\n"; return 0; } - if (0 == ::strlen(yfile)) + if (0 == ::strlen(yfile)) { ; - else if (yout.open(yfile)) + } else if (yout.open(yfile)) { ; - else - { + } else { cerr << "fetch_smiles_quick_multiple_files:cannot open yfile '" << yfile << "'\n"; return 0; } @@ -496,34 +483,27 @@ fetch_smiles_quick_multiple_files (const char * ifile, } static int -read_file_contents (iwstring_data_source & input, - IW_STL_Hash_Map_String & file_contents) -{ +read_file_contents(iwstring_data_source& input, IW_STL_Hash_Map_String& file_contents) { IWString buffer; - while (input.next_record(buffer)) - { + while (input.next_record(buffer)) { IWString id; int i = 0; - if (! buffer.nextword(id, i) || ! buffer.nextword(id, i)) - { + if (!buffer.nextword(id, i) || !buffer.nextword(id, i)) { cerr << "read_file_contents:cannot read identifier from '" << buffer << "'\n"; return 0; } - if (! file_contents.contains(id)) - { + if (!file_contents.contains(id)) { buffer << '\n'; file_contents[id] = buffer; - } - else if (ignore_duplicate_identifiers_in_identifier_file) - { - if (1 == ignore_duplicate_identifiers_in_identifier_file) - cerr << "Ignoring duplicate identifer '" << id << "', line " << input.lines_read() << "\n"; + } else if (ignore_duplicate_identifiers_in_identifier_file) { + if (1 == ignore_duplicate_identifiers_in_identifier_file) { + cerr << "Ignoring duplicate identifer '" << id << "', line " << input.lines_read() + << "\n"; + } duplicate_identifiers_in_identifier_file++; continue; - } - else - { + } else { cerr << "read_file_contents:duplicate identifier '" << buffer << "'\n"; return 0; } @@ -533,13 +513,10 @@ read_file_contents (iwstring_data_source & input, } static int -read_file_contents (const char * fname, - IW_STL_Hash_Map_String & file_contents) -{ +read_file_contents(const char* fname, IW_STL_Hash_Map_String& file_contents) { iwstring_data_source input(fname); - if (! input.good()) - { + if (!input.good()) { cerr << "read_file_contents:cannot open '" << fname << "'\n"; return 0; } @@ -548,42 +525,37 @@ read_file_contents (const char * fname, } static void -append_int_if_non_zero_length (const IWString & s, - int ndx, - IWString & zresult) -{ - if (s.length() > 0) +append_int_if_non_zero_length(const IWString& s, int ndx, IWString& zresult) { + if (s.length() > 0) { zresult << s << ndx; + } return; } static int -fetch_smiles_quick_multiple_files (const Command_Line & cl, - const IWString & stem, - int seq_begin, - const IWString & suffix, - const IWString & xstem, - const IWString & ystem) -{ +fetch_smiles_quick_multiple_files(const Command_Line& cl, const IWString& stem, + int seq_begin, const IWString& suffix, + const IWString& xstem, const IWString& ystem) { IW_STL_Hash_Map_String file_contents; - if (! read_file_contents(cl.last_item(), file_contents)) - { - cerr << "fetch_smiles_quick_multiple_files:cannot read file contents '" << (cl.number_elements() - 1) << "'\n"; + if (!read_file_contents(cl.last_item(), file_contents)) { + cerr << "fetch_smiles_quick_multiple_files:cannot read file contents '" + << (cl.number_elements() - 1) << "'\n"; return 0; } - for (auto i = 0; i < cl.number_elements() - 1; ++i) - { + for (auto i = 0; i < cl.number_elements() - 1; ++i) { IWString stemi, xstemi, ystemi; append_int_if_non_zero_length(stem, i + seq_begin, stemi); - if (suffix.length()) + if (suffix.length()) { stemi << suffix; + } append_int_if_non_zero_length(xstem, i + seq_begin, xstemi); append_int_if_non_zero_length(ystem, i + seq_begin, ystemi); - if (! fetch_smiles_quick_multiple_files(cl[i], file_contents, stemi.null_terminated_chars(), xstemi.null_terminated_chars(), ystemi.null_terminated_chars())) - { + if (!fetch_smiles_quick_multiple_files( + cl[i], file_contents, stemi.null_terminated_chars(), + xstemi.null_terminated_chars(), ystemi.null_terminated_chars())) { cerr << "Cannot process fetching from '" << cl[i] << "'\n"; return 0; } @@ -593,14 +565,11 @@ fetch_smiles_quick_multiple_files (const Command_Line & cl, } static int -fetch_smiles_quick (const char * fname, - IW_STL_Hash_Map_String & identifiers_to_fetch, - IWString_and_File_Descriptor & output) -{ +fetch_smiles_quick(const char* fname, IW_STL_Hash_Map_String& identifiers_to_fetch, + IWString_and_File_Descriptor& output) { iwstring_data_source input(fname); - if (! input.good ()) - { + if (!input.good()) { cerr << "Cannot open '" << fname << "'\n"; return 0; } @@ -608,19 +577,18 @@ fetch_smiles_quick (const char * fname, input.set_dos(1); input.set_translate_tabs(1); - if (verbose > 0) + if (verbose > 0) { cerr << "Processing '" << fname << "'\n"; + } return fetch_smiles_quick(input, identifiers_to_fetch, output); } static int -determine_inter_column_separator(const Command_Line & cl, const char flag, char & sep) -{ +determine_inter_column_separator(const Command_Line& cl, const char flag, char& sep) { IWString i = cl.string_value(flag); - if (! char_name_to_char(i)) - { + if (!char_name_to_char(i)) { cerr << "Unrecognised column separator '" << i << "'\n"; return 0; } @@ -631,330 +599,331 @@ determine_inter_column_separator(const Command_Line & cl, const char flag, char } static int -fetch_smiles_quick (int argc, char ** argv) -{ - Command_Line cl(argc, argv, "vc:C:X:Y:wkdzB:an:xF:qbjK:S:u:g:fi:I:"); +fetch_smiles_quick(int argc, char** argv) { + Command_Line cl(argc, argv, "vc:C:X:Y:wkdzB:an:xF:qbjK:S:u:g:fi:I:h"); - if (cl.unrecognised_options_encountered()) - { + if (cl.unrecognised_options_encountered()) { cerr << "Unrecognised options encountered\n"; usage(1); } verbose = cl.option_count('v'); - if (0 == cl.number_elements()) - { + if (cl.empty()) { cerr << "Insufficient arguments\n"; usage(2); } - if (cl.option_present('a')) - { + if (cl.option_present('a')) { erase_identifiers_written = 0; - if (cl.option_present('Y')) - { + if (cl.option_present('Y')) { cerr << "Sorry, the -a and -Y options do not work together\n"; return 6; } - if (verbose) + if (verbose) { cerr << "Will write all instances of identifiers in the smiles file\n"; + } } - if (cl.option_present('i')) - { - if (! determine_inter_column_separator(cl, 'i', identifier_file_column_separator)) + if (cl.option_present('i')) { + if (!determine_inter_column_separator(cl, 'i', identifier_file_column_separator)) { return 1; + } } - if (cl.option_present('I')) - { - if (! determine_inter_column_separator(cl, 'I', smiles_file_column_separator)) + if (cl.option_present('I')) { + if (!determine_inter_column_separator(cl, 'I', smiles_file_column_separator)) { return 1; + } } - if (cl.option_present('n')) - { - if (cl.option_present('k')) - { + if (cl.option_present('n')) { + if (cl.option_present('k')) { cerr << "The -n and -k options are mutually exclusive\n"; - usage (5); + usage(5); } cl.value('n', string_before_identifier_file_info); - if (verbose) - cerr << "Will put '" << string_before_identifier_file_info << " before identifier file info\n"; + if (verbose) { + cerr << "Will put '" << string_before_identifier_file_info + << " before identifier file info\n"; + } } - if (cl.option_present('B')) - { - if (! cl.value('B', block_size) || block_size < 1) - { + if (cl.option_present('B')) { + if (!cl.value('B', block_size) || block_size < 1) { cerr << "INvalid block size (-B option)\n"; usage(5); } - if (verbose) + if (verbose) { cerr << "Will use " << block_size << " as the output block size\n"; + } } - if (cl.option_present('d')) - { + if (cl.option_present('d')) { ignore_duplicate_identifiers_in_identifier_file = 1; - if (verbose) + if (verbose) { cerr << "Will ignore duplicates in identifier file\n"; + } } - if (cl.option_present('q')) - { + if (cl.option_present('q')) { ignore_duplicate_identifiers_in_identifier_file = 2; - if (verbose) + if (verbose) { cerr << "Will quietly ignore duplicates in identifier file\n"; + } } - if (cl.option_present('f')) - { + if (cl.option_present('f')) { ignore_column_not_present = cl.option_count('f'); - if (verbose) - cerr << "Will ignore records (in both identifier file and target file) where not enough columns\n"; + if (verbose) { + cerr << "Will ignore records (in both identifier file and target file) where not " + "enough columns\n"; + } } - if (cl.option_present('z')) - { + if (cl.option_present('z')) { strip_leading_zeros_from_identifiers = 1; - if (verbose) + if (verbose) { cerr << "Will strip leading zero's from identifiers\n"; + } } - if (cl.option_present('c')) - { - if (! cl.value('c', identifier_column_in_identifier_file) || 0 == identifier_column_in_identifier_file) - { + if (cl.option_present('c')) { + if (!cl.value('c', identifier_column_in_identifier_file) || + 0 == identifier_column_in_identifier_file) { cerr << "The column in identifier file option (-c) must be a valid column number\n"; usage(5); } - if (verbose) - cerr << "Identifiers in identifier file in column " << identifier_column_in_identifier_file << endl; + if (verbose) { + cerr << "Identifiers in identifier file in column " + << identifier_column_in_identifier_file << '\n'; + } - if (identifier_column_in_identifier_file > 0) + if (identifier_column_in_identifier_file > 0) { identifier_column_in_identifier_file--; + } } - if (cl.option_present('C')) - { + if (cl.option_present('C')) { const_IWSubstring c = cl.string_value('C'); - if (c.starts_with("RX=")) - { + if (c.starts_with("RX=")) { c.remove_leading_chars(3); - if (! iwre2::RE2Reset(identifier_regexp_in_smiles_file, c)) - { + if (!iwre2::RE2Reset(identifier_regexp_in_smiles_file, c)) { cerr << "Invalid smiles file identifier regexp '" << c << "'\n"; return 5; } - if (verbose) - cerr << "Identifiers in smiles file must match '" << identifier_regexp_in_smiles_file->pattern() << "'\n"; + if (verbose) { + cerr << "Identifiers in smiles file must match '" + << identifier_regexp_in_smiles_file->pattern() << "'\n"; + } identifier_column_in_smiles_file = -1; } -// else if (! cl.value('C', identifier_column_in_smiles_file) || identifier_column_in_smiles_file < 1) - else if (! cl.value('C', identifier_column_in_smiles_file)) - { + // else if (! cl.value('C', identifier_column_in_smiles_file) || + // identifier_column_in_smiles_file < 1) + else if (!cl.value('C', identifier_column_in_smiles_file)) { cerr << "The column in smiles file option (-C) must be a whole +ve number\n"; usage(5); + } else if (verbose) { + cerr << "Identifiers in smiles file in column " << identifier_column_in_smiles_file + << '\n'; } - else if (verbose) - cerr << "Identifiers in smiles file in column " << identifier_column_in_smiles_file << endl; - if (identifier_column_in_smiles_file > 0) + if (identifier_column_in_smiles_file > 0) { identifier_column_in_smiles_file--; + } } - if (cl.option_present('k')) - { + if (cl.option_present('k')) { include_identifier_file_info = 0; - if (verbose) + if (verbose) { cerr << "Suppress addition of identifier file info\n"; + } } int fail_if_read_identifiers_to_fetch_fails = 1; - if (cl.option_present('b')) - { + if (cl.option_present('b')) { fail_if_read_identifiers_to_fetch_fails = 0; - if (verbose) - cerr << "Will stop processing identifier file on error, and continue with identifiers found so far\n"; + if (verbose) { + cerr << "Will stop processing identifier file on error, and continue with " + "identifiers found so far\n"; + } } - if (cl.option_present('j')) - { + if (cl.option_present('j')) { identifier_file_is_descriptor_file = 1; - if (verbose) + if (verbose) { cerr << "Will skip header record in identifier file\n"; + } } IW_STL_Hash_Map_String identifiers_to_fetch; identifiers_to_fetch.rehash(1000000); -// Where do we start scanning the command line + // Where do we start scanning the command line int clstart = 1; - if (cl.option_present('F')) // nobody uses this any more - { - const char * f = cl.option_value('F'); - if (! read_identifiers_to_fetch(f, identifiers_to_fetch)) - { + if (cl.option_present('F')) { // nobody uses this any more + const char* f = cl.option_value('F'); + if (!read_identifiers_to_fetch(f, identifiers_to_fetch)) { cerr << "Cannot read identifiers to fetch from '" << f << "'\n"; return 6; } clstart = 0; - } - else if (cl.option_present('K')) - { - int i = 0; + } else if (cl.option_present('K')) { IWString k; - while (cl.value('K', k, i++)) - { + for (int i = 0; cl.value('K', k, i); ++i) { identifiers_to_fetch[k] = ""; } clstart = 0; - } - else if (cl.number_elements() < 2) - { + } else if (cl.number_elements() < 2) { cerr << "Must specify at least two files\n"; usage(2); - } - else if (cl.option_present('S')) + } else if (cl.option_present('S')) { ; - else if (read_identifiers_to_fetch(cl[0], identifiers_to_fetch)) + } else if (read_identifiers_to_fetch(cl[0], identifiers_to_fetch)) { ; - else if (fail_if_read_identifiers_to_fetch_fails || 0 == identifiers_to_fetch.size()) - { + } else if (fail_if_read_identifiers_to_fetch_fails || + 0 == identifiers_to_fetch.size()) { cerr << "Cannot read identifiers to fetch from '" << cl[0] << "'\n"; return 6; - } - else - { - cerr << "Warning, possibly incomplete data read from '" << cl[0] << "', allowed by -b option\n"; + } else { + cerr << "Warning, possibly incomplete data read from '" << cl[0] + << "', allowed by -b option\n"; } - if (verbose) + if (verbose) { cerr << "Will fetch " << identifiers_to_fetch.size() << " identifiers\n"; + } - if (cl.option_present('x')) - { + if (cl.option_present('x')) { invert_fetching_operation = 1; - if (verbose) + if (verbose) { cerr << "Will discard rather than fetch identifiers\n"; + } } - if (cl.option_present('w')) - { + if (cl.option_present('w')) { write_stream_for_not_in_smiles_as_smiles = 1; - if (verbose) + if (verbose) { cerr << "Will write stream for not in smiles file as smiles\n"; + } + } + + if (cl.option_present('h')) { + flush_output = 1; + if (verbose) { + cerr << "Will flush output after each molecule found\n"; + } } IWString stem; - if (cl.option_present('S')) - { + if (cl.option_present('S')) { cl.value('S', stem); - if (verbose) + if (verbose) { cerr << "Files formed from stem '" << stem << "'\n"; + } IWString xstem, ystem; - if (cl.option_present('X')) + if (cl.option_present('X')) { cl.value('X', xstem); - if (cl.option_present('Y')) + } + if (cl.option_present('Y')) { cl.value('Y', ystem); + } IWString suffix; - if (cl.option_present('u')) + if (cl.option_present('u')) { cl.value('u', suffix); + } int seq_begin = 0; - if (cl.option_present('g')) + if (cl.option_present('g')) { cl.value('g', seq_begin); + } - if (! fetch_smiles_quick_multiple_files(cl, stem, seq_begin, suffix, xstem, ystem)) + if (!fetch_smiles_quick_multiple_files(cl, stem, seq_begin, suffix, xstem, ystem)) { return 2; - else + } else { return 1; + } } - if (cl.option_present('X')) - { + if (cl.option_present('X')) { IWString fname = cl.string_value('X'); - if (write_stream_for_not_in_smiles_as_smiles && ! fname.ends_with(".smi")) + if (write_stream_for_not_in_smiles_as_smiles && !fname.ends_with(".smi")) { fname << ".smi"; + } - if (fname == cl[0] || fname == cl[1]) - { + if (fname == cl[0] || fname == cl[1]) { cerr << "Cannot overwrite input file(s) '" << fname << "' (-X)\n"; return 1; } stream_for_not_in_identifier_file.open(fname.null_terminated_chars(), std::ios::out); - if (! stream_for_not_in_identifier_file.good()) - { - cerr << "Cannot open stream for not in identifier file (-X option), '" << fname << "'\n"; + if (!stream_for_not_in_identifier_file.good()) { + cerr << "Cannot open stream for not in identifier file (-X option), '" << fname + << "'\n"; return 4; } - if (verbose) + if (verbose) { cerr << "Smiles not in identifier file written to '" << fname << "'\n"; + } } - if (cl.option_present('Y')) - { + if (cl.option_present('Y')) { IWString fname = cl.string_value('Y'); - if (write_stream_for_not_in_smiles_as_smiles && ! fname.ends_with(".smi")) + if (write_stream_for_not_in_smiles_as_smiles && !fname.ends_with(".smi")) { fname << ".smi"; + } - if (fname == cl[0] || fname == cl[1]) - { + if (fname == cl[0] || fname == cl[1]) { cerr << "Cannot overwrite input file(s) '" << fname << "' (-Y)\n"; return 1; } stream_for_not_in_smiles_file.open(fname.null_terminated_chars(), std::ios::out); - if (! stream_for_not_in_smiles_file.good()) - { - cerr << "Cannot open stream for not in smiles file (-Y option), '" << fname << "'\n"; + if (!stream_for_not_in_smiles_file.good()) { + cerr << "Cannot open stream for not in smiles file (-Y option), '" << fname + << "'\n"; return 4; } - if (verbose) + if (verbose) { cerr << "Identifiers not in smiles file written to '" << fname << "'\n"; + } } IWString_and_File_Descriptor output(1); int rc = 0; - for (int i = clstart; i < cl.number_elements(); i++) - { - if (! fetch_smiles_quick(cl[i], identifiers_to_fetch, output)) - { + for (int i = clstart; i < cl.number_elements(); i++) { + if (!fetch_smiles_quick(cl[i], identifiers_to_fetch, output)) { rc = i + 1; break; } @@ -962,37 +931,39 @@ fetch_smiles_quick (int argc, char ** argv) output.flush(); - if (0 == items_written) + if (0 == items_written) { cerr << "Warning, nothing written!\n"; + } - if (verbose) - { + if (verbose) { cerr << items_written << " items written\n"; - if (not_in_identifier_file) + if (not_in_identifier_file) { cerr << not_in_identifier_file << " items not in identifier file\n"; - if (duplicate_identifiers_in_identifier_file) - cerr << "ignored " << duplicate_identifiers_in_identifier_file << " duplicate identifiers in identifier file\n"; + } + if (duplicate_identifiers_in_identifier_file) { + cerr << "ignored " << duplicate_identifiers_in_identifier_file + << " duplicate identifiers in identifier file\n"; + } } - if (stream_for_not_in_smiles_file.rdbuf()->is_open() && identifiers_to_fetch.size()) - { - if (verbose) + if (stream_for_not_in_smiles_file.rdbuf()->is_open() && identifiers_to_fetch.size()) { + if (verbose) { cerr << identifiers_to_fetch.size() << " identifiers not in smiles file\n"; + } - for (IW_STL_Hash_Map_String::const_iterator i = identifiers_to_fetch.begin(); i != identifiers_to_fetch.end(); ++i) - { - if (write_stream_for_not_in_smiles_as_smiles) - { - if ((*i).second.length()) + for (IW_STL_Hash_Map_String::const_iterator i = identifiers_to_fetch.begin(); + i != identifiers_to_fetch.end(); ++i) { + if (write_stream_for_not_in_smiles_as_smiles) { + if ((*i).second.length()) { stream_for_not_in_smiles_file << (*i).second << ' ' << (*i).first; - else + } else { stream_for_not_in_smiles_file << (*i).first; - } - else - { + } + } else { stream_for_not_in_smiles_file << (*i).first; - if ((*i).second.length()) + if ((*i).second.length()) { stream_for_not_in_smiles_file << ' ' << (*i).second; + } } stream_for_not_in_smiles_file << '\n'; } @@ -1002,8 +973,7 @@ fetch_smiles_quick (int argc, char ** argv) } int -main (int argc, char ** argv) -{ +main(int argc, char** argv) { prog_name = argv[0]; int rc = fetch_smiles_quick(argc, argv); diff --git a/src/Utilities/General/model_average.cc b/src/Utilities/General/model_average.cc index a7ad8b3b..23f8c7c3 100644 --- a/src/Utilities/General/model_average.cc +++ b/src/Utilities/General/model_average.cc @@ -13,7 +13,6 @@ #include "Foundational/data_source/iwstring_data_source.h" using std::cerr; -using std::endl; const char* prog_name = NULL; @@ -60,21 +59,23 @@ usage(int rc) #endif // clang-format on // clang-format off - cerr << "Performs model averaging for regression models. All values in one file\n"; - cerr << " -c model in column \n"; - cerr << " -c ,w=0.2 model in column , specify relative weight\n"; - cerr << " -a ... type of averaging to do\n"; - cerr << " -a ave average of all values (default)\n"; - cerr << " -a max final result is max of all values encountered\n"; - cerr << " -a sum final result is sum of all values encountered\n"; - cerr << " -a pairs many-class model, pair-wise models built\n"; - cerr << " -a ovsall many-class model, individual built\n"; - cerr << " -C classification model, convert result to class\n"; - cerr << " -d discern class labels from input\n"; - cerr << " -f write composite prediction first rather than last\n"; - cerr << " -n just write the composite prediction\n"; - cerr << " -r include range/average/standard deviation of component models\n"; - cerr << " -v verbose output\n"; + cerr << R"( +Performs model averaging for regression models. All values in one file + -c model in column . c1,c2,c3 and c1-c4 also recognised. + -c ,w=0.2 model in column , specify relative weight + -a ... type of averaging to do + -a ave average of all values (default) + -a max final result is max of all values encountered + -a sum final result is sum of all values encountered + -a pairs many-class model, pair-wise models built + -a ovsall many-class model, individual built + -C classification model, convert result to class + -d discern class labels from input + -f write composite prediction first rather than last + -n just write the composite prediction + -r include range/average/standard deviation of component models + -v verbose output +)"; // clang-format on exit(rc); diff --git a/src/Vendor/BUILD b/src/Vendor/BUILD index ad87b375..913f5662 100644 --- a/src/Vendor/BUILD +++ b/src/Vendor/BUILD @@ -5,6 +5,7 @@ local_install( name = "install", srcs = [ ":clogp2descriptors_biobyte", + ":marvin_pka", ":rcorina", ], tags = [ @@ -45,6 +46,18 @@ cc_binary( ], ) +cc_binary( + name = "marvin_pka", + srcs = [ + "marvin_pka.cc", + ], + deps = [ + "//Foundational/cmdline:iwcmdline", + "//Molecule_Lib:iwmolecule", + "//Molecule_Lib:moleculeio", + ], +) + cc_binary( name = "rcorina", srcs = [ diff --git a/src/Molecule_Tools/marvin_pka.cc b/src/Vendor/marvin_pka.cc similarity index 100% rename from src/Molecule_Tools/marvin_pka.cc rename to src/Vendor/marvin_pka.cc diff --git a/src/WORKSPACE b/src/WORKSPACE index 0e549251..75e057f5 100644 --- a/src/WORKSPACE +++ b/src/WORKSPACE @@ -160,7 +160,7 @@ new_local_repository( build_file_content = """ cc_library( name = "x86-simd.sort", - hdrs = + hdrs = glob( ["src/*hpp"], ), @@ -215,7 +215,7 @@ new_local_repository( cc_library( name = "xgboost", srcs = [ - "lib64/libxgboost.so", + "lib/libxgboost.so", ], hdrs = glob( diff --git a/src/build_from_src.sh b/src/build_from_src.sh deleted file mode 100755 index 5c7b151f..00000000 --- a/src/build_from_src.sh +++ /dev/null @@ -1,123 +0,0 @@ -#!/bin/bash - -echo "Builds and installs LillyMol executables" -echo "The assumption is that WORKSPACE and build_deps/install.bzl" -echo "have both been configured." -echo "" -echo "First task is to build and run C++ unit tests." - -# note that we do not check return codes from any of the invocations. - -# If bazelisk is available use it, otherwise try bazel, otherwise fail. - -if [[ ! -z "$(type -p bazelisk)" ]] ; then - bazel='bazelisk' -elif [[ ! -z "$(type -p bazel)" ]] ; then - bazel='bazel' -else - echo "No bazel or bazelisk, build will fail" - bazel='bazelisk' -fi - -# Adjust to local resource availability. -jobs='8' - -declare -i inside_lilly -if [[ $(hostname -d) =~ 'lilly.com' ]] ; then - inside_lilly=1 -else - inside_lilly=0 -fi - -# Options that are used by all bazelisk invocations. - -# bazel will not work on an NFS mounted file system. So if you are on an NFS -# file system, you must specify a value for --output_user_root that is -# locally mounted. -# Note that the bazel cache can get quite large, 1-2GB. - -# If inside Lilly, some local scratch storage -if [[ ${inside_lilly} -eq 1 && -d '/node/scratch' ]] ; then - bazel_options="--output_user_root=/node/scratch/${USER}" -elif [[ $(df -TP ${HOME}) =~ 'nfs' ]] ; then - echo "Your HOME dir is an NFS mounted file system. bazel will not work." - echo "Will attempt to use /tmp/ for bazel cache, that will need to be changed." - bazel_options='--output_user_root=/tmp' -else - # Even if outside Lilly, you may still need to set this - bazel_options="" -fi - -build_options="--cxxopt=-DGIT_HASH=\"$(git rev-parse --short --verify HEAD)\" --cxxopt=-DTODAY=\"$(date +%Y-%b-%d)\" --jobs=${jobs} -c opt" - -# All BerkeleyDB things are now isolated, so this probably makes no difference. -if [[ ! -v BUILD_BDB ]] ; then - build_options+=' --build_tag_filters=-berkeleydb' -fi - -# First task is unit tests - -${bazel} ${bazel_options} test ${build_options} Foundational/...:all -${bazel} ${bazel_options} test ${build_options} Molecule_Lib:all -${bazel} ${bazel_options} test ${build_options} Molecule_Tools:all -${bazel} ${bazel_options} test ${build_options} Utilities/...:all - -# Currently no tests in these. -if [[ -v BUILD_BDB ]] ; then - # ${bazel} ${bazel_options} test ${build_options} BerkeleyDB:all - # ${bazel} ${bazel_options} test ${build_options} Molecule_Tools_Bdb:all - echo "" -fi - -# Once the tests run, then executables can be built. -# donor_acceptor_test frequently fails due to lack of supporting files. - -if [[ ! -v BUILD_LIBRARY_ONLY ]] ; then - echo "Building tools" - ${bazel} ${bazel_options} build ${build_options} Molecule_Tools:all - ${bazel} ${bazel_options} build ${build_options} Obsolete:all - ${bazel} ${bazel_options} build ${build_options} Obsolete/Descriptor_Similarity:all - ${bazel} ${bazel_options} build ${build_options} Foundational/iw_tdt:all - ${bazel} ${bazel_options} build ${build_options} Utilities/...:all -fi - -if [[ ${inside_lilly} -eq 1 || -v BUILD_VENDOR ]] ; then - ${bazel} ${bazel_options} build ${build_options} Vendor/...:all -fi - -if [[ -v BUILD_BDB ]] ; then - ${bazel} ${bazel_options} build ${build_options} BerkeleyDB:all - ${bazel} ${bazel_options} build ${build_options} Molecule_Tools_Bdb:all -fi - -# Now install the targets - -if [[ ! -v BUILD_LIBRARY_ONLY ]] ; then - echo "Installing tools" - ${bazel} ${bazel_options} run ${build_options} Foundational/iw_tdt:install - ${bazel} ${bazel_options} run ${build_options} Molecule_Tools:install - ${bazel} ${bazel_options} run ${build_options} Obsolete:install - ${bazel} ${bazel_options} run ${build_options} Obsolete/Descriptor_Similarity:install - ${bazel} ${bazel_options} run ${build_options} Utilities/General:install - ${bazel} ${bazel_options} run ${build_options} Utilities/GFP_Tools:install - ${bazel} ${bazel_options} run ${build_options} Utilities/GFP_Knn:install - ${bazel} ${bazel_options} run ${build_options} Utilities/Distance_Matrix:install -fi - -if [[ ${inside_lilly} -eq 1 || -v BUILD_VENDOR ]] ; then - ${bazel} ${bazel_options} run ${build_options} Vendor:install -fi - -if [[ -v BUILD_BDB ]] ; then - ${bazel} ${bazel_options} run ${build_options} BerkeleyDB:install - ${bazel} ${bazel_options} run ${build_options} Molecule_Tools_Bdb:install -fi - -# Python if requested, build, install and test. -# Note that PYTHONPATH will need to be adjusted, or copy the shared -# libraries from LillyMol/lib to your default PYTHONPATH. - -if [[ -v BUILD_PYTHON ]] ; then - ${bazel} ${bazel_options} build ${build_options} pybind:all - ./copy_shared_libraries.sh ../lib -fi diff --git a/src/build_linux.sh b/src/build_linux.sh index e23cd1d8..3507204f 100755 --- a/src/build_linux.sh +++ b/src/build_linux.sh @@ -31,11 +31,11 @@ fi if [[ -v BUILD_PYTHON ]] ; then # Use python to update WORKSPACE for python locations. if [[ -s 'update_python_in_workspace.py' ]] ; then - cp WORKSPACE /tmp - python3 ./update_python_in_workspace.py /tmp/WORKSPACE > WORKSPACE + cp WORKSPACE "/tmp/WORKSPACE_${USER}" + python3 ./update_python_in_workspace.py "/tmp/WORKSPACE_${USER}" > WORKSPACE if [[ ! -s WORKSPACE ]] ; then echo "Updating WORKSPACE failed, restoring orignal, python bindings will not work" - cp -f /tmp/WORKSPACE WORKSPACE + cp -f "/tmp/WORKSPACE_${USER}" WORKSPACE fi echo "WORKSPACE updated" else @@ -52,13 +52,13 @@ fi bindir=$(echo $REPO_HOME/bin/$(uname) | sed -e 's/\//\\\//g') # Make a copy -cp build_deps/install.bzl /tmp/install.bzl.orig +cp build_deps/install.bzl /tmp/install.bzl.${USER} sed -i -e "s/default *= *\".*\",/default = \"${bindir}\",/" build_deps/install.bzl # Create bindir if not already present bindir=$REPO_HOME/bin/$(uname) -if [[ ! -d ${bindir} ]] ; then +if [[ ! -d ${bindir} ]] ; then mkdir -p ${bindir} fi @@ -80,14 +80,14 @@ third_party=$REPO_HOME/third_party echo "third_party in ${third_party}" if [[ ! -d "${third_party}" ]] ; then - mkdir -p "${third_party}" + mkdir -p "${third_party}" fi lib=$REPO_HOME/lib echo "lib in ${lib}" if [[ ! -d "${lib}" ]] ; then - mkdir -p "${lib}" + mkdir -p "${lib}" fi @@ -100,7 +100,7 @@ fi pushd $third_party if [[ ! -d ${third_party}/bin ]] ; then - mkdir -p ${third_party}/bin + mkdir -p ${third_party}/bin fi # If we clone the repo we must build it, even if the @@ -158,7 +158,7 @@ fi if [[ -v BUILD_XGBOOST ]] ; then git clone --recursive https://github.com/dmlc/xgboost mkdir xgboost/build - (cd xgboost/build && cmake -DCMAKE_INSTALL_PREFIX=${third_party} ..) + (cd xgboost/build && cmake -DCMAKE_INSTALL_PREFIX=${third_party} -DCMAKE_INSTALL_LIBDIR=${third_party}/lib ..) (cd xgboost/build && make -j${THREADS}) (cd xgboost/build && make install) fi @@ -215,6 +215,9 @@ build_options="--cxxopt=-DGIT_HASH=\"$(git rev-parse --short --verify HEAD)\" -- build_options="${build_options} --noincompatible_use_python_toolchains" +# Enable partial builds. +build_options="${build_options} -k" + # Outside Lily use native architective if [[ ${inside_lilly} -eq 1 ]] ; then build_options="${build_options} --cxxopt=-march=sandybridge --cxxopt=-mtune=sandybridge" @@ -247,6 +250,10 @@ if [[ ! -v BUILD_XGBOOST ]] ; then build_tag_filters+=('-xgboost') fi +if [[ ${inside_lilly} -eq 0 ]] ; then + build_tag_filters+=('-vendor') +fi + if [[ "${#build_tag_filters[@]}" -gt 0 ]] ; then tmp=$(IFS=,;printf '%s' "${build_tag_filters[*]}") build_options="${build_options} --build_tag_filters=${tmp}" @@ -292,6 +299,10 @@ if [[ -v BUILD_GO ]] ; then ${bazel} ${bazel_options} build ${build_options} go:all fi +if [[ -v BUILD_XGBOOST ]] ; then + ${bazel} ${bazel_options} build ${build_options} xgboost:all +fi + # Now install the targets if [[ ! -v BUILD_LIBRARY_ONLY ]] ; then @@ -301,8 +312,9 @@ if [[ ! -v BUILD_LIBRARY_ONLY ]] ; then ${bazel} ${bazel_options} run ${build_options} Obsolete:install ${bazel} ${bazel_options} run ${build_options} Obsolete/Descriptor_Similarity:install ${bazel} ${bazel_options} run ${build_options} Utilities/General:install - ${bazel} ${bazel_options} run ${build_options} Utilities/GFP_Tools:install + ${bazel} ${bazel_options} run ${build_options} Utilities/GeneExpression:install ${bazel} ${bazel_options} run ${build_options} Utilities/GFP_Knn:install + ${bazel} ${bazel_options} run ${build_options} Utilities/GFP_Tools:install ${bazel} ${bazel_options} run ${build_options} Utilities/Distance_Matrix:install ${bazel} ${bazel_options} run ${build_options} go:install fi @@ -316,6 +328,10 @@ if [[ -v BUILD_BDB ]] ; then ${bazel} ${bazel_options} run ${build_options} Molecule_Tools_Bdb:install fi +if [[ -v BUILD_XGBOOST ]] ; then + ${bazel} ${bazel_options} run ${build_options} xgboost:install +fi + # Python if requested, build, install and test. # Note that PYTHONPATH will need to be adjusted, or copy the shared # libraries from LillyMol/lib to your default PYTHONPATH. diff --git a/src/go/BUILD b/src/go/BUILD new file mode 100644 index 00000000..2d4df960 --- /dev/null +++ b/src/go/BUILD @@ -0,0 +1,56 @@ +load("@rules_go//go:def.bzl", "go_binary", "go_test") +load("//build_deps:install.bzl", "local_install") + +local_install( + name = "install", + srcs = [ + ":grep_sdf", + ":parallel_process_file", + ":no_spaces_in_file_name", + ":regression_to_classification", + ":rxn_reverse", + ":rxnsmiles2smi", + ], +) + +go_binary( + name = "parallel_process_file", + srcs = [ + "parallel_process_file.go", + ], +) + +go_binary( + name = "no_spaces_in_file_name", + srcs = [ + "no_spaces_in_file_name.go", + ], +) + +go_binary( + name = "regression_to_classification", + srcs = [ + "regression_to_classification.go", + ], +) + +go_binary( + name = "rxn_reverse", + srcs = [ + "rxn_reverse.go", + ], +) + +go_binary( + name = "rxnsmiles2smi", + srcs = [ + "rxnsmiles2smi.go", + ], +) + +go_binary( + name = "grep_sdf", + srcs = [ + "grep_sdf.go", + ], +) diff --git a/src/hello_world/BUILD b/src/hello_world/BUILD new file mode 100644 index 00000000..5a26b838 --- /dev/null +++ b/src/hello_world/BUILD @@ -0,0 +1,8 @@ +cc_binary( + name = "hello_world", + srcs = [ + "hello_world.cc", + ], + deps = [ + ], +) diff --git a/src/julia/BUILD b/src/julia/BUILD new file mode 100644 index 00000000..8059b5dc --- /dev/null +++ b/src/julia/BUILD @@ -0,0 +1,106 @@ +cc_library ( + name = "lillymol_julia", + srcs = [ + "lillymol.cc", + ], + deps = [ + ":resizable_array_holder", + "//Molecule_Lib:iwmolecule", + "//Molecule_Lib:ring_data", + "//Molecule_Lib:moleculeio", + "//Molecule_Lib:iwreaction", + "//Molecule_Tools:xlogp_lib", + "//Molecule_Tools:alogp_lib", + "@com_gitlab_libeigen_eigen//:eigen", + "@libcxxwrap//:libcxxwrap", + "@julia//:julia", + ], + tags = [ + "julia", + ], +) + +cc_library ( + name = "lillymol_substructure", + srcs = [ + "lillymol_substructure.cc", + ], + deps = [ + ":resizable_array_holder", + "//Molecule_Lib:iwmolecule", + "//Molecule_Lib:moleculeio", + "@libcxxwrap//:libcxxwrap", + "@julia//:julia", + ], + tags = [ + "julia", + ], +) + + +cc_library( + name = "bag_julia", + srcs = [ + "bag.cc", + ], + deps = [ + "@libcxxwrap//:libcxxwrap", + "@julia//:julia", + ], + tags = [ + "julia", + ], +) + +cc_shared_library( + name = "lillymol_shared", + deps = [ + ":lillymol_julia", + ], + dynamic_deps = [ + ], + shared_lib_name = "lillymol_julia.so", + tags = [ + "julia", + ], +) + +cc_shared_library( + name = "lillymol_substructure_shared", + deps = [ + ":lillymol_julia", + ], + dynamic_deps = [ + ], + shared_lib_name = "lillymol_substructure_julia.so", + tags = [ + "julia", + ], +) + + +cc_shared_library( + name = "box_shared", + deps = [ + ":bag_julia", + ], + dynamic_deps = [ + ], + shared_lib_name = "bag_julia.so", + tags = [ + "julia", + ], +) + +cc_library ( + name = "resizable_array_holder", + hdrs = [ + "resizable_array_holder.h" + ], + deps = [ + "//Foundational/iwaray", + ], + tags = [ + "julia", + ], +) diff --git a/src/pybind/lillymol_pybind.cc b/src/pybind/lillymol_pybind.cc index c8887ec9..c53a0ebd 100644 --- a/src/pybind/lillymol_pybind.cc +++ b/src/pybind/lillymol_pybind.cc @@ -145,6 +145,24 @@ ToScaffold(Molecule& m) { return 0; } + // Add back in singly connected =O and =N + // Note that we do this for all scaffold atoms, + // aromatic rings, aliphatic rings and in linker groups. + for (int i = 0; i < matoms; ++i) { + const Atom& a = m[i]; + if (a.ncon() != 1) { + continue; + } + const Bond* b = a[0]; + if (! b->is_double_bond()) { + continue; + } + atom_number_t o = b->other(i); + if (spinach[o] == 0) { + spinach[i] = 0; + } + } + return m.remove_atoms(spinach.get(), 1); } @@ -230,6 +248,7 @@ PYBIND11_MODULE(lillymol, m) .def("set_append_molecular_formula", &Mol2Graph::set_append_molecular_formula, "set_append_molecular_formula") .def("set_aromatic_distinguishing_formula", &Mol2Graph::set_aromatic_distinguishing_formula, "set_aromatic_distinguishing_formula") .def("set_remove_chiral_centres", &Mol2Graph::set_remove_chiral_centres, "set_remove_chiral_centres") + .def("turn_on_most_useful_options", &Mol2Graph::TurnOnMostUsefulOptions, "turn on the options you probably want") .def("set_active", &Mol2Graph::set_active, "Set active") .def("active", &Mol2Graph::active, "True if active") ; @@ -383,6 +402,16 @@ PYBIND11_MODULE(lillymol, m) }, "For each atom the ring system identifier" ) + .def("label_atoms_by_ring_system_including_spiro_fused_np", + [](Molecule& m)-> py::array_t { + py::array_t result = mkarray_via_buffer(m.natoms()); + auto req = result.request(); + int* ptr = static_cast(req.ptr); + m.label_atoms_by_ring_system_including_spiro_fused(ptr); + return result; + }, + "For each atom the ring system identifier" + ) .def("amw", static_cast(&Molecule::molecular_weight), "AMW") .def("exact_mass", static_cast(&Molecule::exact_mass), "Exact Mass") .def("ncon", static_cast(&Molecule::ncon), "Connections to Atom") @@ -488,6 +517,21 @@ PYBIND11_MODULE(lillymol, m) }, "Remove a set of atoms" ) + .def("remove_atoms", + [](Molecule& m, py::array_t to_remove, int flag)->int { + //auto req = to_remove.request(); + int* ptr = static_cast(to_remove.request().ptr); + int rc = 0; + for (int i = m.natoms() - 1; i >= 0; --i) { + if (ptr[i] == flag) { + m.remove_atom(i); + ++rc; + } + } + return rc; + }, + "Remove atoms where to_remove[i] == flag" + ) .def("sort_atoms", [](Molecule& m, const std::vector& order) { static constexpr int kAscending = 1; @@ -771,6 +815,16 @@ PYBIND11_MODULE(lillymol, m) }, "Set isotope for atoms in 's'" ) + .def("set_isotopes", + [](Molecule& m, py::array_t iso) { + int* ptr = static_cast(iso.request().ptr); + const int matoms = m.natoms(); + for (int i = 0; i < matoms; ++i) { + m.set_isotope(i, ptr[i]); + } + }, + "Set isotope for each atom" + ) .def("number_isotopic_atoms", static_cast(&Molecule::number_isotopic_atoms), "Number atoms with isotopes") .def("first_atom_with_isotope", [](const Molecule& m, isotope_t iso) -> atom_number_t { @@ -804,6 +858,46 @@ PYBIND11_MODULE(lillymol, m) }, "Most separated atoms" ) + .def("atoms_on_shortest_path", + [](Molecule& m, atom_number_t a1, atom_number_t a2) ->std::optional { + Set_of_Atoms result; + if (! m.atoms_between(a1, a2, result)) { + return std::nullopt; + } + + if (result.empty()) { + return std::nullopt; + } + + return result; + }, + "Return list of atoms on the shortest path between a1 and a2" + ) + .def("down_the_bond", + [](Molecule& m, atom_number_t a1, atom_number_t a2)->std::optional { + const int matoms = m.natoms(); + std::unique_ptr dtb = std::make_unique(matoms); + std::fill_n(dtb.get(), matoms, 0); + std::optional maybe_n = m.DownTheBond(a1, a2, dtb.get()); + if (! maybe_n) { + return std::nullopt; + } + // std::cerr << "Found " << *maybe_n << " atoms down the " << a1 << " " << a2 << " bond\n"; + Set_of_Atoms result; + result.reserve(*maybe_n); + for (int i = 0; i < matoms; ++i) { + if (i == a2) { + continue; + } + if (dtb[i]) { + result << i; + } + } + // std::cerr << "Returning " << result << '\n'; + return result; + }, + "Return all the atoms found looking down the bond from a1 to a2" + ) .def("reset_atom_map_numbers", static_cast(&Molecule::reset_all_atom_map_numbers), "Reset atom map numbers") .def("set_atom_map_number", static_cast(&Molecule::set_atom_map_number), "Set atom map number") @@ -1423,7 +1517,14 @@ PYBIND11_MODULE(lillymol, m) lhs += rhs; return lhs; }, - "add contents of RHS to LHS returning new Set_of_Atoms" + "add contents of RHS to LHS returning lhs" + ) + .def("__iadd__", + [](Set_of_Atoms& lhs, atom_number_t a)->Set_of_Atoms { + lhs.add(a); + return lhs; + }, + "Add atom `a` to lhs" ) ; @@ -1569,12 +1670,11 @@ PYBIND11_MODULE(lillymol, m) }, "Return a list of molecules" ); - m.def("set_auto_create_new_elements", &set_auto_create_new_elements, "auto create new elements"); + m.def("set_auto_create_new_elements", &set_auto_create_new_elements, "Allow arbitrary two letter elements"); m.def("set_atomic_symbols_can_have_arbitrary_length", &set_atomic_symbols_can_have_arbitrary_length, "any string is an element"); m.def("interpret_D_as_deuterium", &element::interpret_d_as_deuterium, "D means '[2H]'"); m.def("interpret_T_as_deuterium", &element::interpret_t_as_tritium, "T means '[3H]'"); m.def("set_display_strange_chemistry_messages", &set_display_strange_chemistry_messages, "turn off messages about bad valences"); - m.def("set_auto_create_new_elements", &set_auto_create_new_elements, "Allow arbitrary two letter elements"); m.def("set_atomic_symbols_can_have_arbitrary_length", &set_atomic_symbols_can_have_arbitrary_length, "Enable elements like 'Ala', 'Gly'"); m.def("set_display_smiles_interpretation_error_messages", &set_display_smiles_interpretation_error_messages, "Set smiles error messages"); m.def("count_atoms_in_smiles", @@ -1643,7 +1743,12 @@ PYBIND11_MODULE(lillymol, m) py::class_(m, "RotatableBonds") .def(py::init<>()) - .def("rotatable_bonds", &quick_rotbond::QuickRotatableBonds::Process) + .def("rotatable_bonds", + [](quick_rotbond::QuickRotatableBonds& rotb, Molecule& m)->int { + return rotb.Process(m, nullptr); + }, + "Number of rotatable bonds in `m`" + ) .def("set_calculation_type", &quick_rotbond::QuickRotatableBonds::set_calculation_type) ; diff --git a/src/pybind/lillymol_pybind_fingerprint.cc b/src/pybind/lillymol_pybind_fingerprint.cc index 9c82b436..7831eb31 100644 --- a/src/pybind/lillymol_pybind_fingerprint.cc +++ b/src/pybind/lillymol_pybind_fingerprint.cc @@ -81,6 +81,10 @@ class LinearFingerprintByte : public BaseFpGenerator { public: LinearFingerprintByte(int nb); + void set_max_length(uint32_t s) { + _fp.set_max_length(s); + } + py::array_t Fingerprint(Molecule& m); }; @@ -118,6 +122,10 @@ class ECFingerprintByte : public BaseFpGenerator { public: ECFingerprintByte(int nb); + void set_max_radius(uint32_t rad) { + _fp.set_max_radius(rad); + } + py::array_t Fingerprint(Molecule& m); }; @@ -264,6 +272,7 @@ PYBIND11_MODULE(lillymol_fingerprint, m) }, "Set atom type" ) + .def("set_max_radius", &ECFingerprintByte::set_max_radius, "Max radius for fingerprints") .def("fingerprint", [](ECFingerprintByte& fp_creator, Molecule& m) { return fp_creator.Fingerprint(m); @@ -281,6 +290,7 @@ PYBIND11_MODULE(lillymol_fingerprint, m) }, "Set atom type" ) + .def("set_max_length", &LinearFingerprintByte::set_max_length, "Max path length") .def("fingerprint", [](LinearFingerprintByte& fp_creator, Molecule& m) { return fp_creator.Fingerprint(m); @@ -298,6 +308,7 @@ PYBIND11_MODULE(lillymol_fingerprint, m) }, "Set atom type" ) + .def("set_max_separation", &AtomPairFingerprintByte::set_max_separation, "max separation between atoms") .def("fingerprint", [](AtomPairFingerprintByte& fp_creator, Molecule& m) { return fp_creator.Fingerprint(m); diff --git a/src/pybind/lillymol_pybind_reaction.cc b/src/pybind/lillymol_pybind_reaction.cc index b9f9d671..1f61b97d 100644 --- a/src/pybind/lillymol_pybind_reaction.cc +++ b/src/pybind/lillymol_pybind_reaction.cc @@ -195,8 +195,9 @@ PYBIND11_MODULE(lillymol_reaction, rxn) .def("perform_reaction", [](IWReaction& rxn, Molecule& scaffold, std::vector& sidechain)->std::optional { - // Just use default match conditions. + // Default conditions, multiple matches not allowed. Sidechain_Match_Conditions smc; + for (uint32_t i = 0; i < sidechain.size(); ++i) { if (! rxn.add_sidechain_reagent(i, sidechain[i], smc)) { std::cerr << "perform_reaction:cannot add sidechain reagent " << sidechain[i].name() << '\n'; @@ -208,6 +209,7 @@ PYBIND11_MODULE(lillymol_reaction, rxn) Substructure_Results sresults; if (rxn.substructure_search(scaffold, sresults) != 1) { std::cerr << "perform_reaction:not 1 match to scaffold " << scaffold.name() << '\n'; + rxn.remove_no_delete_all_reagents(); return std::nullopt; } @@ -224,6 +226,50 @@ PYBIND11_MODULE(lillymol_reaction, rxn) }, "React scaffold with sidechains, assuming one substructure match all round" ) + .def("perform_reaction_to_list", + [](IWReaction& rxn, Molecule& scaffold, std::vector& sidechain)->std::vector { + std::vector result; + + Sidechain_Match_Conditions smc; + // Multiple sidechain matches enumerated. + smc.set_make_new_reagent_for_each_hit(1); + + int number_reagents = 0; + for (uint32_t i = 0; i < sidechain.size(); ++i) { + if (! rxn.add_sidechain_reagent(i, sidechain[i], smc)) { + std::cerr << "perform_reaction:cannot add sidechain reagent " << sidechain[i].name() << '\n'; + rxn.remove_no_delete_all_reagents(); + return result; + } + number_reagents += rxn.sidechain(0)->number_reagents(); + } + + // Make allowances for 2 scaffold matches. Resizing is expected to be expensive. + result.reserve(2 * number_reagents); + + Substructure_Results sresults; + if (rxn.substructure_search(scaffold, sresults) == 0) { + std::cerr << "perform_reaction:no match to scaffold " << scaffold.name() << '\n'; + rxn.remove_no_delete_all_reagents(); + return result; + } + + Reaction_Iterator iter; + for (iter.initialise(rxn); iter.active(); iter++) { + Molecule product; + if (! rxn.perform_reaction(&scaffold, sresults, iter, product)) { + std::cerr << "Reaction involving " << scaffold.name() << " failed, returning partial result\n"; + rxn.remove_no_delete_all_reagents(); + return result; + } + result.push_back(product); + } + + rxn.remove_no_delete_all_reagents(); + return result; + }, + "For each scaffold embedding, generate list of products" + ) ; diff --git a/src/pybind/lillymol_test.py b/src/pybind/lillymol_test.py index f215899a..129e1797 100644 --- a/src/pybind/lillymol_test.py +++ b/src/pybind/lillymol_test.py @@ -161,6 +161,15 @@ def test_isotopes(self): self.assertEqual(m.first_atom_with_isotope(1), 1) self.assertEqual(m.first_atom_with_isotope(7), -1) + def test_set_isotopes_numpy(self): + m = Molecule() + self.assertTrue(m.build_from_smiles("CCCCCCCC")); + iso = np.zeros(m.natoms()); + for i in range(m.natoms()): + iso[i] = i + + m.set_isotopes(iso); + self.assertEqual(m.smiles(), "C[1CH2][2CH2][3CH2][4CH2][5CH2][6CH2][7CH3]") def test_fragment_related(self): m = Molecule() @@ -303,6 +312,33 @@ def test_remove_atom(self): m.add_bond(1, 2, BondType.SINGLE_BOND) self.assertEqual(m.smiles(), "CNC") + def test_remove_atoms_set_of_atoms(self): + m = Molecule() + self.assertTrue(m.build_from_smiles("CCCCCCCC")) + for i in range(m.natoms()): + m.set_isotope(i, i + 1) + + to_remove = Set_of_Atoms() + to_remove += 4 + to_remove += 6 + to_remove += 3 + self.assertEqual(m.remove_atoms(to_remove), 3) + self.assertEqual(m.smiles(), "[1CH3][2CH2][3CH3].[6CH4].[8CH4]") + + def test_remove_atoms_numpy(self): + m = Molecule() + self.assertTrue(m.build_from_smiles("CCCCCCCC")) + for i in range(m.natoms()): + m.set_isotope(i, i + 1) + + to_remove = np.zeros(m.natoms(), dtype=int); + to_remove[3] = 1 + to_remove[4] = 1 + to_remove[6] = 1 + self.assertEqual(m.remove_atoms(to_remove, 1), 3); + self.assertEqual(m.smiles(), "[1CH3][2CH2][3CH3].[6CH4].[8CH4]") + + def test_set_bond_type_between_atoms(self): m = Molecule() self.assertTrue(m.build_from_smiles("CC")) @@ -389,6 +425,13 @@ def test_distance_matrix(self): self.assertEqual(m.bonds_between(0, 1), 1) self.assertEqual(m.bonds_between(0, 2), 2) + self.assertIsNone(m.atoms_on_shortest_path(0, 1)); + # logging.info("atoms_between %s", m.atoms_on_shortest_path(0, 2)) + self.assertCountEqual(m.atoms_on_shortest_path(0, 2), [1]) + # logging.info("atoms_on_shortest_path - 3 %s", m.atoms_on_shortest_path(0, 3)) + self.assertIsNone(m. atoms_on_shortest_path(0, 3)) + self.assertEqual(m.bonds_between(0, 1), 1) + self.assertTrue(m.build_from_smiles("N(CC1=CC=C(OCCCC2=CC=CC=C2)C=C1)(CC1=CC=C(OCCCC2=CC=CC=C2)C=C1)CCCCN CHEMBL349114")) self.assertEqual(m.fragment_membership(30), m.fragment_membership(39)) self.assertEqual(m.bonds_between(30, 39), 18) @@ -396,6 +439,17 @@ def test_distance_matrix(self): self.assertEqual(m.most_distant_pair(), (13, 30)) self.assertEqual(m.bonds_between(13, 30), 26) + + self.assertTrue(m.build_from_smiles("CCC")) + # logging.info("Atoms between 0 and 2 %s", m.atoms_on_shortest_path(0, 2)) + self.assertCountEqual(m.atoms_on_shortest_path(0, 2), [1]) + self.assertEqual(m.bonds_between(0, 2), 2) + + self.assertTrue(m.build_from_smiles("CC1CC(C)1")) + self.assertCountEqual(m.atoms_on_shortest_path(0, 4), [1, 3]) + return + self.assertEqual(m.bonds_between(0, 4), 3) + def test_atom_numbers(self): m = Molecule() self.assertTrue(m.build_from_smiles("C[CH2:1][CH2:2]C")) @@ -738,23 +792,27 @@ def test_scaffold(self): self.assertTrue(m.build_from_smiles("O1N=C(C(=O)N2CCCC2)C=C1COC1=CC=C2N=CC=CC2=C1 CHEMBL1589003")) m.to_scaffold() - self.assertEqual(m.smiles(), "O1N=C(CN2CCCC2)C=C1COC1=CC=C2N=CC=CC2=C1") + self.assertEqual(m.smiles(), "O1N=C(C(=O)N2CCCC2)C=C1COC1=CC=C2N=CC=CC2=C1") self.assertTrue(m.build_from_smiles("O=C(N(C1=CC=C(C)C=C1)CC(=O)NCCOC)CCC(=O)NC1=CC=CC=N1 CHEMBL1576099")) m.to_scaffold() - self.assertEqual(m.smiles(), "C(NC1=CC=CC=C1)CCCNC1=CC=CC=N1") + self.assertEqual(m.smiles(), "O=C(NC1=CC=CC=C1)CCC(=O)NC1=CC=CC=N1") self.assertTrue(m.build_from_smiles("O=C1N(C(=O)C2=C1C(=CC=C2)N(=O)=O)CC(=O)N1CC2=CC=CC=C2CC1 CHEMBL2134451")) m.to_scaffold() - self.assertEqual(m.smiles(), "C1N(CC2=C1C=CC=C2)CCN1CC2=CC=CC=C2CC1") + self.assertEqual(m.smiles(), "O=C1N(C(=O)C2=C1C=CC=C2)CC(=O)N1CC2=CC=CC=C2CC1") self.assertTrue(m.build_from_smiles("O=C(C1=CC=CN1CC(=O)NCC1N(CCC1)CC)C1=CC=CC=C1C CHEMBL1404612")) m.to_scaffold() - self.assertEqual(m.smiles(), "C(C1=CC=CN1CCNCC1NCCC1)C1=CC=CC=C1") + self.assertEqual(m.smiles(), "O=C(C1=CC=CN1CC(=O)NCC1NCCC1)C1=CC=CC=C1") self.assertTrue(m.build_from_smiles("O=C(N1[C@H](C(=O)NC2C3=CC=CC=C3CCC2)CCC1)[C@@H](NC(=O)[C@H](C)NC)CC(=O)O CHEMBL1570483")) m.to_scaffold() - self.assertEqual(m.smiles(), "N1[C@H](CNC2C3=CC=CC=C3CCC2)CCC1") + self.assertEqual(m.smiles(), "N1[C@H](C(=O)NC2C3=CC=CC=C3CCC2)CCC1") + + self.assertTrue(m.build_from_smiles("CC(C)(N)C1=NC(=CC(=O)N1)C(F)F CHEMBL3551873")) + m.to_scaffold() + self.assertEqual(m.smiles(), "C1=NC=CC(=O)N1") def test_coords(self): m = Molecule() @@ -941,5 +999,14 @@ def test_rotbond(self): self.assertTrue(m.build_from_smiles("C1CC1C")) self.assertEqual(rotbond_calc.rotatable_bonds(m), 0) + def test_down_the_bond(self): + m = Molecule(); + self.assertTrue(m.build_from_smiles("CCC")) + self.assertCountEqual(m.down_the_bond(0, 1), [2]) + self.assertCountEqual(m.down_the_bond(2, 1), [0]) + self.assertTrue(m.build_from_smiles("C1CC1C")) + self.assertCountEqual(m.down_the_bond(2, 3), []) + self.assertIsNone(m.down_the_bond(0, 1)) + if __name__ == '__main__': absltest.main() diff --git a/src/run_python_unit_tests.sh b/src/run_python_unit_tests.sh index bbf0a27a..70460140 100755 --- a/src/run_python_unit_tests.sh +++ b/src/run_python_unit_tests.sh @@ -8,6 +8,12 @@ if [[ ! -v PYTHONPATH ]] ; then export PYTHONPATH=${here} fi +# If tmpdir is not set, multiple people running absl tests will collide. +# Unique tmpdir for each user. +if [[ ! -v TMPDIR ]] ; then + export "TMPDIR=/tmp/absl_testing_${USER}" +fi + if [[ ! -s "${here}/../lib" ]] ; then echo "No shared libraries available ${here}, python unit tests not done" exit 1 diff --git a/src/update_bazel_configs.sh b/src/update_bazel_configs.sh deleted file mode 100755 index 15db0872..00000000 --- a/src/update_bazel_configs.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# Update WORKSPACE and install.bzl for the current location. -# Must be invoked from the src directory /path/to/LillyMol/src - -if [[ ! -s 'WORKSPACE' ]] ; then - echo "Must be invoked in the directory with WORKSPACE" - exit 1 -fi - -# Only build python if requested -if [[ -v BUILD_PYTHON ]] ; then - # Use python to update WORKSPACE for python locations. - if [[ -s 'update_python_in_workspace.py' ]] ; then - cp WORKSPACE /tmp - python3 ./update_python_in_workspace.py /tmp/WORKSPACE > WORKSPACE - if [[ ! -s WORKSPACE ]] ; then - echo "Updating WORKSPACE failed, restoring orignal, python bindings will not work" - cp -f /tmp/WORKSPACE WORKSPACE - fi - echo "WORKSPACE updated" - else - echo "Missing update_python_in_workspace.py, WORKSPACE not updated for python" - fi -fi - -# install.bzl does need to be updated. -echo 'Updating build_deps/install.bzl' -if [[ ! -s 'build_deps/install.bzl' ]] ; then - echo "build_deps/install.bzl not found" - exit 1 -fi - -bindir=$(echo ${PWD}/../bin/$(uname) | sed -e 's/\//\\\//g') - -# Make a copy -tmpinstall='/tmp/install.bzl' -cp build_deps/install.bzl /tmp/install.bzl.orig - -sed --in-place --regexp-extended -e "s/default = *\"..+\",/default =\"${bindir}\",/" build_deps/install.bzl > ${tmpinstall} - -# Create bindir if not already present -bindir=$(echo ${PWD}/../bin/$(uname)) -if [[ ! -d ${bindir} ]] ; then - mkdir -p ${bindir} -fi diff --git a/src/xgboost/BUILD b/src/xgboost/BUILD new file mode 100644 index 00000000..724898ca --- /dev/null +++ b/src/xgboost/BUILD @@ -0,0 +1,82 @@ +load("@rules_proto//proto:defs.bzl", "proto_library") +load("@com_google_protobuf//:protobuf.bzl", "py_proto_library") +load("//build_deps:install.bzl", "local_install") + +local_install( + name = "install", + srcs = [ + ":xgboost_model_evaluate", + ] +) + +proto_library( + name = "xgboost_model_proto", + srcs = [ + "xgboost_model.proto", + ], + tags = [ + "xgboost", + ], +) + +cc_proto_library( + name = "xgboost_model_cc_proto", + deps = [ + ":xgboost_model_proto", + ], + tags = [ + "xgboost", + ], + visibility = [ + "//visibility:public", + ], +) + +py_proto_library( + name = "xgboost_model_py_proto", + srcs = [ + "xgboost_model.proto", + ], + tags = [ + "xgboost", + ], +) + +proto_library( + name = "random_forest_model_proto", + srcs = [ + "random_forest_model.proto", + ], +) + +cc_proto_library( + name = "random_forest_model_cc_proto", + deps = [ + "random_forest_model_proto", + ] +) + +py_proto_library( + name = "random_forest_model_py_proto", + srcs = [ + "random_forest_model.proto", + ], +) + +cc_binary( + name = "xgboost_model_evaluate", + srcs = [ + "xgboost_model_evaluate.cc", + ], + deps = [ + ":xgboost_model_cc_proto", + "//Foundational/cmdline_v2:cmdline_v2", + "//Foundational/data_source:iwstring_data_source", + "//Foundational/iwmisc", + "@xgboost//:xgboost", + "@dlmc//:dlmc", + ], + tags = [ + "xgboost", + ], +) diff --git a/src/Utilities/General/random_forest_model.proto b/src/xgboost/random_forest_model.proto similarity index 100% rename from src/Utilities/General/random_forest_model.proto rename to src/xgboost/random_forest_model.proto diff --git a/src/Utilities/General/xgboost_model.proto b/src/xgboost/xgboost_model.proto similarity index 69% rename from src/Utilities/General/xgboost_model.proto rename to src/xgboost/xgboost_model.proto index 2a42dc5e..3c27f29b 100644 --- a/src/Utilities/General/xgboost_model.proto +++ b/src/xgboost/xgboost_model.proto @@ -2,6 +2,14 @@ syntax = "proto3"; package xgboost_model; +enum TreeMethod { + UNDEFINED = 0; + AUTO = 1; + EXACT = 2; + APPROX = 3; + HIST = 4; +} + message XGBoostParameters { optional float eta = 1; @@ -10,6 +18,16 @@ message XGBoostParameters { optional uint32 n_estimators = 3; optional uint32 min_samples_split = 4; + + optional float subsample = 5; + + optional float colsample_bytree = 6; + optional float colsample_bylevel = 7; + optional float colsample_bynode = 8; + + optional TreeMethod tree_method = 9; + + optional float scale_pos_weight = 10; } message LightGbmParameters { diff --git a/src/Utilities/General/xgboost_model_evaluate.cc b/src/xgboost/xgboost_model_evaluate.cc similarity index 99% rename from src/Utilities/General/xgboost_model_evaluate.cc rename to src/xgboost/xgboost_model_evaluate.cc index fe749ee2..722249b7 100644 --- a/src/Utilities/General/xgboost_model_evaluate.cc +++ b/src/xgboost/xgboost_model_evaluate.cc @@ -16,7 +16,7 @@ #include "xgboost/json_io.h" #include "xgboost/c_api.h" -#include "Utilities/General/xgboost_model.pb.h" +#include "xgboost/xgboost_model.pb.h" namespace lillymol_xgboost { diff --git a/test/buildsmidb_bdb/case_1/run_case.sh b/test/buildsmidb_bdb/case_1/run_case.sh index 61ece4ad..5945334b 100755 --- a/test/buildsmidb_bdb/case_1/run_case.sh +++ b/test/buildsmidb_bdb/case_1/run_case.sh @@ -28,7 +28,9 @@ fi dbname="/tmp/buildsmidb$$.bdb" -cmd="${build} -d ${dbname} -c -l ${insmi}" +stderr='stderr' + +cmd="${build} -d ${dbname} -c -l ${insmi}" 2> "${stderr}" ${cmd} if [[ $? -ne 0 ]] ; then @@ -43,7 +45,7 @@ fi found="/tmp/found$$" not_in_db="/tmp/notfound$$" -cmd="${lookup} -d ${dbname} -c -l -F ${found} -U ${not_in_db} ${insmi}" +cmd="${lookup} -d ${dbname} -c -l -F ${found} -U ${not_in_db} ${insmi}" 2> "${stderr}" ${cmd} if [[ $? -ne 0 ]] ; then @@ -64,4 +66,5 @@ fi unlink ${dbname} unlink "${found}.smi" unlink "${not_in_db}.smi" +unlink "${stderr}" exit 0 diff --git a/test/retrosynthesis/case_1/out/log.txt b/test/retrosynthesis/case_1/out/log.txt deleted file mode 100644 index 64c9300f..00000000 --- a/test/retrosynthesis/case_1/out/log.txt +++ /dev/null @@ -1,60 +0,0 @@ -[N+]([O-])(=O)[1C](C)(C)[1CH2][1N](C)CCO.O US08278482B2_0331 PARENT -O.OCC[1NH]C.[O-][N+](=O)[1CH](C)C.[1CH2]=O US08278482B2_0331 via US08278482B2_0331 CentroidRxnSmi_1 R 1 ALL -OCC[1NH]C.[O-][N+](=O)[1CH](C)C.[1CH2]=O US08278482B2_0331 via US08278482B2_0331 CentroidRxnSmi_1 R 1 SPFRM.1 -O=[N+]([O-])[1CH](C)C US08278482B2_0331 via US08278482B2_0331 CentroidRxnSmi_1 R 1 -O=[1CH2] US08278482B2_0331 via US08278482B2_0331 CentroidRxnSmi_1 R 1 -OCC[1NH]C US08278482B2_0331 via US08278482B2_0331 CentroidRxnSmi_1 R 1 -N1C2=C(C=CC=N2)[1C](=C1)[1CH2][1N](C)C.O US05976497_NA PARENT -O.O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US05976497_NA via US04073911_NA CentroidRxnSmi_1 R 1 ALL -O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US05976497_NA via US04073911_NA CentroidRxnSmi_1 R 1 SPFRM.1 -[nH]1c[1cH]c2c1[n]ccc2 US05976497_NA via US04073911_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US05976497_NA via US04073911_NA CentroidRxnSmi_1 R 1 -C[1NH]C US05976497_NA via US04073911_NA CentroidRxnSmi_1 R 1 -[1CH2]([1N]1CCOCC1)[1C]#CCO.O US20020010182A1_0404 PARENT -O.OCC#[1CH].O=[1CH2].O1CC[1NH]CC1 US20020010182A1_0404 via US04012514_NA CentroidRxnSmi_1 R 1 ALL -OCC#[1CH].O=[1CH2].O1CC[1NH]CC1 US20020010182A1_0404 via US04012514_NA CentroidRxnSmi_1 R 1 SPFRM.1 -O=[1CH2] US20020010182A1_0404 via US04012514_NA CentroidRxnSmi_1 R 1 -O1CC[1NH]CC1 US20020010182A1_0404 via US04012514_NA CentroidRxnSmi_1 R 1 -OCC#[1CH] US20020010182A1_0404 via US04012514_NA CentroidRxnSmi_1 R 1 -S1C=CC=C1C[1NH][1CH2][1C](C)(C)C(=O)C.O US20100190824A1_0154 PARENT -O.[1NH2]Cc1sccc1.[1CH2]=O.O=C([1CH](C)C)C US20100190824A1_0154 via US20100190824A1_0154 CentroidRxnSmi_1 R 1 ALL -[1NH2]Cc1sccc1.[1CH2]=O.O=C([1CH](C)C)C US20100190824A1_0154 via US20100190824A1_0154 CentroidRxnSmi_1 R 1 SPFRM.1 -[1NH2]Cc1sccc1 US20100190824A1_0154 via US20100190824A1_0154 CentroidRxnSmi_1 R 1 -O=[1CH2] US20100190824A1_0154 via US20100190824A1_0154 CentroidRxnSmi_1 R 1 -O=C([1CH](C)C)C US20100190824A1_0154 via US20100190824A1_0154 CentroidRxnSmi_1 R 1 -C[1N](C)[1CH2][1C]1=CNC2=C1C=CC(=C2)[N+]([O-])=O.O US20090227797A1_0469 PARENT -O.[O-][N+](=O)c1cc2[nH]c[1cH]c2cc1.[1CH2]=O.C[1NH]C US20090227797A1_0469 via US04073911_NA CentroidRxnSmi_1 R 1 ALL -[O-][N+](=O)c1cc2[nH]c[1cH]c2cc1.[1CH2]=O.C[1NH]C US20090227797A1_0469 via US04073911_NA CentroidRxnSmi_1 R 1 SPFRM.1 -C[1NH]C US20090227797A1_0469 via US04073911_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US20090227797A1_0469 via US04073911_NA CentroidRxnSmi_1 R 1 -O=[N+]([O-])c1cc2[nH]c[1cH]c2cc1 US20090227797A1_0469 via US04073911_NA CentroidRxnSmi_1 R 1 -[1CH2]([1C](C)(C)C=O)[1N](C)C.O US20150368509A1_0289 PARENT -O.[1CH2]=O.O=C[1CH](C)C.C[1NH]C US20150368509A1_0289 via US20130261270A1_0288 CentroidRxnSmi_1 R 1 ALL -[1CH2]=O.O=C[1CH](C)C.C[1NH]C US20150368509A1_0289 via US20130261270A1_0288 CentroidRxnSmi_1 R 1 SPFRM.1 -O=[1CH2] US20150368509A1_0289 via US20130261270A1_0288 CentroidRxnSmi_1 R 1 -O=C[1CH](C)C US20150368509A1_0289 via US20130261270A1_0288 CentroidRxnSmi_1 R 1 -C[1NH]C US20150368509A1_0289 via US20130261270A1_0288 CentroidRxnSmi_1 R 1 -COC1=CC=CC=C1C1=CNC2=NC=C(C3=CC(O)=[1C](C=C3)[1CH2][1N]3CCOCC3)C=C12.O US20060030583A1_0277 PARENT -O.Oc1[1cH]ccc(c2c[n]c3[nH]cc(c3c2)c2c(OC)cccc2)c1.O=[1CH2].O1CC[1NH]CC1 US20060030583A1_0277 via US03992389_NA CentroidRxnSmi_1 R 1 ALL -Oc1[1cH]ccc(c2c[n]c3[nH]cc(c3c2)c2c(OC)cccc2)c1.O=[1CH2].O1CC[1NH]CC1 US20060030583A1_0277 via US03992389_NA CentroidRxnSmi_1 R 1 SPFRM.1 -Oc1[1cH]ccc(c2c[n]c3[nH]cc(c3c2)c2c(OC)cccc2)c1 US20060030583A1_0277 via US03992389_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US20060030583A1_0277 via US03992389_NA CentroidRxnSmi_1 R 1 -O1CC[1NH]CC1 US20060030583A1_0277 via US03992389_NA CentroidRxnSmi_1 R 1 -N1C2=NC=CC=C2[1C](=C1)[1CH2][1N](C)C.O US20090076046A1_0987 PARENT -O.O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US20090076046A1_0987 via US04073911_NA CentroidRxnSmi_1 R 1 ALL -O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US20090076046A1_0987 via US04073911_NA CentroidRxnSmi_1 R 1 SPFRM.1 -[nH]1c[1cH]c2c1[n]ccc2 US20090076046A1_0987 via US04073911_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US20090076046A1_0987 via US04073911_NA CentroidRxnSmi_1 R 1 -C[1NH]C US20090076046A1_0987 via US04073911_NA CentroidRxnSmi_1 R 1 -C(=O)(C1=CC=CC=C1)[1CH2][1CH2][1N]1CCN(C2=CC=CC=C2)CC1.O US07893261B2_0040 PARENT -O.[1CH3]C(=O)c1ccccc1.[1CH2]=O.[1NH]1CCN(c2ccccc2)CC1 US07893261B2_0040 via US03998829_NA CentroidRxnSmi_1 R 1 ALL -[1CH3]C(=O)c1ccccc1.[1CH2]=O.[1NH]1CCN(c2ccccc2)CC1 US07893261B2_0040 via US03998829_NA CentroidRxnSmi_1 R 1 SPFRM.1 -O=C(c1ccccc1)[1CH3] US07893261B2_0040 via US03998829_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US07893261B2_0040 via US03998829_NA CentroidRxnSmi_1 R 1 -[1NH]1CCN(c2ccccc2)CC1 US07893261B2_0040 via US03998829_NA CentroidRxnSmi_1 R 1 -N1C2=NC=CC=C2[1C](=C1)[1CH2][1N](C)C.O US20150265586A1_0985 PARENT -O.O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US20150265586A1_0985 via US04073911_NA CentroidRxnSmi_1 R 1 ALL -O=[1CH2].C[1NH]C.[nH]1c[1cH]c2c1[n]ccc2 US20150265586A1_0985 via US04073911_NA CentroidRxnSmi_1 R 1 SPFRM.1 -[nH]1c[1cH]c2c1[n]ccc2 US20150265586A1_0985 via US04073911_NA CentroidRxnSmi_1 R 1 -O=[1CH2] US20150265586A1_0985 via US04073911_NA CentroidRxnSmi_1 R 1 -C[1NH]C US20150265586A1_0985 via US04073911_NA CentroidRxnSmi_1 R 1