diff --git a/src/eem.c b/src/eem.c index c4f888a..5ab6d15 100644 --- a/src/eem.c +++ b/src/eem.c @@ -199,7 +199,11 @@ void calculate_charges(struct subset * const ss, struct kappa_data * const kd) { starts[0] = 0; for(int i = 1; i < ts.molecules_count; i++) starts[i] = starts[i - 1] + ts.molecules[i - 1].atoms_count; - int nt = s.max_threads/s.de_threads; + int nt = s.max_threads; + if (s.params_method == PARAMS_DE) + nt /= s.de_threads; + if (s.params_method == PARAMS_GM) + nt /= s.gm_threads; int nthreads = ts.molecules_count < nt ? ts.molecules_count : nt; #pragma omp parallel for num_threads(nthreads) for(int i = 0; i < ts.molecules_count; i++) { diff --git a/src/guidedmin.c b/src/guidedmin.c new file mode 100644 index 0000000..66b624a --- /dev/null +++ b/src/guidedmin.c @@ -0,0 +1,265 @@ +/* + * NEEMP - guidedmin.c + * + * by Jana Pazurikova (pazurikova@ics.muni.cz) + * 2016 + * + * */ + +#include +#include +#include +#include +#include "omp.h" +#include "eem.h" +#include "kappa.h" +#include "neemp.h" +#include "parameters.h" +#include "settings.h" +#include "subset.h" +#include "statistics.h" +#include "structures.h" +#include "../externals/lhs/latin_random.h" +#include "guidedmin.h" + +extern const struct training_set ts; +extern const struct settings s; +extern const float ionenergies[]; +extern const float affinities[]; + + +/* Run guided minimization algorithm to find the best set of parameters for calculation of partial charges. */ +void run_guided_min(struct subset * const ss) { + + /* Create a set of random points in vector space of kappa_data */ + if (s.verbosity >= VERBOSE_KAPPA) + printf("GM Generating %d vectors\n", s.gm_size); + + /* Set bounds for each parameters in kappa_data */ + float *bounds = (float*) malloc((ts.atom_types_count*2+1)*2*sizeof(float)); + //compute bounds, 0 means set them to fixed numbers taken from Tomas's full scan, 1 means try to find them with broad search + compute_parameters_bounds(bounds, 0); + //generate population + fill_ss(ss, s.gm_size); + generate_random_population(ss, bounds, s.gm_size); + de_ss = ss; + //evaluate the fitness function for all points + if (s.verbosity >= VERBOSE_KAPPA) + printf("GM Calculating charges and evaluating fitness function for whole set\n"); + int i = 0; +#pragma omp parallel for num_threads(s.gm_threads) default(shared) private(i) + for (i = 0; i < ss->kappa_data_count; i++) { + calculate_charges(ss, &ss->data[i]); + calculate_statistics(ss, &ss->data[i]); + } + + //minimize part of population that has R>0.3 + int minimized_initial = 0; + if (s.verbosity >= VERBOSE_KAPPA) + printf("GM minimizing population\n"); + minimized_initial = minimize_part_of_gm_set(ss, s.gm_iterations_beg); + + //if we minimized zero data, inform user and suggest larger set + if (minimized_initial == 0) { + EXIT_ERROR(RUN_ERROR, "No vector in set was worth minimizing. Please choose larger set than %d (option --gm-size)\n.", s.gm_size); + } + + //find the best kappa_data + set_the_best(ss); + + struct kappa_data *so_far_best = (struct kappa_data*)malloc(sizeof(struct kappa_data)); + kd_init(so_far_best); + + //copy ss->best into so_far_best + kd_copy_parameters(ss->best, so_far_best); + calculate_charges(ss, so_far_best); + calculate_statistics(ss, so_far_best); + + /* Minimize the result */ + minimize_locally(so_far_best, s.gm_iterations_end); + + /* Tidying up and printing */ + free(bounds); + kd_copy_parameters(so_far_best, ss->best); + calculate_charges(ss, ss->best); + calculate_statistics(ss, ss->best); + kd_destroy(so_far_best); + free(so_far_best); +} + +/* Generate random population by Latin HyperCube Sampling */ +/*void generate_random_population(struct subset* ss, float *bounds, int size) { + // Get random numbers by Latin Hypercube Sampling + int dimensions_count = ts.atom_types_count*2+1; + int points_count = size; + int seed = rand(); + double* random_lhs = latin_random_new(dimensions_count, points_count, &seed); + + // Redistribute random_lhs[dim_num, point_num] to ss->data + + //every row contains values of one dimension for all population members + //start with alpha and beta for all atom types + for (int i = 0; i < points_count; i++) { + for (int j = 0; j < ts.atom_types_count; j++) { + ss->data[i].parameters_alpha[j] = interpolate_to_different_bounds((float)random_lhs[j*points_count + i], bounds[2+j*4], bounds[2+j*4+1]); + ss->data[i].parameters_beta[j] = interpolate_to_different_bounds((float)random_lhs[(j+1)*points_count + i], bounds[2+j*4+2], bounds[2+j*4+3]); + } + } + + //the last row of random_lhs is kappa for all population members + for (int i = 0; i < points_count; i++) { + if (s.fixed_kappa < 0) + ss->data[i].kappa = interpolate_to_different_bounds((float)random_lhs[(dimensions_count-2)*points_count + i], bounds[0], bounds[1]); + else + ss->data[i].kappa = s.fixed_kappa; + } + free(random_lhs); + +} */ + +/* Run local minimization on part of population */ +int minimize_part_of_gm_set(struct subset* ss, int min_iterations) { + int quite_good = 0; + int i = 0; + //we minimize all with R2>0.2 && R>0 +#pragma omp parallel for num_threads(s.gm_threads) shared(ss, quite_good) private(i) + for (i = 0; i < ss->kappa_data_count; i++) { + if (ss->data[i].full_stats.R2 > 0.2 && ss->data[i].full_stats.R > 0) { +#pragma omp critical + { + quite_good++; + } + struct kappa_data* m = (struct kappa_data*) malloc (sizeof(struct kappa_data)); + kd_init(m); + m->parent_subset = ss; + kd_copy_parameters(&ss->data[i], m); + minimize_locally(m, min_iterations); + kd_copy_parameters(m, &ss->data[i]); + kd_destroy(m); + free(m); + + } + } + if (s.verbosity >= VERBOSE_KAPPA) { + printf("Out of %d in population, we minimized %d\n", ss->kappa_data_count, quite_good); + } + return quite_good; +} + +/* Run local minimization with NEWUOA algorithm */ +/*void minimize_locally(struct kappa_data* t, int max_calls) { + int n = 2*ts.atom_types_count + 1; //number of variables + int npt = 2*n + 1; //number of interpolation conditions + double* x = (double*) malloc(n*sizeof(double)); + kappa_data_to_double_array(t, x); + double rhobeg = 0.2; + double rhoend = 0.0001; + int iprint = 0; + int maxfun = max_calls; + double* w = (double*) malloc(((npt+13)*(npt+n) + 3*n*(n+3)/2)*sizeof(double)); + //call fortran code NEWUOA for local minimization + newuoa_(&n, &npt, x, &rhobeg, &rhoend, &iprint, &maxfun, w); + double_array_to_kappa_data(x, t); + free(w); + free(x); +} */ + +/* Used by NEWUOA algorithm. Evaluates the vector in the local minimization: converts it to kappa_data, computes charges, computes statistics and return the fitness score that should be minimized */ +/*extern void calfun_(int n, double*x, double* f) { + struct kappa_data* t = (struct kappa_data*) malloc (sizeof(struct kappa_data)); + kd_init(t); + double_array_to_kappa_data(x, t); + calculate_charges(de_ss, t); + calculate_statistics_by_sort_mode(t); + float result = kd_sort_by_return_value(t); + switch (s.sort_by) { + case SORT_R: + case SORT_R2: + case SORT_RW: + case SORT_SPEARMAN: + *f = 1 - (double)(result) + n - n; //+n-n just to get rid of compilaiton warning + break; + default: + *f = (double) (result) + n -n; + } + kd_destroy(t); + free(t); +} */ + +/* Convert kappa_data into an array of doubles, used in local minimization */ +/*void kappa_data_to_double_array(struct kappa_data* t, double* x) { + x[0] = t->kappa; + for (int i = 0; i < ts.atom_types_count; i++) { + x[i*2 + 1] = t->parameters_alpha[i]; + x[i*2 + 2] = t->parameters_beta[i]; + } +} */ + +/* Convert array of doubles into kappa_data, used in local minimization */ +/*void double_array_to_kappa_data(double* x, struct kappa_data* t) { + t->kappa = (float)x[0]; + for (int i = 0; i < ts.atom_types_count; i++) { + t->parameters_alpha[i] = (float)x[i*2 + 1]; + t->parameters_beta[i] = (float)x[i*2 + 2]; + } +} */ + +/* Compute bounds for each parameter of each atom type */ +/*void compute_parameters_bounds(float* bounds, int by_atom_type) { + //returns bounds[kappa_low, kappa_high, alpha_1_low, alpha_1_high, beta_1_low, beta_1_high, alpha_2_low, ...] + float toH = 0.036749309; + bounds[0] = 0.0005; //kappa_low + bounds[1] = 3.5; //kappa_high + for (int j = 0; j < ts.atom_types_count; j++) { + //set bounds to values corresponding with their affinity and ionenergies + if (by_atom_type == 1) { + //set bounds for particular atom type + int atom_number = ts.atom_types[j].Z; + bounds[2 + j*4] = (((float)(ionenergies[atom_number]) + (float)(affinities[atom_number])*5)/2)*toH; //alpha_low + bounds[2 + j*4 + 1] = (float) (bounds[2 + j*4] + 0.1); //alpha_high + bounds[2 + j*4] -= 0.1; + //bounds[2 + j*4] *= toEV; + //bounds[2 + j*4 + 1] *= toEV; + bounds[2 + j*4 + 2] = ((float)(ionenergies[atom_number]) - (float)(affinities[atom_number])*5)/2*toH; //beta_low + bounds[2 + j*4 + 3] = (float) (bounds[2 + j*4 + 2] + 0.1); //beta_high + bounds[2 + j*4 + 2] -= 0.1; + //bounds[2 + j*4 + 2] *= toEV; + //bounds[2 + j*4 + 3] *= toEV; + } + //set bounds to constant values taken from previous experience + else if (by_atom_type == 0) { + bounds[2 + j*4] = 1.8; + bounds[2 + j*4 + 1] = 3.2; + bounds[2 + j*4 + 2] = 0; + bounds[2 + j*4 + 3] = 1.0; + } + } + if (s.verbosity >= VERBOSE_KAPPA) { + printf("DE Bounds set to:\n"); + for (int i = 0; i < ts.atom_types_count; i++) { + char buff[10]; + at_format_text(&ts.atom_types[i], buff); + printf("%s %5.3f - %5.3f, %5.3f - %5.3f\n", buff, bounds[2+i*4], bounds[2+i*4+1], bounds[2+i*4+2], bounds[2+i*4+3]); + } + } +} */ + +/* Get random float within the bounds */ +/*float get_random_float(float low, float high) { + //better random number generator would be nice, but this is sufficient + float n = low + (float)(rand())/((float)(RAND_MAX/(high-low))); + return n; +} */ + +/* Interpolate the number from [0,1] to [low, high] */ +/*float interpolate_to_different_bounds(float x, float low, float high) { + return low + x*(high-low); +} */ + +/* Compute the sum of the vector */ +/*int sum(int* vector, int size) { + int sum = 0; + for (int i = 0; i < size; i++) + sum+=vector[i]; + return sum; +} */ diff --git a/src/guidedmin.h b/src/guidedmin.h new file mode 100644 index 0000000..211859a --- /dev/null +++ b/src/guidedmin.h @@ -0,0 +1,29 @@ +/* + * NEEMP - guidedmin.h + * + * by Jana Pazurikova (pazurikova@ics.muni.cz) + * 2016 + * + * */ + +#ifndef __GUIDEDMIN_H__ +#define __GUIDEDMIN_H__ + +#include "subset.h" +#include "diffevolution.h" + +void run_guided_min(struct subset * const ss); +//void generate_random_population(struct subset* ss, float *bounds, int size); +int minimize_part_of_gm_set(struct subset* ss, int min_iterations); +//void compute_parameters_bounds(float* bounds, int by_atom_type); +//float get_random_float(float low, float high); +//float interpolate_to_different_bounds(float x, float low, float high); +//int sum(int* vector, int size); +//void minimize_locally(struct kappa_data* trial, int max_calls); +//extern void newuoa_(int* n, int* npt, double* x, double* rhobeg, double* rhoend, int* iprint, int* maxfun, double* w); +//extern void calfun_(int n, double*x, double* f); +//void kappa_data_to_double_array(struct kappa_data* trial, double* x); +//void double_array_to_kappa_data(double* x, struct kappa_data* trial); + +struct subset * de_ss; +#endif /* __GUIDEDMIN_H__ */ diff --git a/src/kappa.c b/src/kappa.c index b05189b..ccc3814 100644 --- a/src/kappa.c +++ b/src/kappa.c @@ -20,6 +20,7 @@ #include "statistics.h" #include "structures.h" #include "diffevolution.h" +#include "guidedmin.h" extern const struct training_set ts; extern const struct settings s; @@ -229,6 +230,11 @@ void find_the_best_parameters_for_subset(struct subset * const ss) { //runs a differential evolution algorithm to find the best parameters, ss->best is set after the call run_diff_evolution(ss); } + if (s.params_method == PARAMS_GM) { + //runs a guided minimization algorithm, ss->best is set after the call + run_guided_min(ss); + } + /* Determine the best parameters for computed data */ if(s.params_method == PARAMS_LR_FULL) { @@ -238,7 +244,7 @@ void find_the_best_parameters_for_subset(struct subset * const ss) { /* If Brent is used, the maximum is stored in the last item */ ss->best = &ss->data[ss->kappa_data_count - 1]; } - else if (s.params_method == PARAMS_DE) { + else if (s.params_method == PARAMS_DE || s.params_method == PARAMS_GM) { //well, nothing, the best structure has been already set } diff --git a/src/settings.c b/src/settings.c index 2ca07b1..04eaadd 100644 --- a/src/settings.c +++ b/src/settings.c @@ -61,6 +61,10 @@ static struct option long_options[] = { {"de-fix-kappa",required_argument, 0, 188}, {"de-threads",required_argument, 0, 189}, {"de-polish", required_argument, 0, 190}, + {"gm-size", required_argument, 0, 191}, + {"gm-iterations-beg", required_argument, 0, 192}, + {"gm-iterations-end", required_argument, 0, 193}, + {"gm-threads", required_argument, 0, 194}, {NULL, 0, 0, 0} }; @@ -95,6 +99,10 @@ void s_init(void) { s.limit_de_iters = NO_LIMIT_ITERS; s.limit_de_time = NO_LIMIT_TIME; s.polish = 0; //0 off, 1 only result, 2 result + during evolve, 3 result, evolve and some structures in initial population + s.gm_size = 100; + s.gm_iterations_beg = 1000; + s.gm_iterations_end = 2000; + s.gm_threads = 1; s.sort_by = SORT_R2; s.at_customization = AT_CUSTOM_ELEMENT_BOND; s.discard = DISCARD_OFF; @@ -128,7 +136,7 @@ static void print_help(void) { printf(" --version display version information and exit\n"); printf(" --max-threads N use up to N threads to solve EEM system in parallel\n"); printf(" -m, --mode MODE set mode for the NEEMP. Valid choices are: info, params, charges, cross, cover (required)\n"); - printf(" -p, --params-method METHOD set optimization method used for calculation of parameters. Valid choices are: lr-full, lr-full-brent, de (optional)\n"); + printf(" -p, --params-method METHOD set optimization method used for calculation of parameters. Valid choices are: lr-full, lr-full-brent, de, gm (optional)\n"); printf(" --sdf-file FILE SDF file (required)\n"); printf(" --atom-types-by METHOD classify atoms according to the METHOD. Valid choices are: Element, ElemBond.\n"); printf(" --list-omitted-molecules list names of molecules for which we don't have charges or parameters loaded (mode dependent).\n"); @@ -150,6 +158,10 @@ static void print_help(void) { printf(" --de-dither set the mutation constant to random value from [0.5;1] for ech iteration (optional).\n"); printf(" --de-polish VALUE apply polishing on parameters. Valid choices: 0 (off), 1 (result), 2 (during evolving), 3 (at the beginning). Strongly recommend to keep the default value.\n"); printf(" --de-fix-kappa set kappa to one fixed value (optional).\n"); + printf(" --gm-size set number of randomly generated vectors of parameters, those with reasonable stats will be minimized (optional).\n"); + printf(" --gm-iterations-beg set number of minimization iterations for each reasonable vector of parameters (optional).\n"); + printf(" --gm-iterations-end set number of minimization itertions for the best to polish the final result (optional).\n"); + printf(" --gm-threads set number of threads used for parallel minimization (optional).\n"); printf("Other options:\n"); printf(" --par-out-file FILE output the parameters to the FILE\n"); printf(" -d, --discard METHOD perform discarding with METHOD. Valid choices are: iterative, simple and off. Default is off.\n"); @@ -167,8 +179,8 @@ static void print_help(void) { printf("neemp -m params --sdf-file molecules.sdf --chg-file charges.chg --kappa-max 1.0 --fs-precision 0.2 --sort-by RMSD --fs-only.\n\ Compute parameters for the given molecules in file molecules.sdf and ab-initio charges in charges.chg. Set maximum value for kappa to 1.0, step for the full scan to 0.2, no iterative refinement, sort results according to the relative mean square deviation.\n"); - printf("neemp -m params -p de --sdf-file molecules.sdf --chg-file charges.chg --sort-by RMSD_avg --de-pop-size 250 --de-iters-max 500 -vv.\n\ - Compute parameters for the given molecules in file molecules.sdf and ab-initio charges in charges.chg. The chosen optimization method: differential evolution will create population of 250 sets of parameters and evolve these in maximum of 500 iterations. The fitness function evaluating the set of parameters is average per atom RMSD.\n"); + printf("neemp -m params -p gm --sdf-file molecules.sdf --chg-file charges.chg --sort-by RMSD_avg --gm-size 250 -gm-iterations-beg 1000 -gm-iterations-end 500 --random-seed 1234 -vv.\n\ + Compute parameters for the given molecules in file molecules.sdf and ab-initio charges in charges.chg. The chosen optimization method: guided minimization will create 250 vectors (each vector consists of all parameters) and minimized reasonably good ones for 1000 iterations. The best of them will be minimized again, for 500 iterations.\n"); printf("neemp -m charges --sdf-file molecules.sdf --par-file parameters --chg-out-file output.chg\n\ Calculate and store EEM charges to the file output.chg\n"); @@ -209,6 +221,8 @@ void parse_options(int argc, char **argv) { s.params_method = PARAMS_LR_FULL_BRENT; else if (!strcmp(optarg, "de")) s.params_method = PARAMS_DE; + else if (!strcmp(optarg, "gm")) + s.params_method = PARAMS_GM; else EXIT_ERROR(ARG_ERROR, "Invalid params-method: %s\n", optarg); break; @@ -397,6 +411,19 @@ void parse_options(int argc, char **argv) { case 190: s.polish = atoi(optarg); break; + //GM settings + case 191: + s.gm_size = atoi(optarg); + break; + case 192: + s.gm_iterations_beg = atoi(optarg); + break; + case 193: + s.gm_iterations_end = atoi(optarg); + break; + case 194: + s.gm_threads = atoi(optarg); + break; case '?': EXIT_ERROR(ARG_ERROR, "%s", "Try -h/--help.\n"); default: @@ -465,6 +492,15 @@ void check_settings(void) { } + if (s.params_method == PARAMS_GM) { //all settings are optional, so check for mistakes + if (s.gm_size < 1) + EXIT_ERROR(ARG_ERROR, "%s", "Size of GM set has to be positive.\n"); + if (s.gm_iterations_beg < 1 || s.gm_iterations_end < 1) + EXIT_ERROR(ARG_ERROR, "%s", "Number of minimization iterations for GM has to be positive.\n"); + if (s.gm_threads < 1 || s.gm_threads > s.max_threads) + EXIT_ERROR(ARG_ERROR, "%s", "Number of threads for minimization must be between 1 and maximum number of threads.\n"); + } + if (s.random_seed == -1) s.random_seed = 123; @@ -685,6 +721,15 @@ void print_settings(void) { } + + if (s.params_method == PARAMS_GM) { + printf("\nGuided minimization settings:\n"); + printf("\t - set size %d\n", s.gm_size); + printf("\t - iterations for set at the beginning %d\n", s.gm_iterations_beg); + printf("\t - iterations for the result at the end %d\n", s.gm_iterations_end); + printf("\t - threads used for minimization %d\n", s.gm_threads); + + } } printf("\n"); diff --git a/src/settings.h b/src/settings.h index a4a37b3..2930d99 100644 --- a/src/settings.h +++ b/src/settings.h @@ -27,6 +27,7 @@ enum params_calc_method { PARAMS_LR_FULL, PARAMS_LR_FULL_BRENT, PARAMS_DE, + PARAMS_GM, //guided minimization PARAMS_NOT_SET }; @@ -95,6 +96,12 @@ struct settings { time_t limit_de_time; int polish; //use NEWUOA minimization to polish trial or results + //settings regarding PARAMS_GM optimization method + int gm_size; + int gm_iterations_beg; + int gm_iterations_end; + int gm_threads; + //other settings int random_seed; enum verbosity_levels verbosity;