diff --git a/EDApy.egg-info/SOURCES.txt b/EDApy.egg-info/SOURCES.txt deleted file mode 100644 index b3f7168..0000000 --- a/EDApy.egg-info/SOURCES.txt +++ /dev/null @@ -1,18 +0,0 @@ -README.md -setup.py -EDApy/__init__.py -EDApy.egg-info/PKG-INFO -EDApy.egg-info/SOURCES.txt -EDApy.egg-info/dependency_links.txt -EDApy.egg-info/top_level.txt -EDApy/optimization/__init__.py -EDApy/optimization/multivariate/EDA_multivariate.py -EDApy/optimization/multivariate/EDA_multivariate_gaussian.py -EDApy/optimization/multivariate/__BayesianNetwork.py -EDApy/optimization/multivariate/__clustering.py -EDApy/optimization/multivariate/__init__.py -EDApy/optimization/multivariate/__matrix.py -EDApy/optimization/univariate/__init__.py -EDApy/optimization/univariate/continuous.py -EDApy/optimization/univariate/discrete.py -tests/__init__.py \ No newline at end of file diff --git a/EDApy.egg-info/top_level.txt b/EDApy.egg-info/top_level.txt deleted file mode 100644 index 9f64a44..0000000 --- a/EDApy.egg-info/top_level.txt +++ /dev/null @@ -1,2 +0,0 @@ -EDApy -tests diff --git a/EDApy.egg-info/PKG-INFO b/EDAspy.egg-info/PKG-INFO similarity index 75% rename from EDApy.egg-info/PKG-INFO rename to EDAspy.egg-info/PKG-INFO index 5d77b66..439a2f1 100644 --- a/EDApy.egg-info/PKG-INFO +++ b/EDAspy.egg-info/PKG-INFO @@ -1,12 +1,13 @@ Metadata-Version: 2.1 -Name: EDApy -Version: 0.0.1 +Name: EDAspy +Version: 0.1.0 Summary: This is a package where some estimation of distribution algorithms are implemented. -Home-page: https://github.com/VicentePerezSoloviev/EDApy +Home-page: https://github.com/VicentePerezSoloviev/EDAspy Author: Vicente P. Soloviev Author-email: vicente.perez.soloviev@gmail.com -License: UNKNOWN -Description: # EDApy +License: LGPL-2.1 +Download-URL: https://github.com/VicentePerezSoloviev/EDAspy/archive/0.1.0.tar.gz +Description: # EDAspy ## Description @@ -22,7 +23,7 @@ Description: # EDApy #### Binary univariate EDA It can be used as a simple example of EDA, or to use it for feature selection. The cost function to optimize is the metric of the model. An example is shown. ```python - from EDApy.optimization.univariate import EDA_discrete as EDAd + from EDAspy.optimization.univariate import EDA_discrete as EDAd import pandas as pd def check_solution_in_model(dictionary): @@ -51,7 +52,7 @@ Description: # EDApy This EDA is used when some continuous parameters must be optimized. ```python - from EDApy.optimization.univariate import EDA_continuous as EDAc + from EDAspy.optimization.univariate import EDA_continuous as EDAc import pandas as pd import numpy as np @@ -89,7 +90,7 @@ Description: # EDApy The optimizer will find the optimum values of the non-evidence-variables based on the value of the evidences. This is widely used in problems where dependencies among variables must be considered. ```python - from EDApy.optimization.multivariate import EDA_multivariate as EDAm + from EDAspy.optimization.multivariate import EDA_multivariate as EDAm import pandas as pd blacklist = pd.DataFrame(columns=['from', 'to']) @@ -115,12 +116,47 @@ Description: # EDApy In this case, the output is the self class that can be saved as a pickle in order to explore the attributes. One of the attributes is the optimum structure of the optimum generation, from which the structure can be plotted and observe the dependencies among the variables. The function to plot the structure is the following: ```python - from EDApy.optimization.multivariate import print_structure + from EDAspy.optimization.multivariate import print_structure print_structure(structure=structure, var2optimize=['param2', 'param3', 'param4'], evidences=['param1', 'param5']) ``` ![Structure praph plot](/structure.PNG "Structure of the optimum generation found by the EDA") + #### Another Continuous multivariate EDA approach + + In this EDA approach, new individuals are sampled from a multivariate normal distribution. Evidences are not allowed in the optimizer. If desired, the previous approach should be used. + The EDA is initialized, as in the univariate continuous EDA, with univariate mus and sigma for the variables. In the execution, a multivariate gaussian is built to sample from it. As it is multivariate, correlation among variables is considered. + + ```python + import pandas as pd + from EDAspy.optimization.multivariate import EDA_multivariate_gaussian as EDAmg + + + def cost_function(dictionary): + suma = dictionary['param1'] + dictionary['param2'] + if suma < 0: + return 999999999 + return suma + + mus = pd.DataFrame(columns=['param1', 'param2']) + mus.loc[0] = [10, 8] + + sigma = pd.DataFrame(columns=['param1', 'param2']) + sigma.loc[0] = 5 + + EDAmulti = EDAmg(SIZE_GEN=40, MAX_ITER=1000, DEAD_ITER=50, ALPHA=0.6, aim='minimize', + cost_function=cost_function, mus=mus, sigma=sigma) + + bestcost, params, history = EDAmulti.run(output=True) + print(bestcost) + print(params) + print(history) + ``` + + The cost function to optimize is the minimization of two parameter sum. Both parameters are continuous, and to be initialized two pandas dataframes are needed: one with mus and another with sigmas (diagonal of the covariance matrix) + + The EDA returns the best cost, the combination and the history of costs if wanted to be plotted. + ## Getting started #### Prerequisites @@ -129,12 +165,14 @@ Description: # EDApy #### Installing ``` - pip install git+https://github.com/vicenteperezsoloviev/EDApy.git#egg=EDApy + pip install git+https://github.com/vicenteperezsoloviev/EDAspy.git#egg=EDApy ``` +Keywords: EDA,estimation,bayesian,evolutionary,algorithm,optimization Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable Classifier: Programming Language :: Python :: 3 -Classifier: License :: OSI Approved :: MIT License +Classifier: License :: OSI Approved :: LGPL-2.1 Classifier: Operating System :: OS Independent Requires-Python: >=3.6 Description-Content-Type: text/markdown diff --git a/EDAspy.egg-info/SOURCES.txt b/EDAspy.egg-info/SOURCES.txt new file mode 100644 index 0000000..0ec1924 --- /dev/null +++ b/EDAspy.egg-info/SOURCES.txt @@ -0,0 +1,18 @@ +README.md +setup.py +EDAspy/__init__.py +EDAspy.egg-info/PKG-INFO +EDAspy.egg-info/SOURCES.txt +EDAspy.egg-info/dependency_links.txt +EDAspy.egg-info/top_level.txt +EDAspy/optimization/__init__.py +EDAspy/optimization/multivariate/EDA_multivariate.py +EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py +EDAspy/optimization/multivariate/__BayesianNetwork.py +EDAspy/optimization/multivariate/__clustering.py +EDAspy/optimization/multivariate/__init__.py +EDAspy/optimization/multivariate/__matrix.py +EDAspy/optimization/univariate/__init__.py +EDAspy/optimization/univariate/continuous.py +EDAspy/optimization/univariate/discrete.py +tests/__init__.py \ No newline at end of file diff --git a/EDApy.egg-info/dependency_links.txt b/EDAspy.egg-info/dependency_links.txt similarity index 100% rename from EDApy.egg-info/dependency_links.txt rename to EDAspy.egg-info/dependency_links.txt diff --git a/EDAspy.egg-info/top_level.txt b/EDAspy.egg-info/top_level.txt new file mode 100644 index 0000000..5263bb9 --- /dev/null +++ b/EDAspy.egg-info/top_level.txt @@ -0,0 +1,2 @@ +EDAspy +tests diff --git a/EDApy/__init__.py b/EDAspy/__init__.py similarity index 100% rename from EDApy/__init__.py rename to EDAspy/__init__.py diff --git a/EDApy/optimization/__init__.py b/EDAspy/optimization/__init__.py similarity index 100% rename from EDApy/optimization/__init__.py rename to EDAspy/optimization/__init__.py diff --git a/EDApy/optimization/multivariate/EDA_multivariate.py b/EDAspy/optimization/multivariate/EDA_multivariate.py similarity index 100% rename from EDApy/optimization/multivariate/EDA_multivariate.py rename to EDAspy/optimization/multivariate/EDA_multivariate.py diff --git a/EDApy/optimization/multivariate/EDA_multivariate_gaussian.py b/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py similarity index 100% rename from EDApy/optimization/multivariate/EDA_multivariate_gaussian.py rename to EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py diff --git a/EDApy/optimization/multivariate/__BayesianNetwork.py b/EDAspy/optimization/multivariate/__BayesianNetwork.py similarity index 100% rename from EDApy/optimization/multivariate/__BayesianNetwork.py rename to EDAspy/optimization/multivariate/__BayesianNetwork.py diff --git a/EDApy/optimization/multivariate/__clustering.py b/EDAspy/optimization/multivariate/__clustering.py similarity index 100% rename from EDApy/optimization/multivariate/__clustering.py rename to EDAspy/optimization/multivariate/__clustering.py diff --git a/EDApy/optimization/multivariate/__init__.py b/EDAspy/optimization/multivariate/__init__.py similarity index 99% rename from EDApy/optimization/multivariate/__init__.py rename to EDAspy/optimization/multivariate/__init__.py index 6742f4a..76dbb15 100644 --- a/EDApy/optimization/multivariate/__init__.py +++ b/EDAspy/optimization/multivariate/__init__.py @@ -22,6 +22,7 @@ def check_package(installed_pack, package): else: return False + if sys.version_info[0] < 3: raise Exception("Python version should be greater than 3") diff --git a/EDApy/optimization/multivariate/__matrix.py b/EDAspy/optimization/multivariate/__matrix.py similarity index 100% rename from EDApy/optimization/multivariate/__matrix.py rename to EDAspy/optimization/multivariate/__matrix.py diff --git a/EDApy/optimization/univariate/__init__.py b/EDAspy/optimization/univariate/__init__.py similarity index 100% rename from EDApy/optimization/univariate/__init__.py rename to EDAspy/optimization/univariate/__init__.py diff --git a/EDApy/optimization/univariate/continuous.py b/EDAspy/optimization/univariate/continuous.py similarity index 100% rename from EDApy/optimization/univariate/continuous.py rename to EDAspy/optimization/univariate/continuous.py diff --git a/EDApy/optimization/univariate/discrete.py b/EDAspy/optimization/univariate/discrete.py similarity index 100% rename from EDApy/optimization/univariate/discrete.py rename to EDAspy/optimization/univariate/discrete.py diff --git a/README.md b/README.md index 03caf54..1f97400 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# EDApy +# EDAspy ## Description @@ -14,7 +14,7 @@ Three EDAs have been implemented: #### Binary univariate EDA It can be used as a simple example of EDA, or to use it for feature selection. The cost function to optimize is the metric of the model. An example is shown. ```python -from EDApy.optimization.univariate import EDA_discrete as EDAd +from EDAspy.optimization.univariate import EDA_discrete as EDAd import pandas as pd def check_solution_in_model(dictionary): @@ -43,7 +43,7 @@ Vector probabilities are usually initialized to 0.5 to start from an equilibrium This EDA is used when some continuous parameters must be optimized. ```python -from EDApy.optimization.univariate import EDA_continuous as EDAc +from EDAspy.optimization.univariate import EDA_continuous as EDAc import pandas as pd import numpy as np @@ -81,7 +81,7 @@ In this case, dependencies among the variables are considered and managed with a The optimizer will find the optimum values of the non-evidence-variables based on the value of the evidences. This is widely used in problems where dependencies among variables must be considered. ```python -from EDApy.optimization.multivariate import EDA_multivariate as EDAm +from EDAspy.optimization.multivariate import EDA_multivariate as EDAm import pandas as pd blacklist = pd.DataFrame(columns=['from', 'to']) @@ -107,7 +107,7 @@ Due to the evidences, to help the structure learning algorithm to find the arcs, In this case, the output is the self class that can be saved as a pickle in order to explore the attributes. One of the attributes is the optimum structure of the optimum generation, from which the structure can be plotted and observe the dependencies among the variables. The function to plot the structure is the following: ```python -from EDApy.optimization.multivariate import print_structure +from EDAspy.optimization.multivariate import print_structure print_structure(structure=structure, var2optimize=['param2', 'param3', 'param4'], evidences=['param1', 'param5']) ``` @@ -120,7 +120,7 @@ The EDA is initialized, as in the univariate continuous EDA, with univariate mus ```python import pandas as pd -from EDApy.optimization.multivariate import EDA_multivariate_gaussian as EDAmg +from EDAspy.optimization.multivariate import EDA_multivariate_gaussian as EDAmg def cost_function(dictionary): @@ -151,10 +151,10 @@ The EDA returns the best cost, the combination and the history of costs if wante ## Getting started #### Prerequisites -R must be installed to use the multivariate EDA, with installed libraries c("bnlearn", "dbnR", "data.table") +R must be installed to use the multivariate EDA with Bayesian networks, with the following installed libraries: c("bnlearn", "dbnR", "data.table") To manage R from python, rpy2 package must also be installed. #### Installing ``` -pip install git+https://github.com/vicenteperezsoloviev/EDApy.git#egg=EDApy +pip install git+https://github.com/vicenteperezsoloviev/EDAspy.git#egg=EDAspy ``` diff --git a/build/lib/EDApy/__init__.py b/build/lib/EDAspy/__init__.py similarity index 100% rename from build/lib/EDApy/__init__.py rename to build/lib/EDAspy/__init__.py diff --git a/build/lib/EDApy/optimization/__init__.py b/build/lib/EDAspy/optimization/__init__.py similarity index 100% rename from build/lib/EDApy/optimization/__init__.py rename to build/lib/EDAspy/optimization/__init__.py diff --git a/build/lib/EDApy/optimization/multivariate/EDA_multivariate.py b/build/lib/EDAspy/optimization/multivariate/EDA_multivariate.py similarity index 100% rename from build/lib/EDApy/optimization/multivariate/EDA_multivariate.py rename to build/lib/EDAspy/optimization/multivariate/EDA_multivariate.py diff --git a/build/lib/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py b/build/lib/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py new file mode 100644 index 0000000..f206656 --- /dev/null +++ b/build/lib/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.py @@ -0,0 +1,230 @@ +import pandas as pd +import numpy as np + + +class EDA_multivariate_gaussian: + + """Multivariate Estimation of Distribution algorithm continuous. + New individuals are sampled from a multivariate normal distribution. Evidences are not allowed + + :param SIZE_GEN: total size of the generations in the execution of the algorithm + :type SIZE_GEN: int + :param MAX_ITER: total number of iterations in case that optimum is not yet found. If reached, the optimum found is returned + :type MAX_ITER: int + :param DEAD_ITER: total number of iteration with no better solution found. If reached, the optimum found is returned + :type DEAD_ITER: int + :param ALPHA: percentage of the generation tu take, in order to sample from them. The best individuals selection + :type ALPHA: float [0-1] + :param aim: Represents the optimization aim. + :type aim: 'minimize' or 'maximize'. + :param cost_function: a callable function implemented by the user, to optimize. + :type cost_function: callable function which receives a dictionary as input and returns a numeric + :param mus: pandas dataframe with initial mus of the multivariate gaussian + :type mus: pandas dataframe (one row) + :param sigma: pandas dataframe with the sigmas of the variable (diagonal of covariance matrix) + :type sigma: pandas dataframe (one row) + + :raises Exception: cost function is not callable + + """ + + SIZE_GEN = -1 + MAX_ITER = -1 + DEAD_ITER = -1 + alpha = -1 + vector = -1 + + generation = -1 + + best_mae_global = -1 + best_ind_global = -1 + + cost_function = -1 + history = [] + + def __init__(self, SIZE_GEN, MAX_ITER, DEAD_ITER, ALPHA, aim, cost_function, mus, sigma): + """Constructor of the optimizer class + """ + + self.SIZE_GEN = SIZE_GEN + self.MAX_ITER = MAX_ITER + self.alpha = ALPHA + + self.variables = list(sigma.columns) + + if aim == 'minimize': + self.aim = 'min' + self.best_mae_global = 999999999999 + elif aim == 'maximize': + self.aim = 'max' + self.best_mae_global = -999999999999 + else: + raise Exception('ERROR when setting aim of optimizer. Only "minimize" or "maximize" is possible') + + # check if cost_function is real + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. The cost function must be a callable function') + + # self.DEAD_ITER must be fewer than MAX_ITER + if DEAD_ITER >= MAX_ITER: + raise Exception('ERROR setting DEAD_ITER. The dead iterations must be fewer than the maximum iterations') + else: + self.DEAD_ITER = DEAD_ITER + + # multivariate + self.mus = mus + + sigma_data = pd.DataFrame(columns=mus.columns) + sigma_data['vars'] = list(sigma_data.columns) + sigma_data = sigma_data.set_index('vars') + for var in list(sigma_data.columns): + sigma_data.loc[var, var] = float(sigma[var]) + sigma_data = sigma_data.fillna(0) + + self.sigma = sigma_data + + # new individual + def __new_individual__(self): + """Sample a new individual from the vector of probabilities. + :return: a dictionary with the new individual; with names of the parameters as keys and the values. + :rtype: dict + """ + mus = self.mus.loc[0].values.tolist() + sigma = self.sigma.values.tolist() + + rand = list(np.random.multivariate_normal(mus, sigma, 1)[0]) + dic = {} + for i in range(len(rand)): + key = list(self.sigma.columns)[i] + dic[key] = rand[i] + + return dic + + # build a generation of size SIZE_GEN from prob vector + def new_generation(self): + """Build a new generation sampled from the vector of probabilities. Updates the generation pandas dataframe + """ + gen = pd.DataFrame(columns=self.variables) + while len(gen) < self.SIZE_GEN: + individual = self.__new_individual__() + gen = gen.append(individual, True) + + # drop duplicate individuals + gen = gen.drop_duplicates() + gen = gen.reset_index() + del gen['index'] + + self.generation = gen + + # truncate the generation at alpha percent + def truncation(self): + """ Selection of the best individuals of the actual generation. Updates the generation by selecting the best individuals + """ + + length = int(self.SIZE_GEN * self.alpha) + + # depending on whether min o maw is wanted + if self.aim == 'max': + self.generation = self.generation.nlargest(length, 'cost') + elif self.aim == 'min': + self.generation = self.generation.nsmallest(length, 'cost') + + # check the MAE of each individual + def __check_individual__(self, individual): + """Check the cost of the individual in the cost function + + :param individual: dictionary with the parameters to optimize as keys and the value as values of the keys + :type individual: dict + :return: a cost evaluated in the cost function to optimize + :rtype: float + """ + + cost = self.cost_function(individual) + return cost + + # check each individual of the generation + def check_generation(self): + """Check the cost of each individual in the cost function implemented by the user + """ + + for ind in range(len(self.generation)): + cost = self.__check_individual__(self.generation.loc[ind]) + # print('ind: ', ind, ' cost ', cost) + self.generation.loc[ind, 'cost'] = cost + + # update the probability vector + def update_vector(self): + """From the best individuals update the vector of normal distributions in order to the next + generation can sample from it. Update the vector of normal distributions + """ + + # build covariance matrix from selection + self.variables = list(self.sigma.columns) + self.generation = self.generation.astype(float) + covariance_matrix = self.generation[self.variables].cov() # covariance matrix + self.sigma = covariance_matrix.copy() + + for var in self.variables: + # change mean + self.mus.loc[0, var] = float(self.generation[var].mean()) + + # check if sigma has decreased in off + if self.sigma.loc[var, var] <= 1: + self.sigma.loc[var, var] = 1 + + # intern function to compare local cost with global one + def __compare_costs__(self, local): + """Check if the local best cost is better than the global one + :param local: local best cost + :type local: float + :return: True if is better, False if not + :rtype: bool + """ + + if self.aim == 'min': + return local <= self.best_mae_global + else: + return local >= self.best_mae_global + + # run the class to find the optimum + def run(self, output=True): + """Run method to execute the EDA algorithm + + :param output: True if wanted to print each iteration + :type output: bool + :return: best cost, best individual, history of costs along execution + :rtype: float, pandas dataframe, list + """ + + not_better = 0 + for i in range(self.MAX_ITER): + self.new_generation() + self.check_generation() + self.truncation() + self.update_vector() + + if self.aim == 'min': + best_mae_local = self.generation['cost'].min() + else: + best_mae_local = self.generation['cost'].max() + + self.history.append(best_mae_local) + best_ind_local = self.generation[self.generation['cost'] == best_mae_local] + + # update the best values ever + # if best_mae_local <= self.best_mae_global: + if self.__compare_costs__(best_mae_local): + self.best_mae_global = best_mae_local + self.best_ind_global = best_ind_local + not_better = 0 + else: + not_better = not_better + 1 + if not_better == self.DEAD_ITER: + return self.best_mae_global, self.best_ind_global, self.history + + if output: + print('IT ', i, 'best cost ', best_mae_local) + + return self.best_mae_global, self.best_ind_global, self.history diff --git a/build/lib/EDApy/optimization/multivariate/__BayesianNetwork.py b/build/lib/EDAspy/optimization/multivariate/__BayesianNetwork.py similarity index 100% rename from build/lib/EDApy/optimization/multivariate/__BayesianNetwork.py rename to build/lib/EDAspy/optimization/multivariate/__BayesianNetwork.py diff --git a/build/lib/EDApy/optimization/multivariate/__clustering.py b/build/lib/EDAspy/optimization/multivariate/__clustering.py similarity index 100% rename from build/lib/EDApy/optimization/multivariate/__clustering.py rename to build/lib/EDAspy/optimization/multivariate/__clustering.py diff --git a/build/lib/EDApy/optimization/multivariate/__init__.py b/build/lib/EDAspy/optimization/multivariate/__init__.py similarity index 99% rename from build/lib/EDApy/optimization/multivariate/__init__.py rename to build/lib/EDAspy/optimization/multivariate/__init__.py index 6742f4a..76dbb15 100644 --- a/build/lib/EDApy/optimization/multivariate/__init__.py +++ b/build/lib/EDAspy/optimization/multivariate/__init__.py @@ -22,6 +22,7 @@ def check_package(installed_pack, package): else: return False + if sys.version_info[0] < 3: raise Exception("Python version should be greater than 3") diff --git a/build/lib/EDApy/optimization/multivariate/__matrix.py b/build/lib/EDAspy/optimization/multivariate/__matrix.py similarity index 100% rename from build/lib/EDApy/optimization/multivariate/__matrix.py rename to build/lib/EDAspy/optimization/multivariate/__matrix.py diff --git a/build/lib/EDApy/optimization/univariate/__init__.py b/build/lib/EDAspy/optimization/univariate/__init__.py similarity index 100% rename from build/lib/EDApy/optimization/univariate/__init__.py rename to build/lib/EDAspy/optimization/univariate/__init__.py diff --git a/build/lib/EDApy/optimization/univariate/continuous.py b/build/lib/EDAspy/optimization/univariate/continuous.py similarity index 100% rename from build/lib/EDApy/optimization/univariate/continuous.py rename to build/lib/EDAspy/optimization/univariate/continuous.py diff --git a/build/lib/EDApy/optimization/univariate/discrete.py b/build/lib/EDAspy/optimization/univariate/discrete.py similarity index 100% rename from build/lib/EDApy/optimization/univariate/discrete.py rename to build/lib/EDAspy/optimization/univariate/discrete.py diff --git a/build/lib/tests/__init__.py b/build/lib/tests/__init__.py new file mode 100644 index 0000000..6f1a779 --- /dev/null +++ b/build/lib/tests/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# coding: utf-8 + +# __init__.py + +# tests + diff --git a/dist/EDApy-0.1.0.tar.gz b/dist/EDApy-0.1.0.tar.gz new file mode 100644 index 0000000..900a4c4 Binary files /dev/null and b/dist/EDApy-0.1.0.tar.gz differ diff --git a/dist/EDAspy-0.1.0-py3.7.egg b/dist/EDAspy-0.1.0-py3.7.egg new file mode 100644 index 0000000..48619de Binary files /dev/null and b/dist/EDAspy-0.1.0-py3.7.egg differ diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle index ed649e8..1df52ad 100644 Binary files a/docs/build/doctrees/environment.pickle and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/index.doctree b/docs/build/doctrees/index.doctree index 3e814d9..b18ddad 100644 Binary files a/docs/build/doctrees/index.doctree and b/docs/build/doctrees/index.doctree differ diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo index c68f38b..5430c0b 100644 --- a/docs/build/html/.buildinfo +++ b/docs/build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 3f8bbc54a5f68cb32333a85c91280d12 +config: 9a5f9cb4ba26550d85363ba438d629fe tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/build/html/_modules/EDApy/optimization/multivariate/EDA_multivariate_gaussian.html b/docs/build/html/_modules/EDApy/optimization/multivariate/EDA_multivariate_gaussian.html new file mode 100644 index 0000000..7f55e08 --- /dev/null +++ b/docs/build/html/_modules/EDApy/optimization/multivariate/EDA_multivariate_gaussian.html @@ -0,0 +1,327 @@ + + + + + + + EDApy.optimization.multivariate.EDA_multivariate_gaussian — EDApy 0.0.1 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Source code for EDApy.optimization.multivariate.EDA_multivariate_gaussian

+import pandas as pd
+import numpy as np
+
+
+
[docs]class EDA_multivariate_gaussian: + + """Multivariate Estimation of Distribution algorithm continuous. + New individuals are sampled from a multivariate normal distribution. Evidences are not allowed + + :param SIZE_GEN: total size of the generations in the execution of the algorithm + :type SIZE_GEN: int + :param MAX_ITER: total number of iterations in case that optimum is not yet found. If reached, the optimum found is returned + :type MAX_ITER: int + :param DEAD_ITER: total number of iteration with no better solution found. If reached, the optimum found is returned + :type DEAD_ITER: int + :param ALPHA: percentage of the generation tu take, in order to sample from them. The best individuals selection + :type ALPHA: float [0-1] + :param aim: Represents the optimization aim. + :type aim: 'minimize' or 'maximize'. + :param cost_function: a callable function implemented by the user, to optimize. + :type cost_function: callable function which receives a dictionary as input and returns a numeric + :param mus: pandas dataframe with initial mus of the multivariate gaussian + :type mus: pandas dataframe (one row) + :param sigma: pandas dataframe with the sigmas of the variable (diagonal of covariance matrix) + :type sigma: pandas dataframe (one row) + + :raises Exception: cost function is not callable + + """ + + SIZE_GEN = -1 + MAX_ITER = -1 + DEAD_ITER = -1 + alpha = -1 + vector = -1 + + generation = -1 + + best_mae_global = -1 + best_ind_global = -1 + + cost_function = -1 + history = [] + + def __init__(self, SIZE_GEN, MAX_ITER, DEAD_ITER, ALPHA, aim, cost_function, mus, sigma): + """Constructor of the optimizer class + """ + + self.SIZE_GEN = SIZE_GEN + self.MAX_ITER = MAX_ITER + self.alpha = ALPHA + + self.variables = list(sigma.columns) + + if aim == 'minimize': + self.aim = 'min' + self.best_mae_global = 999999999999 + elif aim == 'maximize': + self.aim = 'max' + self.best_mae_global = -999999999999 + else: + raise Exception('ERROR when setting aim of optimizer. Only "minimize" or "maximize" is possible') + + # check if cost_function is real + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. The cost function must be a callable function') + + # self.DEAD_ITER must be fewer than MAX_ITER + if DEAD_ITER >= MAX_ITER: + raise Exception('ERROR setting DEAD_ITER. The dead iterations must be fewer than the maximum iterations') + else: + self.DEAD_ITER = DEAD_ITER + + # multivariate + self.mus = mus + + sigma_data = pd.DataFrame(columns=mus.columns) + sigma_data['vars'] = list(sigma_data.columns) + sigma_data = sigma_data.set_index('vars') + for var in list(sigma_data.columns): + sigma_data.loc[var, var] = float(sigma[var]) + sigma_data = sigma_data.fillna(0) + + self.sigma = sigma_data + + # new individual + def __new_individual__(self): + """Sample a new individual from the vector of probabilities. + :return: a dictionary with the new individual; with names of the parameters as keys and the values. + :rtype: dict + """ + mus = self.mus.loc[0].values.tolist() + sigma = self.sigma.values.tolist() + + rand = list(np.random.multivariate_normal(mus, sigma, 1)[0]) + dic = {} + for i in range(len(rand)): + key = list(self.sigma.columns)[i] + dic[key] = rand[i] + + return dic + + # build a generation of size SIZE_GEN from prob vector +
[docs] def new_generation(self): + """Build a new generation sampled from the vector of probabilities. Updates the generation pandas dataframe + """ + gen = pd.DataFrame(columns=self.variables) + while len(gen) < self.SIZE_GEN: + individual = self.__new_individual__() + gen = gen.append(individual, True) + + # drop duplicate individuals + gen = gen.drop_duplicates() + gen = gen.reset_index() + del gen['index'] + + self.generation = gen
+ + # truncate the generation at alpha percent +
[docs] def truncation(self): + """ Selection of the best individuals of the actual generation. Updates the generation by selecting the best individuals + """ + + length = int(self.SIZE_GEN * self.alpha) + + # depending on whether min o maw is wanted + if self.aim == 'max': + self.generation = self.generation.nlargest(length, 'cost') + elif self.aim == 'min': + self.generation = self.generation.nsmallest(length, 'cost')
+ + # check the MAE of each individual + def __check_individual__(self, individual): + """Check the cost of the individual in the cost function + + :param individual: dictionary with the parameters to optimize as keys and the value as values of the keys + :type individual: dict + :return: a cost evaluated in the cost function to optimize + :rtype: float + """ + + cost = self.cost_function(individual) + return cost + + # check each individual of the generation +
[docs] def check_generation(self): + """Check the cost of each individual in the cost function implemented by the user + """ + + for ind in range(len(self.generation)): + cost = self.__check_individual__(self.generation.loc[ind]) + # print('ind: ', ind, ' cost ', cost) + self.generation.loc[ind, 'cost'] = cost
+ + # update the probability vector +
[docs] def update_vector(self): + """From the best individuals update the vector of normal distributions in order to the next + generation can sample from it. Update the vector of normal distributions + """ + + # build covariance matrix from selection + self.variables = list(self.sigma.columns) + self.generation = self.generation.astype(float) + covariance_matrix = self.generation[self.variables].cov() # covariance matrix + self.sigma = covariance_matrix.copy() + + for var in self.variables: + # change mean + self.mus.loc[0, var] = float(self.generation[var].mean()) + + # check if sigma has decreased in off + if self.sigma.loc[var, var] <= 1: + self.sigma.loc[var, var] = 1
+ + # intern function to compare local cost with global one + def __compare_costs__(self, local): + """Check if the local best cost is better than the global one + :param local: local best cost + :type local: float + :return: True if is better, False if not + :rtype: bool + """ + + if self.aim == 'min': + return local <= self.best_mae_global + else: + return local >= self.best_mae_global + + # run the class to find the optimum +
[docs] def run(self, output=True): + """Run method to execute the EDA algorithm + + :param output: True if wanted to print each iteration + :type output: bool + :return: best cost, best individual, history of costs along execution + :rtype: float, pandas dataframe, list + """ + + not_better = 0 + for i in range(self.MAX_ITER): + self.new_generation() + self.check_generation() + self.truncation() + self.update_vector() + + if self.aim == 'min': + best_mae_local = self.generation['cost'].min() + else: + best_mae_local = self.generation['cost'].max() + + self.history.append(best_mae_local) + best_ind_local = self.generation[self.generation['cost'] == best_mae_local] + + # update the best values ever + # if best_mae_local <= self.best_mae_global: + if self.__compare_costs__(best_mae_local): + self.best_mae_global = best_mae_local + self.best_ind_global = best_ind_local + not_better = 0 + else: + not_better = not_better + 1 + if not_better == self.DEAD_ITER: + return self.best_mae_global, self.best_ind_global, self.history + + if output: + print('IT ', i, 'best cost ', best_mae_local) + + return self.best_mae_global, self.best_ind_global, self.history
+
+ +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate.html b/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate.html new file mode 100644 index 0000000..422b607 --- /dev/null +++ b/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate.html @@ -0,0 +1,538 @@ + + + + + + + EDAspy.optimization.multivariate.EDA_multivariate — EDAspy 0.0.1 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Source code for EDAspy.optimization.multivariate.EDA_multivariate

+#!/usr/bin/env python
+# coding: utf-8
+
+import pandas as pd
+import rpy2.robjects.packages as rp
+import numpy as np
+
+from .__BayesianNetwork import learn_structure, calculate_fit
+from .__clustering import clustering
+from .__matrix import nearestPD, normalizacion, is_invertible
+
+utils = rp.importr('utils')
+utils.chooseCRANmirror(ind=1)
+bnlearn_package = rp.importr("bnlearn")
+dbnRPac = rp.importr("dbnR")
+
+
+
[docs]class EDAgbn: + + """Multivariate Estimation of Distribution algorithm. Best individuals of each generation are selected and modelled + by a Gaussian Bayesian network, from where new individuals are sampled. Some of the variables might be evidences + (fixed values). The optimizer will find the optimum values of the non-evidences variables for those evidences. + Also it is possible to control de influence of the historic (from which EDA is initialized) in the selection + of the best indiviudals. + + :param MAX_ITER: Maximum number of iterations of the algorithm + :type MAX_ITER: int + :param DEAD_ITER: Number of iterations with no best cost improvement, before stopping + :type DEAD_ITER: int + :param data: data of the historic + :type data: pandas dataframe + :param ALPHA: percentage of population to select in the truncation + :type ALPHA: float [0,1] + :param BETA: percentage of influence of the individual likelihood in the historic + :type BETA: float [0,1] + :param cost_function: cost function to minimize + :type cost_function: callable function which receives a dictionary as input and returns a numeric value + :param evidences: name of evidences variables, and fixed values. + :type evidences: two fold list. A list that contains list of size 2 with name of variable and value [name, value] + :param black_list: forbidden arcs in the structures + :type black_list: pandas dataframe with two columns (from, to) + :param n_clusters: number of clusters in which, the data can be grouped. The cluster is appended in each iteration + :type n_clusters: int + :param cluster_vars: list of names of variables to consider in the clustering + :type cluster_vars: list of strings + + :raises Exception: cost function is not callable + + """ + + # initializations + # structure = -1 + + # best_ind_global = -1 + # best_cost_global = 999999999999 + # best_structure = -1 + # history = -1 + # history_cost = [] + # dispersion = [] + + def __init__(self, MAX_ITER, DEAD_ITER, data, ALPHA, BETA, cost_function, + evidences, black_list, n_clusters, cluster_vars): + + """Constructor method + """ + + self.MAX_ITER = MAX_ITER + self.DEAD_ITER = DEAD_ITER + self.data = data + self.length = int(len(data) * ALPHA) + self.beta = BETA + + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. Cost function must be callable') + + self.evidences = evidences + + # calculate historic fit + self.historic_fit = calculate_fit(data, 'hc', black_list) + self.black_list = black_list + self.generation = data + + # definition of the variables to optimize + ev = [row[0] for row in evidences] + columns = list(data.columns) + self.variables2optimize = list(set(columns) - set(ev)) + + # clustering to soft restrictions + cluster = clustering(n_clusters, data, evidences, cluster_vars) + # add cost to clustering selection + indexes = list(cluster.index) + dic = {} + for index in indexes: + for var in self.variables2optimize: + dic[var] = float(cluster.loc[index, var]) + + cost = self.cost_function(dic) + cluster.loc[index, 'COSTE'] = cost + self.dt_aux = cluster.nsmallest(self.length, 'COSTE') # assign cost to clustering selection + + # initializations + self.__best_cost_global = 99999999999 + self.__best_structure = -1 + self.__history = pd.DataFrame(columns=data.columns) + self.__history_cost = [] + self.__dispersion = [] + self.__best_ind_global = -1 + + self.structure = -1 + + def __initialize_data__(self): + + """Initialize the dataset. Assign a column cost to each individual + """ + + indexes = list(self.generation.index) + dic = {} + for index in indexes: + for var in self.variables2optimize: + dic[var] = float(self.generation.loc[index, var]) + + cost = self.cost_function(dic) + self.generation.loc[index, 'COSTE'] = cost + +
[docs] def truncate(self): + + """Select the best individuals of the generation. In this case, not only the cost is considered. Also the + likelihood of the individual in the initial generation. This influence is controlled by beta parameter + """ + + likelihood_estimation = bnlearn_package.logLik_bn_fit + names = list(self.generation.columns) + to_test = list(set(names) - {'COSTE'}) + + logs = [] + for index, row in self.generation[to_test].iterrows(): + log = likelihood_estimation(self.historic_fit, pd.DataFrame(row).T)[0] + logs.append([index, log]) + + maximo = max([row[1] for row in logs]) + for i in logs: + i[1] = maximo - i[1] + value = float(self.generation.loc[i[0], 'COSTE']) + self.beta * abs(i[1]) + self.generation.loc[i[0], 'trun'] = value + + self.generation = self.generation.nsmallest(self.length, 'trun') + del self.generation['trun']
+ +
[docs] def sampling_multivariate(self, fit): + + """Calculate the parameters mu and sigma to sample from a multivariate normal distribution. + + :param fit: bnfit object from R of the generation (structure and data) + :type fit: bnfit object from R + :return: name in order of the parameters returned. mu and sigma parameters of the multivariate normal distribution + :rtype: list, float, float + """ + + hierarchical_order = bnlearn_package.node_ordering(self.structure) + mu_calc, sigma_calc = dbnRPac.calc_mu, dbnRPac.calc_sigma + + # mus array -> alphabetic order + mat_mus = mu_calc(fit) + + # covariance matrix hierarchical order + cov = list(sigma_calc(fit)) + split = len(mat_mus) + mat_cov = np.array(cov).reshape(split, split) + + # change order of the columns and rows. Up-left must be variables and down right evidences + sampled_nodes = [row[0] for row in self.evidences] + sampled_value = [row[1] for row in self.evidences] + nodes2sample = list(set(hierarchical_order) - set(sampled_nodes)) + + # order evidences and variables to sample in order jerarquico + nodes2sample_hierarchical_order = [] + nodes_sampled_hierarchical_order = [] + + for i in hierarchical_order: + if i in nodes2sample: + nodes2sample_hierarchical_order.append(i) + else: + nodes_sampled_hierarchical_order.append(i) + + mat_cov_order = nodes2sample_hierarchical_order + nodes_sampled_hierarchical_order # desired order + order = [] + for i in mat_cov_order: + order.append(hierarchical_order.index(i)) # place of each order[] in hierarchical order + + # covariance matrix + mat_cov[list(range(len(hierarchical_order))), :] = mat_cov[order, :] # rows swap + mat_cov[:, list(range(len(hierarchical_order)))] = mat_cov[:, order] # columns swap + mat_cov_evid = np.array([row[len(nodes2sample):] for row in mat_cov[len(nodes2sample):]]) + + from numpy.linalg import inv, pinv + if is_invertible(mat_cov): + mat_cov_inv = inv(mat_cov) + else: + mat_cov_inv = pinv(mat_cov, hermitian=True) + + mat_cov_inv_data = np.array([row[:len(nodes2sample)] for row in mat_cov_inv[:len(nodes2sample)]]) + sum_12 = [row[len(nodes2sample):] for row in mat_cov[:len(nodes2sample)]] + + orden_mat_mus = sorted(hierarchical_order) # order hierarchical en order alphabetic + + mus = [] # mus en order hierarchical + for i in hierarchical_order: # append in mus, mu calc of the position where is i + mus.append(mat_mus[orden_mat_mus.index(i)]) + + values = [] # sampled values in hierarchical order + order_values = [] + for i in hierarchical_order: + if i not in nodes2sample: + order_values.append(i) + values.append(sampled_value[sampled_nodes.index(i)]) + + # mus en order hierarchical + mat_mus_data = [] + mat_mus_evid = [] + + for i in hierarchical_order: + if i in nodes2sample: + mat_mus_data.append(mus[hierarchical_order.index(i)]) + else: + mat_mus_evid.append(mus[hierarchical_order.index(i)]) + + # FORMULAS MURPHY + aux = np.subtract(np.array(values), np.array(mat_mus_evid)) + aux1 = np.matmul(sum_12, pinv(mat_cov_evid)) + aux2 = np.matmul(aux1, aux) + + mu_cond = np.add(mat_mus_data, np.array(aux2)) + + if is_invertible(mat_cov): + mat_cov_inv_data_inv = inv(mat_cov_inv_data) + else: + mat_cov_inv_data_inv = pinv(mat_cov_inv_data, hermitian=True) + + return nodes2sample_hierarchical_order, mu_cond, mat_cov_inv_data_inv # , densities
+ +
[docs] def new_generation(self): + + """Build a new generation from the parameters calculated and update the generation to the new group of individuals + + :return: mean and sigma of the individuals costs of the generation + :rtype: float, float + """ + + valid_individuals = 0 + hierarchical_order = bnlearn_package.node_ordering(self.structure) + + bn_fit = bnlearn_package.bn_fit + fit = bn_fit(self.structure, self.generation) + + nodes, mu, sigma = self.sampling_multivariate(fit) + + # precision errors solved + sigma = nearestPD(np.array(sigma)) + gen = pd.DataFrame(columns=hierarchical_order) + media = [] + counter = 0 + + while valid_individuals < len(self.data): + sample = np.random.multivariate_normal(mu, sigma, 1)[0] + # normalization + cost_position = nodes.index('COSTE') + values = [item for item in sample if list(sample).index(item) != cost_position] + ind_ = normalizacion(values) + ind = ind_[:cost_position] + [sample[cost_position]] + ind_[cost_position:] + + # append the evidences + individual = self.evidences[:] + for j in range(len(ind)): + individual.append([nodes[j], ind[j]]) + + # avoid solutions with the parameters under zero + flag = True + for i in individual: + for j in self.variables2optimize: + if i[0] == j: + if i[1] < 0: + flag = False + + # if restrictions are correct + if flag: + dic = {} + for i in individual: + aux = {i[0]: i[1]} + dic.update(aux) + + cost = self.cost_function(dic) + dic.update({'COSTE': cost}) + gen = gen.append(dic, ignore_index=True) + + valid_individuals = valid_individuals + 1 + media.append(cost) + counter = 0 + + else: + counter = counter + 1 + if counter == len(self.data) * 10: + break + + self.generation = gen.copy() + + from scipy.stats import norm + mu, sigma = norm.fit(media) + return mu, sigma
+ +
[docs] def soft_restrictions(self, NOISE): + + """Add Gaussian noise to the evidence variables + + :param NOISE: std of the normal distribution from where noise is sampled + :type NOISE: float + """ + + number_samples = len(self.generation) + + # generate noise from a normal distribution + for i in self.evidences: + s = np.random.normal(i[1], NOISE, number_samples) + self.generation[i[0]] = s
+ + def __choose_best__(self): + + """Select the best individual of the generation + :return: cost of the individual, and the individual + :rtype: float, pandas dataframe + """ + + minimum = self.generation['COSTE'].min() + best_ind_local = self.generation[self.generation['COSTE'] == minimum] + return minimum, best_ind_local + +
[docs] def run(self, output=True): + """Running method + :param output: if desired to print the output of each individual. True to print output + :type output: boolean + :return:the class is returned, in order to explore all the parameters + :rtype: self python class + """ + + self.__initialize_data__() + dead_iter = 0 + + for ITER in range(self.MAX_ITER): + self.truncate() + + # soft the values of the evidences variables + if ITER > 0: + self.soft_restrictions(0.01) + self.generation = self.generation.append(self.dt_aux, ignore_index=True) + else: + # first iteration + self.structure = learn_structure(self.generation, 'hc', self.black_list) + + # print_structure(self.structure, self.variables2optimize, [row[0] for row in self.evidences]) + mu, sigma = self.new_generation() + self.__dispersion.append([mu, sigma]) # info about dispersion among individuals in each iteration + self.structure = learn_structure(self.generation, 'hc', self.black_list) + + # if there have not been found all individuals + if len(self.generation) < len(self.data) / 2: + return self + + # set local and update global best + best_cost_local, best_ind_local = self.__choose_best__() + self.__history.append(best_ind_local, ignore_index=True) + self.__history_cost.append(best_cost_local) + + # update the global cost, structure and best individual, if needed + if best_cost_local < self.__best_cost_global: + self.__best_cost_global = best_cost_local + self.__best_ind_global = best_ind_local + self.__best_structure = self.structure + dead_iter = 0 + else: + dead_iter = dead_iter + 1 + if dead_iter == self.DEAD_ITER: + return self + + if output: + print('ITER: ', ITER, 'dead: ', dead_iter, + 'bestLocal: ', best_cost_local, 'bestGlobal: ', self.best_cost_global) + + return self
+ + """ + Getters of interesting attributes + """ + + @property + def best_cost_global(self): + """ + :return: best cost found ever at the end of the execution + :rtype: float + """ + return self.__best_cost_global + + @property + def best_ind_global(self): + """ + :return: best individual found ever at the end of the execution + :rtype: pandas dataframe slice + """ + return self.__best_ind_global + + @property + def best_structure(self): + """ + :return: best generation structure found ever at the end of the execution + :rtype: bnlearn structure + """ + return self.__best_structure + + @property + def history(self): + """ + :return: best individuals from all generations + :rtype: pandas dataframe + """ + return self.__history + + @property + def history_cost(self): + """ + :return: list of best costs found along the execution + :rtype: list + """ + return self.__history_cost + + @property + def dispersion(self): + """ + :return: list of double tuples with mean and variance of each generation + :rtype: list + """ + return self.__dispersion
+
+ +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.html b/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.html new file mode 100644 index 0000000..7fcc629 --- /dev/null +++ b/docs/build/html/_modules/EDAspy/optimization/multivariate/EDA_multivariate_gaussian.html @@ -0,0 +1,327 @@ + + + + + + + EDAspy.optimization.multivariate.EDA_multivariate_gaussian — EDAspy 0.0.1 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Source code for EDAspy.optimization.multivariate.EDA_multivariate_gaussian

+import pandas as pd
+import numpy as np
+
+
+
[docs]class EDA_multivariate_gaussian: + + """Multivariate Estimation of Distribution algorithm continuous. + New individuals are sampled from a multivariate normal distribution. Evidences are not allowed + + :param SIZE_GEN: total size of the generations in the execution of the algorithm + :type SIZE_GEN: int + :param MAX_ITER: total number of iterations in case that optimum is not yet found. If reached, the optimum found is returned + :type MAX_ITER: int + :param DEAD_ITER: total number of iteration with no better solution found. If reached, the optimum found is returned + :type DEAD_ITER: int + :param ALPHA: percentage of the generation tu take, in order to sample from them. The best individuals selection + :type ALPHA: float [0-1] + :param aim: Represents the optimization aim. + :type aim: 'minimize' or 'maximize'. + :param cost_function: a callable function implemented by the user, to optimize. + :type cost_function: callable function which receives a dictionary as input and returns a numeric + :param mus: pandas dataframe with initial mus of the multivariate gaussian + :type mus: pandas dataframe (one row) + :param sigma: pandas dataframe with the sigmas of the variable (diagonal of covariance matrix) + :type sigma: pandas dataframe (one row) + + :raises Exception: cost function is not callable + + """ + + SIZE_GEN = -1 + MAX_ITER = -1 + DEAD_ITER = -1 + alpha = -1 + vector = -1 + + generation = -1 + + best_mae_global = -1 + best_ind_global = -1 + + cost_function = -1 + history = [] + + def __init__(self, SIZE_GEN, MAX_ITER, DEAD_ITER, ALPHA, aim, cost_function, mus, sigma): + """Constructor of the optimizer class + """ + + self.SIZE_GEN = SIZE_GEN + self.MAX_ITER = MAX_ITER + self.alpha = ALPHA + + self.variables = list(sigma.columns) + + if aim == 'minimize': + self.aim = 'min' + self.best_mae_global = 999999999999 + elif aim == 'maximize': + self.aim = 'max' + self.best_mae_global = -999999999999 + else: + raise Exception('ERROR when setting aim of optimizer. Only "minimize" or "maximize" is possible') + + # check if cost_function is real + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. The cost function must be a callable function') + + # self.DEAD_ITER must be fewer than MAX_ITER + if DEAD_ITER >= MAX_ITER: + raise Exception('ERROR setting DEAD_ITER. The dead iterations must be fewer than the maximum iterations') + else: + self.DEAD_ITER = DEAD_ITER + + # multivariate + self.mus = mus + + sigma_data = pd.DataFrame(columns=mus.columns) + sigma_data['vars'] = list(sigma_data.columns) + sigma_data = sigma_data.set_index('vars') + for var in list(sigma_data.columns): + sigma_data.loc[var, var] = float(sigma[var]) + sigma_data = sigma_data.fillna(0) + + self.sigma = sigma_data + + # new individual + def __new_individual__(self): + """Sample a new individual from the vector of probabilities. + :return: a dictionary with the new individual; with names of the parameters as keys and the values. + :rtype: dict + """ + mus = self.mus.loc[0].values.tolist() + sigma = self.sigma.values.tolist() + + rand = list(np.random.multivariate_normal(mus, sigma, 1)[0]) + dic = {} + for i in range(len(rand)): + key = list(self.sigma.columns)[i] + dic[key] = rand[i] + + return dic + + # build a generation of size SIZE_GEN from prob vector +
[docs] def new_generation(self): + """Build a new generation sampled from the vector of probabilities. Updates the generation pandas dataframe + """ + gen = pd.DataFrame(columns=self.variables) + while len(gen) < self.SIZE_GEN: + individual = self.__new_individual__() + gen = gen.append(individual, True) + + # drop duplicate individuals + gen = gen.drop_duplicates() + gen = gen.reset_index() + del gen['index'] + + self.generation = gen
+ + # truncate the generation at alpha percent +
[docs] def truncation(self): + """ Selection of the best individuals of the actual generation. Updates the generation by selecting the best individuals + """ + + length = int(self.SIZE_GEN * self.alpha) + + # depending on whether min o maw is wanted + if self.aim == 'max': + self.generation = self.generation.nlargest(length, 'cost') + elif self.aim == 'min': + self.generation = self.generation.nsmallest(length, 'cost')
+ + # check the MAE of each individual + def __check_individual__(self, individual): + """Check the cost of the individual in the cost function + + :param individual: dictionary with the parameters to optimize as keys and the value as values of the keys + :type individual: dict + :return: a cost evaluated in the cost function to optimize + :rtype: float + """ + + cost = self.cost_function(individual) + return cost + + # check each individual of the generation +
[docs] def check_generation(self): + """Check the cost of each individual in the cost function implemented by the user + """ + + for ind in range(len(self.generation)): + cost = self.__check_individual__(self.generation.loc[ind]) + # print('ind: ', ind, ' cost ', cost) + self.generation.loc[ind, 'cost'] = cost
+ + # update the probability vector +
[docs] def update_vector(self): + """From the best individuals update the vector of normal distributions in order to the next + generation can sample from it. Update the vector of normal distributions + """ + + # build covariance matrix from selection + self.variables = list(self.sigma.columns) + self.generation = self.generation.astype(float) + covariance_matrix = self.generation[self.variables].cov() # covariance matrix + self.sigma = covariance_matrix.copy() + + for var in self.variables: + # change mean + self.mus.loc[0, var] = float(self.generation[var].mean()) + + # check if sigma has decreased in off + if self.sigma.loc[var, var] <= 1: + self.sigma.loc[var, var] = 1
+ + # intern function to compare local cost with global one + def __compare_costs__(self, local): + """Check if the local best cost is better than the global one + :param local: local best cost + :type local: float + :return: True if is better, False if not + :rtype: bool + """ + + if self.aim == 'min': + return local <= self.best_mae_global + else: + return local >= self.best_mae_global + + # run the class to find the optimum +
[docs] def run(self, output=True): + """Run method to execute the EDA algorithm + + :param output: True if wanted to print each iteration + :type output: bool + :return: best cost, best individual, history of costs along execution + :rtype: float, pandas dataframe, list + """ + + not_better = 0 + for i in range(self.MAX_ITER): + self.new_generation() + self.check_generation() + self.truncation() + self.update_vector() + + if self.aim == 'min': + best_mae_local = self.generation['cost'].min() + else: + best_mae_local = self.generation['cost'].max() + + self.history.append(best_mae_local) + best_ind_local = self.generation[self.generation['cost'] == best_mae_local] + + # update the best values ever + # if best_mae_local <= self.best_mae_global: + if self.__compare_costs__(best_mae_local): + self.best_mae_global = best_mae_local + self.best_ind_global = best_ind_local + not_better = 0 + else: + not_better = not_better + 1 + if not_better == self.DEAD_ITER: + return self.best_mae_global, self.best_ind_global, self.history + + if output: + print('IT ', i, 'best cost ', best_mae_local) + + return self.best_mae_global, self.best_ind_global, self.history
+
+ +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/build/html/_modules/EDAspy/optimization/univariate/continuous.html b/docs/build/html/_modules/EDAspy/optimization/univariate/continuous.html new file mode 100644 index 0000000..c3d7e66 --- /dev/null +++ b/docs/build/html/_modules/EDAspy/optimization/univariate/continuous.html @@ -0,0 +1,333 @@ + + + + + + + EDAspy.optimization.univariate.continuous — EDAspy 0.0.1 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Source code for EDAspy.optimization.univariate.continuous

+import pandas as pd
+import numpy as np
+
+'''
+In this version of UMDA, instead of a vector of probabilities, a vector of univariate normal distributions is found
+When sampling, it is sampled from gaussian
+vector is a table with, columns as variables, and rows with mu, std, and optional max and min
+'''
+
+
+
[docs]class UMDAc: + + """Univariate marginal Estimation of Distribution algorithm continuous. + New individuals are sampled from a vector of univariate normal distributions. + + :param SIZE_GEN: total size of the generations in the execution of the algorithm + :type SIZE_GEN: int + :param MAX_ITER: total number of iterations in case that optimum is not yet found. If reached, the optimum found is returned + :type MAX_ITER: int + :param DEAD_ITER: total number of iteration with no better solution found. If reached, the optimum found is returned + :type DEAD_ITER: int + :param ALPHA: percentage of the generation tu take, in order to sample from them. The best individuals selection + :type ALPHA: float [0-1] + :param vector: vector of normal distributions to sample from + :type vector: pandas dataframe with columns ['mu', 'std'] and optional ['min', 'max'] + :param aim: Represents the optimization aim. + :type aim: 'minimize' or 'maximize'. + :param cost_function: a callable function implemented by the user, to optimize. + :type cost_function: callable function which receives a dictionary as input and returns a numeric + + :raises Exception: cost function is not callable + + """ + + SIZE_GEN = -1 + MAX_ITER = -1 + DEAD_ITER = -1 + alpha = -1 + vector = -1 + + generation = -1 + + best_mae_global = -1 + best_ind_global = -1 + + cost_function = -1 + history = [] + + def __init__(self, SIZE_GEN, MAX_ITER, DEAD_ITER, ALPHA, vector, aim, cost_function): + """Constructor of the optimizer class + """ + + self.SIZE_GEN = SIZE_GEN + self.MAX_ITER = MAX_ITER + self.alpha = ALPHA + self.vector = vector + + self.variables = list(vector.columns) + + if aim == 'minimize': + self.aim = 'min' + self.best_mae_global = 999999999999 + elif aim == 'maximize': + self.aim = 'max' + self.best_mae_global = -999999999999 + else: + raise Exception('ERROR when setting aim of optimizer. Only "minimize" or "maximize" is possible') + + # check if cost_function is real + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. The cost function must be a callable function') + + # self.DEAD_ITER must be fewer than MAX_ITER + if DEAD_ITER >= MAX_ITER: + raise Exception('ERROR setting DEAD_ITER. The dead iterations must be fewer than the maximum iterations') + else: + self.DEAD_ITER = DEAD_ITER + + # new individual + def __new_individual__(self): + """Sample a new individual from the vector of probabilities. + :return: a dictionary with the new individual; with names of the parameters as keys and the values. + :rtype: dict + """ + + dic = {} + for var in self.variables: + mu = int(self.vector.loc['mu', var]) + std = int(self.vector.loc['std', var]) + + # if exists min o max restriction + if 'max' in list(self.vector.index): + maximum = int(self.vector.loc['max', var]) + else: + maximum = 999999999999 + if 'min' in list(self.vector.index): + minimum = int(self.vector.loc['min', var]) + else: + minimum = -999999999999 + + sample = np.random.normal(mu, std, 1) + while sample < minimum or sample > maximum: + sample = np.random.normal(mu, std, 1) + + dic[var] = sample[0] + return dic + + # build a generation of size SIZE_GEN from prob vector +
[docs] def new_generation(self): + """Build a new generation sampled from the vector of probabilities. Updates the generation pandas dataframe + """ + gen = pd.DataFrame(columns=self.variables) + while len(gen) < self.SIZE_GEN: + individual = self.__new_individual__() + gen = gen.append(individual, True) + + # drop duplicate individuals + gen = gen.drop_duplicates() + gen = gen.reset_index() + del gen['index'] + + self.generation = gen
+ + # truncate the generation at alpha percent +
[docs] def truncation(self): + """ Selection of the best individuals of the actual generation. Updates the generation by selecting the best individuals + """ + + length = int(self.SIZE_GEN * self.alpha) + + # depending on whether min o maw is wanted + if self.aim == 'max': + self.generation = self.generation.nlargest(length, 'cost') + elif self.aim == 'min': + self.generation = self.generation.nsmallest(length, 'cost')
+ + # check the MAE of each individual + def __check_individual__(self, individual): + """Check the cost of the individual in the cost function + + :param individual: dictionary with the parameters to optimize as keys and the value as values of the keys + :type individual: dict + :return: a cost evaluated in the cost function to optimize + :rtype: float + """ + + cost = self.cost_function(individual) + return cost + + # check each individual of the generation +
[docs] def check_generation(self): + """Check the cost of each individual in the cost function implemented by the user + """ + + for ind in range(len(self.generation)): + cost = self.__check_individual__(self.generation.loc[ind]) + # print('ind: ', ind, ' cost ', cost) + self.generation.loc[ind, 'cost'] = cost
+ + # update the probability vector +
[docs] def update_vector(self): + """From the best individuals update the vector of normal distributions in order to the next + generation can sample from it. Update the vector of normal distributions + """ + + for var in self.variables: + array = self.generation[var].values + + # calculate mu and std from data + from scipy.stats import norm + mu, std = norm.fit(array) + + # std should never be 0 + if std < 1: + std = 1 + + # update the vector probabilities + self.vector.loc['mu', var] = mu + self.vector.loc['std', var] = std
+ + # intern function to compare local cost with global one + def __compare_costs__(self, local): + """Check if the local best cost is better than the global one + :param local: local best cost + :type local: float + :return: True if is better, False if not + :rtype: bool + """ + + if self.aim == 'min': + return local <= self.best_mae_global + else: + return local >= self.best_mae_global + + # run the class to find the optimum +
[docs] def run(self, output=True): + """Run method to execute the EDA algorithm + + :param output: True if wanted to print each iteration + :type output: bool + :return: best cost, best individual, history of costs along execution + :rtype: float, pandas dataframe, list + """ + + not_better = 0 + for i in range(self.MAX_ITER): + self.new_generation() + self.check_generation() + self.truncation() + self.update_vector() + + if self.aim == 'min': + best_mae_local = self.generation['cost'].min() + else: + best_mae_local = self.generation['cost'].max() + + self.history.append(best_mae_local) + best_ind_local = self.generation[self.generation['cost'] == best_mae_local] + + # update the best values ever + # if best_mae_local <= self.best_mae_global: + if self.__compare_costs__(best_mae_local): + self.best_mae_global = best_mae_local + self.best_ind_global = best_ind_local + not_better = 0 + else: + not_better = not_better + 1 + if not_better == self.DEAD_ITER: + return self.best_mae_global, self.best_ind_global, self.history + + if output: + print('IT ', i, 'best cost ', best_mae_local) + + return self.best_mae_global, self.best_ind_global, self.history
+
+ +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/build/html/_modules/EDAspy/optimization/univariate/discrete.html b/docs/build/html/_modules/EDAspy/optimization/univariate/discrete.html new file mode 100644 index 0000000..ea6109e --- /dev/null +++ b/docs/build/html/_modules/EDAspy/optimization/univariate/discrete.html @@ -0,0 +1,313 @@ + + + + + + + EDAspy.optimization.univariate.discrete — EDAspy 0.0.1 documentation + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ +

Source code for EDAspy.optimization.univariate.discrete

+import numpy as np
+import pandas as pd
+
+
+
[docs]class UMDAd: + + """Univariate marginal Estimation of Distribution algorithm. + New individuals are sampled from a vector of univariate probabilities. It is a binary optimizer, than can be + used for example for feature selection. + + :param SIZE_GEN: total size of the generations in the execution of the algorithm + :type SIZE_GEN: int + :param MAX_IT: total number of iterations in case that optimum is not yet found. If reached, the optimum found is returned + :type MAX_IT: int + :param DEAD_ITER: total number of iteration with no better solution found. If reached, the optimum found is returned + :type DEAD_ITER: int + :param ALPHA: percentage of the generation tu take, in order to sample from them. The best individuals selection + :type ALPHA: float [0-1] + :param vector: vector of normal distributions to sample from + :type vector: pandas dataframe with columns ['mu', 'std'] and optional ['max', 'min'] + :param aim: 'minimize' or 'maximize'. Represents the optimization aim. + :type aim: string ['minimize' or 'maximize'] + :param cost_function: cost function to minimize + :type cost_function: callable function which receives a dictionary as input and returns a numeric value + + :raises Exception: cost function is not callable + + + """ + + MAX_IT = -1 + DEAD_ITER = -1 + SIZE_GEN = -1 + ALPHA = -1 + vector = [] + variables = [] + cost_function = -1 + esc = 15 + + history = [] + + generation = -1 + best_MAE_global = -1 + best_ind_global = -1 + + # init function + def __init__(self, MAX_IT, DEAD_ITER, SIZE_GEN, ALPHA, vector, cost_function, aim): + """Constructor of the optimizer class + """ + + self.ALPHA = ALPHA + self.SIZE_GEN = SIZE_GEN + self.MAX_IT = MAX_IT + + self.vector = vector + self.variables = list(vector.columns) + + # check if cost_function is real + if callable(cost_function): + self.cost_function = cost_function + else: + raise Exception('ERROR setting cost function. The cost function must be a callable function') + + if aim == 'minimize': + self.aim = 'min' + self.best_MAE_global = 9999999999 + elif aim == 'maximize': + self.aim = 'max' + self.best_MAE_global = -9999999999 + else: + raise Exception('ERROR when setting aim of optimizer. Only "minimize" or "maximize" is possible') + + # self.DEAD_ITER must be fewer than MAX_ITER + if DEAD_ITER >= MAX_IT: + raise Exception( + 'ERROR setting DEAD_ITER. The dead iterations must be fewer than the maximum iterations') + else: + self.DEAD_ITER = DEAD_ITER + + # new individual + def __new_individual__(self): + """Sample a new individual from the vector of probabilities. + + :return: a dictionary with the new individual; with names of the parameters as keys and the values. + :rtype: dict + """ + + num_vars = len(self.variables) + sample = list(np.random.uniform(low=0, high=1, size=num_vars)) + individual = {} + index = 0 + for ind in self.variables: + if float(self.vector[ind]) >= sample[index]: + individual[ind] = 1 + else: + individual[ind] = 0 + index = index + 1 + return individual + + # new generation +
[docs] def new_generation(self): + """Build a new generation sampled from the vector of probabilities and updates the generation pandas dataframe + """ + gen = pd.DataFrame(columns=self.variables) + + while len(gen) < self.SIZE_GEN: + individual = self.__new_individual__() + gen = gen.append(individual, True) + + self.generation = gen
+ + # check the cost of each individual of the generation + def __check_individual__(self, individual): + """Check the cost of the individual in the cost function + :param individual: dictionary with the parameters to optimize as keys and the value as values of the keys + :type individual: dict + :return: a cost evaluated in the cost function to optimize + """ + + cost = self.cost_function(individual) + return cost + + # check the cost of each individual of the generation +
[docs] def check_generation(self): + """Check the cost of each individual in the cost function implemented by the user + """ + + for ind in range(len(self.generation)): + cost = self.__check_individual__(self.generation.loc[ind]) + self.generation.loc[ind, 'cost'] = cost
+ + # selection of the best individuals to mutate the next gen +
[docs] def individuals_selection(self): + """Selection of the best individuals of the actual generation and updates the generation by selecting the best + individuals + """ + + length = int(len(self.generation)*self.ALPHA) + if self.aim == 'min': + self.generation = self.generation.nsmallest(length, 'cost') + else: + self.generation = self.generation.nlargest(length, 'cost') + + self.generation = self.generation.reset_index() + del self.generation['index']
+ + # based on the best individuals of the selection, rebuild the prob vector +
[docs] def update_vector(self): + """From the best individuals update the vector of probabilities in order to the next generation can sample from + it and update the vector of probabilities + """ + + for ind in self.variables: + total = self.generation[ind].sum() + prob = total / len(self.generation) + self.vector[ind] = prob
+ + # intern function to compare local cost with global one + def __compare_costs__(self, local): + """Check if the local best cost is better than the global one + :param local: local best cost + :type local: float + :return: True if is better, False if not + :rtype: bool + """ + + if self.aim == 'min': + return local < self.best_MAE_global + else: + return local > self.best_MAE_global + + # run method +
[docs] def run(self, output=True): + """Run method to execute the EDA algorithm + + :param output: True if wanted to print each iteration + :type output: bool + :return: best cost, best individual, history of costs along execution + :rtype: float, pandas dataframe, list + """ + + dead_iter = 0 + for i in range(self.MAX_IT): + self.new_generation() + self.check_generation() + self.individuals_selection() + self.update_vector() + + if self.aim == 'min': + best_mae_local = self.generation['cost'].min() + else: + best_mae_local = self.generation['cost'].max() + + self.history.append(best_mae_local) + best_ind_local = [] + best = self.generation[self.generation['cost'] == best_mae_local].loc[0] + + for j in self.variables: + if int(best[j]) == 1: + best_ind_local.append(j) + + # update best of model + if self.__compare_costs__(best_mae_local): + self.best_MAE_global = best_mae_local + self.best_ind_global = best_ind_local + dead_iter = 0 + else: + dead_iter = dead_iter + 1 + if dead_iter == self.DEAD_ITER: + return self.best_MAE_global, self.best_ind_global, self.history + + if output: + print('IT ', i, 'best cost ', best_mae_local) + print(best_ind_local) + + return self.best_MAE_global, self.best_ind_global, self.history
+
+ +
+ +
+
+ +
+
+ + + + + + + \ No newline at end of file diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html index ab5ada0..9e84f9a 100644 --- a/docs/build/html/_modules/index.html +++ b/docs/build/html/_modules/index.html @@ -4,7 +4,7 @@ - Overview: module code — EDApy 0.0.1 documentation + Overview: module code — EDAspy 0.0.1 documentation @@ -31,12 +31,10 @@

All modules for which code is available

-
@@ -45,7 +43,7 @@

All modules for which code is available