BAFurtado
diff --git a/‎README.md
Lines changed: 52 additions & 0 deletions b/‎README.md
Lines changed: 52 additions & 0 deletions
diff --git a/‎analysis/IQR.csv
Lines changed: 189 additions & 0 deletions b/‎analysis/IQR.csv
Lines changed: 189 additions & 0 deletions
diff --git a/‎analysis/Model_Proposal.png
97.9 KB b/‎analysis/Model_Proposal.png
97.9 KB
diff --git a/‎analysis/counting.py
Lines changed: 112 additions & 0 deletions b/‎analysis/counting.py
Lines changed: 112 additions & 0 deletions
diff --git a/‎analysis/graph_sorted_POLICIES_no_policy.png
370 KB b/‎analysis/graph_sorted_POLICIES_no_policy.png
370 KB
diff --git a/‎analysis/groups_cols.py
Lines changed: 177 additions & 0 deletions b/‎analysis/groups_cols.py
Lines changed: 177 additions & 0 deletions
@@ -0,0 +1,52 @@
+# Machine Learning simulates Agent-based Model towards Policy-Making
+
+Submitted. Under review.
+
+### Proposed scheme:
+
+![](analysis/Model_Proposal.png)
+
+### Results
+
+![](analysis/graph_sorted_POLICIES_no_policy.png)
+
+
+
+### Requires:
+
+````angular2html
+python==3.7 numpy==1.20.2 pandas==1.2.4 matplotlib==3.3.4 scipy==1.6.2 scikit-learn==0.24.2
+````
+
+The program:
+
+1. Reads output from an ABM model and its parameters' configuration
+2. Creates a socioeconomic optimal output based on two ABM results of the modelers choice
+3. Organizes the data as X and Y matrices
+4. Trains some Machine Learning algorithms
+5. Generates random configuration of parameters based on the mean and standard deviation of the original parameters
+6. Apply the trained ML algorithms to the set of randomly generated data
+7. Outputs the mean and values for the actual data, the randomly generated data and the optimal and non-optimal cases
+
+The original database is large (63.7 GB). 
+Thus, we provide pre-processed data to run the programme. 
+The code to make the data selection, however, is presented here at `preparing_data.py`
+
+## Running the program
+`python main.py`
+
+Output will be produced at the pre_processed folder
+
+With access to the 60 GB original data, it was possible to change the parameters for the targets at main.py. 
+We have chosen GDP and Gini coefficient as they carry a powerful, simple message of larger production with less inequality.
+Further work--with a combination (PCA) of output indicators in being developed (PolicyMix)
+
+You may change the parameters of the ML in machines.py
+
+Or the size of the sample at generating_random_conf.py
+
+## Figures
+1. To produce Figure 2, `cd analysis` and run `python read_comparison.py` to generate IQR.csv
+2. Then, run `python plot_alternative_for_table.py`
+3. To produce Figure 4, run `python means_comparison.py` and `python counting.py` to generate the input files.
+4. Then, run `python plot_z_score_parameters.py`
@@ -0,0 +1,112 @@
+import pandas as pd
+
+import groups_cols
+from groups_cols import abm_dummies as dummies
+from groups_cols import abm_params as params
+
+
+def getting_counting(data, name):
+    """ Produces a csv with information regarding each dummy, i.e., when the dummy is active (=1).
+    In the final csv we have three columns: sample size, optimal and non-optimal.
+    All columns are in percentage of the total: sample size in relation to the whole sample, and optimal and
+    non-optimal in relation to the sample of that specific dummy.
+
+    For example, policies_buy has a sample size of 0.12 meaning that 12% of the sample had that dummy as true, an
+    optimal 2% and a non-optimal of 97%, meaning that, when that dummy is active and policy used is buy, 97% of the
+    samples fall under the non-optimal category.
+
+    :param data: base csv
+    :param name: name of the file
+    :return: returns nothing, but saves the csv
+    """
+    table = pd.DataFrame(columns=['size', 'optimal', 'non_optimal', 'optimal_count', 'non_optimal_count'])
+    for key in dummies:
+        for each in dummies[key]:
+            sample_size = len(data[data[each] == 1])/len(data)
+            optimal = len(data[(data[each] == 1) & (data['Tree'] == 1)])
+            non_optimal = len(data[(data[each] == 1) & (data['Tree'] == 0)])
+            total = optimal + non_optimal
+            print(f'{each}: size {sample_size:.04f}: optimal {optimal/total:.0f}: '
+                  f'non-optimal {non_optimal/total:.04f}: optimal_count {optimal} non-optimal_count {non_optimal}')
+            table.loc[each, 'size'] = sample_size
+            table.loc[each, 'optimal'] = optimal/total
+            table.loc[each, 'non_optimal'] = non_optimal/total
+            table.loc[each, 'optimal_count'] = optimal
+            table.loc[each, 'non_optimal_count'] = non_optimal
+    table.to_csv(f'../pre_processed_data/counting_{name}.csv', sep=';')
+
+
+# Parameters analysis
+def coefficient_variation_comparison(simulated, ml):
+    """ This function compares the ABM simulated results to the ML surrogate results in order to identify the
+    differences between the two methods. How much of the cases fall under the optimal in relation to the mean?
+    Added the column difference
+
+    Using standard-score: (optimal value mean - full sample mean) / full sample standard-deviation
+
+    Added absolute optimal value for simulated and ML
+
+    :param simulated: the simulated database in csv
+    :param ml: the ML surrogate database in csv
+    :return: returns nothing, but saves the csv
+    """
+    table = pd.DataFrame(columns=['simulated_optimal', 'ml_optimal', 'difference'])
+    for param in params:
+        sim_mean = simulated[param].mean()
+        sim_std = simulated[param].std()
+        sim_optimal_mean = simulated[simulated['Tree'] == 1][param].mean()
+        ml_mean = ml[param].mean()
+        ml_std = ml[param].std()
+        ml_optimal_mean = ml[ml['Tree'] == 1][param].mean()
+        print(f'{param}: {(sim_optimal_mean - sim_mean) / sim_std:.06f}')
+        print(f'{param}: {(ml_optimal_mean - ml_mean) / ml_std:.06f}')
+        table.loc[param, 'simulated_optimal'] = (sim_optimal_mean - sim_mean) / sim_std
+        table.loc[param, 'ml_optimal'] = (ml_optimal_mean - ml_mean) / ml_std
+        table.loc[param, 'difference'] = table.loc[param, 'simulated_optimal'] - table.loc[param, 'ml_optimal']
+        table.loc[param, 'abs_sim_optimal'] = sim_optimal_mean
+        table.loc[param, 'abs_ml_optimal'] = ml_optimal_mean
+    table.to_csv(f'../pre_processed_data/parameters_comparison.csv', sep=';')
+    table.reset_index(inplace=True)
+    table['Parameters'] = table['index'].map(groups_cols.abm_params_show)
+    to_latex = table[['Parameters', 'abs_sim_optimal', 'abs_ml_optimal']]
+    to_latex = to_latex.sort_values(by='Parameters')
+    to_latex.set_index('Parameters', inplace=True)
+    to_latex.to_latex('../pre_processed_data/parameters_comparison_latex.txt',
+                      float_format="{:0.3f}".format)
+
+
+# Parameters analysis
+def normalize_and_optimal(simulated, ml):
+    table = pd.DataFrame(columns=['z_simulated_optimal', 'z_ml_optimal'])
+    for param in params:
+        # normalize
+        simulated.loc[:, f'n_{param}'] = (simulated[param] - simulated[param].min()) / \
+                                         (simulated[param].max() - simulated[param].min())
+        ml.loc[:, f'n_{param}'] = (ml[param] - ml[param].min()) / (ml[param].max() - ml[param].min())
+        sim_optimal_mean = simulated[simulated['Tree'] == 1][f'n_{param}'].mean()
+        ml_optimal_mean = ml[ml['Tree'] == 1][f'n_{param}'].mean()
+        print(f'{param}: {sim_optimal_mean:.06f}')
+        print(f'{param}: {ml_optimal_mean:.06f}')
+        table.loc[param, 'z_simulated_optimal'] = sim_optimal_mean
+        table.loc[param, 'z_ml_optimal'] = ml_optimal_mean
+        table.loc[param, 'difference'] = sim_optimal_mean - ml_optimal_mean
+        table.loc[param, 'abs_difference'] = abs(sim_optimal_mean - ml_optimal_mean)
+    table.to_csv(f'../pre_processed_data/parameters_norm_optimal.csv', sep=';')
+    table.reset_index(inplace=True)
+    table['Parameters'] = table['index'].map(groups_cols.abm_params_show)
+    to_latex = table[['Parameters', 'z_simulated_optimal', 'z_ml_optimal']]
+    to_latex = to_latex.sort_values(by='Parameters')
+    to_latex.set_index('Parameters', inplace=True)
+    to_latex.to_latex('../pre_processed_data/parameters_norm_optimal_latex.txt',
+                      float_format="{:0.3f}".format)
+
+
+if __name__ == '__main__':
+    th = pd.read_csv('../output/Tree_gdp_index_75_gini_index_25_1000000_temp_stats.csv', sep=';')
+    c = pd.read_csv('../output/current_gdp_index_75_gini_index_25_1000000_temp_stats.csv', sep=';')
+    c.rename(columns={'0': 'Tree'}, inplace=True)
+    getting_counting(th, 'Tree')
+    getting_counting(c, 'Current')
+    coefficient_variation_comparison(c, th)
+    normalize_and_optimal(c, th)
+
@@ -0,0 +1,177 @@
+abm_dummies = {'policies': ['POLICIES_buy',
+                            'POLICIES_rent',
+                            'POLICIES_wage',
+                            'POLICIES_no_policy'],
+               'interest': ['INTEREST_fixed',
+                            'INTEREST_real',
+                            'INTEREST_nominal'],
+               'acps': ['PROCESSING_ACPS_BELO HORIZONTE',
+                        'PROCESSING_ACPS_FORTALEZA',
+                        'PROCESSING_ACPS_PORTO ALEGRE',
+                        'PROCESSING_ACPS_CAMPINAS',
+                        'PROCESSING_ACPS_SALVADOR',
+                        'PROCESSING_ACPS_RECIFE',
+                        'PROCESSING_ACPS_SAO PAULO',
+                        'PROCESSING_ACPS_JOINVILLE',
+                        'PROCESSING_ACPS_CAMPO GRANDE',
+                        'PROCESSING_ACPS_JUNDIAI',
+                        'PROCESSING_ACPS_FEIRA DE SANTANA',
+                        'PROCESSING_ACPS_IPATINGA',
+                        'PROCESSING_ACPS_LONDRINA',
+                        'PROCESSING_ACPS_SOROCABA',
+                        'PROCESSING_ACPS_JOAO PESSOA',
+                        'PROCESSING_ACPS_SAO JOSE DO RIO PRETO',
+                        'PROCESSING_ACPS_MACEIO',
+                        'PROCESSING_ACPS_SAO JOSE DOS CAMPOS',
+                        'PROCESSING_ACPS_ILHEUS - ITABUNA',
+                        'PROCESSING_ACPS_SAO LUIS',
+                        'PROCESSING_ACPS_UBERLANDIA',
+                        'PROCESSING_ACPS_MARINGA',
+                        'PROCESSING_ACPS_VITORIA',
+                        'PROCESSING_ACPS_CUIABA',
+                        'PROCESSING_ACPS_BELEM',
+                        'PROCESSING_ACPS_NOVO HAMBURGO - SAO LEOPOLDO',
+                        'PROCESSING_ACPS_TERESINA',
+                        'PROCESSING_ACPS_MANAUS',
+                        'PROCESSING_ACPS_BRASILIA',
+                        'PROCESSING_ACPS_ARACAJU',
+                        'PROCESSING_ACPS_CAMPINA GRANDE',
+                        'PROCESSING_ACPS_CAMPOS DOS GOYTACAZES',
+                        'PROCESSING_ACPS_CAXIAS DO SUL',
+                        'PROCESSING_ACPS_CRAJUBAR',
+                        'PROCESSING_ACPS_CURITIBA',
+                        'PROCESSING_ACPS_FLORIANOPOLIS',
+                        'PROCESSING_ACPS_GOIANIA',
+                        'PROCESSING_ACPS_JUIZ DE FORA',
+                        'PROCESSING_ACPS_MACAPA',
+                        'PROCESSING_ACPS_NATAL',
+                        'PROCESSING_ACPS_PELOTAS - RIO GRANDE',
+                        'PROCESSING_ACPS_PETROLINA - JUAZEIRO',
+                        'PROCESSING_ACPS_RIBEIRAO PRETO',
+                        'PROCESSING_ACPS_RIO DE JANEIRO',
+                        'PROCESSING_ACPS_SANTOS',
+                        'PROCESSING_ACPS_VOLTA REDONDA - BARRA MANSA'],
+               'r_licenses': ['T_LICENSES_PER_REGION_False',
+                              'T_LICENSES_PER_REGION_True',
+                              'T_LICENSES_PER_REGION_random'],
+               'days': ['STARTING_DAY_2000-01-01',
+                        'STARTING_DAY_2010-01-01'],
+               'r_municipal_fund': ['FPM_DISTRIBUTION_False',
+                                    'FPM_DISTRIBUTION_True'],
+               'r_metro_fund': ['ALTERNATIVE0_False',
+                                'ALTERNATIVE0_True']}
+
+abm_dummies_show = {'POLICIES_buy': 'Policy: buy',
+                    'POLICIES_rent': 'Policy: rent',
+                    'POLICIES_wage': 'Policy: wage',
+                    'POLICIES_no_policy': 'Policy: none',
+                    'PROCESSING_ACPS_BELO HORIZONTE': 'Belo Horizonte',
+                    'PROCESSING_ACPS_FORTALEZA': 'Fortaleza',
+                    'PROCESSING_ACPS_PORTO ALEGRE': 'Porto Alegre',
+                    'PROCESSING_ACPS_CAMPINAS': 'Campinas',
+                    'PROCESSING_ACPS_SALVADOR': 'Salvador',
+                    'PROCESSING_ACPS_RECIFE': 'Recife',
+                    'PROCESSING_ACPS_SAO PAULO': 'São Paulo',
+                    'PROCESSING_ACPS_JOINVILLE': 'Joinville',
+                    'PROCESSING_ACPS_CAMPO GRANDE': 'Campo Grande',
+                    'PROCESSING_ACPS_JUNDIAI': 'Jundiai',
+                    'PROCESSING_ACPS_FEIRA DE SANTANA': 'Feira de Santana',
+                    'PROCESSING_ACPS_IPATINGA': 'Ipatinga',
+                    'PROCESSING_ACPS_LONDRINA': 'Londrina',
+                    'PROCESSING_ACPS_SOROCABA': 'Sorocaba',
+                    'PROCESSING_ACPS_JOAO PESSOA': 'João Pessoa',
+                    'PROCESSING_ACPS_SAO JOSE DO RIO PRETO': 'SJRP',
+                    'PROCESSING_ACPS_MACEIO': 'Maceio',
+                    'PROCESSING_ACPS_SAO JOSE DOS CAMPOS': 'SJC',
+                    'PROCESSING_ACPS_ILHEUS - ITABUNA': 'Ilheus-Itabuna',
+                    'PROCESSING_ACPS_SAO LUIS': 'Sao Luis',
+                    'PROCESSING_ACPS_UBERLANDIA': 'Uberlandia',
+                    'PROCESSING_ACPS_MARINGA': 'Maringá',
+                    'PROCESSING_ACPS_VITORIA': 'Vitória',
+                    'PROCESSING_ACPS_CUIABA': 'Cuiabá',
+                    'PROCESSING_ACPS_BELEM': 'Belém',
+                    'PROCESSING_ACPS_NOVO HAMBURGO - SAO LEOPOLDO': 'NH-SL',
+                    'PROCESSING_ACPS_TERESINA': 'Teresina',
+                    'PROCESSING_ACPS_MANAUS': 'Manaus',
+                    'PROCESSING_ACPS_BRASILIA': 'Brasília',
+                    'T_LICENSES_PER_REGION_False': 'Licenses: False',
+                    'T_LICENSES_PER_REGION_True': 'Licenses: True',
+                    'T_LICENSES_PER_REGION_random': 'Licenses: Random',
+                    'STARTING_DAY_2000-01-01': 'Jan. 2000',
+                    'STARTING_DAY_2010-01-01': 'Jan. 2010',
+                    'FPM_DISTRIBUTION_False': 'FPM: False',
+                    'FPM_DISTRIBUTION_True': 'FPM: True',
+                    'ALTERNATIVE0_False': 'Alternative0: False',
+                    'ALTERNATIVE0_True': 'Alternative0: True',
+                    'INTEREST_fixed': 'Interest: fixed',
+                    'INTEREST_real': 'Interest: real',
+                    'INTEREST_nominal': 'Interest: nominal',
+                    'PROCESSING_ACPS_ARACAJU': 'Aracaju',
+                    'PROCESSING_ACPS_CAMPINA GRANDE': 'Campina Grande',
+                    'PROCESSING_ACPS_CAMPOS DOS GOYTACAZES': 'Campos',
+                    'PROCESSING_ACPS_CAXIAS DO SUL': 'Caxias do Sul',
+                    'PROCESSING_ACPS_CRAJUBAR': 'Crato',
+                    'PROCESSING_ACPS_CURITIBA': 'Curitiba',
+                    'PROCESSING_ACPS_FLORIANOPOLIS': 'Florianópolis',
+                    'PROCESSING_ACPS_GOIANIA': 'Goiânia',
+                    'PROCESSING_ACPS_JUIZ DE FORA': 'Juiz de Fora',
+                    'PROCESSING_ACPS_MACAPA': 'Macapá',
+                    'PROCESSING_ACPS_NATAL': 'Natal',
+                    'PROCESSING_ACPS_PELOTAS - RIO GRANDE': 'Pelotas',
+                    'PROCESSING_ACPS_PETROLINA - JUAZEIRO': 'Petrolina-Juazeiro',
+                    'PROCESSING_ACPS_RIBEIRAO PRETO': 'Ribeirão Preto',
+                    'PROCESSING_ACPS_RIO DE JANEIRO': 'Rio de Janeiro',
+                    'PROCESSING_ACPS_SANTOS': 'Santos',
+                    'PROCESSING_ACPS_VOLTA REDONDA - BARRA MANSA': 'Volta Redonda',
+                    'all': 'All'}
+
+# 'CONSTRUCTION_ACC_CASH_FLOW',
+# 'LOT_COST',
+# 'TAX_PROPERTY',
+abm_params = ['HIRING_SAMPLE_SIZE',
+              'LABOR_MARKET',
+              'LOAN_PAYMENT_TO_PERMANENT_INCOME',
+              'MARKUP',
+              'MAX_LOAN_TO_VALUE',
+              'MUNICIPAL_EFFICIENCY_MANAGEMENT',
+              'NEIGHBORHOOD_EFFECT',
+              'OFFER_SIZE_ON_PRICE',
+              'PCT_DISTANCE_HIRING',
+              'PERCENTAGE_ACTUAL_POP',
+              'PERCENTAGE_ENTERING_ESTATE_MARKET',
+              'PERCENT_CONSTRUCTION_FIRMS',
+              'POLICY_COEFFICIENT',
+              'POLICY_DAYS',
+              'POLICY_QUANTILE',
+              'PRIVATE_TRANSIT_COST',
+              'PRODUCTIVITY_EXPONENT',
+              'PRODUCTIVITY_MAGNITUDE_DIVISOR',
+              'PUBLIC_TRANSIT_COST',
+              'SIZE_MARKET',
+              'STICKY_PRICES',
+              'TAX_ESTATE_TRANSACTION',
+              'TOTAL_DAYS']
+
+abm_params_show = {'HIRING_SAMPLE_SIZE': 'Hiring sample size',
+                   'LABOR_MARKET': 'Frequency of firms entering the labor market',
+                   'LOAN_PAYMENT_TO_PERMANENT_INCOME': 'Loan/permament income ratio',
+                   'MARKUP': 'Markup',
+                   'MAX_LOAN_TO_VALUE': 'Maximum Loan-to-Value',
+                   'MUNICIPAL_EFFICIENCY_MANAGEMENT': 'Municipal efficiency management',
+                   'NEIGHBORHOOD_EFFECT': 'Neighborhood effect',
+                   'OFFER_SIZE_ON_PRICE': 'Supply-demand effect on real estate prices',
+                   'PCT_DISTANCE_HIRING': '% firms analyzing commuting distance',
+                   'PERCENTAGE_ACTUAL_POP': '% of population',
+                   'PERCENTAGE_ENTERING_ESTATE_MARKET': '% families entering real estate market',
+                   'PERCENT_CONSTRUCTION_FIRMS': '% of construction firms',
+                   'POLICY_COEFFICIENT': 'Policy coefficient',
+                   'POLICY_DAYS': 'Policy days',
+                   'POLICY_QUANTILE': 'Policy Quantile',
+                   'PRIVATE_TRANSIT_COST': 'Cost of private transit',
+                   'PRODUCTIVITY_EXPONENT': 'Productivity: exponent',
+                   'PRODUCTIVITY_MAGNITUDE_DIVISOR': 'Productivity: divisor',
+                   'PUBLIC_TRANSIT_COST': 'Cost of public transit',
+                   'SIZE_MARKET': 'Perceived market size',
+                   'STICKY_PRICES': 'Sticky Prices',
+                   'TAX_ESTATE_TRANSACTION': 'Tax over estate transactions',
+                   'TOTAL_DAYS': 'Total Days'}