diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index b567c6a..eb7c4b5 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -13,10 +13,16 @@ def __init__(self): The available commands are: optimization Create a baseline dataset and optimize the parameters. generate Using the fitted parameters generate a synthetic graph dataset. - plots Generate plots showing different characteristics of the baseline, sampled, and final datasets. + plots Generate plots showing different characteristics of the baseline, sampled, and final datasets. + statistics Print some basic statistics of target dataset ''') parser.add_argument('command', help='Subcommand to run') - commands = {"optimization":self.optimization, "generate":self.generate, "plot": self.plot} + commands = { + "optimization":self.optimization, + "generate":self.generate, + "plots": self.plot, + "statistics": self.statistics + } args = parser.parse_args(sys.argv[1:2]) if not args.command in commands: print('Unrecognized command') @@ -32,8 +38,7 @@ def optimization(self): The available subcommands are: baseline Only creates the baseline dataset metrics Calculate the metrics of a dataset - optimize Use sampling and the Powell method with cooperative bargaining to optimize the input RMat parameters - plot Some plots to show analyze the results + optimize Use sampling and the Powell method with cooperative bargaining to optimize the input RMat parameters ************************************* To run the full optimization in steps: @@ -45,8 +50,7 @@ def optimization(self): commands = { "baseline":self.baseline, "metrics":self.metrics, - "optimize": self.optimize, - "plot": self.plot + "optimize": self.optimize } args = parser.parse_args(sys.argv[2:3]) if not args.subcommand in commands: @@ -63,12 +67,12 @@ def generate(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the dataset to generate.", default= 5000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 1000000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= ( 2.490744994387892,2.6031189695165597,0.5401027713447459,0.32109300386782624,0.6878348939570403,0.4389166002041694,0.22515465777238508,0.8146717281526472)) + default= [1.3500523980958758,0.9756729865636893,1.4562248430720026,0.22767153268062393,1.055699069458428,0.9060404341929743,0.35052426603213255,1.157122011830607]) @@ -84,6 +88,19 @@ def generate(self): generate_result_dataset(args.from_file, args.custom_weights, args.parameters_file, args.name, args.folder, args.dataset_size, args.edges_between, args.multiprocess) + + def statistics(self): + parser = argparse.ArgumentParser(description = "Calculate some statistics over a dataset.") + + parser.add_argument('-f', "--folder", metavar = "str", type = str, + help = "Folder where the dataset to analize was generated.", default= "data/validation_dataset") + parser.add_argument('-s', "--sample-size", metavar = "int", type = int, + help = "The size of the sample.", default= 1000) + + args = parser.parse_args(sys.argv[2:]) + from processes.statistics import statistics + statistics(args.folder, args.sample_size) + def plot(self): parser = argparse.ArgumentParser(description = "Some plots to analyze the results.") @@ -105,13 +122,13 @@ def plot(self): choices= choices) parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= (2.490744994387892,2.6031189695165597,0.5401027713447459,0.32109300386782624,0.6878348939570403,0.4389166002041694,0.22515465777238508,0.8146717281526472)) + default= ((1,1,1,1,1,1,1,1))) choices = ["custom", "initial"] parser.add_argument('-ws', "--weight-source", metavar = "str", type = str, help = "Where to get the waights used for the plot from. Posible values: {}".format(choices), default= "custom", choices= choices) parser.add_argument('-n', "--name", metavar = "str", type = str, - help = "Name of the params to use for the fitness_evolution.", default= None) + help = "Name of the params to use for the fitness_evolution.", default= "result") args = parser.parse_args(sys.argv[2:]) @@ -127,7 +144,7 @@ def baseline(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the baseline dataset.", default= 10000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 1000000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") args = parser.parse_args(sys.argv[3:]) @@ -159,11 +176,14 @@ def optimize(self): help = "Folder where the dataset is.", default= "../baseline_dataset") parser.add_argument('-g', "--grid-size", metavar = "int", type = int, help = "The number of rows and columns the grid has.", default=15) + parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, + help = "Initial weights for optimization.", + default= [1.3500523980958758,0.9756729865636893,1.4562248430720026,0.22767153268062393,1.055699069458428,0.9060404341929743,0.35052426603213255,1.157122011830607]) args = parser.parse_args(sys.argv[3:]) from processes.optimization import optimize - optimize(args.name, args.folder, args.grid_size) + optimize(args.name, args.folder, args.grid_size, args.custom_weights) if __name__ == "__main__": diff --git a/README.md b/README.md index ae6bb82..33344f0 100644 --- a/README.md +++ b/README.md @@ -189,9 +189,15 @@ This work was done by the Barcelona Neural Network group (BNN) and is part of a ## Citing -This repository is associated with a Paper. If you are using it for a study, please cite. +This repository is associated with the paper ["Bias Reduction via Cooperative Bargaining in Synthetic Graph Dataset Generation"](https://arxiv.org/abs/2205.13901). If you are using it for a study, please cite. -The citation text and BibTex will be available once the paper is published. +``` +@inproceedings{Wassington2022BiasRV, + title={Bias Reduction via Cooperative Bargaining in Synthetic Graph Dataset Generation}, + author={Axel Wassington and S. Abadal}, + year={2022} +} +``` ## License diff --git a/data/validation_dataset/dataset_metrics.csv b/data/validation_dataset/dataset_metrics.csv index 9181bc5..2c741fb 100644 --- a/data/validation_dataset/dataset_metrics.csv +++ b/data/validation_dataset/dataset_metrics.csv @@ -1,23 +1,14 @@ name,clustering,density,density_log,max_degree,nodes,edges sx-superuser,0.26,4.3660716413995854e-05,-4.359909142376479,14296,189191,781375 bcsstm27,0.672,0.038311181654455184,-1.4166744526085413,55,1224,28675 -co2010,0.408,2.4107766948891065e-05,-4.617843015532475,120,201062,487287 cavity05,0.77,0.026261836093476664,-1.5806749135325535,63,1182,18330 -struct4,0.364,0.012799771647861974,-1.8927977782394834,91,4350,121074 -cage10,0.322,0.0012476265437571044,-2.903915394026621,26,11397,81021 ca-HepPh,0.593,0.0018746094397688842,-2.727089200374292,491,11204,117649 dblp-2010,0.637,2.7952531408644447e-05,-4.55357885586257,238,226413,716460 cage11,0.283,0.00039205070982041877,-3.406657755502905,32,39082,299402 -cond-mat-2003,0.64,0.00030684255571491705,-3.5130844086392994,202,27519,116181 -coAuthorsDBLP,0.636,2.186197601957441e-05,-4.660310586411358,336,299067,977676 -cavity16,0.766,0.007106166004766985,-2.1483646513200694,63,4562,73930 fe_rotor,0.399,0.00013350824470442644,-3.8744919139317995,125,99617,662431 patents_main,0.044,2.085655015760525e-05,-4.680757525795384,212,230686,554949 crystm01,0.526,0.013921182266009852,-1.8563238804006732,28,1625,18369 fpga_dcop_10,0.561,0.00529861886254522,-2.2758373188035748,37,1220,3940 -cit-HepTh,0.31,0.0009379078516105468,-3.0278398284684074,2468,27400,352059 -wa2010,0.374,2.4770115585980415e-05,-4.606071966840228,158,195574,473716 -HEP-th,0.294,0.0009465711478355964,-3.023846737272162,2411,26870,341698 ca-CondMat,0.647,0.0004003099279382089,-3.397603638984791,281,21363,91342 email-Enron,0.49,0.0003185011711252004,-3.4968889664298017,1383,33696,180811 wiki-Vote,0.149,0.004035793145569756,-2.39407110161249,1065,7066,100736 @@ -25,7 +16,6 @@ qc2534,0.988,0.07258473859342394,-1.1391546829004182,184,2534,232947 psmigr_3,0.48,0.08398978943758713,-1.0757735075148394,2746,3140,413921 com-Amazon,0.402,1.6513834036534368e-05,-4.782152084362822,549,334863,925872 nemeth17,0.742,0.007073535770702365,-2.1503634459119887,75,9506,319563 -amazon0302,0.427,2.6194088195261075e-05,-4.581796714553647,420,262111,899792 LeGresley_4908,0.762,0.0014934671607465277,-2.825804322482934,39,4908,17984 soc-Epinions1,0.126,0.00014094905573394035,-3.8509378292702987,3044,75877,405739 Linux_call_graph,0.086,2.3888194076898408e-05,-4.6218166812289105,15979,317926,1207269 @@ -34,23 +24,16 @@ internet,0.099,2.64909704107265e-05,-4.576902132442502,153,124651,205805 coAuthorsCiteseer,0.693,3.151029750712278e-05,-4.501547496378784,1372,227320,814134 usroads-48,0.024,2.035482734874879e-05,-4.691332576989012,7,126146,161950 lhr07c,0.024,0.005765061357494149,-2.239196066152573,99,7337,155150 -delaunay_n18,0.44,2.2887223163681178e-05,-4.640406895741597,21,262144,786396 -or2010,0.455,2.5336823645872357e-05,-4.596247831423197,120,196621,489756 cit-HepPh,0.29,0.0007112218884929657,-3.14799488602908,846,34401,420828 Na5,0.393,0.009158925180889522,-2.0381554887117272,205,5832,155731 p2p-Gnutella31,0.004,7.556716716096343e-05,-4.121666858176848,95,62561,147878 language,0.556,1.4973538912500522e-05,-4.824675544444244,11611,399130,1192675 -soc-Slashdot0902,0.575,0.00017256407249715723,-3.7630496184724294,2554,82168,582533 msc01440,0.514,0.023024283839085787,-1.6378138694729287,46,1440,23855 -astro-ph,0.655,0.0010859724564231906,-2.964181189641385,360,14845,119652 loc-Gowalla,0.239,4.917880929251338e-05,-4.308221990521074,14730,196591,950327 -NotreDame_www,0.275,2.5423259717214715e-05,-4.594768765881246,10717,245529,766311 TSOPF_RS_b162_c1,0.094,0.01402032838523068,-1.8532418141708273,2505,5374,202415 598a,0.428,0.00012049809570614693,-3.919019816415646,26,110971,741934 rajat17,0.728,8.445435249719433e-05,-4.073377963419152,28756,93342,367910 -sparsine,0.326,0.0006396079921598432,-3.1940861182071774,57,50000,799494 EAT_SR,0.098,0.0011334632895532034,-2.9455925411772093,1092,23218,305498 -nemeth20,0.748,0.010861392339708922,-1.9641144982560987,121,9506,490688 foldoc,0.325,0.0010256356780519167,-2.9890068800721705,728,13356,91471 oh2010,0.383,1.3247637646887547e-05,-4.877861559341608,62,365344,884120 dictionary28,0.236,0.00023035786353932204,-3.637596958000205,38,24831,71014 @@ -58,6 +41,4 @@ soc-Slashdot0811,0.617,0.0001826343750092322,-3.73841747722999,2541,77360,546487 TSC_OPF_300,0.359,0.008696970975067726,-2.060631979320414,4207,9773,415288 Wordnet3,0.036,4.215114110861255e-05,-4.375190663732821,543,75606,120472 ca-AstroPh,0.663,0.0012295245160221869,-2.9102628072530363,504,17903,197031 -piston,0.752,0.024896306055726347,-1.6038650857806964,64,2025,51020 -web-NotreDame,0.236,2.1066408037288526e-05,-4.676409508177843,10721,325729,1117563 -la2010,0.368,2.346105764521534e-05,-4.629652413400064,581,204447,490317 \ No newline at end of file +web-NotreDame,0.236,2.1066408037288526e-05,-4.676409508177843,10721,325729,1117563 \ No newline at end of file diff --git a/img/result/partial_validation.svg b/img/result/partial_validation.svg index 9e49000..c6bd13e 100644 --- a/img/result/partial_validation.svg +++ b/img/result/partial_validation.svg @@ -1,12 +1,12 @@ - + - 2022-04-11T09:16:07.283194 + 2022-05-23T18:29:16.029792 image/svg+xml @@ -21,19 +21,19 @@ - - @@ -49,1009 +49,1009 @@ C -2.000462 -1.161816 -2.236068 -0.593012 -2.236068 0 C -2.236068 0.593012 -2.000462 1.161816 -1.581139 1.581139 C -1.161816 2.000462 -0.593012 2.236068 0 2.236068 z -" id="m7cf65c967d" style="stroke:#808080;"/> +" id="m983cbdd89d" style="stroke:#808080;"/> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1066,71 +1066,52 @@ C -2.000462 -1.161816 -2.236068 -0.593012 -2.236068 0 C -2.236068 0.593012 -2.000462 1.161816 -1.581139 1.581139 C -1.161816 2.000462 -0.593012 2.236068 0 2.236068 z -" id="mb1265d89ba" style="stroke:#1f77b4;"/> +" id="m027185e088" style="stroke:#1f77b4;"/> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1139,15 +1120,15 @@ z +" id="m5805318962" style="stroke:#000000;stroke-width:0.8;"/> - + - - + + + + + + + + + + + + + + + + + + + - + + - + - + - + - + + - + - + - + - + + - + - + - + - + + - + - + - + - + + - + - + - + - - + + + - - - - - - - - +" id="DejaVuSans-44" transform="scale(0.015625)"/> - - - - - - - - - - - + + + + - + +" id="m7b1d614a00" style="stroke:#000000;stroke-width:0.8;"/> - + - + - + - + - + - + - + @@ -1662,14 +1542,14 @@ z - + - + - + - + @@ -1677,46 +1557,14 @@ z - + - + - + - - - - + @@ -1724,14 +1572,14 @@ z - + - + - + - + - + - + - + - + - + - + + + + +" id="DejaVuSans-72" transform="scale(0.015625)"/> + + @@ -1873,28 +1830,28 @@ z - - - - - + - + - + - + - - - - - - - - - - - - + + @@ -2110,31 +2073,8 @@ z - - - - - - - - - - - - - - - - - - - - - - - - + - + - + + + + @@ -2216,9 +2184,9 @@ z - + - + @@ -2227,136 +2195,9 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + @@ -2390,9 +2238,9 @@ z - + - + @@ -2407,123 +2255,39 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + + + + @@ -2591,10 +2372,26 @@ z - + - + + - + - + - + - + @@ -2683,10 +2480,26 @@ z - + - + + - + - + @@ -2726,25 +2539,19 @@ z - - - - - - - - - - - - - - - - + - + + - + - + @@ -2835,9 +2642,9 @@ z - + - + @@ -2856,9 +2663,9 @@ z - + - + @@ -2872,9 +2679,9 @@ z - + - + @@ -2885,9 +2692,9 @@ z - + - + @@ -2907,9 +2714,9 @@ z - + - + @@ -2922,9 +2729,9 @@ z - + - + @@ -2933,37 +2740,9 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + @@ -2975,9 +2754,9 @@ z - + - + - + - + @@ -3017,9 +2796,9 @@ z - + - + @@ -3030,10 +2809,50 @@ z - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - - - - - - - - - - - - - - + - + @@ -3295,22 +3018,9 @@ z - - - - - - - - - - - - - - + - + @@ -3319,9 +3029,9 @@ z - + - + @@ -3330,9 +3040,9 @@ z - + - + @@ -3347,9 +3057,9 @@ z - + - + @@ -3368,9 +3078,9 @@ z - + - + @@ -3384,9 +3094,9 @@ z - + - + - + - + @@ -3430,20 +3140,9 @@ z - - - - - - - - - - - - + - + @@ -3459,22 +3158,11 @@ z - - - - - - - - - - - - - + + diff --git a/processes/bargin.py b/processes/bargin.py index 9d8d432..4763032 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -4,7 +4,7 @@ from utils.probability import beta_cdf_interval, beta_cdf_mean, beta_cdf_mean_2d def get_grid(m=10, - limits = [(0,1),(-5.5,0)]): + limits = [(0,1),(-6,-1)]): block0 = np.linspace(limits[0][0], limits[0][1], m + 1) block1 = np.linspace(limits[1][0], limits[1][1], m + 1) @@ -22,10 +22,30 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return ((1-a)/3, min(a, 1- a)) + return (max(0,1-3*a), min(a, 1-a)) def interval_c(a,b): - return ((1-a-b)/2, min(b, 1-a-b)) + return (max(0,1-2*a-b), min(a, 1-a-b)) + + +def interval_c_mean(a, b): + a_maen = (a.right + a.left) /2 + b_mean = (b.right + b.left) /2 + return interval_c(a_maen, b_mean) + +def interval_c_leftleft(a,b): + return interval_c(a.left, b.left) + +def interval_c_leftright(a,b): + return interval_c(a.left, b.right) + + +def interval_c_rightleft(a,b): + return interval_c(a.right, b.left) + + +def interval_c_rightright(a,b): + return interval_c(a.right, b.right) def interval_b_mean(a): a_maen = (a.right + a.left) /2 @@ -40,21 +60,24 @@ def interval_b_right(a): def gen_param_grid(df): - presition = 0.01 - intervals = np.arange(0,1.001,presition) - df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / (df["E"] + 1) + precision = 0.05 + intervals = np.arange(0,1.001,precision) + df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) - df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'c_bucket', 'NE_bucket'])[['a_bucket']].transform('count') def gen_weights(df, res): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res - weights = df.apply(lambda row: beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.25, 1)) * + alfa_a, beta_a, alfa_b, beta_b,alfa_c, beta_c, alfa_N, beta_N = res + weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(1/4, 1)) * beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * - beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1)) / row["param_bucket_count"], + beta_cdf_mean_2d(row['c_bucket'],alfa_c, beta_c, interval_c_mean(row['a_bucket'], row['b_bucket']), + interval_c_leftleft(row['a_bucket'], row['b_bucket']), interval_c_leftright(row['a_bucket'], row['b_bucket']), + interval_c_rightleft(row['a_bucket'], row['b_bucket']), interval_c_rightright(row['a_bucket'], row['b_bucket'])) * + beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) weights[weights < 0] = 0 @@ -69,6 +92,7 @@ def _grid_bargin(params): gen_weights(df, params) total = df["weight"].sum() + print(total) buckets = df[(df["metric_bucket_1"] != np.NaN) & (df["metric_bucket_2"] != np.NaN)].groupby(["metric_bucket_1", "metric_bucket_2"]) bucket_prob = buckets["weight"].sum() / total diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 4dd7940..25d4108 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -13,21 +13,24 @@ def generate_baseline( parameters = [] for i in range(0,dataset_size): - E = random.randint(edges_between[0],edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) - N_r = (n_0, E + 1) - N = int(np.floor(random.uniform(N_r[0],N_r[1]))) + E = random.randint(edges_between[0],edges_between[1]) + n_0 = np.floor(np.sqrt(E * 20)) + N = int(np.floor(random.uniform(n_0, E))) a = random.uniform(0.25, 1) - b = random.uniform((1-a)/3, min(a, 1-a)) - c = random.uniform((1-a-b)/2, min(b, 1-a-b)) + b = random.uniform(max(0,1-3*a), min(a, 1-a)) + c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) d = 1-a-b-c - parameters.append({ + params = { "i": i, "N": N, "E": E, "a": a, "b": b, "c": c, "d": d - }) + } + + print("Queue params: ", params) + + parameters.append(params) if multiprocess: from pebble import ProcessPool @@ -37,7 +40,7 @@ def generate_baseline( for param in parameters: future = pool.schedule(rmat_to_file, args=(param['N'],param['E'],param['a'],param['b'],param['c'],param['d'],dataset_folder, param['i']), - timeout=300) + timeout=600) future.add_done_callback(pebble_timeout_callback) else: for param in parameters: diff --git a/processes/metrics.py b/processes/metrics.py index b3b0ca0..db770f0 100644 --- a/processes/metrics.py +++ b/processes/metrics.py @@ -16,9 +16,9 @@ def _metrics(dataset_folder, row, trials): c = row['c'] d = 1 - a - b - c - G = read_graph(row['name']) - Gcc = max(nx.connected_components(G), key=len) - G = G.subgraph(Gcc) + G = read_graph(Path(dataset_folder,"graphs", row['name'])) + #Gcc = max(nx.connected_components(G), key=len) + #G = G.subgraph(Gcc) density = nx.density(G) clustering = nx.algorithms.approximation.clustering_coefficient.average_clustering(G,trials) diff --git a/processes/optimization.py b/processes/optimization.py index 5ef24c7..78a95e5 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -24,11 +24,13 @@ def store_params(dataset_folder, name, params, i = None): def optimize( name = 'result', dataset_folder = "../baseline_dataset", - grid_size = 10): + grid_size = 10, + custom_weights = [1] * 8): df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) df = pd.merge(df_m, df_d, on="name") + df[df["density_log"] < -1] m = grid_size @@ -42,10 +44,9 @@ def callback(x): store_params(dataset_folder, name, x, i) i += 1 - initial_parameters = [1] * 8 - store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), [1] * 8, bounds=[(1e-32,15)] * 8, - callback = callback) + store_params(dataset_folder, name, custom_weights, 0) + res = minimize(grid_bargin(df, M), custom_weights, bounds=[(1e-32,100)] * 8, + tol = 1e-3, callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file diff --git a/processes/plot.py b/processes/plot.py index cc3a80c..870910e 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -14,35 +14,35 @@ def annotate_df(row, ax): ax.annotate(row["name"], row[["density_log","clustering"]], - xytext=(random.uniform(-10,10), random.uniform(-10,10)), + xytext=(3, -2), textcoords='offset points', - size=8, + size=12, color='darkslategrey') def plot_paramdensity(res, s): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, _, _ = res + alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res param_list = [] for _ in range(s): - a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = beta_rvs_shifted(alfa_b, beta_b, (1-a)/3, min(a, 1-a)) - c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) + a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) + b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) + c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-2*a-b), min(a, 1-a-b)) d = 1-a-b-c params = {'a': a, 'b': b, 'c': c, 'd': d} param_list.append(params) df = pd.DataFrame(param_list) plt.figure() - plt.hist(df, bins=20, label=["a","b","c","d"], stacked=False, density=True) - plt.xlabel("parameter value") - plt.ylabel("probability density") + plt.hist(df, bins=20, label=["a","b", "c","d"], stacked=False, density=True) + plt.xlabel("value") + plt.ylabel("density") plt.legend() plt.xlim(-0,1) - plt.ylim(0,15) + plt.ylim(0,20) def plot_clustering_density(df): plt.figure() plt.hist(df["clustering"], bins=20, density=True) - index = np.arange(0,1, 0.1) + index = (0,1) plt.plot(index, uniform.pdf(index), label='Uniform') plt.ylim(0,10) plt.legend() @@ -53,21 +53,23 @@ def plot_clustering_density(df): def plot_dlog_density(df): plt.figure() plt.hist(df["density_log"], bins=20, density=True) - index = np.arange(-5.5,0, 0.01) + index = (-5.5,0) plt.plot(index, uniform.pdf(index, loc=-5.5, scale =5.5), label='Uniform') plt.ylim(0,0.5) plt.legend() - plt.xlabel("density_log") + plt.xlabel("Dlog") plt.ylabel("denisty") def plot_sample_paramdist(res): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res + alfa_a, beta_a, alfa_b, beta_b,alfa_c, beta_c, alfa_N, beta_N = res plt.figure() index = np.arange(0,1, 0.01) plt.plot(index, beta.pdf(index,alfa_a, beta_a), label='a') plt.plot(index, beta.pdf(index,alfa_b, beta_b), label='b') plt.plot(index, beta.pdf(index,alfa_c, beta_c), label='c') plt.plot(index, beta.pdf(index,alfa_N, beta_N), label='N') + plt.xlabel("value (before shifting and scaling)") + plt.ylabel("density") plt.legend() @@ -88,7 +90,7 @@ def plot_sample_params(df): def plot_param_clustering(df): df.plot.scatter("NE","diff", c="clustering", colormap='magma') plt.xlabel("N / E") - plt.ylabel("d - a") + plt.ylabel("a - d") def plot_fitness_evolution(df, M, params, name): param_serie = params[params["name"].str.startswith("{}_".format(name))].copy() @@ -118,7 +120,8 @@ def plot_param_dlog(df): def plot_validation(df, df_val): ax = df.plot.scatter("density_log","clustering", c="gray") df_val.plot.scatter("density_log","clustering", ax = ax) - plt.xlim(-5.5,0.01) + plt.xlabel("Dlog") + plt.xlim(-6,0.01) plt.ylim(-0.01,1.01) df_val.apply(lambda row: annotate_df(row,ax), axis=1) @@ -130,7 +133,7 @@ def figure_print(show, folder, name, format): def plot( dataset_folder = "../baseline_dataset", validation_metrics = "../validation_dataset/dataset_metrics.csv", - samples = 1000, + samples = 0, show = True, format = 'svg', output_folder = "../plots/initial", @@ -140,19 +143,21 @@ def plot( name = "r10" ): - + plt.rcParams.update({'font.size': 22}) + if weight_source == "custom": weights = custom_weights elif weight_source == "initial": weights = [1] * 8 print("Will plot:", plot_selection) - if set(["sample_grid", "sample_param", "validation", "dlog_density", "clustering_density", "fitness_evolution"]) & set(plot_selection): + if set(["sample_grid", "sample_param", "validation", "dlog_density", "clustering_density", + "fitness_evolution", "param_clustering", "param_dlog"]) & set(plot_selection): print("Loading Dataset...") df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) df_b = pd.merge(df_m, df_d, on="name") - df_b = df_b.head(samples) + df_b = df_b.sample(samples) if samples > 0 else df_b df_b["NE"] = df_b["N"] / df_b["E"] df_b["diff"] = df_b["a"] - df_b["d"] diff --git a/processes/result_dataset.py b/processes/result_dataset.py index b703c76..c4b93d1 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -36,18 +36,23 @@ def generate_result_dataset( for i in range(0,dataset_size): E = random.randint(edges_between[0], edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) - N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) + n_0 = np.floor(np.sqrt(E * 20)) + N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) - a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = beta_rvs_shifted(alfa_b, beta_b, (1-a)/3, min(a, 1-a)) - c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) - d = 1-a-b-c + a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) + b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) + c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-2*a-b), min(a, 1-a-b)) + d = 1 - a - b - c - parameters.append({ + + params = { "i": i, "N": N, "E": E, "a": a, "b": b, "c": c, "d": d - }) + } + + print("Queue params: ", params) + + parameters.append(params) if multiprocess: from pebble import ProcessPool diff --git a/processes/statistics.py b/processes/statistics.py new file mode 100644 index 0000000..0f70d23 --- /dev/null +++ b/processes/statistics.py @@ -0,0 +1,19 @@ +import pandas as pd +from pathlib import Path + +def statistics( + dataset_folder = "../baseline_dataset", + samples = 1000 + ): + + print("Loading Dataset...") + df = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")).head(samples) + + print("correlation: ", df["density_log"].corr(df["clustering"])) + print("covariance: ", df["density_log"].cov(df["clustering"])) + print("density_log min: ", df["density_log"].min()) + print("density_log mean: ", df["density_log"].mean()) + print("density_log max: ", df["density_log"].max()) + print("clustering min: ", df["clustering"].min()) + print("clustering mean: ", df["clustering"].mean()) + print("clustering max: ", df["clustering"].max()) \ No newline at end of file diff --git a/utils/rmat.py b/utils/rmat.py index a73c13c..dca1e9a 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -8,15 +8,21 @@ lock = mp.Lock() def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): - scale = np.log2(N) + scale = np.ceil(np.log2(N)) factor = E/N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True).generate() - Graph = nk.graphtools.toUnweighted(Graph) - out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) - print("Wrinting to:" + str(out_filename)) - nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) - with lock: - add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor - }) + reduce = np.power(2, scale) - N + + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() + Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted + Graph.removeSelfLoops() + Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) + if Graph.numberOfEdges() > 100: + name = 'RMAT_{}.txt'.format(s) + out_filename = Path(dataset_folder,'graphs',name) + print("Wrinting to:" + str(out_filename)) + nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) + with lock: + add_to_csv(Path(dataset_folder,"dataset_description.csv"), { + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': name, 'scale': scale, 'factor': factor, 'reduce': reduce + }) return s \ No newline at end of file