From a8cff3bf390c29e1b60960c4b04a2b83cc9eb84d Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 10 May 2022 20:02:03 +0200 Subject: [PATCH 01/42] Added remove nodes parameter to RMAT generator --- GraphlaxyDataGen.py | 29 ++++++++++++++++----- data/validation_dataset/dataset_metrics.csv | 21 +-------------- processes/bargin.py | 13 +++++---- processes/baseline_dataset.py | 12 ++++----- processes/metrics.py | 4 +-- processes/plot.py | 21 ++++++++------- processes/result_dataset.py | 8 +++--- processes/statistics.py | 19 ++++++++++++++ utils/rmat.py | 10 ++++--- 9 files changed, 80 insertions(+), 57 deletions(-) create mode 100644 processes/statistics.py diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index b567c6a..7108385 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -13,10 +13,16 @@ def __init__(self): The available commands are: optimization Create a baseline dataset and optimize the parameters. generate Using the fitted parameters generate a synthetic graph dataset. - plots Generate plots showing different characteristics of the baseline, sampled, and final datasets. + plots Generate plots showing different characteristics of the baseline, sampled, and final datasets. + statistics Print some basic statistics of target dataset ''') parser.add_argument('command', help='Subcommand to run') - commands = {"optimization":self.optimization, "generate":self.generate, "plot": self.plot} + commands = { + "optimization":self.optimization, + "generate":self.generate, + "plots": self.plot, + "statistics": self.statistics + } args = parser.parse_args(sys.argv[1:2]) if not args.command in commands: print('Unrecognized command') @@ -32,8 +38,7 @@ def optimization(self): The available subcommands are: baseline Only creates the baseline dataset metrics Calculate the metrics of a dataset - optimize Use sampling and the Powell method with cooperative bargaining to optimize the input RMat parameters - plot Some plots to show analyze the results + optimize Use sampling and the Powell method with cooperative bargaining to optimize the input RMat parameters ************************************* To run the full optimization in steps: @@ -45,8 +50,7 @@ def optimization(self): commands = { "baseline":self.baseline, "metrics":self.metrics, - "optimize": self.optimize, - "plot": self.plot + "optimize": self.optimize } args = parser.parse_args(sys.argv[2:3]) if not args.subcommand in commands: @@ -84,6 +88,19 @@ def generate(self): generate_result_dataset(args.from_file, args.custom_weights, args.parameters_file, args.name, args.folder, args.dataset_size, args.edges_between, args.multiprocess) + + def statistics(self): + parser = argparse.ArgumentParser(description = "Calculate some statistics over a dataset.") + + parser.add_argument('-f', "--folder", metavar = "str", type = str, + help = "Folder where the dataset to analize was generated.", default= "data/validation_dataset") + parser.add_argument('-s', "--sample-size", metavar = "int", type = int, + help = "The size of the sample.", default= 1000) + + args = parser.parse_args(sys.argv[2:]) + from processes.statistics import statistics + statistics(args.folder, args.sample_size) + def plot(self): parser = argparse.ArgumentParser(description = "Some plots to analyze the results.") diff --git a/data/validation_dataset/dataset_metrics.csv b/data/validation_dataset/dataset_metrics.csv index 9181bc5..2c741fb 100644 --- a/data/validation_dataset/dataset_metrics.csv +++ b/data/validation_dataset/dataset_metrics.csv @@ -1,23 +1,14 @@ name,clustering,density,density_log,max_degree,nodes,edges sx-superuser,0.26,4.3660716413995854e-05,-4.359909142376479,14296,189191,781375 bcsstm27,0.672,0.038311181654455184,-1.4166744526085413,55,1224,28675 -co2010,0.408,2.4107766948891065e-05,-4.617843015532475,120,201062,487287 cavity05,0.77,0.026261836093476664,-1.5806749135325535,63,1182,18330 -struct4,0.364,0.012799771647861974,-1.8927977782394834,91,4350,121074 -cage10,0.322,0.0012476265437571044,-2.903915394026621,26,11397,81021 ca-HepPh,0.593,0.0018746094397688842,-2.727089200374292,491,11204,117649 dblp-2010,0.637,2.7952531408644447e-05,-4.55357885586257,238,226413,716460 cage11,0.283,0.00039205070982041877,-3.406657755502905,32,39082,299402 -cond-mat-2003,0.64,0.00030684255571491705,-3.5130844086392994,202,27519,116181 -coAuthorsDBLP,0.636,2.186197601957441e-05,-4.660310586411358,336,299067,977676 -cavity16,0.766,0.007106166004766985,-2.1483646513200694,63,4562,73930 fe_rotor,0.399,0.00013350824470442644,-3.8744919139317995,125,99617,662431 patents_main,0.044,2.085655015760525e-05,-4.680757525795384,212,230686,554949 crystm01,0.526,0.013921182266009852,-1.8563238804006732,28,1625,18369 fpga_dcop_10,0.561,0.00529861886254522,-2.2758373188035748,37,1220,3940 -cit-HepTh,0.31,0.0009379078516105468,-3.0278398284684074,2468,27400,352059 -wa2010,0.374,2.4770115585980415e-05,-4.606071966840228,158,195574,473716 -HEP-th,0.294,0.0009465711478355964,-3.023846737272162,2411,26870,341698 ca-CondMat,0.647,0.0004003099279382089,-3.397603638984791,281,21363,91342 email-Enron,0.49,0.0003185011711252004,-3.4968889664298017,1383,33696,180811 wiki-Vote,0.149,0.004035793145569756,-2.39407110161249,1065,7066,100736 @@ -25,7 +16,6 @@ qc2534,0.988,0.07258473859342394,-1.1391546829004182,184,2534,232947 psmigr_3,0.48,0.08398978943758713,-1.0757735075148394,2746,3140,413921 com-Amazon,0.402,1.6513834036534368e-05,-4.782152084362822,549,334863,925872 nemeth17,0.742,0.007073535770702365,-2.1503634459119887,75,9506,319563 -amazon0302,0.427,2.6194088195261075e-05,-4.581796714553647,420,262111,899792 LeGresley_4908,0.762,0.0014934671607465277,-2.825804322482934,39,4908,17984 soc-Epinions1,0.126,0.00014094905573394035,-3.8509378292702987,3044,75877,405739 Linux_call_graph,0.086,2.3888194076898408e-05,-4.6218166812289105,15979,317926,1207269 @@ -34,23 +24,16 @@ internet,0.099,2.64909704107265e-05,-4.576902132442502,153,124651,205805 coAuthorsCiteseer,0.693,3.151029750712278e-05,-4.501547496378784,1372,227320,814134 usroads-48,0.024,2.035482734874879e-05,-4.691332576989012,7,126146,161950 lhr07c,0.024,0.005765061357494149,-2.239196066152573,99,7337,155150 -delaunay_n18,0.44,2.2887223163681178e-05,-4.640406895741597,21,262144,786396 -or2010,0.455,2.5336823645872357e-05,-4.596247831423197,120,196621,489756 cit-HepPh,0.29,0.0007112218884929657,-3.14799488602908,846,34401,420828 Na5,0.393,0.009158925180889522,-2.0381554887117272,205,5832,155731 p2p-Gnutella31,0.004,7.556716716096343e-05,-4.121666858176848,95,62561,147878 language,0.556,1.4973538912500522e-05,-4.824675544444244,11611,399130,1192675 -soc-Slashdot0902,0.575,0.00017256407249715723,-3.7630496184724294,2554,82168,582533 msc01440,0.514,0.023024283839085787,-1.6378138694729287,46,1440,23855 -astro-ph,0.655,0.0010859724564231906,-2.964181189641385,360,14845,119652 loc-Gowalla,0.239,4.917880929251338e-05,-4.308221990521074,14730,196591,950327 -NotreDame_www,0.275,2.5423259717214715e-05,-4.594768765881246,10717,245529,766311 TSOPF_RS_b162_c1,0.094,0.01402032838523068,-1.8532418141708273,2505,5374,202415 598a,0.428,0.00012049809570614693,-3.919019816415646,26,110971,741934 rajat17,0.728,8.445435249719433e-05,-4.073377963419152,28756,93342,367910 -sparsine,0.326,0.0006396079921598432,-3.1940861182071774,57,50000,799494 EAT_SR,0.098,0.0011334632895532034,-2.9455925411772093,1092,23218,305498 -nemeth20,0.748,0.010861392339708922,-1.9641144982560987,121,9506,490688 foldoc,0.325,0.0010256356780519167,-2.9890068800721705,728,13356,91471 oh2010,0.383,1.3247637646887547e-05,-4.877861559341608,62,365344,884120 dictionary28,0.236,0.00023035786353932204,-3.637596958000205,38,24831,71014 @@ -58,6 +41,4 @@ soc-Slashdot0811,0.617,0.0001826343750092322,-3.73841747722999,2541,77360,546487 TSC_OPF_300,0.359,0.008696970975067726,-2.060631979320414,4207,9773,415288 Wordnet3,0.036,4.215114110861255e-05,-4.375190663732821,543,75606,120472 ca-AstroPh,0.663,0.0012295245160221869,-2.9102628072530363,504,17903,197031 -piston,0.752,0.024896306055726347,-1.6038650857806964,64,2025,51020 -web-NotreDame,0.236,2.1066408037288526e-05,-4.676409508177843,10721,325729,1117563 -la2010,0.368,2.346105764521534e-05,-4.629652413400064,581,204447,490317 \ No newline at end of file +web-NotreDame,0.236,2.1066408037288526e-05,-4.676409508177843,10721,325729,1117563 \ No newline at end of file diff --git a/processes/bargin.py b/processes/bargin.py index 9d8d432..6179b16 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -44,17 +44,16 @@ def gen_param_grid(df): intervals = np.arange(0,1.001,presition) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / (df["E"] + 1) df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) - df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) - df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) + df["d_bucket"] = pd.cut(df["d"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) - df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + df["param_bucket_count"] = df.groupby(['a_bucket', 'd_bucket', 'NE_bucket'])[['a_bucket']].transform('count') def gen_weights(df, res): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res - weights = df.apply(lambda row: beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.25, 1)) * - beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * - beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1)) / row["param_bucket_count"], + alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = res + weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.25, 1)) * + beta_cdf_mean(row['d_bucket'],alfa_d, beta_d, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * + beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) weights[weights < 0] = 0 diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 4dd7940..2e92cbd 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -16,17 +16,17 @@ def generate_baseline( E = random.randint(edges_between[0],edges_between[1]) n_0 = np.floor(np.sqrt(E * 2)) - N_r = (n_0, E + 1) - N = int(np.floor(random.uniform(N_r[0],N_r[1]))) + N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - b = random.uniform((1-a)/3, min(a, 1-a)) - c = random.uniform((1-a-b)/2, min(b, 1-a-b)) - d = 1-a-b-c + d = random.uniform((1-a)/3,min(a,1-a)) + bc = 1 - a - d + + b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, - "a": a, "b": b, "c": c, "d": d + "a": a, "b": b, "c": c, "d": d, "bc": bc }) if multiprocess: diff --git a/processes/metrics.py b/processes/metrics.py index b3b0ca0..a815460 100644 --- a/processes/metrics.py +++ b/processes/metrics.py @@ -17,8 +17,8 @@ def _metrics(dataset_folder, row, trials): d = 1 - a - b - c G = read_graph(row['name']) - Gcc = max(nx.connected_components(G), key=len) - G = G.subgraph(Gcc) + #Gcc = max(nx.connected_components(G), key=len) + #G = G.subgraph(Gcc) density = nx.density(G) clustering = nx.algorithms.approximation.clustering_coefficient.average_clustering(G,trials) diff --git a/processes/plot.py b/processes/plot.py index cc3a80c..dda47b8 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -14,9 +14,9 @@ def annotate_df(row, ax): ax.annotate(row["name"], row[["density_log","clustering"]], - xytext=(random.uniform(-10,10), random.uniform(-10,10)), + xytext=(3, -2), textcoords='offset points', - size=8, + size=12, color='darkslategrey') def plot_paramdensity(res, s): @@ -36,13 +36,13 @@ def plot_paramdensity(res, s): plt.ylabel("probability density") plt.legend() plt.xlim(-0,1) - plt.ylim(0,15) + plt.ylim(0,20) def plot_clustering_density(df): plt.figure() plt.hist(df["clustering"], bins=20, density=True) - index = np.arange(0,1, 0.1) + index = (0,1) plt.plot(index, uniform.pdf(index), label='Uniform') plt.ylim(0,10) plt.legend() @@ -53,11 +53,11 @@ def plot_clustering_density(df): def plot_dlog_density(df): plt.figure() plt.hist(df["density_log"], bins=20, density=True) - index = np.arange(-5.5,0, 0.01) + index = (-5.5,0) plt.plot(index, uniform.pdf(index, loc=-5.5, scale =5.5), label='Uniform') plt.ylim(0,0.5) plt.legend() - plt.xlabel("density_log") + plt.xlabel("Dlog") plt.ylabel("denisty") def plot_sample_paramdist(res): @@ -88,7 +88,7 @@ def plot_sample_params(df): def plot_param_clustering(df): df.plot.scatter("NE","diff", c="clustering", colormap='magma') plt.xlabel("N / E") - plt.ylabel("d - a") + plt.ylabel("a - d") def plot_fitness_evolution(df, M, params, name): param_serie = params[params["name"].str.startswith("{}_".format(name))].copy() @@ -118,6 +118,7 @@ def plot_param_dlog(df): def plot_validation(df, df_val): ax = df.plot.scatter("density_log","clustering", c="gray") df_val.plot.scatter("density_log","clustering", ax = ax) + plt.xlabel("Dlog") plt.xlim(-5.5,0.01) plt.ylim(-0.01,1.01) df_val.apply(lambda row: annotate_df(row,ax), axis=1) @@ -140,14 +141,16 @@ def plot( name = "r10" ): - + plt.rcParams.update({'font.size': 22}) + if weight_source == "custom": weights = custom_weights elif weight_source == "initial": weights = [1] * 8 print("Will plot:", plot_selection) - if set(["sample_grid", "sample_param", "validation", "dlog_density", "clustering_density", "fitness_evolution"]) & set(plot_selection): + if set(["sample_grid", "sample_param", "validation", "dlog_density", "clustering_density", + "fitness_evolution", "param_clustering", "param_dlog"]) & set(plot_selection): print("Loading Dataset...") df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index b703c76..f9f703e 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -26,7 +26,7 @@ def generate_result_dataset( print(params) - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = params + alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = params @@ -40,9 +40,9 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = beta_rvs_shifted(alfa_b, beta_b, (1-a)/3, min(a, 1-a)) - c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) - d = 1-a-b-c + d = beta_rvs_shifted(alfa_d, beta_d, (1-a)/3, min(a, 1-a)) + bc = 1-a-d + b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, diff --git a/processes/statistics.py b/processes/statistics.py new file mode 100644 index 0000000..0f70d23 --- /dev/null +++ b/processes/statistics.py @@ -0,0 +1,19 @@ +import pandas as pd +from pathlib import Path + +def statistics( + dataset_folder = "../baseline_dataset", + samples = 1000 + ): + + print("Loading Dataset...") + df = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")).head(samples) + + print("correlation: ", df["density_log"].corr(df["clustering"])) + print("covariance: ", df["density_log"].cov(df["clustering"])) + print("density_log min: ", df["density_log"].min()) + print("density_log mean: ", df["density_log"].mean()) + print("density_log max: ", df["density_log"].max()) + print("clustering min: ", df["clustering"].min()) + print("clustering mean: ", df["clustering"].mean()) + print("clustering max: ", df["clustering"].max()) \ No newline at end of file diff --git a/utils/rmat.py b/utils/rmat.py index a73c13c..ce7b675 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -8,15 +8,19 @@ lock = mp.Lock() def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): - scale = np.log2(N) + scale = np.ceil(np.log2(N)) + reduce = np.power(2, scale) - N + factor = E/N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True).generate() + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() Graph = nk.graphtools.toUnweighted(Graph) + Graph.removeSelfLoops() + Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) print("Wrinting to:" + str(out_filename)) nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) with lock: add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'reduce': reduce, 'factor': factor }) return s \ No newline at end of file From 8f1f095f1cf808155bcffb0c69abd4bbc83c8909 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 10:26:10 +0200 Subject: [PATCH 02/42] change in number of optimization parameters --- processes/optimization.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/processes/optimization.py b/processes/optimization.py index 5ef24c7..8e1cc01 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -12,12 +12,11 @@ def store_params(dataset_folder, name, params, i = None): print("{}: {}".format(name, params)) - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = params + alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = params add_to_csv(Path(dataset_folder, "optimized_parameters.csv"),{ 'name': name, 'iteration': i, 'alfa_a': alfa_a, 'beta_a': beta_a, - 'alfa_b': alfa_b, 'beta_b': beta_b, - 'alfa_c': alfa_c, 'beta_c': beta_c, + 'alfa_d': alfa_d, 'beta_d': beta_d, 'alfa_N': alfa_N, 'beta_N': beta_N, }) @@ -42,9 +41,9 @@ def callback(x): store_params(dataset_folder, name, x, i) i += 1 - initial_parameters = [1] * 8 + initial_parameters = [1] * 6 store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), [1] * 8, bounds=[(1e-32,15)] * 8, + res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,15)] * 6, callback = callback) print(res) From 981b85dd9d1458768df111b499fa4dbee2bec7eb Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 10:52:43 +0200 Subject: [PATCH 03/42] new lower limitfor d parameter --- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 2e92cbd..1695f3f 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - d = random.uniform((1-a)/3,min(a,1-a)) + d = random.uniform(1 - 3 * a,min(a,1-a)) bc = 1 - a - d b = c = bc/2 diff --git a/processes/result_dataset.py b/processes/result_dataset.py index f9f703e..399ccf3 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,7 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - d = beta_rvs_shifted(alfa_d, beta_d, (1-a)/3, min(a, 1-a)) + d = beta_rvs_shifted(alfa_d, beta_d, 1 - 3 * a, min(a, 1-a)) bc = 1-a-d b = c = bc/2 From 3c56af52580bf4a5b6d11d6c2dffe6e86e62c319 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 10:57:49 +0200 Subject: [PATCH 04/42] new lower limit for d parameter fix --- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 1695f3f..4be810b 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - d = random.uniform(1 - 3 * a,min(a,1-a)) + d = random.uniform(max(1 - 3 * a, 0),min(a,1-a)) bc = 1 - a - d b = c = bc/2 diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 399ccf3..c055262 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,7 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - d = beta_rvs_shifted(alfa_d, beta_d, 1 - 3 * a, min(a, 1-a)) + d = beta_rvs_shifted(alfa_d, beta_d, max(1 - 3 * a, 0), min(a, 1-a)) bc = 1-a-d b = c = bc/2 From 05efae49941ecfb8f6fd08e08dba9b8649effd2d Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 11:01:20 +0200 Subject: [PATCH 05/42] new lower limit for d parameter fix --- processes/bargin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/bargin.py b/processes/bargin.py index 6179b16..8673c29 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return ((1-a)/3, min(a, 1- a)) + return (max(0,(1-3*a)), min(a, 1- a)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) From e533cbf6d20221c7fa49d4644099f9f8b2541985 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 14:50:34 +0200 Subject: [PATCH 06/42] new lower limit for d parameter fix --- processes/result_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index c055262..017a2b8 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -19,7 +19,7 @@ def generate_result_dataset( if from_file: df = pd.read_csv(param_file) params = df[df["name"] == name].iloc[-1][[ - "alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_c", "beta_c", "alfa_N", "beta_N" + "alfa_a", "beta_a", "alfa_d", "beta_d", "alfa_N", "beta_N" ]] else: params = custom_weights From 91aa329708a398446cba5a4964828fe294984215 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 16:40:11 +0200 Subject: [PATCH 07/42] fit b instead of d --- GraphlaxyDataGen.py | 8 ++++---- processes/bargin.py | 10 +++++----- processes/baseline_dataset.py | 10 +++++----- processes/plot.py | 5 ++--- processes/result_dataset.py | 11 ++++++----- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 7108385..39333fc 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -67,12 +67,12 @@ def generate(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the dataset to generate.", default= 5000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 1000000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= ( 2.490744994387892,2.6031189695165597,0.5401027713447459,0.32109300386782624,0.6878348939570403,0.4389166002041694,0.22515465777238508,0.8146717281526472)) + default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) @@ -122,7 +122,7 @@ def plot(self): choices= choices) parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= (2.490744994387892,2.6031189695165597,0.5401027713447459,0.32109300386782624,0.6878348939570403,0.4389166002041694,0.22515465777238508,0.8146717281526472)) + default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) choices = ["custom", "initial"] parser.add_argument('-ws', "--weight-source", metavar = "str", type = str, help = "Where to get the waights used for the plot from. Posible values: {}".format(choices), default= "custom", @@ -144,7 +144,7 @@ def baseline(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the baseline dataset.", default= 10000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 1000000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") args = parser.parse_args(sys.argv[3:]) diff --git a/processes/bargin.py b/processes/bargin.py index 8673c29..406212b 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from utils.probability import beta_cdf_interval, beta_cdf_mean, beta_cdf_mean_2d +from utils.probability import beta_cdf_interval, beta_cdf_mean def get_grid(m=10, limits = [(0,1),(-5.5,0)]): @@ -44,15 +44,15 @@ def gen_param_grid(df): intervals = np.arange(0,1.001,presition) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / (df["E"] + 1) df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) - df["d_bucket"] = pd.cut(df["d"], intervals, include_lowest =True) + df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) - df["param_bucket_count"] = df.groupby(['a_bucket', 'd_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') def gen_weights(df, res): - alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = res + alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.25, 1)) * - beta_cdf_mean(row['d_bucket'],alfa_d, beta_d, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * + beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 4be810b..36d4114 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,14 +19,14 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - d = random.uniform(max(1 - 3 * a, 0),min(a,1-a)) - bc = 1 - a - d - - b = c = bc/2 + c = b = random.uniform(max(1 - 3 * a, 0),min(a,1-a)) + d = 1 - a - b - c + #bc = 1 - a - d + #b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, - "a": a, "b": b, "c": c, "d": d, "bc": bc + "a": a, "b": b, "c": c, "d": d }) if multiprocess: diff --git a/processes/plot.py b/processes/plot.py index dda47b8..7db7245 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -61,12 +61,11 @@ def plot_dlog_density(df): plt.ylabel("denisty") def plot_sample_paramdist(res): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res + alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = res plt.figure() index = np.arange(0,1, 0.01) plt.plot(index, beta.pdf(index,alfa_a, beta_a), label='a') - plt.plot(index, beta.pdf(index,alfa_b, beta_b), label='b') - plt.plot(index, beta.pdf(index,alfa_c, beta_c), label='c') + plt.plot(index, beta.pdf(index,alfa_d, beta_d), label='d') plt.plot(index, beta.pdf(index,alfa_N, beta_N), label='N') plt.legend() diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 017a2b8..95ab854 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -19,14 +19,14 @@ def generate_result_dataset( if from_file: df = pd.read_csv(param_file) params = df[df["name"] == name].iloc[-1][[ - "alfa_a", "beta_a", "alfa_d", "beta_d", "alfa_N", "beta_N" + "alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_N", "beta_N" ]] else: params = custom_weights print(params) - alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = params + alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = params @@ -40,9 +40,10 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - d = beta_rvs_shifted(alfa_d, beta_d, max(1 - 3 * a, 0), min(a, 1-a)) - bc = 1-a-d - b = c = bc/2 + b = c = beta_rvs_shifted(alfa_b, beta_b, max(1 - 3 * a, 0), min(a, 1-a)) + d = 1 - a - b - c + #bc = 1-a-d + #b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, From 049f820dca890676add9d9c75fbafd07d423e44f Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 16:45:10 +0200 Subject: [PATCH 08/42] correct lower limit paraeter b --- processes/bargin.py | 2 +- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 406212b..4b0a870 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1-3*a)), min(a, 1- a)) + return (max(0,(1 - 2 * a)/2), min(a, 1- a)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 36d4114..d1cced8 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - c = b = random.uniform(max(1 - 3 * a, 0),min(a,1-a)) + c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,1-a)) d = 1 - a - b - c #bc = 1 - a - d #b = c = bc/2 diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 95ab854..0e36ce8 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,7 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = c = beta_rvs_shifted(alfa_b, beta_b, max(1 - 3 * a, 0), min(a, 1-a)) + b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, 1-a)) d = 1 - a - b - c #bc = 1-a-d #b = c = bc/2 From 6c51b424d72fbb2829331929cc67f649a668142e Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 11 May 2022 16:51:02 +0200 Subject: [PATCH 09/42] correct lower limit paraeter b --- processes/bargin.py | 2 +- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 4b0a870..e776aa9 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1 - 2 * a)/2), min(a, 1- a)) + return (max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index d1cced8..604e9b8 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E+1))) a = random.uniform(0.25, 1) - c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,1-a)) + c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,(1-a)/2)) d = 1 - a - b - c #bc = 1 - a - d #b = c = bc/2 diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 0e36ce8..73e3e3f 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,7 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, 1-a)) + b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) d = 1 - a - b - c #bc = 1-a-d #b = c = bc/2 From d46bd8f277905e6350fba6434f0fd38f618a81d9 Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 12 May 2022 19:26:30 +0200 Subject: [PATCH 10/42] use unweighted rmat --- GraphlaxyDataGen.py | 4 ++-- processes/optimization.py | 2 +- utils/rmat.py | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 39333fc..4d8e9c9 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -70,7 +70,7 @@ def generate(self): help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") - parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, + parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, help = "List of waights for the beta distributions.", default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) @@ -120,7 +120,7 @@ def plot(self): parser.add_argument('-p', "--plot-selection", nargs = '+', metavar = "str", type = str, help = "Selects the plots to make. Posible values: {}".format(choices), default= default, choices= choices) - parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, + parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, help = "List of waights for the beta distributions.", default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) choices = ["custom", "initial"] diff --git a/processes/optimization.py b/processes/optimization.py index 8e1cc01..8ab193c 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -16,7 +16,7 @@ def store_params(dataset_folder, name, params, i = None): add_to_csv(Path(dataset_folder, "optimized_parameters.csv"),{ 'name': name, 'iteration': i, 'alfa_a': alfa_a, 'beta_a': beta_a, - 'alfa_d': alfa_d, 'beta_d': beta_d, + 'alfa_b': alfa_d, 'beta_b': beta_d, 'alfa_N': alfa_N, 'beta_N': beta_N, }) diff --git a/utils/rmat.py b/utils/rmat.py index ce7b675..5b3eef2 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -12,8 +12,7 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): reduce = np.power(2, scale) - N factor = E/N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() - Graph = nk.graphtools.toUnweighted(Graph) + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, reduceNodes = reduce).generate() Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) From b18e12027a90bfff7dd9cf1f7f08946be4d3e618 Mon Sep 17 00:00:00 2001 From: axelwass Date: Fri, 13 May 2022 15:36:34 +0200 Subject: [PATCH 11/42] Change main driverfrom edges to nodes + cap the density --- processes/bargin.py | 2 +- processes/baseline_dataset.py | 7 ++----- processes/result_dataset.py | 8 +++----- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index e776aa9..f713775 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -42,7 +42,7 @@ def interval_b_right(a): def gen_param_grid(df): presition = 0.01 intervals = np.arange(0,1.001,presition) - df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / (df["E"] + 1) + df["NE"] = 20 * (df["E"] - 1)/(df["N"] * (df["N"] - 1)) df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 604e9b8..fef90a3 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -13,16 +13,13 @@ def generate_baseline( parameters = [] for i in range(0,dataset_size): - E = random.randint(edges_between[0],edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) - N = int(np.floor(random.uniform(n_0, E+1))) + N = random.randint(edges_between[0],edges_between[1]) + E = N * int(np.floor(random.uniform(1, (N - 1)/20))) a = random.uniform(0.25, 1) c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,(1-a)/2)) d = 1 - a - b - c - #bc = 1 - a - d - #b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 73e3e3f..20aed59 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -35,15 +35,13 @@ def generate_result_dataset( parameters = [] for i in range(0,dataset_size): - E = random.randint(edges_between[0], edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) - N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) + + N = random.randint(edges_between[0],edges_between[1]) + E = N * beta_rvs_discrete_shifted(alfa_N, beta_N, 1, (N - 1)/20) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) d = 1 - a - b - c - #bc = 1-a-d - #b = c = bc/2 parameters.append({ "i": i, "N": N, "E": E, From 5fac3cc3adb22022bab09e642a9c010319bbde72 Mon Sep 17 00:00:00 2001 From: axelwass Date: Fri, 13 May 2022 15:46:11 +0200 Subject: [PATCH 12/42] Added parameter print + change default node limits --- GraphlaxyDataGen.py | 4 ++-- processes/baseline_dataset.py | 8 ++++++-- processes/result_dataset.py | 9 +++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 4d8e9c9..231b099 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -67,7 +67,7 @@ def generate(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the dataset to generate.", default= 5000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) + help = "The min and max vallue the edges argument can take.", default= (1000, 500000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, @@ -144,7 +144,7 @@ def baseline(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the baseline dataset.", default= 10000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) + help = "The min and max vallue the edges argument can take.", default= (1000, 500000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") args = parser.parse_args(sys.argv[3:]) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index fef90a3..9e422d7 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -21,10 +21,14 @@ def generate_baseline( c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,(1-a)/2)) d = 1 - a - b - c - parameters.append({ + params = { "i": i, "N": N, "E": E, "a": a, "b": b, "c": c, "d": d - }) + } + + print("Queue params: ", params) + + parameters.append(params) if multiprocess: from pebble import ProcessPool diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 20aed59..26b1d90 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -43,10 +43,15 @@ def generate_result_dataset( b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) d = 1 - a - b - c - parameters.append({ + + params = { "i": i, "N": N, "E": E, "a": a, "b": b, "c": c, "d": d - }) + } + + print("Queue params: ", params) + + parameters.append(params) if multiprocess: from pebble import ProcessPool From 25892648c9dafe8604c68522316300abc84b728c Mon Sep 17 00:00:00 2001 From: axelwass Date: Fri, 13 May 2022 15:54:19 +0200 Subject: [PATCH 13/42] Rollback change from edges to nodes --- processes/bargin.py | 2 +- processes/baseline_dataset.py | 6 ++++-- processes/result_dataset.py | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index f713775..5319e3a 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -42,7 +42,7 @@ def interval_b_right(a): def gen_param_grid(df): presition = 0.01 intervals = np.arange(0,1.001,presition) - df["NE"] = 20 * (df["E"] - 1)/(df["N"] * (df["N"] - 1)) + df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / (df["E"] + 1) df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 9e422d7..b9be566 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -14,8 +14,10 @@ def generate_baseline( parameters = [] for i in range(0,dataset_size): - N = random.randint(edges_between[0],edges_between[1]) - E = N * int(np.floor(random.uniform(1, (N - 1)/20))) + E = random.randint(edges_between[0],edges_between[1]) + n_0 = np.floor(np.sqrt(E * 20)) + N = int(np.floor(random.uniform(n_0, E+1))) + a = random.uniform(0.25, 1) c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,(1-a)/2)) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 26b1d90..2024c20 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -35,9 +35,9 @@ def generate_result_dataset( parameters = [] for i in range(0,dataset_size): - - N = random.randint(edges_between[0],edges_between[1]) - E = N * beta_rvs_discrete_shifted(alfa_N, beta_N, 1, (N - 1)/20) + E = random.randint(edges_between[0], edges_between[1]) + n_0 = np.floor(np.sqrt(E * 20)) + N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) From 5a04264934a2b97cd6409c329a8830676ca925e4 Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 11:41:30 +0200 Subject: [PATCH 14/42] non continious node count + new methos --- GraphlaxyDataGen.py | 4 ++-- processes/bargin.py | 4 ++-- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- utils/rmat.py | 8 ++++---- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 231b099..4d8e9c9 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -67,7 +67,7 @@ def generate(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the dataset to generate.", default= 5000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 500000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, @@ -144,7 +144,7 @@ def baseline(self): parser.add_argument('-s', "--dataset-size", metavar = "int", type = int, help = "The size of the baseline dataset.", default= 10000) parser.add_argument('-e', "--edges-between", nargs = 2, metavar = "int", type = int, - help = "The min and max vallue the edges argument can take.", default= (1000, 500000)) + help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") args = parser.parse_args(sys.argv[3:]) diff --git a/processes/bargin.py b/processes/bargin.py index 5319e3a..449eedd 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -4,7 +4,7 @@ from utils.probability import beta_cdf_interval, beta_cdf_mean def get_grid(m=10, - limits = [(0,1),(-5.5,0)]): + limits = [(0,1),(-6,-1)]): block0 = np.linspace(limits[0][0], limits[0][1], m + 1) block1 = np.linspace(limits[1][0], limits[1][1], m + 1) @@ -42,7 +42,7 @@ def interval_b_right(a): def gen_param_grid(df): presition = 0.01 intervals = np.arange(0,1.001,presition) - df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / (df["E"] + 1) + df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index b9be566..ab19aa0 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -16,7 +16,7 @@ def generate_baseline( E = random.randint(edges_between[0],edges_between[1]) n_0 = np.floor(np.sqrt(E * 20)) - N = int(np.floor(random.uniform(n_0, E+1))) + N = int(np.floor(random.uniform(n_0, E))) a = random.uniform(0.25, 1) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 2024c20..49e09b5 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -37,7 +37,7 @@ def generate_result_dataset( E = random.randint(edges_between[0], edges_between[1]) n_0 = np.floor(np.sqrt(E * 20)) - N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E + 1) + N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) diff --git a/utils/rmat.py b/utils/rmat.py index 5b3eef2..a3c6749 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -9,10 +9,10 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): scale = np.ceil(np.log2(N)) - reduce = np.power(2, scale) - N - factor = E/N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, reduceNodes = reduce).generate() + + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True).generate() + Graph = nk.graphtools.toUnweighted(Graph) Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) @@ -20,6 +20,6 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) with lock: add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'reduce': reduce, 'factor': factor + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor }) return s \ No newline at end of file From 3b0a6a4d55a85d25eeaff8349bf05b8d5ce3dd80 Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 16:55:57 +0200 Subject: [PATCH 15/42] continious node count c=0 --- processes/bargin.py | 2 +- processes/result_dataset.py | 3 ++- utils/rmat.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 449eedd..3a86040 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) + return (max(0,(1 - 2 * a)), min(a, (1-a))) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 49e09b5..f802408 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,8 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = c = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) + b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a), 0), min(a, (1-a))) + c = 0 d = 1 - a - b - c diff --git a/utils/rmat.py b/utils/rmat.py index a3c6749..d1ab577 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -10,8 +10,9 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): scale = np.ceil(np.log2(N)) factor = E/N + reduce = np.power(2, scale) - N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True).generate() + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() Graph = nk.graphtools.toUnweighted(Graph) Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) @@ -20,6 +21,6 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) with lock: add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor, 'reduce': reduce }) return s \ No newline at end of file From d3f8cfc1065ae4e7d5e175b90c6472e60ea42248 Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 18:38:14 +0200 Subject: [PATCH 16/42] c=0 --- processes/baseline_dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index ab19aa0..45df83c 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,8 +19,9 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(0.25, 1) - c = b = random.uniform(max((1 - 2 * a)/2, 0),min(a,(1-a)/2)) + a = random.uniform(0.33, 1) + b = random.uniform(max(1 - 2 * a, 0),min(a,1-a)) + c = 0 d = 1 - a - b - c params = { From e2ca68b4b706ea26190e7293695dd10a459113c1 Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 20:20:48 +0200 Subject: [PATCH 17/42] change limits --- processes/bargin.py | 4 ++-- processes/result_dataset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 3a86040..a304b29 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1 - 2 * a)), min(a, (1-a))) + return (max(0,1 - 2 * a), min(a, 1-a)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) @@ -51,7 +51,7 @@ def gen_param_grid(df): def gen_weights(df, res): alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res - weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.25, 1)) * + weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.33, 1)) * beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index f802408..cb900c1 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -39,8 +39,8 @@ def generate_result_dataset( n_0 = np.floor(np.sqrt(E * 20)) N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) - a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a), 0), min(a, (1-a))) + a = beta_rvs_shifted(alfa_a, beta_a, 0.33, 1) + b = beta_rvs_shifted(alfa_b, beta_b, max(1 - 2 * a, 0), min(a, 1-a)) c = 0 d = 1 - a - b - c From 799173e6ec526c5d2816a91f8bf5afbca8e9378f Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 20:22:52 +0200 Subject: [PATCH 18/42] change precision --- processes/bargin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index a304b29..251ba72 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -40,8 +40,8 @@ def interval_b_right(a): def gen_param_grid(df): - presition = 0.01 - intervals = np.arange(0,1.001,presition) + precision = 0.05 + intervals = np.arange(0,1.001,precision) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) From c502f490fb11afa875f209db34a3ce56539a065a Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 16 May 2022 23:06:07 +0200 Subject: [PATCH 19/42] corrected bug with a lower limit --- processes/bargin.py | 4 ++-- processes/baseline_dataset.py | 2 +- processes/optimization.py | 2 +- processes/plot.py | 15 ++++++++------- processes/result_dataset.py | 2 +- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 251ba72..f56f1b9 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -40,7 +40,7 @@ def interval_b_right(a): def gen_param_grid(df): - precision = 0.05 + precision = 0.01 intervals = np.arange(0,1.001,precision) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) @@ -51,7 +51,7 @@ def gen_param_grid(df): def gen_weights(df, res): alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res - weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(0.33, 1)) * + weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(1/3, 1)) * beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 45df83c..a000cf8 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(0.33, 1) + a = random.uniform(1/3, 1) b = random.uniform(max(1 - 2 * a, 0),min(a,1-a)) c = 0 d = 1 - a - b - c diff --git a/processes/optimization.py b/processes/optimization.py index 8ab193c..4a6e52e 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -43,7 +43,7 @@ def callback(x): initial_parameters = [1] * 6 store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,15)] * 6, + res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,20)] * 6, callback = callback) print(res) diff --git a/processes/plot.py b/processes/plot.py index 7db7245..0581a17 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -20,18 +20,19 @@ def annotate_df(row, ax): color='darkslategrey') def plot_paramdensity(res, s): - alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, _, _ = res + alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res param_list = [] for _ in range(s): - a = beta_rvs_shifted(alfa_a, beta_a, 0.25, 1) - b = beta_rvs_shifted(alfa_b, beta_b, (1-a)/3, min(a, 1-a)) - c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) - d = 1-a-b-c - params = {'a': a, 'b': b, 'c': c, 'd': d} + a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) + print(a, max(0,1-2*a), min(a, 1-a)) + b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-2*a), min(a, 1-a)) + #c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) + d = 1-a-b + params = {'a': a, 'b': b, 'd': d} param_list.append(params) df = pd.DataFrame(param_list) plt.figure() - plt.hist(df, bins=20, label=["a","b","c","d"], stacked=False, density=True) + plt.hist(df, bins=20, label=["a","b","d"], stacked=False, density=True) plt.xlabel("parameter value") plt.ylabel("probability density") plt.legend() diff --git a/processes/result_dataset.py b/processes/result_dataset.py index cb900c1..8ded494 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -39,7 +39,7 @@ def generate_result_dataset( n_0 = np.floor(np.sqrt(E * 20)) N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) - a = beta_rvs_shifted(alfa_a, beta_a, 0.33, 1) + a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) b = beta_rvs_shifted(alfa_b, beta_b, max(1 - 2 * a, 0), min(a, 1-a)) c = 0 d = 1 - a - b - c From cefc2902997646ad2b088c391445a0425f13251d Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 17 May 2022 14:34:27 +0200 Subject: [PATCH 20/42] b<2a --- GraphlaxyDataGen.py | 2 +- processes/bargin.py | 6 +++--- processes/baseline_dataset.py | 6 +++--- processes/optimization.py | 2 +- processes/plot.py | 2 +- processes/result_dataset.py | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 4d8e9c9..7b7fe6e 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -128,7 +128,7 @@ def plot(self): help = "Where to get the waights used for the plot from. Posible values: {}".format(choices), default= "custom", choices= choices) parser.add_argument('-n', "--name", metavar = "str", type = str, - help = "Name of the params to use for the fitness_evolution.", default= None) + help = "Name of the params to use for the fitness_evolution.", default= "result") args = parser.parse_args(sys.argv[2:]) diff --git a/processes/bargin.py b/processes/bargin.py index f56f1b9..9713bf4 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -4,7 +4,7 @@ from utils.probability import beta_cdf_interval, beta_cdf_mean def get_grid(m=10, - limits = [(0,1),(-6,-1)]): + limits = [(0,1),(-6,0)]): block0 = np.linspace(limits[0][0], limits[0][1], m + 1) block1 = np.linspace(limits[1][0], limits[1][1], m + 1) @@ -40,9 +40,9 @@ def interval_b_right(a): def gen_param_grid(df): - precision = 0.01 + precision = 0.02 intervals = np.arange(0,1.001,precision) - df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] + df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index a000cf8..f4f6b59 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -15,12 +15,12 @@ def generate_baseline( for i in range(0,dataset_size): E = random.randint(edges_between[0],edges_between[1]) - n_0 = np.floor(np.sqrt(E * 20)) + n_0 = np.floor(np.sqrt(E * 2)) N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(1/3, 1) - b = random.uniform(max(1 - 2 * a, 0),min(a,1-a)) + a = random.uniform(1/4, 1) + b = random.uniform(max(1 - 2*a, 0),min(2*a,1-a)) c = 0 d = 1 - a - b - c diff --git a/processes/optimization.py b/processes/optimization.py index 4a6e52e..0b2f286 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -43,7 +43,7 @@ def callback(x): initial_parameters = [1] * 6 store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,20)] * 6, + res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 6, callback = callback) print(res) diff --git a/processes/plot.py b/processes/plot.py index 0581a17..4cda302 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -94,7 +94,7 @@ def plot_fitness_evolution(df, M, params, name): param_serie = params[params["name"].str.startswith("{}_".format(name))].copy() param_serie["iteration"] = param_serie["name"].str.extract("_(\d+)$").astype(int) param_serie["fitness"] = param_serie[ - ["alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_c", "beta_c", "alfa_N", "beta_N"] + ["alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_N", "beta_N"] ].apply(lambda row: grid_bargin(df, M)(row), axis=1) ax = param_serie.plot("iteration", "fitness", marker="o") diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 8ded494..4e9a429 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -36,7 +36,7 @@ def generate_result_dataset( for i in range(0,dataset_size): E = random.randint(edges_between[0], edges_between[1]) - n_0 = np.floor(np.sqrt(E * 20)) + n_0 = np.floor(np.sqrt(E * 2)) N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) From 64114e842a4e9afdea59531f8992e5715b85c832 Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 17 May 2022 19:52:38 +0200 Subject: [PATCH 21/42] added missing to undirected --- processes/baseline_dataset.py | 4 ++-- utils/rmat.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index f4f6b59..b6460ef 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,8 +19,8 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(1/4, 1) - b = random.uniform(max(1 - 2*a, 0),min(2*a,1-a)) + a = random.uniform(1/3, 1) + b = random.uniform(max(1 - 2*a, 0),min(a,1-a)) c = 0 d = 1 - a - b - c diff --git a/utils/rmat.py b/utils/rmat.py index d1ab577..7dee6a0 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -13,7 +13,7 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): reduce = np.power(2, scale) - N Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() - Graph = nk.graphtools.toUnweighted(Graph) + Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) From 257ad63a7bcde0900132ab117f3990af83e03825 Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 17 May 2022 20:47:24 +0200 Subject: [PATCH 22/42] added missing to undirected --- processes/metrics.py | 4 ++-- utils/rmat.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/processes/metrics.py b/processes/metrics.py index a815460..b3b0ca0 100644 --- a/processes/metrics.py +++ b/processes/metrics.py @@ -17,8 +17,8 @@ def _metrics(dataset_folder, row, trials): d = 1 - a - b - c G = read_graph(row['name']) - #Gcc = max(nx.connected_components(G), key=len) - #G = G.subgraph(Gcc) + Gcc = max(nx.connected_components(G), key=len) + G = G.subgraph(Gcc) density = nx.density(G) clustering = nx.algorithms.approximation.clustering_coefficient.average_clustering(G,trials) diff --git a/utils/rmat.py b/utils/rmat.py index 7dee6a0..835ab27 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -13,9 +13,9 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): reduce = np.power(2, scale) - N Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() - Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted + #Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() - Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) + #Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) print("Wrinting to:" + str(out_filename)) nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) From ff5368733da38795519e5ee2ef1cc741856639b3 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 14:59:46 +0200 Subject: [PATCH 23/42] to unweighted --- utils/rmat.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/rmat.py b/utils/rmat.py index 835ab27..455da8f 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -13,14 +13,15 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): reduce = np.power(2, scale) - N Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() - #Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted + Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() #Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) - out_filename = Path(dataset_folder,'graphs','RMAT_{}.txt'.format(s)) + name = 'RMAT_{}.txt'.format(s) + out_filename = Path(dataset_folder,'graphs',name) print("Wrinting to:" + str(out_filename)) nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) with lock: add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': out_filename, 'scale': scale, 'factor': factor, 'reduce': reduce + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': name, 'scale': scale, 'factor': factor, 'reduce': reduce }) return s \ No newline at end of file From 09aa1d8fc2cfa6dd5dcbdb054197b6bf6bcea2a2 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 15:41:48 +0200 Subject: [PATCH 24/42] limit density to 0.1 --- processes/bargin.py | 4 ++-- processes/metrics.py | 6 +++--- processes/optimization.py | 1 + processes/result_dataset.py | 2 +- utils/rmat.py | 6 +++--- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 9713bf4..a5da68b 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -4,7 +4,7 @@ from utils.probability import beta_cdf_interval, beta_cdf_mean def get_grid(m=10, - limits = [(0,1),(-6,0)]): + limits = [(0,1),(-6,-1)]): block0 = np.linspace(limits[0][0], limits[0][1], m + 1) block1 = np.linspace(limits[1][0], limits[1][1], m + 1) @@ -42,7 +42,7 @@ def interval_b_right(a): def gen_param_grid(df): precision = 0.02 intervals = np.arange(0,1.001,precision) - df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 2))) / df["E"] + df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) diff --git a/processes/metrics.py b/processes/metrics.py index b3b0ca0..db770f0 100644 --- a/processes/metrics.py +++ b/processes/metrics.py @@ -16,9 +16,9 @@ def _metrics(dataset_folder, row, trials): c = row['c'] d = 1 - a - b - c - G = read_graph(row['name']) - Gcc = max(nx.connected_components(G), key=len) - G = G.subgraph(Gcc) + G = read_graph(Path(dataset_folder,"graphs", row['name'])) + #Gcc = max(nx.connected_components(G), key=len) + #G = G.subgraph(Gcc) density = nx.density(G) clustering = nx.algorithms.approximation.clustering_coefficient.average_clustering(G,trials) diff --git a/processes/optimization.py b/processes/optimization.py index 0b2f286..d53bbc3 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -28,6 +28,7 @@ def optimize( df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) df = pd.merge(df_m, df_d, on="name") + df[df["density_log"] < -1] m = grid_size diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 4e9a429..8ded494 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -36,7 +36,7 @@ def generate_result_dataset( for i in range(0,dataset_size): E = random.randint(edges_between[0], edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) + n_0 = np.floor(np.sqrt(E * 20)) N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) diff --git a/utils/rmat.py b/utils/rmat.py index 455da8f..2dc23fa 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -12,10 +12,10 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): factor = E/N reduce = np.power(2, scale) - N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() - Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, reduceNodes = reduce).generate() + #Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() - #Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) + Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) name = 'RMAT_{}.txt'.format(s) out_filename = Path(dataset_folder,'graphs',name) print("Wrinting to:" + str(out_filename)) From 7dab3294bdbad1b4b29a25a1b3626147ec87c7f9 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 16:46:26 +0200 Subject: [PATCH 25/42] c=b --- processes/bargin.py | 4 ++-- processes/baseline_dataset.py | 10 +++++----- processes/result_dataset.py | 6 +++--- utils/rmat.py | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index a5da68b..e4a8424 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,1 - 2 * a), min(a, 1-a)) + return (max(0,(1 - 2 * a)/2), min(a, 1-a)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) @@ -51,7 +51,7 @@ def gen_param_grid(df): def gen_weights(df, res): alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res - weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(1/3, 1)) * + weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(1/4, 1)) * beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index b6460ef..9b0416a 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -15,13 +15,13 @@ def generate_baseline( for i in range(0,dataset_size): E = random.randint(edges_between[0],edges_between[1]) - n_0 = np.floor(np.sqrt(E * 2)) + n_0 = np.floor(np.sqrt(E * 20)) N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(1/3, 1) - b = random.uniform(max(1 - 2*a, 0),min(a,1-a)) - c = 0 + a = random.uniform(1/4, 1) + c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,1-a)) + #c = 0 d = 1 - a - b - c params = { @@ -41,7 +41,7 @@ def generate_baseline( for param in parameters: future = pool.schedule(rmat_to_file, args=(param['N'],param['E'],param['a'],param['b'],param['c'],param['d'],dataset_folder, param['i']), - timeout=300) + timeout=600) future.add_done_callback(pebble_timeout_callback) else: for param in parameters: diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 8ded494..dd320f4 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -39,9 +39,9 @@ def generate_result_dataset( n_0 = np.floor(np.sqrt(E * 20)) N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) - a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) - b = beta_rvs_shifted(alfa_b, beta_b, max(1 - 2 * a, 0), min(a, 1-a)) - c = 0 + a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) + c = b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, 1-a)) + #c = 0 d = 1 - a - b - c diff --git a/utils/rmat.py b/utils/rmat.py index 2dc23fa..38ac5c5 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -12,8 +12,8 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): factor = E/N reduce = np.power(2, scale) - N - Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, reduceNodes = reduce).generate() - #Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted + Graph = nk.generators.RmatGenerator(scale, factor, a, b, c, d, weighted = True, reduceNodes = reduce).generate() + Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) name = 'RMAT_{}.txt'.format(s) From 64e31bfe43304cb65e42326029fb6da297f4f0d9 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 16:49:17 +0200 Subject: [PATCH 26/42] c=b --- processes/bargin.py | 2 +- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index e4a8424..c2d6bd0 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,7 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1 - 2 * a)/2), min(a, 1-a)) + return (max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 9b0416a..f2d5028 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -20,7 +20,7 @@ def generate_baseline( a = random.uniform(1/4, 1) - c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,1-a)) + c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,(1-a)/2)) #c = 0 d = 1 - a - b - c diff --git a/processes/result_dataset.py b/processes/result_dataset.py index dd320f4..ec208c2 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,7 +40,7 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) - c = b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, 1-a)) + c = b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) #c = 0 d = 1 - a - b - c From 0feca85a14ba38e2d9280736006534ee9b5f399f Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 17:09:22 +0200 Subject: [PATCH 27/42] c=rand --- processes/baseline_dataset.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index f2d5028..9011699 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,10 +19,15 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E))) - a = random.uniform(1/4, 1) - c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,(1-a)/2)) + #a = random.uniform(1/4, 1) + #c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,(1-a)/2)) #c = 0 - d = 1 - a - b - c + #d = 1 - a - b - c + + a = random.uniform(0.25, 1) + b = random.uniform((1-a)/3, min(a, 1-a)) + c = random.uniform((1-a-b)/2, min(b, 1-a-b)) + d = 1-a-b-c params = { "i": i, "N": N, "E": E, From 9c4effb77747b31c1761edb1aeb398f6a9f2bf1b Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 17:20:04 +0200 Subject: [PATCH 28/42] independent b c d --- processes/baseline_dataset.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index 9011699..caf36f0 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -18,15 +18,9 @@ def generate_baseline( n_0 = np.floor(np.sqrt(E * 20)) N = int(np.floor(random.uniform(n_0, E))) - - #a = random.uniform(1/4, 1) - #c = b = random.uniform(max((1 - 2*a)/2, 0),min(a,(1-a)/2)) - #c = 0 - #d = 1 - a - b - c - a = random.uniform(0.25, 1) - b = random.uniform((1-a)/3, min(a, 1-a)) - c = random.uniform((1-a-b)/2, min(b, 1-a-b)) + b = random.uniform(0, min(a, 1-a)) + c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) d = 1-a-b-c params = { From dc8958c111d02b1a7eb1c4cd8262a5296f720bf4 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 17:35:41 +0200 Subject: [PATCH 29/42] independent b c d --- processes/bargin.py | 6 ++++-- processes/baseline_dataset.py | 2 +- processes/result_dataset.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index c2d6bd0..d860926 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -22,7 +22,8 @@ def gen_metric_grid(df, metrics, m): df["metric_bucket_2"] = pd.cut(df[metrics[1]], blocks[1], labels=list(range(m)), include_lowest =True) def interval_b(a): - return (max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) + return (max(0,1-3*a), min(a, 1-a)) + #(max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) def interval_c(a,b): return ((1-a-b)/2, min(b, 1-a-b)) @@ -40,7 +41,7 @@ def interval_b_right(a): def gen_param_grid(df): - precision = 0.02 + precision = 0.01 intervals = np.arange(0,1.001,precision) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) @@ -68,6 +69,7 @@ def _grid_bargin(params): gen_weights(df, params) total = df["weight"].sum() + print(total) buckets = df[(df["metric_bucket_1"] != np.NaN) & (df["metric_bucket_2"] != np.NaN)].groupby(["metric_bucket_1", "metric_bucket_2"]) bucket_prob = buckets["weight"].sum() / total diff --git a/processes/baseline_dataset.py b/processes/baseline_dataset.py index caf36f0..25d4108 100644 --- a/processes/baseline_dataset.py +++ b/processes/baseline_dataset.py @@ -19,7 +19,7 @@ def generate_baseline( N = int(np.floor(random.uniform(n_0, E))) a = random.uniform(0.25, 1) - b = random.uniform(0, min(a, 1-a)) + b = random.uniform(max(0,1-3*a), min(a, 1-a)) c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) d = 1-a-b-c diff --git a/processes/result_dataset.py b/processes/result_dataset.py index ec208c2..2797b37 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -40,8 +40,8 @@ def generate_result_dataset( N = beta_rvs_discrete_shifted(alfa_N, beta_N, n_0, E) a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) - c = b = beta_rvs_shifted(alfa_b, beta_b, max((1 - 2 * a)/2, 0), min(a, (1-a)/2)) - #c = 0 + b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) + c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) d = 1 - a - b - c From f0531c148f48d00570f2744b5caea51d86d25217 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 18 May 2022 17:39:17 +0200 Subject: [PATCH 30/42] c=rand --- processes/bargin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/processes/bargin.py b/processes/bargin.py index d860926..1023a53 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -26,7 +26,8 @@ def interval_b(a): #(max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) def interval_c(a,b): - return ((1-a-b)/2, min(b, 1-a-b)) + return (max(0,1-2*a-b), min(a, 1-a-b)) + #((1-a-b)/2, min(b, 1-a-b)) def interval_b_mean(a): a_maen = (a.right + a.left) /2 From c55e6c66be40c42d33e1cb0f616b00f2962710a3 Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 19 May 2022 10:27:31 +0200 Subject: [PATCH 31/42] add c to optimization --- processes/bargin.py | 33 +++++++++++++++++++++++++++++---- processes/optimization.py | 7 ++++--- processes/plot.py | 5 +++-- processes/result_dataset.py | 5 +++-- 4 files changed, 39 insertions(+), 11 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index 1023a53..d1b44bd 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from utils.probability import beta_cdf_interval, beta_cdf_mean +from utils.probability import beta_cdf_interval, beta_cdf_mean, beta_cdf_mean_2d def get_grid(m=10, limits = [(0,1),(-6,-1)]): @@ -29,6 +29,26 @@ def interval_c(a,b): return (max(0,1-2*a-b), min(a, 1-a-b)) #((1-a-b)/2, min(b, 1-a-b)) + +def interval_c_mean(a, b): + a_maen = (a.right + a.left) /2 + b_mean = (b.right + b.left) /2 + return interval_c(a_maen, b_mean) + +def interval_c_leftleft(a,b): + return interval_c(a.left, b.left) + +def interval_c_leftright(a,b): + return interval_c(a.left, b.right) + + +def interval_c_rightleft(a,b): + return interval_c(a.right, b.left) + + +def interval_c_rightright(a,b): + return interval_c(a.right, b.right) + def interval_b_mean(a): a_maen = (a.right + a.left) /2 return interval_b(a_maen) @@ -42,19 +62,24 @@ def interval_b_right(a): def gen_param_grid(df): - precision = 0.01 + precision = 0.05 intervals = np.arange(0,1.001,precision) df["NE"] = (df["N"] - np.floor(np.sqrt(df["E"] * 20))) / df["E"] df["a_bucket"] = pd.cut(df["a"], intervals, include_lowest =True) df["b_bucket"] = pd.cut(df["b"], intervals, include_lowest =True) + df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) - df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'c_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + print(df["param_bucket_count"]) def gen_weights(df, res): - alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res + alfa_a, beta_a, alfa_b, beta_b,alfa_c, beta_c, alfa_N, beta_N = res weights = df.apply(lambda row: (beta_cdf_interval(row['a_bucket'],alfa_a, beta_a,(1/4, 1)) * beta_cdf_mean(row['b_bucket'],alfa_b, beta_b, interval_b_left(row['a_bucket']), interval_b_mean(row['a_bucket']), interval_b_right(row['a_bucket'])) * + beta_cdf_mean_2d(row['c_bucket'],alfa_c, beta_c, interval_c_mean(row['a_bucket'], row['b_bucket']), + interval_c_leftleft(row['a_bucket'], row['b_bucket']), interval_c_leftright(row['a_bucket'], row['b_bucket']), + interval_c_rightleft(row['a_bucket'], row['b_bucket']), interval_c_rightright(row['a_bucket'], row['b_bucket'])) * beta_cdf_interval(row['NE_bucket'],alfa_N, beta_N, (0, 1))) / row["param_bucket_count"], axis=1) diff --git a/processes/optimization.py b/processes/optimization.py index d53bbc3..e6eaa2b 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -12,11 +12,12 @@ def store_params(dataset_folder, name, params, i = None): print("{}: {}".format(name, params)) - alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = params + alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = params add_to_csv(Path(dataset_folder, "optimized_parameters.csv"),{ 'name': name, 'iteration': i, 'alfa_a': alfa_a, 'beta_a': beta_a, - 'alfa_b': alfa_d, 'beta_b': beta_d, + 'alfa_b': alfa_b, 'beta_b': beta_b, + 'alfa_c': alfa_c, 'beta_c': beta_c, 'alfa_N': alfa_N, 'beta_N': beta_N, }) @@ -42,7 +43,7 @@ def callback(x): store_params(dataset_folder, name, x, i) i += 1 - initial_parameters = [1] * 6 + initial_parameters = [1] * 8 store_params(dataset_folder, name, initial_parameters, 0) res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 6, callback = callback) diff --git a/processes/plot.py b/processes/plot.py index 4cda302..a3191e4 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -62,11 +62,12 @@ def plot_dlog_density(df): plt.ylabel("denisty") def plot_sample_paramdist(res): - alfa_a, beta_a, alfa_d, beta_d, alfa_N, beta_N = res + alfa_a, beta_a, alfa_b, beta_b,alfa_c, beta_c, alfa_N, beta_N = res plt.figure() index = np.arange(0,1, 0.01) plt.plot(index, beta.pdf(index,alfa_a, beta_a), label='a') - plt.plot(index, beta.pdf(index,alfa_d, beta_d), label='d') + plt.plot(index, beta.pdf(index,alfa_b, beta_b), label='b') + plt.plot(index, beta.pdf(index,alfa_c, beta_c), label='c') plt.plot(index, beta.pdf(index,alfa_N, beta_N), label='N') plt.legend() diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 2797b37..623a9df 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -26,7 +26,7 @@ def generate_result_dataset( print(params) - alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = params + alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = params @@ -41,7 +41,8 @@ def generate_result_dataset( a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) - c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) + c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-3*a), min(a, 1-a)) + #c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) d = 1 - a - b - c From 9c61105c15821c93f35386519a017d3332a9067f Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 19 May 2022 11:19:10 +0200 Subject: [PATCH 32/42] add c to optimization --- processes/bargin.py | 1 + processes/optimization.py | 2 +- processes/plot.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index d1b44bd..c51b56a 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -70,6 +70,7 @@ def gen_param_grid(df): df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'c_bucket', 'NE_bucket'])[['a_bucket']].transform('count') + df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') print(df["param_bucket_count"]) diff --git a/processes/optimization.py b/processes/optimization.py index e6eaa2b..6333091 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -45,7 +45,7 @@ def callback(x): initial_parameters = [1] * 8 store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 6, + res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 8, callback = callback) print(res) diff --git a/processes/plot.py b/processes/plot.py index a3191e4..91b106e 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -95,7 +95,7 @@ def plot_fitness_evolution(df, M, params, name): param_serie = params[params["name"].str.startswith("{}_".format(name))].copy() param_serie["iteration"] = param_serie["name"].str.extract("_(\d+)$").astype(int) param_serie["fitness"] = param_serie[ - ["alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_N", "beta_N"] + ["alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_c", "beta_c", "alfa_N", "beta_N"] ].apply(lambda row: grid_bargin(df, M)(row), axis=1) ax = param_serie.plot("iteration", "fitness", marker="o") @@ -120,7 +120,7 @@ def plot_validation(df, df_val): ax = df.plot.scatter("density_log","clustering", c="gray") df_val.plot.scatter("density_log","clustering", ax = ax) plt.xlabel("Dlog") - plt.xlim(-5.5,0.01) + plt.xlim(-6,0.01) plt.ylim(-0.01,1.01) df_val.apply(lambda row: annotate_df(row,ax), axis=1) From d6018b2b4df2a1e51c7f6a878578998e9ec90633 Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 19 May 2022 14:43:20 +0200 Subject: [PATCH 33/42] set tolerance --- processes/bargin.py | 1 - processes/optimization.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/processes/bargin.py b/processes/bargin.py index c51b56a..d1b44bd 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -70,7 +70,6 @@ def gen_param_grid(df): df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'c_bucket', 'NE_bucket'])[['a_bucket']].transform('count') - df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'NE_bucket'])[['a_bucket']].transform('count') print(df["param_bucket_count"]) diff --git a/processes/optimization.py b/processes/optimization.py index 6333091..075c083 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -46,7 +46,7 @@ def callback(x): initial_parameters = [1] * 8 store_params(dataset_folder, name, initial_parameters, 0) res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 8, - callback = callback) + tol = 0.01, callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file From e6fd802c2013fd0f7e35d5e07697ebd58d4f07d1 Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 19 May 2022 15:19:15 +0200 Subject: [PATCH 34/42] set tolerance --- processes/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/optimization.py b/processes/optimization.py index 075c083..41e92e5 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -46,7 +46,7 @@ def callback(x): initial_parameters = [1] * 8 store_params(dataset_folder, name, initial_parameters, 0) res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 8, - tol = 0.01, callback = callback) + tol = 0.001, callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file From 3966b1fe34837063c384b59682e3b0bbe6fb8969 Mon Sep 17 00:00:00 2001 From: axelwass Date: Thu, 19 May 2022 21:10:03 +0200 Subject: [PATCH 35/42] set tolerance --- processes/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/optimization.py b/processes/optimization.py index 41e92e5..06451fa 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -46,7 +46,7 @@ def callback(x): initial_parameters = [1] * 8 store_params(dataset_folder, name, initial_parameters, 0) res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 8, - tol = 0.001, callback = callback) + tol = 1e-4, callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file From c48ddeaf79a55cf6a962a7c02e6b283bc7f4fa59 Mon Sep 17 00:00:00 2001 From: axelwass Date: Fri, 20 May 2022 14:49:13 +0200 Subject: [PATCH 36/42] add c to result param list --- processes/result_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 623a9df..0df6a2f 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -19,7 +19,7 @@ def generate_result_dataset( if from_file: df = pd.read_csv(param_file) params = df[df["name"] == name].iloc[-1][[ - "alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_N", "beta_N" + "alfa_a", "beta_a", "alfa_b", "beta_b", "alfa_c", "beta_c", "alfa_N", "beta_N" ]] else: params = custom_weights From 2098a22ce04cfd84c11d9d40ffcd53e54b1041a3 Mon Sep 17 00:00:00 2001 From: axelwass Date: Fri, 20 May 2022 14:51:22 +0200 Subject: [PATCH 37/42] add c to result param list --- processes/result_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/processes/result_dataset.py b/processes/result_dataset.py index 0df6a2f..c4b93d1 100644 --- a/processes/result_dataset.py +++ b/processes/result_dataset.py @@ -41,8 +41,7 @@ def generate_result_dataset( a = beta_rvs_shifted(alfa_a, beta_a, 1/4, 1) b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) - c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-3*a), min(a, 1-a)) - #c = random.uniform(max(0,1-2*a-b), min(a, 1-a-b)) + c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-2*a-b), min(a, 1-a-b)) d = 1 - a - b - c From 9ce60a1893c6663fddccbb503f2e8123372e36b2 Mon Sep 17 00:00:00 2001 From: axelwass Date: Mon, 23 May 2022 09:28:19 +0200 Subject: [PATCH 38/42] add initial weights to optimization, rollback tolerance --- GraphlaxyDataGen.py | 13 ++++++++----- processes/optimization.py | 10 +++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 7b7fe6e..7c7c3a7 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -70,9 +70,9 @@ def generate(self): help = "The min and max vallue the edges argument can take.", default= (100000, 2000000)) parser.add_argument('-m', '--multiprocess', action="store_true", help = "Add to take advantage of multiple cores.") - parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, + parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) + default= (1,1,1,1,1,1,1,1)) @@ -120,9 +120,9 @@ def plot(self): parser.add_argument('-p', "--plot-selection", nargs = '+', metavar = "str", type = str, help = "Selects the plots to make. Posible values: {}".format(choices), default= default, choices= choices) - parser.add_argument('-w', "--custom-weights", nargs = 6, metavar = "float", type = float, + parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= (1.3248478655922757,1.5089650653752031,0.5872691608423339,1.4899436857070154,0.14698869990820493,0.33680332568511046)) + default= ((1,1,1,1,1,1,1,1))) choices = ["custom", "initial"] parser.add_argument('-ws', "--weight-source", metavar = "str", type = str, help = "Where to get the waights used for the plot from. Posible values: {}".format(choices), default= "custom", @@ -176,11 +176,14 @@ def optimize(self): help = "Folder where the dataset is.", default= "../baseline_dataset") parser.add_argument('-g', "--grid-size", metavar = "int", type = int, help = "The number of rows and columns the grid has.", default=15) + parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, + help = "Initial weights for optimization.", + default= [1] * 8) args = parser.parse_args(sys.argv[3:]) from processes.optimization import optimize - optimize(args.name, args.folder, args.grid_size) + optimize(args.name, args.folder, args.grid_size, args.custom_weights) if __name__ == "__main__": diff --git a/processes/optimization.py b/processes/optimization.py index 06451fa..6183639 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -24,7 +24,8 @@ def store_params(dataset_folder, name, params, i = None): def optimize( name = 'result', dataset_folder = "../baseline_dataset", - grid_size = 10): + grid_size = 10, + custom_weights = [1] * 8): df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) @@ -43,10 +44,9 @@ def callback(x): store_params(dataset_folder, name, x, i) i += 1 - initial_parameters = [1] * 8 - store_params(dataset_folder, name, initial_parameters, 0) - res = minimize(grid_bargin(df, M), initial_parameters, bounds=[(1e-32,100)] * 8, - tol = 1e-4, callback = callback) + store_params(dataset_folder, name, custom_weights, 0) + res = minimize(grid_bargin(df, M), custom_weights, bounds=[(1e-32,100)] * 8, + callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file From 6b7aa4cfe5d0e1fb7a89ed544421ad00cae14721 Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 24 May 2022 10:47:42 +0200 Subject: [PATCH 39/42] default weights set --- GraphlaxyDataGen.py | 4 ++-- processes/bargin.py | 3 --- processes/optimization.py | 2 +- processes/plot.py | 19 ++++++++++--------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/GraphlaxyDataGen.py b/GraphlaxyDataGen.py index 7c7c3a7..eb7c4b5 100644 --- a/GraphlaxyDataGen.py +++ b/GraphlaxyDataGen.py @@ -72,7 +72,7 @@ def generate(self): parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "List of waights for the beta distributions.", - default= (1,1,1,1,1,1,1,1)) + default= [1.3500523980958758,0.9756729865636893,1.4562248430720026,0.22767153268062393,1.055699069458428,0.9060404341929743,0.35052426603213255,1.157122011830607]) @@ -178,7 +178,7 @@ def optimize(self): help = "The number of rows and columns the grid has.", default=15) parser.add_argument('-w', "--custom-weights", nargs = 8, metavar = "float", type = float, help = "Initial weights for optimization.", - default= [1] * 8) + default= [1.3500523980958758,0.9756729865636893,1.4562248430720026,0.22767153268062393,1.055699069458428,0.9060404341929743,0.35052426603213255,1.157122011830607]) args = parser.parse_args(sys.argv[3:]) diff --git a/processes/bargin.py b/processes/bargin.py index d1b44bd..4763032 100644 --- a/processes/bargin.py +++ b/processes/bargin.py @@ -23,11 +23,9 @@ def gen_metric_grid(df, metrics, m): def interval_b(a): return (max(0,1-3*a), min(a, 1-a)) - #(max(0,(1 - 2 * a)/2), min(a, (1-a)/2)) def interval_c(a,b): return (max(0,1-2*a-b), min(a, 1-a-b)) - #((1-a-b)/2, min(b, 1-a-b)) def interval_c_mean(a, b): @@ -70,7 +68,6 @@ def gen_param_grid(df): df["c_bucket"] = pd.cut(df["c"], intervals, include_lowest =True) df["NE_bucket"] = pd.cut(df["NE"], intervals, include_lowest =True) df["param_bucket_count"] = df.groupby(['a_bucket', 'b_bucket', 'c_bucket', 'NE_bucket'])[['a_bucket']].transform('count') - print(df["param_bucket_count"]) def gen_weights(df, res): diff --git a/processes/optimization.py b/processes/optimization.py index 6183639..78a95e5 100644 --- a/processes/optimization.py +++ b/processes/optimization.py @@ -46,7 +46,7 @@ def callback(x): store_params(dataset_folder, name, custom_weights, 0) res = minimize(grid_bargin(df, M), custom_weights, bounds=[(1e-32,100)] * 8, - callback = callback) + tol = 1e-3, callback = callback) print(res) store_params(dataset_folder, name, res["x"]) \ No newline at end of file diff --git a/processes/plot.py b/processes/plot.py index 91b106e..5b3d403 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -20,21 +20,20 @@ def annotate_df(row, ax): color='darkslategrey') def plot_paramdensity(res, s): - alfa_a, beta_a, alfa_b, beta_b, alfa_N, beta_N = res + alfa_a, beta_a, alfa_b, beta_b, alfa_c, beta_c, alfa_N, beta_N = res param_list = [] for _ in range(s): a = beta_rvs_shifted(alfa_a, beta_a, 1/3, 1) - print(a, max(0,1-2*a), min(a, 1-a)) - b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-2*a), min(a, 1-a)) - #c = beta_rvs_shifted(alfa_c, beta_c, (1-a-b)/2, min(b, 1-a-b)) - d = 1-a-b - params = {'a': a, 'b': b, 'd': d} + b = beta_rvs_shifted(alfa_b, beta_b, max(0,1-3*a), min(a, 1-a)) + c = beta_rvs_shifted(alfa_c, beta_c, max(0,1-2*a-b), min(a, 1-a-b)) + d = 1-a-b-c + params = {'a': a, 'b': b, 'c': c, 'd': d} param_list.append(params) df = pd.DataFrame(param_list) plt.figure() - plt.hist(df, bins=20, label=["a","b","d"], stacked=False, density=True) - plt.xlabel("parameter value") - plt.ylabel("probability density") + plt.hist(df, bins=20, label=["a","b", "c","d"], stacked=False, density=True) + plt.xlabel("value") + plt.ylabel("density") plt.legend() plt.xlim(-0,1) plt.ylim(0,20) @@ -69,6 +68,8 @@ def plot_sample_paramdist(res): plt.plot(index, beta.pdf(index,alfa_b, beta_b), label='b') plt.plot(index, beta.pdf(index,alfa_c, beta_c), label='c') plt.plot(index, beta.pdf(index,alfa_N, beta_N), label='N') + plt.xlabel("value (before shifting and scaling)") + plt.ylabel("density") plt.legend() From cce08f56e3dd0802984c8a6dfbf16792256b7373 Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 24 May 2022 10:52:42 +0200 Subject: [PATCH 40/42] updated validation image --- img/result/partial_validation.svg | 3430 +++++++++++++---------------- 1 file changed, 1559 insertions(+), 1871 deletions(-) diff --git a/img/result/partial_validation.svg b/img/result/partial_validation.svg index 9e49000..c6bd13e 100644 --- a/img/result/partial_validation.svg +++ b/img/result/partial_validation.svg @@ -1,12 +1,12 @@ - + - 2022-04-11T09:16:07.283194 + 2022-05-23T18:29:16.029792 image/svg+xml @@ -21,19 +21,19 @@ - - @@ -49,1009 +49,1009 @@ C -2.000462 -1.161816 -2.236068 -0.593012 -2.236068 0 C -2.236068 0.593012 -2.000462 1.161816 -1.581139 1.581139 C -1.161816 2.000462 -0.593012 2.236068 0 2.236068 z -" id="m7cf65c967d" style="stroke:#808080;"/> +" id="m983cbdd89d" style="stroke:#808080;"/> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1066,71 +1066,52 @@ C -2.000462 -1.161816 -2.236068 -0.593012 -2.236068 0 C -2.236068 0.593012 -2.000462 1.161816 -1.581139 1.581139 C -1.161816 2.000462 -0.593012 2.236068 0 2.236068 z -" id="mb1265d89ba" style="stroke:#1f77b4;"/> +" id="m027185e088" style="stroke:#1f77b4;"/> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1139,15 +1120,15 @@ z +" id="m5805318962" style="stroke:#000000;stroke-width:0.8;"/> - + - - + + + + + + + + + + + + + + + + + + + - + + - + - + - + - + + - + - + - + - + + - + - + - + - + + - + - + - + - + + - + - + - + - - + + + - - - - - - - - +" id="DejaVuSans-44" transform="scale(0.015625)"/> - - - - - - - - - - - + + + + - + +" id="m7b1d614a00" style="stroke:#000000;stroke-width:0.8;"/> - + - + - + - + - + - + - + @@ -1662,14 +1542,14 @@ z - + - + - + - + @@ -1677,46 +1557,14 @@ z - + - + - + - - - - + @@ -1724,14 +1572,14 @@ z - + - + - + - + - + - + - + - + - + - + + + + +" id="DejaVuSans-72" transform="scale(0.015625)"/> + + @@ -1873,28 +1830,28 @@ z - - - - - + - + - + - + - - - - - - - - - - - - + + @@ -2110,31 +2073,8 @@ z - - - - - - - - - - - - - - - - - - - - - - - - + - + - + + + + @@ -2216,9 +2184,9 @@ z - + - + @@ -2227,136 +2195,9 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + @@ -2390,9 +2238,9 @@ z - + - + @@ -2407,123 +2255,39 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + + + + @@ -2591,10 +2372,26 @@ z - + - + + - + - + - + - + @@ -2683,10 +2480,26 @@ z - + - + + - + - + @@ -2726,25 +2539,19 @@ z - - - - - - - - - - - - - - - - + - + + - + - + @@ -2835,9 +2642,9 @@ z - + - + @@ -2856,9 +2663,9 @@ z - + - + @@ -2872,9 +2679,9 @@ z - + - + @@ -2885,9 +2692,9 @@ z - + - + @@ -2907,9 +2714,9 @@ z - + - + @@ -2922,9 +2729,9 @@ z - + - + @@ -2933,37 +2740,9 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + @@ -2975,9 +2754,9 @@ z - + - + - + - + @@ -3017,9 +2796,9 @@ z - + - + @@ -3030,10 +2809,50 @@ z - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - - - - - - - - - - - - - - + - + @@ -3295,22 +3018,9 @@ z - - - - - - - - - - - - - - + - + @@ -3319,9 +3029,9 @@ z - + - + @@ -3330,9 +3040,9 @@ z - + - + @@ -3347,9 +3057,9 @@ z - + - + @@ -3368,9 +3078,9 @@ z - + - + @@ -3384,9 +3094,9 @@ z - + - + - + - + @@ -3430,20 +3140,9 @@ z - - - - - - - - - - - - + - + @@ -3459,22 +3158,11 @@ z - - - - - - - - - - - - - + + From 3667660108f679a852b7c0e708c9660dbad25746 Mon Sep 17 00:00:00 2001 From: axelwass Date: Wed, 25 May 2022 17:04:30 +0200 Subject: [PATCH 41/42] replace head for sample in plot --- processes/plot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/processes/plot.py b/processes/plot.py index 5b3d403..870910e 100644 --- a/processes/plot.py +++ b/processes/plot.py @@ -133,7 +133,7 @@ def figure_print(show, folder, name, format): def plot( dataset_folder = "../baseline_dataset", validation_metrics = "../validation_dataset/dataset_metrics.csv", - samples = 1000, + samples = 0, show = True, format = 'svg', output_folder = "../plots/initial", @@ -157,7 +157,7 @@ def plot( df_m = pd.read_csv(Path(dataset_folder, "dataset_metrics.csv")) df_d = pd.read_csv(Path(dataset_folder, "dataset_description.csv")) df_b = pd.merge(df_m, df_d, on="name") - df_b = df_b.head(samples) + df_b = df_b.sample(samples) if samples > 0 else df_b df_b["NE"] = df_b["N"] / df_b["E"] df_b["diff"] = df_b["a"] - df_b["d"] From 1dfe30d10cb2134493197b9c607c3baeb5710ff7 Mon Sep 17 00:00:00 2001 From: axelwass Date: Tue, 31 May 2022 15:46:03 +0200 Subject: [PATCH 42/42] lower limit for edges + add link to paper --- README.md | 10 ++++++++-- utils/rmat.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 671d401..6689e57 100644 --- a/README.md +++ b/README.md @@ -189,9 +189,15 @@ This work was done by the Barcelona Neural Network group (BNN) and is part of a ## Citing -This repository is associated with a Paper. If you are using it for a study, please cite. +This repository is associated with the paper ["Bias Reduction via Cooperative Bargaining in Synthetic Graph Dataset Generation"](https://arxiv.org/abs/2205.13901). If you are using it for a study, please cite. -The citation text and BibTex will be available once the paper is published. +``` +@inproceedings{Wassington2022BiasRV, + title={Bias Reduction via Cooperative Bargaining in Synthetic Graph Dataset Generation}, + author={Axel Wassington and S. Abadal}, + year={2022} +} +``` ## License diff --git a/utils/rmat.py b/utils/rmat.py index 38ac5c5..dca1e9a 100644 --- a/utils/rmat.py +++ b/utils/rmat.py @@ -16,12 +16,13 @@ def rmat_to_file(N, E, a, b, c, d, dataset_folder, s): Graph = nk.graph.Graph(Graph, False, False) # To undirected and unweigted Graph.removeSelfLoops() Graph = nk.components.ConnectedComponents(Graph).extractLargestConnectedComponent(Graph, compactGraph = True) - name = 'RMAT_{}.txt'.format(s) - out_filename = Path(dataset_folder,'graphs',name) - print("Wrinting to:" + str(out_filename)) - nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) - with lock: - add_to_csv(Path(dataset_folder,"dataset_description.csv"), { - 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': name, 'scale': scale, 'factor': factor, 'reduce': reduce - }) + if Graph.numberOfEdges() > 100: + name = 'RMAT_{}.txt'.format(s) + out_filename = Path(dataset_folder,'graphs',name) + print("Wrinting to:" + str(out_filename)) + nk.writeGraph(Graph, str(out_filename), nk.Format.EdgeListTabOne) + with lock: + add_to_csv(Path(dataset_folder,"dataset_description.csv"), { + 'N': N, 'E':E, 'a': a, 'b': b, 'c': c, 'd': d, 'name': name, 'scale': scale, 'factor': factor, 'reduce': reduce + }) return s \ No newline at end of file