diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..0f339e9 --- /dev/null +++ b/TODO.md @@ -0,0 +1,3 @@ +- remapping is very brittle as it relies on `get_neighbour_colours` + - this function depends on neighbour container (colour indices, as well as sorting) which depends on `multiset_hash` flag and the WL feature generation algorithm being used + - `bulk_pruner` also uses this function but assumes WL feature generation only, i.e. each tuple only gives one node \ No newline at end of file diff --git a/include/feature_generation/feature_generators/iwl.hpp b/include/feature_generation/feature_generators/iwl.hpp index 57c7311..58d35f7 100644 --- a/include/feature_generation/feature_generators/iwl.hpp +++ b/include/feature_generation/feature_generators/iwl.hpp @@ -34,7 +34,8 @@ namespace feature_generation { void collect_impl(const std::vector &graphs) override; void refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp); + std::vector &colours_tmp, + int iteration); }; } // namespace feature_generation diff --git a/include/feature_generation/feature_generators/kwl2.hpp b/include/feature_generation/feature_generators/kwl2.hpp index d8fe82e..66de5cd 100644 --- a/include/feature_generation/feature_generators/kwl2.hpp +++ b/include/feature_generation/feature_generators/kwl2.hpp @@ -31,7 +31,7 @@ namespace feature_generation { Embedding embed(const std::shared_ptr &graph) override; protected: - std::vector get_neighbour_colour_indices(const std::vector &colours); + std::vector> get_neighbour_colours(const std::vector &colours); inline int get_initial_colour(int index, int u, int v, @@ -40,7 +40,8 @@ namespace feature_generation { void collect_impl(const std::vector &graphs) override; void refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp); + std::vector &colours_tmp, + int iteration); }; } // namespace feature_generation diff --git a/include/feature_generation/feature_generators/lwl2.hpp b/include/feature_generation/feature_generators/lwl2.hpp index c6b2bf1..9bc1b7b 100644 --- a/include/feature_generation/feature_generators/lwl2.hpp +++ b/include/feature_generation/feature_generators/lwl2.hpp @@ -33,7 +33,8 @@ namespace feature_generation { void refine(const std::shared_ptr &graph, std::vector> &pair_to_neighbours, std::vector &colours, - std::vector &colours_tmp); + std::vector &colours_tmp, + int iteration); }; } // namespace feature_generation diff --git a/include/feature_generation/feature_generators/wl.hpp b/include/feature_generation/feature_generators/wl.hpp index 2258736..c915e07 100644 --- a/include/feature_generation/feature_generators/wl.hpp +++ b/include/feature_generation/feature_generators/wl.hpp @@ -28,11 +28,12 @@ namespace feature_generation { Embedding embed(const std::shared_ptr &graph) override; protected: - std::vector get_neighbour_colour_indices(const std::vector &colours); + std::vector> get_neighbour_colours(const std::vector &colours); void collect_impl(const std::vector &graphs) override; void refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp); + std::vector &colours_tmp, + int iteration); }; } // namespace feature_generation diff --git a/include/feature_generation/features.hpp b/include/feature_generation/features.hpp index 60dd1b7..d4e8ba2 100644 --- a/include/feature_generation/features.hpp +++ b/include/feature_generation/features.hpp @@ -45,7 +45,8 @@ class int_vector_hasher { namespace feature_generation { using Embedding = std::vector; - using ColourHash = std::unordered_map, int, int_vector_hasher>; + using VecColourHash = std::vector, int, int_vector_hasher>>; + using StrColourHash = std::vector>; class Features { protected: @@ -58,7 +59,7 @@ namespace feature_generation { bool multiset_hash; // colouring [saved] - ColourHash colour_hash; + VecColourHash colour_hash; std::unordered_map colour_to_layer; std::vector> layer_to_colours; @@ -72,7 +73,6 @@ namespace feature_generation { bool collected; bool pruned; bool collecting; - int cur_collecting_layer; std::shared_ptr neighbour_container; // runtime statistics; int is faster than long but could cause overflow @@ -90,12 +90,16 @@ namespace feature_generation { std::vector convert_to_graphs(const data::Dataset dataset); // get hashed colour if it exists, and constructs it if it doesn't - int get_colour_hash(const std::vector &colour); + int get_colour_hash(const std::vector &colour, const int iteration); // reformat colour hash based on colours to throw out - void init_layer_to_colours(); + VecColourHash new_colour_hash() const; + std::vector> new_layer_to_colours() const; std::map remap_colour_hash(const std::set &to_prune); - virtual std::vector get_neighbour_colour_indices(const std::vector &colours) = 0; + + // TODO redesign this with neighbour container + virtual std::vector> + get_neighbour_colours(const std::vector &colours) = 0; std::vector remap_neighbour_colours(const std::vector &colours, const std::map &remap); @@ -128,6 +132,8 @@ namespace feature_generation { Embedding embed_state(const planning::State &state); virtual Embedding embed(const std::shared_ptr &graph) = 0; + void add_colour_to_x(int colour, int iteration, std::vector &x); + /* Pruning functions */ void prune_this_iteration(int iteration, @@ -161,7 +167,7 @@ namespace feature_generation { std::set get_iteration_colours(int iteration) const { return layer_to_colours.at(iteration); } - ColourHash get_colour_hash() { return colour_hash; } + VecColourHash get_colour_hash() { return colour_hash; } /* Util functions */ @@ -173,11 +179,11 @@ namespace feature_generation { void set_problem(const planning::Problem &problem); // conversion between vectors and strings - ColourHash str_to_int_colour_hash(std::unordered_map str_colour_hash) const; - std::unordered_map int_to_str_colour_hash(ColourHash int_colour_hash) const; + VecColourHash str_to_int_colour_hash(StrColourHash str_colour_hash) const; + StrColourHash int_to_str_colour_hash(VecColourHash int_colour_hash) const; // statistics functions - int get_n_features() const { return colour_hash.size(); } + int get_n_features() const; std::vector get_seen_counts() const { return seen_colour_statistics[1]; }; std::vector get_unseen_counts() const { return seen_colour_statistics[0]; }; int get_n_seen_graphs() const { return n_seen_graphs; } diff --git a/setup.py b/setup.py index 8deeeef..fc38955 100644 --- a/setup.py +++ b/setup.py @@ -7,11 +7,14 @@ # Read version from wlplan/__version__.py file exec(open("wlplan/__version__.py").read()) +# Debug mode +_DEBUG = False + # Compiler flags -COMPILER_FLAGS = [ - "-O3", - # "-DDEBUGMODE", -] +if _DEBUG: + COMPILER_FLAGS = ["-O0", "-g", "-DDEBUGMODE"] +else: + COMPILER_FLAGS = ["-O3"] files = [glob("src/*.cpp"), glob("src/**/*.cpp"), glob("src/**/**/*.cpp")] diff --git a/src/feature_generation/feature_generators/ccwl.cpp b/src/feature_generation/feature_generators/ccwl.cpp index 3a48a21..3422486 100644 --- a/src/feature_generation/feature_generators/ccwl.cpp +++ b/src/feature_generation/feature_generators/ccwl.cpp @@ -29,7 +29,7 @@ namespace feature_generation { // To change this to max, we just need to replace += occurrences with std::max. /* 1. Initialise embedding before pruning */ - int categorical_size = colour_hash.size(); + int categorical_size = get_n_features(); Embedding x0(categorical_size * 2, 0); /* 2. Set up memory for WL updates */ @@ -41,7 +41,7 @@ namespace feature_generation { int col; int is_seen_colour; for (int node_i = 0; node_i < n_nodes; node_i++) { - col = get_colour_hash({graph->nodes[node_i]}); + col = get_colour_hash({graph->nodes[node_i]}, 0); colours[node_i] = col; is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction seen_colour_statistics[is_seen_colour][0]++; @@ -53,7 +53,7 @@ namespace feature_generation { /* 4. Main WL loop */ for (int itr = 1; itr < iterations + 1; itr++) { - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, itr); for (int node_i = 0; node_i < n_nodes; node_i++) { col = colours[node_i]; is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction diff --git a/src/feature_generation/feature_generators/iwl.cpp b/src/feature_generation/feature_generators/iwl.cpp index 301d1f8..bbef2fb 100644 --- a/src/feature_generation/feature_generators/iwl.cpp +++ b/src/feature_generation/feature_generators/iwl.cpp @@ -29,7 +29,8 @@ namespace feature_generation { void IWLFeatures::refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp) { + std::vector &colours_tmp, + int iteration) { // memory for storing string and hashed int representation of colours std::vector new_colour; std::vector neighbour_vector; @@ -61,7 +62,7 @@ namespace feature_generation { new_colour.insert(new_colour.end(), neighbour_vector.begin(), neighbour_vector.end()); // hash seen colours - new_colour_compressed = get_colour_hash(new_colour); + new_colour_compressed = get_colour_hash(new_colour, iteration); end_of_iteration: colours_tmp[u] = new_colour_compressed; @@ -93,17 +94,15 @@ namespace feature_generation { graph->change_node_colour(node_i, INDIVIDUALISE_COLOUR); // init colours - cur_collecting_layer = 0; for (int u = 0; u < n_nodes; u++) { - int col = get_colour_hash({graph->nodes[u]}); + int col = get_colour_hash({graph->nodes[u]}, 0); colours[u] = col; seen_initial_colours.insert(col); } // main WL loop for (int iteration = 1; iteration < iterations + 1; iteration++) { - cur_collecting_layer = iteration; - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, iteration); } // reset node colour @@ -119,7 +118,7 @@ namespace feature_generation { } /* 1. Initialise embedding */ - Embedding x0(colour_hash.size(), 0); + Embedding x0(get_n_features(), 0); /* 2. Set up memory for WL updates */ int n_nodes = graph->nodes.size(); @@ -134,22 +133,17 @@ namespace feature_generation { graph->change_node_colour(node_i, INDIVIDUALISE_COLOUR); /* 3. Compute initial colours */ - int is_seen_colour; for (int u = 0; u < n_nodes; u++) { - int col = get_colour_hash({graph->nodes[u]}); + int col = get_colour_hash({graph->nodes[u]}, 0); colours[u] = col; - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][0]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, 0, x0); } /* 4. Main WL loop */ for (int itr = 1; itr < iterations + 1; itr++) { - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, itr); for (const int col : colours) { - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][itr]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, itr, x0); } } diff --git a/src/feature_generation/feature_generators/kwl2.cpp b/src/feature_generation/feature_generators/kwl2.cpp index 552e03e..dc262c4 100644 --- a/src/feature_generation/feature_generators/kwl2.cpp +++ b/src/feature_generation/feature_generators/kwl2.cpp @@ -26,10 +26,11 @@ namespace feature_generation { KWL2Features::KWL2Features(const std::string &filename) : Features(filename) {} - std::vector KWL2Features::get_neighbour_colour_indices(const std::vector &colours) { + std::vector> + KWL2Features::get_neighbour_colours(const std::vector &colours) { std::cout << "not implemented yet" << std::endl; exit(-1); - return std::vector(); + return std::vector>(); } int kwl2_pair_to_index_map(int n, int i, int j) { @@ -41,7 +42,8 @@ namespace feature_generation { void KWL2Features::refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp) { + std::vector &colours_tmp, + int iteration) { // memory for storing string and hashed int representation of colours std::vector new_colour; std::vector neighbour_vector; @@ -78,7 +80,7 @@ namespace feature_generation { new_colour.insert(new_colour.end(), neighbour_vector.begin(), neighbour_vector.end()); // hash seen colours - new_colour_compressed = get_colour_hash(new_colour); + new_colour_compressed = get_colour_hash(new_colour, iteration); end_of_iteration: colours_tmp[index] = new_colour_compressed; @@ -109,7 +111,7 @@ namespace feature_generation { int u_col = graph->nodes[u]; int v_col = graph->nodes[v]; int e_col = pair_to_edge_label[index]; - int col = get_colour_hash({u_col, v_col, e_col}); + int col = get_colour_hash({u_col, v_col, e_col}, 0); return col; } @@ -136,7 +138,6 @@ namespace feature_generation { std::vector pair_to_edge_label = get_kwl2_pair_to_edge_label(graph); // init colours - cur_collecting_layer = 0; for (int u = 0; u < n_nodes; u++) { for (int v = 0; v < n_nodes; v++) { int index = kwl2_pair_to_index_map(n_nodes, u, v); @@ -148,8 +149,7 @@ namespace feature_generation { // main WL loop for (int iteration = 1; iteration < iterations + 1; iteration++) { - cur_collecting_layer = iteration; - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, iteration); } } } @@ -161,7 +161,7 @@ namespace feature_generation { } /* 1. Initialise embedding before pruning */ - Embedding x0(colour_hash.size(), 0); + Embedding x0(get_n_features(), 0); /* 2. Set up memory for WL updates */ int n_nodes = graph->nodes.size(); @@ -172,25 +172,20 @@ namespace feature_generation { std::vector pair_to_edge_label = get_kwl2_pair_to_edge_label(graph); /* 3. Compute initial colours */ - int is_seen_colour; for (int u = 0; u < n_nodes; u++) { for (int v = 0; v < n_nodes; v++) { int index = kwl2_pair_to_index_map(n_nodes, u, v); int col = get_initial_colour(index, u, v, graph, pair_to_edge_label); colours[index] = col; - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][0]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, 0, x0); } } /* 4. Main WL loop */ for (int itr = 1; itr < iterations + 1; itr++) { - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, itr); for (const int col : colours) { - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][itr]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, itr, x0); } } diff --git a/src/feature_generation/feature_generators/lwl2.cpp b/src/feature_generation/feature_generators/lwl2.cpp index 7445d4d..93249a4 100644 --- a/src/feature_generation/feature_generators/lwl2.cpp +++ b/src/feature_generation/feature_generators/lwl2.cpp @@ -28,7 +28,8 @@ namespace feature_generation { void LWL2Features::refine(const std::shared_ptr &graph, std::vector> &pair_to_neighbours, std::vector &colours, - std::vector &colours_tmp) { + std::vector &colours_tmp, + int iteration) { // memory for storing string and hashed int representation of colours std::vector new_colour; std::vector neighbour_vector; @@ -74,7 +75,7 @@ namespace feature_generation { new_colour.insert(new_colour.end(), neighbour_vector.begin(), neighbour_vector.end()); // hash seen colours - new_colour_compressed = get_colour_hash(new_colour); + new_colour_compressed = get_colour_hash(new_colour, iteration); end_of_iteration: colours_tmp[index] = new_colour_compressed; @@ -124,7 +125,7 @@ namespace feature_generation { int u_col = graph->nodes[u]; int v_col = graph->nodes[v]; int e_col = pair_to_edge_label[index]; - int col = get_colour_hash({std::min(u_col, v_col), std::max(u_col, v_col), e_col}); + int col = get_colour_hash({std::min(u_col, v_col), std::max(u_col, v_col), e_col}, 0); return col; } @@ -152,7 +153,6 @@ namespace feature_generation { std::vector> pair_to_neighbours = get_lwl2_pair_to_neighbours(graph); // init colours - cur_collecting_layer = 0; for (int u = 0; u < n_nodes; u++) { for (int v = u + 1; v < n_nodes; v++) { int index = lwl2_pair_to_index_map(n_nodes, u, v); @@ -164,8 +164,7 @@ namespace feature_generation { // main WL loop for (int iteration = 1; iteration < iterations + 1; iteration++) { - cur_collecting_layer = iteration; - refine(graph, pair_to_neighbours, colours, colours_tmp); + refine(graph, pair_to_neighbours, colours, colours_tmp, iteration); } } } @@ -177,7 +176,7 @@ namespace feature_generation { } /* 1. Initialise embedding before pruning */ - Embedding x0(colour_hash.size(), 0); + Embedding x0(get_n_features(), 0); /* 2. Set up memory for WL updates */ int n_nodes = graph->nodes.size(); @@ -189,25 +188,20 @@ namespace feature_generation { std::vector> pair_to_neighbours = get_lwl2_pair_to_neighbours(graph); /* 3. Compute initial colours */ - int is_seen_colour; for (int u = 0; u < n_nodes; u++) { for (int v = u + 1; v < n_nodes; v++) { int index = lwl2_pair_to_index_map(n_nodes, u, v); int col = get_initial_colour(index, u, v, graph, pair_to_edge_label); colours[index] = col; - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][0]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, 0, x0); } } /* 4. Main WL loop */ for (int itr = 1; itr < iterations + 1; itr++) { - refine(graph, pair_to_neighbours, colours, colours_tmp); + refine(graph, pair_to_neighbours, colours, colours_tmp, itr); for (const int col : colours) { - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][itr]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, itr, x0); } } diff --git a/src/feature_generation/feature_generators/wl.cpp b/src/feature_generation/feature_generators/wl.cpp index 6e6de82..c82b562 100644 --- a/src/feature_generation/feature_generators/wl.cpp +++ b/src/feature_generation/feature_generators/wl.cpp @@ -27,12 +27,19 @@ namespace feature_generation { WLFeatures::WLFeatures(const std::string &filename) : Features(filename) {} - std::vector WLFeatures::get_neighbour_colour_indices(const std::vector &colours) { - std::vector ret = {0}; - for (size_t i = 1; i < colours.size(); i++) { - // see neighbour container - if ((multiset_hash && (i % 3 == 2)) || (!multiset_hash && (i % 2 == 0))) { - ret.push_back(i); + std::vector> + WLFeatures::get_neighbour_colours(const std::vector &colours) { + std::vector> ret; + if (multiset_hash) { + for (size_t i = 1; i < colours.size(); i += 3) { + int occurrences = colours[i + 2]; + for (int j = 0; j < occurrences; j++) { + ret.push_back(std::make_pair(colours[i + 1], colours[i])); + } + } + } else { + for (size_t i = 1; i < colours.size(); i += 2) { + ret.push_back(std::make_pair(colours[i + 1], colours[i])); } } return ret; @@ -40,7 +47,8 @@ namespace feature_generation { void WLFeatures::refine(const std::shared_ptr &graph, std::vector &colours, - std::vector &colours_tmp) { + std::vector &colours_tmp, + int iteration) { // memory for storing string and hashed int representation of colours std::vector new_colour; std::vector neighbour_vector; @@ -74,7 +82,7 @@ namespace feature_generation { new_colour.insert(new_colour.end(), neighbour_vector.begin(), neighbour_vector.end()); // hash seen colours - new_colour_compressed = get_colour_hash(new_colour); + new_colour_compressed = get_colour_hash(new_colour, iteration); end_of_iteration: colours_tmp[u] = new_colour_compressed; @@ -90,8 +98,7 @@ namespace feature_generation { // init colours n_seen_graphs += graphs.size(); - cur_collecting_layer = 0; - std::cout << "collecting iteration " << cur_collecting_layer << std::endl; + std::cout << "collecting iteration 0" << std::endl; for (size_t graph_i = 0; graph_i < graphs.size(); graph_i++) { const auto graph = std::make_shared(graphs[graph_i]); int n_nodes = graph->nodes.size(); @@ -101,7 +108,7 @@ namespace feature_generation { std::vector colours(n_nodes, 0); for (int node_i = 0; node_i < n_nodes; node_i++) { - int col = get_colour_hash({graph->nodes[node_i]}); + int col = get_colour_hash({graph->nodes[node_i]}, 0); colours[node_i] = col; seen_initial_colours.insert(col); } @@ -111,12 +118,11 @@ namespace feature_generation { // main WL loop for (int iteration = 1; iteration < iterations + 1; iteration++) { - cur_collecting_layer = iteration; - std::cout << "collecting iteration " << cur_collecting_layer << std::endl; + std::cout << "collecting iteration " << iteration << std::endl; for (size_t graph_i = 0; graph_i < graphs.size(); graph_i++) { const auto graph = std::make_shared(graphs[graph_i]); - refine(graph, graph_colours[graph_i], graph_colours_tmp[graph_i]); + refine(graph, graph_colours[graph_i], graph_colours_tmp[graph_i], iteration); } // layer pruning @@ -135,28 +141,23 @@ namespace feature_generation { } /* 1. Initialise embedding before pruning, and set up memory */ - Embedding x0(colour_hash.size(), 0); + Embedding x0(get_n_features(), 0); int n_nodes = graph->nodes.size(); std::vector colours(n_nodes); std::vector colours_tmp(n_nodes); /* 2. Compute initial colours */ - int is_seen_colour; for (int node_i = 0; node_i < n_nodes; node_i++) { - int col = get_colour_hash({graph->nodes[node_i]}); + int col = get_colour_hash({graph->nodes[node_i]}, 0); colours[node_i] = col; - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][0]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, 0, x0); } /* 3. Main WL loop */ for (int itr = 1; itr < iterations + 1; itr++) { - refine(graph, colours, colours_tmp); + refine(graph, colours, colours_tmp, itr); for (const int col : colours) { - is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction - seen_colour_statistics[is_seen_colour][itr]++; - x0[col] += is_seen_colour; + add_colour_to_x(col, itr, x0); } } diff --git a/src/feature_generation/features.cpp b/src/feature_generation/features.cpp index 9a0afb9..818c8bf 100644 --- a/src/feature_generation/features.cpp +++ b/src/feature_generation/features.cpp @@ -32,19 +32,30 @@ namespace feature_generation { pruned = false; collecting = false; neighbour_container = std::make_shared(multiset_hash); - seen_colour_statistics = std::vector>(2, std::vector(iterations + 1, 0)); + seen_colour_statistics = + std::vector>(2, std::vector(iterations + 1, 0)); store_weights = false; n_seen_graphs = 0; n_seen_nodes = 0; n_seen_edges = 0; seen_initial_colours = std::set(); - init_layer_to_colours(); + + colour_hash = new_colour_hash(); + layer_to_colours = new_layer_to_colours(); } - void Features::init_layer_to_colours() { + std::vector> Features::new_layer_to_colours() const { // plus 1 because zeroth iteration is also included - layer_to_colours = std::vector>(iterations + 1, std::set()); + return std::vector>(iterations + 1, std::set()); + } + + VecColourHash Features::new_colour_hash() const { + VecColourHash ret; + for (int i = 0; i < iterations + 1; i++) { + ret.push_back(std::unordered_map, int, int_vector_hasher>()); + } + return ret; } Features::Features(const std::string &filename) { @@ -76,8 +87,7 @@ namespace feature_generation { std::cout << "multiset_hash=" << multiset_hash << std::endl; // load colours - std::unordered_map colour_hash_str = - j.at("colour_hash").get>(); + StrColourHash colour_hash_str = j.at("colour_hash").get(); colour_hash = str_to_int_colour_hash(colour_hash_str); colour_to_layer = j.at("colour_to_layer").get>(); @@ -132,70 +142,76 @@ namespace feature_generation { /* Feature generation functions */ - int Features::get_colour_hash(const std::vector &colour) { - if (!collecting && colour_hash.count(colour) == 0) { + int Features::get_colour_hash(const std::vector &colour, const int iteration) { + if (colour.size() == 0) { return UNSEEN_COLOUR; - } else if (collecting && colour_hash.count(colour) == 0) { - int hash = (int)colour_hash.size(); - colour_hash[colour] = hash; - colour_to_layer[hash] = cur_collecting_layer; - layer_to_colours[cur_collecting_layer].insert(hash); + } else if (!collecting && !colour_hash[iteration].count(colour)) { +#ifdef DEBUGMODE + std::cout << "UNSEEN "; + debug_vec(colour); +#endif + return UNSEEN_COLOUR; + } else if (collecting && !colour_hash[iteration].count(colour)) { + int hash = get_n_features(); + colour_hash[iteration][colour] = hash; + colour_to_layer[hash] = iteration; + layer_to_colours[iteration].insert(hash); } - return colour_hash[colour]; + return colour_hash[iteration][colour]; } std::vector Features::remap_neighbour_colours(const std::vector &colours, const std::map &remap) { // make new_colours a copy of colours - std::vector new_colours = colours; + neighbour_container->clear(); #ifdef DEBUGMODE + std::cout << "REMAPPING "; debug_vec(colours); #endif // colours should always show up in remap by their construction - for (const int i : get_neighbour_colour_indices(colours)) { - new_colours[i] = remap.at(colours[i]); + for (const auto &[node_colour, edge_label] : get_neighbour_colours(colours)) { + neighbour_container->insert(remap.at(node_colour), edge_label); + } + + std::vector new_colours = {remap.at(colours[0])}; + for (const int i : neighbour_container->to_vector()) { + new_colours.push_back(i); } + return new_colours; } std::map Features::remap_colour_hash(const std::set &to_prune) { // remap values std::map remap; - std::vector, int>> new_hash_vec; + std::vector, int>>> new_hash_vec( + iterations + 1, std::vector, int>>()); std::unordered_map new_colour_layer; - // layer 0 colours (init colours) should remain consistent - for (const auto &[key, val] : colour_hash) { - int layer = colour_to_layer[val]; - if (seen_initial_colours.count(val) == 0) { - if (layer == 0) { - std::cout << "error: encountered refined colour with layer = " << layer << std::endl; - exit(-1); - } - continue; - } else { - if (layer != 0) { - std::cout << "error: encountered initial colour with layer = " << layer << std::endl; - exit(-1); - } - // keep the same for initial colours - new_hash_vec.push_back(std::make_pair(key, val)); - new_colour_layer[val] = layer; - remap[val] = val; - } + // layer 0 colours are the same + for (const auto &[key, val] : colour_hash[0]) { + new_hash_vec[0].push_back(std::make_pair(key, val)); + new_colour_layer[val] = colour_to_layer[val]; + remap[val] = val; } // deal with layer 1+ colours - for (int iteration = 1; iteration < iterations + 1; iteration++) { - for (const auto &[key, val] : colour_hash) { // this can be optimised - if (colour_to_layer[val] != iteration || to_prune.count(val) > 0) { + for (int itr = 1; itr < iterations + 1; itr++) { + for (const auto &[key, val] : colour_hash.at(itr)) { + if (to_prune.count(val) > 0) { continue; } - int new_val = (int)new_hash_vec.size(); + + // new value is size of new hash + number of layer 0 colours + int new_val = 0; + for (size_t i = 0; i < new_hash_vec.size(); i++) { + new_val += new_hash_vec[i].size(); + } + remap[val] = new_val; - new_hash_vec.push_back(std::make_pair(key, new_val)); + new_hash_vec[itr].push_back(std::make_pair(key, new_val)); new_colour_layer[new_val] = colour_to_layer[val]; } } @@ -207,9 +223,11 @@ namespace feature_generation { std::cout << "INITIAL " << i << std::endl; } std::cout << "old_hash" << std::endl; - for (const auto &[key, val] : colour_hash) { - std::cout << "HASH "; - debug_hash(key, val); + for (int itr = 1; itr < iterations + 1; itr++) { + for (const auto &[key, val] : colour_hash[itr]) { + std::cout << "HASH_ITR " << itr << " HASH "; + debug_hash(key, val); + } } std::cout << "to_prune" << std::endl; for (const int i : to_prune) { @@ -217,30 +235,41 @@ namespace feature_generation { } std::cout << "remap" << std::endl; for (const auto &[key, val] : remap) { - std::cout << "REMAP " << key << " -> " << val << std::endl; + std::cout << "REMAP " << key << " -> " << val << " LAYER: " << new_colour_layer[val] + << std::endl; } #endif ////////////////////////////////////////// // remap keys - ColourHash new_colour_hash; - for (size_t i = 0; i < new_hash_vec.size(); i++) { - std::vector key = new_hash_vec[i].first; - int val = new_hash_vec[i].second; - if (new_colour_layer[val] > 0) { - key = remap_neighbour_colours(key, remap); + VecColourHash new_hash(iterations + 1, + std::unordered_map, int, int_vector_hasher>()); + new_hash[0] = colour_hash[0]; + for (int itr = 1; itr < iterations + 1; itr++) { + for (size_t i = 0; i < new_hash_vec[itr].size(); i++) { + std::vector key = new_hash_vec[itr][i].first; + int val = new_hash_vec[itr][i].second; + if (new_colour_layer[val] > 0) { + key = remap_neighbour_colours(key, remap); + } + new_hash[itr][key] = val; } - new_colour_hash[key] = val; } // remap hash - colour_hash = new_colour_hash; + colour_hash = new_hash; // remap colours colour_to_layer = new_colour_layer; - init_layer_to_colours(); - for (const auto &[key, val] : colour_hash) { - layer_to_colours[colour_to_layer[val]].insert(val); + layer_to_colours = new_layer_to_colours(); + for (int itr = 0; itr < iterations + 1; itr++) { + for (const auto &[key, val] : colour_hash[itr]) { + if (colour_to_layer[val] != itr) { + std::cout << "error: colour layers not preserved during remap" << std::endl; + exit(-1); + } + layer_to_colours[itr].insert(val); + } } return remap; @@ -331,6 +360,14 @@ namespace feature_generation { return embed(graph_generator->to_graph(state)); } + void Features::add_colour_to_x(int col, int itr, std::vector &x) { + bool is_seen_colour = (col != UNSEEN_COLOUR); // prevent branch prediction + seen_colour_statistics[is_seen_colour][itr]++; + if (is_seen_colour) { + x[col]++; + } + } + /* Prediction functions */ double Features::predict(const std::shared_ptr &graph) { @@ -374,33 +411,36 @@ namespace feature_generation { } // hash type conversion functions - ColourHash - Features::str_to_int_colour_hash(std::unordered_map str_colour_hash) const { - ColourHash int_colour_hash; - for (const auto &pair : str_colour_hash) { - std::vector colour; - std::istringstream iss(pair.first); - std::string token; - while (std::getline(iss, token, '.')) { - colour.push_back(std::stoi(token)); + VecColourHash Features::str_to_int_colour_hash(StrColourHash str_colour_hash) const { + VecColourHash int_colour_hash = new_colour_hash(); + for (int itr = 0; itr < iterations + 1; itr++) { + for (const auto &pair : str_colour_hash[itr]) { + std::vector colour; + std::istringstream iss(pair.first); + std::string token; + while (std::getline(iss, token, '.')) { + colour.push_back(std::stoi(token)); + } + int_colour_hash[itr][colour] = pair.second; } - int_colour_hash[colour] = pair.second; } return int_colour_hash; } - std::unordered_map - Features::int_to_str_colour_hash(ColourHash int_colour_hash) const { - std::unordered_map str_colour_hash; - for (const auto &pair : int_colour_hash) { - std::string colour_str = ""; - for (size_t i = 0; i < pair.first.size(); i++) { - colour_str += std::to_string(pair.first[i]); - if (i < pair.first.size() - 1) { - colour_str += "."; + StrColourHash Features::int_to_str_colour_hash(VecColourHash int_colour_hash) const { + StrColourHash str_colour_hash; + for (int itr = 0; itr < iterations + 1; itr++) { + str_colour_hash.push_back(std::unordered_map()); + for (const auto &pair : int_colour_hash[itr]) { + std::string colour_str = ""; + for (size_t i = 0; i < pair.first.size(); i++) { + colour_str += std::to_string(pair.first[i]); + if (i < pair.first.size() - 1) { + colour_str += "."; + } } + str_colour_hash[itr][colour_str] = pair.second; } - str_colour_hash[colour_str] = pair.second; } return str_colour_hash; } @@ -430,6 +470,14 @@ namespace feature_generation { void Features::print_init_colours() const { graph_generator->print_init_colours(); } + int Features::get_n_features() const { + int ret = 0; + for (int i = 0; i < iterations + 1; i++) { + ret += colour_hash[i].size(); + } + return ret; + } + void Features::save(const std::string &filename) { // let Python handle file exceptions json j; diff --git a/src/feature_generation/neighbour_container.cpp b/src/feature_generation/neighbour_container.cpp index ab3cb7e..a62407c 100644 --- a/src/feature_generation/neighbour_container.cpp +++ b/src/feature_generation/neighbour_container.cpp @@ -25,17 +25,8 @@ namespace feature_generation { std::string NeighbourContainer::to_string() const { std::string str = ""; - if (multiset_hash) { - for (const auto &kv : neighbours_mset) { - str += "." + std::to_string(kv.first.first); // edge label - str += "." + std::to_string(kv.first.second); // node colour - str += "." + std::to_string(kv.second); // count in multiset - } - } else { - for (const auto &kv : neighbours_set) { - str += "." + std::to_string(kv.first); // edge label - str += "." + std::to_string(kv.second); // node colour - } + for (const int i : to_vector()) { + str += "." + std::to_string(i); } return str; } @@ -44,9 +35,9 @@ namespace feature_generation { std::vector vec; if (multiset_hash) { for (const auto &kv : neighbours_mset) { - vec.push_back(kv.first.first); // edge label i % 3 == 1 - vec.push_back(kv.first.second); // node colour i % 3 == 2 - vec.push_back(kv.second); // count in multiset i % 3 == 0 + vec.push_back(kv.first.first); // edge label + vec.push_back(kv.first.second); // node colour + vec.push_back(kv.second); // count in multiset } } else { for (const auto &kv : neighbours_set) { diff --git a/src/feature_generation/pruning/bulk_pruners.cpp b/src/feature_generation/pruning/bulk_pruners.cpp index 24cc115..fd25a40 100644 --- a/src/feature_generation/pruning/bulk_pruners.cpp +++ b/src/feature_generation/pruning/bulk_pruners.cpp @@ -73,21 +73,27 @@ namespace feature_generation { // 0. construct feature dependency graph int n_features = X.at(0).size(); - std::vector> edges_fw = - std::vector>(n_features, std::vector()); - std::vector> edges_bw = - std::vector>(n_features, std::vector()); - for (const auto &[neighbours, colour] : colour_hash) { // std::vector, int - std::vector indices = get_neighbour_colour_indices(neighbours); - for (const int i : indices) { - int ancestor = neighbours[i]; - edges_fw.at(ancestor).push_back(colour); - edges_bw.at(colour).push_back(ancestor); + std::vector> edges_fw = std::vector>(n_features, std::set()); + std::vector> edges_bw = std::vector>(n_features, std::set()); + + for (int itr = 1; itr < iterations + 1; itr++) { + for (const auto &[neighbours, colour] : colour_hash[itr]) { // std::vector, int + edges_fw.at(neighbours[0]).insert(colour); + edges_bw.at(colour).insert(neighbours[0]); + for (const auto &[ancestor, _] : get_neighbour_colours(neighbours)) { + edges_fw.at(ancestor).insert(colour); + edges_bw.at(colour).insert(ancestor); + } + } + } + #ifdef DEBUGMODE - std::cout << "FDG " << ancestor << " -> " << colour << std::endl; -#endif + for (int colour = 0; colour < n_features; colour++) { + for (const int child : edges_fw.at(colour)) { + std::cout << "FDG " << colour << " -> " << child << std::endl; } } +#endif // 1. compute equivalent features candidates std::cout << "Computing equivalent feature candidates." << std::endl; @@ -147,7 +153,8 @@ namespace feature_generation { } changed += mark_distinct_features(prune_candidates, feature_group, group_size); - std::cout << "changed: " << changed << ". candidates: " << prune_candidates.size() << std::endl; + std::cout << "changed: " << changed << ". candidates: " << prune_candidates.size() + << std::endl; if (changed == 0) { break; } diff --git a/tests/check_not_debug.py b/tests/check_not_debug.py index d0c37d1..b1ba854 100644 --- a/tests/check_not_debug.py +++ b/tests/check_not_debug.py @@ -12,7 +12,7 @@ def test_not_debug(): found_debug = False for line in setup_script.split("\n"): toks = line.split("#") - if len(toks) > 0 and "-DDEBUGMODE" in toks[0]: + if len(toks) > 0 and "_DEBUG = True" in toks[0]: found_debug = True break