From ac4379ef4d3736bd398e3bfb6b74d1b26b5b2e8f Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Wed, 29 Nov 2017 17:04:29 +0300 Subject: [PATCH 01/15] [ranker] Added beta wmd ranker --- include/meta/caching/maps/locking_map.h | 7 +- include/meta/caching/shard_cache.h | 3 +- include/meta/embeddings/word_embeddings.h | 22 +- include/meta/index/ranker/emd.h | 438 +++++++++++++ include/meta/index/ranker/wmd_base.h | 83 +++ include/meta/util/min_cost_flow.h | 145 ++++ include/meta/util/min_cost_flow.tcc | 764 ++++++++++++++++++++++ src/classify/classifier/knn.cpp | 2 +- src/embeddings/word_embeddings.cpp | 82 ++- src/index/ranker/CMakeLists.txt | 4 +- src/index/ranker/ranker_factory.cpp | 2 + src/index/ranker/wmd_base.cpp | 181 +++++ 12 files changed, 1722 insertions(+), 11 deletions(-) create mode 100644 include/meta/index/ranker/emd.h create mode 100644 include/meta/index/ranker/wmd_base.h create mode 100644 include/meta/util/min_cost_flow.h create mode 100644 include/meta/util/min_cost_flow.tcc create mode 100644 src/index/ranker/wmd_base.cpp diff --git a/include/meta/caching/maps/locking_map.h b/include/meta/caching/maps/locking_map.h index dfe009278..78e06f099 100644 --- a/include/meta/caching/maps/locking_map.h +++ b/include/meta/caching/maps/locking_map.h @@ -14,6 +14,7 @@ #include #include "meta/util/optional.h" +#include "meta/hashing/hash.h" namespace meta { @@ -76,10 +77,10 @@ class locking_map util::optional find(const Key& key) const; /// iterator type for locking_maps - using iterator = typename std::unordered_map::iterator; + using iterator = typename std::unordered_map>::iterator; /// const_iterator type for locking_maps using const_iterator = - typename std::unordered_map::const_iterator; + typename std::unordered_map>::const_iterator; /** * @return an iterator to the beginning of the map @@ -103,7 +104,7 @@ class locking_map private: /// the underlying map used for storage - std::unordered_map map_; + std::unordered_map> map_; /// the mutex that synchronizes accesses into the map mutable std::mutex mutables_; }; diff --git a/include/meta/caching/shard_cache.h b/include/meta/caching/shard_cache.h index 578b888df..b21392a88 100644 --- a/include/meta/caching/shard_cache.h +++ b/include/meta/caching/shard_cache.h @@ -85,7 +85,8 @@ class generic_shard_cache * The hash function used for determining which shard a key * belongs to. */ - std::hash hasher_; +// std::hash hasher_; + hashing::hash<> hasher_; }; /** diff --git a/include/meta/embeddings/word_embeddings.h b/include/meta/embeddings/word_embeddings.h index fac0ea535..2869736cf 100644 --- a/include/meta/embeddings/word_embeddings.h +++ b/include/meta/embeddings/word_embeddings.h @@ -63,12 +63,22 @@ class word_embeddings word_embeddings(std::istream& vocab, std::istream& first, std::istream& second); + + /** + * Loads word embeddings from txt file + * + * @param vectors The stream to read the vectors from + * @param num_lines Number of lines in the file + * @param dimension dimension of the embedding + */ + word_embeddings(std::istream& vectors, size_t num_lines, size_t dimension); + /** * @param term The term to look up * @return the embedding vector (as an array_view) for the given term, * or the vector for the unknown word as appropriate */ - embedding at(util::string_view term) const; + embedding at(std::string term) const; /** * @param tid The term id to look up @@ -76,6 +86,12 @@ class word_embeddings */ util::string_view term(std::size_t tid) const; + /** + * @param tid The term to look up + * @return the term id, or -1 if not found + */ + int64_t tid(std::string term) const; + /** * @param query A vector of the same length as a word embedding to * query for @@ -109,7 +125,9 @@ class word_embeddings util::aligned_vector id_to_term_; /// A hash table from a term to its id - hashing::probe_map term_to_id_; + hashing::probe_map term_to_id_; +// hashing::probe_map term_to_id_; + /// The embeddings matrix util::aligned_vector embeddings_; diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h new file mode 100644 index 000000000..04975a781 --- /dev/null +++ b/include/meta/index/ranker/emd.h @@ -0,0 +1,438 @@ +// +// Created by lolik111 on 19.11.17. +// + +#ifndef META_EMD_H +#define META_EMD_H + + +#include +#include +#include +#include +#include +#include +#include +#include "meta/parallel/algorithm.h" +#include "meta/math/vector.h" +#include "meta/caching/all.h" +#include "meta/util/range.h" +#include "meta/hashing/hash.h" +#include "meta/util/min_cost_flow.h" + + +namespace meta +{ + +namespace index +{ + +class Document +{ +public: + size_t n_terms; + std::vector ids; + std::vector> vectors; + std::vector weights; +}; + + +class em_distance +{ +public: + using metric_type = std::function&, const + std::vector&)>; + + em_distance(const std::shared_ptr, double>> &cache_, + metric_type metric, + size_t dimension, + size_t nthreads = std::thread::hardware_concurrency()) + : nthreads_(nthreads), cache_(cache_), dimension_(dimension), + dist(metric) + { + } + + void fill(){ + auto f = [this](const Document &doc1, const Document &doc2){ + return this->emd_relaxed(doc1, doc2); + }; + std::function fz = f; + methods_.insert(std::make_pair(std::string("rwmd"), fz)); + + } + + + double emd(Document &doc1, Document &doc2) + { + std::vector supply(doc1.n_terms + doc2.n_terms, 0); + std::vector demand(doc1.n_terms + doc2.n_terms, 0); + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + supply[i] = doc1.weights[i]; + } + + for (size_t i = 0; i < doc2.n_terms; ++i) + { + demand[doc1.n_terms + i] = doc2.weights[i]; + } + + std::vector> cost(supply.size(), + std::vector(supply.size(), 0)); + + auto f_c_distance = [&](size_t first, size_t second) + { + std::pair pair; + if (doc1.ids[first] < doc2.ids[second]) + { + pair = std::make_pair(doc1.ids[first], + doc2.ids[second]); + } else { + pair = std::make_pair(doc2.ids[second], + doc1.ids[first]); + } + + auto val = cache_->find(pair); + + if(!val) + { + val = dist(doc1.vectors[first], + doc2.vectors[second]); + cache_->insert(pair, val.value()); + } + return val.value(); + }; + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + for (size_t j = 0; j < doc2.n_terms; ++j) + { + double dist = f_c_distance(i, j); + assert(dist >= 0); + cost[i][j + doc1.n_terms] = dist; + cost[j + doc1.n_terms][i] = dist; + } + } + util::min_cost_flow mcf; + auto score = mcf.emd_hat(supply, demand, supply, demand, cost, + -1.0); + + return score; + } + + double forward_emd(Document &doc1, Document &doc2){ +// std::vector supply; +// std::unordered_map check_map; +// +// for (size_t i = 0; i < doc1.n_terms; ++i) +// { +// check_map.insert({doc1.ids[i], i}); +// } +// +// for (size_t i = 0; i < doc2.n_terms; ++i) +// { +// std::unordered_map::iterator ind; +// if((ind = check_map.find(doc2.ids[i])) != check_map.end()){ +// auto k = ind->second; +// if (doc1.weights[k] < doc2.weights[i]){ +// doc2.weights[i] -= doc1.weights[k]; +// doc1.weights[k] = 0; +// } else { +// doc1.weights[k] -= doc2.weights[i]; +// doc2.weights[i] = 0; +// } +// } +// } +// +// +// for (size_t i = 0; i < doc1.n_terms; ++i) +// { +// if(doc1.weights[i] != 0) +// supply.push_back(doc1.weights[i]); +// } +// std::vector demand(supply.size(), 0); +// +// for (size_t i = 0; i < doc2.n_terms; ++i) +// { +// if(doc2.weights[i] != 0) +// demand.push_back(doc2.weights[i]); +// } +// supply.resize(demand.size(), 0); + std::vector supply(doc1.n_terms + doc2.n_terms, 0); + std::vector demand(doc1.n_terms + doc2.n_terms, 0); + std::vector xtra(doc1.n_terms + doc2.n_terms, 0); + + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + supply[i] = doc1.weights[i]; + xtra[i] = doc1.weights[i]; + } + + for (size_t i = 0; i < doc2.n_terms; ++i) + { + demand[doc1.n_terms + i] = doc2.weights[i]; + xtra[doc1.n_terms + i] = -doc2.weights[i]; + } + + std::vector> cost(supply.size(), + std::vector(supply.size(), 0)); + + auto f_c_distance = [&](size_t first, size_t second) + { + std::pair pair; + if (doc1.ids[first] < doc2.ids[second]) + { + pair = std::make_pair(doc1.ids[first], + doc2.ids[second]); + } else { + pair = std::make_pair(doc2.ids[second], + doc1.ids[first]); + } + + auto val = cache_->find(pair); + + if(!val) + { + val = dist(doc1.vectors[first], + doc2.vectors[second]); + cache_->insert(pair, val.value()); + } + return val.value(); + }; + + std::vector>> edges; + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + std::list> list; + for (size_t j = 0; j < doc2.n_terms; ++j) + { + double dist = f_c_distance(i, j); + list.push_back({doc1.n_terms + j, dist}); + + assert(dist >= 0); + cost[i][j + doc1.n_terms] = dist; + cost[j + doc1.n_terms][i] = dist; + } + edges.push_back(list); + } + + for (size_t i = 0; i < doc2.n_terms; ++i) + { + std::list> list; + edges.push_back(list); + } + + util::min_cost_flow mcf; + std::vector>> f(xtra.size()); + + auto score = mcf.compute_min_cost_flow(xtra, edges, f); + + return score; + + } + + double emd_relaxed2(Document &doc1, Document &doc2) + { + std::vector boilerplate(doc2.n_terms); + for (size_t i = 0; i < doc2.n_terms; i++) { + boilerplate[i] = i; + } + + double acc = 0; + for (size_t i = 0; i < doc1.n_terms; i++) { + + if (doc1.weights[i] != 0) { + std::sort( + boilerplate.begin(), + boilerplate.end(), + [&](const int a, const int b){ + bool ans; + ans = dist(doc1.vectors[i], doc2.vectors[a]) < + dist(doc1.vectors[i], doc2.vectors[b]); + return ans; + }); + + double remaining = doc1.weights[i]; + for (size_t j = 0; j < doc2.n_terms; j++) { + uint64_t w = boilerplate[j]; + if (remaining < doc2.weights[w]) { + acc += remaining * + dist(doc1.vectors[i], doc2.vectors[w]); + break; + } else { + remaining -= doc2.weights[w]; + acc += doc2.weights[w] * + dist(doc1.vectors[i], doc2.vectors[w]); + } + } + } + } + return acc; + } + + double emd_relaxed(const Document &doc1, const Document &doc2) + { + double score = 0; + parallel::thread_pool pool(nthreads_); + std::vector> futuress; + futuress.reserve(nthreads_); + + size_t part = doc1.n_terms / nthreads_; + size_t start = 0; + + std::vector ttimes(nthreads_ + 1); + for (size_t i = 0; i < nthreads_; i++) { + ttimes[i] = start; + start += part; + } + ttimes[nthreads_] = (int) doc1.n_terms; + + for (size_t j = 0; j < nthreads_; j++) { + + futuress.emplace_back( + pool.submit_task([&, j]{ + size_t st = ttimes[j]; + size_t en = ttimes[j + 1]; + return emd_relaxed_thread(st, en, doc1, doc2); + }) + ); + } + for (auto &fut: futuress) { + score += fut.get(); + } + return score; + } + + double + emd_relaxed_thread(const size_t start, const size_t end, const Document + &doc1, const Document &doc2) + { + double acc = 0; + std::vector ids(doc2.n_terms); + for (size_t i = 0; i < doc2.n_terms; ++i) { + ids[i] = i; + } + + auto f_c_distance = [&](size_t first, size_t second) + { + std::pair pair; + if (doc1.ids[first] < doc2.ids[second]) + { + pair = std::make_pair(doc1.ids[first], + doc2.ids[second]); + } else { + pair = std::make_pair(doc2.ids[second], + doc1.ids[first]); + } + + auto val = cache_->find(pair); + + if(!val) + { + val = dist(doc1.vectors[first], + doc2.vectors[second]); + cache_->insert(pair, val.value()); + } + return val.value(); + }; + + for (size_t i = start; i < end; i++) + { + if (doc1.weights[i] == 0) + continue; + + std::vector distances(doc2.n_terms); + + for(size_t j = 0; j < doc2.n_terms; j++) + { + + distances[j] = f_c_distance(i, j); + } + + std::sort(ids.begin(), + ids.end(), + [&] (const size_t a, const size_t b) -> bool + { + return distances[a] < distances[b]; + }); + + double remaining = doc1.weights[i]; + for (auto it = ids.begin(); + it != ids.end(); it++) { + uint64_t w = (uint64_t) *it; + if (remaining < doc2.weights[w]) { + acc += remaining * + dist(doc1.vectors[i], doc2.vectors[w]); + break; + } else { + remaining -= doc2.weights[w]; + acc += doc2.weights[w] * + dist(doc1.vectors[i], doc2.vectors[w]); + } + } + } + return acc; + } + + + double wcd(Document &doc1, Document &doc2) + { + using namespace meta::math::operators; + + std::vector res(dimension_); + auto start = doc1.vectors.begin(); + for (auto w1: doc1.weights) { + res = res + *start++ * w1; + } + + start = doc2.vectors.begin(); + for (auto w2: doc2.weights) { + res = res - *start++ * w2; + } + + return l2norm(res); + } + + static double + l2diff_norm(const std::vector &a, const std::vector &b) + { + double res = 0.0; + auto it1 = a.begin(); + auto it2 = b.begin(); + while (it1 != a.end()) { + double val = *it1 - *it2; + res += val * val; + it1++; + it2++; + } + + return res; + } + + static double + cosine(const std::vector &a, const std::vector &b) + { + return -std::inner_product(a.begin(), a.end(), b.begin(), 0.0); + } + +private: + + std::shared_ptr, double>> cache_; + + const size_t nthreads_; + const size_t dimension_; + const metric_type dist; + std::unordered_map> methods_; + +}; + + +} +} + +#endif //META_EMD_H diff --git a/include/meta/index/ranker/wmd_base.h b/include/meta/index/ranker/wmd_base.h new file mode 100644 index 000000000..ce82c4876 --- /dev/null +++ b/include/meta/index/ranker/wmd_base.h @@ -0,0 +1,83 @@ +// +// Created by lolik111 on 17.11.17. +// + +#ifndef META_WMD_BASE_H +#define META_WMD_BASE_H + + +#include "meta/index/ranker/ranker_factory.h" +#include "meta/index/ranker/ranker.h" +#include "meta/embeddings/word_embeddings.h" +#include "meta/util/string_view.h" +#include "meta/util/array_view.h" +#include "meta/index/ranker/emd.h" + + +namespace meta +{ + namespace index + { + +/** + * Implements word mover's distance model. + * + * @see http://mkusner.github.io/publications/WMD.pdf + * + * Required config parameters: + * ~~~toml + * [ranker] + * method = "wmd" + * ~~~ + * + * Optional config parameters: + * ~~~toml + * mode # current mode: can be 'emd', 'wcd-emd', or 'rwmd' + * num-threads # number of threads used in the algorithm + * cache-per-thread # size of cache per each thread + * ~~~ + */ + class wmd_base : public ranker + { + public: + /// Identifier for this ranker. + const static util::string_view id; + + const static std::string default_mode; + + const static constexpr size_t default_cache_size = 1000000; + + wmd_base(std::shared_ptr fwd, + std::shared_ptr embeddings, + size_t nthreads, size_t cache_size); + + wmd_base(std::istream& in); + + void save(std::ostream& out) const override; + + std::vector + rank(ranker_context& ctx, uint64_t num_results, + const filter_function_type& filter) override; + + private: + std::shared_ptr fwd_; + std::shared_ptr, double>> cache_; + std::shared_ptr embeddings_; + const size_t nthreads_; + const size_t cache_size_; + + meta::index::Document create_document(std::vector> tf, ranker_context &ctx); + }; + +/** + * Specialization of the factory method used to create wmd + * rankers. + */ + template <> + std::unique_ptr + make_ranker(const cpptoml::table& global, + const cpptoml::table& local); + + } +} +#endif diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h new file mode 100644 index 000000000..6e2e5dbfd --- /dev/null +++ b/include/meta/util/min_cost_flow.h @@ -0,0 +1,145 @@ +// +// Created by lolik111 on 29.11.17. +// + + +// +// Created by lolik111 on 28.11.17. +// + +#ifndef FAST_EMD_MIN_COST_FLOW_H +#define FAST_EMD_MIN_COST_FLOW_H + +#include +#include +#include +#include +#include +#include + +namespace meta +{ +namespace util +{ +template +struct edge; + +template +struct edge0; + +template +struct edge1; + +template +struct edge2; + +template +struct edge3; + +template +class min_cost_flow +{ + +public: + + // e - supply(positive) and demand(negative). + // c[i] - edges that goes from node i. first is the second nod + // x - the flow is returned in it + + NumT emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + NumT extra_mass_penalty); + + NumT compute_min_cost_flow(std::vector &e, + const std::vector>> &c, + std::vector>> &x); + + +private: + + size_t _num_nodes; + std::vector _nodes_to_demand; + + template + static T integral_emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + T extra_mass_penalty); + + void + compute_shortest_path(std::vector &d, std::vector &prev, + + size_t from, + std::vector>> &cost_forward, + std::vector>> &cost_backward, + + const std::vector &e, size_t &l); + + void heap_decrease_key(std::vector> &demand, + std::vector &nodes_to_demand, size_t v, + NumT alt); + + void heap_remove_first(std::vector> &demand, + std::vector &nodes_to_demand); + + void heapify(std::vector> &demand, + std::vector &nodes_to_demand, size_t i); + + void swap_heap(std::vector> &demand, + std::vector &nodes_to_demand, size_t i, size_t j); + + size_t LEFT(size_t i) + { + return 2 * (i + 1) - 1; + } + + size_t RIGHT(size_t i) + { + return 2 * (i + 1); // 2*(i+1)+1-1 + } + + size_t PARENT(size_t i) + { + return (i - 1) / 2; + } + +}; +// end min_cost_flow + +// Copyright (c) 2009-2012, Ofir Pele +// All rights reserved. + +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the The Hebrew University of Jerusalem nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +} +} + +#include "min_cost_flow.tcc" + +#endif //FAST_EMD_MIN_COST_FLOW_H diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc new file mode 100644 index 000000000..cc7fba9f5 --- /dev/null +++ b/include/meta/util/min_cost_flow.tcc @@ -0,0 +1,764 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "min_cost_flow.h" + +namespace meta +{ +namespace util +{ +template +struct edge +{ + edge(size_t to, CostType cost) : _to(to), _cost(cost) + { + } + + size_t _to; + CostType _cost; +}; + +template +struct edge0 +{ + edge0(size_t to, CostType cost, CostType flow) : _to(to), _cost(cost), + _flow(flow) + { + } + + size_t _to; + CostType _cost; + CostType _flow; +}; + +template +struct edge1 +{ + edge1(size_t to, CostType reduced_cost) : _to(to), + _reduced_cost(reduced_cost) + { + } + + size_t _to; + CostType _reduced_cost; +}; + +template +struct edge2 +{ + edge2(size_t to, CostType reduced_cost, CostType residual_capacity) + : _to(to), + _reduced_cost(reduced_cost), + _residual_capacity(residual_capacity) + { + } + + size_t _to; + CostType _reduced_cost; + CostType _residual_capacity; +}; + +template +struct edge3 +{ + edge3(size_t to = 0, DistType dist = 0) : _to(to), _dist(dist) + { + } + + size_t _to; + DistType _dist; +}; + +template +NumT min_cost_flow::compute_min_cost_flow(std::vector &e, + const std::vector>> &c, + std::vector>> &x) +{ + + assert(e.size() == c.size()); + assert(x.size() == c.size()); + + _num_nodes = e.size(); + _nodes_to_demand.resize(_num_nodes); + + for (size_t from = 0; from < _num_nodes; ++from) { + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { + x[from].push_back(edge0(it->_to, it->_cost, 0)); + x[it->_to].push_back(edge0(from, -it->_cost, 0)); + } + } // it + } + + // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) + // Note that for forward edges the residual capacity is infinity + std::vector>> r_cost_forward(_num_nodes); + { + for (size_t from = 0; from < _num_nodes; ++from) { + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { + r_cost_forward[from].push_back( + edge1(it->_to, it->_cost)); + } + } + } + } + + // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) + // Since the flow at the beginning is 0, the residual capacity is also zero + std::vector>> r_cost_cap_backward(_num_nodes); + + for (size_t from = 0; from < _num_nodes; ++from) { + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { + r_cost_cap_backward[it->_to].push_back( + edge2(from, -it->_cost, 0)); + } + } + } + + NumT U = 0; + { + for (size_t i = 0; i < _num_nodes; ++i) { + if (e[i] > U) + U = e[i]; + } + } + + std::vector d(_num_nodes); + std::vector prev(_num_nodes); + NumT delta = 1; + while (true) + { // until we break when S or T is empty + + NumT max_supply = 0; + size_t k = 0; + for (size_t i = 0; i < _num_nodes; ++i) { + if (e[i] > 0) { + if (max_supply < e[i]) { + max_supply = e[i]; + k = i; + } + } + } + if (max_supply == 0) + break; + delta = max_supply; + + size_t l; + compute_shortest_path(d, prev, k, r_cost_forward, + r_cost_cap_backward, e, l); + + //--------------------------------------------------------------- + // find delta (minimum on the path from k to l) + // delta= e[k]; + // if (-e[l]_to != to)) { + ++itccb; + } + if (itccb != r_cost_cap_backward[from].end()) { + if (itccb->_residual_capacity < delta) + delta = itccb->_residual_capacity; + } + + to = from; + } while (to != k); + + + // augment delta flow from k to l (backwards actually...) + to = l; + do { + size_t from = prev[to]; + assert(from != to); + + // TODO - might do here O(n) can be done in O(1) + auto itx = x[from].begin(); + while (itx->_to != to) { + ++itx; + } + itx->_flow += delta; + + // update residual for backward edges + auto itccb = r_cost_cap_backward[to].begin(); + while ((itccb != r_cost_cap_backward[to].end()) + && (itccb->_to != from)) { + ++itccb; + } + if (itccb != r_cost_cap_backward[to].end()) { + itccb->_residual_capacity += delta; + } + itccb = r_cost_cap_backward[from].begin(); + while ((itccb != r_cost_cap_backward[from].end()) + && (itccb->_to != to)) + { + ++itccb; + } + if (itccb != r_cost_cap_backward[from].end()) + { + itccb->_residual_capacity -= delta; + } + + // update e + e[to] += delta; + e[from] -= delta; + + to = from; + } while (to != k); + + } // while true (until we break when S or T is empty) + + // compute distance from x + NumT dist = 0; + + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = x[from].begin(); it != x[from].end(); ++it) + { + dist += (it->_cost * it->_flow); + } + } + + return dist; +} + +template +void min_cost_flow:: +compute_shortest_path(std::vector &d, std::vector &prev, + + size_t from, + std::vector>> &cost_forward, + std::vector>> &cost_backward, + + const std::vector &e, size_t &l) +{ + // Making heap (all inf except 0, so we are saving comparisons...) + std::vector> demand(_num_nodes); + + demand[0]._to = from; + _nodes_to_demand[from] = 0; + demand[0]._dist = 0; + + size_t j = 1; + // TODO: both of these into a function? + + for (size_t i = 0; i < from; ++i) { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } + + for (size_t i = from + 1; i < _num_nodes; ++i) { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } + + // main loop + std::vector final_nodes_flg(_num_nodes, false); + do { + size_t u = demand[0]._to; + + d[u] = demand[0]._dist; // final distance + final_nodes_flg[u] = true; + if (e[u] < 0) + { + l = u; + break; + } + + heap_remove_first(demand, _nodes_to_demand); + + for (auto it = cost_forward[u].begin(); it != cost_forward[u].end(); + ++it) + { + assert(it->_reduced_cost >= 0); + NumT alt = d[u] + it->_reduced_cost; + size_t v = it->_to; + if ((_nodes_to_demand[v] < demand.size()) + && (alt < demand[_nodes_to_demand[v]]._dist)) + { + heap_decrease_key(demand, _nodes_to_demand, v, alt); + prev[v] = u; + } + } + + for (auto it = cost_backward[u].begin(); + it != cost_backward[u].end(); ++it) + { + if (it->_residual_capacity > 0) + { + assert(it->_reduced_cost >= 0); + NumT alt = d[u] + it->_reduced_cost; + size_t v = it->_to; + if ((_nodes_to_demand[v] < demand.size()) + && (alt < demand[_nodes_to_demand[v]]._dist)) + { + heap_decrease_key(demand, _nodes_to_demand, v, alt); + prev[v] = u; + } + } + } + // it + + } while (!demand.empty()); + + + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { + + for (auto it = cost_forward[node_from].begin(); + it != cost_forward[node_from].end(); ++it) { + if (final_nodes_flg[node_from]) { + it->_reduced_cost += d[node_from] - d[l]; + } + if (final_nodes_flg[it->_to]) { + it->_reduced_cost -= d[it->_to] - d[l]; + } + } + + } + + + // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) + + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { + + for (auto it = cost_backward[node_from].begin(); + it != cost_backward[node_from].end(); ++it) { + if (final_nodes_flg[node_from]) { + it->_reduced_cost += d[node_from] - d[l]; + } + if (final_nodes_flg[it->_to]) { + it->_reduced_cost -= d[it->_to] - d[l]; + } + } + } + + +} + +template +void min_cost_flow::heap_decrease_key(std::vector> &demand, + std::vector &nodes_to_demand, + size_t v, + NumT alt) +{ + size_t i = nodes_to_demand[v]; + demand[i]._dist = alt; + while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) { + swap_heap(demand, nodes_to_demand, i, PARENT(i)); + i = PARENT(i); + } +} // heap_decrease_key + +template +void min_cost_flow::heap_remove_first(std::vector> &demand, + std::vector &nodes_to_demand) +{ + swap_heap(demand, nodes_to_demand, 0, demand.size() - 1); + demand.pop_back(); + heapify(demand, nodes_to_demand, 0); +} // heap_remove_first + +template +void min_cost_flow::heapify(std::vector> &demand, + std::vector &nodes_to_demand, size_t i) +{ + + do { + // TODO: change to loop + size_t l = LEFT(i); + size_t r = RIGHT(i); + size_t smallest; + if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) { + smallest = l; + } else { + smallest = i; + } + if ((r < demand.size()) + && (demand[r]._dist < demand[smallest]._dist)) { + smallest = r; + } + + if (smallest == i) + return; + + swap_heap(demand, nodes_to_demand, i, smallest); + i = smallest; + + } while (true); + +} + +template +void min_cost_flow::swap_heap(std::vector> &demand, + std::vector &nodes_to_demand, size_t i, + size_t j) +{ + edge3 tmp = demand[i]; + demand[i] = demand[j]; + demand[j] = tmp; + nodes_to_demand[demand[j]._to] = j; + nodes_to_demand[demand[i]._to] = i; +} + + +template +NumT min_cost_flow::emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + NumT extra_mass_penalty) +{ + if (std::is_integral::value) { + return integral_emd_hat(supply_orig, demand_orig, supply, demand, + cost, extra_mass_penalty); + } else { + + const double mult_factor = 1000000; + + // Constructing the input + const size_t n = supply.size(); + std::vector i_supply_orig(n); + std::vector i_demand_orig(n); + std::vector i_supply(n); + std::vector i_demand(n); + std::vector> i_cost(n, std::vector(n)); + + // Converting to uint64_t + double sum_supply = 0.0; + double sum_demand = 0.0; + double max_cost = cost[0][0]; + for (size_t i = 0; i < n; ++i) { + sum_supply += supply_orig[i]; + sum_demand += demand_orig[i]; + for (size_t j = 0; j < n; ++j) { + if (cost[i][j] > max_cost) + max_cost = cost[i][j]; + } + } + double min_sum = std::min(sum_supply, sum_demand); + double max_sum = std::max(sum_supply, sum_demand); + double supply_demand_norm_factor = mult_factor / max_sum; + double cost_norm_factor = mult_factor / max_cost; + for (size_t i = 0; i < n; ++i) { + i_supply_orig[i] = static_cast( + floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); + i_demand_orig[i] = static_cast( + floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); + i_supply[i] = static_cast(floor( + supply[i] * supply_demand_norm_factor + 0.5)); + i_demand[i] = static_cast(floor( + demand[i] * supply_demand_norm_factor + 0.5)); + for (size_t j = 0; j < n; ++j) { + i_cost[i][j] = static_cast( + floor(cost[i][j] * cost_norm_factor + 0.5)); + } + } + + // computing distance without extra mass penalty + double dist + = integral_emd_hat(i_supply_orig, i_demand_orig, + i_supply, i_demand, i_cost, 0); + + dist = dist / supply_demand_norm_factor; + dist = dist / cost_norm_factor; + + // adding extra mass penalty + if (extra_mass_penalty == -1) + extra_mass_penalty = max_cost; + dist += (max_sum - min_sum) * extra_mass_penalty; + + return dist; + + } + +} + +template +template< typename T> +T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply_c, + const std::vector &demand_c, + const std::vector> + &cost_c, + T extra_mass_penalty) +{ + size_t n = supply_c.size(); + assert(demand_c.size() == n); + + // Ensuring that the supplier - supply, have more mass. + std::vector supply; + std::vector demand; + std::vector> cost(cost_c); + T abs_diff_sum_supply_sum_denamd; + T sum_supply = 0; + T sum_demand = 0; + { + for (size_t i = 0; i < n; ++i) + sum_supply += supply_c[i]; + } + { + for (size_t i = 0; i < n; ++i) + sum_demand += demand_c[i]; + } + bool need_to_swap_flow = false; + if (sum_demand > sum_supply) { + need_to_swap_flow = true; + supply = demand_c; + demand = supply_c; + // transpose cost + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + cost[i][j] = cost_c[j][i]; + } + } + abs_diff_sum_supply_sum_denamd = sum_demand - sum_supply; + } else { + supply = supply_c; + demand = demand_c; + abs_diff_sum_supply_sum_denamd = sum_supply - sum_demand; + } + // if (need_to_swap_flow) cout << "need_to_swap_flow" << endl; + + // creating the b vector that contains all vertexes + std::vector b(2 * n + 2); + const size_t threshold_node = 2 * n; + const size_t artificial_node = 2 * n + 1; // need to be last ! + { + for (size_t i = 0; i < n; ++i) { + b[i] = supply[i]; + } + } + { + for (size_t i = n; i < 2 * n; ++i) { + b[i] = (demand[i - n]); + } + } + b[threshold_node] = -abs_diff_sum_supply_sum_denamd; + b[artificial_node] = 0; + //------------------------------------------------------- + + //------------------------------------------------------- + T max_cost = 0; + { + for (size_t i = 0; i < n; ++i) { + { + for (size_t j = 0; j < n; ++j) { + assert(cost[i][j] >= 0); + if (cost[i][j] > max_cost) + max_cost = cost[i][j]; + } + } + } + } + if (extra_mass_penalty == -1) + extra_mass_penalty = max_cost; + //------------------------------------------------------- + + //============================================================= + std::set sources_that_flow_not_only_to_thresh; + std::set sinks_that_get_flow_not_only_from_thresh; + T pre_flow_cost = 0; + //============================================================= + + //============================================================= + // regular edges between sinks and sources without threshold edges + std::vector>> c(b.size()); + { + for (size_t i = 0; i < n; ++i) { + if (b[i] == 0) + continue; + { + for (size_t j = 0; j < n; ++j) { + if (b[j + n] == 0) + continue; + if (cost[i][j] == max_cost) + continue; + c[i].push_back(edge(j + n, cost[i][j])); + } + } // j + } + } // i + + // checking which are not isolated + { + for (size_t i = 0; i < n; ++i) { + if (b[i] == 0) + continue; + { + for (size_t j = 0; j < n; ++j) { + if (b[j + n] == 0) + continue; + if (cost[i][j] == max_cost) + continue; + sources_that_flow_not_only_to_thresh.insert(i); + sinks_that_get_flow_not_only_from_thresh.insert(j + n); + } + } // j + } + } // i + + // converting all sinks to negative + { + for (size_t i = n; i < 2 * n; ++i) { + b[i] = -b[i]; + } + } + + // add edges from/to threshold node, + // note that costs are reversed to the paper (see also remark* above) + // It is important that it will be this way because of remark* above. + { + for (size_t i = 0; i < n; ++i) { + c[i].push_back(edge(threshold_node, 0)); + } + } + { + for (size_t j = 0; j < n; ++j) { + c[threshold_node].push_back(edge(j + n, max_cost)); + } + } + + // artificial arcs - Note the restriction that only one edge i,j is + // artificial so I ignore it... + { + for (size_t i = 0; i < artificial_node; ++i) { + c[i].push_back(edge(artificial_node, max_cost + 1)); + c[artificial_node].push_back(edge(i, max_cost + 1)); + } + } + //============================================================= + + //==================================================== + // remove nodes with supply demand of 0 + // and vertexes that are connected only to the + // threshold vertex + //==================================================== + int current_node_name = 0; + // Note here it should be vector and not vector + // as I'm using -1 as a special flag !!! + const int remove_node_flag = -1; + std::vector nodes_new_names(b.size(), remove_node_flag); + std::vector nodes_old_names; + nodes_old_names.reserve(b.size()); + { + for (size_t i = 0; i < n * 2; ++i) { + if (b[i] != 0) { + if (sources_that_flow_not_only_to_thresh.find(i) + != sources_that_flow_not_only_to_thresh.end() + || sinks_that_get_flow_not_only_from_thresh.find(i) + != sinks_that_get_flow_not_only_from_thresh + .end()) { + nodes_new_names[i] = current_node_name; + nodes_old_names.push_back(i); + ++current_node_name; + } else { + if (i >= n) { // sink + pre_flow_cost -= (b[i] * max_cost); + } + b[threshold_node] + += b[i]; // add mass(i=n) + } + } + } + } // i + nodes_new_names[threshold_node] = current_node_name; + nodes_old_names.push_back(threshold_node); + ++current_node_name; + nodes_new_names[artificial_node] = current_node_name; + nodes_old_names.push_back(artificial_node); + ++current_node_name; + + std::vector bb(current_node_name); + size_t j = 0; + { + for (size_t i = 0; i < b.size(); ++i) { + if (nodes_new_names[i] != remove_node_flag) { + bb[j] = b[i]; + ++j; + } + } + } + + std::vector>> cc(bb.size()); + { + for (size_t i = 0; i < c.size(); ++i) { + if (nodes_new_names[i] == remove_node_flag) + continue; + { + for (auto it = c[i].begin(); it != c[i].end(); ++it) { + if (nodes_new_names[it->_to] != remove_node_flag) { + cc[nodes_new_names[i]].push_back(edge( + nodes_new_names[it->_to], it->_cost)); + } + } + } + } + } + + min_cost_flow mcf; + + T my_dist; + + std::vector>> flows(bb.size()); + + T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); + + my_dist = pre_flow_cost + // pre-flowing on cases where it was possible + mcf_dist + // solution of the transportation problem + (abs_diff_sum_supply_sum_denamd + * extra_mass_penalty); // emd-hat extra mass penalty + + return my_dist; + +} +} +} + + +// end min_cost_flow + +// Copyright (c) 2009-2012, Ofir Pele +// All rights reserved. + +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the The Hebrew University of Jerusalem nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/src/classify/classifier/knn.cpp b/src/classify/classifier/knn.cpp index f7b5ebb94..11e4dab86 100644 --- a/src/classify/classifier/knn.cpp +++ b/src/classify/classifier/knn.cpp @@ -159,7 +159,7 @@ std::unique_ptr make_multi_index_classifier( auto use_weighted = config.get_as("weighted").value_or(false); return make_unique(std::move(training), std::move(inv_idx), *k, - index::make_ranker(*ranker), use_weighted); + index::make_ranker(config, *ranker), use_weighted); } } } diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index e2013f98a..4372e0649 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -21,6 +21,42 @@ namespace embeddings using vocab_type = hashing::probe_map; + +word_embeddings::word_embeddings(std::istream &vectors, size_t num_lines, + size_t dimension) + : vector_size_{dimension}, + id_to_term_(num_lines), + term_to_id_{static_cast(std::ceil( + id_to_term_.size() / vocab_type::default_max_load_factor()))}, + embeddings_(vector_size_ * (id_to_term_.size() + 1)) +{ + printing::progress progress{" > Loading embeddings: ", id_to_term_.size()}; + + for (std::size_t tid = 0; tid < id_to_term_.size(); ++tid) + { + if (!vectors) + throw word_embeddings_exception{ + "embeddings stream ended unexpectedly"}; + + progress(tid); + + vectors >> id_to_term_[tid]; + term_to_id_[id_to_term_[tid]] = tid; + + auto vec = vector(tid); + std::generate(vec.begin(), vec.end(), + [&]() { + double v; + vectors >> v; + return v; }); + auto len = math::operators::l2norm(vec); + std::transform(vec.begin(), vec.end(), vec.begin(), + [=](double weight) { return weight / len; }); + } + + +} + word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) : vector_size_{io::packed::read(vectors)}, id_to_term_(io::packed::read(vocab)), @@ -43,6 +79,7 @@ word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) std::generate(vec.begin(), vec.end(), [&]() { return io::packed::read(vectors); }); } + } word_embeddings::word_embeddings(std::istream& vocab, std::istream& first, @@ -109,7 +146,7 @@ util::array_view word_embeddings::vector(std::size_t tid) const return {embeddings_.data() + tid * vector_size_, vector_size_}; } -embedding word_embeddings::at(util::string_view term) const +embedding word_embeddings::at(std::string term) const { std::size_t tid; auto v_it = term_to_id_.find(term); @@ -124,6 +161,22 @@ embedding word_embeddings::at(util::string_view term) const return {tid, vector(tid)}; } + +int64_t word_embeddings::tid(std::string term) const +{ + int64_t tid; + auto v_it = term_to_id_.find(term); + if (v_it == term_to_id_.end()) + { + tid = -1; + } + else + { + tid = v_it->value(); + } + return tid; +} + util::string_view word_embeddings::term(std::size_t tid) const { if (tid >= id_to_term_.size()) @@ -175,6 +228,32 @@ word_embeddings load_embeddings(const cpptoml::table& config) throw word_embeddings_exception{"embeddings directory does not exist: " + *prefix}; + auto mode = config.get_as("mode").value_or("average"); + + if (mode == "txt") + { + std::ifstream target{*prefix + "/embeddings.target.txt"}; + if (!target) + throw word_embeddings_exception{"missing target vectors in: " + + *prefix}; + auto lines = filesystem::num_lines(*prefix + "/embeddings.target.txt"); + auto dim = config.get_as("dim"); + if(!dim) + { + std::string line; + std::getline(target, line); + std::istringstream iss(line); + std::vector results((std::istream_iterator(iss)), + std::istream_iterator()); + dim = results.size() - 1; + + } + target.seekg(0, target.beg); + return {target, lines, *dim}; + + } + + std::ifstream vocab{*prefix + "/vocab.bin", std::ios::binary}; if (!vocab) throw word_embeddings_exception{"missing vocabulary file in: " @@ -184,7 +263,6 @@ word_embeddings load_embeddings(const cpptoml::table& config) std::ifstream context{*prefix + "/embeddings.context.bin", std::ios::binary}; - auto mode = config.get_as("mode").value_or("average"); if (mode == "average") { if (!target) diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index 20518f751..d806d5deb 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -9,8 +9,8 @@ add_library(meta-ranker absolute_discount.cpp kl_divergence_prf.cpp rocchio.cpp ranker.cpp - ranker_factory.cpp) -target_link_libraries(meta-ranker meta-index) + ranker_factory.cpp wmd_base.cpp) +target_link_libraries(meta-ranker meta-index meta-embeddings meta-util) install(TARGETS meta-ranker EXPORT meta-exports diff --git a/src/index/ranker/ranker_factory.cpp b/src/index/ranker/ranker_factory.cpp index 86c1069af..9b0cb2d04 100644 --- a/src/index/ranker/ranker_factory.cpp +++ b/src/index/ranker/ranker_factory.cpp @@ -3,6 +3,7 @@ * @author Chase Geigle */ +#include #include "cpptoml.h" #include "meta/index/ranker/all.h" #include "meta/index/ranker/ranker_factory.h" @@ -31,6 +32,7 @@ ranker_factory::ranker_factory() reg(); reg(); reg(); + reg(); } std::unique_ptr make_ranker(const cpptoml::table& config) diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp new file mode 100644 index 000000000..717cf2a6c --- /dev/null +++ b/src/index/ranker/wmd_base.cpp @@ -0,0 +1,181 @@ +// +// Created by lolik111 on 17.11.17. +// + +#include +#include "meta/index/ranker/wmd_base.h" +#include "meta/index/postings_data.h" +#include "meta/util/fixed_heap.h" +#include "meta/index/score_data.h" +#include "meta/logging/logger.h" +#include "meta/index/forward_index.h" + + +namespace meta +{ +namespace index +{ + + +const util::string_view wmd_base::id = "wmd-base"; + +const std::string wmd_base::default_mode = "rwmd"; + +const constexpr size_t wmd_base::default_cache_size; + +wmd_base::wmd_base(std::shared_ptr fwd, + std::shared_ptr embeddings, + size_t cache_size, size_t nthreads) + : fwd_{std::move(fwd)}, + embeddings_{embeddings}, + nthreads_{nthreads}, + cache_size_{cache_size}, + cache_{std::make_shared, double> > (nthreads, cache_size)} +{ + +} + +void wmd_base::save(std::ostream &out) const +{ + io::packed::write(out, id); + io::packed::write(out, nthreads_); + io::packed::write(out, cache_size_); + io::packed::write(out, fwd_->index_name()); + +} + +wmd_base::wmd_base(std::istream &in) : + nthreads_{io::packed::read(in)}, + cache_size_{io::packed::read(in)}, + cache_{std::make_shared, double> >(nthreads_, + cache_size_)} +{ + auto path = io::packed::read(in); + auto cfg = cpptoml::parse_file(path + "/config.toml"); + fwd_ = make_index(*cfg); + + embeddings_ = std::make_shared + (embeddings::load_embeddings(*cfg)); +} + + +std::vector +wmd_base::rank(ranker_context &ctx, uint64_t num_results, + const filter_function_type &filter) +{ + auto results = util::make_fixed_heap( + num_results, + [](const search_result &a, const search_result &b){ + return a.score < b.score; + }); + + meta::index::em_distance emd(cache_, index::em_distance::l2diff_norm, + embeddings_->vector_size()); + + for (auto doc : fwd_->docs()) { + if (!filter(doc)) continue; + + std::vector> tf = fwd_->search_primary( + doc)->counts(); + + auto doc1 = create_document(tf, ctx); + + std::vector> tf_pc; + std::vector pc = ctx.postings; + for (auto one: pc) { + tf_pc.push_back(std::pair(one.t_id, + one.query_term_weight)); + } + + auto doc2 = create_document(tf_pc, ctx); + +// double score1 = emd.emd_relaxed(doc1, doc2); +// double score2 = emd.emd_relaxed(doc2, doc1); +// results.emplace(search_result(doc, (float) std::max(score1, score2))); + auto time + = common::time([&]() { + results.emplace(search_result{doc, static_cast(emd + .forward_emd(doc1, doc2))}); + }); + int p = 3; + } + + return results.extract_top(); + +} + +meta::index::Document +wmd_base::create_document(std::vector> tf, + ranker_context &ctx) +{ + size_t unique_terms_count = tf.size(); + size_t all_terms_count = 0; + + meta::index::Document document; + document.vectors = std::vector>(); + document.vectors.reserve(unique_terms_count); + document.ids = std::vector(); + document.ids.reserve(unique_terms_count); + document.weights = std::vector(); + document.weights.reserve(unique_terms_count); + + for (auto term_data : tf) + { + std::string term = fwd_->term_text(term_data.first); + + auto vec_id = this->embeddings_->tid(term); + + if (vec_id >= 0) { + all_terms_count += term_data.second; + auto embedding = this->embeddings_->at(term); + document.vectors.emplace_back(std::vector( + embedding.v.begin(), embedding.v.end())); + document.weights.emplace_back(term_data.second); + document.ids.emplace_back(vec_id); + + } else { + unique_terms_count--; + } + } + + using namespace meta::math::operators; + + document.weights = document.weights / unique_terms_count; + document.n_terms = unique_terms_count; + + return document; +} + + +template<> +std::unique_ptr +make_ranker(const cpptoml::table &global, + const cpptoml::table &local) +{ + if (global.begin() == global.end()) + throw ranker_exception{"empty global configuration provided to " + "construction of wmd_base ranker"}; + auto embeddings = global.get_table("embeddings"); + if (!embeddings) + throw std::runtime_error{ + "\"embeddings\" group needed in config file!"}; + auto glove = embeddings::load_embeddings(*embeddings); + + auto mode = local.get_as("mode").value_or + (wmd_base::default_mode); + auto cache_size = local.get_as("cache-per-thread").value_or + (wmd_base::default_cache_size); + size_t nthreads = local.get_as("num-threads").value_or + (std::thread::hardware_concurrency()); + + auto f_idx = make_index(global); + + return make_unique(f_idx, + std::make_shared( + glove), cache_size, nthreads); +} +} +} + From 60e962e1886f3258819214d7dc879b28cb7ea7a0 Mon Sep 17 00:00:00 2001 From: Valiullin Albert Date: Wed, 29 Nov 2017 20:22:40 +0300 Subject: [PATCH 02/15] [ranker]: refactoring and applying codestyle to wmd --- include/meta/index/ranker/emd.h | 12 +- include/meta/index/ranker/wmd_base.h | 81 ++-- include/meta/util/min_cost_flow.h | 103 +++-- include/meta/util/min_cost_flow.tcc | 581 ++++++++++++--------------- src/index/ranker/wmd_base.cpp | 141 +++---- 5 files changed, 415 insertions(+), 503 deletions(-) diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index 04975a781..ea987bb33 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -1,6 +1,7 @@ -// -// Created by lolik111 on 19.11.17. -// +/** + * @file emd.h + * @author lolik111 + */ #ifndef META_EMD_H #define META_EMD_H @@ -115,8 +116,7 @@ class em_distance } } util::min_cost_flow mcf; - auto score = mcf.emd_hat(supply, demand, supply, demand, cost, - -1.0); + auto score = mcf.emd_hat(supply, demand, supply, demand, cost); return score; } @@ -362,7 +362,7 @@ class em_distance double remaining = doc1.weights[i]; for (auto it = ids.begin(); it != ids.end(); it++) { - uint64_t w = (uint64_t) *it; + auto w = (uint64_t) *it; if (remaining < doc2.weights[w]) { acc += remaining * dist(doc1.vectors[i], doc2.vectors[w]); diff --git a/include/meta/index/ranker/wmd_base.h b/include/meta/index/ranker/wmd_base.h index ce82c4876..c181885f1 100644 --- a/include/meta/index/ranker/wmd_base.h +++ b/include/meta/index/ranker/wmd_base.h @@ -1,23 +1,22 @@ -// -// Created by lolik111 on 17.11.17. -// +/** + * @file wmd_base.h + * @author lolik111 + */ #ifndef META_WMD_BASE_H #define META_WMD_BASE_H - -#include "meta/index/ranker/ranker_factory.h" -#include "meta/index/ranker/ranker.h" #include "meta/embeddings/word_embeddings.h" -#include "meta/util/string_view.h" -#include "meta/util/array_view.h" #include "meta/index/ranker/emd.h" - +#include "meta/index/ranker/ranker.h" +#include "meta/index/ranker/ranker_factory.h" +#include "meta/util/array_view.h" +#include "meta/util/string_view.h" namespace meta { - namespace index - { +namespace index +{ /** * Implements word mover's distance model. @@ -37,47 +36,49 @@ namespace meta * cache-per-thread # size of cache per each thread * ~~~ */ - class wmd_base : public ranker - { - public: - /// Identifier for this ranker. - const static util::string_view id; +class wmd_base : public ranker +{ + public: + /// Identifier for this ranker. + const static util::string_view id; - const static std::string default_mode; + const static std::string default_mode; - const static constexpr size_t default_cache_size = 1000000; + const static constexpr size_t default_cache_size = 1000000; - wmd_base(std::shared_ptr fwd, - std::shared_ptr embeddings, - size_t nthreads, size_t cache_size); + wmd_base(std::shared_ptr fwd, + std::shared_ptr embeddings, + size_t nthreads, size_t cache_size); - wmd_base(std::istream& in); + wmd_base(std::istream& in); - void save(std::ostream& out) const override; + void save(std::ostream& out) const override; - std::vector - rank(ranker_context& ctx, uint64_t num_results, - const filter_function_type& filter) override; + std::vector + rank(ranker_context& ctx, uint64_t num_results, + const filter_function_type& filter) override; - private: - std::shared_ptr fwd_; - std::shared_ptr, double>> cache_; - std::shared_ptr embeddings_; - const size_t nthreads_; - const size_t cache_size_; + private: + std::shared_ptr fwd_; + std::shared_ptr, + double>> + cache_; + std::shared_ptr embeddings_; + const size_t nthreads_; + const size_t cache_size_; - meta::index::Document create_document(std::vector> tf, ranker_context &ctx); - }; + meta::index::Document + create_document(std::vector> tf, + ranker_context& ctx); +}; /** * Specialization of the factory method used to create wmd * rankers. */ - template <> - std::unique_ptr - make_ranker(const cpptoml::table& global, - const cpptoml::table& local); - - } +template <> +std::unique_ptr make_ranker(const cpptoml::table& global, + const cpptoml::table& local); +} } #endif diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h index 6e2e5dbfd..d0e06a2d1 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/util/min_cost_flow.h @@ -1,11 +1,7 @@ -// -// Created by lolik111 on 29.11.17. -// - - -// -// Created by lolik111 on 28.11.17. -// +/** + * @file min_cost_flow.h + * @author lolik111 + */ #ifndef FAST_EMD_MIN_COST_FLOW_H #define FAST_EMD_MIN_COST_FLOW_H @@ -21,77 +17,69 @@ namespace meta { namespace util { -template +template struct edge; -template +template struct edge0; -template +template struct edge1; -template +template struct edge2; -template +template struct edge3; -template +template class min_cost_flow { -public: + public: + NumT emd_hat(const std::vector& supply_orig, + const std::vector& demand_orig, + const std::vector& supply, + const std::vector& demand, + const std::vector>& cost); // e - supply(positive) and demand(negative). // c[i] - edges that goes from node i. first is the second nod // x - the flow is returned in it + NumT compute_min_cost_flow(std::vector& e, + const std::vector>>& c, + std::vector>>& x); - NumT emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - NumT extra_mass_penalty); - - NumT compute_min_cost_flow(std::vector &e, - const std::vector>> &c, - std::vector>> &x); - - -private: - + private: size_t _num_nodes; std::vector _nodes_to_demand; - template - static T integral_emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - T extra_mass_penalty); + template + static T integral_emd_hat(const std::vector& supply_orig, + const std::vector& demand_orig, + const std::vector& supply, + const std::vector& demand, + const std::vector>& cost); void - compute_shortest_path(std::vector &d, std::vector &prev, - + compute_shortest_path(std::vector& d, std::vector& prev, size_t from, - std::vector>> &cost_forward, - std::vector>> &cost_backward, - - const std::vector &e, size_t &l); + std::vector>>& cost_forward, + std::vector>>& cost_backward, + const std::vector& e, size_t& l); - void heap_decrease_key(std::vector> &demand, - std::vector &nodes_to_demand, size_t v, + void heap_decrease_key(std::vector>& demand, + std::vector& nodes_to_demand, size_t v, NumT alt); - void heap_remove_first(std::vector> &demand, - std::vector &nodes_to_demand); + void heap_remove_first(std::vector>& demand, + std::vector& nodes_to_demand); - void heapify(std::vector> &demand, - std::vector &nodes_to_demand, size_t i); + void heapify(std::vector>& demand, + std::vector& nodes_to_demand, size_t i); - void swap_heap(std::vector> &demand, - std::vector &nodes_to_demand, size_t i, size_t j); + void swap_heap(std::vector>& demand, + std::vector& nodes_to_demand, size_t i, size_t j); size_t LEFT(size_t i) { @@ -107,9 +95,14 @@ class min_cost_flow { return (i - 1) / 2; } - }; -// end min_cost_flow +} +} + +#include "min_cost_flow.tcc" + +#endif //FAST_EMD_MIN_COST_FLOW_H + // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. @@ -137,9 +130,3 @@ class min_cost_flow // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -} -} - -#include "min_cost_flow.tcc" - -#endif //FAST_EMD_MIN_COST_FLOW_H diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index cc7fba9f5..b3f476edc 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -1,18 +1,23 @@ +/** + * @file min_cost_flow.tcc + * @author lolik111 + */ + +#include "min_cost_flow.h" #include #include +#include #include #include #include -#include #include -#include -#include "min_cost_flow.h" +#include namespace meta { namespace util { -template +template struct edge { edge(size_t to, CostType cost) : _to(to), _cost(cost) @@ -23,11 +28,11 @@ struct edge CostType _cost; }; -template +template struct edge0 { - edge0(size_t to, CostType cost, CostType flow) : _to(to), _cost(cost), - _flow(flow) + edge0(size_t to, CostType cost, CostType flow) + : _to(to), _cost(cost), _flow(flow) { } @@ -36,11 +41,11 @@ struct edge0 CostType _flow; }; -template +template struct edge1 { - edge1(size_t to, CostType reduced_cost) : _to(to), - _reduced_cost(reduced_cost) + edge1(size_t to, CostType reduced_cost) + : _to(to), _reduced_cost(reduced_cost) { } @@ -48,13 +53,13 @@ struct edge1 CostType _reduced_cost; }; -template +template struct edge2 { edge2(size_t to, CostType reduced_cost, CostType residual_capacity) - : _to(to), - _reduced_cost(reduced_cost), - _residual_capacity(residual_capacity) + : _to(to), + _reduced_cost(reduced_cost), + _residual_capacity(residual_capacity) { } @@ -63,7 +68,7 @@ struct edge2 CostType _residual_capacity; }; -template +template struct edge3 { edge3(size_t to = 0, DistType dist = 0) : _to(to), _dist(dist) @@ -74,10 +79,10 @@ struct edge3 DistType _dist; }; -template -NumT min_cost_flow::compute_min_cost_flow(std::vector &e, - const std::vector>> &c, - std::vector>> &x) +template +NumT min_cost_flow::compute_min_cost_flow( + std::vector& e, const std::vector>>& c, + std::vector>>& x) { assert(e.size() == c.size()); @@ -86,48 +91,45 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, _num_nodes = e.size(); _nodes_to_demand.resize(_num_nodes); - for (size_t from = 0; from < _num_nodes; ++from) { + // init flow + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { - for (auto it = c[from].begin(); it != c[from].end(); ++it) { - x[from].push_back(edge0(it->_to, it->_cost, 0)); - x[it->_to].push_back(edge0(from, -it->_cost, 0)); - } - } // it + x[from].push_back(edge0(it->_to, it->_cost, 0)); + x[it->_to].push_back(edge0(from, -it->_cost, 0)); + } } // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) // Note that for forward edges the residual capacity is infinity std::vector>> r_cost_forward(_num_nodes); + for (size_t from = 0; from < _num_nodes; ++from) { - for (size_t from = 0; from < _num_nodes; ++from) { - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) { - r_cost_forward[from].push_back( - edge1(it->_to, it->_cost)); - } - } + for (auto it = c[from].begin(); it != c[from].end(); ++it) + { + r_cost_forward[from].push_back(edge1(it->_to, it->_cost)); } } // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) // Since the flow at the beginning is 0, the residual capacity is also zero std::vector>> r_cost_cap_backward(_num_nodes); - - for (size_t from = 0; from < _num_nodes; ++from) { - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) { - r_cost_cap_backward[it->_to].push_back( - edge2(from, -it->_cost, 0)); - } - } + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) + { + r_cost_cap_backward[it->_to].push_back( + edge2(from, -it->_cost, 0)); } + } + // Max supply NumT U = 0; + for (size_t i = 0; i < _num_nodes; ++i) { - for (size_t i = 0; i < _num_nodes; ++i) { - if (e[i] > U) - U = e[i]; - } + if (e[i] > U) + U = e[i]; } std::vector d(_num_nodes); @@ -138,9 +140,12 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, NumT max_supply = 0; size_t k = 0; - for (size_t i = 0; i < _num_nodes; ++i) { - if (e[i] > 0) { - if (max_supply < e[i]) { + for (size_t i = 0; i < _num_nodes; ++i) + { + if (e[i] > 0) + { + if (max_supply < e[i]) + { max_supply = e[i]; k = i; } @@ -151,13 +156,10 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, delta = max_supply; size_t l; - compute_shortest_path(d, prev, k, r_cost_forward, - r_cost_cap_backward, e, l); + compute_shortest_path(d, prev, k, r_cost_forward, r_cost_cap_backward, + e, l); - //--------------------------------------------------------------- // find delta (minimum on the path from k to l) - // delta= e[k]; - // if (-e[l]::compute_min_cost_flow(std::vector &e, // residual auto itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) - && (itccb->_to != to)) { + && (itccb->_to != to)) + { ++itccb; } - if (itccb != r_cost_cap_backward[from].end()) { + if (itccb != r_cost_cap_backward[from].end()) + { if (itccb->_residual_capacity < delta) delta = itccb->_residual_capacity; } @@ -178,16 +182,16 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, to = from; } while (to != k); - // augment delta flow from k to l (backwards actually...) to = l; - do { + do + { size_t from = prev[to]; assert(from != to); - // TODO - might do here O(n) can be done in O(1) auto itx = x[from].begin(); - while (itx->_to != to) { + while (itx->_to != to) + { ++itx; } itx->_flow += delta; @@ -195,10 +199,12 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, // update residual for backward edges auto itccb = r_cost_cap_backward[to].begin(); while ((itccb != r_cost_cap_backward[to].end()) - && (itccb->_to != from)) { + && (itccb->_to != from)) + { ++itccb; } - if (itccb != r_cost_cap_backward[to].end()) { + if (itccb != r_cost_cap_backward[to].end()) + { itccb->_residual_capacity += delta; } itccb = r_cost_cap_backward[from].begin(); @@ -218,32 +224,26 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, to = from; } while (to != k); - - } // while true (until we break when S or T is empty) + } // compute distance from x NumT dist = 0; - - for (size_t from = 0; from < _num_nodes; ++from) - { - for (auto it = x[from].begin(); it != x[from].end(); ++it) - { - dist += (it->_cost * it->_flow); - } - } - + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = x[from].begin(); it != x[from].end(); ++it) + { + dist += (it->_cost * it->_flow); + } + } return dist; } -template -void min_cost_flow:: -compute_shortest_path(std::vector &d, std::vector &prev, - - size_t from, - std::vector>> &cost_forward, - std::vector>> &cost_backward, - - const std::vector &e, size_t &l) +template +void min_cost_flow::compute_shortest_path( + std::vector& d, std::vector& prev, size_t from, + std::vector>>& cost_forward, + std::vector>>& cost_backward, + const std::vector& e, size_t& l) { // Making heap (all inf except 0, so we are saving comparisons...) std::vector> demand(_num_nodes); @@ -253,25 +253,26 @@ compute_shortest_path(std::vector &d, std::vector &prev, demand[0]._dist = 0; size_t j = 1; - // TODO: both of these into a function? - - for (size_t i = 0; i < from; ++i) { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); - ++j; - } - - for (size_t i = from + 1; i < _num_nodes; ++i) { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); - ++j; - } + for (size_t i = 0; i < from; ++i) + { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } + + for (size_t i = from + 1; i < _num_nodes; ++i) + { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } // main loop std::vector final_nodes_flg(_num_nodes, false); - do { + do + { size_t u = demand[0]._to; d[u] = demand[0]._dist; // final distance @@ -284,6 +285,7 @@ compute_shortest_path(std::vector &d, std::vector &prev, heap_remove_first(demand, _nodes_to_demand); + // neighbors of u for (auto it = cost_forward[u].begin(); it != cost_forward[u].end(); ++it) { @@ -297,13 +299,11 @@ compute_shortest_path(std::vector &d, std::vector &prev, prev[v] = u; } } - - for (auto it = cost_backward[u].begin(); - it != cost_backward[u].end(); ++it) + for (auto it = cost_backward[u].begin(); it != cost_backward[u].end(); ++it) { if (it->_residual_capacity > 0) { - assert(it->_reduced_cost >= 0); + assert(it->_reduced_cost >= 0); NumT alt = d[u] + it->_reduced_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) @@ -314,84 +314,89 @@ compute_shortest_path(std::vector &d, std::vector &prev, } } } - // it } while (!demand.empty()); - - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { - + // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) + { for (auto it = cost_forward[node_from].begin(); - it != cost_forward[node_from].end(); ++it) { - if (final_nodes_flg[node_from]) { + it != cost_forward[node_from].end(); ++it) + { + if (final_nodes_flg[node_from]) + { it->_reduced_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) { + if (final_nodes_flg[it->_to]) + { it->_reduced_cost -= d[it->_to] - d[l]; } } - } - // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) - - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { - + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) + { for (auto it = cost_backward[node_from].begin(); - it != cost_backward[node_from].end(); ++it) { - if (final_nodes_flg[node_from]) { + it != cost_backward[node_from].end(); ++it) + { + if (final_nodes_flg[node_from]) + { it->_reduced_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) { + if (final_nodes_flg[it->_to]) + { it->_reduced_cost -= d[it->_to] - d[l]; } } } - - } -template -void min_cost_flow::heap_decrease_key(std::vector> &demand, - std::vector &nodes_to_demand, - size_t v, - NumT alt) +template +void min_cost_flow::heap_decrease_key( + std::vector>& demand, std::vector& nodes_to_demand, + size_t v, NumT alt) { size_t i = nodes_to_demand[v]; demand[i]._dist = alt; - while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) { + while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) + { swap_heap(demand, nodes_to_demand, i, PARENT(i)); i = PARENT(i); } -} // heap_decrease_key +} -template -void min_cost_flow::heap_remove_first(std::vector> &demand, - std::vector &nodes_to_demand) +template +void min_cost_flow::heap_remove_first( + std::vector>& demand, std::vector& nodes_to_demand) { swap_heap(demand, nodes_to_demand, 0, demand.size() - 1); demand.pop_back(); heapify(demand, nodes_to_demand, 0); -} // heap_remove_first +} -template -void min_cost_flow::heapify(std::vector> &demand, - std::vector &nodes_to_demand, size_t i) +template +void min_cost_flow::heapify(std::vector>& demand, + std::vector& nodes_to_demand, + size_t i) { - do { + do + { // TODO: change to loop size_t l = LEFT(i); size_t r = RIGHT(i); size_t smallest; - if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) { + if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) + { smallest = l; - } else { + } + else + { smallest = i; } - if ((r < demand.size()) - && (demand[r]._dist < demand[smallest]._dist)) { + if ((r < demand.size()) && (demand[r]._dist < demand[smallest]._dist)) + { smallest = r; } @@ -402,13 +407,12 @@ void min_cost_flow::heapify(std::vector> &demand, i = smallest; } while (true); - } -template -void min_cost_flow::swap_heap(std::vector> &demand, - std::vector &nodes_to_demand, size_t i, - size_t j) +template +void min_cost_flow::swap_heap(std::vector>& demand, + std::vector& nodes_to_demand, + size_t i, size_t j) { edge3 tmp = demand[i]; demand[i] = demand[j]; @@ -417,19 +421,20 @@ void min_cost_flow::swap_heap(std::vector> &demand, nodes_to_demand[demand[i]._to] = i; } - -template -NumT min_cost_flow::emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - NumT extra_mass_penalty) +template +NumT min_cost_flow::emd_hat(const std::vector& supply_orig, + const std::vector& demand_orig, + const std::vector& supply, + const std::vector& demand, + const std::vector>& cost) { - if (std::is_integral::value) { + if (std::is_integral::value) + { return integral_emd_hat(supply_orig, demand_orig, supply, demand, - cost, extra_mass_penalty); - } else { + cost); + } + else + { const double mult_factor = 1000000; @@ -445,241 +450,176 @@ NumT min_cost_flow::emd_hat(const std::vector &supply_orig, double sum_supply = 0.0; double sum_demand = 0.0; double max_cost = cost[0][0]; - for (size_t i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) + { sum_supply += supply_orig[i]; sum_demand += demand_orig[i]; - for (size_t j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) + { if (cost[i][j] > max_cost) max_cost = cost[i][j]; } } - double min_sum = std::min(sum_supply, sum_demand); + double max_sum = std::max(sum_supply, sum_demand); double supply_demand_norm_factor = mult_factor / max_sum; double cost_norm_factor = mult_factor / max_cost; - for (size_t i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) + { i_supply_orig[i] = static_cast( - floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); + floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); i_demand_orig[i] = static_cast( - floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); - i_supply[i] = static_cast(floor( - supply[i] * supply_demand_norm_factor + 0.5)); - i_demand[i] = static_cast(floor( - demand[i] * supply_demand_norm_factor + 0.5)); - for (size_t j = 0; j < n; ++j) { + floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); + i_supply[i] = static_cast( + floor(supply[i] * supply_demand_norm_factor + 0.5)); + i_demand[i] = static_cast( + floor(demand[i] * supply_demand_norm_factor + 0.5)); + for (size_t j = 0; j < n; ++j) + { i_cost[i][j] = static_cast( - floor(cost[i][j] * cost_norm_factor + 0.5)); + floor(cost[i][j] * cost_norm_factor + 0.5)); } } // computing distance without extra mass penalty - double dist - = integral_emd_hat(i_supply_orig, i_demand_orig, - i_supply, i_demand, i_cost, 0); + double dist = integral_emd_hat(i_supply_orig, i_demand_orig, + i_supply, i_demand, i_cost); + // unnormalize dist = dist / supply_demand_norm_factor; dist = dist / cost_norm_factor; - // adding extra mass penalty - if (extra_mass_penalty == -1) - extra_mass_penalty = max_cost; - dist += (max_sum - min_sum) * extra_mass_penalty; - return dist; - } - } -template -template< typename T> -T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply_c, - const std::vector &demand_c, - const std::vector> - &cost_c, - T extra_mass_penalty) +template +template +T min_cost_flow::integral_emd_hat( + const std::vector& supply_orig, const std::vector& demand_orig, + const std::vector& supply_c, const std::vector& demand_c, + const std::vector>& cost_c) { size_t n = supply_c.size(); assert(demand_c.size() == n); - // Ensuring that the supplier - supply, have more mass. - std::vector supply; - std::vector demand; std::vector> cost(cost_c); T abs_diff_sum_supply_sum_denamd; T sum_supply = 0; T sum_demand = 0; + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) - sum_supply += supply_c[i]; + sum_supply += supply_c[i]; + sum_demand += demand_c[i]; } - { - for (size_t i = 0; i < n; ++i) - sum_demand += demand_c[i]; - } - bool need_to_swap_flow = false; - if (sum_demand > sum_supply) { - need_to_swap_flow = true; - supply = demand_c; - demand = supply_c; - // transpose cost - for (size_t i = 0; i < n; ++i) { - for (size_t j = 0; j < n; ++j) { - cost[i][j] = cost_c[j][i]; - } - } - abs_diff_sum_supply_sum_denamd = sum_demand - sum_supply; - } else { - supply = supply_c; - demand = demand_c; - abs_diff_sum_supply_sum_denamd = sum_supply - sum_demand; - } - // if (need_to_swap_flow) cout << "need_to_swap_flow" << endl; - // creating the b vector that contains all vertexes + // creating the b vector that contains all vertices std::vector b(2 * n + 2); const size_t threshold_node = 2 * n; const size_t artificial_node = 2 * n + 1; // need to be last ! + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - b[i] = supply[i]; - } - } - { - for (size_t i = n; i < 2 * n; ++i) { - b[i] = (demand[i - n]); - } + b[i] = supply_c[i]; + b[i + n] = demand_c[i]; } - b[threshold_node] = -abs_diff_sum_supply_sum_denamd; + + b[threshold_node] = 0; b[artificial_node] = 0; - //------------------------------------------------------- - //------------------------------------------------------- T max_cost = 0; + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - { - for (size_t j = 0; j < n; ++j) { - assert(cost[i][j] >= 0); - if (cost[i][j] > max_cost) - max_cost = cost[i][j]; - } - } + for (size_t j = 0; j < n; ++j) + { + assert(cost[i][j] >= 0); + if (cost[i][j] > max_cost) + max_cost = cost[i][j]; } } - if (extra_mass_penalty == -1) - extra_mass_penalty = max_cost; - //------------------------------------------------------- - //============================================================= std::set sources_that_flow_not_only_to_thresh; std::set sinks_that_get_flow_not_only_from_thresh; T pre_flow_cost = 0; - //============================================================= - //============================================================= // regular edges between sinks and sources without threshold edges std::vector>> c(b.size()); + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - if (b[i] == 0) + if (b[i] == 0) + continue; + for (size_t j = 0; j < n; ++j) + { + if (b[j + n] == 0) continue; - { - for (size_t j = 0; j < n; ++j) { - if (b[j + n] == 0) - continue; - if (cost[i][j] == max_cost) - continue; - c[i].push_back(edge(j + n, cost[i][j])); - } - } // j - } - } // i - - // checking which are not isolated - { - for (size_t i = 0; i < n; ++i) { - if (b[i] == 0) + if (cost[i][j] == max_cost) continue; - { - for (size_t j = 0; j < n; ++j) { - if (b[j + n] == 0) - continue; - if (cost[i][j] == max_cost) - continue; - sources_that_flow_not_only_to_thresh.insert(i); - sinks_that_get_flow_not_only_from_thresh.insert(j + n); - } - } // j + c[i].push_back(edge(j + n, cost[i][j])); + + // checking which are not isolated + sources_that_flow_not_only_to_thresh.insert(i); + sinks_that_get_flow_not_only_from_thresh.insert(j + n); } - } // i + } // converting all sinks to negative + for (size_t i = n; i < 2 * n; ++i) { - for (size_t i = n; i < 2 * n; ++i) { - b[i] = -b[i]; - } + b[i] = -b[i]; } // add edges from/to threshold node, // note that costs are reversed to the paper (see also remark* above) // It is important that it will be this way because of remark* above. + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - c[i].push_back(edge(threshold_node, 0)); - } + c[i].push_back(edge(threshold_node, 0)); } + for (size_t j = 0; j < n; ++j) { - for (size_t j = 0; j < n; ++j) { - c[threshold_node].push_back(edge(j + n, max_cost)); - } + c[threshold_node].push_back(edge(j + n, max_cost)); } // artificial arcs - Note the restriction that only one edge i,j is // artificial so I ignore it... + for (size_t i = 0; i < artificial_node; ++i) { - for (size_t i = 0; i < artificial_node; ++i) { - c[i].push_back(edge(artificial_node, max_cost + 1)); - c[artificial_node].push_back(edge(i, max_cost + 1)); - } + c[i].push_back(edge(artificial_node, max_cost + 1)); + c[artificial_node].push_back(edge(i, max_cost + 1)); } - //============================================================= - //==================================================== - // remove nodes with supply demand of 0 - // and vertexes that are connected only to the - // threshold vertex - //==================================================== + // remove nodes with supply demand of 0 and vertices that are connected only + // to the threshold vertex int current_node_name = 0; - // Note here it should be vector and not vector - // as I'm using -1 as a special flag !!! + // Note here it should be vector and not vector as I'm using -1 + // as a special flag !!! const int remove_node_flag = -1; std::vector nodes_new_names(b.size(), remove_node_flag); std::vector nodes_old_names; nodes_old_names.reserve(b.size()); + for (size_t i = 0; i < n * 2; ++i) { - for (size_t i = 0; i < n * 2; ++i) { - if (b[i] != 0) { - if (sources_that_flow_not_only_to_thresh.find(i) + if (b[i] != 0) + { + if (sources_that_flow_not_only_to_thresh.find(i) != sources_that_flow_not_only_to_thresh.end() - || sinks_that_get_flow_not_only_from_thresh.find(i) - != sinks_that_get_flow_not_only_from_thresh - .end()) { - nodes_new_names[i] = current_node_name; - nodes_old_names.push_back(i); - ++current_node_name; - } else { - if (i >= n) { // sink - pre_flow_cost -= (b[i] * max_cost); - } - b[threshold_node] - += b[i]; // add mass(i=n) + || sinks_that_get_flow_not_only_from_thresh.find(i) + != sinks_that_get_flow_not_only_from_thresh.end()) + { + nodes_new_names[i] = current_node_name; + nodes_old_names.push_back(i); + ++current_node_name; + } + else + { + if (i >= n) + { // sink + pre_flow_cost -= (b[i] * max_cost); } + b[threshold_node] += b[i]; // add mass(i=n) } } - } // i + } + nodes_new_names[threshold_node] = current_node_name; nodes_old_names.push_back(threshold_node); ++current_node_name; @@ -689,33 +629,31 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, std::vector bb(current_node_name); size_t j = 0; + for (size_t i = 0; i < b.size(); ++i) { - for (size_t i = 0; i < b.size(); ++i) { - if (nodes_new_names[i] != remove_node_flag) { - bb[j] = b[i]; - ++j; - } + if (nodes_new_names[i] != remove_node_flag) + { + bb[j] = b[i]; + ++j; } } std::vector>> cc(bb.size()); + for (size_t i = 0; i < c.size(); ++i) { - for (size_t i = 0; i < c.size(); ++i) { - if (nodes_new_names[i] == remove_node_flag) - continue; + if (nodes_new_names[i] == remove_node_flag) + continue; + for (auto it = c[i].begin(); it != c[i].end(); ++it) + { + if (nodes_new_names[it->_to] != remove_node_flag) { - for (auto it = c[i].begin(); it != c[i].end(); ++it) { - if (nodes_new_names[it->_to] != remove_node_flag) { - cc[nodes_new_names[i]].push_back(edge( - nodes_new_names[it->_to], it->_cost)); - } - } + cc[nodes_new_names[i]].push_back( + edge(nodes_new_names[it->_to], it->_cost)); } } } - min_cost_flow mcf; - + min_cost_flow mcf; T my_dist; std::vector>> flows(bb.size()); @@ -723,19 +661,14 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); my_dist = pre_flow_cost + // pre-flowing on cases where it was possible - mcf_dist + // solution of the transportation problem - (abs_diff_sum_supply_sum_denamd - * extra_mass_penalty); // emd-hat extra mass penalty + mcf_dist; // solution of the transportation problem return my_dist; - } } } -// end min_cost_flow - // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 717cf2a6c..a4106a27a 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -1,22 +1,20 @@ -// -// Created by lolik111 on 17.11.17. -// +/** + * @file wmd_base.cpp + * @author lolik111 + */ -#include #include "meta/index/ranker/wmd_base.h" +#include "meta/index/forward_index.h" #include "meta/index/postings_data.h" -#include "meta/util/fixed_heap.h" #include "meta/index/score_data.h" #include "meta/logging/logger.h" -#include "meta/index/forward_index.h" - +#include "meta/util/fixed_heap.h" namespace meta { namespace index { - const util::string_view wmd_base::id = "wmd-base"; const std::string wmd_base::default_mode = "rwmd"; @@ -26,89 +24,84 @@ const constexpr size_t wmd_base::default_cache_size; wmd_base::wmd_base(std::shared_ptr fwd, std::shared_ptr embeddings, size_t cache_size, size_t nthreads) - : fwd_{std::move(fwd)}, - embeddings_{embeddings}, - nthreads_{nthreads}, - cache_size_{cache_size}, - cache_{std::make_shared, double> > (nthreads, cache_size)} + : fwd_{std::move(fwd)}, + embeddings_{embeddings}, + nthreads_{nthreads}, + cache_size_{cache_size}, + cache_{std::make_shared, + double>>(nthreads, + cache_size)} { - } -void wmd_base::save(std::ostream &out) const +void wmd_base::save(std::ostream& out) const { io::packed::write(out, id); io::packed::write(out, nthreads_); io::packed::write(out, cache_size_); io::packed::write(out, fwd_->index_name()); - } -wmd_base::wmd_base(std::istream &in) : - nthreads_{io::packed::read(in)}, - cache_size_{io::packed::read(in)}, - cache_{std::make_shared, double> >(nthreads_, - cache_size_)} +wmd_base::wmd_base(std::istream& in) + : nthreads_{io::packed::read(in)}, + cache_size_{io::packed::read(in)}, + cache_{std::make_shared, + double>>(nthreads_, + cache_size_)} { auto path = io::packed::read(in); auto cfg = cpptoml::parse_file(path + "/config.toml"); fwd_ = make_index(*cfg); - embeddings_ = std::make_shared - (embeddings::load_embeddings(*cfg)); + embeddings_ = std::make_shared( + embeddings::load_embeddings(*cfg)); } - -std::vector -wmd_base::rank(ranker_context &ctx, uint64_t num_results, - const filter_function_type &filter) +std::vector wmd_base::rank(ranker_context& ctx, + uint64_t num_results, + const filter_function_type& filter) { auto results = util::make_fixed_heap( - num_results, - [](const search_result &a, const search_result &b){ - return a.score < b.score; - }); + num_results, [](const search_result& a, const search_result& b) { + return a.score < b.score; + }); - meta::index::em_distance emd(cache_, index::em_distance::l2diff_norm, + meta::index::em_distance emd(cache_, index::em_distance::cosine, embeddings_->vector_size()); - for (auto doc : fwd_->docs()) { - if (!filter(doc)) continue; + for (auto doc : fwd_->docs()) + { + if (!filter(doc)) + continue; - std::vector> tf = fwd_->search_primary( - doc)->counts(); + std::vector> tf + = fwd_->search_primary(doc)->counts(); auto doc1 = create_document(tf, ctx); std::vector> tf_pc; std::vector pc = ctx.postings; - for (auto one: pc) { - tf_pc.push_back(std::pair(one.t_id, - one.query_term_weight)); + for (auto one : pc) + { + tf_pc.push_back( + std::pair(one.t_id, one.query_term_weight)); } auto doc2 = create_document(tf_pc, ctx); -// double score1 = emd.emd_relaxed(doc1, doc2); -// double score2 = emd.emd_relaxed(doc2, doc1); -// results.emplace(search_result(doc, (float) std::max(score1, score2))); - auto time - = common::time([&]() { - results.emplace(search_result{doc, static_cast(emd - .forward_emd(doc1, doc2))}); - }); - int p = 3; + double score1 = emd.emd_relaxed(doc1, doc2); + double score2 = emd.emd_relaxed(doc2, doc1); + results.emplace(search_result(doc, (float)std::max(score1, score2))); } return results.extract_top(); - } meta::index::Document wmd_base::create_document(std::vector> tf, - ranker_context &ctx) + ranker_context& ctx) { size_t unique_terms_count = tf.size(); size_t all_terms_count = 0; @@ -127,15 +120,17 @@ wmd_base::create_document(std::vector> tf, auto vec_id = this->embeddings_->tid(term); - if (vec_id >= 0) { + if (vec_id >= 0) + { all_terms_count += term_data.second; auto embedding = this->embeddings_->at(term); - document.vectors.emplace_back(std::vector( - embedding.v.begin(), embedding.v.end())); + document.vectors.emplace_back( + std::vector(embedding.v.begin(), embedding.v.end())); document.weights.emplace_back(term_data.second); document.ids.emplace_back(vec_id); - - } else { + } + else + { unique_terms_count--; } } @@ -148,34 +143,30 @@ wmd_base::create_document(std::vector> tf, return document; } - -template<> -std::unique_ptr -make_ranker(const cpptoml::table &global, - const cpptoml::table &local) +template <> +std::unique_ptr make_ranker(const cpptoml::table& global, + const cpptoml::table& local) { if (global.begin() == global.end()) throw ranker_exception{"empty global configuration provided to " - "construction of wmd_base ranker"}; + "construction of wmd_base ranker"}; auto embeddings = global.get_table("embeddings"); if (!embeddings) - throw std::runtime_error{ - "\"embeddings\" group needed in config file!"}; + throw std::runtime_error{"\"embeddings\" group needed in config file!"}; auto glove = embeddings::load_embeddings(*embeddings); - auto mode = local.get_as("mode").value_or - (wmd_base::default_mode); - auto cache_size = local.get_as("cache-per-thread").value_or - (wmd_base::default_cache_size); - size_t nthreads = local.get_as("num-threads").value_or - (std::thread::hardware_concurrency()); + auto mode + = local.get_as("mode").value_or(wmd_base::default_mode); + auto cache_size = local.get_as("cache-per-thread") + .value_or(wmd_base::default_cache_size); + size_t nthreads = local.get_as("num-threads") + .value_or(std::thread::hardware_concurrency()); auto f_idx = make_index(global); - return make_unique(f_idx, - std::make_shared( - glove), cache_size, nthreads); + return make_unique( + f_idx, std::make_shared(glove), cache_size, + nthreads); } } } - From fe240c7a5f47c54a9ec3a1641326f162d883227a Mon Sep 17 00:00:00 2001 From: Valiullin Albert Date: Wed, 29 Nov 2017 21:07:20 +0300 Subject: [PATCH 03/15] [ranker]: resolving warnings in wmd --- include/meta/index/ranker/emd.h | 5 ++--- include/meta/index/ranker/wmd_base.h | 9 ++++----- include/meta/util/min_cost_flow.h | 8 ++------ include/meta/util/min_cost_flow.tcc | 24 ++++++------------------ src/index/ranker/wmd_base.cpp | 7 +++---- 5 files changed, 17 insertions(+), 36 deletions(-) diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index ea987bb33..b07afc708 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -116,7 +116,7 @@ class em_distance } } util::min_cost_flow mcf; - auto score = mcf.emd_hat(supply, demand, supply, demand, cost); + auto score = mcf.emd_hat(supply, demand, cost); return score; } @@ -420,10 +420,9 @@ class em_distance private: + const size_t nthreads_; std::shared_ptr, double>> cache_; - - const size_t nthreads_; const size_t dimension_; const metric_type dist; std::unordered_map fwd_; - std::shared_ptr, - double>> - cache_; std::shared_ptr embeddings_; const size_t nthreads_; const size_t cache_size_; + std::shared_ptr, + double>> + cache_; meta::index::Document - create_document(std::vector> tf, - ranker_context& ctx); + create_document(std::vector> tf); }; /** diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h index d0e06a2d1..242e38671 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/util/min_cost_flow.h @@ -37,9 +37,7 @@ class min_cost_flow { public: - NumT emd_hat(const std::vector& supply_orig, - const std::vector& demand_orig, - const std::vector& supply, + NumT emd_hat(const std::vector& supply, const std::vector& demand, const std::vector>& cost); @@ -55,9 +53,7 @@ class min_cost_flow std::vector _nodes_to_demand; template - static T integral_emd_hat(const std::vector& supply_orig, - const std::vector& demand_orig, - const std::vector& supply, + static T integral_emd_hat(const std::vector& supply, const std::vector& demand, const std::vector>& cost); diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index b3f476edc..05399eb28 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -422,16 +422,13 @@ void min_cost_flow::swap_heap(std::vector>& demand, } template -NumT min_cost_flow::emd_hat(const std::vector& supply_orig, - const std::vector& demand_orig, - const std::vector& supply, +NumT min_cost_flow::emd_hat(const std::vector& supply, const std::vector& demand, const std::vector>& cost) { if (std::is_integral::value) { - return integral_emd_hat(supply_orig, demand_orig, supply, demand, - cost); + return integral_emd_hat(supply, demand, cost); } else { @@ -440,8 +437,6 @@ NumT min_cost_flow::emd_hat(const std::vector& supply_orig, // Constructing the input const size_t n = supply.size(); - std::vector i_supply_orig(n); - std::vector i_demand_orig(n); std::vector i_supply(n); std::vector i_demand(n); std::vector> i_cost(n, std::vector(n)); @@ -452,8 +447,8 @@ NumT min_cost_flow::emd_hat(const std::vector& supply_orig, double max_cost = cost[0][0]; for (size_t i = 0; i < n; ++i) { - sum_supply += supply_orig[i]; - sum_demand += demand_orig[i]; + sum_supply += supply[i]; + sum_demand += demand[i]; for (size_t j = 0; j < n; ++j) { if (cost[i][j] > max_cost) @@ -466,10 +461,6 @@ NumT min_cost_flow::emd_hat(const std::vector& supply_orig, double cost_norm_factor = mult_factor / max_cost; for (size_t i = 0; i < n; ++i) { - i_supply_orig[i] = static_cast( - floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); - i_demand_orig[i] = static_cast( - floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); i_supply[i] = static_cast( floor(supply[i] * supply_demand_norm_factor + 0.5)); i_demand[i] = static_cast( @@ -481,9 +472,8 @@ NumT min_cost_flow::emd_hat(const std::vector& supply_orig, } } - // computing distance without extra mass penalty - double dist = integral_emd_hat(i_supply_orig, i_demand_orig, - i_supply, i_demand, i_cost); + // computing distance + double dist = integral_emd_hat(i_supply, i_demand, i_cost); // unnormalize dist = dist / supply_demand_norm_factor; @@ -496,7 +486,6 @@ NumT min_cost_flow::emd_hat(const std::vector& supply_orig, template template T min_cost_flow::integral_emd_hat( - const std::vector& supply_orig, const std::vector& demand_orig, const std::vector& supply_c, const std::vector& demand_c, const std::vector>& cost_c) { @@ -504,7 +493,6 @@ T min_cost_flow::integral_emd_hat( assert(demand_c.size() == n); std::vector> cost(cost_c); - T abs_diff_sum_supply_sum_denamd; T sum_supply = 0; T sum_demand = 0; for (size_t i = 0; i < n; ++i) diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index a4106a27a..81c441e63 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -79,7 +79,7 @@ std::vector wmd_base::rank(ranker_context& ctx, std::vector> tf = fwd_->search_primary(doc)->counts(); - auto doc1 = create_document(tf, ctx); + auto doc1 = create_document(tf); std::vector> tf_pc; std::vector pc = ctx.postings; @@ -89,7 +89,7 @@ std::vector wmd_base::rank(ranker_context& ctx, std::pair(one.t_id, one.query_term_weight)); } - auto doc2 = create_document(tf_pc, ctx); + auto doc2 = create_document(tf_pc); double score1 = emd.emd_relaxed(doc1, doc2); double score2 = emd.emd_relaxed(doc2, doc1); @@ -100,8 +100,7 @@ std::vector wmd_base::rank(ranker_context& ctx, } meta::index::Document -wmd_base::create_document(std::vector> tf, - ranker_context& ctx) +wmd_base::create_document(std::vector> tf) { size_t unique_terms_count = tf.size(); size_t all_terms_count = 0; From d43c3ea8a4321838f5de8a43c32dc4aedc264eba Mon Sep 17 00:00:00 2001 From: Valiullin Albert Date: Thu, 30 Nov 2017 02:40:06 +0300 Subject: [PATCH 04/15] [ranker]: refactoring in wmd --- include/meta/index/ranker/emd.h | 2 +- include/meta/util/min_cost_flow.h | 35 +++---- include/meta/util/min_cost_flow.tcc | 142 ++++++++++------------------ 3 files changed, 65 insertions(+), 114 deletions(-) diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index b07afc708..642150800 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -226,7 +226,7 @@ class em_distance } util::min_cost_flow mcf; - std::vector>> f(xtra.size()); + std::vector>> f(xtra.size()); auto score = mcf.compute_min_cost_flow(xtra, edges, f); diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h index 242e38671..45805186f 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/util/min_cost_flow.h @@ -21,16 +21,7 @@ template struct edge; template -struct edge0; - -template -struct edge1; - -template -struct edge2; - -template -struct edge3; +struct edge_weighted; template class min_cost_flow @@ -46,7 +37,7 @@ class min_cost_flow // x - the flow is returned in it NumT compute_min_cost_flow(std::vector& e, const std::vector>>& c, - std::vector>>& x); + std::vector>>& x); private: size_t _num_nodes; @@ -57,24 +48,23 @@ class min_cost_flow const std::vector& demand, const std::vector>& cost); - void - compute_shortest_path(std::vector& d, std::vector& prev, - size_t from, - std::vector>>& cost_forward, - std::vector>>& cost_backward, - const std::vector& e, size_t& l); + void compute_shortest_path( + std::vector& d, std::vector& prev, size_t from, + std::vector>>& cost_forward, + std::vector>>& cost_backward, + const std::vector& e, size_t& l); - void heap_decrease_key(std::vector>& demand, + void heap_decrease_key(std::vector>& demand, std::vector& nodes_to_demand, size_t v, NumT alt); - void heap_remove_first(std::vector>& demand, + void heap_remove_first(std::vector>& demand, std::vector& nodes_to_demand); - void heapify(std::vector>& demand, + void heapify(std::vector>& demand, std::vector& nodes_to_demand, size_t i); - void swap_heap(std::vector>& demand, + void swap_heap(std::vector>& demand, std::vector& nodes_to_demand, size_t i, size_t j); size_t LEFT(size_t i) @@ -97,8 +87,7 @@ class min_cost_flow #include "min_cost_flow.tcc" -#endif //FAST_EMD_MIN_COST_FLOW_H - +#endif // FAST_EMD_MIN_COST_FLOW_H // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index 05399eb28..c0be9ae74 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -17,72 +17,34 @@ namespace meta { namespace util { -template +template struct edge { - edge(size_t to, CostType cost) : _to(to), _cost(cost) + edge(size_t to = 0, T cost = 0) : _to(to), _cost(cost) { } size_t _to; - CostType _cost; + T _cost; }; -template -struct edge0 -{ - edge0(size_t to, CostType cost, CostType flow) - : _to(to), _cost(cost), _flow(flow) - { - } - - size_t _to; - CostType _cost; - CostType _flow; -}; - -template -struct edge1 -{ - edge1(size_t to, CostType reduced_cost) - : _to(to), _reduced_cost(reduced_cost) - { - } - - size_t _to; - CostType _reduced_cost; -}; - -template -struct edge2 -{ - edge2(size_t to, CostType reduced_cost, CostType residual_capacity) - : _to(to), - _reduced_cost(reduced_cost), - _residual_capacity(residual_capacity) - { - } - - size_t _to; - CostType _reduced_cost; - CostType _residual_capacity; -}; - -template -struct edge3 +template +struct edge_weighted { - edge3(size_t to = 0, DistType dist = 0) : _to(to), _dist(dist) + edge_weighted(size_t to, T cost, T amount) + : _to(to), _cost(cost), _amount(amount) { } size_t _to; - DistType _dist; + T _cost; + T _amount; }; template NumT min_cost_flow::compute_min_cost_flow( std::vector& e, const std::vector>>& c, - std::vector>>& x) + std::vector>>& x) { assert(e.size() == c.size()); @@ -96,31 +58,31 @@ NumT min_cost_flow::compute_min_cost_flow( { for (auto it = c[from].begin(); it != c[from].end(); ++it) { - x[from].push_back(edge0(it->_to, it->_cost, 0)); - x[it->_to].push_back(edge0(from, -it->_cost, 0)); + x[from].push_back(edge_weighted(it->_to, it->_cost, 0)); + x[it->_to].push_back(edge_weighted(from, -it->_cost, 0)); } } // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) // Note that for forward edges the residual capacity is infinity - std::vector>> r_cost_forward(_num_nodes); + std::vector>> r_cost_forward(_num_nodes); for (size_t from = 0; from < _num_nodes; ++from) { for (auto it = c[from].begin(); it != c[from].end(); ++it) { - r_cost_forward[from].push_back(edge1(it->_to, it->_cost)); + r_cost_forward[from].push_back(edge(it->_to, it->_cost)); } } // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) // Since the flow at the beginning is 0, the residual capacity is also zero - std::vector>> r_cost_cap_backward(_num_nodes); + std::vector>> r_cost_cap_backward(_num_nodes); for (size_t from = 0; from < _num_nodes; ++from) { for (auto it = c[from].begin(); it != c[from].end(); ++it) { r_cost_cap_backward[it->_to].push_back( - edge2(from, -it->_cost, 0)); + edge_weighted(from, -it->_cost, 0)); } } @@ -175,8 +137,8 @@ NumT min_cost_flow::compute_min_cost_flow( } if (itccb != r_cost_cap_backward[from].end()) { - if (itccb->_residual_capacity < delta) - delta = itccb->_residual_capacity; + if (itccb->_amount < delta) + delta = itccb->_amount; } to = from; @@ -194,7 +156,7 @@ NumT min_cost_flow::compute_min_cost_flow( { ++itx; } - itx->_flow += delta; + itx->_amount += delta; // update residual for backward edges auto itccb = r_cost_cap_backward[to].begin(); @@ -205,7 +167,7 @@ NumT min_cost_flow::compute_min_cost_flow( } if (itccb != r_cost_cap_backward[to].end()) { - itccb->_residual_capacity += delta; + itccb->_amount += delta; } itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) @@ -215,7 +177,7 @@ NumT min_cost_flow::compute_min_cost_flow( } if (itccb != r_cost_cap_backward[from].end()) { - itccb->_residual_capacity -= delta; + itccb->_amount -= delta; } // update e @@ -232,7 +194,7 @@ NumT min_cost_flow::compute_min_cost_flow( { for (auto it = x[from].begin(); it != x[from].end(); ++it) { - dist += (it->_cost * it->_flow); + dist += (it->_cost * it->_amount); } } return dist; @@ -241,23 +203,23 @@ NumT min_cost_flow::compute_min_cost_flow( template void min_cost_flow::compute_shortest_path( std::vector& d, std::vector& prev, size_t from, - std::vector>>& cost_forward, - std::vector>>& cost_backward, + std::vector>>& cost_forward, + std::vector>>& cost_backward, const std::vector& e, size_t& l) { // Making heap (all inf except 0, so we are saving comparisons...) - std::vector> demand(_num_nodes); + std::vector> demand(_num_nodes); demand[0]._to = from; _nodes_to_demand[from] = 0; - demand[0]._dist = 0; + demand[0]._cost = 0; size_t j = 1; for (size_t i = 0; i < from; ++i) { demand[j]._to = i; _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); + demand[j]._cost = std::numeric_limits::max(); ++j; } @@ -265,7 +227,7 @@ void min_cost_flow::compute_shortest_path( { demand[j]._to = i; _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); + demand[j]._cost = std::numeric_limits::max(); ++j; } @@ -275,7 +237,7 @@ void min_cost_flow::compute_shortest_path( { size_t u = demand[0]._to; - d[u] = demand[0]._dist; // final distance + d[u] = demand[0]._cost; // final distance final_nodes_flg[u] = true; if (e[u] < 0) { @@ -289,25 +251,26 @@ void min_cost_flow::compute_shortest_path( for (auto it = cost_forward[u].begin(); it != cost_forward[u].end(); ++it) { - assert(it->_reduced_cost >= 0); - NumT alt = d[u] + it->_reduced_cost; + assert(it->_cost >= 0); + NumT alt = d[u] + it->_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._dist)) + && (alt < demand[_nodes_to_demand[v]]._cost)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; } } - for (auto it = cost_backward[u].begin(); it != cost_backward[u].end(); ++it) + for (auto it = cost_backward[u].begin(); it != cost_backward[u].end(); + ++it) { - if (it->_residual_capacity > 0) + if (it->_amount > 0) { - assert(it->_reduced_cost >= 0); - NumT alt = d[u] + it->_reduced_cost; + assert(it->_cost >= 0); + NumT alt = d[u] + it->_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._dist)) + && (alt < demand[_nodes_to_demand[v]]._cost)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; @@ -325,11 +288,11 @@ void min_cost_flow::compute_shortest_path( { if (final_nodes_flg[node_from]) { - it->_reduced_cost += d[node_from] - d[l]; + it->_cost += d[node_from] - d[l]; } if (final_nodes_flg[it->_to]) { - it->_reduced_cost -= d[it->_to] - d[l]; + it->_cost -= d[it->_to] - d[l]; } } } @@ -342,11 +305,11 @@ void min_cost_flow::compute_shortest_path( { if (final_nodes_flg[node_from]) { - it->_reduced_cost += d[node_from] - d[l]; + it->_cost += d[node_from] - d[l]; } if (final_nodes_flg[it->_to]) { - it->_reduced_cost -= d[it->_to] - d[l]; + it->_cost -= d[it->_to] - d[l]; } } } @@ -354,12 +317,12 @@ void min_cost_flow::compute_shortest_path( template void min_cost_flow::heap_decrease_key( - std::vector>& demand, std::vector& nodes_to_demand, + std::vector>& demand, std::vector& nodes_to_demand, size_t v, NumT alt) { size_t i = nodes_to_demand[v]; - demand[i]._dist = alt; - while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) + demand[i]._cost = alt; + while (i > 0 && demand[PARENT(i)]._cost > demand[i]._cost) { swap_heap(demand, nodes_to_demand, i, PARENT(i)); i = PARENT(i); @@ -368,7 +331,7 @@ void min_cost_flow::heap_decrease_key( template void min_cost_flow::heap_remove_first( - std::vector>& demand, std::vector& nodes_to_demand) + std::vector>& demand, std::vector& nodes_to_demand) { swap_heap(demand, nodes_to_demand, 0, demand.size() - 1); demand.pop_back(); @@ -376,7 +339,7 @@ void min_cost_flow::heap_remove_first( } template -void min_cost_flow::heapify(std::vector>& demand, +void min_cost_flow::heapify(std::vector>& demand, std::vector& nodes_to_demand, size_t i) { @@ -387,7 +350,7 @@ void min_cost_flow::heapify(std::vector>& demand, size_t l = LEFT(i); size_t r = RIGHT(i); size_t smallest; - if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) + if ((l < demand.size()) && (demand[l]._cost < demand[i]._cost)) { smallest = l; } @@ -395,7 +358,7 @@ void min_cost_flow::heapify(std::vector>& demand, { smallest = i; } - if ((r < demand.size()) && (demand[r]._dist < demand[smallest]._dist)) + if ((r < demand.size()) && (demand[r]._cost < demand[smallest]._cost)) { smallest = r; } @@ -410,11 +373,11 @@ void min_cost_flow::heapify(std::vector>& demand, } template -void min_cost_flow::swap_heap(std::vector>& demand, +void min_cost_flow::swap_heap(std::vector>& demand, std::vector& nodes_to_demand, size_t i, size_t j) { - edge3 tmp = demand[i]; + edge tmp = demand[i]; demand[i] = demand[j]; demand[j] = tmp; nodes_to_demand[demand[j]._to] = j; @@ -644,7 +607,7 @@ T min_cost_flow::integral_emd_hat( min_cost_flow mcf; T my_dist; - std::vector>> flows(bb.size()); + std::vector>> flows(bb.size()); T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); @@ -656,7 +619,6 @@ T min_cost_flow::integral_emd_hat( } } - // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. From 95d9696fc84941b08f48aad78eb8aa4f96e9ef0a Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 04:13:33 +0300 Subject: [PATCH 05/15] [ranker] Added prefetch and prune mode --- include/meta/embeddings/word_embeddings.h | 7 + include/meta/index/ranker/emd.h | 454 +++++++--------------- include/meta/index/ranker/wmd_base.h | 13 +- src/classify/classifier/classifier.cpp | 6 +- src/embeddings/word_embeddings.cpp | 6 + src/index/ranker/wmd_base.cpp | 188 +++++++-- 6 files changed, 312 insertions(+), 362 deletions(-) diff --git a/include/meta/embeddings/word_embeddings.h b/include/meta/embeddings/word_embeddings.h index 2869736cf..3aab5d57b 100644 --- a/include/meta/embeddings/word_embeddings.h +++ b/include/meta/embeddings/word_embeddings.h @@ -111,6 +111,13 @@ class word_embeddings */ const util::aligned_vector& vocab() const; + /** + * @param term term_id to look up + * @return the embedding vector (as an array_view) for the given term, + * or the vector for the unknown word as appropriate + */ + util::array_view at(std::size_t tid) const; + private: util::array_view vector(std::size_t tid); diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index 642150800..13b3ff76b 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -6,21 +6,20 @@ #ifndef META_EMD_H #define META_EMD_H - -#include -#include -#include -#include -#include -#include -#include -#include "meta/parallel/algorithm.h" -#include "meta/math/vector.h" #include "meta/caching/all.h" -#include "meta/util/range.h" #include "meta/hashing/hash.h" +#include "meta/math/vector.h" +#include "meta/parallel/algorithm.h" #include "meta/util/min_cost_flow.h" - +#include "meta/util/range.h" +#include +#include +#include +#include +#include +#include +#include +#include namespace meta { @@ -30,41 +29,55 @@ namespace index class Document { -public: + public: size_t n_terms; std::vector ids; - std::vector> vectors; std::vector weights; }; - class em_distance { -public: - using metric_type = std::function&, const - std::vector&)>; - - em_distance(const std::shared_ptr, double>> &cache_, - metric_type metric, - size_t dimension, - size_t nthreads = std::thread::hardware_concurrency()) - : nthreads_(nthreads), cache_(cache_), dimension_(dimension), - dist(metric) + public: + using metric_type + = std::function&, + const util::array_view&)>; + + em_distance( + std:: + shared_ptr, + double>> + cache_, + std::shared_ptr embeddings, + std::string algorithm_type, metric_type metric, size_t nthreads = 1) + : nthreads_(nthreads), + cache_(cache_), + embeddings_(embeddings), + algorithm_type_(algorithm_type), + dimension_(embeddings->vector_size()), + dist(metric) { + methods_.emplace("rwmd", + [this](const Document& doc1, const Document& doc2) { + auto score1 = this->emd_relaxed(doc1, doc2); + auto score2 = this->emd_relaxed(doc2, doc1); + return std::max(score1, score2); + }); + methods_.emplace("wcd", + [this](const Document& doc1, const Document& doc2) { + return this->wcd(doc1, doc2); + }); + methods_.emplace("emd", + [this](const Document& doc1, const Document& doc2) { + return this->emd(doc1, doc2); + }); } - void fill(){ - auto f = [this](const Document &doc1, const Document &doc2){ - return this->emd_relaxed(doc1, doc2); - }; - std::function fz = f; - methods_.insert(std::make_pair(std::string("rwmd"), fz)); - + double score(const Document& doc1, const Document& doc2) + { + return methods_[algorithm_type_](doc1, doc2); } - - double emd(Document &doc1, Document &doc2) + double emd(const Document& doc1, const Document& doc2) { std::vector supply(doc1.n_terms + doc2.n_terms, 0); std::vector demand(doc1.n_terms + doc2.n_terms, 0); @@ -79,37 +92,14 @@ class em_distance demand[doc1.n_terms + i] = doc2.weights[i]; } - std::vector> cost(supply.size(), - std::vector(supply.size(), 0)); - - auto f_c_distance = [&](size_t first, size_t second) - { - std::pair pair; - if (doc1.ids[first] < doc2.ids[second]) - { - pair = std::make_pair(doc1.ids[first], - doc2.ids[second]); - } else { - pair = std::make_pair(doc2.ids[second], - doc1.ids[first]); - } - - auto val = cache_->find(pair); - - if(!val) - { - val = dist(doc1.vectors[first], - doc2.vectors[second]); - cache_->insert(pair, val.value()); - } - return val.value(); - }; + std::vector> cost( + supply.size(), std::vector(supply.size(), 0)); for (size_t i = 0; i < doc1.n_terms; ++i) { for (size_t j = 0; j < doc2.n_terms; ++j) { - double dist = f_c_distance(i, j); + double dist = f_c_distance(doc1, doc2, i, j); assert(dist >= 0); cost[i][j + doc1.n_terms] = dist; cost[j + doc1.n_terms][i] = dist; @@ -121,151 +111,45 @@ class em_distance return score; } - double forward_emd(Document &doc1, Document &doc2){ -// std::vector supply; -// std::unordered_map check_map; -// -// for (size_t i = 0; i < doc1.n_terms; ++i) -// { -// check_map.insert({doc1.ids[i], i}); -// } -// -// for (size_t i = 0; i < doc2.n_terms; ++i) -// { -// std::unordered_map::iterator ind; -// if((ind = check_map.find(doc2.ids[i])) != check_map.end()){ -// auto k = ind->second; -// if (doc1.weights[k] < doc2.weights[i]){ -// doc2.weights[i] -= doc1.weights[k]; -// doc1.weights[k] = 0; -// } else { -// doc1.weights[k] -= doc2.weights[i]; -// doc2.weights[i] = 0; -// } -// } -// } -// -// -// for (size_t i = 0; i < doc1.n_terms; ++i) -// { -// if(doc1.weights[i] != 0) -// supply.push_back(doc1.weights[i]); -// } -// std::vector demand(supply.size(), 0); -// -// for (size_t i = 0; i < doc2.n_terms; ++i) -// { -// if(doc2.weights[i] != 0) -// demand.push_back(doc2.weights[i]); -// } -// supply.resize(demand.size(), 0); - std::vector supply(doc1.n_terms + doc2.n_terms, 0); - std::vector demand(doc1.n_terms + doc2.n_terms, 0); - std::vector xtra(doc1.n_terms + doc2.n_terms, 0); - - - for (size_t i = 0; i < doc1.n_terms; ++i) - { - supply[i] = doc1.weights[i]; - xtra[i] = doc1.weights[i]; - } - - for (size_t i = 0; i < doc2.n_terms; ++i) + double emd_relaxed(const Document& doc1, const Document& doc2) + { + std::vector ids(doc2.n_terms); + for (size_t i = 0; i < doc2.n_terms; i++) { - demand[doc1.n_terms + i] = doc2.weights[i]; - xtra[doc1.n_terms + i] = -doc2.weights[i]; + ids[i] = i; } - std::vector> cost(supply.size(), - std::vector(supply.size(), 0)); - - auto f_c_distance = [&](size_t first, size_t second) - { - std::pair pair; - if (doc1.ids[first] < doc2.ids[second]) - { - pair = std::make_pair(doc1.ids[first], - doc2.ids[second]); - } else { - pair = std::make_pair(doc2.ids[second], - doc1.ids[first]); - } - - auto val = cache_->find(pair); - - if(!val) - { - val = dist(doc1.vectors[first], - doc2.vectors[second]); - cache_->insert(pair, val.value()); - } - return val.value(); - }; - - std::vector>> edges; - - for (size_t i = 0; i < doc1.n_terms; ++i) + double acc = 0; + for (size_t i = 0; i < doc1.n_terms; i++) { - std::list> list; + std::vector distance(doc2.n_terms); for (size_t j = 0; j < doc2.n_terms; ++j) { - double dist = f_c_distance(i, j); - list.push_back({doc1.n_terms + j, dist}); - - assert(dist >= 0); - cost[i][j + doc1.n_terms] = dist; - cost[j + doc1.n_terms][i] = dist; + distance[j] = f_c_distance(doc1, doc2, i, j); } - edges.push_back(list); - } - for (size_t i = 0; i < doc2.n_terms; ++i) - { - std::list> list; - edges.push_back(list); - } - - util::min_cost_flow mcf; - std::vector>> f(xtra.size()); - - auto score = mcf.compute_min_cost_flow(xtra, edges, f); - - return score; - - } - - double emd_relaxed2(Document &doc1, Document &doc2) - { - std::vector boilerplate(doc2.n_terms); - for (size_t i = 0; i < doc2.n_terms; i++) { - boilerplate[i] = i; - } - - double acc = 0; - for (size_t i = 0; i < doc1.n_terms; i++) { - - if (doc1.weights[i] != 0) { - std::sort( - boilerplate.begin(), - boilerplate.end(), - [&](const int a, const int b){ - bool ans; - ans = dist(doc1.vectors[i], doc2.vectors[a]) < - dist(doc1.vectors[i], doc2.vectors[b]); - return ans; - }); + if (doc1.weights[i] != 0) + { + std::sort(ids.begin(), ids.end(), + [&](const size_t a, const size_t b) { + bool ans; + ans = distance[a] < distance[b]; + return ans; + }); double remaining = doc1.weights[i]; - for (size_t j = 0; j < doc2.n_terms; j++) { - uint64_t w = boilerplate[j]; - if (remaining < doc2.weights[w]) { - acc += remaining * - dist(doc1.vectors[i], doc2.vectors[w]); + for (size_t j = 0; j < doc2.n_terms; j++) + { + uint64_t w = ids[j]; + if (remaining < doc2.weights[w]) + { + acc += remaining * distance[w]; break; - } else { + } + else + { remaining -= doc2.weights[w]; - acc += doc2.weights[w] * - dist(doc1.vectors[i], doc2.vectors[w]); + acc += doc2.weights[w] * distance[w]; } } } @@ -273,136 +157,36 @@ class em_distance return acc; } - double emd_relaxed(const Document &doc1, const Document &doc2) + double wcd(const Document& doc1, const Document& doc2) { - double score = 0; - parallel::thread_pool pool(nthreads_); - std::vector> futuress; - futuress.reserve(nthreads_); - - size_t part = doc1.n_terms / nthreads_; - size_t start = 0; - - std::vector ttimes(nthreads_ + 1); - for (size_t i = 0; i < nthreads_; i++) { - ttimes[i] = start; - start += part; - } - ttimes[nthreads_] = (int) doc1.n_terms; - - for (size_t j = 0; j < nthreads_; j++) { - - futuress.emplace_back( - pool.submit_task([&, j]{ - size_t st = ttimes[j]; - size_t en = ttimes[j + 1]; - return emd_relaxed_thread(st, en, doc1, doc2); - }) - ); - } - for (auto &fut: futuress) { - score += fut.get(); - } - return score; - } + using namespace meta::math::operators; - double - emd_relaxed_thread(const size_t start, const size_t end, const Document - &doc1, const Document &doc2) - { - double acc = 0; - std::vector ids(doc2.n_terms); - for (size_t i = 0; i < doc2.n_terms; ++i) { - ids[i] = i; - } + std::vector res1(dimension_, 0); + std::vector res2(dimension_, 0); - auto f_c_distance = [&](size_t first, size_t second) + auto start = doc1.ids.begin(); + for (auto w1 : doc1.weights) { - std::pair pair; - if (doc1.ids[first] < doc2.ids[second]) - { - pair = std::make_pair(doc1.ids[first], - doc2.ids[second]); - } else { - pair = std::make_pair(doc2.ids[second], - doc1.ids[first]); - } - - auto val = cache_->find(pair); - - if(!val) - { - val = dist(doc1.vectors[first], - doc2.vectors[second]); - cache_->insert(pair, val.value()); - } - return val.value(); - }; - - for (size_t i = start; i < end; i++) - { - if (doc1.weights[i] == 0) - continue; - - std::vector distances(doc2.n_terms); - - for(size_t j = 0; j < doc2.n_terms; j++) - { - - distances[j] = f_c_distance(i, j); - } - - std::sort(ids.begin(), - ids.end(), - [&] (const size_t a, const size_t b) -> bool - { - return distances[a] < distances[b]; - }); - - double remaining = doc1.weights[i]; - for (auto it = ids.begin(); - it != ids.end(); it++) { - auto w = (uint64_t) *it; - if (remaining < doc2.weights[w]) { - acc += remaining * - dist(doc1.vectors[i], doc2.vectors[w]); - break; - } else { - remaining -= doc2.weights[w]; - acc += doc2.weights[w] * - dist(doc1.vectors[i], doc2.vectors[w]); - } - } - } - return acc; - } - - - double wcd(Document &doc1, Document &doc2) - { - using namespace meta::math::operators; - - std::vector res(dimension_); - auto start = doc1.vectors.begin(); - for (auto w1: doc1.weights) { - res = res + *start++ * w1; + res1 = res1 + embeddings_->at(*start++) * w1; } - start = doc2.vectors.begin(); - for (auto w2: doc2.weights) { - res = res - *start++ * w2; + start = doc2.ids.begin(); + for (auto w2 : doc2.weights) + { + res2 = res2 + embeddings_->at(*start++) * w2; } - return l2norm(res); + return dist(res1, res2); } - static double - l2diff_norm(const std::vector &a, const std::vector &b) + static double l2diff_norm(const util::array_view& a, + const util::array_view& b) { double res = 0.0; auto it1 = a.begin(); auto it2 = b.begin(); - while (it1 != a.end()) { + while (it1 != a.end()) + { double val = *it1 - *it2; res += val * val; it1++; @@ -412,26 +196,54 @@ class em_distance return res; } - static double - cosine(const std::vector &a, const std::vector &b) + static double cosine(const util::array_view& a, + const util::array_view& b) { - return -std::inner_product(a.begin(), a.end(), b.begin(), 0.0); + if (a.begin() == b.begin()) + return 0; + return (1.0 - std::inner_product(a.begin(), a.end(), b.begin(), 0.0)) + / 2.0; } -private: - + private: const size_t nthreads_; - std::shared_ptr, double>> cache_; + std::shared_ptr, + double>> + cache_; + std::shared_ptr embeddings_; + const std::string algorithm_type_; const size_t dimension_; const metric_type dist; - std::unordered_map> methods_; -}; + std::unordered_map> + methods_; + + double f_c_distance(const Document& doc1, const Document& doc2, + size_t first, size_t second) + { + std::pair pair; + if (doc1.ids[first] < doc2.ids[second]) + { + pair = {doc1.ids[first], doc2.ids[second]}; + } + else + { + pair = {doc2.ids[second], doc1.ids[first]}; + } + auto val = cache_->find(pair); + if (!val) + { + val = dist(embeddings_->at(doc1.ids[first]), + embeddings_->at(doc2.ids[second])); + cache_->insert(pair, val.value()); + } + return val.value(); + } +}; } } -#endif //META_EMD_H +#endif // META_EMD_H diff --git a/include/meta/index/ranker/wmd_base.h b/include/meta/index/ranker/wmd_base.h index bc21cae11..de632fe0b 100644 --- a/include/meta/index/ranker/wmd_base.h +++ b/include/meta/index/ranker/wmd_base.h @@ -44,14 +44,22 @@ class wmd_base : public ranker const static std::string default_mode; + const static std::string default_distance_func; + const static constexpr size_t default_cache_size = 1000000; wmd_base(std::shared_ptr fwd, std::shared_ptr embeddings, - size_t nthreads, size_t cache_size); + size_t nthreads, size_t cache_size, std::string mode, + std::string distance_func); wmd_base(std::istream& in); + std::vector process(em_distance emd, + const filter_function_type& filter, + ranker_context& ctx, + std::vector docs); + void save(std::ostream& out) const override; std::vector @@ -66,7 +74,8 @@ class wmd_base : public ranker std::shared_ptr, double>> cache_; - + const std::string mode_; + const std::string distance_func_; meta::index::Document create_document(std::vector> tf); }; diff --git a/src/classify/classifier/classifier.cpp b/src/classify/classifier/classifier.cpp index d6398e0c3..0024b317f 100644 --- a/src/classify/classifier/classifier.cpp +++ b/src/classify/classifier/classifier.cpp @@ -19,8 +19,10 @@ confusion_matrix classifier::test(dataset_view_type docs) const confusion_matrix matrix; for (const auto& instance : docs) - matrix.add(predicted_label{classify(instance.weights)}, - docs.label(instance)); + try { + matrix.add(predicted_label{classify(instance.weights)}, + docs.label(instance)); + } catch(std::exception e){} return matrix; } diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index 4372e0649..1d45c7e63 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -161,6 +161,11 @@ embedding word_embeddings::at(std::string term) const return {tid, vector(tid)}; } +util::array_view word_embeddings::at(std::size_t tid) const +{ + return vector(tid); +} + int64_t word_embeddings::tid(std::string term) const { @@ -217,6 +222,7 @@ const util::aligned_vector& word_embeddings::vocab() const return id_to_term_; } + word_embeddings load_embeddings(const cpptoml::table& config) { auto prefix = config.get_as("prefix"); diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 81c441e63..2812162b7 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -14,24 +14,28 @@ namespace meta { namespace index { - const util::string_view wmd_base::id = "wmd-base"; const std::string wmd_base::default_mode = "rwmd"; +const std::string wmd_base::default_distance_func = "cosine"; + const constexpr size_t wmd_base::default_cache_size; wmd_base::wmd_base(std::shared_ptr fwd, std::shared_ptr embeddings, - size_t cache_size, size_t nthreads) - : fwd_{std::move(fwd)}, - embeddings_{embeddings}, - nthreads_{nthreads}, - cache_size_{cache_size}, + size_t nthreads, size_t cache_size, std::string mode, + std::string distance_func) + : fwd_(fwd), + embeddings_(embeddings), + nthreads_(nthreads), + cache_size_(cache_size), cache_{std::make_shared, double>>(nthreads, - cache_size)} + cache_size)}, + mode_(mode), + distance_func_(distance_func) { } @@ -40,6 +44,8 @@ void wmd_base::save(std::ostream& out) const io::packed::write(out, id); io::packed::write(out, nthreads_); io::packed::write(out, cache_size_); + io::packed::write(out, mode_); + io::packed::write(out, distance_func_); io::packed::write(out, fwd_->index_name()); } @@ -49,7 +55,9 @@ wmd_base::wmd_base(std::istream& in) cache_{std::make_shared, double>>(nthreads_, - cache_size_)} + cache_size_)}, + mode_{io::packed::read(in)}, + distance_func_{io::packed::read(in)} { auto path = io::packed::read(in); auto cfg = cpptoml::parse_file(path + "/config.toml"); @@ -68,37 +76,144 @@ std::vector wmd_base::rank(ranker_context& ctx, return a.score < b.score; }); - meta::index::em_distance emd(cache_, index::em_distance::cosine, - embeddings_->vector_size()); + em_distance::metric_type distance; + if (distance_func_ == "cosine") + { + distance = em_distance::cosine; + } + else if (distance_func_ == "l2diff") + { + distance = em_distance::l2diff_norm; + } + else + { + distance = em_distance::cosine; + } + + parallel::thread_pool pool(nthreads_); + std::vector docs = fwd_->docs(); - for (auto doc : fwd_->docs()) + if (mode_ != "prefetch-prune") + { + meta::index::em_distance emd(cache_, embeddings_, mode_, distance); + auto scores = process(emd, filter, ctx, fwd_->docs()); + for (auto score : scores) + { + results.emplace(score); + } + } + else { - if (!filter(doc)) - continue; + index::em_distance wcd(cache_, embeddings_, "wcd", distance); + index::em_distance emd(cache_, embeddings_, "emd", distance); + index::em_distance rwmd(cache_, embeddings_, "rwmd", distance); - std::vector> tf - = fwd_->search_primary(doc)->counts(); + // wcd phase + auto scores = process(wcd, filter, ctx, fwd_->docs()); + std::sort(scores.begin(), scores.end(), + [&](const search_result a, const search_result b) { + bool ans; + ans = a.score < b.score; + return ans; + }); - auto doc1 = create_document(tf); + auto emd_heap = util::make_fixed_heap( + num_results, [](const search_result& a, const search_result& b) { + return a.score < b.score; + }); + std::vector k_docs; + for (size_t i = 0; i < num_results; i++) + { + k_docs.push_back(scores[i].d_id); + } + scores.erase(scores.begin(), scores.begin() + num_results); + // emd after wcd + auto k_emd = process(emd, filter, ctx, k_docs); + for(auto sr : k_emd) + { + results.emplace(sr); + } - std::vector> tf_pc; - std::vector pc = ctx.postings; - for (auto one : pc) + // worst result + auto last = (--results.end())->score; + + const size_t magic_constant = std::max(fwd_->docs().size() / 8, + num_results * 8); + std::vector rwmd_docs(magic_constant); + auto start = scores.begin(); + std::generate(rwmd_docs.begin(), rwmd_docs.end(), [&](){ + return (*start++).d_id; + }); + // rwmd phase + auto rwmd_results = process(rwmd, filter, ctx, rwmd_docs); + + std::vector pretend_docs; + + for(auto sr : rwmd_results) { - tf_pc.push_back( - std::pair(one.t_id, one.query_term_weight)); + if (sr.score < last) + { + pretend_docs.emplace_back(sr.d_id); + } } - auto doc2 = create_document(tf_pc); + if (!pretend_docs.empty()) + { // emd phase + auto pretend_results = process(emd, filter, ctx, pretend_docs); + for (auto sr : pretend_results) + { + results.emplace(sr); + } + } - double score1 = emd.emd_relaxed(doc1, doc2); - double score2 = emd.emd_relaxed(doc2, doc1); - results.emplace(search_result(doc, (float)std::max(score1, score2))); } return results.extract_top(); } +std::vector wmd_base::process(em_distance emd, + const filter_function_type& filter, + ranker_context& ctx, + std::vector docs) +{ + parallel::thread_pool pool(nthreads_); + + auto scores = parallel::for_each_block( + docs.begin(), docs.end(), pool, [&](std::vector::iterator start, + std::vector::iterator end) { + std::vector block_scores; + for (auto it = start; it != end; ++it) + { + if (!filter(*it)) + continue; + auto tf = fwd_->search_primary(*it)->counts(); + auto doc1 = create_document(tf); + + std::vector> tf_pc; + tf_pc.reserve(ctx.postings.size()); + for (auto one : ctx.postings) + { + tf_pc.push_back({one.t_id, one.query_term_weight}); + } + + auto doc2 = create_document(tf_pc); + auto score = static_cast(emd.score(doc1, doc2)); + block_scores.emplace_back(*it, score); + } + return block_scores; + }); + std::vector results; + results.reserve(fwd_->docs().size()); + for (auto& vec : scores) + { + for (auto sr : vec.get()) + { + results.emplace_back(sr); + } + } + return results; +} + meta::index::Document wmd_base::create_document(std::vector> tf) { @@ -106,8 +221,6 @@ wmd_base::create_document(std::vector> tf) size_t all_terms_count = 0; meta::index::Document document; - document.vectors = std::vector>(); - document.vectors.reserve(unique_terms_count); document.ids = std::vector(); document.ids.reserve(unique_terms_count); document.weights = std::vector(); @@ -116,15 +229,11 @@ wmd_base::create_document(std::vector> tf) for (auto term_data : tf) { std::string term = fwd_->term_text(term_data.first); - auto vec_id = this->embeddings_->tid(term); if (vec_id >= 0) { all_terms_count += term_data.second; - auto embedding = this->embeddings_->at(term); - document.vectors.emplace_back( - std::vector(embedding.v.begin(), embedding.v.end())); document.weights.emplace_back(term_data.second); document.ids.emplace_back(vec_id); } @@ -136,7 +245,7 @@ wmd_base::create_document(std::vector> tf) using namespace meta::math::operators; - document.weights = document.weights / unique_terms_count; + document.weights = document.weights / all_terms_count; document.n_terms = unique_terms_count; return document; @@ -149,23 +258,28 @@ std::unique_ptr make_ranker(const cpptoml::table& global, if (global.begin() == global.end()) throw ranker_exception{"empty global configuration provided to " "construction of wmd_base ranker"}; + auto f_idx = make_index(global); + auto embeddings = global.get_table("embeddings"); if (!embeddings) throw std::runtime_error{"\"embeddings\" group needed in config file!"}; + auto glove = embeddings::load_embeddings(*embeddings); - auto mode - = local.get_as("mode").value_or(wmd_base::default_mode); auto cache_size = local.get_as("cache-per-thread") .value_or(wmd_base::default_cache_size); size_t nthreads = local.get_as("num-threads") .value_or(std::thread::hardware_concurrency()); - auto f_idx = make_index(global); + auto mode + = local.get_as("mode").value_or(wmd_base::default_mode); + + auto distance_func = local.get_as("distance-func") + .value_or(wmd_base::default_distance_func); return make_unique( - f_idx, std::make_shared(glove), cache_size, - nthreads); + f_idx, std::make_shared(glove), nthreads, + cache_size, mode, distance_func); } } } From d939b7dd5190d3a487206604cadf74a5f3ab1c2c Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 04:43:57 +0300 Subject: [PATCH 06/15] [ranker] fixed bug --- src/embeddings/tools/interactive_embeddings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddings/tools/interactive_embeddings.cpp b/src/embeddings/tools/interactive_embeddings.cpp index d5c721287..2b467f0fc 100644 --- a/src/embeddings/tools/interactive_embeddings.cpp +++ b/src/embeddings/tools/interactive_embeddings.cpp @@ -45,7 +45,7 @@ parse_word(util::string_view& query, const embeddings::word_embeddings& glove) if (word.empty()) throw parse_exception{"invalid expression"}; parse_whitespace(query); - return glove.at(word).v; + return glove.at(word.to_string()).v; } std::vector parse_expression(util::string_view& query, From 873ef8812dfc30a94678c92a4d733457f4b0309e Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 06:54:28 +0300 Subject: [PATCH 07/15] [ranker] returned previous version of emd --- include/meta/index/ranker/emd.h | 2 +- include/meta/util/min_cost_flow.h | 106 +++-- include/meta/util/min_cost_flow.tcc | 665 ++++++++++++++++------------ 3 files changed, 459 insertions(+), 314 deletions(-) diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index 13b3ff76b..deb202796 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -106,7 +106,7 @@ class em_distance } } util::min_cost_flow mcf; - auto score = mcf.emd_hat(supply, demand, cost); + auto score = mcf.emd_hat(supply, demand, supply, demand, cost, -1.); return score; } diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h index 45805186f..6e2e5dbfd 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/util/min_cost_flow.h @@ -1,7 +1,11 @@ -/** - * @file min_cost_flow.h - * @author lolik111 - */ +// +// Created by lolik111 on 29.11.17. +// + + +// +// Created by lolik111 on 28.11.17. +// #ifndef FAST_EMD_MIN_COST_FLOW_H #define FAST_EMD_MIN_COST_FLOW_H @@ -17,55 +21,77 @@ namespace meta { namespace util { -template +template struct edge; -template -struct edge_weighted; +template +struct edge0; + +template +struct edge1; -template +template +struct edge2; + +template +struct edge3; + +template class min_cost_flow { - public: - NumT emd_hat(const std::vector& supply, - const std::vector& demand, - const std::vector>& cost); +public: // e - supply(positive) and demand(negative). // c[i] - edges that goes from node i. first is the second nod // x - the flow is returned in it - NumT compute_min_cost_flow(std::vector& e, - const std::vector>>& c, - std::vector>>& x); - private: + NumT emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + NumT extra_mass_penalty); + + NumT compute_min_cost_flow(std::vector &e, + const std::vector>> &c, + std::vector>> &x); + + +private: + size_t _num_nodes; std::vector _nodes_to_demand; - template - static T integral_emd_hat(const std::vector& supply, - const std::vector& demand, - const std::vector>& cost); + template + static T integral_emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + T extra_mass_penalty); + + void + compute_shortest_path(std::vector &d, std::vector &prev, + + size_t from, + std::vector>> &cost_forward, + std::vector>> &cost_backward, - void compute_shortest_path( - std::vector& d, std::vector& prev, size_t from, - std::vector>>& cost_forward, - std::vector>>& cost_backward, - const std::vector& e, size_t& l); + const std::vector &e, size_t &l); - void heap_decrease_key(std::vector>& demand, - std::vector& nodes_to_demand, size_t v, + void heap_decrease_key(std::vector> &demand, + std::vector &nodes_to_demand, size_t v, NumT alt); - void heap_remove_first(std::vector>& demand, - std::vector& nodes_to_demand); + void heap_remove_first(std::vector> &demand, + std::vector &nodes_to_demand); - void heapify(std::vector>& demand, - std::vector& nodes_to_demand, size_t i); + void heapify(std::vector> &demand, + std::vector &nodes_to_demand, size_t i); - void swap_heap(std::vector>& demand, - std::vector& nodes_to_demand, size_t i, size_t j); + void swap_heap(std::vector> &demand, + std::vector &nodes_to_demand, size_t i, size_t j); size_t LEFT(size_t i) { @@ -81,13 +107,9 @@ class min_cost_flow { return (i - 1) / 2; } -}; -} -} -#include "min_cost_flow.tcc" - -#endif // FAST_EMD_MIN_COST_FLOW_H +}; +// end min_cost_flow // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. @@ -115,3 +137,9 @@ class min_cost_flow // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +} +} + +#include "min_cost_flow.tcc" + +#endif //FAST_EMD_MIN_COST_FLOW_H diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index c0be9ae74..cb439f140 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -1,50 +1,83 @@ -/** - * @file min_cost_flow.tcc - * @author lolik111 - */ - -#include "min_cost_flow.h" #include #include -#include #include #include #include -#include #include +#include +#include +#include "min_cost_flow.h" namespace meta { namespace util { -template +template struct edge { - edge(size_t to = 0, T cost = 0) : _to(to), _cost(cost) + edge(size_t to, CostType cost) : _to(to), _cost(cost) { } size_t _to; - T _cost; + CostType _cost; }; -template -struct edge_weighted +template +struct edge0 { - edge_weighted(size_t to, T cost, T amount) - : _to(to), _cost(cost), _amount(amount) + edge0(size_t to, CostType cost, CostType flow) : _to(to), _cost(cost), + _flow(flow) { } size_t _to; - T _cost; - T _amount; + CostType _cost; + CostType _flow; }; -template -NumT min_cost_flow::compute_min_cost_flow( - std::vector& e, const std::vector>>& c, - std::vector>>& x) +template +struct edge1 +{ + edge1(size_t to, CostType reduced_cost) : _to(to), + _reduced_cost(reduced_cost) + { + } + + size_t _to; + CostType _reduced_cost; +}; + +template +struct edge2 +{ + edge2(size_t to, CostType reduced_cost, CostType residual_capacity) + : _to(to), + _reduced_cost(reduced_cost), + _residual_capacity(residual_capacity) + { + } + + size_t _to; + CostType _reduced_cost; + CostType _residual_capacity; +}; + +template +struct edge3 +{ + edge3(size_t to = 0, DistType dist = 0) : _to(to), _dist(dist) + { + } + + size_t _to; + DistType _dist; +}; + +template +NumT min_cost_flow::compute_min_cost_flow(std::vector &e, + const std::vector>> &c, + std::vector>> &x) { assert(e.size() == c.size()); @@ -53,45 +86,49 @@ NumT min_cost_flow::compute_min_cost_flow( _num_nodes = e.size(); _nodes_to_demand.resize(_num_nodes); - // init flow for (size_t from = 0; from < _num_nodes; ++from) { - for (auto it = c[from].begin(); it != c[from].end(); ++it) - { - x[from].push_back(edge_weighted(it->_to, it->_cost, 0)); - x[it->_to].push_back(edge_weighted(from, -it->_cost, 0)); - } + for (auto it = c[from].begin(); it != c[from].end(); ++it) + { + x[from].push_back(edge0(it->_to, it->_cost, 0)); + x[it->_to].push_back(edge0(from, -it->_cost, 0)); + } + } // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) // Note that for forward edges the residual capacity is infinity - std::vector>> r_cost_forward(_num_nodes); - for (size_t from = 0; from < _num_nodes; ++from) + std::vector>> r_cost_forward(_num_nodes); { - for (auto it = c[from].begin(); it != c[from].end(); ++it) - { - r_cost_forward[from].push_back(edge(it->_to, it->_cost)); + for (size_t from = 0; from < _num_nodes; ++from) { + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { + r_cost_forward[from].push_back( + edge1(it->_to, it->_cost)); + } + } } } // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) // Since the flow at the beginning is 0, the residual capacity is also zero - std::vector>> r_cost_cap_backward(_num_nodes); - for (size_t from = 0; from < _num_nodes; ++from) - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) - { - r_cost_cap_backward[it->_to].push_back( - edge_weighted(from, -it->_cost, 0)); + std::vector>> r_cost_cap_backward(_num_nodes); + + for (size_t from = 0; from < _num_nodes; ++from) { + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) { + r_cost_cap_backward[it->_to].push_back( + edge2(from, -it->_cost, 0)); + } + } } - } - // Max supply NumT U = 0; - for (size_t i = 0; i < _num_nodes; ++i) { - if (e[i] > U) - U = e[i]; + for (size_t i = 0; i < _num_nodes; ++i) { + if (e[i] > U) + U = e[i]; + } } std::vector d(_num_nodes); @@ -102,12 +139,9 @@ NumT min_cost_flow::compute_min_cost_flow( NumT max_supply = 0; size_t k = 0; - for (size_t i = 0; i < _num_nodes; ++i) - { - if (e[i] > 0) - { - if (max_supply < e[i]) - { + for (size_t i = 0; i < _num_nodes; ++i) { + if (e[i] > 0) { + if (max_supply < e[i]) { max_supply = e[i]; k = i; } @@ -118,10 +152,12 @@ NumT min_cost_flow::compute_min_cost_flow( delta = max_supply; size_t l; - compute_shortest_path(d, prev, k, r_cost_forward, r_cost_cap_backward, - e, l); + compute_shortest_path(d, prev, k, r_cost_forward, + r_cost_cap_backward, e, l); // find delta (minimum on the path from k to l) + // delta= e[k]; + // if (-e[l]::compute_min_cost_flow( // residual auto itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) - && (itccb->_to != to)) - { + && (itccb->_to != to)) { ++itccb; } - if (itccb != r_cost_cap_backward[from].end()) - { - if (itccb->_amount < delta) - delta = itccb->_amount; + if (itccb != r_cost_cap_backward[from].end()) { + if (itccb->_residual_capacity < delta) + delta = itccb->_residual_capacity; } to = from; } while (to != k); + // augment delta flow from k to l (backwards actually...) to = l; - do - { + do { size_t from = prev[to]; assert(from != to); + // TODO - might do here O(n) can be done in O(1) auto itx = x[from].begin(); - while (itx->_to != to) - { + while (itx->_to != to) { ++itx; } - itx->_amount += delta; + itx->_flow += delta; // update residual for backward edges auto itccb = r_cost_cap_backward[to].begin(); while ((itccb != r_cost_cap_backward[to].end()) - && (itccb->_to != from)) - { + && (itccb->_to != from)) { ++itccb; } - if (itccb != r_cost_cap_backward[to].end()) - { - itccb->_amount += delta; + if (itccb != r_cost_cap_backward[to].end()) { + itccb->_residual_capacity += delta; } itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) @@ -177,7 +209,7 @@ NumT min_cost_flow::compute_min_cost_flow( } if (itccb != r_cost_cap_backward[from].end()) { - itccb->_amount -= delta; + itccb->_residual_capacity -= delta; } // update e @@ -186,58 +218,63 @@ NumT min_cost_flow::compute_min_cost_flow( to = from; } while (to != k); - } + + } // while true (until we break when S or T is empty) // compute distance from x NumT dist = 0; - for (size_t from = 0; from < _num_nodes; ++from) - { - for (auto it = x[from].begin(); it != x[from].end(); ++it) - { - dist += (it->_cost * it->_amount); - } - } + + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = x[from].begin(); it != x[from].end(); ++it) + { + dist += (it->_cost * it->_flow); + } + } + return dist; } -template -void min_cost_flow::compute_shortest_path( - std::vector& d, std::vector& prev, size_t from, - std::vector>>& cost_forward, - std::vector>>& cost_backward, - const std::vector& e, size_t& l) +template +void min_cost_flow:: +compute_shortest_path(std::vector &d, std::vector &prev, + + size_t from, + std::vector>> &cost_forward, + std::vector>> &cost_backward, + + const std::vector &e, size_t &l) { // Making heap (all inf except 0, so we are saving comparisons...) - std::vector> demand(_num_nodes); + std::vector> demand(_num_nodes); demand[0]._to = from; _nodes_to_demand[from] = 0; - demand[0]._cost = 0; + demand[0]._dist = 0; size_t j = 1; - for (size_t i = 0; i < from; ++i) - { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._cost = std::numeric_limits::max(); - ++j; - } - - for (size_t i = from + 1; i < _num_nodes; ++i) - { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._cost = std::numeric_limits::max(); - ++j; - } + // TODO: both of these into a function? + + for (size_t i = 0; i < from; ++i) { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } + + for (size_t i = from + 1; i < _num_nodes; ++i) { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._dist = std::numeric_limits::max(); + ++j; + } // main loop std::vector final_nodes_flg(_num_nodes, false); - do - { + do { size_t u = demand[0]._to; - d[u] = demand[0]._cost; // final distance + d[u] = demand[0]._dist; // final distance final_nodes_flg[u] = true; if (e[u] < 0) { @@ -247,119 +284,114 @@ void min_cost_flow::compute_shortest_path( heap_remove_first(demand, _nodes_to_demand); - // neighbors of u for (auto it = cost_forward[u].begin(); it != cost_forward[u].end(); ++it) { - assert(it->_cost >= 0); - NumT alt = d[u] + it->_cost; + assert(it->_reduced_cost >= 0); + NumT alt = d[u] + it->_reduced_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._cost)) + && (alt < demand[_nodes_to_demand[v]]._dist)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; } } - for (auto it = cost_backward[u].begin(); it != cost_backward[u].end(); - ++it) + + for (auto it = cost_backward[u].begin(); + it != cost_backward[u].end(); ++it) { - if (it->_amount > 0) + if (it->_residual_capacity > 0) { - assert(it->_cost >= 0); - NumT alt = d[u] + it->_cost; + assert(it->_reduced_cost >= 0); + NumT alt = d[u] + it->_reduced_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._cost)) + && (alt < demand[_nodes_to_demand[v]]._dist)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; } } } + // it } while (!demand.empty()); - // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) - { + + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { + for (auto it = cost_forward[node_from].begin(); - it != cost_forward[node_from].end(); ++it) - { - if (final_nodes_flg[node_from]) - { - it->_cost += d[node_from] - d[l]; + it != cost_forward[node_from].end(); ++it) { + if (final_nodes_flg[node_from]) { + it->_reduced_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) - { - it->_cost -= d[it->_to] - d[l]; + if (final_nodes_flg[it->_to]) { + it->_reduced_cost -= d[it->_to] - d[l]; } } + } + // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) - { + + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { + for (auto it = cost_backward[node_from].begin(); - it != cost_backward[node_from].end(); ++it) - { - if (final_nodes_flg[node_from]) - { - it->_cost += d[node_from] - d[l]; + it != cost_backward[node_from].end(); ++it) { + if (final_nodes_flg[node_from]) { + it->_reduced_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) - { - it->_cost -= d[it->_to] - d[l]; + if (final_nodes_flg[it->_to]) { + it->_reduced_cost -= d[it->_to] - d[l]; } } } + + } -template -void min_cost_flow::heap_decrease_key( - std::vector>& demand, std::vector& nodes_to_demand, - size_t v, NumT alt) +template +void min_cost_flow::heap_decrease_key(std::vector> &demand, + std::vector &nodes_to_demand, + size_t v, + NumT alt) { size_t i = nodes_to_demand[v]; - demand[i]._cost = alt; - while (i > 0 && demand[PARENT(i)]._cost > demand[i]._cost) - { + demand[i]._dist = alt; + while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) { swap_heap(demand, nodes_to_demand, i, PARENT(i)); i = PARENT(i); } -} +} // heap_decrease_key -template -void min_cost_flow::heap_remove_first( - std::vector>& demand, std::vector& nodes_to_demand) +template +void min_cost_flow::heap_remove_first(std::vector> &demand, + std::vector &nodes_to_demand) { swap_heap(demand, nodes_to_demand, 0, demand.size() - 1); demand.pop_back(); heapify(demand, nodes_to_demand, 0); -} +} // heap_remove_first -template -void min_cost_flow::heapify(std::vector>& demand, - std::vector& nodes_to_demand, - size_t i) +template +void min_cost_flow::heapify(std::vector> &demand, + std::vector &nodes_to_demand, size_t i) { - do - { + do { // TODO: change to loop size_t l = LEFT(i); size_t r = RIGHT(i); size_t smallest; - if ((l < demand.size()) && (demand[l]._cost < demand[i]._cost)) - { + if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) { smallest = l; - } - else - { + } else { smallest = i; } - if ((r < demand.size()) && (demand[r]._cost < demand[smallest]._cost)) - { + if ((r < demand.size()) + && (demand[r]._dist < demand[smallest]._dist)) { smallest = r; } @@ -370,36 +402,41 @@ void min_cost_flow::heapify(std::vector>& demand, i = smallest; } while (true); + } -template -void min_cost_flow::swap_heap(std::vector>& demand, - std::vector& nodes_to_demand, - size_t i, size_t j) +template +void min_cost_flow::swap_heap(std::vector> &demand, + std::vector &nodes_to_demand, size_t i, + size_t j) { - edge tmp = demand[i]; + edge3 tmp = demand[i]; demand[i] = demand[j]; demand[j] = tmp; nodes_to_demand[demand[j]._to] = j; nodes_to_demand[demand[i]._to] = i; } -template -NumT min_cost_flow::emd_hat(const std::vector& supply, - const std::vector& demand, - const std::vector>& cost) + +template +NumT min_cost_flow::emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply, + const std::vector &demand, + const std::vector> &cost, + NumT extra_mass_penalty) { - if (std::is_integral::value) - { - return integral_emd_hat(supply, demand, cost); - } - else - { + if (std::is_integral::value) { + return integral_emd_hat(supply_orig, demand_orig, supply, demand, + cost, extra_mass_penalty); + } else { const double mult_factor = 1000000; // Constructing the input const size_t n = supply.size(); + std::vector i_supply_orig(n); + std::vector i_demand_orig(n); std::vector i_supply(n); std::vector i_demand(n); std::vector> i_cost(n, std::vector(n)); @@ -408,169 +445,241 @@ NumT min_cost_flow::emd_hat(const std::vector& supply, double sum_supply = 0.0; double sum_demand = 0.0; double max_cost = cost[0][0]; - for (size_t i = 0; i < n; ++i) - { - sum_supply += supply[i]; - sum_demand += demand[i]; - for (size_t j = 0; j < n; ++j) - { + for (size_t i = 0; i < n; ++i) { + sum_supply += supply_orig[i]; + sum_demand += demand_orig[i]; + for (size_t j = 0; j < n; ++j) { if (cost[i][j] > max_cost) max_cost = cost[i][j]; } } - + double min_sum = std::min(sum_supply, sum_demand); double max_sum = std::max(sum_supply, sum_demand); double supply_demand_norm_factor = mult_factor / max_sum; double cost_norm_factor = mult_factor / max_cost; - for (size_t i = 0; i < n; ++i) - { - i_supply[i] = static_cast( - floor(supply[i] * supply_demand_norm_factor + 0.5)); - i_demand[i] = static_cast( - floor(demand[i] * supply_demand_norm_factor + 0.5)); - for (size_t j = 0; j < n; ++j) - { + for (size_t i = 0; i < n; ++i) { + i_supply_orig[i] = static_cast( + floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); + i_demand_orig[i] = static_cast( + floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); + i_supply[i] = static_cast(floor( + supply[i] * supply_demand_norm_factor + 0.5)); + i_demand[i] = static_cast(floor( + demand[i] * supply_demand_norm_factor + 0.5)); + for (size_t j = 0; j < n; ++j) { i_cost[i][j] = static_cast( - floor(cost[i][j] * cost_norm_factor + 0.5)); + floor(cost[i][j] * cost_norm_factor + 0.5)); } } - // computing distance - double dist = integral_emd_hat(i_supply, i_demand, i_cost); + // computing distance without extra mass penalty + double dist + = integral_emd_hat(i_supply_orig, i_demand_orig, + i_supply, i_demand, i_cost, 0); - // unnormalize dist = dist / supply_demand_norm_factor; dist = dist / cost_norm_factor; + // adding extra mass penalty +// if (extra_mass_penalty == -1) +// extra_mass_penalty = max_cost; +// dist += (max_sum - min_sum) * extra_mass_penalty; + return dist; + } + } -template -template -T min_cost_flow::integral_emd_hat( - const std::vector& supply_c, const std::vector& demand_c, - const std::vector>& cost_c) +template +template< typename T> +T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, + const std::vector &demand_orig, + const std::vector &supply_c, + const std::vector &demand_c, + const std::vector> + &cost_c, + T extra_mass_penalty) { size_t n = supply_c.size(); assert(demand_c.size() == n); + // Ensuring that the supplier - supply, have more mass. + std::vector supply; + std::vector demand; std::vector> cost(cost_c); + T abs_diff_sum_supply_sum_denamd; T sum_supply = 0; T sum_demand = 0; - for (size_t i = 0; i < n; ++i) { - sum_supply += supply_c[i]; - sum_demand += demand_c[i]; + for (size_t i = 0; i < n; ++i) + sum_supply += supply_c[i]; + } + { + for (size_t i = 0; i < n; ++i) + sum_demand += demand_c[i]; } + bool need_to_swap_flow = false; + if (sum_demand > sum_supply) { + need_to_swap_flow = true; + supply = demand_c; + demand = supply_c; + // transpose cost + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + cost[i][j] = cost_c[j][i]; + } + } + abs_diff_sum_supply_sum_denamd = sum_demand - sum_supply; + } else { + supply = supply_c; + demand = demand_c; + abs_diff_sum_supply_sum_denamd = sum_supply - sum_demand; + } + // if (need_to_swap_flow) cout << "need_to_swap_flow" << endl; - // creating the b vector that contains all vertices + // creating the b vector that contains all vertexes std::vector b(2 * n + 2); const size_t threshold_node = 2 * n; const size_t artificial_node = 2 * n + 1; // need to be last ! - for (size_t i = 0; i < n; ++i) { - b[i] = supply_c[i]; - b[i + n] = demand_c[i]; + for (size_t i = 0; i < n; ++i) { + b[i] = supply[i]; + } } - - b[threshold_node] = 0; + { + for (size_t i = n; i < 2 * n; ++i) { + b[i] = (demand[i - n]); + } + } + b[threshold_node] = -abs_diff_sum_supply_sum_denamd; b[artificial_node] = 0; + //------------------------------------------------------- + //------------------------------------------------------- T max_cost = 0; - for (size_t i = 0; i < n; ++i) { - for (size_t j = 0; j < n; ++j) - { - assert(cost[i][j] >= 0); - if (cost[i][j] > max_cost) - max_cost = cost[i][j]; + for (size_t i = 0; i < n; ++i) { + { + for (size_t j = 0; j < n; ++j) { + assert(cost[i][j] >= 0); + if (cost[i][j] > max_cost) + max_cost = cost[i][j]; + } + } } } + if (extra_mass_penalty == -1) + extra_mass_penalty = max_cost; + //------------------------------------------------------- + //============================================================= std::set sources_that_flow_not_only_to_thresh; std::set sinks_that_get_flow_not_only_from_thresh; T pre_flow_cost = 0; + //============================================================= + //============================================================= // regular edges between sinks and sources without threshold edges std::vector>> c(b.size()); - for (size_t i = 0; i < n; ++i) { - if (b[i] == 0) - continue; - for (size_t j = 0; j < n; ++j) - { - if (b[j + n] == 0) - continue; - if (cost[i][j] == max_cost) + for (size_t i = 0; i < n; ++i) { + if (b[i] == 0) continue; - c[i].push_back(edge(j + n, cost[i][j])); + { + for (size_t j = 0; j < n; ++j) { + if (b[j + n] == 0) + continue; + if (cost[i][j] == max_cost) + continue; + c[i].push_back(edge(j + n, cost[i][j])); + } + } // j + } + } // i - // checking which are not isolated - sources_that_flow_not_only_to_thresh.insert(i); - sinks_that_get_flow_not_only_from_thresh.insert(j + n); + // checking which are not isolated + { + for (size_t i = 0; i < n; ++i) { + if (b[i] == 0) + continue; + { + for (size_t j = 0; j < n; ++j) { + if (b[j + n] == 0) + continue; + if (cost[i][j] == max_cost) + continue; + sources_that_flow_not_only_to_thresh.insert(i); + sinks_that_get_flow_not_only_from_thresh.insert(j + n); + } + } // j } - } + } // i // converting all sinks to negative - for (size_t i = n; i < 2 * n; ++i) { - b[i] = -b[i]; + for (size_t i = n; i < 2 * n; ++i) { + b[i] = -b[i]; + } } // add edges from/to threshold node, // note that costs are reversed to the paper (see also remark* above) // It is important that it will be this way because of remark* above. - for (size_t i = 0; i < n; ++i) { - c[i].push_back(edge(threshold_node, 0)); + for (size_t i = 0; i < n; ++i) { + c[i].push_back(edge(threshold_node, 0)); + } } - for (size_t j = 0; j < n; ++j) { - c[threshold_node].push_back(edge(j + n, max_cost)); + for (size_t j = 0; j < n; ++j) { + c[threshold_node].push_back(edge(j + n, max_cost)); + } } // artificial arcs - Note the restriction that only one edge i,j is // artificial so I ignore it... - for (size_t i = 0; i < artificial_node; ++i) { - c[i].push_back(edge(artificial_node, max_cost + 1)); - c[artificial_node].push_back(edge(i, max_cost + 1)); + for (size_t i = 0; i < artificial_node; ++i) { + c[i].push_back(edge(artificial_node, max_cost + 1)); + c[artificial_node].push_back(edge(i, max_cost + 1)); + } } + //============================================================= - // remove nodes with supply demand of 0 and vertices that are connected only - // to the threshold vertex + //==================================================== + // remove nodes with supply demand of 0 + // and vertexes that are connected only to the + // threshold vertex + //==================================================== int current_node_name = 0; - // Note here it should be vector and not vector as I'm using -1 - // as a special flag !!! + // Note here it should be vector and not vector + // as I'm using -1 as a special flag !!! const int remove_node_flag = -1; std::vector nodes_new_names(b.size(), remove_node_flag); std::vector nodes_old_names; nodes_old_names.reserve(b.size()); - for (size_t i = 0; i < n * 2; ++i) { - if (b[i] != 0) - { - if (sources_that_flow_not_only_to_thresh.find(i) + for (size_t i = 0; i < n * 2; ++i) { + if (b[i] != 0) { + if (sources_that_flow_not_only_to_thresh.find(i) != sources_that_flow_not_only_to_thresh.end() - || sinks_that_get_flow_not_only_from_thresh.find(i) - != sinks_that_get_flow_not_only_from_thresh.end()) - { - nodes_new_names[i] = current_node_name; - nodes_old_names.push_back(i); - ++current_node_name; - } - else - { - if (i >= n) - { // sink - pre_flow_cost -= (b[i] * max_cost); + || sinks_that_get_flow_not_only_from_thresh.find(i) + != sinks_that_get_flow_not_only_from_thresh + .end()) { + nodes_new_names[i] = current_node_name; + nodes_old_names.push_back(i); + ++current_node_name; + } else { + if (i >= n) { // sink + pre_flow_cost -= (b[i] * max_cost); + } + b[threshold_node] + += b[i]; // add mass(i=n) } - b[threshold_node] += b[i]; // add mass(i=n) } } - } - + } // i nodes_new_names[threshold_node] = current_node_name; nodes_old_names.push_back(threshold_node); ++current_node_name; @@ -580,45 +689,53 @@ T min_cost_flow::integral_emd_hat( std::vector bb(current_node_name); size_t j = 0; - for (size_t i = 0; i < b.size(); ++i) { - if (nodes_new_names[i] != remove_node_flag) - { - bb[j] = b[i]; - ++j; + for (size_t i = 0; i < b.size(); ++i) { + if (nodes_new_names[i] != remove_node_flag) { + bb[j] = b[i]; + ++j; + } } } std::vector>> cc(bb.size()); - for (size_t i = 0; i < c.size(); ++i) { - if (nodes_new_names[i] == remove_node_flag) - continue; - for (auto it = c[i].begin(); it != c[i].end(); ++it) - { - if (nodes_new_names[it->_to] != remove_node_flag) + for (size_t i = 0; i < c.size(); ++i) { + if (nodes_new_names[i] == remove_node_flag) + continue; { - cc[nodes_new_names[i]].push_back( - edge(nodes_new_names[it->_to], it->_cost)); + for (auto it = c[i].begin(); it != c[i].end(); ++it) { + if (nodes_new_names[it->_to] != remove_node_flag) { + cc[nodes_new_names[i]].push_back(edge( + nodes_new_names[it->_to], it->_cost)); + } + } } } } - min_cost_flow mcf; + min_cost_flow mcf; + T my_dist; - std::vector>> flows(bb.size()); + std::vector>> flows(bb.size()); T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); my_dist = pre_flow_cost + // pre-flowing on cases where it was possible - mcf_dist; // solution of the transportation problem + mcf_dist + // solution of the transportation problem + (abs_diff_sum_supply_sum_denamd + * extra_mass_penalty); // emd-hat extra mass penalty return my_dist; + } } } + +// end min_cost_flow + // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. From f8fa4daf60496dae62a8c6f38496fc33666dc6ae Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 07:18:24 +0300 Subject: [PATCH 08/15] [ranker] fixed bugs --- include/meta/classify/classifier_factory.h | 19 +++++++++++++++++++ include/meta/index/ranker/emd.h | 5 +++++ include/meta/util/min_cost_flow.tcc | 5 ++--- src/index/ranker/wmd_base.cpp | 6 ++++-- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/meta/classify/classifier_factory.h b/include/meta/classify/classifier_factory.h index f8bb931f4..a396e7b4e 100644 --- a/include/meta/classify/classifier_factory.h +++ b/include/meta/classify/classifier_factory.h @@ -58,6 +58,24 @@ class classifier_factory void reg_mi(); }; + +/** + * Convenience method for creating a classifier using the factory. + * + * @param global global configuration file + * + * @param local The configuration group that specifies the configuration + * for the classifier to be created + * @param inv_idx The inverted_index to be passed to the classifier being + * created (if needed) + * + * @return a unique_ptr to the classifier created from the given + * configuration + */ + std::unique_ptr + make_classifier(const cpptoml::table& global, const cpptoml::table& local, multiclass_dataset_view training, + std::shared_ptr inv_idx = nullptr); + /** * Convenience method for creating a classifier using the factory. * @@ -115,6 +133,7 @@ make_multi_index_classifier(const cpptoml::table&, return make_unique(training, inv_idx); } + /** * Factory that is responsible for loading classifiers from input streams. * Clients should use the register_classifier method instead of this class diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index deb202796..6c4b4dfbc 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -185,6 +185,11 @@ class em_distance double res = 0.0; auto it1 = a.begin(); auto it2 = b.begin(); + if(it1 == it2) + { + return 0; + } + while (it1 != a.end()) { double val = *it1 - *it2; diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index cb439f140..ade9b4204 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -723,9 +723,8 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); my_dist = pre_flow_cost + // pre-flowing on cases where it was possible - mcf_dist + // solution of the transportation problem - (abs_diff_sum_supply_sum_denamd - * extra_mass_penalty); // emd-hat extra mass penalty + mcf_dist; // solution of the transportation problem + return my_dist; diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 2812162b7..fcecf5bf4 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -7,7 +7,6 @@ #include "meta/index/forward_index.h" #include "meta/index/postings_data.h" #include "meta/index/score_data.h" -#include "meta/logging/logger.h" #include "meta/util/fixed_heap.h" namespace meta @@ -104,7 +103,7 @@ std::vector wmd_base::rank(ranker_context& ctx, } else { - index::em_distance wcd(cache_, embeddings_, "wcd", distance); + index::em_distance wcd(cache_, embeddings_, "wcd", em_distance::l2diff_norm); index::em_distance emd(cache_, embeddings_, "emd", distance); index::em_distance rwmd(cache_, embeddings_, "rwmd", distance); @@ -197,6 +196,9 @@ std::vector wmd_base::process(em_distance emd, } auto doc2 = create_document(tf_pc); + if(doc1.n_terms == 0 || doc2.n_terms == 0){ + continue; + } auto score = static_cast(emd.score(doc1, doc2)); block_scores.emplace_back(*it, score); } From 7ca3d58515972bcd801558c7d2e2748de407b400 Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 07:21:41 +0300 Subject: [PATCH 09/15] [ranker] fix --- include/meta/classify/classifier_factory.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/include/meta/classify/classifier_factory.h b/include/meta/classify/classifier_factory.h index a396e7b4e..f8bb931f4 100644 --- a/include/meta/classify/classifier_factory.h +++ b/include/meta/classify/classifier_factory.h @@ -58,24 +58,6 @@ class classifier_factory void reg_mi(); }; - -/** - * Convenience method for creating a classifier using the factory. - * - * @param global global configuration file - * - * @param local The configuration group that specifies the configuration - * for the classifier to be created - * @param inv_idx The inverted_index to be passed to the classifier being - * created (if needed) - * - * @return a unique_ptr to the classifier created from the given - * configuration - */ - std::unique_ptr - make_classifier(const cpptoml::table& global, const cpptoml::table& local, multiclass_dataset_view training, - std::shared_ptr inv_idx = nullptr); - /** * Convenience method for creating a classifier using the factory. * @@ -133,7 +115,6 @@ make_multi_index_classifier(const cpptoml::table&, return make_unique(training, inv_idx); } - /** * Factory that is responsible for loading classifiers from input streams. * Clients should use the register_classifier method instead of this class From c2e3ad27d6cff11cbc904855256c607e88420123 Mon Sep 17 00:00:00 2001 From: Valiullin Albert Date: Thu, 30 Nov 2017 19:02:41 +0300 Subject: [PATCH 10/15] [ranker]: refactoring, resolve warnings in wmd --- include/meta/index/ranker/emd.h | 4 +- include/meta/util/min_cost_flow.h | 104 ++--- include/meta/util/min_cost_flow.tcc | 635 ++++++++++++---------------- src/index/ranker/wmd_base.cpp | 22 +- 4 files changed, 321 insertions(+), 444 deletions(-) diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h index 6c4b4dfbc..9800e40cd 100644 --- a/include/meta/index/ranker/emd.h +++ b/include/meta/index/ranker/emd.h @@ -106,7 +106,7 @@ class em_distance } } util::min_cost_flow mcf; - auto score = mcf.emd_hat(supply, demand, supply, demand, cost, -1.); + auto score = mcf.emd_hat(supply, demand, cost); return score; } @@ -185,7 +185,7 @@ class em_distance double res = 0.0; auto it1 = a.begin(); auto it2 = b.begin(); - if(it1 == it2) + if (it1 == it2) { return 0; } diff --git a/include/meta/util/min_cost_flow.h b/include/meta/util/min_cost_flow.h index 6e2e5dbfd..1f2a5fecc 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/util/min_cost_flow.h @@ -1,11 +1,7 @@ -// -// Created by lolik111 on 29.11.17. -// - - -// -// Created by lolik111 on 28.11.17. -// +/** + * @file min_cost_flow.h + * @author lolik111 + */ #ifndef FAST_EMD_MIN_COST_FLOW_H #define FAST_EMD_MIN_COST_FLOW_H @@ -21,77 +17,57 @@ namespace meta { namespace util { -template +template struct edge; -template -struct edge0; - -template -struct edge1; - -template -struct edge2; - -template -struct edge3; +template +struct edge_weighted; -template +template class min_cost_flow { -public: + public: + NumT emd_hat(const std::vector& supply, + const std::vector& demand, + const std::vector>& cost); // e - supply(positive) and demand(negative). // c[i] - edges that goes from node i. first is the second nod // x - the flow is returned in it + NumT compute_min_cost_flow(std::vector& e, + const std::vector>>& c, + std::vector>>& x); - NumT emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - NumT extra_mass_penalty); - - NumT compute_min_cost_flow(std::vector &e, - const std::vector>> &c, - std::vector>> &x); - - -private: - + private: size_t _num_nodes; std::vector _nodes_to_demand; - template - static T integral_emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - T extra_mass_penalty); + template + static T integral_emd_hat(const std::vector& supply, + const std::vector& demand, + const std::vector>& cost); - void - compute_shortest_path(std::vector &d, std::vector &prev, + void compute_shortest_path( + std::vector& d, std::vector& prev, - size_t from, - std::vector>> &cost_forward, - std::vector>> &cost_backward, + size_t from, std::vector>>& cost_forward, + std::vector>>& cost_backward, - const std::vector &e, size_t &l); + const std::vector& e, size_t& l); - void heap_decrease_key(std::vector> &demand, - std::vector &nodes_to_demand, size_t v, + void heap_decrease_key(std::vector>& demand, + std::vector& nodes_to_demand, size_t v, NumT alt); - void heap_remove_first(std::vector> &demand, - std::vector &nodes_to_demand); + void heap_remove_first(std::vector>& demand, + std::vector& nodes_to_demand); - void heapify(std::vector> &demand, - std::vector &nodes_to_demand, size_t i); + void heapify(std::vector>& demand, + std::vector& nodes_to_demand, size_t i); - void swap_heap(std::vector> &demand, - std::vector &nodes_to_demand, size_t i, size_t j); + void swap_heap(std::vector>& demand, + std::vector& nodes_to_demand, size_t i, size_t j); size_t LEFT(size_t i) { @@ -107,9 +83,13 @@ class min_cost_flow { return (i - 1) / 2; } - }; -// end min_cost_flow +} +} + +#include "min_cost_flow.tcc" + +#endif // FAST_EMD_MIN_COST_FLOW_H // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. @@ -137,9 +117,3 @@ class min_cost_flow // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -} -} - -#include "min_cost_flow.tcc" - -#endif //FAST_EMD_MIN_COST_FLOW_H diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/util/min_cost_flow.tcc index ade9b4204..a82bc311b 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/util/min_cost_flow.tcc @@ -1,83 +1,51 @@ +/** + * @file min_cost_flow.tcc + * @author lolik111 + */ + +#include "min_cost_flow.h" #include #include +#include #include +#include #include #include -#include #include -#include -#include "min_cost_flow.h" +#include namespace meta { namespace util { -template +template struct edge { - edge(size_t to, CostType cost) : _to(to), _cost(cost) - { - } - - size_t _to; - CostType _cost; -}; - -template -struct edge0 -{ - edge0(size_t to, CostType cost, CostType flow) : _to(to), _cost(cost), - _flow(flow) - { - } - - size_t _to; - CostType _cost; - CostType _flow; -}; - -template -struct edge1 -{ - edge1(size_t to, CostType reduced_cost) : _to(to), - _reduced_cost(reduced_cost) - { - } - - size_t _to; - CostType _reduced_cost; -}; - -template -struct edge2 -{ - edge2(size_t to, CostType reduced_cost, CostType residual_capacity) - : _to(to), - _reduced_cost(reduced_cost), - _residual_capacity(residual_capacity) + edge(size_t to = 0, T cost = 0) : _to(to), _cost(cost) { } size_t _to; - CostType _reduced_cost; - CostType _residual_capacity; + T _cost; }; -template -struct edge3 +template +struct edge_weighted { - edge3(size_t to = 0, DistType dist = 0) : _to(to), _dist(dist) + edge_weighted(size_t to, T cost, T amount) + : _to(to), _cost(cost), _amount(amount) { } size_t _to; - DistType _dist; + T _cost; + T _amount; }; -template -NumT min_cost_flow::compute_min_cost_flow(std::vector &e, - const std::vector>> &c, - std::vector>> &x) +template +NumT min_cost_flow::compute_min_cost_flow( + std::vector& e, const std::vector>>& c, + std::vector>>& x) { assert(e.size() == c.size()); @@ -86,49 +54,34 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, _num_nodes = e.size(); _nodes_to_demand.resize(_num_nodes); - for (size_t from = 0; from < _num_nodes; ++from) - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) - { - x[from].push_back(edge0(it->_to, it->_cost, 0)); - x[it->_to].push_back(edge0(from, -it->_cost, 0)); - } - - } - // reduced costs for forward edges (c[i,j]-pi[i]+pi[j]) // Note that for forward edges the residual capacity is infinity - std::vector>> r_cost_forward(_num_nodes); - { - for (size_t from = 0; from < _num_nodes; ++from) { - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) { - r_cost_forward[from].push_back( - edge1(it->_to, it->_cost)); - } - } - } - } + std::vector>> r_cost_forward(_num_nodes); // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) // Since the flow at the beginning is 0, the residual capacity is also zero - std::vector>> r_cost_cap_backward(_num_nodes); + std::vector>> r_cost_cap_backward(_num_nodes); - for (size_t from = 0; from < _num_nodes; ++from) { - { - for (auto it = c[from].begin(); it != c[from].end(); ++it) { - r_cost_cap_backward[it->_to].push_back( - edge2(from, -it->_cost, 0)); - } - } + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = c[from].begin(); it != c[from].end(); ++it) + { + // init flow + x[from].push_back(edge_weighted(it->_to, it->_cost, 0)); + x[it->_to].push_back(edge_weighted(from, -it->_cost, 0)); + + r_cost_forward[from].push_back(edge(it->_to, it->_cost)); + r_cost_cap_backward[it->_to].push_back( + edge_weighted(from, -it->_cost, 0)); } + } + // Max supply NumT U = 0; + for (size_t i = 0; i < _num_nodes; ++i) { - for (size_t i = 0; i < _num_nodes; ++i) { - if (e[i] > U) - U = e[i]; - } + if (e[i] > U) + U = e[i]; } std::vector d(_num_nodes); @@ -139,9 +92,12 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, NumT max_supply = 0; size_t k = 0; - for (size_t i = 0; i < _num_nodes; ++i) { - if (e[i] > 0) { - if (max_supply < e[i]) { + for (size_t i = 0; i < _num_nodes; ++i) + { + if (e[i] > 0) + { + if (max_supply < e[i]) + { max_supply = e[i]; k = i; } @@ -152,12 +108,10 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, delta = max_supply; size_t l; - compute_shortest_path(d, prev, k, r_cost_forward, - r_cost_cap_backward, e, l); + compute_shortest_path(d, prev, k, r_cost_forward, r_cost_cap_backward, + e, l); // find delta (minimum on the path from k to l) - // delta= e[k]; - // if (-e[l]::compute_min_cost_flow(std::vector &e, // residual auto itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) - && (itccb->_to != to)) { + && (itccb->_to != to)) + { ++itccb; } - if (itccb != r_cost_cap_backward[from].end()) { - if (itccb->_residual_capacity < delta) - delta = itccb->_residual_capacity; + if (itccb != r_cost_cap_backward[from].end()) + { + if (itccb->_amount < delta) + delta = itccb->_amount; } to = from; } while (to != k); - // augment delta flow from k to l (backwards actually...) to = l; - do { + do + { size_t from = prev[to]; assert(from != to); - // TODO - might do here O(n) can be done in O(1) auto itx = x[from].begin(); - while (itx->_to != to) { + while (itx->_to != to) + { ++itx; } - itx->_flow += delta; + itx->_amount += delta; // update residual for backward edges auto itccb = r_cost_cap_backward[to].begin(); while ((itccb != r_cost_cap_backward[to].end()) - && (itccb->_to != from)) { + && (itccb->_to != from)) + { ++itccb; } - if (itccb != r_cost_cap_backward[to].end()) { - itccb->_residual_capacity += delta; + if (itccb != r_cost_cap_backward[to].end()) + { + itccb->_amount += delta; } itccb = r_cost_cap_backward[from].begin(); while ((itccb != r_cost_cap_backward[from].end()) @@ -209,7 +167,7 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, } if (itccb != r_cost_cap_backward[from].end()) { - itccb->_residual_capacity -= delta; + itccb->_amount -= delta; } // update e @@ -218,63 +176,59 @@ NumT min_cost_flow::compute_min_cost_flow(std::vector &e, to = from; } while (to != k); - - } // while true (until we break when S or T is empty) + } // compute distance from x NumT dist = 0; - - for (size_t from = 0; from < _num_nodes; ++from) - { - for (auto it = x[from].begin(); it != x[from].end(); ++it) - { - dist += (it->_cost * it->_flow); - } - } + for (size_t from = 0; from < _num_nodes; ++from) + { + for (auto it = x[from].begin(); it != x[from].end(); ++it) + { + dist += (it->_cost * it->_amount); + } + } return dist; } -template -void min_cost_flow:: -compute_shortest_path(std::vector &d, std::vector &prev, - - size_t from, - std::vector>> &cost_forward, - std::vector>> &cost_backward, - - const std::vector &e, size_t &l) +template +void min_cost_flow::compute_shortest_path( + std::vector& d, std::vector& prev, size_t from, + std::vector>>& cost_forward, + std::vector>>& cost_backward, + const std::vector& e, size_t& l) { // Making heap (all inf except 0, so we are saving comparisons...) - std::vector> demand(_num_nodes); + std::vector> demand(_num_nodes); demand[0]._to = from; _nodes_to_demand[from] = 0; - demand[0]._dist = 0; + demand[0]._cost = 0; size_t j = 1; - // TODO: both of these into a function? - - for (size_t i = 0; i < from; ++i) { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); - ++j; - } - - for (size_t i = from + 1; i < _num_nodes; ++i) { - demand[j]._to = i; - _nodes_to_demand[i] = j; - demand[j]._dist = std::numeric_limits::max(); - ++j; - } + for (size_t i = 0; i < from; ++i) + { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._cost = std::numeric_limits::max(); + ++j; + } + + for (size_t i = from + 1; i < _num_nodes; ++i) + { + demand[j]._to = i; + _nodes_to_demand[i] = j; + demand[j]._cost = std::numeric_limits::max(); + ++j; + } // main loop std::vector final_nodes_flg(_num_nodes, false); - do { + do + { size_t u = demand[0]._to; - d[u] = demand[0]._dist; // final distance + d[u] = demand[0]._cost; // final distance final_nodes_flg[u] = true; if (e[u] < 0) { @@ -284,114 +238,119 @@ compute_shortest_path(std::vector &d, std::vector &prev, heap_remove_first(demand, _nodes_to_demand); + // neighbors of capacity for (auto it = cost_forward[u].begin(); it != cost_forward[u].end(); ++it) { - assert(it->_reduced_cost >= 0); - NumT alt = d[u] + it->_reduced_cost; + assert(it->_cost >= 0); + NumT alt = d[u] + it->_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._dist)) + && (alt < demand[_nodes_to_demand[v]]._cost)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; } } - for (auto it = cost_backward[u].begin(); - it != cost_backward[u].end(); ++it) + for (auto it = cost_backward[u].begin(); it != cost_backward[u].end(); + ++it) { - if (it->_residual_capacity > 0) + if (it->_amount > 0) { - assert(it->_reduced_cost >= 0); - NumT alt = d[u] + it->_reduced_cost; + assert(it->_cost >= 0); + NumT alt = d[u] + it->_cost; size_t v = it->_to; if ((_nodes_to_demand[v] < demand.size()) - && (alt < demand[_nodes_to_demand[v]]._dist)) + && (alt < demand[_nodes_to_demand[v]]._cost)) { heap_decrease_key(demand, _nodes_to_demand, v, alt); prev[v] = u; } } } - // it - } while (!demand.empty()); - - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { + // reduced costs for forward edges (cost[i,j]-pi[i]+pi[j]) + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) + { for (auto it = cost_forward[node_from].begin(); - it != cost_forward[node_from].end(); ++it) { - if (final_nodes_flg[node_from]) { - it->_reduced_cost += d[node_from] - d[l]; + it != cost_forward[node_from].end(); ++it) + { + if (final_nodes_flg[node_from]) + { + it->_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) { - it->_reduced_cost -= d[it->_to] - d[l]; + if (final_nodes_flg[it->_to]) + { + it->_cost -= d[it->_to] - d[l]; } } - } - // reduced costs and capacity for backward edges (c[j,i]-pi[j]+pi[i]) - - for (size_t node_from = 0; node_from < _num_nodes; ++node_from) { - + for (size_t node_from = 0; node_from < _num_nodes; ++node_from) + { for (auto it = cost_backward[node_from].begin(); - it != cost_backward[node_from].end(); ++it) { - if (final_nodes_flg[node_from]) { - it->_reduced_cost += d[node_from] - d[l]; + it != cost_backward[node_from].end(); ++it) + { + if (final_nodes_flg[node_from]) + { + it->_cost += d[node_from] - d[l]; } - if (final_nodes_flg[it->_to]) { - it->_reduced_cost -= d[it->_to] - d[l]; + if (final_nodes_flg[it->_to]) + { + it->_cost -= d[it->_to] - d[l]; } } } - - } -template -void min_cost_flow::heap_decrease_key(std::vector> &demand, - std::vector &nodes_to_demand, - size_t v, - NumT alt) +template +void min_cost_flow::heap_decrease_key( + std::vector>& demand, std::vector& nodes_to_demand, + size_t v, NumT alt) { size_t i = nodes_to_demand[v]; - demand[i]._dist = alt; - while (i > 0 && demand[PARENT(i)]._dist > demand[i]._dist) { + demand[i]._cost = alt; + while (i > 0 && demand[PARENT(i)]._cost > demand[i]._cost) + { swap_heap(demand, nodes_to_demand, i, PARENT(i)); i = PARENT(i); } -} // heap_decrease_key +} -template -void min_cost_flow::heap_remove_first(std::vector> &demand, - std::vector &nodes_to_demand) +template +void min_cost_flow::heap_remove_first( + std::vector>& demand, std::vector& nodes_to_demand) { swap_heap(demand, nodes_to_demand, 0, demand.size() - 1); demand.pop_back(); heapify(demand, nodes_to_demand, 0); -} // heap_remove_first +} -template -void min_cost_flow::heapify(std::vector> &demand, - std::vector &nodes_to_demand, size_t i) +template +void min_cost_flow::heapify(std::vector>& demand, + std::vector& nodes_to_demand, + size_t i) { - - do { + do + { // TODO: change to loop size_t l = LEFT(i); size_t r = RIGHT(i); size_t smallest; - if ((l < demand.size()) && (demand[l]._dist < demand[i]._dist)) { + if ((l < demand.size()) && (demand[l]._cost < demand[i]._cost)) + { smallest = l; - } else { + } + else + { smallest = i; } - if ((r < demand.size()) - && (demand[r]._dist < demand[smallest]._dist)) { + if ((r < demand.size()) && (demand[r]._cost < demand[smallest]._cost)) + { smallest = r; } @@ -402,41 +361,36 @@ void min_cost_flow::heapify(std::vector> &demand, i = smallest; } while (true); - } -template -void min_cost_flow::swap_heap(std::vector> &demand, - std::vector &nodes_to_demand, size_t i, - size_t j) +template +void min_cost_flow::swap_heap(std::vector>& demand, + std::vector& nodes_to_demand, + size_t i, size_t j) { - edge3 tmp = demand[i]; + edge tmp = demand[i]; demand[i] = demand[j]; demand[j] = tmp; nodes_to_demand[demand[j]._to] = j; nodes_to_demand[demand[i]._to] = i; } - -template -NumT min_cost_flow::emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply, - const std::vector &demand, - const std::vector> &cost, - NumT extra_mass_penalty) +template +NumT min_cost_flow::emd_hat(const std::vector& supply, + const std::vector& demand, + const std::vector>& cost) { - if (std::is_integral::value) { - return integral_emd_hat(supply_orig, demand_orig, supply, demand, - cost, extra_mass_penalty); - } else { + if (std::is_integral::value) + { + return integral_emd_hat(supply, demand, cost); + } + else + { const double mult_factor = 1000000; // Constructing the input const size_t n = supply.size(); - std::vector i_supply_orig(n); - std::vector i_demand_orig(n); std::vector i_supply(n); std::vector i_demand(n); std::vector> i_cost(n, std::vector(n)); @@ -445,61 +399,47 @@ NumT min_cost_flow::emd_hat(const std::vector &supply_orig, double sum_supply = 0.0; double sum_demand = 0.0; double max_cost = cost[0][0]; - for (size_t i = 0; i < n; ++i) { - sum_supply += supply_orig[i]; - sum_demand += demand_orig[i]; - for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) + { + sum_supply += supply[i]; + sum_demand += demand[i]; + for (size_t j = 0; j < n; ++j) + { if (cost[i][j] > max_cost) max_cost = cost[i][j]; } } - double min_sum = std::min(sum_supply, sum_demand); double max_sum = std::max(sum_supply, sum_demand); double supply_demand_norm_factor = mult_factor / max_sum; double cost_norm_factor = mult_factor / max_cost; - for (size_t i = 0; i < n; ++i) { - i_supply_orig[i] = static_cast( - floor(supply_orig[i] * supply_demand_norm_factor + 0.5)); - i_demand_orig[i] = static_cast( - floor(demand_orig[i] * supply_demand_norm_factor + 0.5)); - i_supply[i] = static_cast(floor( - supply[i] * supply_demand_norm_factor + 0.5)); - i_demand[i] = static_cast(floor( - demand[i] * supply_demand_norm_factor + 0.5)); - for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) + { + i_supply[i] = static_cast( + floor(supply[i] * supply_demand_norm_factor + 0.5)); + i_demand[i] = static_cast( + floor(demand[i] * supply_demand_norm_factor + 0.5)); + for (size_t j = 0; j < n; ++j) + { i_cost[i][j] = static_cast( - floor(cost[i][j] * cost_norm_factor + 0.5)); + floor(cost[i][j] * cost_norm_factor + 0.5)); } } - // computing distance without extra mass penalty - double dist - = integral_emd_hat(i_supply_orig, i_demand_orig, - i_supply, i_demand, i_cost, 0); + // computing distance + double dist = integral_emd_hat(i_supply, i_demand, i_cost); dist = dist / supply_demand_norm_factor; dist = dist / cost_norm_factor; - // adding extra mass penalty -// if (extra_mass_penalty == -1) -// extra_mass_penalty = max_cost; -// dist += (max_sum - min_sum) * extra_mass_penalty; - return dist; - } - } -template -template< typename T> -T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, - const std::vector &demand_orig, - const std::vector &supply_c, - const std::vector &demand_c, - const std::vector> - &cost_c, - T extra_mass_penalty) +template +template +T min_cost_flow::integral_emd_hat( + const std::vector& supply_c, const std::vector& demand_c, + const std::vector>& cost_c) { size_t n = supply_c.size(); assert(demand_c.size() == n); @@ -511,147 +451,114 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, T abs_diff_sum_supply_sum_denamd; T sum_supply = 0; T sum_demand = 0; + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) - sum_supply += supply_c[i]; + sum_supply += supply_c[i]; + sum_demand += demand_c[i]; } + + if (sum_demand > sum_supply) { - for (size_t i = 0; i < n; ++i) - sum_demand += demand_c[i]; - } - bool need_to_swap_flow = false; - if (sum_demand > sum_supply) { - need_to_swap_flow = true; supply = demand_c; demand = supply_c; // transpose cost - for (size_t i = 0; i < n; ++i) { - for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) + { + for (size_t j = 0; j < n; ++j) + { cost[i][j] = cost_c[j][i]; } } abs_diff_sum_supply_sum_denamd = sum_demand - sum_supply; - } else { + } + else + { supply = supply_c; demand = demand_c; abs_diff_sum_supply_sum_denamd = sum_supply - sum_demand; } - // if (need_to_swap_flow) cout << "need_to_swap_flow" << endl; // creating the b vector that contains all vertexes std::vector b(2 * n + 2); const size_t threshold_node = 2 * n; const size_t artificial_node = 2 * n + 1; // need to be last ! + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - b[i] = supply[i]; - } - } - { - for (size_t i = n; i < 2 * n; ++i) { - b[i] = (demand[i - n]); - } + b[i] = supply[i]; + b[i + n] = demand[i]; } + + // remark*) Deficit of the extra mass, as mass that flows to the threshold + // node can be absorbed from all sources with cost zero + // This makes sum of b zero. b[threshold_node] = -abs_diff_sum_supply_sum_denamd; b[artificial_node] = 0; - //------------------------------------------------------- - //------------------------------------------------------- T max_cost = 0; + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - { - for (size_t j = 0; j < n; ++j) { - assert(cost[i][j] >= 0); - if (cost[i][j] > max_cost) - max_cost = cost[i][j]; - } - } + for (size_t j = 0; j < n; ++j) + { + assert(cost[i][j] >= 0); + if (cost[i][j] > max_cost) + max_cost = cost[i][j]; } } - if (extra_mass_penalty == -1) - extra_mass_penalty = max_cost; - //------------------------------------------------------- - //============================================================= std::set sources_that_flow_not_only_to_thresh; std::set sinks_that_get_flow_not_only_from_thresh; T pre_flow_cost = 0; - //============================================================= - //============================================================= // regular edges between sinks and sources without threshold edges std::vector>> c(b.size()); { - for (size_t i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) + { if (b[i] == 0) continue; { - for (size_t j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) + { if (b[j + n] == 0) continue; if (cost[i][j] == max_cost) continue; c[i].push_back(edge(j + n, cost[i][j])); - } - } // j - } - } // i - // checking which are not isolated - { - for (size_t i = 0; i < n; ++i) { - if (b[i] == 0) - continue; - { - for (size_t j = 0; j < n; ++j) { - if (b[j + n] == 0) - continue; - if (cost[i][j] == max_cost) - continue; + // checking which are not isolated sources_that_flow_not_only_to_thresh.insert(i); sinks_that_get_flow_not_only_from_thresh.insert(j + n); } - } // j + } } - } // i + } // converting all sinks to negative + for (size_t i = n; i < 2 * n; ++i) { - for (size_t i = n; i < 2 * n; ++i) { - b[i] = -b[i]; - } + b[i] = -b[i]; } // add edges from/to threshold node, // note that costs are reversed to the paper (see also remark* above) // It is important that it will be this way because of remark* above. + for (size_t i = 0; i < n; ++i) { - for (size_t i = 0; i < n; ++i) { - c[i].push_back(edge(threshold_node, 0)); - } - } - { - for (size_t j = 0; j < n; ++j) { - c[threshold_node].push_back(edge(j + n, max_cost)); - } + c[i].push_back(edge(threshold_node, 0)); + c[threshold_node].push_back(edge(i + n, max_cost)); } // artificial arcs - Note the restriction that only one edge i,j is // artificial so I ignore it... + for (size_t i = 0; i < artificial_node; ++i) { - for (size_t i = 0; i < artificial_node; ++i) { - c[i].push_back(edge(artificial_node, max_cost + 1)); - c[artificial_node].push_back(edge(i, max_cost + 1)); - } + c[i].push_back(edge(artificial_node, max_cost + 1)); + c[artificial_node].push_back(edge(i, max_cost + 1)); } - //============================================================= - //==================================================== // remove nodes with supply demand of 0 - // and vertexes that are connected only to the + // and vertices that are connected only to the // threshold vertex - //==================================================== int current_node_name = 0; // Note here it should be vector and not vector // as I'm using -1 as a special flag !!! @@ -659,27 +566,31 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, std::vector nodes_new_names(b.size(), remove_node_flag); std::vector nodes_old_names; nodes_old_names.reserve(b.size()); + + for (size_t i = 0; i < n * 2; ++i) { - for (size_t i = 0; i < n * 2; ++i) { - if (b[i] != 0) { - if (sources_that_flow_not_only_to_thresh.find(i) + if (b[i] != 0) + { + if (sources_that_flow_not_only_to_thresh.find(i) != sources_that_flow_not_only_to_thresh.end() - || sinks_that_get_flow_not_only_from_thresh.find(i) - != sinks_that_get_flow_not_only_from_thresh - .end()) { - nodes_new_names[i] = current_node_name; - nodes_old_names.push_back(i); - ++current_node_name; - } else { - if (i >= n) { // sink - pre_flow_cost -= (b[i] * max_cost); - } - b[threshold_node] - += b[i]; // add mass(i=n) + || sinks_that_get_flow_not_only_from_thresh.find(i) + != sinks_that_get_flow_not_only_from_thresh.end()) + { + nodes_new_names[i] = current_node_name; + nodes_old_names.push_back(i); + ++current_node_name; + } + else + { + if (i >= n) + { // sink + pre_flow_cost -= (b[i] * max_cost); } + b[threshold_node] += b[i]; // add mass(i=n) } } - } // i + } + nodes_new_names[threshold_node] = current_node_name; nodes_old_names.push_back(threshold_node); ++current_node_name; @@ -689,52 +600,44 @@ T min_cost_flow::integral_emd_hat(const std::vector &supply_orig, std::vector bb(current_node_name); size_t j = 0; + for (size_t i = 0; i < b.size(); ++i) { - for (size_t i = 0; i < b.size(); ++i) { - if (nodes_new_names[i] != remove_node_flag) { - bb[j] = b[i]; - ++j; - } + if (nodes_new_names[i] != remove_node_flag) + { + bb[j] = b[i]; + ++j; } } std::vector>> cc(bb.size()); + for (size_t i = 0; i < c.size(); ++i) { - for (size_t i = 0; i < c.size(); ++i) { - if (nodes_new_names[i] == remove_node_flag) - continue; + if (nodes_new_names[i] == remove_node_flag) + continue; + for (auto it = c[i].begin(); it != c[i].end(); ++it) + { + if (nodes_new_names[it->_to] != remove_node_flag) { - for (auto it = c[i].begin(); it != c[i].end(); ++it) { - if (nodes_new_names[it->_to] != remove_node_flag) { - cc[nodes_new_names[i]].push_back(edge( - nodes_new_names[it->_to], it->_cost)); - } - } + cc[nodes_new_names[i]].push_back( + edge(nodes_new_names[it->_to], it->_cost)); } } } - min_cost_flow mcf; - + min_cost_flow mcf; T my_dist; - - std::vector>> flows(bb.size()); + std::vector>> flows(bb.size()); T mcf_dist = mcf.compute_min_cost_flow(bb, cc, flows); my_dist = pre_flow_cost + // pre-flowing on cases where it was possible - mcf_dist; // solution of the transportation problem - + mcf_dist; // solution of the transportation problem return my_dist; - } } } - -// end min_cost_flow - // Copyright (c) 2009-2012, Ofir Pele // All rights reserved. diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index fcecf5bf4..5f46a3c01 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -103,7 +103,8 @@ std::vector wmd_base::rank(ranker_context& ctx, } else { - index::em_distance wcd(cache_, embeddings_, "wcd", em_distance::l2diff_norm); + index::em_distance wcd(cache_, embeddings_, "wcd", + em_distance::l2diff_norm); index::em_distance emd(cache_, embeddings_, "emd", distance); index::em_distance rwmd(cache_, embeddings_, "rwmd", distance); @@ -128,7 +129,7 @@ std::vector wmd_base::rank(ranker_context& ctx, scores.erase(scores.begin(), scores.begin() + num_results); // emd after wcd auto k_emd = process(emd, filter, ctx, k_docs); - for(auto sr : k_emd) + for (auto sr : k_emd) { results.emplace(sr); } @@ -136,19 +137,18 @@ std::vector wmd_base::rank(ranker_context& ctx, // worst result auto last = (--results.end())->score; - const size_t magic_constant = std::max(fwd_->docs().size() / 8, - num_results * 8); + const size_t magic_constant + = std::max(fwd_->docs().size() / 8, num_results * 8); std::vector rwmd_docs(magic_constant); auto start = scores.begin(); - std::generate(rwmd_docs.begin(), rwmd_docs.end(), [&](){ - return (*start++).d_id; - }); + std::generate(rwmd_docs.begin(), rwmd_docs.end(), + [&]() { return (*start++).d_id; }); // rwmd phase auto rwmd_results = process(rwmd, filter, ctx, rwmd_docs); std::vector pretend_docs; - for(auto sr : rwmd_results) + for (auto sr : rwmd_results) { if (sr.score < last) { @@ -157,14 +157,13 @@ std::vector wmd_base::rank(ranker_context& ctx, } if (!pretend_docs.empty()) - { // emd phase + { // emd phase auto pretend_results = process(emd, filter, ctx, pretend_docs); for (auto sr : pretend_results) { results.emplace(sr); } } - } return results.extract_top(); @@ -196,7 +195,8 @@ std::vector wmd_base::process(em_distance emd, } auto doc2 = create_document(tf_pc); - if(doc1.n_terms == 0 || doc2.n_terms == 0){ + if (doc1.n_terms == 0 || doc2.n_terms == 0) + { continue; } auto score = static_cast(emd.score(doc1, doc2)); From 2065aaa6b8f9fd37dc504ca1c6549d4960ef2064 Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Thu, 30 Nov 2017 22:48:10 +0300 Subject: [PATCH 11/15] [ranker] moved wmd logic to embeddings --- .../{util => embeddings/wmd}/min_cost_flow.h | 7 +- .../wmd}/min_cost_flow.tcc | 21 +- include/meta/embeddings/wmd/wm_distance.h | 82 ++++++ include/meta/index/ranker/emd.h | 254 ------------------ include/meta/index/ranker/wmd_base.h | 23 +- src/classify/classifier/classifier.cpp | 6 +- src/embeddings/CMakeLists.txt | 1 + src/embeddings/wmd/CMakeLists.txt | 8 + src/embeddings/wmd/wm_distance.cpp | 217 +++++++++++++++ src/embeddings/word_embeddings.cpp | 33 +-- src/index/ranker/CMakeLists.txt | 5 +- src/index/ranker/wmd_base.cpp | 41 ++- 12 files changed, 374 insertions(+), 324 deletions(-) rename include/meta/{util => embeddings/wmd}/min_cost_flow.h (95%) rename include/meta/{util => embeddings/wmd}/min_cost_flow.tcc (98%) create mode 100644 include/meta/embeddings/wmd/wm_distance.h delete mode 100644 include/meta/index/ranker/emd.h create mode 100644 src/embeddings/wmd/CMakeLists.txt create mode 100644 src/embeddings/wmd/wm_distance.cpp diff --git a/include/meta/util/min_cost_flow.h b/include/meta/embeddings/wmd/min_cost_flow.h similarity index 95% rename from include/meta/util/min_cost_flow.h rename to include/meta/embeddings/wmd/min_cost_flow.h index 1f2a5fecc..842b540df 100644 --- a/include/meta/util/min_cost_flow.h +++ b/include/meta/embeddings/wmd/min_cost_flow.h @@ -1,6 +1,10 @@ /** * @file min_cost_flow.h * @author lolik111 + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. */ #ifndef FAST_EMD_MIN_COST_FLOW_H @@ -9,13 +13,12 @@ #include #include #include -#include #include #include namespace meta { -namespace util +namespace embeddings { template struct edge; diff --git a/include/meta/util/min_cost_flow.tcc b/include/meta/embeddings/wmd/min_cost_flow.tcc similarity index 98% rename from include/meta/util/min_cost_flow.tcc rename to include/meta/embeddings/wmd/min_cost_flow.tcc index a82bc311b..37f9bf512 100644 --- a/include/meta/util/min_cost_flow.tcc +++ b/include/meta/embeddings/wmd/min_cost_flow.tcc @@ -1,22 +1,20 @@ /** * @file min_cost_flow.tcc * @author lolik111 + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. */ -#include "min_cost_flow.h" -#include -#include -#include -#include -#include #include -#include #include -#include + +#include "min_cost_flow.h" namespace meta { -namespace util +namespace embeddings { template struct edge @@ -380,7 +378,7 @@ NumT min_cost_flow::emd_hat(const std::vector& supply, const std::vector& demand, const std::vector>& cost) { - if (std::is_integral::value) + if (std::is_integral::value && std::is_signed::value) { return integral_emd_hat(supply, demand, cost); } @@ -411,6 +409,9 @@ NumT min_cost_flow::emd_hat(const std::vector& supply, } double max_sum = std::max(sum_supply, sum_demand); double supply_demand_norm_factor = mult_factor / max_sum; + if (max_cost < 1e-12){ + return 0.0; + } double cost_norm_factor = mult_factor / max_cost; for (size_t i = 0; i < n; ++i) { diff --git a/include/meta/embeddings/wmd/wm_distance.h b/include/meta/embeddings/wmd/wm_distance.h new file mode 100644 index 000000000..e02a9b473 --- /dev/null +++ b/include/meta/embeddings/wmd/wm_distance.h @@ -0,0 +1,82 @@ +/** + * @file wm_distance.h + * @author lolik111 + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +#ifndef META_EMD_H +#define META_EMD_H + +#include +#include + +#include "meta/caching/all.h" +#include "meta/embeddings/word_embeddings.h" +#include "meta/math/vector.h" + +namespace meta +{ + +namespace embeddings +{ + +struct emb_document +{ + size_t n_terms; + std::vector ids; + std::vector weights; +}; + +class wm_distance +{ + public: + using metric_type + = std::function&, + const util::array_view&)>; + + wm_distance( + std:: + shared_ptr, + double>> + cache_, + std::shared_ptr embeddings, + metric_type metric, size_t nthreads = 1); + + double score(const std::string algorithm_type, const emb_document& doc1, + const emb_document& doc2); + + double emd(const emb_document& doc1, const emb_document& doc2); + + double emd_relaxed(const emb_document& doc1, const emb_document& doc2); + + double wcd(const emb_document& doc1, const emb_document& doc2); + + static double l2diff_norm(const util::array_view& a, + const util::array_view& b); + + static double cosine(const util::array_view& a, + const util::array_view& b); + + private: + const size_t nthreads_; + std::shared_ptr, + double>> + cache_; + std::shared_ptr embeddings_; + const size_t dimension_; + const metric_type dist; + + std::unordered_map> + methods_; + + double f_c_distance(const emb_document& doc1, const emb_document& doc2, + size_t first, size_t second); +}; +} +} + +#endif // META_EMD_H diff --git a/include/meta/index/ranker/emd.h b/include/meta/index/ranker/emd.h deleted file mode 100644 index 9800e40cd..000000000 --- a/include/meta/index/ranker/emd.h +++ /dev/null @@ -1,254 +0,0 @@ -/** - * @file emd.h - * @author lolik111 - */ - -#ifndef META_EMD_H -#define META_EMD_H - -#include "meta/caching/all.h" -#include "meta/hashing/hash.h" -#include "meta/math/vector.h" -#include "meta/parallel/algorithm.h" -#include "meta/util/min_cost_flow.h" -#include "meta/util/range.h" -#include -#include -#include -#include -#include -#include -#include -#include - -namespace meta -{ - -namespace index -{ - -class Document -{ - public: - size_t n_terms; - std::vector ids; - std::vector weights; -}; - -class em_distance -{ - public: - using metric_type - = std::function&, - const util::array_view&)>; - - em_distance( - std:: - shared_ptr, - double>> - cache_, - std::shared_ptr embeddings, - std::string algorithm_type, metric_type metric, size_t nthreads = 1) - : nthreads_(nthreads), - cache_(cache_), - embeddings_(embeddings), - algorithm_type_(algorithm_type), - dimension_(embeddings->vector_size()), - dist(metric) - { - methods_.emplace("rwmd", - [this](const Document& doc1, const Document& doc2) { - auto score1 = this->emd_relaxed(doc1, doc2); - auto score2 = this->emd_relaxed(doc2, doc1); - return std::max(score1, score2); - }); - methods_.emplace("wcd", - [this](const Document& doc1, const Document& doc2) { - return this->wcd(doc1, doc2); - }); - methods_.emplace("emd", - [this](const Document& doc1, const Document& doc2) { - return this->emd(doc1, doc2); - }); - } - - double score(const Document& doc1, const Document& doc2) - { - return methods_[algorithm_type_](doc1, doc2); - } - - double emd(const Document& doc1, const Document& doc2) - { - std::vector supply(doc1.n_terms + doc2.n_terms, 0); - std::vector demand(doc1.n_terms + doc2.n_terms, 0); - - for (size_t i = 0; i < doc1.n_terms; ++i) - { - supply[i] = doc1.weights[i]; - } - - for (size_t i = 0; i < doc2.n_terms; ++i) - { - demand[doc1.n_terms + i] = doc2.weights[i]; - } - - std::vector> cost( - supply.size(), std::vector(supply.size(), 0)); - - for (size_t i = 0; i < doc1.n_terms; ++i) - { - for (size_t j = 0; j < doc2.n_terms; ++j) - { - double dist = f_c_distance(doc1, doc2, i, j); - assert(dist >= 0); - cost[i][j + doc1.n_terms] = dist; - cost[j + doc1.n_terms][i] = dist; - } - } - util::min_cost_flow mcf; - auto score = mcf.emd_hat(supply, demand, cost); - - return score; - } - - double emd_relaxed(const Document& doc1, const Document& doc2) - { - std::vector ids(doc2.n_terms); - for (size_t i = 0; i < doc2.n_terms; i++) - { - ids[i] = i; - } - - double acc = 0; - for (size_t i = 0; i < doc1.n_terms; i++) - { - std::vector distance(doc2.n_terms); - for (size_t j = 0; j < doc2.n_terms; ++j) - { - distance[j] = f_c_distance(doc1, doc2, i, j); - } - - if (doc1.weights[i] != 0) - { - std::sort(ids.begin(), ids.end(), - [&](const size_t a, const size_t b) { - bool ans; - ans = distance[a] < distance[b]; - return ans; - }); - - double remaining = doc1.weights[i]; - for (size_t j = 0; j < doc2.n_terms; j++) - { - uint64_t w = ids[j]; - if (remaining < doc2.weights[w]) - { - acc += remaining * distance[w]; - break; - } - else - { - remaining -= doc2.weights[w]; - acc += doc2.weights[w] * distance[w]; - } - } - } - } - return acc; - } - - double wcd(const Document& doc1, const Document& doc2) - { - using namespace meta::math::operators; - - std::vector res1(dimension_, 0); - std::vector res2(dimension_, 0); - - auto start = doc1.ids.begin(); - for (auto w1 : doc1.weights) - { - res1 = res1 + embeddings_->at(*start++) * w1; - } - - start = doc2.ids.begin(); - for (auto w2 : doc2.weights) - { - res2 = res2 + embeddings_->at(*start++) * w2; - } - - return dist(res1, res2); - } - - static double l2diff_norm(const util::array_view& a, - const util::array_view& b) - { - double res = 0.0; - auto it1 = a.begin(); - auto it2 = b.begin(); - if (it1 == it2) - { - return 0; - } - - while (it1 != a.end()) - { - double val = *it1 - *it2; - res += val * val; - it1++; - it2++; - } - - return res; - } - - static double cosine(const util::array_view& a, - const util::array_view& b) - { - if (a.begin() == b.begin()) - return 0; - return (1.0 - std::inner_product(a.begin(), a.end(), b.begin(), 0.0)) - / 2.0; - } - - private: - const size_t nthreads_; - std::shared_ptr, - double>> - cache_; - std::shared_ptr embeddings_; - const std::string algorithm_type_; - const size_t dimension_; - const metric_type dist; - - std::unordered_map> - methods_; - - double f_c_distance(const Document& doc1, const Document& doc2, - size_t first, size_t second) - { - std::pair pair; - if (doc1.ids[first] < doc2.ids[second]) - { - pair = {doc1.ids[first], doc2.ids[second]}; - } - else - { - pair = {doc2.ids[second], doc1.ids[first]}; - } - - auto val = cache_->find(pair); - - if (!val) - { - val = dist(embeddings_->at(doc1.ids[first]), - embeddings_->at(doc2.ids[second])); - cache_->insert(pair, val.value()); - } - return val.value(); - } -}; -} -} - -#endif // META_EMD_H diff --git a/include/meta/index/ranker/wmd_base.h b/include/meta/index/ranker/wmd_base.h index de632fe0b..fa45d81f5 100644 --- a/include/meta/index/ranker/wmd_base.h +++ b/include/meta/index/ranker/wmd_base.h @@ -6,8 +6,8 @@ #ifndef META_WMD_BASE_H #define META_WMD_BASE_H +#include "meta/embeddings/wmd/wm_distance.h" #include "meta/embeddings/word_embeddings.h" -#include "meta/index/ranker/emd.h" #include "meta/index/ranker/ranker.h" #include "meta/index/ranker/ranker_factory.h" #include "meta/util/array_view.h" @@ -31,9 +31,11 @@ namespace index * * Optional config parameters: * ~~~toml - * mode # current mode: can be 'emd', 'wcd-emd', or 'rwmd' - * num-threads # number of threads used in the algorithm - * cache-per-thread # size of cache per each thread + * mode # current mode: can be "emd", "wcd", "rwmd", or + * "prefetch-prune" + * distance-func # type of the distance function: "l2diff" or "cosine" + * num-threads # number of threads used in the algorithm + * cache-per-thread # size of cache per each thread * ~~~ */ class wmd_base : public ranker @@ -55,11 +57,6 @@ class wmd_base : public ranker wmd_base(std::istream& in); - std::vector process(em_distance emd, - const filter_function_type& filter, - ranker_context& ctx, - std::vector docs); - void save(std::ostream& out) const override; std::vector @@ -76,8 +73,14 @@ class wmd_base : public ranker cache_; const std::string mode_; const std::string distance_func_; - meta::index::Document + embeddings::emb_document create_document(std::vector> tf); + + std::vector process(embeddings::wm_distance emd, + const std::string mode, + const filter_function_type& filter, + ranker_context& ctx, + std::vector docs); }; /** diff --git a/src/classify/classifier/classifier.cpp b/src/classify/classifier/classifier.cpp index 0024b317f..d6398e0c3 100644 --- a/src/classify/classifier/classifier.cpp +++ b/src/classify/classifier/classifier.cpp @@ -19,10 +19,8 @@ confusion_matrix classifier::test(dataset_view_type docs) const confusion_matrix matrix; for (const auto& instance : docs) - try { - matrix.add(predicted_label{classify(instance.weights)}, - docs.label(instance)); - } catch(std::exception e){} + matrix.add(predicted_label{classify(instance.weights)}, + docs.label(instance)); return matrix; } diff --git a/src/embeddings/CMakeLists.txt b/src/embeddings/CMakeLists.txt index 25441f3be..ac667ffe9 100644 --- a/src/embeddings/CMakeLists.txt +++ b/src/embeddings/CMakeLists.txt @@ -2,6 +2,7 @@ project(meta-embeddings) add_subdirectory(tools) add_subdirectory(analyzers) +add_subdirectory(wmd) add_library(meta-embeddings cooccurrence_counter.cpp word_embeddings.cpp) target_link_libraries(meta-embeddings cpptoml meta-analyzers meta-util meta-io) diff --git a/src/embeddings/wmd/CMakeLists.txt b/src/embeddings/wmd/CMakeLists.txt new file mode 100644 index 000000000..306b35789 --- /dev/null +++ b/src/embeddings/wmd/CMakeLists.txt @@ -0,0 +1,8 @@ +project(meta-embeddings) + +add_library(meta-wmd wm_distance.cpp) +target_link_libraries(meta-wmd meta-embeddings) + +install(TARGETS meta-wmd + EXPORT meta-exports + DESTINATION lib) diff --git a/src/embeddings/wmd/wm_distance.cpp b/src/embeddings/wmd/wm_distance.cpp new file mode 100644 index 000000000..6edbcfb24 --- /dev/null +++ b/src/embeddings/wmd/wm_distance.cpp @@ -0,0 +1,217 @@ +/** + * @file wm_distance.cpp + * @author lolik111 + * + * All files in META are dual-licensed under the MIT and NCSA licenses. For more + * details, consult the file LICENSE.mit and LICENSE.ncsa in the root of the + * project. + */ + +//#include +//#include +//#include +//#include + +#include "meta/embeddings/wmd/wm_distance.h" +#include "meta/embeddings/wmd/min_cost_flow.h" +#include "meta/parallel/algorithm.h" + +namespace meta +{ + +namespace embeddings +{ + +wm_distance::wm_distance( + std::shared_ptr, + double>> + cache_, + std::shared_ptr embeddings, metric_type metric, + size_t nthreads /*= 1*/) + : nthreads_(nthreads), + cache_(cache_), + embeddings_(embeddings), + dimension_(embeddings->vector_size()), + dist(metric) +{ + methods_.emplace( + "rwmd", [this](const emb_document& doc1, const emb_document& doc2) { + auto score1 = this->emd_relaxed(doc1, doc2); + auto score2 = this->emd_relaxed(doc2, doc1); + return std::max(score1, score2); + }); + methods_.emplace( + "wcd", [this](const emb_document& doc1, const emb_document& doc2) { + return this->wcd(doc1, doc2); + }); + methods_.emplace( + "emd", [this](const emb_document& doc1, const emb_document& doc2) { + return this->emd(doc1, doc2); + }); +} + +double wm_distance::score(const std::string algorithm_type, + const emb_document& doc1, const emb_document& doc2) +{ + return methods_[algorithm_type](doc1, doc2); +} + +double wm_distance::emd(const emb_document& doc1, const emb_document& doc2) +{ + std::vector supply(doc1.n_terms + doc2.n_terms, 0); + std::vector demand(doc1.n_terms + doc2.n_terms, 0); + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + supply[i] = doc1.weights[i]; + } + + for (size_t i = 0; i < doc2.n_terms; ++i) + { + demand[doc1.n_terms + i] = doc2.weights[i]; + } + + std::vector> cost( + supply.size(), std::vector(supply.size(), 0)); + + for (size_t i = 0; i < doc1.n_terms; ++i) + { + for (size_t j = 0; j < doc2.n_terms; ++j) + { + double dist = f_c_distance(doc1, doc2, i, j); + assert(dist >= 0); + cost[i][j + doc1.n_terms] = dist; + cost[j + doc1.n_terms][i] = dist; + } + } + embeddings::min_cost_flow mcf; + auto score = mcf.emd_hat(supply, demand, cost); + + return score; +} + +double wm_distance::emd_relaxed(const emb_document& doc1, + const emb_document& doc2) +{ + std::vector ids(doc2.n_terms); + for (size_t i = 0; i < doc2.n_terms; i++) + { + ids[i] = i; + } + + double acc = 0; + for (size_t i = 0; i < doc1.n_terms; i++) + { + std::vector distance(doc2.n_terms); + for (size_t j = 0; j < doc2.n_terms; ++j) + { + distance[j] = f_c_distance(doc1, doc2, i, j); + } + + if (doc1.weights[i] != 0) + { + std::sort(ids.begin(), ids.end(), + [&](const size_t a, const size_t b) { + bool ans; + ans = distance[a] < distance[b]; + return ans; + }); + + double remaining = doc1.weights[i]; + for (size_t j = 0; j < doc2.n_terms; j++) + { + uint64_t w = ids[j]; + if (remaining < doc2.weights[w]) + { + acc += remaining * distance[w]; + break; + } + else + { + remaining -= doc2.weights[w]; + acc += doc2.weights[w] * distance[w]; + } + } + } + } + return acc; +} + +double wm_distance::wcd(const emb_document& doc1, const emb_document& doc2) +{ + using namespace meta::math::operators; + + std::vector res1(dimension_, 0); + std::vector res2(dimension_, 0); + + auto start = doc1.ids.begin(); + for (auto w1 : doc1.weights) + { + res1 = res1 + embeddings_->at(*start++) * w1; + } + + start = doc2.ids.begin(); + for (auto w2 : doc2.weights) + { + res2 = res2 + embeddings_->at(*start++) * w2; + } + + return dist(res1, res2); +} + +double wm_distance::l2diff_norm(const util::array_view& a, + const util::array_view& b) +{ + double res = 0.0; + auto it1 = a.begin(); + auto it2 = b.begin(); + if (it1 == it2) + { + return 0; + } + + while (it1 != a.end()) + { + double val = *it1 - *it2; + res += val * val; + it1++; + it2++; + } + + return res; +} + +double wm_distance::cosine(const util::array_view& a, + const util::array_view& b) +{ + if (a.begin() == b.begin()) + return 0; + return (1.0 - std::inner_product(a.begin(), a.end(), b.begin(), 0.0)) / 2.0; +} + +double wm_distance::f_c_distance(const emb_document& doc1, + const emb_document& doc2, size_t first, + size_t second) +{ + std::pair pair; + if (doc1.ids[first] < doc2.ids[second]) + { + pair = {doc1.ids[first], doc2.ids[second]}; + } + else + { + pair = {doc2.ids[second], doc1.ids[first]}; + } + + auto val = cache_->find(pair); + + if (!val) + { + val = dist(embeddings_->at(doc1.ids[first]), + embeddings_->at(doc2.ids[second])); + cache_->insert(pair, val.value()); + } + return val.value(); +} +} +} diff --git a/src/embeddings/word_embeddings.cpp b/src/embeddings/word_embeddings.cpp index 1d45c7e63..56436aa0a 100644 --- a/src/embeddings/word_embeddings.cpp +++ b/src/embeddings/word_embeddings.cpp @@ -21,13 +21,12 @@ namespace embeddings using vocab_type = hashing::probe_map; - -word_embeddings::word_embeddings(std::istream &vectors, size_t num_lines, +word_embeddings::word_embeddings(std::istream& vectors, size_t num_lines, size_t dimension) : vector_size_{dimension}, id_to_term_(num_lines), term_to_id_{static_cast(std::ceil( - id_to_term_.size() / vocab_type::default_max_load_factor()))}, + id_to_term_.size() / vocab_type::default_max_load_factor()))}, embeddings_(vector_size_ * (id_to_term_.size() + 1)) { printing::progress progress{" > Loading embeddings: ", id_to_term_.size()}; @@ -36,7 +35,7 @@ word_embeddings::word_embeddings(std::istream &vectors, size_t num_lines, { if (!vectors) throw word_embeddings_exception{ - "embeddings stream ended unexpectedly"}; + "embeddings stream ended unexpectedly"}; progress(tid); @@ -44,17 +43,15 @@ word_embeddings::word_embeddings(std::istream &vectors, size_t num_lines, term_to_id_[id_to_term_[tid]] = tid; auto vec = vector(tid); - std::generate(vec.begin(), vec.end(), - [&]() { - double v; - vectors >> v; - return v; }); + std::generate(vec.begin(), vec.end(), [&]() { + double v; + vectors >> v; + return v; + }); auto len = math::operators::l2norm(vec); std::transform(vec.begin(), vec.end(), vec.begin(), [=](double weight) { return weight / len; }); } - - } word_embeddings::word_embeddings(std::istream& vocab, std::istream& vectors) @@ -243,23 +240,21 @@ word_embeddings load_embeddings(const cpptoml::table& config) throw word_embeddings_exception{"missing target vectors in: " + *prefix}; auto lines = filesystem::num_lines(*prefix + "/embeddings.target.txt"); - auto dim = config.get_as("dim"); - if(!dim) + auto dim = config.get_as("vector-size"); + if (!dim) { std::string line; std::getline(target, line); std::istringstream iss(line); - std::vector results((std::istream_iterator(iss)), - std::istream_iterator()); + std::vector results( + (std::istream_iterator(iss)), + std::istream_iterator()); dim = results.size() - 1; - + target.seekg(0, target.beg); } - target.seekg(0, target.beg); return {target, lines, *dim}; - } - std::ifstream vocab{*prefix + "/vocab.bin", std::ios::binary}; if (!vocab) throw word_embeddings_exception{"missing vocabulary file in: " diff --git a/src/index/ranker/CMakeLists.txt b/src/index/ranker/CMakeLists.txt index d806d5deb..43a3a5245 100644 --- a/src/index/ranker/CMakeLists.txt +++ b/src/index/ranker/CMakeLists.txt @@ -9,8 +9,9 @@ add_library(meta-ranker absolute_discount.cpp kl_divergence_prf.cpp rocchio.cpp ranker.cpp - ranker_factory.cpp wmd_base.cpp) -target_link_libraries(meta-ranker meta-index meta-embeddings meta-util) + ranker_factory.cpp + wmd_base.cpp) +target_link_libraries(meta-ranker meta-index meta-wmd) install(TARGETS meta-ranker EXPORT meta-exports diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 5f46a3c01..75c775bb3 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -3,10 +3,10 @@ * @author lolik111 */ +#include "meta/parallel/parallel_for.h" #include "meta/index/ranker/wmd_base.h" #include "meta/index/forward_index.h" #include "meta/index/postings_data.h" -#include "meta/index/score_data.h" #include "meta/util/fixed_heap.h" namespace meta @@ -75,18 +75,18 @@ std::vector wmd_base::rank(ranker_context& ctx, return a.score < b.score; }); - em_distance::metric_type distance; + embeddings::wm_distance::metric_type distance; if (distance_func_ == "cosine") { - distance = em_distance::cosine; + distance = embeddings::wm_distance::cosine; } else if (distance_func_ == "l2diff") { - distance = em_distance::l2diff_norm; + distance = embeddings::wm_distance::l2diff_norm; } else { - distance = em_distance::cosine; + distance = embeddings::wm_distance::cosine; } parallel::thread_pool pool(nthreads_); @@ -94,8 +94,8 @@ std::vector wmd_base::rank(ranker_context& ctx, if (mode_ != "prefetch-prune") { - meta::index::em_distance emd(cache_, embeddings_, mode_, distance); - auto scores = process(emd, filter, ctx, fwd_->docs()); + embeddings ::wm_distance emd(cache_, embeddings_, distance); + auto scores = process(emd, mode_, filter, ctx, fwd_->docs()); for (auto score : scores) { results.emplace(score); @@ -103,13 +103,10 @@ std::vector wmd_base::rank(ranker_context& ctx, } else { - index::em_distance wcd(cache_, embeddings_, "wcd", - em_distance::l2diff_norm); - index::em_distance emd(cache_, embeddings_, "emd", distance); - index::em_distance rwmd(cache_, embeddings_, "rwmd", distance); + embeddings::wm_distance emd(cache_, embeddings_, distance); // wcd phase - auto scores = process(wcd, filter, ctx, fwd_->docs()); + auto scores = process(emd, "wcd", filter, ctx, fwd_->docs()); std::sort(scores.begin(), scores.end(), [&](const search_result a, const search_result b) { bool ans; @@ -117,10 +114,6 @@ std::vector wmd_base::rank(ranker_context& ctx, return ans; }); - auto emd_heap = util::make_fixed_heap( - num_results, [](const search_result& a, const search_result& b) { - return a.score < b.score; - }); std::vector k_docs; for (size_t i = 0; i < num_results; i++) { @@ -128,7 +121,7 @@ std::vector wmd_base::rank(ranker_context& ctx, } scores.erase(scores.begin(), scores.begin() + num_results); // emd after wcd - auto k_emd = process(emd, filter, ctx, k_docs); + auto k_emd = process(emd, "emd", filter, ctx, k_docs); for (auto sr : k_emd) { results.emplace(sr); @@ -144,7 +137,7 @@ std::vector wmd_base::rank(ranker_context& ctx, std::generate(rwmd_docs.begin(), rwmd_docs.end(), [&]() { return (*start++).d_id; }); // rwmd phase - auto rwmd_results = process(rwmd, filter, ctx, rwmd_docs); + auto rwmd_results = process(emd, "rwmd", filter, ctx, rwmd_docs); std::vector pretend_docs; @@ -158,7 +151,8 @@ std::vector wmd_base::rank(ranker_context& ctx, if (!pretend_docs.empty()) { // emd phase - auto pretend_results = process(emd, filter, ctx, pretend_docs); + auto pretend_results = process(emd, "emd", filter, ctx, + pretend_docs); for (auto sr : pretend_results) { results.emplace(sr); @@ -169,7 +163,8 @@ std::vector wmd_base::rank(ranker_context& ctx, return results.extract_top(); } -std::vector wmd_base::process(em_distance emd, +std::vector wmd_base::process(embeddings::wm_distance emd, const +std::string mode, const filter_function_type& filter, ranker_context& ctx, std::vector docs) @@ -199,7 +194,7 @@ std::vector wmd_base::process(em_distance emd, { continue; } - auto score = static_cast(emd.score(doc1, doc2)); + auto score = static_cast(emd.score(mode, doc1, doc2)); block_scores.emplace_back(*it, score); } return block_scores; @@ -216,13 +211,13 @@ std::vector wmd_base::process(em_distance emd, return results; } -meta::index::Document +embeddings::emb_document wmd_base::create_document(std::vector> tf) { size_t unique_terms_count = tf.size(); size_t all_terms_count = 0; - meta::index::Document document; + embeddings::emb_document document; document.ids = std::vector(); document.ids.reserve(unique_terms_count); document.weights = std::vector(); From 7882f8549be4e498abe7f7cd1db01e6e31ee13ce Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Fri, 1 Dec 2017 01:16:58 +0300 Subject: [PATCH 12/15] [ranker] fix in em_distance --- src/embeddings/wmd/wm_distance.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/embeddings/wmd/wm_distance.cpp b/src/embeddings/wmd/wm_distance.cpp index 6edbcfb24..e1f7c7e51 100644 --- a/src/embeddings/wmd/wm_distance.cpp +++ b/src/embeddings/wmd/wm_distance.cpp @@ -205,13 +205,14 @@ double wm_distance::f_c_distance(const emb_document& doc1, auto val = cache_->find(pair); - if (!val) - { - val = dist(embeddings_->at(doc1.ids[first]), - embeddings_->at(doc2.ids[second])); - cache_->insert(pair, val.value()); - } - return val.value(); + double def_distance; + + return val.value_or([&](){ + auto dst = dist(embeddings_->at(doc1.ids[first]), + embeddings_->at(doc2.ids[second])); + cache_ ->insert(pair, val.value()); + return dst; + }()); } } } From ec17c57a29b36777a5b61a8c97c00354bf5a02f3 Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Fri, 1 Dec 2017 01:37:29 +0300 Subject: [PATCH 13/15] [ranker] hotfix --- src/embeddings/wmd/wm_distance.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/embeddings/wmd/wm_distance.cpp b/src/embeddings/wmd/wm_distance.cpp index e1f7c7e51..d4f757dcc 100644 --- a/src/embeddings/wmd/wm_distance.cpp +++ b/src/embeddings/wmd/wm_distance.cpp @@ -210,7 +210,7 @@ double wm_distance::f_c_distance(const emb_document& doc1, return val.value_or([&](){ auto dst = dist(embeddings_->at(doc1.ids[first]), embeddings_->at(doc2.ids[second])); - cache_ ->insert(pair, val.value()); + cache_ ->insert(pair, dst); return dst; }()); } From 86262754c2741e9b84779ae048e8f7913484cec1 Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Fri, 1 Dec 2017 02:45:55 +0300 Subject: [PATCH 14/15] [ranker] fixes + comments --- include/meta/embeddings/wmd/wm_distance.h | 62 ++++++++++++++++++++--- include/meta/index/ranker/wmd_base.h | 16 +++++- src/embeddings/wmd/wm_distance.cpp | 25 ++++----- src/index/ranker/wmd_base.cpp | 60 +++++++++++++--------- 4 files changed, 118 insertions(+), 45 deletions(-) diff --git a/include/meta/embeddings/wmd/wm_distance.h b/include/meta/embeddings/wmd/wm_distance.h index e02a9b473..3454a0fb9 100644 --- a/include/meta/embeddings/wmd/wm_distance.h +++ b/include/meta/embeddings/wmd/wm_distance.h @@ -22,14 +22,19 @@ namespace meta namespace embeddings { - +/** + * Struct representing one document in the wmd processing + */ struct emb_document { size_t n_terms; std::vector ids; std::vector weights; }; - +/** + * Class, providing methods to calculate distance between two documents + * in a sense of word-embedding representation + */ class wm_distance { public: @@ -45,18 +50,57 @@ class wm_distance std::shared_ptr embeddings, metric_type metric, size_t nthreads = 1); + /** + * Calculates distance based on type of algorithm + * @param algorithm_type type of the algorithm: "wcd", "rwmd" or "emd" + * @param doc1 + * @param doc2 + * @return distance between two documents + */ double score(const std::string algorithm_type, const emb_document& doc1, const emb_document& doc2); + /** + * Calculates original word mover's distance (based on Matt J. Kusner's + * paper) + * Uses Orif Pele Fast EMD algorithm + * @param doc1 + * @param doc2 + * @return distance between two documents + */ double emd(const emb_document& doc1, const emb_document& doc2); - + /** + * Calculates relaxed EM distance + * @param doc1 + * @param doc2 + * @return distance between two documents + */ double emd_relaxed(const emb_document& doc1, const emb_document& doc2); - + /** + * Calculates World Centroid distance + * @param doc1 + * @param doc2 + * @return distance between two documents + */ double wcd(const emb_document& doc1, const emb_document& doc2); + /** + * L2 norm squared of the difference between two word embeddings + * |a - b|2^2 + * @param a + * @param b + * @return distance between two word embeddings + */ static double l2diff_norm(const util::array_view& a, const util::array_view& b); + /** + * Cosine measure between two word embeddings + * Since we want minimum between two similar terms it calculates (1 - cos)/2 + * @param a + * @param b + * @return distance between two word embeddings + */ static double cosine(const util::array_view& a, const util::array_view& b); @@ -73,8 +117,14 @@ class wm_distance const emb_document&)>> methods_; - double f_c_distance(const emb_document& doc1, const emb_document& doc2, - size_t first, size_t second); + /** + * Returns distance between two terms using cache + * @param first_word_id first term id + * @param second_word_id second term id + * @return distance between two terms + */ + double f_c_distance(const size_t first_word_id, + const size_t second_word_id); }; } } diff --git a/include/meta/index/ranker/wmd_base.h b/include/meta/index/ranker/wmd_base.h index fa45d81f5..11a9e6711 100644 --- a/include/meta/index/ranker/wmd_base.h +++ b/include/meta/index/ranker/wmd_base.h @@ -73,13 +73,27 @@ class wmd_base : public ranker cache_; const std::string mode_; const std::string distance_func_; + /** + * Creates document, omitting terms not presenting in the embeddings + * @param tf vector of term frequences + * @return Struct representing one document in the wmd processing + */ embeddings::emb_document create_document(std::vector> tf); + /** + * Calculates wmd based on the instance of the emd class and mode paralelly + * @param emd + * @param mode + * @param filter + * @param doc_to_compare + * @param docs documents + * @return vector of search results + */ std::vector process(embeddings::wm_distance emd, const std::string mode, const filter_function_type& filter, - ranker_context& ctx, + embeddings::emb_document doc_to_compare, std::vector docs); }; diff --git a/src/embeddings/wmd/wm_distance.cpp b/src/embeddings/wmd/wm_distance.cpp index d4f757dcc..c41c9684f 100644 --- a/src/embeddings/wmd/wm_distance.cpp +++ b/src/embeddings/wmd/wm_distance.cpp @@ -78,7 +78,7 @@ double wm_distance::emd(const emb_document& doc1, const emb_document& doc2) { for (size_t j = 0; j < doc2.n_terms; ++j) { - double dist = f_c_distance(doc1, doc2, i, j); + double dist = f_c_distance(doc1.ids[i], doc2.ids[j]); assert(dist >= 0); cost[i][j + doc1.n_terms] = dist; cost[j + doc1.n_terms][i] = dist; @@ -105,7 +105,7 @@ double wm_distance::emd_relaxed(const emb_document& doc1, std::vector distance(doc2.n_terms); for (size_t j = 0; j < doc2.n_terms; ++j) { - distance[j] = f_c_distance(doc1, doc2, i, j); + distance[j] = f_c_distance(doc1.ids[i], doc2.ids[j]); } if (doc1.weights[i] != 0) @@ -189,28 +189,25 @@ double wm_distance::cosine(const util::array_view& a, return (1.0 - std::inner_product(a.begin(), a.end(), b.begin(), 0.0)) / 2.0; } -double wm_distance::f_c_distance(const emb_document& doc1, - const emb_document& doc2, size_t first, - size_t second) +double wm_distance::f_c_distance(const size_t first_word_id, + const size_t second_word_id) { std::pair pair; - if (doc1.ids[first] < doc2.ids[second]) + if (first_word_id < second_word_id) { - pair = {doc1.ids[first], doc2.ids[second]}; + pair = {first_word_id, second_word_id}; } else { - pair = {doc2.ids[second], doc1.ids[first]}; + pair = {second_word_id, first_word_id}; } auto val = cache_->find(pair); - double def_distance; - - return val.value_or([&](){ - auto dst = dist(embeddings_->at(doc1.ids[first]), - embeddings_->at(doc2.ids[second])); - cache_ ->insert(pair, dst); + return val.value_or([&]() { + auto dst = dist(embeddings_->at(first_word_id), + embeddings_->at(second_word_id)); + cache_->insert(pair, dst); return dst; }()); } diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 75c775bb3..13c0aa387 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -3,10 +3,10 @@ * @author lolik111 */ -#include "meta/parallel/parallel_for.h" #include "meta/index/ranker/wmd_base.h" #include "meta/index/forward_index.h" #include "meta/index/postings_data.h" +#include "meta/parallel/parallel_for.h" #include "meta/util/fixed_heap.h" namespace meta @@ -88,14 +88,26 @@ std::vector wmd_base::rank(ranker_context& ctx, { distance = embeddings::wm_distance::cosine; } + std::vector> tf_pc; + tf_pc.reserve(ctx.postings.size()); + for (auto one : ctx.postings) + { + tf_pc.push_back({one.t_id, one.query_term_weight}); + } + + auto doc_to_compare = create_document(tf_pc); + if (doc_to_compare.n_terms == 0) + { + return results.extract_top(); // empty + } parallel::thread_pool pool(nthreads_); std::vector docs = fwd_->docs(); if (mode_ != "prefetch-prune") { - embeddings ::wm_distance emd(cache_, embeddings_, distance); - auto scores = process(emd, mode_, filter, ctx, fwd_->docs()); + embeddings::wm_distance emd(cache_, embeddings_, distance); + auto scores = process(emd, mode_, filter, doc_to_compare, fwd_->docs()); for (auto score : scores) { results.emplace(score); @@ -106,7 +118,10 @@ std::vector wmd_base::rank(ranker_context& ctx, embeddings::wm_distance emd(cache_, embeddings_, distance); // wcd phase - auto scores = process(emd, "wcd", filter, ctx, fwd_->docs()); + auto scores = process( + {cache_, embeddings_, embeddings::wm_distance::l2diff_norm}, "wcd", + filter, doc_to_compare, fwd_->docs()); + std::sort(scores.begin(), scores.end(), [&](const search_result a, const search_result b) { bool ans; @@ -121,7 +136,7 @@ std::vector wmd_base::rank(ranker_context& ctx, } scores.erase(scores.begin(), scores.begin() + num_results); // emd after wcd - auto k_emd = process(emd, "emd", filter, ctx, k_docs); + auto k_emd = process(emd, "emd", filter, doc_to_compare, k_docs); for (auto sr : k_emd) { results.emplace(sr); @@ -130,14 +145,17 @@ std::vector wmd_base::rank(ranker_context& ctx, // worst result auto last = (--results.end())->score; + // how much documents compare using with rwmd const size_t magic_constant = std::max(fwd_->docs().size() / 8, num_results * 8); + std::vector rwmd_docs(magic_constant); auto start = scores.begin(); std::generate(rwmd_docs.begin(), rwmd_docs.end(), [&]() { return (*start++).d_id; }); // rwmd phase - auto rwmd_results = process(emd, "rwmd", filter, ctx, rwmd_docs); + auto rwmd_results + = process(emd, "rwmd", filter, doc_to_compare, rwmd_docs); std::vector pretend_docs; @@ -151,8 +169,8 @@ std::vector wmd_base::rank(ranker_context& ctx, if (!pretend_docs.empty()) { // emd phase - auto pretend_results = process(emd, "emd", filter, ctx, - pretend_docs); + auto pretend_results + = process(emd, "emd", filter, doc_to_compare, pretend_docs); for (auto sr : pretend_results) { results.emplace(sr); @@ -163,11 +181,11 @@ std::vector wmd_base::rank(ranker_context& ctx, return results.extract_top(); } -std::vector wmd_base::process(embeddings::wm_distance emd, const -std::string mode, - const filter_function_type& filter, - ranker_context& ctx, - std::vector docs) +std::vector +wmd_base::process(embeddings::wm_distance emd, const std::string mode, + const filter_function_type& filter, + embeddings::emb_document doc_to_compare, + std::vector docs) { parallel::thread_pool pool(nthreads_); @@ -179,26 +197,20 @@ std::string mode, { if (!filter(*it)) continue; - auto tf = fwd_->search_primary(*it)->counts(); - auto doc1 = create_document(tf); - std::vector> tf_pc; - tf_pc.reserve(ctx.postings.size()); - for (auto one : ctx.postings) - { - tf_pc.push_back({one.t_id, one.query_term_weight}); - } + auto doc = create_document(fwd_->search_primary(*it)->counts()); - auto doc2 = create_document(tf_pc); - if (doc1.n_terms == 0 || doc2.n_terms == 0) + if (doc.n_terms == 0) { continue; } - auto score = static_cast(emd.score(mode, doc1, doc2)); + auto score + = static_cast(emd.score(mode, doc, doc_to_compare)); block_scores.emplace_back(*it, score); } return block_scores; }); + std::vector results; results.reserve(fwd_->docs().size()); for (auto& vec : scores) From 8fc3d275cdef924d120197f29ad7a81116f907b7 Mon Sep 17 00:00:00 2001 From: Lolik111 Date: Fri, 1 Dec 2017 02:51:24 +0300 Subject: [PATCH 15/15] [ranker] fix --- src/index/ranker/wmd_base.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/index/ranker/wmd_base.cpp b/src/index/ranker/wmd_base.cpp index 13c0aa387..ce06b398f 100644 --- a/src/index/ranker/wmd_base.cpp +++ b/src/index/ranker/wmd_base.cpp @@ -147,7 +147,8 @@ std::vector wmd_base::rank(ranker_context& ctx, // how much documents compare using with rwmd const size_t magic_constant - = std::max(fwd_->docs().size() / 8, num_results * 8); + = std::max(static_cast(fwd_->docs().size() / 8), + static_cast(num_results * 8)); std::vector rwmd_docs(magic_constant); auto start = scores.begin();