From 4e927798114684ecc2d05f34d676c52d9430e145 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 21:42:56 +0900 Subject: [PATCH 01/21] Fix default value --- R/RcppExports.R | 4 ++-- src/collocations.cpp | 2 +- src/keyness.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 8e5bb08..e008a0d 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,11 +1,11 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = 1L) { +cpp_collocations <- function(texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread = -1L) { .Call(`_quanteda_textstats_cpp_collocations`, texts_, types_, words_ignore_, count_min, sizes_, method, smoothing, thread) } -cpp_keyness <- function(mt, measure, correct, thread = 1L) { +cpp_keyness <- function(mt, measure, correct, thread = -1L) { .Call(`_quanteda_textstats_cpp_keyness`, mt, measure, correct, thread) } diff --git a/src/collocations.cpp b/src/collocations.cpp index db83e5c..e4b934e 100644 --- a/src/collocations.cpp +++ b/src/collocations.cpp @@ -224,7 +224,7 @@ DataFrame cpp_collocations(const List &texts_, const IntegerVector sizes_, const String &method, const double smoothing, - const int thread = 1){ + const int thread = -1){ Texts texts = as(texts_); std::vector sizes = as< std::vector >(sizes_); diff --git a/src/keyness.cpp b/src/keyness.cpp index 144c706..5a493fe 100644 --- a/src/keyness.cpp +++ b/src/keyness.cpp @@ -105,7 +105,7 @@ inline double pmi_lambda( Rcpp::NumericVector cpp_keyness(arma::sp_mat &mt, const std::string measure, const std::string correct, - const int thread = 1) { + const int thread = -1) { if (mt.n_rows != 2) throw std::range_error("Invalid DFM object"); From 99e8cfa615afe0a91f8f6355b873e4dd73aa9e7e Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 22:06:09 +0900 Subject: [PATCH 02/21] Use quanteda:::get_threads() --- R/textstat_collocations.R | 2 +- R/textstat_keyness.R | 2 +- R/utils.R | 8 -------- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/R/textstat_collocations.R b/R/textstat_collocations.R index b63768c..457304a 100644 --- a/R/textstat_collocations.R +++ b/R/textstat_collocations.R @@ -160,7 +160,7 @@ textstat_collocations.tokens <- function(x, method = "lambda", if (is.null(id_ignore)) id_ignore <- integer() result <- cpp_collocations(x, types, id_ignore, min_count, size, if (method == "lambda1") "lambda1" else "lambda", - smoothing, get_threads()) + smoothing, quanteda:::get_threads()) # compute z for lambda methods result$z <- result$lambda / result$sigma diff --git a/R/textstat_keyness.R b/R/textstat_keyness.R index e84f70a..a12bfaa 100644 --- a/R/textstat_keyness.R +++ b/R/textstat_keyness.R @@ -148,7 +148,7 @@ textstat_keyness.dfm <- function(x, target = 1L, measure = c("chi2", "exact", "l warning("correction is always none for pmi") result <- data.frame( feature = featnames(temp), - stat = cpp_keyness(temp, measure, correction, get_threads()), + stat = cpp_keyness(temp, measure, correction, quanteda:::get_threads()), p = NA, n_target = as.vector(temp[1, ]), n_reference = as.vector(temp[2, ]), diff --git a/R/utils.R b/R/utils.R index 4a7ab3e..6306475 100644 --- a/R/utils.R +++ b/R/utils.R @@ -21,11 +21,3 @@ check_dots <- function(..., method = NULL) { warning(arg, " argument is not used.", call. = FALSE) } } - -get_threads <- function() { - value <- getOption("quanteda_threads", -1L) - if (!is.integer(value) || length(value) != 1L) - stop("Invalid value of threads in quanteda options") - return(value) -} - From af241c42778ee72789aab03c9a5ac0d5685429f3 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 22:06:28 +0900 Subject: [PATCH 03/21] Remove RcppParallel --- DESCRIPTION | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 93c0689..8a8c122 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,15 +19,14 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda, + quanteda (>= 4.0), Matrix (>= 1.5-0), methods, nsyllable, proxyC (>= 0.1.4), Rcpp (>= 0.12.12), - RcppParallel, stringi -LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0), quanteda +LinkingTo: Rcpp, RcppArmadillo (>= 0.7.600.1.0), quanteda Suggests: entropy, ExPosition, From bc7c8084ea64d0dc00139a3f9bba0d225b5d8ab5 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 22:13:51 +0900 Subject: [PATCH 04/21] Update Makevars --- src/Makevars | 6 ++---- src/Makevars.win | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Makevars b/src/Makevars index 9321074..dca0434 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,4 +1,2 @@ -PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) `$(R_HOME)/bin/Rscript -e "RcppParallel::RcppParallelLibs()"` -#PKG_CXXFLAGS = -DARMA_64BIT_WORD=1 -PKG_CPPFLAGS = -DARMA_DONT_PRINT_OPENMP_WARNING -I../inst/include -#CXX_STD = CXX11 +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +PKG_CPPFLAGS = -I../inst/include diff --git a/src/Makevars.win b/src/Makevars.win index c073a4e..8153d1d 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,5 +1,2 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) -PKG_LIBS += `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe -e "RcppParallel::RcppParallelLibs()"` -PKG_CXXFLAGS = -DRCPP_PARALLEL_USE_TBB=1 -DARMA_64BIT_WORD=1 -PKG_CPPFLAGS = -I../inst/include -DARMA_DONT_PRINT_OPENMP_WARNING=1 -# CXX_STD = CXX11 +PKG_CPPFLAGS = -I../inst/include From eefdd066f62d9d1c8664dd55348594fdd9ccff19 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 22:17:03 +0900 Subject: [PATCH 05/21] Update Makevars --- src/Makevars | 1 - src/Makevars.win | 1 - 2 files changed, 2 deletions(-) diff --git a/src/Makevars b/src/Makevars index dca0434..22ebc63 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -PKG_CPPFLAGS = -I../inst/include diff --git a/src/Makevars.win b/src/Makevars.win index 8153d1d..0485f92 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,2 +1 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) -PKG_CPPFLAGS = -I../inst/include From 5b193274d566b81ca910a08d2e5e929fd6bff462 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 29 Mar 2024 23:11:41 +0900 Subject: [PATCH 06/21] Add TBB flag --- inst/deftbb.R | 4 ++++ src/Makevars | 1 + src/Makevars.win | 1 + 3 files changed, 6 insertions(+) create mode 100644 inst/deftbb.R diff --git a/inst/deftbb.R b/inst/deftbb.R new file mode 100644 index 0000000..bac627b --- /dev/null +++ b/inst/deftbb.R @@ -0,0 +1,4 @@ +#' Print TBB flag in makevars +if (quanteda:::cpp_tbb_enabled()) { + cat("-DTBB") +} \ No newline at end of file diff --git a/src/Makevars b/src/Makevars index 22ebc63..fe080a3 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +1,2 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) +PKG_CXXFLAGS = `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe ../inst/deftbb.R` diff --git a/src/Makevars.win b/src/Makevars.win index 0485f92..61e2def 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1 +1,2 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) +PKG_CXXFLAGS = -DTBB From 6044bbc140cf81837a37aedf0a7ba41c22dbc28d Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 17:37:26 +0900 Subject: [PATCH 07/21] Remove seqs_all --- src/collocations.cpp | 45 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/collocations.cpp b/src/collocations.cpp index e4b934e..5015a79 100644 --- a/src/collocations.cpp +++ b/src/collocations.cpp @@ -8,13 +8,13 @@ float GLOBAL_PATTERN_MAX_LOAD_FACTOR = 0.05; float GLOBAL_NGRAMS_MAX_LOAD_FACTOR = 0.25; #endif +typedef std::atomic UintAtomic; #if QUANTEDA_USE_TBB -typedef tbb::atomic UintAtomic; // NOTE: changed to std::atomic for TBB 2021 typedef tbb::concurrent_vector> VecPair; typedef tbb::concurrent_unordered_map, hash_ngram, equal_ngram> MapNgramsPair; #else -typedef std::vector> VecPair; -typedef std::unordered_map, hash_ngram, equal_ngram> MapNgramsPair; +typedef std::vector> VecPair; +typedef std::unordered_map, hash_ngram, equal_ngram> MapNgramsPair; #endif // return the matching pattern between two words at each position, 0 for matching, 1 for not matching. @@ -107,7 +107,7 @@ Text mark(Text tokens, } void counts2(Text text, - MapNgramsPair &counts_seq, + MapNgramsPair &counts_all, const std::vector &sizes, const unsigned int &id_ignore){ @@ -140,7 +140,7 @@ void counts2(Text text, Text text_sub(text.begin() + i, text.begin() + i + size); //Rcout << "@" << i << " " << nested << ": "; //dev::print_ngram(text_sub); - auto &count = counts_seq[text_sub]; + auto &count = counts_all[text_sub]; count.first++; if (!padded) { if (nested) { @@ -156,7 +156,7 @@ void counts2(Text text, void estimates2(std::size_t i, VecNgrams &seqs, // seqs without padding - MapNgramsPair counts_seq, + MapNgramsPair counts_all, DoubleParams &dice, DoubleParams &pmi, DoubleParams &logratio, @@ -170,7 +170,7 @@ void estimates2(std::size_t i, if (n == 1) return; // ignore single words // output counts std::vector counts_bit(std::pow(2, n), smoothing); - for (auto it = counts_seq.begin(); it != counts_seq.end(); ++it) { + for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { if (it->first.size() != n) continue; // skip different lengths int bit; bit = match_bit2(seqs[i], it->first); @@ -180,7 +180,7 @@ void estimates2(std::size_t i, void estimates_lambda2(std::size_t i, const VecNgrams &seqs, - const VecPair &seqs_all, + const MapNgramsPair &counts_all, DoubleParams &sgma, DoubleParams &lmda, const String &method, @@ -190,10 +190,11 @@ void estimates_lambda2(std::size_t i, if (n == 1) return; // ignore single words std::vector counts_bit(std::pow(2, n), smoothing); - for (std::size_t j = 0; j < seqs_all.size(); j++) { - if (seqs_all[j].first.size() != n) continue; // skip different lengths - int bit = match_bit2(seqs[i], seqs_all[j].first); - counts_bit[bit] += seqs_all[j].second; + //for (std::size_t j = 0; j < seqs_all.size(); j++) { + for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { + if (it->first.size() != n) continue; // skip different lengths + int bit = match_bit2(seqs[i], it->first); + counts_bit[bit] += it->second.first; } //B-J algorithm @@ -255,8 +256,8 @@ DataFrame cpp_collocations(const List &texts_, } #endif - MapNgramsPair counts_seq; - counts_seq.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR); + MapNgramsPair counts_all; + counts_all.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR); //dev::Timer timer; //dev::start_timer("Count", timer); @@ -264,29 +265,25 @@ DataFrame cpp_collocations(const List &texts_, arena.execute([&]{ tbb::parallel_for(tbb::blocked_range(0, H), [&](tbb::blocked_range r) { for (int h = r.begin(); h < r.end(); ++h) { - counts2(texts[h], counts_seq, sizes, id_ignore); + counts2(texts[h], counts_all, sizes, id_ignore); } }); }); #else for (std::size_t h = 0; h < H; h++) { - counts2(texts[h], counts_seq, sizes, id_ignore); + counts2(texts[h], counts_all, sizes, id_ignore); } #endif //dev::stop_timer("Count", timer); VecNgrams seqs; - VecPair seqs_all; IntParams counts, counts_nested, lengths; - std::size_t len = counts_seq.size(); + std::size_t len = counts_all.size(); seqs.reserve(len); - seqs_all.reserve(len); counts.reserve(len); counts_nested.reserve(len); lengths.reserve(len); - for (auto it = counts_seq.begin(); it != counts_seq.end(); ++it) { - // conver to a vector for faster itteration - seqs_all.push_back(std::make_pair(it->first, it->second.first)); + for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { if (it->second.first < count_min) continue; // estimate only sequences without padding if (std::none_of(it->first.begin(), it->first.end(), [](unsigned int v){ return v == 0; })) { @@ -307,13 +304,13 @@ DataFrame cpp_collocations(const List &texts_, arena.execute([&]{ tbb::parallel_for(tbb::blocked_range(0, I), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) { - estimates_lambda2(i, seqs, seqs_all, sgma, lmda, method, smoothing); + estimates_lambda2(i, seqs, counts_all, sgma, lmda, method, smoothing); } }); }); #else for (std::size_t i = 0; i < I; i++) { - estimates_lambda2(i, seqs, seqs_all, sgma, lmda, method, smoothing); + estimates_lambda2(i, seqs, counts_all, sgma, lmda, method, smoothing); } #endif //dev::stop_timer("Estimate", timer); From a40aea29b2fc0ad7e0e05c5bf5d459d05e0e10d0 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 17:43:54 +0900 Subject: [PATCH 08/21] Add libttbb.R --- inst/libtbb.R | 4 ++++ src/Makevars | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 inst/libtbb.R diff --git a/inst/libtbb.R b/inst/libtbb.R new file mode 100644 index 0000000..73fe8b7 --- /dev/null +++ b/inst/libtbb.R @@ -0,0 +1,4 @@ +#' Print TBB flag in makevars +if (quanteda:::cpp_tbb_enabled()) { + cat("-ltbb") +} diff --git a/src/Makevars b/src/Makevars index fe080a3..19a3ed7 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ -PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) -PKG_CXXFLAGS = `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe ../inst/deftbb.R` +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/libtbb.R` +PKG_CXXFLAGS = `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/deftbb.R` From 1cf8a8ded7649e0a7aafd93b762d0accadab3c7a Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 18:01:58 +0900 Subject: [PATCH 09/21] Update libtbb.R for Windows --- inst/libtbb.R | 12 ++++++++++-- src/Makevars.win | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/inst/libtbb.R b/inst/libtbb.R index 73fe8b7..b4dec7b 100644 --- a/inst/libtbb.R +++ b/inst/libtbb.R @@ -1,4 +1,12 @@ #' Print TBB flag in makevars -if (quanteda:::cpp_tbb_enabled()) { - cat("-ltbb") +if (Sys.info()[["sysname"]] == "Windows") { + if (getRversion() >= "4.3.0") { + cat("-ltbb12") + } else { + cat("-ltbb_static") + } +} else { + if (quanteda:::cpp_tbb_enabled()) { + cat("-ltbb") + } } diff --git a/src/Makevars.win b/src/Makevars.win index 61e2def..b08a5b6 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,2 +1,2 @@ -PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) +PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe ../inst/libtbb.R` -fstack-protector PKG_CXXFLAGS = -DTBB From 659fbde51b2f939d27107b9c8f14eb1da1a1a7e3 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 18:02:41 +0900 Subject: [PATCH 10/21] Remove quanteda version requirement for testing --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8a8c122..b0dd61a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda (>= 4.0), + quanteda Matrix (>= 1.5-0), methods, nsyllable, From d15537baf5d2743aa81a412e8d6e441fc104d801 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 19:28:08 +0900 Subject: [PATCH 11/21] Fix --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b0dd61a..23c76bc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda + quanteda, Matrix (>= 1.5-0), methods, nsyllable, From 4fc83807ed1a930da3c6fdcb4dd8f6aa0d027cb6 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 20:05:53 +0900 Subject: [PATCH 12/21] Restore conversion to vector --- src/collocations.cpp | 78 +++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 48 deletions(-) diff --git a/src/collocations.cpp b/src/collocations.cpp index 5015a79..d03e7e7 100644 --- a/src/collocations.cpp +++ b/src/collocations.cpp @@ -10,12 +10,11 @@ float GLOBAL_NGRAMS_MAX_LOAD_FACTOR = 0.25; typedef std::atomic UintAtomic; #if QUANTEDA_USE_TBB -typedef tbb::concurrent_vector> VecPair; typedef tbb::concurrent_unordered_map, hash_ngram, equal_ngram> MapNgramsPair; #else -typedef std::vector> VecPair; typedef std::unordered_map, hash_ngram, equal_ngram> MapNgramsPair; #endif +typedef std::vector> VecNgramsPair; // return the matching pattern between two words at each position, 0 for matching, 1 for not matching. // for example, for 3-gram, bit = 000, 001, 010 ... 111 eg. 0-7 @@ -107,7 +106,7 @@ Text mark(Text tokens, } void counts2(Text text, - MapNgramsPair &counts_all, + MapNgramsPair &map_seqs, const std::vector &sizes, const unsigned int &id_ignore){ @@ -140,7 +139,7 @@ void counts2(Text text, Text text_sub(text.begin() + i, text.begin() + i + size); //Rcout << "@" << i << " " << nested << ": "; //dev::print_ngram(text_sub); - auto &count = counts_all[text_sub]; + auto &count = map_seqs[text_sub]; count.first++; if (!padded) { if (nested) { @@ -154,33 +153,9 @@ void counts2(Text text, } } -void estimates2(std::size_t i, - VecNgrams &seqs, // seqs without padding - MapNgramsPair counts_all, - DoubleParams &dice, - DoubleParams &pmi, - DoubleParams &logratio, - DoubleParams &chi2, - DoubleParams &gensim, - DoubleParams &lfmd, - const String &method, - const double smoothing) { - - std::size_t n = seqs[i].size(); //n=2:5, seqs - if (n == 1) return; // ignore single words - // output counts - std::vector counts_bit(std::pow(2, n), smoothing); - for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { - if (it->first.size() != n) continue; // skip different lengths - int bit; - bit = match_bit2(seqs[i], it->first); - counts_bit[bit] += it->second.first; - } -} - void estimates_lambda2(std::size_t i, const VecNgrams &seqs, - const MapNgramsPair &counts_all, + const VecNgramsPair &seqs_counts, DoubleParams &sgma, DoubleParams &lmda, const String &method, @@ -190,11 +165,10 @@ void estimates_lambda2(std::size_t i, if (n == 1) return; // ignore single words std::vector counts_bit(std::pow(2, n), smoothing); - //for (std::size_t j = 0; j < seqs_all.size(); j++) { - for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { - if (it->first.size() != n) continue; // skip different lengths - int bit = match_bit2(seqs[i], it->first); - counts_bit[bit] += it->second.first; + for (std::size_t j = 0; j < seqs_counts.size(); j++) { + if (seqs_counts[j].first.size() != n) continue; // skip different lengths + int bit = match_bit2(seqs[i], seqs_counts[j].first); + counts_bit[bit] += seqs_counts[j].second; } //B-J algorithm @@ -256,8 +230,8 @@ DataFrame cpp_collocations(const List &texts_, } #endif - MapNgramsPair counts_all; - counts_all.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR); + MapNgramsPair map_seqs; + map_seqs.max_load_factor(GLOBAL_PATTERN_MAX_LOAD_FACTOR); //dev::Timer timer; //dev::start_timer("Count", timer); @@ -265,26 +239,34 @@ DataFrame cpp_collocations(const List &texts_, arena.execute([&]{ tbb::parallel_for(tbb::blocked_range(0, H), [&](tbb::blocked_range r) { for (int h = r.begin(); h < r.end(); ++h) { - counts2(texts[h], counts_all, sizes, id_ignore); + counts2(texts[h], map_seqs, sizes, id_ignore); } }); }); #else for (std::size_t h = 0; h < H; h++) { - counts2(texts[h], counts_all, sizes, id_ignore); + counts2(texts[h], map_seqs, sizes, id_ignore); } #endif - //dev::stop_timer("Count", timer); - VecNgrams seqs; + std::size_t N = map_seqs.size(); + + // for estimation + VecNgramsPair seqs_count; // all the collocation + seqs_count.reserve(N); + VecNgrams seqs; // only eligible collocation + seqs.reserve(N); + + // for output IntParams counts, counts_nested, lengths; - std::size_t len = counts_all.size(); - seqs.reserve(len); - counts.reserve(len); - counts_nested.reserve(len); - lengths.reserve(len); - for (auto it = counts_all.begin(); it != counts_all.end(); ++it) { + counts.reserve(N); + counts_nested.reserve(N); + lengths.reserve(N); + + for (auto it = map_seqs.begin(); it != map_seqs.end(); ++it) { if (it->second.first < count_min) continue; + // convert to a vector for faster iteration + seqs_count.push_back(std::make_pair(it->first, (unsigned int)it->second.first)); // estimate only sequences without padding if (std::none_of(it->first.begin(), it->first.end(), [](unsigned int v){ return v == 0; })) { seqs.push_back(it->first); @@ -304,13 +286,13 @@ DataFrame cpp_collocations(const List &texts_, arena.execute([&]{ tbb::parallel_for(tbb::blocked_range(0, I), [&](tbb::blocked_range r) { for (int i = r.begin(); i < r.end(); ++i) { - estimates_lambda2(i, seqs, counts_all, sgma, lmda, method, smoothing); + estimates_lambda2(i, seqs, seqs_count, sgma, lmda, method, smoothing); } }); }); #else for (std::size_t i = 0; i < I; i++) { - estimates_lambda2(i, seqs, counts_all, sgma, lmda, method, smoothing); + estimates_lambda2(i, seqs, seqs_count, sgma, lmda, method, smoothing); } #endif //dev::stop_timer("Estimate", timer); From 3b03d6d026cbdbca1ed943ecc4ab16dac517fc6e Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sat, 30 Mar 2024 22:22:59 +0900 Subject: [PATCH 13/21] Fix --- src/collocations.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/collocations.cpp b/src/collocations.cpp index d03e7e7..dbc0966 100644 --- a/src/collocations.cpp +++ b/src/collocations.cpp @@ -264,9 +264,9 @@ DataFrame cpp_collocations(const List &texts_, lengths.reserve(N); for (auto it = map_seqs.begin(); it != map_seqs.end(); ++it) { - if (it->second.first < count_min) continue; // convert to a vector for faster iteration seqs_count.push_back(std::make_pair(it->first, (unsigned int)it->second.first)); + if (it->second.first < count_min) continue; // estimate only sequences without padding if (std::none_of(it->first.begin(), it->first.end(), [](unsigned int v){ return v == 0; })) { seqs.push_back(it->first); From 07b81f09fdbda4b9678351667c0b056ceb586fb9 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 08:07:15 +0900 Subject: [PATCH 14/21] Fix NAMESPACE --- DESCRIPTION | 2 +- NAMESPACE | 1 - R/textstat_simil.R | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 23c76bc..7fab51d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,5 +42,5 @@ Encoding: UTF-8 BugReports: https://github.com/quanteda/quanteda.textstats/issues LazyData: TRUE Language: en-GB -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index 88189f8..74b7216 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,7 +53,6 @@ exportMethods(show) import(Matrix) import(methods) importFrom(Rcpp,evalCpp) -importFrom(RcppParallel,RcppParallelLibs) importFrom(nsyllable,nsyllable) importFrom(quanteda,as.corpus) importFrom(quanteda,as.dfm) diff --git a/R/textstat_simil.R b/R/textstat_simil.R index abf382f..4e21f2e 100644 --- a/R/textstat_simil.R +++ b/R/textstat_simil.R @@ -355,7 +355,6 @@ textstat_simil.dfm <- function(x, y = NULL, selection = NULL, #' @details `textstat_dist` options are: `"euclidean"` (default), #' `"manhattan"`, `"maximum"`, `"canberra"`, #' and `"minkowski"`. -#' @importFrom RcppParallel RcppParallelLibs #' @examples #' #' # distances for documents From 78fb5cf2eb3edb430add29f4615b9b087dfcc6e3 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 10:22:35 +0900 Subject: [PATCH 15/21] Update github action to install quanteda v4.0 --- .github/workflows/R-CMD-check.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index a129c75..6214fac 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,6 +43,14 @@ jobs: with: extra-packages: any::rcmdcheck needs: check + + - name: Install remotes + run: | + Rscript -e "install.packages('remotes', repos='https://ftp.belnet.be/mirror/CRAN')" + + - name: Install quanteda from Github + run: | + Rscript -e "remotes::install_github('quanteda/quanteda')" - uses: r-lib/actions/check-r-package@v2 with: From 9c44c560207317eda9a148324b228c4c9bea0989 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 10:30:34 +0900 Subject: [PATCH 16/21] Install TBB in Github action --- .github/workflows/R-CMD-check.yaml | 10 ++++++++++ DESCRIPTION | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 6214fac..021718e 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -29,6 +29,16 @@ jobs: R_KEEP_PKG_SOURCE: yes steps: + + - if: matrix.config.os == 'ubuntu-latest' + run: | + sudo apt update + sudo apt install libtbb-dev + - if: matrix.config.os == 'macos-latest' + run: | + brew update + brew install tbb + - uses: actions/checkout@v3 - uses: r-lib/actions/setup-pandoc@v2 diff --git a/DESCRIPTION b/DESCRIPTION index 7fab51d..8b03d4e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda, + quanteda (>= 4.0.0), Matrix (>= 1.5-0), methods, nsyllable, From 8fa8eba9711702f3eec1f14f15c47d555a9fff9d Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 12:12:42 +0900 Subject: [PATCH 17/21] Convert arma::mat to std::vector --- src/keyness.cpp | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/src/keyness.cpp b/src/keyness.cpp index 5a493fe..31a77c3 100644 --- a/src/keyness.cpp +++ b/src/keyness.cpp @@ -5,6 +5,14 @@ using namespace quanteda; static const double epsilon = 0.000000001; // the same value as R code +inline std::vector to_vector(const arma::sp_mat& mt) { + return arma::conv_to< std::vector >::from(arma::mat(mt)); +} + +inline std::vector to_vector(const arma::rowvec& v) { + return arma::conv_to< std::vector >::from(v); +} + inline double yates_correction( const double &a, const double &b, @@ -39,11 +47,11 @@ inline double williams_correction( inline double chisq_lambda( const double &a, const double &b, - const arma::colvec &mrg, + const std::vector &mrg, const std::string &cor ) { - double tN = mrg(0); - double rN = mrg(1); + double tN = mrg[0]; + double rN = mrg[1]; double c = tN - a, d = rN - b, N = a + b + c + d, E = (a + b) * (a + c) / N; double delta = (cor == "default" || cor == "yates") ? yates_correction(a, b, c, d) : 0.0; double q = (cor == "williams") ? williams_correction(a, b, c, d) : 1.0; @@ -55,11 +63,11 @@ inline double chisq_lambda( inline double lr_lambda( const double &a, const double &b, - const arma::colvec &mrg, + const std::vector &mrg, const std::string &cor ) { - double tN = mrg(0); - double rN = mrg(1); + double tN = mrg[0]; + double rN = mrg[1]; double c = tN - a, d = rN - b, N = a + b + c + d, E = (a + b) * (a + c) / N; double aa = a, bb = b, cc = c , dd = d; @@ -87,12 +95,12 @@ inline double lr_lambda( inline double pmi_lambda( const double &a, const double &b, - const arma::colvec &mrg, + const std::vector &mrg, const bool normal = false ) { - const double tN = mrg(0); - const double rN = mrg(1); + const double tN = mrg[0]; + const double rN = mrg[1]; double c = tN - a, d = rN - b, N = a + b + c + d, E = (a + b) * (a + c) / N; double res = std::log(a / E + epsilon); if (normal) @@ -110,25 +118,27 @@ Rcpp::NumericVector cpp_keyness(arma::sp_mat &mt, if (mt.n_rows != 2) throw std::range_error("Invalid DFM object"); - arma::colvec margin(arma::sum(mt, 1)); - DoubleParams stats(mt.n_cols); - std::size_t I = mt.n_cols; + std::vector margin = to_vector(arma::sum(mt, 1)); + std::vector row0 = to_vector(mt.row(0)); + std::vector row1 = to_vector(mt.row(1)); + DoubleParams stats(mt.n_cols); + #if QUANTEDA_USE_TBB tbb::task_arena arena(thread); arena.execute([&]{ tbb::parallel_for(tbb::blocked_range(0, I), [&](tbb::blocked_range r) { if (measure == "chi2") { for (int i = r.begin(); i < r.end(); ++i) { - stats[i] = chisq_lambda(mt(0, i), mt(1, i), margin, correct); + stats[i] = chisq_lambda(row0[i], row1[i], margin, correct); } } else if (measure == "lr") { for (int i = r.begin(); i < r.end(); ++i) { - stats[i] = lr_lambda(mt(0, i), mt(1, i), margin, correct); + stats[i] = lr_lambda(row0[i], row1[i], margin, correct); } } else if (measure == "pmi") { for (int i = r.begin(); i < r.end(); ++i) { - stats[i] = pmi_lambda(mt(0, i), mt(1, i), margin, false); + stats[i] = pmi_lambda(row0[i], row1[i], margin, false); } } }); @@ -136,15 +146,15 @@ Rcpp::NumericVector cpp_keyness(arma::sp_mat &mt, #else if (measure == "chi2") { for (std::size_t i = 0; i < I; i++) { - stats[i] = chisq_lambda(mt(0, i), mt(1, i), margin, correct); + stats[i] = chisq_lambda(row0[i], row1[i], margin, correct); } } else if (measure == "lr") { for (std::size_t i = 0; i < I; i++) { - stats[i] = lr_lambda(mt(0, i), mt(1, i), margin, correct); + stats[i] = lr_lambda(row0[i], row1[i], margin, correct); } } else if (measure == "pmi") { for (std::size_t i = 0; i < I; i++) { - stats[i] = pmi_lambda(mt(0, i), mt(1, i), margin, false); + stats[i] = pmi_lambda(row0[i], row1[i], margin, false); } } #endif From 053a6852f22f585d2f8c43c94b08f04d3d443685 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 12:12:56 +0900 Subject: [PATCH 18/21] Restore -DARMA_64BIT_WORD=1 --- src/Makevars | 2 +- src/Makevars.win | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Makevars b/src/Makevars index 19a3ed7..43327ee 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,2 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/libtbb.R` -PKG_CXXFLAGS = `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/deftbb.R` +PKG_CXXFLAGS = -DARMA_64BIT_WORD=1 `$(R_HOME)/bin${R_ARCH_BIN}/Rscript ../inst/deftbb.R` diff --git a/src/Makevars.win b/src/Makevars.win index b08a5b6..f6ba5eb 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,2 +1,2 @@ PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) `$(R_HOME)/bin${R_ARCH_BIN}/Rscript.exe ../inst/libtbb.R` -fstack-protector -PKG_CXXFLAGS = -DTBB +PKG_CXXFLAGS = -DARMA_64BIT_WORD=1 -DTBB From 8355aff3d09ec5d0a48bd1576828418e9b0a2e3f Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Sun, 31 Mar 2024 12:22:27 +0900 Subject: [PATCH 19/21] Remove version specificaiton --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8b03d4e..7fab51d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda (>= 4.0.0), + quanteda, Matrix (>= 1.5-0), methods, nsyllable, From 096a3d607a10becff37a4745557234426c173295 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 5 Apr 2024 06:44:16 +0900 Subject: [PATCH 20/21] Require quanteda v4.0 --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7fab51d..2ee1a66 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda, + quanteda (>= 4.0-0), Matrix (>= 1.5-0), methods, nsyllable, From 633fc4a1c0ee265498e9ba363775b35b8c81bdfc Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Fri, 5 Apr 2024 06:48:12 +0900 Subject: [PATCH 21/21] Fix --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2ee1a66..8b03d4e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,7 @@ License: GPL-3 Depends: R (>= 3.5.0) Imports: - quanteda (>= 4.0-0), + quanteda (>= 4.0.0), Matrix (>= 1.5-0), methods, nsyllable,