From e16e5fb69cca36e60a5633aa7f115911e4b00b59 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Mon, 4 Jan 2021 22:55:15 +0100 Subject: [PATCH 01/22] chore: improved docstrings --- chaine/crf.pyx | 40 --------------- chaine/training.py | 124 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 121 insertions(+), 43 deletions(-) diff --git a/chaine/crf.pyx b/chaine/crf.pyx index f066201..46c9565 100644 --- a/chaine/crf.pyx +++ b/chaine/crf.pyx @@ -35,40 +35,29 @@ cdef class Trainer: ------------------------------ min_freq : float, optional (default=0) Threshold value for minimum frequency of a feature occurring in training data - all_possible_states : bool, optional (default=False) Generate state features that do not even occur in the training data - all_possible_transitions : bool, optional (default=False) Generate transition features that do not even occur in the training data - max_iterations : int, optional (default=None) Maximum number of iterations (unlimited by default) - num_memories : int, optional (default=6) Number of limited memories for approximating the inverse hessian matrix - c1 : float, optional (default=0) Coefficient for L1 regularization - c2 : float, optional (default=1.0) Coefficient for L2 regularization - epsilon : float, optional (default=1e-5) Parameter that determines the condition of convergence - period : int, optional (default=10) Threshold value for iterations to test the stopping criterion - delta : float, optional (default=1e-5) Top iteration when log likelihood is not greater than this - linesearch : str, optional (default="MoreThuente") Line search algorithm used in updates: * MoreThuente: More and Thuente's method * Backtracking: Backtracking method with regular Wolfe condition * StrongBacktracking: Backtracking method with strong Wolfe condition - max_linesearch : int, optional (default=20) Maximum number of trials for the line search algorithm @@ -76,37 +65,26 @@ cdef class Trainer: ---------------------- min_freq : float, optional (default=0) Threshold value for minimum frequency of a feature occurring in training data - all_possible_states : bool, optional (default=False) Generate state features that do not even occur in the training data - all_possible_transitions : bool, optional (default=False) Generate transition features that do not even occur in the training data - max_iterations : int, optional (default=None) Maximum number of iterations (1000 by default) - c2 : float, optional (default=1.0) Coefficient for L2 regularization - period : int, optional (default=10) Threshold value for iterations to test the stopping criterion - delta : float, optional (default=1e-5) Top iteration when log likelihood is not greater than this - calibration_eta : float, optional (default=0.1) Initial value of learning rate (eta) used for calibration - calibration_rate : float, optional (default=2.0) Rate of increase/decrease of learning rate for calibration - calibration_samples : int, optional (default=1000) Number of instances used for calibration - calibration_candidates : int, optional (default=10) Number of candidates of learning rate - calibration_max_trials : int, optional (default=20) Maximum number of trials of learning rates for calibration @@ -114,16 +92,12 @@ cdef class Trainer: ------------------------------ min_freq : float, optional (default=0) Threshold value for minimum frequency of a feature occurring in training data - all_possible_states : bool, optional (default=False) Generate state features that do not even occur in the training data - all_possible_transitions : bool, optional (default=False) Generate transition features that do not even occur in the training data - max_iterations : int, optional (default=None) Maximum number of iterations (100 by default) - epsilon : float, optional (default=1e-5) Parameter that determines the condition of convergence @@ -131,31 +105,23 @@ cdef class Trainer: ----------------------------- min_freq : float, optional (default=0) Threshold value for minimum frequency of a feature occurring in training data - all_possible_states : bool, optional (default=False) Generate state features that do not even occur in the training data - all_possible_transitions : bool, optional (default=False) Generate transition features that do not even occur in the training data - max_iterations : int, optional (default=None) Maximum number of iterations (100 by default) - epsilon : float, optional (default=1e-5) Parameter that determines the condition of convergence - pa_type : int, optional (default=1) Strategy for updating feature weights: * 0: PA without slack variables * 1: PA type I * 2: PA type II - c : float, optional (default=1) Aggressiveness parameter (used only for PA-I and PA-II) - error_sensitive : bool, optional (default=True) Include square root of predicted incorrect labels into optimization routine - averaging : bool, optional (default=True) Compute average of feature weights at all updates @@ -163,22 +129,16 @@ cdef class Trainer: ---------------------------------------------------- min_freq : float, optional (default=0) Threshold value for minimum frequency of a feature occurring in training data - all_possible_states : bool, optional (default=False) Generate state features that do not even occur in the training data - all_possible_transitions : bool, optional (default=False) Generate transition features that do not even occur in the training data - max_iterations : int, optional (default=None) Maximum number of iterations (100 by default) - epsilon : float, optional (default=1e-5) Parameter that determines the condition of convergence - variance : float, optional (default=1) Initial variance of every feature weight - gamma : float, optional (default=1) Trade-off between loss function and changes of feature weights """ diff --git a/chaine/training.py b/chaine/training.py index 16a2298..f95290a 100644 --- a/chaine/training.py +++ b/chaine/training.py @@ -18,13 +18,131 @@ def train(dataset: Dataset, labels: Labels, **kwargs) -> Model: Dataset consisting of sequences of features labels : Labels Labels corresponding to each instance in the dataset + algorithm : str + Following algorithms are available: + * lbfgs: Limited-memory BFGS with L1/L2 regularization + * l2sgd: Stochastic gradient descent with L2 regularization + * ap: Averaged perceptron + * pa: Passive aggressive + * arow: Adaptive regularization of weights + + Limited-memory BFGS Parameters + ------------------------------ + min_freq : float, optional (default=0) + Threshold value for minimum frequency of a feature occurring in training data + all_possible_states : bool, optional (default=False) + Generate state features that do not even occur in the training data + all_possible_transitions : bool, optional (default=False) + Generate transition features that do not even occur in the training data + max_iterations : int, optional (default=None) + Maximum number of iterations (unlimited by default) + num_memories : int, optional (default=6) + Number of limited memories for approximating the inverse hessian matrix + c1 : float, optional (default=0) + Coefficient for L1 regularization + c2 : float, optional (default=1.0) + Coefficient for L2 regularization + epsilon : float, optional (default=1e-5) + Parameter that determines the condition of convergence + period : int, optional (default=10) + Threshold value for iterations to test the stopping criterion + delta : float, optional (default=1e-5) + Top iteration when log likelihood is not greater than this + linesearch : str, optional (default="MoreThuente") + Line search algorithm used in updates: + * MoreThuente: More and Thuente's method + * Backtracking: Backtracking method with regular Wolfe condition + * StrongBacktracking: Backtracking method with strong Wolfe condition + max_linesearch : int, optional (default=20) + Maximum number of trials for the line search algorithm + + SGD with L2 Parameters + ---------------------- + min_freq : float, optional (default=0) + Threshold value for minimum frequency of a feature occurring in training data + all_possible_states : bool, optional (default=False) + Generate state features that do not even occur in the training data + all_possible_transitions : bool, optional (default=False) + Generate transition features that do not even occur in the training data + max_iterations : int, optional (default=None) + Maximum number of iterations (1000 by default) + c2 : float, optional (default=1.0) + Coefficient for L2 regularization + period : int, optional (default=10) + Threshold value for iterations to test the stopping criterion + delta : float, optional (default=1e-5) + Top iteration when log likelihood is not greater than this + calibration_eta : float, optional (default=0.1) + Initial value of learning rate (eta) used for calibration + calibration_rate : float, optional (default=2.0) + Rate of increase/decrease of learning rate for calibration + calibration_samples : int, optional (default=1000) + Number of instances used for calibration + calibration_candidates : int, optional (default=10) + Number of candidates of learning rate + calibration_max_trials : int, optional (default=20) + Maximum number of trials of learning rates for calibration + + Averaged Perceptron Parameters + ------------------------------ + min_freq : float, optional (default=0) + Threshold value for minimum frequency of a feature occurring in training data + all_possible_states : bool, optional (default=False) + Generate state features that do not even occur in the training data + all_possible_transitions : bool, optional (default=False) + Generate transition features that do not even occur in the training data + max_iterations : int, optional (default=None) + Maximum number of iterations (100 by default) + epsilon : float, optional (default=1e-5) + Parameter that determines the condition of convergence + + Passive Aggressive Parameters + ----------------------------- + min_freq : float, optional (default=0) + Threshold value for minimum frequency of a feature occurring in training data + all_possible_states : bool, optional (default=False) + Generate state features that do not even occur in the training data + all_possible_transitions : bool, optional (default=False) + Generate transition features that do not even occur in the training data + max_iterations : int, optional (default=None) + Maximum number of iterations (100 by default) + epsilon : float, optional (default=1e-5) + Parameter that determines the condition of convergence + pa_type : int, optional (default=1) + Strategy for updating feature weights: + * 0: PA without slack variables + * 1: PA type I + * 2: PA type II + c : float, optional (default=1) + Aggressiveness parameter (used only for PA-I and PA-II) + error_sensitive : bool, optional (default=True) + Include square root of predicted incorrect labels into optimization routine + averaging : bool, optional (default=True) + Compute average of feature weights at all updates + + Adaptive Regularization of Weights (AROW) Parameters + ---------------------------------------------------- + min_freq : float, optional (default=0) + Threshold value for minimum frequency of a feature occurring in training data + all_possible_states : bool, optional (default=False) + Generate state features that do not even occur in the training data + all_possible_transitions : bool, optional (default=False) + Generate transition features that do not even occur in the training data + max_iterations : int, optional (default=None) + Maximum number of iterations (100 by default) + epsilon : float, optional (default=1e-5) + Parameter that determines the condition of convergence + variance : float, optional (default=1) + Initial variance of every feature weight + gamma : float, optional (default=1) + Trade-off between loss function and changes of feature weights Returns ------- - CRF - A conditional random field fitted on the dataset + Model + A conditional random field trained on the dataset """ - # start training + # initialize trainer and start training trainer = Trainer(**kwargs) trainer.train(dataset, labels, "model.crf") From 9f433078b93f26006c622ff1c2c8e2cc50c6f438 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Mon, 4 Jan 2021 22:56:49 +0100 Subject: [PATCH 02/22] fix: typos --- chaine/training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chaine/training.py b/chaine/training.py index f95290a..c6295e5 100644 --- a/chaine/training.py +++ b/chaine/training.py @@ -1,8 +1,8 @@ """ -chaine.core -~~~~~~~~~~~ +chaine.training +~~~~~~~~~~~~~~~ -This module implements the high-level API +This module implements the high-level API to train a CRF """ from chaine.crf import Model, Trainer From 20fab7132e122a3613dcd9085d151c231ef5c963 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 01:23:31 +0100 Subject: [PATCH 03/22] fix: typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5b390e3..7610d28 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ If you are interested in the theoretical concepts behind conditional random fiel ```python >>> import chaine ->>> tokens = [["John", "Lennon", "was", "born", "in" "Liverpool"]] +>>> tokens = [["John", "Lennon", "was", "born", "in", "Liverpool"]] >>> labels = [["B-PER", "I-PER", "O", "O", "O", "B-LOC"]] >>> model = chaine.train(tokens, labels, max_iterations=5) >>> model.predict(tokens) From d4d73bb6dc496b9f01791f05c17cc3c4292b5684 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 01:23:44 +0100 Subject: [PATCH 04/22] chore: update description --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bce7743..e7487e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "chaine" -version = "0.2.2" -description = "A Lightweight Conditional Random Field" +version = "0.3.0" +description = "Linear-chain conditional random fields for natural language processing" authors = ["Severin Simmler "] readme = "README.md" build = "build.py" From c349974f66638abc6e2721e4d684b1eaabdc8158 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 01:23:57 +0100 Subject: [PATCH 05/22] chore: update tests --- tests/test_logging.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/tests/test_logging.py b/tests/test_logging.py index a3a6345..bf18ef8 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -13,37 +13,3 @@ def test_logger(): assert logger.log_level == logger.DEBUG logger.log_level = logger.WARNING assert logger.log_level == logger.WARNING - - -def test_log_message(): - message = logging.LogMessage() - - assert message.iteration is None - assert message.loss is None - - message.iteration = "1" - message.loss = "1000.0" - assert message.iteration == "1" - assert message.loss == "1000.0" - assert str(message) == "Iteration: 1\tLoss: 1000.0" - - -def test_log_parser(): - parser = logging.LogParser() - - assert isinstance(parser.message, logging.LogMessage) - - text = parser.parse("Irrelevant message") - assert text is None - assert parser.message.iteration is None - assert parser.message.loss is None - - text = parser.parse("***** Iteration #1 *****\n") - assert text is None - assert parser.message.iteration == "1" - assert parser.message.loss is None - - text = parser.parse("Loss: 1000.0") - assert text == "Iteration: 1\tLoss: 1000.0" - assert parser.message.iteration is None - assert parser.message.loss is None From 8abe488f77343f6efa01ed1f3af32643eb8f393b Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 01:24:22 +0100 Subject: [PATCH 06/22] fix: #22 --- chaine/crf.pyx | 17 ++--- chaine/crfsuite/lib/crf/src/crf1d_encode.c | 21 +------ chaine/crfsuite/lib/crf/src/crf1d_feature.c | 2 - chaine/crfsuite/lib/crf/src/crfsuite.c | 19 ------ chaine/crfsuite/lib/crf/src/crfsuite_train.c | 2 - chaine/crfsuite/lib/crf/src/logging.c | 13 +--- chaine/crfsuite/lib/crf/src/train_arow.c | 22 +------ .../lib/crf/src/train_averaged_perceptron.c | 18 +----- chaine/crfsuite/lib/crf/src/train_l2sgd.c | 62 ++++--------------- chaine/crfsuite/lib/crf/src/train_lbfgs.c | 37 ++--------- .../lib/crf/src/train_passive_aggressive.c | 22 +------ chaine/logging.py | 54 ---------------- 12 files changed, 36 insertions(+), 253 deletions(-) diff --git a/chaine/crf.pyx b/chaine/crf.pyx index 46c9565..131a28e 100644 --- a/chaine/crf.pyx +++ b/chaine/crf.pyx @@ -7,7 +7,7 @@ cimport crfsuite_api from libcpp.string cimport string import os -from chaine.logging import Logger, LogParser +from chaine.logging import Logger from chaine.typing import Dataset, Dict, Iterable, Labels, List, Path, Sequence LOGGER = Logger(__name__) @@ -202,7 +202,6 @@ cdef class Trainer: "variance": float, "gamma": float, } - _log_parser = LogParser() def __init__(self, algorithm="l2sgd", **kwargs): self._select_algorithm(algorithm) @@ -246,14 +245,13 @@ cdef class Trainer: features. One item consists only of the relevant features. Internally, the string features are hash-mapped and a sparse matrix is constructed. """ - LOGGER.info("Loading data") + LOGGER.info("Loading training data (this may take a while)") for i, (sequence, labels_) in enumerate(zip(dataset, labels)): - # log progress every 10000 data points - if i > 0 and i % 10000 == 0: - LOGGER.info(f"Processed sequences: {i}") + # log progress every 100 data points + if i > 0 and i % 100 == 0: + LOGGER.debug(f"{i} processed data points") self._append(sequence, labels_) - LOGGER.info("Start training") status_code = self._c_trainer.train(str(model_filepath), -1) if status_code != crfsuite_api.CRFSUITE_SUCCESS: LOGGER.error(f"An error ({status_code}) occured") @@ -270,9 +268,7 @@ cdef class Trainer: self._message(message) def _message(self, message): - event = self._log_parser.parse(message) - if event: - LOGGER.info(event) + LOGGER.info(message) def _append(self, sequence, labels, int group=0): # no generators allowed @@ -400,7 +396,6 @@ cdef class Model: """ return [self.predict_proba_single(sequence) for sequence in sequences] - def _load(self, filepath): filepath = str(filepath) self._check_model(filepath) diff --git a/chaine/crfsuite/lib/crf/src/crf1d_encode.c b/chaine/crfsuite/lib/crf/src/crf1d_encode.c index 3c90256..aa4cc41 100644 --- a/chaine/crfsuite/lib/crf/src/crf1d_encode.c +++ b/chaine/crfsuite/lib/crf/src/crf1d_encode.c @@ -477,11 +477,7 @@ crf1de_set_data( } /* Feature generation. */ - logging(lg, "Feature generation\n"); - logging(lg, "type: CRF1d\n"); - logging(lg, "feature.minfreq: %f\n", opt->feature_minfreq); - logging(lg, "feature.possible_states: %d\n", opt->feature_possible_states); - logging(lg, "feature.possible_transitions: %d\n", opt->feature_possible_transitions); + logging(lg, "Processing training data"); begin = clock(); crf1de->features = crf1df_generate( &crf1de->num_features, @@ -498,9 +494,6 @@ crf1de_set_data( ret = CRFSUITEERR_OUTOFMEMORY; goto error_exit; } - logging(lg, "Number of features: %d\n", crf1de->num_features); - logging(lg, "Seconds required: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); /* Initialize the feature references. */ crf1df_init_references( @@ -544,7 +537,7 @@ crf1de_save_model( int J = 0, B = 0; /* Start storing the model. */ - logging(lg, "Storing the model\n"); + logging(lg, "Saving model"); begin = clock(); /* Allocate and initialize the feature mapping. */ @@ -641,12 +634,7 @@ crf1de_save_model( goto error_exit; } - logging(lg, "Number of active features: %d (%d)\n", J, K); - logging(lg, "Number of active attributes: %d (%d)\n", B, A); - logging(lg, "Number of active labels: %d (%d)\n", L, L); - /* Write labels. */ - logging(lg, "Writing labels\n", L); if (ret = crf1dmw_open_labels(writer, L)) { goto error_exit; @@ -670,7 +658,6 @@ crf1de_save_model( } /* Write attributes. */ - logging(lg, "Writing attributes\n"); if (ret = crf1dmw_open_attrs(writer, B)) { goto error_exit; @@ -697,7 +684,6 @@ crf1de_save_model( } /* Write label feature references. */ - logging(lg, "Writing feature references for transitions\n"); if (ret = crf1dmw_open_labelrefs(writer, L + 2)) { goto error_exit; @@ -716,7 +702,6 @@ crf1de_save_model( } /* Write attribute feature references. */ - logging(lg, "Writing feature references for attributes\n"); if (ret = crf1dmw_open_attrrefs(writer, B)) { goto error_exit; @@ -739,8 +724,6 @@ crf1de_save_model( /* Close the writer. */ crf1dmw_close(writer); - logging(lg, "Seconds required: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); free(amap); free(fmap); diff --git a/chaine/crfsuite/lib/crf/src/crf1d_feature.c b/chaine/crfsuite/lib/crf/src/crf1d_feature.c index 010ea63..fc25f4f 100644 --- a/chaine/crfsuite/lib/crf/src/crf1d_feature.c +++ b/chaine/crfsuite/lib/crf/src/crf1d_feature.c @@ -190,8 +190,6 @@ crf1df_feature_t *crf1df_generate( set = featureset_new(); /* Loop over the sequences in the training data. */ - logging_progress_start(&lg); - for (s = 0; s < N; ++s) { int prev = L, cur = 0; diff --git a/chaine/crfsuite/lib/crf/src/crfsuite.c b/chaine/crfsuite/lib/crf/src/crfsuite.c index 474d35c..d6f6f01 100644 --- a/chaine/crfsuite/lib/crf/src/crfsuite.c +++ b/chaine/crfsuite/lib/crf/src/crfsuite.c @@ -477,8 +477,6 @@ void crfsuite_evaluation_output(crfsuite_evaluation_t *eval, crfsuite_dictionary lg.func = cbm; lg.instance = instance; - logging(&lg, "Performance by label (#match, #model, #ref) (precision, recall, F1):\n"); - for (i = 0; i < eval->num_labels; ++i) { const crfsuite_label_evaluation_t *lev = &eval->tbl[i]; @@ -487,25 +485,8 @@ void crfsuite_evaluation_output(crfsuite_evaluation_t *eval, crfsuite_dictionary if (lstr == NULL) lstr = "[UNKNOWN]"; - if (lev->num_observation == 0) - { - logging(&lg, " %s: (%d, %d, %d) (******, ******, ******)\n", - lstr, lev->num_correct, lev->num_model, lev->num_observation); - } - else - { - logging(&lg, " %s: (%d, %d, %d) (%1.4f, %1.4f, %1.4f)\n", - lstr, lev->num_correct, lev->num_model, lev->num_observation, - lev->precision, lev->recall, lev->fmeasure); - } labels->free(labels, lstr); } - logging(&lg, "Macro-average precision, recall, F1: (%f, %f, %f)\n", - eval->macro_precision, eval->macro_recall, eval->macro_fmeasure); - logging(&lg, "Item accuracy: %d / %d (%1.4f)\n", - eval->item_total_correct, eval->item_total_num, eval->item_accuracy); - logging(&lg, "Instance accuracy: %d / %d (%1.4f)\n", - eval->inst_total_correct, eval->inst_total_num, eval->inst_accuracy); } int crfsuite_interlocked_increment(int *count) diff --git a/chaine/crfsuite/lib/crf/src/crfsuite_train.c b/chaine/crfsuite/lib/crf/src/crfsuite_train.c index 7a6e200..3682192 100644 --- a/chaine/crfsuite/lib/crf/src/crfsuite_train.c +++ b/chaine/crfsuite/lib/crf/src/crfsuite_train.c @@ -150,8 +150,6 @@ static int crfsuite_train_train( if (0 <= holdout) { dataset_init_testset(&testset, (crfsuite_data_t *)data, holdout); - logging(lg, "Holdout group: %d\n", holdout + 1); - logging(lg, "\n"); } /* Set the training set to the CRF, and generate features. */ diff --git a/chaine/crfsuite/lib/crf/src/logging.c b/chaine/crfsuite/lib/crf/src/logging.c index 3e17ffa..75064e2 100644 --- a/chaine/crfsuite/lib/crf/src/logging.c +++ b/chaine/crfsuite/lib/crf/src/logging.c @@ -64,12 +64,6 @@ void logging_timestamp(logging_t *lg, const char *format) logging(lg, format, timestamp); } -void logging_progress_start(logging_t *lg) -{ - lg->percent = 0; - logging(lg, "0"); -} - void logging_progress(logging_t *lg, int percent) { while (lg->percent < percent) @@ -79,11 +73,7 @@ void logging_progress(logging_t *lg, int percent) { if (lg->percent % 10 == 0) { - logging(lg, "%d", lg->percent / 10); - } - else - { - logging(lg, "."); + logging(lg, "Processed %d%% of the training data", lg->percent); } } } @@ -92,5 +82,4 @@ void logging_progress(logging_t *lg, int percent) void logging_progress_end(logging_t *lg) { logging_progress(lg, 100); - logging(lg, "\n"); } diff --git a/chaine/crfsuite/lib/crf/src/train_arow.c b/chaine/crfsuite/lib/crf/src/train_arow.c index 4265017..cb30bb7 100644 --- a/chaine/crfsuite/lib/crf/src/train_arow.c +++ b/chaine/crfsuite/lib/crf/src/train_arow.c @@ -282,14 +282,7 @@ int crfsuite_train_arow( /* Initialize the covariance vector (diagnal matrix). */ vecset(cov, opt.variance, K); - /* Show the parameters. */ - logging(lg, "Adaptive Regularization of Weights (AROW)\n"); - logging(lg, "variance: %f\n", opt.variance); - logging(lg, "gamma: %f\n", opt.gamma); - logging(lg, "max_iterations: %d\n", opt.max_iterations); - logging(lg, "epsilon: %f\n", opt.epsilon); - logging(lg, "\n"); - + logging(lg, "Start training with AROW"); beta = 1.0 / opt.gamma; /* Loop for epoch. */ @@ -381,10 +374,7 @@ int crfsuite_train_arow( } /* Output the progress. */ - logging(lg, "***** Iteration #%d *****\n", i + 1); - logging(lg, "Loss: %f\n", sum_loss); - logging(lg, "Feature norm: %f\n", sqrt(vecdot(mean, mean, K))); - logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC); + logging(lg, "Iteration %d, training loss: %f", i + 1, sum_loss); /* Holdout evaluation if necessary. */ if (testset != NULL) @@ -392,20 +382,14 @@ int crfsuite_train_arow( holdout_evaluation(gm, testset, mean, lg); } - logging(lg, "\n"); - /* Convergence test. */ if (sum_loss / N <= opt.epsilon) { - logging(lg, "Terminated with the stopping criterion\n"); - logging(lg, "\n"); + logging(lg, "Loss has converged, terminating training"); break; } } - logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); - free(viterbi); free(prod); free(cov); diff --git a/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c b/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c index ae014e5..9947a8d 100644 --- a/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c +++ b/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c @@ -143,10 +143,7 @@ int crfsuite_train_averaged_perceptron( } /* Show the parameters. */ - logging(lg, "Averaged perceptron\n"); - logging(lg, "max_iterations: %d\n", opt.max_iterations); - logging(lg, "epsilon: %f\n", opt.epsilon); - logging(lg, "\n"); + logging(lg, "Start training with AP"); c = 1; ud.w = w; @@ -207,10 +204,7 @@ int crfsuite_train_averaged_perceptron( vecasub(wa, 1. / c, ws, K); /* Output the progress. */ - logging(lg, "***** Iteration #%d *****\n", i + 1); - logging(lg, "Loss: %f\n", loss); - logging(lg, "Feature norm: %f\n", sqrt(vecdot(wa, wa, K))); - logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC); + logging(lg, "Iteration %d, training loss: %f", i + 1, loss); /* Holdout evaluation if necessary. */ if (testset != NULL) @@ -218,20 +212,14 @@ int crfsuite_train_averaged_perceptron( holdout_evaluation(gm, testset, wa, lg); } - logging(lg, "\n"); - /* Convergence test. */ if (loss / N < opt.epsilon) { - logging(lg, "Terminated with the stopping criterion\n"); - logging(lg, "\n"); + logging(lg, "Loss has converged, terminating training"); break; } } - logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); - free(viterbi); free(ws); free(w); diff --git a/chaine/crfsuite/lib/crf/src/train_l2sgd.c b/chaine/crfsuite/lib/crf/src/train_l2sgd.c index 1ee6006..7fd8f78 100644 --- a/chaine/crfsuite/lib/crf/src/train_l2sgd.c +++ b/chaine/crfsuite/lib/crf/src/train_l2sgd.c @@ -44,7 +44,7 @@ written by Léon Bottou. The objective function to minimize is: - + f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x) lambda = 2 * C / N @@ -172,7 +172,6 @@ static int l2sgd( if (!calibration) { - logging(lg, "***** Epoch #%d *****\n", epoch); /* Shuffle the training instances. */ dataset_shuffle(trainset); } @@ -200,7 +199,7 @@ static int l2sgd( /* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */ if (!isfinite(loss)) { - logging(lg, "ERROR: overflow loss\n"); + logging(lg, "Loss is abnormal"); ret = CRFSUITEERR_OVERFLOW; sum_loss = loss; goto error_exit; @@ -239,22 +238,11 @@ static int l2sgd( /* Store the current value of the objective function. */ pf[(epoch - 1) % period] = sum_loss; - logging(lg, "Loss: %f\n", sum_loss); + logging(lg, "Epoch %d, learning rate: %f, training loss: %f", epoch, eta, sum_loss); if (period < epoch) { - logging(lg, "Improvement ratio: %f\n", improvement); - } - logging(lg, "Feature L2-norm: %f\n", sqrt(norm2)); - logging(lg, "Learning rate (eta): %f\n", eta); - logging(lg, "Total number of feature updates: %.0f\n", t); - logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC); - - /* Holdout evaluation if necessary. */ - if (testset != NULL) - { - holdout_evaluation(gm, testset, w, lg); + logging(lg, "Improvement ratio: %f", improvement); } - logging(lg, "\n"); /* Check for the stopping criterion. */ if (improvement < epsilon) @@ -272,17 +260,13 @@ static int l2sgd( { if (epoch < num_epochs) { - logging(lg, "SGD terminated with the stopping criteria\n"); + logging(lg, "Loss has converged, terminating training"); } else { - logging(lg, "SGD terminated with the maximum number of iterations\n"); + logging(lg, "Reached maximum number of iterations, terminating training"); } } - else - { - logging(lg, "SGD terminated with error code (%d)\n", ret); - } } /* Restore the best weights. */ @@ -326,12 +310,7 @@ l2sgd_calibration( const floatval_t rate = opt->calibration_rate; const floatval_t lambda = opt->lambda; - logging(lg, "Calibrating the learning rate (eta)\n"); - logging(lg, "calibration.eta: %f\n", eta); - logging(lg, "calibration.rate: %f\n", rate); - logging(lg, "calibration.samples: %d\n", S); - logging(lg, "calibration.candidates: %d\n", num); - logging(lg, "calibration.max_trials: %d\n", opt->calibration_max_trials); + logging(lg, "Calibrating learning rate"); /* Initialize a permutation that shuffles the instances. */ dataset_shuffle(ds); @@ -353,12 +332,10 @@ l2sgd_calibration( init_loss += score; } init_loss += 0.5 * lambda * vecdot(w, w, K) * N; - logging(lg, "Initial loss: %f\n", init_loss); + logging(lg, "Initial training loss: %f", init_loss); while (num > 0 || !dec) { - logging(lg, "Trial #%d (eta = %f): ", trials, eta); - /* Perform SGD for one epoch. */ l2sgd( gm, @@ -370,15 +347,13 @@ l2sgd_calibration( /* Make sure that the learning rate decreases the log-likelihood. */ ok = isfinite(loss) && (loss < init_loss); + + logging(lg, "Trial %d, learning rate %f, training loss: %f", trials, eta, loss); + if (ok) { - logging(lg, "%f\n", loss); --num; } - else - { - logging(lg, "%f (worse)\n", loss); - } if (isfinite(loss) && loss < best_loss) { @@ -412,9 +387,7 @@ l2sgd_calibration( } eta = best_eta; - logging(lg, "Best learning rate (eta): %f\n", eta); - logging(lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); + logging(lg, "Best learning rate: %f", eta); return 1.0 / (lambda * eta); } @@ -491,12 +464,7 @@ int crfsuite_train_l2sgd( opt.lambda = 2. * opt.c2 / N; - logging(lg, "Stochastic Gradient Descent (SGD)\n"); - logging(lg, "c2: %f\n", opt.c2); - logging(lg, "max_iterations: %d\n", opt.max_iterations); - logging(lg, "period: %d\n", opt.period); - logging(lg, "delta: %f\n", opt.delta); - logging(lg, "\n"); + logging(lg, "Start training with SGD"); clk_begin = clock(); /* Calibrate the training rate (eta). */ @@ -518,10 +486,6 @@ int crfsuite_train_l2sgd( opt.delta, &loss); - logging(lg, "Loss: %f\n", loss); - logging(lg, "Total seconds required for training: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); - *ptr_w = w; return ret; diff --git a/chaine/crfsuite/lib/crf/src/train_lbfgs.c b/chaine/crfsuite/lib/crf/src/train_lbfgs.c index 3148e9c..db553c0 100644 --- a/chaine/crfsuite/lib/crf/src/train_lbfgs.c +++ b/chaine/crfsuite/lib/crf/src/train_lbfgs.c @@ -143,14 +143,7 @@ static int lbfgs_progress( } /* Report the progress. */ - logging(lg, "***** Iteration #%d *****\n", k); - logging(lg, "Loss: %f\n", fx); - logging(lg, "Feature norm: %f\n", xnorm); - logging(lg, "Error norm: %f\n", gnorm); - logging(lg, "Active features: %d\n", num_active_features); - logging(lg, "Line search trials: %d\n", ls); - logging(lg, "Line search step: %f\n", step); - logging(lg, "Seconds required for this iteration: %.3f\n", duration / (double)CLOCKS_PER_SEC); + logging(lg, "Iteration %d, training loss: %f", k, fx); /* Send the tagger with the current parameters. */ if (testset != NULL) @@ -158,8 +151,6 @@ static int lbfgs_progress( holdout_evaluation(gm, testset, x, lg); } - logging(lg, "\n"); - /* Continue. */ return 0; } @@ -253,17 +244,7 @@ int crfsuite_train_lbfgs( /* Read the L-BFGS parameters. */ exchange_options(params, &opt, -1); - logging(lg, "L-BFGS optimization\n"); - logging(lg, "c1: %f\n", opt.c1); - logging(lg, "c2: %f\n", opt.c2); - logging(lg, "num_memories: %d\n", opt.memory); - logging(lg, "max_iterations: %d\n", opt.max_iterations); - logging(lg, "epsilon: %f\n", opt.epsilon); - logging(lg, "stop: %d\n", opt.stop); - logging(lg, "delta: %f\n", opt.delta); - logging(lg, "linesearch: %s\n", opt.linesearch); - logging(lg, "linesearch.max_iterations: %d\n", opt.linesearch_max_iterations); - logging(lg, "\n"); + logging(lg, "Start training with L-BFGS"); /* Set parameters for L-BFGS. */ lbfgsparam.m = opt.memory; @@ -315,29 +296,21 @@ int crfsuite_train_lbfgs( &lbfgsparam); if (lbret == LBFGS_CONVERGENCE) { - logging(lg, "L-BFGS resulted in convergence\n"); + logging(lg, "Loss has converged, terminating training"); } else if (lbret == LBFGS_STOP) { - logging(lg, "L-BFGS terminated with the stopping criteria\n"); + logging(lg, "Terminated with the stopping criteria"); } else if (lbret == LBFGSERR_MAXIMUMITERATION) { - logging(lg, "L-BFGS terminated with the maximum number of iterations\n"); - } - else - { - logging(lg, "L-BFGS terminated with error code (%d)\n", lbret); + logging(lg, "Reached maximum number of iterations. terminating training"); } /* Set the best_w array (allocated by us) as the result array, which the * callee can safely `free`. */ *ptr_w = lbfgsi.best_w; - /* Report the run-time for the training. */ - logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); - /* Exit with success. */ lbfgs_free(w); return 0; diff --git a/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c b/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c index 06b3685..2075fbb 100644 --- a/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c +++ b/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c @@ -317,14 +317,7 @@ int crfsuite_train_passive_aggressive( } /* Show the parameters. */ - logging(lg, "Passive Aggressive\n"); - logging(lg, "type: %d\n", opt.type); - logging(lg, "c: %f\n", opt.c); - logging(lg, "error_sensitive: %d\n", opt.error_sensitive); - logging(lg, "averaging: %d\n", opt.averaging); - logging(lg, "max_iterations: %d\n", opt.max_iterations); - logging(lg, "epsilon: %f\n", opt.epsilon); - logging(lg, "\n"); + logging(lg, "Start training with PA"); u = 1; @@ -414,10 +407,7 @@ int crfsuite_train_passive_aggressive( } /* Output the progress. */ - logging(lg, "***** Iteration #%d *****\n", i + 1); - logging(lg, "Loss: %f\n", sum_loss); - logging(lg, "Feature norm: %f\n", sqrt(vecdot(w, w, K))); - logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC); + logging(lg, "Iteration %d, training loss: %f", i + 1, sum_loss); /* Holdout evaluation if necessary. */ if (testset != NULL) @@ -425,20 +415,14 @@ int crfsuite_train_passive_aggressive( holdout_evaluation(gm, testset, wa, lg); } - logging(lg, "\n"); - /* Convergence test. */ if (sum_loss / N < opt.epsilon) { - logging(lg, "Terminated with the stopping criterion\n"); - logging(lg, "\n"); + logging(lg, "Loss has converged, terminating training"); break; } } - logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC); - logging(lg, "\n"); - free(viterbi); free(ws); free(w); diff --git a/chaine/logging.py b/chaine/logging.py index 7e4b983..2bba9d1 100644 --- a/chaine/logging.py +++ b/chaine/logging.py @@ -114,57 +114,3 @@ def formatter(self): def __repr__(self): return f"" - - -class LogMessage: - """CRFsuite log message - - Attributes - ---------- - iteration : Optional[str] - Current number of iterations - loss : Optional[str] - Current loss score - """ - - def __init__(self): - self.iteration = None - self.loss = None - - def __str__(self) -> str: - return f"Iteration {self.iteration}, train loss: {self.loss}" - - -class LogParser: - """Parser for CRFsuite's logfile - - Attributes - ---------- - message : LogMessage - Log message with current iteration and loss - """ - - def __init__(self): - self.message = LogMessage() - - def parse(self, line: str) -> Optional[str]: - """Parse one line of the logs - - Parameters - ---------- - line : str - One line of CRFsuite's logs - - Returns - ------- - str - Formatted log message with latest iteration and loss - """ - if (m := re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)) : - self.message.iteration = m.group(1) - elif (m := re.match(r"Loss: (\d+\.\d+)", line)) : - self.message.loss = m.group(1) - if self.message.iteration: - text = str(self.message) - self.message = LogMessage() - return text From 7323b24d936f0e8f0fde738168c560d53af8d1f3 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 16:07:49 +0100 Subject: [PATCH 07/22] fix: #14 From bccc6dd9869836075e3f5b938ee1121d76b30698 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 16:12:27 +0100 Subject: [PATCH 08/22] fix: #18 --- chaine/crfsuite/include/crfsuite.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chaine/crfsuite/include/crfsuite.hpp b/chaine/crfsuite/include/crfsuite.hpp index 1357222..a9bcf80 100644 --- a/chaine/crfsuite/include/crfsuite.hpp +++ b/chaine/crfsuite/include/crfsuite.hpp @@ -125,7 +125,7 @@ namespace CRFSuite if (xseq.size() != yseq.size()) { std::stringstream ss; - ss << "The numbers of items and labels differ: |x| = " << xseq.size() << ", |y| = " << yseq.size(); + ss << "The number of items and labels differ: |x| = " << xseq.size() << ", |y| = " << yseq.size(); throw std::invalid_argument(ss.str()); } From 2ebb337820a0cfb1e274efca43450696d9410008 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 21:53:51 +0100 Subject: [PATCH 09/22] chore: logging --- chaine/crfsuite/lib/crf/src/train_l2sgd.c | 4 ---- chaine/data.py | 19 +++++++++---------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/chaine/crfsuite/lib/crf/src/train_l2sgd.c b/chaine/crfsuite/lib/crf/src/train_l2sgd.c index 7fd8f78..e9e3e32 100644 --- a/chaine/crfsuite/lib/crf/src/train_l2sgd.c +++ b/chaine/crfsuite/lib/crf/src/train_l2sgd.c @@ -239,10 +239,6 @@ static int l2sgd( pf[(epoch - 1) % period] = sum_loss; logging(lg, "Epoch %d, learning rate: %f, training loss: %f", epoch, eta, sum_loss); - if (period < epoch) - { - logging(lg, "Improvement ratio: %f", improvement); - } /* Check for the stopping criterion. */ if (improvement < epsilon) diff --git a/chaine/data.py b/chaine/data.py index d612cd6..eab82a2 100644 --- a/chaine/data.py +++ b/chaine/data.py @@ -56,13 +56,12 @@ def is_upper(self) -> bool: """True if token is upper case, False otherwise""" return self.text.isupper() - -class TokenSequence: - def __init__(self, tokens): - if not all(isinstance(token, Token) for token in tokens): - tokens = [Token(index, text) for index, text in enumerate(tokens)] - self.tokens = tokens - - def __iter__(self): - for token in self.tokens: - yield token + @property + def features(self): + return {"num_characters": len(self), + "text": self.lower(), + "shape": self.shape, + "is_digit": self.is_digit, + "is_lower": self.is_lower, + "is_title": self.is_title, + "is_upper": self.is_upper} From 6607a618a725a74530cf80414a0c4c3e530a1a3c Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 21:54:05 +0100 Subject: [PATCH 10/22] chore: how it works --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7610d28..53a9176 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ $ pip install chaine If you are interested in the theoretical concepts behind conditional random fields, please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers). -## Example +## Minimal working example ```python >>> import chaine @@ -31,6 +31,11 @@ If you are interested in the theoretical concepts behind conditional random fiel Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb). +## How it works + + + + ## Credits This library makes use of and is partially based on: From 48da240333c5b60f10191bc58adf5eed7b0b4ef4 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Tue, 5 Jan 2021 21:54:12 +0100 Subject: [PATCH 11/22] chore: rename folder --- examples/training.py | 30 ++++++++++ notebooks/tutorial.ipynb | 115 --------------------------------------- 2 files changed, 30 insertions(+), 115 deletions(-) create mode 100644 examples/training.py delete mode 100644 notebooks/tutorial.ipynb diff --git a/examples/training.py b/examples/training.py new file mode 100644 index 0000000..e16bf08 --- /dev/null +++ b/examples/training.py @@ -0,0 +1,30 @@ +import chaine +from chaine.data import Token +from flair.models import SequenceTagger +from flair.data import Sentence +import tqdm +import datasets + +TAGGER = SequenceTagger.load("pos-multi-fast") +DATASET = datasets.load_dataset("germaner") + + +def preprocess(dataset): + for tokens in tqdm.tqdm(dataset): + sentence = Sentence(" ".join(tokens), use_tokenizer=False) + TAGGER.predict(sentence) + pos_tags = [token.get_tag("upos").value for token in sentence] + features = [Token(i, text).features for i, text in enumerate(tokens)] + for token, pos in zip(features, pos_tags): + token["pos"] = pos + yield features + + + + + +if __name__ == "__main__": + tokens = preprocess(DATASET["train"]["tokens"][:10]) + labels = DATASET["train"]["ner_tags"][:10] + + model = chaine.train(tokens, labels) diff --git a/notebooks/tutorial.ipynb b/notebooks/tutorial.ipynb deleted file mode 100644 index 2dad489..0000000 --- a/notebooks/tutorial.ipynb +++ /dev/null @@ -1,115 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Named entity recognition" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import chaine" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "tokens = [[\"John\", \"Lennon\", \"was\", \"rhythm\", \"guitarist\" \"of\", \"The\", \"Beatles\", \".\"]]\n", - "labels = [[\"B-PER\", \"I-PER\", \"O\", \"O\", \"O\" \"O\", \"B-ORG\", \"I-ORG\", \"O\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-12-06 23:50:12,783] [INFO] Loading data\n", - "[2020-12-06 23:50:12,785] [INFO] Start training\n", - "[2020-12-06 23:50:12,789] [INFO] Iteration: 1\tLoss: 14.334076\n", - "[2020-12-06 23:50:12,792] [INFO] Iteration: 2\tLoss: 14.334064\n", - "[2020-12-06 23:50:12,793] [INFO] Iteration: 3\tLoss: 14.334053\n", - "[2020-12-06 23:50:12,794] [INFO] Iteration: 4\tLoss: 14.334041\n", - "[2020-12-06 23:50:12,796] [INFO] Iteration: 5\tLoss: 14.334029\n" - ] - } - ], - "source": [ - "crf = chaine.train(tokens, labels, max_iterations=5)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[['I-PER', 'I-PER', 'O', 'O', 'OO', 'B-ORG', 'I-ORG', 'O']]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "crf.predict(tokens)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Feature extraction\n", - "\n", - "```\n", - "identity of wi, identity of neighboring words\n", - "embeddings for wi, embeddings for neighboring words\n", - "part of speech of wi, part of speech of neighboring words\n", - "base-phrase syntactic chunk label of wi and neighboring words\n", - "presence of wi in a gazetteer\n", - "wi contains a particular prefix (from all prefixes of length ≤ 4)\n", - "wi contains a particular suffix (from all suffixes of length ≤ 4)\n", - "wi is all upper case\n", - "word shape of wi, word shape of neighboring words\n", - "short word shape of wi, short word shape of neighboring words\n", - "presence of hyphen\n", - "```" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 5052beaf7ce5605f23e986d946785edf70102192 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 21:24:44 +0100 Subject: [PATCH 12/22] fix: unit tests --- tests/{test_training.py => test_api.py} | 4 ++-- tests/test_crf.py | 18 ------------------ tests/test_data.py | 13 ------------- 3 files changed, 2 insertions(+), 33 deletions(-) rename tests/{test_training.py => test_api.py} (78%) delete mode 100644 tests/test_data.py diff --git a/tests/test_training.py b/tests/test_api.py similarity index 78% rename from tests/test_training.py rename to tests/test_api.py index 952c6d8..25e8538 100644 --- a/tests/test_training.py +++ b/tests/test_api.py @@ -1,4 +1,4 @@ -from chaine import training +from chaine import api from chaine.crf import Model @@ -6,7 +6,7 @@ def test_train(): sequences = [[{"foo"}, {"bar"}] for _ in range(50)] labels = [["O", "O"] for _ in range(50)] - crf = training.train(sequences, labels) + crf = api.train(sequences, labels) assert isinstance(crf, Model) assert crf.labels == {"O"} diff --git a/tests/test_crf.py b/tests/test_crf.py index 0cc5090..6ea6874 100644 --- a/tests/test_crf.py +++ b/tests/test_crf.py @@ -25,19 +25,6 @@ def model(serialized_model): return crf.Model(serialized_model) -def test_intbool(): - value = crf._intbool("0") - assert isinstance(value, bool) - assert value == False - - value = crf._intbool("1") - assert isinstance(value, bool) - assert value == True - - with pytest.raises(ValueError): - crf._intbool("foo") - - def test_trainer_algorithm_selection(): for algorithm in { "lbfgs", @@ -181,11 +168,6 @@ def test_arow_params(): assert param in trainer.params.keys() -def test_trainer_log_parser(): - trainer = crf.Trainer() - assert hasattr(trainer, "_log_parser") - - def test_training(tmpdir, dataset): trainer = crf.Trainer() model_filepath = Path(tmpdir.join("model.crf")) diff --git a/tests/test_data.py b/tests/test_data.py deleted file mode 100644 index f9213a2..0000000 --- a/tests/test_data.py +++ /dev/null @@ -1,13 +0,0 @@ -from chaine import data - - -def test_token(): - token = data.Token(0, "Foo") - assert len(token) == 3 - assert repr(token) == "" - assert str(token) == "Foo" - assert token.lower() == "foo" - assert token.is_digit == False - assert token.is_lower == False - assert token.is_title == True - assert token.is_upper == False From 37d26d1345a394be0e74b3f1429a29cea00d8360 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 21:24:59 +0100 Subject: [PATCH 13/22] chore: tune version number --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e7487e9..ba1133b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "chaine" -version = "0.3.0" +version = "1.0.0" description = "Linear-chain conditional random fields for natural language processing" authors = ["Severin Simmler "] readme = "README.md" From 3b305705114d35ec47aa1de64201e435d2557f59 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 21:25:16 +0100 Subject: [PATCH 14/22] chore: cleanup --- chaine/__init__.py | 4 +- chaine/{training.py => api.py} | 2 +- chaine/crf.pyx | 30 +++++++-------- chaine/data.py | 67 ---------------------------------- chaine/logging.py | 7 +--- chaine/typing.py | 10 ++--- 6 files changed, 22 insertions(+), 98 deletions(-) rename chaine/{training.py => api.py} (99%) delete mode 100644 chaine/data.py diff --git a/chaine/__init__.py b/chaine/__init__.py index 78eda0f..ea245da 100644 --- a/chaine/__init__.py +++ b/chaine/__init__.py @@ -1,2 +1,2 @@ -from chaine.training import train -from chaine.crf import Model, Trainer +from chaine.api import train +from chaine import crf diff --git a/chaine/training.py b/chaine/api.py similarity index 99% rename from chaine/training.py rename to chaine/api.py index c6295e5..4878d4e 100644 --- a/chaine/training.py +++ b/chaine/api.py @@ -15,7 +15,7 @@ def train(dataset: Dataset, labels: Labels, **kwargs) -> Model: Parameters ---------- dataset : Dataset - Dataset consisting of sequences of features + Dataset consisting of sequences of feature sets labels : Labels Labels corresponding to each instance in the dataset algorithm : str diff --git a/chaine/crf.pyx b/chaine/crf.pyx index 131a28e..4f16374 100644 --- a/chaine/crf.pyx +++ b/chaine/crf.pyx @@ -8,16 +8,11 @@ from libcpp.string cimport string import os from chaine.logging import Logger -from chaine.typing import Dataset, Dict, Iterable, Labels, List, Path, Sequence +from chaine.typing import Dataset, Dict, Iterable, Labels, List, Filepath, Sequence LOGGER = Logger(__name__) -def _intbool(value: str) -> bool: - """Helper function to cast a string to an integer to a boolean""" - return bool(int(value)) - - cdef class Trainer: """Model trainer @@ -148,6 +143,7 @@ cdef class Trainer: "lbfgs": "lbfgs", "limited-memory-bfgs": "lbfgs", "l2sgd": "l2sgd", + "sgd": "l2sgd", "stochastic-gradient-descent": "l2sgd", "ap": "averaged-perceptron", "averaged-perceptron": "averaged-perceptron", @@ -179,8 +175,8 @@ cdef class Trainer: } _parameter_types = { "feature.minfreq": float, - "feature.possible_states": _intbool, - "feature.possible_transitions": _intbool, + "feature.possible_states": lambda value: bool(int(value)), + "feature.possible_transitions": lambda value: bool(int(value)), "c1": float, "c2": float, "max_iterations": int, @@ -197,8 +193,8 @@ cdef class Trainer: "calibration.max_trials": int, "type": int, "c": float, - "error_sensitive": _intbool, - "averaging": _intbool, + "error_sensitive": lambda value: bool(int(value)), + "averaging": lambda value: bool(int(value)), "variance": float, "gamma": float, } @@ -210,14 +206,14 @@ cdef class Trainer: def __cinit__(self): self._c_trainer.set_handler(self, self._on_message) - self._c_trainer.select("lbfgs", "crf1d") + self._c_trainer.select("l2sgd", "crf1d") self._c_trainer._init_trainer() def __repr__(self): """Representation of the trainer""" return f"" - def train(self, dataset: Dataset, labels: Labels, model_filepath: Path): + def train(self, dataset: Dataset, labels: Labels, model_filepath: Filepath): """Train a conditional random field Parameters @@ -226,7 +222,7 @@ cdef class Trainer: Training data set labels : Labels Corresponding true labels - model_filepath : Path + model_filepath : Filepath Path the trained model is written to Note @@ -310,17 +306,17 @@ cdef class Trainer: cdef class Model: - """Conditional random field + """Linear-chain conditional random field Parameters ---------- - model_filepath : str + model_filepath : Filepath Path to the trained model """ cdef crfsuite_api.Tagger c_tagger - def __init__(self, model_filepath): - self._load(model_filepath) + def __init__(self, model_filepath: Filepath): + self._load(str(model_filepath)) def __repr__(self): """Representation of the model""" diff --git a/chaine/data.py b/chaine/data.py deleted file mode 100644 index eab82a2..0000000 --- a/chaine/data.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -chaine.data -~~~~~~~~~~~ - -This module provides basic data structures -""" - -import re -from dataclasses import dataclass - - -@dataclass -class Token: - index: int - text: str - - def __len__(self) -> int: - """Number of characters""" - return len(self.text) - - def __repr__(self) -> str: - """Representation of the token""" - return f"" - - def __str__(self) -> str: - """String representation of the token""" - return self.text - - def lower(self) -> str: - """Lower case of the token""" - return self.text.lower() - - @property - def shape(self) -> str: - text = re.sub("[A-Z]", "X", self.text) - text = re.sub("[a-z]", "x", text) - return re.sub("[0-9]", "d", text) - - @property - def is_digit(self) -> bool: - """True if token is a digit, False otherwise""" - return self.text.isdigit() - - @property - def is_lower(self) -> bool: - """True if token is lower case, False otherwise""" - return self.text.islower() - - @property - def is_title(self) -> bool: - """True if first letter is upper case, False otherwise""" - return self.text.istitle() - - @property - def is_upper(self) -> bool: - """True if token is upper case, False otherwise""" - return self.text.isupper() - - @property - def features(self): - return {"num_characters": len(self), - "text": self.lower(), - "shape": self.shape, - "is_digit": self.is_digit, - "is_lower": self.is_lower, - "is_title": self.is_title, - "is_upper": self.is_upper} diff --git a/chaine/logging.py b/chaine/logging.py index 2bba9d1..8abc41d 100644 --- a/chaine/logging.py +++ b/chaine/logging.py @@ -2,15 +2,12 @@ chaine.logging ~~~~~~~~~~~~~~ -This module implements a basic logger and a parser for CRFsuite +This module implements a basic logger """ import logging -import re import sys -from chaine.typing import Optional - class Logger(logging.Logger): DEBUG = logging.DEBUG @@ -81,7 +78,7 @@ def error(self, message: str): @property def log_level(self) -> int: - """Log level. + """Log level Returns ------- diff --git a/chaine/typing.py b/chaine/typing.py index 72008d6..4a73aeb 100644 --- a/chaine/typing.py +++ b/chaine/typing.py @@ -5,12 +5,10 @@ A collection of type hints """ -from pathlib import Path as _Path +from pathlib import Path from typing import Any, Dict, Generator, Iterable, List, Optional, Set, Union -FeatureGenerator = Generator[List[str], None, None] -TokenGenerator = Generator["Token", None, None] -Labels = Iterable[str] +Labels = Iterable[Iterable[str]] Dataset = Iterable[Iterable[str]] -Path = Union[_Path, str] -Sequence = List[Set[str]] +Filepath = Union[Path, str] +Sequence = List[Union[Set[str], Dict[str, Union[int, float, str, bool]]]] From 602101d6a46300800535561f82e0963814fd0a27 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:41:03 +0100 Subject: [PATCH 15/22] chore: add dev dependencies --- poetry.lock | 396 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 + 2 files changed, 397 insertions(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 7ee636c..a60fc12 100644 --- a/poetry.lock +++ b/poetry.lock @@ -155,6 +155,35 @@ category = "dev" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "datasets" +version = "1.2.0" +description = "HuggingFace/Datasets is an open library of NLP datasets." +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +dill = "*" +multiprocess = "*" +numpy = ">=1.17" +pandas = "*" +pyarrow = ">=0.17.1" +requests = ">=2.19.0" +tqdm = ">=4.27,<4.50.0" +xxhash = "*" + +[package.extras] +apache-beam = ["apache-beam"] +benchmarks = ["numpy (==1.18.5)", "tensorflow (==2.3.0)", "torch (==1.6.0)", "transformers (==3.0.2)"] +dev = ["apache-beam", "absl-py", "bs4", "conllu", "elasticsearch", "faiss-cpu", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "pytest", "pytest-xdist", "tensorflow", "torch", "tldextract", "transformers", "zstandard", "rarfile", "black", "isort", "flake8 (==3.7.9)"] +docs = ["recommonmark", "sphinx (==3.1.2)", "sphinx-markdown-tables", "sphinx-rtd-theme (==0.4.3)", "sphinx-copybutton"] +quality = ["black", "isort", "flake8 (==3.7.9)"] +tensorflow = ["tensorflow (>=2.2.0)"] +tensorflow_gpu = ["tensorflow-gpu (>=2.2.0)"] +tests = ["apache-beam", "absl-py", "bs4", "conllu", "elasticsearch", "faiss-cpu", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "pytest", "pytest-xdist", "tensorflow", "torch", "tldextract", "transformers", "zstandard", "rarfile"] +torch = ["torch"] + [[package]] name = "decorator" version = "4.4.2" @@ -171,6 +200,17 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "dill" +version = "0.3.3" +description = "serialize all of python" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*" + +[package.extras] +graph = ["objgraph (>=1.7.2)"] + [[package]] name = "entrypoints" version = "0.3" @@ -294,6 +334,14 @@ MarkupSafe = ">=0.23" [package.extras] i18n = ["Babel (>=0.8)"] +[[package]] +name = "joblib" +version = "1.0.0" +description = "Lightweight pipelining with Python functions" +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "json5" version = "0.9.5" @@ -415,6 +463,17 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "multiprocess" +version = "0.70.11.1" +description = "better multiprocessing and multithreading in python" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +dill = ">=0.3.3" + [[package]] name = "mypy-extensions" version = "0.4.3" @@ -528,6 +587,14 @@ docs = ["sphinx", "nbsphinx", "sphinxcontrib-github-alt", "sphinx-rtd-theme"] json-logging = ["json-logging"] test = ["pytest", "coverage", "requests", "nbval", "selenium", "pytest-cov", "requests-unixsocket"] +[[package]] +name = "numpy" +version = "1.19.5" +description = "NumPy is the fundamental package for array computing with Python." +category = "dev" +optional = false +python-versions = ">=3.6" + [[package]] name = "packaging" version = "20.8" @@ -539,6 +606,22 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.dependencies] pyparsing = ">=2.0.2" +[[package]] +name = "pandas" +version = "1.2.0" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "dev" +optional = false +python-versions = ">=3.7.1" + +[package.dependencies] +numpy = ">=1.16.5" +python-dateutil = ">=2.7.3" +pytz = ">=2017.3" + +[package.extras] +test = ["pytest (>=5.0.1)", "pytest-xdist", "hypothesis (>=3.58)"] + [[package]] name = "pandocfilters" version = "1.4.3" @@ -635,6 +718,17 @@ category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "pyarrow" +version = "2.0.0" +description = "Python library for Apache Arrow" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.dependencies] +numpy = ">=1.14" + [[package]] name = "pycparser" version = "2.20" @@ -699,6 +793,14 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" [package.dependencies] six = ">=1.5" +[[package]] +name = "pytz" +version = "2020.5" +description = "World timezone definitions, modern and historical" +category = "dev" +optional = false +python-versions = "*" + [[package]] name = "pywin32" version = "300" @@ -753,6 +855,37 @@ urllib3 = ">=1.21.1,<1.27" security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] +[[package]] +name = "scikit-learn" +version = "0.24.0" +description = "A set of python modules for machine learning and data mining" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +joblib = ">=0.11" +numpy = ">=1.13.3" +scipy = ">=0.19.1" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=2.1.1)", "pandas (>=0.25.0)", "memory-profiler (>=0.57.0)"] +docs = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=3.2.0)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.0.0)", "Pillow (>=7.1.2)", "sphinx-prompt (>=1.3.0)"] +examples = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)"] +tests = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "mypy (>=0.770)", "pyamg (>=4.0.0)"] + +[[package]] +name = "scipy" +version = "1.6.0" +description = "SciPy: Scientific Library for Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +numpy = ">=1.16.5" + [[package]] name = "send2trash" version = "1.5.0" @@ -761,6 +894,18 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "seqeval" +version = "1.2.2" +description = "Testing framework for sequence labeling" +category = "dev" +optional = false +python-versions = "*" + +[package.dependencies] +numpy = ">=1.14.0" +scikit-learn = ">=0.21.3" + [[package]] name = "six" version = "1.15.0" @@ -793,6 +938,14 @@ python-versions = "*" [package.extras] test = ["pathlib2"] +[[package]] +name = "threadpoolctl" +version = "2.1.0" +description = "threadpoolctl" +category = "dev" +optional = false +python-versions = ">=3.5" + [[package]] name = "toml" version = "0.10.2" @@ -809,6 +962,17 @@ category = "dev" optional = false python-versions = ">= 3.5" +[[package]] +name = "tqdm" +version = "4.49.0" +description = "Fast, Extensible Progress Meter" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*" + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "argopt", "pydoc-markdown"] + [[package]] name = "traitlets" version = "5.0.5" @@ -868,10 +1032,18 @@ category = "dev" optional = false python-versions = "*" +[[package]] +name = "xxhash" +version = "2.0.0" +description = "Python binding for xxHash" +category = "dev" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "4b1a7d194de7b81ec20900062a23abb84d66e4e7ec3f89d9fcc3329ff6fcafc3" +content-hash = "cddf572f73652339004a5238d90b348b711178c3daf5685ff302ea3175f66ce7" [metadata.files] appdirs = [ @@ -1016,6 +1188,10 @@ cython = [ {file = "Cython-0.29.21-py2.py3-none-any.whl", hash = "sha256:5c4276fdcbccdf1e3c1756c7aeb8395e9a36874fa4d30860e7694f43d325ae13"}, {file = "Cython-0.29.21.tar.gz", hash = "sha256:e57acb89bd55943c8d8bf813763d20b9099cc7165c0f16b707631a7654be9cad"}, ] +datasets = [ + {file = "datasets-1.2.0-py3-none-any.whl", hash = "sha256:4f60447d0b80c2ce26e54893fb515c6798742990c3217ca2c60b86758fd29f49"}, + {file = "datasets-1.2.0.tar.gz", hash = "sha256:695ba8d7644b03dc56bee1339447cc22a1fc358efc64ab0e7eb42510e8f9a4ac"}, +] decorator = [ {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, @@ -1024,6 +1200,10 @@ defusedxml = [ {file = "defusedxml-0.6.0-py2.py3-none-any.whl", hash = "sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93"}, {file = "defusedxml-0.6.0.tar.gz", hash = "sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5"}, ] +dill = [ + {file = "dill-0.3.3-py2.py3-none-any.whl", hash = "sha256:78370261be6ea49037ace8c17e0b7dd06d0393af6513cc23f9b222d9367ce389"}, + {file = "dill-0.3.3.zip", hash = "sha256:efb7f6cb65dba7087c1e111bb5390291ba3616741f96840bfc75792a1a9b5ded"}, +] entrypoints = [ {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, @@ -1060,6 +1240,10 @@ jinja2 = [ {file = "Jinja2-2.11.2-py2.py3-none-any.whl", hash = "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"}, {file = "Jinja2-2.11.2.tar.gz", hash = "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0"}, ] +joblib = [ + {file = "joblib-1.0.0-py3-none-any.whl", hash = "sha256:75ead23f13484a2a414874779d69ade40d4fa1abe62b222a23cd50d4bc822f6f"}, + {file = "joblib-1.0.0.tar.gz", hash = "sha256:7ad866067ac1fdec27d51c8678ea760601b70e32ff1881d4dc8e1171f2b64b24"}, +] json5 = [ {file = "json5-0.9.5-py2.py3-none-any.whl", hash = "sha256:af1a1b9a2850c7f62c23fde18be4749b3599fd302f494eebf957e2ada6b9e42c"}, {file = "json5-0.9.5.tar.gz", hash = "sha256:703cfee540790576b56a92e1c6aaa6c4b0d98971dc358ead83812aa4d06bdb96"}, @@ -1127,6 +1311,19 @@ mistune = [ {file = "mistune-0.8.4-py2.py3-none-any.whl", hash = "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"}, {file = "mistune-0.8.4.tar.gz", hash = "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e"}, ] +multiprocess = [ + {file = "multiprocess-0.70.11.1-cp27-cp27m-macosx_10_8_x86_64.whl", hash = "sha256:8f0d0640642acc654fe2fb5cb529ebbe116468a1dd1544d484db6e79033767c8"}, + {file = "multiprocess-0.70.11.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:4b33a0111e341fad5e3c6bb6dd7f592596f2974cc5ecddee06b9a999bac4cbb0"}, + {file = "multiprocess-0.70.11.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:0eab6e0e87acba9586e5d6869d21271cc865d72d74b7f6b30b6290dffca5caae"}, + {file = "multiprocess-0.70.11.1-cp27-cp27m-win32.whl", hash = "sha256:4d97020a50a18862fbb1f84d81914a2a28f2d78bc315de9a6699459682df2a67"}, + {file = "multiprocess-0.70.11.1-cp27-cp27m-win_amd64.whl", hash = "sha256:217e96638fbfd951a203b8dc17410839e4aea8aa3fb9cc393c37e491dcac2c65"}, + {file = "multiprocess-0.70.11.1-py35-none-any.whl", hash = "sha256:ebb92b67a61b901bfc277c4525e86afba24a60638d192b62f8c332933da995f4"}, + {file = "multiprocess-0.70.11.1-py36-none-any.whl", hash = "sha256:d8e87b086373fbd19c28659391e5b8888aadeaeb88f0e448e55502578bde4920"}, + {file = "multiprocess-0.70.11.1-py37-none-any.whl", hash = "sha256:164c77448e357ebee0dc6abc7ee8c823e40e295e629a5fc6d31725109a3a7ee9"}, + {file = "multiprocess-0.70.11.1-py38-none-any.whl", hash = "sha256:7761fed45cae123aa4b7bb918e77a5cfef6fd436c65bc87453e76bf2bdc3e29e"}, + {file = "multiprocess-0.70.11.1-py39-none-any.whl", hash = "sha256:ae026110257fc551fc949d96d69160768810d9019786c8c84c0c28d1f88fab67"}, + {file = "multiprocess-0.70.11.1.zip", hash = "sha256:9d5e417f3ebce4d027a3c900995840f167f316d9f73c0a7a1fbb4ac0116298d0"}, +] mypy-extensions = [ {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, @@ -1151,10 +1348,65 @@ notebook = [ {file = "notebook-6.1.6-py3-none-any.whl", hash = "sha256:e6a62188e319a5d45dd2ed24719f646adf88bef8be1f654ebd0ab360ece6d7a6"}, {file = "notebook-6.1.6.tar.gz", hash = "sha256:cf40d4f81541401db5a2fda1707ca7877157abd41f04ef7b88f02b67f3c61791"}, ] +numpy = [ + {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"}, + {file = "numpy-1.19.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea"}, + {file = "numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea"}, + {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140"}, + {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d"}, + {file = "numpy-1.19.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76"}, + {file = "numpy-1.19.5-cp36-cp36m-win32.whl", hash = "sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a"}, + {file = "numpy-1.19.5-cp36-cp36m-win_amd64.whl", hash = "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827"}, + {file = "numpy-1.19.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f"}, + {file = "numpy-1.19.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f"}, + {file = "numpy-1.19.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c"}, + {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080"}, + {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d"}, + {file = "numpy-1.19.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28"}, + {file = "numpy-1.19.5-cp37-cp37m-win32.whl", hash = "sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7"}, + {file = "numpy-1.19.5-cp37-cp37m-win_amd64.whl", hash = "sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d"}, + {file = "numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e"}, + {file = "numpy-1.19.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c"}, + {file = "numpy-1.19.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94"}, + {file = "numpy-1.19.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff"}, + {file = "numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c"}, + {file = "numpy-1.19.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc"}, + {file = "numpy-1.19.5-cp38-cp38-win32.whl", hash = "sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2"}, + {file = "numpy-1.19.5-cp38-cp38-win_amd64.whl", hash = "sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa"}, + {file = "numpy-1.19.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd"}, + {file = "numpy-1.19.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa"}, + {file = "numpy-1.19.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8"}, + {file = "numpy-1.19.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371"}, + {file = "numpy-1.19.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb"}, + {file = "numpy-1.19.5-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"}, + {file = "numpy-1.19.5-cp39-cp39-win32.whl", hash = "sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e"}, + {file = "numpy-1.19.5-cp39-cp39-win_amd64.whl", hash = "sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e"}, + {file = "numpy-1.19.5-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73"}, + {file = "numpy-1.19.5.zip", hash = "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4"}, +] packaging = [ {file = "packaging-20.8-py2.py3-none-any.whl", hash = "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858"}, {file = "packaging-20.8.tar.gz", hash = "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"}, ] +pandas = [ + {file = "pandas-1.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cba93d4fd3b0a42858b2b599495aff793fb5d94587979f45a14177d1217ba446"}, + {file = "pandas-1.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9e18631d996fe131de6cb31a8bdae18965cc8f39eb23fdfbbf42808ecc63dabf"}, + {file = "pandas-1.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7b54c14130a3448d81eed1348f52429c23e27188d9db6e6d4afeae792bc49c11"}, + {file = "pandas-1.2.0-cp37-cp37m-win32.whl", hash = "sha256:6c1a57e4d0d6f9633a07817c44e6b36d81c265fe4c52d0c0505513a2d0f7953c"}, + {file = "pandas-1.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:43482789c55cbabeed9482263cfc98a11e8fcae900cb63ef038948acb4a72570"}, + {file = "pandas-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0be6102dd99910513e75ed6536284743ead810349c51bdeadd2a5b6649f30abb"}, + {file = "pandas-1.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:9c6692cea6d56da8650847172bdb148622f545e7782d17995822434c79d7a211"}, + {file = "pandas-1.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:272675a98fa4954b9fc0933df775596fc942e50015d7e75d8f19548808a2bfdf"}, + {file = "pandas-1.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:33318fa24b192b1a4684347ff76679a7267fd4e547da9f71556a5914f0dc10e7"}, + {file = "pandas-1.2.0-cp38-cp38-win32.whl", hash = "sha256:3bc6d2be03cb75981d8cbeda09503cd9d6d699fc0dc28a65e197165ad527b7b8"}, + {file = "pandas-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:7904ee438549b5223ce8dc008772458dd7c5cf0ccc64cf903e81202400702235"}, + {file = "pandas-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f8b87d2f541cd9bc4ecfe85a561abac85c33fe4de4ce70cca36b2768af2611f5"}, + {file = "pandas-1.2.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:91fd0b94e7b98528177a05e6f65efea79d7ef9dec15ee48c7c69fc39fdd87235"}, + {file = "pandas-1.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8f92b07cdbfa3704d85b4264e52c216cafe6c0059b0d07cdad8cb29e0b90f2b8"}, + {file = "pandas-1.2.0-cp39-cp39-win32.whl", hash = "sha256:2d8b4f532db37418121831a461fd107d826c240b098f52e7a1b4ab3d5aaa4fb2"}, + {file = "pandas-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:616478c1bd8fe1e600f521ae2da434e021c11e7a4e5da3451d02906143d3629a"}, + {file = "pandas-1.2.0.tar.gz", hash = "sha256:e03386615b970b8b41da6a68afe717626741bb2431cec993640685614c0680e4"}, +] pandocfilters = [ {file = "pandocfilters-1.4.3.tar.gz", hash = "sha256:bc63fbb50534b4b1f8ebe1860889289e8af94a23bff7445259592df25a3906eb"}, ] @@ -1194,6 +1446,33 @@ py = [ {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, ] +pyarrow = [ + {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_13_intel.whl", hash = "sha256:6afc71cc9c234f3cdbe971297468755ec3392966cb19d3a6caf42fd7dbc6aaa9"}, + {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:eb05038b750a6e16a9680f9d2c40d050796284ea1f94690da8f4f28805af0495"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3e33e9003794c9062f4c963a10f2a0d787b83d4d1a517a375294f2293180b778"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:ffb306951b5925a0638dc2ef1ab7ce8033f39e5b4e0fef5787b91ef4fa7da19d"}, + {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:dc0d04c42632e65c4fcbe2f82c70109c5f347652844ead285bc1285dc3a67660"}, + {file = "pyarrow-2.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:916b593a24f2812b9a75adef1143b1dd89d799e1803282fea2829c5dc0b828ea"}, + {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:c801e59ec4e8d9d871e299726a528c3ba3139f2ce2d9cdab101f8483c52eec7c"}, + {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0bf43e520c33ceb1dd47263a5326830fca65f18d827f7f7b8fe7e64fc4364d88"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0b358773eb9fb1b31c8217c6c8c0b4681c3dff80562dc23ad5b379f0279dad69"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:1000e491e9a539588ec33a2c2603cf05f1d4629aef375345bfd64f2ab7bc8529"}, + {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ce0462cec7f81c4ff87ce1a95c82a8d467606dce6c72e92906ac251c6115f32b"}, + {file = "pyarrow-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:16ec87163a2fb4abd48bf79cbdf70a7455faa83740e067c2280cfa45a63ed1f3"}, + {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:acdd18fd83c0be0b53a8e734c0a650fb27bbf4e7d96a8f7eb0a7506ea58bd594"}, + {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9a8d3c6baa6e159017d97e8a028ae9eaa2811d8f1ab3d22710c04dcddc0dd7a1"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:652c5dff97624375ed0f97cc8ad6f88ee01953f15c17083917735de171f03fe0"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:00d8fb8a9b2d9bb2f0ced2765b62c5d72689eed06c47315bca004584b0ccda60"}, + {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:fb69672e69e1b752744ee1e236fdf03aad78ffec905fc5c19adbaf88bac4d0fd"}, + {file = "pyarrow-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ccff3a72f70ebfcc002bf75f5ad1248065e5c9c14e0dcfa599a438ea221c5658"}, + {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:bc8c3713086e4a137b3fda4b149440458b1b0bd72f67b1afa2c7068df1edc060"}, + {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f4ba9ab479c0172e532f5d73c68e30a31c16b01e09bb21eba9201561231f722"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0db5156a66615591a4a8c66a9a30890a364a259de8d2a6ccb873c7d1740e6c75"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:cf9bf10daadbbf1a360ac1c7dab0b4f8381d81a3f452737bd6ed310d57a88be8"}, + {file = "pyarrow-2.0.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:dd661b6598ce566c6f41d31cc1fc4482308613c2c0c808bd8db33b0643192f84"}, + {file = "pyarrow-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:14b02a629986c25e045f81771799e07a8bb3f339898c111314066436769a3dd4"}, + {file = "pyarrow-2.0.0.tar.gz", hash = "sha256:b5e6cd217457e8febcc98a6c279b96f72d5c31a24cd2bffd8d3b2da701d2025c"}, +] pycparser = [ {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"}, {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"}, @@ -1217,6 +1496,10 @@ python-dateutil = [ {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, ] +pytz = [ + {file = "pytz-2020.5-py2.py3-none-any.whl", hash = "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4"}, + {file = "pytz-2020.5.tar.gz", hash = "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"}, +] pywin32 = [ {file = "pywin32-300-cp35-cp35m-win32.whl", hash = "sha256:1c204a81daed2089e55d11eefa4826c05e604d27fe2be40b6bf8db7b6a39da63"}, {file = "pywin32-300-cp35-cp35m-win_amd64.whl", hash = "sha256:350c5644775736351b77ba68da09a39c760d75d2467ecec37bd3c36a94fbed64"}, @@ -1319,10 +1602,61 @@ requests = [ {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"}, {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"}, ] +scikit-learn = [ + {file = "scikit-learn-0.24.0.tar.gz", hash = "sha256:076369634ee72b5a5941440661e2f306ff4ac30903802dc52031c7e9199ac640"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:890d7d588f65acb0c4f6c083347c9076916bda5e6bd8400f06244b1afc1009af"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:e534f5f3796db6781c87e9835dcd51b7854c8c5a379c9210b93605965c1941fd"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d7fe05fcb44eadd6d6c874c768f085f5de1239db3a3b7be4d3d23d12e4120589"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:7f654befc5ad413690cc58f3f34a3e906caf825195ce0fda00a8e9565e1403e6"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-win32.whl", hash = "sha256:afeb06dc69847927634e58579b9cdc72e1390b79497336b2324b1b173f33bd47"}, + {file = "scikit_learn-0.24.0-cp36-cp36m-win_amd64.whl", hash = "sha256:26f66b3726b54dfb76ea51c5d9c2431ed17ebc066cb4527662b9e851a3e7ba61"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c08b27cb78ee8d2dc781a7affed09859441f5b624f9f92da59ac0791c8774dfc"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:905d8934d1e27a686698864a5863ff2c0e13a2ae1adb78a8a848aacc8a49927d"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d819d625832fb2969911a243e009cfa135cb8ef1e150866e417d6e9d75290087"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:18f7131e62265bf2691ed1d0303c640313894ccfe4278427478c6b2f45094b53"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-win32.whl", hash = "sha256:b0d13fd56d26cf3de0314a4fd48037108c638fe126d813f5c1222bb0f08b6a76"}, + {file = "scikit_learn-0.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c912247e42114f389858ae05d63f4359d4e667ea72aaabee191aee9ad3f9774a"}, + {file = "scikit_learn-0.24.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:758619e49cd7c17282e6cc60d5cc73c02c072b47c9a10010bb3bb47e0d976e50"}, + {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:66f27bf21202a850bcd7b6303916e4907f6e22ec59a14974ede4955aed5c7ed0"}, + {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:5e6e3c042cea83f2e20a45e563b8eabc1f8f72446251fe23ebefdf111a173a33"}, + {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2a5348585aa793bc8cc5a72f8e9067c9380834b0aadbd55f924843b071f13282"}, + {file = "scikit_learn-0.24.0-cp38-cp38-win32.whl", hash = "sha256:743b6edd98c98991be46c08e6b21df3861d5ae915f91d59f988384d93f7263e7"}, + {file = "scikit_learn-0.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:2951f87d35e72f007701c6e028aa230f6df6212a3194677c0c950486066a454d"}, + {file = "scikit_learn-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:44e452ea8491225c5783d49577aad0f36202dfd52aec7f82c0fdfe5fbd5f7400"}, + {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:800aaf63f8838c00e85db2267dd226f89858594843fd03932a9eda95746d2c40"}, + {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:3eeff086f7329521d27249a082ea3c48c085cedb110db5f65968ab55c3ba2e09"}, + {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4395e91b3548005f4a645018435b5a94f8cce232b5b70753020e606c6a750656"}, + {file = "scikit_learn-0.24.0-cp39-cp39-win32.whl", hash = "sha256:80ca024154b84b6ac4cfc86930ba13fdc348a209753bf2c16129db6f9eb8a80b"}, + {file = "scikit_learn-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:490436b44b3a1957cb625e871764b0aa330b34cc416aea4abc6c38ca63d0d682"}, +] +scipy = [ + {file = "scipy-1.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3d4303e3e21d07d9557b26a1707bb9fc065510ee8501c9bf22a0157249a82fd0"}, + {file = "scipy-1.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:1bc5b446600c4ff7ab36bade47180673141322f0febaa555f1c433fe04f2a0e3"}, + {file = "scipy-1.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:8840a9adb4ede3751f49761653d3ebf664f25195fdd42ada394ffea8903dd51d"}, + {file = "scipy-1.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:8629135ee00cc2182ac8be8e75643b9f02235942443732c2ed69ab48edcb6614"}, + {file = "scipy-1.6.0-cp37-cp37m-win32.whl", hash = "sha256:58731bbe0103e96b89b2f41516699db9b63066e4317e31b8402891571f6d358f"}, + {file = "scipy-1.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:876badc33eec20709d4e042a09834f5953ebdac4088d45a4f3a1f18b56885718"}, + {file = "scipy-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c0911f3180de343643f369dc5cfedad6ba9f939c2d516bddea4a6871eb000722"}, + {file = "scipy-1.6.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:b8af26839ae343655f3ca377a5d5e5466f1d3b3ac7432a43449154fe958ae0e0"}, + {file = "scipy-1.6.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4f1d9cc977ac6a4a63c124045c1e8bf67ec37098f67c699887a93736961a00ae"}, + {file = "scipy-1.6.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:eb7928275f3560d47e5538e15e9f32b3d64cd30ea8f85f3e82987425476f53f6"}, + {file = "scipy-1.6.0-cp38-cp38-win32.whl", hash = "sha256:31ab217b5c27ab429d07428a76002b33662f98986095bbce5d55e0788f7e8b15"}, + {file = "scipy-1.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:2f1c2ebca6fd867160e70102200b1bd07b3b2d31a3e6af3c58d688c15d0d07b7"}, + {file = "scipy-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:155225621df90fcd151e25d51c50217e412de717475999ebb76e17e310176981"}, + {file = "scipy-1.6.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:f68d5761a2d2376e2b194c8e9192bbf7c51306ca176f1a0889990a52ef0d551f"}, + {file = "scipy-1.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d902d3a5ad7f28874c0a82db95246d24ca07ad932741df668595fe00a4819870"}, + {file = "scipy-1.6.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:aef3a2dbc436bbe8f6e0b635f0b5fe5ed024b522eee4637dbbe0b974129ca734"}, + {file = "scipy-1.6.0-cp39-cp39-win32.whl", hash = "sha256:cdbc47628184a0ebeb5c08f1892614e1bd4a51f6e0d609c6eed253823a960f5b"}, + {file = "scipy-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:313785c4dab65060f9648112d025f6d2fec69a8a889c714328882d678a95f053"}, + {file = "scipy-1.6.0.tar.gz", hash = "sha256:cb6dc9f82dfd95f6b9032a8d7ea70efeeb15d5b5fd6ed4e8537bb3c673580566"}, +] send2trash = [ {file = "Send2Trash-1.5.0-py3-none-any.whl", hash = "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"}, {file = "Send2Trash-1.5.0.tar.gz", hash = "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2"}, ] +seqeval = [ + {file = "seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f"}, +] six = [ {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"}, {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"}, @@ -1335,6 +1669,10 @@ testpath = [ {file = "testpath-0.4.4-py2.py3-none-any.whl", hash = "sha256:bfcf9411ef4bf3db7579063e0546938b1edda3d69f4e1fb8756991f5951f85d4"}, {file = "testpath-0.4.4.tar.gz", hash = "sha256:60e0a3261c149755f4399a1fff7d37523179a70fdc3abdf78de9fc2604aeec7e"}, ] +threadpoolctl = [ + {file = "threadpoolctl-2.1.0-py3-none-any.whl", hash = "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725"}, + {file = "threadpoolctl-2.1.0.tar.gz", hash = "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"}, +] toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, @@ -1382,6 +1720,10 @@ tornado = [ {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"}, {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"}, ] +tqdm = [ + {file = "tqdm-4.49.0-py2.py3-none-any.whl", hash = "sha256:8f3c5815e3b5e20bc40463fa6b42a352178859692a68ffaa469706e6d38342a5"}, + {file = "tqdm-4.49.0.tar.gz", hash = "sha256:faf9c671bd3fad5ebaeee366949d969dca2b2be32c872a7092a1e1a9048d105b"}, +] traitlets = [ {file = "traitlets-5.0.5-py3-none-any.whl", hash = "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"}, {file = "traitlets-5.0.5.tar.gz", hash = "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396"}, @@ -1435,3 +1777,55 @@ webencodings = [ {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, ] +xxhash = [ + {file = "xxhash-2.0.0-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:df8d1ebdef86bd5d772d81c91d5d111a5ee8e4b68b8fc6b6edfa5aa825dd2a3d"}, + {file = "xxhash-2.0.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:f01c59f5bad2e46bb4235b71b36c56be353f08b6d514a3bd0deb9bf56e4b180a"}, + {file = "xxhash-2.0.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:cb4feeb8881eb89b9ddd0fae797deb078ebdaad6b1ae6c185b9993d241ed365a"}, + {file = "xxhash-2.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:2912d7810bcf7e39b3929fb186fe46ff83b1bd4a3d6b7eba956d57fa1516ac0c"}, + {file = "xxhash-2.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:48b99c55fc643b32f5efca9c35fcaac6ea553958cf503e202c10eb62718e7a0e"}, + {file = "xxhash-2.0.0-cp27-cp27m-win32.whl", hash = "sha256:3221f1a5bc2ee1f150b84a0c4c7cddc7724aaa01460f3353cf63fd667d89f593"}, + {file = "xxhash-2.0.0-cp27-cp27m-win_amd64.whl", hash = "sha256:cba4b6d174b524623ac8b64bda734601d574f95033f87ddf9c495c69a70135e8"}, + {file = "xxhash-2.0.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:b94f13f4f946500f3cc78f11da4ec4b340bd92c5200b5fe4e6aeac96064aa1fd"}, + {file = "xxhash-2.0.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:635b1d7fa85d215112f41d089bd113ac139f6a42769fcc49c73e779904160f7f"}, + {file = "xxhash-2.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:0f5f1b9ae8e2cf2ff606018769f7e46147df70291312f64e1b80d10482ca8c0b"}, + {file = "xxhash-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8f90deec6567a38e1da29feff36973468691e309b2db8235e64936e61df77c43"}, + {file = "xxhash-2.0.0-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:8b7e930a60dfe7380e52466aa27941290dd575a5750c622158c86941797eaa1b"}, + {file = "xxhash-2.0.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:44b26872fd63f1eaf1ab527817aebbd455a3fdcbd56ff6df74fd42a6a137cff4"}, + {file = "xxhash-2.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3d25b540148f1ebf4852e4115f3f4819b585ecd36f121a1f388e8966d69d3a1c"}, + {file = "xxhash-2.0.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:bcd1e9f3ba8df23edefe1d0a886f16b4e27602acbd8575b39540fea26e1aa6d2"}, + {file = "xxhash-2.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:fc03a399205268815742125b17d967afa9f23b08cdafe185e41368cf7ba9b278"}, + {file = "xxhash-2.0.0-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:bdbc195231c87d63b0503785d9c5264f4275a92da41d9f28fdf08fb321453356"}, + {file = "xxhash-2.0.0-cp35-cp35m-win32.whl", hash = "sha256:7291392bdb1d38c44557dfd3fcd4fd04c363a696dbfa7e6592700a31e4ff6657"}, + {file = "xxhash-2.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:e0fc170c3a00ca008d992c2e6324da3f1467b30044b5835d2feb27870645d38c"}, + {file = "xxhash-2.0.0-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:5b3c0c84187556d463626ceed85f0d735a5b8ea1678da3e858d3934f38f23915"}, + {file = "xxhash-2.0.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:2f0ca6673fcbae988389576a779c00a62a28718a18ddc7b2e5b32d7fb30c6f98"}, + {file = "xxhash-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d1859d54837af16ae2a7975477e619793ac698a374d909f533e317c3b384b223"}, + {file = "xxhash-2.0.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:9d0311fcd78dabe04ab3b4034659628b00ac220e77e37648f73aebbf4cb13680"}, + {file = "xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:0ecea927fd3df8f3f3a1d6e5bc85838eb44a69ea2f4c9263dfd0f68c4e17e483"}, + {file = "xxhash-2.0.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:4167f22b037e128820f7642ecc1fbf1b4b4956346093a2e75081bee82b9cfb7e"}, + {file = "xxhash-2.0.0-cp36-cp36m-win32.whl", hash = "sha256:85c5de6c56335b75beef2cba713f95a1b62422be5e27dad30b5083419c6839c4"}, + {file = "xxhash-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:ade1c356acd0b0454a3d3cf42442afe7ad0f46fc944ea1e84720b3858bfdb772"}, + {file = "xxhash-2.0.0-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:fca7d0fb6fde33d1ac5f97298f44e711e5fe1b4587832864be8c6545cb072a54"}, + {file = "xxhash-2.0.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e296b0dee072a54c40c04f09ca35bb9902bb74b54f0fffeafabfc937b3ec85f9"}, + {file = "xxhash-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:02476c5cef803cfd1350662b1e543e47ad64bd5f7f792033d94d590f9674da11"}, + {file = "xxhash-2.0.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:28c1f0bb6dadc11162d1f2e203d7a12d38b511b87fbb5ffa729594fd456f48e6"}, + {file = "xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:922ae5b1efa1f9a9cc959f7197113a623ad110853622e990433242a9d8d00d5c"}, + {file = "xxhash-2.0.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:7709bc8a5e30c74b07203553f33232531e7739458f72204908cedb08a00bd546"}, + {file = "xxhash-2.0.0-cp37-cp37m-win32.whl", hash = "sha256:fb3c9760598009b1d8bbe57785e278aeb956efb7372d8f9b0bb43cd46f420dff"}, + {file = "xxhash-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3f29f6d455388cc415fe52c0f63f442aaea674cee35a2252d8d4dc8d640938c6"}, + {file = "xxhash-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bf360465dc3d24b1501b799c85815c82ddcfc0ffbcba0232968f3a7cd64306fc"}, + {file = "xxhash-2.0.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5d2edbb50025a67f061d09d381c54c7d0948c1572f6c9bd15ee238a303d368d9"}, + {file = "xxhash-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7943ede91d8aedfcacb7178b2d881b7498145590206ff61c3e84dc66e6a51d6a"}, + {file = "xxhash-2.0.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:b5c2edb8b0a2acc5bdac984b3177711f206463b970aa03087221771c2b0d8f1d"}, + {file = "xxhash-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:16e4b7d508bb49b6fc84bf077f2f7f51263b5618cc61f33a64ed43786ec2c6cf"}, + {file = "xxhash-2.0.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:80903d4ce7337921bbc8e5ac695b45691b43c0a00b21964c76e19ea21b9108ea"}, + {file = "xxhash-2.0.0-cp38-cp38-win32.whl", hash = "sha256:e37b25182e969212d5aec60a8da7d1e6a960dbffdb9ba4c63e2240de3605c184"}, + {file = "xxhash-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:fabee25186b6649bbf6ff258f23941339902374786f8317b0422144ddaa505df"}, + {file = "xxhash-2.0.0-pp27-pypy_73-manylinux1_x86_64.whl", hash = "sha256:be93004b832717234a7d2f47dc555428ab1e8712f99cad7d212cebe0e27d3d48"}, + {file = "xxhash-2.0.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:1b86f49b36c25ebdbd1b5539d428a37d9051ad49eb576a3edd964a8770bc8f3a"}, + {file = "xxhash-2.0.0-pp27-pypy_73-win32.whl", hash = "sha256:bde4d39997de901d0a66ebd631b34f9cf106676fec0878f36b7baf630cb3965a"}, + {file = "xxhash-2.0.0-pp36-pypy36_pp73-manylinux1_x86_64.whl", hash = "sha256:99b5412a3eddb1aa9aaf36cdbf93be4eca99ad83ff8c692672fdeedc7fb597de"}, + {file = "xxhash-2.0.0-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:33c4832e689f429539d70baf69162b41dfbabc7f31ca542b5b772cb8a55e7a79"}, + {file = "xxhash-2.0.0-pp36-pypy36_pp73-win32.whl", hash = "sha256:82034c9ed54db20f051133cba01de959b5208fe2900e67ebb4c9631f1fd523fd"}, + {file = "xxhash-2.0.0.tar.gz", hash = "sha256:58ca818554c1476fa1456f6cd4b87002e2294f09baf0f81e5a2a4968e62c423c"}, +] diff --git a/pyproject.toml b/pyproject.toml index ba1133b..e7408ed 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ isort = "^5.6.4" pytest = "^6.1.2" cython = "^0.29.21" jupyterlab = "^2.2.9" +datasets = "^1.2.0" +seqeval = "^1.2.2" [tool.isort] line_length = 88 From 754f6f181c4545b7a70eb32c937fad9280372a76 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:41:18 +0100 Subject: [PATCH 16/22] chore: add example --- examples/README.md | 23 +++++++ examples/conll.py | 140 +++++++++++++++++++++++++++++++++++++++++++ examples/training.py | 30 ---------- 3 files changed, 163 insertions(+), 30 deletions(-) create mode 100644 examples/README.md create mode 100644 examples/conll.py delete mode 100644 examples/training.py diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..d19b1e6 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,23 @@ +datasets +seqeval + + +## How it works + + + +``` +["John", "Lennon"] +``` + +becomes + +``` +{"text": "john", "is_capitalized": True} +``` + +becomes + +``` +{"text": "john", "is_capitalized": True, "text+1": "lennon", "is_capitalized+1": True} +``` diff --git a/examples/conll.py b/examples/conll.py new file mode 100644 index 0000000..6388f74 --- /dev/null +++ b/examples/conll.py @@ -0,0 +1,140 @@ +from typing import Any, Dict, List, Union + +import datasets +from seqeval.metrics import classification_report + +import chaine +from chaine.logging import Logger + +Sentence = List[str] +Tags = List[str] +Features = Dict[str, Union[float, int, str, bool]] +Dataset = Dict[str, Dict[str, Any]] + +LOGGER = Logger(__name__) + + +def featurize_token(token_index: int, sentence: Sentence, pos_tags: Tags) -> Features: + """Extract features from a token in a sentence + + Parameters + ---------- + token_index : int + todo + sentence : Sentence + todo + pos_tags : Tags + todo + + Returns + ------- + Features + todo + """ + token = sentence[token_index] + pos_tag = pos_tags[token_index] + + features = { + "bias": 1.0, + "token.lower()": token.lower(), + "token[-3:]": token[-3:], + "token[-2:]": token[-2:], + "token.isupper()": token.isupper(), + "token.istitle()": token.istitle(), + "token.isdigit()": token.isdigit(), + "pos_tag": pos_tag, + } + if token_index > 0: + previous_token = sentence[token_index - 1] + previous_pos_tag = pos_tags[token_index - 1] + features.update( + { + "-1:token.lower()": previous_token.lower(), + "-1:token.istitle()": previous_token.istitle(), + "-1:token.isupper()": previous_token.isupper(), + "-1:pos_tag": previous_pos_tag, + } + ) + else: + features["BOS"] = True + + if token_index < len(sentence) - 1: + next_token = sentence[token_index + 1] + next_pos_tag = pos_tags[token_index + 1] + features.update( + { + "+1:token.lower()": next_token.lower(), + "+1:token.istitle()": next_token.istitle(), + "+1:token.isupper()": next_token.isupper(), + "+1:pos_tag": next_pos_tag, + } + ) + else: + features["EOS"] = True + + return features + + +def featurize_sentence(sentence: List[str], pos_tags: List[str]) -> List[Features]: + """Extract features from tokens in a sentence + + Parameters + ---------- + sentence : Sentence + todo + pos_tags : Tags + todo + + Returns + ------- + List[Features] + todo + """ + return [ + featurize_token(token_index, sentence, pos_tags) + for token_index in range(len(sentence)) + ] + + +def featurize_dataset(dataset: Dataset) -> List[List[Features]]: + """Extract features from tokens in a sentence + + Parameters + ---------- + dataset : Dataset + todo + + Returns + ------- + List[List[Features]] + todo + """ + return [ + featurize_sentence(sentence, pos_tags) + for sentence, pos_tags in zip(dataset["tokens"], dataset["pos_tags"]) + ] + +def preprocess_labels(dataset: Dataset) -> List[List[str]]: + labels = dataset.features["ner_tags"].feature.names + return [[labels[index] for index in indices] for indices in dataset["ner_tags"]] + + +if __name__ == "__main__": + LOGGER.info("Loading raw dataset") + dataset = datasets.load_dataset("conll2003") + + LOGGER.info("Extracting features from dataset for training") + sentences = featurize_dataset(dataset["train"]) + labels = preprocess_labels(dataset["train"]) + + model = chaine.train(sentences, labels) + + LOGGER.info("Extracting features from dataset for evaluation") + sentences = featurize_dataset(dataset["test"]) + labels = preprocess_labels(dataset["test"]) + + LOGGER.info("Evaluating the model") + predictions = model.predict(sentences) + + print("\nEvaluation:") + print(classification_report(labels, predictions)) diff --git a/examples/training.py b/examples/training.py deleted file mode 100644 index e16bf08..0000000 --- a/examples/training.py +++ /dev/null @@ -1,30 +0,0 @@ -import chaine -from chaine.data import Token -from flair.models import SequenceTagger -from flair.data import Sentence -import tqdm -import datasets - -TAGGER = SequenceTagger.load("pos-multi-fast") -DATASET = datasets.load_dataset("germaner") - - -def preprocess(dataset): - for tokens in tqdm.tqdm(dataset): - sentence = Sentence(" ".join(tokens), use_tokenizer=False) - TAGGER.predict(sentence) - pos_tags = [token.get_tag("upos").value for token in sentence] - features = [Token(i, text).features for i, text in enumerate(tokens)] - for token, pos in zip(features, pos_tags): - token["pos"] = pos - yield features - - - - - -if __name__ == "__main__": - tokens = preprocess(DATASET["train"]["tokens"][:10]) - labels = DATASET["train"]["ner_tags"][:10] - - model = chaine.train(tokens, labels) From e1e111ebb100d581f0c6c4a00c1b0333f2844146 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:41:28 +0100 Subject: [PATCH 17/22] chore: update readme --- README.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 53a9176..85bfe6c 100644 --- a/README.md +++ b/README.md @@ -28,17 +28,12 @@ If you are interested in the theoretical concepts behind conditional random fiel [['B-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC']] ``` -Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb). - - -## How it works - - +Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples). ## Credits -This library makes use of and is partially based on: +This project makes use of and is partially based on: - [CRFsuite](https://github.com/chokkan/crfsuite) - [libLBFGS](https://github.com/chokkan/liblbfgs) From 0b389ff6da734faa4c72fac000b0dbdbdd8484e1 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:43:11 +0100 Subject: [PATCH 18/22] chore: update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 85bfe6c..363763b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Linear-chain conditional random fields for natural language processing. -Chaine is a modern Python library without third-party dependencies and a backend written in C. You can train conditional random fields for natural language processing tasks like [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) or [part-of-speech tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging). +Chaine is a modern Python library without third-party dependencies and a backend written in C. You can train conditional random fields for natural language processing tasks like [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition). - **Lightweight**: No use of bloated third-party libraries. - **Fast**: Performance critical parts are written in C and thus [blazingly fast](http://www.chokkan.org/software/crfsuite/benchmark.html). From 12459002f8fb167fdc5c9f6299d303b8e0cd7791 Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:44:22 +0100 Subject: [PATCH 19/22] chore: update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 363763b..cf466e0 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ You can install the latest stable version from [PyPI](https://pypi.org/project/c $ pip install chaine ``` -If you are interested in the theoretical concepts behind conditional random fields, please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers). +Please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields. ## Minimal working example From fc97158d3f94d690bf1e21e54db5d30cd23d03fd Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:44:40 +0100 Subject: [PATCH 20/22] chore: update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cf466e0..31fd141 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ You can install the latest stable version from [PyPI](https://pypi.org/project/c $ pip install chaine ``` -Please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields. +Please refer to the introducing paper by [Lafferty et al.](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields. ## Minimal working example From 7deff0501a91304f645bd96af32d11ac347d4b6b Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:45:23 +0100 Subject: [PATCH 21/22] chore: update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 31fd141..e20fbb7 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Please refer to the introducing paper by [Lafferty et al.](https://repository.up [['B-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC']] ``` -Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples). +Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples) for a more real-world use case. ## Credits From bce8633a69606fce15604d418496381c2ddf021c Mon Sep 17 00:00:00 2001 From: severinsimmler Date: Wed, 6 Jan 2021 22:46:41 +0100 Subject: [PATCH 22/22] fix: docstring --- chaine/api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/chaine/api.py b/chaine/api.py index 4878d4e..e5eb8aa 100644 --- a/chaine/api.py +++ b/chaine/api.py @@ -1,8 +1,8 @@ """ -chaine.training -~~~~~~~~~~~~~~~ +chaine.api +~~~~~~~~~~ -This module implements the high-level API to train a CRF +This module implements the high-level API to train a conditional random field """ from chaine.crf import Model, Trainer