From e16e5fb69cca36e60a5633aa7f115911e4b00b59 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Mon, 4 Jan 2021 22:55:15 +0100
Subject: [PATCH 01/22] chore: improved docstrings

---
 chaine/crf.pyx     |  40 ---------------
 chaine/training.py | 124 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 121 insertions(+), 43 deletions(-)

diff --git a/chaine/crf.pyx b/chaine/crf.pyx
index f066201..46c9565 100644
--- a/chaine/crf.pyx
+++ b/chaine/crf.pyx
@@ -35,40 +35,29 @@ cdef class Trainer:
     ------------------------------
     min_freq : float, optional (default=0)
         Threshold value for minimum frequency of a feature occurring in training data
-
     all_possible_states : bool, optional (default=False)
         Generate state features that do not even occur in the training data
-
     all_possible_transitions : bool, optional (default=False)
         Generate transition features that do not even occur in the training data
-
     max_iterations : int, optional (default=None)
         Maximum number of iterations (unlimited by default)
-
     num_memories : int, optional (default=6)
         Number of limited memories for approximating the inverse hessian matrix
-
     c1 : float, optional (default=0)
         Coefficient for L1 regularization
-
     c2 : float, optional (default=1.0)
         Coefficient for L2 regularization
-
     epsilon : float, optional (default=1e-5)
         Parameter that determines the condition of convergence
-
     period : int, optional (default=10)
         Threshold value for iterations to test the stopping criterion
-
     delta : float, optional (default=1e-5)
         Top iteration when log likelihood is not greater than this
-
     linesearch : str, optional (default="MoreThuente")
         Line search algorithm used in updates:
             * MoreThuente: More and Thuente's method
             * Backtracking: Backtracking method with regular Wolfe condition
             * StrongBacktracking: Backtracking method with strong Wolfe condition
-
     max_linesearch : int, optional (default=20)
         Maximum number of trials for the line search algorithm
 
@@ -76,37 +65,26 @@ cdef class Trainer:
     ----------------------
     min_freq : float, optional (default=0)
         Threshold value for minimum frequency of a feature occurring in training data
-
     all_possible_states : bool, optional (default=False)
         Generate state features that do not even occur in the training data
-
     all_possible_transitions : bool, optional (default=False)
         Generate transition features that do not even occur in the training data
-
     max_iterations : int, optional (default=None)
         Maximum number of iterations (1000 by default)
-
     c2 : float, optional (default=1.0)
         Coefficient for L2 regularization
-
     period : int, optional (default=10)
         Threshold value for iterations to test the stopping criterion
-
     delta : float, optional (default=1e-5)
         Top iteration when log likelihood is not greater than this
-
     calibration_eta : float, optional (default=0.1)
         Initial value of learning rate (eta) used for calibration
-
     calibration_rate : float, optional (default=2.0)
         Rate of increase/decrease of learning rate for calibration
-
     calibration_samples : int, optional (default=1000)
         Number of instances used for calibration
-
     calibration_candidates : int, optional (default=10)
         Number of candidates of learning rate
-
     calibration_max_trials : int, optional (default=20)
         Maximum number of trials of learning rates for calibration
 
@@ -114,16 +92,12 @@ cdef class Trainer:
     ------------------------------
     min_freq : float, optional (default=0)
         Threshold value for minimum frequency of a feature occurring in training data
-
     all_possible_states : bool, optional (default=False)
         Generate state features that do not even occur in the training data
-
     all_possible_transitions : bool, optional (default=False)
         Generate transition features that do not even occur in the training data
-
     max_iterations : int, optional (default=None)
         Maximum number of iterations (100 by default)
-
     epsilon : float, optional (default=1e-5)
         Parameter that determines the condition of convergence
 
@@ -131,31 +105,23 @@ cdef class Trainer:
     -----------------------------
     min_freq : float, optional (default=0)
         Threshold value for minimum frequency of a feature occurring in training data
-
     all_possible_states : bool, optional (default=False)
         Generate state features that do not even occur in the training data
-
     all_possible_transitions : bool, optional (default=False)
         Generate transition features that do not even occur in the training data
-
     max_iterations : int, optional (default=None)
         Maximum number of iterations (100 by default)
-
     epsilon : float, optional (default=1e-5)
         Parameter that determines the condition of convergence
-
     pa_type : int, optional (default=1)
         Strategy for updating feature weights:
             * 0: PA without slack variables
             * 1: PA type I
             * 2: PA type II
-
     c : float, optional (default=1)
         Aggressiveness parameter (used only for PA-I and PA-II)
-
     error_sensitive : bool, optional (default=True)
         Include square root of predicted incorrect labels into optimization routine
-
     averaging : bool, optional (default=True)
         Compute average of feature weights at all updates
 
@@ -163,22 +129,16 @@ cdef class Trainer:
     ----------------------------------------------------
     min_freq : float, optional (default=0)
         Threshold value for minimum frequency of a feature occurring in training data
-
     all_possible_states : bool, optional (default=False)
         Generate state features that do not even occur in the training data
-
     all_possible_transitions : bool, optional (default=False)
         Generate transition features that do not even occur in the training data
-
     max_iterations : int, optional (default=None)
         Maximum number of iterations (100 by default)
-
     epsilon : float, optional (default=1e-5)
         Parameter that determines the condition of convergence
-
     variance : float, optional (default=1)
         Initial variance of every feature weight
-
     gamma : float, optional (default=1)
         Trade-off between loss function and changes of feature weights
     """
diff --git a/chaine/training.py b/chaine/training.py
index 16a2298..f95290a 100644
--- a/chaine/training.py
+++ b/chaine/training.py
@@ -18,13 +18,131 @@ def train(dataset: Dataset, labels: Labels, **kwargs) -> Model:
         Dataset consisting of sequences of features
     labels : Labels
         Labels corresponding to each instance in the dataset
+    algorithm : str
+        Following algorithms are available:
+            * lbfgs: Limited-memory BFGS with L1/L2 regularization
+            * l2sgd: Stochastic gradient descent with L2 regularization
+            * ap: Averaged perceptron
+            * pa: Passive aggressive
+            * arow: Adaptive regularization of weights
+
+    Limited-memory BFGS Parameters
+    ------------------------------
+    min_freq : float, optional (default=0)
+        Threshold value for minimum frequency of a feature occurring in training data
+    all_possible_states : bool, optional (default=False)
+        Generate state features that do not even occur in the training data
+    all_possible_transitions : bool, optional (default=False)
+        Generate transition features that do not even occur in the training data
+    max_iterations : int, optional (default=None)
+        Maximum number of iterations (unlimited by default)
+    num_memories : int, optional (default=6)
+        Number of limited memories for approximating the inverse hessian matrix
+    c1 : float, optional (default=0)
+        Coefficient for L1 regularization
+    c2 : float, optional (default=1.0)
+        Coefficient for L2 regularization
+    epsilon : float, optional (default=1e-5)
+        Parameter that determines the condition of convergence
+    period : int, optional (default=10)
+        Threshold value for iterations to test the stopping criterion
+    delta : float, optional (default=1e-5)
+        Top iteration when log likelihood is not greater than this
+    linesearch : str, optional (default="MoreThuente")
+        Line search algorithm used in updates:
+            * MoreThuente: More and Thuente's method
+            * Backtracking: Backtracking method with regular Wolfe condition
+            * StrongBacktracking: Backtracking method with strong Wolfe condition
+    max_linesearch : int, optional (default=20)
+        Maximum number of trials for the line search algorithm
+
+    SGD with L2 Parameters
+    ----------------------
+    min_freq : float, optional (default=0)
+        Threshold value for minimum frequency of a feature occurring in training data
+    all_possible_states : bool, optional (default=False)
+        Generate state features that do not even occur in the training data
+    all_possible_transitions : bool, optional (default=False)
+        Generate transition features that do not even occur in the training data
+    max_iterations : int, optional (default=None)
+        Maximum number of iterations (1000 by default)
+    c2 : float, optional (default=1.0)
+        Coefficient for L2 regularization
+    period : int, optional (default=10)
+        Threshold value for iterations to test the stopping criterion
+    delta : float, optional (default=1e-5)
+        Top iteration when log likelihood is not greater than this
+    calibration_eta : float, optional (default=0.1)
+        Initial value of learning rate (eta) used for calibration
+    calibration_rate : float, optional (default=2.0)
+        Rate of increase/decrease of learning rate for calibration
+    calibration_samples : int, optional (default=1000)
+        Number of instances used for calibration
+    calibration_candidates : int, optional (default=10)
+        Number of candidates of learning rate
+    calibration_max_trials : int, optional (default=20)
+        Maximum number of trials of learning rates for calibration
+
+    Averaged Perceptron Parameters
+    ------------------------------
+    min_freq : float, optional (default=0)
+        Threshold value for minimum frequency of a feature occurring in training data
+    all_possible_states : bool, optional (default=False)
+        Generate state features that do not even occur in the training data
+    all_possible_transitions : bool, optional (default=False)
+        Generate transition features that do not even occur in the training data
+    max_iterations : int, optional (default=None)
+        Maximum number of iterations (100 by default)
+    epsilon : float, optional (default=1e-5)
+        Parameter that determines the condition of convergence
+
+    Passive Aggressive Parameters
+    -----------------------------
+    min_freq : float, optional (default=0)
+        Threshold value for minimum frequency of a feature occurring in training data
+    all_possible_states : bool, optional (default=False)
+        Generate state features that do not even occur in the training data
+    all_possible_transitions : bool, optional (default=False)
+        Generate transition features that do not even occur in the training data
+    max_iterations : int, optional (default=None)
+        Maximum number of iterations (100 by default)
+    epsilon : float, optional (default=1e-5)
+        Parameter that determines the condition of convergence
+    pa_type : int, optional (default=1)
+        Strategy for updating feature weights:
+            * 0: PA without slack variables
+            * 1: PA type I
+            * 2: PA type II
+    c : float, optional (default=1)
+        Aggressiveness parameter (used only for PA-I and PA-II)
+    error_sensitive : bool, optional (default=True)
+        Include square root of predicted incorrect labels into optimization routine
+    averaging : bool, optional (default=True)
+        Compute average of feature weights at all updates
+
+    Adaptive Regularization of Weights (AROW) Parameters
+    ----------------------------------------------------
+    min_freq : float, optional (default=0)
+        Threshold value for minimum frequency of a feature occurring in training data
+    all_possible_states : bool, optional (default=False)
+        Generate state features that do not even occur in the training data
+    all_possible_transitions : bool, optional (default=False)
+        Generate transition features that do not even occur in the training data
+    max_iterations : int, optional (default=None)
+        Maximum number of iterations (100 by default)
+    epsilon : float, optional (default=1e-5)
+        Parameter that determines the condition of convergence
+    variance : float, optional (default=1)
+        Initial variance of every feature weight
+    gamma : float, optional (default=1)
+        Trade-off between loss function and changes of feature weights
 
     Returns
     -------
-    CRF
-        A conditional random field fitted on the dataset
+    Model
+        A conditional random field trained on the dataset
     """
-    # start training
+    # initialize trainer and start training
     trainer = Trainer(**kwargs)
     trainer.train(dataset, labels, "model.crf")
 

From 9f433078b93f26006c622ff1c2c8e2cc50c6f438 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Mon, 4 Jan 2021 22:56:49 +0100
Subject: [PATCH 02/22] fix: typos

---
 chaine/training.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/chaine/training.py b/chaine/training.py
index f95290a..c6295e5 100644
--- a/chaine/training.py
+++ b/chaine/training.py
@@ -1,8 +1,8 @@
 """
-chaine.core
-~~~~~~~~~~~
+chaine.training
+~~~~~~~~~~~~~~~
 
-This module implements the high-level API
+This module implements the high-level API to train a CRF
 """
 
 from chaine.crf import Model, Trainer

From 20fab7132e122a3613dcd9085d151c231ef5c963 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 01:23:31 +0100
Subject: [PATCH 03/22] fix: typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5b390e3..7610d28 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ If you are interested in the theoretical concepts behind conditional random fiel
 
 ```python
 >>> import chaine
->>> tokens = [["John", "Lennon", "was", "born", "in" "Liverpool"]]
+>>> tokens = [["John", "Lennon", "was", "born", "in", "Liverpool"]]
 >>> labels = [["B-PER", "I-PER", "O", "O", "O", "B-LOC"]]
 >>> model = chaine.train(tokens, labels, max_iterations=5)
 >>> model.predict(tokens)

From d4d73bb6dc496b9f01791f05c17cc3c4292b5684 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 01:23:44 +0100
Subject: [PATCH 04/22] chore: update description

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bce7743..e7487e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "chaine"
-version = "0.2.2"
-description = "A Lightweight Conditional Random Field"
+version = "0.3.0"
+description = "Linear-chain conditional random fields for natural language processing"
 authors = ["Severin Simmler <severin.simmler@posteo.de>"]
 readme = "README.md"
 build = "build.py"

From c349974f66638abc6e2721e4d684b1eaabdc8158 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 01:23:57 +0100
Subject: [PATCH 05/22] chore: update tests

---
 tests/test_logging.py | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/tests/test_logging.py b/tests/test_logging.py
index a3a6345..bf18ef8 100644
--- a/tests/test_logging.py
+++ b/tests/test_logging.py
@@ -13,37 +13,3 @@ def test_logger():
     assert logger.log_level == logger.DEBUG
     logger.log_level = logger.WARNING
     assert logger.log_level == logger.WARNING
-
-
-def test_log_message():
-    message = logging.LogMessage()
-
-    assert message.iteration is None
-    assert message.loss is None
-
-    message.iteration = "1"
-    message.loss = "1000.0"
-    assert message.iteration == "1"
-    assert message.loss == "1000.0"
-    assert str(message) == "Iteration: 1\tLoss: 1000.0"
-
-
-def test_log_parser():
-    parser = logging.LogParser()
-
-    assert isinstance(parser.message, logging.LogMessage)
-
-    text = parser.parse("Irrelevant message")
-    assert text is None
-    assert parser.message.iteration is None
-    assert parser.message.loss is None
-
-    text = parser.parse("***** Iteration #1 *****\n")
-    assert text is None
-    assert parser.message.iteration == "1"
-    assert parser.message.loss is None
-
-    text = parser.parse("Loss: 1000.0")
-    assert text == "Iteration: 1\tLoss: 1000.0"
-    assert parser.message.iteration is None
-    assert parser.message.loss is None

From 8abe488f77343f6efa01ed1f3af32643eb8f393b Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 01:24:22 +0100
Subject: [PATCH 06/22] fix: #22

---
 chaine/crf.pyx                                | 17 ++---
 chaine/crfsuite/lib/crf/src/crf1d_encode.c    | 21 +------
 chaine/crfsuite/lib/crf/src/crf1d_feature.c   |  2 -
 chaine/crfsuite/lib/crf/src/crfsuite.c        | 19 ------
 chaine/crfsuite/lib/crf/src/crfsuite_train.c  |  2 -
 chaine/crfsuite/lib/crf/src/logging.c         | 13 +---
 chaine/crfsuite/lib/crf/src/train_arow.c      | 22 +------
 .../lib/crf/src/train_averaged_perceptron.c   | 18 +-----
 chaine/crfsuite/lib/crf/src/train_l2sgd.c     | 62 ++++---------------
 chaine/crfsuite/lib/crf/src/train_lbfgs.c     | 37 ++---------
 .../lib/crf/src/train_passive_aggressive.c    | 22 +------
 chaine/logging.py                             | 54 ----------------
 12 files changed, 36 insertions(+), 253 deletions(-)

diff --git a/chaine/crf.pyx b/chaine/crf.pyx
index 46c9565..131a28e 100644
--- a/chaine/crf.pyx
+++ b/chaine/crf.pyx
@@ -7,7 +7,7 @@ cimport crfsuite_api
 from libcpp.string cimport string
 import os
 
-from chaine.logging import Logger, LogParser
+from chaine.logging import Logger
 from chaine.typing import Dataset, Dict, Iterable, Labels, List, Path, Sequence
 
 LOGGER = Logger(__name__)
@@ -202,7 +202,6 @@ cdef class Trainer:
             "variance": float,
             "gamma": float,
         }
-    _log_parser = LogParser()
 
     def __init__(self, algorithm="l2sgd", **kwargs):
         self._select_algorithm(algorithm)
@@ -246,14 +245,13 @@ cdef class Trainer:
         features. One item consists only of the relevant features. Internally, the
         string features are hash-mapped and a sparse matrix is constructed.
         """
-        LOGGER.info("Loading data")
+        LOGGER.info("Loading training data (this may take a while)")
         for i, (sequence, labels_) in enumerate(zip(dataset, labels)):
-            # log progress every 10000 data points
-            if i > 0 and i % 10000 == 0:
-                LOGGER.info(f"Processed sequences: {i}")
+            # log progress every 100 data points
+            if i > 0 and i % 100 == 0:
+                LOGGER.debug(f"{i} processed data points")
             self._append(sequence, labels_)
 
-        LOGGER.info("Start training")
         status_code = self._c_trainer.train(str(model_filepath), -1)
         if status_code != crfsuite_api.CRFSUITE_SUCCESS:
             LOGGER.error(f"An error ({status_code}) occured")
@@ -270,9 +268,7 @@ cdef class Trainer:
         self._message(message)
 
     def _message(self, message):
-        event = self._log_parser.parse(message)
-        if event:
-            LOGGER.info(event)
+        LOGGER.info(message)
 
     def _append(self, sequence, labels, int group=0):
         # no generators allowed
@@ -400,7 +396,6 @@ cdef class Model:
         """
         return [self.predict_proba_single(sequence) for sequence in sequences]
 
-
     def _load(self, filepath):
         filepath = str(filepath)
         self._check_model(filepath)
diff --git a/chaine/crfsuite/lib/crf/src/crf1d_encode.c b/chaine/crfsuite/lib/crf/src/crf1d_encode.c
index 3c90256..aa4cc41 100644
--- a/chaine/crfsuite/lib/crf/src/crf1d_encode.c
+++ b/chaine/crfsuite/lib/crf/src/crf1d_encode.c
@@ -477,11 +477,7 @@ crf1de_set_data(
     }
 
     /* Feature generation. */
-    logging(lg, "Feature generation\n");
-    logging(lg, "type: CRF1d\n");
-    logging(lg, "feature.minfreq: %f\n", opt->feature_minfreq);
-    logging(lg, "feature.possible_states: %d\n", opt->feature_possible_states);
-    logging(lg, "feature.possible_transitions: %d\n", opt->feature_possible_transitions);
+    logging(lg, "Processing training data");
     begin = clock();
     crf1de->features = crf1df_generate(
         &crf1de->num_features,
@@ -498,9 +494,6 @@ crf1de_set_data(
         ret = CRFSUITEERR_OUTOFMEMORY;
         goto error_exit;
     }
-    logging(lg, "Number of features: %d\n", crf1de->num_features);
-    logging(lg, "Seconds required: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
 
     /* Initialize the feature references. */
     crf1df_init_references(
@@ -544,7 +537,7 @@ crf1de_save_model(
     int J = 0, B = 0;
 
     /* Start storing the model. */
-    logging(lg, "Storing the model\n");
+    logging(lg, "Saving model");
     begin = clock();
 
     /* Allocate and initialize the feature mapping. */
@@ -641,12 +634,7 @@ crf1de_save_model(
         goto error_exit;
     }
 
-    logging(lg, "Number of active features: %d (%d)\n", J, K);
-    logging(lg, "Number of active attributes: %d (%d)\n", B, A);
-    logging(lg, "Number of active labels: %d (%d)\n", L, L);
-
     /* Write labels. */
-    logging(lg, "Writing labels\n", L);
     if (ret = crf1dmw_open_labels(writer, L))
     {
         goto error_exit;
@@ -670,7 +658,6 @@ crf1de_save_model(
     }
 
     /* Write attributes. */
-    logging(lg, "Writing attributes\n");
     if (ret = crf1dmw_open_attrs(writer, B))
     {
         goto error_exit;
@@ -697,7 +684,6 @@ crf1de_save_model(
     }
 
     /* Write label feature references. */
-    logging(lg, "Writing feature references for transitions\n");
     if (ret = crf1dmw_open_labelrefs(writer, L + 2))
     {
         goto error_exit;
@@ -716,7 +702,6 @@ crf1de_save_model(
     }
 
     /* Write attribute feature references. */
-    logging(lg, "Writing feature references for attributes\n");
     if (ret = crf1dmw_open_attrrefs(writer, B))
     {
         goto error_exit;
@@ -739,8 +724,6 @@ crf1de_save_model(
 
     /* Close the writer. */
     crf1dmw_close(writer);
-    logging(lg, "Seconds required: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
 
     free(amap);
     free(fmap);
diff --git a/chaine/crfsuite/lib/crf/src/crf1d_feature.c b/chaine/crfsuite/lib/crf/src/crf1d_feature.c
index 010ea63..fc25f4f 100644
--- a/chaine/crfsuite/lib/crf/src/crf1d_feature.c
+++ b/chaine/crfsuite/lib/crf/src/crf1d_feature.c
@@ -190,8 +190,6 @@ crf1df_feature_t *crf1df_generate(
     set = featureset_new();
 
     /* Loop over the sequences in the training data. */
-    logging_progress_start(&lg);
-
     for (s = 0; s < N; ++s)
     {
         int prev = L, cur = 0;
diff --git a/chaine/crfsuite/lib/crf/src/crfsuite.c b/chaine/crfsuite/lib/crf/src/crfsuite.c
index 474d35c..d6f6f01 100644
--- a/chaine/crfsuite/lib/crf/src/crfsuite.c
+++ b/chaine/crfsuite/lib/crf/src/crfsuite.c
@@ -477,8 +477,6 @@ void crfsuite_evaluation_output(crfsuite_evaluation_t *eval, crfsuite_dictionary
     lg.func = cbm;
     lg.instance = instance;
 
-    logging(&lg, "Performance by label (#match, #model, #ref) (precision, recall, F1):\n");
-
     for (i = 0; i < eval->num_labels; ++i)
     {
         const crfsuite_label_evaluation_t *lev = &eval->tbl[i];
@@ -487,25 +485,8 @@ void crfsuite_evaluation_output(crfsuite_evaluation_t *eval, crfsuite_dictionary
         if (lstr == NULL)
             lstr = "[UNKNOWN]";
 
-        if (lev->num_observation == 0)
-        {
-            logging(&lg, "    %s: (%d, %d, %d) (******, ******, ******)\n",
-                    lstr, lev->num_correct, lev->num_model, lev->num_observation);
-        }
-        else
-        {
-            logging(&lg, "    %s: (%d, %d, %d) (%1.4f, %1.4f, %1.4f)\n",
-                    lstr, lev->num_correct, lev->num_model, lev->num_observation,
-                    lev->precision, lev->recall, lev->fmeasure);
-        }
         labels->free(labels, lstr);
     }
-    logging(&lg, "Macro-average precision, recall, F1: (%f, %f, %f)\n",
-            eval->macro_precision, eval->macro_recall, eval->macro_fmeasure);
-    logging(&lg, "Item accuracy: %d / %d (%1.4f)\n",
-            eval->item_total_correct, eval->item_total_num, eval->item_accuracy);
-    logging(&lg, "Instance accuracy: %d / %d (%1.4f)\n",
-            eval->inst_total_correct, eval->inst_total_num, eval->inst_accuracy);
 }
 
 int crfsuite_interlocked_increment(int *count)
diff --git a/chaine/crfsuite/lib/crf/src/crfsuite_train.c b/chaine/crfsuite/lib/crf/src/crfsuite_train.c
index 7a6e200..3682192 100644
--- a/chaine/crfsuite/lib/crf/src/crfsuite_train.c
+++ b/chaine/crfsuite/lib/crf/src/crfsuite_train.c
@@ -150,8 +150,6 @@ static int crfsuite_train_train(
     if (0 <= holdout)
     {
         dataset_init_testset(&testset, (crfsuite_data_t *)data, holdout);
-        logging(lg, "Holdout group: %d\n", holdout + 1);
-        logging(lg, "\n");
     }
 
     /* Set the training set to the CRF, and generate features. */
diff --git a/chaine/crfsuite/lib/crf/src/logging.c b/chaine/crfsuite/lib/crf/src/logging.c
index 3e17ffa..75064e2 100644
--- a/chaine/crfsuite/lib/crf/src/logging.c
+++ b/chaine/crfsuite/lib/crf/src/logging.c
@@ -64,12 +64,6 @@ void logging_timestamp(logging_t *lg, const char *format)
     logging(lg, format, timestamp);
 }
 
-void logging_progress_start(logging_t *lg)
-{
-    lg->percent = 0;
-    logging(lg, "0");
-}
-
 void logging_progress(logging_t *lg, int percent)
 {
     while (lg->percent < percent)
@@ -79,11 +73,7 @@ void logging_progress(logging_t *lg, int percent)
         {
             if (lg->percent % 10 == 0)
             {
-                logging(lg, "%d", lg->percent / 10);
-            }
-            else
-            {
-                logging(lg, ".");
+                logging(lg, "Processed %d%% of the training data", lg->percent);
             }
         }
     }
@@ -92,5 +82,4 @@ void logging_progress(logging_t *lg, int percent)
 void logging_progress_end(logging_t *lg)
 {
     logging_progress(lg, 100);
-    logging(lg, "\n");
 }
diff --git a/chaine/crfsuite/lib/crf/src/train_arow.c b/chaine/crfsuite/lib/crf/src/train_arow.c
index 4265017..cb30bb7 100644
--- a/chaine/crfsuite/lib/crf/src/train_arow.c
+++ b/chaine/crfsuite/lib/crf/src/train_arow.c
@@ -282,14 +282,7 @@ int crfsuite_train_arow(
     /* Initialize the covariance vector (diagnal matrix). */
     vecset(cov, opt.variance, K);
 
-    /* Show the parameters. */
-    logging(lg, "Adaptive Regularization of Weights (AROW)\n");
-    logging(lg, "variance: %f\n", opt.variance);
-    logging(lg, "gamma: %f\n", opt.gamma);
-    logging(lg, "max_iterations: %d\n", opt.max_iterations);
-    logging(lg, "epsilon: %f\n", opt.epsilon);
-    logging(lg, "\n");
-
+    logging(lg, "Start training with AROW");
     beta = 1.0 / opt.gamma;
 
     /* Loop for epoch. */
@@ -381,10 +374,7 @@ int crfsuite_train_arow(
         }
 
         /* Output the progress. */
-        logging(lg, "***** Iteration #%d *****\n", i + 1);
-        logging(lg, "Loss: %f\n", sum_loss);
-        logging(lg, "Feature norm: %f\n", sqrt(vecdot(mean, mean, K)));
-        logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
+        logging(lg, "Iteration %d, training loss: %f", i + 1, sum_loss);
 
         /* Holdout evaluation if necessary. */
         if (testset != NULL)
@@ -392,20 +382,14 @@ int crfsuite_train_arow(
             holdout_evaluation(gm, testset, mean, lg);
         }
 
-        logging(lg, "\n");
-
         /* Convergence test. */
         if (sum_loss / N <= opt.epsilon)
         {
-            logging(lg, "Terminated with the stopping criterion\n");
-            logging(lg, "\n");
+            logging(lg, "Loss has converged, terminating training");
             break;
         }
     }
 
-    logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
-
     free(viterbi);
     free(prod);
     free(cov);
diff --git a/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c b/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c
index ae014e5..9947a8d 100644
--- a/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c
+++ b/chaine/crfsuite/lib/crf/src/train_averaged_perceptron.c
@@ -143,10 +143,7 @@ int crfsuite_train_averaged_perceptron(
     }
 
     /* Show the parameters. */
-    logging(lg, "Averaged perceptron\n");
-    logging(lg, "max_iterations: %d\n", opt.max_iterations);
-    logging(lg, "epsilon: %f\n", opt.epsilon);
-    logging(lg, "\n");
+    logging(lg, "Start training with AP");
 
     c = 1;
     ud.w = w;
@@ -207,10 +204,7 @@ int crfsuite_train_averaged_perceptron(
         vecasub(wa, 1. / c, ws, K);
 
         /* Output the progress. */
-        logging(lg, "***** Iteration #%d *****\n", i + 1);
-        logging(lg, "Loss: %f\n", loss);
-        logging(lg, "Feature norm: %f\n", sqrt(vecdot(wa, wa, K)));
-        logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
+        logging(lg, "Iteration %d, training loss: %f", i + 1, loss);
 
         /* Holdout evaluation if necessary. */
         if (testset != NULL)
@@ -218,20 +212,14 @@ int crfsuite_train_averaged_perceptron(
             holdout_evaluation(gm, testset, wa, lg);
         }
 
-        logging(lg, "\n");
-
         /* Convergence test. */
         if (loss / N < opt.epsilon)
         {
-            logging(lg, "Terminated with the stopping criterion\n");
-            logging(lg, "\n");
+            logging(lg, "Loss has converged, terminating training");
             break;
         }
     }
 
-    logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
-
     free(viterbi);
     free(ws);
     free(w);
diff --git a/chaine/crfsuite/lib/crf/src/train_l2sgd.c b/chaine/crfsuite/lib/crf/src/train_l2sgd.c
index 1ee6006..7fd8f78 100644
--- a/chaine/crfsuite/lib/crf/src/train_l2sgd.c
+++ b/chaine/crfsuite/lib/crf/src/train_l2sgd.c
@@ -44,7 +44,7 @@
     written by Léon Bottou.
 
     The objective function to minimize is:
-        
+
         f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x)
         lambda = 2 * C / N
 
@@ -172,7 +172,6 @@ static int l2sgd(
 
         if (!calibration)
         {
-            logging(lg, "***** Epoch #%d *****\n", epoch);
             /* Shuffle the training instances. */
             dataset_shuffle(trainset);
         }
@@ -200,7 +199,7 @@ static int l2sgd(
         /* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */
         if (!isfinite(loss))
         {
-            logging(lg, "ERROR: overflow loss\n");
+            logging(lg, "Loss is abnormal");
             ret = CRFSUITEERR_OVERFLOW;
             sum_loss = loss;
             goto error_exit;
@@ -239,22 +238,11 @@ static int l2sgd(
             /* Store the current value of the objective function. */
             pf[(epoch - 1) % period] = sum_loss;
 
-            logging(lg, "Loss: %f\n", sum_loss);
+            logging(lg, "Epoch %d, learning rate: %f, training loss: %f", epoch, eta, sum_loss);
             if (period < epoch)
             {
-                logging(lg, "Improvement ratio: %f\n", improvement);
-            }
-            logging(lg, "Feature L2-norm: %f\n", sqrt(norm2));
-            logging(lg, "Learning rate (eta): %f\n", eta);
-            logging(lg, "Total number of feature updates: %.0f\n", t);
-            logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC);
-
-            /* Holdout evaluation if necessary. */
-            if (testset != NULL)
-            {
-                holdout_evaluation(gm, testset, w, lg);
+                logging(lg, "Improvement ratio: %f", improvement);
             }
-            logging(lg, "\n");
 
             /* Check for the stopping criterion. */
             if (improvement < epsilon)
@@ -272,17 +260,13 @@ static int l2sgd(
         {
             if (epoch < num_epochs)
             {
-                logging(lg, "SGD terminated with the stopping criteria\n");
+                logging(lg, "Loss has converged, terminating training");
             }
             else
             {
-                logging(lg, "SGD terminated with the maximum number of iterations\n");
+                logging(lg, "Reached maximum number of iterations, terminating training");
             }
         }
-        else
-        {
-            logging(lg, "SGD terminated with error code (%d)\n", ret);
-        }
     }
 
     /* Restore the best weights. */
@@ -326,12 +310,7 @@ l2sgd_calibration(
     const floatval_t rate = opt->calibration_rate;
     const floatval_t lambda = opt->lambda;
 
-    logging(lg, "Calibrating the learning rate (eta)\n");
-    logging(lg, "calibration.eta: %f\n", eta);
-    logging(lg, "calibration.rate: %f\n", rate);
-    logging(lg, "calibration.samples: %d\n", S);
-    logging(lg, "calibration.candidates: %d\n", num);
-    logging(lg, "calibration.max_trials: %d\n", opt->calibration_max_trials);
+    logging(lg, "Calibrating learning rate");
 
     /* Initialize a permutation that shuffles the instances. */
     dataset_shuffle(ds);
@@ -353,12 +332,10 @@ l2sgd_calibration(
         init_loss += score;
     }
     init_loss += 0.5 * lambda * vecdot(w, w, K) * N;
-    logging(lg, "Initial loss: %f\n", init_loss);
+    logging(lg, "Initial training loss: %f", init_loss);
 
     while (num > 0 || !dec)
     {
-        logging(lg, "Trial #%d (eta = %f): ", trials, eta);
-
         /* Perform SGD for one epoch. */
         l2sgd(
             gm,
@@ -370,15 +347,13 @@ l2sgd_calibration(
 
         /* Make sure that the learning rate decreases the log-likelihood. */
         ok = isfinite(loss) && (loss < init_loss);
+
+        logging(lg, "Trial %d, learning rate %f, training loss: %f", trials, eta, loss);
+
         if (ok)
         {
-            logging(lg, "%f\n", loss);
             --num;
         }
-        else
-        {
-            logging(lg, "%f (worse)\n", loss);
-        }
 
         if (isfinite(loss) && loss < best_loss)
         {
@@ -412,9 +387,7 @@ l2sgd_calibration(
     }
 
     eta = best_eta;
-    logging(lg, "Best learning rate (eta): %f\n", eta);
-    logging(lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
+    logging(lg, "Best learning rate: %f", eta);
 
     return 1.0 / (lambda * eta);
 }
@@ -491,12 +464,7 @@ int crfsuite_train_l2sgd(
 
     opt.lambda = 2. * opt.c2 / N;
 
-    logging(lg, "Stochastic Gradient Descent (SGD)\n");
-    logging(lg, "c2: %f\n", opt.c2);
-    logging(lg, "max_iterations: %d\n", opt.max_iterations);
-    logging(lg, "period: %d\n", opt.period);
-    logging(lg, "delta: %f\n", opt.delta);
-    logging(lg, "\n");
+    logging(lg, "Start training with SGD");
     clk_begin = clock();
 
     /* Calibrate the training rate (eta). */
@@ -518,10 +486,6 @@ int crfsuite_train_l2sgd(
         opt.delta,
         &loss);
 
-    logging(lg, "Loss: %f\n", loss);
-    logging(lg, "Total seconds required for training: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
-
     *ptr_w = w;
     return ret;
 
diff --git a/chaine/crfsuite/lib/crf/src/train_lbfgs.c b/chaine/crfsuite/lib/crf/src/train_lbfgs.c
index 3148e9c..db553c0 100644
--- a/chaine/crfsuite/lib/crf/src/train_lbfgs.c
+++ b/chaine/crfsuite/lib/crf/src/train_lbfgs.c
@@ -143,14 +143,7 @@ static int lbfgs_progress(
     }
 
     /* Report the progress. */
-    logging(lg, "***** Iteration #%d *****\n", k);
-    logging(lg, "Loss: %f\n", fx);
-    logging(lg, "Feature norm: %f\n", xnorm);
-    logging(lg, "Error norm: %f\n", gnorm);
-    logging(lg, "Active features: %d\n", num_active_features);
-    logging(lg, "Line search trials: %d\n", ls);
-    logging(lg, "Line search step: %f\n", step);
-    logging(lg, "Seconds required for this iteration: %.3f\n", duration / (double)CLOCKS_PER_SEC);
+    logging(lg, "Iteration %d, training loss: %f", k, fx);
 
     /* Send the tagger with the current parameters. */
     if (testset != NULL)
@@ -158,8 +151,6 @@ static int lbfgs_progress(
         holdout_evaluation(gm, testset, x, lg);
     }
 
-    logging(lg, "\n");
-
     /* Continue. */
     return 0;
 }
@@ -253,17 +244,7 @@ int crfsuite_train_lbfgs(
 
     /* Read the L-BFGS parameters. */
     exchange_options(params, &opt, -1);
-    logging(lg, "L-BFGS optimization\n");
-    logging(lg, "c1: %f\n", opt.c1);
-    logging(lg, "c2: %f\n", opt.c2);
-    logging(lg, "num_memories: %d\n", opt.memory);
-    logging(lg, "max_iterations: %d\n", opt.max_iterations);
-    logging(lg, "epsilon: %f\n", opt.epsilon);
-    logging(lg, "stop: %d\n", opt.stop);
-    logging(lg, "delta: %f\n", opt.delta);
-    logging(lg, "linesearch: %s\n", opt.linesearch);
-    logging(lg, "linesearch.max_iterations: %d\n", opt.linesearch_max_iterations);
-    logging(lg, "\n");
+    logging(lg, "Start training with L-BFGS");
 
     /* Set parameters for L-BFGS. */
     lbfgsparam.m = opt.memory;
@@ -315,29 +296,21 @@ int crfsuite_train_lbfgs(
         &lbfgsparam);
     if (lbret == LBFGS_CONVERGENCE)
     {
-        logging(lg, "L-BFGS resulted in convergence\n");
+        logging(lg, "Loss has converged, terminating training");
     }
     else if (lbret == LBFGS_STOP)
     {
-        logging(lg, "L-BFGS terminated with the stopping criteria\n");
+        logging(lg, "Terminated with the stopping criteria");
     }
     else if (lbret == LBFGSERR_MAXIMUMITERATION)
     {
-        logging(lg, "L-BFGS terminated with the maximum number of iterations\n");
-    }
-    else
-    {
-        logging(lg, "L-BFGS terminated with error code (%d)\n", lbret);
+        logging(lg, "Reached maximum number of iterations. terminating training");
     }
 
     /* Set the best_w array (allocated by us) as the result array, which the
      * callee can safely `free`. */
     *ptr_w = lbfgsi.best_w;
 
-    /* Report the run-time for the training. */
-    logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
-
     /* Exit with success. */
     lbfgs_free(w);
     return 0;
diff --git a/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c b/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c
index 06b3685..2075fbb 100644
--- a/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c
+++ b/chaine/crfsuite/lib/crf/src/train_passive_aggressive.c
@@ -317,14 +317,7 @@ int crfsuite_train_passive_aggressive(
     }
 
     /* Show the parameters. */
-    logging(lg, "Passive Aggressive\n");
-    logging(lg, "type: %d\n", opt.type);
-    logging(lg, "c: %f\n", opt.c);
-    logging(lg, "error_sensitive: %d\n", opt.error_sensitive);
-    logging(lg, "averaging: %d\n", opt.averaging);
-    logging(lg, "max_iterations: %d\n", opt.max_iterations);
-    logging(lg, "epsilon: %f\n", opt.epsilon);
-    logging(lg, "\n");
+    logging(lg, "Start training with PA");
 
     u = 1;
 
@@ -414,10 +407,7 @@ int crfsuite_train_passive_aggressive(
         }
 
         /* Output the progress. */
-        logging(lg, "***** Iteration #%d *****\n", i + 1);
-        logging(lg, "Loss: %f\n", sum_loss);
-        logging(lg, "Feature norm: %f\n", sqrt(vecdot(w, w, K)));
-        logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - iteration_begin) / (double)CLOCKS_PER_SEC);
+        logging(lg, "Iteration %d, training loss: %f", i + 1, sum_loss);
 
         /* Holdout evaluation if necessary. */
         if (testset != NULL)
@@ -425,20 +415,14 @@ int crfsuite_train_passive_aggressive(
             holdout_evaluation(gm, testset, wa, lg);
         }
 
-        logging(lg, "\n");
-
         /* Convergence test. */
         if (sum_loss / N < opt.epsilon)
         {
-            logging(lg, "Terminated with the stopping criterion\n");
-            logging(lg, "\n");
+            logging(lg, "Loss has converged, terminating training");
             break;
         }
     }
 
-    logging(lg, "Total seconds required for training: %.3f\n", (clock() - begin) / (double)CLOCKS_PER_SEC);
-    logging(lg, "\n");
-
     free(viterbi);
     free(ws);
     free(w);
diff --git a/chaine/logging.py b/chaine/logging.py
index 7e4b983..2bba9d1 100644
--- a/chaine/logging.py
+++ b/chaine/logging.py
@@ -114,57 +114,3 @@ def formatter(self):
 
     def __repr__(self):
         return f"<Logger: {self.name}>"
-
-
-class LogMessage:
-    """CRFsuite log message
-
-    Attributes
-    ----------
-    iteration : Optional[str]
-        Current number of iterations
-    loss : Optional[str]
-        Current loss score
-    """
-
-    def __init__(self):
-        self.iteration = None
-        self.loss = None
-
-    def __str__(self) -> str:
-        return f"Iteration {self.iteration}, train loss: {self.loss}"
-
-
-class LogParser:
-    """Parser for CRFsuite's logfile
-
-    Attributes
-    ----------
-    message : LogMessage
-        Log message with current iteration and loss
-    """
-
-    def __init__(self):
-        self.message = LogMessage()
-
-    def parse(self, line: str) -> Optional[str]:
-        """Parse one line of the logs
-
-        Parameters
-        ----------
-        line : str
-            One line of CRFsuite's logs
-
-        Returns
-        -------
-        str
-            Formatted log message with latest iteration and loss
-        """
-        if (m := re.match(r"\*{5} (?:Iteration|Epoch) #(\d+) \*{5}\n", line)) :
-            self.message.iteration = m.group(1)
-        elif (m := re.match(r"Loss: (\d+\.\d+)", line)) :
-            self.message.loss = m.group(1)
-            if self.message.iteration:
-                text = str(self.message)
-                self.message = LogMessage()
-                return text

From 7323b24d936f0e8f0fde738168c560d53af8d1f3 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 16:07:49 +0100
Subject: [PATCH 07/22] fix: #14


From bccc6dd9869836075e3f5b938ee1121d76b30698 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 16:12:27 +0100
Subject: [PATCH 08/22] fix: #18

---
 chaine/crfsuite/include/crfsuite.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chaine/crfsuite/include/crfsuite.hpp b/chaine/crfsuite/include/crfsuite.hpp
index 1357222..a9bcf80 100644
--- a/chaine/crfsuite/include/crfsuite.hpp
+++ b/chaine/crfsuite/include/crfsuite.hpp
@@ -125,7 +125,7 @@ namespace CRFSuite
         if (xseq.size() != yseq.size())
         {
             std::stringstream ss;
-            ss << "The numbers of items and labels differ: |x| = " << xseq.size() << ", |y| = " << yseq.size();
+            ss << "The number of items and labels differ: |x| = " << xseq.size() << ", |y| = " << yseq.size();
             throw std::invalid_argument(ss.str());
         }
 

From 2ebb337820a0cfb1e274efca43450696d9410008 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 21:53:51 +0100
Subject: [PATCH 09/22] chore: logging

---
 chaine/crfsuite/lib/crf/src/train_l2sgd.c |  4 ----
 chaine/data.py                            | 19 +++++++++----------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/chaine/crfsuite/lib/crf/src/train_l2sgd.c b/chaine/crfsuite/lib/crf/src/train_l2sgd.c
index 7fd8f78..e9e3e32 100644
--- a/chaine/crfsuite/lib/crf/src/train_l2sgd.c
+++ b/chaine/crfsuite/lib/crf/src/train_l2sgd.c
@@ -239,10 +239,6 @@ static int l2sgd(
             pf[(epoch - 1) % period] = sum_loss;
 
             logging(lg, "Epoch %d, learning rate: %f, training loss: %f", epoch, eta, sum_loss);
-            if (period < epoch)
-            {
-                logging(lg, "Improvement ratio: %f", improvement);
-            }
 
             /* Check for the stopping criterion. */
             if (improvement < epsilon)
diff --git a/chaine/data.py b/chaine/data.py
index d612cd6..eab82a2 100644
--- a/chaine/data.py
+++ b/chaine/data.py
@@ -56,13 +56,12 @@ def is_upper(self) -> bool:
         """True if token is upper case, False otherwise"""
         return self.text.isupper()
 
-
-class TokenSequence:
-    def __init__(self, tokens):
-        if not all(isinstance(token, Token) for token in tokens):
-            tokens = [Token(index, text) for index, text in enumerate(tokens)]
-        self.tokens = tokens
-
-    def __iter__(self):
-        for token in self.tokens:
-            yield token
+    @property
+    def features(self):
+        return {"num_characters": len(self),
+        "text": self.lower(),
+        "shape": self.shape,
+        "is_digit": self.is_digit,
+        "is_lower": self.is_lower,
+        "is_title": self.is_title,
+        "is_upper": self.is_upper}

From 6607a618a725a74530cf80414a0c4c3e530a1a3c Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 21:54:05 +0100
Subject: [PATCH 10/22] chore: how it works

---
 README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7610d28..53a9176 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ $ pip install chaine
 If you are interested in the theoretical concepts behind conditional random fields, please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers).
 
 
-## Example
+## Minimal working example
 
 ```python
 >>> import chaine
@@ -31,6 +31,11 @@ If you are interested in the theoretical concepts behind conditional random fiel
 Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb).
 
 
+## How it works
+
+
+
+
 ## Credits
 
 This library makes use of and is partially based on:

From 48da240333c5b60f10191bc58adf5eed7b0b4ef4 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Tue, 5 Jan 2021 21:54:12 +0100
Subject: [PATCH 11/22] chore: rename folder

---
 examples/training.py     |  30 ++++++++++
 notebooks/tutorial.ipynb | 115 ---------------------------------------
 2 files changed, 30 insertions(+), 115 deletions(-)
 create mode 100644 examples/training.py
 delete mode 100644 notebooks/tutorial.ipynb

diff --git a/examples/training.py b/examples/training.py
new file mode 100644
index 0000000..e16bf08
--- /dev/null
+++ b/examples/training.py
@@ -0,0 +1,30 @@
+import chaine
+from chaine.data import Token
+from flair.models import SequenceTagger
+from flair.data import Sentence
+import tqdm
+import datasets
+
+TAGGER = SequenceTagger.load("pos-multi-fast")
+DATASET = datasets.load_dataset("germaner")
+
+
+def preprocess(dataset):
+    for tokens in tqdm.tqdm(dataset):
+        sentence = Sentence(" ".join(tokens), use_tokenizer=False)
+        TAGGER.predict(sentence)
+        pos_tags = [token.get_tag("upos").value for token in sentence]
+        features = [Token(i, text).features for i, text in enumerate(tokens)]
+        for token, pos in zip(features, pos_tags):
+            token["pos"] = pos
+        yield features
+
+
+
+
+
+if __name__ == "__main__":
+    tokens = preprocess(DATASET["train"]["tokens"][:10])
+    labels = DATASET["train"]["ner_tags"][:10]
+
+    model = chaine.train(tokens, labels)
diff --git a/notebooks/tutorial.ipynb b/notebooks/tutorial.ipynb
deleted file mode 100644
index 2dad489..0000000
--- a/notebooks/tutorial.ipynb
+++ /dev/null
@@ -1,115 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Named entity recognition"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import chaine"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokens = [[\"John\", \"Lennon\", \"was\", \"rhythm\", \"guitarist\" \"of\", \"The\", \"Beatles\", \".\"]]\n",
-    "labels = [[\"B-PER\", \"I-PER\", \"O\", \"O\", \"O\" \"O\", \"B-ORG\", \"I-ORG\", \"O\"]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2020-12-06 23:50:12,783] [INFO] Loading data\n",
-      "[2020-12-06 23:50:12,785] [INFO] Start training\n",
-      "[2020-12-06 23:50:12,789] [INFO] Iteration: 1\tLoss: 14.334076\n",
-      "[2020-12-06 23:50:12,792] [INFO] Iteration: 2\tLoss: 14.334064\n",
-      "[2020-12-06 23:50:12,793] [INFO] Iteration: 3\tLoss: 14.334053\n",
-      "[2020-12-06 23:50:12,794] [INFO] Iteration: 4\tLoss: 14.334041\n",
-      "[2020-12-06 23:50:12,796] [INFO] Iteration: 5\tLoss: 14.334029\n"
-     ]
-    }
-   ],
-   "source": [
-    "crf = chaine.train(tokens, labels, max_iterations=5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[['I-PER', 'I-PER', 'O', 'O', 'OO', 'B-ORG', 'I-ORG', 'O']]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "crf.predict(tokens)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Feature extraction\n",
-    "\n",
-    "```\n",
-    "identity of wi, identity of neighboring words\n",
-    "embeddings for wi, embeddings for neighboring words\n",
-    "part of speech of wi, part of speech of neighboring words\n",
-    "base-phrase syntactic chunk label of wi and neighboring words\n",
-    "presence of wi in a gazetteer\n",
-    "wi contains a particular prefix (from all prefixes of length ≤ 4)\n",
-    "wi contains a particular suffix (from all suffixes of length ≤ 4)\n",
-    "wi is all upper case\n",
-    "word shape of wi, word shape of neighboring words\n",
-    "short word shape of wi, short word shape of neighboring words\n",
-    "presence of hyphen\n",
-    "```"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}

From 5052beaf7ce5605f23e986d946785edf70102192 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 21:24:44 +0100
Subject: [PATCH 12/22] fix: unit tests

---
 tests/{test_training.py => test_api.py} |  4 ++--
 tests/test_crf.py                       | 18 ------------------
 tests/test_data.py                      | 13 -------------
 3 files changed, 2 insertions(+), 33 deletions(-)
 rename tests/{test_training.py => test_api.py} (78%)
 delete mode 100644 tests/test_data.py

diff --git a/tests/test_training.py b/tests/test_api.py
similarity index 78%
rename from tests/test_training.py
rename to tests/test_api.py
index 952c6d8..25e8538 100644
--- a/tests/test_training.py
+++ b/tests/test_api.py
@@ -1,4 +1,4 @@
-from chaine import training
+from chaine import api
 from chaine.crf import Model
 
 
@@ -6,7 +6,7 @@ def test_train():
     sequences = [[{"foo"}, {"bar"}] for _ in range(50)]
     labels = [["O", "O"] for _ in range(50)]
 
-    crf = training.train(sequences, labels)
+    crf = api.train(sequences, labels)
 
     assert isinstance(crf, Model)
     assert crf.labels == {"O"}
diff --git a/tests/test_crf.py b/tests/test_crf.py
index 0cc5090..6ea6874 100644
--- a/tests/test_crf.py
+++ b/tests/test_crf.py
@@ -25,19 +25,6 @@ def model(serialized_model):
     return crf.Model(serialized_model)
 
 
-def test_intbool():
-    value = crf._intbool("0")
-    assert isinstance(value, bool)
-    assert value == False
-
-    value = crf._intbool("1")
-    assert isinstance(value, bool)
-    assert value == True
-
-    with pytest.raises(ValueError):
-        crf._intbool("foo")
-
-
 def test_trainer_algorithm_selection():
     for algorithm in {
         "lbfgs",
@@ -181,11 +168,6 @@ def test_arow_params():
         assert param in trainer.params.keys()
 
 
-def test_trainer_log_parser():
-    trainer = crf.Trainer()
-    assert hasattr(trainer, "_log_parser")
-
-
 def test_training(tmpdir, dataset):
     trainer = crf.Trainer()
     model_filepath = Path(tmpdir.join("model.crf"))
diff --git a/tests/test_data.py b/tests/test_data.py
deleted file mode 100644
index f9213a2..0000000
--- a/tests/test_data.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from chaine import data
-
-
-def test_token():
-    token = data.Token(0, "Foo")
-    assert len(token) == 3
-    assert repr(token) == "<Token 0: Foo>"
-    assert str(token) == "Foo"
-    assert token.lower() == "foo"
-    assert token.is_digit == False
-    assert token.is_lower == False
-    assert token.is_title == True
-    assert token.is_upper == False

From 37d26d1345a394be0e74b3f1429a29cea00d8360 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 21:24:59 +0100
Subject: [PATCH 13/22] chore: tune version number

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e7487e9..ba1133b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "chaine"
-version = "0.3.0"
+version = "1.0.0"
 description = "Linear-chain conditional random fields for natural language processing"
 authors = ["Severin Simmler <severin.simmler@posteo.de>"]
 readme = "README.md"

From 3b305705114d35ec47aa1de64201e435d2557f59 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 21:25:16 +0100
Subject: [PATCH 14/22] chore: cleanup

---
 chaine/__init__.py             |  4 +-
 chaine/{training.py => api.py} |  2 +-
 chaine/crf.pyx                 | 30 +++++++--------
 chaine/data.py                 | 67 ----------------------------------
 chaine/logging.py              |  7 +---
 chaine/typing.py               | 10 ++---
 6 files changed, 22 insertions(+), 98 deletions(-)
 rename chaine/{training.py => api.py} (99%)
 delete mode 100644 chaine/data.py

diff --git a/chaine/__init__.py b/chaine/__init__.py
index 78eda0f..ea245da 100644
--- a/chaine/__init__.py
+++ b/chaine/__init__.py
@@ -1,2 +1,2 @@
-from chaine.training import train
-from chaine.crf import Model, Trainer
+from chaine.api import train
+from chaine import crf
diff --git a/chaine/training.py b/chaine/api.py
similarity index 99%
rename from chaine/training.py
rename to chaine/api.py
index c6295e5..4878d4e 100644
--- a/chaine/training.py
+++ b/chaine/api.py
@@ -15,7 +15,7 @@ def train(dataset: Dataset, labels: Labels, **kwargs) -> Model:
     Parameters
     ----------
     dataset : Dataset
-        Dataset consisting of sequences of features
+        Dataset consisting of sequences of feature sets
     labels : Labels
         Labels corresponding to each instance in the dataset
     algorithm : str
diff --git a/chaine/crf.pyx b/chaine/crf.pyx
index 131a28e..4f16374 100644
--- a/chaine/crf.pyx
+++ b/chaine/crf.pyx
@@ -8,16 +8,11 @@ from libcpp.string cimport string
 import os
 
 from chaine.logging import Logger
-from chaine.typing import Dataset, Dict, Iterable, Labels, List, Path, Sequence
+from chaine.typing import Dataset, Dict, Iterable, Labels, List, Filepath, Sequence
 
 LOGGER = Logger(__name__)
 
 
-def _intbool(value: str) -> bool:
-    """Helper function to cast a string to an integer to a boolean"""
-    return bool(int(value))
-
-
 cdef class Trainer:
     """Model trainer
 
@@ -148,6 +143,7 @@ cdef class Trainer:
         "lbfgs": "lbfgs",
         "limited-memory-bfgs": "lbfgs",
         "l2sgd": "l2sgd",
+        "sgd": "l2sgd",
         "stochastic-gradient-descent": "l2sgd",
         "ap": "averaged-perceptron",
         "averaged-perceptron": "averaged-perceptron",
@@ -179,8 +175,8 @@ cdef class Trainer:
     }
     _parameter_types = {
             "feature.minfreq": float,
-            "feature.possible_states": _intbool,
-            "feature.possible_transitions": _intbool,
+            "feature.possible_states": lambda value: bool(int(value)),
+            "feature.possible_transitions": lambda value: bool(int(value)),
             "c1": float,
             "c2": float,
             "max_iterations": int,
@@ -197,8 +193,8 @@ cdef class Trainer:
             "calibration.max_trials": int,
             "type": int,
             "c": float,
-            "error_sensitive": _intbool,
-            "averaging": _intbool,
+            "error_sensitive": lambda value: bool(int(value)),
+            "averaging": lambda value: bool(int(value)),
             "variance": float,
             "gamma": float,
         }
@@ -210,14 +206,14 @@ cdef class Trainer:
 
     def __cinit__(self):
         self._c_trainer.set_handler(self, <crfsuite_api.messagefunc>self._on_message)
-        self._c_trainer.select("lbfgs", "crf1d")
+        self._c_trainer.select("l2sgd", "crf1d")
         self._c_trainer._init_trainer()
 
     def __repr__(self):
         """Representation of the trainer"""
         return f"<Trainer: {self.params}>"
 
-    def train(self, dataset: Dataset, labels: Labels, model_filepath: Path):
+    def train(self, dataset: Dataset, labels: Labels, model_filepath: Filepath):
         """Train a conditional random field
 
         Parameters
@@ -226,7 +222,7 @@ cdef class Trainer:
             Training data set
         labels : Labels
             Corresponding true labels
-        model_filepath : Path
+        model_filepath : Filepath
             Path the trained model is written to
 
         Note
@@ -310,17 +306,17 @@ cdef class Trainer:
 
 
 cdef class Model:
-    """Conditional random field
+    """Linear-chain conditional random field
 
     Parameters
     ----------
-    model_filepath : str
+    model_filepath : Filepath
         Path to the trained model
     """
     cdef crfsuite_api.Tagger c_tagger
 
-    def __init__(self, model_filepath):
-        self._load(model_filepath)
+    def __init__(self, model_filepath: Filepath):
+        self._load(str(model_filepath))
 
     def __repr__(self):
         """Representation of the model"""
diff --git a/chaine/data.py b/chaine/data.py
deleted file mode 100644
index eab82a2..0000000
--- a/chaine/data.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""
-chaine.data
-~~~~~~~~~~~
-
-This module provides basic data structures
-"""
-
-import re
-from dataclasses import dataclass
-
-
-@dataclass
-class Token:
-    index: int
-    text: str
-
-    def __len__(self) -> int:
-        """Number of characters"""
-        return len(self.text)
-
-    def __repr__(self) -> str:
-        """Representation of the token"""
-        return f"<Token {self.index}: {self.text}>"
-
-    def __str__(self) -> str:
-        """String representation of the token"""
-        return self.text
-
-    def lower(self) -> str:
-        """Lower case of the token"""
-        return self.text.lower()
-
-    @property
-    def shape(self) -> str:
-        text = re.sub("[A-Z]", "X", self.text)
-        text = re.sub("[a-z]", "x", text)
-        return re.sub("[0-9]", "d", text)
-
-    @property
-    def is_digit(self) -> bool:
-        """True if token is a digit, False otherwise"""
-        return self.text.isdigit()
-
-    @property
-    def is_lower(self) -> bool:
-        """True if token is lower case, False otherwise"""
-        return self.text.islower()
-
-    @property
-    def is_title(self) -> bool:
-        """True if first letter is upper case, False otherwise"""
-        return self.text.istitle()
-
-    @property
-    def is_upper(self) -> bool:
-        """True if token is upper case, False otherwise"""
-        return self.text.isupper()
-
-    @property
-    def features(self):
-        return {"num_characters": len(self),
-        "text": self.lower(),
-        "shape": self.shape,
-        "is_digit": self.is_digit,
-        "is_lower": self.is_lower,
-        "is_title": self.is_title,
-        "is_upper": self.is_upper}
diff --git a/chaine/logging.py b/chaine/logging.py
index 2bba9d1..8abc41d 100644
--- a/chaine/logging.py
+++ b/chaine/logging.py
@@ -2,15 +2,12 @@
 chaine.logging
 ~~~~~~~~~~~~~~
 
-This module implements a basic logger and a parser for CRFsuite
+This module implements a basic logger
 """
 
 import logging
-import re
 import sys
 
-from chaine.typing import Optional
-
 
 class Logger(logging.Logger):
     DEBUG = logging.DEBUG
@@ -81,7 +78,7 @@ def error(self, message: str):
 
     @property
     def log_level(self) -> int:
-        """Log level.
+        """Log level
 
         Returns
         -------
diff --git a/chaine/typing.py b/chaine/typing.py
index 72008d6..4a73aeb 100644
--- a/chaine/typing.py
+++ b/chaine/typing.py
@@ -5,12 +5,10 @@
 A collection of type hints
 """
 
-from pathlib import Path as _Path
+from pathlib import Path
 from typing import Any, Dict, Generator, Iterable, List, Optional, Set, Union
 
-FeatureGenerator = Generator[List[str], None, None]
-TokenGenerator = Generator["Token", None, None]
-Labels = Iterable[str]
+Labels = Iterable[Iterable[str]]
 Dataset = Iterable[Iterable[str]]
-Path = Union[_Path, str]
-Sequence = List[Set[str]]
+Filepath = Union[Path, str]
+Sequence = List[Union[Set[str], Dict[str, Union[int, float, str, bool]]]]

From 602101d6a46300800535561f82e0963814fd0a27 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:41:03 +0100
Subject: [PATCH 15/22] chore: add dev dependencies

---
 poetry.lock    | 396 ++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |   2 +
 2 files changed, 397 insertions(+), 1 deletion(-)

diff --git a/poetry.lock b/poetry.lock
index 7ee636c..a60fc12 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -155,6 +155,35 @@ category = "dev"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 
+[[package]]
+name = "datasets"
+version = "1.2.0"
+description = "HuggingFace/Datasets is an open library of NLP datasets."
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+dill = "*"
+multiprocess = "*"
+numpy = ">=1.17"
+pandas = "*"
+pyarrow = ">=0.17.1"
+requests = ">=2.19.0"
+tqdm = ">=4.27,<4.50.0"
+xxhash = "*"
+
+[package.extras]
+apache-beam = ["apache-beam"]
+benchmarks = ["numpy (==1.18.5)", "tensorflow (==2.3.0)", "torch (==1.6.0)", "transformers (==3.0.2)"]
+dev = ["apache-beam", "absl-py", "bs4", "conllu", "elasticsearch", "faiss-cpu", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "pytest", "pytest-xdist", "tensorflow", "torch", "tldextract", "transformers", "zstandard", "rarfile", "black", "isort", "flake8 (==3.7.9)"]
+docs = ["recommonmark", "sphinx (==3.1.2)", "sphinx-markdown-tables", "sphinx-rtd-theme (==0.4.3)", "sphinx-copybutton"]
+quality = ["black", "isort", "flake8 (==3.7.9)"]
+tensorflow = ["tensorflow (>=2.2.0)"]
+tensorflow_gpu = ["tensorflow-gpu (>=2.2.0)"]
+tests = ["apache-beam", "absl-py", "bs4", "conllu", "elasticsearch", "faiss-cpu", "langdetect", "lxml", "mwparserfromhell", "nltk", "openpyxl", "py7zr", "pytest", "pytest-xdist", "tensorflow", "torch", "tldextract", "transformers", "zstandard", "rarfile"]
+torch = ["torch"]
+
 [[package]]
 name = "decorator"
 version = "4.4.2"
@@ -171,6 +200,17 @@ category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "dill"
+version = "0.3.3"
+description = "serialize all of python"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*"
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
 [[package]]
 name = "entrypoints"
 version = "0.3"
@@ -294,6 +334,14 @@ MarkupSafe = ">=0.23"
 [package.extras]
 i18n = ["Babel (>=0.8)"]
 
+[[package]]
+name = "joblib"
+version = "1.0.0"
+description = "Lightweight pipelining with Python functions"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
 [[package]]
 name = "json5"
 version = "0.9.5"
@@ -415,6 +463,17 @@ category = "dev"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "multiprocess"
+version = "0.70.11.1"
+description = "better multiprocessing and multithreading in python"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+dill = ">=0.3.3"
+
 [[package]]
 name = "mypy-extensions"
 version = "0.4.3"
@@ -528,6 +587,14 @@ docs = ["sphinx", "nbsphinx", "sphinxcontrib-github-alt", "sphinx-rtd-theme"]
 json-logging = ["json-logging"]
 test = ["pytest", "coverage", "requests", "nbval", "selenium", "pytest-cov", "requests-unixsocket"]
 
+[[package]]
+name = "numpy"
+version = "1.19.5"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
 [[package]]
 name = "packaging"
 version = "20.8"
@@ -539,6 +606,22 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [package.dependencies]
 pyparsing = ">=2.0.2"
 
+[[package]]
+name = "pandas"
+version = "1.2.0"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "dev"
+optional = false
+python-versions = ">=3.7.1"
+
+[package.dependencies]
+numpy = ">=1.16.5"
+python-dateutil = ">=2.7.3"
+pytz = ">=2017.3"
+
+[package.extras]
+test = ["pytest (>=5.0.1)", "pytest-xdist", "hypothesis (>=3.58)"]
+
 [[package]]
 name = "pandocfilters"
 version = "1.4.3"
@@ -635,6 +718,17 @@ category = "dev"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
+[[package]]
+name = "pyarrow"
+version = "2.0.0"
+description = "Python library for Apache Arrow"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
+[package.dependencies]
+numpy = ">=1.14"
+
 [[package]]
 name = "pycparser"
 version = "2.20"
@@ -699,6 +793,14 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 [package.dependencies]
 six = ">=1.5"
 
+[[package]]
+name = "pytz"
+version = "2020.5"
+description = "World timezone definitions, modern and historical"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "pywin32"
 version = "300"
@@ -753,6 +855,37 @@ urllib3 = ">=1.21.1,<1.27"
 security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"]
 socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"]
 
+[[package]]
+name = "scikit-learn"
+version = "0.24.0"
+description = "A set of python modules for machine learning and data mining"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+joblib = ">=0.11"
+numpy = ">=1.13.3"
+scipy = ">=0.19.1"
+threadpoolctl = ">=2.0.0"
+
+[package.extras]
+benchmark = ["matplotlib (>=2.1.1)", "pandas (>=0.25.0)", "memory-profiler (>=0.57.0)"]
+docs = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=3.2.0)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.0.0)", "Pillow (>=7.1.2)", "sphinx-prompt (>=1.3.0)"]
+examples = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)"]
+tests = ["matplotlib (>=2.1.1)", "scikit-image (>=0.13)", "pandas (>=0.25.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "mypy (>=0.770)", "pyamg (>=4.0.0)"]
+
+[[package]]
+name = "scipy"
+version = "1.6.0"
+description = "SciPy: Scientific Library for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+numpy = ">=1.16.5"
+
 [[package]]
 name = "send2trash"
 version = "1.5.0"
@@ -761,6 +894,18 @@ category = "dev"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "seqeval"
+version = "1.2.2"
+description = "Testing framework for sequence labeling"
+category = "dev"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+numpy = ">=1.14.0"
+scikit-learn = ">=0.21.3"
+
 [[package]]
 name = "six"
 version = "1.15.0"
@@ -793,6 +938,14 @@ python-versions = "*"
 [package.extras]
 test = ["pathlib2"]
 
+[[package]]
+name = "threadpoolctl"
+version = "2.1.0"
+description = "threadpoolctl"
+category = "dev"
+optional = false
+python-versions = ">=3.5"
+
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -809,6 +962,17 @@ category = "dev"
 optional = false
 python-versions = ">= 3.5"
 
+[[package]]
+name = "tqdm"
+version = "4.49.0"
+description = "Fast, Extensible Progress Meter"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "argopt", "pydoc-markdown"]
+
 [[package]]
 name = "traitlets"
 version = "5.0.5"
@@ -868,10 +1032,18 @@ category = "dev"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "xxhash"
+version = "2.0.0"
+description = "Python binding for xxHash"
+category = "dev"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "4b1a7d194de7b81ec20900062a23abb84d66e4e7ec3f89d9fcc3329ff6fcafc3"
+content-hash = "cddf572f73652339004a5238d90b348b711178c3daf5685ff302ea3175f66ce7"
 
 [metadata.files]
 appdirs = [
@@ -1016,6 +1188,10 @@ cython = [
     {file = "Cython-0.29.21-py2.py3-none-any.whl", hash = "sha256:5c4276fdcbccdf1e3c1756c7aeb8395e9a36874fa4d30860e7694f43d325ae13"},
     {file = "Cython-0.29.21.tar.gz", hash = "sha256:e57acb89bd55943c8d8bf813763d20b9099cc7165c0f16b707631a7654be9cad"},
 ]
+datasets = [
+    {file = "datasets-1.2.0-py3-none-any.whl", hash = "sha256:4f60447d0b80c2ce26e54893fb515c6798742990c3217ca2c60b86758fd29f49"},
+    {file = "datasets-1.2.0.tar.gz", hash = "sha256:695ba8d7644b03dc56bee1339447cc22a1fc358efc64ab0e7eb42510e8f9a4ac"},
+]
 decorator = [
     {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
     {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
@@ -1024,6 +1200,10 @@ defusedxml = [
     {file = "defusedxml-0.6.0-py2.py3-none-any.whl", hash = "sha256:6687150770438374ab581bb7a1b327a847dd9c5749e396102de3fad4e8a3ef93"},
     {file = "defusedxml-0.6.0.tar.gz", hash = "sha256:f684034d135af4c6cbb949b8a4d2ed61634515257a67299e5f940fbaa34377f5"},
 ]
+dill = [
+    {file = "dill-0.3.3-py2.py3-none-any.whl", hash = "sha256:78370261be6ea49037ace8c17e0b7dd06d0393af6513cc23f9b222d9367ce389"},
+    {file = "dill-0.3.3.zip", hash = "sha256:efb7f6cb65dba7087c1e111bb5390291ba3616741f96840bfc75792a1a9b5ded"},
+]
 entrypoints = [
     {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"},
     {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"},
@@ -1060,6 +1240,10 @@ jinja2 = [
     {file = "Jinja2-2.11.2-py2.py3-none-any.whl", hash = "sha256:f0a4641d3cf955324a89c04f3d94663aa4d638abe8f733ecd3582848e1c37035"},
     {file = "Jinja2-2.11.2.tar.gz", hash = "sha256:89aab215427ef59c34ad58735269eb58b1a5808103067f7bb9d5836c651b3bb0"},
 ]
+joblib = [
+    {file = "joblib-1.0.0-py3-none-any.whl", hash = "sha256:75ead23f13484a2a414874779d69ade40d4fa1abe62b222a23cd50d4bc822f6f"},
+    {file = "joblib-1.0.0.tar.gz", hash = "sha256:7ad866067ac1fdec27d51c8678ea760601b70e32ff1881d4dc8e1171f2b64b24"},
+]
 json5 = [
     {file = "json5-0.9.5-py2.py3-none-any.whl", hash = "sha256:af1a1b9a2850c7f62c23fde18be4749b3599fd302f494eebf957e2ada6b9e42c"},
     {file = "json5-0.9.5.tar.gz", hash = "sha256:703cfee540790576b56a92e1c6aaa6c4b0d98971dc358ead83812aa4d06bdb96"},
@@ -1127,6 +1311,19 @@ mistune = [
     {file = "mistune-0.8.4-py2.py3-none-any.whl", hash = "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4"},
     {file = "mistune-0.8.4.tar.gz", hash = "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e"},
 ]
+multiprocess = [
+    {file = "multiprocess-0.70.11.1-cp27-cp27m-macosx_10_8_x86_64.whl", hash = "sha256:8f0d0640642acc654fe2fb5cb529ebbe116468a1dd1544d484db6e79033767c8"},
+    {file = "multiprocess-0.70.11.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:4b33a0111e341fad5e3c6bb6dd7f592596f2974cc5ecddee06b9a999bac4cbb0"},
+    {file = "multiprocess-0.70.11.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:0eab6e0e87acba9586e5d6869d21271cc865d72d74b7f6b30b6290dffca5caae"},
+    {file = "multiprocess-0.70.11.1-cp27-cp27m-win32.whl", hash = "sha256:4d97020a50a18862fbb1f84d81914a2a28f2d78bc315de9a6699459682df2a67"},
+    {file = "multiprocess-0.70.11.1-cp27-cp27m-win_amd64.whl", hash = "sha256:217e96638fbfd951a203b8dc17410839e4aea8aa3fb9cc393c37e491dcac2c65"},
+    {file = "multiprocess-0.70.11.1-py35-none-any.whl", hash = "sha256:ebb92b67a61b901bfc277c4525e86afba24a60638d192b62f8c332933da995f4"},
+    {file = "multiprocess-0.70.11.1-py36-none-any.whl", hash = "sha256:d8e87b086373fbd19c28659391e5b8888aadeaeb88f0e448e55502578bde4920"},
+    {file = "multiprocess-0.70.11.1-py37-none-any.whl", hash = "sha256:164c77448e357ebee0dc6abc7ee8c823e40e295e629a5fc6d31725109a3a7ee9"},
+    {file = "multiprocess-0.70.11.1-py38-none-any.whl", hash = "sha256:7761fed45cae123aa4b7bb918e77a5cfef6fd436c65bc87453e76bf2bdc3e29e"},
+    {file = "multiprocess-0.70.11.1-py39-none-any.whl", hash = "sha256:ae026110257fc551fc949d96d69160768810d9019786c8c84c0c28d1f88fab67"},
+    {file = "multiprocess-0.70.11.1.zip", hash = "sha256:9d5e417f3ebce4d027a3c900995840f167f316d9f73c0a7a1fbb4ac0116298d0"},
+]
 mypy-extensions = [
     {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
     {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
@@ -1151,10 +1348,65 @@ notebook = [
     {file = "notebook-6.1.6-py3-none-any.whl", hash = "sha256:e6a62188e319a5d45dd2ed24719f646adf88bef8be1f654ebd0ab360ece6d7a6"},
     {file = "notebook-6.1.6.tar.gz", hash = "sha256:cf40d4f81541401db5a2fda1707ca7877157abd41f04ef7b88f02b67f3c61791"},
 ]
+numpy = [
+    {file = "numpy-1.19.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cc6bd4fd593cb261332568485e20a0712883cf631f6f5e8e86a52caa8b2b50ff"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:aeb9ed923be74e659984e321f609b9ba54a48354bfd168d21a2b072ed1e833ea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8b5e972b43c8fc27d56550b4120fe6257fdc15f9301914380b27f74856299fea"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:43d4c81d5ffdff6bae58d66a3cd7f54a7acd9a0e7b18d97abb255defc09e3140"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:a4646724fba402aa7504cd48b4b50e783296b5e10a524c7a6da62e4a8ac9698d"},
+    {file = "numpy-1.19.5-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:2e55195bc1c6b705bfd8ad6f288b38b11b1af32f3c8289d6c50d47f950c12e76"},
+    {file = "numpy-1.19.5-cp36-cp36m-win32.whl", hash = "sha256:39b70c19ec771805081578cc936bbe95336798b7edf4732ed102e7a43ec5c07a"},
+    {file = "numpy-1.19.5-cp36-cp36m-win_amd64.whl", hash = "sha256:dbd18bcf4889b720ba13a27ec2f2aac1981bd41203b3a3b27ba7a33f88ae4827"},
+    {file = "numpy-1.19.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:603aa0706be710eea8884af807b1b3bc9fb2e49b9f4da439e76000f3b3c6ff0f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:cae865b1cae1ec2663d8ea56ef6ff185bad091a5e33ebbadd98de2cfa3fa668f"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:36674959eed6957e61f11c912f71e78857a8d0604171dfd9ce9ad5cbf41c511c"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:06fab248a088e439402141ea04f0fffb203723148f6ee791e9c75b3e9e82f080"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6149a185cece5ee78d1d196938b2a8f9d09f5a5ebfbba66969302a778d5ddd1d"},
+    {file = "numpy-1.19.5-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:50a4a0ad0111cc1b71fa32dedd05fa239f7fb5a43a40663269bb5dc7877cfd28"},
+    {file = "numpy-1.19.5-cp37-cp37m-win32.whl", hash = "sha256:d051ec1c64b85ecc69531e1137bb9751c6830772ee5c1c426dbcfe98ef5788d7"},
+    {file = "numpy-1.19.5-cp37-cp37m-win_amd64.whl", hash = "sha256:a12ff4c8ddfee61f90a1633a4c4afd3f7bcb32b11c52026c92a12e1325922d0d"},
+    {file = "numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf2402002d3d9f91c8b01e66fbb436a4ed01c6498fffed0e4c7566da1d40ee1e"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1ded4fce9cfaaf24e7a0ab51b7a87be9038ea1ace7f34b841fe3b6894c721d1c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:012426a41bc9ab63bb158635aecccc7610e3eff5d31d1eb43bc099debc979d94"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:759e4095edc3c1b3ac031f34d9459fa781777a93ccc633a472a5468587a190ff"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:a9d17f2be3b427fbb2bce61e596cf555d6f8a56c222bd2ca148baeeb5e5c783c"},
+    {file = "numpy-1.19.5-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:99abf4f353c3d1a0c7a5f27699482c987cf663b1eac20db59b8c7b061eabd7fc"},
+    {file = "numpy-1.19.5-cp38-cp38-win32.whl", hash = "sha256:384ec0463d1c2671170901994aeb6dce126de0a95ccc3976c43b0038a37329c2"},
+    {file = "numpy-1.19.5-cp38-cp38-win_amd64.whl", hash = "sha256:811daee36a58dc79cf3d8bdd4a490e4277d0e4b7d103a001a4e73ddb48e7e6aa"},
+    {file = "numpy-1.19.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c843b3f50d1ab7361ca4f0b3639bf691569493a56808a0b0c54a051d260b7dbd"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d6631f2e867676b13026e2846180e2c13c1e11289d67da08d71cacb2cd93d4aa"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:7fb43004bce0ca31d8f13a6eb5e943fa73371381e53f7074ed21a4cb786c32f8"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:2ea52bd92ab9f768cc64a4c3ef8f4b2580a17af0a5436f6126b08efbd1838371"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:400580cbd3cff6ffa6293df2278c75aef2d58d8d93d3c5614cd67981dae68ceb"},
+    {file = "numpy-1.19.5-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:df609c82f18c5b9f6cb97271f03315ff0dbe481a2a02e56aeb1b1a985ce38e60"},
+    {file = "numpy-1.19.5-cp39-cp39-win32.whl", hash = "sha256:ab83f24d5c52d60dbc8cd0528759532736b56db58adaa7b5f1f76ad551416a1e"},
+    {file = "numpy-1.19.5-cp39-cp39-win_amd64.whl", hash = "sha256:0eef32ca3132a48e43f6a0f5a82cb508f22ce5a3d6f67a8329c81c8e226d3f6e"},
+    {file = "numpy-1.19.5-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:a0d53e51a6cb6f0d9082decb7a4cb6dfb33055308c4c44f53103c073f649af73"},
+    {file = "numpy-1.19.5.zip", hash = "sha256:a76f502430dd98d7546e1ea2250a7360c065a5fdea52b2dffe8ae7180909b6f4"},
+]
 packaging = [
     {file = "packaging-20.8-py2.py3-none-any.whl", hash = "sha256:24e0da08660a87484d1602c30bb4902d74816b6985b93de36926f5bc95741858"},
     {file = "packaging-20.8.tar.gz", hash = "sha256:78598185a7008a470d64526a8059de9aaa449238f280fc9eb6b13ba6c4109093"},
 ]
+pandas = [
+    {file = "pandas-1.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cba93d4fd3b0a42858b2b599495aff793fb5d94587979f45a14177d1217ba446"},
+    {file = "pandas-1.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:9e18631d996fe131de6cb31a8bdae18965cc8f39eb23fdfbbf42808ecc63dabf"},
+    {file = "pandas-1.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7b54c14130a3448d81eed1348f52429c23e27188d9db6e6d4afeae792bc49c11"},
+    {file = "pandas-1.2.0-cp37-cp37m-win32.whl", hash = "sha256:6c1a57e4d0d6f9633a07817c44e6b36d81c265fe4c52d0c0505513a2d0f7953c"},
+    {file = "pandas-1.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:43482789c55cbabeed9482263cfc98a11e8fcae900cb63ef038948acb4a72570"},
+    {file = "pandas-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0be6102dd99910513e75ed6536284743ead810349c51bdeadd2a5b6649f30abb"},
+    {file = "pandas-1.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:9c6692cea6d56da8650847172bdb148622f545e7782d17995822434c79d7a211"},
+    {file = "pandas-1.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:272675a98fa4954b9fc0933df775596fc942e50015d7e75d8f19548808a2bfdf"},
+    {file = "pandas-1.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:33318fa24b192b1a4684347ff76679a7267fd4e547da9f71556a5914f0dc10e7"},
+    {file = "pandas-1.2.0-cp38-cp38-win32.whl", hash = "sha256:3bc6d2be03cb75981d8cbeda09503cd9d6d699fc0dc28a65e197165ad527b7b8"},
+    {file = "pandas-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:7904ee438549b5223ce8dc008772458dd7c5cf0ccc64cf903e81202400702235"},
+    {file = "pandas-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f8b87d2f541cd9bc4ecfe85a561abac85c33fe4de4ce70cca36b2768af2611f5"},
+    {file = "pandas-1.2.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:91fd0b94e7b98528177a05e6f65efea79d7ef9dec15ee48c7c69fc39fdd87235"},
+    {file = "pandas-1.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8f92b07cdbfa3704d85b4264e52c216cafe6c0059b0d07cdad8cb29e0b90f2b8"},
+    {file = "pandas-1.2.0-cp39-cp39-win32.whl", hash = "sha256:2d8b4f532db37418121831a461fd107d826c240b098f52e7a1b4ab3d5aaa4fb2"},
+    {file = "pandas-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:616478c1bd8fe1e600f521ae2da434e021c11e7a4e5da3451d02906143d3629a"},
+    {file = "pandas-1.2.0.tar.gz", hash = "sha256:e03386615b970b8b41da6a68afe717626741bb2431cec993640685614c0680e4"},
+]
 pandocfilters = [
     {file = "pandocfilters-1.4.3.tar.gz", hash = "sha256:bc63fbb50534b4b1f8ebe1860889289e8af94a23bff7445259592df25a3906eb"},
 ]
@@ -1194,6 +1446,33 @@ py = [
     {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
     {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
 ]
+pyarrow = [
+    {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_13_intel.whl", hash = "sha256:6afc71cc9c234f3cdbe971297468755ec3392966cb19d3a6caf42fd7dbc6aaa9"},
+    {file = "pyarrow-2.0.0-cp35-cp35m-macosx_10_9_intel.whl", hash = "sha256:eb05038b750a6e16a9680f9d2c40d050796284ea1f94690da8f4f28805af0495"},
+    {file = "pyarrow-2.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3e33e9003794c9062f4c963a10f2a0d787b83d4d1a517a375294f2293180b778"},
+    {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:ffb306951b5925a0638dc2ef1ab7ce8033f39e5b4e0fef5787b91ef4fa7da19d"},
+    {file = "pyarrow-2.0.0-cp35-cp35m-manylinux2014_x86_64.whl", hash = "sha256:dc0d04c42632e65c4fcbe2f82c70109c5f347652844ead285bc1285dc3a67660"},
+    {file = "pyarrow-2.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:916b593a24f2812b9a75adef1143b1dd89d799e1803282fea2829c5dc0b828ea"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_13_x86_64.whl", hash = "sha256:c801e59ec4e8d9d871e299726a528c3ba3139f2ce2d9cdab101f8483c52eec7c"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0bf43e520c33ceb1dd47263a5326830fca65f18d827f7f7b8fe7e64fc4364d88"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0b358773eb9fb1b31c8217c6c8c0b4681c3dff80562dc23ad5b379f0279dad69"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:1000e491e9a539588ec33a2c2603cf05f1d4629aef375345bfd64f2ab7bc8529"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:ce0462cec7f81c4ff87ce1a95c82a8d467606dce6c72e92906ac251c6115f32b"},
+    {file = "pyarrow-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:16ec87163a2fb4abd48bf79cbdf70a7455faa83740e067c2280cfa45a63ed1f3"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:acdd18fd83c0be0b53a8e734c0a650fb27bbf4e7d96a8f7eb0a7506ea58bd594"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9a8d3c6baa6e159017d97e8a028ae9eaa2811d8f1ab3d22710c04dcddc0dd7a1"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:652c5dff97624375ed0f97cc8ad6f88ee01953f15c17083917735de171f03fe0"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:00d8fb8a9b2d9bb2f0ced2765b62c5d72689eed06c47315bca004584b0ccda60"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:fb69672e69e1b752744ee1e236fdf03aad78ffec905fc5c19adbaf88bac4d0fd"},
+    {file = "pyarrow-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:ccff3a72f70ebfcc002bf75f5ad1248065e5c9c14e0dcfa599a438ea221c5658"},
+    {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:bc8c3713086e4a137b3fda4b149440458b1b0bd72f67b1afa2c7068df1edc060"},
+    {file = "pyarrow-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9f4ba9ab479c0172e532f5d73c68e30a31c16b01e09bb21eba9201561231f722"},
+    {file = "pyarrow-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:0db5156a66615591a4a8c66a9a30890a364a259de8d2a6ccb873c7d1740e6c75"},
+    {file = "pyarrow-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:cf9bf10daadbbf1a360ac1c7dab0b4f8381d81a3f452737bd6ed310d57a88be8"},
+    {file = "pyarrow-2.0.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:dd661b6598ce566c6f41d31cc1fc4482308613c2c0c808bd8db33b0643192f84"},
+    {file = "pyarrow-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:14b02a629986c25e045f81771799e07a8bb3f339898c111314066436769a3dd4"},
+    {file = "pyarrow-2.0.0.tar.gz", hash = "sha256:b5e6cd217457e8febcc98a6c279b96f72d5c31a24cd2bffd8d3b2da701d2025c"},
+]
 pycparser = [
     {file = "pycparser-2.20-py2.py3-none-any.whl", hash = "sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"},
     {file = "pycparser-2.20.tar.gz", hash = "sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0"},
@@ -1217,6 +1496,10 @@ python-dateutil = [
     {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
     {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
 ]
+pytz = [
+    {file = "pytz-2020.5-py2.py3-none-any.whl", hash = "sha256:16962c5fb8db4a8f63a26646d8886e9d769b6c511543557bc84e9569fb9a9cb4"},
+    {file = "pytz-2020.5.tar.gz", hash = "sha256:180befebb1927b16f6b57101720075a984c019ac16b1b7575673bea42c6c3da5"},
+]
 pywin32 = [
     {file = "pywin32-300-cp35-cp35m-win32.whl", hash = "sha256:1c204a81daed2089e55d11eefa4826c05e604d27fe2be40b6bf8db7b6a39da63"},
     {file = "pywin32-300-cp35-cp35m-win_amd64.whl", hash = "sha256:350c5644775736351b77ba68da09a39c760d75d2467ecec37bd3c36a94fbed64"},
@@ -1319,10 +1602,61 @@ requests = [
     {file = "requests-2.25.1-py2.py3-none-any.whl", hash = "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"},
     {file = "requests-2.25.1.tar.gz", hash = "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804"},
 ]
+scikit-learn = [
+    {file = "scikit-learn-0.24.0.tar.gz", hash = "sha256:076369634ee72b5a5941440661e2f306ff4ac30903802dc52031c7e9199ac640"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:890d7d588f65acb0c4f6c083347c9076916bda5e6bd8400f06244b1afc1009af"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:e534f5f3796db6781c87e9835dcd51b7854c8c5a379c9210b93605965c1941fd"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:d7fe05fcb44eadd6d6c874c768f085f5de1239db3a3b7be4d3d23d12e4120589"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:7f654befc5ad413690cc58f3f34a3e906caf825195ce0fda00a8e9565e1403e6"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-win32.whl", hash = "sha256:afeb06dc69847927634e58579b9cdc72e1390b79497336b2324b1b173f33bd47"},
+    {file = "scikit_learn-0.24.0-cp36-cp36m-win_amd64.whl", hash = "sha256:26f66b3726b54dfb76ea51c5d9c2431ed17ebc066cb4527662b9e851a3e7ba61"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c08b27cb78ee8d2dc781a7affed09859441f5b624f9f92da59ac0791c8774dfc"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:905d8934d1e27a686698864a5863ff2c0e13a2ae1adb78a8a848aacc8a49927d"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:d819d625832fb2969911a243e009cfa135cb8ef1e150866e417d6e9d75290087"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:18f7131e62265bf2691ed1d0303c640313894ccfe4278427478c6b2f45094b53"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-win32.whl", hash = "sha256:b0d13fd56d26cf3de0314a4fd48037108c638fe126d813f5c1222bb0f08b6a76"},
+    {file = "scikit_learn-0.24.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c912247e42114f389858ae05d63f4359d4e667ea72aaabee191aee9ad3f9774a"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:758619e49cd7c17282e6cc60d5cc73c02c072b47c9a10010bb3bb47e0d976e50"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:66f27bf21202a850bcd7b6303916e4907f6e22ec59a14974ede4955aed5c7ed0"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:5e6e3c042cea83f2e20a45e563b8eabc1f8f72446251fe23ebefdf111a173a33"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:2a5348585aa793bc8cc5a72f8e9067c9380834b0aadbd55f924843b071f13282"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-win32.whl", hash = "sha256:743b6edd98c98991be46c08e6b21df3861d5ae915f91d59f988384d93f7263e7"},
+    {file = "scikit_learn-0.24.0-cp38-cp38-win_amd64.whl", hash = "sha256:2951f87d35e72f007701c6e028aa230f6df6212a3194677c0c950486066a454d"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:44e452ea8491225c5783d49577aad0f36202dfd52aec7f82c0fdfe5fbd5f7400"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:800aaf63f8838c00e85db2267dd226f89858594843fd03932a9eda95746d2c40"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:3eeff086f7329521d27249a082ea3c48c085cedb110db5f65968ab55c3ba2e09"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4395e91b3548005f4a645018435b5a94f8cce232b5b70753020e606c6a750656"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-win32.whl", hash = "sha256:80ca024154b84b6ac4cfc86930ba13fdc348a209753bf2c16129db6f9eb8a80b"},
+    {file = "scikit_learn-0.24.0-cp39-cp39-win_amd64.whl", hash = "sha256:490436b44b3a1957cb625e871764b0aa330b34cc416aea4abc6c38ca63d0d682"},
+]
+scipy = [
+    {file = "scipy-1.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3d4303e3e21d07d9557b26a1707bb9fc065510ee8501c9bf22a0157249a82fd0"},
+    {file = "scipy-1.6.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:1bc5b446600c4ff7ab36bade47180673141322f0febaa555f1c433fe04f2a0e3"},
+    {file = "scipy-1.6.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:8840a9adb4ede3751f49761653d3ebf664f25195fdd42ada394ffea8903dd51d"},
+    {file = "scipy-1.6.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:8629135ee00cc2182ac8be8e75643b9f02235942443732c2ed69ab48edcb6614"},
+    {file = "scipy-1.6.0-cp37-cp37m-win32.whl", hash = "sha256:58731bbe0103e96b89b2f41516699db9b63066e4317e31b8402891571f6d358f"},
+    {file = "scipy-1.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:876badc33eec20709d4e042a09834f5953ebdac4088d45a4f3a1f18b56885718"},
+    {file = "scipy-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c0911f3180de343643f369dc5cfedad6ba9f939c2d516bddea4a6871eb000722"},
+    {file = "scipy-1.6.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:b8af26839ae343655f3ca377a5d5e5466f1d3b3ac7432a43449154fe958ae0e0"},
+    {file = "scipy-1.6.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:4f1d9cc977ac6a4a63c124045c1e8bf67ec37098f67c699887a93736961a00ae"},
+    {file = "scipy-1.6.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:eb7928275f3560d47e5538e15e9f32b3d64cd30ea8f85f3e82987425476f53f6"},
+    {file = "scipy-1.6.0-cp38-cp38-win32.whl", hash = "sha256:31ab217b5c27ab429d07428a76002b33662f98986095bbce5d55e0788f7e8b15"},
+    {file = "scipy-1.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:2f1c2ebca6fd867160e70102200b1bd07b3b2d31a3e6af3c58d688c15d0d07b7"},
+    {file = "scipy-1.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:155225621df90fcd151e25d51c50217e412de717475999ebb76e17e310176981"},
+    {file = "scipy-1.6.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:f68d5761a2d2376e2b194c8e9192bbf7c51306ca176f1a0889990a52ef0d551f"},
+    {file = "scipy-1.6.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:d902d3a5ad7f28874c0a82db95246d24ca07ad932741df668595fe00a4819870"},
+    {file = "scipy-1.6.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:aef3a2dbc436bbe8f6e0b635f0b5fe5ed024b522eee4637dbbe0b974129ca734"},
+    {file = "scipy-1.6.0-cp39-cp39-win32.whl", hash = "sha256:cdbc47628184a0ebeb5c08f1892614e1bd4a51f6e0d609c6eed253823a960f5b"},
+    {file = "scipy-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:313785c4dab65060f9648112d025f6d2fec69a8a889c714328882d678a95f053"},
+    {file = "scipy-1.6.0.tar.gz", hash = "sha256:cb6dc9f82dfd95f6b9032a8d7ea70efeeb15d5b5fd6ed4e8537bb3c673580566"},
+]
 send2trash = [
     {file = "Send2Trash-1.5.0-py3-none-any.whl", hash = "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"},
     {file = "Send2Trash-1.5.0.tar.gz", hash = "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2"},
 ]
+seqeval = [
+    {file = "seqeval-1.2.2.tar.gz", hash = "sha256:f28e97c3ab96d6fcd32b648f6438ff2e09cfba87f05939da9b3970713ec56e6f"},
+]
 six = [
     {file = "six-1.15.0-py2.py3-none-any.whl", hash = "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"},
     {file = "six-1.15.0.tar.gz", hash = "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259"},
@@ -1335,6 +1669,10 @@ testpath = [
     {file = "testpath-0.4.4-py2.py3-none-any.whl", hash = "sha256:bfcf9411ef4bf3db7579063e0546938b1edda3d69f4e1fb8756991f5951f85d4"},
     {file = "testpath-0.4.4.tar.gz", hash = "sha256:60e0a3261c149755f4399a1fff7d37523179a70fdc3abdf78de9fc2604aeec7e"},
 ]
+threadpoolctl = [
+    {file = "threadpoolctl-2.1.0-py3-none-any.whl", hash = "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725"},
+    {file = "threadpoolctl-2.1.0.tar.gz", hash = "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"},
+]
 toml = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
@@ -1382,6 +1720,10 @@ tornado = [
     {file = "tornado-6.1-cp39-cp39-win_amd64.whl", hash = "sha256:548430be2740e327b3fe0201abe471f314741efcb0067ec4f2d7dcfb4825f3e4"},
     {file = "tornado-6.1.tar.gz", hash = "sha256:33c6e81d7bd55b468d2e793517c909b139960b6c790a60b7991b9b6b76fb9791"},
 ]
+tqdm = [
+    {file = "tqdm-4.49.0-py2.py3-none-any.whl", hash = "sha256:8f3c5815e3b5e20bc40463fa6b42a352178859692a68ffaa469706e6d38342a5"},
+    {file = "tqdm-4.49.0.tar.gz", hash = "sha256:faf9c671bd3fad5ebaeee366949d969dca2b2be32c872a7092a1e1a9048d105b"},
+]
 traitlets = [
     {file = "traitlets-5.0.5-py3-none-any.whl", hash = "sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426"},
     {file = "traitlets-5.0.5.tar.gz", hash = "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396"},
@@ -1435,3 +1777,55 @@ webencodings = [
     {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"},
     {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"},
 ]
+xxhash = [
+    {file = "xxhash-2.0.0-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:df8d1ebdef86bd5d772d81c91d5d111a5ee8e4b68b8fc6b6edfa5aa825dd2a3d"},
+    {file = "xxhash-2.0.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:f01c59f5bad2e46bb4235b71b36c56be353f08b6d514a3bd0deb9bf56e4b180a"},
+    {file = "xxhash-2.0.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:cb4feeb8881eb89b9ddd0fae797deb078ebdaad6b1ae6c185b9993d241ed365a"},
+    {file = "xxhash-2.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:2912d7810bcf7e39b3929fb186fe46ff83b1bd4a3d6b7eba956d57fa1516ac0c"},
+    {file = "xxhash-2.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:48b99c55fc643b32f5efca9c35fcaac6ea553958cf503e202c10eb62718e7a0e"},
+    {file = "xxhash-2.0.0-cp27-cp27m-win32.whl", hash = "sha256:3221f1a5bc2ee1f150b84a0c4c7cddc7724aaa01460f3353cf63fd667d89f593"},
+    {file = "xxhash-2.0.0-cp27-cp27m-win_amd64.whl", hash = "sha256:cba4b6d174b524623ac8b64bda734601d574f95033f87ddf9c495c69a70135e8"},
+    {file = "xxhash-2.0.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:b94f13f4f946500f3cc78f11da4ec4b340bd92c5200b5fe4e6aeac96064aa1fd"},
+    {file = "xxhash-2.0.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:635b1d7fa85d215112f41d089bd113ac139f6a42769fcc49c73e779904160f7f"},
+    {file = "xxhash-2.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:0f5f1b9ae8e2cf2ff606018769f7e46147df70291312f64e1b80d10482ca8c0b"},
+    {file = "xxhash-2.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8f90deec6567a38e1da29feff36973468691e309b2db8235e64936e61df77c43"},
+    {file = "xxhash-2.0.0-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:8b7e930a60dfe7380e52466aa27941290dd575a5750c622158c86941797eaa1b"},
+    {file = "xxhash-2.0.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:44b26872fd63f1eaf1ab527817aebbd455a3fdcbd56ff6df74fd42a6a137cff4"},
+    {file = "xxhash-2.0.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:3d25b540148f1ebf4852e4115f3f4819b585ecd36f121a1f388e8966d69d3a1c"},
+    {file = "xxhash-2.0.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:bcd1e9f3ba8df23edefe1d0a886f16b4e27602acbd8575b39540fea26e1aa6d2"},
+    {file = "xxhash-2.0.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:fc03a399205268815742125b17d967afa9f23b08cdafe185e41368cf7ba9b278"},
+    {file = "xxhash-2.0.0-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:bdbc195231c87d63b0503785d9c5264f4275a92da41d9f28fdf08fb321453356"},
+    {file = "xxhash-2.0.0-cp35-cp35m-win32.whl", hash = "sha256:7291392bdb1d38c44557dfd3fcd4fd04c363a696dbfa7e6592700a31e4ff6657"},
+    {file = "xxhash-2.0.0-cp35-cp35m-win_amd64.whl", hash = "sha256:e0fc170c3a00ca008d992c2e6324da3f1467b30044b5835d2feb27870645d38c"},
+    {file = "xxhash-2.0.0-cp36-cp36m-macosx_10_6_intel.whl", hash = "sha256:5b3c0c84187556d463626ceed85f0d735a5b8ea1678da3e858d3934f38f23915"},
+    {file = "xxhash-2.0.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:2f0ca6673fcbae988389576a779c00a62a28718a18ddc7b2e5b32d7fb30c6f98"},
+    {file = "xxhash-2.0.0-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:d1859d54837af16ae2a7975477e619793ac698a374d909f533e317c3b384b223"},
+    {file = "xxhash-2.0.0-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:9d0311fcd78dabe04ab3b4034659628b00ac220e77e37648f73aebbf4cb13680"},
+    {file = "xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:0ecea927fd3df8f3f3a1d6e5bc85838eb44a69ea2f4c9263dfd0f68c4e17e483"},
+    {file = "xxhash-2.0.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:4167f22b037e128820f7642ecc1fbf1b4b4956346093a2e75081bee82b9cfb7e"},
+    {file = "xxhash-2.0.0-cp36-cp36m-win32.whl", hash = "sha256:85c5de6c56335b75beef2cba713f95a1b62422be5e27dad30b5083419c6839c4"},
+    {file = "xxhash-2.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:ade1c356acd0b0454a3d3cf42442afe7ad0f46fc944ea1e84720b3858bfdb772"},
+    {file = "xxhash-2.0.0-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:fca7d0fb6fde33d1ac5f97298f44e711e5fe1b4587832864be8c6545cb072a54"},
+    {file = "xxhash-2.0.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e296b0dee072a54c40c04f09ca35bb9902bb74b54f0fffeafabfc937b3ec85f9"},
+    {file = "xxhash-2.0.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:02476c5cef803cfd1350662b1e543e47ad64bd5f7f792033d94d590f9674da11"},
+    {file = "xxhash-2.0.0-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:28c1f0bb6dadc11162d1f2e203d7a12d38b511b87fbb5ffa729594fd456f48e6"},
+    {file = "xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:922ae5b1efa1f9a9cc959f7197113a623ad110853622e990433242a9d8d00d5c"},
+    {file = "xxhash-2.0.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:7709bc8a5e30c74b07203553f33232531e7739458f72204908cedb08a00bd546"},
+    {file = "xxhash-2.0.0-cp37-cp37m-win32.whl", hash = "sha256:fb3c9760598009b1d8bbe57785e278aeb956efb7372d8f9b0bb43cd46f420dff"},
+    {file = "xxhash-2.0.0-cp37-cp37m-win_amd64.whl", hash = "sha256:3f29f6d455388cc415fe52c0f63f442aaea674cee35a2252d8d4dc8d640938c6"},
+    {file = "xxhash-2.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bf360465dc3d24b1501b799c85815c82ddcfc0ffbcba0232968f3a7cd64306fc"},
+    {file = "xxhash-2.0.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:5d2edbb50025a67f061d09d381c54c7d0948c1572f6c9bd15ee238a303d368d9"},
+    {file = "xxhash-2.0.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7943ede91d8aedfcacb7178b2d881b7498145590206ff61c3e84dc66e6a51d6a"},
+    {file = "xxhash-2.0.0-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:b5c2edb8b0a2acc5bdac984b3177711f206463b970aa03087221771c2b0d8f1d"},
+    {file = "xxhash-2.0.0-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:16e4b7d508bb49b6fc84bf077f2f7f51263b5618cc61f33a64ed43786ec2c6cf"},
+    {file = "xxhash-2.0.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:80903d4ce7337921bbc8e5ac695b45691b43c0a00b21964c76e19ea21b9108ea"},
+    {file = "xxhash-2.0.0-cp38-cp38-win32.whl", hash = "sha256:e37b25182e969212d5aec60a8da7d1e6a960dbffdb9ba4c63e2240de3605c184"},
+    {file = "xxhash-2.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:fabee25186b6649bbf6ff258f23941339902374786f8317b0422144ddaa505df"},
+    {file = "xxhash-2.0.0-pp27-pypy_73-manylinux1_x86_64.whl", hash = "sha256:be93004b832717234a7d2f47dc555428ab1e8712f99cad7d212cebe0e27d3d48"},
+    {file = "xxhash-2.0.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:1b86f49b36c25ebdbd1b5539d428a37d9051ad49eb576a3edd964a8770bc8f3a"},
+    {file = "xxhash-2.0.0-pp27-pypy_73-win32.whl", hash = "sha256:bde4d39997de901d0a66ebd631b34f9cf106676fec0878f36b7baf630cb3965a"},
+    {file = "xxhash-2.0.0-pp36-pypy36_pp73-manylinux1_x86_64.whl", hash = "sha256:99b5412a3eddb1aa9aaf36cdbf93be4eca99ad83ff8c692672fdeedc7fb597de"},
+    {file = "xxhash-2.0.0-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:33c4832e689f429539d70baf69162b41dfbabc7f31ca542b5b772cb8a55e7a79"},
+    {file = "xxhash-2.0.0-pp36-pypy36_pp73-win32.whl", hash = "sha256:82034c9ed54db20f051133cba01de959b5208fe2900e67ebb4c9631f1fd523fd"},
+    {file = "xxhash-2.0.0.tar.gz", hash = "sha256:58ca818554c1476fa1456f6cd4b87002e2294f09baf0f81e5a2a4968e62c423c"},
+]
diff --git a/pyproject.toml b/pyproject.toml
index ba1133b..e7408ed 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,8 @@ isort = "^5.6.4"
 pytest = "^6.1.2"
 cython = "^0.29.21"
 jupyterlab = "^2.2.9"
+datasets = "^1.2.0"
+seqeval = "^1.2.2"
 
 [tool.isort]
 line_length = 88

From 754f6f181c4545b7a70eb32c937fad9280372a76 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:41:18 +0100
Subject: [PATCH 16/22] chore: add example

---
 examples/README.md   |  23 +++++++
 examples/conll.py    | 140 +++++++++++++++++++++++++++++++++++++++++++
 examples/training.py |  30 ----------
 3 files changed, 163 insertions(+), 30 deletions(-)
 create mode 100644 examples/README.md
 create mode 100644 examples/conll.py
 delete mode 100644 examples/training.py

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..d19b1e6
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,23 @@
+datasets
+seqeval
+
+
+## How it works
+
+
+
+```
+["John", "Lennon"]
+```
+
+becomes
+
+```
+{"text": "john", "is_capitalized": True}
+```
+
+becomes
+
+```
+{"text": "john", "is_capitalized": True, "text+1": "lennon", "is_capitalized+1": True}
+```
diff --git a/examples/conll.py b/examples/conll.py
new file mode 100644
index 0000000..6388f74
--- /dev/null
+++ b/examples/conll.py
@@ -0,0 +1,140 @@
+from typing import Any, Dict, List, Union
+
+import datasets
+from seqeval.metrics import classification_report
+
+import chaine
+from chaine.logging import Logger
+
+Sentence = List[str]
+Tags = List[str]
+Features = Dict[str, Union[float, int, str, bool]]
+Dataset = Dict[str, Dict[str, Any]]
+
+LOGGER = Logger(__name__)
+
+
+def featurize_token(token_index: int, sentence: Sentence, pos_tags: Tags) -> Features:
+    """Extract features from a token in a sentence
+
+    Parameters
+    ----------
+    token_index : int
+        todo
+    sentence : Sentence
+        todo
+    pos_tags : Tags
+        todo
+
+    Returns
+    -------
+    Features
+        todo
+    """
+    token = sentence[token_index]
+    pos_tag = pos_tags[token_index]
+
+    features = {
+        "bias": 1.0,
+        "token.lower()": token.lower(),
+        "token[-3:]": token[-3:],
+        "token[-2:]": token[-2:],
+        "token.isupper()": token.isupper(),
+        "token.istitle()": token.istitle(),
+        "token.isdigit()": token.isdigit(),
+        "pos_tag": pos_tag,
+    }
+    if token_index > 0:
+        previous_token = sentence[token_index - 1]
+        previous_pos_tag = pos_tags[token_index - 1]
+        features.update(
+            {
+                "-1:token.lower()": previous_token.lower(),
+                "-1:token.istitle()": previous_token.istitle(),
+                "-1:token.isupper()": previous_token.isupper(),
+                "-1:pos_tag": previous_pos_tag,
+            }
+        )
+    else:
+        features["BOS"] = True
+
+    if token_index < len(sentence) - 1:
+        next_token = sentence[token_index + 1]
+        next_pos_tag = pos_tags[token_index + 1]
+        features.update(
+            {
+                "+1:token.lower()": next_token.lower(),
+                "+1:token.istitle()": next_token.istitle(),
+                "+1:token.isupper()": next_token.isupper(),
+                "+1:pos_tag": next_pos_tag,
+            }
+        )
+    else:
+        features["EOS"] = True
+
+    return features
+
+
+def featurize_sentence(sentence: List[str], pos_tags: List[str]) -> List[Features]:
+    """Extract features from tokens in a sentence
+
+    Parameters
+    ----------
+    sentence : Sentence
+        todo
+    pos_tags : Tags
+        todo
+
+    Returns
+    -------
+    List[Features]
+        todo
+    """
+    return [
+        featurize_token(token_index, sentence, pos_tags)
+        for token_index in range(len(sentence))
+    ]
+
+
+def featurize_dataset(dataset: Dataset) -> List[List[Features]]:
+    """Extract features from tokens in a sentence
+
+    Parameters
+    ----------
+    dataset : Dataset
+        todo
+
+    Returns
+    -------
+    List[List[Features]]
+        todo
+    """
+    return [
+        featurize_sentence(sentence, pos_tags)
+        for sentence, pos_tags in zip(dataset["tokens"], dataset["pos_tags"])
+    ]
+
+def preprocess_labels(dataset: Dataset) -> List[List[str]]:
+    labels = dataset.features["ner_tags"].feature.names
+    return [[labels[index] for index in indices] for indices in dataset["ner_tags"]]
+
+
+if __name__ == "__main__":
+    LOGGER.info("Loading raw dataset")
+    dataset = datasets.load_dataset("conll2003")
+
+    LOGGER.info("Extracting features from dataset for training")
+    sentences = featurize_dataset(dataset["train"])
+    labels = preprocess_labels(dataset["train"])
+
+    model = chaine.train(sentences, labels)
+
+    LOGGER.info("Extracting features from dataset for evaluation")
+    sentences = featurize_dataset(dataset["test"])
+    labels = preprocess_labels(dataset["test"])
+
+    LOGGER.info("Evaluating the model")
+    predictions = model.predict(sentences)
+
+    print("\nEvaluation:")
+    print(classification_report(labels, predictions))
diff --git a/examples/training.py b/examples/training.py
deleted file mode 100644
index e16bf08..0000000
--- a/examples/training.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import chaine
-from chaine.data import Token
-from flair.models import SequenceTagger
-from flair.data import Sentence
-import tqdm
-import datasets
-
-TAGGER = SequenceTagger.load("pos-multi-fast")
-DATASET = datasets.load_dataset("germaner")
-
-
-def preprocess(dataset):
-    for tokens in tqdm.tqdm(dataset):
-        sentence = Sentence(" ".join(tokens), use_tokenizer=False)
-        TAGGER.predict(sentence)
-        pos_tags = [token.get_tag("upos").value for token in sentence]
-        features = [Token(i, text).features for i, text in enumerate(tokens)]
-        for token, pos in zip(features, pos_tags):
-            token["pos"] = pos
-        yield features
-
-
-
-
-
-if __name__ == "__main__":
-    tokens = preprocess(DATASET["train"]["tokens"][:10])
-    labels = DATASET["train"]["ner_tags"][:10]
-
-    model = chaine.train(tokens, labels)

From e1e111ebb100d581f0c6c4a00c1b0333f2844146 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:41:28 +0100
Subject: [PATCH 17/22] chore: update readme

---
 README.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 53a9176..85bfe6c 100644
--- a/README.md
+++ b/README.md
@@ -28,17 +28,12 @@ If you are interested in the theoretical concepts behind conditional random fiel
 [['B-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC']]
 ```
 
-Check out the introducing [Jupyter notebook](https://github.com/severinsimmler/chaine/blob/master/notebooks/tutorial.ipynb).
-
-
-## How it works
-
-
+Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples).
 
 
 ## Credits
 
-This library makes use of and is partially based on:
+This project makes use of and is partially based on:
 
 - [CRFsuite](https://github.com/chokkan/crfsuite)
 - [libLBFGS](https://github.com/chokkan/liblbfgs)

From 0b389ff6da734faa4c72fac000b0dbdbdd8484e1 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:43:11 +0100
Subject: [PATCH 18/22] chore: update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 85bfe6c..363763b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 Linear-chain conditional random fields for natural language processing.
 
-Chaine is a modern Python library without third-party dependencies and a backend written in C. You can train conditional random fields for natural language processing tasks like [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) or [part-of-speech tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging).
+Chaine is a modern Python library without third-party dependencies and a backend written in C. You can train conditional random fields for natural language processing tasks like [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition).
 
 - **Lightweight**: No use of bloated third-party libraries.
 - **Fast**: Performance critical parts are written in C and thus [blazingly fast](http://www.chokkan.org/software/crfsuite/benchmark.html).

From 12459002f8fb167fdc5c9f6299d303b8e0cd7791 Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:44:22 +0100
Subject: [PATCH 19/22] chore: update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 363763b..cf466e0 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ You can install the latest stable version from [PyPI](https://pypi.org/project/c
 $ pip install chaine
 ```
 
-If you are interested in the theoretical concepts behind conditional random fields, please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers).
+Please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields.
 
 
 ## Minimal working example

From fc97158d3f94d690bf1e21e54db5d30cd23d03fd Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:44:40 +0100
Subject: [PATCH 20/22] chore: update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cf466e0..31fd141 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ You can install the latest stable version from [PyPI](https://pypi.org/project/c
 $ pip install chaine
 ```
 
-Please refer to the introducing paper by [Lafferty et al](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields.
+Please refer to the introducing paper by [Lafferty et al.](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for the theoretical concepts behind conditional random fields.
 
 
 ## Minimal working example

From 7deff0501a91304f645bd96af32d11ac347d4b6b Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:45:23 +0100
Subject: [PATCH 21/22] chore: update readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 31fd141..e20fbb7 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Please refer to the introducing paper by [Lafferty et al.](https://repository.up
 [['B-PER', 'I-PER', 'O', 'O', 'O', 'B-LOC']]
 ```
 
-Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples).
+Check out the [examples](https://github.com/severinsimmler/chaine/blob/master/examples) for a more real-world use case.
 
 
 ## Credits

From bce8633a69606fce15604d418496381c2ddf021c Mon Sep 17 00:00:00 2001
From: severinsimmler <severin.simmler@posteo.de>
Date: Wed, 6 Jan 2021 22:46:41 +0100
Subject: [PATCH 22/22] fix: docstring

---
 chaine/api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/chaine/api.py b/chaine/api.py
index 4878d4e..e5eb8aa 100644
--- a/chaine/api.py
+++ b/chaine/api.py
@@ -1,8 +1,8 @@
 """
-chaine.training
-~~~~~~~~~~~~~~~
+chaine.api
+~~~~~~~~~~
 
-This module implements the high-level API to train a CRF
+This module implements the high-level API to train a conditional random field
 """
 
 from chaine.crf import Model, Trainer