From 4e42226337c5e765b5dcacea6439e96c7edcc6b3 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 10:56:04 +0100
Subject: [PATCH 001/102] Prototype for retagging using spacy

---
 grants_tagger_light/cli.py                    |   2 +
 .../preprocessing/preprocess_mesh.py          |   5 +-
 grants_tagger_light/retagging/__init__.py     |   8 +
 grants_tagger_light/retagging/retagging.py    | 184 +++++++
 poetry.lock                                   | 464 +++++++++++++++++-
 pyproject.toml                                |   1 +
 6 files changed, 660 insertions(+), 4 deletions(-)
 create mode 100644 grants_tagger_light/retagging/__init__.py
 create mode 100644 grants_tagger_light/retagging/retagging.py

diff --git a/grants_tagger_light/cli.py b/grants_tagger_light/cli.py
index 62b9cf2a..2449a499 100644
--- a/grants_tagger_light/cli.py
+++ b/grants_tagger_light/cli.py
@@ -5,6 +5,7 @@
 from grants_tagger_light.augmentation import augment_app
 from grants_tagger_light.download_epmc import download_epmc_cli
 from grants_tagger_light.evaluation import evaluate_app
+from grants_tagger_light.retagging import retag_app
 from grants_tagger_light.predict import predict_cli
 from grants_tagger_light.preprocessing import preprocess_app
 from grants_tagger_light.tune_threshold import tune_threshold_cli
@@ -18,6 +19,7 @@
 app.add_typer(preprocess_app, name="preprocess")
 app.add_typer(augment_app, name="augment")
 app.add_typer(evaluate_app, name="evaluate")
+app.add_typer(retag_app, name="retag")
 
 
 app.command("predict")(predict_cli)
diff --git a/grants_tagger_light/preprocessing/preprocess_mesh.py b/grants_tagger_light/preprocessing/preprocess_mesh.py
index ef1ca963..10401c6c 100644
--- a/grants_tagger_light/preprocessing/preprocess_mesh.py
+++ b/grants_tagger_light/preprocessing/preprocess_mesh.py
@@ -117,8 +117,7 @@ def preprocess_mesh(
         batch_size=batch_size,
         num_proc=num_proc,
         desc="Tokenizing",
-        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"},
-        load_from_cache_file=False,
+        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"}
     )
     logger.info("Time taken to tokenize: {}".format(time.time() - t1))
 
@@ -261,7 +260,7 @@ def preprocess_mesh_cli(
     if not data_path.endswith("jsonl"):
         logger.error(
             "It seems your input MeSH data is not in `jsonl` format. "
-            "Please, run first `scripts/mesh_json_to_jsonlpy.`"
+            "Please, run first `scripts/mesh_json_to_jsonl.py.`"
         )
         exit(-1)
 
diff --git a/grants_tagger_light/retagging/__init__.py b/grants_tagger_light/retagging/__init__.py
new file mode 100644
index 00000000..004cd5cd
--- /dev/null
+++ b/grants_tagger_light/retagging/__init__.py
@@ -0,0 +1,8 @@
+import typer
+from .retagging import retag_cli
+
+retag_app = typer.Typer()
+retag_app.command(
+    "mesh",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)(retag_cli)
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
new file mode 100644
index 00000000..d1fb3939
--- /dev/null
+++ b/grants_tagger_light/retagging/retagging.py
@@ -0,0 +1,184 @@
+import json
+import logging
+import multiprocessing
+import os
+import random
+
+import typer
+from loguru import logger
+import numpy as np
+
+from datasets import load_dataset
+
+from datasets import load_from_disk
+import spacy
+import spacy.cli
+from spacy.util import minibatch, compounding
+
+retag_app = typer.Typer()
+
+
+def _load_data(dset: list[str], limit=100, split=0.8):
+    """Load data from the IMDB dataset."""
+    # Partition off part of the train data for evaluation
+    random.Random(42).shuffle(dset)
+    train_size = int(split * len(dset))
+    test_size = int( (1-split) * len(dset))
+    train_dset =  dset[:train_size][:limit]
+    test_dset = dset[train_size:train_size+test_size][:limit]
+    return train_dset, test_dset
+
+
+def evaluate(tokenizer, textcat, texts, cats):
+    docs = (tokenizer(text) for text in texts)
+    tp = 1e-8  # True positives
+    fp = 1e-8  # False positives
+    fn = 1e-8  # False negatives
+    tn = 1e-8  # True negatives
+    for i, doc in enumerate(textcat.pipe(docs)):
+        gold = cats[i]
+        for label, score in doc.cats.items():
+            if label not in gold:
+                continue
+            if score >= 0.5 and gold[label] >= 0.5:
+                tp += 1.
+            elif score >= 0.5 and gold[label] < 0.5:
+                fp += 1.
+            elif score < 0.5 and gold[label] < 0.5:
+                tn += 1
+            elif score < 0.5 and gold[label] >= 0.5:
+                fn += 1
+    precision = tp / (tp + fp)
+    recall = tp / (tp + fn)
+    f_score = 2 * (precision * recall) / (precision + recall)
+    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
+
+def retag(
+    data_path: str,
+    save_to_path: str,
+    model_key: str = "gpt-3.5-turbo",
+    num_proc: int = os.cpu_count(),
+    batch_size: int = 64,
+    concurrent_calls: int = os.cpu_count() * 2,
+    tags_file_path: str = None,
+):
+    if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
+        raise NotImplementedError(
+            f"{model_key} not implemented as an augmentation framework"
+        )
+
+    # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
+    logging.info("Loading the MeSH jsonl...")
+    dset = load_dataset("json", data_files=data_path, num_proc=1)
+    if "train" in dset:
+        dset = dset["train"]
+
+    with open(tags_file_path, 'r') as f:
+        tags = [x.strip() for x in f.readlines()]
+
+    for tag in tags:
+        print(tag)
+        nlp = spacy.load("en_core_web_sm")
+
+        textcat = nlp.create_pipe("textcat")
+        nlp.add_pipe("textcat", last=True)
+
+        textcat.add_label("POSITIVE")
+
+        logging.info(f"Obtaining positive examples for {tag}...")
+        positive_dset = dset.filter(
+            lambda x: tag in x["meshMajor"], num_proc=num_proc
+        )
+        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
+
+        train_data = list(zip(pos_x_train, [{'cats': {'POSITIVE': True}}]))
+
+        logging.info(f"Obtaining negative examples for {tag}...")
+        negative_dset = dset.filter(
+            lambda x: tag not in x["meshMajor"], num_proc=num_proc
+        )
+        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
+
+        train_data.extend(list(zip(neg_x_train, [{'cats': {'POSITIVE': False}}])))
+
+        test = pos_x_test
+        test_cats = [{'cats': {'POSITIVE': True}} for _ in range(len(pos_x_test))]
+        test.extend(neg_x_test)
+        test_cats.extend([{'cats': {'POSITIVE': False}} for _ in range(len(neg_x_test))])
+
+        # get names of other pipes to disable them during training
+        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+
+        n_iter = 1
+        with nlp.disable_pipes(*other_pipes):  # only train textcat
+            optimizer = nlp.begin_training()
+            print("Training the model...")
+            print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
+            for i in range(n_iter):
+                losses = {}
+
+            # batch up the examples using spaCy's minibatch
+            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
+                           losses=losses)
+            with textcat.model.use_params(optimizer.averages):
+                # evaluate on the dev data split off in load_data()
+                scores = evaluate(nlp.tokenizer, textcat, test, test_cats)
+            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
+                  .format(losses['textcat'], scores['textcat_p'],
+                          scores['textcat_r'], scores['textcat_f']))
+        break
+
+@retag_app.command()
+def retag_cli(
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    save_to_path: str = typer.Argument(
+        ..., help="Path where to save the retagged data"
+    ),
+    model_key: str = typer.Option(
+        "gpt-3.5-turbo",
+        help="LLM to use data augmentation. By now, only `openai` is supported",
+    ),
+    num_proc: int = typer.Option(
+        os.cpu_count(), help="Number of processes to use for data augmentation"
+    ),
+    batch_size: int = typer.Option(
+        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
+    ),
+    concurrent_calls: int = typer.Option(
+        os.cpu_count() * 2,
+        min=1,
+        help="Concurrent calls with 1 tag each to the different model",
+    ),
+    tags_file_path: str = typer.Option(
+        None,
+        help="Text file containing one line per tag to be considered. "
+        "The rest will be discarded.",
+    ),
+):
+    if not data_path.endswith("jsonl"):
+        logger.error(
+            "It seems your input MeSH data is not in `jsonl` format. "
+            "Please, run first `scripts/mesh_json_to_jsonl.py.`"
+        )
+        exit(-1)
+
+    if tags_file_path is None:
+        logger.error(
+            "To understand which tags need to be augmented set the path to the tags file in --tags-file-path"
+        )
+        exit(-1)
+
+    spacy.cli.download("en_core_web_sm")
+
+    retag(
+        data_path,
+        save_to_path,
+        model_key=model_key,
+        num_proc=num_proc,
+        batch_size=batch_size,
+        concurrent_calls=concurrent_calls,
+        tags_file_path=tags_file_path,
+    )
diff --git a/poetry.lock b/poetry.lock
index a98613d6..782ece71 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -465,6 +465,46 @@ d = ["aiohttp (>=3.7.4)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
 uvloop = ["uvloop (>=0.15.2)"]
 
+[[package]]
+name = "blis"
+version = "0.7.10"
+description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
+optional = false
+python-versions = "*"
+files = [
+    {file = "blis-0.7.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fb4a9fca42d56533e28bf62b740f5c7d122e804742e5ea24b2704950151ae3c"},
+    {file = "blis-0.7.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2167e656d6237443ef7d0cd7dcfbedc12fcd156c54112f2dc5ca9b0249ec835d"},
+    {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a887165f2d7c08814dc92f96535232ca628e3e27927fb09cdeb8492781a28d04"},
+    {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31a6a8c347ef764ef268b6e11ae7b47ce83aba7ea99fc9223f85543aaab09826"},
+    {file = "blis-0.7.10-cp310-cp310-win_amd64.whl", hash = "sha256:67a17000e953d05f09a1ee7dad001c783ca5d5dc12e40dcfff049b86e74fed67"},
+    {file = "blis-0.7.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:67c8270ea20cf7e9342e4e3ed8fd51123a5236b1aa35fa94fb2200a8e11d0081"},
+    {file = "blis-0.7.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a86f1d2c6370d571dc88fc710416e8cab7dc6bb3a47ee9f27079ee34adf780d6"},
+    {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:288247c424fd2bd3d43b750f1f54bba19fe2cbb11e5c028bc4762bc03bd54b9b"},
+    {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2846d1a5116a5a1e4c09fa5c3cab6fbe13349c8036bc1c8746a738c556a751c4"},
+    {file = "blis-0.7.10-cp311-cp311-win_amd64.whl", hash = "sha256:f5c4a7c0fa67fec5a06fb6c1656bf1b51e7ab414292a04d417512b1fb1247246"},
+    {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec3e11e8ed6be18cf43152513bbfeabbc3f99a5d391786642fb7a14fb914ee61"},
+    {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:148835c8c96ea4c8957111de0593a28e9044c5b0e4cbcc34b77d700394fa6f13"},
+    {file = "blis-0.7.10-cp36-cp36m-win_amd64.whl", hash = "sha256:2df3d8703d23c39d8a0fb1e43be4681ec09f9010e08e9b35674fe799046c5fd5"},
+    {file = "blis-0.7.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fa62e13631c89626365ccd2585a2be154847c5bbb30cfc2ea8fdcf4e83cedd69"},
+    {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc7c70c5d482ce71c61a6008bcb44dfb15a0ac41ba176c59143f016658fa82d"},
+    {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed4e31d32916f657842572b6640b235c5f2f679a70ec74808160b584c08399ce"},
+    {file = "blis-0.7.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9833fc44795c8d43617732df31a8eca9de3f54b181ff9f0008cc50356cc26d86"},
+    {file = "blis-0.7.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0cca151d046f8b6b9d075b4f3a5ffee52993424b3080f0e0c2be419f20a477a7"},
+    {file = "blis-0.7.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d3bb6c4b9ae45e88e6e69b46eca145858cb9b3cd0a43a6c6812fb34c5c80d871"},
+    {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c6a0230688ff7c29e31b78f0d207556044c0c84bb90e7c28b009a6765658c4"},
+    {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:953dd85d4a8f79d4d69c17d27a0b783a5664aee0feafa33662199b7c78b0ee51"},
+    {file = "blis-0.7.10-cp38-cp38-win_amd64.whl", hash = "sha256:ed181a90fef1edff76220cb883df65685aeca610a0abe22c91322a3300e1e89d"},
+    {file = "blis-0.7.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:df7f746159d9ab11f427e00c72abe8de522c1671c7a33ca664739b2bd48b71c2"},
+    {file = "blis-0.7.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd7870a21aed12b25ec8692a75e6965e9451b1b7f2752e2cac4ae9f565d2de95"},
+    {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4766e26721e37e028336b542c226eab9faf812ea2d89a1869531ed0cada6c359"},
+    {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc8fac91353f20e747e130bc8d4010442c6700e4c7e5edc38d69bb844802ea81"},
+    {file = "blis-0.7.10-cp39-cp39-win_amd64.whl", hash = "sha256:4329fef5b1050c88dbca6f7d87ecc02d56f09005afa60edf12d826d82544f88a"},
+    {file = "blis-0.7.10.tar.gz", hash = "sha256:343e8b125784d70ff6e1f17a95ea71538705bf0bd3cc236a176d153590842647"},
+]
+
+[package.dependencies]
+numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
+
 [[package]]
 name = "boto3"
 version = "1.26.161"
@@ -503,6 +543,17 @@ urllib3 = ">=1.25.4,<1.27"
 [package.extras]
 crt = ["awscrt (==0.16.9)"]
 
+[[package]]
+name = "catalogue"
+version = "2.0.9"
+description = "Super lightweight function registries for your library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "catalogue-2.0.9-py3-none-any.whl", hash = "sha256:5817ce97de17ace366a15eadd4987ac022b28f262006147549cdb3467265dc4d"},
+    {file = "catalogue-2.0.9.tar.gz", hash = "sha256:d204c423ec436f2545341ec8a0e026ae033b3ce5911644f95e94d6b887cf631c"},
+]
+
 [[package]]
 name = "celery"
 version = "5.3.1"
@@ -857,6 +908,21 @@ files = [
 [package.extras]
 test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
 
+[[package]]
+name = "confection"
+version = "0.1.2"
+description = "The sweetest config system for Python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "confection-0.1.2-py3-none-any.whl", hash = "sha256:8bde19143fe36c38ea6e7241dec7be14b8a16e51c9d7ade93d19f72d9f8f1115"},
+    {file = "confection-0.1.2.tar.gz", hash = "sha256:7163eb9bdde62cc61a71c6284fb0f0b50b2723b7ef8ab79c7061a7bd659a058e"},
+]
+
+[package.dependencies]
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
+srsly = ">=2.4.0,<3.0.0"
+
 [[package]]
 name = "configobj"
 version = "5.0.8"
@@ -916,6 +982,43 @@ ssh = ["bcrypt (>=3.1.5)"]
 test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
+[[package]]
+name = "cymem"
+version = "2.0.7"
+description = "Manage calls to calloc/free through Cython"
+optional = false
+python-versions = "*"
+files = [
+    {file = "cymem-2.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4981fc9182cc1fe54bfedf5f73bfec3ce0c27582d9be71e130c46e35958beef0"},
+    {file = "cymem-2.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42aedfd2e77aa0518a24a2a60a2147308903abc8b13c84504af58539c39e52a3"},
+    {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c183257dc5ab237b664f64156c743e788f562417c74ea58c5a3939fe2d48d6f6"},
+    {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d18250f97eeb13af2e8b19d3cefe4bf743b963d93320b0a2e729771410fd8cf4"},
+    {file = "cymem-2.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:864701e626b65eb2256060564ed8eb034ebb0a8f14ce3fbef337e88352cdee9f"},
+    {file = "cymem-2.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:314273be1f143da674388e0a125d409e2721fbf669c380ae27c5cbae4011e26d"},
+    {file = "cymem-2.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:df543a36e7000808fe0a03d92fd6cd8bf23fa8737c3f7ae791a5386de797bf79"},
+    {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e5e1b7de7952d89508d07601b9e95b2244e70d7ef60fbc161b3ad68f22815f8"},
+    {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aa33f1dbd7ceda37970e174c38fd1cf106817a261aa58521ba9918156868231"},
+    {file = "cymem-2.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:10178e402bb512b2686b8c2f41f930111e597237ca8f85cb583ea93822ef798d"},
+    {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2971b7da5aa2e65d8fbbe9f2acfc19ff8e73f1896e3d6e1223cc9bf275a0207"},
+    {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85359ab7b490e6c897c04863704481600bd45188a0e2ca7375eb5db193e13cb7"},
+    {file = "cymem-2.0.7-cp36-cp36m-win_amd64.whl", hash = "sha256:0ac45088abffbae9b7db2c597f098de51b7e3c1023cb314e55c0f7f08440cf66"},
+    {file = "cymem-2.0.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:26e5d5c6958855d2fe3d5629afe85a6aae5531abaa76f4bc21b9abf9caaccdfe"},
+    {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:011039e12d3144ac1bf3a6b38f5722b817f0d6487c8184e88c891b360b69f533"},
+    {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f9e63e5ad4ed6ffa21fd8db1c03b05be3fea2f32e32fdace67a840ea2702c3d"},
+    {file = "cymem-2.0.7-cp37-cp37m-win_amd64.whl", hash = "sha256:5ea6b027fdad0c3e9a4f1b94d28d213be08c466a60c72c633eb9db76cf30e53a"},
+    {file = "cymem-2.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4302df5793a320c4f4a263c7785d2fa7f29928d72cb83ebeb34d64a610f8d819"},
+    {file = "cymem-2.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:24b779046484674c054af1e779c68cb224dc9694200ac13b22129d7fb7e99e6d"},
+    {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c50794c612801ed8b599cd4af1ed810a0d39011711c8224f93e1153c00e08d1"},
+    {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9525ad563b36dc1e30889d0087a0daa67dd7bb7d3e1530c4b61cd65cc756a5b"},
+    {file = "cymem-2.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:48b98da6b906fe976865263e27734ebc64f972a978a999d447ad6c83334e3f90"},
+    {file = "cymem-2.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e156788d32ad8f7141330913c5d5d2aa67182fca8f15ae22645e9f379abe8a4c"},
+    {file = "cymem-2.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3da89464021fe669932fce1578343fcaf701e47e3206f50d320f4f21e6683ca5"},
+    {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f359cab9f16e25b3098f816c40acbf1697a3b614a8d02c56e6ebcb9c89a06b3"},
+    {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f165d7bce55d6730930e29d8294569788aa127f1be8d1642d9550ed96223cb37"},
+    {file = "cymem-2.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:59a09cf0e71b1b88bfa0de544b801585d81d06ea123c1725e7c5da05b7ca0d20"},
+    {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"},
+]
+
 [[package]]
 name = "datasets"
 version = "2.13.1"
@@ -1827,6 +1930,20 @@ sqs = ["boto3 (>=1.26.143)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"]
 yaml = ["PyYAML (>=3.10)"]
 zookeeper = ["kazoo (>=2.8.0)"]
 
+[[package]]
+name = "langcodes"
+version = "3.3.0"
+description = "Tools for labeling human languages with IETF language tags"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"},
+    {file = "langcodes-3.3.0.tar.gz", hash = "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"},
+]
+
+[package.extras]
+data = ["language-data (>=1.1,<2.0)"]
+
 [[package]]
 name = "libpecos"
 version = "1.0.0"
@@ -2076,6 +2193,43 @@ files = [
 [package.dependencies]
 dill = ">=0.3.6"
 
+[[package]]
+name = "murmurhash"
+version = "1.0.9"
+description = "Cython bindings for MurmurHash"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "murmurhash-1.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:697ed01454d92681c7ae26eb1adcdc654b54062bcc59db38ed03cad71b23d449"},
+    {file = "murmurhash-1.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ef31b5c11be2c064dbbdd0e22ab3effa9ceb5b11ae735295c717c120087dd94"},
+    {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a2bd203377a31bbb2d83fe3f968756d6c9bbfa36c64c6ebfc3c6494fc680bc"},
+    {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0eb0f8e652431ea238c11bcb671fef5c03aff0544bf7e098df81ea4b6d495405"},
+    {file = "murmurhash-1.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:cf0b3fe54dca598f5b18c9951e70812e070ecb4c0672ad2cc32efde8a33b3df6"},
+    {file = "murmurhash-1.0.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5dc41be79ba4d09aab7e9110a8a4d4b37b184b63767b1b247411667cdb1057a3"},
+    {file = "murmurhash-1.0.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c0f84ecdf37c06eda0222f2f9e81c0974e1a7659c35b755ab2fdc642ebd366db"},
+    {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:241693c1c819148eac29d7882739b1099c891f1f7431127b2652c23f81722cec"},
+    {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f5ca56c430230d3b581dfdbc54eb3ad8b0406dcc9afdd978da2e662c71d370"},
+    {file = "murmurhash-1.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:660ae41fc6609abc05130543011a45b33ca5d8318ae5c70e66bbd351ca936063"},
+    {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01137d688a6b259bde642513506b062364ea4e1609f886d9bd095c3ae6da0b94"},
+    {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b70bbf55d89713873a35bd4002bc231d38e530e1051d57ca5d15f96c01fd778"},
+    {file = "murmurhash-1.0.9-cp36-cp36m-win_amd64.whl", hash = "sha256:3e802fa5b0e618ee99e8c114ce99fc91677f14e9de6e18b945d91323a93c84e8"},
+    {file = "murmurhash-1.0.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:213d0248e586082e1cab6157d9945b846fd2b6be34357ad5ea0d03a1931d82ba"},
+    {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94b89d02aeab5e6bad5056f9d08df03ac7cfe06e61ff4b6340feb227fda80ce8"},
+    {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c2e2ee2d91a87952fe0f80212e86119aa1fd7681f03e6c99b279e50790dc2b3"},
+    {file = "murmurhash-1.0.9-cp37-cp37m-win_amd64.whl", hash = "sha256:8c3d69fb649c77c74a55624ebf7a0df3c81629e6ea6e80048134f015da57b2ea"},
+    {file = "murmurhash-1.0.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab78675510f83e7a3c6bd0abdc448a9a2b0b385b0d7ee766cbbfc5cc278a3042"},
+    {file = "murmurhash-1.0.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0ac5530c250d2b0073ed058555847c8d88d2d00229e483d45658c13b32398523"},
+    {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69157e8fa6b25c4383645227069f6a1f8738d32ed2a83558961019ca3ebef56a"},
+    {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aebe2ae016525a662ff772b72a2c9244a673e3215fcd49897f494258b96f3e7"},
+    {file = "murmurhash-1.0.9-cp38-cp38-win_amd64.whl", hash = "sha256:a5952f9c18a717fa17579e27f57bfa619299546011a8378a8f73e14eece332f6"},
+    {file = "murmurhash-1.0.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef79202feeac68e83971239169a05fa6514ecc2815ce04c8302076d267870f6e"},
+    {file = "murmurhash-1.0.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:799fcbca5693ad6a40f565ae6b8e9718e5875a63deddf343825c0f31c32348fa"},
+    {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9b995bc82eaf9223e045210207b8878fdfe099a788dd8abd708d9ee58459a9d"},
+    {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b129e1c5ebd772e6ff5ef925bcce695df13169bd885337e6074b923ab6edcfc8"},
+    {file = "murmurhash-1.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:379bf6b414bd27dd36772dd1570565a7d69918e980457370838bd514df0d91e9"},
+    {file = "murmurhash-1.0.9.tar.gz", hash = "sha256:fe7a38cb0d3d87c14ec9dddc4932ffe2dbc77d75469ab80fd5014689b0e07b58"},
+]
+
 [[package]]
 name = "mypy-extensions"
 version = "1.0.0"
@@ -2353,6 +2507,28 @@ files = [
     {file = "pathtools-0.1.2.tar.gz", hash = "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0"},
 ]
 
+[[package]]
+name = "pathy"
+version = "0.10.2"
+description = "pathlib.Path subclasses for local and cloud bucket storage"
+optional = false
+python-versions = ">= 3.6"
+files = [
+    {file = "pathy-0.10.2-py3-none-any.whl", hash = "sha256:681bc98dbff28e7de3e50efa8246910f727e8ac254c4318c47ce341f7c1ce21d"},
+    {file = "pathy-0.10.2.tar.gz", hash = "sha256:79c572ab7fed84dc46837346edae58565992d0477a789cd4691a41d8eab9917d"},
+]
+
+[package.dependencies]
+smart-open = ">=5.2.1,<7.0.0"
+typer = ">=0.3.0,<1.0.0"
+
+[package.extras]
+all = ["azure-storage-blob", "boto3", "google-cloud-storage (>=1.26.0,<2.0.0)", "mock", "pytest", "pytest-coverage", "typer-cli"]
+azure = ["azure-storage-blob"]
+gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
+s3 = ["boto3"]
+test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
+
 [[package]]
 name = "platformdirs"
 version = "3.9.1"
@@ -2401,6 +2577,47 @@ nodeenv = ">=0.11.1"
 pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"
 
+[[package]]
+name = "preshed"
+version = "3.0.8"
+description = "Cython hash table that trusts the keys are pre-hashed"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "preshed-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ea4b6df8ef7af38e864235256793bc3056e9699d991afcf6256fa298858582fc"},
+    {file = "preshed-3.0.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e945fc814bdc29564a2ce137c237b3a9848aa1e76a1160369b6e0d328151fdd"},
+    {file = "preshed-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9a4833530fe53001c351974e0c8bb660211b8d0358e592af185fec1ae12b2d0"},
+    {file = "preshed-3.0.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1472ee231f323b4f4368b1b5f8f08481ed43af89697d45450c6ae4af46ac08a"},
+    {file = "preshed-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:c8a2e2931eea7e500fbf8e014b69022f3fab2e35a70da882e2fc753e5e487ae3"},
+    {file = "preshed-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e1bb8701df7861af26a312225bdf7c4822ac06fcf75aeb60fe2b0a20e64c222"},
+    {file = "preshed-3.0.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9aef2b0b7687aecef48b1c6ff657d407ff24e75462877dcb888fa904c4a9c6d"},
+    {file = "preshed-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:854d58a8913ebf3b193b0dc8064155b034e8987de25f26838dfeca09151fda8a"},
+    {file = "preshed-3.0.8-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:135e2ac0db1a3948d6ec295598c7e182b52c394663f2fcfe36a97ae51186be21"},
+    {file = "preshed-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:019d8fa4161035811fb2804d03214143298739e162d0ad24e087bd46c50970f5"},
+    {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a49ce52856fbb3ef4f1cc744c53f5d7e1ca370b1939620ac2509a6d25e02a50"},
+    {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdbc2957b36115a576c515ffe963919f19d2683f3c76c9304ae88ef59f6b5ca6"},
+    {file = "preshed-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:09cc9da2ac1b23010ce7d88a5e20f1033595e6dd80be14318e43b9409f4c7697"},
+    {file = "preshed-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e19c8069f1a1450f835f23d47724530cf716d581fcafb398f534d044f806b8c2"},
+    {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25b5ef5e387a0e17ff41202a8c1816184ab6fb3c0d0b847bf8add0ed5941eb8d"},
+    {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53d3e2456a085425c66af7baba62d7eaa24aa5e460e1a9e02c401a2ed59abd7b"},
+    {file = "preshed-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:85e98a618fb36cdcc37501d8b9b8c1246651cc2f2db3a70702832523e0ae12f4"},
+    {file = "preshed-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7f8837bf616335464f3713cbf562a3dcaad22c3ca9193f957018964ef871a68b"},
+    {file = "preshed-3.0.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:720593baf2c2e295f855192974799e486da5f50d4548db93c44f5726a43cefb9"},
+    {file = "preshed-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0ad3d860b9ce88a74cf7414bb4b1c6fd833813e7b818e76f49272c4974b19ce"},
+    {file = "preshed-3.0.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd19d48440b152657966a52e627780c0ddbe9d907b8d7ee4598505e80a3c55c7"},
+    {file = "preshed-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:246e7c6890dc7fe9b10f0e31de3346b906e3862b6ef42fcbede37968f46a73bf"},
+    {file = "preshed-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67643e66691770dc3434b01671648f481e3455209ce953727ef2330b16790aaa"},
+    {file = "preshed-3.0.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ae25a010c9f551aa2247ee621457f679e07c57fc99d3fd44f84cb40b925f12c"},
+    {file = "preshed-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6a7fcf7dd2e7711051b3f0432da9ec9c748954c989f49d2cd8eabf8c2d953e"},
+    {file = "preshed-3.0.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5942858170c4f53d9afc6352a86bbc72fc96cc4d8964b6415492114a5920d3ed"},
+    {file = "preshed-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:06793022a56782ef51d74f1399925a2ba958e50c5cfbc6fa5b25c4945e158a07"},
+    {file = "preshed-3.0.8.tar.gz", hash = "sha256:6c74c70078809bfddda17be96483c41d06d717934b07cab7921011d81758b357"},
+]
+
+[package.dependencies]
+cymem = ">=2.0.2,<2.1.0"
+murmurhash = ">=0.28.0,<1.1.0"
+
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.39"
@@ -3396,6 +3613,27 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
+[[package]]
+name = "smart-open"
+version = "6.4.0"
+description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
+optional = false
+python-versions = ">=3.6,<4.0"
+files = [
+    {file = "smart_open-6.4.0-py3-none-any.whl", hash = "sha256:8d3ef7e6997e8e42dd55c74166ed21e6ac70664caa32dd940b26d54a8f6b4142"},
+    {file = "smart_open-6.4.0.tar.gz", hash = "sha256:be3c92c246fbe80ebce8fbacb180494a481a77fcdcb7c1aadb2ea5b9c2bee8b9"},
+]
+
+[package.extras]
+all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests"]
+azure = ["azure-common", "azure-core", "azure-storage-blob"]
+gcs = ["google-cloud-storage (>=2.6.0)"]
+http = ["requests"]
+s3 = ["boto3"]
+ssh = ["paramiko"]
+test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses"]
+webhdfs = ["requests"]
+
 [[package]]
 name = "smmap"
 version = "5.0.0"
@@ -3418,6 +3656,115 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
+[[package]]
+name = "spacy"
+version = "3.6.1"
+description = "Industrial-strength Natural Language Processing (NLP) in Python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "spacy-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fb23b9af51ee8baeea4920d6ffc8ef85bc3ea7a6338dbf330a0626cf6ac6ea9"},
+    {file = "spacy-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb00bc74f59b537518a398fd066c0f7a8f029c763cc88afa1a0a59914f639e83"},
+    {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f75430fef7e18e6a4c32ca7efa3fb17020eaaa5d7ca0aeac6f663748a32888d"},
+    {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:479132dd3118024e97022735d6ad10d50c789f3979675a8db86e40f333fa335f"},
+    {file = "spacy-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:385dd3e48a8bb980ec2b8a70831ab3d2d43496357bae91b486c0e99dedb991aa"},
+    {file = "spacy-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:369c1102eadfcfe155ff1d8d540411b784fe163171e15f02e0b47e030af7c527"},
+    {file = "spacy-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ee28656f518e0d454dcc6840a17ec4c6141c055cda86e6b7a772ec6b55cde24"},
+    {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f426f312e945191218a3f753d7ce0068f08d27b253de0e30b9fbae81778bb90"},
+    {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c51ceb2e0352c99b1703ef97849c10cb27ceb58348cb76ab4734477d485035b"},
+    {file = "spacy-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:c6b7184bac8c8f72c4e3dbfd7c82eb0541c03fbccded11412269ae906f0d16c9"},
+    {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643b69be30f092cc3215d576d9a194ee01a3da319accdc06ae5a521d83497093"},
+    {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17424ab01023ece5679fe5c9224241d4ba6b08069b756df77df5b0c857fa762c"},
+    {file = "spacy-3.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:eb93b401f7070fb7e6be64b4d9ac5c69f6ed49c9a7c13532481b425a9ee5d980"},
+    {file = "spacy-3.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:46c27249590a0227d33ad33871e99820c2e9890b59f970a37f8f95f4520ca2eb"},
+    {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590886ca51ad4509100eeae233d22086e3736ab3ff54bf588f356a0862cdb735"},
+    {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca97c6052e098f00c0bed89dfa7c0d9a7ea24667d67854baa7dba53c61c8c6f0"},
+    {file = "spacy-3.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:13554a7bda6f9b148f54f3df0870b487c590921eaff0d7ce1a8be15b70e77a92"},
+    {file = "spacy-3.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a110dc5bbc5b37176168bb24064f7e49b9f29f5a4857f09114e5953c3754b311"},
+    {file = "spacy-3.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3abd2b82dd483c13aeb10720f52416523415ac0af84106f0c1eaae29240fe709"},
+    {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac5d89d909b30e64873caa93399aa5a1e72b363ae291e297c83a07db6b646f"},
+    {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de915f5419ad28d8d1c614c77172ce05b0b59a7c57854f098b7f2da98e28f40"},
+    {file = "spacy-3.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:738d806851760c2917e20046332af1ccbef78ff43eaebb23914f4d90ed060539"},
+    {file = "spacy-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4b5350ad1b70fb9b9e17be220dd866c6b91a950a45cfe6ce524041ef52593621"},
+    {file = "spacy-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3b797eedaf29b8726e5fb81e4b839b1734a07c835243a2d59a28cc974d2a9067"},
+    {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7762c1944cdacc0d04f5c781c79cc7beb1caa6cbc2b74687a997775f0846cec1"},
+    {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fdee99625ee3c11537182598c81a17d4d4521c73b59e6c1d0ad6749c6654f16"},
+    {file = "spacy-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:c9d112681d3666a75b07dea8c65a0b3f46ebebb9b90fda568089254134f0d28b"},
+    {file = "spacy-3.6.1.tar.gz", hash = "sha256:6323a98706ae2d5561694b03a8b0b5751887a002903a4894e68aeb29cc672166"},
+]
+
+[package.dependencies]
+catalogue = ">=2.0.6,<2.1.0"
+cymem = ">=2.0.2,<2.1.0"
+jinja2 = "*"
+langcodes = ">=3.2.0,<4.0.0"
+murmurhash = ">=0.28.0,<1.1.0"
+numpy = ">=1.15.0"
+packaging = ">=20.0"
+pathy = ">=0.10.0"
+preshed = ">=3.0.2,<3.1.0"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
+requests = ">=2.13.0,<3.0.0"
+setuptools = "*"
+smart-open = ">=5.2.1,<7.0.0"
+spacy-legacy = ">=3.0.11,<3.1.0"
+spacy-loggers = ">=1.0.0,<2.0.0"
+srsly = ">=2.4.3,<3.0.0"
+thinc = ">=8.1.8,<8.2.0"
+tqdm = ">=4.38.0,<5.0.0"
+typer = ">=0.3.0,<0.10.0"
+wasabi = ">=0.9.1,<1.2.0"
+
+[package.extras]
+apple = ["thinc-apple-ops (>=0.1.0.dev0,<1.0.0)"]
+cuda = ["cupy (>=5.0.0b4,<13.0.0)"]
+cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"]
+cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"]
+cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"]
+cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"]
+cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"]
+cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"]
+cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"]
+cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"]
+cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"]
+cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"]
+cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"]
+cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"]
+cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"]
+cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"]
+cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"]
+cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"]
+cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"]
+cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"]
+ja = ["sudachidict-core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"]
+ko = ["natto-py (>=0.9.0)"]
+lookups = ["spacy-lookups-data (>=1.0.3,<1.1.0)"]
+ray = ["spacy-ray (>=0.1.0,<1.0.0)"]
+th = ["pythainlp (>=2.0)"]
+transformers = ["spacy-transformers (>=1.1.2,<1.3.0)"]
+
+[[package]]
+name = "spacy-legacy"
+version = "3.0.12"
+description = "Legacy registered functions for spaCy backwards compatibility"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"},
+    {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"},
+]
+
+[[package]]
+name = "spacy-loggers"
+version = "1.0.4"
+description = "Logging utilities for SpaCy"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "spacy-loggers-1.0.4.tar.gz", hash = "sha256:e6f983bf71230091d5bb7b11bf64bd54415eca839108d5f83d9155d0ba93bf28"},
+    {file = "spacy_loggers-1.0.4-py3-none-any.whl", hash = "sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae"},
+]
+
 [[package]]
 name = "sqltrie"
 version = "0.7.0"
@@ -3438,6 +3785,46 @@ pygtrie = "*"
 dev = ["mypy (==0.971)", "pyinstaller", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-benchmark", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
 tests = ["mypy (==0.971)", "pyinstaller", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-benchmark", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
 
+[[package]]
+name = "srsly"
+version = "2.4.7"
+description = "Modern high-performance serialization utilities for Python"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "srsly-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:38506074cfac43f5581b6b22c335dc4d43ef9a82cbe9fe2557452e149d4540f5"},
+    {file = "srsly-2.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:efd401ac0b239f3c7c0070fcd613f10a4a01478ff5fe7fc8527ea7a23dfa3709"},
+    {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd1be19502fda87108c8055bce6537ec332266057f595133623a4a18e56a91a1"},
+    {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87e86be5fd655ed554e4bf6b63a4eb3380ffb40752d0621323a3df879d3e6407"},
+    {file = "srsly-2.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:7be5def9b6ac7896ce326997498b8155b9167ddc672fb209a200090c7fe45a4b"},
+    {file = "srsly-2.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb3d54563e33816d33695b58f9daaea410fcd0b9272aba27050410a5279ba8d8"},
+    {file = "srsly-2.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2848735a9fcb0ad9ec23a6986466de7942280a01dbcb7b66583288f1378afba1"},
+    {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:282d59a37c271603dd790ab25fa6521c3d3fdbca67bef3ee838fd664c773ea0d"},
+    {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7affecb281db0683fe78181d644f6d6a061948fa318884c5669a064b97869f54"},
+    {file = "srsly-2.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:76d991167dc83f8684fb366a092a03f51f7582741885ba42444ab577e61ae198"},
+    {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a7278470bbad3831c9d8abd7f7b9fa9a3d6cd29f797f913f7a04ade5668715"},
+    {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:654496a07fcf11ba823e9a16f263001271f04d8b1bfd8d94ba6130a1649fc6d8"},
+    {file = "srsly-2.4.7-cp36-cp36m-win_amd64.whl", hash = "sha256:89e35ead948349b2a8d47600544dbf49ff737d15a899bc5a71928220daee2807"},
+    {file = "srsly-2.4.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3e0f0410faf9d5dc5c58caf907a4b0b94e6dc766289e329a15ddf8adca264d1c"},
+    {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c3422ab7ed37438086a178e611be85b7001e0071882655fcb8dca83c4f5f57d"},
+    {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a81186f9c1beb0892fcef4fd6350e6ee0d2d700da5042e400ec6da65a0b52fb"},
+    {file = "srsly-2.4.7-cp37-cp37m-win_amd64.whl", hash = "sha256:1fe4a9bf004174f0b73b3fc3a96d35811c218e0441f4246ac4cb3f06daf0ca12"},
+    {file = "srsly-2.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:86501eb25c6615d934bde0aea98d705ce7edd11d070536162bd2fa8606034f0f"},
+    {file = "srsly-2.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f46bc563a7b80f81aed8dd12f86ef43b93852d937666f44a3d04bcdaa630376c"},
+    {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e60cd20f08b8a0e200017c6e8f5af51321878b17bf7da284dd81c7604825c6e"},
+    {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c90953a58dfde2eeaea15749c7dddad2a508b48b17d084b491d56d5213ef2a37"},
+    {file = "srsly-2.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:7c9a1dc7077b4a101fd018c1c567ec735203887e016a813588557f5c4ce2de8b"},
+    {file = "srsly-2.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c8ada26613f49f72baa573dbd7e911f3af88b647c3559cb6641c97ca8dd7cfe0"},
+    {file = "srsly-2.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:267f6ac1b8388a4649a6e6299114ff2f6af03bafd60fc8f267e890a9becf7057"},
+    {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75f2777cc44ad34c5f2239d44c8cd56b0263bf19bc6c1593dcc765e2a21fc5e7"},
+    {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2059d447cfe5bf6692634cbfbbb2d5663f554023b0aa0ee3d348387d9ec9345a"},
+    {file = "srsly-2.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:422e44d702da4420c47012d309fc56b5081ca06a500393d83114eb09d71bf1ce"},
+    {file = "srsly-2.4.7.tar.gz", hash = "sha256:93c2cc4588778261ccb23dd0543b24ded81015dd8ab4ec137cd7d04965035d08"},
+]
+
+[package.dependencies]
+catalogue = ">=2.0.3,<2.1.0"
+
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -3480,6 +3867,81 @@ files = [
 [package.extras]
 doc = ["reno", "sphinx", "tornado (>=4.5)"]
 
+[[package]]
+name = "thinc"
+version = "8.1.12"
+description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "thinc-8.1.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efda431bc1513e81e457dbff4ef1610592569ddc362f8df24422628b195d51f4"},
+    {file = "thinc-8.1.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01dbe9063171c1d0df29374a3857ee500fb8acf8f33bd8a85d11214d7453ff7a"},
+    {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fcfe97b80aa02a6cdeef9f5e3127822a13497a9b6f58653da4ff3caf321e3c4"},
+    {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c52d0657c61b7e1a382cb5ee1ee71692a0e9c47bef9f3e02ac3492b26056d27"},
+    {file = "thinc-8.1.12-cp310-cp310-win_amd64.whl", hash = "sha256:b2078018c8bc36540b0c007cb1909f6c81c9a973b3180d15b934414f08988b28"},
+    {file = "thinc-8.1.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:340171c1927592082c79509e5a964766e2d65c2e30c5e583489488935a9a2340"},
+    {file = "thinc-8.1.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:88e8c9cd5119d5dbb0c4ed1bdde5acd6cf12fe1b3316647ecbd79fb12e3ef542"},
+    {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15c6cb31138814599426bd8855b9fc9d8d8ddb2bde1c91d204353b5e5af15deb"},
+    {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dc3117db83ec0d423480b6c77de90f658dfaed5f7a2bbc3d640f1f6c7ff0fe7"},
+    {file = "thinc-8.1.12-cp311-cp311-win_amd64.whl", hash = "sha256:f9ac43fd02e952c005753f85bd375c03baea5fa818a6a4942930177c31130eca"},
+    {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4241d0b8c9e813a1fbba05b6dc7d7056c0a2601b8a1119d372e85185068009e6"},
+    {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c141e42e610605a9c6def19e5dbb4877353839a610e3cdb1fa68e70f6b39492a"},
+    {file = "thinc-8.1.12-cp36-cp36m-win_amd64.whl", hash = "sha256:9388c1427b4c3615967e1be19fa93427be61241392bdd5a84ab1da0f96c6bcfb"},
+    {file = "thinc-8.1.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f6fb12692fae1a056432800f94ec88fa714eb1111aff9eabd61d2dfe10beb713"},
+    {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e51c693d477e02eab164a67b588fcdbb3609bc54ec39de6084da2dd9a356b8f8"},
+    {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4265f902f9a597be294765479ef6535d679e497fa2fed955cbcabcfdd82f81ad"},
+    {file = "thinc-8.1.12-cp37-cp37m-win_amd64.whl", hash = "sha256:4586d6709f3811db85e192fdf519620b3326d28e5f0193cef8544b057e20a951"},
+    {file = "thinc-8.1.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e10a648872e9ebbe115fa5fba0d515e8226bd0e2de0abd41d55f1ae04017813c"},
+    {file = "thinc-8.1.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:27231eb1d468e7eb97f255c3d1e985d5a0cb8e309e0ec01b29cce2de836b8db2"},
+    {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ece3880ac05d6bb75ecdbd9c03298e6f9691e5cb7480c1f15e66e33fe34004"},
+    {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:285f1141ecd7a9b61e2fed58b609c194b40e6ae5daf1e1e8dec31616bc9ffca1"},
+    {file = "thinc-8.1.12-cp38-cp38-win_amd64.whl", hash = "sha256:0400632aa235cfbbc0004014e90cdf54cd42333aa7f5e971ffe87c8125e607ed"},
+    {file = "thinc-8.1.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2edb3ef3a02f966eae8c5c56feb80ad5b6e5c221c94fcd95eb413d09d0d82212"},
+    {file = "thinc-8.1.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e078d3b00e51c597f3f301d3e2925d0842d0725f251ff9a53a1e1b4110d4b9c1"},
+    {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d0ac2f6a0b38ddb913f9b31d8c4b13b98a7f5f62db211e0d8ebefbda5138757"},
+    {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47cde897cf54bc731a3a7c2e51a6ef01a86687ab7ae90ab0e9fc5d2294fe0fba"},
+    {file = "thinc-8.1.12-cp39-cp39-win_amd64.whl", hash = "sha256:1b846c35a24b5b33e5d240f514f3a9e8bac2b6a10491caa147753dc50740a400"},
+    {file = "thinc-8.1.12.tar.gz", hash = "sha256:9dd12c5c79b176f077ce9416b49c9752782bd76ff0ea649d66527882e83ea353"},
+]
+
+[package.dependencies]
+blis = ">=0.7.8,<0.8.0"
+catalogue = ">=2.0.4,<2.1.0"
+confection = ">=0.0.1,<1.0.0"
+cymem = ">=2.0.2,<2.1.0"
+murmurhash = ">=1.0.2,<1.1.0"
+numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
+packaging = ">=20.0"
+preshed = ">=3.0.2,<3.1.0"
+pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
+setuptools = "*"
+srsly = ">=2.4.0,<3.0.0"
+wasabi = ">=0.8.1,<1.2.0"
+
+[package.extras]
+cuda = ["cupy (>=5.0.0b4)"]
+cuda-autodetect = ["cupy-wheel (>=11.0.0)"]
+cuda100 = ["cupy-cuda100 (>=5.0.0b4)"]
+cuda101 = ["cupy-cuda101 (>=5.0.0b4)"]
+cuda102 = ["cupy-cuda102 (>=5.0.0b4)"]
+cuda110 = ["cupy-cuda110 (>=5.0.0b4)"]
+cuda111 = ["cupy-cuda111 (>=5.0.0b4)"]
+cuda112 = ["cupy-cuda112 (>=5.0.0b4)"]
+cuda113 = ["cupy-cuda113 (>=5.0.0b4)"]
+cuda114 = ["cupy-cuda114 (>=5.0.0b4)"]
+cuda115 = ["cupy-cuda115 (>=5.0.0b4)"]
+cuda116 = ["cupy-cuda116 (>=5.0.0b4)"]
+cuda117 = ["cupy-cuda117 (>=5.0.0b4)"]
+cuda11x = ["cupy-cuda11x (>=11.0.0)"]
+cuda80 = ["cupy-cuda80 (>=5.0.0b4)"]
+cuda90 = ["cupy-cuda90 (>=5.0.0b4)"]
+cuda91 = ["cupy-cuda91 (>=5.0.0b4)"]
+cuda92 = ["cupy-cuda92 (>=5.0.0b4)"]
+datasets = ["ml-datasets (>=0.2.0,<0.3.0)"]
+mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
+tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
+torch = ["torch (>=1.6.0)"]
+
 [[package]]
 name = "threadpoolctl"
 version = "3.2.0"
@@ -4189,4 +4651,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "cb53df7c0ffa68f5c40fe32b94d716f6ee3d944bb795e67d653389a0c0070d93"
+content-hash = "c246890fd08f7d69ace373434b1d4cf0adc5bcb1f76177ee34d28ad12b839afa"
diff --git a/pyproject.toml b/pyproject.toml
index edcc4bf9..964f7a19 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ loguru = "^0.7.0"
 wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
+spacy = "^3.6.1"
 
 
 [tool.poetry.group.dev]

From b0561789777c847a4df7648d439646844a511006 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 11:10:01 +0100
Subject: [PATCH 002/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index d1fb3939..e3ce853e 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -12,7 +12,6 @@
 
 from datasets import load_from_disk
 import spacy
-import spacy.cli
 from spacy.util import minibatch, compounding
 
 retag_app = typer.Typer()
@@ -78,7 +77,7 @@ def retag(
 
     for tag in tags:
         print(tag)
-        nlp = spacy.load("en_core_web_sm")
+        nlp = spacy.blank('en')
 
         textcat = nlp.create_pipe("textcat")
         nlp.add_pipe("textcat", last=True)
@@ -100,11 +99,13 @@ def retag(
         neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
 
         train_data.extend(list(zip(neg_x_train, [{'cats': {'POSITIVE': False}}])))
+        logging.info(f"Train data size: {len(train_data)}. First example: {train_data[0]}")
 
         test = pos_x_test
         test_cats = [{'cats': {'POSITIVE': True}} for _ in range(len(pos_x_test))]
         test.extend(neg_x_test)
         test_cats.extend([{'cats': {'POSITIVE': False}} for _ in range(len(neg_x_test))])
+        logging.info(f"Test data size: {len(test)}")
 
         # get names of other pipes to disable them during training
         other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
@@ -171,8 +172,6 @@ def retag_cli(
         )
         exit(-1)
 
-    spacy.cli.download("en_core_web_sm")
-
     retag(
         data_path,
         save_to_path,

From b6c1444ae9757a480f44dceb0cf74b5ec2b3e2f0 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 11:17:28 +0100
Subject: [PATCH 003/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index e3ce853e..c45754c8 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -21,10 +21,11 @@ def _load_data(dset: list[str], limit=100, split=0.8):
     """Load data from the IMDB dataset."""
     # Partition off part of the train data for evaluation
     random.Random(42).shuffle(dset)
+    dset = dset[:limit]
     train_size = int(split * len(dset))
-    test_size = int( (1-split) * len(dset))
-    train_dset =  dset[:train_size][:limit]
-    test_dset = dset[train_size:train_size+test_size][:limit]
+    test_size = limit - train_size
+    train_dset = dset[:train_size]
+    test_dset = dset[train_size:limit]
     return train_dset, test_dset
 
 
@@ -90,7 +91,7 @@ def retag(
         )
         pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
 
-        train_data = list(zip(pos_x_train, [{'cats': {'POSITIVE': True}}]))
+        train_data = list(zip(pos_x_train, [{'cats': {tag: True, 'O': False}}]))
 
         logging.info(f"Obtaining negative examples for {tag}...")
         negative_dset = dset.filter(
@@ -98,13 +99,13 @@ def retag(
         )
         neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
 
-        train_data.extend(list(zip(neg_x_train, [{'cats': {'POSITIVE': False}}])))
+        train_data.extend(list(zip(neg_x_train, [{'cats': {tag: False, 'O': True}}])))
         logging.info(f"Train data size: {len(train_data)}. First example: {train_data[0]}")
 
         test = pos_x_test
-        test_cats = [{'cats': {'POSITIVE': True}} for _ in range(len(pos_x_test))]
+        test_cats = [{'cats': {tag: True, 'O': False}} for _ in range(len(pos_x_test))]
         test.extend(neg_x_test)
-        test_cats.extend([{'cats': {'POSITIVE': False}} for _ in range(len(neg_x_test))])
+        test_cats.extend([{'cats': {tag: False, 'O': True}} for _ in range(len(neg_x_test))])
         logging.info(f"Test data size: {len(test)}")
 
         # get names of other pipes to disable them during training

From d84788243254463ec0693bba534adb045dbc823f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 11:23:38 +0100
Subject: [PATCH 004/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index c45754c8..6623c798 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -42,11 +42,11 @@ def evaluate(tokenizer, textcat, texts, cats):
                 continue
             if score >= 0.5 and gold[label] >= 0.5:
                 tp += 1.
-            elif score >= 0.5 and gold[label] < 0.5:
+            elif score >= 0.5 > gold[label]:
                 fp += 1.
             elif score < 0.5 and gold[label] < 0.5:
                 tn += 1
-            elif score < 0.5 and gold[label] >= 0.5:
+            elif score < 0.5 <= gold[label]:
                 fn += 1
     precision = tp / (tp + fp)
     recall = tp / (tp + fn)
@@ -77,13 +77,14 @@ def retag(
         tags = [x.strip() for x in f.readlines()]
 
     for tag in tags:
-        print(tag)
+        logging.info(f"Retagging: {tag}")
         nlp = spacy.blank('en')
 
         textcat = nlp.create_pipe("textcat")
         nlp.add_pipe("textcat", last=True)
 
-        textcat.add_label("POSITIVE")
+        textcat.add_label(tag)
+        textcat.add_label("O")
 
         logging.info(f"Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
@@ -114,8 +115,8 @@ def retag(
         n_iter = 1
         with nlp.disable_pipes(*other_pipes):  # only train textcat
             optimizer = nlp.begin_training()
-            print("Training the model...")
-            print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
+            logging.info("Training the model...")
+            logging.info('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
             for i in range(n_iter):
                 losses = {}
 
@@ -128,11 +129,12 @@ def retag(
             with textcat.model.use_params(optimizer.averages):
                 # evaluate on the dev data split off in load_data()
                 scores = evaluate(nlp.tokenizer, textcat, test, test_cats)
-            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
+            logging.info('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                   .format(losses['textcat'], scores['textcat_p'],
                           scores['textcat_r'], scores['textcat_f']))
         break
 
+
 @retag_app.command()
 def retag_cli(
     data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),

From 348ad3bc9c710c7e71ba0ce6066a740e4b71b738 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:03:16 +0100
Subject: [PATCH 005/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/config.cfg   | 112 ++++++++++++++++++++
 grants_tagger_light/retagging/retagging.py | 115 ++++++++-------------
 2 files changed, 156 insertions(+), 71 deletions(-)
 create mode 100644 grants_tagger_light/retagging/config.cfg

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
new file mode 100644
index 00000000..2b03256f
--- /dev/null
+++ b/grants_tagger_light/retagging/config.cfg
@@ -0,0 +1,112 @@
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+batch_size = 16
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.textcat]
+factory = "textcat"
+scorer = {"@scorers":"spacy.textcat_scorer.v1"}
+threshold = 0.5
+
+[components.textcat.model]
+@architectures = "spacy.TextCatBOW.v2"
+exclusive_classes = true
+ngram_size = 1
+no_output_layer = false
+nO = null
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+max_length = 0
+gold_preproc = false
+limit = 0
+augmenter = null
+
+[training]
+dev_corpus = ""
+train_corpus = ""
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1600
+max_epochs = 0
+# max_steps = 20000
+max_steps = 100
+eval_frequency = 50
+frozen_components = []
+annotating_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+cats_score = 1.0
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 6623c798..f7339995 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,18 +1,15 @@
-import json
 import logging
-import multiprocessing
 import os
 import random
 
 import typer
 from loguru import logger
-import numpy as np
 
 from datasets import load_dataset
 
-from datasets import load_from_disk
 import spacy
-from spacy.util import minibatch, compounding
+from spacy.tokens import DocBin
+from spacy.cli.train import train as spacy_train
 
 retag_app = typer.Typer()
 
@@ -21,38 +18,12 @@ def _load_data(dset: list[str], limit=100, split=0.8):
     """Load data from the IMDB dataset."""
     # Partition off part of the train data for evaluation
     random.Random(42).shuffle(dset)
-    dset = dset[:limit]
-    train_size = int(split * len(dset))
-    test_size = limit - train_size
+    train_size = int(split * limit)
     train_dset = dset[:train_size]
     test_dset = dset[train_size:limit]
     return train_dset, test_dset
 
 
-def evaluate(tokenizer, textcat, texts, cats):
-    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8  # True positives
-    fp = 1e-8  # False positives
-    fn = 1e-8  # False negatives
-    tn = 1e-8  # True negatives
-    for i, doc in enumerate(textcat.pipe(docs)):
-        gold = cats[i]
-        for label, score in doc.cats.items():
-            if label not in gold:
-                continue
-            if score >= 0.5 and gold[label] >= 0.5:
-                tp += 1.
-            elif score >= 0.5 > gold[label]:
-                fp += 1.
-            elif score < 0.5 and gold[label] < 0.5:
-                tn += 1
-            elif score < 0.5 <= gold[label]:
-                fn += 1
-    precision = tp / (tp + fp)
-    recall = tp / (tp + fn)
-    f_score = 2 * (precision * recall) / (precision + recall)
-    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
-
 def retag(
     data_path: str,
     save_to_path: str,
@@ -78,13 +49,14 @@ def retag(
 
     for tag in tags:
         logging.info(f"Retagging: {tag}")
-        nlp = spacy.blank('en')
+        # nlp = spacy.blank('en')
+        nlp = spacy.load("en_core_web_sm")
 
-        textcat = nlp.create_pipe("textcat")
-        nlp.add_pipe("textcat", last=True)
+        # textcat = nlp.create_pipe("textcat")
+        # nlp.add_pipe("textcat", last=True)
 
-        textcat.add_label(tag)
-        textcat.add_label("O")
+        # textcat.add_label(tag)
+        # textcat.add_label("O")
 
         logging.info(f"Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
@@ -92,46 +64,47 @@ def retag(
         )
         pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
 
-        train_data = list(zip(pos_x_train, [{'cats': {tag: True, 'O': False}}]))
-
         logging.info(f"Obtaining negative examples for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
         neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
 
-        train_data.extend(list(zip(neg_x_train, [{'cats': {tag: False, 'O': True}}])))
-        logging.info(f"Train data size: {len(train_data)}. First example: {train_data[0]}")
-
-        test = pos_x_test
-        test_cats = [{'cats': {tag: True, 'O': False}} for _ in range(len(pos_x_test))]
-        test.extend(neg_x_test)
-        test_cats.extend([{'cats': {tag: False, 'O': True}} for _ in range(len(neg_x_test))])
-        logging.info(f"Test data size: {len(test)}")
-
-        # get names of other pipes to disable them during training
-        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
-
-        n_iter = 1
-        with nlp.disable_pipes(*other_pipes):  # only train textcat
-            optimizer = nlp.begin_training()
-            logging.info("Training the model...")
-            logging.info('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
-            for i in range(n_iter):
-                losses = {}
-
-            # batch up the examples using spaCy's minibatch
-            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
-            for batch in batches:
-                texts, annotations = zip(*batch)
-                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
-                           losses=losses)
-            with textcat.model.use_params(optimizer.averages):
-                # evaluate on the dev data split off in load_data()
-                scores = evaluate(nlp.tokenizer, textcat, test, test_cats)
-            logging.info('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
-                  .format(losses['textcat'], scores['textcat_p'],
-                          scores['textcat_r'], scores['textcat_f']))
+        train_data = DocBin()
+        for doc in nlp.pipe(pos_x_train):
+            doc.cats[tag] = 1
+            doc.cats['O'] = 0
+            train_data.add(doc)
+        for doc in nlp.pipe(neg_x_train):
+            doc.cats[tag] = 0
+            doc.cats['O'] = 1
+            train_data.add(doc)
+        train_data.to_disk("train.spacy")
+
+        test_data = DocBin()
+        for doc in nlp.pipe(pos_x_test):
+            doc.cats[tag] = 1
+            doc.cats['O'] = 0
+            test_data.add(doc)
+        for doc in nlp.pipe(pos_x_train):
+            doc.cats[tag] = 0
+            doc.cats['O'] = 1
+            test_data.add(doc)
+        test_data.to_disk("test.spacy")
+
+        logging.info(f"Train data size: {len(train_data)}")
+        logging.info(f"Test data size: {len(test_data)}")
+
+        config_path = "config.cfg"
+        output_model_path = "spacy_textcat"
+        spacy_train(
+            config_path,
+            output_path=output_model_path,
+            overrides={
+                "paths.train": "train.spacy",
+                "paths.dev": "valid.spacy",
+            },
+        )
         break
 
 

From b9b1e9f78ab774576c66eda02ac660d56394f689 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:04:56 +0100
Subject: [PATCH 006/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index f7339995..72e1d4f9 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -102,7 +102,7 @@ def retag(
             output_path=output_model_path,
             overrides={
                 "paths.train": "train.spacy",
-                "paths.dev": "valid.spacy",
+                "paths.dev": "test.spacy",
             },
         )
         break

From 06840fda49982f89fe33ed67223151a689a9fa50 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:11:55 +0100
Subject: [PATCH 007/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 72e1d4f9..7436a155 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -86,7 +86,7 @@ def retag(
             doc.cats[tag] = 1
             doc.cats['O'] = 0
             test_data.add(doc)
-        for doc in nlp.pipe(pos_x_train):
+        for doc in nlp.pipe(pos_x_test):
             doc.cats[tag] = 0
             doc.cats['O'] = 1
             test_data.add(doc)
@@ -95,7 +95,7 @@ def retag(
         logging.info(f"Train data size: {len(train_data)}")
         logging.info(f"Test data size: {len(test_data)}")
 
-        config_path = "config.cfg"
+        config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.cfg")
         output_model_path = "spacy_textcat"
         spacy_train(
             config_path,

From b442043c9b93d2d53a9b41d294f51e7ad932090f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:21:48 +0100
Subject: [PATCH 008/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/config.cfg   | 21 +++++++++++++++------
 grants_tagger_light/retagging/retagging.py |  4 ++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index 2b03256f..9d6f2f11 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -1,7 +1,17 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+gpu_allocator = null
+seed = 0
+
 [nlp]
 lang = "en"
 pipeline = ["textcat"]
-batch_size = 16
+batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
@@ -41,17 +51,16 @@ limit = 0
 augmenter = null
 
 [training]
-dev_corpus = ""
-train_corpus = ""
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
-# max_steps = 20000
-max_steps = 100
-eval_frequency = 50
+max_steps = 20000
+eval_frequency = 200
 frozen_components = []
 annotating_components = []
 before_to_disk = null
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 7436a155..94ef2c2a 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -101,8 +101,8 @@ def retag(
             config_path,
             output_path=output_model_path,
             overrides={
-                "paths.train": "train.spacy",
-                "paths.dev": "test.spacy",
+                "paths.train": os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy"),
+                "paths.dev": os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy"),
             },
         )
         break

From 66156a8c9d5a489ccd6b9aead4a514096f31d992 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:27:09 +0100
Subject: [PATCH 009/102] Prototype for retagging using spacy

---
 grants_tagger_light/retagging/retagging.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 94ef2c2a..0b715bb8 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -79,7 +79,8 @@ def retag(
             doc.cats[tag] = 0
             doc.cats['O'] = 1
             train_data.add(doc)
-        train_data.to_disk("train.spacy")
+        train_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy")
+        train_data.to_disk(train_data_path)
 
         test_data = DocBin()
         for doc in nlp.pipe(pos_x_test):
@@ -90,7 +91,8 @@ def retag(
             doc.cats[tag] = 0
             doc.cats['O'] = 1
             test_data.add(doc)
-        test_data.to_disk("test.spacy")
+        test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy")
+        test_data.to_disk(test_data_path)
 
         logging.info(f"Train data size: {len(train_data)}")
         logging.info(f"Test data size: {len(test_data)}")
@@ -101,8 +103,8 @@ def retag(
             config_path,
             output_path=output_model_path,
             overrides={
-                "paths.train": os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy"),
-                "paths.dev": os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy"),
+                "paths.train": train_data_path,
+                "paths.dev": test_data_path,
             },
         )
         break

From f3f8c23572aa9e58893e2c82047e32277fef493d Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:38:25 +0100
Subject: [PATCH 010/102] Changes to cNN

---
 grants_tagger_light/retagging/config.cfg | 70 +++++++++++++-----------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index 9d6f2f11..4402991e 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -1,83 +1,89 @@
 [paths]
-train = null
-dev = null
-vectors = null
+train = ""
+dev = ""
+raw = null
 init_tok2vec = null
+vectors = null
 
 [system]
-gpu_allocator = null
 seed = 0
+gpu_allocator = null
 
 [nlp]
 lang = "en"
 pipeline = ["textcat"]
-batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+batch_size = 1000
 
 [components]
 
 [components.textcat]
-factory = "textcat"
-scorer = {"@scorers":"spacy.textcat_scorer.v1"}
+factory = "textcat_multilabel"
 threshold = 0.5
 
 [components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
-exclusive_classes = true
-ngram_size = 1
-no_output_layer = false
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
 nO = null
 
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.textcat.model.tok2vec.encode:width}
+rows = [10000,5000,5000,5000]
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
 [corpora]
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
-path = ${paths.dev}
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
 max_length = 0
-gold_preproc = false
 limit = 0
 augmenter = null
 
 [corpora.train]
 @readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = 0
+path = ${paths:train}
 gold_preproc = false
+max_length = 0
 limit = 0
 augmenter = null
 
 [training]
-dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.1
-accumulate_gradient = 1
+dropout = 0.2
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
+accumulate_gradient = 1
 frozen_components = []
-annotating_components = []
 before_to_disk = null
 
 [training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
+@batchers = "spacy.batch_by_sequence.v1"
+size = 32
 get_length = null
 
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-t = 0.0
-
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
@@ -89,12 +95,11 @@ beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
-use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
+use_averages = true
 
 [training.score_weights]
-cats_score = 1.0
 cats_score_desc = null
 cats_micro_p = null
 cats_micro_r = null
@@ -105,6 +110,7 @@ cats_macro_f = null
 cats_macro_auc = null
 cats_f_per_type = null
 cats_macro_auc_per_type = null
+cats_score = 1.0
 
 [pretraining]
 

From f6c770eceb831c07740cb3af9fa961dca5715ba4 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 12:48:59 +0100
Subject: [PATCH 011/102] Changes to cNN

---
 grants_tagger_light/retagging/config.cfg   |  4 ++--
 grants_tagger_light/retagging/retagging.py | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index 4402991e..e71cf867 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -6,8 +6,8 @@ init_tok2vec = null
 vectors = null
 
 [system]
-seed = 0
-gpu_allocator = null
+seed = 42
+gpu_allocator = "pytorch"
 
 [nlp]
 lang = "en"
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 0b715bb8..da0d67bf 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -49,14 +49,8 @@ def retag(
 
     for tag in tags:
         logging.info(f"Retagging: {tag}")
-        # nlp = spacy.blank('en')
-        nlp = spacy.load("en_core_web_sm")
-
-        # textcat = nlp.create_pipe("textcat")
-        # nlp.add_pipe("textcat", last=True)
 
-        # textcat.add_label(tag)
-        # textcat.add_label("O")
+        nlp = spacy.load("en_core_web_sm")
 
         logging.info(f"Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
@@ -150,6 +144,9 @@ def retag_cli(
         )
         exit(-1)
 
+    spacy.cli.download("en_core_web_sm")
+    spacy.require_gpu()
+
     retag(
         data_path,
         save_to_path,

From ec2aeb647f217bbdc48dd16c4eb95b0a68f2923e Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 13:15:35 +0100
Subject: [PATCH 012/102] Adds cpu and gpu

---
 .../retagging/cnn_gpu_config.cfg              | 127 ++++++++++++++++++
 grants_tagger_light/retagging/config.cfg      |  72 +++++-----
 grants_tagger_light/retagging/retagging.py    |   1 -
 3 files changed, 160 insertions(+), 40 deletions(-)
 create mode 100644 grants_tagger_light/retagging/cnn_gpu_config.cfg

diff --git a/grants_tagger_light/retagging/cnn_gpu_config.cfg b/grants_tagger_light/retagging/cnn_gpu_config.cfg
new file mode 100644
index 00000000..e71cf867
--- /dev/null
+++ b/grants_tagger_light/retagging/cnn_gpu_config.cfg
@@ -0,0 +1,127 @@
+[paths]
+train = ""
+dev = ""
+raw = null
+init_tok2vec = null
+vectors = null
+
+[system]
+seed = 42
+gpu_allocator = "pytorch"
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+
+[components]
+
+[components.textcat]
+factory = "textcat_multilabel"
+threshold = 0.5
+
+[components.textcat.model]
+@architectures = "spacy.TextCatCNN.v1"
+exclusive_classes = false
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.textcat.model.tok2vec.encode:width}
+rows = [10000,5000,5000,5000]
+attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 96
+depth = 4
+window_size = 1
+maxout_pieces = 3
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths:dev}
+gold_preproc = ${corpora.train.gold_preproc}
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths:train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+train_corpus = "corpora.train"
+dev_corpus = "corpora.dev"
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.2
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 200
+accumulate_gradient = 1
+frozen_components = []
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_sequence.v1"
+size = 32
+get_length = null
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+eps = 0.00000001
+learn_rate = 0.001
+use_averages = true
+
+[training.score_weights]
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+cats_score = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index e71cf867..9d6f2f11 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -1,89 +1,83 @@
 [paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
+train = null
+dev = null
 vectors = null
+init_tok2vec = null
 
 [system]
-seed = 42
-gpu_allocator = "pytorch"
+gpu_allocator = null
+seed = 0
 
 [nlp]
 lang = "en"
 pipeline = ["textcat"]
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+batch_size = 1000
 disabled = []
 before_creation = null
 after_creation = null
 after_pipeline_creation = null
-batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
 
 [components]
 
 [components.textcat]
-factory = "textcat_multilabel"
+factory = "textcat"
+scorer = {"@scorers":"spacy.textcat_scorer.v1"}
 threshold = 0.5
 
 [components.textcat.model]
-@architectures = "spacy.TextCatCNN.v1"
-exclusive_classes = false
+@architectures = "spacy.TextCatBOW.v2"
+exclusive_classes = true
+ngram_size = 1
+no_output_layer = false
 nO = null
 
-[components.textcat.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v2"
-
-[components.textcat.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.textcat.model.tok2vec.encode:width}
-rows = [10000,5000,5000,5000]
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-include_static_vectors = false
-
-[components.textcat.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v2"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
 [corpora]
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${corpora.train.gold_preproc}
+path = ${paths.dev}
 max_length = 0
+gold_preproc = false
 limit = 0
 augmenter = null
 
 [corpora.train]
 @readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = false
+path = ${paths.train}
 max_length = 0
+gold_preproc = false
 limit = 0
 augmenter = null
 
 [training]
-train_corpus = "corpora.train"
 dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
+dropout = 0.1
+accumulate_gradient = 1
 patience = 1600
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
-accumulate_gradient = 1
 frozen_components = []
+annotating_components = []
 before_to_disk = null
 
 [training.batcher]
-@batchers = "spacy.batch_by_sequence.v1"
-size = 32
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
 get_length = null
 
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
 [training.logger]
 @loggers = "spacy.ConsoleLogger.v1"
 progress_bar = false
@@ -95,11 +89,12 @@ beta2 = 0.999
 L2_is_weight_decay = true
 L2 = 0.01
 grad_clip = 1.0
+use_averages = false
 eps = 0.00000001
 learn_rate = 0.001
-use_averages = true
 
 [training.score_weights]
+cats_score = 1.0
 cats_score_desc = null
 cats_micro_p = null
 cats_micro_r = null
@@ -110,7 +105,6 @@ cats_macro_f = null
 cats_macro_auc = null
 cats_f_per_type = null
 cats_macro_auc_per_type = null
-cats_score = 1.0
 
 [pretraining]
 
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index da0d67bf..ceea2b79 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -145,7 +145,6 @@ def retag_cli(
         exit(-1)
 
     spacy.cli.download("en_core_web_sm")
-    spacy.require_gpu()
 
     retag(
         data_path,

From 4289f4f9b5f680b48860c1022cf80888df06a19c Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 13:22:38 +0100
Subject: [PATCH 013/102] Adds large model

---
 grants_tagger_light/retagging/config.cfg   | 13 ++++++++-----
 grants_tagger_light/retagging/retagging.py |  5 +++--
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index 9d6f2f11..576c704f 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -55,11 +55,13 @@ dev_corpus = "corpora.dev"
 train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
-dropout = 0.1
+# dropout = 0.1
+dropout = 0
 accumulate_gradient = 1
-patience = 1600
-max_epochs = 0
-max_steps = 20000
+# patience = 1600
+patience = 0
+max_epochs = 15
+# max_steps = 20000
 eval_frequency = 200
 frozen_components = []
 annotating_components = []
@@ -91,7 +93,8 @@ L2 = 0.01
 grad_clip = 1.0
 use_averages = false
 eps = 0.00000001
-learn_rate = 0.001
+#learn_rate = 0.001
+learn_rate = 0.005
 
 [training.score_weights]
 cats_score = 1.0
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index ceea2b79..baa467a4 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -50,7 +50,7 @@ def retag(
     for tag in tags:
         logging.info(f"Retagging: {tag}")
 
-        nlp = spacy.load("en_core_web_sm")
+        nlp = spacy.load("en_core_web_lg")
 
         logging.info(f"Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
@@ -64,6 +64,7 @@ def retag(
         )
         neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
 
+        logging.info(f"Processing corpus...")
         train_data = DocBin()
         for doc in nlp.pipe(pos_x_train):
             doc.cats[tag] = 1
@@ -144,7 +145,7 @@ def retag_cli(
         )
         exit(-1)
 
-    spacy.cli.download("en_core_web_sm")
+    spacy.cli.download("en_core_web_lg")
 
     retag(
         data_path,

From 96356ae3a6d8b9fac1fb68f9463f340d83ffb5e4 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 13:27:36 +0100
Subject: [PATCH 014/102] Adds large model

---
 grants_tagger_light/retagging/config.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
index 576c704f..a5fb381e 100644
--- a/grants_tagger_light/retagging/config.cfg
+++ b/grants_tagger_light/retagging/config.cfg
@@ -56,7 +56,7 @@ train_corpus = "corpora.train"
 seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 # dropout = 0.1
-dropout = 0
+dropout = 0.0
 accumulate_gradient = 1
 # patience = 1600
 patience = 0

From e57e891e3eea449e3e6e301f83688a0adc8fff97 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 15:22:49 +0100
Subject: [PATCH 015/102] Adds sparknlp

---
 grants_tagger_light/retagging/retagging.py    | 119 +++--
 .../retagging/retagging_spacy.py              | 158 ++++++
 poetry.lock                                   | 499 ++++++++++++++++--
 pyproject.toml                                |   1 +
 4 files changed, 693 insertions(+), 84 deletions(-)
 create mode 100644 grants_tagger_light/retagging/retagging_spacy.py

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index baa467a4..319a4810 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -7,9 +7,14 @@
 
 from datasets import load_dataset
 
-import spacy
-from spacy.tokens import DocBin
-from spacy.cli.train import train as spacy_train
+
+from johnsnowlabs import nlp
+
+import os
+
+from sklearn.metrics import classification_report
+
+spark = nlp.start()
 
 retag_app = typer.Typer()
 
@@ -50,8 +55,6 @@ def retag(
     for tag in tags:
         logging.info(f"Retagging: {tag}")
 
-        nlp = spacy.load("en_core_web_lg")
-
         logging.info(f"Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
             lambda x: tag in x["meshMajor"], num_proc=num_proc
@@ -64,45 +67,71 @@ def retag(
         )
         neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
 
-        logging.info(f"Processing corpus...")
-        train_data = DocBin()
-        for doc in nlp.pipe(pos_x_train):
-            doc.cats[tag] = 1
-            doc.cats['O'] = 0
-            train_data.add(doc)
-        for doc in nlp.pipe(neg_x_train):
-            doc.cats[tag] = 0
-            doc.cats['O'] = 1
-            train_data.add(doc)
-        train_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy")
-        train_data.to_disk(train_data_path)
-
-        test_data = DocBin()
-        for doc in nlp.pipe(pos_x_test):
-            doc.cats[tag] = 1
-            doc.cats['O'] = 0
-            test_data.add(doc)
-        for doc in nlp.pipe(pos_x_test):
-            doc.cats[tag] = 0
-            doc.cats['O'] = 1
-            test_data.add(doc)
-        test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy")
-        test_data.to_disk(test_data_path)
-
-        logging.info(f"Train data size: {len(train_data)}")
-        logging.info(f"Test data size: {len(test_data)}")
-
-        config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.cfg")
-        output_model_path = "spacy_textcat"
-        spacy_train(
-            config_path,
-            output_path=output_model_path,
-            overrides={
-                "paths.train": train_data_path,
-                "paths.dev": test_data_path,
-            },
-        )
-        break
+        train_data = [(x, tag) for x in pos_x_train]
+        train_data.extend([(x, 'other') for x in neg_x_train])
+
+        columns = ["text", "category"]
+        train_df = spark.createDataFrame(train_data, columns)
+
+        test_data = [(x, tag) for x in pos_x_test]
+        test_data.extend([(x, 'other') for x in neg_x_test])
+        test_df = spark.createDataFrame(test_data, columns)
+
+        logging.info(train_df.groupBy("category") \
+            .count() \
+            .orderBy(col("count").desc()) \
+            .show())
+
+        logging.info(train_df.groupBy("category") \
+              .count() \
+              .orderBy(col("count").desc()) \
+              .show())
+
+        document_assembler = nlp.DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("document")
+
+        tokenizer = nlp.Tokenizer() \
+            .setInputCols(["document"]) \
+            .setOutputCol("token")
+
+        bert_embeddings = nlp.BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
+            .setInputCols(["document", 'token']) \
+            .setOutputCol("embeddings")
+
+        embeddingsSentence = nlp.SentenceEmbeddings() \
+            .setInputCols(["document", "embeddings"]) \
+            .setOutputCol("sentence_embeddings") \
+            .setPoolingStrategy("AVERAGE")
+
+        classsifierdl = nlp.ClassifierDLApproach() \
+            .setInputCols(["sentence_embeddings"]) \
+            .setOutputCol("label") \
+            .setLabelColumn("category") \
+            .setMaxEpochs(10) \
+            .setLr(0.001) \
+            .setBatchSize(8) \
+            .setEnableOutputLogs(True)
+        # .setOutputLogsPath('logs')
+
+        bert_clf_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                             tokenizer,
+                                             bert_embeddings,
+                                             embeddingsSentence,
+                                             classsifierdl])
+
+        clf_pipelineModel = bert_clf_pipeline.fit(train_df)
+        preds = clf_pipelineModel.transform(test_df)
+        logging.info(preds.select('category', 'text', 'label.result').show(10, truncate=80))
+
+        preds_df = preds.select('category', 'text', 'label.result').toPandas()
+
+        # The result is an array since in Spark NLP you can have multiple sentences.
+        # Let's explode the array and get the item(s) inside of result column out
+        preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
+
+        print(classification_report(preds_df['category'], preds_df['result']))
+
 
 
 @retag_app.command()
@@ -145,8 +174,6 @@ def retag_cli(
         )
         exit(-1)
 
-    spacy.cli.download("en_core_web_lg")
-
     retag(
         data_path,
         save_to_path,
diff --git a/grants_tagger_light/retagging/retagging_spacy.py b/grants_tagger_light/retagging/retagging_spacy.py
new file mode 100644
index 00000000..0ce0b26d
--- /dev/null
+++ b/grants_tagger_light/retagging/retagging_spacy.py
@@ -0,0 +1,158 @@
+"""import logging
+import os
+import random
+
+import typer
+from loguru import logger
+
+from datasets import load_dataset
+
+import spacy
+from spacy.tokens import DocBin
+from spacy.cli.train import train as spacy_train
+
+retag_app = typer.Typer()
+
+
+def _load_data(dset: list[str], limit=100, split=0.8):
+    # Partition off part of the train data for evaluation
+    random.Random(42).shuffle(dset)
+    train_size = int(split * limit)
+    train_dset = dset[:train_size]
+    test_dset = dset[train_size:limit]
+    return train_dset, test_dset
+
+
+def retag(
+    data_path: str,
+    save_to_path: str,
+    model_key: str = "gpt-3.5-turbo",
+    num_proc: int = os.cpu_count(),
+    batch_size: int = 64,
+    concurrent_calls: int = os.cpu_count() * 2,
+    tags_file_path: str = None,
+):
+    if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
+        raise NotImplementedError(
+            f"{model_key} not implemented as an augmentation framework"
+        )
+
+    # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
+    logging.info("Loading the MeSH jsonl...")
+    dset = load_dataset("json", data_files=data_path, num_proc=1)
+    if "train" in dset:
+        dset = dset["train"]
+
+    with open(tags_file_path, 'r') as f:
+        tags = [x.strip() for x in f.readlines()]
+
+    for tag in tags:
+        logging.info(f"Retagging: {tag}")
+
+        nlp = spacy.load("en_core_web_lg")
+
+        logging.info(f"Obtaining positive examples for {tag}...")
+        positive_dset = dset.filter(
+            lambda x: tag in x["meshMajor"], num_proc=num_proc
+        )
+        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
+
+        logging.info(f"Obtaining negative examples for {tag}...")
+        negative_dset = dset.filter(
+            lambda x: tag not in x["meshMajor"], num_proc=num_proc
+        )
+        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
+
+        logging.info(f"Processing corpus...")
+        train_data = DocBin()
+        for doc in nlp.pipe(pos_x_train):
+            doc.cats[tag] = 1
+            doc.cats['O'] = 0
+            train_data.add(doc)
+        for doc in nlp.pipe(neg_x_train):
+            doc.cats[tag] = 0
+            doc.cats['O'] = 1
+            train_data.add(doc)
+        train_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy")
+        train_data.to_disk(train_data_path)
+
+        test_data = DocBin()
+        for doc in nlp.pipe(pos_x_test):
+            doc.cats[tag] = 1
+            doc.cats['O'] = 0
+            test_data.add(doc)
+        for doc in nlp.pipe(pos_x_test):
+            doc.cats[tag] = 0
+            doc.cats['O'] = 1
+            test_data.add(doc)
+        test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy")
+        test_data.to_disk(test_data_path)
+
+        logging.info(f"Train data size: {len(train_data)}")
+        logging.info(f"Test data size: {len(test_data)}")
+
+        config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.cfg")
+        output_model_path = "spacy_textcat"
+        spacy_train(
+            config_path,
+            output_path=output_model_path,
+            overrides={
+                "paths.train": train_data_path,
+                "paths.dev": test_data_path,
+            },
+        )
+        break
+
+
+@retag_app.command()
+def retag_cli(
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    save_to_path: str = typer.Argument(
+        ..., help="Path where to save the retagged data"
+    ),
+    model_key: str = typer.Option(
+        "gpt-3.5-turbo",
+        help="LLM to use data augmentation. By now, only `openai` is supported",
+    ),
+    num_proc: int = typer.Option(
+        os.cpu_count(), help="Number of processes to use for data augmentation"
+    ),
+    batch_size: int = typer.Option(
+        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
+    ),
+    concurrent_calls: int = typer.Option(
+        os.cpu_count() * 2,
+        min=1,
+        help="Concurrent calls with 1 tag each to the different model",
+    ),
+    tags_file_path: str = typer.Option(
+        None,
+        help="Text file containing one line per tag to be considered. "
+        "The rest will be discarded.",
+    ),
+):
+    if not data_path.endswith("jsonl"):
+        logger.error(
+            "It seems your input MeSH data is not in `jsonl` format. "
+            "Please, run first `scripts/mesh_json_to_jsonl.py.`"
+        )
+        exit(-1)
+
+    if tags_file_path is None:
+        logger.error(
+            "To understand which tags need to be augmented set the path to the tags file in --tags-file-path"
+        )
+        exit(-1)
+
+    spacy.cli.download("en_core_web_lg")
+
+    retag(
+        data_path,
+        save_to_path,
+        model_key=model_key,
+        num_proc=num_proc,
+        batch_size=batch_size,
+        concurrent_calls=concurrent_calls,
+        tags_file_path=tags_file_path,
+    )
+"""
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 782ece71..ef747606 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -267,6 +267,17 @@ files = [
     {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
 ]
 
+[[package]]
+name = "appnope"
+version = "0.1.3"
+description = "Disable App Nap on macOS >= 10.9"
+optional = false
+python-versions = "*"
+files = [
+    {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"},
+    {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"},
+]
+
 [[package]]
 name = "argilla"
 version = "1.10.0"
@@ -298,6 +309,23 @@ postgresql = ["psycopg2 (>=2.9.5,<2.10.0)", "psycopg2-binary (>=2.9.5,<2.10.0)"]
 server = ["PyYAML (>=5.4.1,<6.1.0)", "SQLAlchemy (>=2.0.0,<2.1.0)", "aiofiles (>=0.6,<22.2)", "alembic (>=1.9.0,<1.10.0)", "brotli-asgi (>=1.1,<1.3)", "elasticsearch8[async] (>=8.7.0,<8.8.0)", "fastapi (>=0.75,<0.89)", "luqum (>=0.11,<0.13)", "opensearch-py (>=2.0.0,<2.1.0)", "passlib[bcrypt] (>=1.7.4,<1.8.0)", "psutil (>=5.8,<5.10)", "python-jose[cryptography] (>=3.2,<3.4)", "python-multipart (>=0.0.5,<0.1.0)", "scikit-learn (>=0.24.2)", "segment-analytics-python (==2.2.0)", "smart-open", "uvicorn[standard] (>=0.15.0,<0.21.0)"]
 tests = ["cleanlab (>=2.0.0,<2.1.0)", "datasets (>1.17.0,!=2.3.2)", "evaluate", "factory-boy (>=3.2.1,<3.3.0)", "faiss-cpu", "flair (>=0.12.2)", "flyingsquid", "huggingface-hub (>=0.5.0,<0.13)", "openai", "pgmpy", "plotly (>=4.1.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-mock", "rich (==13.0.1)", "seqeval", "setfit", "snorkel (>=0.9.7)", "spacy (==3.5.0)", "span-marker", "transformers[torch] (>=4.19.0)"]
 
+[[package]]
+name = "asttokens"
+version = "2.4.0"
+description = "Annotate AST trees with source code positions"
+optional = false
+python-versions = "*"
+files = [
+    {file = "asttokens-2.4.0-py2.py3-none-any.whl", hash = "sha256:cf8fc9e61a86461aa9fb161a14a0841a03c405fa829ac6b202670b3495d2ce69"},
+    {file = "asttokens-2.4.0.tar.gz", hash = "sha256:2e0171b991b2c959acc6c49318049236844a5da1d65ba2672c4880c1c894834e"},
+]
+
+[package.dependencies]
+six = ">=1.12.0"
+
+[package.extras]
+test = ["astroid", "pytest"]
+
 [[package]]
 name = "async-timeout"
 version = "4.0.2"
@@ -398,6 +426,17 @@ redshift = ["redshift-connector (>=2.0.0,<3.0.0)"]
 sparql = ["SPARQLWrapper (>=2.0.0,<3.0.0)", "requests (>=2.0.0,<3.0.0)"]
 sqlserver = ["pyodbc (>=4.0.0,<5.0.0)"]
 
+[[package]]
+name = "backcall"
+version = "0.2.0"
+description = "Specifications for callback functions passed in to an API"
+optional = false
+python-versions = "*"
+files = [
+    {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"},
+    {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"},
+]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -1019,6 +1058,51 @@ files = [
     {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"},
 ]
 
+[[package]]
+name = "databricks-api"
+version = "0.9.0"
+description = "Databricks API client auto-generated from the official databricks-cli package"
+optional = false
+python-versions = ">=3.6,<4.0"
+files = [
+    {file = "databricks_api-0.9.0-py3-none-any.whl", hash = "sha256:51327fc1a06d9f4125a7a74d6764c3f1e99b6fb8f4b7f7cc178679b2c0d8ae5b"},
+    {file = "databricks_api-0.9.0.tar.gz", hash = "sha256:40db26831ae37d2659d2700f4cb253615d895b6d440b99fb995aed51e67928f0"},
+]
+
+[package.dependencies]
+databricks-cli = "*"
+
+[[package]]
+name = "databricks-cli"
+version = "0.17.7"
+description = "A command line interface for Databricks"
+optional = false
+python-versions = "*"
+files = [
+    {file = "databricks-cli-0.17.7.tar.gz", hash = "sha256:5a545063449f3b9ad904644c0f251058485e29e564dedf8d4e4a7b45caf9549b"},
+    {file = "databricks_cli-0.17.7-py2-none-any.whl", hash = "sha256:5b025943c70bbd374415264d38bfaddfb34ce070fadb083d851aec311e0f8901"},
+]
+
+[package.dependencies]
+click = ">=7.0"
+oauthlib = ">=3.1.0"
+pyjwt = ">=1.7.0"
+requests = ">=2.17.3"
+six = ">=1.10.0"
+tabulate = ">=0.7.7"
+urllib3 = ">=1.26.7,<2.0.0"
+
+[[package]]
+name = "dataclasses"
+version = "0.6"
+description = "A backport of the dataclasses module for Python 3.6"
+optional = false
+python-versions = "*"
+files = [
+    {file = "dataclasses-0.6-py3-none-any.whl", hash = "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f"},
+    {file = "dataclasses-0.6.tar.gz", hash = "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"},
+]
+
 [[package]]
 name = "datasets"
 version = "2.13.1"
@@ -1061,6 +1145,17 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elast
 torch = ["torch"]
 vision = ["Pillow (>=6.2.1)"]
 
+[[package]]
+name = "decorator"
+version = "5.1.1"
+description = "Decorators for Humans"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
+]
+
 [[package]]
 name = "deprecated"
 version = "1.2.14"
@@ -1473,6 +1568,20 @@ files = [
 [package.extras]
 test = ["pytest (>=6)"]
 
+[[package]]
+name = "executing"
+version = "1.2.0"
+description = "Get the currently executing AST node of a frame, and other information"
+optional = false
+python-versions = "*"
+files = [
+    {file = "executing-1.2.0-py2.py3-none-any.whl", hash = "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc"},
+    {file = "executing-1.2.0.tar.gz", hash = "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"},
+]
+
+[package.extras]
+tests = ["asttokens", "littleutils", "pytest", "rich"]
+
 [[package]]
 name = "filelock"
 version = "3.12.2"
@@ -1821,6 +1930,45 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
+[[package]]
+name = "ipython"
+version = "8.15.0"
+description = "IPython: Productive Interactive Computing"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "ipython-8.15.0-py3-none-any.whl", hash = "sha256:45a2c3a529296870a97b7de34eda4a31bee16bc7bf954e07d39abe49caf8f887"},
+    {file = "ipython-8.15.0.tar.gz", hash = "sha256:2baeb5be6949eeebf532150f81746f8333e2ccce02de1c7eedde3f23ed5e9f1e"},
+]
+
+[package.dependencies]
+appnope = {version = "*", markers = "sys_platform == \"darwin\""}
+backcall = "*"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+decorator = "*"
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+jedi = ">=0.16"
+matplotlib-inline = "*"
+pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""}
+pickleshare = "*"
+prompt-toolkit = ">=3.0.30,<3.0.37 || >3.0.37,<3.1.0"
+pygments = ">=2.4.0"
+stack-data = "*"
+traitlets = ">=5"
+
+[package.extras]
+all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.21)", "pandas", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"]
+black = ["black"]
+doc = ["docrepr", "exceptiongroup", "ipykernel", "matplotlib", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "typing-extensions"]
+kernel = ["ipykernel"]
+nbconvert = ["nbconvert"]
+nbformat = ["nbformat"]
+notebook = ["ipywidgets", "notebook"]
+parallel = ["ipyparallel"]
+qtconsole = ["qtconsole"]
+test = ["pytest (<7.1)", "pytest-asyncio", "testpath"]
+test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.21)", "pandas", "pytest (<7.1)", "pytest-asyncio", "testpath", "trio"]
+
 [[package]]
 name = "isort"
 version = "5.12.0"
@@ -1859,6 +2007,25 @@ requests = "*"
 dev = ["mypy (==0.971)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "types-requests"]
 tests = ["mypy (==0.971)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "types-requests"]
 
+[[package]]
+name = "jedi"
+version = "0.19.0"
+description = "An autocompletion tool for Python that can be used for text editors."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "jedi-0.19.0-py2.py3-none-any.whl", hash = "sha256:cb8ce23fbccff0025e9386b5cf85e892f94c9b822378f8da49970471335ac64e"},
+    {file = "jedi-0.19.0.tar.gz", hash = "sha256:bcf9894f1753969cbac8022a8c2eaee06bfa3724e4192470aaffe7eb6272b0c4"},
+]
+
+[package.dependencies]
+parso = ">=0.8.3,<0.9.0"
+
+[package.extras]
+docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"]
+qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
+testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]
+
 [[package]]
 name = "jinja2"
 version = "3.1.2"
@@ -1898,6 +2065,29 @@ files = [
     {file = "joblib-1.3.1.tar.gz", hash = "sha256:1f937906df65329ba98013dc9692fe22a4c5e4a648112de500508b18a21b41e3"},
 ]
 
+[[package]]
+name = "johnsnowlabs"
+version = "5.0.7"
+description = "The John Snow Labs Library gives you access to all of John Snow Labs Enterprise And Open Source products in an easy and simple manner. Access 10000+ state-of-the-art NLP and OCR models for Finance, Legal and Medical domains. Easily scalable to Spark Cluster"
+optional = false
+python-versions = "*"
+files = [
+    {file = "johnsnowlabs-5.0.7-py3-none-any.whl", hash = "sha256:b95044738d93a6650081c87f00cd4e2ffa43288e45c508b916f8041da94bbddd"},
+    {file = "johnsnowlabs-5.0.7.tar.gz", hash = "sha256:528c58164bea42e7d2311907568a6898565eeb61105f1554569b9caa72bf9fd7"},
+]
+
+[package.dependencies]
+colorama = "*"
+databricks-api = "*"
+dataclasses = "*"
+nlu = "5.0.0"
+numpy = "*"
+pydantic = "1.10.11"
+pyspark = "3.1.2"
+requests = "*"
+spark-nlp = "5.0.2"
+spark-nlp-display = "4.1"
+
 [[package]]
 name = "kombu"
 version = "5.3.1"
@@ -2056,6 +2246,20 @@ files = [
     {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"},
 ]
 
+[[package]]
+name = "matplotlib-inline"
+version = "0.1.6"
+description = "Inline Matplotlib backend for Jupyter"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"},
+    {file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"},
+]
+
+[package.dependencies]
+traitlets = "*"
+
 [[package]]
 name = "monotonic"
 version = "1.6"
@@ -2269,6 +2473,24 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
 test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
+[[package]]
+name = "nlu"
+version = "5.0.0"
+description = "John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library."
+optional = false
+python-versions = "*"
+files = [
+    {file = "nlu-5.0.0-py3-none-any.whl", hash = "sha256:4e9b62ab1e822d15881657dd320fed62c032856a6a5783e14172b92196116fbd"},
+    {file = "nlu-5.0.0.tar.gz", hash = "sha256:e22d834839c1a7fe4a91aa6f21e79921798d0a4d1d643b03ef07f37d0bec7e75"},
+]
+
+[package.dependencies]
+dataclasses = "*"
+numpy = "*"
+pandas = ">=1.3.5"
+pyarrow = ">=0.16.0"
+spark-nlp = ">=5.0.2"
+
 [[package]]
 name = "nodeenv"
 version = "1.8.0"
@@ -2320,6 +2542,22 @@ files = [
     {file = "numpy-1.23.5.tar.gz", hash = "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a"},
 ]
 
+[[package]]
+name = "oauthlib"
+version = "3.2.2"
+description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
+    {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
+]
+
+[package.extras]
+rsa = ["cryptography (>=3.0.0)"]
+signals = ["blinker (>=1.4.0)"]
+signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
+
 [[package]]
 name = "omegaconf"
 version = "2.3.0"
@@ -2486,6 +2724,21 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
+[[package]]
+name = "parso"
+version = "0.8.3"
+description = "A Python Parser"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "parso-0.8.3-py2.py3-none-any.whl", hash = "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"},
+    {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"},
+]
+
+[package.extras]
+qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
+testing = ["docopt", "pytest (<6.0.0)"]
+
 [[package]]
 name = "pathspec"
 version = "0.11.1"
@@ -2529,6 +2782,31 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 s3 = ["boto3"]
 test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
 
+[[package]]
+name = "pexpect"
+version = "4.8.0"
+description = "Pexpect allows easy control of interactive console applications."
+optional = false
+python-versions = "*"
+files = [
+    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
+    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
+]
+
+[package.dependencies]
+ptyprocess = ">=0.5"
+
+[[package]]
+name = "pickleshare"
+version = "0.7.5"
+description = "Tiny 'shelve'-like database with concurrency support"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"},
+    {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
+]
+
 [[package]]
 name = "platformdirs"
 version = "3.9.1"
@@ -2680,6 +2958,42 @@ files = [
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
+[[package]]
+name = "ptyprocess"
+version = "0.7.0"
+description = "Run a subprocess in a pseudo terminal"
+optional = false
+python-versions = "*"
+files = [
+    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
+    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
+]
+
+[[package]]
+name = "pure-eval"
+version = "0.2.2"
+description = "Safely evaluate AST nodes without side effects"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"},
+    {file = "pure_eval-0.2.2.tar.gz", hash = "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"},
+]
+
+[package.extras]
+tests = ["pytest"]
+
+[[package]]
+name = "py4j"
+version = "0.10.9"
+description = "Enables Python programs to dynamically access arbitrary Java objects"
+optional = false
+python-versions = "*"
+files = [
+    {file = "py4j-0.10.9-py2.py3-none-any.whl", hash = "sha256:859ba728a7bb43e9c2bf058832759fb97a598bb28cc12f34f5fc4abdec08ede6"},
+    {file = "py4j-0.10.9.tar.gz", hash = "sha256:36ec57f43ff8ced260a18aa9a4e46c3500a730cac8860e259cbaa546c2b9db2f"},
+]
+
 [[package]]
 name = "pyarrow"
 version = "12.0.1"
@@ -2730,47 +3044,47 @@ files = [
 
 [[package]]
 name = "pydantic"
-version = "1.10.12"
+version = "1.10.11"
 description = "Data validation and settings management using python type hints"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"},
-    {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"},
-    {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"},
-    {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"},
-    {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"},
-    {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"},
-    {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"},
-    {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"},
-    {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"},
-    {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"},
-    {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"},
-    {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"},
-    {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"},
-    {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"},
-    {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"},
-    {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"},
-    {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"},
-    {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"},
-    {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"},
-    {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"},
-    {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"},
-    {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"},
-    {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"},
+    {file = "pydantic-1.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ff44c5e89315b15ff1f7fdaf9853770b810936d6b01a7bcecaa227d2f8fe444f"},
+    {file = "pydantic-1.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a6c098d4ab5e2d5b3984d3cb2527e2d6099d3de85630c8934efcfdc348a9760e"},
+    {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16928fdc9cb273c6af00d9d5045434c39afba5f42325fb990add2c241402d151"},
+    {file = "pydantic-1.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0588788a9a85f3e5e9ebca14211a496409cb3deca5b6971ff37c556d581854e7"},
+    {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e9baf78b31da2dc3d3f346ef18e58ec5f12f5aaa17ac517e2ffd026a92a87588"},
+    {file = "pydantic-1.10.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:373c0840f5c2b5b1ccadd9286782852b901055998136287828731868027a724f"},
+    {file = "pydantic-1.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:c3339a46bbe6013ef7bdd2844679bfe500347ac5742cd4019a88312aa58a9847"},
+    {file = "pydantic-1.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:08a6c32e1c3809fbc49debb96bf833164f3438b3696abf0fbeceb417d123e6eb"},
+    {file = "pydantic-1.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a451ccab49971af043ec4e0d207cbc8cbe53dbf148ef9f19599024076fe9c25b"},
+    {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b02d24f7b2b365fed586ed73582c20f353a4c50e4be9ba2c57ab96f8091ddae"},
+    {file = "pydantic-1.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f34739a89260dfa420aa3cbd069fbcc794b25bbe5c0a214f8fb29e363484b66"},
+    {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:e297897eb4bebde985f72a46a7552a7556a3dd11e7f76acda0c1093e3dbcf216"},
+    {file = "pydantic-1.10.11-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d185819a7a059550ecb85d5134e7d40f2565f3dd94cfd870132c5f91a89cf58c"},
+    {file = "pydantic-1.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:4400015f15c9b464c9db2d5d951b6a780102cfa5870f2c036d37c23b56f7fc1b"},
+    {file = "pydantic-1.10.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2417de68290434461a266271fc57274a138510dca19982336639484c73a07af6"},
+    {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:331c031ba1554b974c98679bd0780d89670d6fd6f53f5d70b10bdc9addee1713"},
+    {file = "pydantic-1.10.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8268a735a14c308923e8958363e3a3404f6834bb98c11f5ab43251a4e410170c"},
+    {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:44e51ba599c3ef227e168424e220cd3e544288c57829520dc90ea9cb190c3248"},
+    {file = "pydantic-1.10.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d7781f1d13b19700b7949c5a639c764a077cbbdd4322ed505b449d3ca8edcb36"},
+    {file = "pydantic-1.10.11-cp37-cp37m-win_amd64.whl", hash = "sha256:7522a7666157aa22b812ce14c827574ddccc94f361237ca6ea8bb0d5c38f1629"},
+    {file = "pydantic-1.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc64eab9b19cd794a380179ac0e6752335e9555d214cfcb755820333c0784cb3"},
+    {file = "pydantic-1.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8dc77064471780262b6a68fe67e013298d130414d5aaf9b562c33987dbd2cf4f"},
+    {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe429898f2c9dd209bd0632a606bddc06f8bce081bbd03d1c775a45886e2c1cb"},
+    {file = "pydantic-1.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:192c608ad002a748e4a0bed2ddbcd98f9b56df50a7c24d9a931a8c5dd053bd3d"},
+    {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ef55392ec4bb5721f4ded1096241e4b7151ba6d50a50a80a2526c854f42e6a2f"},
+    {file = "pydantic-1.10.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e0bb6efe86281623abbeeb0be64eab740c865388ee934cd3e6a358784aca6e"},
+    {file = "pydantic-1.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:265a60da42f9f27e0b1014eab8acd3e53bd0bad5c5b4884e98a55f8f596b2c19"},
+    {file = "pydantic-1.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:469adf96c8e2c2bbfa655fc7735a2a82f4c543d9fee97bd113a7fb509bf5e622"},
+    {file = "pydantic-1.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e6cbfbd010b14c8a905a7b10f9fe090068d1744d46f9e0c021db28daeb8b6de1"},
+    {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abade85268cc92dff86d6effcd917893130f0ff516f3d637f50dadc22ae93999"},
+    {file = "pydantic-1.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e9738b0f2e6c70f44ee0de53f2089d6002b10c33264abee07bdb5c7f03038303"},
+    {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:787cf23e5a0cde753f2eabac1b2e73ae3844eb873fd1f5bdbff3048d8dbb7604"},
+    {file = "pydantic-1.10.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:174899023337b9fc685ac8adaa7b047050616136ccd30e9070627c1aaab53a13"},
+    {file = "pydantic-1.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:1954f8778489a04b245a1e7b8b22a9d3ea8ef49337285693cf6959e4b757535e"},
+    {file = "pydantic-1.10.11-py3-none-any.whl", hash = "sha256:008c5e266c8aada206d0627a011504e14268a62091450210eda7c07fabe6963e"},
+    {file = "pydantic-1.10.11.tar.gz", hash = "sha256:f66d479cf7eb331372c470614be6511eae96f1f120344c25f3f9bb59fb1b5528"},
 ]
 
 [package.dependencies]
@@ -2862,6 +3176,23 @@ files = [
     {file = "pygtrie-2.5.0.tar.gz", hash = "sha256:203514ad826eb403dab1d2e2ddd034e0d1534bbe4dbe0213bb0593f66beba4e2"},
 ]
 
+[[package]]
+name = "pyjwt"
+version = "2.8.0"
+description = "JSON Web Token implementation in Python"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
+    {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
+]
+
+[package.extras]
+crypto = ["cryptography (>=3.4.0)"]
+dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
+docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
+tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
+
 [[package]]
 name = "pyparsing"
 version = "3.1.0"
@@ -2876,6 +3207,24 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
+[[package]]
+name = "pyspark"
+version = "3.1.2"
+description = "Apache Spark Python API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "pyspark-3.1.2.tar.gz", hash = "sha256:5e25ebb18756e9715f4d26848cc7e558035025da74b4fc325a0ebc05ff538e65"},
+]
+
+[package.dependencies]
+py4j = "0.10.9"
+
+[package.extras]
+ml = ["numpy (>=1.7)"]
+mllib = ["numpy (>=1.7)"]
+sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
+
 [[package]]
 name = "pytest"
 version = "7.4.0"
@@ -3765,6 +4114,35 @@ files = [
     {file = "spacy_loggers-1.0.4-py3-none-any.whl", hash = "sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae"},
 ]
 
+[[package]]
+name = "spark-nlp"
+version = "5.0.2"
+description = "John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment."
+optional = false
+python-versions = "*"
+files = [
+    {file = "spark-nlp-5.0.2.tar.gz", hash = "sha256:690a9509bea5adddb55557539ca8fc1a8b949e73fb69499007829ae857284050"},
+    {file = "spark_nlp-5.0.2-py2.py3-none-any.whl", hash = "sha256:898da78131364934dcaa715d8a763ec751e06b2d901a07fe5ca0c1a03d51ce47"},
+]
+
+[[package]]
+name = "spark-nlp-display"
+version = "4.1"
+description = "Visualization package for Spark NLP"
+optional = false
+python-versions = ">=2.7"
+files = [
+    {file = "spark-nlp-display-4.1.tar.gz", hash = "sha256:2ef6a3db7702b0e2b455c150b3322eb5505896b57482f5f6aafd5c1e149ff6b6"},
+    {file = "spark_nlp_display-4.1-py3-none-any.whl", hash = "sha256:5af5ae18b8669cb9b2b9bea577e44ad609297a68d6f6c2e3d9ff9f52e26e0440"},
+]
+
+[package.dependencies]
+ipython = "*"
+numpy = "*"
+pandas = "*"
+spark-nlp = "*"
+svgwrite = "1.4"
+
 [[package]]
 name = "sqltrie"
 version = "0.7.0"
@@ -3825,6 +4203,36 @@ files = [
 [package.dependencies]
 catalogue = ">=2.0.3,<2.1.0"
 
+[[package]]
+name = "stack-data"
+version = "0.6.2"
+description = "Extract data from python stack frames and tracebacks for informative displays"
+optional = false
+python-versions = "*"
+files = [
+    {file = "stack_data-0.6.2-py3-none-any.whl", hash = "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"},
+    {file = "stack_data-0.6.2.tar.gz", hash = "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815"},
+]
+
+[package.dependencies]
+asttokens = ">=2.1.0"
+executing = ">=1.2.0"
+pure-eval = "*"
+
+[package.extras]
+tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
+
+[[package]]
+name = "svgwrite"
+version = "1.4"
+description = "A Python library to create SVG drawings."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "svgwrite-1.4-py3-none-any.whl", hash = "sha256:fa842fb3129a9399d19b5e9602a022fcc7f2f3f24713550e765c488ffafd743d"},
+    {file = "svgwrite-1.4.zip", hash = "sha256:b38ac03b67f81c728d81a33e4711aaf3ab136a57156d721bb17f88525d9909bb"},
+]
+
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -4082,6 +4490,21 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
+[[package]]
+name = "traitlets"
+version = "5.9.0"
+description = "Traitlets Python configuration system"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "traitlets-5.9.0-py3-none-any.whl", hash = "sha256:9e6ec080259b9a5940c797d58b613b5e31441c2257b87c2e795c5228ae80d2d8"},
+    {file = "traitlets-5.9.0.tar.gz", hash = "sha256:f6cde21a9c68cf756af02035f72d5a723bf607e862e7be33ece505abf4a3bad9"},
+]
+
+[package.extras]
+docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
+test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"]
+
 [[package]]
 name = "transformers"
 version = "4.29.2"
@@ -4651,4 +5074,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "c246890fd08f7d69ace373434b1d4cf0adc5bcb1f76177ee34d28ad12b839afa"
+content-hash = "eb7bef16bff140a7569ebb217e8d0d8d8220a883340f533a0be071efe490b500"
diff --git a/pyproject.toml b/pyproject.toml
index 964f7a19..a9387b00 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
 spacy = "^3.6.1"
+johnsnowlabs = "^5.0.7"
 
 
 [tool.poetry.group.dev]

From e38c66803f3ef1221b74dbd7dd6b0835a4d2f72a Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 15:30:01 +0100
Subject: [PATCH 016/102] Adds sparknlp

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 319a4810..80722112 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import random
 
 import typer
@@ -13,6 +12,7 @@
 import os
 
 from sklearn.metrics import classification_report
+from pyspark.sql.functions import *
 
 spark = nlp.start()
 

From da10221c7e57e03cadf7829f57a6dfe03d9ecfda Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-15-234.eu-west-1.compute.internal>
Date: Fri, 8 Sep 2023 14:54:46 +0000
Subject: [PATCH 017/102] Different config of sparknlp

---
 grants_tagger_light/retagging/retagging.py | 13 ++-----------
 tags_to_augment.txt                        |  1 +
 2 files changed, 3 insertions(+), 11 deletions(-)
 create mode 100644 tags_to_augment.txt

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 80722112..0f0400be 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -91,18 +91,9 @@ def retag(
             .setInputCol("text") \
             .setOutputCol("document")
 
-        tokenizer = nlp.Tokenizer() \
+        embeddingsSentence = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
             .setInputCols(["document"]) \
-            .setOutputCol("token")
-
-        bert_embeddings = nlp.BertEmbeddings().pretrained(name='small_bert_L4_256', lang='en') \
-            .setInputCols(["document", 'token']) \
-            .setOutputCol("embeddings")
-
-        embeddingsSentence = nlp.SentenceEmbeddings() \
-            .setInputCols(["document", "embeddings"]) \
             .setOutputCol("sentence_embeddings") \
-            .setPoolingStrategy("AVERAGE")
 
         classsifierdl = nlp.ClassifierDLApproach() \
             .setInputCols(["sentence_embeddings"]) \
@@ -110,7 +101,7 @@ def retag(
             .setLabelColumn("category") \
             .setMaxEpochs(10) \
             .setLr(0.001) \
-            .setBatchSize(8) \
+            .setBatchSize(1) \
             .setEnableOutputLogs(True)
         # .setOutputLogsPath('logs')
 
diff --git a/tags_to_augment.txt b/tags_to_augment.txt
new file mode 100644
index 00000000..7d3e95ef
--- /dev/null
+++ b/tags_to_augment.txt
@@ -0,0 +1 @@
+Artificial Intelligence

From 848fd7a8ef82b07dd85da134669af173c27271f9 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 16:08:24 +0100
Subject: [PATCH 018/102] Fixes pipeline

---
 grants_tagger_light/retagging/retagging.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 0f0400be..c7b18435 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -91,7 +91,7 @@ def retag(
             .setInputCol("text") \
             .setOutputCol("document")
 
-        embeddingsSentence = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
+        embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
             .setInputCols(["document"]) \
             .setOutputCol("sentence_embeddings") \
 
@@ -106,9 +106,7 @@ def retag(
         # .setOutputLogsPath('logs')
 
         bert_clf_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                             tokenizer,
-                                             bert_embeddings,
-                                             embeddingsSentence,
+                                             embeddings,
                                              classsifierdl])
 
         clf_pipelineModel = bert_clf_pipeline.fit(train_df)

From bf06cc86485342a92251cb580f493395e091f12f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 16:28:10 +0100
Subject: [PATCH 019/102] Prototypes retagging after training

---
 grants_tagger_light/retagging/retagging.py | 32 ++++++++++++++--------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index c7b18435..41518cdd 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -95,7 +95,7 @@ def retag(
             .setInputCols(["document"]) \
             .setOutputCol("sentence_embeddings") \
 
-        classsifierdl = nlp.ClassifierDLApproach() \
+        classifierdl = nlp.ClassifierDLApproach() \
             .setInputCols(["sentence_embeddings"]) \
             .setOutputCol("label") \
             .setLabelColumn("category") \
@@ -105,21 +105,31 @@ def retag(
             .setEnableOutputLogs(True)
         # .setOutputLogsPath('logs')
 
-        bert_clf_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                             embeddings,
-                                             classsifierdl])
+        clf_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                            embeddings,
+                                            classifierdl])
 
-        clf_pipelineModel = bert_clf_pipeline.fit(train_df)
-        preds = clf_pipelineModel.transform(test_df)
+        fit_clf_pipeline = clf_pipeline.fit(train_df)
+        preds = fit_clf_pipeline.transform(test_df)
         logging.info(preds.select('category', 'text', 'label.result').show(10, truncate=80))
-
         preds_df = preds.select('category', 'text', 'label.result').toPandas()
-
-        # The result is an array since in Spark NLP you can have multiple sentences.
-        # Let's explode the array and get the item(s) inside of result column out
         preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
+        logging.info(classification_report(preds_df['category'], preds_df['result']))
 
-        print(classification_report(preds_df['category'], preds_df['result']))
+        logging.info("Retagging using the model...")
+        fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
+        fit_clf_model = nlp.ClassifierDLModel.load('clf_tmp')
+
+        pred_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                             embeddings,
+                                             fit_clf_model])
+        pred_df = spark.createDataFrame([['']]).toDF("text")
+        fit_pred_pipeline = pred_pipeline.fit(pred_df)
+        fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
+        for i, elem in enumerate(dset["abstractText"]):
+            result = fit_pred_lightpipeline.annotate(elem)
+            print(result)
+            print(f"Tagged: {result['label']==tag} Expected: {tag in dset['meshMajor']}")
 
 
 

From 8db1322e2565f600829370975a13ed6c141b8f04 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 16:37:13 +0100
Subject: [PATCH 020/102] Prototypes retagging after training

---
 grants_tagger_light/retagging/retagging.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 41518cdd..f6fd12f3 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -126,10 +126,9 @@ def retag(
         pred_df = spark.createDataFrame([['']]).toDF("text")
         fit_pred_pipeline = pred_pipeline.fit(pred_df)
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
-        for i, elem in enumerate(dset["abstractText"]):
-            result = fit_pred_lightpipeline.annotate(elem)
-            print(result)
-            print(f"Tagged: {result['label']==tag} Expected: {tag in dset['meshMajor']}")
+        for text, old_tags in enumerate(dset["abstractText"], dset["meshMajor"]):
+            result = fit_pred_lightpipeline.annotate(text)
+            print(f"New tag: {result['label'][0]==tag} Old tag: {tag in old_tags}")
 
 
 

From 5440a9963538a494a6222b357226ed6be41e6615 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 16:38:33 +0100
Subject: [PATCH 021/102] Prototypes retagging after training

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index f6fd12f3..3e3b4b4e 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -126,7 +126,7 @@ def retag(
         pred_df = spark.createDataFrame([['']]).toDF("text")
         fit_pred_pipeline = pred_pipeline.fit(pred_df)
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
-        for text, old_tags in enumerate(dset["abstractText"], dset["meshMajor"]):
+        for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
             result = fit_pred_lightpipeline.annotate(text)
             print(f"New tag: {result['label'][0]==tag} Old tag: {tag in old_tags}")
 

From 98cc4e3329eb4a30dc66abf11146a45d99f632bf Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 17:03:11 +0100
Subject: [PATCH 022/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 25 ++++++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 3e3b4b4e..ec271120 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import random
 
@@ -55,13 +56,13 @@ def retag(
     for tag in tags:
         logging.info(f"Retagging: {tag}")
 
-        logging.info(f"Obtaining positive examples for {tag}...")
+        logging.info(f"- Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
             lambda x: tag in x["meshMajor"], num_proc=num_proc
         )
         pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
 
-        logging.info(f"Obtaining negative examples for {tag}...")
+        logging.info(f"- Obtaining negative examples for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
@@ -116,7 +117,7 @@ def retag(
         preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
         logging.info(classification_report(preds_df['category'], preds_df['result']))
 
-        logging.info("Retagging using the model...")
+        logging.info("- Loading the model for prediction...")
         fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
         fit_clf_model = nlp.ClassifierDLModel.load('clf_tmp')
 
@@ -126,10 +127,20 @@ def retag(
         pred_df = spark.createDataFrame([['']]).toDF("text")
         fit_pred_pipeline = pred_pipeline.fit(pred_df)
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
-        for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
-            result = fit_pred_lightpipeline.annotate(text)
-            print(f"New tag: {result['label'][0]==tag} Old tag: {tag in old_tags}")
-
+        logging.info(f"- Retagging {tag}...")
+        with open(save_to_path, 'a') as f:
+            for i, text in dset["abstractText"]:
+                result = fit_pred_lightpipeline.annotate(text)
+                before = tag in dset['meshMajor'][i]
+                after = result['label'][0] == tag
+                if before != after:
+                    logging.info("- Corrected!")
+                    row = dset[i]
+                    if after is True:
+                        row['meshMajor'].append(tag)
+                    else:
+                        row['meshMajor'].remove(tag)
+                    json.dump(dset[i], f)
 
 
 @retag_app.command()

From 35652e0539fad6499a091f32e4ffc11603adb0c3 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 17:10:46 +0100
Subject: [PATCH 023/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index ec271120..8b405b0d 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -129,7 +129,7 @@ def retag(
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
         with open(save_to_path, 'a') as f:
-            for i, text in dset["abstractText"]:
+            for i, text in enumerate(dset["abstractText"]):
                 result = fit_pred_lightpipeline.annotate(text)
                 before = tag in dset['meshMajor'][i]
                 after = result['label'][0] == tag

From 7a2b0edc46efdc103e97e3c2eda7d1bb4328f916 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 17:36:07 +0100
Subject: [PATCH 024/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 8b405b0d..9dc0e9aa 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -133,6 +133,7 @@ def retag(
                 result = fit_pred_lightpipeline.annotate(text)
                 before = tag in dset['meshMajor'][i]
                 after = result['label'][0] == tag
+                print(f"- Before: {before} After: {after}")
                 if before != after:
                     logging.info("- Corrected!")
                     row = dset[i]
@@ -140,7 +141,7 @@ def retag(
                         row['meshMajor'].append(tag)
                     else:
                         row['meshMajor'].remove(tag)
-                    json.dump(dset[i], f)
+                    json.dump(row, f)
 
 
 @retag_app.command()

From 2a64c86e2fc1a032ceb8475c202958876c7593d4 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 17:49:49 +0100
Subject: [PATCH 025/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 9dc0e9aa..d910bc11 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -129,19 +129,21 @@ def retag(
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
         with open(save_to_path, 'a') as f:
-            for i, text in enumerate(dset["abstractText"]):
+            counter = 0
+            for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
                 result = fit_pred_lightpipeline.annotate(text)
-                before = tag in dset['meshMajor'][i]
+                before = tag in old_tags
                 after = result['label'][0] == tag
                 print(f"- Before: {before} After: {after}")
                 if before != after:
                     logging.info("- Corrected!")
-                    row = dset[i]
+                    row = dset[counter]
                     if after is True:
                         row['meshMajor'].append(tag)
                     else:
                         row['meshMajor'].remove(tag)
                     json.dump(row, f)
+                counter += 1
 
 
 @retag_app.command()

From 47690b0d0fc62e3bb9423c1aad198bb19477d11d Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 18:01:23 +0100
Subject: [PATCH 026/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index d910bc11..e759a693 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -134,15 +134,17 @@ def retag(
                 result = fit_pred_lightpipeline.annotate(text)
                 before = tag in old_tags
                 after = result['label'][0] == tag
-                print(f"- Before: {before} After: {after}")
                 if before != after:
                     logging.info("- Corrected!")
                     row = dset[counter]
                     if after is True:
                         row['meshMajor'].append(tag)
+                        row['correction'] = f"+{tag}"
                     else:
                         row['meshMajor'].remove(tag)
+                        row['correction'] = f"-{tag}"
                     json.dump(row, f)
+                    f.flush()
                 counter += 1
 
 

From d7dd1cb80d97310866684a173f5dd639dc6bac30 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 18:16:07 +0100
Subject: [PATCH 027/102] Saves corrections

---
 grants_tagger_light/retagging/retagging.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index e759a693..7f053b31 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -135,7 +135,6 @@ def retag(
                 before = tag in old_tags
                 after = result['label'][0] == tag
                 if before != after:
-                    logging.info("- Corrected!")
                     row = dset[counter]
                     if after is True:
                         row['meshMajor'].append(tag)
@@ -143,7 +142,9 @@ def retag(
                     else:
                         row['meshMajor'].remove(tag)
                         row['correction'] = f"-{tag}"
+                    logging.info(f"- Corrected: {row['correction']}")
                     json.dump(row, f)
+                    f.write("\n")
                     f.flush()
                 counter += 1
 

From 2951428f17f774a999e6aedf5b258a0e3c477f30 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 8 Sep 2023 18:36:50 +0100
Subject: [PATCH 028/102] 500 rows for better accuracy

---
 grants_tagger_light/retagging/retagging.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 7f053b31..b94a8736 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -60,13 +60,13 @@ def retag(
         positive_dset = dset.filter(
             lambda x: tag in x["meshMajor"], num_proc=num_proc
         )
-        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
+        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=250, split=0.8)
 
         logging.info(f"- Obtaining negative examples for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
-        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
+        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=250, split=0.8)
 
         train_data = [(x, tag) for x in pos_x_train]
         train_data.extend([(x, 'other') for x in neg_x_train])
@@ -100,7 +100,7 @@ def retag(
             .setInputCols(["sentence_embeddings"]) \
             .setOutputCol("label") \
             .setLabelColumn("category") \
-            .setMaxEpochs(10) \
+            .setMaxEpochs(25) \
             .setLr(0.001) \
             .setBatchSize(1) \
             .setEnableOutputLogs(True)

From 42b3b824f4f6bf322d1ee994c9b585c98eaf339e Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 12:42:47 +0100
Subject: [PATCH 029/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 84 +++++++++++++++-------
 1 file changed, 57 insertions(+), 27 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index b94a8736..31350fde 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -30,14 +30,46 @@ def _load_data(dset: list[str], limit=100, split=0.8):
     return train_dset, test_dset
 
 
+def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset_row):
+    with open(save_to_path, 'a') as f:
+        # result = fit_pred_lightpipeline.fullAnnotate(text)
+        batch_texts = [x for x in current_batch[0]]
+        batch_tags = [x for x in current_batch[1]]
+        result = lightpipeline.fullAnnotate(batch_texts)
+        for r in range(len(result)):
+            prediction = result[r]['label'][0].result
+            prediction_confidence = float(result[r]['label'][0].metadata[tag])
+            prediction_old_tags = batch_tags[r]
+
+            if prediction_confidence < threshold:
+                continue
+
+            before = tag in prediction_old_tags
+            after = prediction == tag
+
+            if before != after:
+                if 'correction' not in dset_row:
+                    dset_row['correction'] = []
+                if after is True:
+                    dset_row['meshMajor'].append(tag)
+                    dset_row['correction'].append(f"+{tag}")
+                else:
+                    dset_row['meshMajor'].remove(tag)
+                    dset_row['correction'].append(f"-{tag}")
+                logging.info(f"- Corrected: {row['correction']}")
+                json.dump(dset_row, f)
+                f.write("\n")
+                f.flush()
+
+
 def retag(
     data_path: str,
     save_to_path: str,
     model_key: str = "gpt-3.5-turbo",
     num_proc: int = os.cpu_count(),
     batch_size: int = 64,
-    concurrent_calls: int = os.cpu_count() * 2,
     tags_file_path: str = None,
+    threshold: float = 0.8
 ):
     if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
         raise NotImplementedError(
@@ -96,13 +128,14 @@ def retag(
             .setInputCols(["document"]) \
             .setOutputCol("sentence_embeddings") \
 
+        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
         classifierdl = nlp.ClassifierDLApproach() \
             .setInputCols(["sentence_embeddings"]) \
             .setOutputCol("label") \
             .setLabelColumn("category") \
             .setMaxEpochs(25) \
             .setLr(0.001) \
-            .setBatchSize(1) \
+            .setBatchSize(max(batch_size, 8)) \
             .setEnableOutputLogs(True)
         # .setOutputLogsPath('logs')
 
@@ -128,25 +161,23 @@ def retag(
         fit_pred_pipeline = pred_pipeline.fit(pred_df)
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
-        with open(save_to_path, 'a') as f:
-            counter = 0
-            for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
-                result = fit_pred_lightpipeline.annotate(text)
-                before = tag in old_tags
-                after = result['label'][0] == tag
-                if before != after:
-                    row = dset[counter]
-                    if after is True:
-                        row['meshMajor'].append(tag)
-                        row['correction'] = f"+{tag}"
-                    else:
-                        row['meshMajor'].remove(tag)
-                        row['correction'] = f"-{tag}"
-                    logging.info(f"- Corrected: {row['correction']}")
-                    json.dump(row, f)
-                    f.write("\n")
-                    f.flush()
-                counter += 1
+
+        counter = 0
+        current_batch = []
+        for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
+            if len(current_batch) < batch_size:
+                current_batch.append((text, old_tags))
+                continue
+            else:
+                _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag,
+                                          dset[counter])
+                current_batch = []
+            counter += 1
+
+        # Remaining
+        if len(current_batch) > 0:
+            _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag,
+                                      dset[counter])
 
 
 @retag_app.command()
@@ -165,16 +196,15 @@ def retag_cli(
     batch_size: int = typer.Option(
         64, help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
-    concurrent_calls: int = typer.Option(
-        os.cpu_count() * 2,
-        min=1,
-        help="Concurrent calls with 1 tag each to the different model",
-    ),
     tags_file_path: str = typer.Option(
         None,
         help="Text file containing one line per tag to be considered. "
         "The rest will be discarded.",
     ),
+    threshold: float = typer.Option(
+        0.8,
+        help="Minimum threshold of confidence to retag a model. Default: 0.8"
+    )
 ):
     if not data_path.endswith("jsonl"):
         logger.error(
@@ -195,6 +225,6 @@ def retag_cli(
         model_key=model_key,
         num_proc=num_proc,
         batch_size=batch_size,
-        concurrent_calls=concurrent_calls,
         tags_file_path=tags_file_path,
+        threshold=threshold
     )

From c94a5a2b32c1f7a3c139a440ce30fcf5a28ddadf Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 12:44:39 +0100
Subject: [PATCH 030/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 31350fde..af181713 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -7,13 +7,12 @@
 
 from datasets import load_dataset
 
-
 from johnsnowlabs import nlp
 
 import os
 
 from sklearn.metrics import classification_report
-from pyspark.sql.functions import *
+from pyspark.sql.functions import col
 
 spark = nlp.start()
 
@@ -56,7 +55,7 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
                 else:
                     dset_row['meshMajor'].remove(tag)
                     dset_row['correction'].append(f"-{tag}")
-                logging.info(f"- Corrected: {row['correction']}")
+                logging.info(f"- Corrected: {dset_row['correction']}")
                 json.dump(dset_row, f)
                 f.write("\n")
                 f.flush()
@@ -110,15 +109,15 @@ def retag(
         test_data.extend([(x, 'other') for x in neg_x_test])
         test_df = spark.createDataFrame(test_data, columns)
 
-        logging.info(train_df.groupBy("category") \
-            .count() \
-            .orderBy(col("count").desc()) \
-            .show())
+        logging.info(train_df.groupBy("category")
+                     .count()
+                     .orderBy(col("count").desc())
+                     .show())
 
-        logging.info(train_df.groupBy("category") \
-              .count() \
-              .orderBy(col("count").desc()) \
-              .show())
+        logging.info(train_df.groupBy("category")
+                     .count()
+                     .orderBy(col("count").desc())
+                     .show())
 
         document_assembler = nlp.DocumentAssembler() \
             .setInputCol("text") \

From 430cc8acbfe11753ffbbd2cfe66c4b37ebfb8347 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 12:46:06 +0100
Subject: [PATCH 031/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index af181713..fd4829cb 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -51,10 +51,10 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
                     dset_row['correction'] = []
                 if after is True:
                     dset_row['meshMajor'].append(tag)
-                    dset_row['correction'].append(f"+{tag}")
+                    dset_row['correction'].append({'change': f"+{tag}", 'confidence': prediction_confidence})
                 else:
                     dset_row['meshMajor'].remove(tag)
-                    dset_row['correction'].append(f"-{tag}")
+                    dset_row['correction'].append({'change': f"-{tag}", 'confidence': prediction_confidence})
                 logging.info(f"- Corrected: {dset_row['correction']}")
                 json.dump(dset_row, f)
                 f.write("\n")

From c93b945f59981b22712806c319f8e3e5259630c6 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 13:09:12 +0100
Subject: [PATCH 032/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index fd4829cb..04687106 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -32,8 +32,8 @@ def _load_data(dset: list[str], limit=100, split=0.8):
 def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset_row):
     with open(save_to_path, 'a') as f:
         # result = fit_pred_lightpipeline.fullAnnotate(text)
-        batch_texts = [x for x in current_batch[0]]
-        batch_tags = [x for x in current_batch[1]]
+        batch_texts = [x[0] for x in current_batch]
+        batch_tags = [x[1] for x in current_batch]
         result = lightpipeline.fullAnnotate(batch_texts)
         for r in range(len(result)):
             prediction = result[r]['label'][0].result

From d8e318b1ff9c609319af8815362280f9ad33af3c Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 13:22:39 +0100
Subject: [PATCH 033/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 04687106..0ebf7a08 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -29,11 +29,12 @@ def _load_data(dset: list[str], limit=100, split=0.8):
     return train_dset, test_dset
 
 
-def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset_row):
+def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset):
     with open(save_to_path, 'a') as f:
         # result = fit_pred_lightpipeline.fullAnnotate(text)
         batch_texts = [x[0] for x in current_batch]
         batch_tags = [x[1] for x in current_batch]
+        batch_row_nums = [x[2] for x in current_batch]
         result = lightpipeline.fullAnnotate(batch_texts)
         for r in range(len(result)):
             prediction = result[r]['label'][0].result
@@ -45,7 +46,7 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
 
             before = tag in prediction_old_tags
             after = prediction == tag
-
+            dset_row = dset[batch_row_nums[r]]
             if before != after:
                 if 'correction' not in dset_row:
                     dset_row['correction'] = []
@@ -161,17 +162,16 @@ def retag(
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
 
-        counter = 0
+        counter = -1
         current_batch = []
         for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
+            counter += 1
             if len(current_batch) < batch_size:
-                current_batch.append((text, old_tags))
+                current_batch.append((text, old_tags, counter))
                 continue
             else:
-                _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag,
-                                          dset[counter])
+                _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag, dset)
                 current_batch = []
-            counter += 1
 
         # Remaining
         if len(current_batch) > 0:

From 0af5a85bc72f98fa40802dd991aceb8cd2b2007a Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 13:41:40 +0100
Subject: [PATCH 034/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 0ebf7a08..3c164b01 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -22,6 +22,7 @@
 def _load_data(dset: list[str], limit=100, split=0.8):
     """Load data from the IMDB dataset."""
     # Partition off part of the train data for evaluation
+    limit = min(len(dset), limit)
     random.Random(42).shuffle(dset)
     train_size = int(split * limit)
     train_dset = dset[:train_size]
@@ -56,10 +57,10 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
                 else:
                     dset_row['meshMajor'].remove(tag)
                     dset_row['correction'].append({'change': f"-{tag}", 'confidence': prediction_confidence})
-                logging.info(f"- Corrected: {dset_row['correction']}")
+                # logging.info(f"- Corrected: {dset_row['correction']}")
                 json.dump(dset_row, f)
                 f.write("\n")
-                f.flush()
+        f.flush()
 
 
 def retag(
@@ -92,6 +93,14 @@ def retag(
         positive_dset = dset.filter(
             lambda x: tag in x["meshMajor"], num_proc=num_proc
         )
+
+        if len(positive_dset['abstractText']) < 50:
+            logging.info(f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])}. "
+                         f"Check {save_to_path}.err for more information about skipped tags.")
+            with open(f"{save_to_path}.err", 'a') as f:
+                f.write(tag)
+            continue
+
         pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=250, split=0.8)
 
         logging.info(f"- Obtaining negative examples for {tag}...")

From e9362852e5a7bd0c2267fd25afcd91ef17375c0f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 13:43:39 +0100
Subject: [PATCH 035/102] Adds batching and refactors prediction

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 3c164b01..4e362ea3 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -31,7 +31,7 @@ def _load_data(dset: list[str], limit=100, split=0.8):
 
 
 def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset):
-    with open(save_to_path, 'a') as f:
+    with open(f"{save_to_path}.{tag}.jsonl", "a") as f:
         # result = fit_pred_lightpipeline.fullAnnotate(text)
         batch_texts = [x[0] for x in current_batch]
         batch_tags = [x[1] for x in current_batch]

From 44a790b0fb917c5ff2f7c317e91a3449d5a70e31 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sat, 9 Sep 2023 19:38:55 +0100
Subject: [PATCH 036/102] Adds some logging, refactors

---
 grants_tagger_light/retagging/retagging.py | 43 +++++++++-------------
 1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 4e362ea3..2c92a0d0 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -32,7 +32,6 @@ def _load_data(dset: list[str], limit=100, split=0.8):
 
 def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset):
     with open(f"{save_to_path}.{tag}.jsonl", "a") as f:
-        # result = fit_pred_lightpipeline.fullAnnotate(text)
         batch_texts = [x[0] for x in current_batch]
         batch_tags = [x[1] for x in current_batch]
         batch_row_nums = [x[2] for x in current_batch]
@@ -66,17 +65,11 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
 def retag(
     data_path: str,
     save_to_path: str,
-    model_key: str = "gpt-3.5-turbo",
     num_proc: int = os.cpu_count(),
     batch_size: int = 64,
     tags_file_path: str = None,
     threshold: float = 0.8
 ):
-    if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
-        raise NotImplementedError(
-            f"{model_key} not implemented as an augmentation framework"
-        )
-
     # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
     logging.info("Loading the MeSH jsonl...")
     dset = load_dataset("json", data_files=data_path, num_proc=1)
@@ -119,15 +112,13 @@ def retag(
         test_data.extend([(x, 'other') for x in neg_x_test])
         test_df = spark.createDataFrame(test_data, columns)
 
-        logging.info(train_df.groupBy("category")
-                     .count()
-                     .orderBy(col("count").desc())
-                     .show())
+        train_df.groupBy("category")\
+            .count()\
+            .orderBy(col("count").desc())
 
-        logging.info(train_df.groupBy("category")
-                     .count()
-                     .orderBy(col("count").desc())
-                     .show())
+        train_df.groupBy("category")\
+            .count()\
+            .orderBy(col("count").desc())
 
         document_assembler = nlp.DocumentAssembler() \
             .setInputCol("text") \
@@ -171,21 +162,28 @@ def retag(
         fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
 
-        counter = -1
+        row_counter = -1
+        batch_counter = -1
+
+        batch_total = len(dset["abstractText"])
+
         current_batch = []
+
         for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
-            counter += 1
+            row_counter += 1
             if len(current_batch) < batch_size:
-                current_batch.append((text, old_tags, counter))
+                current_batch.append((text, old_tags, row_counter))
                 continue
             else:
+                batch_counter += 1
+                print(f"Processing batch {batch_counter}/{batch_total}", end="\r", flush=True)
+
                 _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag, dset)
                 current_batch = []
 
         # Remaining
         if len(current_batch) > 0:
-            _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag,
-                                      dset[counter])
+            _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag, dset)
 
 
 @retag_app.command()
@@ -194,10 +192,6 @@ def retag_cli(
     save_to_path: str = typer.Argument(
         ..., help="Path where to save the retagged data"
     ),
-    model_key: str = typer.Option(
-        "gpt-3.5-turbo",
-        help="LLM to use data augmentation. By now, only `openai` is supported",
-    ),
     num_proc: int = typer.Option(
         os.cpu_count(), help="Number of processes to use for data augmentation"
     ),
@@ -230,7 +224,6 @@ def retag_cli(
     retag(
         data_path,
         save_to_path,
-        model_key=model_key,
         num_proc=num_proc,
         batch_size=batch_size,
         tags_file_path=tags_file_path,

From d948f3ad3fb07c42ff7496a2399a55c3513f5d6b Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Sun, 10 Sep 2023 13:03:36 +0100
Subject: [PATCH 037/102] Adds pure spark

---
 grants_tagger_light/retagging/retagging.py | 172 ++++++++++++---------
 1 file changed, 99 insertions(+), 73 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 2c92a0d0..410a650a 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -6,30 +6,35 @@
 from loguru import logger
 
 from datasets import load_dataset
-
+from datasets import Dataset
 from johnsnowlabs import nlp
 
 import os
 
 from sklearn.metrics import classification_report
-from pyspark.sql.functions import col
+from pyspark.sql.functions import col, array_contains
 
 spark = nlp.start()
 
 retag_app = typer.Typer()
 
 
-def _load_data(dset: list[str], limit=100, split=0.8):
+def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     """Load data from the IMDB dataset."""
-    # Partition off part of the train data for evaluation
-    limit = min(len(dset), limit)
-    random.Random(42).shuffle(dset)
-    train_size = int(split * limit)
-    train_dset = dset[:train_size]
-    test_dset = dset[train_size:limit]
+    min_limit = min(len(dset), limit)
+    dset = dset.select([x for x in range(limit)])
+    # Not in parallel since the data is very small and it's worse to divide and conquer
+    dset.map(
+        lambda x: {'featured_tag': tag},
+        desc=f"Adding featured tag ({tag})",
+    )
+    train_size = int(split * min_limit)
+    train_dset = dset.select([x for x in range(train_size)])
+    test_dset = dset.select([x for x in range(train_size, min_limit)])
     return train_dset, test_dset
 
 
+"""
 def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset):
     with open(f"{save_to_path}.{tag}.jsonl", "a") as f:
         batch_texts = [x[0] for x in current_batch]
@@ -59,7 +64,63 @@ def _process_prediction_batch(save_to_path, current_batch, lightpipeline, thresh
                 # logging.info(f"- Corrected: {dset_row['correction']}")
                 json.dump(dset_row, f)
                 f.write("\n")
-        f.flush()
+        f.flush()"""
+
+
+def _create_pipelines(batch_size, train_df, test_df):
+    """
+        This method creates a Spark pipeline (to run on dataframes) and a Spark Lightpipeline (to run on arrays of str)
+        Lightpipelines are faster in less data.
+    Args:
+        batch_size: max size of the batch to train. Since data is small for training, I limit it to 8.
+        train_df: Spark Dataframe of the train data
+        test_df: Spark Dataframe of the test data
+
+    Returns:
+        a tuple of (pipeline, lightpipeline)
+    """
+    document_assembler = nlp.DocumentAssembler() \
+        .setInputCol("abstractText") \
+        .setOutputCol("document")
+
+    embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
+        .setInputCols(["document"]) \
+        .setOutputCol("sentence_embeddings") \
+ \
+        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
+    classifierdl = nlp.ClassifierDLApproach() \
+        .setInputCols(["sentence_embeddings"]) \
+        .setOutputCol("label") \
+        .setLabelColumn("featured_tag") \
+        .setMaxEpochs(25) \
+        .setLr(0.001) \
+        .setBatchSize(max(batch_size, 8)) \
+        .setEnableOutputLogs(True)
+    # .setOutputLogsPath('logs')
+
+    clf_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                        embeddings,
+                                        classifierdl])
+
+    fit_clf_pipeline = clf_pipeline.fit(train_df)
+    preds = fit_clf_pipeline.transform(test_df)
+    logging.info(preds.select('category', 'text', 'label.result').show(10, truncate=80))
+    preds_df = preds.select('category', 'text', 'label.result').toPandas()
+    preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
+    logging.info(classification_report(preds_df['category'], preds_df['result']))
+
+    logging.info("- Loading the model for prediction...")
+    fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
+    fit_clf_model = nlp.ClassifierDLModel.load('clf_tmp')
+
+    pred_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                         embeddings,
+                                         fit_clf_model])
+    pred_df = spark.createDataFrame([['']]).toDF("text")
+    fit_pred_pipeline = pred_pipeline.fit(pred_df)
+    fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
+
+    return fit_pred_pipeline, fit_pred_lightpipeline
 
 
 def retag(
@@ -94,81 +155,45 @@ def retag(
                 f.write(tag)
             continue
 
-        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=250, split=0.8)
+        pos_x_train, pos_x_test = _load_data(positive_dset, tag, limit=250, split=0.8)
 
-        logging.info(f"- Obtaining negative examples for {tag}...")
+        logging.info(f"- Obtaining negative examples ('other') for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
-        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=250, split=0.8)
+        neg_x_train, neg_x_test = _load_data(negative_dset, "other", limit=250, split=0.8)
 
-        train_data = [(x, tag) for x in pos_x_train]
-        train_data.extend([(x, 'other') for x in neg_x_train])
+        train_df = spark.createDataFrame(pos_x_train)
+        train_df = train_df.union(neg_x_train)
 
-        columns = ["text", "category"]
-        train_df = spark.createDataFrame(train_data, columns)
+        test_df = spark.createDataFrame(pos_x_test)
+        test_df = test_df.union(neg_x_test)
 
-        test_data = [(x, tag) for x in pos_x_test]
-        test_data.extend([(x, 'other') for x in neg_x_test])
-        test_df = spark.createDataFrame(test_data, columns)
-
-        train_df.groupBy("category")\
-            .count()\
+        train_df.groupBy("featured_tag") \
+            .count() \
             .orderBy(col("count").desc())
 
-        train_df.groupBy("category")\
-            .count()\
+        test_df.groupBy("featured_tag") \
+            .count() \
             .orderBy(col("count").desc())
 
-        document_assembler = nlp.DocumentAssembler() \
-            .setInputCol("text") \
-            .setOutputCol("document")
-
-        embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
-            .setInputCols(["document"]) \
-            .setOutputCol("sentence_embeddings") \
+        pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
-        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
-        classifierdl = nlp.ClassifierDLApproach() \
-            .setInputCols(["sentence_embeddings"]) \
-            .setOutputCol("label") \
-            .setLabelColumn("category") \
-            .setMaxEpochs(25) \
-            .setLr(0.001) \
-            .setBatchSize(max(batch_size, 8)) \
-            .setEnableOutputLogs(True)
-        # .setOutputLogsPath('logs')
-
-        clf_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                            embeddings,
-                                            classifierdl])
-
-        fit_clf_pipeline = clf_pipeline.fit(train_df)
-        preds = fit_clf_pipeline.transform(test_df)
-        logging.info(preds.select('category', 'text', 'label.result').show(10, truncate=80))
-        preds_df = preds.select('category', 'text', 'label.result').toPandas()
-        preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
-        logging.info(classification_report(preds_df['category'], preds_df['result']))
-
-        logging.info("- Loading the model for prediction...")
-        fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
-        fit_clf_model = nlp.ClassifierDLModel.load('clf_tmp')
-
-        pred_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                             embeddings,
-                                             fit_clf_model])
-        pred_df = spark.createDataFrame([['']]).toDF("text")
-        fit_pred_pipeline = pred_pipeline.fit(pred_df)
-        fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
         logging.info(f"- Retagging {tag}...")
 
-        row_counter = -1
-        batch_counter = -1
-
-        batch_total = len(dset["abstractText"])
+        # This is the most performant way to do it in Spark:
+        # 1) You predict using transform. It leverages all the nodes.
+        # 2) We filter on the fly - we only want rows predicted as {tag} but with that tag NOT (~) included in meshMajor
+        # 3) We save on the fly too
+        pipeline.transform(dset).\
+            filter(~array_contains(col('meshMajor'), col('featured_tag'))).\
+            write.mode('overwrite').save(f"{save_to_path}.{tag}")
 
-        current_batch = []
+        # batch_counter = -1
+        # batch_total = len(dset["abstractText"])
 
+        # current_batch = []
+        """
         for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
             row_counter += 1
             if len(current_batch) < batch_size:
@@ -178,13 +203,14 @@ def retag(
                 batch_counter += 1
                 print(f"Processing batch {batch_counter}/{batch_total}", end="\r", flush=True)
 
-                _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag, dset)
+                _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset)
                 current_batch = []
-
+        
+        
         # Remaining
         if len(current_batch) > 0:
-            _process_prediction_batch(save_to_path, current_batch, fit_pred_lightpipeline, threshold, tag, dset)
-
+            _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset)
+        """
 
 @retag_app.command()
 def retag_cli(

From 5850b2209eb4a6821aeceb5a9f2e4430e1fb143f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 12:52:30 +0100
Subject: [PATCH 038/102] Adds pure spark

---
 grants_tagger_light/retagging/retagging.py | 209 ++++++++++++---------
 1 file changed, 121 insertions(+), 88 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 410a650a..d1c9f165 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,20 +1,21 @@
 import json
 import logging
 import random
+import time
 
 import typer
 from loguru import logger
 
-from datasets import load_dataset
-from datasets import Dataset
+from datasets import Dataset, load_dataset, concatenate_datasets
 from johnsnowlabs import nlp
 
 import os
 
 from sklearn.metrics import classification_report
-from pyspark.sql.functions import col, array_contains
+from pyspark.sql.functions import col, lit, array_contains
 
 spark = nlp.start()
+spark.sparkContext.setLogLevel("OFF")
 
 retag_app = typer.Typer()
 
@@ -34,39 +35,6 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     return train_dset, test_dset
 
 
-"""
-def _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset):
-    with open(f"{save_to_path}.{tag}.jsonl", "a") as f:
-        batch_texts = [x[0] for x in current_batch]
-        batch_tags = [x[1] for x in current_batch]
-        batch_row_nums = [x[2] for x in current_batch]
-        result = lightpipeline.fullAnnotate(batch_texts)
-        for r in range(len(result)):
-            prediction = result[r]['label'][0].result
-            prediction_confidence = float(result[r]['label'][0].metadata[tag])
-            prediction_old_tags = batch_tags[r]
-
-            if prediction_confidence < threshold:
-                continue
-
-            before = tag in prediction_old_tags
-            after = prediction == tag
-            dset_row = dset[batch_row_nums[r]]
-            if before != after:
-                if 'correction' not in dset_row:
-                    dset_row['correction'] = []
-                if after is True:
-                    dset_row['meshMajor'].append(tag)
-                    dset_row['correction'].append({'change': f"+{tag}", 'confidence': prediction_confidence})
-                else:
-                    dset_row['meshMajor'].remove(tag)
-                    dset_row['correction'].append({'change': f"-{tag}", 'confidence': prediction_confidence})
-                # logging.info(f"- Corrected: {dset_row['correction']}")
-                json.dump(dset_row, f)
-                f.write("\n")
-        f.flush()"""
-
-
 def _create_pipelines(batch_size, train_df, test_df):
     """
         This method creates a Spark pipeline (to run on dataframes) and a Spark Lightpipeline (to run on arrays of str)
@@ -83,11 +51,12 @@ def _create_pipelines(batch_size, train_df, test_df):
         .setInputCol("abstractText") \
         .setOutputCol("document")
 
-    embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_bert_base_cased", "en") \
+    # Biobert Sentence Embeddings (clinical)
+    embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_clinical_base_cased", "en") \
         .setInputCols(["document"]) \
-        .setOutputCol("sentence_embeddings") \
- \
-        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
+        .setOutputCol("sentence_embeddings")
+
+    # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
     classifierdl = nlp.ClassifierDLApproach() \
         .setInputCols(["sentence_embeddings"]) \
         .setOutputCol("label") \
@@ -104,10 +73,9 @@ def _create_pipelines(batch_size, train_df, test_df):
 
     fit_clf_pipeline = clf_pipeline.fit(train_df)
     preds = fit_clf_pipeline.transform(test_df)
-    logging.info(preds.select('category', 'text', 'label.result').show(10, truncate=80))
-    preds_df = preds.select('category', 'text', 'label.result').toPandas()
+    preds_df = preds.select('featured_tag', 'abstractText', 'label.result').toPandas()
     preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
-    logging.info(classification_report(preds_df['category'], preds_df['result']))
+    logging.info(classification_report(preds_df['featured_tag'], preds_df['result']))
 
     logging.info("- Loading the model for prediction...")
     fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
@@ -123,13 +91,71 @@ def _create_pipelines(batch_size, train_df, test_df):
     return fit_pred_pipeline, fit_pred_lightpipeline
 
 
+def _annotate(save_to_path, dset, tag, limit, is_positive):
+    human_supervision = {}
+    curation_file = f"{save_to_path}.{tag}.curation.json"
+    if os.path.isfile(curation_file):
+        with open(curation_file, 'r') as f:
+            human_supervision = json.load(f)
+        prompt = f"File `{curation_file}` found. Do you want to reuse previous work? [y|n]: "
+        answer = input(prompt)
+        while answer not in ['y', 'n']:
+            answer = input(prompt)
+        if answer == 'n':
+            human_supervision[tag][is_positive] = []
+
+    if tag not in human_supervision:
+        human_supervision[tag] = {'positive': [], 'negative': []}
+
+    field = 'positive' if is_positive else 'negative'
+    count = len(human_supervision[tag][field])
+    logging.info(f"[{tag}] Annotated: {count} Required: {limit} Available: {len(dset) - count}")
+    finished = False
+    while count <= limit:
+        tries = 0
+        random.seed(time.time())
+        random_pos_row = random.randint(0, len(dset))
+        id_ = dset[random_pos_row]['pmid']
+        while id_ in [x['pmid'] for x in human_supervision[tag][field]]:
+            random_pos_row = random.randint(0, len(dset))
+            id_ = dset[random_pos_row]['pmid']
+            tries += 1
+            if tries >= 10:
+                logger.error(f"Unable to find more examples for {field} {tag} which are not already tagged. "
+                             f"Continuing with {count} examples...")
+                finished = True
+                break
+        if finished:
+            break
+        print("="*50)
+        print(dset[random_pos_row]['abstractText'])
+        print("=" * 50)
+        res = input(f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""} a `{tag}` text? '
+                    f'[a to accept]: ')
+        if res == 'a':
+            human_supervision[tag][field].append(dset[random_pos_row])
+            with open(curation_file, 'w') as f:
+                json.dump(human_supervision, f)
+        count = len(human_supervision[tag][field])
+
+
+def _curate(save_to_path, pos_dset, neg_dset, tag, limit):
+    logging.info("- Curating positive examples")
+    _annotate(save_to_path, pos_dset, tag, limit, is_positive=True)
+
+    logging.info("- Curating negative examples")
+    _annotate(save_to_path, neg_dset, tag, limit, is_positive=False)
+
+
 def retag(
     data_path: str,
     save_to_path: str,
     num_proc: int = os.cpu_count(),
     batch_size: int = 64,
     tags_file_path: str = None,
-    threshold: float = 0.8
+    threshold: float = 0.8,
+    train_examples: int = 100,
+    supervised: bool = True,
 ):
     # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
     logging.info("Loading the MeSH jsonl...")
@@ -155,62 +181,57 @@ def retag(
                 f.write(tag)
             continue
 
-        pos_x_train, pos_x_test = _load_data(positive_dset, tag, limit=250, split=0.8)
-
         logging.info(f"- Obtaining negative examples ('other') for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
-        neg_x_train, neg_x_test = _load_data(negative_dset, "other", limit=250, split=0.8)
 
-        train_df = spark.createDataFrame(pos_x_train)
-        train_df = train_df.union(neg_x_train)
+        if supervised:
+            logging.info(f"- Curating data...")
+            _curate(save_to_path, positive_dset, negative_dset, tag, train_examples)
 
-        test_df = spark.createDataFrame(pos_x_test)
-        test_df = test_df.union(neg_x_test)
+            curation_file = f"{save_to_path}.{tag}.curation.json"
+            if os.path.isfile(curation_file):
+                with open(curation_file, "r") as fr:
+                    # I load the curated data file
+                    human_supervision = json.load(fr)
+                    positive_dset = Dataset.from_list(human_supervision[tag]['positive'])
+                    negative_dset = Dataset.from_list(human_supervision[tag]['negative'])
 
-        train_df.groupBy("featured_tag") \
-            .count() \
-            .orderBy(col("count").desc())
+        pos_x_train, pos_x_test = _load_data(positive_dset, tag, limit=train_examples, split=0.8)
+        neg_x_train, neg_x_test = _load_data(negative_dset, "other", limit=train_examples, split=0.8)
 
-        test_df.groupBy("featured_tag") \
-            .count() \
-            .orderBy(col("count").desc())
+        pos_x_train = pos_x_train.add_column("featured_tag", [tag] * len(pos_x_train))
+        pos_x_test = pos_x_test.add_column("featured_tag", [tag] * len(pos_x_test))
+        neg_x_train = neg_x_train.add_column("featured_tag", ["other"] * len(neg_x_train))
+        neg_x_test = neg_x_test.add_column("featured_tag", ["other"] * len(neg_x_test))
 
+        logging.info(f"- Creating train/test sets...")
+        train = concatenate_datasets([pos_x_train, neg_x_train])
+        train_df = spark.createDataFrame(train)
+        test = concatenate_datasets([pos_x_test, neg_x_test])
+        test_df = spark.createDataFrame(test)
+
+        logging.info(f"- Train dataset size: {train_df.count()}")
+        logging.info(f"- Test dataset size: {test_df.count()}")
+
+        logging.info(f"- Creating `sparknlp` pipelines...")
         pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
         logging.info(f"- Retagging {tag}...")
 
         # This is the most performant way to do it in Spark:
         # 1) You predict using transform. It leverages all the nodes.
-        # 2) We filter on the fly - we only want rows predicted as {tag} but with that tag NOT (~) included in meshMajor
-        # 3) We save on the fly too
-        pipeline.transform(dset).\
-            filter(~array_contains(col('meshMajor'), col('featured_tag'))).\
-            write.mode('overwrite').save(f"{save_to_path}.{tag}")
-
-        # batch_counter = -1
-        # batch_total = len(dset["abstractText"])
-
-        # current_batch = []
-        """
-        for text, old_tags in zip(dset["abstractText"], dset["meshMajor"]):
-            row_counter += 1
-            if len(current_batch) < batch_size:
-                current_batch.append((text, old_tags, row_counter))
-                continue
-            else:
-                batch_counter += 1
-                print(f"Processing batch {batch_counter}/{batch_total}", end="\r", flush=True)
-
-                _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset)
-                current_batch = []
-        
-        
-        # Remaining
-        if len(current_batch) > 0:
-            _process_prediction_batch(save_to_path, current_batch, lightpipeline, threshold, tag, dset)
-        """
+        # 2) Instead of returning the result, we save to disk and then load.
+        pipeline.transform(spark.createDataFrame(dset)). \
+            write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
+
+        # 1) We load
+        # 2) We filter to get those results where the predicted tag was not initially in meshMajor
+        # 3) We filter by confidence > threshold
+        # predictions = spark.read.load(f"{save_to_path}.{tag}.prediction").\
+        #   filter(~array_contains(col('meshMajor'), tag)).\
+
 
 @retag_app.command()
 def retag_cli(
@@ -230,8 +251,18 @@ def retag_cli(
         "The rest will be discarded.",
     ),
     threshold: float = typer.Option(
-        0.8,
-        help="Minimum threshold of confidence to retag a model. Default: 0.8"
+        0.9,
+        help="Minimum threshold of confidence to retag a model. Default: 0.9"
+    ),
+    train_examples: int = typer.Option(
+        100,
+        help="Number of examples to use for training the retaggers"
+    ),
+    supervised: bool = typer.Option(
+        True,
+        help="Use human curation, showing a `limit` amount of positive and negative examples to curate data"
+             " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
+             " the model will be train. All intermediary steps will be saved."
     )
 ):
     if not data_path.endswith("jsonl"):
@@ -253,5 +284,7 @@ def retag_cli(
         num_proc=num_proc,
         batch_size=batch_size,
         tags_file_path=tags_file_path,
-        threshold=threshold
+        threshold=threshold,
+        train_examples=train_examples,
+        supervised=supervised
     )

From f884a2516af5d8856195790d06787fa8012e9f17 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 13:18:25 +0100
Subject: [PATCH 039/102] Adds spark.repartition

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index d1c9f165..1d96f9d0 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -223,7 +223,7 @@ def retag(
         # This is the most performant way to do it in Spark:
         # 1) You predict using transform. It leverages all the nodes.
         # 2) Instead of returning the result, we save to disk and then load.
-        pipeline.transform(spark.createDataFrame(dset)). \
+        pipeline.transform(spark.createDataFrame(dset).repartition(num_proc)). \
             write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
 
         # 1) We load

From 7ebabba8c742767394134fd1d625639e4493ef02 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 13:55:20 +0100
Subject: [PATCH 040/102] Adds spark.repartition

---
 grants_tagger_light/retagging/retagging.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 1d96f9d0..2c48795d 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -12,9 +12,8 @@
 import os
 
 from sklearn.metrics import classification_report
-from pyspark.sql.functions import col, lit, array_contains
 
-spark = nlp.start()
+spark = nlp.start(spark_conf={'spark.executor.memory': '6g'})
 spark.sparkContext.setLogLevel("OFF")
 
 retag_app = typer.Typer()

From e86a3fe52c85e3a084df18874a9b73188092f17e Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 14:10:57 +0100
Subject: [PATCH 041/102] Adds parquet optimization

---
 grants_tagger_light/retagging/retagging.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 2c48795d..b8642d4a 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -12,6 +12,7 @@
 import os
 
 from sklearn.metrics import classification_report
+import pyarrow.parquet as pq
 
 spark = nlp.start(spark_conf={'spark.executor.memory': '6g'})
 spark.sparkContext.setLogLevel("OFF")
@@ -217,12 +218,13 @@ def retag(
         logging.info(f"- Creating `sparknlp` pipelines...")
         pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
-        logging.info(f"- Retagging {tag}...")
+        logging.info(f"- Optimizing dataframe in parquet...")
+        data_in_parquet = f"{save_to_path}.data.parquet"
+        pq.write_table(dset.data.table, data_in_parquet)
+        sdf = spark.read.load(data_in_parquet)
 
-        # This is the most performant way to do it in Spark:
-        # 1) You predict using transform. It leverages all the nodes.
-        # 2) Instead of returning the result, we save to disk and then load.
-        pipeline.transform(spark.createDataFrame(dset).repartition(num_proc)). \
+        logging.info(f"- Retagging {tag}...")
+        pipeline.transform(sdf.repartition(num_proc)). \
             write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
 
         # 1) We load

From 6083b228f925bb8f65c810768eef1cec8caaf0d9 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 14:28:03 +0100
Subject: [PATCH 042/102] Adds parquet optimization

---
 grants_tagger_light/retagging/retagging.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index b8642d4a..078bc92a 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -14,7 +14,9 @@
 from sklearn.metrics import classification_report
 import pyarrow.parquet as pq
 
-spark = nlp.start(spark_conf={'spark.executor.memory': '6g'})
+spark = nlp.start(spark_conf={'spark.executor.memory': '6g',
+                              'spark.driver.maxResultSize': '6g'}
+                  )
 spark.sparkContext.setLogLevel("OFF")
 
 retag_app = typer.Typer()
@@ -218,14 +220,19 @@ def retag(
         logging.info(f"- Creating `sparknlp` pipelines...")
         pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
-        logging.info(f"- Optimizing dataframe in parquet...")
+        logging.info(f"- Optimizing dataframe...")
+        dset = dset.remove_columns(["title", "journal", "title"])
         data_in_parquet = f"{save_to_path}.data.parquet"
         pq.write_table(dset.data.table, data_in_parquet)
+        del dset, train, train_df, test, test_df, pos_x_train, pos_x_test, neg_x_train, neg_x_test, positive_dset, \
+            negative_dset
         sdf = spark.read.load(data_in_parquet)
 
+        logging.info(f"- Repartitioning...")
+        sdf = sdf.repartition(num_proc)
+
         logging.info(f"- Retagging {tag}...")
-        pipeline.transform(sdf.repartition(num_proc)). \
-            write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
+        pipeline.transform(sdf).write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
 
         # 1) We load
         # 2) We filter to get those results where the predicted tag was not initially in meshMajor

From 4c3933dbe55148761985b1e56759c0dd75489648 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 14:31:06 +0100
Subject: [PATCH 043/102] Adds parquet optimization

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 078bc92a..eaa2dfca 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -221,7 +221,7 @@ def retag(
         pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
         logging.info(f"- Optimizing dataframe...")
-        dset = dset.remove_columns(["title", "journal", "title"])
+        dset = dset.remove_columns(["title", "journal", "year"])
         data_in_parquet = f"{save_to_path}.data.parquet"
         pq.write_table(dset.data.table, data_in_parquet)
         del dset, train, train_df, test, test_df, pos_x_train, pos_x_test, neg_x_train, neg_x_test, positive_dset, \

From 06b544687adb5c407a19ad9516806452605ef8f3 Mon Sep 17 00:00:00 2001
From: Juan Martinez <jjmcarrascosa@gmail.com>
Date: Mon, 11 Sep 2023 16:23:06 +0000
Subject: [PATCH 044/102] Adds spark context config

---
 grants_tagger_light/retagging/retagging.py |    6 +-
 ll.Artificial Intelligence.curation.json   |    1 +
 poetry.lock                                | 1426 ++++++++++----------
 3 files changed, 719 insertions(+), 714 deletions(-)
 create mode 100644 ll.Artificial Intelligence.curation.json

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index eaa2dfca..80594b6b 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -14,8 +14,10 @@
 from sklearn.metrics import classification_report
 import pyarrow.parquet as pq
 
-spark = nlp.start(spark_conf={'spark.executor.memory': '6g',
-                              'spark.driver.maxResultSize': '6g'}
+spark = nlp.start(spark_conf={'spark.executor.memory': '10g',
+                              'spark.driver.maxResultSize': '6g',
+                              'spark.executor.memoryOverhead': '1g',
+                              'spark.memory.fraction': '0.6'}
                   )
 spark.sparkContext.setLogLevel("OFF")
 
diff --git a/ll.Artificial Intelligence.curation.json b/ll.Artificial Intelligence.curation.json
new file mode 100644
index 00000000..6b86c3bd
--- /dev/null
+++ b/ll.Artificial Intelligence.curation.json	
@@ -0,0 +1 @@
+{"Artificial Intelligence": {"positive": [{"journal": "Zhonghua wei zhong bing ji jiu yi xue", "meshMajor": ["Artificial Intelligence", "Big Data", "Critical Care", "Critical Illness", "Humans"], "year": "2020", "abstractText": "Through the big data intelligent algorithm and application of artificial intelligence in critically ill patients, the value of the combination of clinical real-time warning and artificial intelligence in critical care medicine was explored. Artificial intelligence was used to simulate human thinking by studying, calculating, and analyzing a large amount of critical illness data in the medical work, and integrate a large number of clinical monitoring and treatment data generated in critical care medicine. The necessity, feasibility, relevance, data learning and application architecture of the application of artificial intelligence in the early warning of critical illness in medical work were analyzed, thus to promote the pioneering application of real-time warning of critical illness in clinical medicine. The development of critical care medicine in medical work requires the integration of big data and artificial intelligence. Through real-time early warning, accurate and scientific intelligent application of medical data, the life threatening uncertainties in the diagnosis and treatment of critically ill patients can be more effectively reduced and the success rate of the treatment of critically ill patients can be improved. The perfect combination of artificial intelligence technology and big data of critical care medicine can provide a favorable guarantee for the pioneering application of real-time warning of critical care medicine in clinical work.", "pmid": "33198854", "title": "[Artificial intelligence provides promotion of big data in medical work and contribution to people's health as soon as possible: real-time warning of critical illness is the pioneer of artificial intelligence in clinical medicine]."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Brain Mapping", "Cognition", "Electroencephalography", "Humans", "Male", "Man-Machine Systems", "Mathematical Concepts", "Problem Solving", "Reaction Time", "Signal Processing, Computer-Assisted", "Time Factors"], "year": "2013", "abstractText": "The association of functional connectivity patterns with particular cognitive tasks has long been a topic of interest in neuroscience, e.g., studies of functional connectivity have demonstrated its potential use for decoding various brain states. However, the high-dimensionality of the pairwise functional connectivity limits its usefulness in some real-time applications. In the present study, the methodology of tensor subspace analysis (TSA) is used to reduce the initial high-dimensionality of the pairwise coupling in the original functional connectivity network to a space of condensed descriptive power, which would significantly decrease the computational cost and facilitate the differentiation of brain states. We assess the feasibility of the proposed method on EEG recordings when the subject was performing mental arithmetic task which differ only in the difficulty level (easy: 1-digit addition v.s. 3-digit additions). Two different cortical connective networks were detected, and by comparing the functional connectivity networks in different work states, it was found that the task-difficulty is best reflected in the connectivity structure of sub-graphs extending over parietooccipital sites. Incorporating this data-driven information within original TSA methodology, we succeeded in predicting the difficulty level from connectivity patterns in an efficient way that can be implemented so as to work in real-time. ", "pmid": "24110343", "title": "A tensorial approach to access cognitive workload related to mental arithmetic from EEG functional connectivity estimates."}, {"journal": "Radiology", "meshMajor": ["Adult", "Artificial Intelligence", "Deep Learning", "Humans", "Neural Networks, Computer", "ROC Curve", "Radiography, Thoracic", "Retrospective Studies", "Sensitivity and Specificity", "Triage"], "year": "2019", "abstractText": "Purpose To develop and test an artificial intelligence (AI) system, based on deep convolutional neural networks (CNNs), for automated real-time triaging of adult chest radiographs on the basis of the urgency of imaging appearances. Materials and Methods An AI system was developed by using 470 388 fully anonymized institutional adult chest radiographs acquired from 2007 to 2017. The free-text radiology reports were preprocessed by using an in-house natural language processing (NLP) system modeling radiologic language. The NLP system analyzed the free-text report to prioritize each radiograph as critical, urgent, nonurgent, or normal. An AI system for computer vision using an ensemble of two deep CNNs was then trained by using labeled radiographs to predict the clinical priority from radiologic appearances only. The system's performance in radiograph prioritization was tested in a simulation by using an independent set of 15 887 radiographs. Prediction performance was assessed with the area under the receiver operating characteristic curve; sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV) were also determined. Nonparametric testing of the improvement in time to final report was determined at a nominal significance level of 5%. Results Normal chest radiographs were detected by our AI system with a sensitivity of 71%, specificity of 95%, PPV of 73%, and NPV of 94%. The average reporting delay was reduced from 11.2 to 2.7 days for critical imaging findings (P < .001) and from 7.6 to 4.1 days for urgent imaging findings (P < .001) in the simulation compared with historical data. Conclusion Automated real-time triaging of adult chest radiographs with use of an artificial intelligence system is feasible, with clinically acceptable performance. \u00a9 RSNA, 2019 Online supplemental material is available for this article. See also the editorial by Auffermann in this issue.", "pmid": "30667333", "title": "Automated Triaging of Adult Chest Radiographs with Deep Artificial Neural Networks."}, {"journal": "Amino acids", "meshMajor": ["Amino Acids", "Angiotensin-Converting Enzyme Inhibitors", "Animals", "Anti-Infective Agents", "Antimicrobial Cationic Peptides", "Artificial Intelligence", "Databases, Protein", "Dipeptides", "Humans", "Kinetics", "Models, Chemical", "Pancreatic Elastase", "Peptides", "Protein Conformation", "Quantitative Structure-Activity Relationship", "Software", "Staphylococcus aureus", "Statistics as Topic", "Swine"], "year": "2010", "abstractText": "In this study, structural topology scale (ST-scale) was recruited as a novel structural topological descriptor derived from principal component analysis on 827 structural variables of 167 amino acids. By using partial least squares (PLS), we applied ST-scale for the study of quantitative sequence-activity models (QSAMs) on three peptide datasets (58 angiotensin-converting enzyme (ACE) inhibitors, 34 antimicrobial peptides (AMPs) and 89 elastase substrates (ES)). The results of QSAMs were superior to that of the earlier studies, with determination coefficient (r(2)) and cross-validated (q(2)) equal to 0.855, 0.774; 0.79, 0.371 (OSC-PLS: 0.995, 0.848) and 0.846, 0.747, respectively. Therefore, ST-scale descriptors were considered to be competent to extract information from 827 structural variables and relate with their bioactivities.", "pmid": "19373543", "title": "ST-scale as a novel amino acid descriptor and its application in QSAM of peptides and analogues."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Architecture", "Artificial Intelligence", "Computer Simulation", "Geographic Information Systems", "Humans", "Image Interpretation, Computer-Assisted", "Maps as Topic", "Models, Biological", "Models, Theoretical", "Monte Carlo Method", "Pattern Recognition, Automated", "Robotics"], "year": "2010", "abstractText": "In this paper we deal with the problem of map building and localization of a mobile robot in an environment using the information provided by an omnidirectional vision sensor that is mounted on the robot. Our main objective consists of studying the feasibility of the techniques based in the global appearance of a set of omnidirectional images captured by this vision sensor to solve this problem. First, we study how to describe globally the visual information so that it represents correctly locations and the geometrical relationships between these locations. Then, we integrate this information using an approach based on a spring-mass-damper model, to create a topological map of the environment. Once the map is built, we propose the use of a Monte Carlo localization approach to estimate the most probable pose of the vision system and its trajectory within the map. We perform a comparison in terms of computational cost and error in localization. The experimental results we present have been obtained with real indoor omnidirectional images.", "pmid": "22163538", "title": "Map building and monte carlo localization using global appearance of omnidirectional images."}, {"journal": "The pharmacogenomics journal", "meshMajor": ["Algorithms", "Area Under Curve", "Artificial Intelligence", "Brain Neoplasms", "Color", "Databases, Genetic", "Endpoint Determination", "Gene Expression Profiling", "Humans", "Least-Squares Analysis", "Neuroblastoma", "Oligonucleotide Array Sequence Analysis", "Predictive Value of Tests", "Quality Control", "RNA, Neoplasm", "ROC Curve"], "year": "2010", "abstractText": "Microarray-based prediction of clinical endpoints may be performed using either a one-color approach reflecting mRNA abundance in absolute intensity values or a two-color approach yielding ratios of fluorescent intensities. In this study, as part of the MAQC-II project, we systematically compared the classification performance resulting from one- and two-color gene-expression profiles of 478 neuroblastoma samples. In total, 196 classification models were applied to these measurements to predict four clinical endpoints, and classification performances were compared in terms of accuracy, area under the curve, Matthews correlation coefficient and root mean-squared error. Whereas prediction performance varied with distinct clinical endpoints and classification models, equivalent performance metrics were observed for one- and two-color measurements in both internal and external validation. Furthermore, overlap of selected signature genes correlated inversely with endpoint prediction difficulty. In summary, our data strongly substantiate that the choice of platform is not a primary factor for successful gene expression based-prediction of clinical endpoints.", "pmid": "20676065", "title": "Comparison of performance of one-color and two-color gene-expression analyses in predicting clinical endpoints of neuroblastoma patients."}, {"journal": "Artificial intelligence in medicine", "meshMajor": ["Adult", "Affect", "Algorithms", "Artificial Intelligence", "Autonomic Nervous System", "Biosensing Techniques", "Bipolar Disorder", "Clothing", "Decision Support Techniques", "Diagnosis, Computer-Assisted", "Electrocardiography, Ambulatory", "Equipment Design", "Female", "Heart Rate", "Humans", "Male", "Middle Aged", "Models, Statistical", "Monitoring, Ambulatory", "Predictive Value of Tests", "Respiratory Rate", "Severity of Illness Index", "Signal Processing, Computer-Assisted", "Time Factors", "Transducers"], "year": "2013", "abstractText": "BACKGROUND: Bipolar disorders are characterized by a series of both depressive and manic or hypomanic episodes. Although common and expensive to treat, the clinical assessment of bipolar disorder is still ill-defined.OBJECTIVE: In the current literature several correlations between mood disorders and dysfunctions involving the autonomic nervous system (ANS) can be found. The objective of this work is to develop a novel mood recognition system based on a pervasive, wearable and personalized monitoring system using ANS-related biosignals.MATERIALS AND METHODS: The monitoring platform used in this study is the core sensing system of the personalized monitoring systems for care in mental health (PSYCHE) European project. It is comprised of a comfortable sensorized t-shirt that can acquire the inter-beat interval time series, the heart rate, and the respiratory dynamics for long-term monitoring during the day and overnight. In this study, three bipolar patients were followed for a period of 90 days during which up to six monitoring sessions and psychophysical evaluations were performed for each patient. Specific signal processing techniques and artificial intelligence algorithms were applied to analyze more than 120 h of data.RESULTS: Experimental results are expressed in terms of confusion matrices and an exhaustive descriptive statistics of the most relevant features is reported as well. A classification accuracy of about 97% is achieved for the intra-subject analysis. Such an accuracy was found in distinguishing relatively good affective balance state (euthymia) from severe clinical states (severe depression and mixed state) and is lower in distinguishing euthymia from the milder states (accuracy up to 88%).CONCLUSIONS: The PSYCHE platform could provide a viable decision support system in order to improve mood assessment in patient care. Evidences about the correlation between mood disorders and ANS dysfunctions were found and the obtained results are promising for an effective biosignal-based mood recognition.", "pmid": "23332576", "title": "Mood recognition in bipolar patients through the PSYCHE platform: preliminary evaluations and perspectives."}, {"journal": "PloS one", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Algorithms", "Artificial Intelligence", "Bayes Theorem", "Biomarkers, Tumor", "Case-Control Studies", "Female", "Follow-Up Studies", "Genetic Predisposition to Disease", "Humans", "Inflammation", "Inflammation Mediators", "Male", "Middle Aged", "Polymorphism, Single Nucleotide", "Prognosis", "Risk Factors", "Smoking", "Texas", "Urinary Bladder Neoplasms", "Young Adult"], "year": "2013", "abstractText": "The relationship between inflammation and cancer is well established in several tumor types, including bladder cancer. We performed an association study between 886 inflammatory-gene variants and bladder cancer risk in 1,047 cases and 988 controls from the Spanish Bladder Cancer (SBC)/EPICURO Study. A preliminary exploration with the widely used univariate logistic regression approach did not identify any significant SNP after correcting for multiple testing. We further applied two more comprehensive methods to capture the complexity of bladder cancer genetic susceptibility: Bayesian Threshold LASSO (BTL), a regularized regression method, and AUC-Random Forest, a machine-learning algorithm. Both approaches explore the joint effect of markers. BTL analysis identified a signature of 37 SNPs in 34 genes showing an association with bladder cancer. AUC-RF detected an optimal predictive subset of 56 SNPs. 13 SNPs were identified by both methods in the total population. Using resources from the Texas Bladder Cancer study we were able to replicate 30% of the SNPs assessed. The associations between inflammatory SNPs and bladder cancer were reexamined among non-smokers to eliminate the effect of tobacco, one of the strongest and most prevalent environmental risk factor for this tumor. A 9 SNP-signature was detected by BTL. Here we report, for the first time, a set of SNP in inflammatory genes jointly associated with bladder cancer risk. These results highlight the importance of the complex structure of genetic susceptibility associated with cancer risk. ", "pmid": "24391818", "title": "Application of multi-SNP approaches Bayesian LASSO and AUC-RF to detect main effects of inflammatory-gene variants associated with bladder cancer risk."}, {"journal": "Journal of the American Medical Informatics Association : JAMIA", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Artificial Intelligence", "Cardiology", "Computer Simulation", "Diagnosis, Computer-Assisted", "Diagnosis, Differential", "Heart Diseases", "Hospitals, Teaching", "Humans", "Middle Aged", "Models, Cardiovascular", "Predictive Value of Tests", "Prospective Studies", "ROC Curve", "Sensitivity and Specificity"], "year": null, "abstractText": "CONTEXT: The Heart Disease Program (HDP) is a novel computerized diagnosis program incorporating a computer model of cardiovascular physiology. Physicians can enter standard clinical data and receive a differential diagnosis with explanations.OBJECTIVE: To evaluate the diagnostic performance of the HDP and its usability by physicians in a typical clinical setting.DESIGN: A prospective observational study of the HDP in use by physicians in departments of medicine and cardiology of a teaching hospital. Data came from 114 patients with a broad range of cardiac disorders, entered by six physicians.MEASUREMENTS: Sensitivity, specificity, and positive predictive value (PPV). Comprehensiveness: the proportion of final diagnoses suggested by the HDP or physicians for each case.RELEVANCE: the proportion of HDP or physicians' diagnoses that are correct. Area under the receiver operating characteristic (ROC) curve (AUC) for the HDP and the physicians. Performance was compared with a final diagnosis based on follow-up and further investigations.RESULTS: Compared with the final diagnoses, the HDP had a higher sensitivity (53.0% vs. 34.8%) and significantly higher comprehensiveness (57.2% vs. 39.5%, p < 0.0001) than the physicians. Physicians' PPV and relevance (56.2%, 56.0%) were higher than the HDP (25.4%, 28.1%). Combining the diagnoses of the physicians and the HDPs, sensitivity was 61.3% and comprehensiveness was 65.7%. These findings were significant in the two collection cohorts and for subanalysis of the most serious diagnoses. The AUCs were similar for the HDP and the physicians.CONCLUSIONS: The heart disease program has the potential to improve the differential diagnoses of physicians in a typical clinical setting.", "pmid": "12668689", "title": "Evaluation of a cardiac diagnostic program in a typical clinical setting."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Coronary Artery Disease", "Databases, Factual", "Diagnosis, Computer-Assisted", "Fuzzy Logic", "Humans", "Sensitivity and Specificity", "Statistics, Nonparametric", "Time Factors"], "year": "2014", "abstractText": "In the past decades, medical data mining has become a popular data mining subject. Researchers have proposed several tools and various methodologies for developing effective medical expert systems. Diagnosing heart diseases is one of the important topics and many researchers have tried to develop intelligent medical expert systems to help the physicians. In this paper, we propose the use of PSO algorithm with a boosting approach to extract rules for recognizing the presence or absence of coronary artery disease in a patient. The weight of training examples that are classified properly by the new rules is reduced by a boosting mechanism. Therefore, in the next rule generation cycle, the focus is on those fuzzy rules that account for the currently misclassified or uncovered instances. We have used coronary artery disease data sets taken from University of California Irvine, (UCI), to evaluate our new classification approach. Results show that the proposed method can detect the coronary artery disease with an acceptable accuracy. Also, the discovered rules have significant interpretability as well. ", "pmid": "24817883", "title": "Coronary artery disease detection using a fuzzy-boosting PSO approach."}, {"journal": "NeuroImage", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Brain", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Magnetic Resonance Imaging", "Organ Size", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "An automated algorithm has been developed to segment stripped (non-brain tissue excluded) T1-weighted MRI brain volumes into left and right cerebral hemispheres and cerebellum+brainstem. The algorithm, which uses the Graph Cuts technique, performs a fully automated segmentation in approximately 30 s following pre-processing. It is robust and accurate and has been tested on datasets from two scanners using different field strengths and pulse sequences. We describe the Graph Cuts algorithm and compare the results of Graph Cuts segmentations against \"gold standard\" manual segmentations and segmentations produced by three popular software packages used by neuroimagers: BrainVisa, CLASP, and SurfRelax.", "pmid": "17150376", "title": "Automatic segmentation of left and right cerebral hemispheres from MRI brain volumes using the graph cuts algorithm."}, {"journal": "Molecular pharmaceutics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Deep Learning", "Drug Discovery", "Machine Learning", "Neural Networks, Computer"], "year": "2018", "abstractText": "Artificial Intelligence has advanced at an unprecedented pace, backing recent breakthroughs in natural language processing, speech recognition, and computer vision: domains where the data is euclidean in nature. More recently, considerable progress has been made in engineering deep-learning architectures that can accept non-Euclidean data such as graphs and manifolds: geometric deep learning. This progress is of considerable interest to the drug discovery community, as molecules can naturally be represented as graphs, where atoms are nodes and bonds are edges. In this work, we explore the performance of geometric deep-learning methods in the context of drug discovery, comparing machine learned features against the domain expert engineered features that are mainstream in the pharmaceutical industry.", "pmid": "29863875", "title": "Geometric Deep Learning Autonomously Learns Chemical Features That Outperform Those Engineered by Domain Experts."}, {"journal": "The oncologist", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Antineoplastic Combined Chemotherapy Protocols", "Artificial Intelligence", "China", "Clinical Decision-Making", "Decision Support Systems, Clinical", "Disease-Free Survival", "Evidence-Based Medicine", "Female", "Humans", "Male", "Medical Oncology", "Middle Aged", "Neoplasms", "Patient Selection", "Practice Guidelines as Topic", "Retrospective Studies"], "year": "2019", "abstractText": "BACKGROUND: IBM Watson for Oncology (WFO), which can use natural language processing to evaluate data in structured and unstructured formats, has begun to be used in China. It provides physicians with evidence-based treatment options and ranks them in three categories for treatment decision support. This study was designed to examine the concordance between the treatment recommendation proposed by WFO and actual clinical decisions by oncologists in our cancer center, which would reflect the differences of cancer treatment between China and the U.S.PATIENTS AND METHODS: Retrospective data from 362 patients with cancer were ingested into WFO from April 2017 to October 2017. WFO recommendations were provided in three categories: recommended, for consideration, and not recommended. Concordance was analyzed by comparing the treatment decisions proposed by WFO with those of the multidisciplinary tumor board. Concordance was achieved when the oncologists' treatment decisions were in the recommended or for consideration categories in WFO.RESULTS: Ovarian cancer showed the highest concordance, which was 96%. Lung cancer and breast cancer obtained a concordance of slightly above 80%. The concordance of rectal cancer was 74%, whereas colon cancer and cervical cancer showed the same concordance of 64%. In particular, the concordance of gastric cancer was very low, only 12%, and 88% of cases were under physicians choice.CONCLUSION: Different cancer types showed different concordances, and only gastric cancers were significantly less likely to be concordant. Incidence and pharmaceuticals may be the major cause of discordance. To be comprehensively and rapidly applied in China, WFO needs to accelerate localization. ClinicalTrials.gov Identifier: NCT03400514.IMPLICATIONS FOR PRACTICE: IBM Watson for Oncology (WFO) has begun to be used in China. In this study, concordance was examined between the treatment recommendation proposed by WFO and clinical decisions for 362 patients in our cancer center, which could reflect the differences of cancer treatment between China and the U.S. Different cancer types showed different concordances, and only gastric cancers were significantly less likely to be concordant. Incidence and pharmaceuticals may be the major causes of discordance. To be comprehensively and rapidly applied in China, WFO needs to accelerate localization. This study may have a significant effect on application of artificial intelligence systems in China.", "pmid": "30181315", "title": "Concordance Study Between IBM Watson for Oncology and Clinical Practice for Patients with Cancer in China."}, {"journal": "Computational and mathematical methods in medicine", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Cluster Analysis", "Diagnostic Imaging", "Fuzzy Logic", "Humans", "Image Interpretation, Computer-Assisted", "Image Processing, Computer-Assisted", "Medical Informatics", "Models, Statistical", "Normal Distribution", "Pattern Recognition, Automated", "Reproducibility of Results", "Software", "Tomography, X-Ray Computed"], "year": "2014", "abstractText": "Researchers recently apply an integrative approach to automate medical image segmentation for benefiting available methods and eliminating their disadvantages. Intensity inhomogeneity is a challenging and open problem in this area, which has received less attention by this approach. It has considerable effects on segmentation accuracy. This paper proposes a new kernel-based fuzzy level set algorithm by an integrative approach to deal with this problem. It can directly evolve from the initial level set obtained by Gaussian Kernel-Based Fuzzy C-Means (GKFCM). The controlling parameters of level set evolution are also estimated from the results of GKFCM. Moreover the proposed algorithm is enhanced with locally regularized evolution based on an image model that describes the composition of real-world images, in which intensity inhomogeneity is assumed as a component of an image. Such improvements make level set manipulation easier and lead to more robust segmentation in intensity inhomogeneity. The proposed algorithm has valuable benefits including automation, invariant of intensity inhomogeneity, and high accuracy. Performance evaluation of the proposed algorithm was carried on medical images from different modalities. The results confirm its effectiveness for medical image segmentation. ", "pmid": "24624225", "title": "A new kernel-based fuzzy level set method for automated segmentation of medical images in the presence of intensity inhomogeneity."}, {"journal": "Biochemical and biophysical research communications", "meshMajor": ["Artificial Intelligence", "Cell Cycle Proteins", "Cell Division", "Centrosome", "Kinetochores", "Sequence Analysis, Protein"], "year": "2010", "abstractText": "In the process of cell division, a great deal of proteins is assembled into three distinct organelles, namely midbody, centrosome and kinetochore. Knowing the localization of microkit (midbody, centrosome and kinetochore) proteins will facilitate drug target discovery and provide novel insights into understanding their functions. In this study, a support vector machine (SVM) model, MicekiPred, was presented to predict the localization of microkit proteins based on gene ontology (GO) information. A total accuracy of 77.51% was achieved using the jackknife cross-validation. This result shows that the model will be an effective complementary tool for future experimental study. The prediction model and dataset used in this article can be freely downloaded from http://cobi.uestc.edu.cn/people/hlin/tools/MicekiPred/.", "pmid": "20854791", "title": "Prediction of midbody, centrosome and kinetochore proteins based on gene ontology information."}, {"journal": "Physics in medicine and biology", "meshMajor": ["Alzheimer Disease", "Artificial Intelligence", "Diagnosis, Computer-Assisted", "Humans", "Image Interpretation, Computer-Assisted", "Tomography, Emission-Computed, Single-Photon"], "year": "2010", "abstractText": "This paper presents a computer-aided diagnosis technique for improving the accuracy of early diagnosis of Alzheimer-type dementia. The proposed methodology is based on the selection of voxels which present Welch's t-test between both classes, normal and Alzheimer images, greater than a given threshold. The mean and standard deviation of intensity values are calculated for selected voxels. They are chosen as feature vectors for two different classifiers: support vector machines with linear kernel and classification trees. The proposed methodology reaches greater than 95% accuracy in the classification task.", "pmid": "20413829", "title": "Computer-aided diagnosis of Alzheimer's disease using support vector machines and classification trees."}, {"journal": "Journal of computer-aided molecular design", "meshMajor": ["Artificial Intelligence", "Humans", "Learning", "Least-Squares Analysis", "Neural Networks, Computer", "Quantitative Structure-Activity Relationship"], "year": null, "abstractText": "Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding (LLE), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. LLE is shown to create more stable representations than other non-linear dimensionality reduction algorithms, and to be capable of capturing non-linearity in chemical data.", "pmid": "15729847", "title": "Locally linear embedding for dimensionality reduction in QSAR."}, {"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Artificial Intelligence", "Cognition", "Conscience", "Consciousness", "Ethics", "Humans", "Intelligence", "Memory", "Thinking", "Unconscious, Psychology"], "year": "1992", "abstractText": "A complex system (CS) is defined as a set of elements, with connections between them, singled out of the environment, capable of getting information from the environment, capable of making decisions (i.e., of choosing between alternatives), and having purposefulness (i.e., an urge towards preferable states or other goals). Thinking is a process that takes place (or which can take place) in some of the CS and consists of (i) receiving information from the environment (and from itself), (ii) memorizing the information, (iii) the subconscious, and (iv) consciousness. Life is a process that takes place in some CS and consists of functions i and ii, as well as (v) reproduction with passing of hereditary information to progeny, and (vi) oriented energy and matter exchange with the environment sufficient for the maintenance of all life processes. Memory is a complex of processes of placing information in memory banks, keeping it there, and producing it according to prescriptions available in the system or to inquiries arising in it. Consciousness is a process of realization by the thinking CS of some set of algorithms consisting of the comparison of its knowledge, intentions, decisions, and actions with reality--i.e., with accumulated and continuously received internal and external information. Conscience is a realization of an algorithm of good and evil pattern recognition.", "pmid": "1631060", "title": "On the definition of the concepts thinking, consciousness, and conscience."}, {"journal": "Journal of the Royal Society of Medicine", "meshMajor": ["Artificial Intelligence", "Critical Pathways", "Delivery of Health Care, Integrated", "Forecasting", "Heuristics", "Humans", "Technology Assessment, Biomedical"], "year": "2019", "abstractText": "In recent years, there has been massive progress in artificial intelligence (AI) with the development of deep neural networks, natural language processing, computer vision and robotics. These techniques are now actively being applied in healthcare with many of the health service activities currently being delivered by clinicians and administrators predicted to be taken over by AI in the coming years. However, there has also been exceptional hype about the abilities of AI with a mistaken notion that AI will replace human clinicians altogether. These perspectives are inaccurate, and if a balanced perspective of the limitations and promise of AI is taken, one can gauge which parts of the health system AI can be integrated to make a meaningful impact. The four main areas where AI would have the most influence would be: patient administration, clinical decision support, patient monitoring and healthcare interventions. This health system where AI plays a central role could be termed an AI-enabled or AI-augmented health system. In this article, we discuss how this system can be developed based on a realistic assessment of current AI technologies and predicted developments.", "pmid": "30507284", "title": "Artificial intelligence-enabled healthcare delivery."}, {"journal": "IEEE transactions on neural networks", "meshMajor": ["Artificial Intelligence", "Computer Simulation", "Humans", "Learning", "Likelihood Functions", "Neural Networks, Computer", "Nonlinear Dynamics", "Recurrence"], "year": "2011", "abstractText": "Recurrent neural network (RNN) has emerged as a promising tool in modeling nonlinear dynamical systems, but the training convergence is still of concern. This paper aims to develop an effective extended Kalman filter-based RNN training approach with a controllable training convergence. The training convergence problem during extended Kalman filter-based RNN training has been proposed and studied by adapting two artificial training noise parameters: the covariance of measurement noise (R) and the covariance of process noise (Q) of Kalman filter. The R and Q adaption laws have been developed using the Lyapunov method and the maximum likelihood method, respectively. The effectiveness of the proposed adaption laws has been tested using a nonlinear dynamical benchmark system and further applied in cutting tool wear modeling. The results show that the R adaption law can effectively avoid the divergence problem and ensure the training convergence, whereas the Q adaption law helps improve the training convergence speed.", "pmid": "21402512", "title": "Convergence study in extended Kalman filter-based training of recurrent neural networks."}, {"journal": "Biomedical sciences instrumentation", "meshMajor": ["Artificial Intelligence", "Benchmarking", "Biomedical Engineering", "Computer Simulation", "Cost-Benefit Analysis", "Database Management Systems", "Decision Making, Computer-Assisted", "Efficiency, Organizational", "Electronics, Medical", "Information Storage and Retrieval", "Joint Commission on Accreditation of Healthcare Organizations", "Models, Statistical", "Quality Control", "Quality Indicators, Health Care", "Total Quality Management", "United States"], "year": "2003", "abstractText": "Healthcare is ever changing environment and with the Joint Commission for the Accreditation of Hospital Organization (JCAHO) emphasis on quality improvement during the past several years, and the cost-focused healthcare reforms of the 1990s, benchmarking with peer comparison, and more recently benchmarking against competitors, has taken on a new emphasis. All acute healthcare organizations accredited by JCAHO now require participation in a program titled ORYX, which is designed to use comparisons with other organizations and promote national benchmarks. The knowledge management system designed assists clinical engineering department to convert vast amounts of available data into information, which is ultimately transformed into knowledge to enable better decision-making. The systems assist in using the data as a comparison tool, to compare the performance internally and also compare performance with peer organizations using the same measures within the same measurement system. Collectively, these applications support better, faster data-driven decisions. This tool provides fast and easy access to financial and quality metrics to clinical engineering department managers, which increases their ability to perform sophisticated analysis to develop accurate models and forecasts, and make timely, data driven decisions. The project also provides a platform by means of which clinical engineering departmental procedures, data, and methods can be assessed and shared among institutions.", "pmid": "12724889", "title": "Knowledge management system for benchmarking performance indicators using statistical process control (SPC) and Virtual Instrumentation (VI)."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Databases, Genetic", "Gene Expression Profiling", "Oligonucleotide Array Sequence Analysis", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "BACKGROUND: Cluster analysis is an integral part of high dimensional data analysis. In the context of large scale gene expression data, a filtered set of genes are grouped together according to their expression profiles using one of numerous clustering algorithms that exist in the statistics and machine learning literature. A closely related problem is that of selecting a clustering algorithm that is \"optimal\" in some sense from a rather impressive list of clustering algorithms that currently exist.RESULTS: In this paper, we propose two validation measures each with two parts: one measuring the statistical consistency (stability) of the clusters produced and the other representing their biological functional congruence. Smaller values of these indices indicate better performance for a clustering algorithm. We illustrate this approach using two case studies with publicly available gene expression data sets: one involving a SAGE data of breast cancer patients and the other involving a time course cDNA microarray data on yeast. Six well known clustering algorithms UPGMA, K-Means, Diana, Fanny, Model-Based and SOM were evaluated.CONCLUSION: No single clustering algorithm may be best suited for clustering genes into functional groups via expression profiles for all data sets. The validation measures introduced in this paper can aid in the selection of an optimal algorithm, for a given data set, from a collection of available clustering algorithms.", "pmid": "17217509", "title": "Evaluation of clustering algorithms for gene expression data."}, {"journal": "ISA transactions", "meshMajor": ["Air Pressure", "Algorithms", "Artificial Intelligence", "Computer Systems", "Food", "Food Industry", "Food Packaging", "Models, Statistical", "Reinforcement, Psychology", "Steam", "Sterilization", "Temperature"], "year": "2011", "abstractText": "A control technique based on Reinforcement Learning is proposed for the thermal sterilization of canned foods. The proposed controller has the objective of ensuring a given degree of sterilization during Heating (by providing a minimum temperature inside the cans during a given time) and then a smooth Cooling, avoiding sudden pressure variations. For this, three automatic control valves are manipulated by the controller: a valve that regulates the admission of steam during Heating, and a valve that regulate the admission of air, together with a bleeder valve, during Cooling. As dynamical models of this kind of processes are too complex and involve many uncertainties, controllers based on learning are proposed. Thus, based on the control objectives and the constraints on input and output variables, the proposed controllers learn the most adequate control actions by looking up a certain matrix that contains the state-action mapping, starting from a preselected state-action space. This state-action matrix is constantly updated based on the performance obtained with the applied control actions. Experimental results at laboratory scale show the advantages of the proposed technique for this kind of processes.", "pmid": "20817160", "title": "Learning control for batch thermal sterilization of canned foods."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cerebral Cortex", "Computer Simulation", "Databases, Factual", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Magnetic Resonance Imaging", "Models, Biological", "Models, Statistical", "Nonlinear Dynamics", "Pattern Recognition, Automated", "Principal Component Analysis", "Reference Values", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2005", "abstractText": "Because of the complex shape of human cortical gyri and great variation between individuals, development of effective representation schemes which allow establishment of correspondence between individuals, extraction of average structure of a population, and co-registration has proved very difficult. We introduce an approach which extracts line representations of gyri at different depths from high resolution MRI, labels main gyri semi-automatically, and extracts a template from a population using non-linear principal component analysis. The method has been tested on data from 96 healthy human volunteers. The model captures the most salient shape features of all major cortical gyri, and can be used for inter-subject registration, for investigating regionalized inter-subject variability, and for inter-hemispheric comparisons.", "pmid": "16686027", "title": "A construction of an averaged representation of human cortical gyri using non-linear principal component analysis."}, {"journal": "Artificial intelligence in medicine", "meshMajor": ["Anthropology, Cultural", "Artificial Intelligence", "Communication", "Computer Simulation", "Humans", "Information Services", "Interviews as Topic", "Medical History Taking", "Migraine Disorders", "Natural Language Processing", "Patient Education as Topic", "Physician-Patient Relations", "Systems Integration", "Terminology as Topic"], "year": "1995", "abstractText": "This paper is a report on the first phase of a long-term, interdisciplinary project whose goal is to increase the overall effectiveness of physicians' time, and thus the quality of health care, by improving the information exchange between physicians and patients in clinical settings. We are focusing on patients with long-term and chronic conditions, initially on migraine patients, who require periodic interaction with their physicians for effective management of their condition. We are using medical informatics to focus on the information needs of patients, as well as of physicians, and to address problems of information exchange. This requires understanding patients' concerns to design an appropriate system, and using state-of-the-art artificial intelligence techniques to build an interactive explanation system. In contrast to many other knowledge-based systems, our system's design is based on empirical data on actual information needs. We used ethnographic techniques to observe explanations actually given in clinic settings, and to conduct interviews with migraine sufferers and physicians. Our system has an extensive knowledge base that contains both general medical terminology and specific knowledge about migraine, such as common trigger factors and symptoms of migraine, the common therapies, and the most common effects and side effects of those therapies. The system consists of two main components: (a) an interactive history-taking module that collects information from patients prior to each visit, builds a patient model, and summarizes the patients' status for their physicians; and (b) an intelligent explanation module that produces an interactive information sheet containing explanations in everyday language that are tailored to individual patients, and responds intelligently to follow-up questions about topics covered in the information sheet.", "pmid": "7647838", "title": "An intelligent interactive system for delivering individualized information to patients."}, {"journal": "Computerized medical imaging and graphics : the official journal of the Computerized Medical Imaging Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Colon", "Colonography, Computed Tomographic", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2009", "abstractText": "An automatic method for the segmentation of the colonic wall is proposed for abdominal computed tomography (CT) of the cleansed and air-inflated colon. This multistage approach uses an adaptive 3D region-growing algorithm, with a self-adjusting growing condition depending on local variations of the intensity at the air-tissue boundary. The method was evaluated using retrospectively collected CT scans based on visual segmentation of the colon by expert radiologists. This evaluation showed that the procedure identifies 97% of the colon segments, representing 99.8% of the colon surface, and accurately replicates the anatomical profile of the colonic wall. The parameter settings and performance of the method are relatively independent of the scanner and acquisition conditions. The method is intended for application to the computer-aided detection of polyps in CT colonography.", "pmid": "19304454", "title": "An automatic method for colon segmentation in CT colonography."}, {"journal": "Journal of medical Internet research", "meshMajor": ["Adolescent", "Adult", "Aged", "Artificial Intelligence", "Female", "Humans", "Internet", "Interpersonal Relations", "Male", "Middle Aged", "Mobile Applications", "Social Support", "Young Adult"], "year": "2020", "abstractText": "BACKGROUND: Previous research suggests that artificial agents may be a promising source of social support for humans. However, the bulk of this research has been conducted in the context of social support interventions that specifically address stressful situations or health improvements. Little research has examined social support received from artificial agents in everyday contexts.OBJECTIVE: Considering that social support manifests in not only crises but also everyday situations and that everyday social support forms the basis of support received during more stressful events, we aimed to investigate the types of everyday social support that can be received from artificial agents.METHODS: In Study 1, we examined publicly available user reviews (N=1854) of Replika, a popular companion chatbot. In Study 2, a sample (n=66) of Replika users provided detailed open-ended responses regarding their experiences of using Replika. We conducted thematic analysis on both datasets to gain insight into the kind of everyday social support that users receive through interactions with Replika.RESULTS: Replika provides some level of companionship that can help curtail loneliness, provide a \"safe space\" in which users can discuss any topic without the fear of judgment or retaliation, increase positive affect through uplifting and nurturing messages, and provide helpful information/advice when normal sources of informational support are not available.CONCLUSIONS: Artificial agents may be a promising source of everyday social support, particularly companionship, emotional, informational, and appraisal support, but not as tangible support. Future studies are needed to determine who might benefit from these types of everyday social support the most and why. These results could potentially be used to help address global health issues or other crises early on in everyday situations before they potentially manifest into larger issues.", "pmid": "32141837", "title": "User Experiences of Social Support From Companion Chatbots in Everyday Contexts: Thematic Analysis."}, {"journal": "Philosophical transactions of the Royal Society of London. Series B, Biological sciences", "meshMajor": ["Artificial Intelligence", "Ethnic Groups", "Female", "Germany", "Humans", "Language", "Language Tests", "Learning", "Linguistics", "Memory", "Netherlands", "Reaction Time"], "year": "2012", "abstractText": "Processing non-adjacent dependencies is considered to be one of the hallmarks of human language. Assuming that sequence-learning tasks provide a useful way to tap natural-language-processing mechanisms, we cross-modally combined serial reaction time and artificial-grammar learning paradigms to investigate the processing of multiple nested (A(1)A(2)A(3)B(3)B(2)B(1)) and crossed dependencies (A(1)A(2)A(3)B(1)B(2)B(3)), containing either three or two dependencies. Both reaction times and prediction errors highlighted problems with processing the middle dependency in nested structures (A(1)A(2)A(3)B(3)_B(1)), reminiscent of the 'missing-verb effect' observed in English and French, but not with crossed structures (A(1)A(2)A(3)B(1)_B(3)). Prior linguistic experience did not play a major role: native speakers of German and Dutch-which permit nested and crossed dependencies, respectively-showed a similar pattern of results for sequences with three dependencies. As for sequences with two dependencies, reaction times and prediction errors were similar for both nested and crossed dependencies. The results suggest that constraints on the processing of multiple non-adjacent dependencies are determined by the specific ordering of the non-adjacent dependencies (i.e. nested or crossed), as well as the number of non-adjacent dependencies to be resolved (i.e. two or three). Furthermore, these constraints may not be specific to language but instead derive from limitations on structured sequence learning.", "pmid": "22688641", "title": "Processing multiple non-adjacent dependencies: evidence from sequence learning."}, {"journal": "Frontiers in bioscience : a journal and virtual library", "meshMajor": ["Algorithms", "Artificial Intelligence", "DNA", "Exons", "GC Rich Sequence", "Humans", "Introns", "Polymorphism, Single Nucleotide", "Sequence Analysis, DNA", "Thermodynamics"], "year": "2007", "abstractText": "Recently, SNP has gained substantial attention as genetic markers and is recognized as a key element in the development of personalized medicine. Computational prediction of SNP can be used as a guide for SNP discovery to reduce the cost and time needed for the development of personalized medicine. We have developed a method for SNP prediction based on support vector machines (SVMs) using different features extracted from the SNP data. Prediction rates of 60.9% was achieved by sequence feature, 59.1% by free-energy feature, 58.1% by GC content feature, 58.0% by melting temperature feature, 56.2% by enthalpy feature, 55.1% by entropy feature and 54.3% by the gene, exon and intron feature. We introduced a new feature, the SNP distribution score that achieved a prediction rate of 77.3%. Thus, the proposed SNP prediction algorithm can be used to in SNP discovery.", "pmid": "17127407", "title": "Predicting single nucleotide polymorphisms (SNP) from DNA sequence by support vector machine."}, {"journal": "Applied optics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biomimetics", "Equipment Design", "Equipment Failure Analysis", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Models, Biological", "Models, Statistical", "Pattern Recognition, Automated", "Reproducibility of Results", "Robotics", "Sensitivity and Specificity", "Transducers", "Vision, Binocular"], "year": "2008", "abstractText": "One major research issue associated with 3D perception by robotic systems is the creation of efficient sensor systems that can generate dense range maps reliably. A visual sensor system for robotic applications is developed that is inherently equipped with two types of sensor, an active trinocular vision and a passive stereo vision. Unlike in conventional active vision systems that use a large number of images with variations of projected patterns for dense range map acquisition or from conventional passive vision systems that work well on specific environments with sufficient feature information, a cooperative bidirectional sensor fusion method for this visual sensor system enables us to acquire a reliable dense range map using active and passive information simultaneously. The fusion algorithms are composed of two parts, one in which the passive stereo vision helps active vision and the other in which the active trinocular vision helps the passive one. The first part matches the laser patterns in stereo laser images with the help of intensity images; the second part utilizes an information fusion technique using the dynamic programming method in which image regions between laser patterns are matched pixel-by-pixel with help of the fusion results obtained in the first part. To determine how the proposed sensor system and fusion algorithms can work in real applications, the sensor system is implemented on a robotic system, and the proposed algorithms are applied. A series of experimental tests is performed for a variety of configurations of robot and environments. The performance of the sensor system is discussed in detail.", "pmid": "18404193", "title": "Dense range map reconstruction from a versatile robotic sensor system with an active trinocular vision and a passive binocular vision."}, {"journal": "Journal of biomedical optics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Endoscopy", "Female", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Microscopy, Confocal", "Ovarian Neoplasms", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": null, "abstractText": "The confocal microendoscope is an instrument for imaging the surface of the human ovary. Images taken with this instrument from normal and diseased tissue show significant differences in cellular distribution. A real-time computer-aided system to facilitate the identification of ovarian cancer is introduced. The cellular-level structure present in ex vivo confocal microendoscope images is modeled as texture. Features are extracted based on first-order statistics, spatial gray-level-dependence matrices, and spatial-frequency content. Selection of the features is performed using stepwise discriminant analysis, forward sequential search, a nonparametric method, principal component analysis, and a heuristic technique that combines the results of these other methods. The selected features are used for classification, and the performance of various machine classifiers is compared by analyzing areas under their receiver operating characteristic curves. The machine classifiers studied included linear discriminant analysis, quadratic discriminant analysis, and the k-nearest-neighbor algorithm. The results suggest it is possible to automatically identify pathology based on texture features extracted from confocal microendoscope images and that the machine performance is superior to that of a human observer.", "pmid": "18465984", "title": "Computer-aided identification of ovarian cancer in confocal microendoscope images."}, {"journal": "International journal of data mining and bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Brain Injuries", "Data Interpretation, Statistical", "Humans", "Intracranial Pressure"], "year": "2013", "abstractText": "This paper attempts to predict Intracranial Pressure (ICP) based on features extracted from non-invasively collected patient data. These features include midline shift measurement and textural features extracted from Computed axial Tomography (CT) images. A statistical analysis is performed to examine the relationship between ICP and midline shift. Machine learning is also applied to estimate ICP levels with a two-stage feature selection scheme. To avoid overfitting, all feature selections and parameter selections are performed using a nested 10-fold cross validation within the training data. The classification results demonstrate the effectiveness of the proposed method in ICP prediction.", "pmid": "24400523", "title": "Predictability of intracranial pressure level in traumatic brain injury: features extraction, statistical analysis and machine learning-based evaluation."}, {"journal": "Medical image analysis", "meshMajor": ["Aging", "Algorithms", "Alzheimer Disease", "Artificial Intelligence", "Brain", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Magnetic Resonance Imaging", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "We develop a multivariate analysis of brain anatomy to identify the relevant shape deformation patterns and quantify the shape changes that explain corresponding variations in clinical neuropsychological measures. We use kernel Partial Least Squares (PLS) and formulate a regression model in the tangent space of the manifold of diffeomorphisms characterized by deformation momenta. The scalar deformation momenta completely encode the diffeomorphic changes in anatomical shape. In this model, the clinical measures are the response variables, while the anatomical variability is treated as the independent variable. To better understand the \"shape-clinical response\" relationship, we also control for demographic confounders, such as age, gender, and years of education in our regression model. We evaluate the proposed methodology on the Alzheimer's Disease Neuroimaging Initiative (ADNI) database using baseline structural MR imaging data and neuropsychological evaluation test scores. We demonstrate the ability of our model to quantify the anatomical deformations in units of clinical response. Our results also demonstrate that the proposed method is generic and generates reliable shape deformations both in terms of the extracted patterns and the amount of shape changes. We found that while the hippocampus and amygdala emerge as mainly responsible for changes in test scores for global measures of dementia and memory function, they are not a determinant factor for executive function. Another critical finding was the appearance of thalamus and putamen as most important regions that relate to executive function. These resulting anatomical regions were consistent with very high confidence irrespective of the size of the population used in the study. This data-driven global analysis of brain anatomy was able to reach similar conclusions as other studies in Alzheimer's disease based on predefined ROIs, together with the identification of other new patterns of deformation. The proposed methodology thus holds promise for discovering new patterns of shape changes in the human brain that could add to our understanding of disease progression in neurological disorders. ", "pmid": "24667299", "title": "Quantifying anatomical shape variations in neurological disorders."}, {"journal": "Studies in health technology and informatics", "meshMajor": ["Artificial Intelligence", "Codes of Ethics", "Health", "Humans", "Information Services", "Internet", "Medical Informatics", "Natural Language Processing", "Quality Control"], "year": "2007", "abstractText": "The number of medical websites is constantly growing [1]. Owing to the open nature of the Web, the reliability of information available on the Web is uneven. Internet users are overwhelmed by the quantity of information available on the Web. The situation is even more critical in the medical area, as the content proposed by health websites can have a direct impact on the users' well being. One way to control the reliability of health websites is to assess their quality and to make this assessment available to users. The HON Foundation has defined a set of eight ethical principles. HON's experts are working in order to manually define whether a given website complies with s the required principles. As the number of medical websites is constantly growing, manual expertise becomes insufficient and automatic systems should be used in order to help medical experts. In this paper we present the design and the evaluation of an automatic system conceived for the categorisation of medical and health documents according to he HONcode ethical principles. A first evaluation shows promising results. Currently the system shows 0.78 micro precision and 0.73 F-measure, with 0.06 errors.", "pmid": "17911808", "title": "Machine learning approach for automatic quality criteria detection of health web pages."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Discriminant Analysis", "Information Storage and Retrieval", "Pattern Recognition, Automated", "Principal Component Analysis"], "year": "2008", "abstractText": "Fisher's linear discriminant analysis (LDA) is a traditional dimensionality reduction method that has been proven to be successful for decades. Numerous variants, such as the kernel-based Fisher discriminant analysis (KFDA), have been proposed to enhance the LDA's power for nonlinear discriminants. Although effective, the KFDA is computationally expensive, since the complexity increases with the size of the data set. In this correspondence, we suggest a novel strategy to enhance the computation for an entire family of the KFDAs. Rather than invoke the KFDA for the entire data set, we advocate that the data be first reduced into a smaller representative subset using a prototype reduction scheme and that the dimensionality reduction be achieved by invoking a KFDA on this reduced data set. In this way, data points that are ineffective in the dimension reduction and classification can be eliminated to obtain a significantly reduced kernel matrix K without degrading the performance. Our experimental results demonstrate that the proposed mechanism dramatically reduces the computation time without sacrificing the classification accuracy for artificial and real-life data sets.", "pmid": "18348939", "title": "On using prototype reduction schemes to optimize kernel-based fisher discriminant analysis."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Humans", "Learning", "Linear Models", "Models, Theoretical", "Reinforcement, Psychology", "Thinking"], "year": "2016", "abstractText": "To improve the convergence rate and the sample efficiency, two efficient learning methods AC-HMLP and RAC-HMLP (AC-HMLP with ?2-regularization) are proposed by combining actor-critic algorithm with hierarchical model learning and planning. The hierarchical models consisting of the local and the global models, which are learned at the same time during learning of the value function and the policy, are approximated by local linear regression (LLR) and linear function approximation (LFA), respectively. Both the local model and the global model are applied to generate samples for planning; the former is used only if the state-prediction error does not surpass the threshold at each time step, while the latter is utilized at the end of each episode. The purpose of taking both models is to improve the sample efficiency and accelerate the convergence rate of the whole algorithm through fully utilizing the local and global information. Experimentally, AC-HMLP and RAC-HMLP are compared with three representative algorithms on two Reinforcement Learning (RL) benchmark problems. The results demonstrate that they perform best in terms of convergence rate and sample efficiency.", "pmid": "27795704", "title": "Efficient Actor-Critic Algorithm with Hierarchical Model Learning and Planning."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Ecosystem", "Geologic Sediments", "Linear Models", "Netherlands", "Neural Networks, Computer", "Predictive Value of Tests", "Time Factors"], "year": "2006", "abstractText": "The paper presents machine learning (ML) models that predict sedimentation in the harbour basin of the Port of Rotterdam. The important factors affecting the sedimentation process such as waves, wind, tides, surge, river discharge, etc. are studied, the corresponding time series data is analysed, missing values are estimated and the most important variables behind the process are chosen as the inputs. Two ML methods are used: MLP ANN and M5 model tree. The latter is a collection of piece-wise linear regression models, each being an expert for a particular region of the input space. The models are trained on the data collected during 1992-1998 and tested by the data of 1999-2000. The predictive accuracy of the models is found to be adequate for the potential use in the operational decision making.", "pmid": "16530383", "title": "Machine learning in sedimentation modelling."}, {"journal": "Neurological research", "meshMajor": ["Algorithms", "Artificial Intelligence", "Diagnostic Errors", "Electroencephalography", "Epilepsy", "Humans", "Neural Networks, Computer", "Predictive Value of Tests", "Reproducibility of Results", "Signal Processing, Computer-Assisted"], "year": "2004", "abstractText": "Diagnosis of epilepsy is primarily based on scalp-recorded electroencephalograms (EEG). Unfortunately the long-term recordings obtained from 'ambulatory recording systems' contain EEG data of up to one week duration, which has introduced new problems for clinical analysis. Traditional methods, where the entire EEG is reviewed by a trained professional, are very time-consuming when applied to recordings of this length. Therefore, several automated diagnostic aid approaches were proposed in recent years, in order to reduce expert effort in analyzing lengthy recordings. The most promising approaches to automated diagnosis are based on neural networks. This paper describes a method for automated detection of epileptic seizures from EEG signals using a multistage nonlinear pre-processing filter in combination with a diagnostic (LAMSTAR) Artificial Neural Network (ANN). Pre-processing via multistage nonlinear filtering, LAMSTAR input preparation, ANN training and system performance (1.6% miss rate, 97.2% overall accuracy when considering both false-alarms and 'misses') are discussed and are shown to compare favorably with earlier approaches presented in recent literature.", "pmid": "14977058", "title": "A neural-network-based detection of epilepsy."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Face", "Facial Expression", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Information Storage and Retrieval", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "Mosaicing entails the consolidation of information represented by multiple images through the application of a registration and blending procedure. We describe a face mosaicing scheme that generates a composite face image during enrollment based on the evidence provided by frontal and semiprofile face images of an individual. Face mosaicing obviates the need to store multiple face templates representing multiple poses of a user's face image. In the proposed scheme, the side profile images are aligned with the frontal image using a hierarchical registration algorithm that exploits neighborhood properties to determine the transformation relating the two images. Multiresolution splining is then used to blend the side profiles with the frontal image, thereby generating a composite face image of the user. A texture-based face recognition technique that is a slightly modified version of the C2 algorithm proposed by Serre et al. is used to compare a probe face image with the gallery face mosaic. Experiments conducted on three different databases indicate that face mosaicing, as described in this paper, offers significant benefits by accounting for the pose variations that are commonly observed in face images.", "pmid": "17926704", "title": "A mosaicing scheme for pose-invariant face recognition."}, {"journal": "Proteins", "meshMajor": ["Active Transport, Cell Nucleus", "Algorithms", "Artificial Intelligence", "Computational Biology", "DNA-Binding Proteins", "Databases, Protein", "Humans", "Internet", "Kinetics", "Models, Biological", "Nuclear Localization Signals", "Origin Recognition Complex", "Phosphorylation", "Protein Conformation", "Protein Interaction Domains and Motifs", "Protein Isoforms", "Protein Processing, Post-Translational", "RNA Splicing Factors", "Saccharomyces cerevisiae Proteins", "Serine", "Software Validation", "Transcription Factors", "alpha Karyopherins"], "year": "2014", "abstractText": "The binding affinity between a nuclear localization signal (NLS) and its import receptor is closely related to corresponding nuclear import activity. PTM-based modulation of the NLS binding affinity to the import receptor is one of the most understood mechanisms to regulate nuclear import of proteins. However, identification of such regulation mechanisms is challenging due to the difficulty of assessing the impact of PTM on corresponding nuclear import activities. In this study we proposed NIpredict, an effective algorithm to predict nuclear import activity given its NLS, in which molecular interaction energy components (MIECs) were used to characterize the NLS-import receptor interaction, and the support vector regression machine (SVR) was used to learn the relationship between the characterized NLS-import receptor interaction and the corresponding nuclear import activity. Our experiments showed that nuclear import activity change due to NLS change could be accurately predicted by the NIpredict algorithm. Based on NIpredict, we developed a systematic framework to identify potential PTM-based nuclear import regulations for human and yeast nuclear proteins. Application of this approach has identified the potential nuclear import regulation mechanisms by phosphorylation of two nuclear proteins including SF1 and ORC6.", "pmid": "25043850", "title": "Computational identification of post-translational modification-based nuclear import regulations by characterizing nuclear localization signal-import receptor interaction."}, {"journal": "Journal of biomedical informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computational Biology", "Decision Theory", "MEDLINE", "Natural Language Processing"], "year": "2001", "abstractText": "With the growing use of Natural Language Processing (NLP) techniques for information extraction and concept indexing in the biomedical domain, a method that quickly and efficiently assigns the correct sense of an ambiguous biomedical term in a given context is needed concurrently. The current status of word sense disambiguation (WSD) in the biomedical domain is that handcrafted rules are used based on contextual material. The disadvantages of this approach are (i) generating WSD rules manually is a time-consuming and tedious task, (ii) maintenance of rule sets becomes increasingly difficult over time, and (iii) handcrafted rules are often incomplete and perform poorly in new domains comprised of specialized vocabularies and different genres of text. This paper presents a two-phase unsupervised method to build a WSD classifier for an ambiguous biomedical term W. The first phase automatically creates a sense-tagged corpus for W, and the second phase derives a classifier for W using the derived sense-tagged corpus as a training set. A formative experiment was performed, which demonstrated that classifiers trained on the derived sense-tagged corpora achieved an overall accuracy of about 97%, with greater than 90% accuracy for each individual ambiguous term.", "pmid": "11977807", "title": "Disambiguating ambiguous biomedical terms in biomedical narrative text: an unsupervised method."}, {"journal": "Journal of medical systems", "meshMajor": ["Acidosis", "Acidosis, Respiratory", "Adolescent", "Algorithms", "Artificial Intelligence", "Child", "Decision Making, Computer-Assisted", "Decision Support Systems, Clinical", "Decision Trees", "Diagnosis, Differential", "Expert Systems", "Female", "Humans", "Male", "Postoperative Complications"], "year": "1997", "abstractText": "The decision tree approach is one of the most common approaches in automatic learning and decision making. The automatic learning of decision trees and their use usually show very good results in various \"theoretical\" environments. But in real life it is often impossible to find the desired number of representative training objects for various reasons. The lack of possibilities to measure attribute values, high cost and complexity of such measurements, and unavailability of all attributes at the same time are the typical representatives. For this reason we decided to use the decision trees not for their primary task--the decision making--but for outlining the most important attributes. This was possible by using a well-known property of the decision trees--their knowledge representation, which can be easily understood by humans. In a delicate field of medical decision making, we cannot allow ourselves to make any inaccurate decisions and the \"tips,\" provided by the decision trees, can be of a great assistance. Our main interest was to discover a predisposition to two forms of acidosis: the metabolic acidosis and respiratory acidosis, which can both have serious effects on child's health. We decided to construct different decision trees from a set of training objects. Instead of using a test set for evaluation of a decision tree, we asked medical experts to take a closer look at the generated trees. They examined and evaluated the decision trees branch by branch. Their comments show that trees generated from the available training set mainly have surprisingly good branches, but on the other hand, for some, no medical explanation could be found.", "pmid": "9555627", "title": "The limitations of decision trees and automatic learning in real world medical decision making."}, {"journal": "IEEE transactions on cybernetics", "meshMajor": ["Adolescent", "Adult", "Algorithms", "Artificial Intelligence", "Emotions", "Face", "Facial Expression", "Female", "Humans", "Image Processing, Computer-Assisted", "Male", "Pattern Recognition, Automated", "Young Adult"], "year": "2015", "abstractText": "In this paper, we present a new idea to analyze facial expression by exploring some common and specific information among different expressions. Inspired by the observation that only a few facial parts are active in expression disclosure (e.g., around mouth, eye), we try to discover the common and specific patches which are important to discriminate all the expressions and only a particular expression, respectively. A two-stage multitask sparse learning (MTSL) framework is proposed to efficiently locate those discriminative patches. In the first stage MTSL, expression recognition tasks are combined to located common patches. Each of the tasks aims to find dominant patches for each expression. Secondly, two related tasks, facial expression recognition and face verification tasks, are coupled to learn specific facial patches for individual expression. The two-stage patch learning is performed on patches sampled by multiscale strategy. Extensive experiments validate the existence and significance of common and specific patches. Utilizing these learned patches, we achieve superior performances on expression recognition compared to the state-of-the-arts. ", "pmid": "25291808", "title": "Learning Multiscale Active Facial Patches for Expression Analysis."}, {"journal": "Magnetic resonance in medicine", "meshMajor": ["Adolescent", "Adult", "Aged", "Algorithms", "Artificial Intelligence", "Brain Neoplasms", "Child", "Female", "Glioma", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Magnetic Resonance Angiography", "Male", "Middle Aged", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Young Adult"], "year": "2010", "abstractText": "Dynamic susceptibility contrast magnetic resonance perfusion imaging (DSC-MRI) is a useful method to characterize gliomas. Recently, support vector machines (SVMs) have been introduced as means to prospectively characterize new patients based on information from previous patients. Based on features derived from automatically segmented tumor volumes from 101 DSC-MR examinations, four different SVM models were compared. All SVM models achieved high prediction accuracies (>82%) after rebalancing the training data sets to equal amounts of samples per class. Best discrimination was obtained using a SVM model with a radial basis function kernel. A correct prediction of low-grade glioma was obtained at 83% (true positive rate) and for high-grade glioma at 91% (true negative rate) on the independent test data set. In conclusion, the combination of automated tumor segmentation followed by SVM classification is feasible. Thereby, a powerful tool is available to characterize glioma presurgically in patients.", "pmid": "20564592", "title": "Support vector machines in DSC-based glioma imaging: suggestions for optimal characterization."}, {"journal": "IEEE transactions on cybernetics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Feedback", "Models, Theoretical", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "In this paper, the problem of adaptive active fault-tolerant control for a class of nonlinear systems with unknown actuator fault is investigated. The actuator fault is assumed to have no traditional affine appearance of the system state variables and control input. The useful property of the basis function of the radial basis function neural network (NN), which will be used in the design of the fault tolerant controller, is explored. Based on the analysis of the design of normal and passive fault tolerant controllers, by using the implicit function theorem, a novel NN-based active fault-tolerant control scheme with fault alarm is proposed. Comparing with results in the literature, the fault-tolerant control scheme can minimize the time delay between fault occurrence and accommodation that is called the time delay due to fault diagnosis, and reduce the adverse effect on system performance. In addition, the FTC scheme has the advantages of a passive fault-tolerant control scheme as well as the traditional active fault-tolerant control scheme's properties. Furthermore, the fault-tolerant control scheme requires no additional fault detection and isolation model which is necessary in the traditional active fault-tolerant control scheme. Finally, simulation results are presented to demonstrate the efficiency of the developed techniques. ", "pmid": "25014982", "title": "Novel neural networks-based fault tolerant control scheme with fault alarm."}, {"journal": "Toxicology and applied pharmacology", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Decision Trees", "Environmental Monitoring", "Least-Squares Analysis", "Molecular Structure", "Quantitative Structure-Activity Relationship", "Regression Analysis", "Risk Assessment", "Species Specificity", "Stochastic Processes", "Tetrahymena pyriformis", "Toxicology"], "year": "2014", "abstractText": "Ensemble learning approach based decision treeboost (DTB) and decision tree forest (DTF) models are introduced in order to establish quantitative structure-toxicity relationship (QSTR) for the prediction of toxicity of 1450 diverse chemicals. Eight non-quantum mechanical molecular descriptors were derived. Structural diversity of the chemicals was evaluated using Tanimoto similarity index. Stochastic gradient boosting and bagging algorithms supplemented DTB and DTF models were constructed for classification and function optimization problems using the toxicity end-point in T. pyriformis. Special attention was drawn to prediction ability and robustness of the models, investigated both in external and 10-fold cross validation processes. In complete data, optimal DTB and DTF models rendered accuracies of 98.90%, 98.83% in two-category and 98.14%, 98.14% in four-category toxicity classifications. Both the models further yielded classification accuracies of 100% in external toxicity data of T. pyriformis. The constructed regression models (DTB and DTF) using five descriptors yielded correlation coefficients (R(2)) of 0.945, 0.944 between the measured and predicted toxicities with mean squared errors (MSEs) of 0.059, and 0.064 in complete T. pyriformis data. The T. pyriformis regression models (DTB and DTF) applied to the external toxicity data sets yielded R(2) and MSE values of 0.637, 0.655; 0.534, 0.507 (marine bacteria) and 0.741, 0.691; 0.155, 0.173 (algae). The results suggest for wide applicability of the inter-species models in predicting toxicity of new chemicals for regulatory purposes. These approaches provide useful strategy and robust tools in the screening of ecotoxicological risk or environmental hazard potential of chemicals. ", "pmid": "24463095", "title": "In silico prediction of toxicity of non-congeneric industrial chemicals using ensemble learning based modeling approaches."}, {"journal": "IEEE transactions on image processing : a publication of the IEEE Signal Processing Society", "meshMajor": ["Algorithms", "Artifacts", "Artificial Intelligence", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Neural Networks, Computer", "Pattern Recognition, Automated", "Photogrammetry", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "We propose a modified self-organizing neural network to estimate the disparity map from a stereo pair of images. Novelty consists of the network architecture and of dispensing with the standard assumption of epipolar geometry. Quite distinct from the existing algorithms which, typically, involve area- and/or feature-matching, the network is first initialized to the right image, and then deformed until it is transformed into the left image, or vice versa, this deformation itself being the measure of disparity. Illustrative examples include two classes of stereo pairs: synthetic and natural (including random-dot stereograms and wire frames) and distorted. The latter has one of the following special characteristics: one image is blurred, one image is of a different size, there are salient features like discontinuous depth values at boundaries and surface wrinkles, and there exist occluded and half-occluded regions. While these examples serve, in general, to demonstrate that the technique performs better than many existing algorithms, the above-mentioned stereo pairs (in particular, the last two) bring out some of its limitations, thereby serving as possible motivation for further work.", "pmid": "17990758", "title": "On the application of a modified self-organizing neural network to estimate stereo disparity."}, {"journal": "Journal of biomedical informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Gene Expression Profiling", "Oligonucleotide Array Sequence Analysis", "Pattern Recognition, Automated", "Software", "User-Computer Interface"], "year": "2008", "abstractText": "MOTIVATION: A challenge in microarray data analysis is to interpret observed changes in terms of biological properties and relationships. One powerful approach is to make associations of gene expression clusters with biomedical ontologies and/or biological pathways. However, this approach evaluates only one cluster at a time, returning long unordered lists of annotations for clusters without considering the overall context of the experiment under investigation.RESULTS: BioLattice is a mathematical framework based on concept lattice analysis for the biological interpretation of gene expression data. By considering gene expression clusters as objects and associated annotations as attributes and by using set inclusion relationships BioLattice orders them to create a lattice of concepts, providing an 'executive' summary of the experimental context. External knowledge resources such as Gene Ontology trees and pathway graphs can be added incrementally. We propose two quantitative structural analysis methods, 'prominent sub-lattice' and 'core-periphery' analyses, enabling systematic comparison of experimental concepts and contexts. BioLattice is implemented as a web-based utility using Scalable Vector Graphics for interactive visualization. We applied it to real microarray datasets with improved biological interpretations of the experimental contexts.", "pmid": "18093880", "title": "BioLattice: a framework for the biological interpretation of microarray gene expression data using concept lattice analysis."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Diagnosis, Computer-Assisted", "Electroencephalography", "Female", "Fourier Analysis", "Humans", "Male", "Middle Aged", "Pattern Recognition, Automated", "Polysomnography", "Reproducibility of Results", "Sensitivity and Specificity", "Sleep Stages", "Wavelet Analysis", "Young Adult"], "year": "2010", "abstractText": "An algorithm to detect automatically drowsiness episodes has been developed. It uses only one EEG channel to differentiate the stages of alertness and drowsiness. In this work the vectors features are building combining Power Spectral Density (PDS) and Wavelet Transform (WT). The feature extracted from the PSD of EEG signal are: Central frequency, the First Quartile Frequency, the Maximum Frequency, the Total Energy of the Spectrum, the Power of Theta and Alpha bands. In the Wavelet Domain, it was computed the number of Zero Crossing and the integrated from the scale 3, 4 and 5 of Daubechies 2 order WT. The classifying of epochs is being done with neural networks. The detection results obtained with this technique are 86.5 % for drowsiness stages and 81.7% for alertness segment. Those results show that the features extracted and the classifier are able to identify drowsiness EEG segments.", "pmid": "21096343", "title": "An automatic detector of drowsiness based on spectral analysis and wavelet decomposition of EEG records."}, {"journal": "IEEE transactions on medical imaging", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Elasticity Imaging Techniques", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Models, Biological", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2009", "abstractText": "In this paper, we propose the DT-REFinD algorithm for the diffeomorphic nonlinear registration of diffusion tensor images. Unlike scalar images, deforming tensor images requires choosing both a reorientation strategy and an interpolation scheme. Current diffusion tensor registration algorithms that use full tensor information face difficulties in computing the differential of the tensor reorientation strategy and consequently, these methods often approximate the gradient of the objective function. In the case of the finite-strain (FS) reorientation strategy, we borrow results from the pose estimation literature in computer vision to derive an analytical gradient of the registration objective function. By utilizing the closed-form gradient and the velocity field representation of one parameter subgroups of diffeomorphisms, the resulting registration algorithm is diffeomorphic and fast. We contrast the algorithm with a traditional FS alternative that ignores the reorientation in the gradient computation. We show that the exact gradient leads to significantly better registration at the cost of computation time. Independently of the choice of Euclidean or Log-Euclidean interpolation and sum of squared differences dissimilarity measure, the exact gradient achieves better alignment over an entire spectrum of deformation penalties. Alignment quality is assessed with a battery of metrics including tensor overlap, fractional anisotropy, inverse consistency and closeness to synthetic warps. The improvements persist even when a different reorientation scheme, preservation of principal directions, is used to apply the final deformations.", "pmid": "19556193", "title": "DT-REFinD: diffusion tensor registration with exact finite-strain differential."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Activities of Daily Living", "Artificial Intelligence", "Computer Communication Networks", "Equipment Design", "Equipment Failure Analysis", "Monitoring, Ambulatory", "Personal Autonomy", "Self-Help Devices", "Software", "Systems Integration", "Telemedicine", "Transducers"], "year": "2014", "abstractText": "The deployment of the Ambient Intelligence (AmI) paradigm requires designing and integrating user-centered smart environments to assist people in their daily life activities. This research paper details an integration and validation of multiple heterogeneous sensors with hybrid reasoners that support decision making in order to monitor personal and environmental data at a smart home in a private way. The results innovate on knowledge-based platforms, distributed sensors, connected objects, accessibility and authentication methods to promote independent living for elderly people. TALISMAN+, the AmI framework deployed, integrates four subsystems in the smart home: (i) a mobile biomedical telemonitoring platform to provide elderly patients with continuous disease management; (ii) an integration middleware that allows context capture from heterogeneous sensors to program environment's reaction; (iii) a vision system for intelligent monitoring of daily activities in the home; and (iv) an ontologies-based integrated reasoning platform to trigger local actions and manage private information in the smart home. The framework was integrated in two real running environments, the UPM Accessible Digital Home and MetalTIC house, and successfully validated by five experts in home care, elderly people and personal autonomy.", "pmid": "25232910", "title": "Integration of multisensor hybrid reasoners to support personal autonomy in the smart home."}, {"journal": "M.D. computing : computers in medical practice", "meshMajor": ["Artificial Intelligence", "Computer Communication Networks", "Dementia", "Diagnosis, Computer-Assisted", "Expert Systems", "Humans", "Software"], "year": null, "abstractText": "During the past decade, artificial neural networks have been established as promising psychological and computational models. The proponents of neural computing believe that it offers new solutions to problems that have been intractable so far. To study the suitability of neural networks for performing sequential diagnostic classification, I have used a network that, over time, becomes increasingly proficient at diagnosing dementia. A description of the implementation, training, and behavior of this network illustrates how neural-network technology might contribute to clinical computing.", "pmid": "2407923", "title": "A neural network as an approach to clinical diagnosis."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Automation", "Bayes Theorem", "Computer Systems", "Electroencephalography", "Equipment Design", "Humans", "Male", "Models, Theoretical", "Neural Networks, Computer", "Normal Distribution", "Reproducibility of Results", "Signal Processing, Computer-Assisted"], "year": "2011", "abstractText": "EEG data has been used to discriminate levels of mental workload when classifiers are created for each subject, but the reliability of classifiers trained on multiple subjects has yet to be investigated. Artificial neural network and naive Bayesian classifiers were trained with data from single and multiple subjects and their ability to discriminate among three difficulty conditions was tested. When trained on data from multiple subjects, both types of classifiers poorly discriminated between the three levels. However, a novel model, the naive Bayesian classifier with a hidden node, performed nearly as well as the models trained and tested on individuals. In addition, a hierarchical Bayes model with a higher level constraint on the hidden node can further improve its performance.", "pmid": "22255836", "title": "An EEG workload classifier for multiple subjects."}, {"journal": "Bioinformatics (Oxford, England)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cell Cycle Proteins", "Cluster Analysis", "Gene Expression Profiling", "Models, Genetic", "Models, Statistical", "Multivariate Analysis", "Normal Distribution", "Oligonucleotide Array Sequence Analysis", "Saccharomyces cerevisiae Proteins", "Sequence Alignment", "Sequence Analysis, DNA", "Software"], "year": "2004", "abstractText": "MOTIVATION: Grouping genes having similar expression patterns is called gene clustering, which has been proved to be a useful tool for extracting underlying biological information of gene expression data. Many clustering procedures have shown success in microarray gene clustering; most of them belong to the family of heuristic clustering algorithms. Model-based algorithms are alternative clustering algorithms, which are based on the assumption that the whole set of microarray data is a finite mixture of a certain type of distributions with different parameters. Application of the model-based algorithms to unsupervised clustering has been reported. Here, for the first time, we demonstrated the use of the model-based algorithm in supervised clustering of microarray data.RESULTS: We applied the proposed methods to real gene expression data and simulated data. We showed that the supervised model-based algorithm is superior over the unsupervised method and the support vector machines (SVM) method.AVAILABILITY: The program written in the SAS language implementing methods I-III in this report is available upon request. The software of SVMs is available in the website http://svm.sdsc.edu/cgi-bin/nph-SVMsubmit.cgi", "pmid": "15044244", "title": "Supervised cluster analysis for microarray data based on multivariate Gaussian mixture."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artifacts", "Artificial Intelligence", "Data Interpretation, Statistical", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Motion", "Pattern Recognition, Automated", "Photography", "Reproducibility of Results", "Sensitivity and Specificity", "Video Recording"], "year": "2013", "abstractText": "Turbulence mitigation refers to the stabilization of videos with nonuniform deformations due to the influence of optical turbulence. Typical approaches for turbulence mitigation follow averaging or dewarping techniques. Although these methods can reduce the turbulence, they distort the independently moving objects, which can often be of great interest. In this paper, we address the novel problem of simultaneous turbulence mitigation and moving object detection. We propose a novel three-term low-rank matrix decomposition approach in which we decompose the turbulence sequence into three components: the background, the turbulence, and the object. We simplify this extremely difficult problem into a minimization of nuclear norm, Frobenius norm, and l1 norm. Our method is based on two observations: First, the turbulence causes dense and Gaussian noise and therefore can be captured by Frobenius norm, while the moving objects are sparse and thus can be captured by l1 norm. Second, since the object's motion is linear and intrinsically different from the Gaussian-like turbulence, a Gaussian-based turbulence model can be employed to enforce an additional constraint on the search space of the minimization. We demonstrate the robustness of our approach on challenging sequences which are significantly distorted with atmospheric turbulence and include extremely tiny moving objects.", "pmid": "22529321", "title": "Simultaneous video stabilization and moving object detection in turbulence."}, {"journal": "IEEE transactions on bio-medical engineering", "meshMajor": ["Algorithms", "Artificial Intelligence", "Atrial Fibrillation", "Diagnosis, Computer-Assisted", "Electrocardiography", "Europe", "Humans", "Pattern Recognition, Automated", "Principal Component Analysis", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "Analysis of atrial rhythm is important in the treatment and management of patients with atrial fibrillation. Several algorithms exist for extracting the atrial signal from the electrocardiogram (ECG) in atrial fibrillation, but there are few reports on how well these techniques are able to recover the atrial signal. We assessed and compared three algorithms for extracting the atrial signal from the 12-lead ECG. The 12-lead ECGs of 30 patients in atrial fibrillation were analyzed. Atrial activity was extracted by three algorithms, Spatiotemporal QRST cancellation (STC), principal component analysis (PCA), and independent component analysis (ICA). The amplitude and frequency characteristics of the extracted atrial signals were compared between algorithms and against reference data. Mean (standard deviation) amplitude of QRST segments of V1 was 0.99 (0.54) mV, compared to 0.18 (0.11) mV (STC), 0.19 (0.13) mV (PCA), and 0.29 (0.22) mV (ICA). Hence, for all algorithms there were significant reductions in the amplitude of the ventricular activity compared with that in V1. Reference atrial signal amplitude in V1 was 0.18 (0.11) mV, compared to 0.17 (0.10) mV (STC), 0.12 (0.09) mV (PCA), and 0.18 (0.13) mV (ICA) in the extracted atrial signals. PCA tended to attenuate the atrial signal in these segments. There were no significant differences for any of the algorithms when comparing the amplitude of the reference atrial signal with that of the extracted atrial signals in segments in which ventricular activity had been removed. There were no significant differences between algorithms in the frequency characteristics of the extracted atrial signals. There were discrepancies in amplitude and frequency characteristics of the atrial signal in only a few cases resulting from notable residual ventricular activity for PCA and ICA algorithms. In conclusion, the extracted atrial signals from these algorithms exhibit very similar amplitude and frequency characteristics. Users of these algorithms should be observant of residual ventricular activities which can affect the analysis of the fibrillatory waveform in clinical practice.", "pmid": "16485765", "title": "Comparison of atrial signal extraction algorithms in 12-lead ECGs with atrial fibrillation."}, {"journal": "IEEE transactions on information technology in biomedicine : a publication of the IEEE Engineering in Medicine and Biology Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Binding Sites", "Computer Simulation", "Enzyme Activation", "Models, Chemical", "Peptides", "Protein Binding", "Sequence Analysis, Protein", "Viral Nonstructural Proteins"], "year": "2007", "abstractText": "Although various machine learning approaches have been used for predicting protease cleavage sites, constructing a probabilistic model for these tasks is still challenging. This paper proposes a novel algorithm termed as a probabilistic peptide machine where estimating probability density functions and constructing a classifier for predicting protease cleavage sites are combined into one process. The simulation based on experimentally determined Hepatitis C virus (HCV) protease cleavage data has demonstrated the success of this new algorithm.", "pmid": "17912976", "title": "A probabilistic peptide machine for predicting hepatitis C virus protease cleavage sites."}, {"journal": "Journal of chemical information and modeling", "meshMajor": ["Algorithms", "Artificial Intelligence", "Flavonoids", "Genetics", "Nonlinear Dynamics", "Protein Binding", "Quantitative Structure-Activity Relationship", "Receptors, GABA-A", "Regression Analysis"], "year": "2009", "abstractText": "Several studies were conducted in past years which used the evolutionary process of Genetic Algorithms for optimizing the Support Vector Regression parameter values although, however, few of them were devoted to the simultaneously optimization of the type of kernel function involved in the established model. The present work introduces a new hybrid genetic-based Support Vector Regression approach, whose statistical quality and predictive capability is afterward analyzed and compared to other standard chemometric techniques, such as Partial Least Squares, Back-Propagation Artificial Neural Networks, and Support Vector Machines based on Cross-Validation. For this purpose, we employ a data set of experimentally determined binding affinity constants toward the benzodiazepine binding site of the GABA (A) receptor complex on 78 flavonoid ligands.", "pmid": "19492793", "title": "New hybrid genetic based Support Vector Regression as QSAR approach for analyzing flavonoids-GABA(A) complexes."}, {"journal": "Physical review. E, Statistical, nonlinear, and soft matter physics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Bayes Theorem", "Learning", "Mathematical Computing", "Models, Statistical", "Neural Networks, Computer", "Normal Distribution", "Thermodynamics"], "year": "2001", "abstractText": "We study the typical learning properties of the recently introduced soft margin classifiers (SMCs), learning realizable and unrealizable tasks, with the tools of statistical mechanics. We derive analytically the behavior of the learning curves in the regime of very large training sets. We obtain exponential and power laws for the decay of the generalization error towards the asymptotic value, depending on the task and on general characteristics of the distribution of stabilities of the patterns to be learned. The optimal learning curves of the SMCs, which give the minimal generalization error, are obtained by tuning the coefficient controlling the trade-off between the error and the regularization terms in the cost function. If the task is realizable by the SMC, the optimal performance is better than that of a hard margin support vector machine and is very close to that of a Bayesian classifier.", "pmid": "11580367", "title": "Statistical mechanics of learning with soft margin classifiers."}, {"journal": "Methods in molecular biology (Clifton, N.J.)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Databases, Factual", "Proteins", "Quantitative Structure-Activity Relationship", "Small Molecule Libraries"], "year": "2011", "abstractText": "Support vector machine (SVM)-based selectivity searching has recently been introduced to identify compounds in virtual screening libraries that are not only active for a target protein, but also selective for this target over a closely related member of the same protein family. In simulated virtual screening calculations, SVM-based strategies termed preference ranking and one-versus-all ranking were successfully applied to rank a database and enrich high-ranking positions with selective compounds while removing nonselective molecules from high ranks. In contrast to the original SVM approach developed for binary classification, these strategies enable learning from more than two classes, considering that distinguishing between selective, promiscuously active, and inactive compounds gives rise to a three-class prediction problem. In this chapter, we describe the extension of the one-versus-all strategy to four training classes. Furthermore, we present an adaptation of the preference ranking strategy that leads to higher recall of selective compounds than previously investigated approaches and is applicable in situations where the removal of nonselective compounds from high-ranking positions is not required.", "pmid": "20838983", "title": "Application of support vector machine-based ranking strategies to search for target-selective compounds."}, {"journal": "Anadolu kardiyoloji dergisi : AKD = the Anatolian journal of cardiology", "meshMajor": ["Algorithms", "Artificial Intelligence", "Case-Control Studies", "Coronary Angiography", "Coronary Artery Disease", "Female", "Humans", "Image Interpretation, Computer-Assisted", "Male", "Middle Aged", "Neural Networks, Computer", "Predictive Value of Tests", "Prognosis", "Reproducibility of Results", "Retrospective Studies", "Sensitivity and Specificity"], "year": "2008", "abstractText": "OBJECTIVE: Eight different learning algorithms used for creating artificial neural network (ANN) models and the different ANN models in the prediction of coronary artery disease (CAD) are introduced.METHODS: This work was carried out as a retrospective case-control study. Overall, 124 consecutive patients who had been diagnosed with CAD by coronary angiography (at least 1 coronary stenosis > 50% in major epicardial arteries) were enrolled in the work. Angiographically, the 113 people (group 2) with normal coronary arteries were taken as control subjects. Multi-layered perceptrons ANN architecture were applied. The ANN models trained with different learning algorithms were performed in 237 records, divided into training (n=171) and testing (n=66) data sets. The performance of prediction was evaluated by sensitivity, specificity and accuracy values based on standard definitions.RESULTS: The results have demonstrated that ANN models trained with eight different learning algorithms are promising because of high (greater than 71%) sensitivity, specificity and accuracy values in the prediction of CAD. Accuracy, sensitivity and specificity values varied between 83.63%-100%, 86.46%-100% and 74.67%-100% for training, respectively. For testing, the values were more than 71% for sensitivity, 76% for specificity and 81% for accuracy.CONCLUSIONS: It may be proposed that the use of different learning algorithms other than backpropagation and larger sample sizes can improve the performance of prediction. The proposed ANN models trained with these learning algorithms could be used a promising approach for predicting CAD without the need for invasive diagnostic methods and could help in the prognostic clinical decision.", "pmid": "18676299", "title": "Predicting coronary artery disease using different artificial neural network models."}, {"journal": "PloS one", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Imaging, Three-Dimensional", "Probability", "Signal Processing, Computer-Assisted", "Signal-To-Noise Ratio"], "year": "2018", "abstractText": "This paper presents a robust 3D point cloud registration algorithm based on bidirectional Maximum Correntropy Criterion (MCC). Comparing with traditional registration algorithm based on the mean square error (MSE), using the MCC is superior in dealing with complex registration problem with non-Gaussian noise and large outliers. Since the MCC is considered as a probability measure which weights the corresponding points for registration, the noisy points are penalized. Moreover, we propose to use bidirectional measures which can maximum the overlapping parts and avoid the registration result being trapped into a local minimum. Both of these strategies can better apply the information theory method to the point cloud registration problem, making the algorithm more robust. In the process of implementation, we integrate the fixed-point optimization technique based on the iterative closest point algorithm, resulting in the correspondence and transformation parameters that are solved iteratively. The comparison experiments under noisy conditions with related algorithms have demonstrated good performance of the proposed algorithm.", "pmid": "29799864", "title": "Robust 3D point cloud registration based on bidirectional Maximum Correntropy Criterion."}, {"journal": "Optics express", "meshMajor": ["Algorithms", "Artificial Intelligence", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2008", "abstractText": "In this paper, we propose an occlusion removal method using sub-image block matching for improved recognition of partially occluded 3D objects in computational integral imaging (CII). When 3D plane images are reconstructed in CII, occlusion degrades the resolution of reconstructed images. To overcome this problem, we apply the sub-image transform to elemental image array (EIA) and these sub-images are employed using block matching method for depth estimation. Based on the estimated depth information, we remove the unknown occlusion. After completing the occlusion removal for all sub-images, we obtain the modified EIA without occlusion information through the inverse sub-image transform. Finally, the 3D plane images are reconstructed by using a computational integral imaging reconstruction method with the modified EIA. The proposed method can provide a substantial gain in terms of the visual quality of 3D reconstructed images. To show the usefulness of the proposed method we carry out some experiments and the results are presented.", "pmid": "18852735", "title": "Occlusion removal method of partially occluded 3D object using sub-image block matching in computational integral imaging."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Adult", "Analysis of Variance", "Animals", "Artificial Intelligence", "Brain", "Brain Mapping", "Cercopithecidae", "Corpus Callosum", "Female", "Humans", "Image Processing, Computer-Assisted", "Lateral Ventricles", "Magnetic Resonance Imaging, Cine", "Male", "Middle Aged", "Principal Component Analysis", "Skull"], "year": "2009", "abstractText": "Localized Components Analysis (LoCA) is a new method for describing surface shape variation in an ensemble of objects using a linear subspace of spatially localized shape components. In contrast to earlier methods, LoCA optimizes explicitly for localized components and allows a flexible trade-off between localized and concise representations, and the formulation of locality is flexible enough to incorporate properties such as symmetry. This paper demonstrates that LoCA can provide intuitive presentations of shape differences associated with sex, disease state, and species in a broad range of biomedical specimens, including human brain regions and monkey crania.", "pmid": "19542583", "title": "Exploration of shape variation using localized components analysis."}, {"journal": "Neural computation", "meshMajor": ["Algorithms", "Artificial Intelligence", "Entropy", "Humans", "Least-Squares Analysis"], "year": "2015", "abstractText": "Regression aims at estimating the conditional mean of output given input. However, regression is not informative enough if the conditional density is multimodal, heteroskedastic, and asymmetric. In such a case, estimating the conditional density itself is preferable, but conditional density estimation (CDE) is challenging in high-dimensional space. A naive approach to coping with high dimensionality is to first perform dimensionality reduction (DR) and then execute CDE. However, a two-step process does not perform well in practice because the error incurred in the first DR step can be magnified in the second CDE step. In this letter, we propose a novel single-shot procedure that performs CDE and DR simultaneously in an integrated way. Our key idea is to formulate DR as the problem of minimizing a squared-loss variant of conditional entropy, and this is solved using CDE. Thus, an additional CDE step is not needed after DR. We demonstrate the usefulness of the proposed method through extensive experiments on various data sets, including humanoid robot transition and computer art. ", "pmid": "25380340", "title": "Conditional density estimation with dimensionality reduction via squared-loss conditional entropy minimization."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feedback", "Likelihood Functions", "Markov Chains", "Neural Networks, Computer", "Robotics", "Time Factors"], "year": "2008", "abstractText": "This paper proposes a novel learning method for a mixture of recurrent neural network (RNN) experts model, which can acquire the ability to generate desired sequences by dynamically switching between experts. Our method is based on maximum likelihood estimation, using a gradient descent algorithm. This approach is similar to that used in conventional methods; however, we modify the likelihood function by adding a mechanism to alter the variance for each expert. The proposed method is demonstrated to successfully learn Markov chain switching among a set of 9 Lissajous curves, for which the conventional method fails. The learning performance, analyzed in terms of the generalization capability, of the proposed method is also shown to be superior to that of the conventional method. With the addition of a gating network, the proposed method is successfully applied to the learning of sensory-motor flows for a small humanoid robot as a realistic problem of time series prediction and generation.", "pmid": "18938059", "title": "A model for learning to segment temporal sequences, utilizing a mixture of RNN experts together with adaptive variance."}, {"journal": "Information processing in medical imaging : proceedings of the ... conference", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cerebral Cortex", "Diffusion Tensor Imaging", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Nerve Fibers, Myelinated", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2013", "abstractText": "Group neuroimaging studies of the cerebral cortex benefit from accurate, surface-based, cross-subject alignment for investigating brain architecture, function and connectivity. There is an increasing amount of high quality data available. However, establishing how different modalities correlate across groups remains an open research question. One reason for this is that the current methods for registration, based on cortical folding, provide sub-optimal alignment of some functional subregions of the brain. A more flexible framework is needed that will allow robust alignment of multiple modalities. We adapt the Fast Primal-Dual (Fast-PD) approach for discrete Markov Random Field (MRF) optimisation to spherical registration by reframing the deformation labels as a discrete set of rotations and propose a novel regularisation term, derived from the geodesic distance between rotation matrices. This formulation allows significant flexibility in the choice of similarity metric. To this end we propose a new multivariate cost function based on the discretisation of a graph-based mutual information measure. Results are presented for alignment driven by scalar metrics of curvature and myelination, and multivariate features derived from functional task performance. These experiments demonstrate the potential of this approach for improving the integration of complementary brain data sets in the future.", "pmid": "24683992", "title": "Multimodal surface matching: fast and generalisable cortical registration using discrete optimisation."}, {"journal": "IEEE transactions on neural networks", "meshMajor": ["Algorithms", "Artificial Intelligence", "Forecasting", "Humans", "Memory", "Neural Networks, Computer", "Protein Structure, Secondary", "Proteins", "Sequence Analysis, Protein"], "year": "2009", "abstractText": "Conventional recurrent neural networks (RNNs) have difficulties in learning long-term dependencies. To tackle this problem, we propose an architecture called segmented-memory recurrent neural network (SMRNN). A symbolic sequence is broken into segments and then presented as inputs to the SMRNN one symbol per cycle. The SMRNN uses separate internal states to store symbol-level context, as well as segment-level context. The symbol-level context is updated for each symbol presented for input. The segment-level context is updated after each segment. The SMRNN is trained using an extended real-time recurrent learning algorithm. We test the performance of SMRNN on the information latching problem, the \"two-sequence problem\" and the problem of protein secondary structure (PSS) prediction. Our implementation results indicate that SMRNN performs better on long-term dependency problems than conventional RNNs. Besides, we also theoretically analyze how the segmented memory of SMRNN helps learning long-term temporal dependencies and study the impact of the segment length.", "pmid": "19605323", "title": "Segmented-memory recurrent neural networks."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Mining"], "year": "2013", "abstractText": "BACKGROUND: Negation occurs frequently in scientific literature, especially in biomedical literature. It has previously been reported that around 13% of sentences found in biomedical research articles contain negation. Historically, the main motivation for identifying negated events has been to ensure their exclusion from lists of extracted interactions. However, recently, there has been a growing interest in negative results, which has resulted in negation detection being identified as a key challenge in biomedical relation extraction. In this article, we focus on the problem of identifying negated bio-events, given gold standard event annotations.RESULTS: We have conducted a detailed analysis of three open access bio-event corpora containing negation information (i.e., GENIA Event, BioInfer and BioNLP'09 ST), and have identified the main types of negated bio-events. We have analysed the key aspects of a machine learning solution to the problem of detecting negated events, including selection of negation cues, feature engineering and the choice of learning algorithm. Combining the best solutions for each aspect of the problem, we propose a novel framework for the identification of negated bio-events. We have evaluated our system on each of the three open access corpora mentioned above. The performance of the system significantly surpasses the best results previously reported on the BioNLP'09 ST corpus, and achieves even better results on the GENIA Event and BioInfer corpora, both of which contain more varied and complex events.CONCLUSIONS: Recently, in the field of biomedical text mining, the development and enhancement of event-based systems has received significant interest. The ability to identify negated events is a key performance element for these systems. We have conducted the first detailed study on the analysis and identification of negated bio-events. Our proposed framework can be integrated with state-of-the-art event extraction systems. The resulting systems will be able to extract bio-events with attached polarities from textual documents, which can serve as the foundation for more elaborate systems that are able to detect mutually contradicting bio-events.", "pmid": "23323936", "title": "Negated bio-events: analysis and identification."}, {"journal": "Studies in health technology and informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Neural Networks, Computer", "Odorants", "Smell", "Software"], "year": "2000", "abstractText": "For practical applications, artificial neural networks have to meet several requirements: Mainly they should learn quick, classify accurate and behave robust. Programs should be user-friendly and should not need the presence of an expert for fine tuning diverse learning parameters. The present paper demonstrates an approach using an oversized network topology, adaptive propagation (APROP), a modified error function, and averaging outputs of four networks described for the first time. As an example, signals from different semiconductor gas sensors of an electronic nose were classified. The electronic nose smelt different types of edible oil with extremely different a-priori-probabilities. The fully-specified neural network classifier fulfilled the above mentioned demands. The new approach will be helpful not only for classifying olfactory signals automatically but also in many other fields in medicine, e.g. in data mining from medical databases.", "pmid": "11187516", "title": "Artificial neural networks for classifying olfactory signals."}, {"journal": "Applied spectroscopy", "meshMajor": ["Air Pollutants", "Aircraft", "Algorithms", "Artificial Intelligence", "Environmental Monitoring", "Ethanol", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Signal Processing, Computer-Assisted", "Spectroscopy, Fourier Transform Infrared"], "year": "2003", "abstractText": "Methodology is developed for the automated detection of heated plumes of ethanol vapor with airborne passive Fourier transform infrared spectrometry. Positioned in a fixed-wing aircraft in a downward-looking mode, the spectrometer is used to detect ground sources of ethanol vapor from an altitude of 2000-3000 ft. Challenges to the use of this approach for the routine detection of chemical plumes include (1) the presence of a constantly changing background radiance as the aircraft flies, (2) the cost and complexity of collecting the data needed to train the classification algorithms used in implementing the plume detection, and (3) the need for rapid interferogram scans to minimize the ground area viewed per scan. To address these challenges, this work couples a novel ground-based data collection and training protocol with the use of signal processing and pattern recognition methods based on short sections of the interferogram data collected by the spectrometer. In the data collection, heated plumes of ethanol vapor are released from a portable emission stack and viewed by the spectrometer from ground level against a synthetic background designed to simulate a terrestrial radiance source. Classifiers trained with these data are subsequently tested with airborne data collected over a period of 2.5 years. Two classifier architectures are compared in this work: support vector machines (SVM) and piecewise linear discriminant analysis (PLDA). When applied to the airborne test data, the SVM classifiers perform best, failing to detect ethanol in only 8% of the cases in which it is present. False detections occur at a rate of less than 0.5%. The classifier performs well in spite of differences between the backgrounds associated with the ground-based and airborne data collections and the instrumental drift arising from the long time span of the data collection. Further improvements in classification performance are judged to require increased sophistication in the ground-based data collection in order to provide a better match to the infrared backgrounds observed from the air.", "pmid": "14658159", "title": "Remote detection of heated ethanol plumes by airborne passive Fourier transform infrared spectrometry."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feedback", "Humans", "Neural Networks, Computer"], "year": "2007", "abstractText": "An important drawback of many artificial neural networks (ANN) is their lack of explanation capability [Andrews, R., Diederich, J., & Tickle, A. B. (1996). A survey and critique of techniques for extracting rules from trained artificial neural networks. Knowledge-Based Systems, 8, 373-389]. This paper starts with a survey of algorithms which attempt to explain the ANN output. We then present HYPINV, a new explanation algorithm which relies on network inversion; i.e. calculating the ANN input which produces a desired output. HYPINV is a pedagogical algorithm, that extracts rules, in the form of hyperplanes. It is able to generate rules with arbitrarily desired fidelity, maintaining a fidelity-complexity tradeoff. To our knowledge, HYPINV is the only pedagogical rule extraction method, which extracts hyperplane rules from continuous or binary attribute neural networks. Different network inversion techniques, involving gradient descent as well as an evolutionary algorithm, are presented. An information theoretic treatment of rule extraction is presented. HYPINV is applied to example synthetic problems, to a real aerospace problem, and compared with similar algorithms using benchmark problems.", "pmid": "17029713", "title": "Neural network explanation using inversion."}, {"journal": "BioTechniques", "meshMajor": ["Animals", "Artificial Intelligence", "Base Sequence", "DNA, Complementary", "False Positive Reactions", "Gene Library", "Genetic Testing", "Human Genome Project", "Humans", "Molecular Sequence Data", "Neural Networks, Computer", "Predictive Value of Tests"], "year": "1996", "abstractText": "A low order neural network-based filter was designed as a rapid screening agent for single-spanning transmembrane regions in an integrated informatics system. A rapid screening algorithm was seen as a compromise between costly structure-specific techniques and simple rules that gave a high false-positive rate for cDNA. The filter was applied to a library of 2123 anonymous cDNA sequences, which resulted in 61 detections. Evaluation of the detections with two other dissimilar computer prediction algorithms yielded strong transmembrane predictions for 15 of the detections, while 8 of the detections resulted in a definitive negative result. Homology searches performed on the sequences with detection reports yielded 13 homologs in the predicted reading frame, four of which are membrane associated.", "pmid": "8969840", "title": "High-throughput cDNA screening utilizing a low order neural network filter."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feasibility Studies", "Femoral Fractures", "Fracture Fixation, Internal", "Humans", "Radiographic Image Enhancement", "Radiographic Image Interpretation, Computer-Assisted", "Reproducibility of Results", "Sensitivity and Specificity", "Surgery, Computer-Assisted", "Tomography, X-Ray Computed", "Treatment Outcome"], "year": "2007", "abstractText": "An algorithm to globally register multiple 3D data sets (point sets) within a general reference frame is proposed. The algorithm uses the Unscented Kalman Filter algorithm to simultaneously compute the registration transformations that map the data sets together, and to calculate the variances of the registration parameters. The data sets are either randomly generated, or collected from a set of fractured bone phantoms using Computed Tomography (CT) images. The algorithm robustly converges for isotropic Gaussian noise that could have perturbed the point coordinates in the data sets. It is also computationally efficient, and enables real-time global registration of multiple data sets, with applications in computer-assisted orthopaedic trauma surgery.", "pmid": "18044659", "title": "Global registration of multiple point sets: feasibility and applications in multi-fragment fracture fixation."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Artificial Intelligence", "Automation", "Humans", "Imaging, Three-Dimensional", "Liver", "Support Vector Machine", "Tomography, X-Ray Computed"], "year": "2012", "abstractText": "This paper presents a semi-automatic approach to segmentation of liver parenchyma from 3D computed tomography (CT) images. Specifically, liver segmentation is formalized as a pattern recognition problem, where a given voxel is to be assigned a correct label - either in a liver or a non-liver class. Each voxel is associated with a feature vector that describes image textures. Based on the generated features, an Extreme Learning Machine (ELM) classifier is employed to perform the voxel classification. Since preliminary voxel segmentation tends to be less accurate at the boundary, and there are other non-liver tissue voxels with similar texture characteristics as liver parenchyma, morphological smoothing and 3D level set refinement are applied to enhance the accuracy of segmentation. Our approach is validated on a set of CT data. The experiment shows that the proposed approach with ELM has the reasonably good performance for liver parenchyma segmentation. It demonstrates a comparable result in accuracy of classification but with a much faster training and classification speed compared with support vector machine (SVM).", "pmid": "23366744", "title": "A semi-automatic approach to the segmentation of liver parenchyma from 3D CT images with Extreme Learning Machine."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Internet", "Models, Theoretical", "Software", "Travel"], "year": "2016", "abstractText": "Rapid growth of web and its applications has created a colossal importance for recommender systems. Being applied in various domains, recommender systems were designed to generate suggestions such as items or services based on user interests. Basically, recommender systems experience many issues which reflects dwindled effectiveness. Integrating powerful data management techniques to recommender systems can address such issues and the recommendations quality can be increased significantly. Recent research on recommender systems reveals an idea of utilizing social network data to enhance traditional recommender system with better prediction and improved accuracy. This paper expresses views on social network data based recommender systems by considering usage of various recommendation algorithms, functionalities of systems, different types of interfaces, filtering techniques, and artificial intelligence techniques. After examining the depths of objectives, methodologies, and data sources of the existing models, the paper helps anyone interested in the development of travel recommendation systems and facilitates future research direction. We have also proposed a location recommendation system based on social pertinent trust walker (SPTW) and compared the results with the existing baseline random walk models. Later, we have enhanced the SPTW model for group of users recommendations. The results obtained from the experiments have been presented. ", "pmid": "27069468", "title": "A Collaborative Location Based Travel Recommendation System through Enhanced Rating Prediction for the Group of Users."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Eyeglasses", "Face", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2013", "abstractText": "This paper presents a system that automatically extracts the position of the eyeglasses and the accurate shape and size of the frame lenses in facial images. The novelty brought by this paper consists in three key contributions. The first one is an original model for representing the shape of the eyeglasses lens, using Fourier descriptors. The second one is a method for generating the search space starting from a finite, relatively small number of representative lens shapes based on Fourier morphing. Finally, we propose an accurate lens contour extraction algorithm using a multi-stage Monte Carlo sampling technique. Multiple experiments demonstrate the effectiveness of our approach. ", "pmid": "24152926", "title": "Eyeglasses lens contour extraction from facial images using an efficient shape description."}, {"journal": "BMC bioinformatics", "meshMajor": ["Abstracting and Indexing", "Algorithms", "Artificial Intelligence", "Databases, Factual", "Information Storage and Retrieval", "Natural Language Processing", "Semantics", "Software", "Terminology as Topic", "Vocabulary, Controlled"], "year": "2006", "abstractText": "BACKGROUND: We study the adaptation of Link Grammar Parser to the biomedical sublanguage with a focus on domain terms not found in a general parser lexicon. Using two biomedical corpora, we implement and evaluate three approaches to addressing unknown words: automatic lexicon expansion, the use of morphological clues, and disambiguation using a part-of-speech tagger. We evaluate each approach separately for its effect on parsing performance and consider combinations of these approaches.RESULTS: In addition to a 45% increase in parsing efficiency, we find that the best approach, incorporating information from a domain part-of-speech tagger, offers a statistically significant 10% relative decrease in error.CONCLUSION: When available, a high-quality domain part-of-speech tagger is the best solution to unknown word issues in the domain adaptation of a general parser. In the absence of such a resource, surface clues can provide remarkably good coverage and performance when tuned to the domain. The adapted parser is available under an open-source license.", "pmid": "17134475", "title": "Lexical adaptation of link grammar to the biomedical sublanguage: a comparative evaluation of three approaches."}, {"journal": "Proceedings. Symposium on Computer Applications in Medical Care", "meshMajor": ["Artificial Intelligence", "Christianity", "Cross Infection", "Diagnosis, Computer-Assisted", "Expert Systems", "Hospitals, Religious", "Humans", "Infant, Newborn", "Reproducibility of Results", "Sensitivity and Specificity", "Utah"], "year": "1994", "abstractText": "Hospital-acquired infections are responsible for an increase in patient mortality and costs. Their detection is essential to permit better infection control. We developed an expert system specifically to detect infections in pediatric patients. The expert system is implemented at LDS Hospital that has a level three newborn intensive care unit and well baby units. We describe how the knowledge base of the expert system was developed, implemented, and validated in a retrospective study. The results of the system were compared to manual reviewer results. The expert system had a sensitivity of 84.5% and specificity of 92.8% in detecting hospital-acquired infections when compared to a physician reviewer. The Cohen's kappa between the expert system and the physician reviewer was 0.62 (p < .001).", "pmid": "7950013", "title": "Computerized detection of nosocomial infections in newborns."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Connectome", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Magnetic Resonance Imaging", "Mental Recall", "Nerve Net", "Pattern Recognition, Automated", "Recognition, Psychology", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "We present a Riemannian approach for classifying fMRI connectivity patterns before and after intervention in longitudinal studies. A fundamental difficulty with using connectivity as features is that covariance matrices live on the positive semi-definite cone, which renders their elements inter-related. The implicit independent feature assumption in most classifier learning algorithms is thus violated. In this paper, we propose a matrix whitening transport for projecting the covariance estimates onto a common tangent space to reduce the statistical dependencies between their elements. We show on real data that our approach provides significantly higher classification accuracy than directly using Pearson's correlation. We further propose a non-parametric scheme for identifying significantly discriminative connections from classifier weights. Using this scheme, a number of neuroanatomically meaningful connections are found, whereas no significant connections are detected with pure permutation testing.", "pmid": "25485405", "title": "Transport on Riemannian manifold for functional connectivity-based classification."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Discriminant Analysis", "Gait", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Whole Body Imaging"], "year": "2007", "abstractText": "The traditional image representations are not suited to conventional classification methods, such as the linear discriminant analysis (LDA), because of the under sample problem (USP): the dimensionality of the feature space is much higher than the number of training samples. Motivated by the successes of the two dimensional LDA (2DLDA) for face recognition, we develop a general tensor discriminant analysis (GTDA) as a preprocessing step for LDA. The benefits of GTDA compared with existing preprocessing methods, e.g., principal component analysis (PCA) and 2DLDA, include 1) the USP is reduced in subsequent classification by, for example, LDA; 2) the discriminative information in the training tensors is preserved; and 3) GTDA provides stable recognition rates because the alternating projection optimization algorithm to obtain a solution of GTDA converges, while that of 2DLDA does not. We use human gait recognition to validate the proposed GTDA. The averaged gait images are utilized for gait representation. Given the popularity of Gabor function based image decompositions for image understanding and object recognition, we develop three different Gabor function based image representations: 1) the GaborD representation is the sum of Gabor filter responses over directions, 2) GaborS is the sum of Gabor filter responses over scales, and 3) GaborSD is the sum of Gabor filter responses over scales and directions. The GaborD, GaborS and GaborSD representations are applied to the problem of recognizing people from their averaged gait images.A large number of experiments were carried out to evaluate the effectiveness (recognition rate) of gait recognition based on first obtaining a Gabor, GaborD, GaborS or GaborSD image representation, then using GDTA to extract features and finally using LDA for classification. The proposed methods achieved good performance for gait recognition based on image sequences from the USF HumanID Database. Experimental comparisons are made with nine state of the art classification methods in gait recognition.", "pmid": "17699917", "title": "General tensor discriminant analysis and gabor features for gait recognition."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Computer Simulation", "Computing Methodologies", "Fuzzy Logic", "Image Interpretation, Computer-Assisted", "Models, Statistical", "Neural Networks, Computer", "Pattern Recognition, Automated", "Software", "Software Validation"], "year": "2007", "abstractText": "This paper focuses on the evolution of Fuzzy ARTMAP neural network classifiers, using genetic algorithms, with the objective of improving generalization performance (classification accuracy of the ART network on unseen test data) and alleviating the ART category proliferation problem (the problem of creating more than necessary ART network categories to solve a classification problem). We refer to the resulting architecture as GFAM. We demonstrate through extensive experimentation that GFAM exhibits good generalization and is of small size (creates few ART categories), while consuming reasonable computational effort. In a number of classification problems, GFAM produces the optimal classifier. Furthermore, we compare the performance of GFAM with other competitive ARTMAP classifiers that have appeared in the literature and addressed the category proliferation problem in ART. We illustrate that GFAM produces improved results over these architectures, as well as other competitive classifiers.", "pmid": "17851035", "title": "GFAM: evolving Fuzzy ARTMAP neural networks."}, {"journal": "Proceedings. Symposium on Computer Applications in Medical Care", "meshMajor": ["Algorithms", "Artificial Intelligence", "Classification", "Databases, Factual", "Decision Support Techniques", "Female", "Humans", "Infant, Newborn", "Infant, Premature", "Male", "Obstetric Labor, Premature", "Pregnancy", "Risk Assessment", "Software"], "year": "1994", "abstractText": "Prediction of preterm birth is a poorly understood domain. The existing manual methods of assessment of preterm birth are 17%-38% accurate. The machine learning system LERS was used for three different datasets about pregnant women. Rules induced by LERS were used in conjunction with a classification scheme of LERS, based on \"bucket brigade algorithm\" of genetic algorithms and enhanced by partial matching. The resulting prediction of preterm birth in new, unseen cases is much more accurate (68%-90%).", "pmid": "7950021", "title": "Improving prediction of preterm birth using a new classification scheme and rule induction."}, {"journal": "IEEE transactions on medical imaging", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Databases, Factual", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Likelihood Functions", "Positron-Emission Tomography", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "We derive computationally efficient methods for the estimation of the mean and variance properties of penalized likelihood dynamic positron emission tomography (PET) images. This allows us to predict the accuracy of reconstructed activity estimates and to compare reconstruction algorithms theoretically. We combine a bin-mode approach in which data is modeled as a collection of independent Poisson random variables at each spatiotemporal bin with the space-time separabilities in the imaging equation and penalties to derive rapidly computable analytic mean and variance approximations. We use these approximations to compare bias/variance properties of our dynamic PET image reconstruction algorithm with those of multiframe static PET reconstructions.", "pmid": "16398413", "title": "Mean and covariance properties of dynamic PET reconstructions from list-mode data."}, {"journal": "International journal of neural systems", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Electronic Data Processing", "Fuzzy Logic", "Models, Neurological", "Neural Networks, Computer", "Neurolinguistic Programming", "Nonlinear Dynamics", "Stochastic Processes", "Synapses"], "year": "2001", "abstractText": "An artificial neural network with a two-layer feedback topology and generalized recurrent neurons, for solving nonlinear discrete dynamic optimization problems, is developed. A direct method to assign the weights of neural networks is presented. The method is based on Bellmann's Optimality Principle and on the interchange of information which occurs during the synaptic chemical processing among neurons. The neural network based algorithm is an advantageous approach for dynamic programming due to the inherent parallelism of the neural networks; further it reduces the severity of computational problems that can occur in methods like conventional methods. Some illustrative application examples are presented to show how this approach works out including the shortest path and fuzzy decision making problems.", "pmid": "11852439", "title": "A biologically inspired neural network for dynamic programming."}, {"journal": "NeuroImage", "meshMajor": ["Aged", "Aged, 80 and over", "Artificial Intelligence", "Atrophy", "Brain", "Brain Mapping", "Computer Simulation", "Female", "Fourier Analysis", "Humans", "Image Processing, Computer-Assisted", "Imaging, Three-Dimensional", "Longitudinal Studies", "Magnetic Resonance Imaging", "Male", "Middle Aged", "Multivariate Analysis", "Nonlinear Dynamics", "Numerical Analysis, Computer-Assisted", "Reproducibility of Results", "Software"], "year": "2004", "abstractText": "A high-dimensional shape transformation posed in a mass-preserving framework is used as a morphological signature of a brain image. Population differences with complex spatial patterns are then determined by applying a nonlinear support vector machine (SVM) pattern classification method to the morphological signatures. Significant reduction of the dimensionality of the morphological signatures is achieved via wavelet decomposition and feature reduction methods. Applying the method to MR images with simulated atrophy shows that the method can correctly detect subtle and spatially complex atrophy, even when the simulated atrophy represents only a 5% variation from the original image. Applying this method to actual MR images shows that brains can be correctly determined to be male or female with a successful classification rate of 97%, using the leave-one-out method. This proposed method also shows a high classification rate for old adults' age classification, even under difficult test scenarios. The main characteristic of the proposed methodology is that, by applying multivariate pattern classification methods, it can detect subtle and spatially complex patterns of morphological group differences which are often not detectable by voxel-based morphometric methods, because these methods analyze morphological measurements voxel-by-voxel and do not consider the entirety of the data simultaneously.", "pmid": "14741641", "title": "Morphological classification of brains via high-dimensional shape transformations and machine learning methods."}, {"journal": "Talanta", "meshMajor": ["Algorithms", "Animals", "Artificial Intelligence", "Discriminant Analysis", "Disease", "Gene Expression Profiling", "Humans", "Neoplasms", "Oligonucleotide Array Sequence Analysis", "Probability"], "year": "2009", "abstractText": "One problem with discriminant analysis of microarray data is representation of each sample by a large number of genes that are possibly irrelevant, insignificant or redundant. Methods of variable selection are, therefore, of great significance in microarray data analysis. To circumvent the problem, a new gene mining approach is proposed based on the similarity between probability density functions on each gene for the class of interest with respect to the others. This method allows the ascertainment of significant genes that are informative for discriminating each individual class rather than maximizing the separability of all classes. Then one can select genes containing important information about the particular subtypes of diseases. Based on the mined significant genes for individual classes, a support vector machine with local kernel transform is constructed for the classification of different diseases. The combination of the gene mining approach with support vector machine is demonstrated for cancer classification using two public data sets. The results reveal that significant genes are identified for each cancer, and the classification model shows satisfactory performance in training and prediction for both data sets.", "pmid": "19559875", "title": "Variable selection using probability density function similarity for support vector machine classification of high-dimensional microarray data."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Database Management Systems", "Databases, Bibliographic", "Information Storage and Retrieval", "Natural Language Processing", "Periodicals as Topic", "Software", "Vocabulary, Controlled"], "year": "2008", "abstractText": "BACKGROUND: Despite increasing interest in applying Natural Language Processing (NLP) to biomedical text, whether this technology can facilitate tasks such as database curation remains unclear.RESULTS: PaperBrowser is the first NLP-powered interface that was developed under a user-centered approach to improve the way in which FlyBase curators navigate an article. In this paper, we first discuss how observing curators at work informed the design and evaluation of PaperBrowser. Then, we present how we appraise PaperBrowser's navigational functionalities in a user-based study using a text highlighting task and evaluation criteria of Human-Computer Interaction. Our results show that PaperBrowser reduces the amount of interactions between two highlighting events and therefore improves navigational efficiency by about 58% compared to the navigational mechanism that was previously available to the curators. Moreover, PaperBrowser is shown to provide curators with enhanced navigational utility by over 74% irrespective of the different ways in which they highlight text in the article.CONCLUSION: We show that state-of-the-art performance in certain NLP tasks such as Named Entity Recognition and Anaphora Resolution can be combined with the navigational functionalities of PaperBrowser to support curation quite successfully.", "pmid": "18410678", "title": "Natural language processing in aid of FlyBase curators."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Aged", "Aged, 80 and over", "Aging", "Algorithms", "Artificial Intelligence", "Brain", "Cognitive Dysfunction", "Diffusion Tensor Imaging", "Humans", "Linear Models", "Male", "Normal Distribution"], "year": "2013", "abstractText": "In this study, we employed diffusion tensor imaging (DTI) to construct brain structural network and then derive the connection matrices from 96 healthy elderly subjects. The correlation analysis between these topological properties of network based on graph theory and the Cognitive Abilities Screening Instrument (CASI) index were processed to extract the significant network characteristics. These characteristics were then integrated to estimate the models by various machine-learning algorithms to predict user's cognitive performance. From the results, linear regression model and Gaussian processes model showed presented better abilities with lower mean absolute errors of 5.8120 and 6.25 to predict the cognitive performance respectively. Moreover, these extracted topological properties of brain structural network derived from DTI also could be regarded as the bio-signatures for further evaluation of brain degeneration in healthy aged and early diagnosis of mild cognitive impairment (MCI). ", "pmid": "24109740", "title": "A prediction model for cognitive performance in health ageing using diffusion tensor imaging with graph theory."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2009", "abstractText": "Segmentation of anatomical objects is always a fundamental task for various clinical applications. Although many automatic segmentation methods have been designed to segment specific anatomical objects in a given imaging modality, a more generic solution that is directly applicable to different imaging modalities and different deformable surfaces is desired, if attainable. In this paper, we propose such a framework, which learns from examples the spatially adaptive appearance and shape of a 3D surface (either open or closed). The application to a new object/surface in a new modality requires only the annotation of training examples. Key contributions of our method include: (1) an automatic clustering and learning algorithm to capture the spatial distribution of appearance similarities/variations on the 3D surface. More specifically, the model vertices are hierarchically clustered into a set of anatomical primitives (sub-surfaces) using both geometric and appearance features. The appearance characteristics of each learned anatomical primitive are then captured through a cascaded boosting learning method. (2) To effectively incorporate non-Gaussian shape priors, we cluster the training shapes in order to build multiple statistical shape models. (3) To our best knowledge, this is the first time the same segmentation algorithm has been directly employed in two very diverse applications: (a) Liver segmentation (closed surface) in PET-CT, in which CT has very low-resolution and low-contrast; (b) Distal femur (condyle) surface (open surface) segmentation in MRI.", "pmid": "20426213", "title": "Cross modality deformable segmentation using hierarchical clustering and learning."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Computer Systems", "Decision Support Techniques", "Models, Theoretical", "Motion", "Pattern Recognition, Automated", "Robotics"], "year": "2006", "abstractText": "This paper presents a simple yet efficient dynamic-programming (DP) shortest path algorithm for real-time collision-free robot-path planning applicable to situations in which targets and barriers are permitted to move. The algorithm works in real time and requires no prior knowledge of target or barrier movements. In the case that the barriers are stationary, this paper proves that this algorithm always results in the robot catching the target, provided it moves at a greater speed than the target, and the dynamic-system update frequency is sufficiently large. Like most robot-path-planning approaches, the environment is represented by a topologically organized map. Each grid point on the map has only local connections to its neighboring grid points from which it receives information in real time. The information stored at each point is a current estimate of the distance to the nearest target and the neighbor from which this distance was determined. Updating the distance estimate at each grid point is done using only the information gathered from the point's neighbors, that is, each point can be considered an independent processor, and the order in which grid points are updated is not determined based on global knowledge of the current distances at each point or the previous history of each point. The robot path is determined in real time completely from the information at the robot's current grid-point location. The computational effort to update each point is minimal, allowing for rapid propagation of the distance information outward along the grid from the target locations. In the static situation, where both the targets and the barriers do not move, this algorithm is a DP solution to the shortest path problem, but is restricted by lack of global knowledge. In this case, this paper proves that the dynamic system converges in a small number of iterations to a state where the minimal distance to a target is recorded at each grid point and shows that this robot-path-planning algorithm can be made to always choose an optimal path. The effectiveness of this algorithm is demonstrated through a number of simulations.", "pmid": "16903362", "title": "An efficient dynamic system for real-time robot-path planning."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artificial Intelligence", "Bayes Theorem", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Models, Statistical", "Pattern Recognition, Automated"], "year": "2006", "abstractText": "In this paper, the optimizations of three fundamental components of image understanding: segmentation/annotation, 3D sensing (stereo) and 3D fitting, are posed and integrated within a Bayesian framework. This approach benefits from recent advances in statistical learning which have resulted in greatly improved flexibility and robustness. The first two components produce annotation (region labeling) and depth maps for the input images, while the third module integrates and resolves the inconsistencies between region labels and depth maps to fit most likely 3D models. To illustrate the application of these ideas, we have focused on the difficult problem of fitting individual tree models to tree stands which is a major challenge for vision-based forestry inventory systems.", "pmid": "16640256", "title": "Component optimization for image understanding: a Bayesian approach."}, {"journal": "[Rinsho ketsueki] The Japanese journal of clinical hematology", "meshMajor": ["Artificial Intelligence", "Deep Learning", "Humans", "Neural Networks, Computer"], "year": "2020", "abstractText": "Artificial intelligence (AI) has been applied widely in medicine. For example, deep neural network-based deep learning is particularly effective for pattern recognition in static medical images. Additionally, dynamic time series data are analysed ubiquitously in biology and medicine, as in the application of BCR-ABL International Scale time series data measured from CML patients treated with tyrosine-kinase inhibitors. Nonlinear data analyses, rather than conventional deep learning, can be more powerful for this type of dynamic disease information. Here, I introduce our mathematical approaches that are applicable for disease dynamics, such as dynamical network biomarkers (DNB) and randomly distributed embedding (RDE), as examples of nonlinear data analyses. I also discuss the availability of neuroinspired and neuromorphic hardware systems, which we are developing for potential use in next-generation AI.", "pmid": "32507823", "title": "[Present and future perspectives of artificial intelligence: examples of mathematical approaches for analysis of disease dynamics]."}, {"journal": "Cell", "meshMajor": ["Algorithms", "Animals", "Artificial Intelligence", "Embryoid Bodies", "Embryonic Stem Cells", "Genomics", "Harringtonines", "High-Throughput Nucleotide Sequencing", "Kinetics", "Mice", "Open Reading Frames", "Peptide Chain Initiation, Translational", "Protein Biosynthesis", "RNA", "Ribosomes", "Sequence Analysis, RNA"], "year": "2011", "abstractText": "The ability to sequence genomes has far outstripped approaches for deciphering the information they encode. Here we present a suite of techniques, based on ribosome profiling (the deep sequencing of ribosome-protected mRNA fragments), to provide genome-wide maps of protein synthesis as well as a pulse-chase strategy for determining rates of translation elongation. We exploit the propensity of harringtonine to cause ribosomes to accumulate at sites of translation initiation together with a machine learning algorithm to define protein products systematically. Analysis of translation in mouse embryonic stem cells reveals thousands of strong pause sites and unannotated translation products. These include amino-terminal extensions and truncations and upstream open reading frames with regulatory potential, initiated at both AUG and non-AUG codons, whose translation changes after differentiation. We also define a class of short, polycistronic ribosome-associated coding RNAs (sprcRNAs) that encode small proteins. Our studies reveal an unanticipated complexity to mammalian proteomes.", "pmid": "22056041", "title": "Ribosome profiling of mouse embryonic stem cells reveals the complexity and dynamics of mammalian proteomes."}, {"journal": "TheScientificWorldJournal", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Language", "Natural Language Processing", "Semantics"], "year": "2013", "abstractText": "Word sense disambiguation (WSD) is a fundamental problem in nature language processing, the objective of which is to identify the most proper sense for an ambiguous word in a given context. Although WSD has been researched over the years, the performance of existing algorithms in terms of accuracy and recall is still unsatisfactory. In this paper, we propose a novel approach to word sense disambiguation based on topical and semantic association. For a given document, supposing that its topic category is accurately discriminated, the correct sense of the ambiguous term is identified through the corresponding topic and semantic contexts. We firstly extract topic discriminative terms from document and construct topical graph based on topic span intervals to implement topic identification. We then exploit syntactic features, topic span features, and semantic features to disambiguate nouns and verbs in the context of ambiguous word. Finally, we conduct experiments on the standard data set SemCor to evaluate the performance of the proposed method, and the results indicate that our approach achieves relatively better performance than existing approaches. ", "pmid": "24294131", "title": "A novel approach to word sense disambiguation based on topical and semantic association."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Accidents, Traffic", "Analysis of Variance", "Artificial Intelligence", "Attention", "Automobile Driving", "Eye Movement Measurements", "Head Movements", "Humans", "Protective Devices", "Systems Integration", "User-Computer Interface"], "year": "2009", "abstractText": "In this paper, we introduce a novel laser-based wide-area heads-up windshield display which is capable of actively interfacing with a human as part of a driver assistance system. The dynamic active display (DAD) is a unique prototype interface that presents safety-critical visual icons to the driver in a manner that minimizes the deviation of his or her gaze direction without adding to unnecessary visual clutter. As part of an automotive safety system, the DAD presents alerts in the field of view of the driver only if necessary, which is based upon the state and pose of the driver, vehicle, and environment. This paper examines the effectiveness of DAD through a comprehensive comparative experimental evaluation of a speed compliance driver assistance system, which is implemented on a vehicular test bed. Three different types of display protocols for assisting a driver to comply with speed limits are tested on actual roadways, and these are compared with a conventional dashboard display. Given the inclination, drivers who are given an overspeed warning alert reduced the time required to slow down to the speed limit by 38% (p < 0.01) as compared with the drivers not given the alert. Additionally, certain alerts decreased distraction levels by reducing the time spent looking away from the road by 63% (p < 0.01). Ultimately, these alerts demonstrate the utility and promise of the DAD system.", "pmid": "19068432", "title": "A novel active heads-up display for driver assistance."}, {"journal": "Healthcare informatics : the business magazine for information and communication systems", "meshMajor": ["Artificial Intelligence", "Hospital Information Systems", "United States"], "year": "1990", "abstractText": "Computers have manipulated data to increase the efficiency of their users. AI represents the next evolutionary step.", "pmid": "10120641", "title": "Artificial intelligence in healthcare management."}, {"journal": "Neuroscience letters", "meshMajor": ["Artificial Intelligence", "Child", "Diagnosis, Computer-Assisted", "Female", "Humans", "Hyperkinesis", "Male", "Mental Disorders", "Psychiatric Status Rating Scales", "Severity of Illness Index"], "year": "2011", "abstractText": "Automatic classification of different behavioral disorders with many similarities (e.g. in symptoms) by using an automated approach will help psychiatrists to concentrate on correct disorder and its treatment as soon as possible, to avoid wasting time on diagnosis, and to increase the accuracy of diagnosis. In this study, we tried to differentiate and classify (diagnose) 306 children with many similar symptoms and different behavioral disorders such as ADHD, depression, anxiety, comorbid depression and anxiety and conduct disorder with high accuracy. Classification was based on the symptoms and their severity. With examining 16 different available classifiers, by using \"Prtools\", we have proposed nearest mean classifier as the most accurate classifier with 96.92% accuracy in this research.", "pmid": "21396979", "title": "Automatic classification of hyperactive children: comparing multiple artificial intelligence approaches."}, {"journal": "The New phytologist", "meshMajor": ["Artificial Intelligence", "Fossils", "Image Processing, Computer-Assisted", "Internet", "Picea", "Pollen", "Reproducibility of Results", "Software"], "year": "2012", "abstractText": "Pollen is among the most ubiquitous of terrestrial fossils, preserving an extended record of vegetation change. However, this temporal continuity comes with a taxonomic tradeoff. Analytical methods that improve the taxonomic precision of pollen identifications would expand the research questions that could be addressed by pollen, in fields such as paleoecology, paleoclimatology, biostratigraphy, melissopalynology, and forensics. We developed a supervised, layered, instance-based machine-learning classification system that uses leave-one-out bias optimization and discriminates among small variations in pollen shape, size, and texture. We tested our system on black and white spruce, two paleoclimatically significant taxa in the North American Quaternary. We achieved > 93% grain-to-grain classification accuracies in a series of experiments with both fossil and reference material. More significantly, when applied to Quaternary samples, the learning system was able to replicate the count proportions of a human expert (R(2) = 0.78, P = 0.007), with one key difference - the machine achieved these ratios by including larger numbers of grains with low-confidence identifications. Our results demonstrate the capability of machine-learning systems to solve the most challenging palynological classification problem, the discrimination of congeneric species, extending the capabilities of the pollen analyst and improving the taxonomic resolution of the palynological record.", "pmid": "22943455", "title": "Classifying black and white spruce pollen using layered machine learning."}, {"journal": "American journal of ophthalmology", "meshMajor": ["Anterior Eye Segment", "Artificial Intelligence", "Deep Learning", "Female", "Glaucoma, Angle-Closure", "Gonioscopy", "Humans", "Male", "Middle Aged", "ROC Curve", "Tomography, Optical Coherence"], "year": "2019", "abstractText": "PURPOSE: Anterior segment optical coherence tomography (AS-OCT) provides an objective imaging modality for visually identifying anterior segment structures. An automated detection system could assist ophthalmologists in interpreting AS-OCT images for the presence of angle closure.DESIGN: Development of an artificial intelligence automated detection system for the presence of angle closure.METHODS: A deep learning system for automated angle-closure detection in AS-OCT images was developed, and this was compared with another automated angle-closure detection system based on quantitative features. A total of 4135 Visante AS-OCT images from 2113 subjects (8270 anterior chamber angle images with 7375 open-angle and 895 angle-closure) were examined. The deep learning angle-closure detection system for a 2-class classification problem was tested by 5-fold cross-validation. The deep learning system and the automated angle-closure detection system based on quantitative features were evaluated against clinicians' grading of AS-OCT images as the reference standard.RESULTS: The area under the receiver operating characteristic curve of the system using quantitative features was 0.90 (95% confidence interval [CI] 0.891-0.914) with a sensitivity of 0.79 \u00b1 0.037 and a specificity of 0.87 \u00b1 0.009, while the area under the receiver operating characteristic curve of the deep learning system was 0.96 (95% CI 0.953-0.968) with a sensitivity of 0.90 \u00b1 0.02 and a specificity of 0.92 \u00b1 0.008, against clinicians' grading of AS-OCT images as the reference standard.CONCLUSIONS: The results demonstrate the potential of the deep learning system for angle-closure detection in AS-OCT images.", "pmid": "30849350", "title": "A Deep Learning System for Automated Angle-Closure Detection in Anterior Segment Optical Coherence Tomography Images."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Algorithms", "Arrhythmias, Cardiac", "Artificial Intelligence", "Computer Simulation", "Diagnosis, Computer-Assisted", "Electrocardiography", "Heart Rate", "Humans", "Models, Cardiovascular", "Models, Statistical", "Pattern Recognition, Automated", "Principal Component Analysis", "Software"], "year": "2007", "abstractText": "In this paper, we developed the novel algorithm for cardiac arrhythmia classification. Until now, back propagation neural network (BPNN) was frequently used for these tasks. However, general gradient based learning method is far slower than what is required for their application. The proposed algorithm adapts Extreme Learning Machine(ELM) that has the advantage of very fast learning speed and high accuracy. In this paper, we classify beats into normal beat, left bundle branch block beat, right bundle branch block beat, premature ventricular contraction, atrial premature beat, paced beat, and ventricular escape beat. Experimental results show that we can obtain 97.45% in average accuracy, 97.44% in average sensitivity, 98.46% in average specificity, and 2.423 seconds in learning time.", "pmid": "18002690", "title": "Algorithm for classifying arrhythmia using Extreme Learning Machine and principal component analysis."}], "negative": [{"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Alternative Splicing", "Base Sequence", "Cell Differentiation", "Cells, Cultured", "Embryonic Stem Cells", "Gene Expression Profiling", "Gene Expression Regulation, Developmental", "Humans", "Neurons", "RNA", "RNA, Messenger", "Sequence Analysis, DNA", "Transcription, Genetic"], "year": "2010", "abstractText": "To examine the fundamental mechanisms governing neural differentiation, we analyzed the transcriptome changes that occur during the differentiation of hESCs into the neural lineage. Undifferentiated hESCs as well as cells at three stages of early neural differentiation-N1 (early initiation), N2 (neural progenitor), and N3 (early glial-like)-were analyzed using a combination of single read, paired-end read, and long read RNA sequencing. The results revealed enormous complexity in gene transcription and splicing dynamics during neural cell differentiation. We found previously unannotated transcripts and spliced isoforms specific for each stage of differentiation. Interestingly, splicing isoform diversity is highest in undifferentiated hESCs and decreases upon differentiation, a phenomenon we call isoform specialization. During neural differentiation, we observed differential expression of many types of genes, including those involved in key signaling pathways, and a large number of extracellular receptors exhibit stage-specific regulation. These results provide a valuable resource for studying neural differentiation and reveal insights into the mechanisms underlying in vitro neural differentiation of hESCs, such as neural fate specification, neural progenitor cell identity maintenance, and the transition from a predominantly neuronal state into one with increased gliogenic potential.", "pmid": "20194744", "title": "Dynamic transcriptomes during neural differentiation of human embryonic stem cells revealed by short, long, and paired-end sequencing."}, {"journal": "Le Journal dentaire du Quebec", "meshMajor": ["Carcinogens", "Cell Transformation, Neoplastic", "Humans", "Neoplasms", "Neoplasms, Radiation-Induced", "Oncogenic Viruses"], "year": "1991", "abstractText": "This article briefly describes the fundamental principles associated with the development of cancerous cells in man. The manner in which these neoplastic cells appear and spread are discussed. A better understanding of these phenomena will allow the practitioner to realize the importance and necessity of a systematic evaluation of all patients.", "pmid": "1819607", "title": "[Oncology and its applications. 1. Basic principles of oncology]."}, {"journal": "LGBT health", "meshMajor": ["Adolescent", "Adult", "Comprehension", "Florida", "Health Communication", "Health Knowledge, Attitudes, Practice", "Health Policy", "Homosexuality, Male", "Humans", "Interviews as Topic", "Male", "Pre-Exposure Prophylaxis", "Public Health", "Qualitative Research", "Sexual and Gender Minorities", "Young Adult"], "year": "2016", "abstractText": "PURPOSE: Street markets in antiretroviral medications for HIV have been documented, but sources of demand are not well understood. We report unexpected findings from qualitative research suggesting that some demand is for informal pre-exposure prophylaxis (PrEP).METHODS: Focus groups with young men who have sex with men (N\u2009=\u200931) yielded information on their understanding and use of PrEP.RESULTS: Of those who had heard of it, few understood PrEP to be a physician-prescribed regimen; most believed it to be a pill taken before and/or after sex and acquired on the street or through HIV-positive friends.CONCLUSION: Implications for PrEP rollout and public health policy are discussed.", "pmid": "26720130", "title": "Misunderstanding of Pre-Exposure Prophylaxis Use Among Men Who Have Sex with Men: Public Health and Policy Implications."}, {"journal": "FEBS letters", "meshMajor": ["Adaptor Proteins, Signal Transducing", "Blotting, Western", "Carrier Proteins", "DNA Primers", "DNA-Binding Proteins", "Gene Expression Regulation, Enzymologic", "Genes, Reporter", "Glutathione Peroxidase", "Hydrogen Peroxide", "Lac Operon", "Models, Genetic", "Mutagenesis", "Oxidative Stress", "Phosphoproteins", "Point Mutation", "Polymerase Chain Reaction", "Promoter Regions, Genetic", "Protein Binding", "Response Elements", "Saccharomyces cerevisiae", "Saccharomyces cerevisiae Proteins", "Time Factors", "Transcription Factors", "beta-Galactosidase"], "year": "2004", "abstractText": "The GPX2 gene encodes a homologue of phospholipid hydroperoxide glutathione peroxidase in Saccharomyces cerevisiae. The GPX2 promoter contains three elements the sequence of which is completely consistent with the optimal sequence for the Yap1 response element (YRE). Here, we identify the intrinsic YRE that functions in the oxidative stress response of GPX2. In addition, we discovered a cis-acting element (5'-GGCCGGC-3') within the GPX2 promoter proximal to the functional YRE that is necessary for H(2)O(2)-induced expression of GPX2. We present evidence showing that Skn7 is necessary for the oxidative stress response of GPX2 and is able to bind to this sequence. We determine the optimal sequence for Skn7 to regulate GPX2 under conditions of oxidative stress to be 5'-GGC(C/T)GGC-3', and we designate this sequence the oxidative stress-responsive Skn7 response element.", "pmid": "15135069", "title": "Regulation of the yeast phospholipid hydroperoxide glutathione peroxidase GPX2 by oxidative stress is mediated by Yap1 and Skn7."}, {"journal": "New York state journal of medicine", "meshMajor": ["Cocaine", "Cross-Sectional Studies", "Female", "Fetal Blood", "Hospitals, Urban", "Humans", "Incidence", "Infant, Newborn", "New York City", "Pregnancy", "Pregnancy Complications", "Prenatal Care", "Retrospective Studies", "Substance-Related Disorders", "Syphilis", "Syphilis, Congenital"], "year": "1990", "abstractText": "The frequency of positive cord blood rapid plasma reagin (RPR) tests among newborns at an inner city hospital and associations with maternal cocaine use, prenatal care, and adequacy of syphilis therapy were retrospectively assessed. The incidence of positive cord blood RPRs increased from 1.1% of all live births in 1985 to 3.4% in 1988. In 1987, 98 babies were born with positive cord blood RPRs; 86 of their charts were available for review. Four infants had false positive RPRs, and one patient delivered twins, leaving 81 mothers who could be evaluated. Almost 37% of these patients had had no prenatal care. More than 55% had inadequate or not therapy for syphilis. Of these, only 17.4% had prenatal care. Slightly more than 40% of patients acknowledged using drugs during pregnancy, 87.9% of whom used cocaine. Among the patients who used drugs, 75.8% received no prenatal care, in contrast to 10.4% of mothers who did not use drugs (p less than 0.001). It appears that drug use, particularly use of cocaine, is associated with low levels of utilization of prenatal services and inadequate therapy for syphilis. This may lead to increased risk of congenital syphilis in newborns.", "pmid": "2234614", "title": "Syphilis among parturients at an inner city hospital: association with cocaine use and implications for congenital syphilis rates."}, {"journal": "International urology and nephrology", "meshMajor": ["Clinical Trials as Topic", "Drug Administration Schedule", "Drug Evaluation", "Drug Resistance", "Estramustine", "Estrogens", "Humans", "Male", "Nitrogen Mustard Compounds", "Prostatic Neoplasms"], "year": "1975", "abstractText": "The experience gained with Estracyt, kindly supplied by AB LEO, Sweden, is reported. On the basis of former data in the literature we only used the drug in estrogen resistant and advanced cases. Estracyt (estramustine phosphate) is a nitrogen mustard derivative of the urethan type, attached to oestradiol-17-phosphate. In histologically verified cases, it was administered in daily doses of 300 mg intravenously for three weeks, followed by maintenance doses of 300 mg a week in tablets for three months. During treatment, liver and bone marrow function was checked systematically. The changes in morphological picture were studied by means of biopsies during and at the end of treatment. In agreement with the data in the literature a favourable effect was observed in estrogen resistant patients, with no toxic effect whatever on the bone marrow. At the same time GOT and GPT and BSP retention examinations demonstrated a hepatotoxic side effect. The pathological values returned to normal after withdrawal of the drug. Histological examinations showed that the tumour cells had changed but failed to disappear after treatment.", "pmid": "1102475", "title": "Treatment of prostatic cancer with Estracyt (estramustine phosphate)."}, {"journal": "Evolution; international journal of organic evolution", "meshMajor": ["Analysis of Variance", "Climate", "Environment", "Evolution, Molecular", "Genome, Plant", "Geography", "Phylogeny", "Pinus", "Regression Analysis", "Reproduction", "Seeds"], "year": "2004", "abstractText": "Genome size has been suggested to be a fundamental biological attribute in determining life-history traits in many groups of organisms. We examined the relationships between pine genome sizes and pine phylogeny, environmental factors (latitude, elevation, annual rainfall), and biological traits (latitudinal and elevational ranges, seed mass, minimum generation time, interval between large seed crops, seed dispersal mode, relative growth rate, measures of potential and actual invasiveness, and level of rarity). Genome sizes were determined for 60 pine taxa and then combined with published values to make a dataset encompassing 85 species, or 70% of species in the genus. Supertrees were constructed using 20 published source phylogenies. Ancestral genome size was estimated as 32 pg. Genome size has apparently remained stable or increased over evolutionary time in subgenus Strobus, while it has decreased in most subsections in subgenus Pinus. We analyzed relationships between genome size and life-history variables using cross-species correlations and phylogenetically independent contrasts derived from supertree constructions. The generally assumed positive relation between genome size and minimum generation time could not be confirmed in phylogenetically controlled analyses. We found that the strongest correlation was between genome size and seed mass. Because the growth quantities specific leaf area and leaf area ratio (and to a lesser extent relative growth rate) are strongly negatively related to seed mass, they were also negatively correlated with genome size. Northern latitudinal limit was negatively correlated with genome size. Invasiveness, particularly of wind-dispersed species, was negatively associated with both genome size and seed mass. Seed mass and its relationships with seed number, dispersal mode, and growth rate contribute greatly to the differences in life-history strategies of pines. Many life-history patterns are therefore indirectly, but consistently, associated with genome size.", "pmid": "15446425", "title": "Evolution of genome size in pines (Pinus) and its life-history correlates: supertree analyses."}, {"journal": "Canadian journal of ophthalmology. Journal canadien d'ophtalmologie", "meshMajor": ["Acid Phosphatase", "Animals", "Autolysis", "Cornea", "Corneal Transplantation", "Hydrocortisone", "Hydrogen-Ion Concentration", "Lysosomes", "Membranes", "Rabbits", "Refrigeration", "Temperature", "Tissue Preservation", "Transplantation, Homologous"], "year": "1975", "abstractText": "Many eyes donated for use in corneal grafting are rejected because of signs of autolysis in the donor material. The purpose of this experimental study was to determine whether hydrocortisone acting as a lysosome membrane stabilizer could prevent or retard autolysis of the corneas under storage, and if so, what was the most efficacious concentration. Different groups of rabbit corneas were placed in saline as controls or in varying concentrations of hydrocortisone (10(-10) M to 10(-4) M at pH 7.4) at 37 degrees C and 4 degrees C. Acid phosphatase released after six hours was measured biochemically. This enzyme was used as a marker enzyme reflecting lysosomal labilization. Results showed a significant stabilization of the lysosomal membrane at 4 degrees C as compared to 37 degrees C. A trend towards stabilization of the lysosomal membrane was seen when 10(-8) M concentration of hydrocortisone at 37 degrees C was used, there being no demonstrable stabilization at 4 degrees C.", "pmid": "133", "title": "The prevention of autolysis of stored cornea using steroid as a lysosome membrane stabilizer."}, {"journal": "European review for medical and pharmacological sciences", "meshMajor": ["Betacoronavirus", "COVID-19", "Coronavirus Infections", "Humans", "Pandemics", "Pneumonia, Viral", "Prevalence", "Regression Analysis", "SARS-CoV-2", "Seroepidemiologic Studies", "Survival Rate"], "year": "2020", "abstractText": "OBJECTIVE: Our objective was to find an association between exposure of a population to Middle East Respiratory Syndrome Coronavirus (MERS-CoV) and mortality rate due to Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2) across different countries worldwide.MATERIALS AND METHODS: To find the relationship between exposure to MERS-CoV and mortality rate due to SARS-CoV-2, we collected and analyzed data of three possible factors that may have resulted in an exposure of a population to MERS-CoV: (1) the number of Middle East Respiratory Syndrome (MERS) cases reported among 16 countries since 2012; (2) data of MERS-CoV seroprevalence in camels across 23 countries, as working with camels increase risk of exposure to MERS-CoV; (3) data of travel history of people from 51 countries to Saudi Arabia was collected on the assumption that travel to a country where MERS is endemic, such as, Saudi Arabia, could also lead to exposure to MERS-CoV.RESULTS: We found a significantly lower number of Coronavirus disease 2019 (COVID-19) deaths per million (deaths/M) of a population in countries that are likely to be exposed to MERS-CoV than otherwise (t-stat=3.686, p<0.01). In addition, the number of COVID-19 deaths/M of a population was significantly lower in countries that reported a higher seroprevalence of MERS-CoV in camels than otherwise (t-stat=4.5077, p<0.01). Regression analysis showed that increased travelling history to Saudi Arabia is likely to be associated with a lower mortality rate due to COVID-19.CONCLUSIONS: This study provides empirical evidence that a population that was at an increased risk of exposure to MERS-CoV had a significantly lower mortality rate due to SARS-CoV-2, which might be due to cross-protective immunity against SARS-CoV-2 in that population because of an earlier exposure to MERS-CoV.", "pmid": "32965011", "title": "An association between exposure to Middle East Respiratory Syndrome (MERS) and mortality rate of Coronavirus Disease 2019 (COVID-19)."}, {"journal": "European heart journal", "meshMajor": ["Anticoagulants", "Chi-Square Distribution", "Coated Materials, Biocompatible", "Coronary Angiography", "Coronary Disease", "Equipment Design", "Female", "Heparin", "Humans", "Logistic Models", "Male", "Recurrence", "Reoperation", "Risk Factors", "Statistics, Nonparametric", "Stents", "Thrombosis", "Treatment Outcome"], "year": "2001", "abstractText": "AIMS: Heparin coating of stents is thought to reduce stent thrombosis and restenosis rates. However, clinical data comparing coated and uncoated stents of the same model are lacking. We compared the heparin coated (C) and the uncoated (U) version of the Jostent stent with regard to the clinical and angiographic outcome after 6 months.METHODS AND RESULTS: Provisional stenting was done in 277 patients and 306 lesions; only 40 were Benestent-II like lesions. Delivery success rate was 98.4%. Both groups (C/U: n=156/150 lesions) were comparable in clinical and procedural data. Post stenting, reference diameter (C/U: 2.68+/-0.56/2.66+/-0.53 mm) and minimal lumen diameter did not differ (C/U: 2.48+/-0.47/2.48+/-0.52 mm). During follow-up the rate of subacute stent thrombosis (C/U: 1.9%/1.3%) and myocardial infarction did not differ. Angiography at the 6-month follow-up (79.4%) revealed no difference in restenosis rate (C/U: 33.1%/30.3%). Risk factors for restenosis were a type B2/C lesion (P<0.02), a stented segment longer than 16 mm (P<0.006) and a stent inflation pressure <14 bar (P<0.0063).CONCLUSION: Corline heparin coating of the Jostent has no impact on the in-hospital complication rate, stent thrombosis or restenosis. The Jostent design gives a high procedural success rate and satisfying result at 6 months in an everyday patient population undergoing provisional stenting.", "pmid": "11549303", "title": "Comparison of the heparin coated vs the uncoated Jostent--no influence on restenosis or clinical outcome."}, {"journal": "Photosynthesis research", "meshMajor": ["Acclimatization", "Cell Membrane", "Electrophoresis, Polyacrylamide Gel", "Light", "Light-Harvesting Protein Complexes", "Models, Biological", "Operon", "Peptides", "Pigments, Biological", "Protein Structure, Quaternary", "Proteomics", "Rhodobacter sphaeroides", "Spectrophotometry, Infrared"], "year": "2011", "abstractText": "In order to obtain an improved understanding of the assembly of the bacterial photosynthetic apparatus, we have conducted a proteomic analysis of pigment-protein complexes isolated from the purple bacterium Rhodobacter sphaeroides undergoing acclimation to reduced incident light intensity. Photoheterotrophically growing cells were shifted from 1,100 to 100\u00a0W/m(2) and intracytoplasmic membrane (ICM) vesicles isolated over 24-h were subjected to clear native polyacrylamide gel electrophoresis. Bands containing the LH2 and reaction center (RC)-LH1 complexes were excised and subjected to in-gel trypsin digestion followed by liquid chromatography (LC)-mass spectroscopy (MS)/MS. The results revealed that the LH2 band contained distinct levels of the LH2-\u00e1 and -\u00e2 polypeptides encoded by the two puc operons. Polypeptide subunits encoded by the puc2AB operon predominated under high light and in the early stages of acclimation to low light, while after 24\u00a0h, the puc1BAC components were most abundant. Surprisingly, the Puc2A polypeptide containing a 251 residue C-terminal extension not present in Puc1A, was a protein of major abundance. A predominance of Puc2A components in the LH2 complex formed at high light intensity is followed by a >2.5-fold enrichment in Puc1B levels between 3 and 24\u00a0h of acclimation, accompanied by a nearly twofold decrease in Puc2A levels. This indicates that the puc1BAC operon is under more stringent light control, thought to reflect differences in the puc1 upstream regulatory region. In contrast, elevated levels of Puc2 polypeptides were seen 48\u00a0h after the gratuitous induction of ICM formation at low aeration in the dark, while after 24\u00a0h of acclimation to low light, an absence of alterations in Puc polypeptide distributions was observed in the upper LH2-enriched gel band, despite an approximate twofold increase in overall LH2 levels. This is consistent with the origin of this band from a pool of LH2 laid down early in development that is distinct from subsequently assembled LH2-only domains, forming the LH2 gel band.", "pmid": "21863386", "title": "Differential assembly of polypeptides of the light-harvesting 2 complex encoded by distinct operons during acclimation of Rhodobacter sphaeroides to low light intensity."}, {"journal": "Molekuliarnaia biologiia", "meshMajor": ["Aliivibrio fischeri", "Chaperonin 10", "Chaperonin 60", "Escherichia coli", "Escherichia coli Proteins", "Gene Expression Regulation, Bacterial", "Operon", "Plasmids", "Protease La", "Protein Folding", "Recombinant Fusion Proteins"], "year": null, "abstractText": "It was shown that the chaperonin GroEL/GroES and protease Lon influence the expression of the Vibrio fischeri lux regulon in Escherichia coli cells: E. coli groE mutants bearing hybrid plasmid with the lux regulon were weakly luminescent; cells of the E. coli lon- comprising the entire lux regulon display very intense bioluminescence, with no lag period in the induction curve characteristic of lon+ strains. The luxR gene was cloned from the Vibrio fischeri genome in the pGEX-KG vector. It was shown that the active fusion protein GST-LuxR by affinity chromatography on glutathione-sucrose colony is purified only with proteins GroEL and Lon. The present results showed that the LuxR, transcriptional activator of the V. fischeri lux operon, really complexes with GroEL chaperonin and Lon protease. We suppose, that the GroEL/GroES chaperonin systems is required for the folding of LuxR into an active protein, and the LuxR is the target for the ATP-dependent serine Lon protease of E. coli.", "pmid": "16637268", "title": "[Role of GroEL/GroES chaperonin system and Lon protease in regulation of expression Vibrio fischeri lux genes in Escherichia coli cells]."}, {"journal": "Journal of visualized experiments : JoVE", "meshMajor": ["Adenocarcinoma", "Animals", "Appendectomy", "Azoxymethane", "Carcinogens", "Cecum", "Chronic Disease", "Colitis", "Colon", "Colorectal Neoplasms", "Dextran Sulfate", "Disease Models, Animal", "Male", "Mice, Inbred C57BL"], "year": "2019", "abstractText": "The human appendix has been recently implicated to play important biological roles in the pathogenesis of various complex diseases, such as colorectal cancer, inflammatory bowel disease, and Parkinson's disease. To study the function of the appendix, a gut disease-associated murine appendectomy model has been established and its step-by-step protocol is described here. This report introduces a facile protocol for caecal patch removal in mice followed by the chemical induction of chronic colitis-associated colorectal cancer using a combination of dextran sulfate sodium (DSS) and azoxymethane (AOM). IgA specific cells and IgA concentration were significantly reduced upon removal of the caecal patch in male C57BL/6 mice\u00a0compared to those in the sham group. Simultaneously administering 2% DSS and AOM resulted in nearly 80% mice survival in both sham and appendectomy groups without significant body weight loss. Histological results confirmed colonic inflammation and different degrees of adenocarcinoma. This model can be used for the study of the functional role of the appendix in maintaining gut microbiota homeostasis and pathogenesis of gut colitis and malignancies, as well as for the potential development of drug targeting therapies.", "pmid": "31498319", "title": "Murine Appendectomy Model of Chronic Colitis Associated Colorectal Cancer by Precise Localization of Caecal Patch."}, {"journal": "Journal of plastic surgery and hand surgery", "meshMajor": ["Adult", "Amputation, Traumatic", "Attitude of Health Personnel", "Carpal Tunnel Syndrome", "Clinical Decision-Making", "Cross-Sectional Studies", "Female", "Hammer Toe Syndrome", "Hand Injuries", "Humans", "Male", "Middle Aged", "Perception", "Physical Therapists", "Practice Patterns, Physicians'", "Prognosis", "Surgeons", "Surveys and Questionnaires", "Tendon Injuries", "Treatment Outcome"], "year": "2018", "abstractText": "OBJECTIVE: The objectives of this survey were (1) to study if surgeons' perceptions of the benefit of six surgical procedures differ if they consider themselves as patients instead of treating a patient, (2) to evaluate the role of five predetermined factors that may influence decision-making, and (3) to assess how uniformly hand surgeons and hand therapists perceive the benefits of the surgical treatments.METHODS: The members of the national societies for Hand Surgery and Hand Therapy were asked to participate in the survey. Six patient cases with hand complaint (carpal tunnel syndrome, flexor tendon injury, dorsal wrist ganglion, thumb amputation, boxer's fracture, and mallet fracture) and a proposed operative procedure were presented, and the respondents rated the procedures in terms of the expected benefit. Half of the surgeons were advised to consider themselves as patients when filling out the survey.RESULTS: A survey was completed by 56 surgeons (61%) and 59 therapists (20%). Surgeons who considered themselves as patients had less confident perception on the benefit of carpal tunnel release compared with surgeons, who considered treating patients. Hand surgeons and hand therapists had similar perception of the benefits of surgery. The expected functional result was regarded as the most important factor in directing the decision about the treatment.CONCLUSIONS: Surgeons tended to be more unanimous in their opinions in cases, where there is limited evidence on treatment effect. The agreement between surgeons and therapists implies that the clinical perspectives are similar, and probably reflect the reality well.", "pmid": "28417701", "title": "Survey of hand surgeons' and therapists' perceptions of the benefit of common surgical procedures of the hand."}, {"journal": "Pathology international", "meshMajor": ["Adenocarcinoma, Mucinous", "Adult", "Cervix Uteri", "Cesarean Section", "Choristoma", "Diagnosis, Differential", "Female", "Humans", "Immunohistochemistry", "Phenotype", "Pregnancy", "Urinary Bladder Diseases", "Uterine Cervical Neoplasms"], "year": "2010", "abstractText": "Endocervicosis of the urinary bladder is a very rare tumor-like benign lesion. In the present report, a case in a 34-year-old woman, who has a prior Caesarean section at the age of 30 and 2-years history of dysuria, is described. Transvaginal ultrasound, cystoscopy and magnetic resonance imaging demonstrated a solid mass in the posterior wall of the bladder. The mass was removed and histology revealed a haphazard proliferation of endocervical-type mucinous glands scattered through the muscularis propria of bladder wall. Immunohistochemical phenotype of these glands was compared with three normal uterine endocervices and two cases of well-differentiated mucinous adenocarcinoma of the uterine cervix. Endocervicosis glands displayed positive reaction for antibodies against estrogen receptor, progesterone receptor, CAM 5.2, cytokeratin 7, CA125, HBME-1 and carcinoembryonic antigen, which showed positivity in normal endocervices. On the other hand, only glands of well-differentiated mucinous adenocarcinoma expressed human gastric mucin and showed high proliferative index of Ki-67. These results supported the hypothesis of its M?llerian origin. Furthermore, diffuse distribution of estrogen and progesterone receptors, lack of human gastric mucin and low proliferative activity were distinct features for endocervicosis compared to well-differentiated mucinous adenocarcinoma.", "pmid": "20594276", "title": "Immunohistochemical phenotype of the urinary bladder endocervicosis: comparison with normal endocervix and well-differentiated mucinous adenocarcinoma of uterine cervix."}, {"journal": "American journal of diseases of children (1960)", "meshMajor": ["Anemia", "Child, Preschool", "Diseases in Twins", "Erythroblasts", "Erythrocyte Count", "Erythrocytes", "Female", "Humans", "Infant", "Male", "Reticulocytes"], "year": "1981", "abstractText": "Seventeen patients aged 7 to 33 months, including a pair of identical twin girls, came to the Children's Memorial Hospital, Chicago, between January 1975 and December 1979 with transient normocytic anemia and reticulocytopenia. In 16 of the patients, bone marrow aspirates were obtained; 15 showed erythroblastopenia and one showed erythroid hyperplasia indicative of recovery. Except for a cluster of six cases occurring from July to October 1979, no seasonal variation was observed. Unlike patients with congenital hypoplastic anemia, all 17 patients were of normal stature. Other distinguishing features of transient erythroblastopenia of childhood included onset after early infancy, normocytosis, and rapid, spontaneous recovery.", "pmid": "7293995", "title": "Transient erythroblastopenia of childhood. Review of 17 cases, including a pair of identical twins."}, {"journal": "Brain research bulletin", "meshMajor": ["Animals", "In Vitro Techniques", "Medulla Oblongata", "Membrane Potentials", "Motor Neurons", "Neural Inhibition", "Rats", "Somatostatin", "Vagus Nerve"], "year": "1986", "abstractText": "Somatostatin-14 (SOM) effects on electrical properties of membrane in rat brainstem slice preparations were studied in vitro by intracellular recording. Vagal motoneurons in the nucleus dorsalis motoris nervi vagi (DMV) were hyperpolarized by SOM. SOM increased both negativity of membrane potential and input membrane conductance, and decreased synaptic noise. The effects persisted during synaptic blockade by tetrodotoxin (TTX) or [Ca]o-free-high-[Mg]o perfusion. The reversal potential of the hyperpolarization induced by SOM depended on [K]o concentration. Hill's coefficient calculated from the dose-response curve was 2. The results suggest that SOM may inhibit visceral organ functions through the DMV.", "pmid": "2876758", "title": "Effect of somatostatin on the vagal motor neuron in the rat."}, {"journal": "Journal of hand surgery (Edinburgh, Scotland)", "meshMajor": ["Adult", "Aged", "Amyloidosis", "Carpal Tunnel Syndrome", "Humans", "Middle Aged", "Renal Dialysis", "Synovial Membrane"], "year": "1988", "abstractText": "Over a five-year period (1981-1985), nine patients on haemodialysis developed carpal tunnel syndrome. Five patients, following biopsy of synovium in the carpal tunnel or biopsy of thickened epineurium of the median nerve, were found to have amyloid deposits in the soft tissues. The relationship between this condition, dialysis arthropathy and long-term haemodialysis is reviewed. In addition, in this small group of patients no relationship to the side of the fistula has been demonstrated and two patients developed recurrent problems despite initial open decompression of the carpal tunnel.", "pmid": "3249138", "title": "Amyloidosis as a cause of carpal tunnel syndrome in haemodialysis patients."}, {"journal": "Letters in applied microbiology", "meshMajor": ["Animals", "Anti-Infective Agents", "Bacillus", "Biological Products", "Nematoda", "Resveratrol", "Stilbenes"], "year": "2012", "abstractText": "AIMS: \u2002 The aim of the present study was to purify and characterize a natural antimicrobial compound from Bacillus sp. strain N associated with a novel rhabditid entomopathogenic nematode.METHODS AND RESULTS: \u2002 The cell-free culture filtrate of a bacterium associated with a novel entomopathogenic nematode (EPN), Rhabditis (Oscheius) sp. exhibited strong antimicrobial activity. The ethyl acetate extract of the bacterial culture filtrate was purified by column chromatography, and two bioactive compounds were isolated and their chemical structures were established based on spectral analysis. The compounds were identified as 3,4',5-trihydroxystilbene (1) and 3,5-dihydroxy-4-isopropylstilbene (2). The presence of 3,4',5-trihydroxystilbene (resveratrol) is reported for the first time in bacteria. Compound 1 showed antibacterial activity against all the four test bacteria, whereas compound 2 was effective against the Gram-positive bacteria only. Compounds 1 and 2 were active against all the five fungi tested and are more effective than bavistin, the standard fungicide. The antifungal activity of the compounds against the plant pathogenic fungi, Rhizoctonia solani is reported for the first time.CONCLUSIONS: \u2002 Cell-free extract of the bacterium and isolated stilbenes demonstrated high antibacterial activity against bacteria and fungi especially against plant pathogenic fungi. We conclude that the bacterium-associated EPN are promising sources of natural bioactive secondary metabolites.SIGNIFICANCE AND IMPACT OF THE STUDY: \u2002 Stilbene compounds can be used for the control of fungi and bacteria.", "pmid": "22332977", "title": "Bioactive stilbenes from a Bacillus sp. N strain associated with a novel rhabditid entomopathogenic nematode."}, {"journal": "Methods in molecular biology (Clifton, N.J.)", "meshMajor": ["Electron Probe Microanalysis", "Histocytological Preparation Techniques", "Microscopy", "Microscopy, Electron, Scanning", "Minerals", "Plant Cells"], "year": "2014", "abstractText": "This chapter describes protocols using formalin-acetic acid-alcohol (FAA) to fix plant tissues for studying biomineralization by means of scanning electron microscopy (SEM) and qualitative energy-dispersive X-ray microanalysis (EDX). Specimen preparation protocols for SEM and EDX mainly include fixation, dehydration, critical point drying (CPD), mounting, and coating. Gold-coated specimens are used for SEM imaging, while gold- and carbon-coated specimens are prepared for qualitative X-ray microanalyses separately to obtain complementary information on the elemental compositions of biominerals. During the specimen preparation procedure for SEM, some biominerals may be dislodged or scattered, making it difficult to determine their accurate locations, and light microscopy is used to complement SEM studies. Specimen preparation protocols for light microscopy generally include fixation, dehydration, infiltration and embedding with resin, microtome sectioning, and staining. In addition, microwave processing methods are adopted here to speed up the specimen preparation process for both SEM and light microscopy. ", "pmid": "24357384", "title": "Application of SEM and EDX in studying biomineralization in plant tissues."}, {"journal": "Autophagy", "meshMajor": ["Animals", "Autophagy", "Autophagy-Related Proteins", "Cell Line", "Central Nervous System", "Disease Models, Animal", "Encephalomyelitis, Autoimmune, Experimental", "Female", "Inflammation", "Macrophages", "Mice", "Mice, Inbred C57BL", "MicroRNAs", "Microglia", "Microscopy, Electron, Transmission", "PPAR gamma", "Proto-Oncogene Proteins c-bcl-2", "Up-Regulation"], "year": "2019", "abstractText": "Microglia are innate immune cells in the central nervous system (CNS), that supplies neurons with key factors for executing autophagosomal/lysosomal functions. Macroautophagy/autophagy is a cellular catabolic process that maintains cell balance in response to stress-related stimulation. Abnormal autophagy occurs with many pathologies, such as cancer, and autoimmune and neurodegenerative diseases. Hence, clarification of the mechanisms of autophagy regulation is of utmost importance. Recently, researchers presented microRNAs (miRNAs) as novel and potent modulators of autophagic activity. Here, we found that Mir223 deficiency significantly ameliorated CNS inflammation, demyelination and the clinical symptoms of experimental autoimmune encephalomyelitis (EAE) and increased resting microglia and autophagy in brain microglial cells. In contrast, the autophagy inhibitor 3-methylademine (3-MA) aggravated the clinical symptoms of EAE in wild-type (WT) and Mir223-deficienct mice. Furthermore, it was confirmed that Mir223 deficiency in mice increased the protein expression of ATG16L1 (autophagy related 16-like 1 [S. cerevisiae]) and LC3-II in bone marrow-derived macrophage cells compared with cells from WT mice. Indeed, the cellular level of Atg16l1 was decreased in BV2 cells upon Mir223 overexpression and increased following the introduction of antagomirs. We also showed that the 3' UTR of Atg16l1 contained functional Mir223-responsive sequences and that overexpression of ATG16L1 returned autophagy to normal levels even in the presence of Mir223 mimics. Collectively, these data indicate that Mir223 is a novel and important regulator of autophagy and that Atg16l1 is a Mir223 target in this process, which may have implications for improving our understanding of the neuroinflammatory process of EAE. Abbreviations: 3-MA: 3-methylademine; ACTB/\u00e2-actin: actin, beta; ATG: autophagy related; ATG16L1: autophagy related 16-like 1 (S. cerevisiae); BECN1: beclin 1, autophagy related; CNR2: cannabinoid receptor 2 (macrophage); CNS: central nervous system; CQ: chloroquine; EAE: experimental autoimmune encephalomyelitis; FOXO3: forkhead box O3; GAPDH: glyceraldehyde-3-phosphate dehydrogenase; H&E: hematoxylin and eosin; ITGAM: integrin alpha M; LPS: lipoplysaccharide; MAP1LC3/LC3: microtubule-associated protein 1 light chain 3; miRNAs: microRNAs; MS: multiple sclerosis; PPARG: peroxisome proliferator activated receptor gamma; PTPRC: protein tyrosine phosphatase, receptor type, C; RA: rheumatoid arthritis; SQSTM1: sequestosome 1; TB: tuberculosis; TIMM23: translocase of inner mitochondrial membrane 23; TLR: toll-like receptor.", "pmid": "30208760", "title": "Mir223 restrains autophagy and promotes CNS inflammation by targeting ATG16L1."}, {"journal": "The Journal of hand surgery", "meshMajor": ["Computer Communication Networks", "Computers", "Education, Medical, Continuing", "Education, Medical, Graduate", "Faculty, Medical", "Hand", "Humans", "Internship and Residency", "Patient Care Team", "Periodicals as Topic", "Software", "Videoconferencing"], "year": "2014", "abstractText": "With our hand team scattered across several different locations, it is difficult to find a time to get together for our weekly didactic hand conference and monthly hand journal club. In addition, traffic and tight clinical schedules sometimes force conferences to start late or be canceled. Our solution was to set up an on-line conference. Using TeamViewer to host our conference and Skype to host our journal clubs, we experienced increased attendance by both faculty and residents in our\u00a0meetings. In this article, we establish a method of hosting effective on-line videoconferences to facilitate nearly universal participation of our hand team, and we hope to assist others who wish\u00a0to establish similar setups in their communities. ", "pmid": "24315487", "title": "How to establish an interactive eConference and eJournal Club."}, {"journal": "Acta tropica", "meshMajor": ["Animals", "Cote d'Ivoire", "Electrophoresis", "Humans", "Isoenzymes", "Male", "Swine", "Trypanosoma", "Trypanosomiasis, African"], "year": "1981", "abstractText": "'Mini-pigs' were infected with salivarian Trypanozoon clones to examine the persistence and stability of the human serum resistance [Blood Incubation Infectivity Test (BIIT)] and isoenzyme characteristics during infection in a new host. A stock regarded as Trypanosoma brucei, derived from a domestic pig in the Ivory Coast, retained its BIIT negative (serum sensitive), alanine aminotransferase (ALAT) and peptidase 2 (PEP 2) characteristics throughout 343 days of infection in pigs. Similarly there was no change in the BIIT positive (serum resistant) and different ALAT and PEP characteristics of a human isolate from the same area, and regarded as T. b. gambiense, during 154 days before the infection became undetectable. In mixed infections of the two clones in pigs, trypanosomes which were not treated with human serum and inoculated into Mastomys natalensis invariably displayed the 'T. b. brucei' characteristics. However, simultaneous inoculations of trypanosomes treated with human serum into M. natalensis always displayed the characteristics of the T. b. gambiense. Thus, in mixed infections, in which 'T. b. brucei' predominated, the minority 'T. b. gambiense' population was recoverable after treatment with human serum by subinoculation into Mastomys.", "pmid": "6123244", "title": "On the persistence of human serum resistance and isoenzyme patterns of Trypanozoon in experimentally infected pigs."}, {"journal": "The Journal of biological chemistry", "meshMajor": ["Adaptor Proteins, Signal Transducing", "Ataxia Telangiectasia Mutated Proteins", "BRCA1 Protein", "Cell Cycle", "Cell Cycle Proteins", "Checkpoint Kinase 1", "DNA Damage", "DNA, Complementary", "DNA-Binding Proteins", "Dose-Response Relationship, Drug", "Dose-Response Relationship, Radiation", "Down-Regulation", "Glutathione Transferase", "HeLa Cells", "Humans", "K562 Cells", "Microscopy, Fluorescence", "Nuclear Proteins", "Phosphorylation", "Plasmids", "Precipitin Tests", "Protein Binding", "Protein Kinases", "Protein Structure, Tertiary", "Protein-Serine-Threonine Kinases", "RNA, Small Interfering", "S Phase", "Time Factors", "Trans-Activators", "Transfection", "Tumor Suppressor Proteins"], "year": "2003", "abstractText": "BRCA1 is a tumor suppressor involved in DNA repair and damage-induced checkpoint controls. In response to DNA damage, BRCA1 relocalizes to nuclear foci at the sites of DNA lesions. However, little is known about the regulation of BRCA1 relocalization following DNA damage. Here we show that mediator of DNA damage checkpoint protein 1 (MDC1), previously named NFBD1 or Kiaa0170, is a proximate mediator of DNA damage responses that regulates BRCA1 function. MDC1 regulates ataxia-telangiectasia-mutated (ATM)-dependent phosphorylation events at the site of DNA damage. Importantly down-regulation of MDC1 abolishes the relocalization and hyperphosphorylation of BRCA1 following DNA damage, which coincides with defective G(2)/M checkpoint control in response to DNA damage. Taken together these data suggest that MDC1 regulates BRCA1 function in DNA damage checkpoint control.", "pmid": "12611903", "title": "Mediator of DNA damage checkpoint protein 1 regulates BRCA1 localization and phosphorylation in DNA damage checkpoint control."}, {"journal": "Molecular autism", "meshMajor": ["Attention", "Autistic Disorder", "Case-Control Studies", "Child", "Child, Preschool", "Female", "Head Movements", "Humans", "Male", "Neurologic Examination", "Social Behavior"], "year": "2018", "abstractText": "Background: Deficits in motor movement in children with autism spectrum disorder (ASD) have typically been characterized qualitatively by human observers. Although clinicians have noted the importance of atypical head positioning (e.g. social peering and repetitive head banging) when diagnosing children with ASD, a quantitative understanding of head movement in ASD is lacking. Here, we conduct a quantitative comparison of head movement dynamics in children with and without ASD using automated, person-independent computer-vision\u00a0based head tracking (Zface). Because children with ASD often exhibit preferential attention to nonsocial versus social stimuli, we investigated whether children with and without ASD differed in their head movement dynamics depending on stimulus sociality.Methods: The current study examined differences in head movement dynamics in children with (n\u2009=\u200921) and without ASD (n\u2009=\u200921). Children were video-recorded while watching a 16-min video of social and nonsocial stimuli. Three dimensions of rigid head movement-pitch (head nods), yaw (head turns), and roll (lateral head inclinations)-were tracked using Zface. The root mean square of pitch, yaw, and roll was calculated to index the magnitude of head angular displacement (quantity of head movement) and angular velocity (speed).Results: Compared with children without ASD, children with ASD exhibited greater yaw displacement, indicating greater head turning, and greater velocity of yaw and roll, indicating faster head turning and inclination. Follow-up analyses indicated that differences in head movement dynamics were specific to the social rather than the nonsocial stimulus condition.Conclusions: Head movement dynamics (displacement and velocity) were greater in children with ASD than in\u00a0children without ASD, providing a quantitative foundation for previous clinical reports. Head movement differences were evident in lateral (yaw and roll) but not vertical (pitch) movement and were specific to a social rather than nonsocial condition. When presented with social stimuli, children with ASD had higher levels of head movement and moved their heads more quickly than children without ASD. Children with ASD may use head movement to modulate their perception of social scenes.", "pmid": "29492241", "title": "Objective measurement of head movement differences in children with and without autism spectrum disorder."}, {"journal": "The International journal on drug policy", "meshMajor": ["Adolescent", "Armed Conflicts", "Epidemics", "Female", "Food", "HIV Infections", "Humans", "Illicit Drugs", "Interpersonal Relations", "Libya", "Male", "Parent-Child Relations", "Prisons", "Religion", "Risk Factors", "Schools", "Self Concept", "Substance Abuse Treatment Centers", "Substance Abuse, Intravenous", "Substance-Related Disorders", "Surveys and Questionnaires"], "year": "2018", "abstractText": "BACKGROUND: Libya is facing a rapidly growing epidemic of illicit drug use and HIV. This situation is fueled by a complex array of factors, mainly the consequences of the political and military turmoil of the Arab Spring. Although it is extensively documented in other settings that young people are one of the most vulnerable groups to both HIV and illicit drug use, no study has explored this issue among young people in Libya. The current study addresses this research gap.METHODS: This study is a qualitative study using in-depth interviews guided by a semi-structured questionnaire. We used a maximum variation, purposive sampling strategy to recruit male and female participants, aged 14-18 years, from schools, prisons, and community-based informal re-education and rehabilitation centers in Tripoli, Libya.RESULTS: In total, 31 participants were recruited: 6 females and 25 males. Sixteen participants were prisoners and residents of community-based informal re-education and rehabilitation centers, and 15 were recruited in schools. Risk factors for drug use included peer influence, the increased availability and affordability of drugs, disruption of social life and healthy recreational activities, and the distress and casualties of the war. Protective factors were religious beliefs and practices, good parent-child connectedness, and high self-esteem and future aspiration. Risk factors for HIV were insufficient knowledge related to HIV transmission and unsafe injection practices, such as sharing needles and syringes.CONCLUSION: We found individual, interpersonal, family, and structural-level factors that interplayed to shape the vulnerability of young people to drug use and HIV infection in Tripoli, Libya. Structural factors, including the increased availability and affordability of drugs, provided the frame within which other factors, such as peer influence, insufficient knowledge of substance use, and HIV, operated to increase the vulnerability of young people to drugs and HIV, while religious beliefs and parent-child connectedness acted as protective factors. Multisectoral efforts and studies to quantitatively evaluate the magnitude and distribution of these problems are urgently needed.", "pmid": "29272852", "title": "\"Now drugs in Libya are much cheaper than food\": A qualitative study on substance use among young Libyans in post-revolution Tripoli, Libya."}, {"journal": "Journal of the American Academy of Child and Adolescent Psychiatry", "meshMajor": ["Adolescent", "Adolescent Psychiatry", "Age Factors", "Female", "Health Status", "Humans", "Longitudinal Studies", "Male", "Mental Disorders", "Mental Health Services", "Patient Acceptance of Health Care", "Psychiatric Status Rating Scales", "Risk Factors", "Socialization", "Stress, Psychological"], "year": "1993", "abstractText": "OBJECTIVE: To determine the strength of association between mental health disorders in adolescence and disorder in early adulthood.METHOD: The study used mental health data from a longitudinal investigation of a New Zealand birth cohort. Of the 943 with prevalence data for DSM-III disorder at age 15, 890 had prevalence data for DSM-III-R disorder when aged 18 years.RESULTS: Two-thirds of those with disorder at age 15 had disorder at age 18. The residual form of attention deficit disorder, simple phobias, and oppositional disorders (with no other accompanying disorders) were associated with the lowest risk of later disorder and conduct disorder with the highest. With the exception of the overall symptom level, a variety of characteristics examined (e.g., social competence and adversity) could not differentiate between those with transient disorder and those with disorder at both ages. Comparisons of those with recurring disorder and those with new disorder at age 18 showed that in addition to characteristics of the disorder, disadvantage was strongly associated with recurrent disorder.CONCLUSIONS: The risk of later disorder for those with disorder in adolescence was high and differed across type of disorder. Findings suggest that to reduce the risk of disorder in early adulthood, clinicians could play a more active role in community interventions with direct social outcomes.", "pmid": "8282655", "title": "Mental health disorders from age 15 to age 18 years."}, {"journal": "Journal of the American Chemical Society", "meshMajor": ["Amino Acid Sequence", "Animals", "Cell Line", "Crystallography, X-Ray", "Glycoproteins", "Glycosylation", "Macrophage-Activating Factors", "Macrophages", "Mice", "Models, Molecular", "Molecular Sequence Data", "Phagocytosis", "Vitamin D-Binding Protein"], "year": "2006", "abstractText": "Rational protein design has been successfully used to create mimics of natural proteins that retain native activity. In the present work, de novo protein engineering is explored to develop a mini-protein analogue of Gc-MAF, a glycoprotein involved in the immune system activation that has shown anticancer activity in mice. Gc-MAF is derived in vivo from vitamin D binding protein (VDBP) via enzymatic processing of its glycosaccharide to leave a single GalNAc residue located on an exposed loop. We used molecular modeling tools in conjunction with structural analysis to splice the glycosylated loop onto a stable three-helix bundle (alpha3W, PDB entry 1LQ7). The resulting 69-residue model peptide, MM1, has been successfully synthesized by solid-phase synthesis both in the aglycosylated and the glycosylated (GalNAc-MM1) form. Circular dichroism spectroscopy confirmed the expected alpha-helical secondary structure. The thermodynamic stability as evaluated from chemical and thermal denaturation is comparable with that of the scaffold protein, alpha3W, indicating that the insertion of the exogenous loop of Gc-MAF did not significantly perturb the overall structure. GalNAc-MM1 retains the macrophage stimulation activity of natural Gc-MAF; in vitro tests show an identical enhancement of Fc-receptor-mediated phagocytosis in primary macrophages. GalNAc-MM1 provides a framework for the development of mutants with increased activity that could be used in place of Gc-MAF as an immunomodulatory agent in therapy.", "pmid": "16734450", "title": "A designed glycoprotein analogue of Gc-MAF exhibits native-like phagocytic activity."}, {"journal": "Chronobiology international", "meshMajor": ["Animals", "Astronomical Phenomena", "Astronomy", "Female", "Humans", "Periodicity", "Placenta", "Pregnancy", "Pregnancy, Animal", "Reproduction", "Species Specificity"], "year": "1988", "abstractText": "Weekly, twice-monthly, and monthly lunar related rhythms have been alleged for various animal reproductive processes. Herein gestation times of 213 types of terrestrial placental mammals were analyzed for best-fit integer multiples approximating length of any of the above lunar related rhythms. At the same time numeric controls were constituted of a completely random, a block randomized, and a sequential set of numbers spanning the data set. Among test integers 6 through 33, the number 30, approximating the 29.53-day lunar-synodic month, was consistently and statistically a best-fit multiple to the data. This might suggest a once-monthly lunar illumination, but not a twice-monthly gravitational or near-weekly tidal, influence upon animal reproduction. As for a receptor mechanism, the tapetum, or reflective layer of the retina, present in most land mammals, but absent in humans, enhances dim illumination. A suggestion is that because of this visual enhancer, cycling moonlight might be a circa-lunar physiologic timer for many terrestrial mammals.", "pmid": "3219755", "title": "Common 30-day multiple in gestation time of terrestrial placentals."}, {"journal": "European journal of surgical oncology : the journal of the European Society of Surgical Oncology and the British Association of Surgical Oncology", "meshMajor": ["Adenocarcinoma", "Adult", "Aged", "Aged, 80 and over", "Disease-Free Survival", "Humans", "Lymphocyte Count", "Male", "Middle Aged", "Neoplasm Staging", "Neutrophils", "Pancreatic Neoplasms", "Predictive Value of Tests", "Preoperative Period", "ROC Curve", "Risk Factors", "Survival Rate", "Tumor Burden"], "year": "2018", "abstractText": "BACKGROUND: The neutrophil-to-lymphocyte ratio (NLR), which reflects the cancer-induced systemic inflammation response, has been proposed as a risk factor for poor long-term prognosis in cancer. We\u00a0investigated the prognostic role of the NLR and the relationship between the NLR and TNM stage in pancreatic ductal adenocarcinoma (PDAC) patients following curative resection.METHODS: One-hundred thirty-eight consecutive patients with resected PDAC were enrolled between 2004 and 2014. Univariate and multivariate analyses identified variables associated with overall survival (OS) and recurrence-free survival (RFS). Patients were stratified according to the NLR, with an NLR cut-off value of 2.2 being estimated by receiver operating characteristic curve.RESULTS: Compared to patients with a low NLR (?2.2), those with a high preoperative NLR (>2.2) had worse OS and RFS (P\u00a0=\u00a00.017, P\u00a0=\u00a00.029, respectively). For early-stage tumors, tumor size ?20\u00a0mm and a high NLR were independent risk factors for poor OS (hazard ratio (HR): 3.255, 95% confidence interval (CI): 1.082-9.789, P\u00a0=\u00a00.036; HR: 3.690, 95% CI: 1.026-13.272, P\u00a0=\u00a00.046, respectively) and RFS (HR:\u00a03.575, 95% CI: 1.174-10.892, P\u00a0=\u00a00.025; HR: 5.380, 95% CI: 1.587-18.234, P\u00a0=\u00a00.007, respectively). The NLR was not correlated with prognosis in patients with advanced stages.CONCLUSIONS: An elevated preoperative NLR was an important prognosticator for early TNM stage PDAC. The NLR, which is calculated using inexpensive and readily available biomarkers, could be a novel tool for predicting long-term survival in patients, especially those with early stage PDAC.", "pmid": "29807728", "title": "Preoperative neutrophil-to-lymphocyte ratio as a prognosticator in early stage pancreatic ductal adenocarcinoma."}, {"journal": "Acta obstetricia et gynecologica Scandinavica", "meshMajor": ["Abortion, Legal", "Adult", "Cervix Uteri", "Chorionic Gonadotropin", "Dilatation", "Female", "Humans", "Laminaria", "Placenta", "Pregnancy"], "year": "1992", "abstractText": "Serum concentrations of hCG were determined in blood samples taken 18-20 h and immediately before vacuum aspiration in 45 women in gestational weeks 7-9, admitted for legal abortion. In 35 of the women, a laminaria tent was inserted for cervical dilatation immediately after the first blood sampling. Serum hCG values decreased significantly in the women pretreated with laminaria tent, but were unchanged in the untreated women. This finding may indicate that pretreatment with a laminaria tent induces a partial placental detachment.", "pmid": "1315097", "title": "Effect of predilatation of the uterine cervix by laminaria tent on activity of the placenta."}, {"journal": "Brazilian dental journal", "meshMajor": ["Adolescent", "Alveolar Process", "Cephalometry", "Child", "Chin", "Facial Bones", "Humans", "Incisor", "Malocclusion, Angle Class I", "Mandible", "Maxilla", "Molar", "Nasal Bone", "Palate", "Puberty", "Tooth", "Vertical Dimension"], "year": "2004", "abstractText": "The dental and skeletal dimensions of individuals with Class I skeletal pattern in puberty were compared. Eighty patients with Class I malocclusion were selected, independent of the vertical relations (overbite) of the incisors. The sample was divided into 3 groups: normal, short and excessive lower anterior face height, based on facial proportions. The dental and skeletal measurements of the 3 groups were compared among themselves. In the angular measurements, the results showed no correlation in the mandibular plane angle. In the linear measurements, the mandibular length was significantly greater in the group of patients with short lower anterior face height, with a positive correlation among the three groups. The dentoalveolar heights of the incisors had a positive correlation among the three groups in relation to the lower anterior face height, showing that they are responsible for its variation.", "pmid": "15322649", "title": "Dental-skeletal dimensions in growing individuals with variations in the lower facial height."}, {"journal": "Journal of nuclear medicine : official publication, Society of Nuclear Medicine", "meshMajor": ["Cardiomyopathy, Dilated", "Cardiomyopathy, Hypertrophic", "Electrocardiography", "Female", "Heart", "Humans", "Image Processing, Computer-Assisted", "Male", "Middle Aged", "Models, Structural", "Myocardial Contraction", "Myocardial Infarction", "Thallium Radioisotopes", "Tomography, Emission-Computed, Single-Photon", "Ventricular Function, Left"], "year": "1991", "abstractText": "We measured left ventricular (LV) systolic thickening expressed as a systolic thickening ratio in 28 patients, using 201Tl ECG-gated SPECT. Five normals, 15 patients with prior myocardial infarction, 5 with hypertrophic cardiomyopathy, and 3 with dilated cardiomyopathy were studied. The systolic thickening ratio was calculated as [(end-systolic--end-diastolic pixel counts) divided by end-diastolic pixel counts], using the circumferential profile technique of both end-diastolic and end-systolic short axial images. Functional images of the systolic thickening ratio were also displayed with the \"bull's-eye\" method. The mean systolic thickening ratio thus calculated were as follows: normals, 0.53 +/- 0.05 (mean +/- 1 s.d.); non-transmural prior myocardial infarction, 0.33 +/- 0.09; transmural prior myocardial infarction, 0.14 +/- 0.05; hypertrophic cardiomyopathy in relatively nonhypertrophied areas, 0.56 +/- 0.11; hypertrophic cardiomyopathy in hypertrophied areas, 0.23 +/- 0.07; and dilated cardiomyopathy, 0.19 +/- 0.02. The systolic thickening ratio analysis by gated thallium SPECT offers a unique approach for assessing LV function.", "pmid": "1869967", "title": "Assessment of systolic thickening with thallium-201 ECG-gated single-photon emission computed tomography: a parameter for local left ventricular function."}, {"journal": "Annals of plastic surgery", "meshMajor": ["Adolescent", "Adult", "Asian Continental Ancestry Group", "Cartilage", "Cohort Studies", "Esthetics", "Female", "Follow-Up Studies", "Humans", "Male", "Middle Aged", "Nasal Septum", "Nose Deformities, Acquired", "Patient Satisfaction", "Republic of Korea", "Retrospective Studies", "Rhinoplasty", "Risk Assessment", "Treatment Outcome", "Young Adult"], "year": "2017", "abstractText": "BACKGROUND: A deviated nose is a common deformity encountered in rhinoplasty. Over the past several decades, a variety of rhinoplasty techniques have been described focusing on the classification of bony and cartilaginous deviation. Nevertheless, corrective rhinoplasty is still a challenging procedure even for experienced surgeons because of the high recurrence rate of deviation. In attempt to reduce the recurrence rate, the author systematized the complex procedures by using a single technique regardless of the classification of a deviation.MATERIALS AND METHODS: Forty patients who underwent corrective rhinoplasty between June 2009 and December 2014 were reviewed retrospectively. All the patients were operated using 4 main surgical procedures: anterior approach septal correction, unilateral osteotomy, and medialization of the deviated complex to the contralateral intact side, and dorsal augmentation with a dermofat graft. Assessment of improvement was based on photo standardization. The degree of nasal deviation, nasofrontal angle, tip projection-to-nasal length ratio, vertical line of the upper lip-to-tip projection ratio, and columellar-labial angle were measured.RESULTS: Preoperative and postoperative anthropometric measurements revealed that the mean degree of deviation changed from 10.19\u00b0 to 3.43\u00b0 (P < 0.01), and the degree of nasofrontal angle changed from 131.55\u00b0 to 133.14\u00b0 (P < 0.01). All patients responded to both the preoperative and postoperative questionnaires. The questionnaires revealed a significant functional and cosmetic improvement from 36.84\u00b0 to 76.95\u00b0 and 39.45\u00b0 to 79.41\u00b0, respectively (P < 0.0001).CONCLUSIONS: This systematized strategy to correct the Asian deviated nose provided reproducible and consistent results It also resulted in low recurrence rates and high postoperative satisfaction among patients.", "pmid": "27922895", "title": "A Systematized Strategy in Corrective Rhinoplasty for the Asian Deviated Nose."}, {"journal": "PloS one", "meshMajor": ["Cancer Vaccines", "Computer Simulation", "Epitopes", "Gene Expression Regulation, Neoplastic", "Genome, Human", "Genomics", "Humans", "Immunotherapy", "Internet", "Mutation", "Neoplasms", "Peptides", "Precision Medicine"], "year": "2016", "abstractText": "Due to advancement in sequencing technology, genomes of thousands of cancer tissues or cell-lines have been sequenced. Identification of cancer-specific epitopes or neoepitopes from cancer genomes is one of the major challenges in the field of immunotherapy or vaccine development. This paper describes a platform Cancertope, developed for designing genome-based immunotherapy or vaccine against a cancer cell. Broadly, the integrated resources on this platform are apportioned into three precise sections. First section explains a cancer-specific database of neoepitopes generated from genome of 905 cancer cell lines. This database harbors wide range of epitopes (e.g., B-cell, CD8+ T-cell, HLA class I, HLA class II) against 60 cancer-specific vaccine antigens. Second section describes a partially personalized module developed for predicting potential neoepitopes against a user-specific cancer genome. Finally, we describe a fully personalized module developed for identification of neoepitopes from genomes of cancerous and healthy cells of a cancer-patient. In order to assist the scientific community, wide range of tools are incorporated in this platform that includes screening of epitopes against human reference proteome (http://www.imtech.res.in/raghava/cancertope/).", "pmid": "27832200", "title": "A Platform for Designing Genome-Based Personalized Immunotherapy or Vaccine against Cancer."}, {"journal": "Journal of applied physiology (Bethesda, Md. : 1985)", "meshMajor": ["Animals", "Isometric Contraction", "Male", "Muscle Fibers, Skeletal", "Muscle, Skeletal", "Rats", "Rats, Inbred F344", "Reflex", "Regeneration", "Transplantation, Autologous"], "year": "1998", "abstractText": "In rats, combinations of plantar flexor muscles representing approximately 20, 40, 60, and 80% of the mass of the total plantar flexor group were transferred orthotopically in the absence of synergistic muscles and allowed to recover for 120 days. We hypothesized that, compared with their individual control values for structural and functional variables, the transfers would display a hierarchical array of deficits, proportional to their initial mass and, consequently, inversely proportional to the relative load on the transfers. Surprisingly, compared with their individual control values, each muscle transfer displayed deficits of 30-40% in muscle mass, total fiber cross-sectional area, and maximum isometric force, with the exception of the smallest transfer, the plantaris (PLN) muscle, which recovered 100% of its control value for each of these variables. Therefore, except for the PLN transfer, the muscle transfers studied displayed deficits similar in magnitude to those reported for muscles transferred in the presence of synergistic muscles. The greater recovery of the PLN transfer was attributed to the relatively large requirement for force production imposed on this transfer due to the average force requirements of the total plantar flexor group.", "pmid": "9609778", "title": "Recovery of muscle transfers replacing the total plantar flexor muscle group in rats."}, {"journal": "Journal of nephrology", "meshMajor": ["Adult", "Aged", "Calcinosis", "Coronary Disease", "Exercise Test", "Female", "Heart", "Humans", "Kidney Transplantation", "Male", "Middle Aged", "Myocardial Ischemia", "Radionuclide Imaging"], "year": null, "abstractText": "Whether coronary artery calcium (CAC) screening in pretransplant patients may help predict silent myocardial ischemia is unknown. Accordingly, we performed CAC imaging on 46 nondiabetic patients awaiting kidneytransplant. All patients underwent multidetector computed tomography imaging for CAC quantification, and a vasodilator myocardial perfusion stress (MPS) test was performed only in patients with a total CAC score>300 or>100 in a single coronary artery. The mean patient's age was 46+/-14 years and the median dialysis vintage was 33 months (interquartile range 19-53). The median CAC score was 82 (interquartile range 0-700) and correlated with patients' age (p=0.006) and dialysis vintage (p=0.02). Nineteen patients qualified for MPS, but 5 refused the test. Of the remaining 14 patients, 7 patients had normal scans and 7 showed a minimal perfusion defect in the inferoposterior segment of the left ventricle. At the time of writing, 12 patients have undergone successful kidney transplantation without untoward complications. CAC screening does not appear to be associated with silent ischemia in pretransplant patients. Though CAC is extensive in dialysis patients, calcium may be associated with nonobstructive atherosclerotic lesions or calcification of the media layer of the vessel wall.", "pmid": "17048205", "title": "Screening for silent ischemia with coronary artery calcium and nuclear stress testing in nondiabetic patients prior to kidney transplant."}, {"journal": "Journal of molecular biology", "meshMajor": ["Anticodon", "Base Sequence", "Molecular Sequence Data", "Nuclear Magnetic Resonance, Biomolecular", "Nucleic Acid Conformation", "RNA, Transfer, Phe"], "year": "2003", "abstractText": "Post-transcriptional modifications contribute chemistry and structure to RNAs. Modifications of tRNA at nucleoside 37, 3'-adjacent to the anticodon, are particularly interesting because they facilitate codon recognition and negate translational frame-shifting. To assess if the functional contribution of a position 37-modified nucleoside defines a specific structure or restricts conformational flexibility, structures of the yeast tRNA(Phe) anticodon stem and loop (ASL(Phe)) with naturally occurring modified nucleosides differing only at position 37, ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)), and ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)), were determined by NMR spectroscopy and restrained molecular dynamics. The ASL structures had similarly resolved stems (RMSD approximately 0.6A) of five canonical base-pairs in standard A-form RNA. The \"NOE walk\" was evident on the 5' and 3' sides of the stems of both RNAs, and extended to the adjacent loop nucleosides. The NOESY cross-peaks involving U(33) H2' and characteristic of tRNA's anticodon domain U-turn were present but weak, whereas those involving the U(33) H1' proton were absent from the spectra of both ASLs. However, ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) exhibited the downfield shifted 31P resonance of U(33)pGm(34) indicative of U-turns; ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)) did not. An unusual \"backwards\" NOE between Gm(34) and A(35) (Gm(34)/H8 to A(35)/H1') was observed in both molecules. The RNAs exhibited a protonated A(+)(38) resulting in the final structures having C(32).A(+)(38) intra-loop base-pairs, with that of ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) being especially well defined. A single family of low-energy structures of ASL(Phe)-(Cm(32),Gm(34), m(1)G(37),m(5)C(40)) (loop RMSD 0.98A) exhibited a significantly restricted conformational space for the anticodon loop in comparison to that of ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)) (loop RMSD 2.58A). In addition, the ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) average structure had a greater degree of similarity to that of the yeast tRNA(Phe) crystal structure. A comparison of the resulting structures indicates that modification of position 37 affects the accuracy of decoding and the maintenance of the mRNA reading frame by restricting anticodon loop conformational space.", "pmid": "14643656", "title": "Naturally-occurring modification restricts the anticodon domain conformational space of tRNA(Phe)."}, {"journal": "The Plant journal : for cell and molecular biology", "meshMajor": ["Cysteine Endopeptidases", "Gene Knockdown Techniques", "Mass Spectrometry", "Methionine", "Plant Diseases", "Plants, Genetically Modified", "Potyvirus", "Protein Modification, Translational", "RNA Interference", "RNA, Viral", "RNA-Induced Silencing Complex", "Ribosomal Proteins", "Tobacco", "Viral Proteins"], "year": "2016", "abstractText": "Potyviral helper component proteinase (HCPro) is a well-characterized suppressor of antiviral RNA silencing, but its mechanism of action is not yet fully understood. In this study, we used affinity purification coupled with mass spectrometry to identify binding partners of HCPro in potyvirus-infected plant cells. This approach led to identification of various HCPro interactors, including two key enzymes of the methionine cycle, S-adenosyl-L-methionine synthase and S-adenosyl-L-homocysteine hydrolase. This finding, together with the results of enzymatic activity and gene knockdown experiments, suggests a mechanism in which HCPro complexes containing viral and host proteins act to suppress antiviral RNA silencing through local disruption of the methionine cycle. Another group of HCPro interactors identified in this study comprised ribosomal proteins. Immunoaffinity purification of ribosomes demonstrated that HCPro is associated with ribosomes in virus-infected cells. Furthermore, we show that HCPro and ARGONAUTE1 (AGO1), the core component of the RNA-induced silencing complex (RISC), interact with each other and are both associated with ribosomes in planta. These results, together with the fact that AGO1 association with ribosomes is a hallmark of RISC-mediated translational repression, suggest a second mechanism of HCPro action, whereby ribosome-associated multiprotein complexes containing HCPro relieve viral RNA translational repression through interaction with AGO1.", "pmid": "26611351", "title": "Molecular insights into the function of the viral RNA silencing suppressor HCPro."}, {"journal": "Biotechnology and applied biochemistry", "meshMajor": ["Aquatic Organisms", "Cytochrome P-450 Enzyme System", "Moritella", "Protein Conformation"], "year": null, "abstractText": "We have explored the adaptation of the cytochromes P450 (P450) of deep-sea bacteria to high hydrostatic pressures. Strict conservation of the protein fold and functional importance of protein-bound water make P450 a unique subject for the studies of high-pressure adaptation. Earlier, we expressed and purified a fatty-acid binding P450 from the deep-sea bacteria Photobacterium profundum SS9 (CYP261C1). Here, we report purification and initial characterization of its mesophilic ortholog from the shallow-water P. profundum 3TCK (CYP261C2), as well as another piezophilic enzyme, CYP261D1, from deep-sea Moritella sp. PE36. Comparison of the three enzymes revealed a striking peculiarity of the piezophilic enzymes. Both CYP261C1 and CYP261D1 possess an apparent pressure-induced conformational toggle actuated at the pressures commensurate with the physiological pressure of habitation of the host bacteria. Furthermore, in contrast to CYP261C2, the piezophilic CYP261 enzymes may be chromatographically separated into two fractions with different properties, and different thermodynamic parameters of spin equilibrium in particular. According to our concept, the changes in the energy landscape that evolved in pressure-tolerant enzymes must stabilize the less-hydrated, closed conformers, which may be transient in the catalytic mechanisms of nonpiezophilic enzymes. The studies of enzymes of piezophiles should help unravel the mechanisms that control water access during the catalytic cycle.", "pmid": "23586990", "title": "CYP261 enzymes from deep sea bacteria: a clue to conformational heterogeneity in cytochromes P450."}, {"journal": "Chemico-biological interactions", "meshMajor": ["Clinical Trials as Topic", "Drug Industry", "Humans", "Reproducibility of Results", "Toxicology"], "year": "2004", "abstractText": "Over the past decades, a number of drugs have been withdrawn or have required special labeling due to adverse effects observed post-marketing. Species differences in drug toxicity in preclinical safety tests and the lack of sensitive biomarkers and nonrepresentative patient population in clinical trials are probable reasons for the failures in predicting human drug toxicity. It is proposed that toxicology should evolve from an empirical practice to an investigative discipline. Accurate prediction of human drug toxicity requires resources and time to be spent in clearly defining key toxic pathways and corresponding risk factors, which hopefully, will be compensated by the benefits of a lower percentage of clinical failure due to toxicity and a decreased frequency of market withdrawal due to unacceptable adverse drug effects.", "pmid": "15522257", "title": "Accurate prediction of human drug toxicity: a major challenge in drug development."}, {"journal": "European journal of immunology", "meshMajor": ["Animals", "Antibodies, Monoclonal", "Antibody Specificity", "Antigens, Surface", "Cell Line", "Electrophoresis, Polyacrylamide Gel", "Female", "Leukemia, Lymphoid", "Mice", "Mice, Inbred BALB C", "T-Lymphocytes", "Thymus Gland", "beta 2-Microglobulin"], "year": "1982", "abstractText": "A monoclonal antibody, M241, was produced which binds to a human cell surface molecule with properties similar to the murine thymus leukemia (TL) antigen. This human TL-like antigen was found on thymocytes and some T cell lines derived from patients with acute lymphocytic leukemia, but was not found on peripheral blood lymphocytes or B cell lines. The monoclonal antibody M241 was used to immunoprecipitate a molecule from lysates of 125I surface-labeled MOLT 4 cells which had two subunits, a 43-kDa chain and a 12-kDa chain. The small subunit was shown to be beta 2-microglobulin (beta 2m) by immunoprecipitation with a monoclonal antibody, BBM.1, which recognizes human beta 2 m. The TL-like molecule recognized by M241 was shown to be serologically distinct from the HLA-A,B,C molecules recognized by three monoclonal antibodies W6/32, PA2.6 and BB7.8, and distinct from another human thymocyte antigen, the 49 kDa HTA 1 molecule, recognized by the monoclonal antibody NA1/34. Following removal of the HLA-A,B,C molecules, the HTA 1 molecules, and the M241-defined TL-like molecules from MOLT 4 lysates, additional beta 2m-associated molecules were immunoprecipitated with BBM.1. These molecules contained a 45-kDa subunit attached to beta 2m.", "pmid": "6754387", "title": "A monoclonal antibody recognizing a human thymus leukemia-like antigen associated with beta 2-microglobulin."}, {"journal": "Journal of the American Medical Women's Association (1972)", "meshMajor": ["Adult", "Case-Control Studies", "Delivery of Health Care", "Female", "Homeless Persons", "Humans", "Longitudinal Studies", "Poverty", "Stress Disorders, Post-Traumatic", "United States", "Women's Health"], "year": "2001", "abstractText": "OBJECTIVES: To identify childhood antecedents for lifetime post-traumatic stress disorder (PTSD) and to determine how this diagnosis relates to health and service use among extremely poor women.METHODS: We conducted a secondary data analysis of 425 women in the Worcester Family Research Project, a case-control longitudinal study of 220 sheltered homeless and 216 extremely poor housed (never homeless) women in Worcester, Massachusetts.RESULTS: We found that extremely poor women with lifetime PTSD were more likely to have grown up in family environments of violence, threat, and anger than those without PTSD. The strongest risk factor for PTSD was childhood sexual abuse with threat. Low-income women with lifetime PTSD had more bodily pain, even when controlling for other health and demographic factors. Women with PTSD experienced more chronic health conditions and had more problematic relationships with their health care providers and perceived more barriers to care.CONCLUSION: Many low-income women have difficulty using medical care appropriately because of childhood histories of physical and sexual abuse, the subsequent development of post-trauma responses, and structural barriers to care. Given these factors, it is critical that health care clinicians routinely screen for histories of violence and PTSD and develop treatment plans that ensure safety, link current symptoms with prior experiences, and provide support as necessary. A team approach coordinated by a case manager may be the best strategy. Without routine screening for PTSD and sensitive treatment, many extremely poor women will receive compromised health care and may even be retraumatized.", "pmid": "11326804", "title": "Post-traumatic stress disorder in extremely poor women: implications for health care clinicians."}, {"journal": "The Journal of organic chemistry", "meshMajor": ["Adrenergic Uptake Inhibitors", "Atomoxetine Hydrochloride", "Carbonates", "Cesium", "Cyclopropanes", "Methylation", "Models, Chemical", "Nitro Compounds", "Phenols", "Propylamines", "Stereoisomerism"], "year": "2008", "abstractText": "Nucleophilic ring opening of methyl 1-nitrocyclopropanecarboxylates by phenol derivatives in the presence of Cs2CO3 is described. The reaction tolerates a variety of substituents on both the aromatic alcohol and the cyclopropane and affords the products in good yields (53-84%) and with complete preservation of the enantiomeric excess at C-4. The methodology was applied in an enantioselective synthesis of the norepinephrine reuptake inhibitor atomoxetine (Strattera).", "pmid": "18671432", "title": "Nucleophilic addition of phenol derivatives to methyl 1-nitrocyclopropanecarboxylates."}, {"journal": "Viruses", "meshMajor": ["Animals", "Antibodies, Neutralizing", "Antibody Formation", "B-Lymphocytes", "Disease Models, Animal", "Germinal Center", "HIV Antibodies", "HIV Infections", "HIV-1", "Immunoglobulin G", "Immunologic Memory", "Immunophenotyping", "Male", "Mice", "Mice, Inbred BALB C", "Mice, Inbred C57BL", "Phenotype", "env Gene Products, Human Immunodeficiency Virus"], "year": "2014", "abstractText": "Continued efforts to define the immunogenic properties of the HIV-1 envelope glycoproteins (Env) are needed to elicit effective antibody (Ab) responses by vaccination. HIV-1 is a highly neutralization-resistant virus due to conformational and glycan shielding of conserved Ab determinants on the virus spike. Elicitation of broadly neutralizing Abs that bind poorly accessible epitope regions on Env is therefore extremely challenging and will likely require selective targeting of specific sub-determinants. To evaluate such approaches there is a pressing need for in vivo studies in both large and small animals, including mice. Currently, most mouse immunization studies are performed in the BALB/c strain; however, the C57BL/6 strain offers improved possibilities for mechanistic studies due to the availability of numerous knock-out strains on this genetic background. Here, we compared Env immunogenicity in BALB/c and C57BL/6 mice and found that the magnitude of the antigen-specific response was somewhat lower in C57BL/6 than in BALB/c mice by ELISA but not significantly different by B cell ELISpot measurements. We then established protocols for the isolation of single Env-specific memory B cells and germinal center (GC) B cells from immunized C57BL/6 mice to facilitate future studies of the elicited response at the monoclonal Ab level. We propose that these protocols can be used to gain an improved understanding of the early recruitment of Env-specific B cells to the GC as well as the archiving of such responses in the memory B cell pool following immunization. ", "pmid": "25198199", "title": "HIV-1 Env-specific memory and germinal center B cells in C57BL/6 mice."}, {"journal": "Medicinski arhiv", "meshMajor": ["Computer Simulation", "Humans", "Imaging, Three-Dimensional", "Joints", "Surgery, Computer-Assisted"], "year": "2004", "abstractText": "In this paper the authors give an overview of two systems (simulation and navigation) which are very important and give support to clinical work by making possible good visualization of the morphology and kinematics of joints. The approach to each patient with changes in the joints is individualized with a computer generated tomographical images which give very precise data which up to now had been inaccessible with clinical testing as the only alternative was the well known invasive diagnostic procedures. The first case concerns the COJOKS simulation system (COmputerized Joint Kinematics Simulation). The second case is of a navigation operative system which has recently been put into use and was developed on the basis of the GPS system (MSNT). This system is used for the precise determination of the bone structure of joints which is, by way of computer transformed into virtual 3D shape. This gives the surgeon all the data necessary during the operative procedure on bone and joint structures.", "pmid": "15137217", "title": "[Clinical views on simulation and navigation technologies in kinematics of joints in locomotor surgery]."}, {"journal": "Journal of virology", "meshMajor": ["Antibodies, Monoclonal", "Cell Line", "Genotype", "Influenza A virus", "Kinetics", "Mutation", "Phenotype"], "year": "1992", "abstractText": "The rates of mutation to the mar (monoclonal antibody-resistant) genotype of individual influenza virus plaque isolates, obtained from a stock generated after two successive cloning steps, have been determined by the fluctuation test. When a random sample of 60 clones was analyzed, 7 contained a proportion of mar mutants significantly higher than the average, and among them, 2 showed a mutation rate two to three times higher than the average value obtained for the virus population when the hemagglutinin-specific monoclonal antibody 2G10 was used. In order to look for mutants with higher mutation rates, a systematic search was carried out with a nonmutagenized virus stock, and several clones with increased mutation rates were isolated. One of them (mut43) was characterized further and was shown to have a mutation rate three to four times higher than that of the virus population at the sites defined by two nonoverlapping, hemagglutinin-specific monoclonal antibodies as well as at the site defined by a neuraminidase-specific monoclonal antibody. These results indicate that the mutation rate of an influenza virus is a weighted average of the contributions of a heterogeneous population. The consequences of this fact for the adaptive evolution of influenza viruses are discussed.", "pmid": "1548773", "title": "Heterogeneity of the mutation rates of influenza A viruses: isolation of mutator mutants."}, {"journal": "Acta biologica Hungarica", "meshMajor": ["Animals", "Cells, Cultured", "Dose-Response Relationship, Drug", "Estradiol", "Estrogens", "Female", "Growth Hormone", "Pituitary Gland, Anterior", "Prolactin", "Rats", "Rats, Sprague-Dawley"], "year": "1994", "abstractText": "Female rats were treated in vivo with estrogen for three weeks. The pituitaries were then removed and their responses to somatostatin, dopamine, TRH, hGHRH(1-44)NH2, or their combination were examined in a superfused pituitary cell system. Somatostatin did not decrease basal prolactin secretion in the control cells, but it caused a dose-dependent decrease in prolactin release from the estrogen pretreated cells. Estrogen pretreatment did not alter the sensitivity of pituitary cells to dopamine; dopamine was equally effective in the control and estrogen pretreated pituitaries in decreasing the basal prolactin secretion and TRH induced prolactin release. Prolactin release from the estrogen pretreated cells, stimulated by 25 nM TRH was inhibited by 1 nM somatostatin and nearly totally abolished by 25 nM somatostatin, whereas in the control cells only the higher dose of somatostatin caused some decrease in the prolactin release. Estrogen pretreated cells showed a reduced response to GHRH. Somatostatin did not decrease the basal secretion of GH in either group, but at 1 nM dose it completely abolished the GH release induced by equimolar concentration of GHRH. However, after somatostatin was eliminated from the system, a delayed GH release could be observed that was greater in the control pituitaries than in the estrogen pretreated pituitaries. It is concluded that in vivo treatment with estrogen reduces GH secretion in response to GHRH and increases prolactin secretion after TRH stimulation. After estrogen treatment, the basal and TRH stimulated prolactin release can be effectively reduced by somatostatin. These effects could be observed in vitro using estrogen free tissue culture medium for up to 36 hours after the removal of the pituitaries. The reciprocal changes in GH and prolactin secretion support the concept of the transdifferentiation of GH and prolactin secreting cells.", "pmid": "7725821", "title": "Reciprocal changes in prolactin and growth hormone secretion in vitro after in vivo estrogen treatment."}, {"journal": "The Annals of otology, rhinology, and laryngology", "meshMajor": ["Adult", "Dilatation", "Female", "Follow-Up Studies", "Humans", "Laryngostenosis", "Male", "Middle Aged", "Retrospective Studies", "Severity of Illness Index", "Tracheal Stenosis", "Treatment Outcome", "Voice Quality"], "year": "2015", "abstractText": "OBJECTIVE: To assess the impact of suspension microlaryngoscopy with balloon dilation on voice-related quality of life (V-RQOL) in laryngotracheal stenosis (LTS).METHODS: Retrospective chart review of LTS patients dilated at a tertiary-care academic hospital from 2010 to 2013. Data were obtained and then analyzed. LTS was stratified by (1) subglottic or tracheal stenosis and (2) multilevel stenosis (MLS; glottic and subglottic/tracheal). Pre- and postoperative V-RQOL and grade, roughness, breathiness, asthenia, strain (GRBAS) scores were compared. The number and frequency of balloon dilation procedures over the lifetime were secondary outcome variables.RESULTS: Thirty-eight patients were identified: 26 subglottic/tracheal and 12 multilevel. Of these, 71.4% required multiple dilations, with greatest dilations/patient for multilevel stenosis (4.8). V-RQOL improved in the 27 patients with completed pre- and postoperative scores from a mean of 70.4 to 80 (P=.025). Pre/postoperative V-RQOLs for tracheal/subglottic (mean, 82.8/93.8) were significantly higher (P=.0001/.0001) than multilevel stenosis (48/55.3). Voice quality-of-life improvement was significant for the subglottic/tracheal cohort (P=.036) but not for the MLS group. GRBAS was performed pre- and postoperatively in 10 patients with improvement in all domains except breathiness.CONCLUSION: Laryngotracheal stenosis is associated with dysphonia. Patients with glottic involvement have significantly worse voice quality of life than those with tracheal/subglottic stenosis. Endoscopic balloon dilation improves V-RQOL in patients with subglottic/tracheal stenosis.", "pmid": "25519815", "title": "Voice quality in laryngotracheal stenosis: impact of dilation and level of stenosis."}, {"journal": "Journal of neuroimmunology", "meshMajor": ["Adult", "Aged", "Antibodies", "Female", "Gangliosides", "Humans", "Immunoglobulin A", "Immunoglobulin G", "Immunoglobulin Isotypes", "Immunoglobulin M", "Male", "Motor Neuron Disease", "Nervous System Diseases", "Peripheral Nervous System Diseases", "Reference Values"], "year": "1995", "abstractText": "Antibodies of the IgM, IgG and IgA class against GM1, asialo-GM1, GD1b and GM2 gangliosides were determined in the sera of patients with motor neuron disease (MND), peripheral neuropathy, other neurological diseases (OND) and healthy individuals. Antibodies of the three immunoglobulin classes were present in healthy persons. MND patients did not differ from OND or controls in anti-GM1 titers of the three isotypes. In the group of peripheral neuropathy, no elevations of antibody titers were observed in patients with sensory or sensory-motor neuropathy; however, four out of 12 patients with the motor variety had very high levels of IgM or IgG antibodies. Two of these four patients also had increased titers of IgA antibodies, but no patients exhibited high titers restricted to this isotype.", "pmid": "7822479", "title": "Presence and isotype of anti-ganglioside antibodies in healthy persons, motor neuron disease, peripheral neuropathy, and other diseases of the nervous system."}, {"journal": "Connective tissue research", "meshMajor": ["Celecoxib", "Cells, Cultured", "Chondrocytes", "Cyclooxygenase 1", "Cyclooxygenase 2", "Cyclooxygenase Inhibitors", "Dinoprostone", "Humans", "Interleukin-1beta", "Pyrazoles", "Receptors, Prostaglandin E", "Receptors, Prostaglandin E, EP4 Subtype", "Sulfonamides"], "year": "2009", "abstractText": "Prostaglandin (PG) E(2), which exerts its actions via the PG receptors EP1-4, is produced from arachidonic acid by cyclooxygenase (COX)-1 and COX-2. The aim of this study was to investigate the mechanisms by which interleukin (IL)-1beta induces the expression of PG receptors in cultured human chondrocytes and to explore the role of PGE(2) in this process. The cells were cultured with 0, 10, or 100 U/mL IL-1beta with or without 1 muM celecoxib, a specific inhibitor of COX-2, for up to 28 days. Expression of the genes encoding COX-1, COX-2, and EP1-4 was quantified using real-time PCR, and expression of the corresponding proteins was examined using immunohistochemical staining. PGE(2) production was determined using ELISA. IL-1beta treatment caused a marked dose- and time-dependent increase in the levels of PGE(2), COX-2, and EP4 as compared with the untreated control. It did not affect the expression of COX-1, and it decreased the expression of EP1 and EP2. EP3 expression was not detected in either the absence or the presence of IL-1beta. When celecoxib was also present, IL-1beta failed to stimulate PGE(2) production and EP4 expression, but its stimulatory effect on COX-2 expression and its inhibitory effect on EP1 and EP2 expression were unchanged. IL-1beta increases the production of PGE(2), COX-2, and the PG receptor EP4 in cultured human chondrocytes. The increase in EP4 expression appears to be a result of the increased PGE(2) production.", "pmid": "19444759", "title": "IL-1beta stimulates the expression of prostaglandin receptor EP4 in human chondrocytes by increasing production of prostaglandin E2."}, {"journal": "Journal of cardiac surgery", "meshMajor": ["Cardiac Catheterization", "Child", "Child, Preschool", "Cineangiography", "Female", "Heart Defects, Congenital", "Humans", "Infant", "Lung", "Male", "Mucocutaneous Lymph Node Syndrome", "Pulmonary Artery", "Pulmonary Veins", "Regional Blood Flow", "Tetralogy of Fallot", "Vascular Resistance", "Ventilation-Perfusion Ratio"], "year": null, "abstractText": "OBJECTIVES: This study was done to clarify which diameter, that of the pulmonary arteries (PAs) or that of the pulmonary veins (PVs), more precisely reflects pulmonary blood flow (PBF) bilaterally and unilaterally.METHODS: To evaluate bilateral PBF, we studied 15 consecutive patients with Kawasaki disease as normal patients and 30 patients with tetralogy of Fallot who received cardiac catheterization. To evaluate unilateral PBF, 20 patients with various congenital heart diseases undergoing cineangiography and lung perfusion scintigraphy were studied. The diameter of PA was measured immediately proximal to the origin of the first lobar branches bilaterally, and right PA area, left PA area, PA area (mm2), and PA index (mm2/m2) were calculated. The diameter of PV was also measured distal to the junction with the left atrium. Right PV area, left PV area, PV area (mm2), and PV index (mm2/m2) were calculated from these diameters. Pulmonary blood flow (PBF) was obtained by the Fick method during catheterization. To evaluate unilateral PBF, PBF was divided into right and left PBF according to the right/left perfusion ratio measured by lung perfusion scintigraphy.RESULTS: Evaluation of bilateral PBF was as follows: in normal patients, PA and PV areas were correlated with body surface area (r = 0.88, p = 0.0001 and r = 0.93, p = 0.0001); PA index and PV index ranged from 248 to 436 (mean = 343) mm2/m2 and from 346 to 595 (mean = 466) mm2/m2, respectively, and were constant irrespective of body surface area; PA and PV areas were correlated with PBF in normal patients, as well as in patients with tetralogy of Fallot. There was a better correlation between PV area and PBF than between PA area and PBF in normal patients, as well as a significantly better correlation in patients with tetralogy of Fallot. Evaluation of unilateral PBF was as follows: right PV area was correlated with right PBF (p = 0.0002), while right PA area was not; left PV area and left PA area were correlated with left PBF; right/left PV area ratio was correlated with the right/left perfusion ratio with better agreement than right/left PA area ratio.CONCLUSION: Our data suggest that the size of PVs in patients with congenital heart disease may be more useful than the size of PAs to indicate bilateral and unilateral PBF than the size of PAs. Differences in PV area of each lung may be a suitable indicator of discrepancy in blood flow to each lung.", "pmid": "9591181", "title": "Diameters of the pulmonary arteries and veins as an indicator of bilateral and unilateral pulmonary blood flow in patients with congenital heart disease."}, {"journal": "Clinical hemorheology and microcirculation", "meshMajor": ["Adolescent", "Adult", "Aerobiosis", "Blood Viscosity", "Body Mass Index", "Diet", "Exercise Test", "Fibrinogen", "Football", "Heart Rate", "Hematocrit", "Humans", "Leisure Activities", "Oxygen Consumption", "Work Capacity Evaluation"], "year": "1998", "abstractText": "While it is well established that blood viscosity is decreased in sportsmen and related to fitness, the involvement of fibrinogen in this relationship is less well defined. Relationships among fitness, rheology and fibrinogen were investigated in 32 football players (age 17-33 years: 19 professionals and 13 leisure players). A submaximal 25 min exercise-test was performed and allowed the calculation of aerobic working capacity. Aerobic working capacity (W170 and VO2 max) was negatively correlated to fibrinogen (r = -0.531, p < 0.01 and r = -0.623, p < 0.01), while on the whole sample the correlation to viscosity and erythrocyte aggregation was not significant. When subjects were divided into two subgroups according to their plasma fibrinogen concentration, the aerobic working capacity (either expressed as W170 or VO2 max) is higher when plasma fibrinogen level is lower than 2.7 g/l. Thus, there is a highly significant negative correlation between fibrinogen and fitness in these sportsmen, independent of blood rheology. These data suggest that rheology and fibrinogen are to some extent separate determinants of an individual's fitness.", "pmid": "9874357", "title": "Fibrinogen is negatively correlated with aerobic working capacity in football players."}, {"journal": "Cadernos de saude publica", "meshMajor": ["Aged", "Brazil", "Cross-Sectional Studies", "Diabetes Mellitus", "Female", "Health Knowledge, Attitudes, Practice", "Humans", "Male", "Middle Aged", "Prevalence", "Socioeconomic Factors", "Surveys and Questionnaires"], "year": "2010", "abstractText": "The aim of the study was to assess the prevalence of self-reported diabetes in the elderly, identifying associated factors, knowledge, and practices related to treatment options. This was a cross-sectional population-based study with stratified clustered two-stage sampling in six municipalities in the State of S?o Paulo, Brazil. Among the 1,949 elderly, 15.4% presented self-reported diabetes. Body mass index and exercising were statistically associated with diabetes. There was a significant difference between diabetics and non-diabetics in terms of self-rated health, hospitalization, self-reported illness in the previous two weeks, and report of the following diseases: hypertension, anemia, chronic kidney disease, and heart disease. In terms of per capita family income, there was no difference in regular medical visits, participation in discussion groups, and control practices. The findings show the need for behavior changes to prevent and control diabetes and its complications. Educational interventions are needed to expand the coverage of diabetes care.", "pmid": "20209221", "title": "[Self-reported diabetes in the elderly: prevalence, associated factors, and control practices]."}, {"journal": "BMC developmental biology", "meshMajor": ["Arachis", "Aspergillus", "Disasters", "Expressed Sequence Tags", "Gene Expression Profiling", "Gene Expression Regulation, Plant", "Gene Library", "Genes, Plant", "Oligonucleotide Array Sequence Analysis", "Seeds"], "year": "2008", "abstractText": "BACKGROUND: Peanut (Arachis hypogaea L.) is an important crop economically and nutritionally, and is one of the most susceptible host crops to colonization of Aspergillus parasiticus and subsequent aflatoxin contamination. Knowledge from molecular genetic studies could help to devise strategies in alleviating this problem; however, few peanut DNA sequences are available in the public database. In order to understand the molecular basis of host resistance to aflatoxin contamination, a large-scale project was conducted to generate expressed sequence tags (ESTs) from developing seeds to identify resistance-related genes involved in defense response against Aspergillus infection and subsequent aflatoxin contamination.RESULTS: We constructed six different cDNA libraries derived from developing peanut seeds at three reproduction stages (R5, R6 and R7) from a resistant and a susceptible cultivated peanut genotypes, 'Tifrunner' (susceptible to Aspergillus infection with higher aflatoxin contamination and resistant to TSWV) and 'GT-C20' (resistant to Aspergillus with reduced aflatoxin contamination and susceptible to TSWV). The developing peanut seed tissues were challenged by A. parasiticus and drought stress in the field. A total of 24,192 randomly selected cDNA clones from six libraries were sequenced. After removing vector sequences and quality trimming, 21,777 high-quality EST sequences were generated. Sequence clustering and assembling resulted in 8,689 unique EST sequences with 1,741 tentative consensus EST sequences (TCs) and 6,948 singleton ESTs. Functional classification was performed according to MIPS functional catalogue criteria. The unique EST sequences were divided into twenty-two categories. A similarity search against the non-redundant protein database available from NCBI indicated that 84.78% of total ESTs showed significant similarity to known proteins, of which 165 genes had been previously reported in peanuts. There were differences in overall expression patterns in different libraries and genotypes. A number of sequences were expressed throughout all of the libraries, representing constitutive expressed sequences. In order to identify resistance-related genes with significantly differential expression, a statistical analysis to estimate the relative abundance (R) was used to compare the relative abundance of each gene transcripts in each cDNA library. Thirty six and forty seven unique EST sequences with threshold of R > 4 from libraries of 'GT-C20' and 'Tifrunner', respectively, were selected for examination of temporal gene expression patterns according to EST frequencies. Nine and eight resistance-related genes with significant up-regulation were obtained in 'GT-C20' and 'Tifrunner' libraries, respectively. Among them, three genes were common in both genotypes. Furthermore, a comparison of our EST sequences with other plant sequences in the TIGR Gene Indices libraries showed that the percentage of peanut EST matched to Arabidopsis thaliana, maize (Zea mays), Medicago truncatula, rapeseed (Brassica napus), rice (Oryza sativa), soybean (Glycine max) and wheat (Triticum aestivum) ESTs ranged from 33.84% to 79.46% with the sequence identity >/= 80%. These results revealed that peanut ESTs are more closely related to legume species than to cereal crops, and more homologous to dicot than to monocot plant species.CONCLUSION: The developed ESTs can be used to discover novel sequences or genes, to identify resistance-related genes and to detect the differences among alleles or markers between these resistant and susceptible peanut genotypes. Additionally, this large collection of cultivated peanut EST sequences will make it possible to construct microarrays for gene expression studies and for further characterization of host resistance mechanisms. It will be a valuable genomic resource for the peanut community. The 21,777 ESTs have been deposited to the NCBI GenBank database with accession numbers ES702769 to ES724546.", "pmid": "18248674", "title": "Peanut gene expression profiling in developing seeds at different reproduction stages during Aspergillus parasiticus infection."}, {"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Amino Acids", "Animals", "Glutamates", "Glutathione", "Kidney", "Kinetics", "Methionine", "Mice", "Serine", "gamma-Glutamyltransferase"], "year": "1978", "abstractText": "The function of the gamma-glutamyl cycle was explored in in vivo studies in which amino acids and specific inhibitors of cycle enzymes (gamma-glutamyl transpeptidase, gamma-glutamyl cyclotransferase, gamma-glutamylcysteine synthetase, and 5-oxoprolinase) were administered to mice. The findings, which show that the gamma-glutamyl cycle functions in vivo, support the conclusion that gamma-glutamyl amino acids formed by gamma-glutamyl transpeptidase from externally supplied amino acids and intracellular glutathione are translocated into the cell and thus indicate that there is a significant physiological connection between the metabolism of glutathione and the transport of amino acids.", "pmid": "31622", "title": "Evidence that the gamma-glutamyl cycle functions in vivo using intracellular glutathione: effects of amino acids and selective inhibition of enzymes."}, {"journal": "Journal of cardiology", "meshMajor": ["Aged", "Angioplasty, Balloon, Coronary", "Coronary Circulation", "Female", "Humans", "Hypertension", "Male", "Middle Aged", "Myocardial Infarction", "Myocardial Revascularization", "Stents", "Stroke Volume", "Thrombolytic Therapy", "Ventricular Function, Left"], "year": "2005", "abstractText": "OBJECTIVES: To evaluate the effectiveness of distal protection with the GuardWire Plus during primary angioplasty in patients with acute myocardial infarction.METHODS: Thirty-eight consecutive patients undergoing stent implantation with distal protection using the GuardWire Plus (DP-group) were compared with a matched control group undergoing conventional stent implantation after balloon angioplasty without distal protection (NDP-group). Microvascular circulation after revascularization was assessed by Thrombolysis in Myocardial Infarction (TIMI) flow grade, myocardial blush grade (MBG), serum creatine kinase peak release, and ST resolution. Left ventricular ejection fraction was measured by echocardiography at discharge. Follow-up quantitative coronary angiography and left ventriculography were performed 6 months after percutaneous coronary intervention. Quantitative coronary angiography data, restenosis rate, target lesion revascularization rate and follow-up left ventricular ejection fraction were also compared between the two groups.RESULTS: No significant differences were observed in baseline clinical and angiographic characteristics between the two groups. The TIMI flow grade 3 (DP-group 81.6% vs NDP-group 57.9%)and MBG 3 (57.9% vs 30.6%)were significantly greater in the DP-group respectively (p < 0.05). Post procedural ST-segment resolution > or = 50% was found in a significantly higher percentage of patients in the DP-group (68.4% vs 42.1%, p < 0.05). Left ventricular ejection fraction at discharge was significantly greater in the DP-group (55.5 +/- 8.5% vs 45.7 +/- 11.1%, p < 0.05). However, 6 months after the percutaneous coronary intervention, no significant difference was observed between the two groups. Restenosis rate and target lesion revascularization rate were similar in the two groups.CONCLUSIONS: Distal protection with the GuardWire Plus improved the microvascular circulation as assessed by TIMI flow grade, MBG, and ST resolution. Furthermore, left ventricular ejection fraction at discharge was improved.", "pmid": "15801274", "title": "Effectiveness of distal protection with the GuardWire Plus during primary angioplasty for acute myocardial infarction."}, {"journal": "Clinical nuclear medicine", "meshMajor": ["Adolescent", "Cumulative Trauma Disorders", "Female", "Foot Injuries", "Fractures, Stress", "Humans", "Metatarsal Bones", "Music", "Radionuclide Imaging"], "year": "2007", "abstractText": "A 14-year-old girl presented with a painful right foot. She was an elite water-polo player and could recall no history of specific trauma to the foot. On close and persistent questioning, she admitted to having taken up playing the drums recently, with practice sessions of up to 4 h/d. She used the foot drum with her right foot and had noticed that this was becoming increasingly painful and prevented her playing the instrument for the last 2 days. Plain films of the foot were originally reported as normal, but revised to abnormal after the scintigraphic study. Bone scintigraphy confirmed a stress fracture of the right 3rd metatarsal bone. Stress fractures of the 3rd metatarsal bone are rare with only 2 previous reports in the literature.", "pmid": "17710033", "title": "Drummer's fracture of the third metatarsal bone."}, {"journal": "IEEE/ACM transactions on computational biology and bioinformatics", "meshMajor": ["Algorithms", "Benzene", "Computational Biology", "Computer Graphics", "Drug Design", "Isomerism", "Models, Chemical"], "year": null, "abstractText": "Enumeration of chemical structures is useful for drug design, which is one of the main targets of computational biology and bioinformatics. A chemical graph with no other cycles than benzene rings is called tree-like, and becomes a tree possibly with multiple edges if we contract each benzene ring into a single virtual atom of valence 6. All tree-like chemical graphs with a given tree representation are called the substituted benzene isomers of . When we replace each virtual atom in with a benzene ring to obtain a substituted benzene isomer, distinct isomers of are caused by the difference in arrangements of atom groups around a benzene ring. In this paper, we propose an efficient algorithm that enumerates all substituted benzene isomers of a given tree representation . Our algorithm first counts the number of all the isomers of the tree representation by a dynamic programming method. To enumerate all the isomers, for each , our algorithm then generates the th isomer by backtracking the counting phase of the dynamic programming. We also implemented our algorithm for computational experiments.", "pmid": "28113952", "title": "Enumerating Substituted Benzene Isomers of Tree-Like Chemical Graphs."}, {"journal": "Journal of molecular and cellular cardiology", "meshMajor": ["Adult", "Aged", "Cardiomyopathy, Hypertrophic", "Carrier Proteins", "Computer Simulation", "Female", "Gene Expression Profiling", "Genome, Human", "Humans", "Male", "MicroRNAs", "Middle Aged", "Mutation", "Myocardium", "Phosphorylation", "RNA, Messenger", "Reproducibility of Results", "Reverse Transcriptase Polymerase Chain Reaction", "Signal Transduction", "TRPM Cation Channels", "Transcriptome", "Troponin I", "Young Adult"], "year": "2013", "abstractText": "Hypertrophic cardiomyopathy (HCM) is predominantly caused by mutations in genes encoding sarcomeric proteins. One of the most frequent affected genes is MYBPC3, which encodes the thick filament protein cardiac myosin binding protein C. Despite the prevalence of HCM, disease pathology and clinical outcome of sarcomeric mutations are largely unknown. We hypothesized that microRNAs (miRNAs) could play a role in the disease process. To determine which miRNAs were changed in expression, miRNA arrays were performed on heart tissue from HCM patients with a MYBPC3 mutation (n=6) and compared with hearts of non-failing donors (n=6). 532 out of 664 analyzed miRNAs were expressed in at least one heart sample. 13 miRNAs were differentially expressed in HCM compared with donors (at p<0.01, fold change ? 2). The genomic context of these differentially expressed miRNAs revealed that miR-204 (fold change 2.4 in HCM vs. donor) was located in an intron of the TRPM3 gene, encoding an aspecific cation channel involved in calcium entry. RT-PCR analysis revealed a trend towards TRPM3 upregulation in HCM compared with donor myocardium (fold change 2.3, p=0.078). In silico identification of mRNA targets of differentially expressed miRNAs showed a large proportion of genes involved in cardiac hypertrophy and cardiac beta-adrenergic receptor signaling and we showed reduced phosphorylation of cardiac troponin I in the HCM myocardium when compared with donor. HCM patients with MYBPC3 mutations have a specific miRNA expression profile. Downstream mRNA targets reveal possible involvement in cardiac signaling pathways.", "pmid": "24083979", "title": "MicroRNA transcriptome profiling in cardiac tissue of hypertrophic cardiomyopathy patients with MYBPC3 mutations."}, {"journal": "Journal of food science", "meshMajor": ["Antibodies", "Enzyme-Linked Immunosorbent Assay", "Fermentation", "Heating", "Hot Temperature", "Humans", "Reproducibility of Results", "Soy Foods", "Soybean Proteins", "Soybeans"], "year": "2014", "abstractText": "UNLABELLED: Soybean is used in processed foods worldwide. Because soybean can cause adverse reactions in some atopic patients, appropriate labeling regarding its content in processed foods is needed to better protect consumers. In the previous study, we developed a reliable sandwich Enzyme Linked Immunosorbent Assay (ELISA) method with high sensitivity and specificity for detecting soybean proteins by using antibody to Gly m Bd 30K, which was originally characterized as a vacuolar protein with a molecular mass of 34 kDa in soybean. The ELISA displayed satisfactory repeatability and reproducibility in an interlaboratory evaluation. However, it could not detect soybean protein in fermented soybean products. We therefore developed an extraction method combined with a heating process to inhibit soybean protein degradation by microbial proteolytic enzymes in fermented soybean products. This extraction method enables the sensitive detection of soybean protein in fermented soybean products such as natto and miso. It was able to detect with high-sensitivity soybean protein present at 10 \u00ecg/g levels in model processed foods. This method is suitable for quantifying soybean protein in processed foods without the degrading effects of microbial proteolytic enzymes. The present extraction method can be used sensitively to monitor labeling systems in a reliable manner and should be useful for the mandatory inspections required under Japanese regulations.PRACTICAL APPLICATION: The extraction and ELISA methods that we developed enable sensitive detection of soybean protein in soybean products, including fermented foods. These methods should be useful for reliable and sensitive monitoring of product labeling systems and should help to solve the problem of insensitive in soybean labeling of processed foods.", "pmid": "24811351", "title": "Detection of soybean proteins in fermented soybean products by using heating extraction."}, {"journal": "The Journal of allergy and clinical immunology", "meshMajor": ["Air Pollution, Indoor", "Animals", "Artemisia", "Asthma", "Germany", "Humans", "Lipopolysaccharides", "Mice, Inbred BALB C", "Pantoea", "Particulate Matter", "Pollen", "Pseudomonas"], "year": "2019", "abstractText": "BACKGROUND: Endotoxin (LPS) released from gram-negative bacteria causes strong immunologic and inflammatory effects and, when airborne, can contribute to respiratory conditions, such as allergic asthma.OBJECTIVES: We sought to identify the source of airborne endotoxin and the effect of this endotoxin on allergic sensitization.METHODS: We determined LPS levels in outdoor air on a daily basis for 4 consecutive years in Munich (Germany) and Davos (Switzerland). Air was sampled as particulate matter (PM) greater than 10\u00a0\u00ecm (PM\u00a0>\u00a010) and PM between 2.5 and 10\u00a0\u00ecm. LPS levels were determined by using the recombinant Factor C assay.RESULTS: More than 60% of the annual endotoxin exposure was detected in the PM\u00a0>\u00a010 fraction, showing that bacteria do not aerosolize as independent units or aggregates but adhered to large particles. In Munich 70% of annual exposure was detected between June 12th and August 28th. Multivariate modeling showed that endotoxin levels could be explained by phenological parameters (ie, plant growth). Indeed, days with high airborne endotoxin levels correlated well with the amount of Artemisia pollen in the air. Pollen collected from plants across Europe (100 locations) showed that the highest levels of endotoxin were detected on Artemisia vulgaris (mugwort) pollen, with little on other pollen. Microbiome analysis showed that LPS concentrations on mugwort pollen were related to the presence of Pseudomonas species and Pantoea species communities. In a mouse model of allergic disease, the presence of LPS on mugwort pollen was needed for allergic sensitization.CONCLUSIONS: The majority of airborne endotoxin stems from bacteria dispersed with pollen of only one plant: mugwort. This\u00a0LPS was essential for inducing inflammation of the lung and allergic sensitization.", "pmid": "30012513", "title": "Artemisia pollen is the main vector for airborne endotoxin."}, {"journal": "PloS one", "meshMajor": ["Amino Acid Sequence", "Base Sequence", "Calcium-Binding Proteins", "Calmodulin", "DNA Primers", "HeLa Cells", "Humans", "Membrane Proteins", "Microscopy, Fluorescence", "Subcellular Fractions"], "year": "2011", "abstractText": "The CaBPs represent a subfamily of small EF-hand containing calcium (Ca(2+))-sensing proteins related to calmodulin that regulate key ion channels in the mammalian nervous system. In a recent bioinformatic analyses we determined that CaBP7 and CaBP8 form an evolutionarily distinct branch within the CaBPs (also known as the calneurons) a finding that is consistent with earlier observations characterising a putative C-terminal transmembrane (TM) spanning helix in each of these proteins which is essential for their sub-cellular targeting to the Golgi apparatus and constitutive secretory vesicles. The C-terminal position of the predicted TM-helix suggests that CaBP7 and CaBP8 could be processed in a manner analogous to tail-anchored integral membrane proteins which exhibit the ability to insert across membranes post-translationally. In this study we have investigated the topology of CaBP7 and CaBP8 within cellular membranes through a combination of trypsin protection and epitope accessibility analyses. Our results indicate that the TM-helices of CaBP7 and CaBP8 insert fully across membranes such that their extreme C-termini are luminal. The observed type-II membrane topology is consistent with processing of CaBP7 and CaBP8 as true tail-anchored proteins. This targeting mechanism is distinct from any other calmodulin related Ca(2+)-sensor and conceivably underpins unique physiological functions of these proteins.", "pmid": "21445352", "title": "Determination of the membrane topology of the small EF-hand Ca2+-sensing proteins CaBP7 and CaBP8."}, {"journal": "Journal of youth and adolescence", "meshMajor": ["Adolescent", "Adolescent Behavior", "Aggression", "Antisocial Personality Disorder", "Female", "Friends", "Humans", "Male", "Peer Group", "Peer Influence"], "year": "2019", "abstractText": "Growing evidence reveals heterogeneity in antisocial behavior and urges the need to distinguish between aggressive and nonaggressive rule-breaking behaviors. This study characterized how aggression and rule-breaking behaviors shaped peer selection and influence. Using a longitudinal social network modeling approach, these questions were addressed in a sample of 1034 ethno-racially diverse early adolescents (49.52% females, Mage\u2009=\u200912.1), who were assessed in fall and spring of the same year. The results showed no evidence of peer selection on aggressive and rule-breaking behaviors, and significant peer influence on aggressive behavior only. Rule-breaking also forecasted a decreased susceptibility to peer influence on aggressive behavior. The findings expanded our knowledge about complex pathways through which heterogeneity in antisocial behavior is reciprocally related to friendship networks.", "pmid": "31440880", "title": "Friendship Network Dynamics of Aggressive and Rule-Breaking Antisocial Behaviors in Adolescence."}, {"journal": "Psychiatric genetics", "meshMajor": ["Chromosome Mapping", "Chromosomes, Human, Pair 2", "Dyslexia", "Genetic Linkage", "Genetic Predisposition to Disease", "Homeodomain Proteins", "Humans", "Membrane Proteins", "Nerve Tissue Proteins", "Otx Transcription Factors", "Transcription Factors"], "year": "2002", "abstractText": "A locus on chromosome 2p12-16 has been implicated in dyslexia susceptibility by two independent linkage studies, including our own study of 119 nuclear twin-based families, each with at least one reading-disabled child. Nonetheless, no variant of any gene has been reported to show association with dyslexia, and no consistent clinical evidence exists to identify candidate genes with any strong a priori logic. We used 21 microsatellite markers spanning 2p12-16 to refine our 1-LOD unit linkage support interval to 12cM between D2S337 and D2S286. Then, in quantitative association analysis, two microsatellites yielded P values<0.05 across a range of reading-related measures (D2S2378 and D2S2114). The exon/intron borders of two positional candidate genes within the region were characterized, and the exons were screened for polymorphisms. The genes were Semaphorin4F (SEMA4F), which encodes a protein involved in axonal growth cone guidance, and OTX1, encoding a homeodomain transcription factor involved in forebrain development. Two non-synonymous single nucleotide polymorphisms were found in SEMA4F, each with a heterozygosity of 0.03. One intronic single nucleotide polymorphism between exons 12 and 13 of SEMA4F was tested for quantitative association, but no significant association was found. Only one single nucleotide polymorphism was found in OTX1, which was exonic but silent. Our data therefore suggest that linkage with reading disability at 2p12-16 is not caused by coding variants of SEMA4F or OTX1. Our study outlines the approach necessary for the identification of genetic variants causing dyslexia susceptibility in an epidemiological population of dyslexics.", "pmid": "11901358", "title": "Fine mapping of the chromosome 2p12-16 dyslexia susceptibility locus: quantitative association analysis and positional candidate genes SEMA4F and OTX1."}, {"journal": "PloS one", "meshMajor": ["Cells, Cultured", "Enzyme-Linked Immunosorbent Assay", "Humans", "Interleukin-8", "Lysophospholipids", "NF-kappa B", "Neutrophils", "Sphingosine"], "year": "2014", "abstractText": "The bioactive sphingolipid sphingosine 1-phosphate (S1P) is found in increased amounts in the airways of asthmatics. S1P can regulate airway smooth muscle functions associated with asthmatic inflammation and remodeling, including cytokine secretion. To date however, whether S1P induces secretion of an important chemokine responsible for neutrophilia in airway inflammation--IL-8--was unexplored. The aim of this study was to investigate whether S1P induces IL-8 gene expression and secretion to enhance neutrophil chemotaxis in vitro, as well as examine the molecular mechanisms responsible for repression by the corticosteroid dexamethasone. We show that S1P upregulates IL-8 secretion from ASM cells and enhance neutrophil chemotaxis in vitro. The corticosteroid dexamethasone significantly represses IL-8 mRNA expression and protein secretion in a concentration- and time-dependent manner. Additionally, we reveal that S1P-induced IL-8 secretion is p38 MAPK and ERK-dependent and that these key phosphoproteins act on the downstream effector mitogen- and stress-activated kinase 1 (MSK1) to control secretion of the neutrophil chemoattractant cytokine IL-8. The functional relevance of this in vitro data was demonstrated by neutrophil chemotaxis assays where S1P-induced effects can be significantly attenuated by pretreatment with dexamethasone, pharmacological inhibition of p38 MAPK- or ERK-mediated pathways, or by knocking down MSK-1 with siRNA. Taken together, our study reveals the molecular pathways responsible for IL-8 secretion from ASM cells in response to S1P and indicates ways in which the impact on IL-8-driven neutrophilia may be lessened.", "pmid": "24647471", "title": "Sphingosine 1-phosphate induces neutrophil chemoattractant IL-8: repression by steroids."}, {"journal": "Neuroscience letters", "meshMajor": ["Animals", "Behavior, Animal", "Disease Models, Animal", "Male", "Mice", "Mice, Inbred ICR", "Pain", "Peripheral Nervous System Diseases", "Reproducibility of Results", "Spinal Nerves", "Tail", "Temperature"], "year": "2002", "abstractText": "We attempted to develop a mouse model for peripheral neuropathy by a partial injury of the nerve supplying the tail. Under enflurane anesthesia, the unilateral superior caudal trunk was resected between the S3 and S4 spinal nerves. Tests for thermal allodynia were conducted by immersing the tail into 4 or 38 degrees C water. The mechanical allodynia was assessed by stimulating the tail with a von Frey hair (1.96 mN, 0.2 g). After the nerve injury, the experimental animals had shorter tail withdrawal latencies to cold and warm water immersion than the presurgical latency, and exhibited an increase in tail response to von Frey stimulation. We interpret these abnormal sensitivities as the signs of mechanical, cold and warm allodynia following the superior caudal trunk injury in the mouse.", "pmid": "11897161", "title": "A mouse model for peripheral neuropathy produced by a partial injury of the nerve supplying the tail."}, {"journal": "Zhongguo Zhong yao za zhi = Zhongguo zhongyao zazhi = China journal of Chinese materia medica", "meshMajor": ["Amino Acids", "Gas Chromatography-Mass Spectrometry", "Oils, Volatile", "Prunella", "Taste"], "year": "2014", "abstractText": "Volatile oil components and the contents and types of amino acid in spica of Prunella vulgaris were analysed by GC-MS and amino acid analyzer. Esters, fatty acids, aromatic hydrocarbon, ketone and several alcohol compounds were identified by mass spectrum comparison. In these ingredients, beta-ionone smelled aroma of cedar, raspberry, nerolidol showed weak sweet soft orange blossom flavor, neroli tasted sweet and fresh, nerolidol tasted sweet with light aroma of wood, hexadecanal showed a weak aroma of flowers and wax, alpha-sinensal had rich and fresh sweet orange flavor. To some extent, these types of aromatic substances can affect the taste of herbal tea or decoction made of Spica Prunellae. Among amino acids detected, natural amino acids accounted for a larger proportion, and those natural amino acids showed bitterness, slight bitterness, sourness (freshness), sweetness, slight sweetness, sourness (slight freshness). The results indicated that bitter and slightly bitter amino acids have the greatest impacts on the sense of Spica Prunellae.", "pmid": "24946541", "title": "[Preliminary analysis of bitter substances in spica of Prunella vulgaris]."}, {"journal": "Arquivos brasileiros de cardiologia", "meshMajor": ["Atrial Fibrillation", "Catheter Ablation", "Contrast Media", "Gadolinium", "Heart Atria", "Humans", "Recurrence", "Treatment Outcome"], "year": "2020", "abstractText": "BACKGROUND: Atrial fibrillation (AF) is known to induce atrial remodeling, which promotes fibrosis related to arrhythmogenesis. Accordingly, since scars induced by catheter ablation (CA) can reduce unablated fibrotic areas, greater extent of left atrial (LA) scarring may be associated with less AF recurrence after CA.OBJECTIVES: This study aims to investigate, through systematic review and meta-analysis, whether the amount of LA scarring, seen on late gadolinium enhancement magnetic resonance imaging, is associated with less AF recurrence after CA.METHODS: The recommendations of the MOOSE guideline were followed. Database search was conducted in PubMed and Cochrane Central Register of Controlled Trials (coment?rio 1) until January 2019 (coment?rio 2). Two authors performed screening, data extraction, and quality evaluation. All studies were graded as good quality. A funnel plot was generated, showing no publication bias. Statistical significance was defined as p value < 0.05.RESULTS: Eight observational studies were included in the systematic review, four of which were included in the meta-analysis. Six of the eight studies included in the systematic review showed that greater extension of LA scarring is associated with less AF recurrence after CA. Meta-analysis showed that greater extension of LA scarring is associated with less AF recurrence (SMD = 0.52; 95% CI 0.27 - 0.76; p < 0.0001).CONCLUSION: Greater extension of LA scarring is possibly associated with less AF recurrence after CA. Randomized studies that explore ablation methods based on this association are fundamental.", "pmid": "32074201", "title": "Extent of Left Atrial Ablation Lesions and Atrial Fibrillation Recurrence after Catheter Ablation - A Systematic Review and Meta-Analysis."}, {"journal": "Neuroscience letters", "meshMajor": ["Adult", "Color Perception", "Contrast Sensitivity", "Female", "Humans", "Male", "Photic Stimulation", "Retinal Cone Photoreceptor Cells", "Sensory Thresholds", "Visual Pathways", "Young Adult"], "year": "2018", "abstractText": "Flashing light stimulation is often used to investigate the visual system. However, the magnitude of the effect of this stimulus on the various subcortical pathways is not well investigated. The signals of conscious vision are conveyed by the magnocellular, parvocellular and koniocellular pathways. Parvocellular and koniocellular pathways (or more precisely, the L-M opponent and S-cone isolating channels) can be accessed by isoluminant red-green (L-M) and S-cone isolating stimuli, respectively. The main goal of the present study was to explore how costimulation with strong white extrafoveal light flashes alters the perception of stimuli specific to these pathways. Eleven healthy volunteers with negative neurological and ophthalmological history were enrolled for the study. Isoluminance of L-M and S-cone isolating sine-wave gratings was set individually, using the minimum motion procedure. The contrast thresholds for these stimuli as well as for achromatic gratings were determined by an adaptive staircase procedure where subjects had to indicate the orientation (horizontal, oblique or vertical) of the gratings. Thresholds were then determined again while a strong white peripheral light flash was presented 50\u202fms before each trial. Peripheral light flashes significantly (p\u202f<\u202f0.05) increased the contrast thresholds of the achromatic and S-cone isolating stimuli. The threshold elevation was especially marked in case of the achromatic stimuli. However, the contrast threshold for the L-M stimuli was not significantly influenced by the light flashes. We conclude that extrafoveally applied light flashes influence predominantly the perception of achromatic stimuli.", "pmid": "29751069", "title": "Extrafoveally applied flashing light affects contrast thresholds of achromatic and S-cone isolating, but not L-M cone modulated stimuli."}, {"journal": "Pediatric pulmonology", "meshMajor": ["Administration, Inhalation", "Bacteremia", "Dose-Response Relationship, Drug", "Female", "Hernia, Diaphragmatic", "Humans", "Hypoxia", "Infant, Newborn", "Infant, Premature", "Infant, Premature, Diseases", "Male", "Meconium Aspiration Syndrome", "Nitric Oxide", "Oxygen Consumption", "Respiratory Insufficiency", "Streptococcal Infections"], "year": "1995", "abstractText": "In acute hypoxemic respiratory failure of term and near-term neonates, extra- and intrapulmonary right-to-left shunting contribute to refractory hypoxemia. Inhaled nitric oxide (NO) decreases pulmonary arterial pressure and improves ventilation-perfusion mismatch in a variety of animal models and selected human patients. We report on 10 consecutive term and near-term newborns with severe acute hypoxemic respiratory failure due to diaphragmatic hernia, meconium aspiration syndrome, group B streptococcus sepsis, pneumonia or acute respiratory distress syndrome, who received increasing doses of inhaled NO (up to 80 ppm) to improve the arterial partial pressure of oxygen (PaO2). The response to NO and the optimum NO concentration which improved PaO2 varied considerably between patients. Improvement of PaO2 was absent or poor (less than 10 mm Hg) in the 4 newborns with meconium aspiration syndrome and in 1 patient with congenital diaphragmatic hernia, while in the other 5 patients inhaled NO increased the mean (+/- SE) PaO2 from 41 +/- 6 to 57 +/- 9 mm Hg (P < 0.05). Optimum NO concentrations determined by dose-response measurements performed during the first 8 hr of NO inhalation were 8-16 ppm except for 2 newborns with congenital diaphragmatic hernia who required 32 ppm to effectively increase PaO2. Four of the 5 patients in whom the PaO2 rose by more than 10 mm Hg received inhaled NO for extended periods of time (5 to 23 days) with no signs of tachyphylaxis. The optimum NO concentration dropped to less than 3 ppm after prolonged mechanical ventilation or when intravenous prostacyclin was given concomitantly.(ABSTRACT TRUNCATED AT 250 WORDS)", "pmid": "7567204", "title": "Dose-response to inhaled nitric oxide in acute hypoxemic respiratory failure of newborn infants: a preliminary report."}, {"journal": "Southern medical journal", "meshMajor": ["Diabetes Mellitus, Type 2", "Diagnostic Errors", "Female", "Fetal Hemoglobin", "Glycated Hemoglobin A", "Humans", "Middle Aged"], "year": "2000", "abstractText": "We present the case of a 61- year-old black woman with a diagnosis of type 2 diabetes and a falsely elevated hemoglobin A1c (HbA1c) due to hereditary persistence of fetal hemoglobin. Physicians and allied health care professionals are alerted to this potentially significant problem in the diagnosis and management of diabetes mellitus (DM), particularly in the wake of the Diabetes Complications and Control Trial when \"strict\" glycemic control assessed by HbA1c is now the standard of care.", "pmid": "10653068", "title": "Spurious elevation of hemoglobin A1c by hereditary persistence of fetal hemoglobin."}, {"journal": "European review for medical and pharmacological sciences", "meshMajor": ["Adult", "Antiviral Agents", "China", "Drug Resistance, Viral", "Female", "Genotype", "Hepacivirus", "Hepatitis C, Chronic", "Humans", "Male", "Middle Aged", "Mutation", "Prevalence"], "year": "2018", "abstractText": "OBJECTIVE: Although direct-acting antiviral agents (DAAs) for treating hepatitis C virus (HCV) infection have not yet been approved for clinical application at present in China, the development trend is irresistible. DAAs-containing therapeutic regimens have been approved and others are also under development worldwide. In vitro studies have shown that S282T mutation in the NS5B region of HCV is involved in DAAs resistance. The aim of this study was to investigate naturally occurring resistance mutation of S282T in different genotypes of HCV from DAA-treated na?ve Chinese patients who were chronically infected with HCV.PATIENTS AND METHODS: 250 Chinese patients chronically infected with HCV were enrolled in this study. All subjects were na?ve to DAAs. Direct sequencing of HCV NS5B region was performed in all enrolled patients.RESULTS: 70.4% (176/250) cases were infected with HCV genotype 1b, 19.2% (47/250) were 2a, 4.0% (11/250) were 6a, 3.6% (10/250) were 3b, 1.6% (4/250) were 1a and 1.2% (3/250) were 3a. Genotype 4, 5 and 7 were not observed. The S282T mutation was not found in any of the cases.CONCLUSIONS: The results showed that the S282T mutation was not prevalent in DAA-treated na?ve Chinese patients who were chronically infected with HCV.", "pmid": "30178855", "title": "Prevalence of S282T mutation in different genotypes of hepatitis C virus from DAA-treated na?ve Chinese patients who were chronically infected with HCV."}, {"journal": "Genome biology and evolution", "meshMajor": ["Ascomycota", "Evolution, Molecular", "Fungal Proteins", "Genes, Fungal", "Magnoliopsida", "Plant Tumors", "Species Specificity"], "year": "2014", "abstractText": "Taphrina fungi are biotrophic plant pathogens that cause plant deformity diseases. We sequenced the genomes of four Taphrina species-Taphrina wiesneri, T. deformans, T. flavorubra, and T. populina-which parasitize Prunus, Cerasus, and Populus hosts with varying severity of disease symptoms. High levels of gene synteny within Taphrina species were observed, and our comparative analysis further revealed that these fungi may utilize multiple strategies in coping with the host environment that are also found in some specialized dimorphic species. These include species-specific aneuploidy and clusters of highly diverged secreted proteins located at subtelomeres. We also identified species differences in plant hormone biosynthesis pathways, which may contribute to varying degree of disease symptoms. The genomes provide a rich resource for investigation into Taphrina biology and evolutionary studies across the basal ascomycetes clade.", "pmid": "24682155", "title": "Comparative genomics of Taphrina fungi causing varying degrees of tumorous deformity in plants."}, {"journal": "Journal of zoo and wildlife medicine : official publication of the American Association of Zoo Veterinarians", "meshMajor": ["Acaricides", "Animals", "Animals, Zoo", "Arvicolinae", "California", "Disease Eradication", "Endangered Species", "Ivermectin", "Mite Infestations", "Permethrin", "Rodent Diseases"], "year": "2018", "abstractText": "\u2003 Staff at a university laboratory responsible for management of a captive insurance colony of endangered Amargosa voles ( Microtus californicus scirpensis) discovered an outbreak of tropical rat mites ( Ornithonyssus bacoti) infesting 106 voles. This bloodsucking mesostigmatid mite typically occurs in laboratory settings and can cause weight loss, wounds, or other negative impacts on health. The source of the infestation was likely feral rodents, and the route was suspected to be straw bedding. Twenty-nine of the 106 (27.4%) infested voles developed ulcerated dorsal skin lesions that resolved when treated with topical selamectin. A triad approach was implemented to eradicate the mites, consisting of environmental management, individual animal treatment, and training. Voles were moved individually into a clean room containing only autoclaved materials (including straw), cages were treated with permethrin-impregnated cotton, treatment order was instituted to avoid transferring mites, and voles coming from outside were quarantined. All animals in an infested room were treated with topical selamectin, and personnel were trained on risks and new procedures. No adverse effects from the use of selamectin were identified, and this efficient protocol does not require the long-term use of acaricides. This report documents infestation of an endangered rodent with an exotic parasite, safe use of permethrin and selamectin in this species, and comprehensive management to manage a large infestation.", "pmid": "29900773", "title": "ERADICATION OF A TROPICAL RAT MITE ( ORNITHONYSSUS BACOTI) INFESTATION FROM A CAPTIVE COLONY OF ENDANGERED AMARGOSA VOLES ( MICROTUS CALIFORNICUS SCIRPENSIS)."}, {"journal": "Lancet (London, England)", "meshMajor": ["Attitude to Health", "Child", "Child, Preschool", "Diphtheria", "England", "Humans", "Immunization", "Immunization Schedule", "Infant", "Informed Consent", "Measles", "Parents", "Poliomyelitis", "Tetanus", "Vaccination", "Whooping Cough"], "year": "1977", "abstractText": "Immunisation levels in West Sussex for the period 1970 to 1976 were reviewed. The national decline in immunisation against whooping cough was reflected locally in 1974, 1975, and 1976 but, in contrast to the national experience, levels of immunisation against diphtheria, tetanus, poliomyelitis, or measles did not decline. It is suggested that this difference is accounted for by the use of a computer system which keeps an \"immunisation diary\" for parents and for medical and nursing staff.", "pmid": "72302", "title": "Immunisation levels---need they all decline?"}, {"journal": "Terapevticheskii arkhiv", "meshMajor": ["Asthma", "Case-Control Studies", "Forced Expiratory Volume", "Humans", "Obesity", "Plethysmography", "Total Lung Capacity", "Vital Capacity"], "year": "2019", "abstractText": "AIM: To assess the functional status of the small Airways in patients with bronchial asthma associated with obesity, by body plethysmography.MATERIALS AND METHODS: 65 patients with bronchial asthma of mild severity, partially controlled course, including 30 patients with normal body weight and 35 patients with obesity of I degree were examined. Control group-30 healthy volunteers. Examined forced vital capacity (FVC), forced expiratory volume in first second (FEV1) ratio of FEV1 to FVC (FEV1/FVC), maximum volumetric exhalation rate after 25.50 and 75% FVC (MEF75, MEF50, MEF25), average flow velocity in the exhalation interval 25-75% of FVC (MMEF25-75). Method bodyplethysmography was evaluated in bronchial resistance, functional residual capacity (FRC), residual volume of the lungs (RV), total lung capacity (TLC), the percentage of RV/TLC.RESULTS: Patients with bronchial asthma with obesity showed a reduction of indicators of bronchial obstruction: FEV1 of 14% (p=0.02), FEV1/FVC by 14% (p=0.001), MEF75 30% (p=0.001), MEF50 by 35% (p=0.001), MEF25 by 44% (p=0.003), MMEF25-75 by 38% (p=0.001). The increase of bronchial resistance on inhalation in 2 times (p=0.001), on exhalation in 3.3 times (p=0.003) was found, which is typical for generalized bronchial obstruction at the proximal level. An increase in RV by 24% (p=0.03), TLC - by 9% (p=0.03), RV/TLC - by 18% (p=0.03), indicating the presence of &quot;air traps&quot; and dysfunction of the small respiratory tract.CONCLUSION: In patients with asthma of mild severity associated with obesity, both the central bronchis and the distal lung are affected, which are manifested by generalized bronchial obstruction, the formation of &quot;air traps&quot; and dysfunction of the small respiratory tract.", "pmid": "31090373", "title": "Functional state of the small airways in patients with bronchial asthma associated with obesity."}, {"journal": "Neirofiziologiia = Neurophysiology", "meshMajor": ["Afferent Pathways", "Animals", "Brain Mapping", "Brain Stem", "Cats", "Diencephalon", "Hypothalamic Area, Lateral", "Hypothalamus", "Hypothalamus, Posterior", "Locomotion", "Mesencephalon"], "year": "1984", "abstractText": "Afferent brainstem projections to the functionally identified hypothalamic locomotor region were studied in cat by means of the horse-radish peroxidase technique. Cells of origin of such projections were found bilaterally in the different brainstem structures. Most of these cells were found in sites of location of monoaminergic systems (nucleus reticularis lateralis, locus coeruleus, nucleus tractus solitarius, raphe nuclei, substantia grisea centralis) parabrachial and certain sensory nuclei of the brainstem. The hypothalamic locomotor regions have mutual bilateral interconnections.", "pmid": "6462287", "title": "[Afferent brain stem connections of the hypothalamic locomotor area of the cat brain]."}, {"journal": "The Journal of infection", "meshMajor": ["Adolescent", "Adult", "Aged", "Aged, 80 and over", "Child", "Child, Preschool", "Female", "Humans", "Infant", "Infant, Newborn", "Male", "Middle Aged", "Multilocus Sequence Typing", "Pneumococcal Infections", "Pneumococcal Vaccines", "Prevalence", "Prospective Studies", "Serotyping", "Spain", "Streptococcus pneumoniae", "Vaccines, Conjugate", "Young Adult"], "year": "2011", "abstractText": "OBJECTIVES: The objective of this study was to learn the serotype distribution and clonal composition of pneumococci causing invasive pneumococcal disease (IPD) in children and adults in Spain before the introduction of new 10-valent (PCV10) and 13-valent (PCV13) conjugate vaccines.METHODS: This is a 1-year prospective study including all patients with culture-proved IPD admitted to 30 medical centers in Catalonia, Spain, during the year 2009.RESULTS: A total of 614 episodes of IPD occurred in 612 patients. The rates of IPD were highest in children aged <24 months and adults >64 years (64.5 and 44.7 per 100,000 population). The burden of disease was mainly due to pneumonia in all age ranges. 609 of 614 strains were serotyped and 47 different serotypes were found. Among the 609 IPD cases with known serotype, 12.2% were caused by PCV7 serotypes, 51% by PCV10 serotypes, and 71.7% by PCV13 serotypes. 608 of 614 isolates were characterized by MLST. The main clonal types detected were ST306, CC191 and CC230.CONCLUSIONS: PCV13 conjugate vaccine offers good coverage against IPD in Catalonia, Spain. However, the high genetic diversity of pneumococci highlights the importance of molecular surveillance systems for monitoring IPD during the vaccination period.SUMMARY: This study shows that 13-valent conjugate vaccine offers good coverage against invasive pneumococcal disease in children and adults in Spain. However, the high genetic diversity of pneumococci highlights the importance of molecular surveillance systems for monitoring IPD during the vaccination period.", "pmid": "21679725", "title": "Serotypes and clones causing invasive pneumococcal disease before the use of new conjugate vaccines in Catalonia, Spain."}, {"journal": "Langmuir : the ACS journal of surfaces and colloids", "meshMajor": ["Animals", "Cell Culture Techniques", "Cells, Cultured", "Dendrites", "Nerve Net", "Neurites", "Neurons", "Rats"], "year": "2018", "abstractText": "Despite significant progress, our knowledge of the functioning of the central nervous system still remains scarce to date. A better understanding of its behavior, in either normal or diseased conditions, goes through an increased knowledge of basic mechanisms involved in neuronal function, including at the single-cell level. This has motivated significant efforts for the development of miniaturized sensing devices to monitor neuronal activity with high spatial and signal resolution. One of the main challenges remaining to be addressed in this domain is, however, the ability to create in vitro spatially ordered neuronal networks at low density with a precise control of the cell location to ensure proper monitoring of the activity of a defined set of neurons. Here, we present a novel self-aligned chemical functionalization method, based on a repellant surface with patterned attractive areas, which permits the elaboration of low-density neuronal network down to individual cells with a high control of the soma location and axonal growth. This approach is compatible with complementary metal-oxide-semiconductor line technology at a wafer scale and allows performing the cell culture on packaged chip outside microelectronics facilities. Rat cortical neurons were cultured on such patterned surfaces for over one month and displayed a very high degree of organization in large networks. Indeed, more than 90% of the network nodes were settled by a soma and 100% of the connecting lines were occupied by a neurite, with a very good selectivity (low parasitic cell connections). After optimization, networks composed of 75% of unicellular nodes were obtained, together with a control at the micron scale of the location of the somas. Finally, we demonstrated that the dendritic neuronal growth was guided by the surface functionalization, even when micrometer scale topologies were encountered and we succeeded to control the extension growth along one-dimensional-aligned nanostructures with sub-micrometrical scale precision. This novel approach now opens the way for precise monitoring of neuronal network activity at the single-cell level.", "pmid": "29754481", "title": "Self-Aligned Functionalization Approach to Order Neuronal Networks at the Single-Cell Level."}, {"journal": "Langmuir : the ACS journal of surfaces and colloids", "meshMajor": ["Aluminum", "Biosensing Techniques", "DNA", "Electric Conductivity", "Electrochemistry", "Electrodes", "Ions", "Nanostructures", "Nucleic Acid Hybridization", "Particle Size", "Porosity", "Sensitivity and Specificity"], "year": "2005", "abstractText": "We show that nanoporous alumina modified with covalently linked DNA can be used to detect target DNA by monitoring the increase in impedance at the electrode upon DNA hybridization, which resulted from blocking the pores to ionic flow. Using cyclic voltammetry, direct current conductance, and impedance spectroscopy we confirm the importance of pore size: the effect is observed with 20-nm-diameter pores and is absent for 200-nm pores.", "pmid": "15896007", "title": "Sensing DNA hybridization via ionic conductance through a nanoporous electrode."}, {"journal": "Heart rhythm", "meshMajor": ["Administration, Oral", "Anticoagulants", "Atrial Fibrillation", "Electronics, Medical", "Humans", "Prostheses and Implants", "Risk Factors", "Sensitivity and Specificity", "Stroke"], "year": "2014", "abstractText": "The detection of atrial fibrillation (AF) by a cardiac implantable electronic device (CIED) in patients without a prior history of AF is increasing. This trend is the result of the increased number of CIEDs being implanted in a population whose multiple medical comorbidities are known to predispose to AF. Cardiac implantable electronic device-detected atrial fibrillation (CDAF) is independently associated with the development of ischemic stroke, and the annual risk may depend on both total AF burden and individual risk factors. No data evaluating the benefit of oral anticoagulation in this population are available, which makes the decision to initiate anticoagulation challenging. This review analyzes the available data on CDAF and the associated risk of ischemic stroke, and it presents a rationale for the use of long-term oral anticoagulation in this population. ", "pmid": "24394157", "title": "Clinical significance of atrial fibrillation detected by cardiac implantable electronic devices."}, {"journal": "Analytical chemistry", "meshMajor": ["Amino Acids", "Animals", "Ascorbic Acid", "Brain Ischemia", "Carbon", "Cerebral Cortex", "Copper", "Electrochemical Techniques", "Electrodes", "Ethylenediamines", "Ions", "Male", "Nanocomposites", "Pyridines", "Rats", "Rats, Sprague-Dawley", "Surface Properties", "Uric Acid"], "year": "2013", "abstractText": "Direct determination of cerebral metal ions in small volume biological samples is still the bottleneck for evaluating the roles that metal ions play in the physiological and pathological processes. In this work, selected copper ion (Cu(2+)) as a model, a facile and direct electrochemical method for detection of Cu(2+) has been developed on the basis of two new designed strategies: one is specific recognition molecule for Cu(2+)-AE-TPEA (N-(2-aminoethyl)-N,N',N'-tris(pyridine-2-yl-methyl)ethane-1,2-diamine); another is carbon dots (C-Dots) with high electrocatalytic activity. Based on the high affinity between TPEA and Cu(2+), the electrode assembled with C-Dot-TPEA hybridized nanocomposites shows high selectivity toward Cu(2+) over other metal ions, amino acids, and biological coexisting species, such as uric acid (UA), ascorbic acid (AA), and so on, which makes it possible to be used for determination of Cu(2+) in the complex brain system. By taking advantage of C-Dots, a dynamic linear range from 1 \u00ecM to 60 \u00ecM is first achieved with a detection limit of ?100 nM in aCSF solution. In addition, the developed method with theoretical simplicity and less instrumental demands exhibits long-term stability and good reproducibility. As a result, the present strategy has been successfully applied in detection of cerebral Cu(2+) in normal rat brain and that followed by global cerebral ischemia, combined with in vivo microdialysis. The determined concentrations of Cu(2+) in the rat brain microdialysates by the present method are found to be identical to those obtained by the conventional ICP-AES method.", "pmid": "23214718", "title": "Highly selective electrochemical strategy for monitoring of cerebral Cu2+ based on a carbon Dot-TPEA hybridized surface."}, {"journal": "Investigative ophthalmology & visual science", "meshMajor": ["Aged", "Aged, 80 and over", "Contrast Sensitivity", "Flicker Fusion", "Glaucoma, Open-Angle", "Humans", "Middle Aged", "Reproducibility of Results", "Vision Disorders", "Visual Acuity", "Visual Field Tests", "Visual Fields"], "year": "1997", "abstractText": "PURPOSE: The authors compared the efficacy of two different forms of flicker perimetry: temporal modulation perimetry (TMP), which measures contrast thresholds for a fixed temporal frequency, and critical flicker frequency (CFF), which measures the highest frequency for which flicker is detected at a fixed contrast.METHODS: The authors compared 16 patients with early to moderate glaucomatous visual field loss with 16 age-matched normal controls. Flicker stimuli consisted of 2 degrees diameter targets of 2 seconds in duration, presented in 44 locations throughout the central 30 degrees visual field. Flicker was presented within a cosine envelope to avoid temporal transients. For TMP, contrast sensitivity thresholds were measured for 8-Hz sinusoidal flicker; CFF thresholds were measured for a stimulus of 100% contrast.RESULTS: The results indicate that TMP and CFF produced similar test-retest reliability in normals. CFF had slightly better reliability in glaucoma patients. Receiver operating characteristic analysis revealed that TMP could provide better separation of normals and glaucoma patients than did CFF. Similar findings were obtained when the thresholds for both procedures were converted to Z scores.CONCLUSIONS: Both methods of flicker perimetry testing provide acceptable test-retest reliability, and both can distinguish normal subjects from glaucoma patients. However, TMP is more effective in separating normal subjects from glaucoma patients than CFF, suggesting that TMP is the method of choice for detecting glaucomatous damage using flicker perimetry.", "pmid": "9344350", "title": "Which method of flicker perimetry is most effective for detection of glaucomatous visual field loss?"}, {"journal": "Bio Systems", "meshMajor": ["Electronic Data Processing", "Models, Organizational", "Systems Biology"], "year": "2006", "abstractText": "The structure of a system influences its adaptability. An important result of adaptability theory is that subsystem independence increases adaptability [Conrad, M., 1983. Adaptability. Plenum Press, New York]. Adaptability is essential in systems that face an uncertain environment such as biological systems and organizations. Modern organizations are the product of human design. And so it is their structure and the effect that it has on their adaptability. In this paper we explore the potential effects of computer-based information processing on the adaptability of organizations. The integration of computer-based processes into the dynamics of the functions they support and the effect it has on subsystem independence are especially relevant to our analysis.", "pmid": "16757096", "title": "Adaptability and the integration of computer-based information processing into the dynamics of organizations."}, {"journal": "American journal of clinical pathology", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Antigens, CD", "Bone Marrow", "Bone Marrow Cells", "Female", "Flow Cytometry", "Hematologic Neoplasms", "Humans", "Immunophenotyping", "Male", "Middle Aged", "Myelodysplastic Syndromes", "Myeloproliferative Disorders", "Young Adult"], "year": "2014", "abstractText": "OBJECTIVES: Flow cytometry immunophenotyping has been suggested as an adjunctive technique in the evaluation of myeloid malignancies, especially in the myelodysplastic syndromes. However, its use has been limited due to complexity and cost restraints. The goal of this study is to attempt a simpler approach to flow cytometry immunophenotyping in myeloid neoplasms.METHODS: We analyzed bone marrow specimens of 45 selected patients and an additional 99 consecutive random patients using a limited antibody panel.RESULTS: Normal CD34-positive blasts show a characteristic pattern of CD13/HLA-DR expression, with three readily identifiable subpopulations. In contrast, myeloid neoplasms frequently show loss of this heterogeneity.CONCLUSIONS: Analysis of a limited antibody panel with a focus on CD13/HLA-DR expression provides relatively high specificity and sensitivity for the detection of myeloid neoplasms.", "pmid": "25125617", "title": "Loss of blast heterogeneity in myelodysplastic syndrome and other chronic myeloid neoplasms."}, {"journal": "Oncology reports", "meshMajor": ["Activin Receptors, Type I", "Cell Division", "Humans", "Immunoprecipitation", "Membrane Proteins", "Neoplasm Invasiveness", "Oligonucleotides, Antisense", "Protein-Serine-Threonine Kinases", "RNA, Messenger", "Receptor, Transforming Growth Factor-beta Type I", "Receptor, Transforming Growth Factor-beta Type II", "Receptors, Transforming Growth Factor beta", "Reverse Transcriptase Polymerase Chain Reaction", "Stomach Neoplasms", "Thionucleotides", "Transforming Growth Factor beta", "Tumor Cells, Cultured"], "year": "2004", "abstractText": "Non-metastatic gene A (nma) has a homologue DNA sequence to a gene of bone morphogenetic proteins and activin membrane-bound inhibitor (BAMBI), which negatively regulates TGF-beta signaling. In this study, we analyzed the functional homology between Nma and BAMBI in human gastric carcinoma cell lines. Various levels of nma mRNA expression were detected by the RT-PCR technique in all human gastric carcinoma cell lines. Then, Nma antisense and sense S-oligodeoxynucleotide (ODN) were used to analyze the response of TGF-beta to cell growth and invasion gastric carcinoma cell lines. The cell growth was inhibited by TGF-beta in Nma antisense S-ODN treatment gastric carcinoma cell lines, MKN28, MKN1, MKN74 and TMK1. TGF-beta reduced cell growth and invasive activity of MKN28 treated with Nma antisense S-ODN in a dose and time-dependent manner. Furthermore, lysates of Nma sense or antisense S-ODN treated MKN28 cells were immunoprecipitated with anti-TGFbetaR-I or anti-TGFbetaR-II antibody. The 29 kDa signal considered as Nma appeared in sense S-ODN treated MKN28 cells immunoprecipitated with anti-TGFbetaR-I. These results indicate that Nma negatively regulates TGF-beta signaling, consequently playing an important role as one of the escape mechanisms from TGF-beta-mediated growth control similarly to BAMBI, and induce cell growth and invasion in human gastric carcinoma cell lines.", "pmid": "15138559", "title": "Effect of Nma on growth inhibition by TGF-betaa in human gastric carcinoma cell lines."}, {"journal": "Annals of hepatology", "meshMajor": ["Adolescent", "Alanine Transaminase", "Aspartate Aminotransferases", "Biomarkers", "Body Mass Index", "Chi-Square Distribution", "Child", "Cross-Sectional Studies", "Elasticity Imaging Techniques", "Female", "Humans", "Hypertension", "India", "Lipids", "Liver Cirrhosis", "Logistic Models", "Male", "Non-alcoholic Fatty Liver Disease", "Pediatric Obesity", "Platelet Count", "Predictive Value of Tests", "Prevalence", "Risk Factors"], "year": null, "abstractText": "\u00a0Background and rationale. Nonalcoholic fatty liver disease (NAFLD) is the most common cause of pediatric liver disease in western countries. Its prevalence in Indian subcontinent is not well studied.MATERIAL AND METHODS: In a school based cross sectional study we have screened overweight and obese children in the age group of 11 to 15 years for NAFLD. Ultrasonography, elevated serum transaminases, fibroscan were used for defining NAFLD. Dietary habits, blood pressure, serum lipid profile, blood counts and insulin resistance were recorded. The relation of fibrosis 4 score, pediatric NAFLD fibrosis index, aspartate transaminases to platelet ratio index (APRI) with fibroscan was evaluated.RESULTS: Out of 616 students screened 198 were overweight and obese. Hundred students and their parents gave informed consent for the further evaluation. The prevalence of NAFLD was 62% in overweight and obese children. Fatty liver was found in 50 % students on ultrasonography, liver stiffness (? 6.1 Kilopascals) in 23% and raised alanine transaminase in 30%. Hypertension, dyslipidemia, diabetes mellitus and insulin resistance were seen in 6%, 18%, 2% and 66% students respectively. Systolic hypertension, serum triglyceride, aspartate transaminase, APRI was significantly higher in the NAFLD group. On binary logistic regression only systolic hypertension was an independent risk factor for NAFLD.CONCLUSION: In conclusion NAFLD is common in asymptomatic overweight and obese Indian children. Systolic hypertension is the only independent factor associated with NAFLD. Fibroscan has limited role for screening. We recommend screening for NAFLD in this high risk group with alanine transaminases and ultrasonography.", "pmid": "27740518", "title": "\u00a0Most overweight and obese Indian children have nonalcoholic fatty liver disease."}, {"journal": "Journal of cataract and refractive surgery", "meshMajor": ["Conjunctiva", "Follow-Up Studies", "Glaucoma, Open-Angle", "Humans", "Intraocular Pressure", "Sclera", "Surgical Flaps", "Trabeculectomy"], "year": "1994", "abstractText": "A modified surgical technique is described for trabeculectomy using contemporary limbal incisional techniques for a simplified dissection of the lamellar scleral flap, a technically easier operation, with smoothly dissected surfaces. The results and complications are comparable to those using the standard method.", "pmid": "8064615", "title": "Trabeculectomy: a modified surgical technique."}, {"journal": "International journal of cancer", "meshMajor": ["Animals", "Antibodies", "Antibody-Dependent Cell Cytotoxicity", "Cell Line", "Isoantigens", "Killer Cells, Natural", "Mice", "Mice, Inbred BALB C", "Mice, Inbred C3H", "Mice, Inbred C57BL", "Mice, Nude", "Neoplasms, Experimental", "Receptors, Fc", "Spleen"], "year": "1981", "abstractText": "Antibodies reactive with effector cells were shown to augment the cytotoxicity of spleen cells from athymic nude and euthymic mice. The addition of alloantibody to the assay or pretreatment of the effector cells with alloantibody resulted in increased cytotoxicity against the human cell K562, a relatively poor target for spontaneous mouse NK activity. When monoclonal antibodies were tested, cytotoxicity was markedly increased by some antibodies, such as anti-H-2, anti-la, and anti-Thy 1.2, while others had no effect. The degree of augmentation of cytotoxicity was dependent on the concentration of antibody added. Nylon-wool-nonadherent nu/nu splenic effector cells mediated the antibody-induced cytotoxicity and anti-asialo GMI plus complement abolished activity, indicating that the cells mediating the cytotoxicity were NK cells and not mature T cells, B cells or macrophages. When spleen cells from mice having different levels of NK activity were evaluated in this system, the magnitude of augmentation by antibody correlated with the level of spontaneous NK activity and no increased cytotoxicity was found with cell populations that had low spontaneous NK activity. Testing a panel of target cells, showed that certain human and mouse cell lines, with low to moderate susceptibility to spontaneous NK activity, were sensitive to antibody-induced NK-cell-mediated cytotoxicity whereas others were completely resistant. Both Fc-IgG receptor-positive and -negative cell lines were susceptible target cells. These results indicate that antibodies reactive with murine NK cells can increase their cytolytic activity.", "pmid": "7287215", "title": "Antibody-induced augmentation of murine natural killer cell activity."}, {"journal": "Pediatrics", "meshMajor": ["Adolescent", "Adolescent Behavior", "Bulimia", "Comorbidity", "Depression", "Female", "Humans", "Hyperphagia", "Male", "Minnesota", "Obesity", "Prevalence", "Risk Assessment", "Self Concept", "Sex Distribution", "Suicide, Attempted"], "year": "2003", "abstractText": "OBJECTIVE: To assess the prevalence of overeating among adolescents and to examine associations between overeating and sociodemographic characteristics, weight status, dieting behaviors, body satisfaction, depressive mood, self-esteem, and suicide.METHOD: A school-based sample of 4746 boys and girls in public middle and high schools in Minnesota completed the Project EAT (Eating Among Teens) survey and anthropometric measurements of height and weight.RESULTS: Overall, 17.3% of girls and 7.8% of boys reported objective overeating in the past year. Youths who engaged in overeating were more likely to be overweight or obese, to have dieted in the past year, to be trying to lose weight currently, and to report that weight and shape are very important to their overall feelings about self. Youths who met criteria for binge eating syndrome (high frequency of objective overeating with loss of control and distress regarding the binge eating) scored significantly lower on measures of body satisfaction and self-esteem and higher on a measure of depressive mood than those who reported either subclinical or no binge eating. Overeating was associated with suicide risk; more than one fourth of girls (28.6%) and boys (27.8%) who met criteria for binge eating syndrome reported that they had attempted suicide.CONCLUSIONS: Overeating among adolescents is associated with a number of adverse behaviors and negative psychological experiences. As the current study is cross-sectional, it is not possible to ascertain cause and effect. Future research should seek to identify whether objective overeating is an early warning sign of additional psychological distress or is a potential consequence of compromised psychological health. Clinical implications are discussed.", "pmid": "12509556", "title": "Overeating among adolescents: prevalence and associations with weight-related characteristics and psychological health."}, {"journal": "Chemosphere", "meshMajor": ["Biotechnology", "Chemical Fractionation", "Conservation of Natural Resources", "Decontamination", "Kinetics", "Metals, Heavy", "Plants", "Saponins", "Soil Pollutants", "Surface-Active Agents"], "year": "2002", "abstractText": "A washing process was studied to evaluate the efficiency of saponin on remediating heavy metal contaminated soils. Three different types of soils (Andosol: soil A, Cambisol: soil B, Regosol: soil C) were washed with saponin in batch experiments. Utilization of saponin was effective for removal of heavy metals from soils, attaining 90-100% of Cd and 85-98% of Zn extractions. The fractionations of heavy metals removed by saponin were identified using the sequential extraction. Saponin was effective in removing the exchangeable and carbonated fractions of heavy metals from soils. In recovery procedures, the pH of soil leachates was increased to about 10.7, leading to separate heavy metals as hydroxide precipitates and saponin solute. In addition recycle of used saponin is considered to be effective for the subsequent utilization. The limits of Japanese leaching test were met for all of the soil residues after saponin treatment. As a whole, this study shows that saponin can be used as a cleaning agent for remediation of heavy metal contaminated soils.", "pmid": "12365835", "title": "Evaluation of remediation process with plant-derived biosurfactant for recovery of heavy metals from contaminated soils."}, {"journal": "International journal of hygiene and environmental health", "meshMajor": ["Animals", "Cattle", "Colony Count, Microbial", "DNA, Bacterial", "Escherichia coli", "Food Handling", "Meat"], "year": "2007", "abstractText": "A molecular-based detection method was developed to detect Escherichia coli O26, O111 and O157 in minced (ground) beef samples. This method consists of an initial overnight enrichment in modified tryptone soya broth (mTSB) and novobiocin prior to DNA extraction and subsequent serogrouping using a triplex PCR. This method has a low limit of detection and results are available within 24 hours of receipt of samples. Once optimized, this rapid method was utilized to determine the prevalence of these E. coli serogroups in six hundred minced beef samples all of which were previously examined by immunomagnetic separation (IMS) and selective plating for E. coli O26 and O111. Using IMS, two E. coli O26 isolates were detected. No E. coli O111 were recovered. The multiplex PCR technique described here did not detect E. coli O111 nor O157 in any of the samples, however six minced beef samples were positive for E. coli O26 using our method, only two of these were previously detected by IMS and culture. Application of molecular methods are useful to support culture-based approaches thereby further contributing to risk reduction along the food chain.", "pmid": "17118703", "title": "Development and assessment of a rapid method to detect Escherichia coli O26, O111 and O157 in retail minced beef."}, {"journal": "PloS one", "meshMajor": ["Aged", "Diabetes Mellitus, Type 2", "Electrocardiography", "Female", "Follow-Up Studies", "Glycated Hemoglobin A", "Guideline Adherence", "Humans", "Hypoglycemic Agents", "Kidney Function Tests", "Lipoproteins", "Luxembourg", "Male", "Middle Aged", "Patient Compliance", "Practice Guidelines as Topic", "Proportional Hazards Models"], "year": "2013", "abstractText": "INTRODUCTION: Type 2 diabetes is associated with severe micro- and macro-vascular complications. Physicians' and patients' adherence to follow-up guidelines permits postponing or reducing these complications. The objectives were to assess the level of adherence to fundamental follow-up guidelines and determine patients' characteristics associated with this level of adherence in the context of Luxembourg, where no guidelines were implemented.STUDY POPULATION: The exhaustive residing population treated for type 2 diabetes in Luxembourg during the 2000-2006 period (N = 21,068).METHODS: Seven fundamental criteria were extracted from international guidelines (consultation with the treating physician, HbA1c tests, electrocardiogram, retinal, dental, lipid and renal check-ups). The factors associated with the level of adherence to those criteria were identified using a partial proportional odds model.RESULTS: In 2006, despite 90% of the patients consulted at least 4 times their treating physician, only 0.6% completed all criteria; 55.0% had no HbA1c test (-8.6 points since 2000) and 31.1% had a renal check-up (+21.6 points). The sex (OR(male): 0.87 [95%CI, 0.83-0.92]), the nationality (OR(NonEU): 0.64 [0.52-0.78]), the type of antidiabetic treatment (ORoral: 1.48 [1.35-1.63], OR(mixed): 1.35 [1.20-1.52]) and the type of treating physician (ORG-ID: 0.47 [0.42-0.53]) were the main factors associated with the level of adherence in 2006 (3 or more criteria).CONCLUSION: A large percentage of patients were not provided with a systematic annual follow-up between 2000 and 2006. This study highlighted the necessity to promote guidelines in Luxembourg, education for physicians and to launch a national discussion on a disease management program for diabetic patients.", "pmid": "24244637", "title": "Adherence to international follow-up guidelines in type 2 diabetes: a longitudinal cohort study in Luxembourg."}, {"journal": "Canadian journal of microbiology", "meshMajor": ["Agriculture", "Animals", "Anti-Bacterial Agents", "Bacteria", "Cattle", "Ceftriaxone", "Chickens", "Dairying", "Drug Resistance, Bacterial", "Drug Resistance, Multiple, Bacterial", "Feces"], "year": "2006", "abstractText": "Approximately 40 samples of animal feces, drinking water, feed, bedding, pine wood shavings, compost, and manure slurry were collected from two animal research farms (one dairy and one poultry) and analyzed for ceftriaxone-resistant bacteria. Our study revealed that the total percentage of aerobic bacteria with reduced susceptibility to ceftriaxone (minimal inhibitory concentration (MIC) > or = 16 micro g/mL) ranged from 0.9% to 10.8% in dairy feces and from 0.05% to 3.93% in chicken feces. The percentages of ceftriaxone-resistant bacteria (MIC > or = 64 micro g/mL) were in the range of 0.01% - 2.3% in dairy feces and 0.01% - 0.79% in chicken feces. Environmental samples contained a wide range of ceftriaxone-resistant bacterial populations. Among those environmental samples, fresh pine wood shavings used as chicken bedding contained the highest percentages (41.5%) of ceftriaxone-resistant bacteria, as determined by a plating method. A total of 105 ceftriaxone-resistant (MIC > or = 128 micro g/mL) bacterial isolates were isolated from the above samples and tested for resistance to nine antibiotics: ampicillin, ceftriaxone, streptomycin, kanamycin, gentamicin, chloramphenicol, tetracycline, ciprofloxacin, and nalidixic acid. The most prevalent resistance pattern (34.3%) among isolates included resistance to all nine antibiotics. Results from this study suggest that ceftriaxone-resistant bacteria exist in farm environments, and the ceftriaxone resistance was frequently associated with resistance to multiple antibiotics. Environmental sources such as pine wood shavings used as bedding can be a potential reservoir for transmitting the multidrug-resistant bacteria.", "pmid": "17110962", "title": "Occurrence of ceftriaxone-resistant commensal bacteria on a dairy farm and a poultry farm."}, {"journal": "Lancet (London, England)", "meshMajor": ["Adult", "Chlorambucil", "Drug Administration Schedule", "Drug Evaluation", "Female", "Follow-Up Studies", "Glomerulonephritis, Membranous", "Humans", "Male", "Middle Aged", "Nephrotic Syndrome", "Pancytopenia", "Prednisolone", "Proteinuria", "Risk Factors"], "year": "1988", "abstractText": "Eight patients with idiopathic membranous nephropathy whose renal function was deteriorating were given a 6-month course of alternating monthly cycles of prednisolone and chlorambucil. Proteinuria was reduced in all eight, from a mean (SD) of 15.3 (5.9) g/24 h at the start of treatment to 2.1 (1.5) g/24 h at follow-up (p less than 0.05). Creatinine clearance increased in six, and the rate of decline was reduced in the other two (group mean 51.6 [17.8] ml/min at the start of treatment and 81.4 [36.8] ml/min at follow-up; p less than 0.05). Adverse effects of chlorambucil were severe, and the daily dose had to be reduced. Prednisolone and chlorambucil treatment can change the natural course of membranous nephropathy even when renal function has started to deteriorate, so treatment can be reserved for high-risk patients.", "pmid": "2902317", "title": "Prednisolone and chlorambucil treatment in idiopathic membranous nephropathy with deteriorating renal function."}, {"journal": "The Journal of neuroscience : the official journal of the Society for Neuroscience", "meshMajor": ["Animals", "Cocaine", "Cyclic AMP Response Element-Binding Protein", "Male", "Mice", "Mice, Inbred C57BL", "Mice, Transgenic", "Nucleus Accumbens", "Prosencephalon", "Rats", "Rats, Sprague-Dawley", "Reward"], "year": "2009", "abstractText": "The transcription factor cAMP response element-binding protein (CREB) within the nucleus accumbens (NAc) plays an important role in regulating mood. In rodents, increased CREB activity within the NAc produces depression-like signs including anhedonia, whereas disruption of CREB activity by expression of a dominant-negative CREB (mCREB, which acts as a CREB antagonist) has antidepressant-like effects. We examined how disruption of CREB activity affects brain reward processes using intracranial self-stimulation (ICSS) and inducible bitransgenic mice with enriched expression of mCREB in forebrain regions including the NAc. Mutant mice or littermate controls were prepared with lateral hypothalamic stimulating electrodes, and trained in the ICSS procedure to determine the frequency at which the stimulation becomes rewarding (threshold). Inducible expression of mCREB did not affect baseline sensitivity to brain stimulation itself. However, mCREB-expressing mice were more sensitive to the rewarding (threshold-lowering) effects of cocaine. Interestingly, mCREB mice were insensitive to the depressive-like (threshold-elevating) effects of the kappa-opioid receptor agonist U50,488. These behavioral differences were accompanied by decreased mRNA expression of G-protein receptor kinase-3 (GRK3), a protein involved in opioid receptor desensitization, within the NAc of mCREB mice. Disruption of CREB or GRK3 activity within the NAc specifically by viral-mediated gene transfer enhanced the rewarding impact of brain stimulation in rats, establishing the contribution of functional changes within this region. Together with previous findings, these studies raise the possibility that disruption of CREB in the NAc influences motivation by simultaneously facilitating reward and reducing depressive-like states such as anhedonia and dysphoria.", "pmid": "19211892", "title": "Altered sensitivity to rewarding and aversive drugs in mice with inducible disruption of cAMP response element-binding protein function within the nucleus accumbens."}, {"journal": "Tree physiology", "meshMajor": ["Acclimatization", "Droughts", "Environment", "Ericaceae", "Forests", "Photosynthesis", "Pinus", "Plant Leaves", "Quercus", "Seasons", "Spain", "Species Specificity"], "year": "2015", "abstractText": "The Mediterranean region is a hot spot of climate change vulnerable to increased droughts and heat waves. Scaling carbon fluxes from leaf to landscape levels is particularly challenging under drought conditions. We aimed to improve the mechanistic understanding of the seasonal acclimation of photosynthesis and morphology in sunlit and shaded leaves of four Mediterranean trees (Quercus ilex L., Pinus halepensis Mill., Arbutus unedo L. and Quercus pubescens Willd.) under natural conditions. Vc,max and Jmax were not constant, and mesophyll conductance was not infinite, as assumed in most terrestrial biosphere models, but varied significantly between seasons, tree species and leaf position. Favourable conditions in winter led to photosynthetic recovery and growth in the evergreens. Under moderate drought, adjustments in the photo/biochemistry and stomatal/mesophyllic diffusion behaviour effectively protected the photosynthetic machineries. Severe drought, however, induced early leaf senescence mostly in A. unedo and Q. pubescens, and significantly increased leaf mass per area in Q. ilex and P. halepensis. Shaded leaves had lower photosynthetic potentials but cushioned negative effects during stress periods. Species-specificity, seasonal variations and leaf position are key factors to explain vegetation responses to abiotic stress and hold great potential to reduce uncertainties in terrestrial biosphere models especially under drought conditions. ", "pmid": "25836361", "title": "Seasonal variability of foliar photosynthetic and morphological traits and drought impacts in a Mediterranean mixed forest."}, {"journal": "Nucleic acids research", "meshMajor": ["Antigens, Neoplasm", "Antineoplastic Agents", "Cell Line, Tumor", "DNA", "DNA Breaks", "DNA Topoisomerases, Type II", "DNA-Binding Proteins", "HL-60 Cells", "Humans", "Methionine", "Organoplatinum Compounds", "Podophyllotoxin", "Poly-ADP-Ribose Binding Proteins", "Protein Conformation", "Topoisomerase II Inhibitors"], "year": "2017", "abstractText": "Human type II topoisomerase (Top2) isoforms, hTop2\u00e1 and hTop2\u00e2, are targeted by some of the most successful anticancer drugs. These drugs induce Top2-mediated DNA cleavage to trigger cell-death pathways. The potency of these drugs correlates positively with their efficacy in stabilizing the enzyme-mediated DNA breaks. Structural analysis of hTop2\u00e1 and hTop2\u00e2 revealed the presence of methionine residues in the drug-binding pocket, we therefore tested whether a tighter Top2-drug association may be accomplished by introducing a methionine-reactive Pt2+ into a drug to further stabilize the DNA break. Herein, we synthesized an organoplatinum compound, etoplatin-N2\u00e2, by replacing the methionine-juxtaposing group of the drug etoposide with a cis-dichlorodiammineplatinum(II) moiety. Compared to etoposide, etoplatin-N2\u00e2 more potently inhibits both human Top2s. While the DNA breaks arrested by etoposide can be rejoined, those captured by etoplatin-N2\u00e2 are practically irreversible. Crystallographic analyses of hTop2\u00e2 complexed with DNA and etoplatin-N2\u00e2 demonstrate coordinate bond formation between Pt2+ and a flanking methionine. Notably, this stable coordinate tether can be loosened by disrupting the structural integrity of drug-binding pocket, suggesting that Pt2+ coordination chemistry may allow for the development of potent inhibitors with protein conformation-dependent reversibility. This approach may be exploited to achieve isoform-specific targeting of human Top2s.", "pmid": "28977631", "title": "Producing irreversible topoisomerase II-mediated DNA breaks by site-specific Pt(II)-methionine coordination chemistry."}, {"journal": "Journal of invertebrate pathology", "meshMajor": ["Animals", "Aphanomyces", "Astacoidea", "Bosnia and Herzegovina", "Host-Pathogen Interactions", "Infections"], "year": "2017", "abstractText": "Although the introduction of the crayfish plague pathogen Aphanomyces astaci to Europe is responsible for substantial declines in native crayfish populations throughout the whole continent, its presence has never been officially confirmed in many European regions, including most of the Balkan Peninsula. We demonstrate that the recent crayfish mortality observed in Bosnia and Herzegovina (Mostarsko blato karst field, Neretva river drainage) was caused by A. astaci. The causative strain is known only from European crayfish, indicating that A. astaci poses a threat to native species in this region, even in the absence of its main vectors, the North American crayfish.", "pmid": "28888767", "title": "Recent acute crayfish mortality reveals Aphanomyces astaci presence in Bosnia and Herzegovina."}, {"journal": "FEMS immunology and medical microbiology", "meshMajor": ["Animals", "Bacteremia", "Bacteria", "Bacterial Infections", "Biological Assay", "Bombyx", "Cell Wall", "Humans", "Larva", "Peptidoglycan", "Predictive Value of Tests", "Sensitivity and Specificity"], "year": "2000", "abstractText": "Silkworm larvae plasma (SLP) reagent, which is prepared from the body fluid of the silkworm, reacts with peptidoglycan (PG), a fragment of both the Gram-positive and Gram-negative bacterial cell wall, as well as with beta-glucan, a component of fungi. We developed a quantitative method for the detection of PG in human plasma from cases with bacterial infection using the SLP reagent. Tested in this way, the SLP method showed 86.2% sensitivity, 90.6% specificity, 89.3% positive predictive value, and 88.5% efficiency. The SLP method provides a valuable tool for the diagnosis of systemic infection using patients' blood.", "pmid": "10767607", "title": "Detection of peptidoglycan in human plasma using the silkworm larvae plasma test."}]}}
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index ef747606..41a57aa1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -30,120 +30,120 @@ testing = ["datasets", "deepspeed", "evaluate", "parameterized", "pytest", "pyte
 
 [[package]]
 name = "aiobotocore"
-version = "2.5.2"
+version = "2.5.4"
 description = "Async client for aws services using botocore and aiohttp"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "aiobotocore-2.5.2-py3-none-any.whl", hash = "sha256:337429ffd3cc367532572d40be809a84c7b5335f3f8eca2f23e09dfaa9a9ef90"},
-    {file = "aiobotocore-2.5.2.tar.gz", hash = "sha256:e7399f21570db1c287f1c0c814dd3475dfe1c8166722e2c77ce67f172cbcfa89"},
+    {file = "aiobotocore-2.5.4-py3-none-any.whl", hash = "sha256:4b32218728ca3d0be83835b604603a0cd6c329066e884bb78149334267f92440"},
+    {file = "aiobotocore-2.5.4.tar.gz", hash = "sha256:60341f19eda77e41e1ab11eef171b5a98b5dbdb90804f5334b6f90e560e31fae"},
 ]
 
 [package.dependencies]
 aiohttp = ">=3.3.1,<4.0.0"
 aioitertools = ">=0.5.1,<1.0.0"
-boto3 = {version = ">=1.26.161,<1.26.162", optional = true, markers = "extra == \"boto3\""}
-botocore = ">=1.29.161,<1.29.162"
+boto3 = {version = ">=1.28.17,<1.28.18", optional = true, markers = "extra == \"boto3\""}
+botocore = ">=1.31.17,<1.31.18"
 wrapt = ">=1.10.10,<2.0.0"
 
 [package.extras]
-awscli = ["awscli (>=1.27.161,<1.27.162)"]
-boto3 = ["boto3 (>=1.26.161,<1.26.162)"]
+awscli = ["awscli (>=1.29.17,<1.29.18)"]
+boto3 = ["boto3 (>=1.28.17,<1.28.18)"]
 
 [[package]]
 name = "aiohttp"
-version = "3.8.4"
+version = "3.8.5"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ce45967538fb747370308d3145aa68a074bdecb4f3a300869590f725ced69c1"},
-    {file = "aiohttp-3.8.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b744c33b6f14ca26b7544e8d8aadff6b765a80ad6164fb1a430bbadd593dfb1a"},
-    {file = "aiohttp-3.8.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a45865451439eb320784918617ba54b7a377e3501fb70402ab84d38c2cd891b"},
-    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a86d42d7cba1cec432d47ab13b6637bee393a10f664c425ea7b305d1301ca1a3"},
-    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee3c36df21b5714d49fc4580247947aa64bcbe2939d1b77b4c8dcb8f6c9faecc"},
-    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:176a64b24c0935869d5bbc4c96e82f89f643bcdf08ec947701b9dbb3c956b7dd"},
-    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c844fd628851c0bc309f3c801b3a3d58ce430b2ce5b359cd918a5a76d0b20cb5"},
-    {file = "aiohttp-3.8.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5393fb786a9e23e4799fec788e7e735de18052f83682ce2dfcabaf1c00c2c08e"},
-    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e4b09863aae0dc965c3ef36500d891a3ff495a2ea9ae9171e4519963c12ceefd"},
-    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:adfbc22e87365a6e564c804c58fc44ff7727deea782d175c33602737b7feadb6"},
-    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:147ae376f14b55f4f3c2b118b95be50a369b89b38a971e80a17c3fd623f280c9"},
-    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:eafb3e874816ebe2a92f5e155f17260034c8c341dad1df25672fb710627c6949"},
-    {file = "aiohttp-3.8.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c6cc15d58053c76eacac5fa9152d7d84b8d67b3fde92709195cb984cfb3475ea"},
-    {file = "aiohttp-3.8.4-cp310-cp310-win32.whl", hash = "sha256:59f029a5f6e2d679296db7bee982bb3d20c088e52a2977e3175faf31d6fb75d1"},
-    {file = "aiohttp-3.8.4-cp310-cp310-win_amd64.whl", hash = "sha256:fe7ba4a51f33ab275515f66b0a236bcde4fb5561498fe8f898d4e549b2e4509f"},
-    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:3d8ef1a630519a26d6760bc695842579cb09e373c5f227a21b67dc3eb16cfea4"},
-    {file = "aiohttp-3.8.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b3f2e06a512e94722886c0827bee9807c86a9f698fac6b3aee841fab49bbfb4"},
-    {file = "aiohttp-3.8.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3a80464982d41b1fbfe3154e440ba4904b71c1a53e9cd584098cd41efdb188ef"},
-    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b631e26df63e52f7cce0cce6507b7a7f1bc9b0c501fcde69742130b32e8782f"},
-    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f43255086fe25e36fd5ed8f2ee47477408a73ef00e804cb2b5cba4bf2ac7f5e"},
-    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4d347a172f866cd1d93126d9b239fcbe682acb39b48ee0873c73c933dd23bd0f"},
-    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3fec6a4cb5551721cdd70473eb009d90935b4063acc5f40905d40ecfea23e05"},
-    {file = "aiohttp-3.8.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80a37fe8f7c1e6ce8f2d9c411676e4bc633a8462844e38f46156d07a7d401654"},
-    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d1e6a862b76f34395a985b3cd39a0d949ca80a70b6ebdea37d3ab39ceea6698a"},
-    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cd468460eefef601ece4428d3cf4562459157c0f6523db89365202c31b6daebb"},
-    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:618c901dd3aad4ace71dfa0f5e82e88b46ef57e3239fc7027773cb6d4ed53531"},
-    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:652b1bff4f15f6287550b4670546a2947f2a4575b6c6dff7760eafb22eacbf0b"},
-    {file = "aiohttp-3.8.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80575ba9377c5171407a06d0196b2310b679dc752d02a1fcaa2bc20b235dbf24"},
-    {file = "aiohttp-3.8.4-cp311-cp311-win32.whl", hash = "sha256:bbcf1a76cf6f6dacf2c7f4d2ebd411438c275faa1dc0c68e46eb84eebd05dd7d"},
-    {file = "aiohttp-3.8.4-cp311-cp311-win_amd64.whl", hash = "sha256:6e74dd54f7239fcffe07913ff8b964e28b712f09846e20de78676ce2a3dc0bfc"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:880e15bb6dad90549b43f796b391cfffd7af373f4646784795e20d92606b7a51"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb96fa6b56bb536c42d6a4a87dfca570ff8e52de2d63cabebfd6fb67049c34b6"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a6cadebe132e90cefa77e45f2d2f1a4b2ce5c6b1bfc1656c1ddafcfe4ba8131"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f352b62b45dff37b55ddd7b9c0c8672c4dd2eb9c0f9c11d395075a84e2c40f75"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ab43061a0c81198d88f39aaf90dae9a7744620978f7ef3e3708339b8ed2ef01"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9cb1565a7ad52e096a6988e2ee0397f72fe056dadf75d17fa6b5aebaea05622"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:1b3ea7edd2d24538959c1c1abf97c744d879d4e541d38305f9bd7d9b10c9ec41"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:7c7837fe8037e96b6dd5cfcf47263c1620a9d332a87ec06a6ca4564e56bd0f36"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3b90467ebc3d9fa5b0f9b6489dfb2c304a1db7b9946fa92aa76a831b9d587e99"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:cab9401de3ea52b4b4c6971db5fb5c999bd4260898af972bf23de1c6b5dd9d71"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d1f9282c5f2b5e241034a009779e7b2a1aa045f667ff521e7948ea9b56e0c5ff"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-win32.whl", hash = "sha256:5e14f25765a578a0a634d5f0cd1e2c3f53964553a00347998dfdf96b8137f777"},
-    {file = "aiohttp-3.8.4-cp36-cp36m-win_amd64.whl", hash = "sha256:4c745b109057e7e5f1848c689ee4fb3a016c8d4d92da52b312f8a509f83aa05e"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:aede4df4eeb926c8fa70de46c340a1bc2c6079e1c40ccf7b0eae1313ffd33519"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddaae3f3d32fc2cb4c53fab020b69a05c8ab1f02e0e59665c6f7a0d3a5be54f"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4eb3b82ca349cf6fadcdc7abcc8b3a50ab74a62e9113ab7a8ebc268aad35bb9"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bcb89336efa095ea21b30f9e686763f2be4478f1b0a616969551982c4ee4c3b"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c08e8ed6fa3d477e501ec9db169bfac8140e830aa372d77e4a43084d8dd91ab"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c6cd05ea06daca6ad6a4ca3ba7fe7dc5b5de063ff4daec6170ec0f9979f6c332"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b7a00a9ed8d6e725b55ef98b1b35c88013245f35f68b1b12c5cd4100dddac333"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:de04b491d0e5007ee1b63a309956eaed959a49f5bb4e84b26c8f5d49de140fa9"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:40653609b3bf50611356e6b6554e3a331f6879fa7116f3959b20e3528783e699"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dbf3a08a06b3f433013c143ebd72c15cac33d2914b8ea4bea7ac2c23578815d6"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:854f422ac44af92bfe172d8e73229c270dc09b96535e8a548f99c84f82dde241"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-win32.whl", hash = "sha256:aeb29c84bb53a84b1a81c6c09d24cf33bb8432cc5c39979021cc0f98c1292a1a"},
-    {file = "aiohttp-3.8.4-cp37-cp37m-win_amd64.whl", hash = "sha256:db3fc6120bce9f446d13b1b834ea5b15341ca9ff3f335e4a951a6ead31105480"},
-    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:fabb87dd8850ef0f7fe2b366d44b77d7e6fa2ea87861ab3844da99291e81e60f"},
-    {file = "aiohttp-3.8.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:91f6d540163f90bbaef9387e65f18f73ffd7c79f5225ac3d3f61df7b0d01ad15"},
-    {file = "aiohttp-3.8.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d265f09a75a79a788237d7f9054f929ced2e69eb0bb79de3798c468d8a90f945"},
-    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d89efa095ca7d442a6d0cbc755f9e08190ba40069b235c9886a8763b03785da"},
-    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4dac314662f4e2aa5009977b652d9b8db7121b46c38f2073bfeed9f4049732cd"},
-    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fe11310ae1e4cd560035598c3f29d86cef39a83d244c7466f95c27ae04850f10"},
-    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ddb2a2026c3f6a68c3998a6c47ab6795e4127315d2e35a09997da21865757f8"},
-    {file = "aiohttp-3.8.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e75b89ac3bd27d2d043b234aa7b734c38ba1b0e43f07787130a0ecac1e12228a"},
-    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6e601588f2b502c93c30cd5a45bfc665faaf37bbe835b7cfd461753068232074"},
-    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a5d794d1ae64e7753e405ba58e08fcfa73e3fad93ef9b7e31112ef3c9a0efb52"},
-    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a1f4689c9a1462f3df0a1f7e797791cd6b124ddbee2b570d34e7f38ade0e2c71"},
-    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3032dcb1c35bc330134a5b8a5d4f68c1a87252dfc6e1262c65a7e30e62298275"},
-    {file = "aiohttp-3.8.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8189c56eb0ddbb95bfadb8f60ea1b22fcfa659396ea36f6adcc521213cd7b44d"},
-    {file = "aiohttp-3.8.4-cp38-cp38-win32.whl", hash = "sha256:33587f26dcee66efb2fff3c177547bd0449ab7edf1b73a7f5dea1e38609a0c54"},
-    {file = "aiohttp-3.8.4-cp38-cp38-win_amd64.whl", hash = "sha256:e595432ac259af2d4630008bf638873d69346372d38255774c0e286951e8b79f"},
-    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5a7bdf9e57126dc345b683c3632e8ba317c31d2a41acd5800c10640387d193ed"},
-    {file = "aiohttp-3.8.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:22f6eab15b6db242499a16de87939a342f5a950ad0abaf1532038e2ce7d31567"},
-    {file = "aiohttp-3.8.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7235604476a76ef249bd64cb8274ed24ccf6995c4a8b51a237005ee7a57e8643"},
-    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea9eb976ffdd79d0e893869cfe179a8f60f152d42cb64622fca418cd9b18dc2a"},
-    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92c0cea74a2a81c4c76b62ea1cac163ecb20fb3ba3a75c909b9fa71b4ad493cf"},
-    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:493f5bc2f8307286b7799c6d899d388bbaa7dfa6c4caf4f97ef7521b9cb13719"},
-    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a63f03189a6fa7c900226e3ef5ba4d3bd047e18f445e69adbd65af433add5a2"},
-    {file = "aiohttp-3.8.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10c8cefcff98fd9168cdd86c4da8b84baaa90bf2da2269c6161984e6737bf23e"},
-    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bca5f24726e2919de94f047739d0a4fc01372801a3672708260546aa2601bf57"},
-    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:03baa76b730e4e15a45f81dfe29a8d910314143414e528737f8589ec60cf7391"},
-    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:8c29c77cc57e40f84acef9bfb904373a4e89a4e8b74e71aa8075c021ec9078c2"},
-    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:03543dcf98a6619254b409be2d22b51f21ec66272be4ebda7b04e6412e4b2e14"},
-    {file = "aiohttp-3.8.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:17b79c2963db82086229012cff93ea55196ed31f6493bb1ccd2c62f1724324e4"},
-    {file = "aiohttp-3.8.4-cp39-cp39-win32.whl", hash = "sha256:34ce9f93a4a68d1272d26030655dd1b58ff727b3ed2a33d80ec433561b03d67a"},
-    {file = "aiohttp-3.8.4-cp39-cp39-win_amd64.whl", hash = "sha256:41a86a69bb63bb2fc3dc9ad5ea9f10f1c9c8e282b471931be0268ddd09430b04"},
-    {file = "aiohttp-3.8.4.tar.gz", hash = "sha256:bf2e1a9162c1e441bf805a1fd166e249d574ca04e03b34f97e2928769e91ab5c"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
+    {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
+    {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
+    {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
+    {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
+    {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
+    {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
+    {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
+    {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
+    {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
+    {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
+    {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
+    {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
+    {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
+    {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
+    {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
+    {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
+    {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
+    {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
+    {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
+    {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
+    {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
+    {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
+    {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
 ]
 
 [package.dependencies]
@@ -237,24 +237,24 @@ files = [
 
 [[package]]
 name = "anyio"
-version = "3.7.1"
+version = "4.0.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"},
-    {file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"},
+    {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"},
+    {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"},
 ]
 
 [package.dependencies]
-exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
 idna = ">=2.8"
 sniffio = ">=1.1"
 
 [package.extras]
-doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"]
-test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
-trio = ["trio (<0.22)"]
+doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"]
+trio = ["trio (>=0.22)"]
 
 [[package]]
 name = "appdirs"
@@ -328,13 +328,13 @@ test = ["astroid", "pytest"]
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "4.0.3"
 description = "Timeout context manager for asyncio programs"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
+    {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
 ]
 
 [[package]]
@@ -392,13 +392,13 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte
 
 [[package]]
 name = "awswrangler"
-version = "3.2.1"
+version = "3.3.0"
 description = "Pandas on AWS."
 optional = false
 python-versions = ">=3.8,<4.0"
 files = [
-    {file = "awswrangler-3.2.1-py3-none-any.whl", hash = "sha256:fcd950fa9fe48217e484db0d1ce10a009c63f8e71d321f92b22fc0cf63d89b41"},
-    {file = "awswrangler-3.2.1.tar.gz", hash = "sha256:f12047d32463d8f0ca51c30cd50172efb742d890a0f602dc2e828878b02ba90b"},
+    {file = "awswrangler-3.3.0-py3-none-any.whl", hash = "sha256:dfeac4aca8e2a2fcfb5627bf64ce6b1c01178ed2c36bb8f3ca319019dc14931f"},
+    {file = "awswrangler-3.3.0.tar.gz", hash = "sha256:0bc07fe4f0e79bc9adef9b66718974ae4d18a498c6bf14ec3c7a2059d7efa69f"},
 ]
 
 [package.dependencies]
@@ -406,14 +406,14 @@ boto3 = ">=1.20.32,<2.0.0"
 botocore = ">=1.23.32,<2.0.0"
 numpy = ">=1.18,<2.0"
 packaging = ">=21.1,<24.0"
-pandas = ">=1.2.0,<1.5.0 || >1.5.0,<3.0.0"
+pandas = ">=1.2.0,<3.0.0"
 pyarrow = ">=7.0.0"
 typing-extensions = ">=4.4.0,<5.0.0"
 
 [package.extras]
-deltalake = ["deltalake (>=0.6.4,<0.10.0)"]
+deltalake = ["deltalake (>=0.6.4,<0.11.0)"]
 gremlin = ["gremlinpython (>=3.6.2,<4.0.0)", "requests (>=2.0.0,<3.0.0)"]
-modin = ["modin (>=0.20.1,<0.21.0)"]
+modin = ["modin (>=0.23.0,<0.24.0)"]
 mysql = ["pymysql (>=1.0.0,<2.0.0)"]
 opencypher = ["requests (>=2.0.0,<3.0.0)"]
 openpyxl = ["openpyxl (>=3.0.0,<4.0.0)"]
@@ -421,7 +421,7 @@ opensearch = ["jsonpath-ng (>=1.5.3,<2.0.0)", "opensearch-py (>=2.0.0,<3.0.0)",
 oracle = ["oracledb (>=1.0.0,<2.0.0)"]
 postgres = ["pg8000 (>=1.29.0,<2.0.0)"]
 progressbar = ["progressbar2 (>=4.0.0,<5.0.0)"]
-ray = ["ray[data,default] (>=2.0.0,<2.6.0)"]
+ray = ["ray[data,default] (>=2.0.0,<2.7.0)"]
 redshift = ["redshift-connector (>=2.0.0,<3.0.0)"]
 sparql = ["SPARQLWrapper (>=2.0.0,<3.0.0)", "requests (>=2.0.0,<3.0.0)"]
 sqlserver = ["pyodbc (>=4.0.0,<5.0.0)"]
@@ -461,33 +461,33 @@ files = [
 
 [[package]]
 name = "black"
-version = "23.7.0"
+version = "23.9.1"
 description = "The uncompromising code formatter."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "black-23.7.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:5c4bc552ab52f6c1c506ccae05681fab58c3f72d59ae6e6639e8885e94fe2587"},
-    {file = "black-23.7.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:552513d5cd5694590d7ef6f46e1767a4df9af168d449ff767b13b084c020e63f"},
-    {file = "black-23.7.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:86cee259349b4448adb4ef9b204bb4467aae74a386bce85d56ba4f5dc0da27be"},
-    {file = "black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:501387a9edcb75d7ae8a4412bb8749900386eaef258f1aefab18adddea1936bc"},
-    {file = "black-23.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb074d8b213749fa1d077d630db0d5f8cc3b2ae63587ad4116e8a436e9bbe995"},
-    {file = "black-23.7.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b5b0ee6d96b345a8b420100b7d71ebfdd19fab5e8301aff48ec270042cd40ac2"},
-    {file = "black-23.7.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:893695a76b140881531062d48476ebe4a48f5d1e9388177e175d76234ca247cd"},
-    {file = "black-23.7.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:c333286dc3ddca6fdff74670b911cccedacb4ef0a60b34e491b8a67c833b343a"},
-    {file = "black-23.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831d8f54c3a8c8cf55f64d0422ee875eecac26f5f649fb6c1df65316b67c8926"},
-    {file = "black-23.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7f3bf2dec7d541b4619b8ce526bda74a6b0bffc480a163fed32eb8b3c9aed8ad"},
-    {file = "black-23.7.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:f9062af71c59c004cd519e2fb8f5d25d39e46d3af011b41ab43b9c74e27e236f"},
-    {file = "black-23.7.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:01ede61aac8c154b55f35301fac3e730baf0c9cf8120f65a9cd61a81cfb4a0c3"},
-    {file = "black-23.7.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:327a8c2550ddc573b51e2c352adb88143464bb9d92c10416feb86b0f5aee5ff6"},
-    {file = "black-23.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1c6022b86f83b632d06f2b02774134def5d4d4f1dac8bef16d90cda18ba28a"},
-    {file = "black-23.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:27eb7a0c71604d5de083757fbdb245b1a4fae60e9596514c6ec497eb63f95320"},
-    {file = "black-23.7.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:8417dbd2f57b5701492cd46edcecc4f9208dc75529bcf76c514864e48da867d9"},
-    {file = "black-23.7.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:47e56d83aad53ca140da0af87678fb38e44fd6bc0af71eebab2d1f59b1acf1d3"},
-    {file = "black-23.7.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:25cc308838fe71f7065df53aedd20327969d05671bac95b38fdf37ebe70ac087"},
-    {file = "black-23.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:642496b675095d423f9b8448243336f8ec71c9d4d57ec17bf795b67f08132a91"},
-    {file = "black-23.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:ad0014efc7acf0bd745792bd0d8857413652979200ab924fbf239062adc12491"},
-    {file = "black-23.7.0-py3-none-any.whl", hash = "sha256:9fd59d418c60c0348505f2ddf9609c1e1de8e7493eab96198fc89d9f865e7a96"},
-    {file = "black-23.7.0.tar.gz", hash = "sha256:022a582720b0d9480ed82576c920a8c1dde97cc38ff11d8d8859b3bd6ca9eedb"},
+    {file = "black-23.9.1-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:d6bc09188020c9ac2555a498949401ab35bb6bf76d4e0f8ee251694664df6301"},
+    {file = "black-23.9.1-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:13ef033794029b85dfea8032c9d3b92b42b526f1ff4bf13b2182ce4e917f5100"},
+    {file = "black-23.9.1-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:75a2dc41b183d4872d3a500d2b9c9016e67ed95738a3624f4751a0cb4818fe71"},
+    {file = "black-23.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13a2e4a93bb8ca74a749b6974925c27219bb3df4d42fc45e948a5d9feb5122b7"},
+    {file = "black-23.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:adc3e4442eef57f99b5590b245a328aad19c99552e0bdc7f0b04db6656debd80"},
+    {file = "black-23.9.1-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:8431445bf62d2a914b541da7ab3e2b4f3bc052d2ccbf157ebad18ea126efb91f"},
+    {file = "black-23.9.1-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:8fc1ddcf83f996247505db6b715294eba56ea9372e107fd54963c7553f2b6dfe"},
+    {file = "black-23.9.1-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:7d30ec46de88091e4316b17ae58bbbfc12b2de05e069030f6b747dfc649ad186"},
+    {file = "black-23.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:031e8c69f3d3b09e1aa471a926a1eeb0b9071f80b17689a655f7885ac9325a6f"},
+    {file = "black-23.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:538efb451cd50f43aba394e9ec7ad55a37598faae3348d723b59ea8e91616300"},
+    {file = "black-23.9.1-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:638619a559280de0c2aa4d76f504891c9860bb8fa214267358f0a20f27c12948"},
+    {file = "black-23.9.1-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:a732b82747235e0542c03bf352c126052c0fbc458d8a239a94701175b17d4855"},
+    {file = "black-23.9.1-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:cf3a4d00e4cdb6734b64bf23cd4341421e8953615cba6b3670453737a72ec204"},
+    {file = "black-23.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf99f3de8b3273a8317681d8194ea222f10e0133a24a7548c73ce44ea1679377"},
+    {file = "black-23.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:14f04c990259576acd093871e7e9b14918eb28f1866f91968ff5524293f9c573"},
+    {file = "black-23.9.1-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:c619f063c2d68f19b2d7270f4cf3192cb81c9ec5bc5ba02df91471d0b88c4c5c"},
+    {file = "black-23.9.1-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:6a3b50e4b93f43b34a9d3ef00d9b6728b4a722c997c99ab09102fd5efdb88325"},
+    {file = "black-23.9.1-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c46767e8df1b7beefb0899c4a95fb43058fa8500b6db144f4ff3ca38eb2f6393"},
+    {file = "black-23.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50254ebfa56aa46a9fdd5d651f9637485068a1adf42270148cd101cdf56e0ad9"},
+    {file = "black-23.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:403397c033adbc45c2bd41747da1f7fc7eaa44efbee256b53842470d4ac5a70f"},
+    {file = "black-23.9.1-py3-none-any.whl", hash = "sha256:6ccd59584cc834b6d127628713e4b6b968e5f79572da66284532525a042549f9"},
+    {file = "black-23.9.1.tar.gz", hash = "sha256:24b6b3ff5c6d9ea08a8888f6977eae858e1f340d7260cf56d70a49823236b62d"},
 ]
 
 [package.dependencies]
@@ -497,6 +497,7 @@ packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""}
 
 [package.extras]
 colorama = ["colorama (>=0.4.3)"]
@@ -546,17 +547,17 @@ numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
 
 [[package]]
 name = "boto3"
-version = "1.26.161"
+version = "1.28.17"
 description = "The AWS SDK for Python"
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "boto3-1.26.161-py3-none-any.whl", hash = "sha256:f66e5c9dbe7f34383bcf64fa6070771355c11a44dd75c7f1279f2f37e1c89183"},
-    {file = "boto3-1.26.161.tar.gz", hash = "sha256:662731e464d14af1035f44fc6a46b0e3112ee011ac0a5ed416d205daa3e15f25"},
+    {file = "boto3-1.28.17-py3-none-any.whl", hash = "sha256:bca0526f819e0f19c0f1e6eba3e2d1d6b6a92a45129f98c0d716e5aab6d9444b"},
+    {file = "boto3-1.28.17.tar.gz", hash = "sha256:90f7cfb5e1821af95b1fc084bc50e6c47fa3edc99f32de1a2591faa0c546bea7"},
 ]
 
 [package.dependencies]
-botocore = ">=1.29.161,<1.30.0"
+botocore = ">=1.31.17,<1.32.0"
 jmespath = ">=0.7.1,<2.0.0"
 s3transfer = ">=0.6.0,<0.7.0"
 
@@ -565,13 +566,13 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.29.161"
+version = "1.31.17"
 description = "Low-level, data-driven core of boto 3."
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "botocore-1.29.161-py3-none-any.whl", hash = "sha256:b906999dd53dda2ef0ef6f7f55fcc81a4b06b9f1c8a9f65c546e0b981f959f5f"},
-    {file = "botocore-1.29.161.tar.gz", hash = "sha256:a50edd715eb510343e27849f36483804aae4b871590db4d4996aa53368dcac40"},
+    {file = "botocore-1.31.17-py3-none-any.whl", hash = "sha256:6ac34a1d34aa3750e78b77b8596617e2bab938964694d651939dba2cbde2c12b"},
+    {file = "botocore-1.31.17.tar.gz", hash = "sha256:396459065dba4339eb4da4ec8b4e6599728eb89b7caaceea199e26f7d824a41c"},
 ]
 
 [package.dependencies]
@@ -580,7 +581,7 @@ python-dateutil = ">=2.1,<3.0.0"
 urllib3 = ">=1.25.4,<1.27"
 
 [package.extras]
-crt = ["awscrt (==0.16.9)"]
+crt = ["awscrt (==0.16.26)"]
 
 [[package]]
 name = "catalogue"
@@ -595,13 +596,13 @@ files = [
 
 [[package]]
 name = "celery"
-version = "5.3.1"
+version = "5.3.4"
 description = "Distributed Task Queue."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "celery-5.3.1-py3-none-any.whl", hash = "sha256:27f8f3f3b58de6e0ab4f174791383bbd7445aff0471a43e99cfd77727940753f"},
-    {file = "celery-5.3.1.tar.gz", hash = "sha256:f84d1c21a1520c116c2b7d26593926581191435a03aa74b77c941b93ca1c6210"},
+    {file = "celery-5.3.4-py3-none-any.whl", hash = "sha256:1e6ed40af72695464ce98ca2c201ad0ef8fd192246f6c9eac8bba343b980ad34"},
+    {file = "celery-5.3.4.tar.gz", hash = "sha256:9023df6a8962da79eb30c0c84d5f4863d9793a466354cc931d7f72423996de28"},
 ]
 
 [package.dependencies]
@@ -610,14 +611,14 @@ click = ">=8.1.2,<9.0"
 click-didyoumean = ">=0.3.0"
 click-plugins = ">=1.1.1"
 click-repl = ">=0.2.0"
-kombu = ">=5.3.1,<6.0"
+kombu = ">=5.3.2,<6.0"
 python-dateutil = ">=2.8.2"
 tzdata = ">=2022.7"
 vine = ">=5.0.0,<6.0"
 
 [package.extras]
-arangodb = ["pyArango (>=2.0.1)"]
-auth = ["cryptography (==41.0.1)"]
+arangodb = ["pyArango (>=2.0.2)"]
+auth = ["cryptography (==41.0.3)"]
 azureblockblob = ["azure-storage-blob (>=12.15.0)"]
 brotli = ["brotli (>=1.0.0)", "brotlipy (>=0.7.0)"]
 cassandra = ["cassandra-driver (>=3.25.0,<4)"]
@@ -637,7 +638,7 @@ msgpack = ["msgpack (==1.0.5)"]
 pymemcache = ["python-memcached (==1.59)"]
 pyro = ["pyro4 (==4.82)"]
 pytest = ["pytest-celery (==0.0.0)"]
-redis = ["redis (>=4.5.2,!=4.5.5)"]
+redis = ["redis (>=4.5.2,!=4.5.5,<5.0.0)"]
 s3 = ["boto3 (>=1.26.143)"]
 slmq = ["softlayer-messaging (>=1.0.3)"]
 solar = ["ephem (==4.1.4)"]
@@ -650,13 +651,13 @@ zstd = ["zstandard (==0.21.0)"]
 
 [[package]]
 name = "certifi"
-version = "2023.5.7"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
-    {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]
 
 [[package]]
@@ -737,13 +738,13 @@ pycparser = "*"
 
 [[package]]
 name = "cfgv"
-version = "3.3.1"
+version = "3.4.0"
 description = "Validate configuration and produce human readable error messages."
 optional = false
-python-versions = ">=3.6.1"
+python-versions = ">=3.8"
 files = [
-    {file = "cfgv-3.3.1-py2.py3-none-any.whl", hash = "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426"},
-    {file = "cfgv-3.3.1.tar.gz", hash = "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736"},
+    {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
 ]
 
 [[package]]
@@ -832,13 +833,13 @@ files = [
 
 [[package]]
 name = "click"
-version = "8.1.5"
+version = "8.1.7"
 description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "click-8.1.5-py3-none-any.whl", hash = "sha256:e576aa487d679441d7d30abb87e1b43d24fc53bffb8758443b1a9e1cee504548"},
-    {file = "click-8.1.5.tar.gz", hash = "sha256:4be4b1af8d665c6d942909916d31a213a106800c47d0eeba73d34da3cbc11367"},
+    {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
+    {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
 ]
 
 [package.dependencies]
@@ -895,28 +896,28 @@ testing = ["pytest (>=7.2.1)", "pytest-cov (>=4.0.0)", "tox (>=4.4.3)"]
 
 [[package]]
 name = "cmake"
-version = "3.26.4"
+version = "3.27.4.1"
 description = "CMake is an open-source, cross-platform family of tools designed to build, test and package software"
 optional = false
 python-versions = "*"
 files = [
-    {file = "cmake-3.26.4-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:230227bf99f36614de84cdc92ffce3a50eb2803020e946f8da945a08fcf766bf"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:248a90816abfc10ff6e1109b54b8235c3e62f0ac92da16541753deb3b5ae063d"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:1b92f9f59f48c803106dbdd6750b0f571a0500e25d3a62c42ba84bb7a9240d10"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3175442985558d5415b97f264a6a1bb0af5ecfe10e3f7510257b1ea66bd33848"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1d887be5f1a3f17559a78707a6bc0560f4f8cb93cebb9d823d90a63e68bae09b"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:235d8eac93a28dcce5a1cd7130412885a2aa53d5735cb2230e0f26f589347b65"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:05cfd76c637eb22058c95e2dc383cadd4e0615e2643e637bb498a6cc24825790"},
-    {file = "cmake-3.26.4-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:93015da6f1c0e1e5f2debf752f1803ea52d742d915ad674043d36e471f937507"},
-    {file = "cmake-3.26.4-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:d726671ae7ae4aa6989e73d26b9f8f8e6af45163a26ea243949d72246566fdd8"},
-    {file = "cmake-3.26.4-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:432837364aa6cab2826a72e8a4cdd3586f5ac9ce495217ccd59aa70f2bba8120"},
-    {file = "cmake-3.26.4-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:24110035aff586a04a6a6fcf4609270642e4f503c0620c962dff75b653f81414"},
-    {file = "cmake-3.26.4-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:3e280e81713408987b7053f5b922c9f94e45668ca6efff1f02846309ca0b5b0f"},
-    {file = "cmake-3.26.4-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:c3b0e72750c0f6c0373242c1299bc4ffdbebdd5004966ae6df0b2e9845aa6990"},
-    {file = "cmake-3.26.4-py2.py3-none-win32.whl", hash = "sha256:e058e59154a1e490fb9425b420f87e28144292397607638d73e323509f7efae6"},
-    {file = "cmake-3.26.4-py2.py3-none-win_amd64.whl", hash = "sha256:b7a6946c345497c14064e0c9585b30f5aaebbefdfc0b245b6bb5a978eb4fc85f"},
-    {file = "cmake-3.26.4-py2.py3-none-win_arm64.whl", hash = "sha256:93a03bad17b9741acaff4a8651f8596496506602fa123e70fe67142f1b21ee2e"},
-    {file = "cmake-3.26.4.tar.gz", hash = "sha256:d45b30b9ce7280829888c78650177ab525df2b6785e1a5b3d82b4c147d828c0e"},
+    {file = "cmake-3.27.4.1-py2.py3-none-macosx_10_10_universal2.macosx_10_10_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:65b79f1e8b6fa254697ee0b411aa4dff0d2309c1405af3448adf06cbd7ef0ac5"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2010_i686.manylinux_2_12_i686.whl", hash = "sha256:4a1d22ee72dcdc32d0f8bbf5691d2e9367585db8bfeafe7cffa2c4274127a801"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:37e9cad75184fbefe837311528d026901278b606707e9d14b58e3767d49d0aa6"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2f217fb281b068696fdcc4b198de62e9ded8bc0a93877684afc59db3507ccb44"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:a3bd8b9d0e294bd2b3ce27a850c9d924aee7e4f4c0bb56d66641cc1544314f58"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:871c8b5eaac959f079c2389c7a7f198fa5f86a029e8726fcb1f3e13d030c33e9"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:ec4a5bc2376dfc57065bfde6806183b331165f33457b7cc0fc0511260dde7c72"},
+    {file = "cmake-3.27.4.1-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c36eb106dec60198264b25d4bd23cd9ea30b0af9200a143ec1db887c095306f7"},
+    {file = "cmake-3.27.4.1-py2.py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:2b0b53ec2e45cfe9982d0adf833b3519efc328c1e3cffae4d237841a1ed6edf4"},
+    {file = "cmake-3.27.4.1-py2.py3-none-musllinux_1_1_i686.whl", hash = "sha256:a504815bcba0ece9aafb48a6b7770d6479756fda92f8b62f9ab7ff8a403a12d5"},
+    {file = "cmake-3.27.4.1-py2.py3-none-musllinux_1_1_ppc64le.whl", hash = "sha256:1aca07fccfa042a0379bb027e30f090a8239b18fd3f959391c5d77c22dd0a809"},
+    {file = "cmake-3.27.4.1-py2.py3-none-musllinux_1_1_s390x.whl", hash = "sha256:315eb37233e2d0b8fa01580e33439eaeaef65f1e41ad9ca269cbe68cc0a039a4"},
+    {file = "cmake-3.27.4.1-py2.py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:94fec5a8bae1f3b62d8a8653ebcb7fa4007e2d0e713f94e4b2089f708c13548f"},
+    {file = "cmake-3.27.4.1-py2.py3-none-win32.whl", hash = "sha256:8249a1ba7901b53661b44e59bdf6fd1e977e10843795788efe25d3374de6ed95"},
+    {file = "cmake-3.27.4.1-py2.py3-none-win_amd64.whl", hash = "sha256:b72db11e13eafb46b9c53797d141e89886293db768feabef4b841accf666de54"},
+    {file = "cmake-3.27.4.1-py2.py3-none-win_arm64.whl", hash = "sha256:0fb68660ce3954de99d1f41bedcf87063325c4cc891003f12de36472fa1efa28"},
+    {file = "cmake-3.27.4.1.tar.gz", hash = "sha256:70526bbff5eeb7d4d6b921af1b80d2d29828302882f94a2cba93ad7d469b90f6"},
 ]
 
 [package.extras]
@@ -949,13 +950,13 @@ test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
 
 [[package]]
 name = "confection"
-version = "0.1.2"
+version = "0.1.3"
 description = "The sweetest config system for Python"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "confection-0.1.2-py3-none-any.whl", hash = "sha256:8bde19143fe36c38ea6e7241dec7be14b8a16e51c9d7ade93d19f72d9f8f1115"},
-    {file = "confection-0.1.2.tar.gz", hash = "sha256:7163eb9bdde62cc61a71c6284fb0f0b50b2723b7ef8ab79c7061a7bd659a058e"},
+    {file = "confection-0.1.3-py3-none-any.whl", hash = "sha256:58b125c9bc6786f32e37fe4d98bc3a03e5f509a4b9de02541b99c559f2026092"},
+    {file = "confection-0.1.3.tar.gz", hash = "sha256:5a876d368a7698eec58791126757a75a3df16e26cc49653b52426e9ffd39f12f"},
 ]
 
 [package.dependencies]
@@ -978,34 +979,34 @@ six = "*"
 
 [[package]]
 name = "cryptography"
-version = "41.0.2"
+version = "41.0.3"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"},
-    {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"},
-    {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"},
-    {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"},
-    {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"},
-    {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"},
-    {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"},
-    {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"},
-    {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"},
-    {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
+    {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
+    {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
+    {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
+    {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
+    {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
+    {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
+    {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
+    {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
+    {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
 ]
 
 [package.dependencies]
@@ -1206,13 +1207,13 @@ graph = ["objgraph (>=1.7.2)"]
 
 [[package]]
 name = "diskcache"
-version = "5.6.1"
+version = "5.6.3"
 description = "Disk Cache -- Disk and file backed persistent cache."
 optional = false
 python-versions = ">=3"
 files = [
-    {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"},
-    {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"},
+    {file = "diskcache-5.6.3-py3-none-any.whl", hash = "sha256:5e31b2d5fbad117cc363ebaf6b689474db18a1f6438bc82358b024abd4c2ca19"},
+    {file = "diskcache-5.6.3.tar.gz", hash = "sha256:2c3a3fa2743d8535d832ec61c2054a1641f41775aa7c556758a109941e33e4fc"},
 ]
 
 [[package]]
@@ -1264,67 +1265,67 @@ files = [
 
 [[package]]
 name = "dulwich"
-version = "0.21.5"
+version = "0.21.6"
 description = "Python Git Library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "dulwich-0.21.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8864719bc176cdd27847332a2059127e2f7bab7db2ff99a999873cb7fff54116"},
-    {file = "dulwich-0.21.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3800cdc17d144c1f7e114972293bd6c46688f5bcc2c9228ed0537ded72394082"},
-    {file = "dulwich-0.21.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e2f676bfed8146966fe934ee734969d7d81548fbd250a8308582973670a9dab1"},
-    {file = "dulwich-0.21.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db330fb59fe3b9d253bdf0e49a521739db83689520c4921ab1c5242aaf77b82"},
-    {file = "dulwich-0.21.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8f6d4f4f4d01dd1d3c968e486d4cd77f96f772da7265941bc506de0944ddb9"},
-    {file = "dulwich-0.21.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1cc0c9ba19ac1b2372598802bc9201a9c45e5d6f1f7a80ec40deeb10acc4e9ae"},
-    {file = "dulwich-0.21.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:61e10242b5a7a82faa8996b2c76239cfb633620b02cdd2946e8af6e7eb31d651"},
-    {file = "dulwich-0.21.5-cp310-cp310-win32.whl", hash = "sha256:7f357639b56146a396f48e5e0bc9bbaca3d6d51c8340bd825299272b588fff5f"},
-    {file = "dulwich-0.21.5-cp310-cp310-win_amd64.whl", hash = "sha256:891d5c73e2b66d05dbb502e44f027dc0dbbd8f6198bc90dae348152e69d0befc"},
-    {file = "dulwich-0.21.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:45d6198e804b539708b73a003419e48fb42ff2c3c6dd93f63f3b134dff6dd259"},
-    {file = "dulwich-0.21.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c2a565d4e704d7f784cdf9637097141f6d47129c8fffc2fac699d57cb075a169"},
-    {file = "dulwich-0.21.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:823091d6b6a1ea07dc4839c9752198fb39193213d103ac189c7669736be2eaff"},
-    {file = "dulwich-0.21.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2c9931b657f2206abec0964ec2355ee2c1e04d05f8864e823ffa23c548c4548"},
-    {file = "dulwich-0.21.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7dc358c2ee727322a09b7c6da43d47a1026049dbd3ad8d612eddca1f9074b298"},
-    {file = "dulwich-0.21.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6155ab7388ee01c670f7c5d8003d4e133eebebc7085a856c007989f0ba921b36"},
-    {file = "dulwich-0.21.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a605e10d72f90a39ea2e634fbfd80f866fc4df29a02ea6db52ae92e5fd4a2003"},
-    {file = "dulwich-0.21.5-cp311-cp311-win32.whl", hash = "sha256:daa607370722c3dce99a0022397c141caefb5ed32032a4f72506f4817ea6405b"},
-    {file = "dulwich-0.21.5-cp311-cp311-win_amd64.whl", hash = "sha256:5e56b2c1911c344527edb2bf1a4356e2fb7e086b1ba309666e1e5c2224cdca8a"},
-    {file = "dulwich-0.21.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:85d3401d08b1ec78c7d58ae987c4bb7b768a438f3daa74aeb8372bebc7fb16fa"},
-    {file = "dulwich-0.21.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:90479608e49db93d8c9e4323bc0ec5496678b535446e29d8fd67dc5bbb5d51bf"},
-    {file = "dulwich-0.21.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9a6bf99f57bcac4c77fc60a58f1b322c91cc4d8c65dc341f76bf402622f89cb"},
-    {file = "dulwich-0.21.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3e68b162af2aae995355e7920f89d50d72b53d56021e5ac0a546d493b17cbf7e"},
-    {file = "dulwich-0.21.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0ab86d6d42e385bf3438e70f3c9b16de68018bd88929379e3484c0ef7990bd3c"},
-    {file = "dulwich-0.21.5-cp37-cp37m-win32.whl", hash = "sha256:f2eeca6d61366cf5ee8aef45bed4245a67d4c0f0d731dc2383eabb80fa695683"},
-    {file = "dulwich-0.21.5-cp37-cp37m-win_amd64.whl", hash = "sha256:1b20a3656b48c941d49c536824e1e5278a695560e8de1a83b53a630143c4552e"},
-    {file = "dulwich-0.21.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:3932b5e17503b265a85f1eda77ede647681c3bab53bc9572955b6b282abd26ea"},
-    {file = "dulwich-0.21.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6616132d219234580de88ceb85dd51480dc43b1bdc05887214b8dd9cfd4a9d40"},
-    {file = "dulwich-0.21.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaf6c7fb6b13495c19c9aace88821c2ade3c8c55b4e216cd7cc55d3e3807d7fa"},
-    {file = "dulwich-0.21.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be12a46f73023970125808a4a78f610c055373096c1ecea3280edee41613eba8"},
-    {file = "dulwich-0.21.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baecef0d8b9199822c7912876a03a1af17833f6c0d461efb62decebd45897e49"},
-    {file = "dulwich-0.21.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:82f632afb9c7c341a875d46aaa3e6c5e586c7a64ce36c9544fa400f7e4f29754"},
-    {file = "dulwich-0.21.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82cdf482f8f51fcc965ffad66180b54a9abaea9b1e985a32e1acbfedf6e0e363"},
-    {file = "dulwich-0.21.5-cp38-cp38-win32.whl", hash = "sha256:c8ded43dc0bd2e65420eb01e778034be5ca7f72e397a839167eda7dcb87c4248"},
-    {file = "dulwich-0.21.5-cp38-cp38-win_amd64.whl", hash = "sha256:2aba0fdad2a19bd5bb3aad6882580cb33359c67b48412ccd4cfccd932012b35e"},
-    {file = "dulwich-0.21.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fd4ad079758514375f11469e081723ba8831ce4eaa1a64b41f06a3a866d5ac34"},
-    {file = "dulwich-0.21.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7fe62685bf356bfb4d0738f84a3fcf0d1fc9e11fee152e488a20b8c66a52429e"},
-    {file = "dulwich-0.21.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:aae448da7d80306dda4fc46292fed7efaa466294571ab3448be16714305076f1"},
-    {file = "dulwich-0.21.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b24cb1fad0525dba4872e9381bc576ea2a6dcdf06b0ed98f8e953e3b1d719b89"},
-    {file = "dulwich-0.21.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e39b7c2c9bda6acae83b25054650a8bb7e373e886e2334721d384e1479bf04b"},
-    {file = "dulwich-0.21.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26456dba39d1209fca17187db06967130e27eeecad2b3c2bbbe63467b0bf09d6"},
-    {file = "dulwich-0.21.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:281310644e02e3aa6d76bcaffe2063b9031213c4916b5f1a6e68c25bdecfaba4"},
-    {file = "dulwich-0.21.5-cp39-cp39-win32.whl", hash = "sha256:4814ca3209dabe0fe7719e9545fbdad7f8bb250c5a225964fe2a31069940c4cf"},
-    {file = "dulwich-0.21.5-cp39-cp39-win_amd64.whl", hash = "sha256:c922a4573267486be0ef85216f2da103fb38075b8465dc0e90457843884e4860"},
-    {file = "dulwich-0.21.5-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:e52b20c4368171b7d32bd3ab0f1d2402e76ad4f2ea915ff9aa73bc9fa2b54d6d"},
-    {file = "dulwich-0.21.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aeb736d777ee21f2117a90fc453ee181aa7eedb9e255b5ef07c51733f3fe5cb6"},
-    {file = "dulwich-0.21.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e8a79c1ed7166f32ad21974fa98d11bf6fd74e94a47e754c777c320e01257c6"},
-    {file = "dulwich-0.21.5-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:b943517e30bd651fbc275a892bb96774f3893d95fe5a4dedd84496a98eaaa8ab"},
-    {file = "dulwich-0.21.5-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:32493a456358a3a6c15bbda07106fc3d4cc50834ee18bc7717968d18be59b223"},
-    {file = "dulwich-0.21.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0aa44b812d978fc22a04531f5090c3c369d5facd03fa6e0501d460a661800c7f"},
-    {file = "dulwich-0.21.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f46bcb6777e5f9f4af24a2bd029e88b77316269d24ce66be590e546a0d8f7b7"},
-    {file = "dulwich-0.21.5-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:a917fd3b4493db3716da2260f16f6b18f68d46fbe491d851d154fc0c2d984ae4"},
-    {file = "dulwich-0.21.5-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:684c52cff867d10c75a7238151ca307582b3d251bbcd6db9e9cffbc998ef804e"},
-    {file = "dulwich-0.21.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9019189d7a8f7394df6a22cd5b484238c5776e42282ad5d6d6c626b4c5f43597"},
-    {file = "dulwich-0.21.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:494024f74c2eef9988adb4352b3651ac1b6c0466176ec62b69d3d3672167ba68"},
-    {file = "dulwich-0.21.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f9b6ac1b1c67fc6083c42b7b6cd3b211292c8a6517216c733caf23e8b103ab6d"},
-    {file = "dulwich-0.21.5.tar.gz", hash = "sha256:70955e4e249ddda6e34a4636b90f74e931e558f993b17c52570fa6144b993103"},
+    {file = "dulwich-0.21.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7f89bee4c97372e8aaf8ffaf5899f1bcd5184b5306d7eaf68738c1101ceba10e"},
+    {file = "dulwich-0.21.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:847bb52562a211b596453a602e75739350c86d7edb846b5b1c46896a5c86b9bb"},
+    {file = "dulwich-0.21.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4e09d0b4e985b371aa6728773781b19298d361a00772e20f98522868cf7edc6f"},
+    {file = "dulwich-0.21.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8dfb50b3915e223a97f50fbac0dbc298d5fffeaac004eeeb3d552c57fe38416f"},
+    {file = "dulwich-0.21.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a64eca1601e79c16df78afe08da9ac9497b934cbc5765990ca7d89a4b87453d9"},
+    {file = "dulwich-0.21.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1fedd924763a5d640348db43a267a394aa80d551228ad45708e0b0cc2130bb62"},
+    {file = "dulwich-0.21.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:edc21c3784dd9d9b85abd9fe53f81a884e2cdcc4e5e09ada17287420d64cfd46"},
+    {file = "dulwich-0.21.6-cp310-cp310-win32.whl", hash = "sha256:daa3584beabfcf0da76df57535a23c80ff6d8ccde6ddbd23bdc79d317a0e20a7"},
+    {file = "dulwich-0.21.6-cp310-cp310-win_amd64.whl", hash = "sha256:40623cc39a3f1634663d22d87f86e2e406cc8ff17ae7a3edc7fcf963c288992f"},
+    {file = "dulwich-0.21.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:e8ed878553f0b76facbb620b455fafa0943162fe8e386920717781e490444efa"},
+    {file = "dulwich-0.21.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a89b19f4960e759915dbc23a4dd0abc067b55d8d65e9df50961b73091b87b81a"},
+    {file = "dulwich-0.21.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28acbd08d6b38720d99cc01da9dd307a2e0585e00436c95bcac6357b9a9a6f76"},
+    {file = "dulwich-0.21.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c2f2683e0598f7c7071ef08a0822f062d8744549a0d45f2c156741033b7e3d7d"},
+    {file = "dulwich-0.21.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54342cf96fe8a44648505c65f23d18889595762003a168d67d7263df66143bd2"},
+    {file = "dulwich-0.21.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2a3fc071e5b14f164191286f7ffc02f60fe8b439d01fad0832697cc08c2237dd"},
+    {file = "dulwich-0.21.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:32d7acfe3fe2ce4502446d8f7a5ab34cfd24c9ff8961e60337638410906a8fbb"},
+    {file = "dulwich-0.21.6-cp311-cp311-win32.whl", hash = "sha256:5e58171a5d70f7910f73d25ff82a058edff09a4c1c3bd1de0dc6b1fbc9a42c3e"},
+    {file = "dulwich-0.21.6-cp311-cp311-win_amd64.whl", hash = "sha256:ceabe8f96edfb9183034a860f5dc77586700b517457032867b64a03c44e5cf96"},
+    {file = "dulwich-0.21.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:4fdc2f081bc3e9e120079c2cea4be213e3f127335aca7c0ab0c19fe791270caa"},
+    {file = "dulwich-0.21.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fe957564108f74325d0d042d85e0c67ef470921ca92b6e7d330c7c49a3b9c1d"},
+    {file = "dulwich-0.21.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2912c8a845c8ccbc79d068a89db7172e355adeb84eb31f062cd3a406d528b30"},
+    {file = "dulwich-0.21.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:81e237a6b1b20c79ef62ca19a8fb231f5519bab874b9a1c2acf9c05edcabd600"},
+    {file = "dulwich-0.21.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:513d045e74307eeb31592255c38f37042c9aa68ce845a167943018ab5138b0e3"},
+    {file = "dulwich-0.21.6-cp37-cp37m-win32.whl", hash = "sha256:e1ac882afa890ef993b8502647e6c6d2b3977ce56e3fe80058ce64607cbc7107"},
+    {file = "dulwich-0.21.6-cp37-cp37m-win_amd64.whl", hash = "sha256:5d2ccf3d355850674f75655154a6519bf1f1664176c670109fa7041019b286f9"},
+    {file = "dulwich-0.21.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:28c9724a167c84a83fc6238e0781f4702b5fe8c53ede31604525fb1a9d1833f4"},
+    {file = "dulwich-0.21.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c816be529680659b6a19798287b4ec6de49040f58160d40b1b2934fd6c28e93f"},
+    {file = "dulwich-0.21.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b0545f0fa9444a0eb84977d08e302e3f55fd7c34a0466ec28bedc3c839b2fc1f"},
+    {file = "dulwich-0.21.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b1682e8e826471ea3c22b8521435e93799e3db8ad05dd3c8f9b1aaacfa78147"},
+    {file = "dulwich-0.21.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24ad45928a65f39ea0f451f9989b7aaedba9893d48c3189b544a70c6a1043f71"},
+    {file = "dulwich-0.21.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b1c9e55233f19cd19c484f607cd90ab578ac50ebfef607f77e3b35c2b6049470"},
+    {file = "dulwich-0.21.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:18697b58e0fc5972de68b529b08ac9ddda3f39af27bcf3f6999635ed3da7ef68"},
+    {file = "dulwich-0.21.6-cp38-cp38-win32.whl", hash = "sha256:22798e9ba59e32b8faff5d9067e2b5a308f6b0fba9b1e1e928571ad278e7b36c"},
+    {file = "dulwich-0.21.6-cp38-cp38-win_amd64.whl", hash = "sha256:6c91e1ed20d3d9a6aaaed9e75adae37272b3fcbcc72bab1eb09574806da88563"},
+    {file = "dulwich-0.21.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:8b84450766a3b151c3676fec3e3ed76304e52a84d5d69ade0f34fff2782c1b41"},
+    {file = "dulwich-0.21.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3da632648ee27b64bb5b285a3a94fddf297a596891cca12ac0df43c4f59448f"},
+    {file = "dulwich-0.21.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cef50c0a19f322b7150248b8fa0862ce1652dec657e340c4020573721e85f215"},
+    {file = "dulwich-0.21.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ac20dfcfd6057efb8499158d23f2c059f933aefa381e192100e6d8bc25d562"},
+    {file = "dulwich-0.21.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81d10aa50c0a9a6dd495990c639358e3a3bbff39e17ff302179be6e93b573da7"},
+    {file = "dulwich-0.21.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a9b52a08d49731375662936d05a12c4a64a6fe0ce257111f62638e475fb5d26d"},
+    {file = "dulwich-0.21.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ed2f1f638b9adfba862719693b371ffe5d58e94d552ace9a23dea0fb0db6f468"},
+    {file = "dulwich-0.21.6-cp39-cp39-win32.whl", hash = "sha256:bf90f2f9328a82778cf85ab696e4a7926918c3f315c75fc432ba31346bfa89b7"},
+    {file = "dulwich-0.21.6-cp39-cp39-win_amd64.whl", hash = "sha256:e0dee3840c3c72e1d60c8f87a7a715d8eac023b9e1b80199d97790f7a1c60d9c"},
+    {file = "dulwich-0.21.6-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:32d3a35caad6879d04711b358b861142440a543f5f4e02df67b13cbcd57f84a6"},
+    {file = "dulwich-0.21.6-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c04df87098053b7767b46fc04b7943d75443f91c73560ca50157cdc22e27a5d3"},
+    {file = "dulwich-0.21.6-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e07f145c7b0d82a9f77d157f493a61900e913d1c1f8b1f40d07d919ffb0929a4"},
+    {file = "dulwich-0.21.6-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:008ff08629ab16d3638a9f36cfc6f5bd74b4d594657f2dc1583d8d3201794571"},
+    {file = "dulwich-0.21.6-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bf469cd5076623c2aad69d01ce9d5392fcb38a5faef91abe1501be733453e37d"},
+    {file = "dulwich-0.21.6-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6592ef2d16ac61a27022647cf64a048f5be6e0a6ab2ebc7322bfbe24fb2b971b"},
+    {file = "dulwich-0.21.6-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99577b2b37f64bc87280079245fb2963494c345d7db355173ecec7ab3d64b949"},
+    {file = "dulwich-0.21.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:d7cd9fb896c65e4c28cb9332f2be192817805978dd8dc299681c4fe83c631158"},
+    {file = "dulwich-0.21.6-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:d9002094198e57e88fe77412d3aa64dd05978046ae725a16123ba621a7704628"},
+    {file = "dulwich-0.21.6-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9b6f8a16f32190aa88c37ef013858b3e01964774bc983900bd0d74ecb6576e6"},
+    {file = "dulwich-0.21.6-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee8aba4dec4d0a52737a8a141f3456229c87dcfd7961f8115786a27b6ebefed"},
+    {file = "dulwich-0.21.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a780e2a0ff208c4f218e72eff8d13f9aff485ff9a6f3066c22abe4ec8cec7dcd"},
+    {file = "dulwich-0.21.6.tar.gz", hash = "sha256:30fbe87e8b51f3813c131e2841c86d007434d160bd16db586b40d47f31dd05b0"},
 ]
 
 [package.dependencies]
@@ -1451,13 +1452,13 @@ tests = ["dvc[testing]", "flaky (==3.7.0)", "mypy (==0.910)", "pylint (==2.15.9)
 
 [[package]]
 name = "dvc-objects"
-version = "0.23.1"
+version = "0.25.0"
 description = "dvc objects"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "dvc-objects-0.23.1.tar.gz", hash = "sha256:159ec9bede7443fbcbc64d33e53071ae51bea86fc82a764ab652655c35b58776"},
-    {file = "dvc_objects-0.23.1-py3-none-any.whl", hash = "sha256:118640f4cf83415cd2bf104be39712660e470a9ac061e55ac688f7ab703677c4"},
+    {file = "dvc-objects-0.25.0.tar.gz", hash = "sha256:6e13add661ab7766cc26493102c7981b5164351f0ca4ee33d080d1651d4b5899"},
+    {file = "dvc_objects-0.25.0-py3-none-any.whl", hash = "sha256:09f318cbb376750f4d2ef0afcde4ae41ca3f3071d6192bfee676812acd1f6d1f"},
 ]
 
 [package.dependencies]
@@ -1512,13 +1513,13 @@ tests = ["Pygments (==2.10.0)", "collective.checkdocs (==0.2)", "dvc[testing]",
 
 [[package]]
 name = "dvc-studio-client"
-version = "0.11.0"
+version = "0.15.0"
 description = "Small library to post data from DVC/DVCLive to Iterative Studio"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "dvc-studio-client-0.11.0.tar.gz", hash = "sha256:9179acc39bb9acfb54a5369142c835dc2428bd285e41281b005739ce63d9d55b"},
-    {file = "dvc_studio_client-0.11.0-py3-none-any.whl", hash = "sha256:2832fe0bdf723dbe51320abcde238bd0a1e1a3befa15c1f05cc9ed2ca25fb39f"},
+    {file = "dvc-studio-client-0.15.0.tar.gz", hash = "sha256:46dd508a0fb2c1c9986efd4111aa16ad3e40718c5e86a2be9f6e5ee509ff44a1"},
+    {file = "dvc_studio_client-0.15.0-py3-none-any.whl", hash = "sha256:f51f36f9a86ea2bfcaed95b2ad6f532ed59a4d527c1febe079a938d79ff86796"},
 ]
 
 [package.dependencies]
@@ -1527,9 +1528,9 @@ requests = "*"
 voluptuous = "*"
 
 [package.extras]
-dev = ["mkdocs (==1.3.1)", "mkdocs-gen-files (==0.3.5)", "mkdocs-material (==8.4.1)", "mkdocs-section-index (==0.3.4)", "mkdocstrings-python (==0.7.1)", "pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
-docs = ["mkdocs (==1.3.1)", "mkdocs-gen-files (==0.3.5)", "mkdocs-material (==8.4.1)", "mkdocs-section-index (==0.3.4)", "mkdocstrings-python (==0.7.1)"]
-tests = ["pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
+dev = ["mkdocs (==1.5.2)", "mkdocs-gen-files (==0.5.0)", "mkdocs-material (==9.2.2)", "mkdocs-section-index (==0.3.5)", "mkdocstrings-python (==1.5.0)", "pytest (==7.4.0)", "pytest-cov (==4.1.0)", "pytest-mock (==3.11.1)", "pytest-sugar (==0.9.7)"]
+docs = ["mkdocs (==1.5.2)", "mkdocs-gen-files (==0.5.0)", "mkdocs-material (==9.2.2)", "mkdocs-section-index (==0.3.5)", "mkdocstrings-python (==1.5.0)"]
+tests = ["pytest (==7.4.0)", "pytest-cov (==4.1.0)", "pytest-mock (==3.11.1)", "pytest-sugar (==0.9.7)"]
 
 [[package]]
 name = "dvc-task"
@@ -1556,13 +1557,13 @@ tests = ["celery-types (==0.15.0)", "flaky (==3.7.0)", "mypy (==0.971)", "pylint
 
 [[package]]
 name = "exceptiongroup"
-version = "1.1.2"
+version = "1.1.3"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.1.2-py3-none-any.whl", hash = "sha256:e346e69d186172ca7cf029c8c1d16235aa0e04035e5750b4b95039e65204328f"},
-    {file = "exceptiongroup-1.1.2.tar.gz", hash = "sha256:12c3e887d6485d16943a309616de20ae5582633e0a2eda17f4e10fd61c1e8af5"},
+    {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"},
+    {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"},
 ]
 
 [package.extras]
@@ -1584,18 +1585,21 @@ tests = ["asttokens", "littleutils", "pytest", "rich"]
 
 [[package]]
 name = "filelock"
-version = "3.12.2"
+version = "3.12.3"
 description = "A platform independent file lock."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"},
-    {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"},
+    {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"},
+    {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"},
 ]
 
+[package.dependencies]
+typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""}
+
 [package.extras]
-docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
-testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"]
+docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"]
+testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"]
 
 [[package]]
 name = "flatten-dict"
@@ -1613,13 +1617,13 @@ six = ">=1.12,<2.0"
 
 [[package]]
 name = "flufl-lock"
-version = "8.0.1"
+version = "8.0.2"
 description = "NFS-safe file locking with timeouts for POSIX and Windows"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "flufl_lock-8.0.1-py3-none-any.whl", hash = "sha256:a3df854d76173d59813fdcba91671234b59e2a14db3390793745c77a7bb92d9d"},
-    {file = "flufl_lock-8.0.1.tar.gz", hash = "sha256:edb7f1f3f8b4805ef6a6a23b9a3975bfc9b7c15eb33e10b0b086d0caa2a97e04"},
+    {file = "flufl_lock-8.0.2-py3-none-any.whl", hash = "sha256:ca33fb581122d651e4f24775bebed1e58cd1ea85a95a505881902ba050ed170b"},
+    {file = "flufl_lock-8.0.2.tar.gz", hash = "sha256:61c7246b34d6e5544c8a1fa4dae396d10e16ceb23371a31db22e0a2993d01432"},
 ]
 
 [package.dependencies]
@@ -1698,13 +1702,13 @@ files = [
 
 [[package]]
 name = "fsspec"
-version = "2023.6.0"
+version = "2023.9.0"
 description = "File-system specification"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"},
-    {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"},
+    {file = "fsspec-2023.9.0-py3-none-any.whl", hash = "sha256:d55b9ab2a4c1f2b759888ae9f93e40c2aa72c0808132e87e282b549f9e6c4254"},
+    {file = "fsspec-2023.9.0.tar.gz", hash = "sha256:4dbf0fefee035b7c6d3bbbe6bc99b2f201f40d4dca95b67c2b719be77bcd917f"},
 ]
 
 [package.dependencies]
@@ -1762,13 +1766,13 @@ smmap = ">=3.0.1,<6"
 
 [[package]]
 name = "gitpython"
-version = "3.1.32"
+version = "3.1.35"
 description = "GitPython is a Python library used to interact with Git repositories"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "GitPython-3.1.32-py3-none-any.whl", hash = "sha256:e3d59b1c2c6ebb9dfa7a184daf3b6dd4914237e7488a1730a6d8f6f5d0b4187f"},
-    {file = "GitPython-3.1.32.tar.gz", hash = "sha256:8d9b8cb1e80b9735e8717c9362079d3ce4c6e5ddeebedd0361b228c3a67a62f6"},
+    {file = "GitPython-3.1.35-py3-none-any.whl", hash = "sha256:c19b4292d7a1d3c0f653858db273ff8a6614100d1eb1528b014ec97286193c09"},
+    {file = "GitPython-3.1.35.tar.gz", hash = "sha256:9cbefbd1789a5fe9bcf621bb34d3f441f3a90c8461d377f84eda73e721d9b06b"},
 ]
 
 [package.dependencies]
@@ -1848,13 +1852,13 @@ socks = ["socksio (==1.*)"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.16.4"
+version = "0.17.0"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.16.4-py3-none-any.whl", hash = "sha256:0d3df29932f334fead024afc7cb4cc5149d955238b8b5e42dcf9740d6995a349"},
-    {file = "huggingface_hub-0.16.4.tar.gz", hash = "sha256:608c7d4f3d368b326d1747f91523dbd1f692871e8e2e7a4750314a2dd8b63e14"},
+    {file = "huggingface_hub-0.17.0-py3-none-any.whl", hash = "sha256:8111ef89ebf5514154b4e929662f57fc43818d06c95dabdfa4c77f9087383172"},
+    {file = "huggingface_hub-0.17.0.tar.gz", hash = "sha256:a048c64e0f651c32afe41a1818bf2cd47de902ff65dfba395ff71b999d9d4655"},
 ]
 
 [package.dependencies]
@@ -1867,16 +1871,17 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (>=23.1,<24.0)", "gradio", "jedi", "mypy (==0.982)", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"]
+docs = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "hf-doc-builder", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)", "watchdog"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
-inference = ["aiohttp", "pydantic"]
-quality = ["black (>=23.1,<24.0)", "mypy (==0.982)", "ruff (>=0.0.241)"]
+inference = ["aiohttp", "pydantic (<2.0)"]
+quality = ["black (==23.7)", "mypy (==1.5.1)", "ruff (>=0.0.241)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
-testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
+testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
 torch = ["torch"]
-typing = ["pydantic", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
+typing = ["pydantic (<2.0)", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"]
 
 [[package]]
 name = "hydra-core"
@@ -1896,13 +1901,13 @@ packaging = "*"
 
 [[package]]
 name = "identify"
-version = "2.5.24"
+version = "2.5.27"
 description = "File identification library for Python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "identify-2.5.24-py2.py3-none-any.whl", hash = "sha256:986dbfb38b1140e763e413e6feb44cd731faf72d1909543178aa79b0e258265d"},
-    {file = "identify-2.5.24.tar.gz", hash = "sha256:0aac67d5b4812498056d28a9a512a483f5085cc28640b02b258a59dac34301d4"},
+    {file = "identify-2.5.27-py2.py3-none-any.whl", hash = "sha256:fdb527b2dfe24602809b2201e033c2a113d7bdf716db3ca8e3243f735dcecaba"},
+    {file = "identify-2.5.27.tar.gz", hash = "sha256:287b75b04a0e22d727bc9a41f0d4f3c1bcada97490fa6eabb5b28f0e9097e733"},
 ]
 
 [package.extras]
@@ -2056,31 +2061,31 @@ files = [
 
 [[package]]
 name = "joblib"
-version = "1.3.1"
+version = "1.3.2"
 description = "Lightweight pipelining with Python functions"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "joblib-1.3.1-py3-none-any.whl", hash = "sha256:89cf0529520e01b3de7ac7b74a8102c90d16d54c64b5dd98cafcd14307fdf915"},
-    {file = "joblib-1.3.1.tar.gz", hash = "sha256:1f937906df65329ba98013dc9692fe22a4c5e4a648112de500508b18a21b41e3"},
+    {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"},
+    {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
 ]
 
 [[package]]
 name = "johnsnowlabs"
-version = "5.0.7"
+version = "5.0.8"
 description = "The John Snow Labs Library gives you access to all of John Snow Labs Enterprise And Open Source products in an easy and simple manner. Access 10000+ state-of-the-art NLP and OCR models for Finance, Legal and Medical domains. Easily scalable to Spark Cluster"
 optional = false
 python-versions = "*"
 files = [
-    {file = "johnsnowlabs-5.0.7-py3-none-any.whl", hash = "sha256:b95044738d93a6650081c87f00cd4e2ffa43288e45c508b916f8041da94bbddd"},
-    {file = "johnsnowlabs-5.0.7.tar.gz", hash = "sha256:528c58164bea42e7d2311907568a6898565eeb61105f1554569b9caa72bf9fd7"},
+    {file = "johnsnowlabs-5.0.8-py3-none-any.whl", hash = "sha256:a00f6c44684735716106d82ef420701f73085a3f9b1d9371060042a7d963ecd5"},
+    {file = "johnsnowlabs-5.0.8.tar.gz", hash = "sha256:45edcf4a2d4dddc2718850f80dbb963bca80a3c31bba3646e165e84446c31bc6"},
 ]
 
 [package.dependencies]
 colorama = "*"
 databricks-api = "*"
 dataclasses = "*"
-nlu = "5.0.0"
+nlu = "5.0.1"
 numpy = "*"
 pydantic = "1.10.11"
 pyspark = "3.1.2"
@@ -2090,13 +2095,13 @@ spark-nlp-display = "4.1"
 
 [[package]]
 name = "kombu"
-version = "5.3.1"
+version = "5.3.2"
 description = "Messaging library for Python."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "kombu-5.3.1-py3-none-any.whl", hash = "sha256:48ee589e8833126fd01ceaa08f8a2041334e9f5894e5763c8486a550454551e9"},
-    {file = "kombu-5.3.1.tar.gz", hash = "sha256:fbd7572d92c0bf71c112a6b45163153dea5a7b6a701ec16b568c27d0fd2370f2"},
+    {file = "kombu-5.3.2-py3-none-any.whl", hash = "sha256:b753c9cfc9b1e976e637a7cbc1a65d446a22e45546cd996ea28f932082b7dc9e"},
+    {file = "kombu-5.3.2.tar.gz", hash = "sha256:0ba213f630a2cb2772728aef56ac6883dc3a2f13435e10048f6e97d48506dbbd"},
 ]
 
 [package.dependencies]
@@ -2171,13 +2176,13 @@ files = [
 
 [[package]]
 name = "loguru"
-version = "0.7.0"
+version = "0.7.1"
 description = "Python logging made (stupidly) simple"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "loguru-0.7.0-py3-none-any.whl", hash = "sha256:b93aa30099fa6860d4727f1b81f8718e965bb96253fa190fab2077aaad6d15d3"},
-    {file = "loguru-0.7.0.tar.gz", hash = "sha256:1612053ced6ae84d7959dd7d5e431a0532642237ec21f7fd83ac73fe539e03e1"},
+    {file = "loguru-0.7.1-py3-none-any.whl", hash = "sha256:046bf970cb3cad77a28d607cbf042ac25a407db987a1e801c7f7e692469982f9"},
+    {file = "loguru-0.7.1.tar.gz", hash = "sha256:7ba2a7d81b79a412b0ded69bd921e012335e80fd39937a633570f273a343579e"},
 ]
 
 [package.dependencies]
@@ -2185,7 +2190,7 @@ colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
 win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
 
 [package.extras]
-dev = ["Sphinx (==5.3.0)", "colorama (==0.4.5)", "colorama (==0.4.6)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v0.990)", "pre-commit (==3.2.1)", "pytest (==6.1.2)", "pytest (==7.2.1)", "pytest-cov (==2.12.1)", "pytest-cov (==4.0.0)", "pytest-mypy-plugins (==1.10.1)", "pytest-mypy-plugins (==1.9.3)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.2.0)", "tox (==3.27.1)", "tox (==4.4.6)"]
+dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "pre-commit (==3.3.1)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
 
 [[package]]
 name = "markupsafe"
@@ -2475,13 +2480,13 @@ test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
 [[package]]
 name = "nlu"
-version = "5.0.0"
+version = "5.0.1"
 description = "John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library."
 optional = false
 python-versions = "*"
 files = [
-    {file = "nlu-5.0.0-py3-none-any.whl", hash = "sha256:4e9b62ab1e822d15881657dd320fed62c032856a6a5783e14172b92196116fbd"},
-    {file = "nlu-5.0.0.tar.gz", hash = "sha256:e22d834839c1a7fe4a91aa6f21e79921798d0a4d1d643b03ef07f37d0bec7e75"},
+    {file = "nlu-5.0.1-py3-none-any.whl", hash = "sha256:fd8126e99109c61f3fc01dcbcf81fe671d8ecc16b4dc5db3731103152dea4612"},
+    {file = "nlu-5.0.1.tar.gz", hash = "sha256:05bc7508ef284ec5be0642e188f5039a9383f4b109de59540add721781d046d2"},
 ]
 
 [package.dependencies]
@@ -2613,57 +2618,71 @@ tenacity = "*"
 
 [[package]]
 name = "orjson"
-version = "3.9.2"
+version = "3.9.7"
 description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "orjson-3.9.2-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7323e4ca8322b1ecb87562f1ec2491831c086d9faa9a6c6503f489dadbed37d7"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1272688ea1865f711b01ba479dea2d53e037ea00892fd04196b5875f7021d9d3"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0b9a26f1d1427a9101a1e8910f2e2df1f44d3d18ad5480ba031b15d5c1cb282e"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a5ca55b0d8f25f18b471e34abaee4b175924b6cd62f59992945b25963443141"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:877872db2c0f41fbe21f852ff642ca842a43bc34895b70f71c9d575df31fffb4"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a39c2529d75373b7167bf84c814ef9b8f3737a339c225ed6c0df40736df8748"},
-    {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:84ebd6fdf138eb0eb4280045442331ee71c0aab5e16397ba6645f32f911bfb37"},
-    {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a60a1cfcfe310547a1946506dd4f1ed0a7d5bd5b02c8697d9d5dcd8d2e9245e"},
-    {file = "orjson-3.9.2-cp310-none-win_amd64.whl", hash = "sha256:c290c4f81e8fd0c1683638802c11610b2f722b540f8e5e858b6914b495cf90c8"},
-    {file = "orjson-3.9.2-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:02ef014f9a605e84b675060785e37ec9c0d2347a04f1307a9d6840ab8ecd6f55"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:992af54265ada1c1579500d6594ed73fe333e726de70d64919cf37f93defdd06"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a40958f7af7c6d992ee67b2da4098dca8b770fc3b4b3834d540477788bfa76d3"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93864dec3e3dd058a2dbe488d11ac0345214a6a12697f53a63e34de7d28d4257"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16fdf5a82df80c544c3c91516ab3882cd1ac4f1f84eefeafa642e05cef5f6699"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275b5a18fd9ed60b2720543d3ddac170051c43d680e47d04ff5203d2c6d8ebf1"},
-    {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9aea6dcb99fcbc9f6d1dd84fca92322fda261da7fb014514bb4689c7c2097a8"},
-    {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d74ae0e101d17c22ef67b741ba356ab896fc0fa64b301c2bf2bb0a4d874b190"},
-    {file = "orjson-3.9.2-cp311-none-win_amd64.whl", hash = "sha256:6320b28e7bdb58c3a3a5efffe04b9edad3318d82409e84670a9b24e8035a249d"},
-    {file = "orjson-3.9.2-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:368e9cc91ecb7ac21f2aa475e1901204110cf3e714e98649c2502227d248f947"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58e9e70f0dcd6a802c35887f306b555ff7a214840aad7de24901fc8bd9cf5dde"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00c983896c2e01c94c0ef72fd7373b2aa06d0c0eed0342c4884559f812a6835b"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ee743e8890b16c87a2f89733f983370672272b61ee77429c0a5899b2c98c1a7"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7b065942d362aad4818ff599d2f104c35a565c2cbcbab8c09ec49edba91da75"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e46e9c5b404bb9e41d5555762fd410d5466b7eb1ec170ad1b1609cbebe71df21"},
-    {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8170157288714678ffd64f5de33039e1164a73fd8b6be40a8a273f80093f5c4f"},
-    {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e3e2f087161947dafe8319ea2cfcb9cea4bb9d2172ecc60ac3c9738f72ef2909"},
-    {file = "orjson-3.9.2-cp37-none-win_amd64.whl", hash = "sha256:d7de3dbbe74109ae598692113cec327fd30c5a30ebca819b21dfa4052f7b08ef"},
-    {file = "orjson-3.9.2-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8cd4385c59bbc1433cad4a80aca65d2d9039646a9c57f8084897549b55913b17"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74036aab1a80c361039290cdbc51aa7adc7ea13f56e5ef94e9be536abd227bd"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1aaa46d7d4ae55335f635eadc9be0bd9bcf742e6757209fc6dc697e390010adc"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e52c67ed6bb368083aa2078ea3ccbd9721920b93d4b06c43eb4e20c4c860046"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a6cdfcf9c7dd4026b2b01fdff56986251dc0cc1e980c690c79eec3ae07b36e7"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1882a70bb69595b9ec5aac0040a819e94d2833fe54901e2b32f5e734bc259a8b"},
-    {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fc05e060d452145ab3c0b5420769e7356050ea311fc03cb9d79c481982917cca"},
-    {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8bc2c40d9bb26efefb10949d261a47ca196772c308babc538dd9f4b73e8d386"},
-    {file = "orjson-3.9.2-cp38-none-win_amd64.whl", hash = "sha256:3164fc20a585ec30a9aff33ad5de3b20ce85702b2b2a456852c413e3f0d7ab09"},
-    {file = "orjson-3.9.2-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7a6ccadf788531595ed4728aa746bc271955448d2460ff0ef8e21eb3f2a281ba"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3245d230370f571c945f69aab823c279a868dc877352817e22e551de155cb06c"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:205925b179550a4ee39b8418dd4c94ad6b777d165d7d22614771c771d44f57bd"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0325fe2d69512187761f7368c8cda1959bcb75fc56b8e7a884e9569112320e57"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:806704cd58708acc66a064a9a58e3be25cf1c3f9f159e8757bd3f515bfabdfa1"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03fb36f187a0c19ff38f6289418863df8b9b7880cdbe279e920bef3a09d8dab1"},
-    {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20925d07a97c49c6305bff1635318d9fc1804aa4ccacb5fb0deb8a910e57d97a"},
-    {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eebfed53bec5674e981ebe8ed2cf00b3f7bcda62d634733ff779c264307ea505"},
-    {file = "orjson-3.9.2-cp39-none-win_amd64.whl", hash = "sha256:869b961df5fcedf6c79f4096119b35679b63272362e9b745e668f0391a892d39"},
-    {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"},
+    {file = "orjson-3.9.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:b6df858e37c321cefbf27fe7ece30a950bcc3a75618a804a0dcef7ed9dd9c92d"},
+    {file = "orjson-3.9.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5198633137780d78b86bb54dafaaa9baea698b4f059456cd4554ab7009619221"},
+    {file = "orjson-3.9.7-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e736815b30f7e3c9044ec06a98ee59e217a833227e10eb157f44071faddd7c5"},
+    {file = "orjson-3.9.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a19e4074bc98793458b4b3ba35a9a1d132179345e60e152a1bb48c538ab863c4"},
+    {file = "orjson-3.9.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80acafe396ab689a326ab0d80f8cc61dec0dd2c5dca5b4b3825e7b1e0132c101"},
+    {file = "orjson-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:355efdbbf0cecc3bd9b12589b8f8e9f03c813a115efa53f8dc2a523bfdb01334"},
+    {file = "orjson-3.9.7-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3aab72d2cef7f1dd6104c89b0b4d6b416b0db5ca87cc2fac5f79c5601f549cc2"},
+    {file = "orjson-3.9.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:36b1df2e4095368ee388190687cb1b8557c67bc38400a942a1a77713580b50ae"},
+    {file = "orjson-3.9.7-cp310-none-win32.whl", hash = "sha256:e94b7b31aa0d65f5b7c72dd8f8227dbd3e30354b99e7a9af096d967a77f2a580"},
+    {file = "orjson-3.9.7-cp310-none-win_amd64.whl", hash = "sha256:82720ab0cf5bb436bbd97a319ac529aee06077ff7e61cab57cee04a596c4f9b4"},
+    {file = "orjson-3.9.7-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1f8b47650f90e298b78ecf4df003f66f54acdba6a0f763cc4df1eab048fe3738"},
+    {file = "orjson-3.9.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f738fee63eb263530efd4d2e9c76316c1f47b3bbf38c1bf45ae9625feed0395e"},
+    {file = "orjson-3.9.7-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38e34c3a21ed41a7dbd5349e24c3725be5416641fdeedf8f56fcbab6d981c900"},
+    {file = "orjson-3.9.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:21a3344163be3b2c7e22cef14fa5abe957a892b2ea0525ee86ad8186921b6cf0"},
+    {file = "orjson-3.9.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23be6b22aab83f440b62a6f5975bcabeecb672bc627face6a83bc7aeb495dc7e"},
+    {file = "orjson-3.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5205ec0dfab1887dd383597012199f5175035e782cdb013c542187d280ca443"},
+    {file = "orjson-3.9.7-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8769806ea0b45d7bf75cad253fba9ac6700b7050ebb19337ff6b4e9060f963fa"},
+    {file = "orjson-3.9.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f9e01239abea2f52a429fe9d95c96df95f078f0172489d691b4a848ace54a476"},
+    {file = "orjson-3.9.7-cp311-none-win32.whl", hash = "sha256:8bdb6c911dae5fbf110fe4f5cba578437526334df381b3554b6ab7f626e5eeca"},
+    {file = "orjson-3.9.7-cp311-none-win_amd64.whl", hash = "sha256:9d62c583b5110e6a5cf5169ab616aa4ec71f2c0c30f833306f9e378cf51b6c86"},
+    {file = "orjson-3.9.7-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1c3cee5c23979deb8d1b82dc4cc49be59cccc0547999dbe9adb434bb7af11cf7"},
+    {file = "orjson-3.9.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a347d7b43cb609e780ff8d7b3107d4bcb5b6fd09c2702aa7bdf52f15ed09fa09"},
+    {file = "orjson-3.9.7-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:154fd67216c2ca38a2edb4089584504fbb6c0694b518b9020ad35ecc97252bb9"},
+    {file = "orjson-3.9.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ea3e63e61b4b0beeb08508458bdff2daca7a321468d3c4b320a758a2f554d31"},
+    {file = "orjson-3.9.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1eb0b0b2476f357eb2975ff040ef23978137aa674cd86204cfd15d2d17318588"},
+    {file = "orjson-3.9.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b9a20a03576c6b7022926f614ac5a6b0914486825eac89196adf3267c6489d"},
+    {file = "orjson-3.9.7-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:915e22c93e7b7b636240c5a79da5f6e4e84988d699656c8e27f2ac4c95b8dcc0"},
+    {file = "orjson-3.9.7-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f26fb3e8e3e2ee405c947ff44a3e384e8fa1843bc35830fe6f3d9a95a1147b6e"},
+    {file = "orjson-3.9.7-cp312-none-win_amd64.whl", hash = "sha256:d8692948cada6ee21f33db5e23460f71c8010d6dfcfe293c9b96737600a7df78"},
+    {file = "orjson-3.9.7-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7bab596678d29ad969a524823c4e828929a90c09e91cc438e0ad79b37ce41166"},
+    {file = "orjson-3.9.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63ef3d371ea0b7239ace284cab9cd00d9c92b73119a7c274b437adb09bda35e6"},
+    {file = "orjson-3.9.7-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2f8fcf696bbbc584c0c7ed4adb92fd2ad7d153a50258842787bc1524e50d7081"},
+    {file = "orjson-3.9.7-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:90fe73a1f0321265126cbba13677dcceb367d926c7a65807bd80916af4c17047"},
+    {file = "orjson-3.9.7-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:45a47f41b6c3beeb31ac5cf0ff7524987cfcce0a10c43156eb3ee8d92d92bf22"},
+    {file = "orjson-3.9.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a2937f528c84e64be20cb80e70cea76a6dfb74b628a04dab130679d4454395c"},
+    {file = "orjson-3.9.7-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b4fb306c96e04c5863d52ba8d65137917a3d999059c11e659eba7b75a69167bd"},
+    {file = "orjson-3.9.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:410aa9d34ad1089898f3db461b7b744d0efcf9252a9415bbdf23540d4f67589f"},
+    {file = "orjson-3.9.7-cp37-none-win32.whl", hash = "sha256:26ffb398de58247ff7bde895fe30817a036f967b0ad0e1cf2b54bda5f8dcfdd9"},
+    {file = "orjson-3.9.7-cp37-none-win_amd64.whl", hash = "sha256:bcb9a60ed2101af2af450318cd89c6b8313e9f8df4e8fb12b657b2e97227cf08"},
+    {file = "orjson-3.9.7-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:5da9032dac184b2ae2da4bce423edff7db34bfd936ebd7d4207ea45840f03905"},
+    {file = "orjson-3.9.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7951af8f2998045c656ba8062e8edf5e83fd82b912534ab1de1345de08a41d2b"},
+    {file = "orjson-3.9.7-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8e59650292aa3a8ea78073fc84184538783966528e442a1b9ed653aa282edcf"},
+    {file = "orjson-3.9.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9274ba499e7dfb8a651ee876d80386b481336d3868cba29af839370514e4dce0"},
+    {file = "orjson-3.9.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ca1706e8b8b565e934c142db6a9592e6401dc430e4b067a97781a997070c5378"},
+    {file = "orjson-3.9.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83cc275cf6dcb1a248e1876cdefd3f9b5f01063854acdfd687ec360cd3c9712a"},
+    {file = "orjson-3.9.7-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:11c10f31f2c2056585f89d8229a56013bc2fe5de51e095ebc71868d070a8dd81"},
+    {file = "orjson-3.9.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:cf334ce1d2fadd1bf3e5e9bf15e58e0c42b26eb6590875ce65bd877d917a58aa"},
+    {file = "orjson-3.9.7-cp38-none-win32.whl", hash = "sha256:76a0fc023910d8a8ab64daed8d31d608446d2d77c6474b616b34537aa7b79c7f"},
+    {file = "orjson-3.9.7-cp38-none-win_amd64.whl", hash = "sha256:7a34a199d89d82d1897fd4a47820eb50947eec9cda5fd73f4578ff692a912f89"},
+    {file = "orjson-3.9.7-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e7e7f44e091b93eb39db88bb0cb765db09b7a7f64aea2f35e7d86cbf47046c65"},
+    {file = "orjson-3.9.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01d647b2a9c45a23a84c3e70e19d120011cba5f56131d185c1b78685457320bb"},
+    {file = "orjson-3.9.7-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0eb850a87e900a9c484150c414e21af53a6125a13f6e378cf4cc11ae86c8f9c5"},
+    {file = "orjson-3.9.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f4b0042d8388ac85b8330b65406c84c3229420a05068445c13ca28cc222f1f7"},
+    {file = "orjson-3.9.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd3e7aae977c723cc1dbb82f97babdb5e5fbce109630fbabb2ea5053523c89d3"},
+    {file = "orjson-3.9.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c616b796358a70b1f675a24628e4823b67d9e376df2703e893da58247458956"},
+    {file = "orjson-3.9.7-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c3ba725cf5cf87d2d2d988d39c6a2a8b6fc983d78ff71bc728b0be54c869c884"},
+    {file = "orjson-3.9.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4891d4c934f88b6c29b56395dfc7014ebf7e10b9e22ffd9877784e16c6b2064f"},
+    {file = "orjson-3.9.7-cp39-none-win32.whl", hash = "sha256:14d3fb6cd1040a4a4a530b28e8085131ed94ebc90d72793c59a713de34b60838"},
+    {file = "orjson-3.9.7-cp39-none-win_amd64.whl", hash = "sha256:9ef82157bbcecd75d6296d5d8b2d792242afcd064eb1ac573f8847b52e58f677"},
+    {file = "orjson-3.9.7.tar.gz", hash = "sha256:85e39198f78e2f7e054d296395f6c96f5e02892337746ef5b6a1bf3ed5910142"},
 ]
 
 [[package]]
@@ -2741,13 +2760,13 @@ testing = ["docopt", "pytest (<6.0.0)"]
 
 [[package]]
 name = "pathspec"
-version = "0.11.1"
+version = "0.11.2"
 description = "Utility library for gitignore style pattern matching of file paths."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"},
-    {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"},
+    {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"},
+    {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"},
 ]
 
 [[package]]
@@ -2809,28 +2828,28 @@ files = [
 
 [[package]]
 name = "platformdirs"
-version = "3.9.1"
+version = "3.10.0"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "platformdirs-3.9.1-py3-none-any.whl", hash = "sha256:ad8291ae0ae5072f66c16945166cb11c63394c7a3ad1b1bc9828ca3162da8c2f"},
-    {file = "platformdirs-3.9.1.tar.gz", hash = "sha256:1b42b450ad933e981d56e59f1b97495428c9bd60698baab9f3eb3d00d5822421"},
+    {file = "platformdirs-3.10.0-py3-none-any.whl", hash = "sha256:d7c24979f292f916dc9cbf8648319032f551ea8c49a4c9bf2fb556a02070ec1d"},
+    {file = "platformdirs-3.10.0.tar.gz", hash = "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d"},
 ]
 
 [package.extras]
-docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
-test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.1)", "sphinx-autodoc-typehints (>=1.24)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)"]
 
 [[package]]
 name = "pluggy"
-version = "1.2.0"
+version = "1.3.0"
 description = "plugin and hook calling mechanisms for python"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
-    {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
+    {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"},
+    {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"},
 ]
 
 [package.extras]
@@ -2839,13 +2858,13 @@ testing = ["pytest", "pytest-benchmark"]
 
 [[package]]
 name = "pre-commit"
-version = "3.3.3"
+version = "3.4.0"
 description = "A framework for managing and maintaining multi-language pre-commit hooks."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pre_commit-3.3.3-py2.py3-none-any.whl", hash = "sha256:10badb65d6a38caff29703362271d7dca483d01da88f9d7e05d0b97171c136cb"},
-    {file = "pre_commit-3.3.3.tar.gz", hash = "sha256:a2256f489cd913d575c145132ae196fe335da32d91a8294b7afe6622335dd023"},
+    {file = "pre_commit-3.4.0-py2.py3-none-any.whl", hash = "sha256:96d529a951f8b677f730a7212442027e8ba53f9b04d217c4c67dc56c393ad945"},
+    {file = "pre_commit-3.4.0.tar.gz", hash = "sha256:6bbd5129a64cad4c0dfaeeb12cd8f7ea7e15b77028d985341478c8af3c759522"},
 ]
 
 [package.dependencies]
@@ -2912,24 +2931,24 @@ wcwidth = "*"
 
 [[package]]
 name = "protobuf"
-version = "4.23.4"
+version = "4.24.3"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "protobuf-4.23.4-cp310-abi3-win32.whl", hash = "sha256:5fea3c64d41ea5ecf5697b83e41d09b9589e6f20b677ab3c48e5f242d9b7897b"},
-    {file = "protobuf-4.23.4-cp310-abi3-win_amd64.whl", hash = "sha256:7b19b6266d92ca6a2a87effa88ecc4af73ebc5cfde194dc737cf8ef23a9a3b12"},
-    {file = "protobuf-4.23.4-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8547bf44fe8cec3c69e3042f5c4fb3e36eb2a7a013bb0a44c018fc1e427aafbd"},
-    {file = "protobuf-4.23.4-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:fee88269a090ada09ca63551bf2f573eb2424035bcf2cb1b121895b01a46594a"},
-    {file = "protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:effeac51ab79332d44fba74660d40ae79985901ac21bca408f8dc335a81aa597"},
-    {file = "protobuf-4.23.4-cp37-cp37m-win32.whl", hash = "sha256:c3e0939433c40796ca4cfc0fac08af50b00eb66a40bbbc5dee711998fb0bbc1e"},
-    {file = "protobuf-4.23.4-cp37-cp37m-win_amd64.whl", hash = "sha256:9053df6df8e5a76c84339ee4a9f5a2661ceee4a0dab019e8663c50ba324208b0"},
-    {file = "protobuf-4.23.4-cp38-cp38-win32.whl", hash = "sha256:e1c915778d8ced71e26fcf43c0866d7499891bca14c4368448a82edc61fdbc70"},
-    {file = "protobuf-4.23.4-cp38-cp38-win_amd64.whl", hash = "sha256:351cc90f7d10839c480aeb9b870a211e322bf05f6ab3f55fcb2f51331f80a7d2"},
-    {file = "protobuf-4.23.4-cp39-cp39-win32.whl", hash = "sha256:6dd9b9940e3f17077e820b75851126615ee38643c2c5332aa7a359988820c720"},
-    {file = "protobuf-4.23.4-cp39-cp39-win_amd64.whl", hash = "sha256:0a5759f5696895de8cc913f084e27fd4125e8fb0914bb729a17816a33819f474"},
-    {file = "protobuf-4.23.4-py3-none-any.whl", hash = "sha256:e9d0be5bf34b275b9f87ba7407796556abeeba635455d036c7351f7c183ef8ff"},
-    {file = "protobuf-4.23.4.tar.gz", hash = "sha256:ccd9430c0719dce806b93f89c91de7977304729e55377f872a92465d548329a9"},
+    {file = "protobuf-4.24.3-cp310-abi3-win32.whl", hash = "sha256:20651f11b6adc70c0f29efbe8f4a94a74caf61b6200472a9aea6e19898f9fcf4"},
+    {file = "protobuf-4.24.3-cp310-abi3-win_amd64.whl", hash = "sha256:3d42e9e4796a811478c783ef63dc85b5a104b44aaaca85d4864d5b886e4b05e3"},
+    {file = "protobuf-4.24.3-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:6e514e8af0045be2b56e56ae1bb14f43ce7ffa0f68b1c793670ccbe2c4fc7d2b"},
+    {file = "protobuf-4.24.3-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:ba53c2f04798a326774f0e53b9c759eaef4f6a568ea7072ec6629851c8435959"},
+    {file = "protobuf-4.24.3-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:f6ccbcf027761a2978c1406070c3788f6de4a4b2cc20800cc03d52df716ad675"},
+    {file = "protobuf-4.24.3-cp37-cp37m-win32.whl", hash = "sha256:1b182c7181a2891e8f7f3a1b5242e4ec54d1f42582485a896e4de81aa17540c2"},
+    {file = "protobuf-4.24.3-cp37-cp37m-win_amd64.whl", hash = "sha256:b0271a701e6782880d65a308ba42bc43874dabd1a0a0f41f72d2dac3b57f8e76"},
+    {file = "protobuf-4.24.3-cp38-cp38-win32.whl", hash = "sha256:e29d79c913f17a60cf17c626f1041e5288e9885c8579832580209de8b75f2a52"},
+    {file = "protobuf-4.24.3-cp38-cp38-win_amd64.whl", hash = "sha256:067f750169bc644da2e1ef18c785e85071b7c296f14ac53e0900e605da588719"},
+    {file = "protobuf-4.24.3-cp39-cp39-win32.whl", hash = "sha256:2da777d34b4f4f7613cdf85c70eb9a90b1fbef9d36ae4a0ccfe014b0b07906f1"},
+    {file = "protobuf-4.24.3-cp39-cp39-win_amd64.whl", hash = "sha256:f631bb982c5478e0c1c70eab383af74a84be66945ebf5dd6b06fc90079668d0b"},
+    {file = "protobuf-4.24.3-py3-none-any.whl", hash = "sha256:f6f8dc65625dadaad0c8545319c2e2f0424fede988368893ca3844261342c11a"},
+    {file = "protobuf-4.24.3.tar.gz", hash = "sha256:12e9ad2ec079b833176d2921be2cb24281fa591f0b119b208b788adc48c2561d"},
 ]
 
 [[package]]
@@ -2996,36 +3015,40 @@ files = [
 
 [[package]]
 name = "pyarrow"
-version = "12.0.1"
+version = "13.0.0"
 description = "Python library for Apache Arrow"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "pyarrow-12.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:6d288029a94a9bb5407ceebdd7110ba398a00412c5b0155ee9813a40d246c5df"},
-    {file = "pyarrow-12.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:345e1828efdbd9aa4d4de7d5676778aba384a2c3add896d995b23d368e60e5af"},
-    {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d6009fdf8986332b2169314da482baed47ac053311c8934ac6651e614deacd6"},
-    {file = "pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d3c4cbbf81e6dd23fe921bc91dc4619ea3b79bc58ef10bce0f49bdafb103daf"},
-    {file = "pyarrow-12.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:cdacf515ec276709ac8042c7d9bd5be83b4f5f39c6c037a17a60d7ebfd92c890"},
-    {file = "pyarrow-12.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:749be7fd2ff260683f9cc739cb862fb11be376de965a2a8ccbf2693b098db6c7"},
-    {file = "pyarrow-12.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6895b5fb74289d055c43db3af0de6e16b07586c45763cb5e558d38b86a91e3a7"},
-    {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1887bdae17ec3b4c046fcf19951e71b6a619f39fa674f9881216173566c8f718"},
-    {file = "pyarrow-12.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2c9cb8eeabbadf5fcfc3d1ddea616c7ce893db2ce4dcef0ac13b099ad7ca082"},
-    {file = "pyarrow-12.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:ce4aebdf412bd0eeb800d8e47db854f9f9f7e2f5a0220440acf219ddfddd4f63"},
-    {file = "pyarrow-12.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:e0d8730c7f6e893f6db5d5b86eda42c0a130842d101992b581e2138e4d5663d3"},
-    {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43364daec02f69fec89d2315f7fbfbeec956e0d991cbbef471681bd77875c40f"},
-    {file = "pyarrow-12.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051f9f5ccf585f12d7de836e50965b3c235542cc896959320d9776ab93f3b33d"},
-    {file = "pyarrow-12.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:be2757e9275875d2a9c6e6052ac7957fbbfc7bc7370e4a036a9b893e96fedaba"},
-    {file = "pyarrow-12.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:cf812306d66f40f69e684300f7af5111c11f6e0d89d6b733e05a3de44961529d"},
-    {file = "pyarrow-12.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:459a1c0ed2d68671188b2118c63bac91eaef6fc150c77ddd8a583e3c795737bf"},
-    {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85e705e33eaf666bbe508a16fd5ba27ca061e177916b7a317ba5a51bee43384c"},
-    {file = "pyarrow-12.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9120c3eb2b1f6f516a3b7a9714ed860882d9ef98c4b17edcdc91d95b7528db60"},
-    {file = "pyarrow-12.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c780f4dc40460015d80fcd6a6140de80b615349ed68ef9adb653fe351778c9b3"},
-    {file = "pyarrow-12.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:a3c63124fc26bf5f95f508f5d04e1ece8cc23a8b0af2a1e6ab2b1ec3fdc91b24"},
-    {file = "pyarrow-12.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b13329f79fa4472324f8d32dc1b1216616d09bd1e77cfb13104dec5463632c36"},
-    {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb656150d3d12ec1396f6dde542db1675a95c0cc8366d507347b0beed96e87ca"},
-    {file = "pyarrow-12.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6251e38470da97a5b2e00de5c6a049149f7b2bd62f12fa5dbb9ac674119ba71a"},
-    {file = "pyarrow-12.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3de26da901216149ce086920547dfff5cd22818c9eab67ebc41e863a5883bac7"},
-    {file = "pyarrow-12.0.1.tar.gz", hash = "sha256:cce317fc96e5b71107bf1f9f184d5e54e2bd14bbf3f9a3d62819961f0af86fec"},
+    {file = "pyarrow-13.0.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:1afcc2c33f31f6fb25c92d50a86b7a9f076d38acbcb6f9e74349636109550148"},
+    {file = "pyarrow-13.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:70fa38cdc66b2fc1349a082987f2b499d51d072faaa6b600f71931150de2e0e3"},
+    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd57b13a6466822498238877892a9b287b0a58c2e81e4bdb0b596dbb151cbb73"},
+    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8ce69f7bf01de2e2764e14df45b8404fc6f1a5ed9871e8e08a12169f87b7a26"},
+    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:588f0d2da6cf1b1680974d63be09a6530fd1bd825dc87f76e162404779a157dc"},
+    {file = "pyarrow-13.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6241afd72b628787b4abea39e238e3ff9f34165273fad306c7acf780dd850956"},
+    {file = "pyarrow-13.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:fda7857e35993673fcda603c07d43889fca60a5b254052a462653f8656c64f44"},
+    {file = "pyarrow-13.0.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:aac0ae0146a9bfa5e12d87dda89d9ef7c57a96210b899459fc2f785303dcbb67"},
+    {file = "pyarrow-13.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d7759994217c86c161c6a8060509cfdf782b952163569606bb373828afdd82e8"},
+    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:868a073fd0ff6468ae7d869b5fc1f54de5c4255b37f44fb890385eb68b68f95d"},
+    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51be67e29f3cfcde263a113c28e96aa04362ed8229cb7c6e5f5c719003659d33"},
+    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d1b4e7176443d12610874bb84d0060bf080f000ea9ed7c84b2801df851320295"},
+    {file = "pyarrow-13.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:69b6f9a089d116a82c3ed819eea8fe67dae6105f0d81eaf0fdd5e60d0c6e0944"},
+    {file = "pyarrow-13.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ab1268db81aeb241200e321e220e7cd769762f386f92f61b898352dd27e402ce"},
+    {file = "pyarrow-13.0.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:ee7490f0f3f16a6c38f8c680949551053c8194e68de5046e6c288e396dccee80"},
+    {file = "pyarrow-13.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e3ad79455c197a36eefbd90ad4aa832bece7f830a64396c15c61a0985e337287"},
+    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68fcd2dc1b7d9310b29a15949cdd0cb9bc34b6de767aff979ebf546020bf0ba0"},
+    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc6fd330fd574c51d10638e63c0d00ab456498fc804c9d01f2a61b9264f2c5b2"},
+    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:e66442e084979a97bb66939e18f7b8709e4ac5f887e636aba29486ffbf373763"},
+    {file = "pyarrow-13.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:0f6eff839a9e40e9c5610d3ff8c5bdd2f10303408312caf4c8003285d0b49565"},
+    {file = "pyarrow-13.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b30a27f1cddf5c6efcb67e598d7823a1e253d743d92ac32ec1eb4b6a1417867"},
+    {file = "pyarrow-13.0.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:09552dad5cf3de2dc0aba1c7c4b470754c69bd821f5faafc3d774bedc3b04bb7"},
+    {file = "pyarrow-13.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3896ae6c205d73ad192d2fc1489cd0edfab9f12867c85b4c277af4d37383c18c"},
+    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6647444b21cb5e68b593b970b2a9a07748dd74ea457c7dadaa15fd469c48ada1"},
+    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47663efc9c395e31d09c6aacfa860f4473815ad6804311c5433f7085415d62a7"},
+    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:b9ba6b6d34bd2563345488cf444510588ea42ad5613df3b3509f48eb80250afd"},
+    {file = "pyarrow-13.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:d00d374a5625beeb448a7fa23060df79adb596074beb3ddc1838adb647b6ef09"},
+    {file = "pyarrow-13.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:c51afd87c35c8331b56f796eff954b9c7f8d4b7fef5903daf4e05fcf017d23a8"},
+    {file = "pyarrow-13.0.0.tar.gz", hash = "sha256:83333726e83ed44b0ac94d8d7a21bbdee4a05029c3b1e8db58a863eec8fd8a33"},
 ]
 
 [package.dependencies]
@@ -3110,56 +3133,47 @@ pyparsing = ">=2.1.4"
 
 [[package]]
 name = "pygit2"
-version = "1.12.2"
+version = "1.13.0"
 description = "Python bindings for libgit2."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pygit2-1.12.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:79fbd99d3e08ca7478150eeba28ca4d4103f564148eab8d00aba8f1e6fc60654"},
-    {file = "pygit2-1.12.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be3bb0139f464947523022a5af343a2e862c4ff250a57ec9f631449e7c0ba7c0"},
-    {file = "pygit2-1.12.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4df3e5745fdf3111a6ccc905eae99f22f1a180728f714795138ca540cc2a50a"},
-    {file = "pygit2-1.12.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:214bd214784fcbef7a8494d1d59e0cd3a731c0d24ce0f230dcc843322ee33b08"},
-    {file = "pygit2-1.12.2-cp310-cp310-win32.whl", hash = "sha256:336c864ac961e7be8ba06e9ed8c999e4f624a8ccd90121cc4e40956d8b57acac"},
-    {file = "pygit2-1.12.2-cp310-cp310-win_amd64.whl", hash = "sha256:fb9eb57b75ce586928053692a25aae2a50fef3ad36661c57c07d4902899b1df3"},
-    {file = "pygit2-1.12.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8f813d35d836c5b0d1962c387754786bcc7f1c3c8e11207b9eeb30238ac4cc7"},
-    {file = "pygit2-1.12.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:25a6548930328c5247bfb7c67d29104e63b036cb5390f032d9f91f63efb70434"},
-    {file = "pygit2-1.12.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a365ffca23d910381749fdbcc367db52fe808f9aa4852914dd9ef8b711384a32"},
-    {file = "pygit2-1.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec04c27be5d5af1ceecdcc0464e07081222f91f285f156dc53b23751d146569a"},
-    {file = "pygit2-1.12.2-cp311-cp311-win32.whl", hash = "sha256:546091316c9a8c37b9867ddcc6c9f7402ca4d0b9db3f349212a7b5e71988e359"},
-    {file = "pygit2-1.12.2-cp311-cp311-win_amd64.whl", hash = "sha256:8bf14196cbfffbcd286f459a1d4fc660c5d5dfa8fb422e21216961df575410d6"},
-    {file = "pygit2-1.12.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:7bb30ab1fdaa4c30821fed33892958b6d92d50dbd03c76f7775b4e5d62f53a2e"},
-    {file = "pygit2-1.12.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e7e705aaecad85b883022e81e054fbd27d26023fc031618ee61c51516580517e"},
-    {file = "pygit2-1.12.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac2b5f408eb882e79645ebb43039ac37739c3edd25d857cc97d7482a684b613f"},
-    {file = "pygit2-1.12.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22e7f3ad2b7b0c80be991bb47d8a2f2535cc9bf090746eb8679231ee565fde81"},
-    {file = "pygit2-1.12.2-cp38-cp38-win32.whl", hash = "sha256:5b3ab4d6302990f7adb2b015bcbda1f0715277008d0c66440497e6f8313bf9cb"},
-    {file = "pygit2-1.12.2-cp38-cp38-win_amd64.whl", hash = "sha256:c74e7601cb8b8dc3d02fd32274e200a7761cffd20ee531442bf1fa115c8f99a5"},
-    {file = "pygit2-1.12.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6a4083ba093c69142e0400114a4ef75e87834637d2bbfd77b964614bf70f624f"},
-    {file = "pygit2-1.12.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:926f2e48c4eaa179249d417b8382290b86b0f01dbf41d289f763576209276b9f"},
-    {file = "pygit2-1.12.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14ae27491347a0ac4bbe8347b09d752cfe7fea1121c14525415e0cca6db4a836"},
-    {file = "pygit2-1.12.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f65483ab5e3563c58f60debe2acc0979fdf6fd633432fcfbddf727a9a265ba4"},
-    {file = "pygit2-1.12.2-cp39-cp39-win32.whl", hash = "sha256:8da8517809635ea3da950d9cf99c6d1851352d92b6db309382db88a01c3b0bfd"},
-    {file = "pygit2-1.12.2-cp39-cp39-win_amd64.whl", hash = "sha256:b9c2359b99eed8e7fac30c06e6b4ae277a6a0537d6b4b88a190828c3d7eb9ef2"},
-    {file = "pygit2-1.12.2-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:685378852ef8eb081333bc80dbdfc4f1333cf4a8f3baf614c4135e02ad1ee38a"},
-    {file = "pygit2-1.12.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdf655e5f801990f5cad721b6ccbe7610962f0a4f1c20373dbf9c0be39374a81"},
-    {file = "pygit2-1.12.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:857c5cde635d470f58803d67bfb281dc4f6336065a0253bfbed001f18e2d0767"},
-    {file = "pygit2-1.12.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:fe35a72af61961dbb7fb4abcdaa36d5f1c85b2cd3daae94137eeb9c07215cdd3"},
-    {file = "pygit2-1.12.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f443d3641762b2bb9c76400bb18beb4ba27dd35bc098a8bfae82e6a190c52ab"},
-    {file = "pygit2-1.12.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1e26649e1540b6a774f812e2fc9890320ff4d33f16db1bb02626318b5ceae2"},
-    {file = "pygit2-1.12.2.tar.gz", hash = "sha256:56e85d0e66de957d599d1efb2409d39afeefd8f01009bfda0796b42a4b678358"},
+    {file = "pygit2-1.13.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1c9027b71a5d54b1de122c54f153b0b227ca270507d6308ed42b5a69fc740a2e"},
+    {file = "pygit2-1.13.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0d7e31d9bfacc7e6670da65e77f441e9e21a7223f73c739dfd92d8c0c057009"},
+    {file = "pygit2-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:713b38e69527ddc14b52408ba5e6e862903f838cfe426645839242801dcdc2b3"},
+    {file = "pygit2-1.13.0-cp310-cp310-win32.whl", hash = "sha256:97d9a25a33354017f5bfdbdac1dd0133afee8b061a327b8e7c12d50d58c5b65d"},
+    {file = "pygit2-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:8e8df423209869c148b23269d1b01f61c134a825e5e3e636736695e72cd7698b"},
+    {file = "pygit2-1.13.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c03df33e4f09ffeb7fa5570edfbe96285276aeb6fec18c06f64ea2f8fc7de842"},
+    {file = "pygit2-1.13.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb1a601b279d1e1a1e1ba8d9b6bf798103f4befc66bdc4d6c329a332eae6a7d7"},
+    {file = "pygit2-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d5b650f3f1c304d07c884a0ac45b04169fc5e3ba27030015d96644b411df44c"},
+    {file = "pygit2-1.13.0-cp311-cp311-win32.whl", hash = "sha256:d765610b087b81cb90d1a5eb266e243eda2a7c3f1b5458500c76bfd437fbe02c"},
+    {file = "pygit2-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:b7686b82cd972b3be88e91e963dbcb35474814bb7127297cbc4b9ab874b7037b"},
+    {file = "pygit2-1.13.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:947a774eb089f11fc79022c9e1c2ec20a9a0152e0b2390e6d733dfd25234530d"},
+    {file = "pygit2-1.13.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e79e8eda586f0b713911076db8e46a1f05882fca6911e1f63a028337fd442192"},
+    {file = "pygit2-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e235506031a616339eb2d5be2d24a70295937b9c4cd58265da9ee3ac206aff8"},
+    {file = "pygit2-1.13.0-cp38-cp38-win32.whl", hash = "sha256:431de0d476b77a855614afc1cf634ded78df44e20c77a2c35147ea980b69eb65"},
+    {file = "pygit2-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:696a0c334b6e8e710f9b5666af6dffeab9305c7154a3779f0da36189a1e9ee82"},
+    {file = "pygit2-1.13.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6f5daab6670d3b353e4694a0fbb652674dcc2a4bc77e20342575d24dc974a4a5"},
+    {file = "pygit2-1.13.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d646d975f3d33cedc947bfbf490bb26da2267ad57e3d25d88b8b0b675496d4a"},
+    {file = "pygit2-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a21f09caf2ffdd5f94d7963622f1e0fad4a1160a68872d79b12a4326c32fa99"},
+    {file = "pygit2-1.13.0-cp39-cp39-win32.whl", hash = "sha256:b48b113b2ff77bd5e2cd01c6cd29be2d06b057eb71be823e8083b435a41eda2a"},
+    {file = "pygit2-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:faea44ade5254a836ed5fa525442d7174e4197ca20cdbf2704a3b91f77bf952f"},
+    {file = "pygit2-1.13.0.tar.gz", hash = "sha256:6dde37436fab14264ad3d6cbc5aae3fd555eb9a9680a7bfdd6e564cd77b5e2b8"},
 ]
 
 [package.dependencies]
 cffi = ">=1.9.1"
+setuptools = {version = "*", markers = "python_version >= \"3.12\""}
 
 [[package]]
 name = "pygments"
-version = "2.15.1"
+version = "2.16.1"
 description = "Pygments is a syntax highlighting package written in Python."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"},
-    {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"},
+    {file = "Pygments-2.16.1-py3-none-any.whl", hash = "sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692"},
+    {file = "Pygments-2.16.1.tar.gz", hash = "sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29"},
 ]
 
 [package.extras]
@@ -3195,13 +3209,13 @@ tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
 
 [[package]]
 name = "pyparsing"
-version = "3.1.0"
+version = "3.1.1"
 description = "pyparsing module - Classes and methods to define and execute parsing grammars"
 optional = false
 python-versions = ">=3.6.8"
 files = [
-    {file = "pyparsing-3.1.0-py3-none-any.whl", hash = "sha256:d554a96d1a7d3ddaf7183104485bc19fd80543ad6ac5bdb6426719d766fb06c1"},
-    {file = "pyparsing-3.1.0.tar.gz", hash = "sha256:edb662d6fe322d6e990b1594b5feaeadf806803359e3d4d42f11e295e588f0ea"},
+    {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"},
+    {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"},
 ]
 
 [package.extras]
@@ -3227,13 +3241,13 @@ sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
 
 [[package]]
 name = "pytest"
-version = "7.4.0"
+version = "7.4.2"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
-    {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
+    {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"},
+    {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"},
 ]
 
 [package.dependencies]
@@ -3263,13 +3277,13 @@ six = ">=1.5"
 
 [[package]]
 name = "pytz"
-version = "2023.3"
+version = "2023.3.post1"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
-    {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
+    {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"},
+    {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"},
 ]
 
 [[package]]
@@ -3346,99 +3360,99 @@ files = [
 
 [[package]]
 name = "regex"
-version = "2023.6.3"
+version = "2023.8.8"
 description = "Alternative regular expression module, to replace re."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "regex-2023.6.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:824bf3ac11001849aec3fa1d69abcb67aac3e150a933963fb12bda5151fe1bfd"},
-    {file = "regex-2023.6.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:05ed27acdf4465c95826962528f9e8d41dbf9b1aa8531a387dee6ed215a3e9ef"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b49c764f88a79160fa64f9a7b425620e87c9f46095ef9c9920542ab2495c8bc"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8e3f1316c2293e5469f8f09dc2d76efb6c3982d3da91ba95061a7e69489a14ef"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:43e1dd9d12df9004246bacb79a0e5886b3b6071b32e41f83b0acbf293f820ee8"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4959e8bcbfda5146477d21c3a8ad81b185cd252f3d0d6e4724a5ef11c012fb06"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af4dd387354dc83a3bff67127a124c21116feb0d2ef536805c454721c5d7993d"},
-    {file = "regex-2023.6.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2239d95d8e243658b8dbb36b12bd10c33ad6e6933a54d36ff053713f129aa536"},
-    {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:890e5a11c97cf0d0c550eb661b937a1e45431ffa79803b942a057c4fb12a2da2"},
-    {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a8105e9af3b029f243ab11ad47c19b566482c150c754e4c717900a798806b222"},
-    {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:25be746a8ec7bc7b082783216de8e9473803706723b3f6bef34b3d0ed03d57e2"},
-    {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3676f1dd082be28b1266c93f618ee07741b704ab7b68501a173ce7d8d0d0ca18"},
-    {file = "regex-2023.6.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:10cb847aeb1728412c666ab2e2000ba6f174f25b2bdc7292e7dd71b16db07568"},
-    {file = "regex-2023.6.3-cp310-cp310-win32.whl", hash = "sha256:dbbbfce33cd98f97f6bffb17801b0576e653f4fdb1d399b2ea89638bc8d08ae1"},
-    {file = "regex-2023.6.3-cp310-cp310-win_amd64.whl", hash = "sha256:c5f8037000eb21e4823aa485149f2299eb589f8d1fe4b448036d230c3f4e68e0"},
-    {file = "regex-2023.6.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c123f662be8ec5ab4ea72ea300359023a5d1df095b7ead76fedcd8babbedf969"},
-    {file = "regex-2023.6.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9edcbad1f8a407e450fbac88d89e04e0b99a08473f666a3f3de0fd292badb6aa"},
-    {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dcba6dae7de533c876255317c11f3abe4907ba7d9aa15d13e3d9710d4315ec0e"},
-    {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29cdd471ebf9e0f2fb3cac165efedc3c58db841d83a518b082077e612d3ee5df"},
-    {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:12b74fbbf6cbbf9dbce20eb9b5879469e97aeeaa874145517563cca4029db65c"},
-    {file = "regex-2023.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c29ca1bd61b16b67be247be87390ef1d1ef702800f91fbd1991f5c4421ebae8"},
-    {file = "regex-2023.6.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d77f09bc4b55d4bf7cc5eba785d87001d6757b7c9eec237fe2af57aba1a071d9"},
-    {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ea353ecb6ab5f7e7d2f4372b1e779796ebd7b37352d290096978fea83c4dba0c"},
-    {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:10590510780b7541969287512d1b43f19f965c2ece6c9b1c00fc367b29d8dce7"},
-    {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e2fbd6236aae3b7f9d514312cdb58e6494ee1c76a9948adde6eba33eb1c4264f"},
-    {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:6b2675068c8b56f6bfd5a2bda55b8accbb96c02fd563704732fd1c95e2083461"},
-    {file = "regex-2023.6.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74419d2b50ecb98360cfaa2974da8689cb3b45b9deff0dcf489c0d333bcc1477"},
-    {file = "regex-2023.6.3-cp311-cp311-win32.whl", hash = "sha256:fb5ec16523dc573a4b277663a2b5a364e2099902d3944c9419a40ebd56a118f9"},
-    {file = "regex-2023.6.3-cp311-cp311-win_amd64.whl", hash = "sha256:09e4a1a6acc39294a36b7338819b10baceb227f7f7dbbea0506d419b5a1dd8af"},
-    {file = "regex-2023.6.3-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0654bca0cdf28a5956c83839162692725159f4cda8d63e0911a2c0dc76166525"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:463b6a3ceb5ca952e66550a4532cef94c9a0c80dc156c4cc343041951aec1697"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87b2a5bb5e78ee0ad1de71c664d6eb536dc3947a46a69182a90f4410f5e3f7dd"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6343c6928282c1f6a9db41f5fd551662310e8774c0e5ebccb767002fcf663ca9"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6192d5af2ccd2a38877bfef086d35e6659566a335b1492786ff254c168b1693"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74390d18c75054947e4194019077e243c06fbb62e541d8817a0fa822ea310c14"},
-    {file = "regex-2023.6.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:742e19a90d9bb2f4a6cf2862b8b06dea5e09b96c9f2df1779e53432d7275331f"},
-    {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8abbc5d54ea0ee80e37fef009e3cec5dafd722ed3c829126253d3e22f3846f1e"},
-    {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:c2b867c17a7a7ae44c43ebbeb1b5ff406b3e8d5b3e14662683e5e66e6cc868d3"},
-    {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:d831c2f8ff278179705ca59f7e8524069c1a989e716a1874d6d1aab6119d91d1"},
-    {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:ee2d1a9a253b1729bb2de27d41f696ae893507c7db224436abe83ee25356f5c1"},
-    {file = "regex-2023.6.3-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:61474f0b41fe1a80e8dfa70f70ea1e047387b7cd01c85ec88fa44f5d7561d787"},
-    {file = "regex-2023.6.3-cp36-cp36m-win32.whl", hash = "sha256:0b71e63226e393b534105fcbdd8740410dc6b0854c2bfa39bbda6b0d40e59a54"},
-    {file = "regex-2023.6.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bbb02fd4462f37060122e5acacec78e49c0fbb303c30dd49c7f493cf21fc5b27"},
-    {file = "regex-2023.6.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b862c2b9d5ae38a68b92e215b93f98d4c5e9454fa36aae4450f61dd33ff48487"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:976d7a304b59ede34ca2921305b57356694f9e6879db323fd90a80f865d355a3"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:83320a09188e0e6c39088355d423aa9d056ad57a0b6c6381b300ec1a04ec3d16"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9427a399501818a7564f8c90eced1e9e20709ece36be701f394ada99890ea4b3"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7178bbc1b2ec40eaca599d13c092079bf529679bf0371c602edaa555e10b41c3"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:837328d14cde912af625d5f303ec29f7e28cdab588674897baafaf505341f2fc"},
-    {file = "regex-2023.6.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d44dc13229905ae96dd2ae2dd7cebf824ee92bc52e8cf03dcead37d926da019"},
-    {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d54af539295392611e7efbe94e827311eb8b29668e2b3f4cadcfe6f46df9c777"},
-    {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:7117d10690c38a622e54c432dfbbd3cbd92f09401d622902c32f6d377e2300ee"},
-    {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bb60b503ec8a6e4e3e03a681072fa3a5adcbfa5479fa2d898ae2b4a8e24c4591"},
-    {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:65ba8603753cec91c71de423a943ba506363b0e5c3fdb913ef8f9caa14b2c7e0"},
-    {file = "regex-2023.6.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:271f0bdba3c70b58e6f500b205d10a36fb4b58bd06ac61381b68de66442efddb"},
-    {file = "regex-2023.6.3-cp37-cp37m-win32.whl", hash = "sha256:9beb322958aaca059f34975b0df135181f2e5d7a13b84d3e0e45434749cb20f7"},
-    {file = "regex-2023.6.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fea75c3710d4f31389eed3c02f62d0b66a9da282521075061ce875eb5300cf23"},
-    {file = "regex-2023.6.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8f56fcb7ff7bf7404becdfc60b1e81a6d0561807051fd2f1860b0d0348156a07"},
-    {file = "regex-2023.6.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d2da3abc88711bce7557412310dfa50327d5769a31d1c894b58eb256459dc289"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99b50300df5add73d307cf66abea093304a07eb017bce94f01e795090dea87c"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5708089ed5b40a7b2dc561e0c8baa9535b77771b64a8330b684823cfd5116036"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:687ea9d78a4b1cf82f8479cab23678aff723108df3edeac098e5b2498879f4a7"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4d3850beab9f527f06ccc94b446c864059c57651b3f911fddb8d9d3ec1d1b25d"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8915cc96abeb8983cea1df3c939e3c6e1ac778340c17732eb63bb96247b91d2"},
-    {file = "regex-2023.6.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:841d6e0e5663d4c7b4c8099c9997be748677d46cbf43f9f471150e560791f7ff"},
-    {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9edce5281f965cf135e19840f4d93d55b3835122aa76ccacfd389e880ba4cf82"},
-    {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:b956231ebdc45f5b7a2e1f90f66a12be9610ce775fe1b1d50414aac1e9206c06"},
-    {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:36efeba71c6539d23c4643be88295ce8c82c88bbd7c65e8a24081d2ca123da3f"},
-    {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:cf67ca618b4fd34aee78740bea954d7c69fdda419eb208c2c0c7060bb822d747"},
-    {file = "regex-2023.6.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b4598b1897837067a57b08147a68ac026c1e73b31ef6e36deeeb1fa60b2933c9"},
-    {file = "regex-2023.6.3-cp38-cp38-win32.whl", hash = "sha256:f415f802fbcafed5dcc694c13b1292f07fe0befdb94aa8a52905bd115ff41e88"},
-    {file = "regex-2023.6.3-cp38-cp38-win_amd64.whl", hash = "sha256:d4f03bb71d482f979bda92e1427f3ec9b220e62a7dd337af0aa6b47bf4498f72"},
-    {file = "regex-2023.6.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ccf91346b7bd20c790310c4147eee6ed495a54ddb6737162a36ce9dbef3e4751"},
-    {file = "regex-2023.6.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b28f5024a3a041009eb4c333863d7894d191215b39576535c6734cd88b0fcb68"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0bb18053dfcfed432cc3ac632b5e5e5c5b7e55fb3f8090e867bfd9b054dbcbf"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5bfb3004f2144a084a16ce19ca56b8ac46e6fd0651f54269fc9e230edb5e4a"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5c6b48d0fa50d8f4df3daf451be7f9689c2bde1a52b1225c5926e3f54b6a9ed1"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:051da80e6eeb6e239e394ae60704d2b566aa6a7aed6f2890a7967307267a5dc6"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a4c3b7fa4cdaa69268748665a1a6ff70c014d39bb69c50fda64b396c9116cf77"},
-    {file = "regex-2023.6.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:457b6cce21bee41ac292d6753d5e94dcbc5c9e3e3a834da285b0bde7aa4a11e9"},
-    {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:aad51907d74fc183033ad796dd4c2e080d1adcc4fd3c0fd4fd499f30c03011cd"},
-    {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:0385e73da22363778ef2324950e08b689abdf0b108a7d8decb403ad7f5191938"},
-    {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c6a57b742133830eec44d9b2290daf5cbe0a2f1d6acee1b3c7b1c7b2f3606df7"},
-    {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:3e5219bf9e75993d73ab3d25985c857c77e614525fac9ae02b1bebd92f7cecac"},
-    {file = "regex-2023.6.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e5087a3c59eef624a4591ef9eaa6e9a8d8a94c779dade95d27c0bc24650261cd"},
-    {file = "regex-2023.6.3-cp39-cp39-win32.whl", hash = "sha256:20326216cc2afe69b6e98528160b225d72f85ab080cbdf0b11528cbbaba2248f"},
-    {file = "regex-2023.6.3-cp39-cp39-win_amd64.whl", hash = "sha256:bdff5eab10e59cf26bc479f565e25ed71a7d041d1ded04ccf9aee1d9f208487a"},
-    {file = "regex-2023.6.3.tar.gz", hash = "sha256:72d1a25bf36d2050ceb35b517afe13864865268dfb45910e2e17a84be6cbfeb0"},
+    {file = "regex-2023.8.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88900f521c645f784260a8d346e12a1590f79e96403971241e64c3a265c8ecdb"},
+    {file = "regex-2023.8.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3611576aff55918af2697410ff0293d6071b7e00f4b09e005d614686ac4cd57c"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a0ccc8f2698f120e9e5742f4b38dc944c38744d4bdfc427616f3a163dd9de5"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c662a4cbdd6280ee56f841f14620787215a171c4e2d1744c9528bed8f5816c96"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cf0633e4a1b667bfe0bb10b5e53fe0d5f34a6243ea2530eb342491f1adf4f739"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:551ad543fa19e94943c5b2cebc54c73353ffff08228ee5f3376bd27b3d5b9800"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54de2619f5ea58474f2ac211ceea6b615af2d7e4306220d4f3fe690c91988a61"},
+    {file = "regex-2023.8.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5ec4b3f0aebbbe2fc0134ee30a791af522a92ad9f164858805a77442d7d18570"},
+    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ae646c35cb9f820491760ac62c25b6d6b496757fda2d51be429e0e7b67ae0ab"},
+    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca339088839582d01654e6f83a637a4b8194d0960477b9769d2ff2cfa0fa36d2"},
+    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:d9b6627408021452dcd0d2cdf8da0534e19d93d070bfa8b6b4176f99711e7f90"},
+    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:bd3366aceedf274f765a3a4bc95d6cd97b130d1dda524d8f25225d14123c01db"},
+    {file = "regex-2023.8.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7aed90a72fc3654fba9bc4b7f851571dcc368120432ad68b226bd593f3f6c0b7"},
+    {file = "regex-2023.8.8-cp310-cp310-win32.whl", hash = "sha256:80b80b889cb767cc47f31d2b2f3dec2db8126fbcd0cff31b3925b4dc6609dcdb"},
+    {file = "regex-2023.8.8-cp310-cp310-win_amd64.whl", hash = "sha256:b82edc98d107cbc7357da7a5a695901b47d6eb0420e587256ba3ad24b80b7d0b"},
+    {file = "regex-2023.8.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1e7d84d64c84ad97bf06f3c8cb5e48941f135ace28f450d86af6b6512f1c9a71"},
+    {file = "regex-2023.8.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce0f9fbe7d295f9922c0424a3637b88c6c472b75eafeaff6f910494a1fa719ef"},
+    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06c57e14ac723b04458df5956cfb7e2d9caa6e9d353c0b4c7d5d54fcb1325c46"},
+    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7a9aaa5a1267125eef22cef3b63484c3241aaec6f48949b366d26c7250e0357"},
+    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b7408511fca48a82a119d78a77c2f5eb1b22fe88b0d2450ed0756d194fe7a9a"},
+    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14dc6f2d88192a67d708341f3085df6a4f5a0c7b03dec08d763ca2cd86e9f559"},
+    {file = "regex-2023.8.8-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:48c640b99213643d141550326f34f0502fedb1798adb3c9eb79650b1ecb2f177"},
+    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0085da0f6c6393428bf0d9c08d8b1874d805bb55e17cb1dfa5ddb7cfb11140bf"},
+    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:964b16dcc10c79a4a2be9f1273fcc2684a9eedb3906439720598029a797b46e6"},
+    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7ce606c14bb195b0e5108544b540e2c5faed6843367e4ab3deb5c6aa5e681208"},
+    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:40f029d73b10fac448c73d6eb33d57b34607f40116e9f6e9f0d32e9229b147d7"},
+    {file = "regex-2023.8.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3b8e6ea6be6d64104d8e9afc34c151926f8182f84e7ac290a93925c0db004bfd"},
+    {file = "regex-2023.8.8-cp311-cp311-win32.whl", hash = "sha256:942f8b1f3b223638b02df7df79140646c03938d488fbfb771824f3d05fc083a8"},
+    {file = "regex-2023.8.8-cp311-cp311-win_amd64.whl", hash = "sha256:51d8ea2a3a1a8fe4f67de21b8b93757005213e8ac3917567872f2865185fa7fb"},
+    {file = "regex-2023.8.8-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e951d1a8e9963ea51efd7f150450803e3b95db5939f994ad3d5edac2b6f6e2b4"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:704f63b774218207b8ccc6c47fcef5340741e5d839d11d606f70af93ee78e4d4"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:22283c769a7b01c8ac355d5be0715bf6929b6267619505e289f792b01304d898"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91129ff1bb0619bc1f4ad19485718cc623a2dc433dff95baadbf89405c7f6b57"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de35342190deb7b866ad6ba5cbcccb2d22c0487ee0cbb251efef0843d705f0d4"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b993b6f524d1e274a5062488a43e3f9f8764ee9745ccd8e8193df743dbe5ee61"},
+    {file = "regex-2023.8.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3026cbcf11d79095a32d9a13bbc572a458727bd5b1ca332df4a79faecd45281c"},
+    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:293352710172239bf579c90a9864d0df57340b6fd21272345222fb6371bf82b3"},
+    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:d909b5a3fff619dc7e48b6b1bedc2f30ec43033ba7af32f936c10839e81b9217"},
+    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:3d370ff652323c5307d9c8e4c62efd1956fb08051b0e9210212bc51168b4ff56"},
+    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:b076da1ed19dc37788f6a934c60adf97bd02c7eea461b73730513921a85d4235"},
+    {file = "regex-2023.8.8-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:e9941a4ada58f6218694f382e43fdd256e97615db9da135e77359da257a7168b"},
+    {file = "regex-2023.8.8-cp36-cp36m-win32.whl", hash = "sha256:a8c65c17aed7e15a0c824cdc63a6b104dfc530f6fa8cb6ac51c437af52b481c7"},
+    {file = "regex-2023.8.8-cp36-cp36m-win_amd64.whl", hash = "sha256:aadf28046e77a72f30dcc1ab185639e8de7f4104b8cb5c6dfa5d8ed860e57236"},
+    {file = "regex-2023.8.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:423adfa872b4908843ac3e7a30f957f5d5282944b81ca0a3b8a7ccbbfaa06103"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ae594c66f4a7e1ea67232a0846649a7c94c188d6c071ac0210c3e86a5f92109"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e51c80c168074faa793685656c38eb7a06cbad7774c8cbc3ea05552d615393d8"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09b7f4c66aa9d1522b06e31a54f15581c37286237208df1345108fcf4e050c18"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e73e5243af12d9cd6a9d6a45a43570dbe2e5b1cdfc862f5ae2b031e44dd95a8"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:941460db8fe3bd613db52f05259c9336f5a47ccae7d7def44cc277184030a116"},
+    {file = "regex-2023.8.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f0ccf3e01afeb412a1a9993049cb160d0352dba635bbca7762b2dc722aa5742a"},
+    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:2e9216e0d2cdce7dbc9be48cb3eacb962740a09b011a116fd7af8c832ab116ca"},
+    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5cd9cd7170459b9223c5e592ac036e0704bee765706445c353d96f2890e816c8"},
+    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4873ef92e03a4309b3ccd8281454801b291b689f6ad45ef8c3658b6fa761d7ac"},
+    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:239c3c2a339d3b3ddd51c2daef10874410917cd2b998f043c13e2084cb191684"},
+    {file = "regex-2023.8.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1005c60ed7037be0d9dea1f9c53cc42f836188227366370867222bda4c3c6bd7"},
+    {file = "regex-2023.8.8-cp37-cp37m-win32.whl", hash = "sha256:e6bd1e9b95bc5614a7a9c9c44fde9539cba1c823b43a9f7bc11266446dd568e3"},
+    {file = "regex-2023.8.8-cp37-cp37m-win_amd64.whl", hash = "sha256:9a96edd79661e93327cfeac4edec72a4046e14550a1d22aa0dd2e3ca52aec921"},
+    {file = "regex-2023.8.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f2181c20ef18747d5f4a7ea513e09ea03bdd50884a11ce46066bb90fe4213675"},
+    {file = "regex-2023.8.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a2ad5add903eb7cdde2b7c64aaca405f3957ab34f16594d2b78d53b8b1a6a7d6"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9233ac249b354c54146e392e8a451e465dd2d967fc773690811d3a8c240ac601"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:920974009fb37b20d32afcdf0227a2e707eb83fe418713f7a8b7de038b870d0b"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd2b6c5dfe0929b6c23dde9624483380b170b6e34ed79054ad131b20203a1a63"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96979d753b1dc3b2169003e1854dc67bfc86edf93c01e84757927f810b8c3c93"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ae54a338191e1356253e7883d9d19f8679b6143703086245fb14d1f20196be9"},
+    {file = "regex-2023.8.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2162ae2eb8b079622176a81b65d486ba50b888271302190870b8cc488587d280"},
+    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c884d1a59e69e03b93cf0dfee8794c63d7de0ee8f7ffb76e5f75be8131b6400a"},
+    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf9273e96f3ee2ac89ffcb17627a78f78e7516b08f94dc435844ae72576a276e"},
+    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:83215147121e15d5f3a45d99abeed9cf1fe16869d5c233b08c56cdf75f43a504"},
+    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:3f7454aa427b8ab9101f3787eb178057c5250478e39b99540cfc2b889c7d0586"},
+    {file = "regex-2023.8.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0640913d2c1044d97e30d7c41728195fc37e54d190c5385eacb52115127b882"},
+    {file = "regex-2023.8.8-cp38-cp38-win32.whl", hash = "sha256:0c59122ceccb905a941fb23b087b8eafc5290bf983ebcb14d2301febcbe199c7"},
+    {file = "regex-2023.8.8-cp38-cp38-win_amd64.whl", hash = "sha256:c12f6f67495ea05c3d542d119d270007090bad5b843f642d418eb601ec0fa7be"},
+    {file = "regex-2023.8.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:82cd0a69cd28f6cc3789cc6adeb1027f79526b1ab50b1f6062bbc3a0ccb2dbc3"},
+    {file = "regex-2023.8.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:bb34d1605f96a245fc39790a117ac1bac8de84ab7691637b26ab2c5efb8f228c"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:987b9ac04d0b38ef4f89fbc035e84a7efad9cdd5f1e29024f9289182c8d99e09"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dd6082f4e2aec9b6a0927202c85bc1b09dcab113f97265127c1dc20e2e32495"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7eb95fe8222932c10d4436e7a6f7c99991e3fdd9f36c949eff16a69246dee2dc"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7098c524ba9f20717a56a8d551d2ed491ea89cbf37e540759ed3b776a4f8d6eb"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b694430b3f00eb02c594ff5a16db30e054c1b9589a043fe9174584c6efa8033"},
+    {file = "regex-2023.8.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b2aeab3895d778155054abea5238d0eb9a72e9242bd4b43f42fd911ef9a13470"},
+    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:988631b9d78b546e284478c2ec15c8a85960e262e247b35ca5eaf7ee22f6050a"},
+    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:67ecd894e56a0c6108ec5ab1d8fa8418ec0cff45844a855966b875d1039a2e34"},
+    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:14898830f0a0eb67cae2bbbc787c1a7d6e34ecc06fbd39d3af5fe29a4468e2c9"},
+    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:f2200e00b62568cfd920127782c61bc1c546062a879cdc741cfcc6976668dfcf"},
+    {file = "regex-2023.8.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9691a549c19c22d26a4f3b948071e93517bdf86e41b81d8c6ac8a964bb71e5a6"},
+    {file = "regex-2023.8.8-cp39-cp39-win32.whl", hash = "sha256:6ab2ed84bf0137927846b37e882745a827458689eb969028af8032b1b3dac78e"},
+    {file = "regex-2023.8.8-cp39-cp39-win_amd64.whl", hash = "sha256:5543c055d8ec7801901e1193a51570643d6a6ab8751b1f7dd9af71af467538bb"},
+    {file = "regex-2023.8.8.tar.gz", hash = "sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e"},
 ]
 
 [[package]]
@@ -3589,33 +3603,33 @@ files = [
 
 [[package]]
 name = "s3fs"
-version = "2023.6.0"
+version = "2023.9.0"
 description = "Convenient Filesystem interface over S3"
 optional = false
 python-versions = ">= 3.8"
 files = [
-    {file = "s3fs-2023.6.0-py3-none-any.whl", hash = "sha256:d1a0a423d0d2e17fb2a193d9531935dc3f45ba742693448a461b6b34f6a92a24"},
-    {file = "s3fs-2023.6.0.tar.gz", hash = "sha256:63fd8ddf05eb722de784b7b503196107f2a518061298cf005a8a4715b4d49117"},
+    {file = "s3fs-2023.9.0-py3-none-any.whl", hash = "sha256:98ad2b221514490f0fe49b730ccf4f0362031ee8ede6d5392cdd3977ca313b1a"},
+    {file = "s3fs-2023.9.0.tar.gz", hash = "sha256:35057d4d59722cab9fe91c9a30147e3e5bddfc55ec14fde8776c512179c823dd"},
 ]
 
 [package.dependencies]
-aiobotocore = ">=2.5.0,<2.6.0"
+aiobotocore = ">=2.5.4,<2.6.0"
 aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1"
-fsspec = "2023.6.0"
+fsspec = "2023.9.0"
 
 [package.extras]
-awscli = ["aiobotocore[awscli] (>=2.5.0,<2.6.0)"]
-boto3 = ["aiobotocore[boto3] (>=2.5.0,<2.6.0)"]
+awscli = ["aiobotocore[awscli] (>=2.5.4,<2.6.0)"]
+boto3 = ["aiobotocore[boto3] (>=2.5.4,<2.6.0)"]
 
 [[package]]
 name = "s3transfer"
-version = "0.6.1"
+version = "0.6.2"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">= 3.7"
 files = [
-    {file = "s3transfer-0.6.1-py3-none-any.whl", hash = "sha256:3c0da2d074bf35d6870ef157158641178a4204a6e689e82546083e31e0311346"},
-    {file = "s3transfer-0.6.1.tar.gz", hash = "sha256:640bb492711f4c0c0905e1f62b6aaeb771881935ad27884852411f8e9cacbca9"},
+    {file = "s3transfer-0.6.2-py3-none-any.whl", hash = "sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084"},
+    {file = "s3transfer-0.6.2.tar.gz", hash = "sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861"},
 ]
 
 [package.dependencies]
@@ -3706,13 +3720,13 @@ test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "sciki
 
 [[package]]
 name = "scmrepo"
-version = "1.0.4"
+version = "1.3.1"
 description = "SCM wrapper and fsspec filesystem for Git for use in DVC"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "scmrepo-1.0.4-py3-none-any.whl", hash = "sha256:048ffd98ab72afb6d3dfb177e5cefe14652ea28377b4258d9dac098cd4461036"},
-    {file = "scmrepo-1.0.4.tar.gz", hash = "sha256:d03278d6a86caa5c7f1e85918bc28ef69040a5e5fd04c97cc0eea611ea8be13c"},
+    {file = "scmrepo-1.3.1-py3-none-any.whl", hash = "sha256:143af909ac42eb05dc7f214bb4347b32dd7d8a37e2224f6b452606b7d781ecd1"},
+    {file = "scmrepo-1.3.1.tar.gz", hash = "sha256:11fe5fb4815d2aa1c2d1bcc63898228f0eb7106a18529243dd9fad34b57ef0dc"},
 ]
 
 [package.dependencies]
@@ -3727,8 +3741,8 @@ pygtrie = ">=2.3.2"
 shortuuid = ">=0.5.0"
 
 [package.extras]
-dev = ["mock (==5.0.1)", "mypy (==0.971)", "paramiko (==3.1.0)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-asyncio (==0.18.3)", "pytest-cov (==3.0.0)", "pytest-docker (==0.12.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "pytest-test-utils (==0.0.8)", "types-certifi (==2021.10.8.3)", "types-mock (==5.0.0.6)", "types-paramiko (==3.0.0.10)"]
-tests = ["mock (==5.0.1)", "mypy (==0.971)", "paramiko (==3.1.0)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-asyncio (==0.18.3)", "pytest-cov (==3.0.0)", "pytest-docker (==0.12.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "pytest-test-utils (==0.0.8)", "types-certifi (==2021.10.8.3)", "types-mock (==5.0.0.6)", "types-paramiko (==3.0.0.10)"]
+dev = ["mock (==5.0.1)", "mypy (==0.971)", "paramiko (==3.2.0)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-asyncio (==0.18.3)", "pytest-cov (==3.0.0)", "pytest-docker (==0.12.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "pytest-test-utils (==0.0.8)", "types-certifi (==2021.10.8.3)", "types-mock (==5.0.0.6)", "types-paramiko (==3.2.0.1)"]
+tests = ["mock (==5.0.1)", "mypy (==0.971)", "paramiko (==3.2.0)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-asyncio (==0.18.3)", "pytest-cov (==3.0.0)", "pytest-docker (==0.12.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "pytest-test-utils (==0.0.8)", "types-certifi (==2021.10.8.3)", "types-mock (==5.0.0.6)", "types-paramiko (==3.2.0.1)"]
 
 [[package]]
 name = "sentencepiece"
@@ -3786,13 +3800,13 @@ files = [
 
 [[package]]
 name = "sentry-sdk"
-version = "1.28.1"
+version = "1.30.0"
 description = "Python client for Sentry (https://sentry.io)"
 optional = false
 python-versions = "*"
 files = [
-    {file = "sentry-sdk-1.28.1.tar.gz", hash = "sha256:dcd88c68aa64dae715311b5ede6502fd684f70d00a7cd4858118f0ba3153a3ae"},
-    {file = "sentry_sdk-1.28.1-py2.py3-none-any.whl", hash = "sha256:6bdb25bd9092478d3a817cb0d01fa99e296aea34d404eac3ca0037faa5c2aa0a"},
+    {file = "sentry-sdk-1.30.0.tar.gz", hash = "sha256:7dc873b87e1faf4d00614afd1058bfa1522942f33daef8a59f90de8ed75cd10c"},
+    {file = "sentry_sdk-1.30.0-py2.py3-none-any.whl", hash = "sha256:2e53ad63f96bb9da6570ba2e755c267e529edcf58580a2c0d2a11ef26e1e678b"},
 ]
 
 [package.dependencies]
@@ -3815,6 +3829,7 @@ httpx = ["httpx (>=0.16.0)"]
 huey = ["huey (>=2)"]
 loguru = ["loguru (>=0.5)"]
 opentelemetry = ["opentelemetry-distro (>=0.35b0)"]
+opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"]
 pure-eval = ["asttokens", "executing", "pure-eval"]
 pymongo = ["pymongo (>=3.1)"]
 pyspark = ["pyspark (>=2.4.4)"]
@@ -3912,19 +3927,19 @@ test = ["pytest"]
 
 [[package]]
 name = "setuptools"
-version = "68.0.0"
+version = "68.2.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"},
-    {file = "setuptools-68.0.0.tar.gz", hash = "sha256:baf1fdb41c6da4cd2eae722e135500da913332ab3f2f5c7d33af9b492acb5235"},
+    {file = "setuptools-68.2.0-py3-none-any.whl", hash = "sha256:af3d5949030c3f493f550876b2fd1dd5ec66689c4ee5d5344f009746f71fd5a8"},
+    {file = "setuptools-68.2.0.tar.gz", hash = "sha256:00478ca80aeebeecb2f288d3206b0de568df5cd2b8fada1209843cc9a8d88a48"},
 ]
 
 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
-testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
 name = "shortuuid"
@@ -3939,13 +3954,13 @@ files = [
 
 [[package]]
 name = "shtab"
-version = "1.6.2"
+version = "1.6.4"
 description = "Automagic shell tab completion for Python CLI applications"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "shtab-1.6.2-py3-none-any.whl", hash = "sha256:a1f0b9693c2e3b5c6933783fb356cd8c80896960a97a6acdc258bf2ac31fe11d"},
-    {file = "shtab-1.6.2.tar.gz", hash = "sha256:425d3b3e5d1b4ac59119fab5d40dfb01d4462676698e82dc404c707c6fdcd32c"},
+    {file = "shtab-1.6.4-py3-none-any.whl", hash = "sha256:4be38887a912091a1640e06f5ccbcbd24e176cf2fcb9ef0c2e011ee22d63834f"},
+    {file = "shtab-1.6.4.tar.gz", hash = "sha256:aba9e049bed54ffdb650cb2e02657282d8c0148024b0f500277052df124d47de"},
 ]
 
 [package.extras]
@@ -4105,13 +4120,13 @@ files = [
 
 [[package]]
 name = "spacy-loggers"
-version = "1.0.4"
+version = "1.0.5"
 description = "Logging utilities for SpaCy"
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "spacy-loggers-1.0.4.tar.gz", hash = "sha256:e6f983bf71230091d5bb7b11bf64bd54415eca839108d5f83d9155d0ba93bf28"},
-    {file = "spacy_loggers-1.0.4-py3-none-any.whl", hash = "sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae"},
+    {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"},
+    {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"},
 ]
 
 [[package]]
@@ -4428,13 +4443,13 @@ files = [
 
 [[package]]
 name = "tomlkit"
-version = "0.11.8"
+version = "0.12.1"
 description = "Style preserving TOML library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tomlkit-0.11.8-py3-none-any.whl", hash = "sha256:8c726c4c202bdb148667835f68d68780b9a003a9ec34167b6c673b38eff2a171"},
-    {file = "tomlkit-0.11.8.tar.gz", hash = "sha256:9330fc7faa1db67b541b28e62018c17d20be733177d290a13b24c62d1614e0c3"},
+    {file = "tomlkit-0.12.1-py3-none-any.whl", hash = "sha256:712cbd236609acc6a3e2e97253dfc52d4c2082982a88f61b640ecf0817eab899"},
+    {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"},
 ]
 
 [[package]]
@@ -4472,20 +4487,20 @@ reference = "torch-gpu"
 
 [[package]]
 name = "tqdm"
-version = "4.65.0"
+version = "4.66.1"
 description = "Fast, Extensible Progress Meter"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"},
-    {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"},
+    {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+    {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
 ]
 
 [package.dependencies]
 colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [package.extras]
-dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
 notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
@@ -4682,23 +4697,23 @@ files = [
 
 [[package]]
 name = "virtualenv"
-version = "20.24.0"
+version = "20.24.5"
 description = "Virtual Python Environment builder"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "virtualenv-20.24.0-py3-none-any.whl", hash = "sha256:18d1b37fc75cc2670625702d76849a91ebd383768b4e91382a8d51be3246049e"},
-    {file = "virtualenv-20.24.0.tar.gz", hash = "sha256:e2a7cef9da880d693b933db7654367754f14e20650dc60e8ee7385571f8593a3"},
+    {file = "virtualenv-20.24.5-py3-none-any.whl", hash = "sha256:b80039f280f4919c77b30f1c23294ae357c4c8701042086e3fc005963e4e537b"},
+    {file = "virtualenv-20.24.5.tar.gz", hash = "sha256:e8361967f6da6fbdf1426483bfe9fca8287c242ac0bc30429905721cefbff752"},
 ]
 
 [package.dependencies]
-distlib = ">=0.3.6,<1"
-filelock = ">=3.12,<4"
-platformdirs = ">=3.5.1,<4"
+distlib = ">=0.3.7,<1"
+filelock = ">=3.12.2,<4"
+platformdirs = ">=3.9.1,<4"
 
 [package.extras]
-docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
-test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.3.1)", "pytest-env (>=0.8.1)", "pytest-freezer (>=0.4.6)", "pytest-mock (>=3.10)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=67.8)", "time-machine (>=2.9)"]
+docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
 
 [[package]]
 name = "voluptuous"
@@ -4713,13 +4728,13 @@ files = [
 
 [[package]]
 name = "wandb"
-version = "0.15.5"
-description = "A CLI and library for interacting with the Weights and Biases API."
+version = "0.15.10"
+description = "A CLI and library for interacting with the Weights & Biases API."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "wandb-0.15.5-py3-none-any.whl", hash = "sha256:8cfb8fdaaf0a35b636d0ca2c2c1262b0e3d835ac37f70fc3094b618f55f63f01"},
-    {file = "wandb-0.15.5.tar.gz", hash = "sha256:40c1d9ae501194bff408bc9c555865ffcccf08d2d65dd413547df0c17ed20cb5"},
+    {file = "wandb-0.15.10-py3-none-any.whl", hash = "sha256:bc810879fecd1b62ccd498658e55ade3702939090a94b99418630e77e1f43d50"},
+    {file = "wandb-0.15.10.tar.gz", hash = "sha256:5ce6d2d6be904da726b591a19ac22d2e71aa4b6a9ed450d8cd817d1712969b13"},
 ]
 
 [package.dependencies]
@@ -4741,11 +4756,11 @@ async = ["httpx (>=0.22.0)"]
 aws = ["boto3"]
 azure = ["azure-identity", "azure-storage-blob"]
 gcp = ["google-cloud-storage"]
-grpc = ["grpcio (>=1.27.2)"]
 kubeflow = ["google-cloud-storage", "kubernetes", "minio", "sh"]
 launch = ["awscli", "azure-containerregistry", "azure-identity", "azure-storage-blob", "boto3", "botocore", "chardet", "google-auth", "google-cloud-artifact-registry", "google-cloud-compute", "google-cloud-storage", "iso8601", "kubernetes", "nbconvert", "nbformat", "optuna", "typing-extensions"]
 media = ["bokeh", "moviepy", "numpy", "pillow", "plotly", "rdkit-pypi", "soundfile"]
 models = ["cloudpickle"]
+perf = ["orjson"]
 sweeps = ["sweeps (>=0.2.0)"]
 
 [[package]]
@@ -4862,109 +4877,96 @@ files = [
 
 [[package]]
 name = "xxhash"
-version = "3.2.0"
+version = "3.3.0"
 description = "Python binding for xxHash"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "xxhash-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:af44b9e59c4b2926a4e3c7f9d29949ff42fcea28637ff6b8182e654461932be8"},
-    {file = "xxhash-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1bdd57973e2b802ef32553d7bebf9402dac1557874dbe5c908b499ea917662cd"},
-    {file = "xxhash-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b7c9aa77bbce61a5e681bd39cb6a804338474dcc90abe3c543592aa5d6c9a9b"},
-    {file = "xxhash-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:11bf87dc7bb8c3b0b5e24b7b941a9a19d8c1f88120b6a03a17264086bc8bb023"},
-    {file = "xxhash-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2783d41487ce6d379fdfaa7332fca5187bf7010b9bddcf20cafba923bc1dc665"},
-    {file = "xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:561076ca0dcef2fbc20b2bc2765bff099e002e96041ae9dbe910a863ca6ee3ea"},
-    {file = "xxhash-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a26eeb4625a6e61cedc8c1b39b89327c9c7e1a8c2c4d786fe3f178eb839ede6"},
-    {file = "xxhash-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d93a44d0104d1b9b10de4e7aadf747f6efc1d7ec5ed0aa3f233a720725dd31bd"},
-    {file = "xxhash-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:89585adc73395a10306d2e2036e50d6c4ac0cf8dd47edf914c25488871b64f6d"},
-    {file = "xxhash-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a892b4b139126a86bfdcb97cd912a2f8c4e8623869c3ef7b50871451dd7afeb0"},
-    {file = "xxhash-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:e998efb190653f70e0f30d92b39fc645145369a4823bee46af8ddfc244aa969d"},
-    {file = "xxhash-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e8ed3bd2b8bb3277710843ca63e4f5c3ee6f8f80b083be5b19a7a9905420d11e"},
-    {file = "xxhash-3.2.0-cp310-cp310-win32.whl", hash = "sha256:20181cbaed033c72cb881b2a1d13c629cd1228f113046133469c9a48cfcbcd36"},
-    {file = "xxhash-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:a0f7a16138279d707db778a63264d1d6016ac13ffd3f1e99f54b2855d6c0d8e1"},
-    {file = "xxhash-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5daff3fb5bfef30bc5a2cb143810d376d43461445aa17aece7210de52adbe151"},
-    {file = "xxhash-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75bb5be3c5de702a547715f320ecf5c8014aeca750ed5147ca75389bd22e7343"},
-    {file = "xxhash-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01f36b671ff55cb1d5c2f6058b799b697fd0ae4b4582bba6ed0999678068172a"},
-    {file = "xxhash-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d4d4519123aac73c93159eb8f61db9682393862dd669e7eae034ecd0a35eadac"},
-    {file = "xxhash-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:994e4741d5ed70fc2a335a91ef79343c6b1089d7dfe6e955dd06f8ffe82bede6"},
-    {file = "xxhash-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:919bc1b010aa6ff0eb918838ff73a435aed9e9a19c3202b91acecd296bf75607"},
-    {file = "xxhash-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:17b65454c5accbb079c45eca546c27c4782f5175aa320758fafac896b1549d27"},
-    {file = "xxhash-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b0c094d5e65a46dbf3fe0928ff20873a747e6abfd2ed4b675beeb2750624bc2e"},
-    {file = "xxhash-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:f94163ebe2d5546e6a5977e96d83621f4689c1054053428cf8d4c28b10f92f69"},
-    {file = "xxhash-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cead7c0307977a00b3f784cff676e72c147adbcada19a2e6fc2ddf54f37cf387"},
-    {file = "xxhash-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:a0e1bd0260c1da35c1883321ce2707ceea07127816ab625e1226ec95177b561a"},
-    {file = "xxhash-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc8878935671490efe9275fb4190a6062b73277bd273237179b9b5a2aa436153"},
-    {file = "xxhash-3.2.0-cp311-cp311-win32.whl", hash = "sha256:a433f6162b18d52f7068175d00bd5b1563b7405f926a48d888a97b90a160c40d"},
-    {file = "xxhash-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:a32d546a1752e4ee7805d6db57944f7224afa7428d22867006b6486e4195c1f3"},
-    {file = "xxhash-3.2.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:82daaab720866bf690b20b49de5640b0c27e3b8eea2d08aa75bdca2b0f0cfb63"},
-    {file = "xxhash-3.2.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3126df6520cbdbaddd87ce74794b2b6c45dd2cf6ac2b600a374b8cdb76a2548c"},
-    {file = "xxhash-3.2.0-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e172c1ee40507ae3b8d220f4048aaca204f203e1e4197e8e652f5c814f61d1aa"},
-    {file = "xxhash-3.2.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5384f1d9f30876f5d5b618464fb19ff7ce6c0fe4c690fbaafd1c52adc3aae807"},
-    {file = "xxhash-3.2.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26cb52174a7e96a17acad27a3ca65b24713610ac479c99ac9640843822d3bebf"},
-    {file = "xxhash-3.2.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbcd613a5e76b1495fc24db9c37a6b7ee5f214fd85979187ec4e032abfc12ded"},
-    {file = "xxhash-3.2.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:f988daf25f31726d5b9d0be6af636ca9000898f9ea43a57eac594daea25b0948"},
-    {file = "xxhash-3.2.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:bbc30c98ab006ab9fc47e5ed439c00f706bc9d4441ff52693b8b6fea335163e0"},
-    {file = "xxhash-3.2.0-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:2408d49260b0a4a7cc6ba445aebf38e073aeaf482f8e32767ca477e32ccbbf9e"},
-    {file = "xxhash-3.2.0-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:3f4152fd0bf8b03b79f2f900fd6087a66866537e94b5a11fd0fd99ef7efe5c42"},
-    {file = "xxhash-3.2.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:0eea848758e4823a01abdbcccb021a03c1ee4100411cbeeb7a5c36a202a0c13c"},
-    {file = "xxhash-3.2.0-cp36-cp36m-win32.whl", hash = "sha256:77709139af5123c578ab06cf999429cdb9ab211047acd0c787e098dcb3f1cb4d"},
-    {file = "xxhash-3.2.0-cp36-cp36m-win_amd64.whl", hash = "sha256:91687671fd9d484a4e201ad266d366b695a45a1f2b41be93d116ba60f1b8f3b3"},
-    {file = "xxhash-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e4af8bc5c3fcc2192c266421c6aa2daab1a18e002cb8e66ef672030e46ae25cf"},
-    {file = "xxhash-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8be562e2ce3e481d9209b6f254c3d7c5ff920eb256aba2380d2fb5ba75d4f87"},
-    {file = "xxhash-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9eba0c7c12126b12f7fcbea5513f28c950d28f33d2a227f74b50b77789e478e8"},
-    {file = "xxhash-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2198c4901a0223c48f6ec0a978b60bca4f4f7229a11ca4dc96ca325dd6a29115"},
-    {file = "xxhash-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50ce82a71b22a3069c02e914bf842118a53065e2ec1c6fb54786e03608ab89cc"},
-    {file = "xxhash-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b5019fb33711c30e54e4e57ae0ca70af9d35b589d385ac04acd6954452fa73bb"},
-    {file = "xxhash-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:0d54ac023eef7e3ac9f0b8841ae8a376b933043bc2ad428121346c6fa61c491c"},
-    {file = "xxhash-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c55fa832fc3fe64e0d29da5dc9b50ba66ca93312107cec2709300ea3d3bab5c7"},
-    {file = "xxhash-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:f4ce006215497993ae77c612c1883ca4f3973899573ce0c52fee91f0d39c4561"},
-    {file = "xxhash-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:1afb9b9d27fd675b436cb110c15979976d92d761ad6e66799b83756402f3a974"},
-    {file = "xxhash-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:baa99cebf95c1885db21e119395f222a706a2bb75a545f0672880a442137725e"},
-    {file = "xxhash-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:75aa692936942ccb2e8fd6a386c81c61630ac1b6d6e921698122db8a930579c3"},
-    {file = "xxhash-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:0a2cdfb5cae9fafb9f7b65fd52ecd60cf7d72c13bb2591ea59aaefa03d5a8827"},
-    {file = "xxhash-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a68d1e8a390b660d94b9360ae5baa8c21a101bd9c4790a8b30781bada9f1fc6"},
-    {file = "xxhash-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ce7c3ce28f94302df95eaea7c9c1e2c974b6d15d78a0c82142a97939d7b6c082"},
-    {file = "xxhash-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0dcb419bf7b0bc77d366e5005c25682249c5521a63fd36c51f584bd91bb13bd5"},
-    {file = "xxhash-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae521ed9287f86aac979eeac43af762f03d9d9797b2272185fb9ddd810391216"},
-    {file = "xxhash-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0d16775094423088ffa357d09fbbb9ab48d2fb721d42c0856b801c86f616eec"},
-    {file = "xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe454aeab348c42f56d6f7434ff758a3ef90787ac81b9ad5a363cd61b90a1b0b"},
-    {file = "xxhash-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:052fd0efdd5525c2dbc61bebb423d92aa619c4905bba605afbf1e985a562a231"},
-    {file = "xxhash-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:02badf3754e2133de254a4688798c4d80f0060635087abcb461415cb3eb82115"},
-    {file = "xxhash-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:66b8a90b28c13c2aae7a71b32638ceb14cefc2a1c8cf23d8d50dfb64dfac7aaf"},
-    {file = "xxhash-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:649cdf19df175925ad87289ead6f760cd840730ee85abc5eb43be326a0a24d97"},
-    {file = "xxhash-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4b948a03f89f5c72d69d40975af8af241111f0643228796558dc1cae8f5560b0"},
-    {file = "xxhash-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49f51fab7b762da7c2cee0a3d575184d3b9be5e2f64f26cae2dd286258ac9b3c"},
-    {file = "xxhash-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1a42994f0d42b55514785356722d9031f064fd34e495b3a589e96db68ee0179d"},
-    {file = "xxhash-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:0a6d58ba5865475e53d6c2c4fa6a62e2721e7875e146e2681e5337a6948f12e7"},
-    {file = "xxhash-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:aabdbc082030f8df613e2d2ea1f974e7ad36a539bdfc40d36f34e55c7e4b8e94"},
-    {file = "xxhash-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:498843b66b9ca416e9d03037e5875c8d0c0ab9037527e22df3b39aa5163214cd"},
-    {file = "xxhash-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a910b1193cd90af17228f5d6069816646df0148f14f53eefa6b2b11a1dedfcd0"},
-    {file = "xxhash-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bb6d8ce31dc25faf4da92991320e211fa7f42de010ef51937b1dc565a4926501"},
-    {file = "xxhash-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:883dc3d3942620f4c7dbc3fd6162f50a67f050b714e47da77444e3bcea7d91cc"},
-    {file = "xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59dc8bfacf89b8f5be54d55bc3b4bd6d74d0c5320c8a63d2538ac7df5b96f1d5"},
-    {file = "xxhash-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:61e6aa1d30c2af692aa88c4dd48709426e8b37bff6a574ee2de677579c34a3d6"},
-    {file = "xxhash-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:314ec0bd21f0ee8d30f2bd82ed3759314bd317ddbbd8555668f3d20ab7a8899a"},
-    {file = "xxhash-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:dad638cde3a5357ad3163b80b3127df61fb5b5e34e9e05a87697144400ba03c7"},
-    {file = "xxhash-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:eaa3ea15025b56076d806b248948612289b093e8dcda8d013776b3848dffff15"},
-    {file = "xxhash-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:7deae3a312feb5c17c97cbf18129f83cbd3f1f9ec25b0f50e2bd9697befb22e7"},
-    {file = "xxhash-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:add774341c09853b1612c64a526032d95ab1683053325403e1afbe3ad2f374c5"},
-    {file = "xxhash-3.2.0-cp39-cp39-win32.whl", hash = "sha256:9b94749130ef3119375c599bfce82142c2500ef9ed3280089157ee37662a7137"},
-    {file = "xxhash-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:e57d94a1552af67f67b27db5dba0b03783ea69d5ca2af2f40e098f0ba3ce3f5f"},
-    {file = "xxhash-3.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:92fd765591c83e5c5f409b33eac1d3266c03d3d11c71a7dbade36d5cdee4fbc0"},
-    {file = "xxhash-3.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8970f6a411a9839a02b23b7e90bbbba4a6de52ace009274998566dc43f36ca18"},
-    {file = "xxhash-3.2.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5f3e33fe6cbab481727f9aeb136a213aed7e33cd1ca27bd75e916ffacc18411"},
-    {file = "xxhash-3.2.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:368265392cb696dd53907e2328b5a8c1bee81cf2142d0cc743caf1c1047abb36"},
-    {file = "xxhash-3.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:3b1f3c6d67fa9f49c4ff6b25ce0e7143bab88a5bc0f4116dd290c92337d0ecc7"},
-    {file = "xxhash-3.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c5e8db6e1ee7267b7c412ad0afd5863bf7a95286b8333a5958c8097c69f94cf5"},
-    {file = "xxhash-3.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:761df3c7e2c5270088b691c5a8121004f84318177da1ca1db64222ec83c44871"},
-    {file = "xxhash-3.2.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2d15a707e7f689531eb4134eccb0f8bf3844bb8255ad50823aa39708d9e6755"},
-    {file = "xxhash-3.2.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e6b2ba4ff53dd5f57d728095e3def7375eb19c90621ce3b41b256de84ec61cfd"},
-    {file = "xxhash-3.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:61b0bcf946fdfd8ab5f09179dc2b5c74d1ef47cedfc6ed0ec01fdf0ee8682dd3"},
-    {file = "xxhash-3.2.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f7b79f0f302396d8e0d444826ceb3d07b61977793886ebae04e82796c02e42dc"},
-    {file = "xxhash-3.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0773cd5c438ffcd5dbff91cdd503574f88a4b960e70cedeb67736583a17a918"},
-    {file = "xxhash-3.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ec1f57127879b419a2c8d2db9d9978eb26c61ae17e5972197830430ae78d25b"},
-    {file = "xxhash-3.2.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d4b15c00e807b1d3d0b612338c814739dec310b80fb069bd732b98ddc709ad7"},
-    {file = "xxhash-3.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:9d3f686e3d1c8900c5459eee02b60c7399e20ec5c6402364068a343c83a61d90"},
-    {file = "xxhash-3.2.0.tar.gz", hash = "sha256:1afd47af8955c5db730f630ad53ae798cf7fae0acb64cebb3cf94d35c47dd088"},
+    {file = "xxhash-3.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70ef7288d1cb1ad16e02d101ea43bb0e392d985d60b9b0035aee80663530960d"},
+    {file = "xxhash-3.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:44ff8c673cab50be46784e0aec62aa6f0ca9ea765e2b0690e8945d0cd950dcaf"},
+    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfebc90273ae2beb813d8118a2bfffb5a5a81ac054fbfd061ea18fd0a81db0ac"},
+    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9084e68bedbd665c7e9241a7b597c28f4775edeb3941bf608ecb38732a5f8fb5"},
+    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72493a14a3e89564b1a6c7400b9b40621e8f4692410706ef27c66aeadc7b431"},
+    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98779cbe9068dd7734cc3210693894d5cc9b156920e9c336f10fb99f46bebbd8"},
+    {file = "xxhash-3.3.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:499f8a12767dd28b98ab6b7c7da7d294564e4c9024a2aaa5d0b0b98a8bef2f92"},
+    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4dabda7f42c548f98d8e07e390bda2953fc58302c0e07ded7b3fe0637e7ecd2f"},
+    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:c416409646c793c46370f0f1859253302ee70aeda5278c2a0ca41462f8ec1244"},
+    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b8bd31aaad8a80a7302730676cec26bea3ef1fd9835875aa47fea073aca9fe05"},
+    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3af8e3bcd630f905efbdfe7a51b51fc1ca3c9dca8b155f841925f3ad41685d41"},
+    {file = "xxhash-3.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d86b79c707fc7025d967af71db652429a06a8179175e45bd2e9f17b8af6f5949"},
+    {file = "xxhash-3.3.0-cp310-cp310-win32.whl", hash = "sha256:98fe771f36ee9d3a1f5741424a956a2ba9651d9508a9f64a024b57f2cf796414"},
+    {file = "xxhash-3.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:0a65131f7f731ecf7e3dd27f09d877aff3000a79a446caaa2c0d8d0ec0bc7186"},
+    {file = "xxhash-3.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a9761e425e79d23797fa0bec2d781dbadb9fe5dcc2bf69030855f5e393c3bec8"},
+    {file = "xxhash-3.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d28c7ef1deb3c3ac5f5290176ca3d501daa97c2e1f7443bf5d8b61ac651794b2"},
+    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:701b7cefffc25de1b7ddfae6505da70a3b3a11e312c2e2b33b09e180bbceb43d"},
+    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1644f8b8e19a242c3047a089541067248a651038cabb9fcab3c13eb1dfcd757"},
+    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20e7d0e3488cc0f0dbe360731b7fe32e1f2df46bf2de2db3317d301efb93084c"},
+    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:156c52eca2b20f9839723bef0b929a290f6c2f1c98ccb24e82f58f96f3c16007"},
+    {file = "xxhash-3.3.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d6ce4d3828d79044ed08994e196c20f69c18133ed8a4286afe3e98989adeeac"},
+    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b85b63757ade2439c8d7d71842c40d42c0ab3b69279ed02afbd3b1635f7d2b4b"},
+    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b2b9051e40b7b649a9a2a38fb223ca6a593d332012df885746b81968948f9435"},
+    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:81b7ce050f26fc1daaaa0d24e320815306736d14608e1ba31920e693a7ca9afb"},
+    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:7442500fcce71669953ca959682dcd47452bc3f9c95c8d88315874aeabec9f82"},
+    {file = "xxhash-3.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:36a05bf59a515cfb07f3f83373c527fff2ecaa77eaf30c968c788aea582070a1"},
+    {file = "xxhash-3.3.0-cp311-cp311-win32.whl", hash = "sha256:da16f9cd62c6fde74683be1b28c28ef865e706da13e3bee4ba836fcc520de0cc"},
+    {file = "xxhash-3.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:40fd49ef6964b1c90c0bea63cd184f6d0b36e59144a080e8b3ac2c4c06bf6bf2"},
+    {file = "xxhash-3.3.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:672c60cce1f8026ae32c651f877aa64f342876083a36a4b1ff91bc876aaf0e34"},
+    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bb6c83d7a65dd3065566c77425ba72df96982174e8ef613d809052d68ae77ab"},
+    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a4170f3016b621e3200ebfcc18de6f50eb8e8fc1303e16324b1f5625afd51b57"},
+    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bfb9c45d502ab38c0f4edf98a678694ae0f345613ef4900ade98c71f64db4d78"},
+    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48af026a2b1569666da42a478248a1f03f4e2350a34eb661afe3cb45429ca1d7"},
+    {file = "xxhash-3.3.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe627de8fe8ddfa8b6477bda4ae5d5843ad1a0c83601dcff72247039465cc901"},
+    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:427fc60a188e345534f35b0aa76f7640c5ddf0354f1c9ad826a2bc086282982d"},
+    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:d80acb20c7f268fe3150ac0be6a6b798062af56a1795eef855b26c9eae11a99c"},
+    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:e71100818943422d1fbbe460e7be7fc4f2d2ba9371b2a745eb09e29ef0493f4a"},
+    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:e3b9bb5fdbe284c7b61c5d82c76688e52bbaf48ab1e53de98c072cc696fa331f"},
+    {file = "xxhash-3.3.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1e25f6c8c46cf1ed8237f610abb231093a748c97d6c2c092789a7cad7e7ef290"},
+    {file = "xxhash-3.3.0-cp37-cp37m-win32.whl", hash = "sha256:928208dfecc563be59ae91868d1658e78809cb1e6a0bd74960a96c915db6390c"},
+    {file = "xxhash-3.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:bd1b4531a66da6dde1974662c1fd6fb1a2f27e40542e3df5e5e5dbab8ea4aee7"},
+    {file = "xxhash-3.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:deebb296df92e082b6d0171a7d6227b503e2897cea4f8bdd3d708094974d4cf6"},
+    {file = "xxhash-3.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd96e9cb0e2baa294e6d572207d9731c3bb8e2511f1ff70f2bf17266b4488bd9"},
+    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3756b44bf247e422a2e47a38f25d03cf4a5ed539fdc2be3c60043e872e6ff13d"},
+    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69550c3c053b8f135ceac97b85dc1b2bc54b7613a966f550f32b43bed81c788a"},
+    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fc8736fc3e0c5aad435520873b9d2e27ddcc5a830b07e00e9c4d3a61ded9675"},
+    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80ead7774392efbd95f9f701155048f9ca26cf55133db6f5bb5a0ec69376bda5"},
+    {file = "xxhash-3.3.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8737c9b3fd944d856faafa92c95f6198649ad57987935b6d965d086938be917"},
+    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2c8e078d0b9f85212801c41bd9eec8122003929686b0ee33360ffbfdf1a189ab"},
+    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f399269d20ef1dd910331f9ad49e8510c3ba2aa657b623293b536038f266a5c5"},
+    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f3661decef5f9ff7ab50edbef463bf7dc717621b56755dbae5458a946a033b10"},
+    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:5ec374d0f1e7d43ef48a4ff643600833d7a325ecc6933b4d6ad9282f55751cf7"},
+    {file = "xxhash-3.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:39a947ff02d9a85673f5ce1f6f34059e24c714a797440485bd81b2c3cb69a7ff"},
+    {file = "xxhash-3.3.0-cp38-cp38-win32.whl", hash = "sha256:4a4f0645a0ec03b229fb04f2e66bdbcb1ffd341a70d6c86c3ee015ffdcd70fad"},
+    {file = "xxhash-3.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:8af5a687c0fb4357c230eec8a57ca07d3172faa3cb69beb0cbad40672ae6fa4b"},
+    {file = "xxhash-3.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e5bfafda019ecc6202af6f3cf08220fa66af9612ba16ef831033ae3ac7bd1f89"},
+    {file = "xxhash-3.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d113b433bc817adf845689a051363777835577858263ec4325d1934fcb7e394"},
+    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56aacf4bf65f575c0392be958aceff719d850950bb6af7d804b32d4bc293159c"},
+    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f5d3e4e0937dad05585e9bd772bbdf0ca40cd8b2f54789d7a1f3091b608118c"},
+    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23605d7fc67bc7daa0d263b3a26de3375cfcc0b51ab7de5026625415c05b6fed"},
+    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe525be0392d493558a2b10d764bcaae9850cc262b417176a8b001f16e085fc6"},
+    {file = "xxhash-3.3.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b234d08786884f5c8d55dfebb839cfbd846d812e3a052c39ca7e8ce7055fed68"},
+    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b031395b4b9c3085d9ea1ce89896ab01a65fc63172b2bfda5dd318fefe5e2f93"},
+    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:5afe44da46b48c75169e622a532dca3fe585343c0577cfd7c18ecd3f1200305d"},
+    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c59f233f38b6a49d5e4ddf16be910a5bbf36a2989b6b2c8591853fb9f5a5e691"},
+    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:ed016e278c5c4633270903c7cf3b9dfb0bd293b7335e43fe695cb95541da53c9"},
+    {file = "xxhash-3.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a8bd6612fb35487e9ab329bb37b3df44f58baf752010dde9282593edbfed7e7"},
+    {file = "xxhash-3.3.0-cp39-cp39-win32.whl", hash = "sha256:015a0498bde85364abc53fcc713af962dd4555391929736d9c0ff2c555436a03"},
+    {file = "xxhash-3.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:06a484097af32caf1cfffadd60c3ca140c9e52b40a551fb1f6f0fdfd6f7f8977"},
+    {file = "xxhash-3.3.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c3809740124bbc777d29e3ae53de24f4c13fd5e62878086a8feadf0dcb654a5"},
+    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae092f0daaeece2acdd6ec46e2ab307d8d6f22b01ecca14dc6078844dbd88339"},
+    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3498e72ff2610b049b97bb81d1ea6e7bfa5b7a45efb3f255d77ec2fa2bc91653"},
+    {file = "xxhash-3.3.0-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0004dded9d86f129961326e980420187640fb7ba65a184009429861c1d09df7"},
+    {file = "xxhash-3.3.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:41c8bfd27191928bae6fd2b66872965532267785094a03c0ee5f358d9dba51c2"},
+    {file = "xxhash-3.3.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:71db8498e329cef3588b0617f762a3fe31d899872e76a68ce2840e35a1318a5b"},
+    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d1d24d71b6209bc0124286932c4f0660c1103cb996fe34cb374bc12ac251940"},
+    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61004587a09b5b385e43d95ffe3a76c9d934dfd79ea38272d5c20ddfba8eab8f"},
+    {file = "xxhash-3.3.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3f0c92e3fa826425c73acafb31e022a719c85423847a9433d3a9e61e4ac97543"},
+    {file = "xxhash-3.3.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:367e03f1484ce471c94e731b98f5e4a05b43e7188b16692998e1cc89fd1159a5"},
+    {file = "xxhash-3.3.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed04c47dfaab98fcda0b748af9ee6fe8c888a0a0fbd13720e0f0221671e387e1"},
+    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cbfde62516435ca198220aff048a8793383cb7047c7b88714a061968bca786d"},
+    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73682225faa973ee56743f0fcd36bfcbfec503be258e0e420fb34313f52f1e7b"},
+    {file = "xxhash-3.3.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d49efdce2086c2c506af20ed18a1115b40af7aad6d4ee27cb31d7c810585a3f2"},
+    {file = "xxhash-3.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:546a0bb8e5a657cadf0da290b30ccd561cb89c256a5421ab8d5eb12eaf087349"},
+    {file = "xxhash-3.3.0.tar.gz", hash = "sha256:c3f9e322b1ebeebd44e3d9d2d9b124e0c550c1ef41bd552afdcdd719516ee41a"},
 ]
 
 [[package]]

From 629e6b44b050c8168af82675b631644cc0d5d757 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 17:37:52 +0100
Subject: [PATCH 045/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 27 +++++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 80594b6b..c28b6579 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -14,12 +14,27 @@
 from sklearn.metrics import classification_report
 import pyarrow.parquet as pq
 
-spark = nlp.start(spark_conf={'spark.executor.memory': '10g',
-                              'spark.driver.maxResultSize': '6g',
-                              'spark.executor.memoryOverhead': '1g',
-                              'spark.memory.fraction': '0.6'}
-                  )
-spark.sparkContext.setLogLevel("OFF")
+spark = nlp.start(spark_conf={
+    'spark.driver.memory': '8g',
+    'spark.executor.memory': '8g',
+    # Fraction of heap space used for execution memory
+    'spark.memory.fraction': '0.7',
+    # Fraction of heap space used for storage memory
+    'spark.memory.storageFraction': '0.3',
+    # Enable off-heap storage (for large datasets)
+    'spark.memory.offHeap.enabled': 'true',
+    # Off-heap memory size (adjust as needed)
+    'spark.memory.offHeap.size': '8g',
+    'spark.shuffle.manager': 'sort',
+    'spark.shuffle.spill': 'true',
+    'spark.master': f'local[{os.cpu_count()}]',
+    'spark.default.parallelism': f'{os.cpu_count()*2}',
+    'spark.speculation': 'false',
+    'spark.task.maxFailures': '4',
+    'spark.local.dir': f"{os.path.join(os.getcwd(), '.spark')}",
+    'spark.eventLog.enabled': 'true',
+    'spark.eventLog.dir': f"{os.path.join(os.getcwd(), '.sparklogs')}"
+})
 
 retag_app = typer.Typer()
 

From 127c8b4a9307fb8642e3d50c7b6823f5131e0f2a Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 17:41:33 +0100
Subject: [PATCH 046/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index c28b6579..1401b057 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -31,9 +31,9 @@
     'spark.default.parallelism': f'{os.cpu_count()*2}',
     'spark.speculation': 'false',
     'spark.task.maxFailures': '4',
-    'spark.local.dir': f"{os.path.join(os.getcwd(), '.spark')}",
+    'spark.local.dir': f"{os.path.join(os.getcwd())}",
     'spark.eventLog.enabled': 'true',
-    'spark.eventLog.dir': f"{os.path.join(os.getcwd(), '.sparklogs')}"
+    'spark.eventLog.dir': f"{os.path.join(os.getcwd())}"
 })
 
 retag_app = typer.Typer()

From 2dd32434fc595a69196fa1e1758eb9699f1f547b Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 17:49:46 +0100
Subject: [PATCH 047/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 26 +++++++++++++++-------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 1401b057..5941cc72 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -15,16 +15,16 @@
 import pyarrow.parquet as pq
 
 spark = nlp.start(spark_conf={
-    'spark.driver.memory': '8g',
-    'spark.executor.memory': '8g',
+    'spark.driver.memory': '12g',
+    'spark.executor.memory': '12g',
     # Fraction of heap space used for execution memory
-    'spark.memory.fraction': '0.7',
+    'spark.memory.fraction': '0.5',
     # Fraction of heap space used for storage memory
-    'spark.memory.storageFraction': '0.3',
+    'spark.memory.storageFraction': '0.5',
     # Enable off-heap storage (for large datasets)
     'spark.memory.offHeap.enabled': 'true',
     # Off-heap memory size (adjust as needed)
-    'spark.memory.offHeap.size': '8g',
+    'spark.memory.offHeap.size': '16g',
     'spark.shuffle.manager': 'sort',
     'spark.shuffle.spill': 'true',
     'spark.master': f'local[{os.cpu_count()}]',
@@ -238,10 +238,20 @@ def retag(
         pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
 
         logging.info(f"- Optimizing dataframe...")
-        dset = dset.remove_columns(["title", "journal", "year"])
         data_in_parquet = f"{save_to_path}.data.parquet"
-        pq.write_table(dset.data.table, data_in_parquet)
-        del dset, train, train_df, test, test_df, pos_x_train, pos_x_test, neg_x_train, neg_x_test, positive_dset, \
+        optimize=True
+        if os.path.isfile(data_in_parquet):
+            answer = input("Optimized dataframe found. Do you want to use it? [y|n]: ")
+            while answer not in ['y', 'n']:
+                answer = input("Optimized dataframe found. Do you want to use it? [y|n]: ")
+            if answer == 'y':
+                optimize = False
+
+        if optimize:
+            dset = dset.remove_columns(["title", "journal", "year"])
+
+            pq.write_table(dset.data.table, data_in_parquet)
+        del dset, train, train_df, test, test_df, pos_x_train, pos_x_test, neg_x_train, neg_x_test, positive_dset,\
             negative_dset
         sdf = spark.read.load(data_in_parquet)
 

From f486608361367b0fab9931f2806889f1d7674de9 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 17:58:09 +0100
Subject: [PATCH 048/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 74 ++++++++++++----------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 5941cc72..1c911c34 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -22,9 +22,9 @@
     # Fraction of heap space used for storage memory
     'spark.memory.storageFraction': '0.5',
     # Enable off-heap storage (for large datasets)
-    'spark.memory.offHeap.enabled': 'true',
+    'spark.memory.offHeap.enabled': 'false',
     # Off-heap memory size (adjust as needed)
-    'spark.memory.offHeap.size': '16g',
+    # 'spark.memory.offHeap.size': '10g',
     'spark.shuffle.manager': 'sort',
     'spark.shuffle.spill': 'true',
     'spark.master': f'local[{os.cpu_count()}]',
@@ -54,11 +54,11 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     return train_dset, test_dset
 
 
-def _create_pipelines(batch_size, train_df, test_df):
+def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
     """
-        This method creates a Spark pipeline (to run on dataframes) and a Spark Lightpipeline (to run on arrays of str)
-        Lightpipelines are faster in less data.
+        This method creates a Spark pipeline (to run on dataframes)
     Args:
+        save_to_path: path where to save the final results.
         batch_size: max size of the batch to train. Since data is small for training, I limit it to 8.
         train_df: Spark Dataframe of the train data
         test_df: Spark Dataframe of the test data
@@ -75,39 +75,49 @@ def _create_pipelines(batch_size, train_df, test_df):
         .setInputCols(["document"]) \
         .setOutputCol("sentence_embeddings")
 
-    # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
-    classifierdl = nlp.ClassifierDLApproach() \
-        .setInputCols(["sentence_embeddings"]) \
-        .setOutputCol("label") \
-        .setLabelColumn("featured_tag") \
-        .setMaxEpochs(25) \
-        .setLr(0.001) \
-        .setBatchSize(max(batch_size, 8)) \
-        .setEnableOutputLogs(True)
-    # .setOutputLogsPath('logs')
-
-    clf_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                        embeddings,
-                                        classifierdl])
-
-    fit_clf_pipeline = clf_pipeline.fit(train_df)
-    preds = fit_clf_pipeline.transform(test_df)
-    preds_df = preds.select('featured_tag', 'abstractText', 'label.result').toPandas()
-    preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
-    logging.info(classification_report(preds_df['featured_tag'], preds_df['result']))
-
-    logging.info("- Loading the model for prediction...")
-    fit_clf_pipeline.stages[-1].write().overwrite().save('clf_tmp')
-    fit_clf_model = nlp.ClassifierDLModel.load('clf_tmp')
+    retrain = True
+    clf_dir = f"{save_to_path}.{tag}_clf"
+    if os.path.isdir(clf_dir):
+        answer = "Classifier already trained. Do you want to reuse it? [y|n]: "
+        while answer not in ['y', 'n']:
+            answer = "Classifier already trained. Do you want to reuse it? [y|n]: "
+        if answer == 'n':
+            retrain = False
+
+    if retrain:
+        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
+        classifierdl = nlp.ClassifierDLApproach() \
+            .setInputCols(["sentence_embeddings"]) \
+            .setOutputCol("label") \
+            .setLabelColumn("featured_tag") \
+            .setMaxEpochs(25) \
+            .setLr(0.001) \
+            .setBatchSize(max(batch_size, 8)) \
+            .setEnableOutputLogs(True)
+        # .setOutputLogsPath('logs')
+
+        clf_pipeline = nlp.Pipeline(stages=[document_assembler,
+                                            embeddings,
+                                            classifierdl])
+
+        fit_clf_pipeline = clf_pipeline.fit(train_df)
+        preds = fit_clf_pipeline.transform(test_df)
+        preds_df = preds.select('featured_tag', 'abstractText', 'label.result').toPandas()
+        preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
+        logging.info(classification_report(preds_df['featured_tag'], preds_df['result']))
+
+        logging.info("- Loading the model for prediction...")
+        fit_clf_pipeline.stages[-1].write().overwrite().save(clf_dir)
+
+    fit_clf_model = nlp.ClassifierDLModel.load(clf_dir)
 
     pred_pipeline = nlp.Pipeline(stages=[document_assembler,
                                          embeddings,
                                          fit_clf_model])
     pred_df = spark.createDataFrame([['']]).toDF("text")
     fit_pred_pipeline = pred_pipeline.fit(pred_df)
-    fit_pred_lightpipeline = nlp.LightPipeline(fit_pred_pipeline)
 
-    return fit_pred_pipeline, fit_pred_lightpipeline
+    return fit_pred_pipeline
 
 
 def _annotate(save_to_path, dset, tag, limit, is_positive):
@@ -235,7 +245,7 @@ def retag(
         logging.info(f"- Test dataset size: {test_df.count()}")
 
         logging.info(f"- Creating `sparknlp` pipelines...")
-        pipeline, lightpipeline = _create_pipelines(batch_size, train_df, test_df)
+        pipeline = _create_pipelines(save_to_path, batch_size, train_df, test_df, tag)
 
         logging.info(f"- Optimizing dataframe...")
         data_in_parquet = f"{save_to_path}.data.parquet"

From 6be482baf841665aa12399c4eb7878a16c910bf4 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 18:01:04 +0100
Subject: [PATCH 049/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 1c911c34..aacf300b 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -76,7 +76,7 @@ def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
         .setOutputCol("sentence_embeddings")
 
     retrain = True
-    clf_dir = f"{save_to_path}.{tag}_clf"
+    clf_dir = f"{save_to_path}.{tag.replace(' ', '')}_clf"
     if os.path.isdir(clf_dir):
         answer = "Classifier already trained. Do you want to reuse it? [y|n]: "
         while answer not in ['y', 'n']:
@@ -122,7 +122,7 @@ def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
 
 def _annotate(save_to_path, dset, tag, limit, is_positive):
     human_supervision = {}
-    curation_file = f"{save_to_path}.{tag}.curation.json"
+    curation_file = f"{save_to_path}.{tag.replace(' ', '')}.curation.json"
     if os.path.isfile(curation_file):
         with open(curation_file, 'r') as f:
             human_supervision = json.load(f)
@@ -219,7 +219,7 @@ def retag(
             logging.info(f"- Curating data...")
             _curate(save_to_path, positive_dset, negative_dset, tag, train_examples)
 
-            curation_file = f"{save_to_path}.{tag}.curation.json"
+            curation_file = f"{save_to_path}.{tag.replace(' ', '')}.curation.json"
             if os.path.isfile(curation_file):
                 with open(curation_file, "r") as fr:
                     # I load the curated data file
@@ -269,7 +269,7 @@ def retag(
         sdf = sdf.repartition(num_proc)
 
         logging.info(f"- Retagging {tag}...")
-        pipeline.transform(sdf).write.mode('overwrite').save(f"{save_to_path}.{tag}.prediction")
+        pipeline.transform(sdf).write.mode('overwrite').save(f"{save_to_path}.{tag.replace(' ', '')}.prediction")
 
         # 1) We load
         # 2) We filter to get those results where the predicted tag was not initially in meshMajor

From dca895b00e2a6a3f9834d24c076d9ba73713080c Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 18:05:25 +0100
Subject: [PATCH 050/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index aacf300b..28c1adbd 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -15,16 +15,16 @@
 import pyarrow.parquet as pq
 
 spark = nlp.start(spark_conf={
-    'spark.driver.memory': '12g',
-    'spark.executor.memory': '12g',
+    'spark.driver.memory': '6g',
+    'spark.executor.memory': '6g',
     # Fraction of heap space used for execution memory
-    'spark.memory.fraction': '0.5',
+    'spark.memory.fraction': '0.6',
     # Fraction of heap space used for storage memory
-    'spark.memory.storageFraction': '0.5',
+    'spark.memory.storageFraction': '0.4',
     # Enable off-heap storage (for large datasets)
-    'spark.memory.offHeap.enabled': 'false',
+    'spark.memory.offHeap.enabled': 'true',
     # Off-heap memory size (adjust as needed)
-    # 'spark.memory.offHeap.size': '10g',
+    'spark.memory.offHeap.size': '10g',
     'spark.shuffle.manager': 'sort',
     'spark.shuffle.spill': 'true',
     'spark.master': f'local[{os.cpu_count()}]',

From c8584bd3f60da01618ac4b00a2d46361e2642495 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 18:07:29 +0100
Subject: [PATCH 051/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 28c1adbd..1563a925 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -78,9 +78,9 @@ def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
     retrain = True
     clf_dir = f"{save_to_path}.{tag.replace(' ', '')}_clf"
     if os.path.isdir(clf_dir):
-        answer = "Classifier already trained. Do you want to reuse it? [y|n]: "
+        answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
         while answer not in ['y', 'n']:
-            answer = "Classifier already trained. Do you want to reuse it? [y|n]: "
+            answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
         if answer == 'n':
             retrain = False
 

From 13dbf67f355409faac077bdf654df9006c6233d4 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 18:09:43 +0100
Subject: [PATCH 052/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 1563a925..b7c126bd 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -81,7 +81,7 @@ def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
         answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
         while answer not in ['y', 'n']:
             answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
-        if answer == 'n':
+        if answer == 'y':
             retrain = False
 
     if retrain:

From c5667bee1ebba0fade0c47409ebad3a70003d3aa Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Mon, 11 Sep 2023 18:12:27 +0100
Subject: [PATCH 053/102] Spark config

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index b7c126bd..f9849c70 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -15,7 +15,7 @@
 import pyarrow.parquet as pq
 
 spark = nlp.start(spark_conf={
-    'spark.driver.memory': '6g',
+    'spark.driver.memory': '12g',
     'spark.executor.memory': '6g',
     # Fraction of heap space used for execution memory
     'spark.memory.fraction': '0.6',
@@ -24,7 +24,7 @@
     # Enable off-heap storage (for large datasets)
     'spark.memory.offHeap.enabled': 'true',
     # Off-heap memory size (adjust as needed)
-    'spark.memory.offHeap.size': '10g',
+    'spark.memory.offHeap.size': '6g',
     'spark.shuffle.manager': 'sort',
     'spark.shuffle.spill': 'true',
     'spark.master': f'local[{os.cpu_count()}]',

From d71acf2cabacb8475668db4195b29e3ceeac226c Mon Sep 17 00:00:00 2001
From: Nick Sorros <nsorros@gmail.com>
Date: Tue, 12 Sep 2023 11:19:28 +0300
Subject: [PATCH 054/102] Remove duplicates and add reference number'

---
 scripts/create_xlinear_bertmesh_comparison_csv.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/create_xlinear_bertmesh_comparison_csv.py b/scripts/create_xlinear_bertmesh_comparison_csv.py
index 1f525a2c..7341c1f2 100644
--- a/scripts/create_xlinear_bertmesh_comparison_csv.py
+++ b/scripts/create_xlinear_bertmesh_comparison_csv.py
@@ -130,8 +130,9 @@ def create_comparison_csv(
     active_grants = active_grants[~active_grants["Synopsis"].isna()]
     active_grants.sample(frac=1)
     active_grants_sample = active_grants.iloc[:active_portfolio_sample]
-    active_grants_sample = pd.DataFrame({"abstract": active_grants_sample["Synopsis"]})
+    active_grants_sample = pd.DataFrame({"abstract": active_grants_sample["Synopsis"], "Reference": active_grants_sample["Reference"]})
     active_grants_sample["active_portfolio"] = 1
+    active_grants.drop_duplicates(subset="abstract", inplace=True)
     grants_sample = pd.concat([grants_sample, active_grants_sample])
 
     abstracts = grants_sample["abstract"].tolist()

From f7d70b089a7d67e6787a7c83dbaf847c4325063a Mon Sep 17 00:00:00 2001
From: Nick Sorros <nsorros@gmail.com>
Date: Tue, 12 Sep 2023 11:20:41 +0300
Subject: [PATCH 055/102] Switch to save in excel format

---
 pipelines/generate_grants/dvc.yaml                | 2 +-
 scripts/create_xlinear_bertmesh_comparison_csv.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelines/generate_grants/dvc.yaml b/pipelines/generate_grants/dvc.yaml
index 0a982435..18ac053c 100644
--- a/pipelines/generate_grants/dvc.yaml
+++ b/pipelines/generate_grants/dvc.yaml
@@ -22,4 +22,4 @@ stages:
       - scripts/create_xlinear_bertmesh_comparison_csv.py
     wdir: "../.."
     outs:
-      - data/grants_comparison/comparison.csv
+      - data/grants_comparison/comparison.xlsx
diff --git a/scripts/create_xlinear_bertmesh_comparison_csv.py b/scripts/create_xlinear_bertmesh_comparison_csv.py
index 7341c1f2..b455ec39 100644
--- a/scripts/create_xlinear_bertmesh_comparison_csv.py
+++ b/scripts/create_xlinear_bertmesh_comparison_csv.py
@@ -193,7 +193,7 @@ def create_comparison_csv(
         )
 
     # Output df to csv
-    grants_sample.to_csv(output_path, index=False)
+    grants_sample.to_excel(output_path, index=False)
 
 
 if __name__ == "__main__":

From 5cb92d4cd89783802df62c28edb7383bbfabfd18 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-60-50-9.eu-west-1.compute.internal>
Date: Tue, 12 Sep 2023 08:33:48 +0000
Subject: [PATCH 056/102] Run pre-commit

---
 README.md                                     | 20 +++++++++----------
 examples/augment.sh                           |  2 +-
 examples/augment_specific_tags.sh             |  2 +-
 examples/preprocess_splitting_by_fract.sh     |  2 +-
 examples/preprocess_splitting_by_rows.sh      |  2 +-
 examples/preprocess_splitting_by_years.sh     |  2 +-
 examples/resume_train_by_epoch.sh             |  2 +-
 examples/resume_train_by_steps.sh             |  2 +-
 .../augmentation/prompt.template              |  2 +-
 .../create_xlinear_bertmesh_comparison_csv.py |  7 ++++++-
 10 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 7535dde7..25657ed8 100644
--- a/README.md
+++ b/README.md
@@ -72,12 +72,12 @@ in square brackets the commands that are not implemented yet
 ## ⚙️Preprocess
 
 This process is optional to run, since it can be directly managed by the `Train` process.
-- If you run it manually, it will store the data in local first, which can help if you need finetune in the future, 
+- If you run it manually, it will store the data in local first, which can help if you need finetune in the future,
 rerun, etc.
-- If not run it, the `train` step will preprocess and then run, without any extra I/O operations on disk, 
+- If not run it, the `train` step will preprocess and then run, without any extra I/O operations on disk,
 which may add latency depending on the infrastructure.
 
-It requires data in `jsonl` format for parallelization purposes. In `data/raw` you can find `allMesH_2021.jsonl` 
+It requires data in `jsonl` format for parallelization purposes. In `data/raw` you can find `allMesH_2021.jsonl`
 already prepared for the preprocessing step.
 
 If your data is in `json` format, trasnform it to `jsonl` with tools as `jq` or using Python.
@@ -96,9 +96,9 @@ your own data under development.
 ### Preprocessing bertmesh
 
 ```
- Usage: grants-tagger preprocess mesh [OPTIONS] DATA_PATH SAVE_TO_PATH                                                                                                                                             
-                                      MODEL_KEY                                                                                                                                                                    
-                                                                                                                                                                                                                   
+ Usage: grants-tagger preprocess mesh [OPTIONS] DATA_PATH SAVE_TO_PATH
+                                      MODEL_KEY
+
 ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ *    data_path         TEXT  Path to mesh.jsonl [default: None] [required]                                                                                                                                      │
 │ *    save_to_path      TEXT  Path to save the serialized PyArrow dataset after preprocessing [default: None] [required]                                                                                         │
@@ -122,7 +122,7 @@ The command will train a model and save it to the specified path. Currently we s
 
 ### bertmesh
 ```
- Usage: grants-tagger train bertmesh [OPTIONS] MODEL_KEY DATA_PATH                                                                                                                                                 
+ Usage: grants-tagger train bertmesh [OPTIONS] MODEL_KEY DATA_PATH
 
 ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
 │ *    model_key      TEXT  Pretrained model key. Local path or HF location [default: None] [required]                                                                                                            │
@@ -143,7 +143,7 @@ The command will train a model and save it to the specified path. Currently we s
 
 #### About `model_key`
 `model_key` possible values are:
-- A HF location for a pretrained / finetuned model 
+- A HF location for a pretrained / finetuned model
 - "" to load a model by default and train from scratch (`microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract`)
 
 #### About `sharding`
@@ -152,7 +152,7 @@ to improve performance on big datasets. To enable it:
 - set shards to something bigger than 1 (Recommended: same number as cpu cores)
 
 #### Other arguments
-Besides those arguments, feel free to add any other TrainingArgument from Hugging Face or Wand DB. 
+Besides those arguments, feel free to add any other TrainingArgument from Hugging Face or Wand DB.
 This is the example used to train reaching a ~0.6 F1, also available at `examples/train_by_epochs.sh`
 ```commandline
 grants-tagger train bertmesh \
@@ -365,7 +365,7 @@ and you would be able to run `grants_tagger preprocess epmc_mesh ...`
 
 ## 🚦 Test
 
-To run the test you need to have installed the `dev` dependencies first. 
+To run the test you need to have installed the `dev` dependencies first.
 This is done by running `poetry install --with dev` after you are in the sell (`poetry shell`)
 
 Run tests with `pytest`. If you want to write some additional tests,
diff --git a/examples/augment.sh b/examples/augment.sh
index 9ad482b1..9e801e77 100644
--- a/examples/augment.sh
+++ b/examples/augment.sh
@@ -1,3 +1,3 @@
 grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
   --min-examples 25 \
-  --concurrent-calls 25
\ No newline at end of file
+  --concurrent-calls 25
diff --git a/examples/augment_specific_tags.sh b/examples/augment_specific_tags.sh
index 3ce920c8..1bd45293 100644
--- a/examples/augment_specific_tags.sh
+++ b/examples/augment_specific_tags.sh
@@ -2,4 +2,4 @@
 grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
   --tags-file-path tags_to_augment.txt \
   --examples 25 \
-  --concurrent-calls 25
\ No newline at end of file
+  --concurrent-calls 25
diff --git a/examples/preprocess_splitting_by_fract.sh b/examples/preprocess_splitting_by_fract.sh
index 526133f2..93a0ba67 100644
--- a/examples/preprocess_splitting_by_fract.sh
+++ b/examples/preprocess_splitting_by_fract.sh
@@ -1,2 +1,2 @@
 grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
-  --test-size 0.05
\ No newline at end of file
+  --test-size 0.05
diff --git a/examples/preprocess_splitting_by_rows.sh b/examples/preprocess_splitting_by_rows.sh
index 42ba15a4..0a4f82f6 100644
--- a/examples/preprocess_splitting_by_rows.sh
+++ b/examples/preprocess_splitting_by_rows.sh
@@ -1,2 +1,2 @@
 grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
-  --test-size 25000
\ No newline at end of file
+  --test-size 25000
diff --git a/examples/preprocess_splitting_by_years.sh b/examples/preprocess_splitting_by_years.sh
index 629e74fa..28870229 100644
--- a/examples/preprocess_splitting_by_years.sh
+++ b/examples/preprocess_splitting_by_years.sh
@@ -1,4 +1,4 @@
 grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
   --test-size 25000 \
   --train-years 2016,2017,2018,2019 \
-  --test-years 2020,2021
\ No newline at end of file
+  --test-years 2020,2021
diff --git a/examples/resume_train_by_epoch.sh b/examples/resume_train_by_epoch.sh
index 38b520b4..792dcdac 100644
--- a/examples/resume_train_by_epoch.sh
+++ b/examples/resume_train_by_epoch.sh
@@ -34,4 +34,4 @@ grants-tagger train bertmesh \
     --save_strategy epoch \
     --wandb_project wellcome-mesh \
     --wandb_name test-train-all \
-    --wandb_api_key ${WANDB_API_KEY}
\ No newline at end of file
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/resume_train_by_steps.sh b/examples/resume_train_by_steps.sh
index 3c251cf3..d8309aa9 100644
--- a/examples/resume_train_by_steps.sh
+++ b/examples/resume_train_by_steps.sh
@@ -36,4 +36,4 @@ grants-tagger train bertmesh \
     --save_steps 10000 \
     --wandb_project wellcome-mesh \
     --wandb_name test-train-all \
-    --wandb_api_key ${WANDB_API_KEY}
\ No newline at end of file
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/grants_tagger_light/augmentation/prompt.template b/grants_tagger_light/augmentation/prompt.template
index 45eb75d3..2e7f6c31 100644
--- a/grants_tagger_light/augmentation/prompt.template
+++ b/grants_tagger_light/augmentation/prompt.template
@@ -9,4 +9,4 @@ ABSTRACT:
 {ABSTRACT}
 
 TOPIC:
-{TOPIC}
\ No newline at end of file
+{TOPIC}
diff --git a/scripts/create_xlinear_bertmesh_comparison_csv.py b/scripts/create_xlinear_bertmesh_comparison_csv.py
index b455ec39..b51b878e 100644
--- a/scripts/create_xlinear_bertmesh_comparison_csv.py
+++ b/scripts/create_xlinear_bertmesh_comparison_csv.py
@@ -130,7 +130,12 @@ def create_comparison_csv(
     active_grants = active_grants[~active_grants["Synopsis"].isna()]
     active_grants.sample(frac=1)
     active_grants_sample = active_grants.iloc[:active_portfolio_sample]
-    active_grants_sample = pd.DataFrame({"abstract": active_grants_sample["Synopsis"], "Reference": active_grants_sample["Reference"]})
+    active_grants_sample = pd.DataFrame(
+        {
+            "abstract": active_grants_sample["Synopsis"],
+            "Reference": active_grants_sample["Reference"],
+        }
+    )
     active_grants_sample["active_portfolio"] = 1
     active_grants.drop_duplicates(subset="abstract", inplace=True)
     grants_sample = pd.concat([grants_sample, active_grants_sample])

From 902eda035371233a4868ef2f25552cd1b6460755 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Tue, 12 Sep 2023 11:31:42 +0100
Subject: [PATCH 057/102] Adds documentation, configurable tags, years, memory

---
 README.md                                     | 144 +++++++++++++++-
 examples/retag.sh                             |   3 +
 grants_tagger_light/augmentation/augment.py   |  17 +-
 .../preprocessing/preprocess_mesh.py          |  19 ++-
 grants_tagger_light/retagging/retagging.py    |  80 +++++----
 .../retagging/retagging_spacy.py              | 158 ------------------
 ll.Artificial Intelligence.curation.json      |   1 -
 tags_to_augment.txt                           |   1 -
 8 files changed, 221 insertions(+), 202 deletions(-)
 create mode 100644 examples/retag.sh
 delete mode 100644 grants_tagger_light/retagging/retagging_spacy.py
 delete mode 100644 ll.Artificial Intelligence.curation.json
 delete mode 100644 tags_to_augment.txt

diff --git a/README.md b/README.md
index 7535dde7..bf7ca7fe 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,33 @@ For inference, CPU-support should suffice.
 
 You now have access to the `grants-tagger` command line interface!
 
-## OPTIONAL: 3. Install MantisNLP `remote` to connect to a remote AWS instances
+## 3. For `retagging` you will need to make sure you have `openjdk 8 (or 11)` installed to run Spark
+First, make sure you don't have java installed or you have another version that it's not java 8 or 11.
+```shell
+java -version
+```
+
+If you don't or you have another version, install it (example for java 8):
+```shell
+sudo apt update
+sudo apt install openjdk-8-jdk
+```
+
+Make sure you set by default the one we have just installed. Copy the path to the java folder from:
+```shell
+sudo update-alternatives --config java
+```
+
+And now, set your JAVA_HOME env var:
+```shell
+sudo vim /etc/environment
+JAVA_HOME="[PATH_TO_THE_JAVA_FOLDER]
+```
+
+Restar the shell or do `source /etc/environment`
+
+
+## OPTIONAL: 4. Install MantisNLP `remote` to connect to a remote AWS instances
 `pip install git+https://github.com/ivyleavedtoadflax/remote.py.git`
 Then add your instance
 `remote config add [instance_name]`
@@ -120,7 +146,7 @@ your own data under development.
 
 The command will train a model and save it to the specified path. Currently we support on BertMesh.
 
-### bertmesh
+### Training bertmesh
 ```
  Usage: grants-tagger train bertmesh [OPTIONS] MODEL_KEY DATA_PATH                                                                                                                                                 
 
@@ -154,7 +180,7 @@ to improve performance on big datasets. To enable it:
 #### Other arguments
 Besides those arguments, feel free to add any other TrainingArgument from Hugging Face or Wand DB. 
 This is the example used to train reaching a ~0.6 F1, also available at `examples/train_by_epochs.sh`
-```commandline
+```shell
 grants-tagger train bertmesh \
     "" \
     [YOUR_PREPROCESSED_FOLDER] \
@@ -186,6 +212,118 @@ grants-tagger train bertmesh \
     --wandb_api_key ${WANDB_API_KEY}
 ```
 
+## 📚 Augment
+Data augmentation can be useful for low represented classes. LLMs as `openai GPT-3.5` can be used to that purpose.
+
+### Augmenting bertmesh
+For bertmesh, we will augment the `allMeSH_2021.jsonl` file. We just need to select the path to that file (usually in `data/raw/allMeSH_2021.jsonl`)
+and where to save the generated data (in jsonl).
+
+```shell
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
+```
+
+### concurrent-calls param
+By setting `concurrent-calls [number_of_calls]` you will use the multiclient openai library which will create 
+async calls to openai and work in parallel, improving the processing times.
+
+If `1`, vanilla `openai` library in sync mode will be used.
+
+### What tags do we augment? By minimum examples 
+There are two ways to do it. First, `all tags` with less than `min-examples` examples.
+In this case, There are two parameters which are important to know:
+* `min-examples`: Example: 25. Is the min. number of examples you require from a tag. If less is found, the data augmentation will be triggered.
+* `examples`: Example: 25. In case there are less than `min-examples`, how many examples we generate for that tag.
+
+```shell
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
+  --min-examples 25 \
+  --concurrent-calls 25
+```
+
+### What tags do we augment? By tags file
+Second way is to use a file with 1 line per tag. To do this, instead of `min-examples` use `tags-file-path` param.
+```shell
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
+  --tags-file-path tags_to_augment.txt \
+  --examples 25 \
+  --concurrent-calls 25
+```
+
+### Other params
+```                                                                                                                                                                                                                   
+ Usage: grants-tagger augment mesh [OPTIONS] DATA_PATH SAVE_TO_PATH                                                                                                                                                
+
+╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    data_path         TEXT  Path to mesh.jsonl [default: None] [required]                                                                                                                                      │
+│ *    save_to_path      TEXT  Path to save the new generated data in jsonl format
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --model-key               TEXT                   LLM to use data augmentation. By now, only `openai` is supported [default: gpt-3.5-turbo]                                                                      │
+│ --num-proc                INTEGER                Number of processes to use for data augmentation [default: 8]                                                                                                  │
+│ --batch-size              INTEGER                Preprocessing batch size (for dataset, filter, map, ...) [default: 64]                                                                                         │
+│ --min-examples            INTEGER                Minimum number of examples to require. Less than that will trigger data augmentation. [default: None]                                                          │
+│ --examples                INTEGER                Examples to generate per each tag. [default: 25]                                                                                                               │
+│ --prompt-template         TEXT                   File to use as a prompt. Make sure to ask the LLM to return a dict with two fields: `abstract` and `tags`                                                      │
+│                                                  [default: grants_tagger_light/augmentation/prompt.template]                                                                                                    │
+│ --concurrent-calls        INTEGER RANGE [x>=1]   Concurrent calls with 1 tag each to the different model [default: 16]                                                                                          │
+│ --temperature             FLOAT RANGE [0<=x<=2]  A value between 0 and 2. The bigger - the more creative. [default: 1.5]                                                                                        │
+│ --tags-file-path          TEXT                   Text file containing one line per tag to be considered. The rest will be discarded. [default: None]                                                            │
+│ --help                                           Show this message and exit.                                                                                                                                    │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+## ✏ Retagging
+Retagging is the process of correcting inconsistent tags in the data.
+
+## Retagging bertmesh
+The data in `allMeSH_2021.jsonl` (`PubMed` labelled with `MeSH` tags) is highly inconsistent for many rows, leading to 
+bad performance of some ambiguous labels. 
+
+Example: this is a row not being tagged as `Artificial Intelligence`, but talking about `Neural Networks`.
+
+```python
+{"journal": "Nature communications", "meshMajor": ["Cell Cycle", "Image Processing, Computer-Assisted", "Microscopy", "Neural Networks, Computer", "Saccharomyces cerevisiae", "Software"], "year": "2020", "abstractText": "The identification of cell borders ('segmentation') in microscopy images constitutes a bottleneck for large-scale experiments. For the model organism Saccharomyces cerevisiae, current segmentation methods face challenges when cells bud, crowd, or exhibit irregular features. We present a convolutional neural network (CNN) named YeaZ, the underlying training set of high-quality segmented yeast images (>10 000 cells) including mutants, stressed cells, and time courses, as well as a graphical user interface and a web application ( www.quantsysbio.com/data-and-software ) to efficiently employ, test, and expand the system. A key feature is a cell-cell boundary test which avoids the need for fluorescent markers. Our CNN is highly accurate, including for buds, and outperforms existing methods on benchmark images, indicating it transfers well to other conditions. To demonstrate how efficient large-scale image processing uncovers new biology, we analyze the geometries of ?2200 wild-type and cyclin mutant cells and find that morphogenesis control occurs unexpectedly early and gradually.", "pmid": "33184262", "title": "A convolutional neural network segments yeast microscopy images with high accuracy."}]
+```
+
+And this is another example. Same topic, but now it was tagged as `Artificial Intelligence`.
+```
+{"journal": "Nature communications", "meshMajor": ["Databases, Factual", "Deep Learning", "Diagnosis, Computer-Assisted", "False Positive Reactions", "Humans", "Image Processing, Computer-Assisted", "Neural Networks, Computer", "Stomach Neoplasms"], "year": "2020", "abstractText": "The early detection and accurate histopathological diagnosis of gastric cancer increase the chances of successful treatment. The worldwide shortage of pathologists offers a unique opportunity for the use of artificial intelligence assistance systems to alleviate the workload and increase diagnostic accuracy. Here, we report a clinically applicable system developed at the Chinese PLA General Hospital, China, using a deep convolutional neural network trained with 2,123 pixel-level annotated H&E-stained whole slide images. The model achieves a sensitivity near 100% and an average specificity of 80.6% on a real-world test dataset with 3,212 whole slide images digitalized by three scanners. We show that the system could aid pathologists in improving diagnostic accuracy and preventing misdiagnoses. Moreover, we demonstrate that our system performs robustly with 1,582 whole slide images from two other medical centres. Our study suggests the feasibility and benefits of using histopathological artificial intelligence assistance systems in routine practice scenarios.", "pmid": "32855423", "title": "Clinically applicable histopathological diagnosis system for gastric cancer detection using deep learning."}
+```
+
+For tags as `Data Science`, `Artificial Intelligence`, `Data Collection`, `Deep Learning`, `Neural Networks, Computer`, `Machine Learning`, the situation is really dramatic.
+
+`Artificial Intelligence` with several thousand rows shows a performance of 0.1 F1, showing a lot of confusion with the other tags described above.
+
+We propose a solution: retagging the data.
+
+```
+grants-tagger retag mesh data/raw/allMeSH_2021.jsonl ll --tags-file-path tags_to_augment.txt 
+```
+
+### Other params
+```
+ Usage: grants-tagger retag mesh [OPTIONS] DATA_PATH SAVE_TO_PATH                                                                                                                                                  
+
+╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    data_path         TEXT  Path to mesh.jsonl [default: None] [required]                                                                                                                                      │
+│ *    save_to_path      TEXT  Path where to save the retagged data [default: None] [required]                                                                                                                    │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
+│ --num-proc                             INTEGER  Number of processes to use for data augmentation [default: 8]                                                                                                   │
+│ --batch-size                           INTEGER  Preprocessing batch size (for dataset, filter, map, ...) [default: 64]                                                                                          │
+│ --tags-file-path                       TEXT     Text file containing one line per tag to be considered. The rest will be discarded. [default: None]                                                             │
+│ --threshold                            FLOAT    Minimum threshold of confidence to retag a model. Default: 0.9 [default: 0.9]                                                                                   │
+│ --train-examples                       INTEGER  Number of examples to use for training the retaggers [default: 100]                                                                                             │
+│ --supervised        --no-supervised             Use human curation, showing a `limit` amount of positive and negative examples to curate data for training the retaggers. The user will be required to accept   │
+│                                                 or reject. When the limit is reached, the model will be train. All intermediary steps will be saved.                                                            │
+│                                                 [default: supervised]                                                                                                                                           │
+│ --help                                          Show this message and exit.                                                                                                                                     │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
+```
+
+
+
 ## 📈 Evaluate
 
 Evaluate enables evaluation of the performance of various approaches including
diff --git a/examples/retag.sh b/examples/retag.sh
new file mode 100644
index 00000000..651f02b6
--- /dev/null
+++ b/examples/retag.sh
@@ -0,0 +1,3 @@
+grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
+  --tags "Artificial Intelligence,HIV" \
+  --years 2017,2018,2019,2020,2021
\ No newline at end of file
diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index 05a38cda..e97ff46d 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -156,26 +156,33 @@ def augment(
 
 @augment_app.command()
 def augment_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    data_path: str = typer.Argument(
+        ...,
+        help="Path to mesh.jsonl"),
     save_to_path: str = typer.Argument(
-        ..., help="Path to save the serialized PyArrow dataset after preprocessing"
+        ...,
+        help="Path to save the new jsonl data"
     ),
     model_key: str = typer.Option(
         "gpt-3.5-turbo",
         help="LLM to use data augmentation. By now, only `openai` is supported",
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(), help="Number of processes to use for data augmentation"
+        os.cpu_count(),
+        help="Number of processes to use for data augmentation"
     ),
     batch_size: int = typer.Option(
-        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
+        64,
+        help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
     min_examples: int = typer.Option(
         None,
         help="Minimum number of examples to require. "
         "Less than that will trigger data augmentation.",
     ),
-    examples: int = typer.Option(25, help="Examples to generate per each tag."),
+    examples: int = typer.Option(
+        25,
+        help="Examples to generate per each tag."),
     prompt_template: str = typer.Option(
         "grants_tagger_light/augmentation/prompt.template",
         help="File to use as a prompt. "
diff --git a/grants_tagger_light/preprocessing/preprocess_mesh.py b/grants_tagger_light/preprocessing/preprocess_mesh.py
index 10401c6c..586783cf 100644
--- a/grants_tagger_light/preprocessing/preprocess_mesh.py
+++ b/grants_tagger_light/preprocessing/preprocess_mesh.py
@@ -225,9 +225,13 @@ def preprocess_mesh(
 
 @preprocess_app.command()
 def preprocess_mesh_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    data_path: str = typer.Argument(
+        ...,
+        help="Path to mesh.jsonl"
+    ),
     save_to_path: str = typer.Argument(
-        ..., help="Path to save the serialized PyArrow dataset after preprocessing"
+        ...,
+        help="Path to save the serialized PyArrow dataset after preprocessing"
     ),
     model_key: str = typer.Argument(
         ...,
@@ -235,16 +239,21 @@ def preprocess_mesh_cli(
         "Leave blank if training from scratch",  # noqa
     ),
     test_size: float = typer.Option(
-        None, help="Fraction of data to use for testing in (0,1] or number of rows"
+        None,
+        help="Fraction of data to use for testing in (0,1] or number of rows"
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(), help="Number of processes to use for preprocessing"
+        os.cpu_count(),
+        help="Number of processes to use for preprocessing"
     ),
     max_samples: int = typer.Option(
         -1,
         help="Maximum number of samples to use for preprocessing",
     ),
-    batch_size: int = typer.Option(256, help="Size of the preprocessing batch"),
+    batch_size: int = typer.Option(
+        256,
+        help="Size of the preprocessing batch"
+    ),
     tags: str = typer.Option(
         None,
         help="Comma-separated tags you want to include in the dataset "
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index f9849c70..b8eb434d 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -14,27 +14,9 @@
 from sklearn.metrics import classification_report
 import pyarrow.parquet as pq
 
-spark = nlp.start(spark_conf={
-    'spark.driver.memory': '12g',
-    'spark.executor.memory': '6g',
-    # Fraction of heap space used for execution memory
-    'spark.memory.fraction': '0.6',
-    # Fraction of heap space used for storage memory
-    'spark.memory.storageFraction': '0.4',
-    # Enable off-heap storage (for large datasets)
-    'spark.memory.offHeap.enabled': 'true',
-    # Off-heap memory size (adjust as needed)
-    'spark.memory.offHeap.size': '6g',
-    'spark.shuffle.manager': 'sort',
-    'spark.shuffle.spill': 'true',
-    'spark.master': f'local[{os.cpu_count()}]',
-    'spark.default.parallelism': f'{os.cpu_count()*2}',
-    'spark.speculation': 'false',
-    'spark.task.maxFailures': '4',
-    'spark.local.dir': f"{os.path.join(os.getcwd())}",
-    'spark.eventLog.enabled': 'true',
-    'spark.eventLog.dir': f"{os.path.join(os.getcwd())}"
-})
+from grants_tagger_light.utils.years_tags_parser import parse_years, parse_tags
+
+import numpy as np
 
 retag_app = typer.Typer()
 
@@ -54,7 +36,7 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     return train_dset, test_dset
 
 
-def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
+def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag, spark):
     """
         This method creates a Spark pipeline (to run on dataframes)
     Args:
@@ -62,6 +44,7 @@ def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag):
         batch_size: max size of the batch to train. Since data is small for training, I limit it to 8.
         train_df: Spark Dataframe of the train data
         test_df: Spark Dataframe of the test data
+        spark: the Spark Object
 
     Returns:
         a tuple of (pipeline, lightpipeline)
@@ -179,21 +162,39 @@ def _curate(save_to_path, pos_dset, neg_dset, tag, limit):
 def retag(
     data_path: str,
     save_to_path: str,
+    spark_memory: int = 27,
     num_proc: int = os.cpu_count(),
     batch_size: int = 64,
+    tags: list = None,
     tags_file_path: str = None,
     threshold: float = 0.8,
     train_examples: int = 100,
     supervised: bool = True,
+    years: list = None,
 ):
+
+    spark = nlp.start(spark_conf={
+        'spark.driver.memory': f'{spark_memory}g',
+        'spark.executor.memory': f'{spark_memory}g',
+    })
+
     # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
     logging.info("Loading the MeSH jsonl...")
     dset = load_dataset("json", data_files=data_path, num_proc=1)
     if "train" in dset:
         dset = dset["train"]
 
-    with open(tags_file_path, 'r') as f:
-        tags = [x.strip() for x in f.readlines()]
+    if years is not None:
+        logger.info(f"Removing all years which are not in {years}")
+        dset = dset.filter(
+            lambda x: any(np.isin(years, [str(x["year"])])), num_proc=num_proc
+        )
+
+    if tags_file_path is not None and os.path.isfile(tags_file_path):
+        with open(tags_file_path, 'r') as f:
+            tags = [x.strip() for x in f.readlines()]
+
+    logging.info(f"Total tags detected: {tags}")
 
     for tag in tags:
         logging.info(f"Retagging: {tag}")
@@ -245,7 +246,7 @@ def retag(
         logging.info(f"- Test dataset size: {test_df.count()}")
 
         logging.info(f"- Creating `sparknlp` pipelines...")
-        pipeline = _create_pipelines(save_to_path, batch_size, train_df, test_df, tag)
+        pipeline = _create_pipelines(save_to_path, batch_size, train_df, test_df, tag, spark)
 
         logging.info(f"- Optimizing dataframe...")
         data_in_parquet = f"{save_to_path}.data.parquet"
@@ -290,6 +291,10 @@ def retag_cli(
     batch_size: int = typer.Option(
         64, help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
+    tags: str = typer.Option(
+        None,
+        help="Comma separated list of tags to retag"
+    ),
     tags_file_path: str = typer.Option(
         None,
         help="Text file containing one line per tag to be considered. "
@@ -308,7 +313,14 @@ def retag_cli(
         help="Use human curation, showing a `limit` amount of positive and negative examples to curate data"
              " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
              " the model will be train. All intermediary steps will be saved."
-    )
+    ),
+    spark_memory: int = typer.Option(
+        20,
+        help="Gigabytes of memory to be used. Recommended at least 20 to run on MeSH."
+    ),
+    years: str = typer.Option(
+        None, help="Comma-separated years you want to include in the retagging process"
+    ),
 ):
     if not data_path.endswith("jsonl"):
         logger.error(
@@ -317,19 +329,29 @@ def retag_cli(
         )
         exit(-1)
 
-    if tags_file_path is None:
+    if tags_file_path is None and tags is None:
+        logger.error(
+            "To understand which tags need to be augmented, use --tags [tags separated by comma] or create a file with"
+            "a newline per tag and set the path in --tags-file-path"
+        )
+        exit(-1)
+
+    if tags_file_path is not None and not os.path.isfile(tags_file_path):
         logger.error(
-            "To understand which tags need to be augmented set the path to the tags file in --tags-file-path"
+            f"{tags_file_path} not found"
         )
         exit(-1)
 
     retag(
         data_path,
         save_to_path,
+        spark_memory=spark_memory,
         num_proc=num_proc,
         batch_size=batch_size,
+        tags=parse_tags(tags),
         tags_file_path=tags_file_path,
         threshold=threshold,
         train_examples=train_examples,
-        supervised=supervised
+        supervised=supervised,
+        years=parse_years(years),
     )
diff --git a/grants_tagger_light/retagging/retagging_spacy.py b/grants_tagger_light/retagging/retagging_spacy.py
deleted file mode 100644
index 0ce0b26d..00000000
--- a/grants_tagger_light/retagging/retagging_spacy.py
+++ /dev/null
@@ -1,158 +0,0 @@
-"""import logging
-import os
-import random
-
-import typer
-from loguru import logger
-
-from datasets import load_dataset
-
-import spacy
-from spacy.tokens import DocBin
-from spacy.cli.train import train as spacy_train
-
-retag_app = typer.Typer()
-
-
-def _load_data(dset: list[str], limit=100, split=0.8):
-    # Partition off part of the train data for evaluation
-    random.Random(42).shuffle(dset)
-    train_size = int(split * limit)
-    train_dset = dset[:train_size]
-    test_dset = dset[train_size:limit]
-    return train_dset, test_dset
-
-
-def retag(
-    data_path: str,
-    save_to_path: str,
-    model_key: str = "gpt-3.5-turbo",
-    num_proc: int = os.cpu_count(),
-    batch_size: int = 64,
-    concurrent_calls: int = os.cpu_count() * 2,
-    tags_file_path: str = None,
-):
-    if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
-        raise NotImplementedError(
-            f"{model_key} not implemented as an augmentation framework"
-        )
-
-    # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
-    logging.info("Loading the MeSH jsonl...")
-    dset = load_dataset("json", data_files=data_path, num_proc=1)
-    if "train" in dset:
-        dset = dset["train"]
-
-    with open(tags_file_path, 'r') as f:
-        tags = [x.strip() for x in f.readlines()]
-
-    for tag in tags:
-        logging.info(f"Retagging: {tag}")
-
-        nlp = spacy.load("en_core_web_lg")
-
-        logging.info(f"Obtaining positive examples for {tag}...")
-        positive_dset = dset.filter(
-            lambda x: tag in x["meshMajor"], num_proc=num_proc
-        )
-        pos_x_train, pos_x_test = _load_data(positive_dset['abstractText'], limit=100, split=0.8)
-
-        logging.info(f"Obtaining negative examples for {tag}...")
-        negative_dset = dset.filter(
-            lambda x: tag not in x["meshMajor"], num_proc=num_proc
-        )
-        neg_x_train, neg_x_test = _load_data(negative_dset['abstractText'], limit=100, split=0.8)
-
-        logging.info(f"Processing corpus...")
-        train_data = DocBin()
-        for doc in nlp.pipe(pos_x_train):
-            doc.cats[tag] = 1
-            doc.cats['O'] = 0
-            train_data.add(doc)
-        for doc in nlp.pipe(neg_x_train):
-            doc.cats[tag] = 0
-            doc.cats['O'] = 1
-            train_data.add(doc)
-        train_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "train.spacy")
-        train_data.to_disk(train_data_path)
-
-        test_data = DocBin()
-        for doc in nlp.pipe(pos_x_test):
-            doc.cats[tag] = 1
-            doc.cats['O'] = 0
-            test_data.add(doc)
-        for doc in nlp.pipe(pos_x_test):
-            doc.cats[tag] = 0
-            doc.cats['O'] = 1
-            test_data.add(doc)
-        test_data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "test.spacy")
-        test_data.to_disk(test_data_path)
-
-        logging.info(f"Train data size: {len(train_data)}")
-        logging.info(f"Test data size: {len(test_data)}")
-
-        config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.cfg")
-        output_model_path = "spacy_textcat"
-        spacy_train(
-            config_path,
-            output_path=output_model_path,
-            overrides={
-                "paths.train": train_data_path,
-                "paths.dev": test_data_path,
-            },
-        )
-        break
-
-
-@retag_app.command()
-def retag_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
-    save_to_path: str = typer.Argument(
-        ..., help="Path where to save the retagged data"
-    ),
-    model_key: str = typer.Option(
-        "gpt-3.5-turbo",
-        help="LLM to use data augmentation. By now, only `openai` is supported",
-    ),
-    num_proc: int = typer.Option(
-        os.cpu_count(), help="Number of processes to use for data augmentation"
-    ),
-    batch_size: int = typer.Option(
-        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
-    ),
-    concurrent_calls: int = typer.Option(
-        os.cpu_count() * 2,
-        min=1,
-        help="Concurrent calls with 1 tag each to the different model",
-    ),
-    tags_file_path: str = typer.Option(
-        None,
-        help="Text file containing one line per tag to be considered. "
-        "The rest will be discarded.",
-    ),
-):
-    if not data_path.endswith("jsonl"):
-        logger.error(
-            "It seems your input MeSH data is not in `jsonl` format. "
-            "Please, run first `scripts/mesh_json_to_jsonl.py.`"
-        )
-        exit(-1)
-
-    if tags_file_path is None:
-        logger.error(
-            "To understand which tags need to be augmented set the path to the tags file in --tags-file-path"
-        )
-        exit(-1)
-
-    spacy.cli.download("en_core_web_lg")
-
-    retag(
-        data_path,
-        save_to_path,
-        model_key=model_key,
-        num_proc=num_proc,
-        batch_size=batch_size,
-        concurrent_calls=concurrent_calls,
-        tags_file_path=tags_file_path,
-    )
-"""
\ No newline at end of file
diff --git a/ll.Artificial Intelligence.curation.json b/ll.Artificial Intelligence.curation.json
deleted file mode 100644
index 6b86c3bd..00000000
--- a/ll.Artificial Intelligence.curation.json	
+++ /dev/null
@@ -1 +0,0 @@
-{"Artificial Intelligence": {"positive": [{"journal": "Zhonghua wei zhong bing ji jiu yi xue", "meshMajor": ["Artificial Intelligence", "Big Data", "Critical Care", "Critical Illness", "Humans"], "year": "2020", "abstractText": "Through the big data intelligent algorithm and application of artificial intelligence in critically ill patients, the value of the combination of clinical real-time warning and artificial intelligence in critical care medicine was explored. Artificial intelligence was used to simulate human thinking by studying, calculating, and analyzing a large amount of critical illness data in the medical work, and integrate a large number of clinical monitoring and treatment data generated in critical care medicine. The necessity, feasibility, relevance, data learning and application architecture of the application of artificial intelligence in the early warning of critical illness in medical work were analyzed, thus to promote the pioneering application of real-time warning of critical illness in clinical medicine. The development of critical care medicine in medical work requires the integration of big data and artificial intelligence. Through real-time early warning, accurate and scientific intelligent application of medical data, the life threatening uncertainties in the diagnosis and treatment of critically ill patients can be more effectively reduced and the success rate of the treatment of critically ill patients can be improved. The perfect combination of artificial intelligence technology and big data of critical care medicine can provide a favorable guarantee for the pioneering application of real-time warning of critical care medicine in clinical work.", "pmid": "33198854", "title": "[Artificial intelligence provides promotion of big data in medical work and contribution to people's health as soon as possible: real-time warning of critical illness is the pioneer of artificial intelligence in clinical medicine]."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Brain Mapping", "Cognition", "Electroencephalography", "Humans", "Male", "Man-Machine Systems", "Mathematical Concepts", "Problem Solving", "Reaction Time", "Signal Processing, Computer-Assisted", "Time Factors"], "year": "2013", "abstractText": "The association of functional connectivity patterns with particular cognitive tasks has long been a topic of interest in neuroscience, e.g., studies of functional connectivity have demonstrated its potential use for decoding various brain states. However, the high-dimensionality of the pairwise functional connectivity limits its usefulness in some real-time applications. In the present study, the methodology of tensor subspace analysis (TSA) is used to reduce the initial high-dimensionality of the pairwise coupling in the original functional connectivity network to a space of condensed descriptive power, which would significantly decrease the computational cost and facilitate the differentiation of brain states. We assess the feasibility of the proposed method on EEG recordings when the subject was performing mental arithmetic task which differ only in the difficulty level (easy: 1-digit addition v.s. 3-digit additions). Two different cortical connective networks were detected, and by comparing the functional connectivity networks in different work states, it was found that the task-difficulty is best reflected in the connectivity structure of sub-graphs extending over parietooccipital sites. Incorporating this data-driven information within original TSA methodology, we succeeded in predicting the difficulty level from connectivity patterns in an efficient way that can be implemented so as to work in real-time. ", "pmid": "24110343", "title": "A tensorial approach to access cognitive workload related to mental arithmetic from EEG functional connectivity estimates."}, {"journal": "Radiology", "meshMajor": ["Adult", "Artificial Intelligence", "Deep Learning", "Humans", "Neural Networks, Computer", "ROC Curve", "Radiography, Thoracic", "Retrospective Studies", "Sensitivity and Specificity", "Triage"], "year": "2019", "abstractText": "Purpose To develop and test an artificial intelligence (AI) system, based on deep convolutional neural networks (CNNs), for automated real-time triaging of adult chest radiographs on the basis of the urgency of imaging appearances. Materials and Methods An AI system was developed by using 470 388 fully anonymized institutional adult chest radiographs acquired from 2007 to 2017. The free-text radiology reports were preprocessed by using an in-house natural language processing (NLP) system modeling radiologic language. The NLP system analyzed the free-text report to prioritize each radiograph as critical, urgent, nonurgent, or normal. An AI system for computer vision using an ensemble of two deep CNNs was then trained by using labeled radiographs to predict the clinical priority from radiologic appearances only. The system's performance in radiograph prioritization was tested in a simulation by using an independent set of 15 887 radiographs. Prediction performance was assessed with the area under the receiver operating characteristic curve; sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV) were also determined. Nonparametric testing of the improvement in time to final report was determined at a nominal significance level of 5%. Results Normal chest radiographs were detected by our AI system with a sensitivity of 71%, specificity of 95%, PPV of 73%, and NPV of 94%. The average reporting delay was reduced from 11.2 to 2.7 days for critical imaging findings (P < .001) and from 7.6 to 4.1 days for urgent imaging findings (P < .001) in the simulation compared with historical data. Conclusion Automated real-time triaging of adult chest radiographs with use of an artificial intelligence system is feasible, with clinically acceptable performance. \u00a9 RSNA, 2019 Online supplemental material is available for this article. See also the editorial by Auffermann in this issue.", "pmid": "30667333", "title": "Automated Triaging of Adult Chest Radiographs with Deep Artificial Neural Networks."}, {"journal": "Amino acids", "meshMajor": ["Amino Acids", "Angiotensin-Converting Enzyme Inhibitors", "Animals", "Anti-Infective Agents", "Antimicrobial Cationic Peptides", "Artificial Intelligence", "Databases, Protein", "Dipeptides", "Humans", "Kinetics", "Models, Chemical", "Pancreatic Elastase", "Peptides", "Protein Conformation", "Quantitative Structure-Activity Relationship", "Software", "Staphylococcus aureus", "Statistics as Topic", "Swine"], "year": "2010", "abstractText": "In this study, structural topology scale (ST-scale) was recruited as a novel structural topological descriptor derived from principal component analysis on 827 structural variables of 167 amino acids. By using partial least squares (PLS), we applied ST-scale for the study of quantitative sequence-activity models (QSAMs) on three peptide datasets (58 angiotensin-converting enzyme (ACE) inhibitors, 34 antimicrobial peptides (AMPs) and 89 elastase substrates (ES)). The results of QSAMs were superior to that of the earlier studies, with determination coefficient (r(2)) and cross-validated (q(2)) equal to 0.855, 0.774; 0.79, 0.371 (OSC-PLS: 0.995, 0.848) and 0.846, 0.747, respectively. Therefore, ST-scale descriptors were considered to be competent to extract information from 827 structural variables and relate with their bioactivities.", "pmid": "19373543", "title": "ST-scale as a novel amino acid descriptor and its application in QSAM of peptides and analogues."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Architecture", "Artificial Intelligence", "Computer Simulation", "Geographic Information Systems", "Humans", "Image Interpretation, Computer-Assisted", "Maps as Topic", "Models, Biological", "Models, Theoretical", "Monte Carlo Method", "Pattern Recognition, Automated", "Robotics"], "year": "2010", "abstractText": "In this paper we deal with the problem of map building and localization of a mobile robot in an environment using the information provided by an omnidirectional vision sensor that is mounted on the robot. Our main objective consists of studying the feasibility of the techniques based in the global appearance of a set of omnidirectional images captured by this vision sensor to solve this problem. First, we study how to describe globally the visual information so that it represents correctly locations and the geometrical relationships between these locations. Then, we integrate this information using an approach based on a spring-mass-damper model, to create a topological map of the environment. Once the map is built, we propose the use of a Monte Carlo localization approach to estimate the most probable pose of the vision system and its trajectory within the map. We perform a comparison in terms of computational cost and error in localization. The experimental results we present have been obtained with real indoor omnidirectional images.", "pmid": "22163538", "title": "Map building and monte carlo localization using global appearance of omnidirectional images."}, {"journal": "The pharmacogenomics journal", "meshMajor": ["Algorithms", "Area Under Curve", "Artificial Intelligence", "Brain Neoplasms", "Color", "Databases, Genetic", "Endpoint Determination", "Gene Expression Profiling", "Humans", "Least-Squares Analysis", "Neuroblastoma", "Oligonucleotide Array Sequence Analysis", "Predictive Value of Tests", "Quality Control", "RNA, Neoplasm", "ROC Curve"], "year": "2010", "abstractText": "Microarray-based prediction of clinical endpoints may be performed using either a one-color approach reflecting mRNA abundance in absolute intensity values or a two-color approach yielding ratios of fluorescent intensities. In this study, as part of the MAQC-II project, we systematically compared the classification performance resulting from one- and two-color gene-expression profiles of 478 neuroblastoma samples. In total, 196 classification models were applied to these measurements to predict four clinical endpoints, and classification performances were compared in terms of accuracy, area under the curve, Matthews correlation coefficient and root mean-squared error. Whereas prediction performance varied with distinct clinical endpoints and classification models, equivalent performance metrics were observed for one- and two-color measurements in both internal and external validation. Furthermore, overlap of selected signature genes correlated inversely with endpoint prediction difficulty. In summary, our data strongly substantiate that the choice of platform is not a primary factor for successful gene expression based-prediction of clinical endpoints.", "pmid": "20676065", "title": "Comparison of performance of one-color and two-color gene-expression analyses in predicting clinical endpoints of neuroblastoma patients."}, {"journal": "Artificial intelligence in medicine", "meshMajor": ["Adult", "Affect", "Algorithms", "Artificial Intelligence", "Autonomic Nervous System", "Biosensing Techniques", "Bipolar Disorder", "Clothing", "Decision Support Techniques", "Diagnosis, Computer-Assisted", "Electrocardiography, Ambulatory", "Equipment Design", "Female", "Heart Rate", "Humans", "Male", "Middle Aged", "Models, Statistical", "Monitoring, Ambulatory", "Predictive Value of Tests", "Respiratory Rate", "Severity of Illness Index", "Signal Processing, Computer-Assisted", "Time Factors", "Transducers"], "year": "2013", "abstractText": "BACKGROUND: Bipolar disorders are characterized by a series of both depressive and manic or hypomanic episodes. Although common and expensive to treat, the clinical assessment of bipolar disorder is still ill-defined.OBJECTIVE: In the current literature several correlations between mood disorders and dysfunctions involving the autonomic nervous system (ANS) can be found. The objective of this work is to develop a novel mood recognition system based on a pervasive, wearable and personalized monitoring system using ANS-related biosignals.MATERIALS AND METHODS: The monitoring platform used in this study is the core sensing system of the personalized monitoring systems for care in mental health (PSYCHE) European project. It is comprised of a comfortable sensorized t-shirt that can acquire the inter-beat interval time series, the heart rate, and the respiratory dynamics for long-term monitoring during the day and overnight. In this study, three bipolar patients were followed for a period of 90 days during which up to six monitoring sessions and psychophysical evaluations were performed for each patient. Specific signal processing techniques and artificial intelligence algorithms were applied to analyze more than 120 h of data.RESULTS: Experimental results are expressed in terms of confusion matrices and an exhaustive descriptive statistics of the most relevant features is reported as well. A classification accuracy of about 97% is achieved for the intra-subject analysis. Such an accuracy was found in distinguishing relatively good affective balance state (euthymia) from severe clinical states (severe depression and mixed state) and is lower in distinguishing euthymia from the milder states (accuracy up to 88%).CONCLUSIONS: The PSYCHE platform could provide a viable decision support system in order to improve mood assessment in patient care. Evidences about the correlation between mood disorders and ANS dysfunctions were found and the obtained results are promising for an effective biosignal-based mood recognition.", "pmid": "23332576", "title": "Mood recognition in bipolar patients through the PSYCHE platform: preliminary evaluations and perspectives."}, {"journal": "PloS one", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Algorithms", "Artificial Intelligence", "Bayes Theorem", "Biomarkers, Tumor", "Case-Control Studies", "Female", "Follow-Up Studies", "Genetic Predisposition to Disease", "Humans", "Inflammation", "Inflammation Mediators", "Male", "Middle Aged", "Polymorphism, Single Nucleotide", "Prognosis", "Risk Factors", "Smoking", "Texas", "Urinary Bladder Neoplasms", "Young Adult"], "year": "2013", "abstractText": "The relationship between inflammation and cancer is well established in several tumor types, including bladder cancer. We performed an association study between 886 inflammatory-gene variants and bladder cancer risk in 1,047 cases and 988 controls from the Spanish Bladder Cancer (SBC)/EPICURO Study. A preliminary exploration with the widely used univariate logistic regression approach did not identify any significant SNP after correcting for multiple testing. We further applied two more comprehensive methods to capture the complexity of bladder cancer genetic susceptibility: Bayesian Threshold LASSO (BTL), a regularized regression method, and AUC-Random Forest, a machine-learning algorithm. Both approaches explore the joint effect of markers. BTL analysis identified a signature of 37 SNPs in 34 genes showing an association with bladder cancer. AUC-RF detected an optimal predictive subset of 56 SNPs. 13 SNPs were identified by both methods in the total population. Using resources from the Texas Bladder Cancer study we were able to replicate 30% of the SNPs assessed. The associations between inflammatory SNPs and bladder cancer were reexamined among non-smokers to eliminate the effect of tobacco, one of the strongest and most prevalent environmental risk factor for this tumor. A 9 SNP-signature was detected by BTL. Here we report, for the first time, a set of SNP in inflammatory genes jointly associated with bladder cancer risk. These results highlight the importance of the complex structure of genetic susceptibility associated with cancer risk. ", "pmid": "24391818", "title": "Application of multi-SNP approaches Bayesian LASSO and AUC-RF to detect main effects of inflammatory-gene variants associated with bladder cancer risk."}, {"journal": "Journal of the American Medical Informatics Association : JAMIA", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Artificial Intelligence", "Cardiology", "Computer Simulation", "Diagnosis, Computer-Assisted", "Diagnosis, Differential", "Heart Diseases", "Hospitals, Teaching", "Humans", "Middle Aged", "Models, Cardiovascular", "Predictive Value of Tests", "Prospective Studies", "ROC Curve", "Sensitivity and Specificity"], "year": null, "abstractText": "CONTEXT: The Heart Disease Program (HDP) is a novel computerized diagnosis program incorporating a computer model of cardiovascular physiology. Physicians can enter standard clinical data and receive a differential diagnosis with explanations.OBJECTIVE: To evaluate the diagnostic performance of the HDP and its usability by physicians in a typical clinical setting.DESIGN: A prospective observational study of the HDP in use by physicians in departments of medicine and cardiology of a teaching hospital. Data came from 114 patients with a broad range of cardiac disorders, entered by six physicians.MEASUREMENTS: Sensitivity, specificity, and positive predictive value (PPV). Comprehensiveness: the proportion of final diagnoses suggested by the HDP or physicians for each case.RELEVANCE: the proportion of HDP or physicians' diagnoses that are correct. Area under the receiver operating characteristic (ROC) curve (AUC) for the HDP and the physicians. Performance was compared with a final diagnosis based on follow-up and further investigations.RESULTS: Compared with the final diagnoses, the HDP had a higher sensitivity (53.0% vs. 34.8%) and significantly higher comprehensiveness (57.2% vs. 39.5%, p < 0.0001) than the physicians. Physicians' PPV and relevance (56.2%, 56.0%) were higher than the HDP (25.4%, 28.1%). Combining the diagnoses of the physicians and the HDPs, sensitivity was 61.3% and comprehensiveness was 65.7%. These findings were significant in the two collection cohorts and for subanalysis of the most serious diagnoses. The AUCs were similar for the HDP and the physicians.CONCLUSIONS: The heart disease program has the potential to improve the differential diagnoses of physicians in a typical clinical setting.", "pmid": "12668689", "title": "Evaluation of a cardiac diagnostic program in a typical clinical setting."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Coronary Artery Disease", "Databases, Factual", "Diagnosis, Computer-Assisted", "Fuzzy Logic", "Humans", "Sensitivity and Specificity", "Statistics, Nonparametric", "Time Factors"], "year": "2014", "abstractText": "In the past decades, medical data mining has become a popular data mining subject. Researchers have proposed several tools and various methodologies for developing effective medical expert systems. Diagnosing heart diseases is one of the important topics and many researchers have tried to develop intelligent medical expert systems to help the physicians. In this paper, we propose the use of PSO algorithm with a boosting approach to extract rules for recognizing the presence or absence of coronary artery disease in a patient. The weight of training examples that are classified properly by the new rules is reduced by a boosting mechanism. Therefore, in the next rule generation cycle, the focus is on those fuzzy rules that account for the currently misclassified or uncovered instances. We have used coronary artery disease data sets taken from University of California Irvine, (UCI), to evaluate our new classification approach. Results show that the proposed method can detect the coronary artery disease with an acceptable accuracy. Also, the discovered rules have significant interpretability as well. ", "pmid": "24817883", "title": "Coronary artery disease detection using a fuzzy-boosting PSO approach."}, {"journal": "NeuroImage", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Brain", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Magnetic Resonance Imaging", "Organ Size", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "An automated algorithm has been developed to segment stripped (non-brain tissue excluded) T1-weighted MRI brain volumes into left and right cerebral hemispheres and cerebellum+brainstem. The algorithm, which uses the Graph Cuts technique, performs a fully automated segmentation in approximately 30 s following pre-processing. It is robust and accurate and has been tested on datasets from two scanners using different field strengths and pulse sequences. We describe the Graph Cuts algorithm and compare the results of Graph Cuts segmentations against \"gold standard\" manual segmentations and segmentations produced by three popular software packages used by neuroimagers: BrainVisa, CLASP, and SurfRelax.", "pmid": "17150376", "title": "Automatic segmentation of left and right cerebral hemispheres from MRI brain volumes using the graph cuts algorithm."}, {"journal": "Molecular pharmaceutics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Deep Learning", "Drug Discovery", "Machine Learning", "Neural Networks, Computer"], "year": "2018", "abstractText": "Artificial Intelligence has advanced at an unprecedented pace, backing recent breakthroughs in natural language processing, speech recognition, and computer vision: domains where the data is euclidean in nature. More recently, considerable progress has been made in engineering deep-learning architectures that can accept non-Euclidean data such as graphs and manifolds: geometric deep learning. This progress is of considerable interest to the drug discovery community, as molecules can naturally be represented as graphs, where atoms are nodes and bonds are edges. In this work, we explore the performance of geometric deep-learning methods in the context of drug discovery, comparing machine learned features against the domain expert engineered features that are mainstream in the pharmaceutical industry.", "pmid": "29863875", "title": "Geometric Deep Learning Autonomously Learns Chemical Features That Outperform Those Engineered by Domain Experts."}, {"journal": "The oncologist", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Antineoplastic Combined Chemotherapy Protocols", "Artificial Intelligence", "China", "Clinical Decision-Making", "Decision Support Systems, Clinical", "Disease-Free Survival", "Evidence-Based Medicine", "Female", "Humans", "Male", "Medical Oncology", "Middle Aged", "Neoplasms", "Patient Selection", "Practice Guidelines as Topic", "Retrospective Studies"], "year": "2019", "abstractText": "BACKGROUND: IBM Watson for Oncology (WFO), which can use natural language processing to evaluate data in structured and unstructured formats, has begun to be used in China. It provides physicians with evidence-based treatment options and ranks them in three categories for treatment decision support. This study was designed to examine the concordance between the treatment recommendation proposed by WFO and actual clinical decisions by oncologists in our cancer center, which would reflect the differences of cancer treatment between China and the U.S.PATIENTS AND METHODS: Retrospective data from 362 patients with cancer were ingested into WFO from April 2017 to October 2017. WFO recommendations were provided in three categories: recommended, for consideration, and not recommended. Concordance was analyzed by comparing the treatment decisions proposed by WFO with those of the multidisciplinary tumor board. Concordance was achieved when the oncologists' treatment decisions were in the recommended or for consideration categories in WFO.RESULTS: Ovarian cancer showed the highest concordance, which was 96%. Lung cancer and breast cancer obtained a concordance of slightly above 80%. The concordance of rectal cancer was 74%, whereas colon cancer and cervical cancer showed the same concordance of 64%. In particular, the concordance of gastric cancer was very low, only 12%, and 88% of cases were under physicians choice.CONCLUSION: Different cancer types showed different concordances, and only gastric cancers were significantly less likely to be concordant. Incidence and pharmaceuticals may be the major cause of discordance. To be comprehensively and rapidly applied in China, WFO needs to accelerate localization. ClinicalTrials.gov Identifier: NCT03400514.IMPLICATIONS FOR PRACTICE: IBM Watson for Oncology (WFO) has begun to be used in China. In this study, concordance was examined between the treatment recommendation proposed by WFO and clinical decisions for 362 patients in our cancer center, which could reflect the differences of cancer treatment between China and the U.S. Different cancer types showed different concordances, and only gastric cancers were significantly less likely to be concordant. Incidence and pharmaceuticals may be the major causes of discordance. To be comprehensively and rapidly applied in China, WFO needs to accelerate localization. This study may have a significant effect on application of artificial intelligence systems in China.", "pmid": "30181315", "title": "Concordance Study Between IBM Watson for Oncology and Clinical Practice for Patients with Cancer in China."}, {"journal": "Computational and mathematical methods in medicine", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Cluster Analysis", "Diagnostic Imaging", "Fuzzy Logic", "Humans", "Image Interpretation, Computer-Assisted", "Image Processing, Computer-Assisted", "Medical Informatics", "Models, Statistical", "Normal Distribution", "Pattern Recognition, Automated", "Reproducibility of Results", "Software", "Tomography, X-Ray Computed"], "year": "2014", "abstractText": "Researchers recently apply an integrative approach to automate medical image segmentation for benefiting available methods and eliminating their disadvantages. Intensity inhomogeneity is a challenging and open problem in this area, which has received less attention by this approach. It has considerable effects on segmentation accuracy. This paper proposes a new kernel-based fuzzy level set algorithm by an integrative approach to deal with this problem. It can directly evolve from the initial level set obtained by Gaussian Kernel-Based Fuzzy C-Means (GKFCM). The controlling parameters of level set evolution are also estimated from the results of GKFCM. Moreover the proposed algorithm is enhanced with locally regularized evolution based on an image model that describes the composition of real-world images, in which intensity inhomogeneity is assumed as a component of an image. Such improvements make level set manipulation easier and lead to more robust segmentation in intensity inhomogeneity. The proposed algorithm has valuable benefits including automation, invariant of intensity inhomogeneity, and high accuracy. Performance evaluation of the proposed algorithm was carried on medical images from different modalities. The results confirm its effectiveness for medical image segmentation. ", "pmid": "24624225", "title": "A new kernel-based fuzzy level set method for automated segmentation of medical images in the presence of intensity inhomogeneity."}, {"journal": "Biochemical and biophysical research communications", "meshMajor": ["Artificial Intelligence", "Cell Cycle Proteins", "Cell Division", "Centrosome", "Kinetochores", "Sequence Analysis, Protein"], "year": "2010", "abstractText": "In the process of cell division, a great deal of proteins is assembled into three distinct organelles, namely midbody, centrosome and kinetochore. Knowing the localization of microkit (midbody, centrosome and kinetochore) proteins will facilitate drug target discovery and provide novel insights into understanding their functions. In this study, a support vector machine (SVM) model, MicekiPred, was presented to predict the localization of microkit proteins based on gene ontology (GO) information. A total accuracy of 77.51% was achieved using the jackknife cross-validation. This result shows that the model will be an effective complementary tool for future experimental study. The prediction model and dataset used in this article can be freely downloaded from http://cobi.uestc.edu.cn/people/hlin/tools/MicekiPred/.", "pmid": "20854791", "title": "Prediction of midbody, centrosome and kinetochore proteins based on gene ontology information."}, {"journal": "Physics in medicine and biology", "meshMajor": ["Alzheimer Disease", "Artificial Intelligence", "Diagnosis, Computer-Assisted", "Humans", "Image Interpretation, Computer-Assisted", "Tomography, Emission-Computed, Single-Photon"], "year": "2010", "abstractText": "This paper presents a computer-aided diagnosis technique for improving the accuracy of early diagnosis of Alzheimer-type dementia. The proposed methodology is based on the selection of voxels which present Welch's t-test between both classes, normal and Alzheimer images, greater than a given threshold. The mean and standard deviation of intensity values are calculated for selected voxels. They are chosen as feature vectors for two different classifiers: support vector machines with linear kernel and classification trees. The proposed methodology reaches greater than 95% accuracy in the classification task.", "pmid": "20413829", "title": "Computer-aided diagnosis of Alzheimer's disease using support vector machines and classification trees."}, {"journal": "Journal of computer-aided molecular design", "meshMajor": ["Artificial Intelligence", "Humans", "Learning", "Least-Squares Analysis", "Neural Networks, Computer", "Quantitative Structure-Activity Relationship"], "year": null, "abstractText": "Current practice in Quantitative Structure Activity Relationship (QSAR) methods usually involves generating a great number of chemical descriptors and then cutting them back with variable selection techniques. Variable selection is an effective method to reduce the dimensionality but may discard some valuable information. This paper introduces Locally Linear Embedding (LLE), a local non-linear dimensionality reduction technique, that can statistically discover a low-dimensional representation of the chemical data. LLE is shown to create more stable representations than other non-linear dimensionality reduction algorithms, and to be capable of capturing non-linearity in chemical data.", "pmid": "15729847", "title": "Locally linear embedding for dimensionality reduction in QSAR."}, {"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Artificial Intelligence", "Cognition", "Conscience", "Consciousness", "Ethics", "Humans", "Intelligence", "Memory", "Thinking", "Unconscious, Psychology"], "year": "1992", "abstractText": "A complex system (CS) is defined as a set of elements, with connections between them, singled out of the environment, capable of getting information from the environment, capable of making decisions (i.e., of choosing between alternatives), and having purposefulness (i.e., an urge towards preferable states or other goals). Thinking is a process that takes place (or which can take place) in some of the CS and consists of (i) receiving information from the environment (and from itself), (ii) memorizing the information, (iii) the subconscious, and (iv) consciousness. Life is a process that takes place in some CS and consists of functions i and ii, as well as (v) reproduction with passing of hereditary information to progeny, and (vi) oriented energy and matter exchange with the environment sufficient for the maintenance of all life processes. Memory is a complex of processes of placing information in memory banks, keeping it there, and producing it according to prescriptions available in the system or to inquiries arising in it. Consciousness is a process of realization by the thinking CS of some set of algorithms consisting of the comparison of its knowledge, intentions, decisions, and actions with reality--i.e., with accumulated and continuously received internal and external information. Conscience is a realization of an algorithm of good and evil pattern recognition.", "pmid": "1631060", "title": "On the definition of the concepts thinking, consciousness, and conscience."}, {"journal": "Journal of the Royal Society of Medicine", "meshMajor": ["Artificial Intelligence", "Critical Pathways", "Delivery of Health Care, Integrated", "Forecasting", "Heuristics", "Humans", "Technology Assessment, Biomedical"], "year": "2019", "abstractText": "In recent years, there has been massive progress in artificial intelligence (AI) with the development of deep neural networks, natural language processing, computer vision and robotics. These techniques are now actively being applied in healthcare with many of the health service activities currently being delivered by clinicians and administrators predicted to be taken over by AI in the coming years. However, there has also been exceptional hype about the abilities of AI with a mistaken notion that AI will replace human clinicians altogether. These perspectives are inaccurate, and if a balanced perspective of the limitations and promise of AI is taken, one can gauge which parts of the health system AI can be integrated to make a meaningful impact. The four main areas where AI would have the most influence would be: patient administration, clinical decision support, patient monitoring and healthcare interventions. This health system where AI plays a central role could be termed an AI-enabled or AI-augmented health system. In this article, we discuss how this system can be developed based on a realistic assessment of current AI technologies and predicted developments.", "pmid": "30507284", "title": "Artificial intelligence-enabled healthcare delivery."}, {"journal": "IEEE transactions on neural networks", "meshMajor": ["Artificial Intelligence", "Computer Simulation", "Humans", "Learning", "Likelihood Functions", "Neural Networks, Computer", "Nonlinear Dynamics", "Recurrence"], "year": "2011", "abstractText": "Recurrent neural network (RNN) has emerged as a promising tool in modeling nonlinear dynamical systems, but the training convergence is still of concern. This paper aims to develop an effective extended Kalman filter-based RNN training approach with a controllable training convergence. The training convergence problem during extended Kalman filter-based RNN training has been proposed and studied by adapting two artificial training noise parameters: the covariance of measurement noise (R) and the covariance of process noise (Q) of Kalman filter. The R and Q adaption laws have been developed using the Lyapunov method and the maximum likelihood method, respectively. The effectiveness of the proposed adaption laws has been tested using a nonlinear dynamical benchmark system and further applied in cutting tool wear modeling. The results show that the R adaption law can effectively avoid the divergence problem and ensure the training convergence, whereas the Q adaption law helps improve the training convergence speed.", "pmid": "21402512", "title": "Convergence study in extended Kalman filter-based training of recurrent neural networks."}, {"journal": "Biomedical sciences instrumentation", "meshMajor": ["Artificial Intelligence", "Benchmarking", "Biomedical Engineering", "Computer Simulation", "Cost-Benefit Analysis", "Database Management Systems", "Decision Making, Computer-Assisted", "Efficiency, Organizational", "Electronics, Medical", "Information Storage and Retrieval", "Joint Commission on Accreditation of Healthcare Organizations", "Models, Statistical", "Quality Control", "Quality Indicators, Health Care", "Total Quality Management", "United States"], "year": "2003", "abstractText": "Healthcare is ever changing environment and with the Joint Commission for the Accreditation of Hospital Organization (JCAHO) emphasis on quality improvement during the past several years, and the cost-focused healthcare reforms of the 1990s, benchmarking with peer comparison, and more recently benchmarking against competitors, has taken on a new emphasis. All acute healthcare organizations accredited by JCAHO now require participation in a program titled ORYX, which is designed to use comparisons with other organizations and promote national benchmarks. The knowledge management system designed assists clinical engineering department to convert vast amounts of available data into information, which is ultimately transformed into knowledge to enable better decision-making. The systems assist in using the data as a comparison tool, to compare the performance internally and also compare performance with peer organizations using the same measures within the same measurement system. Collectively, these applications support better, faster data-driven decisions. This tool provides fast and easy access to financial and quality metrics to clinical engineering department managers, which increases their ability to perform sophisticated analysis to develop accurate models and forecasts, and make timely, data driven decisions. The project also provides a platform by means of which clinical engineering departmental procedures, data, and methods can be assessed and shared among institutions.", "pmid": "12724889", "title": "Knowledge management system for benchmarking performance indicators using statistical process control (SPC) and Virtual Instrumentation (VI)."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Databases, Genetic", "Gene Expression Profiling", "Oligonucleotide Array Sequence Analysis", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "BACKGROUND: Cluster analysis is an integral part of high dimensional data analysis. In the context of large scale gene expression data, a filtered set of genes are grouped together according to their expression profiles using one of numerous clustering algorithms that exist in the statistics and machine learning literature. A closely related problem is that of selecting a clustering algorithm that is \"optimal\" in some sense from a rather impressive list of clustering algorithms that currently exist.RESULTS: In this paper, we propose two validation measures each with two parts: one measuring the statistical consistency (stability) of the clusters produced and the other representing their biological functional congruence. Smaller values of these indices indicate better performance for a clustering algorithm. We illustrate this approach using two case studies with publicly available gene expression data sets: one involving a SAGE data of breast cancer patients and the other involving a time course cDNA microarray data on yeast. Six well known clustering algorithms UPGMA, K-Means, Diana, Fanny, Model-Based and SOM were evaluated.CONCLUSION: No single clustering algorithm may be best suited for clustering genes into functional groups via expression profiles for all data sets. The validation measures introduced in this paper can aid in the selection of an optimal algorithm, for a given data set, from a collection of available clustering algorithms.", "pmid": "17217509", "title": "Evaluation of clustering algorithms for gene expression data."}, {"journal": "ISA transactions", "meshMajor": ["Air Pressure", "Algorithms", "Artificial Intelligence", "Computer Systems", "Food", "Food Industry", "Food Packaging", "Models, Statistical", "Reinforcement, Psychology", "Steam", "Sterilization", "Temperature"], "year": "2011", "abstractText": "A control technique based on Reinforcement Learning is proposed for the thermal sterilization of canned foods. The proposed controller has the objective of ensuring a given degree of sterilization during Heating (by providing a minimum temperature inside the cans during a given time) and then a smooth Cooling, avoiding sudden pressure variations. For this, three automatic control valves are manipulated by the controller: a valve that regulates the admission of steam during Heating, and a valve that regulate the admission of air, together with a bleeder valve, during Cooling. As dynamical models of this kind of processes are too complex and involve many uncertainties, controllers based on learning are proposed. Thus, based on the control objectives and the constraints on input and output variables, the proposed controllers learn the most adequate control actions by looking up a certain matrix that contains the state-action mapping, starting from a preselected state-action space. This state-action matrix is constantly updated based on the performance obtained with the applied control actions. Experimental results at laboratory scale show the advantages of the proposed technique for this kind of processes.", "pmid": "20817160", "title": "Learning control for batch thermal sterilization of canned foods."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cerebral Cortex", "Computer Simulation", "Databases, Factual", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Magnetic Resonance Imaging", "Models, Biological", "Models, Statistical", "Nonlinear Dynamics", "Pattern Recognition, Automated", "Principal Component Analysis", "Reference Values", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2005", "abstractText": "Because of the complex shape of human cortical gyri and great variation between individuals, development of effective representation schemes which allow establishment of correspondence between individuals, extraction of average structure of a population, and co-registration has proved very difficult. We introduce an approach which extracts line representations of gyri at different depths from high resolution MRI, labels main gyri semi-automatically, and extracts a template from a population using non-linear principal component analysis. The method has been tested on data from 96 healthy human volunteers. The model captures the most salient shape features of all major cortical gyri, and can be used for inter-subject registration, for investigating regionalized inter-subject variability, and for inter-hemispheric comparisons.", "pmid": "16686027", "title": "A construction of an averaged representation of human cortical gyri using non-linear principal component analysis."}, {"journal": "Artificial intelligence in medicine", "meshMajor": ["Anthropology, Cultural", "Artificial Intelligence", "Communication", "Computer Simulation", "Humans", "Information Services", "Interviews as Topic", "Medical History Taking", "Migraine Disorders", "Natural Language Processing", "Patient Education as Topic", "Physician-Patient Relations", "Systems Integration", "Terminology as Topic"], "year": "1995", "abstractText": "This paper is a report on the first phase of a long-term, interdisciplinary project whose goal is to increase the overall effectiveness of physicians' time, and thus the quality of health care, by improving the information exchange between physicians and patients in clinical settings. We are focusing on patients with long-term and chronic conditions, initially on migraine patients, who require periodic interaction with their physicians for effective management of their condition. We are using medical informatics to focus on the information needs of patients, as well as of physicians, and to address problems of information exchange. This requires understanding patients' concerns to design an appropriate system, and using state-of-the-art artificial intelligence techniques to build an interactive explanation system. In contrast to many other knowledge-based systems, our system's design is based on empirical data on actual information needs. We used ethnographic techniques to observe explanations actually given in clinic settings, and to conduct interviews with migraine sufferers and physicians. Our system has an extensive knowledge base that contains both general medical terminology and specific knowledge about migraine, such as common trigger factors and symptoms of migraine, the common therapies, and the most common effects and side effects of those therapies. The system consists of two main components: (a) an interactive history-taking module that collects information from patients prior to each visit, builds a patient model, and summarizes the patients' status for their physicians; and (b) an intelligent explanation module that produces an interactive information sheet containing explanations in everyday language that are tailored to individual patients, and responds intelligently to follow-up questions about topics covered in the information sheet.", "pmid": "7647838", "title": "An intelligent interactive system for delivering individualized information to patients."}, {"journal": "Computerized medical imaging and graphics : the official journal of the Computerized Medical Imaging Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Colon", "Colonography, Computed Tomographic", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2009", "abstractText": "An automatic method for the segmentation of the colonic wall is proposed for abdominal computed tomography (CT) of the cleansed and air-inflated colon. This multistage approach uses an adaptive 3D region-growing algorithm, with a self-adjusting growing condition depending on local variations of the intensity at the air-tissue boundary. The method was evaluated using retrospectively collected CT scans based on visual segmentation of the colon by expert radiologists. This evaluation showed that the procedure identifies 97% of the colon segments, representing 99.8% of the colon surface, and accurately replicates the anatomical profile of the colonic wall. The parameter settings and performance of the method are relatively independent of the scanner and acquisition conditions. The method is intended for application to the computer-aided detection of polyps in CT colonography.", "pmid": "19304454", "title": "An automatic method for colon segmentation in CT colonography."}, {"journal": "Journal of medical Internet research", "meshMajor": ["Adolescent", "Adult", "Aged", "Artificial Intelligence", "Female", "Humans", "Internet", "Interpersonal Relations", "Male", "Middle Aged", "Mobile Applications", "Social Support", "Young Adult"], "year": "2020", "abstractText": "BACKGROUND: Previous research suggests that artificial agents may be a promising source of social support for humans. However, the bulk of this research has been conducted in the context of social support interventions that specifically address stressful situations or health improvements. Little research has examined social support received from artificial agents in everyday contexts.OBJECTIVE: Considering that social support manifests in not only crises but also everyday situations and that everyday social support forms the basis of support received during more stressful events, we aimed to investigate the types of everyday social support that can be received from artificial agents.METHODS: In Study 1, we examined publicly available user reviews (N=1854) of Replika, a popular companion chatbot. In Study 2, a sample (n=66) of Replika users provided detailed open-ended responses regarding their experiences of using Replika. We conducted thematic analysis on both datasets to gain insight into the kind of everyday social support that users receive through interactions with Replika.RESULTS: Replika provides some level of companionship that can help curtail loneliness, provide a \"safe space\" in which users can discuss any topic without the fear of judgment or retaliation, increase positive affect through uplifting and nurturing messages, and provide helpful information/advice when normal sources of informational support are not available.CONCLUSIONS: Artificial agents may be a promising source of everyday social support, particularly companionship, emotional, informational, and appraisal support, but not as tangible support. Future studies are needed to determine who might benefit from these types of everyday social support the most and why. These results could potentially be used to help address global health issues or other crises early on in everyday situations before they potentially manifest into larger issues.", "pmid": "32141837", "title": "User Experiences of Social Support From Companion Chatbots in Everyday Contexts: Thematic Analysis."}, {"journal": "Philosophical transactions of the Royal Society of London. Series B, Biological sciences", "meshMajor": ["Artificial Intelligence", "Ethnic Groups", "Female", "Germany", "Humans", "Language", "Language Tests", "Learning", "Linguistics", "Memory", "Netherlands", "Reaction Time"], "year": "2012", "abstractText": "Processing non-adjacent dependencies is considered to be one of the hallmarks of human language. Assuming that sequence-learning tasks provide a useful way to tap natural-language-processing mechanisms, we cross-modally combined serial reaction time and artificial-grammar learning paradigms to investigate the processing of multiple nested (A(1)A(2)A(3)B(3)B(2)B(1)) and crossed dependencies (A(1)A(2)A(3)B(1)B(2)B(3)), containing either three or two dependencies. Both reaction times and prediction errors highlighted problems with processing the middle dependency in nested structures (A(1)A(2)A(3)B(3)_B(1)), reminiscent of the 'missing-verb effect' observed in English and French, but not with crossed structures (A(1)A(2)A(3)B(1)_B(3)). Prior linguistic experience did not play a major role: native speakers of German and Dutch-which permit nested and crossed dependencies, respectively-showed a similar pattern of results for sequences with three dependencies. As for sequences with two dependencies, reaction times and prediction errors were similar for both nested and crossed dependencies. The results suggest that constraints on the processing of multiple non-adjacent dependencies are determined by the specific ordering of the non-adjacent dependencies (i.e. nested or crossed), as well as the number of non-adjacent dependencies to be resolved (i.e. two or three). Furthermore, these constraints may not be specific to language but instead derive from limitations on structured sequence learning.", "pmid": "22688641", "title": "Processing multiple non-adjacent dependencies: evidence from sequence learning."}, {"journal": "Frontiers in bioscience : a journal and virtual library", "meshMajor": ["Algorithms", "Artificial Intelligence", "DNA", "Exons", "GC Rich Sequence", "Humans", "Introns", "Polymorphism, Single Nucleotide", "Sequence Analysis, DNA", "Thermodynamics"], "year": "2007", "abstractText": "Recently, SNP has gained substantial attention as genetic markers and is recognized as a key element in the development of personalized medicine. Computational prediction of SNP can be used as a guide for SNP discovery to reduce the cost and time needed for the development of personalized medicine. We have developed a method for SNP prediction based on support vector machines (SVMs) using different features extracted from the SNP data. Prediction rates of 60.9% was achieved by sequence feature, 59.1% by free-energy feature, 58.1% by GC content feature, 58.0% by melting temperature feature, 56.2% by enthalpy feature, 55.1% by entropy feature and 54.3% by the gene, exon and intron feature. We introduced a new feature, the SNP distribution score that achieved a prediction rate of 77.3%. Thus, the proposed SNP prediction algorithm can be used to in SNP discovery.", "pmid": "17127407", "title": "Predicting single nucleotide polymorphisms (SNP) from DNA sequence by support vector machine."}, {"journal": "Applied optics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biomimetics", "Equipment Design", "Equipment Failure Analysis", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Models, Biological", "Models, Statistical", "Pattern Recognition, Automated", "Reproducibility of Results", "Robotics", "Sensitivity and Specificity", "Transducers", "Vision, Binocular"], "year": "2008", "abstractText": "One major research issue associated with 3D perception by robotic systems is the creation of efficient sensor systems that can generate dense range maps reliably. A visual sensor system for robotic applications is developed that is inherently equipped with two types of sensor, an active trinocular vision and a passive stereo vision. Unlike in conventional active vision systems that use a large number of images with variations of projected patterns for dense range map acquisition or from conventional passive vision systems that work well on specific environments with sufficient feature information, a cooperative bidirectional sensor fusion method for this visual sensor system enables us to acquire a reliable dense range map using active and passive information simultaneously. The fusion algorithms are composed of two parts, one in which the passive stereo vision helps active vision and the other in which the active trinocular vision helps the passive one. The first part matches the laser patterns in stereo laser images with the help of intensity images; the second part utilizes an information fusion technique using the dynamic programming method in which image regions between laser patterns are matched pixel-by-pixel with help of the fusion results obtained in the first part. To determine how the proposed sensor system and fusion algorithms can work in real applications, the sensor system is implemented on a robotic system, and the proposed algorithms are applied. A series of experimental tests is performed for a variety of configurations of robot and environments. The performance of the sensor system is discussed in detail.", "pmid": "18404193", "title": "Dense range map reconstruction from a versatile robotic sensor system with an active trinocular vision and a passive binocular vision."}, {"journal": "Journal of biomedical optics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Endoscopy", "Female", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Microscopy, Confocal", "Ovarian Neoplasms", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": null, "abstractText": "The confocal microendoscope is an instrument for imaging the surface of the human ovary. Images taken with this instrument from normal and diseased tissue show significant differences in cellular distribution. A real-time computer-aided system to facilitate the identification of ovarian cancer is introduced. The cellular-level structure present in ex vivo confocal microendoscope images is modeled as texture. Features are extracted based on first-order statistics, spatial gray-level-dependence matrices, and spatial-frequency content. Selection of the features is performed using stepwise discriminant analysis, forward sequential search, a nonparametric method, principal component analysis, and a heuristic technique that combines the results of these other methods. The selected features are used for classification, and the performance of various machine classifiers is compared by analyzing areas under their receiver operating characteristic curves. The machine classifiers studied included linear discriminant analysis, quadratic discriminant analysis, and the k-nearest-neighbor algorithm. The results suggest it is possible to automatically identify pathology based on texture features extracted from confocal microendoscope images and that the machine performance is superior to that of a human observer.", "pmid": "18465984", "title": "Computer-aided identification of ovarian cancer in confocal microendoscope images."}, {"journal": "International journal of data mining and bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Brain Injuries", "Data Interpretation, Statistical", "Humans", "Intracranial Pressure"], "year": "2013", "abstractText": "This paper attempts to predict Intracranial Pressure (ICP) based on features extracted from non-invasively collected patient data. These features include midline shift measurement and textural features extracted from Computed axial Tomography (CT) images. A statistical analysis is performed to examine the relationship between ICP and midline shift. Machine learning is also applied to estimate ICP levels with a two-stage feature selection scheme. To avoid overfitting, all feature selections and parameter selections are performed using a nested 10-fold cross validation within the training data. The classification results demonstrate the effectiveness of the proposed method in ICP prediction.", "pmid": "24400523", "title": "Predictability of intracranial pressure level in traumatic brain injury: features extraction, statistical analysis and machine learning-based evaluation."}, {"journal": "Medical image analysis", "meshMajor": ["Aging", "Algorithms", "Alzheimer Disease", "Artificial Intelligence", "Brain", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Magnetic Resonance Imaging", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "We develop a multivariate analysis of brain anatomy to identify the relevant shape deformation patterns and quantify the shape changes that explain corresponding variations in clinical neuropsychological measures. We use kernel Partial Least Squares (PLS) and formulate a regression model in the tangent space of the manifold of diffeomorphisms characterized by deformation momenta. The scalar deformation momenta completely encode the diffeomorphic changes in anatomical shape. In this model, the clinical measures are the response variables, while the anatomical variability is treated as the independent variable. To better understand the \"shape-clinical response\" relationship, we also control for demographic confounders, such as age, gender, and years of education in our regression model. We evaluate the proposed methodology on the Alzheimer's Disease Neuroimaging Initiative (ADNI) database using baseline structural MR imaging data and neuropsychological evaluation test scores. We demonstrate the ability of our model to quantify the anatomical deformations in units of clinical response. Our results also demonstrate that the proposed method is generic and generates reliable shape deformations both in terms of the extracted patterns and the amount of shape changes. We found that while the hippocampus and amygdala emerge as mainly responsible for changes in test scores for global measures of dementia and memory function, they are not a determinant factor for executive function. Another critical finding was the appearance of thalamus and putamen as most important regions that relate to executive function. These resulting anatomical regions were consistent with very high confidence irrespective of the size of the population used in the study. This data-driven global analysis of brain anatomy was able to reach similar conclusions as other studies in Alzheimer's disease based on predefined ROIs, together with the identification of other new patterns of deformation. The proposed methodology thus holds promise for discovering new patterns of shape changes in the human brain that could add to our understanding of disease progression in neurological disorders. ", "pmid": "24667299", "title": "Quantifying anatomical shape variations in neurological disorders."}, {"journal": "Studies in health technology and informatics", "meshMajor": ["Artificial Intelligence", "Codes of Ethics", "Health", "Humans", "Information Services", "Internet", "Medical Informatics", "Natural Language Processing", "Quality Control"], "year": "2007", "abstractText": "The number of medical websites is constantly growing [1]. Owing to the open nature of the Web, the reliability of information available on the Web is uneven. Internet users are overwhelmed by the quantity of information available on the Web. The situation is even more critical in the medical area, as the content proposed by health websites can have a direct impact on the users' well being. One way to control the reliability of health websites is to assess their quality and to make this assessment available to users. The HON Foundation has defined a set of eight ethical principles. HON's experts are working in order to manually define whether a given website complies with s the required principles. As the number of medical websites is constantly growing, manual expertise becomes insufficient and automatic systems should be used in order to help medical experts. In this paper we present the design and the evaluation of an automatic system conceived for the categorisation of medical and health documents according to he HONcode ethical principles. A first evaluation shows promising results. Currently the system shows 0.78 micro precision and 0.73 F-measure, with 0.06 errors.", "pmid": "17911808", "title": "Machine learning approach for automatic quality criteria detection of health web pages."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Discriminant Analysis", "Information Storage and Retrieval", "Pattern Recognition, Automated", "Principal Component Analysis"], "year": "2008", "abstractText": "Fisher's linear discriminant analysis (LDA) is a traditional dimensionality reduction method that has been proven to be successful for decades. Numerous variants, such as the kernel-based Fisher discriminant analysis (KFDA), have been proposed to enhance the LDA's power for nonlinear discriminants. Although effective, the KFDA is computationally expensive, since the complexity increases with the size of the data set. In this correspondence, we suggest a novel strategy to enhance the computation for an entire family of the KFDAs. Rather than invoke the KFDA for the entire data set, we advocate that the data be first reduced into a smaller representative subset using a prototype reduction scheme and that the dimensionality reduction be achieved by invoking a KFDA on this reduced data set. In this way, data points that are ineffective in the dimension reduction and classification can be eliminated to obtain a significantly reduced kernel matrix K without degrading the performance. Our experimental results demonstrate that the proposed mechanism dramatically reduces the computation time without sacrificing the classification accuracy for artificial and real-life data sets.", "pmid": "18348939", "title": "On using prototype reduction schemes to optimize kernel-based fisher discriminant analysis."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Humans", "Learning", "Linear Models", "Models, Theoretical", "Reinforcement, Psychology", "Thinking"], "year": "2016", "abstractText": "To improve the convergence rate and the sample efficiency, two efficient learning methods AC-HMLP and RAC-HMLP (AC-HMLP with ?2-regularization) are proposed by combining actor-critic algorithm with hierarchical model learning and planning. The hierarchical models consisting of the local and the global models, which are learned at the same time during learning of the value function and the policy, are approximated by local linear regression (LLR) and linear function approximation (LFA), respectively. Both the local model and the global model are applied to generate samples for planning; the former is used only if the state-prediction error does not surpass the threshold at each time step, while the latter is utilized at the end of each episode. The purpose of taking both models is to improve the sample efficiency and accelerate the convergence rate of the whole algorithm through fully utilizing the local and global information. Experimentally, AC-HMLP and RAC-HMLP are compared with three representative algorithms on two Reinforcement Learning (RL) benchmark problems. The results demonstrate that they perform best in terms of convergence rate and sample efficiency.", "pmid": "27795704", "title": "Efficient Actor-Critic Algorithm with Hierarchical Model Learning and Planning."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Ecosystem", "Geologic Sediments", "Linear Models", "Netherlands", "Neural Networks, Computer", "Predictive Value of Tests", "Time Factors"], "year": "2006", "abstractText": "The paper presents machine learning (ML) models that predict sedimentation in the harbour basin of the Port of Rotterdam. The important factors affecting the sedimentation process such as waves, wind, tides, surge, river discharge, etc. are studied, the corresponding time series data is analysed, missing values are estimated and the most important variables behind the process are chosen as the inputs. Two ML methods are used: MLP ANN and M5 model tree. The latter is a collection of piece-wise linear regression models, each being an expert for a particular region of the input space. The models are trained on the data collected during 1992-1998 and tested by the data of 1999-2000. The predictive accuracy of the models is found to be adequate for the potential use in the operational decision making.", "pmid": "16530383", "title": "Machine learning in sedimentation modelling."}, {"journal": "Neurological research", "meshMajor": ["Algorithms", "Artificial Intelligence", "Diagnostic Errors", "Electroencephalography", "Epilepsy", "Humans", "Neural Networks, Computer", "Predictive Value of Tests", "Reproducibility of Results", "Signal Processing, Computer-Assisted"], "year": "2004", "abstractText": "Diagnosis of epilepsy is primarily based on scalp-recorded electroencephalograms (EEG). Unfortunately the long-term recordings obtained from 'ambulatory recording systems' contain EEG data of up to one week duration, which has introduced new problems for clinical analysis. Traditional methods, where the entire EEG is reviewed by a trained professional, are very time-consuming when applied to recordings of this length. Therefore, several automated diagnostic aid approaches were proposed in recent years, in order to reduce expert effort in analyzing lengthy recordings. The most promising approaches to automated diagnosis are based on neural networks. This paper describes a method for automated detection of epileptic seizures from EEG signals using a multistage nonlinear pre-processing filter in combination with a diagnostic (LAMSTAR) Artificial Neural Network (ANN). Pre-processing via multistage nonlinear filtering, LAMSTAR input preparation, ANN training and system performance (1.6% miss rate, 97.2% overall accuracy when considering both false-alarms and 'misses') are discussed and are shown to compare favorably with earlier approaches presented in recent literature.", "pmid": "14977058", "title": "A neural-network-based detection of epilepsy."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Face", "Facial Expression", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Information Storage and Retrieval", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "Mosaicing entails the consolidation of information represented by multiple images through the application of a registration and blending procedure. We describe a face mosaicing scheme that generates a composite face image during enrollment based on the evidence provided by frontal and semiprofile face images of an individual. Face mosaicing obviates the need to store multiple face templates representing multiple poses of a user's face image. In the proposed scheme, the side profile images are aligned with the frontal image using a hierarchical registration algorithm that exploits neighborhood properties to determine the transformation relating the two images. Multiresolution splining is then used to blend the side profiles with the frontal image, thereby generating a composite face image of the user. A texture-based face recognition technique that is a slightly modified version of the C2 algorithm proposed by Serre et al. is used to compare a probe face image with the gallery face mosaic. Experiments conducted on three different databases indicate that face mosaicing, as described in this paper, offers significant benefits by accounting for the pose variations that are commonly observed in face images.", "pmid": "17926704", "title": "A mosaicing scheme for pose-invariant face recognition."}, {"journal": "Proteins", "meshMajor": ["Active Transport, Cell Nucleus", "Algorithms", "Artificial Intelligence", "Computational Biology", "DNA-Binding Proteins", "Databases, Protein", "Humans", "Internet", "Kinetics", "Models, Biological", "Nuclear Localization Signals", "Origin Recognition Complex", "Phosphorylation", "Protein Conformation", "Protein Interaction Domains and Motifs", "Protein Isoforms", "Protein Processing, Post-Translational", "RNA Splicing Factors", "Saccharomyces cerevisiae Proteins", "Serine", "Software Validation", "Transcription Factors", "alpha Karyopherins"], "year": "2014", "abstractText": "The binding affinity between a nuclear localization signal (NLS) and its import receptor is closely related to corresponding nuclear import activity. PTM-based modulation of the NLS binding affinity to the import receptor is one of the most understood mechanisms to regulate nuclear import of proteins. However, identification of such regulation mechanisms is challenging due to the difficulty of assessing the impact of PTM on corresponding nuclear import activities. In this study we proposed NIpredict, an effective algorithm to predict nuclear import activity given its NLS, in which molecular interaction energy components (MIECs) were used to characterize the NLS-import receptor interaction, and the support vector regression machine (SVR) was used to learn the relationship between the characterized NLS-import receptor interaction and the corresponding nuclear import activity. Our experiments showed that nuclear import activity change due to NLS change could be accurately predicted by the NIpredict algorithm. Based on NIpredict, we developed a systematic framework to identify potential PTM-based nuclear import regulations for human and yeast nuclear proteins. Application of this approach has identified the potential nuclear import regulation mechanisms by phosphorylation of two nuclear proteins including SF1 and ORC6.", "pmid": "25043850", "title": "Computational identification of post-translational modification-based nuclear import regulations by characterizing nuclear localization signal-import receptor interaction."}, {"journal": "Journal of biomedical informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computational Biology", "Decision Theory", "MEDLINE", "Natural Language Processing"], "year": "2001", "abstractText": "With the growing use of Natural Language Processing (NLP) techniques for information extraction and concept indexing in the biomedical domain, a method that quickly and efficiently assigns the correct sense of an ambiguous biomedical term in a given context is needed concurrently. The current status of word sense disambiguation (WSD) in the biomedical domain is that handcrafted rules are used based on contextual material. The disadvantages of this approach are (i) generating WSD rules manually is a time-consuming and tedious task, (ii) maintenance of rule sets becomes increasingly difficult over time, and (iii) handcrafted rules are often incomplete and perform poorly in new domains comprised of specialized vocabularies and different genres of text. This paper presents a two-phase unsupervised method to build a WSD classifier for an ambiguous biomedical term W. The first phase automatically creates a sense-tagged corpus for W, and the second phase derives a classifier for W using the derived sense-tagged corpus as a training set. A formative experiment was performed, which demonstrated that classifiers trained on the derived sense-tagged corpora achieved an overall accuracy of about 97%, with greater than 90% accuracy for each individual ambiguous term.", "pmid": "11977807", "title": "Disambiguating ambiguous biomedical terms in biomedical narrative text: an unsupervised method."}, {"journal": "Journal of medical systems", "meshMajor": ["Acidosis", "Acidosis, Respiratory", "Adolescent", "Algorithms", "Artificial Intelligence", "Child", "Decision Making, Computer-Assisted", "Decision Support Systems, Clinical", "Decision Trees", "Diagnosis, Differential", "Expert Systems", "Female", "Humans", "Male", "Postoperative Complications"], "year": "1997", "abstractText": "The decision tree approach is one of the most common approaches in automatic learning and decision making. The automatic learning of decision trees and their use usually show very good results in various \"theoretical\" environments. But in real life it is often impossible to find the desired number of representative training objects for various reasons. The lack of possibilities to measure attribute values, high cost and complexity of such measurements, and unavailability of all attributes at the same time are the typical representatives. For this reason we decided to use the decision trees not for their primary task--the decision making--but for outlining the most important attributes. This was possible by using a well-known property of the decision trees--their knowledge representation, which can be easily understood by humans. In a delicate field of medical decision making, we cannot allow ourselves to make any inaccurate decisions and the \"tips,\" provided by the decision trees, can be of a great assistance. Our main interest was to discover a predisposition to two forms of acidosis: the metabolic acidosis and respiratory acidosis, which can both have serious effects on child's health. We decided to construct different decision trees from a set of training objects. Instead of using a test set for evaluation of a decision tree, we asked medical experts to take a closer look at the generated trees. They examined and evaluated the decision trees branch by branch. Their comments show that trees generated from the available training set mainly have surprisingly good branches, but on the other hand, for some, no medical explanation could be found.", "pmid": "9555627", "title": "The limitations of decision trees and automatic learning in real world medical decision making."}, {"journal": "IEEE transactions on cybernetics", "meshMajor": ["Adolescent", "Adult", "Algorithms", "Artificial Intelligence", "Emotions", "Face", "Facial Expression", "Female", "Humans", "Image Processing, Computer-Assisted", "Male", "Pattern Recognition, Automated", "Young Adult"], "year": "2015", "abstractText": "In this paper, we present a new idea to analyze facial expression by exploring some common and specific information among different expressions. Inspired by the observation that only a few facial parts are active in expression disclosure (e.g., around mouth, eye), we try to discover the common and specific patches which are important to discriminate all the expressions and only a particular expression, respectively. A two-stage multitask sparse learning (MTSL) framework is proposed to efficiently locate those discriminative patches. In the first stage MTSL, expression recognition tasks are combined to located common patches. Each of the tasks aims to find dominant patches for each expression. Secondly, two related tasks, facial expression recognition and face verification tasks, are coupled to learn specific facial patches for individual expression. The two-stage patch learning is performed on patches sampled by multiscale strategy. Extensive experiments validate the existence and significance of common and specific patches. Utilizing these learned patches, we achieve superior performances on expression recognition compared to the state-of-the-arts. ", "pmid": "25291808", "title": "Learning Multiscale Active Facial Patches for Expression Analysis."}, {"journal": "Magnetic resonance in medicine", "meshMajor": ["Adolescent", "Adult", "Aged", "Algorithms", "Artificial Intelligence", "Brain Neoplasms", "Child", "Female", "Glioma", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Magnetic Resonance Angiography", "Male", "Middle Aged", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Young Adult"], "year": "2010", "abstractText": "Dynamic susceptibility contrast magnetic resonance perfusion imaging (DSC-MRI) is a useful method to characterize gliomas. Recently, support vector machines (SVMs) have been introduced as means to prospectively characterize new patients based on information from previous patients. Based on features derived from automatically segmented tumor volumes from 101 DSC-MR examinations, four different SVM models were compared. All SVM models achieved high prediction accuracies (>82%) after rebalancing the training data sets to equal amounts of samples per class. Best discrimination was obtained using a SVM model with a radial basis function kernel. A correct prediction of low-grade glioma was obtained at 83% (true positive rate) and for high-grade glioma at 91% (true negative rate) on the independent test data set. In conclusion, the combination of automated tumor segmentation followed by SVM classification is feasible. Thereby, a powerful tool is available to characterize glioma presurgically in patients.", "pmid": "20564592", "title": "Support vector machines in DSC-based glioma imaging: suggestions for optimal characterization."}, {"journal": "IEEE transactions on cybernetics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Feedback", "Models, Theoretical", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "In this paper, the problem of adaptive active fault-tolerant control for a class of nonlinear systems with unknown actuator fault is investigated. The actuator fault is assumed to have no traditional affine appearance of the system state variables and control input. The useful property of the basis function of the radial basis function neural network (NN), which will be used in the design of the fault tolerant controller, is explored. Based on the analysis of the design of normal and passive fault tolerant controllers, by using the implicit function theorem, a novel NN-based active fault-tolerant control scheme with fault alarm is proposed. Comparing with results in the literature, the fault-tolerant control scheme can minimize the time delay between fault occurrence and accommodation that is called the time delay due to fault diagnosis, and reduce the adverse effect on system performance. In addition, the FTC scheme has the advantages of a passive fault-tolerant control scheme as well as the traditional active fault-tolerant control scheme's properties. Furthermore, the fault-tolerant control scheme requires no additional fault detection and isolation model which is necessary in the traditional active fault-tolerant control scheme. Finally, simulation results are presented to demonstrate the efficiency of the developed techniques. ", "pmid": "25014982", "title": "Novel neural networks-based fault tolerant control scheme with fault alarm."}, {"journal": "Toxicology and applied pharmacology", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Decision Trees", "Environmental Monitoring", "Least-Squares Analysis", "Molecular Structure", "Quantitative Structure-Activity Relationship", "Regression Analysis", "Risk Assessment", "Species Specificity", "Stochastic Processes", "Tetrahymena pyriformis", "Toxicology"], "year": "2014", "abstractText": "Ensemble learning approach based decision treeboost (DTB) and decision tree forest (DTF) models are introduced in order to establish quantitative structure-toxicity relationship (QSTR) for the prediction of toxicity of 1450 diverse chemicals. Eight non-quantum mechanical molecular descriptors were derived. Structural diversity of the chemicals was evaluated using Tanimoto similarity index. Stochastic gradient boosting and bagging algorithms supplemented DTB and DTF models were constructed for classification and function optimization problems using the toxicity end-point in T. pyriformis. Special attention was drawn to prediction ability and robustness of the models, investigated both in external and 10-fold cross validation processes. In complete data, optimal DTB and DTF models rendered accuracies of 98.90%, 98.83% in two-category and 98.14%, 98.14% in four-category toxicity classifications. Both the models further yielded classification accuracies of 100% in external toxicity data of T. pyriformis. The constructed regression models (DTB and DTF) using five descriptors yielded correlation coefficients (R(2)) of 0.945, 0.944 between the measured and predicted toxicities with mean squared errors (MSEs) of 0.059, and 0.064 in complete T. pyriformis data. The T. pyriformis regression models (DTB and DTF) applied to the external toxicity data sets yielded R(2) and MSE values of 0.637, 0.655; 0.534, 0.507 (marine bacteria) and 0.741, 0.691; 0.155, 0.173 (algae). The results suggest for wide applicability of the inter-species models in predicting toxicity of new chemicals for regulatory purposes. These approaches provide useful strategy and robust tools in the screening of ecotoxicological risk or environmental hazard potential of chemicals. ", "pmid": "24463095", "title": "In silico prediction of toxicity of non-congeneric industrial chemicals using ensemble learning based modeling approaches."}, {"journal": "IEEE transactions on image processing : a publication of the IEEE Signal Processing Society", "meshMajor": ["Algorithms", "Artifacts", "Artificial Intelligence", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Neural Networks, Computer", "Pattern Recognition, Automated", "Photogrammetry", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2007", "abstractText": "We propose a modified self-organizing neural network to estimate the disparity map from a stereo pair of images. Novelty consists of the network architecture and of dispensing with the standard assumption of epipolar geometry. Quite distinct from the existing algorithms which, typically, involve area- and/or feature-matching, the network is first initialized to the right image, and then deformed until it is transformed into the left image, or vice versa, this deformation itself being the measure of disparity. Illustrative examples include two classes of stereo pairs: synthetic and natural (including random-dot stereograms and wire frames) and distorted. The latter has one of the following special characteristics: one image is blurred, one image is of a different size, there are salient features like discontinuous depth values at boundaries and surface wrinkles, and there exist occluded and half-occluded regions. While these examples serve, in general, to demonstrate that the technique performs better than many existing algorithms, the above-mentioned stereo pairs (in particular, the last two) bring out some of its limitations, thereby serving as possible motivation for further work.", "pmid": "17990758", "title": "On the application of a modified self-organizing neural network to estimate stereo disparity."}, {"journal": "Journal of biomedical informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Gene Expression Profiling", "Oligonucleotide Array Sequence Analysis", "Pattern Recognition, Automated", "Software", "User-Computer Interface"], "year": "2008", "abstractText": "MOTIVATION: A challenge in microarray data analysis is to interpret observed changes in terms of biological properties and relationships. One powerful approach is to make associations of gene expression clusters with biomedical ontologies and/or biological pathways. However, this approach evaluates only one cluster at a time, returning long unordered lists of annotations for clusters without considering the overall context of the experiment under investigation.RESULTS: BioLattice is a mathematical framework based on concept lattice analysis for the biological interpretation of gene expression data. By considering gene expression clusters as objects and associated annotations as attributes and by using set inclusion relationships BioLattice orders them to create a lattice of concepts, providing an 'executive' summary of the experimental context. External knowledge resources such as Gene Ontology trees and pathway graphs can be added incrementally. We propose two quantitative structural analysis methods, 'prominent sub-lattice' and 'core-periphery' analyses, enabling systematic comparison of experimental concepts and contexts. BioLattice is implemented as a web-based utility using Scalable Vector Graphics for interactive visualization. We applied it to real microarray datasets with improved biological interpretations of the experimental contexts.", "pmid": "18093880", "title": "BioLattice: a framework for the biological interpretation of microarray gene expression data using concept lattice analysis."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Diagnosis, Computer-Assisted", "Electroencephalography", "Female", "Fourier Analysis", "Humans", "Male", "Middle Aged", "Pattern Recognition, Automated", "Polysomnography", "Reproducibility of Results", "Sensitivity and Specificity", "Sleep Stages", "Wavelet Analysis", "Young Adult"], "year": "2010", "abstractText": "An algorithm to detect automatically drowsiness episodes has been developed. It uses only one EEG channel to differentiate the stages of alertness and drowsiness. In this work the vectors features are building combining Power Spectral Density (PDS) and Wavelet Transform (WT). The feature extracted from the PSD of EEG signal are: Central frequency, the First Quartile Frequency, the Maximum Frequency, the Total Energy of the Spectrum, the Power of Theta and Alpha bands. In the Wavelet Domain, it was computed the number of Zero Crossing and the integrated from the scale 3, 4 and 5 of Daubechies 2 order WT. The classifying of epochs is being done with neural networks. The detection results obtained with this technique are 86.5 % for drowsiness stages and 81.7% for alertness segment. Those results show that the features extracted and the classifier are able to identify drowsiness EEG segments.", "pmid": "21096343", "title": "An automatic detector of drowsiness based on spectral analysis and wavelet decomposition of EEG records."}, {"journal": "IEEE transactions on medical imaging", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Elasticity Imaging Techniques", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Models, Biological", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2009", "abstractText": "In this paper, we propose the DT-REFinD algorithm for the diffeomorphic nonlinear registration of diffusion tensor images. Unlike scalar images, deforming tensor images requires choosing both a reorientation strategy and an interpolation scheme. Current diffusion tensor registration algorithms that use full tensor information face difficulties in computing the differential of the tensor reorientation strategy and consequently, these methods often approximate the gradient of the objective function. In the case of the finite-strain (FS) reorientation strategy, we borrow results from the pose estimation literature in computer vision to derive an analytical gradient of the registration objective function. By utilizing the closed-form gradient and the velocity field representation of one parameter subgroups of diffeomorphisms, the resulting registration algorithm is diffeomorphic and fast. We contrast the algorithm with a traditional FS alternative that ignores the reorientation in the gradient computation. We show that the exact gradient leads to significantly better registration at the cost of computation time. Independently of the choice of Euclidean or Log-Euclidean interpolation and sum of squared differences dissimilarity measure, the exact gradient achieves better alignment over an entire spectrum of deformation penalties. Alignment quality is assessed with a battery of metrics including tensor overlap, fractional anisotropy, inverse consistency and closeness to synthetic warps. The improvements persist even when a different reorientation scheme, preservation of principal directions, is used to apply the final deformations.", "pmid": "19556193", "title": "DT-REFinD: diffusion tensor registration with exact finite-strain differential."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Activities of Daily Living", "Artificial Intelligence", "Computer Communication Networks", "Equipment Design", "Equipment Failure Analysis", "Monitoring, Ambulatory", "Personal Autonomy", "Self-Help Devices", "Software", "Systems Integration", "Telemedicine", "Transducers"], "year": "2014", "abstractText": "The deployment of the Ambient Intelligence (AmI) paradigm requires designing and integrating user-centered smart environments to assist people in their daily life activities. This research paper details an integration and validation of multiple heterogeneous sensors with hybrid reasoners that support decision making in order to monitor personal and environmental data at a smart home in a private way. The results innovate on knowledge-based platforms, distributed sensors, connected objects, accessibility and authentication methods to promote independent living for elderly people. TALISMAN+, the AmI framework deployed, integrates four subsystems in the smart home: (i) a mobile biomedical telemonitoring platform to provide elderly patients with continuous disease management; (ii) an integration middleware that allows context capture from heterogeneous sensors to program environment's reaction; (iii) a vision system for intelligent monitoring of daily activities in the home; and (iv) an ontologies-based integrated reasoning platform to trigger local actions and manage private information in the smart home. The framework was integrated in two real running environments, the UPM Accessible Digital Home and MetalTIC house, and successfully validated by five experts in home care, elderly people and personal autonomy.", "pmid": "25232910", "title": "Integration of multisensor hybrid reasoners to support personal autonomy in the smart home."}, {"journal": "M.D. computing : computers in medical practice", "meshMajor": ["Artificial Intelligence", "Computer Communication Networks", "Dementia", "Diagnosis, Computer-Assisted", "Expert Systems", "Humans", "Software"], "year": null, "abstractText": "During the past decade, artificial neural networks have been established as promising psychological and computational models. The proponents of neural computing believe that it offers new solutions to problems that have been intractable so far. To study the suitability of neural networks for performing sequential diagnostic classification, I have used a network that, over time, becomes increasingly proficient at diagnosing dementia. A description of the implementation, training, and behavior of this network illustrates how neural-network technology might contribute to clinical computing.", "pmid": "2407923", "title": "A neural network as an approach to clinical diagnosis."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Adult", "Algorithms", "Artificial Intelligence", "Automation", "Bayes Theorem", "Computer Systems", "Electroencephalography", "Equipment Design", "Humans", "Male", "Models, Theoretical", "Neural Networks, Computer", "Normal Distribution", "Reproducibility of Results", "Signal Processing, Computer-Assisted"], "year": "2011", "abstractText": "EEG data has been used to discriminate levels of mental workload when classifiers are created for each subject, but the reliability of classifiers trained on multiple subjects has yet to be investigated. Artificial neural network and naive Bayesian classifiers were trained with data from single and multiple subjects and their ability to discriminate among three difficulty conditions was tested. When trained on data from multiple subjects, both types of classifiers poorly discriminated between the three levels. However, a novel model, the naive Bayesian classifier with a hidden node, performed nearly as well as the models trained and tested on individuals. In addition, a hierarchical Bayes model with a higher level constraint on the hidden node can further improve its performance.", "pmid": "22255836", "title": "An EEG workload classifier for multiple subjects."}, {"journal": "Bioinformatics (Oxford, England)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cell Cycle Proteins", "Cluster Analysis", "Gene Expression Profiling", "Models, Genetic", "Models, Statistical", "Multivariate Analysis", "Normal Distribution", "Oligonucleotide Array Sequence Analysis", "Saccharomyces cerevisiae Proteins", "Sequence Alignment", "Sequence Analysis, DNA", "Software"], "year": "2004", "abstractText": "MOTIVATION: Grouping genes having similar expression patterns is called gene clustering, which has been proved to be a useful tool for extracting underlying biological information of gene expression data. Many clustering procedures have shown success in microarray gene clustering; most of them belong to the family of heuristic clustering algorithms. Model-based algorithms are alternative clustering algorithms, which are based on the assumption that the whole set of microarray data is a finite mixture of a certain type of distributions with different parameters. Application of the model-based algorithms to unsupervised clustering has been reported. Here, for the first time, we demonstrated the use of the model-based algorithm in supervised clustering of microarray data.RESULTS: We applied the proposed methods to real gene expression data and simulated data. We showed that the supervised model-based algorithm is superior over the unsupervised method and the support vector machines (SVM) method.AVAILABILITY: The program written in the SAS language implementing methods I-III in this report is available upon request. The software of SVMs is available in the website http://svm.sdsc.edu/cgi-bin/nph-SVMsubmit.cgi", "pmid": "15044244", "title": "Supervised cluster analysis for microarray data based on multivariate Gaussian mixture."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artifacts", "Artificial Intelligence", "Data Interpretation, Statistical", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Motion", "Pattern Recognition, Automated", "Photography", "Reproducibility of Results", "Sensitivity and Specificity", "Video Recording"], "year": "2013", "abstractText": "Turbulence mitigation refers to the stabilization of videos with nonuniform deformations due to the influence of optical turbulence. Typical approaches for turbulence mitigation follow averaging or dewarping techniques. Although these methods can reduce the turbulence, they distort the independently moving objects, which can often be of great interest. In this paper, we address the novel problem of simultaneous turbulence mitigation and moving object detection. We propose a novel three-term low-rank matrix decomposition approach in which we decompose the turbulence sequence into three components: the background, the turbulence, and the object. We simplify this extremely difficult problem into a minimization of nuclear norm, Frobenius norm, and l1 norm. Our method is based on two observations: First, the turbulence causes dense and Gaussian noise and therefore can be captured by Frobenius norm, while the moving objects are sparse and thus can be captured by l1 norm. Second, since the object's motion is linear and intrinsically different from the Gaussian-like turbulence, a Gaussian-based turbulence model can be employed to enforce an additional constraint on the search space of the minimization. We demonstrate the robustness of our approach on challenging sequences which are significantly distorted with atmospheric turbulence and include extremely tiny moving objects.", "pmid": "22529321", "title": "Simultaneous video stabilization and moving object detection in turbulence."}, {"journal": "IEEE transactions on bio-medical engineering", "meshMajor": ["Algorithms", "Artificial Intelligence", "Atrial Fibrillation", "Diagnosis, Computer-Assisted", "Electrocardiography", "Europe", "Humans", "Pattern Recognition, Automated", "Principal Component Analysis", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "Analysis of atrial rhythm is important in the treatment and management of patients with atrial fibrillation. Several algorithms exist for extracting the atrial signal from the electrocardiogram (ECG) in atrial fibrillation, but there are few reports on how well these techniques are able to recover the atrial signal. We assessed and compared three algorithms for extracting the atrial signal from the 12-lead ECG. The 12-lead ECGs of 30 patients in atrial fibrillation were analyzed. Atrial activity was extracted by three algorithms, Spatiotemporal QRST cancellation (STC), principal component analysis (PCA), and independent component analysis (ICA). The amplitude and frequency characteristics of the extracted atrial signals were compared between algorithms and against reference data. Mean (standard deviation) amplitude of QRST segments of V1 was 0.99 (0.54) mV, compared to 0.18 (0.11) mV (STC), 0.19 (0.13) mV (PCA), and 0.29 (0.22) mV (ICA). Hence, for all algorithms there were significant reductions in the amplitude of the ventricular activity compared with that in V1. Reference atrial signal amplitude in V1 was 0.18 (0.11) mV, compared to 0.17 (0.10) mV (STC), 0.12 (0.09) mV (PCA), and 0.18 (0.13) mV (ICA) in the extracted atrial signals. PCA tended to attenuate the atrial signal in these segments. There were no significant differences for any of the algorithms when comparing the amplitude of the reference atrial signal with that of the extracted atrial signals in segments in which ventricular activity had been removed. There were no significant differences between algorithms in the frequency characteristics of the extracted atrial signals. There were discrepancies in amplitude and frequency characteristics of the atrial signal in only a few cases resulting from notable residual ventricular activity for PCA and ICA algorithms. In conclusion, the extracted atrial signals from these algorithms exhibit very similar amplitude and frequency characteristics. Users of these algorithms should be observant of residual ventricular activities which can affect the analysis of the fibrillatory waveform in clinical practice.", "pmid": "16485765", "title": "Comparison of atrial signal extraction algorithms in 12-lead ECGs with atrial fibrillation."}, {"journal": "IEEE transactions on information technology in biomedicine : a publication of the IEEE Engineering in Medicine and Biology Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Binding Sites", "Computer Simulation", "Enzyme Activation", "Models, Chemical", "Peptides", "Protein Binding", "Sequence Analysis, Protein", "Viral Nonstructural Proteins"], "year": "2007", "abstractText": "Although various machine learning approaches have been used for predicting protease cleavage sites, constructing a probabilistic model for these tasks is still challenging. This paper proposes a novel algorithm termed as a probabilistic peptide machine where estimating probability density functions and constructing a classifier for predicting protease cleavage sites are combined into one process. The simulation based on experimentally determined Hepatitis C virus (HCV) protease cleavage data has demonstrated the success of this new algorithm.", "pmid": "17912976", "title": "A probabilistic peptide machine for predicting hepatitis C virus protease cleavage sites."}, {"journal": "Journal of chemical information and modeling", "meshMajor": ["Algorithms", "Artificial Intelligence", "Flavonoids", "Genetics", "Nonlinear Dynamics", "Protein Binding", "Quantitative Structure-Activity Relationship", "Receptors, GABA-A", "Regression Analysis"], "year": "2009", "abstractText": "Several studies were conducted in past years which used the evolutionary process of Genetic Algorithms for optimizing the Support Vector Regression parameter values although, however, few of them were devoted to the simultaneously optimization of the type of kernel function involved in the established model. The present work introduces a new hybrid genetic-based Support Vector Regression approach, whose statistical quality and predictive capability is afterward analyzed and compared to other standard chemometric techniques, such as Partial Least Squares, Back-Propagation Artificial Neural Networks, and Support Vector Machines based on Cross-Validation. For this purpose, we employ a data set of experimentally determined binding affinity constants toward the benzodiazepine binding site of the GABA (A) receptor complex on 78 flavonoid ligands.", "pmid": "19492793", "title": "New hybrid genetic based Support Vector Regression as QSAR approach for analyzing flavonoids-GABA(A) complexes."}, {"journal": "Physical review. E, Statistical, nonlinear, and soft matter physics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Bayes Theorem", "Learning", "Mathematical Computing", "Models, Statistical", "Neural Networks, Computer", "Normal Distribution", "Thermodynamics"], "year": "2001", "abstractText": "We study the typical learning properties of the recently introduced soft margin classifiers (SMCs), learning realizable and unrealizable tasks, with the tools of statistical mechanics. We derive analytically the behavior of the learning curves in the regime of very large training sets. We obtain exponential and power laws for the decay of the generalization error towards the asymptotic value, depending on the task and on general characteristics of the distribution of stabilities of the patterns to be learned. The optimal learning curves of the SMCs, which give the minimal generalization error, are obtained by tuning the coefficient controlling the trade-off between the error and the regularization terms in the cost function. If the task is realizable by the SMC, the optimal performance is better than that of a hard margin support vector machine and is very close to that of a Bayesian classifier.", "pmid": "11580367", "title": "Statistical mechanics of learning with soft margin classifiers."}, {"journal": "Methods in molecular biology (Clifton, N.J.)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Databases, Factual", "Proteins", "Quantitative Structure-Activity Relationship", "Small Molecule Libraries"], "year": "2011", "abstractText": "Support vector machine (SVM)-based selectivity searching has recently been introduced to identify compounds in virtual screening libraries that are not only active for a target protein, but also selective for this target over a closely related member of the same protein family. In simulated virtual screening calculations, SVM-based strategies termed preference ranking and one-versus-all ranking were successfully applied to rank a database and enrich high-ranking positions with selective compounds while removing nonselective molecules from high ranks. In contrast to the original SVM approach developed for binary classification, these strategies enable learning from more than two classes, considering that distinguishing between selective, promiscuously active, and inactive compounds gives rise to a three-class prediction problem. In this chapter, we describe the extension of the one-versus-all strategy to four training classes. Furthermore, we present an adaptation of the preference ranking strategy that leads to higher recall of selective compounds than previously investigated approaches and is applicable in situations where the removal of nonselective compounds from high-ranking positions is not required.", "pmid": "20838983", "title": "Application of support vector machine-based ranking strategies to search for target-selective compounds."}, {"journal": "Anadolu kardiyoloji dergisi : AKD = the Anatolian journal of cardiology", "meshMajor": ["Algorithms", "Artificial Intelligence", "Case-Control Studies", "Coronary Angiography", "Coronary Artery Disease", "Female", "Humans", "Image Interpretation, Computer-Assisted", "Male", "Middle Aged", "Neural Networks, Computer", "Predictive Value of Tests", "Prognosis", "Reproducibility of Results", "Retrospective Studies", "Sensitivity and Specificity"], "year": "2008", "abstractText": "OBJECTIVE: Eight different learning algorithms used for creating artificial neural network (ANN) models and the different ANN models in the prediction of coronary artery disease (CAD) are introduced.METHODS: This work was carried out as a retrospective case-control study. Overall, 124 consecutive patients who had been diagnosed with CAD by coronary angiography (at least 1 coronary stenosis > 50% in major epicardial arteries) were enrolled in the work. Angiographically, the 113 people (group 2) with normal coronary arteries were taken as control subjects. Multi-layered perceptrons ANN architecture were applied. The ANN models trained with different learning algorithms were performed in 237 records, divided into training (n=171) and testing (n=66) data sets. The performance of prediction was evaluated by sensitivity, specificity and accuracy values based on standard definitions.RESULTS: The results have demonstrated that ANN models trained with eight different learning algorithms are promising because of high (greater than 71%) sensitivity, specificity and accuracy values in the prediction of CAD. Accuracy, sensitivity and specificity values varied between 83.63%-100%, 86.46%-100% and 74.67%-100% for training, respectively. For testing, the values were more than 71% for sensitivity, 76% for specificity and 81% for accuracy.CONCLUSIONS: It may be proposed that the use of different learning algorithms other than backpropagation and larger sample sizes can improve the performance of prediction. The proposed ANN models trained with these learning algorithms could be used a promising approach for predicting CAD without the need for invasive diagnostic methods and could help in the prognostic clinical decision.", "pmid": "18676299", "title": "Predicting coronary artery disease using different artificial neural network models."}, {"journal": "PloS one", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Imaging, Three-Dimensional", "Probability", "Signal Processing, Computer-Assisted", "Signal-To-Noise Ratio"], "year": "2018", "abstractText": "This paper presents a robust 3D point cloud registration algorithm based on bidirectional Maximum Correntropy Criterion (MCC). Comparing with traditional registration algorithm based on the mean square error (MSE), using the MCC is superior in dealing with complex registration problem with non-Gaussian noise and large outliers. Since the MCC is considered as a probability measure which weights the corresponding points for registration, the noisy points are penalized. Moreover, we propose to use bidirectional measures which can maximum the overlapping parts and avoid the registration result being trapped into a local minimum. Both of these strategies can better apply the information theory method to the point cloud registration problem, making the algorithm more robust. In the process of implementation, we integrate the fixed-point optimization technique based on the iterative closest point algorithm, resulting in the correspondence and transformation parameters that are solved iteratively. The comparison experiments under noisy conditions with related algorithms have demonstrated good performance of the proposed algorithm.", "pmid": "29799864", "title": "Robust 3D point cloud registration based on bidirectional Maximum Correntropy Criterion."}, {"journal": "Optics express", "meshMajor": ["Algorithms", "Artificial Intelligence", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2008", "abstractText": "In this paper, we propose an occlusion removal method using sub-image block matching for improved recognition of partially occluded 3D objects in computational integral imaging (CII). When 3D plane images are reconstructed in CII, occlusion degrades the resolution of reconstructed images. To overcome this problem, we apply the sub-image transform to elemental image array (EIA) and these sub-images are employed using block matching method for depth estimation. Based on the estimated depth information, we remove the unknown occlusion. After completing the occlusion removal for all sub-images, we obtain the modified EIA without occlusion information through the inverse sub-image transform. Finally, the 3D plane images are reconstructed by using a computational integral imaging reconstruction method with the modified EIA. The proposed method can provide a substantial gain in terms of the visual quality of 3D reconstructed images. To show the usefulness of the proposed method we carry out some experiments and the results are presented.", "pmid": "18852735", "title": "Occlusion removal method of partially occluded 3D object using sub-image block matching in computational integral imaging."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Adult", "Analysis of Variance", "Animals", "Artificial Intelligence", "Brain", "Brain Mapping", "Cercopithecidae", "Corpus Callosum", "Female", "Humans", "Image Processing, Computer-Assisted", "Lateral Ventricles", "Magnetic Resonance Imaging, Cine", "Male", "Middle Aged", "Principal Component Analysis", "Skull"], "year": "2009", "abstractText": "Localized Components Analysis (LoCA) is a new method for describing surface shape variation in an ensemble of objects using a linear subspace of spatially localized shape components. In contrast to earlier methods, LoCA optimizes explicitly for localized components and allows a flexible trade-off between localized and concise representations, and the formulation of locality is flexible enough to incorporate properties such as symmetry. This paper demonstrates that LoCA can provide intuitive presentations of shape differences associated with sex, disease state, and species in a broad range of biomedical specimens, including human brain regions and monkey crania.", "pmid": "19542583", "title": "Exploration of shape variation using localized components analysis."}, {"journal": "Neural computation", "meshMajor": ["Algorithms", "Artificial Intelligence", "Entropy", "Humans", "Least-Squares Analysis"], "year": "2015", "abstractText": "Regression aims at estimating the conditional mean of output given input. However, regression is not informative enough if the conditional density is multimodal, heteroskedastic, and asymmetric. In such a case, estimating the conditional density itself is preferable, but conditional density estimation (CDE) is challenging in high-dimensional space. A naive approach to coping with high dimensionality is to first perform dimensionality reduction (DR) and then execute CDE. However, a two-step process does not perform well in practice because the error incurred in the first DR step can be magnified in the second CDE step. In this letter, we propose a novel single-shot procedure that performs CDE and DR simultaneously in an integrated way. Our key idea is to formulate DR as the problem of minimizing a squared-loss variant of conditional entropy, and this is solved using CDE. Thus, an additional CDE step is not needed after DR. We demonstrate the usefulness of the proposed method through extensive experiments on various data sets, including humanoid robot transition and computer art. ", "pmid": "25380340", "title": "Conditional density estimation with dimensionality reduction via squared-loss conditional entropy minimization."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feedback", "Likelihood Functions", "Markov Chains", "Neural Networks, Computer", "Robotics", "Time Factors"], "year": "2008", "abstractText": "This paper proposes a novel learning method for a mixture of recurrent neural network (RNN) experts model, which can acquire the ability to generate desired sequences by dynamically switching between experts. Our method is based on maximum likelihood estimation, using a gradient descent algorithm. This approach is similar to that used in conventional methods; however, we modify the likelihood function by adding a mechanism to alter the variance for each expert. The proposed method is demonstrated to successfully learn Markov chain switching among a set of 9 Lissajous curves, for which the conventional method fails. The learning performance, analyzed in terms of the generalization capability, of the proposed method is also shown to be superior to that of the conventional method. With the addition of a gating network, the proposed method is successfully applied to the learning of sensory-motor flows for a small humanoid robot as a realistic problem of time series prediction and generation.", "pmid": "18938059", "title": "A model for learning to segment temporal sequences, utilizing a mixture of RNN experts together with adaptive variance."}, {"journal": "Information processing in medical imaging : proceedings of the ... conference", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cerebral Cortex", "Diffusion Tensor Imaging", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Nerve Fibers, Myelinated", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2013", "abstractText": "Group neuroimaging studies of the cerebral cortex benefit from accurate, surface-based, cross-subject alignment for investigating brain architecture, function and connectivity. There is an increasing amount of high quality data available. However, establishing how different modalities correlate across groups remains an open research question. One reason for this is that the current methods for registration, based on cortical folding, provide sub-optimal alignment of some functional subregions of the brain. A more flexible framework is needed that will allow robust alignment of multiple modalities. We adapt the Fast Primal-Dual (Fast-PD) approach for discrete Markov Random Field (MRF) optimisation to spherical registration by reframing the deformation labels as a discrete set of rotations and propose a novel regularisation term, derived from the geodesic distance between rotation matrices. This formulation allows significant flexibility in the choice of similarity metric. To this end we propose a new multivariate cost function based on the discretisation of a graph-based mutual information measure. Results are presented for alignment driven by scalar metrics of curvature and myelination, and multivariate features derived from functional task performance. These experiments demonstrate the potential of this approach for improving the integration of complementary brain data sets in the future.", "pmid": "24683992", "title": "Multimodal surface matching: fast and generalisable cortical registration using discrete optimisation."}, {"journal": "IEEE transactions on neural networks", "meshMajor": ["Algorithms", "Artificial Intelligence", "Forecasting", "Humans", "Memory", "Neural Networks, Computer", "Protein Structure, Secondary", "Proteins", "Sequence Analysis, Protein"], "year": "2009", "abstractText": "Conventional recurrent neural networks (RNNs) have difficulties in learning long-term dependencies. To tackle this problem, we propose an architecture called segmented-memory recurrent neural network (SMRNN). A symbolic sequence is broken into segments and then presented as inputs to the SMRNN one symbol per cycle. The SMRNN uses separate internal states to store symbol-level context, as well as segment-level context. The symbol-level context is updated for each symbol presented for input. The segment-level context is updated after each segment. The SMRNN is trained using an extended real-time recurrent learning algorithm. We test the performance of SMRNN on the information latching problem, the \"two-sequence problem\" and the problem of protein secondary structure (PSS) prediction. Our implementation results indicate that SMRNN performs better on long-term dependency problems than conventional RNNs. Besides, we also theoretically analyze how the segmented memory of SMRNN helps learning long-term temporal dependencies and study the impact of the segment length.", "pmid": "19605323", "title": "Segmented-memory recurrent neural networks."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Mining"], "year": "2013", "abstractText": "BACKGROUND: Negation occurs frequently in scientific literature, especially in biomedical literature. It has previously been reported that around 13% of sentences found in biomedical research articles contain negation. Historically, the main motivation for identifying negated events has been to ensure their exclusion from lists of extracted interactions. However, recently, there has been a growing interest in negative results, which has resulted in negation detection being identified as a key challenge in biomedical relation extraction. In this article, we focus on the problem of identifying negated bio-events, given gold standard event annotations.RESULTS: We have conducted a detailed analysis of three open access bio-event corpora containing negation information (i.e., GENIA Event, BioInfer and BioNLP'09 ST), and have identified the main types of negated bio-events. We have analysed the key aspects of a machine learning solution to the problem of detecting negated events, including selection of negation cues, feature engineering and the choice of learning algorithm. Combining the best solutions for each aspect of the problem, we propose a novel framework for the identification of negated bio-events. We have evaluated our system on each of the three open access corpora mentioned above. The performance of the system significantly surpasses the best results previously reported on the BioNLP'09 ST corpus, and achieves even better results on the GENIA Event and BioInfer corpora, both of which contain more varied and complex events.CONCLUSIONS: Recently, in the field of biomedical text mining, the development and enhancement of event-based systems has received significant interest. The ability to identify negated events is a key performance element for these systems. We have conducted the first detailed study on the analysis and identification of negated bio-events. Our proposed framework can be integrated with state-of-the-art event extraction systems. The resulting systems will be able to extract bio-events with attached polarities from textual documents, which can serve as the foundation for more elaborate systems that are able to detect mutually contradicting bio-events.", "pmid": "23323936", "title": "Negated bio-events: analysis and identification."}, {"journal": "Studies in health technology and informatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Neural Networks, Computer", "Odorants", "Smell", "Software"], "year": "2000", "abstractText": "For practical applications, artificial neural networks have to meet several requirements: Mainly they should learn quick, classify accurate and behave robust. Programs should be user-friendly and should not need the presence of an expert for fine tuning diverse learning parameters. The present paper demonstrates an approach using an oversized network topology, adaptive propagation (APROP), a modified error function, and averaging outputs of four networks described for the first time. As an example, signals from different semiconductor gas sensors of an electronic nose were classified. The electronic nose smelt different types of edible oil with extremely different a-priori-probabilities. The fully-specified neural network classifier fulfilled the above mentioned demands. The new approach will be helpful not only for classifying olfactory signals automatically but also in many other fields in medicine, e.g. in data mining from medical databases.", "pmid": "11187516", "title": "Artificial neural networks for classifying olfactory signals."}, {"journal": "Applied spectroscopy", "meshMajor": ["Air Pollutants", "Aircraft", "Algorithms", "Artificial Intelligence", "Environmental Monitoring", "Ethanol", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Signal Processing, Computer-Assisted", "Spectroscopy, Fourier Transform Infrared"], "year": "2003", "abstractText": "Methodology is developed for the automated detection of heated plumes of ethanol vapor with airborne passive Fourier transform infrared spectrometry. Positioned in a fixed-wing aircraft in a downward-looking mode, the spectrometer is used to detect ground sources of ethanol vapor from an altitude of 2000-3000 ft. Challenges to the use of this approach for the routine detection of chemical plumes include (1) the presence of a constantly changing background radiance as the aircraft flies, (2) the cost and complexity of collecting the data needed to train the classification algorithms used in implementing the plume detection, and (3) the need for rapid interferogram scans to minimize the ground area viewed per scan. To address these challenges, this work couples a novel ground-based data collection and training protocol with the use of signal processing and pattern recognition methods based on short sections of the interferogram data collected by the spectrometer. In the data collection, heated plumes of ethanol vapor are released from a portable emission stack and viewed by the spectrometer from ground level against a synthetic background designed to simulate a terrestrial radiance source. Classifiers trained with these data are subsequently tested with airborne data collected over a period of 2.5 years. Two classifier architectures are compared in this work: support vector machines (SVM) and piecewise linear discriminant analysis (PLDA). When applied to the airborne test data, the SVM classifiers perform best, failing to detect ethanol in only 8% of the cases in which it is present. False detections occur at a rate of less than 0.5%. The classifier performs well in spite of differences between the backgrounds associated with the ground-based and airborne data collections and the instrumental drift arising from the long time span of the data collection. Further improvements in classification performance are judged to require increased sophistication in the ground-based data collection in order to provide a better match to the infrared backgrounds observed from the air.", "pmid": "14658159", "title": "Remote detection of heated ethanol plumes by airborne passive Fourier transform infrared spectrometry."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feedback", "Humans", "Neural Networks, Computer"], "year": "2007", "abstractText": "An important drawback of many artificial neural networks (ANN) is their lack of explanation capability [Andrews, R., Diederich, J., & Tickle, A. B. (1996). A survey and critique of techniques for extracting rules from trained artificial neural networks. Knowledge-Based Systems, 8, 373-389]. This paper starts with a survey of algorithms which attempt to explain the ANN output. We then present HYPINV, a new explanation algorithm which relies on network inversion; i.e. calculating the ANN input which produces a desired output. HYPINV is a pedagogical algorithm, that extracts rules, in the form of hyperplanes. It is able to generate rules with arbitrarily desired fidelity, maintaining a fidelity-complexity tradeoff. To our knowledge, HYPINV is the only pedagogical rule extraction method, which extracts hyperplane rules from continuous or binary attribute neural networks. Different network inversion techniques, involving gradient descent as well as an evolutionary algorithm, are presented. An information theoretic treatment of rule extraction is presented. HYPINV is applied to example synthetic problems, to a real aerospace problem, and compared with similar algorithms using benchmark problems.", "pmid": "17029713", "title": "Neural network explanation using inversion."}, {"journal": "BioTechniques", "meshMajor": ["Animals", "Artificial Intelligence", "Base Sequence", "DNA, Complementary", "False Positive Reactions", "Gene Library", "Genetic Testing", "Human Genome Project", "Humans", "Molecular Sequence Data", "Neural Networks, Computer", "Predictive Value of Tests"], "year": "1996", "abstractText": "A low order neural network-based filter was designed as a rapid screening agent for single-spanning transmembrane regions in an integrated informatics system. A rapid screening algorithm was seen as a compromise between costly structure-specific techniques and simple rules that gave a high false-positive rate for cDNA. The filter was applied to a library of 2123 anonymous cDNA sequences, which resulted in 61 detections. Evaluation of the detections with two other dissimilar computer prediction algorithms yielded strong transmembrane predictions for 15 of the detections, while 8 of the detections resulted in a definitive negative result. Homology searches performed on the sequences with detection reports yielded 13 homologs in the predicted reading frame, four of which are membrane associated.", "pmid": "8969840", "title": "High-throughput cDNA screening utilizing a low order neural network filter."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Feasibility Studies", "Femoral Fractures", "Fracture Fixation, Internal", "Humans", "Radiographic Image Enhancement", "Radiographic Image Interpretation, Computer-Assisted", "Reproducibility of Results", "Sensitivity and Specificity", "Surgery, Computer-Assisted", "Tomography, X-Ray Computed", "Treatment Outcome"], "year": "2007", "abstractText": "An algorithm to globally register multiple 3D data sets (point sets) within a general reference frame is proposed. The algorithm uses the Unscented Kalman Filter algorithm to simultaneously compute the registration transformations that map the data sets together, and to calculate the variances of the registration parameters. The data sets are either randomly generated, or collected from a set of fractured bone phantoms using Computed Tomography (CT) images. The algorithm robustly converges for isotropic Gaussian noise that could have perturbed the point coordinates in the data sets. It is also computationally efficient, and enables real-time global registration of multiple data sets, with applications in computer-assisted orthopaedic trauma surgery.", "pmid": "18044659", "title": "Global registration of multiple point sets: feasibility and applications in multi-fragment fracture fixation."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Artificial Intelligence", "Automation", "Humans", "Imaging, Three-Dimensional", "Liver", "Support Vector Machine", "Tomography, X-Ray Computed"], "year": "2012", "abstractText": "This paper presents a semi-automatic approach to segmentation of liver parenchyma from 3D computed tomography (CT) images. Specifically, liver segmentation is formalized as a pattern recognition problem, where a given voxel is to be assigned a correct label - either in a liver or a non-liver class. Each voxel is associated with a feature vector that describes image textures. Based on the generated features, an Extreme Learning Machine (ELM) classifier is employed to perform the voxel classification. Since preliminary voxel segmentation tends to be less accurate at the boundary, and there are other non-liver tissue voxels with similar texture characteristics as liver parenchyma, morphological smoothing and 3D level set refinement are applied to enhance the accuracy of segmentation. Our approach is validated on a set of CT data. The experiment shows that the proposed approach with ELM has the reasonably good performance for liver parenchyma segmentation. It demonstrates a comparable result in accuracy of classification but with a much faster training and classification speed compared with support vector machine (SVM).", "pmid": "23366744", "title": "A semi-automatic approach to the segmentation of liver parenchyma from 3D CT images with Extreme Learning Machine."}, {"journal": "Computational intelligence and neuroscience", "meshMajor": ["Algorithms", "Artificial Intelligence", "Internet", "Models, Theoretical", "Software", "Travel"], "year": "2016", "abstractText": "Rapid growth of web and its applications has created a colossal importance for recommender systems. Being applied in various domains, recommender systems were designed to generate suggestions such as items or services based on user interests. Basically, recommender systems experience many issues which reflects dwindled effectiveness. Integrating powerful data management techniques to recommender systems can address such issues and the recommendations quality can be increased significantly. Recent research on recommender systems reveals an idea of utilizing social network data to enhance traditional recommender system with better prediction and improved accuracy. This paper expresses views on social network data based recommender systems by considering usage of various recommendation algorithms, functionalities of systems, different types of interfaces, filtering techniques, and artificial intelligence techniques. After examining the depths of objectives, methodologies, and data sources of the existing models, the paper helps anyone interested in the development of travel recommendation systems and facilitates future research direction. We have also proposed a location recommendation system based on social pertinent trust walker (SPTW) and compared the results with the existing baseline random walk models. Later, we have enhanced the SPTW model for group of users recommendations. The results obtained from the experiments have been presented. ", "pmid": "27069468", "title": "A Collaborative Location Based Travel Recommendation System through Enhanced Rating Prediction for the Group of Users."}, {"journal": "Sensors (Basel, Switzerland)", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Eyeglasses", "Face", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2013", "abstractText": "This paper presents a system that automatically extracts the position of the eyeglasses and the accurate shape and size of the frame lenses in facial images. The novelty brought by this paper consists in three key contributions. The first one is an original model for representing the shape of the eyeglasses lens, using Fourier descriptors. The second one is a method for generating the search space starting from a finite, relatively small number of representative lens shapes based on Fourier morphing. Finally, we propose an accurate lens contour extraction algorithm using a multi-stage Monte Carlo sampling technique. Multiple experiments demonstrate the effectiveness of our approach. ", "pmid": "24152926", "title": "Eyeglasses lens contour extraction from facial images using an efficient shape description."}, {"journal": "BMC bioinformatics", "meshMajor": ["Abstracting and Indexing", "Algorithms", "Artificial Intelligence", "Databases, Factual", "Information Storage and Retrieval", "Natural Language Processing", "Semantics", "Software", "Terminology as Topic", "Vocabulary, Controlled"], "year": "2006", "abstractText": "BACKGROUND: We study the adaptation of Link Grammar Parser to the biomedical sublanguage with a focus on domain terms not found in a general parser lexicon. Using two biomedical corpora, we implement and evaluate three approaches to addressing unknown words: automatic lexicon expansion, the use of morphological clues, and disambiguation using a part-of-speech tagger. We evaluate each approach separately for its effect on parsing performance and consider combinations of these approaches.RESULTS: In addition to a 45% increase in parsing efficiency, we find that the best approach, incorporating information from a domain part-of-speech tagger, offers a statistically significant 10% relative decrease in error.CONCLUSION: When available, a high-quality domain part-of-speech tagger is the best solution to unknown word issues in the domain adaptation of a general parser. In the absence of such a resource, surface clues can provide remarkably good coverage and performance when tuned to the domain. The adapted parser is available under an open-source license.", "pmid": "17134475", "title": "Lexical adaptation of link grammar to the biomedical sublanguage: a comparative evaluation of three approaches."}, {"journal": "Proceedings. Symposium on Computer Applications in Medical Care", "meshMajor": ["Artificial Intelligence", "Christianity", "Cross Infection", "Diagnosis, Computer-Assisted", "Expert Systems", "Hospitals, Religious", "Humans", "Infant, Newborn", "Reproducibility of Results", "Sensitivity and Specificity", "Utah"], "year": "1994", "abstractText": "Hospital-acquired infections are responsible for an increase in patient mortality and costs. Their detection is essential to permit better infection control. We developed an expert system specifically to detect infections in pediatric patients. The expert system is implemented at LDS Hospital that has a level three newborn intensive care unit and well baby units. We describe how the knowledge base of the expert system was developed, implemented, and validated in a retrospective study. The results of the system were compared to manual reviewer results. The expert system had a sensitivity of 84.5% and specificity of 92.8% in detecting hospital-acquired infections when compared to a physician reviewer. The Cohen's kappa between the expert system and the physician reviewer was 0.62 (p < .001).", "pmid": "7950013", "title": "Computerized detection of nosocomial infections in newborns."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Brain", "Connectome", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Magnetic Resonance Imaging", "Mental Recall", "Nerve Net", "Pattern Recognition, Automated", "Recognition, Psychology", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2014", "abstractText": "We present a Riemannian approach for classifying fMRI connectivity patterns before and after intervention in longitudinal studies. A fundamental difficulty with using connectivity as features is that covariance matrices live on the positive semi-definite cone, which renders their elements inter-related. The implicit independent feature assumption in most classifier learning algorithms is thus violated. In this paper, we propose a matrix whitening transport for projecting the covariance estimates onto a common tangent space to reduce the statistical dependencies between their elements. We show on real data that our approach provides significantly higher classification accuracy than directly using Pearson's correlation. We further propose a non-parametric scheme for identifying significantly discriminative connections from classifier weights. Using this scheme, a number of neuroanatomically meaningful connections are found, whereas no significant connections are detected with pure permutation testing.", "pmid": "25485405", "title": "Transport on Riemannian manifold for functional connectivity-based classification."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artificial Intelligence", "Biometry", "Discriminant Analysis", "Gait", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Whole Body Imaging"], "year": "2007", "abstractText": "The traditional image representations are not suited to conventional classification methods, such as the linear discriminant analysis (LDA), because of the under sample problem (USP): the dimensionality of the feature space is much higher than the number of training samples. Motivated by the successes of the two dimensional LDA (2DLDA) for face recognition, we develop a general tensor discriminant analysis (GTDA) as a preprocessing step for LDA. The benefits of GTDA compared with existing preprocessing methods, e.g., principal component analysis (PCA) and 2DLDA, include 1) the USP is reduced in subsequent classification by, for example, LDA; 2) the discriminative information in the training tensors is preserved; and 3) GTDA provides stable recognition rates because the alternating projection optimization algorithm to obtain a solution of GTDA converges, while that of 2DLDA does not. We use human gait recognition to validate the proposed GTDA. The averaged gait images are utilized for gait representation. Given the popularity of Gabor function based image decompositions for image understanding and object recognition, we develop three different Gabor function based image representations: 1) the GaborD representation is the sum of Gabor filter responses over directions, 2) GaborS is the sum of Gabor filter responses over scales, and 3) GaborSD is the sum of Gabor filter responses over scales and directions. The GaborD, GaborS and GaborSD representations are applied to the problem of recognizing people from their averaged gait images.A large number of experiments were carried out to evaluate the effectiveness (recognition rate) of gait recognition based on first obtaining a Gabor, GaborD, GaborS or GaborSD image representation, then using GDTA to extract features and finally using LDA for classification. The proposed methods achieved good performance for gait recognition based on image sequences from the USF HumanID Database. Experimental comparisons are made with nine state of the art classification methods in gait recognition.", "pmid": "17699917", "title": "General tensor discriminant analysis and gabor features for gait recognition."}, {"journal": "Neural networks : the official journal of the International Neural Network Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Computer Simulation", "Computing Methodologies", "Fuzzy Logic", "Image Interpretation, Computer-Assisted", "Models, Statistical", "Neural Networks, Computer", "Pattern Recognition, Automated", "Software", "Software Validation"], "year": "2007", "abstractText": "This paper focuses on the evolution of Fuzzy ARTMAP neural network classifiers, using genetic algorithms, with the objective of improving generalization performance (classification accuracy of the ART network on unseen test data) and alleviating the ART category proliferation problem (the problem of creating more than necessary ART network categories to solve a classification problem). We refer to the resulting architecture as GFAM. We demonstrate through extensive experimentation that GFAM exhibits good generalization and is of small size (creates few ART categories), while consuming reasonable computational effort. In a number of classification problems, GFAM produces the optimal classifier. Furthermore, we compare the performance of GFAM with other competitive ARTMAP classifiers that have appeared in the literature and addressed the category proliferation problem in ART. We illustrate that GFAM produces improved results over these architectures, as well as other competitive classifiers.", "pmid": "17851035", "title": "GFAM: evolving Fuzzy ARTMAP neural networks."}, {"journal": "Proceedings. Symposium on Computer Applications in Medical Care", "meshMajor": ["Algorithms", "Artificial Intelligence", "Classification", "Databases, Factual", "Decision Support Techniques", "Female", "Humans", "Infant, Newborn", "Infant, Premature", "Male", "Obstetric Labor, Premature", "Pregnancy", "Risk Assessment", "Software"], "year": "1994", "abstractText": "Prediction of preterm birth is a poorly understood domain. The existing manual methods of assessment of preterm birth are 17%-38% accurate. The machine learning system LERS was used for three different datasets about pregnant women. Rules induced by LERS were used in conjunction with a classification scheme of LERS, based on \"bucket brigade algorithm\" of genetic algorithms and enhanced by partial matching. The resulting prediction of preterm birth in new, unseen cases is much more accurate (68%-90%).", "pmid": "7950021", "title": "Improving prediction of preterm birth using a new classification scheme and rule induction."}, {"journal": "IEEE transactions on medical imaging", "meshMajor": ["Algorithms", "Artificial Intelligence", "Data Interpretation, Statistical", "Databases, Factual", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Likelihood Functions", "Positron-Emission Tomography", "Reproducibility of Results", "Sensitivity and Specificity"], "year": "2006", "abstractText": "We derive computationally efficient methods for the estimation of the mean and variance properties of penalized likelihood dynamic positron emission tomography (PET) images. This allows us to predict the accuracy of reconstructed activity estimates and to compare reconstruction algorithms theoretically. We combine a bin-mode approach in which data is modeled as a collection of independent Poisson random variables at each spatiotemporal bin with the space-time separabilities in the imaging equation and penalties to derive rapidly computable analytic mean and variance approximations. We use these approximations to compare bias/variance properties of our dynamic PET image reconstruction algorithm with those of multiframe static PET reconstructions.", "pmid": "16398413", "title": "Mean and covariance properties of dynamic PET reconstructions from list-mode data."}, {"journal": "International journal of neural systems", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Electronic Data Processing", "Fuzzy Logic", "Models, Neurological", "Neural Networks, Computer", "Neurolinguistic Programming", "Nonlinear Dynamics", "Stochastic Processes", "Synapses"], "year": "2001", "abstractText": "An artificial neural network with a two-layer feedback topology and generalized recurrent neurons, for solving nonlinear discrete dynamic optimization problems, is developed. A direct method to assign the weights of neural networks is presented. The method is based on Bellmann's Optimality Principle and on the interchange of information which occurs during the synaptic chemical processing among neurons. The neural network based algorithm is an advantageous approach for dynamic programming due to the inherent parallelism of the neural networks; further it reduces the severity of computational problems that can occur in methods like conventional methods. Some illustrative application examples are presented to show how this approach works out including the shortest path and fuzzy decision making problems.", "pmid": "11852439", "title": "A biologically inspired neural network for dynamic programming."}, {"journal": "NeuroImage", "meshMajor": ["Aged", "Aged, 80 and over", "Artificial Intelligence", "Atrophy", "Brain", "Brain Mapping", "Computer Simulation", "Female", "Fourier Analysis", "Humans", "Image Processing, Computer-Assisted", "Imaging, Three-Dimensional", "Longitudinal Studies", "Magnetic Resonance Imaging", "Male", "Middle Aged", "Multivariate Analysis", "Nonlinear Dynamics", "Numerical Analysis, Computer-Assisted", "Reproducibility of Results", "Software"], "year": "2004", "abstractText": "A high-dimensional shape transformation posed in a mass-preserving framework is used as a morphological signature of a brain image. Population differences with complex spatial patterns are then determined by applying a nonlinear support vector machine (SVM) pattern classification method to the morphological signatures. Significant reduction of the dimensionality of the morphological signatures is achieved via wavelet decomposition and feature reduction methods. Applying the method to MR images with simulated atrophy shows that the method can correctly detect subtle and spatially complex atrophy, even when the simulated atrophy represents only a 5% variation from the original image. Applying this method to actual MR images shows that brains can be correctly determined to be male or female with a successful classification rate of 97%, using the leave-one-out method. This proposed method also shows a high classification rate for old adults' age classification, even under difficult test scenarios. The main characteristic of the proposed methodology is that, by applying multivariate pattern classification methods, it can detect subtle and spatially complex patterns of morphological group differences which are often not detectable by voxel-based morphometric methods, because these methods analyze morphological measurements voxel-by-voxel and do not consider the entirety of the data simultaneously.", "pmid": "14741641", "title": "Morphological classification of brains via high-dimensional shape transformations and machine learning methods."}, {"journal": "Talanta", "meshMajor": ["Algorithms", "Animals", "Artificial Intelligence", "Discriminant Analysis", "Disease", "Gene Expression Profiling", "Humans", "Neoplasms", "Oligonucleotide Array Sequence Analysis", "Probability"], "year": "2009", "abstractText": "One problem with discriminant analysis of microarray data is representation of each sample by a large number of genes that are possibly irrelevant, insignificant or redundant. Methods of variable selection are, therefore, of great significance in microarray data analysis. To circumvent the problem, a new gene mining approach is proposed based on the similarity between probability density functions on each gene for the class of interest with respect to the others. This method allows the ascertainment of significant genes that are informative for discriminating each individual class rather than maximizing the separability of all classes. Then one can select genes containing important information about the particular subtypes of diseases. Based on the mined significant genes for individual classes, a support vector machine with local kernel transform is constructed for the classification of different diseases. The combination of the gene mining approach with support vector machine is demonstrated for cancer classification using two public data sets. The results reveal that significant genes are identified for each cancer, and the classification model shows satisfactory performance in training and prediction for both data sets.", "pmid": "19559875", "title": "Variable selection using probability density function similarity for support vector machine classification of high-dimensional microarray data."}, {"journal": "BMC bioinformatics", "meshMajor": ["Algorithms", "Artificial Intelligence", "Database Management Systems", "Databases, Bibliographic", "Information Storage and Retrieval", "Natural Language Processing", "Periodicals as Topic", "Software", "Vocabulary, Controlled"], "year": "2008", "abstractText": "BACKGROUND: Despite increasing interest in applying Natural Language Processing (NLP) to biomedical text, whether this technology can facilitate tasks such as database curation remains unclear.RESULTS: PaperBrowser is the first NLP-powered interface that was developed under a user-centered approach to improve the way in which FlyBase curators navigate an article. In this paper, we first discuss how observing curators at work informed the design and evaluation of PaperBrowser. Then, we present how we appraise PaperBrowser's navigational functionalities in a user-based study using a text highlighting task and evaluation criteria of Human-Computer Interaction. Our results show that PaperBrowser reduces the amount of interactions between two highlighting events and therefore improves navigational efficiency by about 58% compared to the navigational mechanism that was previously available to the curators. Moreover, PaperBrowser is shown to provide curators with enhanced navigational utility by over 74% irrespective of the different ways in which they highlight text in the article.CONCLUSION: We show that state-of-the-art performance in certain NLP tasks such as Named Entity Recognition and Anaphora Resolution can be combined with the navigational functionalities of PaperBrowser to support curation quite successfully.", "pmid": "18410678", "title": "Natural language processing in aid of FlyBase curators."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Aged", "Aged, 80 and over", "Aging", "Algorithms", "Artificial Intelligence", "Brain", "Cognitive Dysfunction", "Diffusion Tensor Imaging", "Humans", "Linear Models", "Male", "Normal Distribution"], "year": "2013", "abstractText": "In this study, we employed diffusion tensor imaging (DTI) to construct brain structural network and then derive the connection matrices from 96 healthy elderly subjects. The correlation analysis between these topological properties of network based on graph theory and the Cognitive Abilities Screening Instrument (CASI) index were processed to extract the significant network characteristics. These characteristics were then integrated to estimate the models by various machine-learning algorithms to predict user's cognitive performance. From the results, linear regression model and Gaussian processes model showed presented better abilities with lower mean absolute errors of 5.8120 and 6.25 to predict the cognitive performance respectively. Moreover, these extracted topological properties of brain structural network derived from DTI also could be regarded as the bio-signatures for further evaluation of brain degeneration in healthy aged and early diagnosis of mild cognitive impairment (MCI). ", "pmid": "24109740", "title": "A prediction model for cognitive performance in health ageing using diffusion tensor imaging with graph theory."}, {"journal": "Medical image computing and computer-assisted intervention : MICCAI ... International Conference on Medical Image Computing and Computer-Assisted Intervention", "meshMajor": ["Algorithms", "Artificial Intelligence", "Cluster Analysis", "Humans", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Pattern Recognition, Automated", "Reproducibility of Results", "Sensitivity and Specificity", "Subtraction Technique"], "year": "2009", "abstractText": "Segmentation of anatomical objects is always a fundamental task for various clinical applications. Although many automatic segmentation methods have been designed to segment specific anatomical objects in a given imaging modality, a more generic solution that is directly applicable to different imaging modalities and different deformable surfaces is desired, if attainable. In this paper, we propose such a framework, which learns from examples the spatially adaptive appearance and shape of a 3D surface (either open or closed). The application to a new object/surface in a new modality requires only the annotation of training examples. Key contributions of our method include: (1) an automatic clustering and learning algorithm to capture the spatial distribution of appearance similarities/variations on the 3D surface. More specifically, the model vertices are hierarchically clustered into a set of anatomical primitives (sub-surfaces) using both geometric and appearance features. The appearance characteristics of each learned anatomical primitive are then captured through a cascaded boosting learning method. (2) To effectively incorporate non-Gaussian shape priors, we cluster the training shapes in order to build multiple statistical shape models. (3) To our best knowledge, this is the first time the same segmentation algorithm has been directly employed in two very diverse applications: (a) Liver segmentation (closed surface) in PET-CT, in which CT has very low-resolution and low-contrast; (b) Distal femur (condyle) surface (open surface) segmentation in MRI.", "pmid": "20426213", "title": "Cross modality deformable segmentation using hierarchical clustering and learning."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Algorithms", "Artificial Intelligence", "Computer Simulation", "Computer Systems", "Decision Support Techniques", "Models, Theoretical", "Motion", "Pattern Recognition, Automated", "Robotics"], "year": "2006", "abstractText": "This paper presents a simple yet efficient dynamic-programming (DP) shortest path algorithm for real-time collision-free robot-path planning applicable to situations in which targets and barriers are permitted to move. The algorithm works in real time and requires no prior knowledge of target or barrier movements. In the case that the barriers are stationary, this paper proves that this algorithm always results in the robot catching the target, provided it moves at a greater speed than the target, and the dynamic-system update frequency is sufficiently large. Like most robot-path-planning approaches, the environment is represented by a topologically organized map. Each grid point on the map has only local connections to its neighboring grid points from which it receives information in real time. The information stored at each point is a current estimate of the distance to the nearest target and the neighbor from which this distance was determined. Updating the distance estimate at each grid point is done using only the information gathered from the point's neighbors, that is, each point can be considered an independent processor, and the order in which grid points are updated is not determined based on global knowledge of the current distances at each point or the previous history of each point. The robot path is determined in real time completely from the information at the robot's current grid-point location. The computational effort to update each point is minimal, allowing for rapid propagation of the distance information outward along the grid from the target locations. In the static situation, where both the targets and the barriers do not move, this algorithm is a DP solution to the shortest path problem, but is restricted by lack of global knowledge. In this case, this paper proves that the dynamic system converges in a small number of iterations to a state where the minimal distance to a target is recorded at each grid point and shows that this robot-path-planning algorithm can be made to always choose an optimal path. The effectiveness of this algorithm is demonstrated through a number of simulations.", "pmid": "16903362", "title": "An efficient dynamic system for real-time robot-path planning."}, {"journal": "IEEE transactions on pattern analysis and machine intelligence", "meshMajor": ["Algorithms", "Artificial Intelligence", "Bayes Theorem", "Image Enhancement", "Image Interpretation, Computer-Assisted", "Imaging, Three-Dimensional", "Information Storage and Retrieval", "Models, Statistical", "Pattern Recognition, Automated"], "year": "2006", "abstractText": "In this paper, the optimizations of three fundamental components of image understanding: segmentation/annotation, 3D sensing (stereo) and 3D fitting, are posed and integrated within a Bayesian framework. This approach benefits from recent advances in statistical learning which have resulted in greatly improved flexibility and robustness. The first two components produce annotation (region labeling) and depth maps for the input images, while the third module integrates and resolves the inconsistencies between region labels and depth maps to fit most likely 3D models. To illustrate the application of these ideas, we have focused on the difficult problem of fitting individual tree models to tree stands which is a major challenge for vision-based forestry inventory systems.", "pmid": "16640256", "title": "Component optimization for image understanding: a Bayesian approach."}, {"journal": "[Rinsho ketsueki] The Japanese journal of clinical hematology", "meshMajor": ["Artificial Intelligence", "Deep Learning", "Humans", "Neural Networks, Computer"], "year": "2020", "abstractText": "Artificial intelligence (AI) has been applied widely in medicine. For example, deep neural network-based deep learning is particularly effective for pattern recognition in static medical images. Additionally, dynamic time series data are analysed ubiquitously in biology and medicine, as in the application of BCR-ABL International Scale time series data measured from CML patients treated with tyrosine-kinase inhibitors. Nonlinear data analyses, rather than conventional deep learning, can be more powerful for this type of dynamic disease information. Here, I introduce our mathematical approaches that are applicable for disease dynamics, such as dynamical network biomarkers (DNB) and randomly distributed embedding (RDE), as examples of nonlinear data analyses. I also discuss the availability of neuroinspired and neuromorphic hardware systems, which we are developing for potential use in next-generation AI.", "pmid": "32507823", "title": "[Present and future perspectives of artificial intelligence: examples of mathematical approaches for analysis of disease dynamics]."}, {"journal": "Cell", "meshMajor": ["Algorithms", "Animals", "Artificial Intelligence", "Embryoid Bodies", "Embryonic Stem Cells", "Genomics", "Harringtonines", "High-Throughput Nucleotide Sequencing", "Kinetics", "Mice", "Open Reading Frames", "Peptide Chain Initiation, Translational", "Protein Biosynthesis", "RNA", "Ribosomes", "Sequence Analysis, RNA"], "year": "2011", "abstractText": "The ability to sequence genomes has far outstripped approaches for deciphering the information they encode. Here we present a suite of techniques, based on ribosome profiling (the deep sequencing of ribosome-protected mRNA fragments), to provide genome-wide maps of protein synthesis as well as a pulse-chase strategy for determining rates of translation elongation. We exploit the propensity of harringtonine to cause ribosomes to accumulate at sites of translation initiation together with a machine learning algorithm to define protein products systematically. Analysis of translation in mouse embryonic stem cells reveals thousands of strong pause sites and unannotated translation products. These include amino-terminal extensions and truncations and upstream open reading frames with regulatory potential, initiated at both AUG and non-AUG codons, whose translation changes after differentiation. We also define a class of short, polycistronic ribosome-associated coding RNAs (sprcRNAs) that encode small proteins. Our studies reveal an unanticipated complexity to mammalian proteomes.", "pmid": "22056041", "title": "Ribosome profiling of mouse embryonic stem cells reveals the complexity and dynamics of mammalian proteomes."}, {"journal": "TheScientificWorldJournal", "meshMajor": ["Algorithms", "Artificial Intelligence", "Humans", "Language", "Natural Language Processing", "Semantics"], "year": "2013", "abstractText": "Word sense disambiguation (WSD) is a fundamental problem in nature language processing, the objective of which is to identify the most proper sense for an ambiguous word in a given context. Although WSD has been researched over the years, the performance of existing algorithms in terms of accuracy and recall is still unsatisfactory. In this paper, we propose a novel approach to word sense disambiguation based on topical and semantic association. For a given document, supposing that its topic category is accurately discriminated, the correct sense of the ambiguous term is identified through the corresponding topic and semantic contexts. We firstly extract topic discriminative terms from document and construct topical graph based on topic span intervals to implement topic identification. We then exploit syntactic features, topic span features, and semantic features to disambiguate nouns and verbs in the context of ambiguous word. Finally, we conduct experiments on the standard data set SemCor to evaluate the performance of the proposed method, and the results indicate that our approach achieves relatively better performance than existing approaches. ", "pmid": "24294131", "title": "A novel approach to word sense disambiguation based on topical and semantic association."}, {"journal": "IEEE transactions on systems, man, and cybernetics. Part B, Cybernetics : a publication of the IEEE Systems, Man, and Cybernetics Society", "meshMajor": ["Accidents, Traffic", "Analysis of Variance", "Artificial Intelligence", "Attention", "Automobile Driving", "Eye Movement Measurements", "Head Movements", "Humans", "Protective Devices", "Systems Integration", "User-Computer Interface"], "year": "2009", "abstractText": "In this paper, we introduce a novel laser-based wide-area heads-up windshield display which is capable of actively interfacing with a human as part of a driver assistance system. The dynamic active display (DAD) is a unique prototype interface that presents safety-critical visual icons to the driver in a manner that minimizes the deviation of his or her gaze direction without adding to unnecessary visual clutter. As part of an automotive safety system, the DAD presents alerts in the field of view of the driver only if necessary, which is based upon the state and pose of the driver, vehicle, and environment. This paper examines the effectiveness of DAD through a comprehensive comparative experimental evaluation of a speed compliance driver assistance system, which is implemented on a vehicular test bed. Three different types of display protocols for assisting a driver to comply with speed limits are tested on actual roadways, and these are compared with a conventional dashboard display. Given the inclination, drivers who are given an overspeed warning alert reduced the time required to slow down to the speed limit by 38% (p < 0.01) as compared with the drivers not given the alert. Additionally, certain alerts decreased distraction levels by reducing the time spent looking away from the road by 63% (p < 0.01). Ultimately, these alerts demonstrate the utility and promise of the DAD system.", "pmid": "19068432", "title": "A novel active heads-up display for driver assistance."}, {"journal": "Healthcare informatics : the business magazine for information and communication systems", "meshMajor": ["Artificial Intelligence", "Hospital Information Systems", "United States"], "year": "1990", "abstractText": "Computers have manipulated data to increase the efficiency of their users. AI represents the next evolutionary step.", "pmid": "10120641", "title": "Artificial intelligence in healthcare management."}, {"journal": "Neuroscience letters", "meshMajor": ["Artificial Intelligence", "Child", "Diagnosis, Computer-Assisted", "Female", "Humans", "Hyperkinesis", "Male", "Mental Disorders", "Psychiatric Status Rating Scales", "Severity of Illness Index"], "year": "2011", "abstractText": "Automatic classification of different behavioral disorders with many similarities (e.g. in symptoms) by using an automated approach will help psychiatrists to concentrate on correct disorder and its treatment as soon as possible, to avoid wasting time on diagnosis, and to increase the accuracy of diagnosis. In this study, we tried to differentiate and classify (diagnose) 306 children with many similar symptoms and different behavioral disorders such as ADHD, depression, anxiety, comorbid depression and anxiety and conduct disorder with high accuracy. Classification was based on the symptoms and their severity. With examining 16 different available classifiers, by using \"Prtools\", we have proposed nearest mean classifier as the most accurate classifier with 96.92% accuracy in this research.", "pmid": "21396979", "title": "Automatic classification of hyperactive children: comparing multiple artificial intelligence approaches."}, {"journal": "The New phytologist", "meshMajor": ["Artificial Intelligence", "Fossils", "Image Processing, Computer-Assisted", "Internet", "Picea", "Pollen", "Reproducibility of Results", "Software"], "year": "2012", "abstractText": "Pollen is among the most ubiquitous of terrestrial fossils, preserving an extended record of vegetation change. However, this temporal continuity comes with a taxonomic tradeoff. Analytical methods that improve the taxonomic precision of pollen identifications would expand the research questions that could be addressed by pollen, in fields such as paleoecology, paleoclimatology, biostratigraphy, melissopalynology, and forensics. We developed a supervised, layered, instance-based machine-learning classification system that uses leave-one-out bias optimization and discriminates among small variations in pollen shape, size, and texture. We tested our system on black and white spruce, two paleoclimatically significant taxa in the North American Quaternary. We achieved > 93% grain-to-grain classification accuracies in a series of experiments with both fossil and reference material. More significantly, when applied to Quaternary samples, the learning system was able to replicate the count proportions of a human expert (R(2) = 0.78, P = 0.007), with one key difference - the machine achieved these ratios by including larger numbers of grains with low-confidence identifications. Our results demonstrate the capability of machine-learning systems to solve the most challenging palynological classification problem, the discrimination of congeneric species, extending the capabilities of the pollen analyst and improving the taxonomic resolution of the palynological record.", "pmid": "22943455", "title": "Classifying black and white spruce pollen using layered machine learning."}, {"journal": "American journal of ophthalmology", "meshMajor": ["Anterior Eye Segment", "Artificial Intelligence", "Deep Learning", "Female", "Glaucoma, Angle-Closure", "Gonioscopy", "Humans", "Male", "Middle Aged", "ROC Curve", "Tomography, Optical Coherence"], "year": "2019", "abstractText": "PURPOSE: Anterior segment optical coherence tomography (AS-OCT) provides an objective imaging modality for visually identifying anterior segment structures. An automated detection system could assist ophthalmologists in interpreting AS-OCT images for the presence of angle closure.DESIGN: Development of an artificial intelligence automated detection system for the presence of angle closure.METHODS: A deep learning system for automated angle-closure detection in AS-OCT images was developed, and this was compared with another automated angle-closure detection system based on quantitative features. A total of 4135 Visante AS-OCT images from 2113 subjects (8270 anterior chamber angle images with 7375 open-angle and 895 angle-closure) were examined. The deep learning angle-closure detection system for a 2-class classification problem was tested by 5-fold cross-validation. The deep learning system and the automated angle-closure detection system based on quantitative features were evaluated against clinicians' grading of AS-OCT images as the reference standard.RESULTS: The area under the receiver operating characteristic curve of the system using quantitative features was 0.90 (95% confidence interval [CI] 0.891-0.914) with a sensitivity of 0.79 \u00b1 0.037 and a specificity of 0.87 \u00b1 0.009, while the area under the receiver operating characteristic curve of the deep learning system was 0.96 (95% CI 0.953-0.968) with a sensitivity of 0.90 \u00b1 0.02 and a specificity of 0.92 \u00b1 0.008, against clinicians' grading of AS-OCT images as the reference standard.CONCLUSIONS: The results demonstrate the potential of the deep learning system for angle-closure detection in AS-OCT images.", "pmid": "30849350", "title": "A Deep Learning System for Automated Angle-Closure Detection in Anterior Segment Optical Coherence Tomography Images."}, {"journal": "Annual International Conference of the IEEE Engineering in Medicine and Biology Society. IEEE Engineering in Medicine and Biology Society. Annual International Conference", "meshMajor": ["Algorithms", "Arrhythmias, Cardiac", "Artificial Intelligence", "Computer Simulation", "Diagnosis, Computer-Assisted", "Electrocardiography", "Heart Rate", "Humans", "Models, Cardiovascular", "Models, Statistical", "Pattern Recognition, Automated", "Principal Component Analysis", "Software"], "year": "2007", "abstractText": "In this paper, we developed the novel algorithm for cardiac arrhythmia classification. Until now, back propagation neural network (BPNN) was frequently used for these tasks. However, general gradient based learning method is far slower than what is required for their application. The proposed algorithm adapts Extreme Learning Machine(ELM) that has the advantage of very fast learning speed and high accuracy. In this paper, we classify beats into normal beat, left bundle branch block beat, right bundle branch block beat, premature ventricular contraction, atrial premature beat, paced beat, and ventricular escape beat. Experimental results show that we can obtain 97.45% in average accuracy, 97.44% in average sensitivity, 98.46% in average specificity, and 2.423 seconds in learning time.", "pmid": "18002690", "title": "Algorithm for classifying arrhythmia using Extreme Learning Machine and principal component analysis."}], "negative": [{"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Alternative Splicing", "Base Sequence", "Cell Differentiation", "Cells, Cultured", "Embryonic Stem Cells", "Gene Expression Profiling", "Gene Expression Regulation, Developmental", "Humans", "Neurons", "RNA", "RNA, Messenger", "Sequence Analysis, DNA", "Transcription, Genetic"], "year": "2010", "abstractText": "To examine the fundamental mechanisms governing neural differentiation, we analyzed the transcriptome changes that occur during the differentiation of hESCs into the neural lineage. Undifferentiated hESCs as well as cells at three stages of early neural differentiation-N1 (early initiation), N2 (neural progenitor), and N3 (early glial-like)-were analyzed using a combination of single read, paired-end read, and long read RNA sequencing. The results revealed enormous complexity in gene transcription and splicing dynamics during neural cell differentiation. We found previously unannotated transcripts and spliced isoforms specific for each stage of differentiation. Interestingly, splicing isoform diversity is highest in undifferentiated hESCs and decreases upon differentiation, a phenomenon we call isoform specialization. During neural differentiation, we observed differential expression of many types of genes, including those involved in key signaling pathways, and a large number of extracellular receptors exhibit stage-specific regulation. These results provide a valuable resource for studying neural differentiation and reveal insights into the mechanisms underlying in vitro neural differentiation of hESCs, such as neural fate specification, neural progenitor cell identity maintenance, and the transition from a predominantly neuronal state into one with increased gliogenic potential.", "pmid": "20194744", "title": "Dynamic transcriptomes during neural differentiation of human embryonic stem cells revealed by short, long, and paired-end sequencing."}, {"journal": "Le Journal dentaire du Quebec", "meshMajor": ["Carcinogens", "Cell Transformation, Neoplastic", "Humans", "Neoplasms", "Neoplasms, Radiation-Induced", "Oncogenic Viruses"], "year": "1991", "abstractText": "This article briefly describes the fundamental principles associated with the development of cancerous cells in man. The manner in which these neoplastic cells appear and spread are discussed. A better understanding of these phenomena will allow the practitioner to realize the importance and necessity of a systematic evaluation of all patients.", "pmid": "1819607", "title": "[Oncology and its applications. 1. Basic principles of oncology]."}, {"journal": "LGBT health", "meshMajor": ["Adolescent", "Adult", "Comprehension", "Florida", "Health Communication", "Health Knowledge, Attitudes, Practice", "Health Policy", "Homosexuality, Male", "Humans", "Interviews as Topic", "Male", "Pre-Exposure Prophylaxis", "Public Health", "Qualitative Research", "Sexual and Gender Minorities", "Young Adult"], "year": "2016", "abstractText": "PURPOSE: Street markets in antiretroviral medications for HIV have been documented, but sources of demand are not well understood. We report unexpected findings from qualitative research suggesting that some demand is for informal pre-exposure prophylaxis (PrEP).METHODS: Focus groups with young men who have sex with men (N\u2009=\u200931) yielded information on their understanding and use of PrEP.RESULTS: Of those who had heard of it, few understood PrEP to be a physician-prescribed regimen; most believed it to be a pill taken before and/or after sex and acquired on the street or through HIV-positive friends.CONCLUSION: Implications for PrEP rollout and public health policy are discussed.", "pmid": "26720130", "title": "Misunderstanding of Pre-Exposure Prophylaxis Use Among Men Who Have Sex with Men: Public Health and Policy Implications."}, {"journal": "FEBS letters", "meshMajor": ["Adaptor Proteins, Signal Transducing", "Blotting, Western", "Carrier Proteins", "DNA Primers", "DNA-Binding Proteins", "Gene Expression Regulation, Enzymologic", "Genes, Reporter", "Glutathione Peroxidase", "Hydrogen Peroxide", "Lac Operon", "Models, Genetic", "Mutagenesis", "Oxidative Stress", "Phosphoproteins", "Point Mutation", "Polymerase Chain Reaction", "Promoter Regions, Genetic", "Protein Binding", "Response Elements", "Saccharomyces cerevisiae", "Saccharomyces cerevisiae Proteins", "Time Factors", "Transcription Factors", "beta-Galactosidase"], "year": "2004", "abstractText": "The GPX2 gene encodes a homologue of phospholipid hydroperoxide glutathione peroxidase in Saccharomyces cerevisiae. The GPX2 promoter contains three elements the sequence of which is completely consistent with the optimal sequence for the Yap1 response element (YRE). Here, we identify the intrinsic YRE that functions in the oxidative stress response of GPX2. In addition, we discovered a cis-acting element (5'-GGCCGGC-3') within the GPX2 promoter proximal to the functional YRE that is necessary for H(2)O(2)-induced expression of GPX2. We present evidence showing that Skn7 is necessary for the oxidative stress response of GPX2 and is able to bind to this sequence. We determine the optimal sequence for Skn7 to regulate GPX2 under conditions of oxidative stress to be 5'-GGC(C/T)GGC-3', and we designate this sequence the oxidative stress-responsive Skn7 response element.", "pmid": "15135069", "title": "Regulation of the yeast phospholipid hydroperoxide glutathione peroxidase GPX2 by oxidative stress is mediated by Yap1 and Skn7."}, {"journal": "New York state journal of medicine", "meshMajor": ["Cocaine", "Cross-Sectional Studies", "Female", "Fetal Blood", "Hospitals, Urban", "Humans", "Incidence", "Infant, Newborn", "New York City", "Pregnancy", "Pregnancy Complications", "Prenatal Care", "Retrospective Studies", "Substance-Related Disorders", "Syphilis", "Syphilis, Congenital"], "year": "1990", "abstractText": "The frequency of positive cord blood rapid plasma reagin (RPR) tests among newborns at an inner city hospital and associations with maternal cocaine use, prenatal care, and adequacy of syphilis therapy were retrospectively assessed. The incidence of positive cord blood RPRs increased from 1.1% of all live births in 1985 to 3.4% in 1988. In 1987, 98 babies were born with positive cord blood RPRs; 86 of their charts were available for review. Four infants had false positive RPRs, and one patient delivered twins, leaving 81 mothers who could be evaluated. Almost 37% of these patients had had no prenatal care. More than 55% had inadequate or not therapy for syphilis. Of these, only 17.4% had prenatal care. Slightly more than 40% of patients acknowledged using drugs during pregnancy, 87.9% of whom used cocaine. Among the patients who used drugs, 75.8% received no prenatal care, in contrast to 10.4% of mothers who did not use drugs (p less than 0.001). It appears that drug use, particularly use of cocaine, is associated with low levels of utilization of prenatal services and inadequate therapy for syphilis. This may lead to increased risk of congenital syphilis in newborns.", "pmid": "2234614", "title": "Syphilis among parturients at an inner city hospital: association with cocaine use and implications for congenital syphilis rates."}, {"journal": "International urology and nephrology", "meshMajor": ["Clinical Trials as Topic", "Drug Administration Schedule", "Drug Evaluation", "Drug Resistance", "Estramustine", "Estrogens", "Humans", "Male", "Nitrogen Mustard Compounds", "Prostatic Neoplasms"], "year": "1975", "abstractText": "The experience gained with Estracyt, kindly supplied by AB LEO, Sweden, is reported. On the basis of former data in the literature we only used the drug in estrogen resistant and advanced cases. Estracyt (estramustine phosphate) is a nitrogen mustard derivative of the urethan type, attached to oestradiol-17-phosphate. In histologically verified cases, it was administered in daily doses of 300 mg intravenously for three weeks, followed by maintenance doses of 300 mg a week in tablets for three months. During treatment, liver and bone marrow function was checked systematically. The changes in morphological picture were studied by means of biopsies during and at the end of treatment. In agreement with the data in the literature a favourable effect was observed in estrogen resistant patients, with no toxic effect whatever on the bone marrow. At the same time GOT and GPT and BSP retention examinations demonstrated a hepatotoxic side effect. The pathological values returned to normal after withdrawal of the drug. Histological examinations showed that the tumour cells had changed but failed to disappear after treatment.", "pmid": "1102475", "title": "Treatment of prostatic cancer with Estracyt (estramustine phosphate)."}, {"journal": "Evolution; international journal of organic evolution", "meshMajor": ["Analysis of Variance", "Climate", "Environment", "Evolution, Molecular", "Genome, Plant", "Geography", "Phylogeny", "Pinus", "Regression Analysis", "Reproduction", "Seeds"], "year": "2004", "abstractText": "Genome size has been suggested to be a fundamental biological attribute in determining life-history traits in many groups of organisms. We examined the relationships between pine genome sizes and pine phylogeny, environmental factors (latitude, elevation, annual rainfall), and biological traits (latitudinal and elevational ranges, seed mass, minimum generation time, interval between large seed crops, seed dispersal mode, relative growth rate, measures of potential and actual invasiveness, and level of rarity). Genome sizes were determined for 60 pine taxa and then combined with published values to make a dataset encompassing 85 species, or 70% of species in the genus. Supertrees were constructed using 20 published source phylogenies. Ancestral genome size was estimated as 32 pg. Genome size has apparently remained stable or increased over evolutionary time in subgenus Strobus, while it has decreased in most subsections in subgenus Pinus. We analyzed relationships between genome size and life-history variables using cross-species correlations and phylogenetically independent contrasts derived from supertree constructions. The generally assumed positive relation between genome size and minimum generation time could not be confirmed in phylogenetically controlled analyses. We found that the strongest correlation was between genome size and seed mass. Because the growth quantities specific leaf area and leaf area ratio (and to a lesser extent relative growth rate) are strongly negatively related to seed mass, they were also negatively correlated with genome size. Northern latitudinal limit was negatively correlated with genome size. Invasiveness, particularly of wind-dispersed species, was negatively associated with both genome size and seed mass. Seed mass and its relationships with seed number, dispersal mode, and growth rate contribute greatly to the differences in life-history strategies of pines. Many life-history patterns are therefore indirectly, but consistently, associated with genome size.", "pmid": "15446425", "title": "Evolution of genome size in pines (Pinus) and its life-history correlates: supertree analyses."}, {"journal": "Canadian journal of ophthalmology. Journal canadien d'ophtalmologie", "meshMajor": ["Acid Phosphatase", "Animals", "Autolysis", "Cornea", "Corneal Transplantation", "Hydrocortisone", "Hydrogen-Ion Concentration", "Lysosomes", "Membranes", "Rabbits", "Refrigeration", "Temperature", "Tissue Preservation", "Transplantation, Homologous"], "year": "1975", "abstractText": "Many eyes donated for use in corneal grafting are rejected because of signs of autolysis in the donor material. The purpose of this experimental study was to determine whether hydrocortisone acting as a lysosome membrane stabilizer could prevent or retard autolysis of the corneas under storage, and if so, what was the most efficacious concentration. Different groups of rabbit corneas were placed in saline as controls or in varying concentrations of hydrocortisone (10(-10) M to 10(-4) M at pH 7.4) at 37 degrees C and 4 degrees C. Acid phosphatase released after six hours was measured biochemically. This enzyme was used as a marker enzyme reflecting lysosomal labilization. Results showed a significant stabilization of the lysosomal membrane at 4 degrees C as compared to 37 degrees C. A trend towards stabilization of the lysosomal membrane was seen when 10(-8) M concentration of hydrocortisone at 37 degrees C was used, there being no demonstrable stabilization at 4 degrees C.", "pmid": "133", "title": "The prevention of autolysis of stored cornea using steroid as a lysosome membrane stabilizer."}, {"journal": "European review for medical and pharmacological sciences", "meshMajor": ["Betacoronavirus", "COVID-19", "Coronavirus Infections", "Humans", "Pandemics", "Pneumonia, Viral", "Prevalence", "Regression Analysis", "SARS-CoV-2", "Seroepidemiologic Studies", "Survival Rate"], "year": "2020", "abstractText": "OBJECTIVE: Our objective was to find an association between exposure of a population to Middle East Respiratory Syndrome Coronavirus (MERS-CoV) and mortality rate due to Severe Acute Respiratory Syndrome Coronavirus 2 (SARS-CoV-2) across different countries worldwide.MATERIALS AND METHODS: To find the relationship between exposure to MERS-CoV and mortality rate due to SARS-CoV-2, we collected and analyzed data of three possible factors that may have resulted in an exposure of a population to MERS-CoV: (1) the number of Middle East Respiratory Syndrome (MERS) cases reported among 16 countries since 2012; (2) data of MERS-CoV seroprevalence in camels across 23 countries, as working with camels increase risk of exposure to MERS-CoV; (3) data of travel history of people from 51 countries to Saudi Arabia was collected on the assumption that travel to a country where MERS is endemic, such as, Saudi Arabia, could also lead to exposure to MERS-CoV.RESULTS: We found a significantly lower number of Coronavirus disease 2019 (COVID-19) deaths per million (deaths/M) of a population in countries that are likely to be exposed to MERS-CoV than otherwise (t-stat=3.686, p<0.01). In addition, the number of COVID-19 deaths/M of a population was significantly lower in countries that reported a higher seroprevalence of MERS-CoV in camels than otherwise (t-stat=4.5077, p<0.01). Regression analysis showed that increased travelling history to Saudi Arabia is likely to be associated with a lower mortality rate due to COVID-19.CONCLUSIONS: This study provides empirical evidence that a population that was at an increased risk of exposure to MERS-CoV had a significantly lower mortality rate due to SARS-CoV-2, which might be due to cross-protective immunity against SARS-CoV-2 in that population because of an earlier exposure to MERS-CoV.", "pmid": "32965011", "title": "An association between exposure to Middle East Respiratory Syndrome (MERS) and mortality rate of Coronavirus Disease 2019 (COVID-19)."}, {"journal": "European heart journal", "meshMajor": ["Anticoagulants", "Chi-Square Distribution", "Coated Materials, Biocompatible", "Coronary Angiography", "Coronary Disease", "Equipment Design", "Female", "Heparin", "Humans", "Logistic Models", "Male", "Recurrence", "Reoperation", "Risk Factors", "Statistics, Nonparametric", "Stents", "Thrombosis", "Treatment Outcome"], "year": "2001", "abstractText": "AIMS: Heparin coating of stents is thought to reduce stent thrombosis and restenosis rates. However, clinical data comparing coated and uncoated stents of the same model are lacking. We compared the heparin coated (C) and the uncoated (U) version of the Jostent stent with regard to the clinical and angiographic outcome after 6 months.METHODS AND RESULTS: Provisional stenting was done in 277 patients and 306 lesions; only 40 were Benestent-II like lesions. Delivery success rate was 98.4%. Both groups (C/U: n=156/150 lesions) were comparable in clinical and procedural data. Post stenting, reference diameter (C/U: 2.68+/-0.56/2.66+/-0.53 mm) and minimal lumen diameter did not differ (C/U: 2.48+/-0.47/2.48+/-0.52 mm). During follow-up the rate of subacute stent thrombosis (C/U: 1.9%/1.3%) and myocardial infarction did not differ. Angiography at the 6-month follow-up (79.4%) revealed no difference in restenosis rate (C/U: 33.1%/30.3%). Risk factors for restenosis were a type B2/C lesion (P<0.02), a stented segment longer than 16 mm (P<0.006) and a stent inflation pressure <14 bar (P<0.0063).CONCLUSION: Corline heparin coating of the Jostent has no impact on the in-hospital complication rate, stent thrombosis or restenosis. The Jostent design gives a high procedural success rate and satisfying result at 6 months in an everyday patient population undergoing provisional stenting.", "pmid": "11549303", "title": "Comparison of the heparin coated vs the uncoated Jostent--no influence on restenosis or clinical outcome."}, {"journal": "Photosynthesis research", "meshMajor": ["Acclimatization", "Cell Membrane", "Electrophoresis, Polyacrylamide Gel", "Light", "Light-Harvesting Protein Complexes", "Models, Biological", "Operon", "Peptides", "Pigments, Biological", "Protein Structure, Quaternary", "Proteomics", "Rhodobacter sphaeroides", "Spectrophotometry, Infrared"], "year": "2011", "abstractText": "In order to obtain an improved understanding of the assembly of the bacterial photosynthetic apparatus, we have conducted a proteomic analysis of pigment-protein complexes isolated from the purple bacterium Rhodobacter sphaeroides undergoing acclimation to reduced incident light intensity. Photoheterotrophically growing cells were shifted from 1,100 to 100\u00a0W/m(2) and intracytoplasmic membrane (ICM) vesicles isolated over 24-h were subjected to clear native polyacrylamide gel electrophoresis. Bands containing the LH2 and reaction center (RC)-LH1 complexes were excised and subjected to in-gel trypsin digestion followed by liquid chromatography (LC)-mass spectroscopy (MS)/MS. The results revealed that the LH2 band contained distinct levels of the LH2-\u00e1 and -\u00e2 polypeptides encoded by the two puc operons. Polypeptide subunits encoded by the puc2AB operon predominated under high light and in the early stages of acclimation to low light, while after 24\u00a0h, the puc1BAC components were most abundant. Surprisingly, the Puc2A polypeptide containing a 251 residue C-terminal extension not present in Puc1A, was a protein of major abundance. A predominance of Puc2A components in the LH2 complex formed at high light intensity is followed by a >2.5-fold enrichment in Puc1B levels between 3 and 24\u00a0h of acclimation, accompanied by a nearly twofold decrease in Puc2A levels. This indicates that the puc1BAC operon is under more stringent light control, thought to reflect differences in the puc1 upstream regulatory region. In contrast, elevated levels of Puc2 polypeptides were seen 48\u00a0h after the gratuitous induction of ICM formation at low aeration in the dark, while after 24\u00a0h of acclimation to low light, an absence of alterations in Puc polypeptide distributions was observed in the upper LH2-enriched gel band, despite an approximate twofold increase in overall LH2 levels. This is consistent with the origin of this band from a pool of LH2 laid down early in development that is distinct from subsequently assembled LH2-only domains, forming the LH2 gel band.", "pmid": "21863386", "title": "Differential assembly of polypeptides of the light-harvesting 2 complex encoded by distinct operons during acclimation of Rhodobacter sphaeroides to low light intensity."}, {"journal": "Molekuliarnaia biologiia", "meshMajor": ["Aliivibrio fischeri", "Chaperonin 10", "Chaperonin 60", "Escherichia coli", "Escherichia coli Proteins", "Gene Expression Regulation, Bacterial", "Operon", "Plasmids", "Protease La", "Protein Folding", "Recombinant Fusion Proteins"], "year": null, "abstractText": "It was shown that the chaperonin GroEL/GroES and protease Lon influence the expression of the Vibrio fischeri lux regulon in Escherichia coli cells: E. coli groE mutants bearing hybrid plasmid with the lux regulon were weakly luminescent; cells of the E. coli lon- comprising the entire lux regulon display very intense bioluminescence, with no lag period in the induction curve characteristic of lon+ strains. The luxR gene was cloned from the Vibrio fischeri genome in the pGEX-KG vector. It was shown that the active fusion protein GST-LuxR by affinity chromatography on glutathione-sucrose colony is purified only with proteins GroEL and Lon. The present results showed that the LuxR, transcriptional activator of the V. fischeri lux operon, really complexes with GroEL chaperonin and Lon protease. We suppose, that the GroEL/GroES chaperonin systems is required for the folding of LuxR into an active protein, and the LuxR is the target for the ATP-dependent serine Lon protease of E. coli.", "pmid": "16637268", "title": "[Role of GroEL/GroES chaperonin system and Lon protease in regulation of expression Vibrio fischeri lux genes in Escherichia coli cells]."}, {"journal": "Journal of visualized experiments : JoVE", "meshMajor": ["Adenocarcinoma", "Animals", "Appendectomy", "Azoxymethane", "Carcinogens", "Cecum", "Chronic Disease", "Colitis", "Colon", "Colorectal Neoplasms", "Dextran Sulfate", "Disease Models, Animal", "Male", "Mice, Inbred C57BL"], "year": "2019", "abstractText": "The human appendix has been recently implicated to play important biological roles in the pathogenesis of various complex diseases, such as colorectal cancer, inflammatory bowel disease, and Parkinson's disease. To study the function of the appendix, a gut disease-associated murine appendectomy model has been established and its step-by-step protocol is described here. This report introduces a facile protocol for caecal patch removal in mice followed by the chemical induction of chronic colitis-associated colorectal cancer using a combination of dextran sulfate sodium (DSS) and azoxymethane (AOM). IgA specific cells and IgA concentration were significantly reduced upon removal of the caecal patch in male C57BL/6 mice\u00a0compared to those in the sham group. Simultaneously administering 2% DSS and AOM resulted in nearly 80% mice survival in both sham and appendectomy groups without significant body weight loss. Histological results confirmed colonic inflammation and different degrees of adenocarcinoma. This model can be used for the study of the functional role of the appendix in maintaining gut microbiota homeostasis and pathogenesis of gut colitis and malignancies, as well as for the potential development of drug targeting therapies.", "pmid": "31498319", "title": "Murine Appendectomy Model of Chronic Colitis Associated Colorectal Cancer by Precise Localization of Caecal Patch."}, {"journal": "Journal of plastic surgery and hand surgery", "meshMajor": ["Adult", "Amputation, Traumatic", "Attitude of Health Personnel", "Carpal Tunnel Syndrome", "Clinical Decision-Making", "Cross-Sectional Studies", "Female", "Hammer Toe Syndrome", "Hand Injuries", "Humans", "Male", "Middle Aged", "Perception", "Physical Therapists", "Practice Patterns, Physicians'", "Prognosis", "Surgeons", "Surveys and Questionnaires", "Tendon Injuries", "Treatment Outcome"], "year": "2018", "abstractText": "OBJECTIVE: The objectives of this survey were (1) to study if surgeons' perceptions of the benefit of six surgical procedures differ if they consider themselves as patients instead of treating a patient, (2) to evaluate the role of five predetermined factors that may influence decision-making, and (3) to assess how uniformly hand surgeons and hand therapists perceive the benefits of the surgical treatments.METHODS: The members of the national societies for Hand Surgery and Hand Therapy were asked to participate in the survey. Six patient cases with hand complaint (carpal tunnel syndrome, flexor tendon injury, dorsal wrist ganglion, thumb amputation, boxer's fracture, and mallet fracture) and a proposed operative procedure were presented, and the respondents rated the procedures in terms of the expected benefit. Half of the surgeons were advised to consider themselves as patients when filling out the survey.RESULTS: A survey was completed by 56 surgeons (61%) and 59 therapists (20%). Surgeons who considered themselves as patients had less confident perception on the benefit of carpal tunnel release compared with surgeons, who considered treating patients. Hand surgeons and hand therapists had similar perception of the benefits of surgery. The expected functional result was regarded as the most important factor in directing the decision about the treatment.CONCLUSIONS: Surgeons tended to be more unanimous in their opinions in cases, where there is limited evidence on treatment effect. The agreement between surgeons and therapists implies that the clinical perspectives are similar, and probably reflect the reality well.", "pmid": "28417701", "title": "Survey of hand surgeons' and therapists' perceptions of the benefit of common surgical procedures of the hand."}, {"journal": "Pathology international", "meshMajor": ["Adenocarcinoma, Mucinous", "Adult", "Cervix Uteri", "Cesarean Section", "Choristoma", "Diagnosis, Differential", "Female", "Humans", "Immunohistochemistry", "Phenotype", "Pregnancy", "Urinary Bladder Diseases", "Uterine Cervical Neoplasms"], "year": "2010", "abstractText": "Endocervicosis of the urinary bladder is a very rare tumor-like benign lesion. In the present report, a case in a 34-year-old woman, who has a prior Caesarean section at the age of 30 and 2-years history of dysuria, is described. Transvaginal ultrasound, cystoscopy and magnetic resonance imaging demonstrated a solid mass in the posterior wall of the bladder. The mass was removed and histology revealed a haphazard proliferation of endocervical-type mucinous glands scattered through the muscularis propria of bladder wall. Immunohistochemical phenotype of these glands was compared with three normal uterine endocervices and two cases of well-differentiated mucinous adenocarcinoma of the uterine cervix. Endocervicosis glands displayed positive reaction for antibodies against estrogen receptor, progesterone receptor, CAM 5.2, cytokeratin 7, CA125, HBME-1 and carcinoembryonic antigen, which showed positivity in normal endocervices. On the other hand, only glands of well-differentiated mucinous adenocarcinoma expressed human gastric mucin and showed high proliferative index of Ki-67. These results supported the hypothesis of its M?llerian origin. Furthermore, diffuse distribution of estrogen and progesterone receptors, lack of human gastric mucin and low proliferative activity were distinct features for endocervicosis compared to well-differentiated mucinous adenocarcinoma.", "pmid": "20594276", "title": "Immunohistochemical phenotype of the urinary bladder endocervicosis: comparison with normal endocervix and well-differentiated mucinous adenocarcinoma of uterine cervix."}, {"journal": "American journal of diseases of children (1960)", "meshMajor": ["Anemia", "Child, Preschool", "Diseases in Twins", "Erythroblasts", "Erythrocyte Count", "Erythrocytes", "Female", "Humans", "Infant", "Male", "Reticulocytes"], "year": "1981", "abstractText": "Seventeen patients aged 7 to 33 months, including a pair of identical twin girls, came to the Children's Memorial Hospital, Chicago, between January 1975 and December 1979 with transient normocytic anemia and reticulocytopenia. In 16 of the patients, bone marrow aspirates were obtained; 15 showed erythroblastopenia and one showed erythroid hyperplasia indicative of recovery. Except for a cluster of six cases occurring from July to October 1979, no seasonal variation was observed. Unlike patients with congenital hypoplastic anemia, all 17 patients were of normal stature. Other distinguishing features of transient erythroblastopenia of childhood included onset after early infancy, normocytosis, and rapid, spontaneous recovery.", "pmid": "7293995", "title": "Transient erythroblastopenia of childhood. Review of 17 cases, including a pair of identical twins."}, {"journal": "Brain research bulletin", "meshMajor": ["Animals", "In Vitro Techniques", "Medulla Oblongata", "Membrane Potentials", "Motor Neurons", "Neural Inhibition", "Rats", "Somatostatin", "Vagus Nerve"], "year": "1986", "abstractText": "Somatostatin-14 (SOM) effects on electrical properties of membrane in rat brainstem slice preparations were studied in vitro by intracellular recording. Vagal motoneurons in the nucleus dorsalis motoris nervi vagi (DMV) were hyperpolarized by SOM. SOM increased both negativity of membrane potential and input membrane conductance, and decreased synaptic noise. The effects persisted during synaptic blockade by tetrodotoxin (TTX) or [Ca]o-free-high-[Mg]o perfusion. The reversal potential of the hyperpolarization induced by SOM depended on [K]o concentration. Hill's coefficient calculated from the dose-response curve was 2. The results suggest that SOM may inhibit visceral organ functions through the DMV.", "pmid": "2876758", "title": "Effect of somatostatin on the vagal motor neuron in the rat."}, {"journal": "Journal of hand surgery (Edinburgh, Scotland)", "meshMajor": ["Adult", "Aged", "Amyloidosis", "Carpal Tunnel Syndrome", "Humans", "Middle Aged", "Renal Dialysis", "Synovial Membrane"], "year": "1988", "abstractText": "Over a five-year period (1981-1985), nine patients on haemodialysis developed carpal tunnel syndrome. Five patients, following biopsy of synovium in the carpal tunnel or biopsy of thickened epineurium of the median nerve, were found to have amyloid deposits in the soft tissues. The relationship between this condition, dialysis arthropathy and long-term haemodialysis is reviewed. In addition, in this small group of patients no relationship to the side of the fistula has been demonstrated and two patients developed recurrent problems despite initial open decompression of the carpal tunnel.", "pmid": "3249138", "title": "Amyloidosis as a cause of carpal tunnel syndrome in haemodialysis patients."}, {"journal": "Letters in applied microbiology", "meshMajor": ["Animals", "Anti-Infective Agents", "Bacillus", "Biological Products", "Nematoda", "Resveratrol", "Stilbenes"], "year": "2012", "abstractText": "AIMS: \u2002 The aim of the present study was to purify and characterize a natural antimicrobial compound from Bacillus sp. strain N associated with a novel rhabditid entomopathogenic nematode.METHODS AND RESULTS: \u2002 The cell-free culture filtrate of a bacterium associated with a novel entomopathogenic nematode (EPN), Rhabditis (Oscheius) sp. exhibited strong antimicrobial activity. The ethyl acetate extract of the bacterial culture filtrate was purified by column chromatography, and two bioactive compounds were isolated and their chemical structures were established based on spectral analysis. The compounds were identified as 3,4',5-trihydroxystilbene (1) and 3,5-dihydroxy-4-isopropylstilbene (2). The presence of 3,4',5-trihydroxystilbene (resveratrol) is reported for the first time in bacteria. Compound 1 showed antibacterial activity against all the four test bacteria, whereas compound 2 was effective against the Gram-positive bacteria only. Compounds 1 and 2 were active against all the five fungi tested and are more effective than bavistin, the standard fungicide. The antifungal activity of the compounds against the plant pathogenic fungi, Rhizoctonia solani is reported for the first time.CONCLUSIONS: \u2002 Cell-free extract of the bacterium and isolated stilbenes demonstrated high antibacterial activity against bacteria and fungi especially against plant pathogenic fungi. We conclude that the bacterium-associated EPN are promising sources of natural bioactive secondary metabolites.SIGNIFICANCE AND IMPACT OF THE STUDY: \u2002 Stilbene compounds can be used for the control of fungi and bacteria.", "pmid": "22332977", "title": "Bioactive stilbenes from a Bacillus sp. N strain associated with a novel rhabditid entomopathogenic nematode."}, {"journal": "Methods in molecular biology (Clifton, N.J.)", "meshMajor": ["Electron Probe Microanalysis", "Histocytological Preparation Techniques", "Microscopy", "Microscopy, Electron, Scanning", "Minerals", "Plant Cells"], "year": "2014", "abstractText": "This chapter describes protocols using formalin-acetic acid-alcohol (FAA) to fix plant tissues for studying biomineralization by means of scanning electron microscopy (SEM) and qualitative energy-dispersive X-ray microanalysis (EDX). Specimen preparation protocols for SEM and EDX mainly include fixation, dehydration, critical point drying (CPD), mounting, and coating. Gold-coated specimens are used for SEM imaging, while gold- and carbon-coated specimens are prepared for qualitative X-ray microanalyses separately to obtain complementary information on the elemental compositions of biominerals. During the specimen preparation procedure for SEM, some biominerals may be dislodged or scattered, making it difficult to determine their accurate locations, and light microscopy is used to complement SEM studies. Specimen preparation protocols for light microscopy generally include fixation, dehydration, infiltration and embedding with resin, microtome sectioning, and staining. In addition, microwave processing methods are adopted here to speed up the specimen preparation process for both SEM and light microscopy. ", "pmid": "24357384", "title": "Application of SEM and EDX in studying biomineralization in plant tissues."}, {"journal": "Autophagy", "meshMajor": ["Animals", "Autophagy", "Autophagy-Related Proteins", "Cell Line", "Central Nervous System", "Disease Models, Animal", "Encephalomyelitis, Autoimmune, Experimental", "Female", "Inflammation", "Macrophages", "Mice", "Mice, Inbred C57BL", "MicroRNAs", "Microglia", "Microscopy, Electron, Transmission", "PPAR gamma", "Proto-Oncogene Proteins c-bcl-2", "Up-Regulation"], "year": "2019", "abstractText": "Microglia are innate immune cells in the central nervous system (CNS), that supplies neurons with key factors for executing autophagosomal/lysosomal functions. Macroautophagy/autophagy is a cellular catabolic process that maintains cell balance in response to stress-related stimulation. Abnormal autophagy occurs with many pathologies, such as cancer, and autoimmune and neurodegenerative diseases. Hence, clarification of the mechanisms of autophagy regulation is of utmost importance. Recently, researchers presented microRNAs (miRNAs) as novel and potent modulators of autophagic activity. Here, we found that Mir223 deficiency significantly ameliorated CNS inflammation, demyelination and the clinical symptoms of experimental autoimmune encephalomyelitis (EAE) and increased resting microglia and autophagy in brain microglial cells. In contrast, the autophagy inhibitor 3-methylademine (3-MA) aggravated the clinical symptoms of EAE in wild-type (WT) and Mir223-deficienct mice. Furthermore, it was confirmed that Mir223 deficiency in mice increased the protein expression of ATG16L1 (autophagy related 16-like 1 [S. cerevisiae]) and LC3-II in bone marrow-derived macrophage cells compared with cells from WT mice. Indeed, the cellular level of Atg16l1 was decreased in BV2 cells upon Mir223 overexpression and increased following the introduction of antagomirs. We also showed that the 3' UTR of Atg16l1 contained functional Mir223-responsive sequences and that overexpression of ATG16L1 returned autophagy to normal levels even in the presence of Mir223 mimics. Collectively, these data indicate that Mir223 is a novel and important regulator of autophagy and that Atg16l1 is a Mir223 target in this process, which may have implications for improving our understanding of the neuroinflammatory process of EAE. Abbreviations: 3-MA: 3-methylademine; ACTB/\u00e2-actin: actin, beta; ATG: autophagy related; ATG16L1: autophagy related 16-like 1 (S. cerevisiae); BECN1: beclin 1, autophagy related; CNR2: cannabinoid receptor 2 (macrophage); CNS: central nervous system; CQ: chloroquine; EAE: experimental autoimmune encephalomyelitis; FOXO3: forkhead box O3; GAPDH: glyceraldehyde-3-phosphate dehydrogenase; H&E: hematoxylin and eosin; ITGAM: integrin alpha M; LPS: lipoplysaccharide; MAP1LC3/LC3: microtubule-associated protein 1 light chain 3; miRNAs: microRNAs; MS: multiple sclerosis; PPARG: peroxisome proliferator activated receptor gamma; PTPRC: protein tyrosine phosphatase, receptor type, C; RA: rheumatoid arthritis; SQSTM1: sequestosome 1; TB: tuberculosis; TIMM23: translocase of inner mitochondrial membrane 23; TLR: toll-like receptor.", "pmid": "30208760", "title": "Mir223 restrains autophagy and promotes CNS inflammation by targeting ATG16L1."}, {"journal": "The Journal of hand surgery", "meshMajor": ["Computer Communication Networks", "Computers", "Education, Medical, Continuing", "Education, Medical, Graduate", "Faculty, Medical", "Hand", "Humans", "Internship and Residency", "Patient Care Team", "Periodicals as Topic", "Software", "Videoconferencing"], "year": "2014", "abstractText": "With our hand team scattered across several different locations, it is difficult to find a time to get together for our weekly didactic hand conference and monthly hand journal club. In addition, traffic and tight clinical schedules sometimes force conferences to start late or be canceled. Our solution was to set up an on-line conference. Using TeamViewer to host our conference and Skype to host our journal clubs, we experienced increased attendance by both faculty and residents in our\u00a0meetings. In this article, we establish a method of hosting effective on-line videoconferences to facilitate nearly universal participation of our hand team, and we hope to assist others who wish\u00a0to establish similar setups in their communities. ", "pmid": "24315487", "title": "How to establish an interactive eConference and eJournal Club."}, {"journal": "Acta tropica", "meshMajor": ["Animals", "Cote d'Ivoire", "Electrophoresis", "Humans", "Isoenzymes", "Male", "Swine", "Trypanosoma", "Trypanosomiasis, African"], "year": "1981", "abstractText": "'Mini-pigs' were infected with salivarian Trypanozoon clones to examine the persistence and stability of the human serum resistance [Blood Incubation Infectivity Test (BIIT)] and isoenzyme characteristics during infection in a new host. A stock regarded as Trypanosoma brucei, derived from a domestic pig in the Ivory Coast, retained its BIIT negative (serum sensitive), alanine aminotransferase (ALAT) and peptidase 2 (PEP 2) characteristics throughout 343 days of infection in pigs. Similarly there was no change in the BIIT positive (serum resistant) and different ALAT and PEP characteristics of a human isolate from the same area, and regarded as T. b. gambiense, during 154 days before the infection became undetectable. In mixed infections of the two clones in pigs, trypanosomes which were not treated with human serum and inoculated into Mastomys natalensis invariably displayed the 'T. b. brucei' characteristics. However, simultaneous inoculations of trypanosomes treated with human serum into M. natalensis always displayed the characteristics of the T. b. gambiense. Thus, in mixed infections, in which 'T. b. brucei' predominated, the minority 'T. b. gambiense' population was recoverable after treatment with human serum by subinoculation into Mastomys.", "pmid": "6123244", "title": "On the persistence of human serum resistance and isoenzyme patterns of Trypanozoon in experimentally infected pigs."}, {"journal": "The Journal of biological chemistry", "meshMajor": ["Adaptor Proteins, Signal Transducing", "Ataxia Telangiectasia Mutated Proteins", "BRCA1 Protein", "Cell Cycle", "Cell Cycle Proteins", "Checkpoint Kinase 1", "DNA Damage", "DNA, Complementary", "DNA-Binding Proteins", "Dose-Response Relationship, Drug", "Dose-Response Relationship, Radiation", "Down-Regulation", "Glutathione Transferase", "HeLa Cells", "Humans", "K562 Cells", "Microscopy, Fluorescence", "Nuclear Proteins", "Phosphorylation", "Plasmids", "Precipitin Tests", "Protein Binding", "Protein Kinases", "Protein Structure, Tertiary", "Protein-Serine-Threonine Kinases", "RNA, Small Interfering", "S Phase", "Time Factors", "Trans-Activators", "Transfection", "Tumor Suppressor Proteins"], "year": "2003", "abstractText": "BRCA1 is a tumor suppressor involved in DNA repair and damage-induced checkpoint controls. In response to DNA damage, BRCA1 relocalizes to nuclear foci at the sites of DNA lesions. However, little is known about the regulation of BRCA1 relocalization following DNA damage. Here we show that mediator of DNA damage checkpoint protein 1 (MDC1), previously named NFBD1 or Kiaa0170, is a proximate mediator of DNA damage responses that regulates BRCA1 function. MDC1 regulates ataxia-telangiectasia-mutated (ATM)-dependent phosphorylation events at the site of DNA damage. Importantly down-regulation of MDC1 abolishes the relocalization and hyperphosphorylation of BRCA1 following DNA damage, which coincides with defective G(2)/M checkpoint control in response to DNA damage. Taken together these data suggest that MDC1 regulates BRCA1 function in DNA damage checkpoint control.", "pmid": "12611903", "title": "Mediator of DNA damage checkpoint protein 1 regulates BRCA1 localization and phosphorylation in DNA damage checkpoint control."}, {"journal": "Molecular autism", "meshMajor": ["Attention", "Autistic Disorder", "Case-Control Studies", "Child", "Child, Preschool", "Female", "Head Movements", "Humans", "Male", "Neurologic Examination", "Social Behavior"], "year": "2018", "abstractText": "Background: Deficits in motor movement in children with autism spectrum disorder (ASD) have typically been characterized qualitatively by human observers. Although clinicians have noted the importance of atypical head positioning (e.g. social peering and repetitive head banging) when diagnosing children with ASD, a quantitative understanding of head movement in ASD is lacking. Here, we conduct a quantitative comparison of head movement dynamics in children with and without ASD using automated, person-independent computer-vision\u00a0based head tracking (Zface). Because children with ASD often exhibit preferential attention to nonsocial versus social stimuli, we investigated whether children with and without ASD differed in their head movement dynamics depending on stimulus sociality.Methods: The current study examined differences in head movement dynamics in children with (n\u2009=\u200921) and without ASD (n\u2009=\u200921). Children were video-recorded while watching a 16-min video of social and nonsocial stimuli. Three dimensions of rigid head movement-pitch (head nods), yaw (head turns), and roll (lateral head inclinations)-were tracked using Zface. The root mean square of pitch, yaw, and roll was calculated to index the magnitude of head angular displacement (quantity of head movement) and angular velocity (speed).Results: Compared with children without ASD, children with ASD exhibited greater yaw displacement, indicating greater head turning, and greater velocity of yaw and roll, indicating faster head turning and inclination. Follow-up analyses indicated that differences in head movement dynamics were specific to the social rather than the nonsocial stimulus condition.Conclusions: Head movement dynamics (displacement and velocity) were greater in children with ASD than in\u00a0children without ASD, providing a quantitative foundation for previous clinical reports. Head movement differences were evident in lateral (yaw and roll) but not vertical (pitch) movement and were specific to a social rather than nonsocial condition. When presented with social stimuli, children with ASD had higher levels of head movement and moved their heads more quickly than children without ASD. Children with ASD may use head movement to modulate their perception of social scenes.", "pmid": "29492241", "title": "Objective measurement of head movement differences in children with and without autism spectrum disorder."}, {"journal": "The International journal on drug policy", "meshMajor": ["Adolescent", "Armed Conflicts", "Epidemics", "Female", "Food", "HIV Infections", "Humans", "Illicit Drugs", "Interpersonal Relations", "Libya", "Male", "Parent-Child Relations", "Prisons", "Religion", "Risk Factors", "Schools", "Self Concept", "Substance Abuse Treatment Centers", "Substance Abuse, Intravenous", "Substance-Related Disorders", "Surveys and Questionnaires"], "year": "2018", "abstractText": "BACKGROUND: Libya is facing a rapidly growing epidemic of illicit drug use and HIV. This situation is fueled by a complex array of factors, mainly the consequences of the political and military turmoil of the Arab Spring. Although it is extensively documented in other settings that young people are one of the most vulnerable groups to both HIV and illicit drug use, no study has explored this issue among young people in Libya. The current study addresses this research gap.METHODS: This study is a qualitative study using in-depth interviews guided by a semi-structured questionnaire. We used a maximum variation, purposive sampling strategy to recruit male and female participants, aged 14-18 years, from schools, prisons, and community-based informal re-education and rehabilitation centers in Tripoli, Libya.RESULTS: In total, 31 participants were recruited: 6 females and 25 males. Sixteen participants were prisoners and residents of community-based informal re-education and rehabilitation centers, and 15 were recruited in schools. Risk factors for drug use included peer influence, the increased availability and affordability of drugs, disruption of social life and healthy recreational activities, and the distress and casualties of the war. Protective factors were religious beliefs and practices, good parent-child connectedness, and high self-esteem and future aspiration. Risk factors for HIV were insufficient knowledge related to HIV transmission and unsafe injection practices, such as sharing needles and syringes.CONCLUSION: We found individual, interpersonal, family, and structural-level factors that interplayed to shape the vulnerability of young people to drug use and HIV infection in Tripoli, Libya. Structural factors, including the increased availability and affordability of drugs, provided the frame within which other factors, such as peer influence, insufficient knowledge of substance use, and HIV, operated to increase the vulnerability of young people to drugs and HIV, while religious beliefs and parent-child connectedness acted as protective factors. Multisectoral efforts and studies to quantitatively evaluate the magnitude and distribution of these problems are urgently needed.", "pmid": "29272852", "title": "\"Now drugs in Libya are much cheaper than food\": A qualitative study on substance use among young Libyans in post-revolution Tripoli, Libya."}, {"journal": "Journal of the American Academy of Child and Adolescent Psychiatry", "meshMajor": ["Adolescent", "Adolescent Psychiatry", "Age Factors", "Female", "Health Status", "Humans", "Longitudinal Studies", "Male", "Mental Disorders", "Mental Health Services", "Patient Acceptance of Health Care", "Psychiatric Status Rating Scales", "Risk Factors", "Socialization", "Stress, Psychological"], "year": "1993", "abstractText": "OBJECTIVE: To determine the strength of association between mental health disorders in adolescence and disorder in early adulthood.METHOD: The study used mental health data from a longitudinal investigation of a New Zealand birth cohort. Of the 943 with prevalence data for DSM-III disorder at age 15, 890 had prevalence data for DSM-III-R disorder when aged 18 years.RESULTS: Two-thirds of those with disorder at age 15 had disorder at age 18. The residual form of attention deficit disorder, simple phobias, and oppositional disorders (with no other accompanying disorders) were associated with the lowest risk of later disorder and conduct disorder with the highest. With the exception of the overall symptom level, a variety of characteristics examined (e.g., social competence and adversity) could not differentiate between those with transient disorder and those with disorder at both ages. Comparisons of those with recurring disorder and those with new disorder at age 18 showed that in addition to characteristics of the disorder, disadvantage was strongly associated with recurrent disorder.CONCLUSIONS: The risk of later disorder for those with disorder in adolescence was high and differed across type of disorder. Findings suggest that to reduce the risk of disorder in early adulthood, clinicians could play a more active role in community interventions with direct social outcomes.", "pmid": "8282655", "title": "Mental health disorders from age 15 to age 18 years."}, {"journal": "Journal of the American Chemical Society", "meshMajor": ["Amino Acid Sequence", "Animals", "Cell Line", "Crystallography, X-Ray", "Glycoproteins", "Glycosylation", "Macrophage-Activating Factors", "Macrophages", "Mice", "Models, Molecular", "Molecular Sequence Data", "Phagocytosis", "Vitamin D-Binding Protein"], "year": "2006", "abstractText": "Rational protein design has been successfully used to create mimics of natural proteins that retain native activity. In the present work, de novo protein engineering is explored to develop a mini-protein analogue of Gc-MAF, a glycoprotein involved in the immune system activation that has shown anticancer activity in mice. Gc-MAF is derived in vivo from vitamin D binding protein (VDBP) via enzymatic processing of its glycosaccharide to leave a single GalNAc residue located on an exposed loop. We used molecular modeling tools in conjunction with structural analysis to splice the glycosylated loop onto a stable three-helix bundle (alpha3W, PDB entry 1LQ7). The resulting 69-residue model peptide, MM1, has been successfully synthesized by solid-phase synthesis both in the aglycosylated and the glycosylated (GalNAc-MM1) form. Circular dichroism spectroscopy confirmed the expected alpha-helical secondary structure. The thermodynamic stability as evaluated from chemical and thermal denaturation is comparable with that of the scaffold protein, alpha3W, indicating that the insertion of the exogenous loop of Gc-MAF did not significantly perturb the overall structure. GalNAc-MM1 retains the macrophage stimulation activity of natural Gc-MAF; in vitro tests show an identical enhancement of Fc-receptor-mediated phagocytosis in primary macrophages. GalNAc-MM1 provides a framework for the development of mutants with increased activity that could be used in place of Gc-MAF as an immunomodulatory agent in therapy.", "pmid": "16734450", "title": "A designed glycoprotein analogue of Gc-MAF exhibits native-like phagocytic activity."}, {"journal": "Chronobiology international", "meshMajor": ["Animals", "Astronomical Phenomena", "Astronomy", "Female", "Humans", "Periodicity", "Placenta", "Pregnancy", "Pregnancy, Animal", "Reproduction", "Species Specificity"], "year": "1988", "abstractText": "Weekly, twice-monthly, and monthly lunar related rhythms have been alleged for various animal reproductive processes. Herein gestation times of 213 types of terrestrial placental mammals were analyzed for best-fit integer multiples approximating length of any of the above lunar related rhythms. At the same time numeric controls were constituted of a completely random, a block randomized, and a sequential set of numbers spanning the data set. Among test integers 6 through 33, the number 30, approximating the 29.53-day lunar-synodic month, was consistently and statistically a best-fit multiple to the data. This might suggest a once-monthly lunar illumination, but not a twice-monthly gravitational or near-weekly tidal, influence upon animal reproduction. As for a receptor mechanism, the tapetum, or reflective layer of the retina, present in most land mammals, but absent in humans, enhances dim illumination. A suggestion is that because of this visual enhancer, cycling moonlight might be a circa-lunar physiologic timer for many terrestrial mammals.", "pmid": "3219755", "title": "Common 30-day multiple in gestation time of terrestrial placentals."}, {"journal": "European journal of surgical oncology : the journal of the European Society of Surgical Oncology and the British Association of Surgical Oncology", "meshMajor": ["Adenocarcinoma", "Adult", "Aged", "Aged, 80 and over", "Disease-Free Survival", "Humans", "Lymphocyte Count", "Male", "Middle Aged", "Neoplasm Staging", "Neutrophils", "Pancreatic Neoplasms", "Predictive Value of Tests", "Preoperative Period", "ROC Curve", "Risk Factors", "Survival Rate", "Tumor Burden"], "year": "2018", "abstractText": "BACKGROUND: The neutrophil-to-lymphocyte ratio (NLR), which reflects the cancer-induced systemic inflammation response, has been proposed as a risk factor for poor long-term prognosis in cancer. We\u00a0investigated the prognostic role of the NLR and the relationship between the NLR and TNM stage in pancreatic ductal adenocarcinoma (PDAC) patients following curative resection.METHODS: One-hundred thirty-eight consecutive patients with resected PDAC were enrolled between 2004 and 2014. Univariate and multivariate analyses identified variables associated with overall survival (OS) and recurrence-free survival (RFS). Patients were stratified according to the NLR, with an NLR cut-off value of 2.2 being estimated by receiver operating characteristic curve.RESULTS: Compared to patients with a low NLR (?2.2), those with a high preoperative NLR (>2.2) had worse OS and RFS (P\u00a0=\u00a00.017, P\u00a0=\u00a00.029, respectively). For early-stage tumors, tumor size ?20\u00a0mm and a high NLR were independent risk factors for poor OS (hazard ratio (HR): 3.255, 95% confidence interval (CI): 1.082-9.789, P\u00a0=\u00a00.036; HR: 3.690, 95% CI: 1.026-13.272, P\u00a0=\u00a00.046, respectively) and RFS (HR:\u00a03.575, 95% CI: 1.174-10.892, P\u00a0=\u00a00.025; HR: 5.380, 95% CI: 1.587-18.234, P\u00a0=\u00a00.007, respectively). The NLR was not correlated with prognosis in patients with advanced stages.CONCLUSIONS: An elevated preoperative NLR was an important prognosticator for early TNM stage PDAC. The NLR, which is calculated using inexpensive and readily available biomarkers, could be a novel tool for predicting long-term survival in patients, especially those with early stage PDAC.", "pmid": "29807728", "title": "Preoperative neutrophil-to-lymphocyte ratio as a prognosticator in early stage pancreatic ductal adenocarcinoma."}, {"journal": "Acta obstetricia et gynecologica Scandinavica", "meshMajor": ["Abortion, Legal", "Adult", "Cervix Uteri", "Chorionic Gonadotropin", "Dilatation", "Female", "Humans", "Laminaria", "Placenta", "Pregnancy"], "year": "1992", "abstractText": "Serum concentrations of hCG were determined in blood samples taken 18-20 h and immediately before vacuum aspiration in 45 women in gestational weeks 7-9, admitted for legal abortion. In 35 of the women, a laminaria tent was inserted for cervical dilatation immediately after the first blood sampling. Serum hCG values decreased significantly in the women pretreated with laminaria tent, but were unchanged in the untreated women. This finding may indicate that pretreatment with a laminaria tent induces a partial placental detachment.", "pmid": "1315097", "title": "Effect of predilatation of the uterine cervix by laminaria tent on activity of the placenta."}, {"journal": "Brazilian dental journal", "meshMajor": ["Adolescent", "Alveolar Process", "Cephalometry", "Child", "Chin", "Facial Bones", "Humans", "Incisor", "Malocclusion, Angle Class I", "Mandible", "Maxilla", "Molar", "Nasal Bone", "Palate", "Puberty", "Tooth", "Vertical Dimension"], "year": "2004", "abstractText": "The dental and skeletal dimensions of individuals with Class I skeletal pattern in puberty were compared. Eighty patients with Class I malocclusion were selected, independent of the vertical relations (overbite) of the incisors. The sample was divided into 3 groups: normal, short and excessive lower anterior face height, based on facial proportions. The dental and skeletal measurements of the 3 groups were compared among themselves. In the angular measurements, the results showed no correlation in the mandibular plane angle. In the linear measurements, the mandibular length was significantly greater in the group of patients with short lower anterior face height, with a positive correlation among the three groups. The dentoalveolar heights of the incisors had a positive correlation among the three groups in relation to the lower anterior face height, showing that they are responsible for its variation.", "pmid": "15322649", "title": "Dental-skeletal dimensions in growing individuals with variations in the lower facial height."}, {"journal": "Journal of nuclear medicine : official publication, Society of Nuclear Medicine", "meshMajor": ["Cardiomyopathy, Dilated", "Cardiomyopathy, Hypertrophic", "Electrocardiography", "Female", "Heart", "Humans", "Image Processing, Computer-Assisted", "Male", "Middle Aged", "Models, Structural", "Myocardial Contraction", "Myocardial Infarction", "Thallium Radioisotopes", "Tomography, Emission-Computed, Single-Photon", "Ventricular Function, Left"], "year": "1991", "abstractText": "We measured left ventricular (LV) systolic thickening expressed as a systolic thickening ratio in 28 patients, using 201Tl ECG-gated SPECT. Five normals, 15 patients with prior myocardial infarction, 5 with hypertrophic cardiomyopathy, and 3 with dilated cardiomyopathy were studied. The systolic thickening ratio was calculated as [(end-systolic--end-diastolic pixel counts) divided by end-diastolic pixel counts], using the circumferential profile technique of both end-diastolic and end-systolic short axial images. Functional images of the systolic thickening ratio were also displayed with the \"bull's-eye\" method. The mean systolic thickening ratio thus calculated were as follows: normals, 0.53 +/- 0.05 (mean +/- 1 s.d.); non-transmural prior myocardial infarction, 0.33 +/- 0.09; transmural prior myocardial infarction, 0.14 +/- 0.05; hypertrophic cardiomyopathy in relatively nonhypertrophied areas, 0.56 +/- 0.11; hypertrophic cardiomyopathy in hypertrophied areas, 0.23 +/- 0.07; and dilated cardiomyopathy, 0.19 +/- 0.02. The systolic thickening ratio analysis by gated thallium SPECT offers a unique approach for assessing LV function.", "pmid": "1869967", "title": "Assessment of systolic thickening with thallium-201 ECG-gated single-photon emission computed tomography: a parameter for local left ventricular function."}, {"journal": "Annals of plastic surgery", "meshMajor": ["Adolescent", "Adult", "Asian Continental Ancestry Group", "Cartilage", "Cohort Studies", "Esthetics", "Female", "Follow-Up Studies", "Humans", "Male", "Middle Aged", "Nasal Septum", "Nose Deformities, Acquired", "Patient Satisfaction", "Republic of Korea", "Retrospective Studies", "Rhinoplasty", "Risk Assessment", "Treatment Outcome", "Young Adult"], "year": "2017", "abstractText": "BACKGROUND: A deviated nose is a common deformity encountered in rhinoplasty. Over the past several decades, a variety of rhinoplasty techniques have been described focusing on the classification of bony and cartilaginous deviation. Nevertheless, corrective rhinoplasty is still a challenging procedure even for experienced surgeons because of the high recurrence rate of deviation. In attempt to reduce the recurrence rate, the author systematized the complex procedures by using a single technique regardless of the classification of a deviation.MATERIALS AND METHODS: Forty patients who underwent corrective rhinoplasty between June 2009 and December 2014 were reviewed retrospectively. All the patients were operated using 4 main surgical procedures: anterior approach septal correction, unilateral osteotomy, and medialization of the deviated complex to the contralateral intact side, and dorsal augmentation with a dermofat graft. Assessment of improvement was based on photo standardization. The degree of nasal deviation, nasofrontal angle, tip projection-to-nasal length ratio, vertical line of the upper lip-to-tip projection ratio, and columellar-labial angle were measured.RESULTS: Preoperative and postoperative anthropometric measurements revealed that the mean degree of deviation changed from 10.19\u00b0 to 3.43\u00b0 (P < 0.01), and the degree of nasofrontal angle changed from 131.55\u00b0 to 133.14\u00b0 (P < 0.01). All patients responded to both the preoperative and postoperative questionnaires. The questionnaires revealed a significant functional and cosmetic improvement from 36.84\u00b0 to 76.95\u00b0 and 39.45\u00b0 to 79.41\u00b0, respectively (P < 0.0001).CONCLUSIONS: This systematized strategy to correct the Asian deviated nose provided reproducible and consistent results It also resulted in low recurrence rates and high postoperative satisfaction among patients.", "pmid": "27922895", "title": "A Systematized Strategy in Corrective Rhinoplasty for the Asian Deviated Nose."}, {"journal": "PloS one", "meshMajor": ["Cancer Vaccines", "Computer Simulation", "Epitopes", "Gene Expression Regulation, Neoplastic", "Genome, Human", "Genomics", "Humans", "Immunotherapy", "Internet", "Mutation", "Neoplasms", "Peptides", "Precision Medicine"], "year": "2016", "abstractText": "Due to advancement in sequencing technology, genomes of thousands of cancer tissues or cell-lines have been sequenced. Identification of cancer-specific epitopes or neoepitopes from cancer genomes is one of the major challenges in the field of immunotherapy or vaccine development. This paper describes a platform Cancertope, developed for designing genome-based immunotherapy or vaccine against a cancer cell. Broadly, the integrated resources on this platform are apportioned into three precise sections. First section explains a cancer-specific database of neoepitopes generated from genome of 905 cancer cell lines. This database harbors wide range of epitopes (e.g., B-cell, CD8+ T-cell, HLA class I, HLA class II) against 60 cancer-specific vaccine antigens. Second section describes a partially personalized module developed for predicting potential neoepitopes against a user-specific cancer genome. Finally, we describe a fully personalized module developed for identification of neoepitopes from genomes of cancerous and healthy cells of a cancer-patient. In order to assist the scientific community, wide range of tools are incorporated in this platform that includes screening of epitopes against human reference proteome (http://www.imtech.res.in/raghava/cancertope/).", "pmid": "27832200", "title": "A Platform for Designing Genome-Based Personalized Immunotherapy or Vaccine against Cancer."}, {"journal": "Journal of applied physiology (Bethesda, Md. : 1985)", "meshMajor": ["Animals", "Isometric Contraction", "Male", "Muscle Fibers, Skeletal", "Muscle, Skeletal", "Rats", "Rats, Inbred F344", "Reflex", "Regeneration", "Transplantation, Autologous"], "year": "1998", "abstractText": "In rats, combinations of plantar flexor muscles representing approximately 20, 40, 60, and 80% of the mass of the total plantar flexor group were transferred orthotopically in the absence of synergistic muscles and allowed to recover for 120 days. We hypothesized that, compared with their individual control values for structural and functional variables, the transfers would display a hierarchical array of deficits, proportional to their initial mass and, consequently, inversely proportional to the relative load on the transfers. Surprisingly, compared with their individual control values, each muscle transfer displayed deficits of 30-40% in muscle mass, total fiber cross-sectional area, and maximum isometric force, with the exception of the smallest transfer, the plantaris (PLN) muscle, which recovered 100% of its control value for each of these variables. Therefore, except for the PLN transfer, the muscle transfers studied displayed deficits similar in magnitude to those reported for muscles transferred in the presence of synergistic muscles. The greater recovery of the PLN transfer was attributed to the relatively large requirement for force production imposed on this transfer due to the average force requirements of the total plantar flexor group.", "pmid": "9609778", "title": "Recovery of muscle transfers replacing the total plantar flexor muscle group in rats."}, {"journal": "Journal of nephrology", "meshMajor": ["Adult", "Aged", "Calcinosis", "Coronary Disease", "Exercise Test", "Female", "Heart", "Humans", "Kidney Transplantation", "Male", "Middle Aged", "Myocardial Ischemia", "Radionuclide Imaging"], "year": null, "abstractText": "Whether coronary artery calcium (CAC) screening in pretransplant patients may help predict silent myocardial ischemia is unknown. Accordingly, we performed CAC imaging on 46 nondiabetic patients awaiting kidneytransplant. All patients underwent multidetector computed tomography imaging for CAC quantification, and a vasodilator myocardial perfusion stress (MPS) test was performed only in patients with a total CAC score>300 or>100 in a single coronary artery. The mean patient's age was 46+/-14 years and the median dialysis vintage was 33 months (interquartile range 19-53). The median CAC score was 82 (interquartile range 0-700) and correlated with patients' age (p=0.006) and dialysis vintage (p=0.02). Nineteen patients qualified for MPS, but 5 refused the test. Of the remaining 14 patients, 7 patients had normal scans and 7 showed a minimal perfusion defect in the inferoposterior segment of the left ventricle. At the time of writing, 12 patients have undergone successful kidney transplantation without untoward complications. CAC screening does not appear to be associated with silent ischemia in pretransplant patients. Though CAC is extensive in dialysis patients, calcium may be associated with nonobstructive atherosclerotic lesions or calcification of the media layer of the vessel wall.", "pmid": "17048205", "title": "Screening for silent ischemia with coronary artery calcium and nuclear stress testing in nondiabetic patients prior to kidney transplant."}, {"journal": "Journal of molecular biology", "meshMajor": ["Anticodon", "Base Sequence", "Molecular Sequence Data", "Nuclear Magnetic Resonance, Biomolecular", "Nucleic Acid Conformation", "RNA, Transfer, Phe"], "year": "2003", "abstractText": "Post-transcriptional modifications contribute chemistry and structure to RNAs. Modifications of tRNA at nucleoside 37, 3'-adjacent to the anticodon, are particularly interesting because they facilitate codon recognition and negate translational frame-shifting. To assess if the functional contribution of a position 37-modified nucleoside defines a specific structure or restricts conformational flexibility, structures of the yeast tRNA(Phe) anticodon stem and loop (ASL(Phe)) with naturally occurring modified nucleosides differing only at position 37, ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)), and ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)), were determined by NMR spectroscopy and restrained molecular dynamics. The ASL structures had similarly resolved stems (RMSD approximately 0.6A) of five canonical base-pairs in standard A-form RNA. The \"NOE walk\" was evident on the 5' and 3' sides of the stems of both RNAs, and extended to the adjacent loop nucleosides. The NOESY cross-peaks involving U(33) H2' and characteristic of tRNA's anticodon domain U-turn were present but weak, whereas those involving the U(33) H1' proton were absent from the spectra of both ASLs. However, ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) exhibited the downfield shifted 31P resonance of U(33)pGm(34) indicative of U-turns; ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)) did not. An unusual \"backwards\" NOE between Gm(34) and A(35) (Gm(34)/H8 to A(35)/H1') was observed in both molecules. The RNAs exhibited a protonated A(+)(38) resulting in the final structures having C(32).A(+)(38) intra-loop base-pairs, with that of ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) being especially well defined. A single family of low-energy structures of ASL(Phe)-(Cm(32),Gm(34), m(1)G(37),m(5)C(40)) (loop RMSD 0.98A) exhibited a significantly restricted conformational space for the anticodon loop in comparison to that of ASL(Phe)-(Cm(32),Gm(34),m(5)C(40)) (loop RMSD 2.58A). In addition, the ASL(Phe)-(Cm(32),Gm(34),m(1)G(37),m(5)C(40)) average structure had a greater degree of similarity to that of the yeast tRNA(Phe) crystal structure. A comparison of the resulting structures indicates that modification of position 37 affects the accuracy of decoding and the maintenance of the mRNA reading frame by restricting anticodon loop conformational space.", "pmid": "14643656", "title": "Naturally-occurring modification restricts the anticodon domain conformational space of tRNA(Phe)."}, {"journal": "The Plant journal : for cell and molecular biology", "meshMajor": ["Cysteine Endopeptidases", "Gene Knockdown Techniques", "Mass Spectrometry", "Methionine", "Plant Diseases", "Plants, Genetically Modified", "Potyvirus", "Protein Modification, Translational", "RNA Interference", "RNA, Viral", "RNA-Induced Silencing Complex", "Ribosomal Proteins", "Tobacco", "Viral Proteins"], "year": "2016", "abstractText": "Potyviral helper component proteinase (HCPro) is a well-characterized suppressor of antiviral RNA silencing, but its mechanism of action is not yet fully understood. In this study, we used affinity purification coupled with mass spectrometry to identify binding partners of HCPro in potyvirus-infected plant cells. This approach led to identification of various HCPro interactors, including two key enzymes of the methionine cycle, S-adenosyl-L-methionine synthase and S-adenosyl-L-homocysteine hydrolase. This finding, together with the results of enzymatic activity and gene knockdown experiments, suggests a mechanism in which HCPro complexes containing viral and host proteins act to suppress antiviral RNA silencing through local disruption of the methionine cycle. Another group of HCPro interactors identified in this study comprised ribosomal proteins. Immunoaffinity purification of ribosomes demonstrated that HCPro is associated with ribosomes in virus-infected cells. Furthermore, we show that HCPro and ARGONAUTE1 (AGO1), the core component of the RNA-induced silencing complex (RISC), interact with each other and are both associated with ribosomes in planta. These results, together with the fact that AGO1 association with ribosomes is a hallmark of RISC-mediated translational repression, suggest a second mechanism of HCPro action, whereby ribosome-associated multiprotein complexes containing HCPro relieve viral RNA translational repression through interaction with AGO1.", "pmid": "26611351", "title": "Molecular insights into the function of the viral RNA silencing suppressor HCPro."}, {"journal": "Biotechnology and applied biochemistry", "meshMajor": ["Aquatic Organisms", "Cytochrome P-450 Enzyme System", "Moritella", "Protein Conformation"], "year": null, "abstractText": "We have explored the adaptation of the cytochromes P450 (P450) of deep-sea bacteria to high hydrostatic pressures. Strict conservation of the protein fold and functional importance of protein-bound water make P450 a unique subject for the studies of high-pressure adaptation. Earlier, we expressed and purified a fatty-acid binding P450 from the deep-sea bacteria Photobacterium profundum SS9 (CYP261C1). Here, we report purification and initial characterization of its mesophilic ortholog from the shallow-water P. profundum 3TCK (CYP261C2), as well as another piezophilic enzyme, CYP261D1, from deep-sea Moritella sp. PE36. Comparison of the three enzymes revealed a striking peculiarity of the piezophilic enzymes. Both CYP261C1 and CYP261D1 possess an apparent pressure-induced conformational toggle actuated at the pressures commensurate with the physiological pressure of habitation of the host bacteria. Furthermore, in contrast to CYP261C2, the piezophilic CYP261 enzymes may be chromatographically separated into two fractions with different properties, and different thermodynamic parameters of spin equilibrium in particular. According to our concept, the changes in the energy landscape that evolved in pressure-tolerant enzymes must stabilize the less-hydrated, closed conformers, which may be transient in the catalytic mechanisms of nonpiezophilic enzymes. The studies of enzymes of piezophiles should help unravel the mechanisms that control water access during the catalytic cycle.", "pmid": "23586990", "title": "CYP261 enzymes from deep sea bacteria: a clue to conformational heterogeneity in cytochromes P450."}, {"journal": "Chemico-biological interactions", "meshMajor": ["Clinical Trials as Topic", "Drug Industry", "Humans", "Reproducibility of Results", "Toxicology"], "year": "2004", "abstractText": "Over the past decades, a number of drugs have been withdrawn or have required special labeling due to adverse effects observed post-marketing. Species differences in drug toxicity in preclinical safety tests and the lack of sensitive biomarkers and nonrepresentative patient population in clinical trials are probable reasons for the failures in predicting human drug toxicity. It is proposed that toxicology should evolve from an empirical practice to an investigative discipline. Accurate prediction of human drug toxicity requires resources and time to be spent in clearly defining key toxic pathways and corresponding risk factors, which hopefully, will be compensated by the benefits of a lower percentage of clinical failure due to toxicity and a decreased frequency of market withdrawal due to unacceptable adverse drug effects.", "pmid": "15522257", "title": "Accurate prediction of human drug toxicity: a major challenge in drug development."}, {"journal": "European journal of immunology", "meshMajor": ["Animals", "Antibodies, Monoclonal", "Antibody Specificity", "Antigens, Surface", "Cell Line", "Electrophoresis, Polyacrylamide Gel", "Female", "Leukemia, Lymphoid", "Mice", "Mice, Inbred BALB C", "T-Lymphocytes", "Thymus Gland", "beta 2-Microglobulin"], "year": "1982", "abstractText": "A monoclonal antibody, M241, was produced which binds to a human cell surface molecule with properties similar to the murine thymus leukemia (TL) antigen. This human TL-like antigen was found on thymocytes and some T cell lines derived from patients with acute lymphocytic leukemia, but was not found on peripheral blood lymphocytes or B cell lines. The monoclonal antibody M241 was used to immunoprecipitate a molecule from lysates of 125I surface-labeled MOLT 4 cells which had two subunits, a 43-kDa chain and a 12-kDa chain. The small subunit was shown to be beta 2-microglobulin (beta 2m) by immunoprecipitation with a monoclonal antibody, BBM.1, which recognizes human beta 2 m. The TL-like molecule recognized by M241 was shown to be serologically distinct from the HLA-A,B,C molecules recognized by three monoclonal antibodies W6/32, PA2.6 and BB7.8, and distinct from another human thymocyte antigen, the 49 kDa HTA 1 molecule, recognized by the monoclonal antibody NA1/34. Following removal of the HLA-A,B,C molecules, the HTA 1 molecules, and the M241-defined TL-like molecules from MOLT 4 lysates, additional beta 2m-associated molecules were immunoprecipitated with BBM.1. These molecules contained a 45-kDa subunit attached to beta 2m.", "pmid": "6754387", "title": "A monoclonal antibody recognizing a human thymus leukemia-like antigen associated with beta 2-microglobulin."}, {"journal": "Journal of the American Medical Women's Association (1972)", "meshMajor": ["Adult", "Case-Control Studies", "Delivery of Health Care", "Female", "Homeless Persons", "Humans", "Longitudinal Studies", "Poverty", "Stress Disorders, Post-Traumatic", "United States", "Women's Health"], "year": "2001", "abstractText": "OBJECTIVES: To identify childhood antecedents for lifetime post-traumatic stress disorder (PTSD) and to determine how this diagnosis relates to health and service use among extremely poor women.METHODS: We conducted a secondary data analysis of 425 women in the Worcester Family Research Project, a case-control longitudinal study of 220 sheltered homeless and 216 extremely poor housed (never homeless) women in Worcester, Massachusetts.RESULTS: We found that extremely poor women with lifetime PTSD were more likely to have grown up in family environments of violence, threat, and anger than those without PTSD. The strongest risk factor for PTSD was childhood sexual abuse with threat. Low-income women with lifetime PTSD had more bodily pain, even when controlling for other health and demographic factors. Women with PTSD experienced more chronic health conditions and had more problematic relationships with their health care providers and perceived more barriers to care.CONCLUSION: Many low-income women have difficulty using medical care appropriately because of childhood histories of physical and sexual abuse, the subsequent development of post-trauma responses, and structural barriers to care. Given these factors, it is critical that health care clinicians routinely screen for histories of violence and PTSD and develop treatment plans that ensure safety, link current symptoms with prior experiences, and provide support as necessary. A team approach coordinated by a case manager may be the best strategy. Without routine screening for PTSD and sensitive treatment, many extremely poor women will receive compromised health care and may even be retraumatized.", "pmid": "11326804", "title": "Post-traumatic stress disorder in extremely poor women: implications for health care clinicians."}, {"journal": "The Journal of organic chemistry", "meshMajor": ["Adrenergic Uptake Inhibitors", "Atomoxetine Hydrochloride", "Carbonates", "Cesium", "Cyclopropanes", "Methylation", "Models, Chemical", "Nitro Compounds", "Phenols", "Propylamines", "Stereoisomerism"], "year": "2008", "abstractText": "Nucleophilic ring opening of methyl 1-nitrocyclopropanecarboxylates by phenol derivatives in the presence of Cs2CO3 is described. The reaction tolerates a variety of substituents on both the aromatic alcohol and the cyclopropane and affords the products in good yields (53-84%) and with complete preservation of the enantiomeric excess at C-4. The methodology was applied in an enantioselective synthesis of the norepinephrine reuptake inhibitor atomoxetine (Strattera).", "pmid": "18671432", "title": "Nucleophilic addition of phenol derivatives to methyl 1-nitrocyclopropanecarboxylates."}, {"journal": "Viruses", "meshMajor": ["Animals", "Antibodies, Neutralizing", "Antibody Formation", "B-Lymphocytes", "Disease Models, Animal", "Germinal Center", "HIV Antibodies", "HIV Infections", "HIV-1", "Immunoglobulin G", "Immunologic Memory", "Immunophenotyping", "Male", "Mice", "Mice, Inbred BALB C", "Mice, Inbred C57BL", "Phenotype", "env Gene Products, Human Immunodeficiency Virus"], "year": "2014", "abstractText": "Continued efforts to define the immunogenic properties of the HIV-1 envelope glycoproteins (Env) are needed to elicit effective antibody (Ab) responses by vaccination. HIV-1 is a highly neutralization-resistant virus due to conformational and glycan shielding of conserved Ab determinants on the virus spike. Elicitation of broadly neutralizing Abs that bind poorly accessible epitope regions on Env is therefore extremely challenging and will likely require selective targeting of specific sub-determinants. To evaluate such approaches there is a pressing need for in vivo studies in both large and small animals, including mice. Currently, most mouse immunization studies are performed in the BALB/c strain; however, the C57BL/6 strain offers improved possibilities for mechanistic studies due to the availability of numerous knock-out strains on this genetic background. Here, we compared Env immunogenicity in BALB/c and C57BL/6 mice and found that the magnitude of the antigen-specific response was somewhat lower in C57BL/6 than in BALB/c mice by ELISA but not significantly different by B cell ELISpot measurements. We then established protocols for the isolation of single Env-specific memory B cells and germinal center (GC) B cells from immunized C57BL/6 mice to facilitate future studies of the elicited response at the monoclonal Ab level. We propose that these protocols can be used to gain an improved understanding of the early recruitment of Env-specific B cells to the GC as well as the archiving of such responses in the memory B cell pool following immunization. ", "pmid": "25198199", "title": "HIV-1 Env-specific memory and germinal center B cells in C57BL/6 mice."}, {"journal": "Medicinski arhiv", "meshMajor": ["Computer Simulation", "Humans", "Imaging, Three-Dimensional", "Joints", "Surgery, Computer-Assisted"], "year": "2004", "abstractText": "In this paper the authors give an overview of two systems (simulation and navigation) which are very important and give support to clinical work by making possible good visualization of the morphology and kinematics of joints. The approach to each patient with changes in the joints is individualized with a computer generated tomographical images which give very precise data which up to now had been inaccessible with clinical testing as the only alternative was the well known invasive diagnostic procedures. The first case concerns the COJOKS simulation system (COmputerized Joint Kinematics Simulation). The second case is of a navigation operative system which has recently been put into use and was developed on the basis of the GPS system (MSNT). This system is used for the precise determination of the bone structure of joints which is, by way of computer transformed into virtual 3D shape. This gives the surgeon all the data necessary during the operative procedure on bone and joint structures.", "pmid": "15137217", "title": "[Clinical views on simulation and navigation technologies in kinematics of joints in locomotor surgery]."}, {"journal": "Journal of virology", "meshMajor": ["Antibodies, Monoclonal", "Cell Line", "Genotype", "Influenza A virus", "Kinetics", "Mutation", "Phenotype"], "year": "1992", "abstractText": "The rates of mutation to the mar (monoclonal antibody-resistant) genotype of individual influenza virus plaque isolates, obtained from a stock generated after two successive cloning steps, have been determined by the fluctuation test. When a random sample of 60 clones was analyzed, 7 contained a proportion of mar mutants significantly higher than the average, and among them, 2 showed a mutation rate two to three times higher than the average value obtained for the virus population when the hemagglutinin-specific monoclonal antibody 2G10 was used. In order to look for mutants with higher mutation rates, a systematic search was carried out with a nonmutagenized virus stock, and several clones with increased mutation rates were isolated. One of them (mut43) was characterized further and was shown to have a mutation rate three to four times higher than that of the virus population at the sites defined by two nonoverlapping, hemagglutinin-specific monoclonal antibodies as well as at the site defined by a neuraminidase-specific monoclonal antibody. These results indicate that the mutation rate of an influenza virus is a weighted average of the contributions of a heterogeneous population. The consequences of this fact for the adaptive evolution of influenza viruses are discussed.", "pmid": "1548773", "title": "Heterogeneity of the mutation rates of influenza A viruses: isolation of mutator mutants."}, {"journal": "Acta biologica Hungarica", "meshMajor": ["Animals", "Cells, Cultured", "Dose-Response Relationship, Drug", "Estradiol", "Estrogens", "Female", "Growth Hormone", "Pituitary Gland, Anterior", "Prolactin", "Rats", "Rats, Sprague-Dawley"], "year": "1994", "abstractText": "Female rats were treated in vivo with estrogen for three weeks. The pituitaries were then removed and their responses to somatostatin, dopamine, TRH, hGHRH(1-44)NH2, or their combination were examined in a superfused pituitary cell system. Somatostatin did not decrease basal prolactin secretion in the control cells, but it caused a dose-dependent decrease in prolactin release from the estrogen pretreated cells. Estrogen pretreatment did not alter the sensitivity of pituitary cells to dopamine; dopamine was equally effective in the control and estrogen pretreated pituitaries in decreasing the basal prolactin secretion and TRH induced prolactin release. Prolactin release from the estrogen pretreated cells, stimulated by 25 nM TRH was inhibited by 1 nM somatostatin and nearly totally abolished by 25 nM somatostatin, whereas in the control cells only the higher dose of somatostatin caused some decrease in the prolactin release. Estrogen pretreated cells showed a reduced response to GHRH. Somatostatin did not decrease the basal secretion of GH in either group, but at 1 nM dose it completely abolished the GH release induced by equimolar concentration of GHRH. However, after somatostatin was eliminated from the system, a delayed GH release could be observed that was greater in the control pituitaries than in the estrogen pretreated pituitaries. It is concluded that in vivo treatment with estrogen reduces GH secretion in response to GHRH and increases prolactin secretion after TRH stimulation. After estrogen treatment, the basal and TRH stimulated prolactin release can be effectively reduced by somatostatin. These effects could be observed in vitro using estrogen free tissue culture medium for up to 36 hours after the removal of the pituitaries. The reciprocal changes in GH and prolactin secretion support the concept of the transdifferentiation of GH and prolactin secreting cells.", "pmid": "7725821", "title": "Reciprocal changes in prolactin and growth hormone secretion in vitro after in vivo estrogen treatment."}, {"journal": "The Annals of otology, rhinology, and laryngology", "meshMajor": ["Adult", "Dilatation", "Female", "Follow-Up Studies", "Humans", "Laryngostenosis", "Male", "Middle Aged", "Retrospective Studies", "Severity of Illness Index", "Tracheal Stenosis", "Treatment Outcome", "Voice Quality"], "year": "2015", "abstractText": "OBJECTIVE: To assess the impact of suspension microlaryngoscopy with balloon dilation on voice-related quality of life (V-RQOL) in laryngotracheal stenosis (LTS).METHODS: Retrospective chart review of LTS patients dilated at a tertiary-care academic hospital from 2010 to 2013. Data were obtained and then analyzed. LTS was stratified by (1) subglottic or tracheal stenosis and (2) multilevel stenosis (MLS; glottic and subglottic/tracheal). Pre- and postoperative V-RQOL and grade, roughness, breathiness, asthenia, strain (GRBAS) scores were compared. The number and frequency of balloon dilation procedures over the lifetime were secondary outcome variables.RESULTS: Thirty-eight patients were identified: 26 subglottic/tracheal and 12 multilevel. Of these, 71.4% required multiple dilations, with greatest dilations/patient for multilevel stenosis (4.8). V-RQOL improved in the 27 patients with completed pre- and postoperative scores from a mean of 70.4 to 80 (P=.025). Pre/postoperative V-RQOLs for tracheal/subglottic (mean, 82.8/93.8) were significantly higher (P=.0001/.0001) than multilevel stenosis (48/55.3). Voice quality-of-life improvement was significant for the subglottic/tracheal cohort (P=.036) but not for the MLS group. GRBAS was performed pre- and postoperatively in 10 patients with improvement in all domains except breathiness.CONCLUSION: Laryngotracheal stenosis is associated with dysphonia. Patients with glottic involvement have significantly worse voice quality of life than those with tracheal/subglottic stenosis. Endoscopic balloon dilation improves V-RQOL in patients with subglottic/tracheal stenosis.", "pmid": "25519815", "title": "Voice quality in laryngotracheal stenosis: impact of dilation and level of stenosis."}, {"journal": "Journal of neuroimmunology", "meshMajor": ["Adult", "Aged", "Antibodies", "Female", "Gangliosides", "Humans", "Immunoglobulin A", "Immunoglobulin G", "Immunoglobulin Isotypes", "Immunoglobulin M", "Male", "Motor Neuron Disease", "Nervous System Diseases", "Peripheral Nervous System Diseases", "Reference Values"], "year": "1995", "abstractText": "Antibodies of the IgM, IgG and IgA class against GM1, asialo-GM1, GD1b and GM2 gangliosides were determined in the sera of patients with motor neuron disease (MND), peripheral neuropathy, other neurological diseases (OND) and healthy individuals. Antibodies of the three immunoglobulin classes were present in healthy persons. MND patients did not differ from OND or controls in anti-GM1 titers of the three isotypes. In the group of peripheral neuropathy, no elevations of antibody titers were observed in patients with sensory or sensory-motor neuropathy; however, four out of 12 patients with the motor variety had very high levels of IgM or IgG antibodies. Two of these four patients also had increased titers of IgA antibodies, but no patients exhibited high titers restricted to this isotype.", "pmid": "7822479", "title": "Presence and isotype of anti-ganglioside antibodies in healthy persons, motor neuron disease, peripheral neuropathy, and other diseases of the nervous system."}, {"journal": "Connective tissue research", "meshMajor": ["Celecoxib", "Cells, Cultured", "Chondrocytes", "Cyclooxygenase 1", "Cyclooxygenase 2", "Cyclooxygenase Inhibitors", "Dinoprostone", "Humans", "Interleukin-1beta", "Pyrazoles", "Receptors, Prostaglandin E", "Receptors, Prostaglandin E, EP4 Subtype", "Sulfonamides"], "year": "2009", "abstractText": "Prostaglandin (PG) E(2), which exerts its actions via the PG receptors EP1-4, is produced from arachidonic acid by cyclooxygenase (COX)-1 and COX-2. The aim of this study was to investigate the mechanisms by which interleukin (IL)-1beta induces the expression of PG receptors in cultured human chondrocytes and to explore the role of PGE(2) in this process. The cells were cultured with 0, 10, or 100 U/mL IL-1beta with or without 1 muM celecoxib, a specific inhibitor of COX-2, for up to 28 days. Expression of the genes encoding COX-1, COX-2, and EP1-4 was quantified using real-time PCR, and expression of the corresponding proteins was examined using immunohistochemical staining. PGE(2) production was determined using ELISA. IL-1beta treatment caused a marked dose- and time-dependent increase in the levels of PGE(2), COX-2, and EP4 as compared with the untreated control. It did not affect the expression of COX-1, and it decreased the expression of EP1 and EP2. EP3 expression was not detected in either the absence or the presence of IL-1beta. When celecoxib was also present, IL-1beta failed to stimulate PGE(2) production and EP4 expression, but its stimulatory effect on COX-2 expression and its inhibitory effect on EP1 and EP2 expression were unchanged. IL-1beta increases the production of PGE(2), COX-2, and the PG receptor EP4 in cultured human chondrocytes. The increase in EP4 expression appears to be a result of the increased PGE(2) production.", "pmid": "19444759", "title": "IL-1beta stimulates the expression of prostaglandin receptor EP4 in human chondrocytes by increasing production of prostaglandin E2."}, {"journal": "Journal of cardiac surgery", "meshMajor": ["Cardiac Catheterization", "Child", "Child, Preschool", "Cineangiography", "Female", "Heart Defects, Congenital", "Humans", "Infant", "Lung", "Male", "Mucocutaneous Lymph Node Syndrome", "Pulmonary Artery", "Pulmonary Veins", "Regional Blood Flow", "Tetralogy of Fallot", "Vascular Resistance", "Ventilation-Perfusion Ratio"], "year": null, "abstractText": "OBJECTIVES: This study was done to clarify which diameter, that of the pulmonary arteries (PAs) or that of the pulmonary veins (PVs), more precisely reflects pulmonary blood flow (PBF) bilaterally and unilaterally.METHODS: To evaluate bilateral PBF, we studied 15 consecutive patients with Kawasaki disease as normal patients and 30 patients with tetralogy of Fallot who received cardiac catheterization. To evaluate unilateral PBF, 20 patients with various congenital heart diseases undergoing cineangiography and lung perfusion scintigraphy were studied. The diameter of PA was measured immediately proximal to the origin of the first lobar branches bilaterally, and right PA area, left PA area, PA area (mm2), and PA index (mm2/m2) were calculated. The diameter of PV was also measured distal to the junction with the left atrium. Right PV area, left PV area, PV area (mm2), and PV index (mm2/m2) were calculated from these diameters. Pulmonary blood flow (PBF) was obtained by the Fick method during catheterization. To evaluate unilateral PBF, PBF was divided into right and left PBF according to the right/left perfusion ratio measured by lung perfusion scintigraphy.RESULTS: Evaluation of bilateral PBF was as follows: in normal patients, PA and PV areas were correlated with body surface area (r = 0.88, p = 0.0001 and r = 0.93, p = 0.0001); PA index and PV index ranged from 248 to 436 (mean = 343) mm2/m2 and from 346 to 595 (mean = 466) mm2/m2, respectively, and were constant irrespective of body surface area; PA and PV areas were correlated with PBF in normal patients, as well as in patients with tetralogy of Fallot. There was a better correlation between PV area and PBF than between PA area and PBF in normal patients, as well as a significantly better correlation in patients with tetralogy of Fallot. Evaluation of unilateral PBF was as follows: right PV area was correlated with right PBF (p = 0.0002), while right PA area was not; left PV area and left PA area were correlated with left PBF; right/left PV area ratio was correlated with the right/left perfusion ratio with better agreement than right/left PA area ratio.CONCLUSION: Our data suggest that the size of PVs in patients with congenital heart disease may be more useful than the size of PAs to indicate bilateral and unilateral PBF than the size of PAs. Differences in PV area of each lung may be a suitable indicator of discrepancy in blood flow to each lung.", "pmid": "9591181", "title": "Diameters of the pulmonary arteries and veins as an indicator of bilateral and unilateral pulmonary blood flow in patients with congenital heart disease."}, {"journal": "Clinical hemorheology and microcirculation", "meshMajor": ["Adolescent", "Adult", "Aerobiosis", "Blood Viscosity", "Body Mass Index", "Diet", "Exercise Test", "Fibrinogen", "Football", "Heart Rate", "Hematocrit", "Humans", "Leisure Activities", "Oxygen Consumption", "Work Capacity Evaluation"], "year": "1998", "abstractText": "While it is well established that blood viscosity is decreased in sportsmen and related to fitness, the involvement of fibrinogen in this relationship is less well defined. Relationships among fitness, rheology and fibrinogen were investigated in 32 football players (age 17-33 years: 19 professionals and 13 leisure players). A submaximal 25 min exercise-test was performed and allowed the calculation of aerobic working capacity. Aerobic working capacity (W170 and VO2 max) was negatively correlated to fibrinogen (r = -0.531, p < 0.01 and r = -0.623, p < 0.01), while on the whole sample the correlation to viscosity and erythrocyte aggregation was not significant. When subjects were divided into two subgroups according to their plasma fibrinogen concentration, the aerobic working capacity (either expressed as W170 or VO2 max) is higher when plasma fibrinogen level is lower than 2.7 g/l. Thus, there is a highly significant negative correlation between fibrinogen and fitness in these sportsmen, independent of blood rheology. These data suggest that rheology and fibrinogen are to some extent separate determinants of an individual's fitness.", "pmid": "9874357", "title": "Fibrinogen is negatively correlated with aerobic working capacity in football players."}, {"journal": "Cadernos de saude publica", "meshMajor": ["Aged", "Brazil", "Cross-Sectional Studies", "Diabetes Mellitus", "Female", "Health Knowledge, Attitudes, Practice", "Humans", "Male", "Middle Aged", "Prevalence", "Socioeconomic Factors", "Surveys and Questionnaires"], "year": "2010", "abstractText": "The aim of the study was to assess the prevalence of self-reported diabetes in the elderly, identifying associated factors, knowledge, and practices related to treatment options. This was a cross-sectional population-based study with stratified clustered two-stage sampling in six municipalities in the State of S?o Paulo, Brazil. Among the 1,949 elderly, 15.4% presented self-reported diabetes. Body mass index and exercising were statistically associated with diabetes. There was a significant difference between diabetics and non-diabetics in terms of self-rated health, hospitalization, self-reported illness in the previous two weeks, and report of the following diseases: hypertension, anemia, chronic kidney disease, and heart disease. In terms of per capita family income, there was no difference in regular medical visits, participation in discussion groups, and control practices. The findings show the need for behavior changes to prevent and control diabetes and its complications. Educational interventions are needed to expand the coverage of diabetes care.", "pmid": "20209221", "title": "[Self-reported diabetes in the elderly: prevalence, associated factors, and control practices]."}, {"journal": "BMC developmental biology", "meshMajor": ["Arachis", "Aspergillus", "Disasters", "Expressed Sequence Tags", "Gene Expression Profiling", "Gene Expression Regulation, Plant", "Gene Library", "Genes, Plant", "Oligonucleotide Array Sequence Analysis", "Seeds"], "year": "2008", "abstractText": "BACKGROUND: Peanut (Arachis hypogaea L.) is an important crop economically and nutritionally, and is one of the most susceptible host crops to colonization of Aspergillus parasiticus and subsequent aflatoxin contamination. Knowledge from molecular genetic studies could help to devise strategies in alleviating this problem; however, few peanut DNA sequences are available in the public database. In order to understand the molecular basis of host resistance to aflatoxin contamination, a large-scale project was conducted to generate expressed sequence tags (ESTs) from developing seeds to identify resistance-related genes involved in defense response against Aspergillus infection and subsequent aflatoxin contamination.RESULTS: We constructed six different cDNA libraries derived from developing peanut seeds at three reproduction stages (R5, R6 and R7) from a resistant and a susceptible cultivated peanut genotypes, 'Tifrunner' (susceptible to Aspergillus infection with higher aflatoxin contamination and resistant to TSWV) and 'GT-C20' (resistant to Aspergillus with reduced aflatoxin contamination and susceptible to TSWV). The developing peanut seed tissues were challenged by A. parasiticus and drought stress in the field. A total of 24,192 randomly selected cDNA clones from six libraries were sequenced. After removing vector sequences and quality trimming, 21,777 high-quality EST sequences were generated. Sequence clustering and assembling resulted in 8,689 unique EST sequences with 1,741 tentative consensus EST sequences (TCs) and 6,948 singleton ESTs. Functional classification was performed according to MIPS functional catalogue criteria. The unique EST sequences were divided into twenty-two categories. A similarity search against the non-redundant protein database available from NCBI indicated that 84.78% of total ESTs showed significant similarity to known proteins, of which 165 genes had been previously reported in peanuts. There were differences in overall expression patterns in different libraries and genotypes. A number of sequences were expressed throughout all of the libraries, representing constitutive expressed sequences. In order to identify resistance-related genes with significantly differential expression, a statistical analysis to estimate the relative abundance (R) was used to compare the relative abundance of each gene transcripts in each cDNA library. Thirty six and forty seven unique EST sequences with threshold of R > 4 from libraries of 'GT-C20' and 'Tifrunner', respectively, were selected for examination of temporal gene expression patterns according to EST frequencies. Nine and eight resistance-related genes with significant up-regulation were obtained in 'GT-C20' and 'Tifrunner' libraries, respectively. Among them, three genes were common in both genotypes. Furthermore, a comparison of our EST sequences with other plant sequences in the TIGR Gene Indices libraries showed that the percentage of peanut EST matched to Arabidopsis thaliana, maize (Zea mays), Medicago truncatula, rapeseed (Brassica napus), rice (Oryza sativa), soybean (Glycine max) and wheat (Triticum aestivum) ESTs ranged from 33.84% to 79.46% with the sequence identity >/= 80%. These results revealed that peanut ESTs are more closely related to legume species than to cereal crops, and more homologous to dicot than to monocot plant species.CONCLUSION: The developed ESTs can be used to discover novel sequences or genes, to identify resistance-related genes and to detect the differences among alleles or markers between these resistant and susceptible peanut genotypes. Additionally, this large collection of cultivated peanut EST sequences will make it possible to construct microarrays for gene expression studies and for further characterization of host resistance mechanisms. It will be a valuable genomic resource for the peanut community. The 21,777 ESTs have been deposited to the NCBI GenBank database with accession numbers ES702769 to ES724546.", "pmid": "18248674", "title": "Peanut gene expression profiling in developing seeds at different reproduction stages during Aspergillus parasiticus infection."}, {"journal": "Proceedings of the National Academy of Sciences of the United States of America", "meshMajor": ["Amino Acids", "Animals", "Glutamates", "Glutathione", "Kidney", "Kinetics", "Methionine", "Mice", "Serine", "gamma-Glutamyltransferase"], "year": "1978", "abstractText": "The function of the gamma-glutamyl cycle was explored in in vivo studies in which amino acids and specific inhibitors of cycle enzymes (gamma-glutamyl transpeptidase, gamma-glutamyl cyclotransferase, gamma-glutamylcysteine synthetase, and 5-oxoprolinase) were administered to mice. The findings, which show that the gamma-glutamyl cycle functions in vivo, support the conclusion that gamma-glutamyl amino acids formed by gamma-glutamyl transpeptidase from externally supplied amino acids and intracellular glutathione are translocated into the cell and thus indicate that there is a significant physiological connection between the metabolism of glutathione and the transport of amino acids.", "pmid": "31622", "title": "Evidence that the gamma-glutamyl cycle functions in vivo using intracellular glutathione: effects of amino acids and selective inhibition of enzymes."}, {"journal": "Journal of cardiology", "meshMajor": ["Aged", "Angioplasty, Balloon, Coronary", "Coronary Circulation", "Female", "Humans", "Hypertension", "Male", "Middle Aged", "Myocardial Infarction", "Myocardial Revascularization", "Stents", "Stroke Volume", "Thrombolytic Therapy", "Ventricular Function, Left"], "year": "2005", "abstractText": "OBJECTIVES: To evaluate the effectiveness of distal protection with the GuardWire Plus during primary angioplasty in patients with acute myocardial infarction.METHODS: Thirty-eight consecutive patients undergoing stent implantation with distal protection using the GuardWire Plus (DP-group) were compared with a matched control group undergoing conventional stent implantation after balloon angioplasty without distal protection (NDP-group). Microvascular circulation after revascularization was assessed by Thrombolysis in Myocardial Infarction (TIMI) flow grade, myocardial blush grade (MBG), serum creatine kinase peak release, and ST resolution. Left ventricular ejection fraction was measured by echocardiography at discharge. Follow-up quantitative coronary angiography and left ventriculography were performed 6 months after percutaneous coronary intervention. Quantitative coronary angiography data, restenosis rate, target lesion revascularization rate and follow-up left ventricular ejection fraction were also compared between the two groups.RESULTS: No significant differences were observed in baseline clinical and angiographic characteristics between the two groups. The TIMI flow grade 3 (DP-group 81.6% vs NDP-group 57.9%)and MBG 3 (57.9% vs 30.6%)were significantly greater in the DP-group respectively (p < 0.05). Post procedural ST-segment resolution > or = 50% was found in a significantly higher percentage of patients in the DP-group (68.4% vs 42.1%, p < 0.05). Left ventricular ejection fraction at discharge was significantly greater in the DP-group (55.5 +/- 8.5% vs 45.7 +/- 11.1%, p < 0.05). However, 6 months after the percutaneous coronary intervention, no significant difference was observed between the two groups. Restenosis rate and target lesion revascularization rate were similar in the two groups.CONCLUSIONS: Distal protection with the GuardWire Plus improved the microvascular circulation as assessed by TIMI flow grade, MBG, and ST resolution. Furthermore, left ventricular ejection fraction at discharge was improved.", "pmid": "15801274", "title": "Effectiveness of distal protection with the GuardWire Plus during primary angioplasty for acute myocardial infarction."}, {"journal": "Clinical nuclear medicine", "meshMajor": ["Adolescent", "Cumulative Trauma Disorders", "Female", "Foot Injuries", "Fractures, Stress", "Humans", "Metatarsal Bones", "Music", "Radionuclide Imaging"], "year": "2007", "abstractText": "A 14-year-old girl presented with a painful right foot. She was an elite water-polo player and could recall no history of specific trauma to the foot. On close and persistent questioning, she admitted to having taken up playing the drums recently, with practice sessions of up to 4 h/d. She used the foot drum with her right foot and had noticed that this was becoming increasingly painful and prevented her playing the instrument for the last 2 days. Plain films of the foot were originally reported as normal, but revised to abnormal after the scintigraphic study. Bone scintigraphy confirmed a stress fracture of the right 3rd metatarsal bone. Stress fractures of the 3rd metatarsal bone are rare with only 2 previous reports in the literature.", "pmid": "17710033", "title": "Drummer's fracture of the third metatarsal bone."}, {"journal": "IEEE/ACM transactions on computational biology and bioinformatics", "meshMajor": ["Algorithms", "Benzene", "Computational Biology", "Computer Graphics", "Drug Design", "Isomerism", "Models, Chemical"], "year": null, "abstractText": "Enumeration of chemical structures is useful for drug design, which is one of the main targets of computational biology and bioinformatics. A chemical graph with no other cycles than benzene rings is called tree-like, and becomes a tree possibly with multiple edges if we contract each benzene ring into a single virtual atom of valence 6. All tree-like chemical graphs with a given tree representation are called the substituted benzene isomers of . When we replace each virtual atom in with a benzene ring to obtain a substituted benzene isomer, distinct isomers of are caused by the difference in arrangements of atom groups around a benzene ring. In this paper, we propose an efficient algorithm that enumerates all substituted benzene isomers of a given tree representation . Our algorithm first counts the number of all the isomers of the tree representation by a dynamic programming method. To enumerate all the isomers, for each , our algorithm then generates the th isomer by backtracking the counting phase of the dynamic programming. We also implemented our algorithm for computational experiments.", "pmid": "28113952", "title": "Enumerating Substituted Benzene Isomers of Tree-Like Chemical Graphs."}, {"journal": "Journal of molecular and cellular cardiology", "meshMajor": ["Adult", "Aged", "Cardiomyopathy, Hypertrophic", "Carrier Proteins", "Computer Simulation", "Female", "Gene Expression Profiling", "Genome, Human", "Humans", "Male", "MicroRNAs", "Middle Aged", "Mutation", "Myocardium", "Phosphorylation", "RNA, Messenger", "Reproducibility of Results", "Reverse Transcriptase Polymerase Chain Reaction", "Signal Transduction", "TRPM Cation Channels", "Transcriptome", "Troponin I", "Young Adult"], "year": "2013", "abstractText": "Hypertrophic cardiomyopathy (HCM) is predominantly caused by mutations in genes encoding sarcomeric proteins. One of the most frequent affected genes is MYBPC3, which encodes the thick filament protein cardiac myosin binding protein C. Despite the prevalence of HCM, disease pathology and clinical outcome of sarcomeric mutations are largely unknown. We hypothesized that microRNAs (miRNAs) could play a role in the disease process. To determine which miRNAs were changed in expression, miRNA arrays were performed on heart tissue from HCM patients with a MYBPC3 mutation (n=6) and compared with hearts of non-failing donors (n=6). 532 out of 664 analyzed miRNAs were expressed in at least one heart sample. 13 miRNAs were differentially expressed in HCM compared with donors (at p<0.01, fold change ? 2). The genomic context of these differentially expressed miRNAs revealed that miR-204 (fold change 2.4 in HCM vs. donor) was located in an intron of the TRPM3 gene, encoding an aspecific cation channel involved in calcium entry. RT-PCR analysis revealed a trend towards TRPM3 upregulation in HCM compared with donor myocardium (fold change 2.3, p=0.078). In silico identification of mRNA targets of differentially expressed miRNAs showed a large proportion of genes involved in cardiac hypertrophy and cardiac beta-adrenergic receptor signaling and we showed reduced phosphorylation of cardiac troponin I in the HCM myocardium when compared with donor. HCM patients with MYBPC3 mutations have a specific miRNA expression profile. Downstream mRNA targets reveal possible involvement in cardiac signaling pathways.", "pmid": "24083979", "title": "MicroRNA transcriptome profiling in cardiac tissue of hypertrophic cardiomyopathy patients with MYBPC3 mutations."}, {"journal": "Journal of food science", "meshMajor": ["Antibodies", "Enzyme-Linked Immunosorbent Assay", "Fermentation", "Heating", "Hot Temperature", "Humans", "Reproducibility of Results", "Soy Foods", "Soybean Proteins", "Soybeans"], "year": "2014", "abstractText": "UNLABELLED: Soybean is used in processed foods worldwide. Because soybean can cause adverse reactions in some atopic patients, appropriate labeling regarding its content in processed foods is needed to better protect consumers. In the previous study, we developed a reliable sandwich Enzyme Linked Immunosorbent Assay (ELISA) method with high sensitivity and specificity for detecting soybean proteins by using antibody to Gly m Bd 30K, which was originally characterized as a vacuolar protein with a molecular mass of 34 kDa in soybean. The ELISA displayed satisfactory repeatability and reproducibility in an interlaboratory evaluation. However, it could not detect soybean protein in fermented soybean products. We therefore developed an extraction method combined with a heating process to inhibit soybean protein degradation by microbial proteolytic enzymes in fermented soybean products. This extraction method enables the sensitive detection of soybean protein in fermented soybean products such as natto and miso. It was able to detect with high-sensitivity soybean protein present at 10 \u00ecg/g levels in model processed foods. This method is suitable for quantifying soybean protein in processed foods without the degrading effects of microbial proteolytic enzymes. The present extraction method can be used sensitively to monitor labeling systems in a reliable manner and should be useful for the mandatory inspections required under Japanese regulations.PRACTICAL APPLICATION: The extraction and ELISA methods that we developed enable sensitive detection of soybean protein in soybean products, including fermented foods. These methods should be useful for reliable and sensitive monitoring of product labeling systems and should help to solve the problem of insensitive in soybean labeling of processed foods.", "pmid": "24811351", "title": "Detection of soybean proteins in fermented soybean products by using heating extraction."}, {"journal": "The Journal of allergy and clinical immunology", "meshMajor": ["Air Pollution, Indoor", "Animals", "Artemisia", "Asthma", "Germany", "Humans", "Lipopolysaccharides", "Mice, Inbred BALB C", "Pantoea", "Particulate Matter", "Pollen", "Pseudomonas"], "year": "2019", "abstractText": "BACKGROUND: Endotoxin (LPS) released from gram-negative bacteria causes strong immunologic and inflammatory effects and, when airborne, can contribute to respiratory conditions, such as allergic asthma.OBJECTIVES: We sought to identify the source of airborne endotoxin and the effect of this endotoxin on allergic sensitization.METHODS: We determined LPS levels in outdoor air on a daily basis for 4 consecutive years in Munich (Germany) and Davos (Switzerland). Air was sampled as particulate matter (PM) greater than 10\u00a0\u00ecm (PM\u00a0>\u00a010) and PM between 2.5 and 10\u00a0\u00ecm. LPS levels were determined by using the recombinant Factor C assay.RESULTS: More than 60% of the annual endotoxin exposure was detected in the PM\u00a0>\u00a010 fraction, showing that bacteria do not aerosolize as independent units or aggregates but adhered to large particles. In Munich 70% of annual exposure was detected between June 12th and August 28th. Multivariate modeling showed that endotoxin levels could be explained by phenological parameters (ie, plant growth). Indeed, days with high airborne endotoxin levels correlated well with the amount of Artemisia pollen in the air. Pollen collected from plants across Europe (100 locations) showed that the highest levels of endotoxin were detected on Artemisia vulgaris (mugwort) pollen, with little on other pollen. Microbiome analysis showed that LPS concentrations on mugwort pollen were related to the presence of Pseudomonas species and Pantoea species communities. In a mouse model of allergic disease, the presence of LPS on mugwort pollen was needed for allergic sensitization.CONCLUSIONS: The majority of airborne endotoxin stems from bacteria dispersed with pollen of only one plant: mugwort. This\u00a0LPS was essential for inducing inflammation of the lung and allergic sensitization.", "pmid": "30012513", "title": "Artemisia pollen is the main vector for airborne endotoxin."}, {"journal": "PloS one", "meshMajor": ["Amino Acid Sequence", "Base Sequence", "Calcium-Binding Proteins", "Calmodulin", "DNA Primers", "HeLa Cells", "Humans", "Membrane Proteins", "Microscopy, Fluorescence", "Subcellular Fractions"], "year": "2011", "abstractText": "The CaBPs represent a subfamily of small EF-hand containing calcium (Ca(2+))-sensing proteins related to calmodulin that regulate key ion channels in the mammalian nervous system. In a recent bioinformatic analyses we determined that CaBP7 and CaBP8 form an evolutionarily distinct branch within the CaBPs (also known as the calneurons) a finding that is consistent with earlier observations characterising a putative C-terminal transmembrane (TM) spanning helix in each of these proteins which is essential for their sub-cellular targeting to the Golgi apparatus and constitutive secretory vesicles. The C-terminal position of the predicted TM-helix suggests that CaBP7 and CaBP8 could be processed in a manner analogous to tail-anchored integral membrane proteins which exhibit the ability to insert across membranes post-translationally. In this study we have investigated the topology of CaBP7 and CaBP8 within cellular membranes through a combination of trypsin protection and epitope accessibility analyses. Our results indicate that the TM-helices of CaBP7 and CaBP8 insert fully across membranes such that their extreme C-termini are luminal. The observed type-II membrane topology is consistent with processing of CaBP7 and CaBP8 as true tail-anchored proteins. This targeting mechanism is distinct from any other calmodulin related Ca(2+)-sensor and conceivably underpins unique physiological functions of these proteins.", "pmid": "21445352", "title": "Determination of the membrane topology of the small EF-hand Ca2+-sensing proteins CaBP7 and CaBP8."}, {"journal": "Journal of youth and adolescence", "meshMajor": ["Adolescent", "Adolescent Behavior", "Aggression", "Antisocial Personality Disorder", "Female", "Friends", "Humans", "Male", "Peer Group", "Peer Influence"], "year": "2019", "abstractText": "Growing evidence reveals heterogeneity in antisocial behavior and urges the need to distinguish between aggressive and nonaggressive rule-breaking behaviors. This study characterized how aggression and rule-breaking behaviors shaped peer selection and influence. Using a longitudinal social network modeling approach, these questions were addressed in a sample of 1034 ethno-racially diverse early adolescents (49.52% females, Mage\u2009=\u200912.1), who were assessed in fall and spring of the same year. The results showed no evidence of peer selection on aggressive and rule-breaking behaviors, and significant peer influence on aggressive behavior only. Rule-breaking also forecasted a decreased susceptibility to peer influence on aggressive behavior. The findings expanded our knowledge about complex pathways through which heterogeneity in antisocial behavior is reciprocally related to friendship networks.", "pmid": "31440880", "title": "Friendship Network Dynamics of Aggressive and Rule-Breaking Antisocial Behaviors in Adolescence."}, {"journal": "Psychiatric genetics", "meshMajor": ["Chromosome Mapping", "Chromosomes, Human, Pair 2", "Dyslexia", "Genetic Linkage", "Genetic Predisposition to Disease", "Homeodomain Proteins", "Humans", "Membrane Proteins", "Nerve Tissue Proteins", "Otx Transcription Factors", "Transcription Factors"], "year": "2002", "abstractText": "A locus on chromosome 2p12-16 has been implicated in dyslexia susceptibility by two independent linkage studies, including our own study of 119 nuclear twin-based families, each with at least one reading-disabled child. Nonetheless, no variant of any gene has been reported to show association with dyslexia, and no consistent clinical evidence exists to identify candidate genes with any strong a priori logic. We used 21 microsatellite markers spanning 2p12-16 to refine our 1-LOD unit linkage support interval to 12cM between D2S337 and D2S286. Then, in quantitative association analysis, two microsatellites yielded P values<0.05 across a range of reading-related measures (D2S2378 and D2S2114). The exon/intron borders of two positional candidate genes within the region were characterized, and the exons were screened for polymorphisms. The genes were Semaphorin4F (SEMA4F), which encodes a protein involved in axonal growth cone guidance, and OTX1, encoding a homeodomain transcription factor involved in forebrain development. Two non-synonymous single nucleotide polymorphisms were found in SEMA4F, each with a heterozygosity of 0.03. One intronic single nucleotide polymorphism between exons 12 and 13 of SEMA4F was tested for quantitative association, but no significant association was found. Only one single nucleotide polymorphism was found in OTX1, which was exonic but silent. Our data therefore suggest that linkage with reading disability at 2p12-16 is not caused by coding variants of SEMA4F or OTX1. Our study outlines the approach necessary for the identification of genetic variants causing dyslexia susceptibility in an epidemiological population of dyslexics.", "pmid": "11901358", "title": "Fine mapping of the chromosome 2p12-16 dyslexia susceptibility locus: quantitative association analysis and positional candidate genes SEMA4F and OTX1."}, {"journal": "PloS one", "meshMajor": ["Cells, Cultured", "Enzyme-Linked Immunosorbent Assay", "Humans", "Interleukin-8", "Lysophospholipids", "NF-kappa B", "Neutrophils", "Sphingosine"], "year": "2014", "abstractText": "The bioactive sphingolipid sphingosine 1-phosphate (S1P) is found in increased amounts in the airways of asthmatics. S1P can regulate airway smooth muscle functions associated with asthmatic inflammation and remodeling, including cytokine secretion. To date however, whether S1P induces secretion of an important chemokine responsible for neutrophilia in airway inflammation--IL-8--was unexplored. The aim of this study was to investigate whether S1P induces IL-8 gene expression and secretion to enhance neutrophil chemotaxis in vitro, as well as examine the molecular mechanisms responsible for repression by the corticosteroid dexamethasone. We show that S1P upregulates IL-8 secretion from ASM cells and enhance neutrophil chemotaxis in vitro. The corticosteroid dexamethasone significantly represses IL-8 mRNA expression and protein secretion in a concentration- and time-dependent manner. Additionally, we reveal that S1P-induced IL-8 secretion is p38 MAPK and ERK-dependent and that these key phosphoproteins act on the downstream effector mitogen- and stress-activated kinase 1 (MSK1) to control secretion of the neutrophil chemoattractant cytokine IL-8. The functional relevance of this in vitro data was demonstrated by neutrophil chemotaxis assays where S1P-induced effects can be significantly attenuated by pretreatment with dexamethasone, pharmacological inhibition of p38 MAPK- or ERK-mediated pathways, or by knocking down MSK-1 with siRNA. Taken together, our study reveals the molecular pathways responsible for IL-8 secretion from ASM cells in response to S1P and indicates ways in which the impact on IL-8-driven neutrophilia may be lessened.", "pmid": "24647471", "title": "Sphingosine 1-phosphate induces neutrophil chemoattractant IL-8: repression by steroids."}, {"journal": "Neuroscience letters", "meshMajor": ["Animals", "Behavior, Animal", "Disease Models, Animal", "Male", "Mice", "Mice, Inbred ICR", "Pain", "Peripheral Nervous System Diseases", "Reproducibility of Results", "Spinal Nerves", "Tail", "Temperature"], "year": "2002", "abstractText": "We attempted to develop a mouse model for peripheral neuropathy by a partial injury of the nerve supplying the tail. Under enflurane anesthesia, the unilateral superior caudal trunk was resected between the S3 and S4 spinal nerves. Tests for thermal allodynia were conducted by immersing the tail into 4 or 38 degrees C water. The mechanical allodynia was assessed by stimulating the tail with a von Frey hair (1.96 mN, 0.2 g). After the nerve injury, the experimental animals had shorter tail withdrawal latencies to cold and warm water immersion than the presurgical latency, and exhibited an increase in tail response to von Frey stimulation. We interpret these abnormal sensitivities as the signs of mechanical, cold and warm allodynia following the superior caudal trunk injury in the mouse.", "pmid": "11897161", "title": "A mouse model for peripheral neuropathy produced by a partial injury of the nerve supplying the tail."}, {"journal": "Zhongguo Zhong yao za zhi = Zhongguo zhongyao zazhi = China journal of Chinese materia medica", "meshMajor": ["Amino Acids", "Gas Chromatography-Mass Spectrometry", "Oils, Volatile", "Prunella", "Taste"], "year": "2014", "abstractText": "Volatile oil components and the contents and types of amino acid in spica of Prunella vulgaris were analysed by GC-MS and amino acid analyzer. Esters, fatty acids, aromatic hydrocarbon, ketone and several alcohol compounds were identified by mass spectrum comparison. In these ingredients, beta-ionone smelled aroma of cedar, raspberry, nerolidol showed weak sweet soft orange blossom flavor, neroli tasted sweet and fresh, nerolidol tasted sweet with light aroma of wood, hexadecanal showed a weak aroma of flowers and wax, alpha-sinensal had rich and fresh sweet orange flavor. To some extent, these types of aromatic substances can affect the taste of herbal tea or decoction made of Spica Prunellae. Among amino acids detected, natural amino acids accounted for a larger proportion, and those natural amino acids showed bitterness, slight bitterness, sourness (freshness), sweetness, slight sweetness, sourness (slight freshness). The results indicated that bitter and slightly bitter amino acids have the greatest impacts on the sense of Spica Prunellae.", "pmid": "24946541", "title": "[Preliminary analysis of bitter substances in spica of Prunella vulgaris]."}, {"journal": "Arquivos brasileiros de cardiologia", "meshMajor": ["Atrial Fibrillation", "Catheter Ablation", "Contrast Media", "Gadolinium", "Heart Atria", "Humans", "Recurrence", "Treatment Outcome"], "year": "2020", "abstractText": "BACKGROUND: Atrial fibrillation (AF) is known to induce atrial remodeling, which promotes fibrosis related to arrhythmogenesis. Accordingly, since scars induced by catheter ablation (CA) can reduce unablated fibrotic areas, greater extent of left atrial (LA) scarring may be associated with less AF recurrence after CA.OBJECTIVES: This study aims to investigate, through systematic review and meta-analysis, whether the amount of LA scarring, seen on late gadolinium enhancement magnetic resonance imaging, is associated with less AF recurrence after CA.METHODS: The recommendations of the MOOSE guideline were followed. Database search was conducted in PubMed and Cochrane Central Register of Controlled Trials (coment?rio 1) until January 2019 (coment?rio 2). Two authors performed screening, data extraction, and quality evaluation. All studies were graded as good quality. A funnel plot was generated, showing no publication bias. Statistical significance was defined as p value < 0.05.RESULTS: Eight observational studies were included in the systematic review, four of which were included in the meta-analysis. Six of the eight studies included in the systematic review showed that greater extension of LA scarring is associated with less AF recurrence after CA. Meta-analysis showed that greater extension of LA scarring is associated with less AF recurrence (SMD = 0.52; 95% CI 0.27 - 0.76; p < 0.0001).CONCLUSION: Greater extension of LA scarring is possibly associated with less AF recurrence after CA. Randomized studies that explore ablation methods based on this association are fundamental.", "pmid": "32074201", "title": "Extent of Left Atrial Ablation Lesions and Atrial Fibrillation Recurrence after Catheter Ablation - A Systematic Review and Meta-Analysis."}, {"journal": "Neuroscience letters", "meshMajor": ["Adult", "Color Perception", "Contrast Sensitivity", "Female", "Humans", "Male", "Photic Stimulation", "Retinal Cone Photoreceptor Cells", "Sensory Thresholds", "Visual Pathways", "Young Adult"], "year": "2018", "abstractText": "Flashing light stimulation is often used to investigate the visual system. However, the magnitude of the effect of this stimulus on the various subcortical pathways is not well investigated. The signals of conscious vision are conveyed by the magnocellular, parvocellular and koniocellular pathways. Parvocellular and koniocellular pathways (or more precisely, the L-M opponent and S-cone isolating channels) can be accessed by isoluminant red-green (L-M) and S-cone isolating stimuli, respectively. The main goal of the present study was to explore how costimulation with strong white extrafoveal light flashes alters the perception of stimuli specific to these pathways. Eleven healthy volunteers with negative neurological and ophthalmological history were enrolled for the study. Isoluminance of L-M and S-cone isolating sine-wave gratings was set individually, using the minimum motion procedure. The contrast thresholds for these stimuli as well as for achromatic gratings were determined by an adaptive staircase procedure where subjects had to indicate the orientation (horizontal, oblique or vertical) of the gratings. Thresholds were then determined again while a strong white peripheral light flash was presented 50\u202fms before each trial. Peripheral light flashes significantly (p\u202f<\u202f0.05) increased the contrast thresholds of the achromatic and S-cone isolating stimuli. The threshold elevation was especially marked in case of the achromatic stimuli. However, the contrast threshold for the L-M stimuli was not significantly influenced by the light flashes. We conclude that extrafoveally applied light flashes influence predominantly the perception of achromatic stimuli.", "pmid": "29751069", "title": "Extrafoveally applied flashing light affects contrast thresholds of achromatic and S-cone isolating, but not L-M cone modulated stimuli."}, {"journal": "Pediatric pulmonology", "meshMajor": ["Administration, Inhalation", "Bacteremia", "Dose-Response Relationship, Drug", "Female", "Hernia, Diaphragmatic", "Humans", "Hypoxia", "Infant, Newborn", "Infant, Premature", "Infant, Premature, Diseases", "Male", "Meconium Aspiration Syndrome", "Nitric Oxide", "Oxygen Consumption", "Respiratory Insufficiency", "Streptococcal Infections"], "year": "1995", "abstractText": "In acute hypoxemic respiratory failure of term and near-term neonates, extra- and intrapulmonary right-to-left shunting contribute to refractory hypoxemia. Inhaled nitric oxide (NO) decreases pulmonary arterial pressure and improves ventilation-perfusion mismatch in a variety of animal models and selected human patients. We report on 10 consecutive term and near-term newborns with severe acute hypoxemic respiratory failure due to diaphragmatic hernia, meconium aspiration syndrome, group B streptococcus sepsis, pneumonia or acute respiratory distress syndrome, who received increasing doses of inhaled NO (up to 80 ppm) to improve the arterial partial pressure of oxygen (PaO2). The response to NO and the optimum NO concentration which improved PaO2 varied considerably between patients. Improvement of PaO2 was absent or poor (less than 10 mm Hg) in the 4 newborns with meconium aspiration syndrome and in 1 patient with congenital diaphragmatic hernia, while in the other 5 patients inhaled NO increased the mean (+/- SE) PaO2 from 41 +/- 6 to 57 +/- 9 mm Hg (P < 0.05). Optimum NO concentrations determined by dose-response measurements performed during the first 8 hr of NO inhalation were 8-16 ppm except for 2 newborns with congenital diaphragmatic hernia who required 32 ppm to effectively increase PaO2. Four of the 5 patients in whom the PaO2 rose by more than 10 mm Hg received inhaled NO for extended periods of time (5 to 23 days) with no signs of tachyphylaxis. The optimum NO concentration dropped to less than 3 ppm after prolonged mechanical ventilation or when intravenous prostacyclin was given concomitantly.(ABSTRACT TRUNCATED AT 250 WORDS)", "pmid": "7567204", "title": "Dose-response to inhaled nitric oxide in acute hypoxemic respiratory failure of newborn infants: a preliminary report."}, {"journal": "Southern medical journal", "meshMajor": ["Diabetes Mellitus, Type 2", "Diagnostic Errors", "Female", "Fetal Hemoglobin", "Glycated Hemoglobin A", "Humans", "Middle Aged"], "year": "2000", "abstractText": "We present the case of a 61- year-old black woman with a diagnosis of type 2 diabetes and a falsely elevated hemoglobin A1c (HbA1c) due to hereditary persistence of fetal hemoglobin. Physicians and allied health care professionals are alerted to this potentially significant problem in the diagnosis and management of diabetes mellitus (DM), particularly in the wake of the Diabetes Complications and Control Trial when \"strict\" glycemic control assessed by HbA1c is now the standard of care.", "pmid": "10653068", "title": "Spurious elevation of hemoglobin A1c by hereditary persistence of fetal hemoglobin."}, {"journal": "European review for medical and pharmacological sciences", "meshMajor": ["Adult", "Antiviral Agents", "China", "Drug Resistance, Viral", "Female", "Genotype", "Hepacivirus", "Hepatitis C, Chronic", "Humans", "Male", "Middle Aged", "Mutation", "Prevalence"], "year": "2018", "abstractText": "OBJECTIVE: Although direct-acting antiviral agents (DAAs) for treating hepatitis C virus (HCV) infection have not yet been approved for clinical application at present in China, the development trend is irresistible. DAAs-containing therapeutic regimens have been approved and others are also under development worldwide. In vitro studies have shown that S282T mutation in the NS5B region of HCV is involved in DAAs resistance. The aim of this study was to investigate naturally occurring resistance mutation of S282T in different genotypes of HCV from DAA-treated na?ve Chinese patients who were chronically infected with HCV.PATIENTS AND METHODS: 250 Chinese patients chronically infected with HCV were enrolled in this study. All subjects were na?ve to DAAs. Direct sequencing of HCV NS5B region was performed in all enrolled patients.RESULTS: 70.4% (176/250) cases were infected with HCV genotype 1b, 19.2% (47/250) were 2a, 4.0% (11/250) were 6a, 3.6% (10/250) were 3b, 1.6% (4/250) were 1a and 1.2% (3/250) were 3a. Genotype 4, 5 and 7 were not observed. The S282T mutation was not found in any of the cases.CONCLUSIONS: The results showed that the S282T mutation was not prevalent in DAA-treated na?ve Chinese patients who were chronically infected with HCV.", "pmid": "30178855", "title": "Prevalence of S282T mutation in different genotypes of hepatitis C virus from DAA-treated na?ve Chinese patients who were chronically infected with HCV."}, {"journal": "Genome biology and evolution", "meshMajor": ["Ascomycota", "Evolution, Molecular", "Fungal Proteins", "Genes, Fungal", "Magnoliopsida", "Plant Tumors", "Species Specificity"], "year": "2014", "abstractText": "Taphrina fungi are biotrophic plant pathogens that cause plant deformity diseases. We sequenced the genomes of four Taphrina species-Taphrina wiesneri, T. deformans, T. flavorubra, and T. populina-which parasitize Prunus, Cerasus, and Populus hosts with varying severity of disease symptoms. High levels of gene synteny within Taphrina species were observed, and our comparative analysis further revealed that these fungi may utilize multiple strategies in coping with the host environment that are also found in some specialized dimorphic species. These include species-specific aneuploidy and clusters of highly diverged secreted proteins located at subtelomeres. We also identified species differences in plant hormone biosynthesis pathways, which may contribute to varying degree of disease symptoms. The genomes provide a rich resource for investigation into Taphrina biology and evolutionary studies across the basal ascomycetes clade.", "pmid": "24682155", "title": "Comparative genomics of Taphrina fungi causing varying degrees of tumorous deformity in plants."}, {"journal": "Journal of zoo and wildlife medicine : official publication of the American Association of Zoo Veterinarians", "meshMajor": ["Acaricides", "Animals", "Animals, Zoo", "Arvicolinae", "California", "Disease Eradication", "Endangered Species", "Ivermectin", "Mite Infestations", "Permethrin", "Rodent Diseases"], "year": "2018", "abstractText": "\u2003 Staff at a university laboratory responsible for management of a captive insurance colony of endangered Amargosa voles ( Microtus californicus scirpensis) discovered an outbreak of tropical rat mites ( Ornithonyssus bacoti) infesting 106 voles. This bloodsucking mesostigmatid mite typically occurs in laboratory settings and can cause weight loss, wounds, or other negative impacts on health. The source of the infestation was likely feral rodents, and the route was suspected to be straw bedding. Twenty-nine of the 106 (27.4%) infested voles developed ulcerated dorsal skin lesions that resolved when treated with topical selamectin. A triad approach was implemented to eradicate the mites, consisting of environmental management, individual animal treatment, and training. Voles were moved individually into a clean room containing only autoclaved materials (including straw), cages were treated with permethrin-impregnated cotton, treatment order was instituted to avoid transferring mites, and voles coming from outside were quarantined. All animals in an infested room were treated with topical selamectin, and personnel were trained on risks and new procedures. No adverse effects from the use of selamectin were identified, and this efficient protocol does not require the long-term use of acaricides. This report documents infestation of an endangered rodent with an exotic parasite, safe use of permethrin and selamectin in this species, and comprehensive management to manage a large infestation.", "pmid": "29900773", "title": "ERADICATION OF A TROPICAL RAT MITE ( ORNITHONYSSUS BACOTI) INFESTATION FROM A CAPTIVE COLONY OF ENDANGERED AMARGOSA VOLES ( MICROTUS CALIFORNICUS SCIRPENSIS)."}, {"journal": "Lancet (London, England)", "meshMajor": ["Attitude to Health", "Child", "Child, Preschool", "Diphtheria", "England", "Humans", "Immunization", "Immunization Schedule", "Infant", "Informed Consent", "Measles", "Parents", "Poliomyelitis", "Tetanus", "Vaccination", "Whooping Cough"], "year": "1977", "abstractText": "Immunisation levels in West Sussex for the period 1970 to 1976 were reviewed. The national decline in immunisation against whooping cough was reflected locally in 1974, 1975, and 1976 but, in contrast to the national experience, levels of immunisation against diphtheria, tetanus, poliomyelitis, or measles did not decline. It is suggested that this difference is accounted for by the use of a computer system which keeps an \"immunisation diary\" for parents and for medical and nursing staff.", "pmid": "72302", "title": "Immunisation levels---need they all decline?"}, {"journal": "Terapevticheskii arkhiv", "meshMajor": ["Asthma", "Case-Control Studies", "Forced Expiratory Volume", "Humans", "Obesity", "Plethysmography", "Total Lung Capacity", "Vital Capacity"], "year": "2019", "abstractText": "AIM: To assess the functional status of the small Airways in patients with bronchial asthma associated with obesity, by body plethysmography.MATERIALS AND METHODS: 65 patients with bronchial asthma of mild severity, partially controlled course, including 30 patients with normal body weight and 35 patients with obesity of I degree were examined. Control group-30 healthy volunteers. Examined forced vital capacity (FVC), forced expiratory volume in first second (FEV1) ratio of FEV1 to FVC (FEV1/FVC), maximum volumetric exhalation rate after 25.50 and 75% FVC (MEF75, MEF50, MEF25), average flow velocity in the exhalation interval 25-75% of FVC (MMEF25-75). Method bodyplethysmography was evaluated in bronchial resistance, functional residual capacity (FRC), residual volume of the lungs (RV), total lung capacity (TLC), the percentage of RV/TLC.RESULTS: Patients with bronchial asthma with obesity showed a reduction of indicators of bronchial obstruction: FEV1 of 14% (p=0.02), FEV1/FVC by 14% (p=0.001), MEF75 30% (p=0.001), MEF50 by 35% (p=0.001), MEF25 by 44% (p=0.003), MMEF25-75 by 38% (p=0.001). The increase of bronchial resistance on inhalation in 2 times (p=0.001), on exhalation in 3.3 times (p=0.003) was found, which is typical for generalized bronchial obstruction at the proximal level. An increase in RV by 24% (p=0.03), TLC - by 9% (p=0.03), RV/TLC - by 18% (p=0.03), indicating the presence of &quot;air traps&quot; and dysfunction of the small respiratory tract.CONCLUSION: In patients with asthma of mild severity associated with obesity, both the central bronchis and the distal lung are affected, which are manifested by generalized bronchial obstruction, the formation of &quot;air traps&quot; and dysfunction of the small respiratory tract.", "pmid": "31090373", "title": "Functional state of the small airways in patients with bronchial asthma associated with obesity."}, {"journal": "Neirofiziologiia = Neurophysiology", "meshMajor": ["Afferent Pathways", "Animals", "Brain Mapping", "Brain Stem", "Cats", "Diencephalon", "Hypothalamic Area, Lateral", "Hypothalamus", "Hypothalamus, Posterior", "Locomotion", "Mesencephalon"], "year": "1984", "abstractText": "Afferent brainstem projections to the functionally identified hypothalamic locomotor region were studied in cat by means of the horse-radish peroxidase technique. Cells of origin of such projections were found bilaterally in the different brainstem structures. Most of these cells were found in sites of location of monoaminergic systems (nucleus reticularis lateralis, locus coeruleus, nucleus tractus solitarius, raphe nuclei, substantia grisea centralis) parabrachial and certain sensory nuclei of the brainstem. The hypothalamic locomotor regions have mutual bilateral interconnections.", "pmid": "6462287", "title": "[Afferent brain stem connections of the hypothalamic locomotor area of the cat brain]."}, {"journal": "The Journal of infection", "meshMajor": ["Adolescent", "Adult", "Aged", "Aged, 80 and over", "Child", "Child, Preschool", "Female", "Humans", "Infant", "Infant, Newborn", "Male", "Middle Aged", "Multilocus Sequence Typing", "Pneumococcal Infections", "Pneumococcal Vaccines", "Prevalence", "Prospective Studies", "Serotyping", "Spain", "Streptococcus pneumoniae", "Vaccines, Conjugate", "Young Adult"], "year": "2011", "abstractText": "OBJECTIVES: The objective of this study was to learn the serotype distribution and clonal composition of pneumococci causing invasive pneumococcal disease (IPD) in children and adults in Spain before the introduction of new 10-valent (PCV10) and 13-valent (PCV13) conjugate vaccines.METHODS: This is a 1-year prospective study including all patients with culture-proved IPD admitted to 30 medical centers in Catalonia, Spain, during the year 2009.RESULTS: A total of 614 episodes of IPD occurred in 612 patients. The rates of IPD were highest in children aged <24 months and adults >64 years (64.5 and 44.7 per 100,000 population). The burden of disease was mainly due to pneumonia in all age ranges. 609 of 614 strains were serotyped and 47 different serotypes were found. Among the 609 IPD cases with known serotype, 12.2% were caused by PCV7 serotypes, 51% by PCV10 serotypes, and 71.7% by PCV13 serotypes. 608 of 614 isolates were characterized by MLST. The main clonal types detected were ST306, CC191 and CC230.CONCLUSIONS: PCV13 conjugate vaccine offers good coverage against IPD in Catalonia, Spain. However, the high genetic diversity of pneumococci highlights the importance of molecular surveillance systems for monitoring IPD during the vaccination period.SUMMARY: This study shows that 13-valent conjugate vaccine offers good coverage against invasive pneumococcal disease in children and adults in Spain. However, the high genetic diversity of pneumococci highlights the importance of molecular surveillance systems for monitoring IPD during the vaccination period.", "pmid": "21679725", "title": "Serotypes and clones causing invasive pneumococcal disease before the use of new conjugate vaccines in Catalonia, Spain."}, {"journal": "Langmuir : the ACS journal of surfaces and colloids", "meshMajor": ["Animals", "Cell Culture Techniques", "Cells, Cultured", "Dendrites", "Nerve Net", "Neurites", "Neurons", "Rats"], "year": "2018", "abstractText": "Despite significant progress, our knowledge of the functioning of the central nervous system still remains scarce to date. A better understanding of its behavior, in either normal or diseased conditions, goes through an increased knowledge of basic mechanisms involved in neuronal function, including at the single-cell level. This has motivated significant efforts for the development of miniaturized sensing devices to monitor neuronal activity with high spatial and signal resolution. One of the main challenges remaining to be addressed in this domain is, however, the ability to create in vitro spatially ordered neuronal networks at low density with a precise control of the cell location to ensure proper monitoring of the activity of a defined set of neurons. Here, we present a novel self-aligned chemical functionalization method, based on a repellant surface with patterned attractive areas, which permits the elaboration of low-density neuronal network down to individual cells with a high control of the soma location and axonal growth. This approach is compatible with complementary metal-oxide-semiconductor line technology at a wafer scale and allows performing the cell culture on packaged chip outside microelectronics facilities. Rat cortical neurons were cultured on such patterned surfaces for over one month and displayed a very high degree of organization in large networks. Indeed, more than 90% of the network nodes were settled by a soma and 100% of the connecting lines were occupied by a neurite, with a very good selectivity (low parasitic cell connections). After optimization, networks composed of 75% of unicellular nodes were obtained, together with a control at the micron scale of the location of the somas. Finally, we demonstrated that the dendritic neuronal growth was guided by the surface functionalization, even when micrometer scale topologies were encountered and we succeeded to control the extension growth along one-dimensional-aligned nanostructures with sub-micrometrical scale precision. This novel approach now opens the way for precise monitoring of neuronal network activity at the single-cell level.", "pmid": "29754481", "title": "Self-Aligned Functionalization Approach to Order Neuronal Networks at the Single-Cell Level."}, {"journal": "Langmuir : the ACS journal of surfaces and colloids", "meshMajor": ["Aluminum", "Biosensing Techniques", "DNA", "Electric Conductivity", "Electrochemistry", "Electrodes", "Ions", "Nanostructures", "Nucleic Acid Hybridization", "Particle Size", "Porosity", "Sensitivity and Specificity"], "year": "2005", "abstractText": "We show that nanoporous alumina modified with covalently linked DNA can be used to detect target DNA by monitoring the increase in impedance at the electrode upon DNA hybridization, which resulted from blocking the pores to ionic flow. Using cyclic voltammetry, direct current conductance, and impedance spectroscopy we confirm the importance of pore size: the effect is observed with 20-nm-diameter pores and is absent for 200-nm pores.", "pmid": "15896007", "title": "Sensing DNA hybridization via ionic conductance through a nanoporous electrode."}, {"journal": "Heart rhythm", "meshMajor": ["Administration, Oral", "Anticoagulants", "Atrial Fibrillation", "Electronics, Medical", "Humans", "Prostheses and Implants", "Risk Factors", "Sensitivity and Specificity", "Stroke"], "year": "2014", "abstractText": "The detection of atrial fibrillation (AF) by a cardiac implantable electronic device (CIED) in patients without a prior history of AF is increasing. This trend is the result of the increased number of CIEDs being implanted in a population whose multiple medical comorbidities are known to predispose to AF. Cardiac implantable electronic device-detected atrial fibrillation (CDAF) is independently associated with the development of ischemic stroke, and the annual risk may depend on both total AF burden and individual risk factors. No data evaluating the benefit of oral anticoagulation in this population are available, which makes the decision to initiate anticoagulation challenging. This review analyzes the available data on CDAF and the associated risk of ischemic stroke, and it presents a rationale for the use of long-term oral anticoagulation in this population. ", "pmid": "24394157", "title": "Clinical significance of atrial fibrillation detected by cardiac implantable electronic devices."}, {"journal": "Analytical chemistry", "meshMajor": ["Amino Acids", "Animals", "Ascorbic Acid", "Brain Ischemia", "Carbon", "Cerebral Cortex", "Copper", "Electrochemical Techniques", "Electrodes", "Ethylenediamines", "Ions", "Male", "Nanocomposites", "Pyridines", "Rats", "Rats, Sprague-Dawley", "Surface Properties", "Uric Acid"], "year": "2013", "abstractText": "Direct determination of cerebral metal ions in small volume biological samples is still the bottleneck for evaluating the roles that metal ions play in the physiological and pathological processes. In this work, selected copper ion (Cu(2+)) as a model, a facile and direct electrochemical method for detection of Cu(2+) has been developed on the basis of two new designed strategies: one is specific recognition molecule for Cu(2+)-AE-TPEA (N-(2-aminoethyl)-N,N',N'-tris(pyridine-2-yl-methyl)ethane-1,2-diamine); another is carbon dots (C-Dots) with high electrocatalytic activity. Based on the high affinity between TPEA and Cu(2+), the electrode assembled with C-Dot-TPEA hybridized nanocomposites shows high selectivity toward Cu(2+) over other metal ions, amino acids, and biological coexisting species, such as uric acid (UA), ascorbic acid (AA), and so on, which makes it possible to be used for determination of Cu(2+) in the complex brain system. By taking advantage of C-Dots, a dynamic linear range from 1 \u00ecM to 60 \u00ecM is first achieved with a detection limit of ?100 nM in aCSF solution. In addition, the developed method with theoretical simplicity and less instrumental demands exhibits long-term stability and good reproducibility. As a result, the present strategy has been successfully applied in detection of cerebral Cu(2+) in normal rat brain and that followed by global cerebral ischemia, combined with in vivo microdialysis. The determined concentrations of Cu(2+) in the rat brain microdialysates by the present method are found to be identical to those obtained by the conventional ICP-AES method.", "pmid": "23214718", "title": "Highly selective electrochemical strategy for monitoring of cerebral Cu2+ based on a carbon Dot-TPEA hybridized surface."}, {"journal": "Investigative ophthalmology & visual science", "meshMajor": ["Aged", "Aged, 80 and over", "Contrast Sensitivity", "Flicker Fusion", "Glaucoma, Open-Angle", "Humans", "Middle Aged", "Reproducibility of Results", "Vision Disorders", "Visual Acuity", "Visual Field Tests", "Visual Fields"], "year": "1997", "abstractText": "PURPOSE: The authors compared the efficacy of two different forms of flicker perimetry: temporal modulation perimetry (TMP), which measures contrast thresholds for a fixed temporal frequency, and critical flicker frequency (CFF), which measures the highest frequency for which flicker is detected at a fixed contrast.METHODS: The authors compared 16 patients with early to moderate glaucomatous visual field loss with 16 age-matched normal controls. Flicker stimuli consisted of 2 degrees diameter targets of 2 seconds in duration, presented in 44 locations throughout the central 30 degrees visual field. Flicker was presented within a cosine envelope to avoid temporal transients. For TMP, contrast sensitivity thresholds were measured for 8-Hz sinusoidal flicker; CFF thresholds were measured for a stimulus of 100% contrast.RESULTS: The results indicate that TMP and CFF produced similar test-retest reliability in normals. CFF had slightly better reliability in glaucoma patients. Receiver operating characteristic analysis revealed that TMP could provide better separation of normals and glaucoma patients than did CFF. Similar findings were obtained when the thresholds for both procedures were converted to Z scores.CONCLUSIONS: Both methods of flicker perimetry testing provide acceptable test-retest reliability, and both can distinguish normal subjects from glaucoma patients. However, TMP is more effective in separating normal subjects from glaucoma patients than CFF, suggesting that TMP is the method of choice for detecting glaucomatous damage using flicker perimetry.", "pmid": "9344350", "title": "Which method of flicker perimetry is most effective for detection of glaucomatous visual field loss?"}, {"journal": "Bio Systems", "meshMajor": ["Electronic Data Processing", "Models, Organizational", "Systems Biology"], "year": "2006", "abstractText": "The structure of a system influences its adaptability. An important result of adaptability theory is that subsystem independence increases adaptability [Conrad, M., 1983. Adaptability. Plenum Press, New York]. Adaptability is essential in systems that face an uncertain environment such as biological systems and organizations. Modern organizations are the product of human design. And so it is their structure and the effect that it has on their adaptability. In this paper we explore the potential effects of computer-based information processing on the adaptability of organizations. The integration of computer-based processes into the dynamics of the functions they support and the effect it has on subsystem independence are especially relevant to our analysis.", "pmid": "16757096", "title": "Adaptability and the integration of computer-based information processing into the dynamics of organizations."}, {"journal": "American journal of clinical pathology", "meshMajor": ["Adult", "Aged", "Aged, 80 and over", "Antigens, CD", "Bone Marrow", "Bone Marrow Cells", "Female", "Flow Cytometry", "Hematologic Neoplasms", "Humans", "Immunophenotyping", "Male", "Middle Aged", "Myelodysplastic Syndromes", "Myeloproliferative Disorders", "Young Adult"], "year": "2014", "abstractText": "OBJECTIVES: Flow cytometry immunophenotyping has been suggested as an adjunctive technique in the evaluation of myeloid malignancies, especially in the myelodysplastic syndromes. However, its use has been limited due to complexity and cost restraints. The goal of this study is to attempt a simpler approach to flow cytometry immunophenotyping in myeloid neoplasms.METHODS: We analyzed bone marrow specimens of 45 selected patients and an additional 99 consecutive random patients using a limited antibody panel.RESULTS: Normal CD34-positive blasts show a characteristic pattern of CD13/HLA-DR expression, with three readily identifiable subpopulations. In contrast, myeloid neoplasms frequently show loss of this heterogeneity.CONCLUSIONS: Analysis of a limited antibody panel with a focus on CD13/HLA-DR expression provides relatively high specificity and sensitivity for the detection of myeloid neoplasms.", "pmid": "25125617", "title": "Loss of blast heterogeneity in myelodysplastic syndrome and other chronic myeloid neoplasms."}, {"journal": "Oncology reports", "meshMajor": ["Activin Receptors, Type I", "Cell Division", "Humans", "Immunoprecipitation", "Membrane Proteins", "Neoplasm Invasiveness", "Oligonucleotides, Antisense", "Protein-Serine-Threonine Kinases", "RNA, Messenger", "Receptor, Transforming Growth Factor-beta Type I", "Receptor, Transforming Growth Factor-beta Type II", "Receptors, Transforming Growth Factor beta", "Reverse Transcriptase Polymerase Chain Reaction", "Stomach Neoplasms", "Thionucleotides", "Transforming Growth Factor beta", "Tumor Cells, Cultured"], "year": "2004", "abstractText": "Non-metastatic gene A (nma) has a homologue DNA sequence to a gene of bone morphogenetic proteins and activin membrane-bound inhibitor (BAMBI), which negatively regulates TGF-beta signaling. In this study, we analyzed the functional homology between Nma and BAMBI in human gastric carcinoma cell lines. Various levels of nma mRNA expression were detected by the RT-PCR technique in all human gastric carcinoma cell lines. Then, Nma antisense and sense S-oligodeoxynucleotide (ODN) were used to analyze the response of TGF-beta to cell growth and invasion gastric carcinoma cell lines. The cell growth was inhibited by TGF-beta in Nma antisense S-ODN treatment gastric carcinoma cell lines, MKN28, MKN1, MKN74 and TMK1. TGF-beta reduced cell growth and invasive activity of MKN28 treated with Nma antisense S-ODN in a dose and time-dependent manner. Furthermore, lysates of Nma sense or antisense S-ODN treated MKN28 cells were immunoprecipitated with anti-TGFbetaR-I or anti-TGFbetaR-II antibody. The 29 kDa signal considered as Nma appeared in sense S-ODN treated MKN28 cells immunoprecipitated with anti-TGFbetaR-I. These results indicate that Nma negatively regulates TGF-beta signaling, consequently playing an important role as one of the escape mechanisms from TGF-beta-mediated growth control similarly to BAMBI, and induce cell growth and invasion in human gastric carcinoma cell lines.", "pmid": "15138559", "title": "Effect of Nma on growth inhibition by TGF-betaa in human gastric carcinoma cell lines."}, {"journal": "Annals of hepatology", "meshMajor": ["Adolescent", "Alanine Transaminase", "Aspartate Aminotransferases", "Biomarkers", "Body Mass Index", "Chi-Square Distribution", "Child", "Cross-Sectional Studies", "Elasticity Imaging Techniques", "Female", "Humans", "Hypertension", "India", "Lipids", "Liver Cirrhosis", "Logistic Models", "Male", "Non-alcoholic Fatty Liver Disease", "Pediatric Obesity", "Platelet Count", "Predictive Value of Tests", "Prevalence", "Risk Factors"], "year": null, "abstractText": "\u00a0Background and rationale. Nonalcoholic fatty liver disease (NAFLD) is the most common cause of pediatric liver disease in western countries. Its prevalence in Indian subcontinent is not well studied.MATERIAL AND METHODS: In a school based cross sectional study we have screened overweight and obese children in the age group of 11 to 15 years for NAFLD. Ultrasonography, elevated serum transaminases, fibroscan were used for defining NAFLD. Dietary habits, blood pressure, serum lipid profile, blood counts and insulin resistance were recorded. The relation of fibrosis 4 score, pediatric NAFLD fibrosis index, aspartate transaminases to platelet ratio index (APRI) with fibroscan was evaluated.RESULTS: Out of 616 students screened 198 were overweight and obese. Hundred students and their parents gave informed consent for the further evaluation. The prevalence of NAFLD was 62% in overweight and obese children. Fatty liver was found in 50 % students on ultrasonography, liver stiffness (? 6.1 Kilopascals) in 23% and raised alanine transaminase in 30%. Hypertension, dyslipidemia, diabetes mellitus and insulin resistance were seen in 6%, 18%, 2% and 66% students respectively. Systolic hypertension, serum triglyceride, aspartate transaminase, APRI was significantly higher in the NAFLD group. On binary logistic regression only systolic hypertension was an independent risk factor for NAFLD.CONCLUSION: In conclusion NAFLD is common in asymptomatic overweight and obese Indian children. Systolic hypertension is the only independent factor associated with NAFLD. Fibroscan has limited role for screening. We recommend screening for NAFLD in this high risk group with alanine transaminases and ultrasonography.", "pmid": "27740518", "title": "\u00a0Most overweight and obese Indian children have nonalcoholic fatty liver disease."}, {"journal": "Journal of cataract and refractive surgery", "meshMajor": ["Conjunctiva", "Follow-Up Studies", "Glaucoma, Open-Angle", "Humans", "Intraocular Pressure", "Sclera", "Surgical Flaps", "Trabeculectomy"], "year": "1994", "abstractText": "A modified surgical technique is described for trabeculectomy using contemporary limbal incisional techniques for a simplified dissection of the lamellar scleral flap, a technically easier operation, with smoothly dissected surfaces. The results and complications are comparable to those using the standard method.", "pmid": "8064615", "title": "Trabeculectomy: a modified surgical technique."}, {"journal": "International journal of cancer", "meshMajor": ["Animals", "Antibodies", "Antibody-Dependent Cell Cytotoxicity", "Cell Line", "Isoantigens", "Killer Cells, Natural", "Mice", "Mice, Inbred BALB C", "Mice, Inbred C3H", "Mice, Inbred C57BL", "Mice, Nude", "Neoplasms, Experimental", "Receptors, Fc", "Spleen"], "year": "1981", "abstractText": "Antibodies reactive with effector cells were shown to augment the cytotoxicity of spleen cells from athymic nude and euthymic mice. The addition of alloantibody to the assay or pretreatment of the effector cells with alloantibody resulted in increased cytotoxicity against the human cell K562, a relatively poor target for spontaneous mouse NK activity. When monoclonal antibodies were tested, cytotoxicity was markedly increased by some antibodies, such as anti-H-2, anti-la, and anti-Thy 1.2, while others had no effect. The degree of augmentation of cytotoxicity was dependent on the concentration of antibody added. Nylon-wool-nonadherent nu/nu splenic effector cells mediated the antibody-induced cytotoxicity and anti-asialo GMI plus complement abolished activity, indicating that the cells mediating the cytotoxicity were NK cells and not mature T cells, B cells or macrophages. When spleen cells from mice having different levels of NK activity were evaluated in this system, the magnitude of augmentation by antibody correlated with the level of spontaneous NK activity and no increased cytotoxicity was found with cell populations that had low spontaneous NK activity. Testing a panel of target cells, showed that certain human and mouse cell lines, with low to moderate susceptibility to spontaneous NK activity, were sensitive to antibody-induced NK-cell-mediated cytotoxicity whereas others were completely resistant. Both Fc-IgG receptor-positive and -negative cell lines were susceptible target cells. These results indicate that antibodies reactive with murine NK cells can increase their cytolytic activity.", "pmid": "7287215", "title": "Antibody-induced augmentation of murine natural killer cell activity."}, {"journal": "Pediatrics", "meshMajor": ["Adolescent", "Adolescent Behavior", "Bulimia", "Comorbidity", "Depression", "Female", "Humans", "Hyperphagia", "Male", "Minnesota", "Obesity", "Prevalence", "Risk Assessment", "Self Concept", "Sex Distribution", "Suicide, Attempted"], "year": "2003", "abstractText": "OBJECTIVE: To assess the prevalence of overeating among adolescents and to examine associations between overeating and sociodemographic characteristics, weight status, dieting behaviors, body satisfaction, depressive mood, self-esteem, and suicide.METHOD: A school-based sample of 4746 boys and girls in public middle and high schools in Minnesota completed the Project EAT (Eating Among Teens) survey and anthropometric measurements of height and weight.RESULTS: Overall, 17.3% of girls and 7.8% of boys reported objective overeating in the past year. Youths who engaged in overeating were more likely to be overweight or obese, to have dieted in the past year, to be trying to lose weight currently, and to report that weight and shape are very important to their overall feelings about self. Youths who met criteria for binge eating syndrome (high frequency of objective overeating with loss of control and distress regarding the binge eating) scored significantly lower on measures of body satisfaction and self-esteem and higher on a measure of depressive mood than those who reported either subclinical or no binge eating. Overeating was associated with suicide risk; more than one fourth of girls (28.6%) and boys (27.8%) who met criteria for binge eating syndrome reported that they had attempted suicide.CONCLUSIONS: Overeating among adolescents is associated with a number of adverse behaviors and negative psychological experiences. As the current study is cross-sectional, it is not possible to ascertain cause and effect. Future research should seek to identify whether objective overeating is an early warning sign of additional psychological distress or is a potential consequence of compromised psychological health. Clinical implications are discussed.", "pmid": "12509556", "title": "Overeating among adolescents: prevalence and associations with weight-related characteristics and psychological health."}, {"journal": "Chemosphere", "meshMajor": ["Biotechnology", "Chemical Fractionation", "Conservation of Natural Resources", "Decontamination", "Kinetics", "Metals, Heavy", "Plants", "Saponins", "Soil Pollutants", "Surface-Active Agents"], "year": "2002", "abstractText": "A washing process was studied to evaluate the efficiency of saponin on remediating heavy metal contaminated soils. Three different types of soils (Andosol: soil A, Cambisol: soil B, Regosol: soil C) were washed with saponin in batch experiments. Utilization of saponin was effective for removal of heavy metals from soils, attaining 90-100% of Cd and 85-98% of Zn extractions. The fractionations of heavy metals removed by saponin were identified using the sequential extraction. Saponin was effective in removing the exchangeable and carbonated fractions of heavy metals from soils. In recovery procedures, the pH of soil leachates was increased to about 10.7, leading to separate heavy metals as hydroxide precipitates and saponin solute. In addition recycle of used saponin is considered to be effective for the subsequent utilization. The limits of Japanese leaching test were met for all of the soil residues after saponin treatment. As a whole, this study shows that saponin can be used as a cleaning agent for remediation of heavy metal contaminated soils.", "pmid": "12365835", "title": "Evaluation of remediation process with plant-derived biosurfactant for recovery of heavy metals from contaminated soils."}, {"journal": "International journal of hygiene and environmental health", "meshMajor": ["Animals", "Cattle", "Colony Count, Microbial", "DNA, Bacterial", "Escherichia coli", "Food Handling", "Meat"], "year": "2007", "abstractText": "A molecular-based detection method was developed to detect Escherichia coli O26, O111 and O157 in minced (ground) beef samples. This method consists of an initial overnight enrichment in modified tryptone soya broth (mTSB) and novobiocin prior to DNA extraction and subsequent serogrouping using a triplex PCR. This method has a low limit of detection and results are available within 24 hours of receipt of samples. Once optimized, this rapid method was utilized to determine the prevalence of these E. coli serogroups in six hundred minced beef samples all of which were previously examined by immunomagnetic separation (IMS) and selective plating for E. coli O26 and O111. Using IMS, two E. coli O26 isolates were detected. No E. coli O111 were recovered. The multiplex PCR technique described here did not detect E. coli O111 nor O157 in any of the samples, however six minced beef samples were positive for E. coli O26 using our method, only two of these were previously detected by IMS and culture. Application of molecular methods are useful to support culture-based approaches thereby further contributing to risk reduction along the food chain.", "pmid": "17118703", "title": "Development and assessment of a rapid method to detect Escherichia coli O26, O111 and O157 in retail minced beef."}, {"journal": "PloS one", "meshMajor": ["Aged", "Diabetes Mellitus, Type 2", "Electrocardiography", "Female", "Follow-Up Studies", "Glycated Hemoglobin A", "Guideline Adherence", "Humans", "Hypoglycemic Agents", "Kidney Function Tests", "Lipoproteins", "Luxembourg", "Male", "Middle Aged", "Patient Compliance", "Practice Guidelines as Topic", "Proportional Hazards Models"], "year": "2013", "abstractText": "INTRODUCTION: Type 2 diabetes is associated with severe micro- and macro-vascular complications. Physicians' and patients' adherence to follow-up guidelines permits postponing or reducing these complications. The objectives were to assess the level of adherence to fundamental follow-up guidelines and determine patients' characteristics associated with this level of adherence in the context of Luxembourg, where no guidelines were implemented.STUDY POPULATION: The exhaustive residing population treated for type 2 diabetes in Luxembourg during the 2000-2006 period (N = 21,068).METHODS: Seven fundamental criteria were extracted from international guidelines (consultation with the treating physician, HbA1c tests, electrocardiogram, retinal, dental, lipid and renal check-ups). The factors associated with the level of adherence to those criteria were identified using a partial proportional odds model.RESULTS: In 2006, despite 90% of the patients consulted at least 4 times their treating physician, only 0.6% completed all criteria; 55.0% had no HbA1c test (-8.6 points since 2000) and 31.1% had a renal check-up (+21.6 points). The sex (OR(male): 0.87 [95%CI, 0.83-0.92]), the nationality (OR(NonEU): 0.64 [0.52-0.78]), the type of antidiabetic treatment (ORoral: 1.48 [1.35-1.63], OR(mixed): 1.35 [1.20-1.52]) and the type of treating physician (ORG-ID: 0.47 [0.42-0.53]) were the main factors associated with the level of adherence in 2006 (3 or more criteria).CONCLUSION: A large percentage of patients were not provided with a systematic annual follow-up between 2000 and 2006. This study highlighted the necessity to promote guidelines in Luxembourg, education for physicians and to launch a national discussion on a disease management program for diabetic patients.", "pmid": "24244637", "title": "Adherence to international follow-up guidelines in type 2 diabetes: a longitudinal cohort study in Luxembourg."}, {"journal": "Canadian journal of microbiology", "meshMajor": ["Agriculture", "Animals", "Anti-Bacterial Agents", "Bacteria", "Cattle", "Ceftriaxone", "Chickens", "Dairying", "Drug Resistance, Bacterial", "Drug Resistance, Multiple, Bacterial", "Feces"], "year": "2006", "abstractText": "Approximately 40 samples of animal feces, drinking water, feed, bedding, pine wood shavings, compost, and manure slurry were collected from two animal research farms (one dairy and one poultry) and analyzed for ceftriaxone-resistant bacteria. Our study revealed that the total percentage of aerobic bacteria with reduced susceptibility to ceftriaxone (minimal inhibitory concentration (MIC) > or = 16 micro g/mL) ranged from 0.9% to 10.8% in dairy feces and from 0.05% to 3.93% in chicken feces. The percentages of ceftriaxone-resistant bacteria (MIC > or = 64 micro g/mL) were in the range of 0.01% - 2.3% in dairy feces and 0.01% - 0.79% in chicken feces. Environmental samples contained a wide range of ceftriaxone-resistant bacterial populations. Among those environmental samples, fresh pine wood shavings used as chicken bedding contained the highest percentages (41.5%) of ceftriaxone-resistant bacteria, as determined by a plating method. A total of 105 ceftriaxone-resistant (MIC > or = 128 micro g/mL) bacterial isolates were isolated from the above samples and tested for resistance to nine antibiotics: ampicillin, ceftriaxone, streptomycin, kanamycin, gentamicin, chloramphenicol, tetracycline, ciprofloxacin, and nalidixic acid. The most prevalent resistance pattern (34.3%) among isolates included resistance to all nine antibiotics. Results from this study suggest that ceftriaxone-resistant bacteria exist in farm environments, and the ceftriaxone resistance was frequently associated with resistance to multiple antibiotics. Environmental sources such as pine wood shavings used as bedding can be a potential reservoir for transmitting the multidrug-resistant bacteria.", "pmid": "17110962", "title": "Occurrence of ceftriaxone-resistant commensal bacteria on a dairy farm and a poultry farm."}, {"journal": "Lancet (London, England)", "meshMajor": ["Adult", "Chlorambucil", "Drug Administration Schedule", "Drug Evaluation", "Female", "Follow-Up Studies", "Glomerulonephritis, Membranous", "Humans", "Male", "Middle Aged", "Nephrotic Syndrome", "Pancytopenia", "Prednisolone", "Proteinuria", "Risk Factors"], "year": "1988", "abstractText": "Eight patients with idiopathic membranous nephropathy whose renal function was deteriorating were given a 6-month course of alternating monthly cycles of prednisolone and chlorambucil. Proteinuria was reduced in all eight, from a mean (SD) of 15.3 (5.9) g/24 h at the start of treatment to 2.1 (1.5) g/24 h at follow-up (p less than 0.05). Creatinine clearance increased in six, and the rate of decline was reduced in the other two (group mean 51.6 [17.8] ml/min at the start of treatment and 81.4 [36.8] ml/min at follow-up; p less than 0.05). Adverse effects of chlorambucil were severe, and the daily dose had to be reduced. Prednisolone and chlorambucil treatment can change the natural course of membranous nephropathy even when renal function has started to deteriorate, so treatment can be reserved for high-risk patients.", "pmid": "2902317", "title": "Prednisolone and chlorambucil treatment in idiopathic membranous nephropathy with deteriorating renal function."}, {"journal": "The Journal of neuroscience : the official journal of the Society for Neuroscience", "meshMajor": ["Animals", "Cocaine", "Cyclic AMP Response Element-Binding Protein", "Male", "Mice", "Mice, Inbred C57BL", "Mice, Transgenic", "Nucleus Accumbens", "Prosencephalon", "Rats", "Rats, Sprague-Dawley", "Reward"], "year": "2009", "abstractText": "The transcription factor cAMP response element-binding protein (CREB) within the nucleus accumbens (NAc) plays an important role in regulating mood. In rodents, increased CREB activity within the NAc produces depression-like signs including anhedonia, whereas disruption of CREB activity by expression of a dominant-negative CREB (mCREB, which acts as a CREB antagonist) has antidepressant-like effects. We examined how disruption of CREB activity affects brain reward processes using intracranial self-stimulation (ICSS) and inducible bitransgenic mice with enriched expression of mCREB in forebrain regions including the NAc. Mutant mice or littermate controls were prepared with lateral hypothalamic stimulating electrodes, and trained in the ICSS procedure to determine the frequency at which the stimulation becomes rewarding (threshold). Inducible expression of mCREB did not affect baseline sensitivity to brain stimulation itself. However, mCREB-expressing mice were more sensitive to the rewarding (threshold-lowering) effects of cocaine. Interestingly, mCREB mice were insensitive to the depressive-like (threshold-elevating) effects of the kappa-opioid receptor agonist U50,488. These behavioral differences were accompanied by decreased mRNA expression of G-protein receptor kinase-3 (GRK3), a protein involved in opioid receptor desensitization, within the NAc of mCREB mice. Disruption of CREB or GRK3 activity within the NAc specifically by viral-mediated gene transfer enhanced the rewarding impact of brain stimulation in rats, establishing the contribution of functional changes within this region. Together with previous findings, these studies raise the possibility that disruption of CREB in the NAc influences motivation by simultaneously facilitating reward and reducing depressive-like states such as anhedonia and dysphoria.", "pmid": "19211892", "title": "Altered sensitivity to rewarding and aversive drugs in mice with inducible disruption of cAMP response element-binding protein function within the nucleus accumbens."}, {"journal": "Tree physiology", "meshMajor": ["Acclimatization", "Droughts", "Environment", "Ericaceae", "Forests", "Photosynthesis", "Pinus", "Plant Leaves", "Quercus", "Seasons", "Spain", "Species Specificity"], "year": "2015", "abstractText": "The Mediterranean region is a hot spot of climate change vulnerable to increased droughts and heat waves. Scaling carbon fluxes from leaf to landscape levels is particularly challenging under drought conditions. We aimed to improve the mechanistic understanding of the seasonal acclimation of photosynthesis and morphology in sunlit and shaded leaves of four Mediterranean trees (Quercus ilex L., Pinus halepensis Mill., Arbutus unedo L. and Quercus pubescens Willd.) under natural conditions. Vc,max and Jmax were not constant, and mesophyll conductance was not infinite, as assumed in most terrestrial biosphere models, but varied significantly between seasons, tree species and leaf position. Favourable conditions in winter led to photosynthetic recovery and growth in the evergreens. Under moderate drought, adjustments in the photo/biochemistry and stomatal/mesophyllic diffusion behaviour effectively protected the photosynthetic machineries. Severe drought, however, induced early leaf senescence mostly in A. unedo and Q. pubescens, and significantly increased leaf mass per area in Q. ilex and P. halepensis. Shaded leaves had lower photosynthetic potentials but cushioned negative effects during stress periods. Species-specificity, seasonal variations and leaf position are key factors to explain vegetation responses to abiotic stress and hold great potential to reduce uncertainties in terrestrial biosphere models especially under drought conditions. ", "pmid": "25836361", "title": "Seasonal variability of foliar photosynthetic and morphological traits and drought impacts in a Mediterranean mixed forest."}, {"journal": "Nucleic acids research", "meshMajor": ["Antigens, Neoplasm", "Antineoplastic Agents", "Cell Line, Tumor", "DNA", "DNA Breaks", "DNA Topoisomerases, Type II", "DNA-Binding Proteins", "HL-60 Cells", "Humans", "Methionine", "Organoplatinum Compounds", "Podophyllotoxin", "Poly-ADP-Ribose Binding Proteins", "Protein Conformation", "Topoisomerase II Inhibitors"], "year": "2017", "abstractText": "Human type II topoisomerase (Top2) isoforms, hTop2\u00e1 and hTop2\u00e2, are targeted by some of the most successful anticancer drugs. These drugs induce Top2-mediated DNA cleavage to trigger cell-death pathways. The potency of these drugs correlates positively with their efficacy in stabilizing the enzyme-mediated DNA breaks. Structural analysis of hTop2\u00e1 and hTop2\u00e2 revealed the presence of methionine residues in the drug-binding pocket, we therefore tested whether a tighter Top2-drug association may be accomplished by introducing a methionine-reactive Pt2+ into a drug to further stabilize the DNA break. Herein, we synthesized an organoplatinum compound, etoplatin-N2\u00e2, by replacing the methionine-juxtaposing group of the drug etoposide with a cis-dichlorodiammineplatinum(II) moiety. Compared to etoposide, etoplatin-N2\u00e2 more potently inhibits both human Top2s. While the DNA breaks arrested by etoposide can be rejoined, those captured by etoplatin-N2\u00e2 are practically irreversible. Crystallographic analyses of hTop2\u00e2 complexed with DNA and etoplatin-N2\u00e2 demonstrate coordinate bond formation between Pt2+ and a flanking methionine. Notably, this stable coordinate tether can be loosened by disrupting the structural integrity of drug-binding pocket, suggesting that Pt2+ coordination chemistry may allow for the development of potent inhibitors with protein conformation-dependent reversibility. This approach may be exploited to achieve isoform-specific targeting of human Top2s.", "pmid": "28977631", "title": "Producing irreversible topoisomerase II-mediated DNA breaks by site-specific Pt(II)-methionine coordination chemistry."}, {"journal": "Journal of invertebrate pathology", "meshMajor": ["Animals", "Aphanomyces", "Astacoidea", "Bosnia and Herzegovina", "Host-Pathogen Interactions", "Infections"], "year": "2017", "abstractText": "Although the introduction of the crayfish plague pathogen Aphanomyces astaci to Europe is responsible for substantial declines in native crayfish populations throughout the whole continent, its presence has never been officially confirmed in many European regions, including most of the Balkan Peninsula. We demonstrate that the recent crayfish mortality observed in Bosnia and Herzegovina (Mostarsko blato karst field, Neretva river drainage) was caused by A. astaci. The causative strain is known only from European crayfish, indicating that A. astaci poses a threat to native species in this region, even in the absence of its main vectors, the North American crayfish.", "pmid": "28888767", "title": "Recent acute crayfish mortality reveals Aphanomyces astaci presence in Bosnia and Herzegovina."}, {"journal": "FEMS immunology and medical microbiology", "meshMajor": ["Animals", "Bacteremia", "Bacteria", "Bacterial Infections", "Biological Assay", "Bombyx", "Cell Wall", "Humans", "Larva", "Peptidoglycan", "Predictive Value of Tests", "Sensitivity and Specificity"], "year": "2000", "abstractText": "Silkworm larvae plasma (SLP) reagent, which is prepared from the body fluid of the silkworm, reacts with peptidoglycan (PG), a fragment of both the Gram-positive and Gram-negative bacterial cell wall, as well as with beta-glucan, a component of fungi. We developed a quantitative method for the detection of PG in human plasma from cases with bacterial infection using the SLP reagent. Tested in this way, the SLP method showed 86.2% sensitivity, 90.6% specificity, 89.3% positive predictive value, and 88.5% efficiency. The SLP method provides a valuable tool for the diagnosis of systemic infection using patients' blood.", "pmid": "10767607", "title": "Detection of peptidoglycan in human plasma using the silkworm larvae plasma test."}]}}
\ No newline at end of file
diff --git a/tags_to_augment.txt b/tags_to_augment.txt
deleted file mode 100644
index 7d3e95ef..00000000
--- a/tags_to_augment.txt
+++ /dev/null
@@ -1 +0,0 @@
-Artificial Intelligence

From 45013427f8d2e5f9b1bb76bef6eb059b61058c36 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-60-50-9.eu-west-1.compute.internal>
Date: Tue, 12 Sep 2023 12:05:17 +0000
Subject: [PATCH 058/102] Add openpyxl to support excel

---
 poetry.lock                                   | 56 ++++++++++++++++++-
 pyproject.toml                                |  1 +
 .../create_xlinear_bertmesh_comparison_csv.py |  5 +-
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a98613d6..74125c70 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -1356,6 +1356,17 @@ dev = ["celery-types (==0.15.0)", "flaky (==3.7.0)", "mkdocs (==1.3.1)", "mkdocs
 docs = ["mkdocs (==1.3.1)", "mkdocs-gen-files (==0.3.5)", "mkdocs-material (==8.4.1)", "mkdocs-section-index (==0.3.4)", "mkdocstrings-python (==0.7.1)"]
 tests = ["celery-types (==0.15.0)", "flaky (==3.7.0)", "mypy (==0.971)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-celery", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.6)", "pytest-test-utils (>=0.0.6)"]
 
+[[package]]
+name = "et-xmlfile"
+version = "1.1.0"
+description = "An implementation of lxml.xmlfile for the standard library"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
+    {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.1.2"
@@ -1907,6 +1918,16 @@ files = [
     {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"},
     {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"},
+    {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"},
     {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"},
@@ -2219,6 +2240,20 @@ aioprocessing = "*"
 openai = "*"
 tenacity = "*"
 
+[[package]]
+name = "openpyxl"
+version = "3.1.2"
+description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
+    {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
+]
+
+[package.dependencies]
+et-xmlfile = "*"
+
 [[package]]
 name = "orjson"
 version = "3.9.2"
@@ -2234,6 +2269,7 @@ files = [
     {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a39c2529d75373b7167bf84c814ef9b8f3737a339c225ed6c0df40736df8748"},
     {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:84ebd6fdf138eb0eb4280045442331ee71c0aab5e16397ba6645f32f911bfb37"},
     {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a60a1cfcfe310547a1946506dd4f1ed0a7d5bd5b02c8697d9d5dcd8d2e9245e"},
+    {file = "orjson-3.9.2-cp310-none-win32.whl", hash = "sha256:2ae61f5d544030a6379dbc23405df66fea0777c48a0216d2d83d3e08b69eb676"},
     {file = "orjson-3.9.2-cp310-none-win_amd64.whl", hash = "sha256:c290c4f81e8fd0c1683638802c11610b2f722b540f8e5e858b6914b495cf90c8"},
     {file = "orjson-3.9.2-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:02ef014f9a605e84b675060785e37ec9c0d2347a04f1307a9d6840ab8ecd6f55"},
     {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:992af54265ada1c1579500d6594ed73fe333e726de70d64919cf37f93defdd06"},
@@ -2243,6 +2279,7 @@ files = [
     {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275b5a18fd9ed60b2720543d3ddac170051c43d680e47d04ff5203d2c6d8ebf1"},
     {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9aea6dcb99fcbc9f6d1dd84fca92322fda261da7fb014514bb4689c7c2097a8"},
     {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d74ae0e101d17c22ef67b741ba356ab896fc0fa64b301c2bf2bb0a4d874b190"},
+    {file = "orjson-3.9.2-cp311-none-win32.whl", hash = "sha256:a9a7d618f99b2d67365f2b3a588686195cb6e16666cd5471da603a01315c17cc"},
     {file = "orjson-3.9.2-cp311-none-win_amd64.whl", hash = "sha256:6320b28e7bdb58c3a3a5efffe04b9edad3318d82409e84670a9b24e8035a249d"},
     {file = "orjson-3.9.2-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:368e9cc91ecb7ac21f2aa475e1901204110cf3e714e98649c2502227d248f947"},
     {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58e9e70f0dcd6a802c35887f306b555ff7a214840aad7de24901fc8bd9cf5dde"},
@@ -2252,6 +2289,7 @@ files = [
     {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e46e9c5b404bb9e41d5555762fd410d5466b7eb1ec170ad1b1609cbebe71df21"},
     {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8170157288714678ffd64f5de33039e1164a73fd8b6be40a8a273f80093f5c4f"},
     {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e3e2f087161947dafe8319ea2cfcb9cea4bb9d2172ecc60ac3c9738f72ef2909"},
+    {file = "orjson-3.9.2-cp37-none-win32.whl", hash = "sha256:373b7b2ad11975d143556fdbd2c27e1150b535d2c07e0b48dc434211ce557fe6"},
     {file = "orjson-3.9.2-cp37-none-win_amd64.whl", hash = "sha256:d7de3dbbe74109ae598692113cec327fd30c5a30ebca819b21dfa4052f7b08ef"},
     {file = "orjson-3.9.2-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8cd4385c59bbc1433cad4a80aca65d2d9039646a9c57f8084897549b55913b17"},
     {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74036aab1a80c361039290cdbc51aa7adc7ea13f56e5ef94e9be536abd227bd"},
@@ -2261,6 +2299,7 @@ files = [
     {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1882a70bb69595b9ec5aac0040a819e94d2833fe54901e2b32f5e734bc259a8b"},
     {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fc05e060d452145ab3c0b5420769e7356050ea311fc03cb9d79c481982917cca"},
     {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8bc2c40d9bb26efefb10949d261a47ca196772c308babc538dd9f4b73e8d386"},
+    {file = "orjson-3.9.2-cp38-none-win32.whl", hash = "sha256:302d80198d8d5b658065627da3a356cbe5efa082b89b303f162f030c622e0a17"},
     {file = "orjson-3.9.2-cp38-none-win_amd64.whl", hash = "sha256:3164fc20a585ec30a9aff33ad5de3b20ce85702b2b2a456852c413e3f0d7ab09"},
     {file = "orjson-3.9.2-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7a6ccadf788531595ed4728aa746bc271955448d2460ff0ef8e21eb3f2a281ba"},
     {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3245d230370f571c945f69aab823c279a868dc877352817e22e551de155cb06c"},
@@ -2270,6 +2309,7 @@ files = [
     {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03fb36f187a0c19ff38f6289418863df8b9b7880cdbe279e920bef3a09d8dab1"},
     {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20925d07a97c49c6305bff1635318d9fc1804aa4ccacb5fb0deb8a910e57d97a"},
     {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eebfed53bec5674e981ebe8ed2cf00b3f7bcda62d634733ff779c264307ea505"},
+    {file = "orjson-3.9.2-cp39-none-win32.whl", hash = "sha256:ba60f09d735f16593950c6adf033fbb526faa94d776925579a87b777db7d0838"},
     {file = "orjson-3.9.2-cp39-none-win_amd64.whl", hash = "sha256:869b961df5fcedf6c79f4096119b35679b63272362e9b745e668f0391a892d39"},
     {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"},
 ]
@@ -2323,8 +2363,8 @@ files = [
 
 [package.dependencies]
 numpy = [
-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
     {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+    {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""},
 ]
 python-dateutil = ">=2.8.1"
 pytz = ">=2020.1"
@@ -2741,6 +2781,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -2748,8 +2789,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -2766,6 +2814,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -2773,6 +2822,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -4189,4 +4239,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "cb53df7c0ffa68f5c40fe32b94d716f6ee3d944bb795e67d653389a0c0070d93"
+content-hash = "b21685519eac62559ab3dc3149bf13c71f1a446b0fb6c41161b575300ef042de"
diff --git a/pyproject.toml b/pyproject.toml
index edcc4bf9..7b044dc8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ loguru = "^0.7.0"
 wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
+openpyxl = "^3.1.2"
 
 
 [tool.poetry.group.dev]
diff --git a/scripts/create_xlinear_bertmesh_comparison_csv.py b/scripts/create_xlinear_bertmesh_comparison_csv.py
index b51b878e..cca0c650 100644
--- a/scripts/create_xlinear_bertmesh_comparison_csv.py
+++ b/scripts/create_xlinear_bertmesh_comparison_csv.py
@@ -128,8 +128,8 @@ def create_comparison_csv(
     # Add active portfolio
     active_grants = pd.read_csv(active_portfolio_path)
     active_grants = active_grants[~active_grants["Synopsis"].isna()]
-    active_grants.sample(frac=1)
-    active_grants_sample = active_grants.iloc[:active_portfolio_sample]
+    active_grants.drop_duplicates(subset="Synopsis", inplace=True)
+    active_grants_sample = active_grants.sample(n=active_portfolio_sample)
     active_grants_sample = pd.DataFrame(
         {
             "abstract": active_grants_sample["Synopsis"],
@@ -137,7 +137,6 @@ def create_comparison_csv(
         }
     )
     active_grants_sample["active_portfolio"] = 1
-    active_grants.drop_duplicates(subset="abstract", inplace=True)
     grants_sample = pd.concat([grants_sample, active_grants_sample])
 
     abstracts = grants_sample["abstract"].tolist()

From 7b78843344eb6f058590614181adfd5ca81affdb Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-10-60-50-9.eu-west-1.compute.internal>
Date: Tue, 12 Sep 2023 12:31:10 +0000
Subject: [PATCH 059/102] Run dvc

---
 data/grants_comparison/.gitignore  |  1 +
 pipelines/generate_grants/dvc.lock | 12 ++++++------
 pipelines/generate_grants/dvc.yaml |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/data/grants_comparison/.gitignore b/data/grants_comparison/.gitignore
index 1fd228cc..cd00363b 100644
--- a/data/grants_comparison/.gitignore
+++ b/data/grants_comparison/.gitignore
@@ -1,2 +1,3 @@
 /meshterms_list.txt
 /comparison.csv
+/comparison.xlsx
diff --git a/pipelines/generate_grants/dvc.lock b/pipelines/generate_grants/dvc.lock
index 8c6a5d0c..d2b12d87 100644
--- a/pipelines/generate_grants/dvc.lock
+++ b/pipelines/generate_grants/dvc.lock
@@ -7,12 +7,12 @@ stages:
       --active-portfolio-path data/raw/active_grants_last_5_years.csv --bertmesh-path
       Wellcome/WellcomeBertMesh --bertmesh-thresh 0.5 --pre-annotate-bertmesh --xlinear-path
       models/xlinear-0.2.5/model --xlinear-label-binarizer-path models/xlinear-0.2.5/label_binarizer.pkl
-      --xlinear-thresh 0.2 --pre-annotate-xlinear --output-path data/grants_comparison/comparison.csv
+      --xlinear-thresh 0.2 --pre-annotate-xlinear --output-path data/grants_comparison/comparison.xlsx
     deps:
     - path: scripts/create_xlinear_bertmesh_comparison_csv.py
-      md5: 0a91bf23be4068bdc7c4b7a32d80ff2d
-      size: 8214
+      md5: a8ced1e8851e43f1902ba0d9dbf98781
+      size: 8350
     outs:
-    - path: data/grants_comparison/comparison.csv
-      md5: bc4fd9f4a670409dad07ffd03cf421f1
-      size: 596654
+    - path: data/grants_comparison/comparison.xlsx
+      md5: bd9ecabc26224fab96816fc7a6bd8be8
+      size: 195402
diff --git a/pipelines/generate_grants/dvc.yaml b/pipelines/generate_grants/dvc.yaml
index 18ac053c..1a30a4ec 100644
--- a/pipelines/generate_grants/dvc.yaml
+++ b/pipelines/generate_grants/dvc.yaml
@@ -17,7 +17,7 @@ stages:
       --xlinear-label-binarizer-path models/xlinear-0.2.5/label_binarizer.pkl
       --xlinear-thresh 0.2
       --pre-annotate-xlinear
-      --output-path data/grants_comparison/comparison.csv
+      --output-path data/grants_comparison/comparison.xlsx
     deps:
       - scripts/create_xlinear_bertmesh_comparison_csv.py
     wdir: "../.."

From 22fad690d30743dbf00227b4882b37a63c542f3b Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Tue, 12 Sep 2023 17:07:52 +0100
Subject: [PATCH 060/102] Total refactor: XLinear

---
 examples/retag.sh                           |   4 +-
 grants_tagger_light/models/xlinear/model.py |  13 +-
 grants_tagger_light/retagging/retagging.py  | 237 ++++-------
 poetry.lock                                 | 425 +-------------------
 pyproject.toml                              |   1 -
 5 files changed, 97 insertions(+), 583 deletions(-)

diff --git a/examples/retag.sh b/examples/retag.sh
index 651f02b6..97257f97 100644
--- a/examples/retag.sh
+++ b/examples/retag.sh
@@ -1,3 +1,5 @@
+# run in c5.9xlarge with at least 72GB of RAM
 grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
   --tags "Artificial Intelligence,HIV" \
-  --years 2017,2018,2019,2020,2021
\ No newline at end of file
+  --years 2016,2017,2018,2019,2020,2021 \
+  --supervised
\ No newline at end of file
diff --git a/grants_tagger_light/models/xlinear/model.py b/grants_tagger_light/models/xlinear/model.py
index 674bb3ab..312e0e10 100644
--- a/grants_tagger_light/models/xlinear/model.py
+++ b/grants_tagger_light/models/xlinear/model.py
@@ -68,6 +68,13 @@ def __init__(
         # Those are MeshXLinear params
         self.threshold = threshold
 
+        self.model_path = None
+        self.xlinear_model_ = None
+        self.vectorizer_ = None
+
+        self.label_binarizer_path = label_binarizer_path
+        self.label_binarizer_ = None
+
         if label_binarizer_path is not None:
             self.load_label_binarizer(label_binarizer_path)
 
@@ -167,7 +174,6 @@ def predict_tags(
         """
         X: list or numpy array of texts
         model_path: path to trained model
-        label_binarizer_path: path to trained label_binarizer
         probabilities: bool, default False. When true probabilities
                     are returned along with tags
         threshold: float, default 0.5. Probability threshold to be used to assign tags.
@@ -217,6 +223,9 @@ def load(self, model_path, is_predict_only=True):
         with open(params_path, "r") as f:
             self.__dict__.update(json.load(f))
 
+        self.load_label_binarizer(self.label_binarizer_path)
+        self.model_path = model_path
+
         if self.vectorizer_library == "sklearn":
             self.vectorizer_ = load_pickle(vectorizer_path)
         else:
@@ -229,6 +238,8 @@ def load(self, model_path, is_predict_only=True):
             model_path, is_predict_only=is_predict_only
         )
 
+        return self
+
     def load_label_binarizer(self, label_binarizer_path):
         with open(label_binarizer_path, "rb") as f:
             self.label_binarizer_ = pickle.loads(f.read())
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index b8eb434d..4c3fce14 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -6,17 +6,19 @@
 import typer
 from loguru import logger
 
-from datasets import Dataset, load_dataset, concatenate_datasets
-from johnsnowlabs import nlp
+from datasets import Dataset, load_dataset, concatenate_datasets, load_from_disk
 
 import os
 
-from sklearn.metrics import classification_report
-import pyarrow.parquet as pq
+from sklearn import preprocessing
 
+from grants_tagger_light.models.xlinear import MeshXLinear
 from grants_tagger_light.utils.years_tags_parser import parse_years, parse_tags
+import scipy
+import pickle as pkl
 
 import numpy as np
+import tqdm
 
 retag_app = typer.Typer()
 
@@ -36,94 +38,22 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     return train_dset, test_dset
 
 
-def _create_pipelines(save_to_path, batch_size, train_df, test_df, tag, spark):
-    """
-        This method creates a Spark pipeline (to run on dataframes)
-    Args:
-        save_to_path: path where to save the final results.
-        batch_size: max size of the batch to train. Since data is small for training, I limit it to 8.
-        train_df: Spark Dataframe of the train data
-        test_df: Spark Dataframe of the test data
-        spark: the Spark Object
-
-    Returns:
-        a tuple of (pipeline, lightpipeline)
-    """
-    document_assembler = nlp.DocumentAssembler() \
-        .setInputCol("abstractText") \
-        .setOutputCol("document")
-
-    # Biobert Sentence Embeddings (clinical)
-    embeddings = nlp.BertSentenceEmbeddings.pretrained("sent_biobert_clinical_base_cased", "en") \
-        .setInputCols(["document"]) \
-        .setOutputCol("sentence_embeddings")
-
-    retrain = True
-    clf_dir = f"{save_to_path}.{tag.replace(' ', '')}_clf"
-    if os.path.isdir(clf_dir):
-        answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
-        while answer not in ['y', 'n']:
-            answer = input("Classifier already trained. Do you want to reuse it? [y|n]: ")
-        if answer == 'y':
-            retrain = False
-
-    if retrain:
-        # I'm limiting the batch size to 8 since there are not many examples and big batch sizes will decrease accuracy
-        classifierdl = nlp.ClassifierDLApproach() \
-            .setInputCols(["sentence_embeddings"]) \
-            .setOutputCol("label") \
-            .setLabelColumn("featured_tag") \
-            .setMaxEpochs(25) \
-            .setLr(0.001) \
-            .setBatchSize(max(batch_size, 8)) \
-            .setEnableOutputLogs(True)
-        # .setOutputLogsPath('logs')
-
-        clf_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                            embeddings,
-                                            classifierdl])
-
-        fit_clf_pipeline = clf_pipeline.fit(train_df)
-        preds = fit_clf_pipeline.transform(test_df)
-        preds_df = preds.select('featured_tag', 'abstractText', 'label.result').toPandas()
-        preds_df['result'] = preds_df['result'].apply(lambda x: x[0])
-        logging.info(classification_report(preds_df['featured_tag'], preds_df['result']))
-
-        logging.info("- Loading the model for prediction...")
-        fit_clf_pipeline.stages[-1].write().overwrite().save(clf_dir)
-
-    fit_clf_model = nlp.ClassifierDLModel.load(clf_dir)
-
-    pred_pipeline = nlp.Pipeline(stages=[document_assembler,
-                                         embeddings,
-                                         fit_clf_model])
-    pred_df = spark.createDataFrame([['']]).toDF("text")
-    fit_pred_pipeline = pred_pipeline.fit(pred_df)
-
-    return fit_pred_pipeline
-
-
-def _annotate(save_to_path, dset, tag, limit, is_positive):
-    human_supervision = {}
-    curation_file = f"{save_to_path}.{tag.replace(' ', '')}.curation.json"
+def _annotate(curation_file, dset, tag, limit, is_positive):
+    field = 'positive' if is_positive else 'negative'
+    human_supervision = {tag: {'positive': [], 'negative': []}}
     if os.path.isfile(curation_file):
-        with open(curation_file, 'r') as f:
-            human_supervision = json.load(f)
         prompt = f"File `{curation_file}` found. Do you want to reuse previous work? [y|n]: "
         answer = input(prompt)
         while answer not in ['y', 'n']:
             answer = input(prompt)
-        if answer == 'n':
-            human_supervision[tag][is_positive] = []
-
-    if tag not in human_supervision:
-        human_supervision[tag] = {'positive': [], 'negative': []}
+        if answer == 'y':
+            with open(curation_file, 'r') as f:
+                human_supervision = json.load(f)
 
-    field = 'positive' if is_positive else 'negative'
     count = len(human_supervision[tag][field])
     logging.info(f"[{tag}] Annotated: {count} Required: {limit} Available: {len(dset) - count}")
     finished = False
-    while count <= limit:
+    while count < limit:
         tries = 0
         random.seed(time.time())
         random_pos_row = random.randint(0, len(dset))
@@ -148,7 +78,7 @@ def _annotate(save_to_path, dset, tag, limit, is_positive):
             human_supervision[tag][field].append(dset[random_pos_row])
             with open(curation_file, 'w') as f:
                 json.dump(human_supervision, f)
-        count = len(human_supervision[tag][field])
+        count = len(human_supervision[tag])
 
 
 def _curate(save_to_path, pos_dset, neg_dset, tag, limit):
@@ -162,9 +92,8 @@ def _curate(save_to_path, pos_dset, neg_dset, tag, limit):
 def retag(
     data_path: str,
     save_to_path: str,
-    spark_memory: int = 27,
     num_proc: int = os.cpu_count(),
-    batch_size: int = 64,
+    batch_size: int = 1024,
     tags: list = None,
     tags_file_path: str = None,
     threshold: float = 0.8,
@@ -173,11 +102,6 @@ def retag(
     years: list = None,
 ):
 
-    spark = nlp.start(spark_conf={
-        'spark.driver.memory': f'{spark_memory}g',
-        'spark.executor.memory': f'{spark_memory}g',
-    })
-
     # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
     logging.info("Loading the MeSH jsonl...")
     dset = load_dataset("json", data_files=data_path, num_proc=1)
@@ -194,18 +118,19 @@ def retag(
         with open(tags_file_path, 'r') as f:
             tags = [x.strip() for x in f.readlines()]
 
-    logging.info(f"Total tags detected: {tags}")
+    logging.info(f"- Total tags detected: {tags}.")
+    logging.info("- Training classifiers (retaggers)")
 
     for tag in tags:
-        logging.info(f"Retagging: {tag}")
-
+        os.makedirs(os.path.join(save_to_path, tag.replace(" ", "")), exist_ok=True)
         logging.info(f"- Obtaining positive examples for {tag}...")
         positive_dset = dset.filter(
             lambda x: tag in x["meshMajor"], num_proc=num_proc
         )
 
-        if len(positive_dset['abstractText']) < 50:
-            logging.info(f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])}. "
+        if len(positive_dset['abstractText']) < train_examples:
+            logging.info(f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])} vs "
+                         f"expected {train_examples}). "
                          f"Check {save_to_path}.err for more information about skipped tags.")
             with open(f"{save_to_path}.err", 'a') as f:
                 f.write(tag)
@@ -216,80 +141,85 @@ def retag(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
 
+        curation_file = os.path.join(save_to_path, tag.replace(' ', ''), "curation")
         if supervised:
-            logging.info(f"- Curating data...")
-            _curate(save_to_path, positive_dset, negative_dset, tag, train_examples)
+            logging.info(f"- Curating {tag}...")
+            _curate(curation_file, positive_dset, negative_dset, tag, train_examples)
+        else:
+            with open(curation_file, 'w') as f:
+                json.dump({tag: {'positive': [positive_dset[i] for i in range(train_examples)],
+                                 'negative': [negative_dset[i] for i in range(train_examples)]
+                                 }
+                           }, f)
+
+    logging.info("- Retagging...")
 
-            curation_file = f"{save_to_path}.{tag.replace(' ', '')}.curation.json"
-            if os.path.isfile(curation_file):
-                with open(curation_file, "r") as fr:
-                    # I load the curated data file
-                    human_supervision = json.load(fr)
-                    positive_dset = Dataset.from_list(human_supervision[tag]['positive'])
-                    negative_dset = Dataset.from_list(human_supervision[tag]['negative'])
+    models = {}
+    for tag in tags:
+        curation_file = os.path.join(save_to_path, tag.replace(' ', ''), "curation")
+        if not os.path.isfile(curation_file):
+            logger.info(f"Skipping `{tag}` retagging as no curation data was found. "
+                        f"Maybe there were too little examples? (check {save_to_path}.err)")
+            continue
+        with open(curation_file, "r") as fr:
+            data = json.load(fr)
+            positive_dset = Dataset.from_list(data[tag]['positive'])
+            negative_dset = Dataset.from_list(data[tag]['negative'])
 
         pos_x_train, pos_x_test = _load_data(positive_dset, tag, limit=train_examples, split=0.8)
         neg_x_train, neg_x_test = _load_data(negative_dset, "other", limit=train_examples, split=0.8)
 
-        pos_x_train = pos_x_train.add_column("featured_tag", [tag] * len(pos_x_train))
-        pos_x_test = pos_x_test.add_column("featured_tag", [tag] * len(pos_x_test))
-        neg_x_train = neg_x_train.add_column("featured_tag", ["other"] * len(neg_x_train))
-        neg_x_test = neg_x_test.add_column("featured_tag", ["other"] * len(neg_x_test))
+        pos_x_train = pos_x_train.add_column("tag", [tag] * len(pos_x_train))
+        pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
+        neg_x_train = neg_x_train.add_column("tag", ["other"] * len(neg_x_train))
+        neg_x_test = neg_x_test.add_column("tag", ["other"] * len(neg_x_test))
 
         logging.info(f"- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
-        train_df = spark.createDataFrame(train)
         test = concatenate_datasets([pos_x_test, neg_x_test])
-        test_df = spark.createDataFrame(test)
-
-        logging.info(f"- Train dataset size: {train_df.count()}")
-        logging.info(f"- Test dataset size: {test_df.count()}")
-
-        logging.info(f"- Creating `sparknlp` pipelines...")
-        pipeline = _create_pipelines(save_to_path, batch_size, train_df, test_df, tag, spark)
-
-        logging.info(f"- Optimizing dataframe...")
-        data_in_parquet = f"{save_to_path}.data.parquet"
-        optimize=True
-        if os.path.isfile(data_in_parquet):
-            answer = input("Optimized dataframe found. Do you want to use it? [y|n]: ")
-            while answer not in ['y', 'n']:
-                answer = input("Optimized dataframe found. Do you want to use it? [y|n]: ")
-            if answer == 'y':
-                optimize = False
 
-        if optimize:
-            dset = dset.remove_columns(["title", "journal", "year"])
-
-            pq.write_table(dset.data.table, data_in_parquet)
-        del dset, train, train_df, test, test_df, pos_x_train, pos_x_test, neg_x_train, neg_x_test, positive_dset,\
-            negative_dset
-        sdf = spark.read.load(data_in_parquet)
-
-        logging.info(f"- Repartitioning...")
-        sdf = sdf.repartition(num_proc)
-
-        logging.info(f"- Retagging {tag}...")
-        pipeline.transform(sdf).write.mode('overwrite').save(f"{save_to_path}.{tag.replace(' ', '')}.prediction")
-
-        # 1) We load
-        # 2) We filter to get those results where the predicted tag was not initially in meshMajor
-        # 3) We filter by confidence > threshold
-        # predictions = spark.read.load(f"{save_to_path}.{tag}.prediction").\
-        #   filter(~array_contains(col('meshMajor'), tag)).\
+        label_binarizer = preprocessing.LabelBinarizer()
+        label_binarizer_path = os.path.join(save_to_path, tag.replace(" ", ""), 'labelbinarizer')
+        labels = [1 if x == tag else 0 for x in train["tag"]]
+        label_binarizer.fit(labels)
+        with open(label_binarizer_path, 'wb') as f:
+            pkl.dump(label_binarizer, f)
+
+        model = MeshXLinear(label_binarizer_path=label_binarizer_path)
+        model.fit(train["abstractText"], scipy.sparse.csr_matrix(label_binarizer.transform(labels)))
+        models[tag] = model
+        model_path = os.path.join(save_to_path, tag.replace(" ", ""), "clf")
+        os.makedirs(model_path, exist_ok=True)
+        model.save(model_path)
+
+    logging.info("- Predicting all tags")
+    for b in tqdm.tqdm(range(int(len(dset) / batch_size))):
+        start = b * batch_size
+        end = min(len(dset), (b+1) * batch_size)
+        batch = dset[start:end]["abstractText"]
+        for tag in tags:
+            if tag not in models:
+                logger.info(f"Skipping {tag} - classifier not trained. Maybe there were little data?")
+                continue
+            models[tag](batch, threshold=threshold)
 
 
 @retag_app.command()
 def retag_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    data_path: str = typer.Argument(
+        ...,
+        help="Path to allMeSH_2021.jsonl"),
     save_to_path: str = typer.Argument(
-        ..., help="Path where to save the retagged data"
+        ...,
+        help="Path where to save the retagged data"
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(), help="Number of processes to use for data augmentation"
+        os.cpu_count(),
+        help="Number of processes to use for data augmentation"
     ),
     batch_size: int = typer.Option(
-        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
+        1024,
+        help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
     tags: str = typer.Option(
         None,
@@ -309,15 +239,11 @@ def retag_cli(
         help="Number of examples to use for training the retaggers"
     ),
     supervised: bool = typer.Option(
-        True,
+        False,
         help="Use human curation, showing a `limit` amount of positive and negative examples to curate data"
              " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
              " the model will be train. All intermediary steps will be saved."
     ),
-    spark_memory: int = typer.Option(
-        20,
-        help="Gigabytes of memory to be used. Recommended at least 20 to run on MeSH."
-    ),
     years: str = typer.Option(
         None, help="Comma-separated years you want to include in the retagging process"
     ),
@@ -345,7 +271,6 @@ def retag_cli(
     retag(
         data_path,
         save_to_path,
-        spark_memory=spark_memory,
         num_proc=num_proc,
         batch_size=batch_size,
         tags=parse_tags(tags),
diff --git a/poetry.lock b/poetry.lock
index 41a57aa1..708dc739 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -267,17 +267,6 @@ files = [
     {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
 ]
 
-[[package]]
-name = "appnope"
-version = "0.1.3"
-description = "Disable App Nap on macOS >= 10.9"
-optional = false
-python-versions = "*"
-files = [
-    {file = "appnope-0.1.3-py2.py3-none-any.whl", hash = "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e"},
-    {file = "appnope-0.1.3.tar.gz", hash = "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24"},
-]
-
 [[package]]
 name = "argilla"
 version = "1.10.0"
@@ -309,23 +298,6 @@ postgresql = ["psycopg2 (>=2.9.5,<2.10.0)", "psycopg2-binary (>=2.9.5,<2.10.0)"]
 server = ["PyYAML (>=5.4.1,<6.1.0)", "SQLAlchemy (>=2.0.0,<2.1.0)", "aiofiles (>=0.6,<22.2)", "alembic (>=1.9.0,<1.10.0)", "brotli-asgi (>=1.1,<1.3)", "elasticsearch8[async] (>=8.7.0,<8.8.0)", "fastapi (>=0.75,<0.89)", "luqum (>=0.11,<0.13)", "opensearch-py (>=2.0.0,<2.1.0)", "passlib[bcrypt] (>=1.7.4,<1.8.0)", "psutil (>=5.8,<5.10)", "python-jose[cryptography] (>=3.2,<3.4)", "python-multipart (>=0.0.5,<0.1.0)", "scikit-learn (>=0.24.2)", "segment-analytics-python (==2.2.0)", "smart-open", "uvicorn[standard] (>=0.15.0,<0.21.0)"]
 tests = ["cleanlab (>=2.0.0,<2.1.0)", "datasets (>1.17.0,!=2.3.2)", "evaluate", "factory-boy (>=3.2.1,<3.3.0)", "faiss-cpu", "flair (>=0.12.2)", "flyingsquid", "huggingface-hub (>=0.5.0,<0.13)", "openai", "pgmpy", "plotly (>=4.1.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-mock", "rich (==13.0.1)", "seqeval", "setfit", "snorkel (>=0.9.7)", "spacy (==3.5.0)", "span-marker", "transformers[torch] (>=4.19.0)"]
 
-[[package]]
-name = "asttokens"
-version = "2.4.0"
-description = "Annotate AST trees with source code positions"
-optional = false
-python-versions = "*"
-files = [
-    {file = "asttokens-2.4.0-py2.py3-none-any.whl", hash = "sha256:cf8fc9e61a86461aa9fb161a14a0841a03c405fa829ac6b202670b3495d2ce69"},
-    {file = "asttokens-2.4.0.tar.gz", hash = "sha256:2e0171b991b2c959acc6c49318049236844a5da1d65ba2672c4880c1c894834e"},
-]
-
-[package.dependencies]
-six = ">=1.12.0"
-
-[package.extras]
-test = ["astroid", "pytest"]
-
 [[package]]
 name = "async-timeout"
 version = "4.0.3"
@@ -426,17 +398,6 @@ redshift = ["redshift-connector (>=2.0.0,<3.0.0)"]
 sparql = ["SPARQLWrapper (>=2.0.0,<3.0.0)", "requests (>=2.0.0,<3.0.0)"]
 sqlserver = ["pyodbc (>=4.0.0,<5.0.0)"]
 
-[[package]]
-name = "backcall"
-version = "0.2.0"
-description = "Specifications for callback functions passed in to an API"
-optional = false
-python-versions = "*"
-files = [
-    {file = "backcall-0.2.0-py2.py3-none-any.whl", hash = "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255"},
-    {file = "backcall-0.2.0.tar.gz", hash = "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e"},
-]
-
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -1059,51 +1020,6 @@ files = [
     {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"},
 ]
 
-[[package]]
-name = "databricks-api"
-version = "0.9.0"
-description = "Databricks API client auto-generated from the official databricks-cli package"
-optional = false
-python-versions = ">=3.6,<4.0"
-files = [
-    {file = "databricks_api-0.9.0-py3-none-any.whl", hash = "sha256:51327fc1a06d9f4125a7a74d6764c3f1e99b6fb8f4b7f7cc178679b2c0d8ae5b"},
-    {file = "databricks_api-0.9.0.tar.gz", hash = "sha256:40db26831ae37d2659d2700f4cb253615d895b6d440b99fb995aed51e67928f0"},
-]
-
-[package.dependencies]
-databricks-cli = "*"
-
-[[package]]
-name = "databricks-cli"
-version = "0.17.7"
-description = "A command line interface for Databricks"
-optional = false
-python-versions = "*"
-files = [
-    {file = "databricks-cli-0.17.7.tar.gz", hash = "sha256:5a545063449f3b9ad904644c0f251058485e29e564dedf8d4e4a7b45caf9549b"},
-    {file = "databricks_cli-0.17.7-py2-none-any.whl", hash = "sha256:5b025943c70bbd374415264d38bfaddfb34ce070fadb083d851aec311e0f8901"},
-]
-
-[package.dependencies]
-click = ">=7.0"
-oauthlib = ">=3.1.0"
-pyjwt = ">=1.7.0"
-requests = ">=2.17.3"
-six = ">=1.10.0"
-tabulate = ">=0.7.7"
-urllib3 = ">=1.26.7,<2.0.0"
-
-[[package]]
-name = "dataclasses"
-version = "0.6"
-description = "A backport of the dataclasses module for Python 3.6"
-optional = false
-python-versions = "*"
-files = [
-    {file = "dataclasses-0.6-py3-none-any.whl", hash = "sha256:454a69d788c7fda44efd71e259be79577822f5e3f53f029a22d08004e951dc9f"},
-    {file = "dataclasses-0.6.tar.gz", hash = "sha256:6988bd2b895eef432d562370bb707d540f32f7360ab13da45340101bc2307d84"},
-]
-
 [[package]]
 name = "datasets"
 version = "2.13.1"
@@ -1146,17 +1062,6 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0,<2.44.0)", "elast
 torch = ["torch"]
 vision = ["Pillow (>=6.2.1)"]
 
-[[package]]
-name = "decorator"
-version = "5.1.1"
-description = "Decorators for Humans"
-optional = false
-python-versions = ">=3.5"
-files = [
-    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
-    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
-]
-
 [[package]]
 name = "deprecated"
 version = "1.2.14"
@@ -1569,20 +1474,6 @@ files = [
 [package.extras]
 test = ["pytest (>=6)"]
 
-[[package]]
-name = "executing"
-version = "1.2.0"
-description = "Get the currently executing AST node of a frame, and other information"
-optional = false
-python-versions = "*"
-files = [
-    {file = "executing-1.2.0-py2.py3-none-any.whl", hash = "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc"},
-    {file = "executing-1.2.0.tar.gz", hash = "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107"},
-]
-
-[package.extras]
-tests = ["asttokens", "littleutils", "pytest", "rich"]
-
 [[package]]
 name = "filelock"
 version = "3.12.3"
@@ -1935,45 +1826,6 @@ files = [
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
 ]
 
-[[package]]
-name = "ipython"
-version = "8.15.0"
-description = "IPython: Productive Interactive Computing"
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "ipython-8.15.0-py3-none-any.whl", hash = "sha256:45a2c3a529296870a97b7de34eda4a31bee16bc7bf954e07d39abe49caf8f887"},
-    {file = "ipython-8.15.0.tar.gz", hash = "sha256:2baeb5be6949eeebf532150f81746f8333e2ccce02de1c7eedde3f23ed5e9f1e"},
-]
-
-[package.dependencies]
-appnope = {version = "*", markers = "sys_platform == \"darwin\""}
-backcall = "*"
-colorama = {version = "*", markers = "sys_platform == \"win32\""}
-decorator = "*"
-exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
-jedi = ">=0.16"
-matplotlib-inline = "*"
-pexpect = {version = ">4.3", markers = "sys_platform != \"win32\""}
-pickleshare = "*"
-prompt-toolkit = ">=3.0.30,<3.0.37 || >3.0.37,<3.1.0"
-pygments = ">=2.4.0"
-stack-data = "*"
-traitlets = ">=5"
-
-[package.extras]
-all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.21)", "pandas", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"]
-black = ["black"]
-doc = ["docrepr", "exceptiongroup", "ipykernel", "matplotlib", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "typing-extensions"]
-kernel = ["ipykernel"]
-nbconvert = ["nbconvert"]
-nbformat = ["nbformat"]
-notebook = ["ipywidgets", "notebook"]
-parallel = ["ipyparallel"]
-qtconsole = ["qtconsole"]
-test = ["pytest (<7.1)", "pytest-asyncio", "testpath"]
-test-extra = ["curio", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.21)", "pandas", "pytest (<7.1)", "pytest-asyncio", "testpath", "trio"]
-
 [[package]]
 name = "isort"
 version = "5.12.0"
@@ -2012,25 +1864,6 @@ requests = "*"
 dev = ["mypy (==0.971)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "types-requests"]
 tests = ["mypy (==0.971)", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)", "types-requests"]
 
-[[package]]
-name = "jedi"
-version = "0.19.0"
-description = "An autocompletion tool for Python that can be used for text editors."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "jedi-0.19.0-py2.py3-none-any.whl", hash = "sha256:cb8ce23fbccff0025e9386b5cf85e892f94c9b822378f8da49970471335ac64e"},
-    {file = "jedi-0.19.0.tar.gz", hash = "sha256:bcf9894f1753969cbac8022a8c2eaee06bfa3724e4192470aaffe7eb6272b0c4"},
-]
-
-[package.dependencies]
-parso = ">=0.8.3,<0.9.0"
-
-[package.extras]
-docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"]
-qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"]
-testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]
-
 [[package]]
 name = "jinja2"
 version = "3.1.2"
@@ -2070,29 +1903,6 @@ files = [
     {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
 ]
 
-[[package]]
-name = "johnsnowlabs"
-version = "5.0.8"
-description = "The John Snow Labs Library gives you access to all of John Snow Labs Enterprise And Open Source products in an easy and simple manner. Access 10000+ state-of-the-art NLP and OCR models for Finance, Legal and Medical domains. Easily scalable to Spark Cluster"
-optional = false
-python-versions = "*"
-files = [
-    {file = "johnsnowlabs-5.0.8-py3-none-any.whl", hash = "sha256:a00f6c44684735716106d82ef420701f73085a3f9b1d9371060042a7d963ecd5"},
-    {file = "johnsnowlabs-5.0.8.tar.gz", hash = "sha256:45edcf4a2d4dddc2718850f80dbb963bca80a3c31bba3646e165e84446c31bc6"},
-]
-
-[package.dependencies]
-colorama = "*"
-databricks-api = "*"
-dataclasses = "*"
-nlu = "5.0.1"
-numpy = "*"
-pydantic = "1.10.11"
-pyspark = "3.1.2"
-requests = "*"
-spark-nlp = "5.0.2"
-spark-nlp-display = "4.1"
-
 [[package]]
 name = "kombu"
 version = "5.3.2"
@@ -2251,20 +2061,6 @@ files = [
     {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"},
 ]
 
-[[package]]
-name = "matplotlib-inline"
-version = "0.1.6"
-description = "Inline Matplotlib backend for Jupyter"
-optional = false
-python-versions = ">=3.5"
-files = [
-    {file = "matplotlib-inline-0.1.6.tar.gz", hash = "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304"},
-    {file = "matplotlib_inline-0.1.6-py3-none-any.whl", hash = "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311"},
-]
-
-[package.dependencies]
-traitlets = "*"
-
 [[package]]
 name = "monotonic"
 version = "1.6"
@@ -2478,24 +2274,6 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
 test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
 
-[[package]]
-name = "nlu"
-version = "5.0.1"
-description = "John Snow Labs NLU provides state of the art algorithms for NLP&NLU with 20000+ of pretrained models in 200+ languages. It enables swift and simple development and research with its powerful Pythonic and Keras inspired API. It is powerd by John Snow Labs powerful Spark NLP library."
-optional = false
-python-versions = "*"
-files = [
-    {file = "nlu-5.0.1-py3-none-any.whl", hash = "sha256:fd8126e99109c61f3fc01dcbcf81fe671d8ecc16b4dc5db3731103152dea4612"},
-    {file = "nlu-5.0.1.tar.gz", hash = "sha256:05bc7508ef284ec5be0642e188f5039a9383f4b109de59540add721781d046d2"},
-]
-
-[package.dependencies]
-dataclasses = "*"
-numpy = "*"
-pandas = ">=1.3.5"
-pyarrow = ">=0.16.0"
-spark-nlp = ">=5.0.2"
-
 [[package]]
 name = "nodeenv"
 version = "1.8.0"
@@ -2547,22 +2325,6 @@ files = [
     {file = "numpy-1.23.5.tar.gz", hash = "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a"},
 ]
 
-[[package]]
-name = "oauthlib"
-version = "3.2.2"
-description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"},
-    {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"},
-]
-
-[package.extras]
-rsa = ["cryptography (>=3.0.0)"]
-signals = ["blinker (>=1.4.0)"]
-signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
-
 [[package]]
 name = "omegaconf"
 version = "2.3.0"
@@ -2743,21 +2505,6 @@ pytz = ">=2020.1"
 [package.extras]
 test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
 
-[[package]]
-name = "parso"
-version = "0.8.3"
-description = "A Python Parser"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "parso-0.8.3-py2.py3-none-any.whl", hash = "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75"},
-    {file = "parso-0.8.3.tar.gz", hash = "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0"},
-]
-
-[package.extras]
-qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
-testing = ["docopt", "pytest (<6.0.0)"]
-
 [[package]]
 name = "pathspec"
 version = "0.11.2"
@@ -2801,31 +2548,6 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
 s3 = ["boto3"]
 test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
 
-[[package]]
-name = "pexpect"
-version = "4.8.0"
-description = "Pexpect allows easy control of interactive console applications."
-optional = false
-python-versions = "*"
-files = [
-    {file = "pexpect-4.8.0-py2.py3-none-any.whl", hash = "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937"},
-    {file = "pexpect-4.8.0.tar.gz", hash = "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c"},
-]
-
-[package.dependencies]
-ptyprocess = ">=0.5"
-
-[[package]]
-name = "pickleshare"
-version = "0.7.5"
-description = "Tiny 'shelve'-like database with concurrency support"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pickleshare-0.7.5-py2.py3-none-any.whl", hash = "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"},
-    {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"},
-]
-
 [[package]]
 name = "platformdirs"
 version = "3.10.0"
@@ -2977,42 +2699,6 @@ files = [
 [package.extras]
 test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
-[[package]]
-name = "ptyprocess"
-version = "0.7.0"
-description = "Run a subprocess in a pseudo terminal"
-optional = false
-python-versions = "*"
-files = [
-    {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
-    {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
-]
-
-[[package]]
-name = "pure-eval"
-version = "0.2.2"
-description = "Safely evaluate AST nodes without side effects"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"},
-    {file = "pure_eval-0.2.2.tar.gz", hash = "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"},
-]
-
-[package.extras]
-tests = ["pytest"]
-
-[[package]]
-name = "py4j"
-version = "0.10.9"
-description = "Enables Python programs to dynamically access arbitrary Java objects"
-optional = false
-python-versions = "*"
-files = [
-    {file = "py4j-0.10.9-py2.py3-none-any.whl", hash = "sha256:859ba728a7bb43e9c2bf058832759fb97a598bb28cc12f34f5fc4abdec08ede6"},
-    {file = "py4j-0.10.9.tar.gz", hash = "sha256:36ec57f43ff8ced260a18aa9a4e46c3500a730cac8860e259cbaa546c2b9db2f"},
-]
-
 [[package]]
 name = "pyarrow"
 version = "13.0.0"
@@ -3190,23 +2876,6 @@ files = [
     {file = "pygtrie-2.5.0.tar.gz", hash = "sha256:203514ad826eb403dab1d2e2ddd034e0d1534bbe4dbe0213bb0593f66beba4e2"},
 ]
 
-[[package]]
-name = "pyjwt"
-version = "2.8.0"
-description = "JSON Web Token implementation in Python"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"},
-    {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"},
-]
-
-[package.extras]
-crypto = ["cryptography (>=3.4.0)"]
-dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
-docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
-tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
-
 [[package]]
 name = "pyparsing"
 version = "3.1.1"
@@ -3221,24 +2890,6 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
-[[package]]
-name = "pyspark"
-version = "3.1.2"
-description = "Apache Spark Python API"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pyspark-3.1.2.tar.gz", hash = "sha256:5e25ebb18756e9715f4d26848cc7e558035025da74b4fc325a0ebc05ff538e65"},
-]
-
-[package.dependencies]
-py4j = "0.10.9"
-
-[package.extras]
-ml = ["numpy (>=1.7)"]
-mllib = ["numpy (>=1.7)"]
-sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"]
-
 [[package]]
 name = "pytest"
 version = "7.4.2"
@@ -4129,35 +3780,6 @@ files = [
     {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"},
 ]
 
-[[package]]
-name = "spark-nlp"
-version = "5.0.2"
-description = "John Snow Labs Spark NLP is a natural language processing library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines, that scale easily in a distributed environment."
-optional = false
-python-versions = "*"
-files = [
-    {file = "spark-nlp-5.0.2.tar.gz", hash = "sha256:690a9509bea5adddb55557539ca8fc1a8b949e73fb69499007829ae857284050"},
-    {file = "spark_nlp-5.0.2-py2.py3-none-any.whl", hash = "sha256:898da78131364934dcaa715d8a763ec751e06b2d901a07fe5ca0c1a03d51ce47"},
-]
-
-[[package]]
-name = "spark-nlp-display"
-version = "4.1"
-description = "Visualization package for Spark NLP"
-optional = false
-python-versions = ">=2.7"
-files = [
-    {file = "spark-nlp-display-4.1.tar.gz", hash = "sha256:2ef6a3db7702b0e2b455c150b3322eb5505896b57482f5f6aafd5c1e149ff6b6"},
-    {file = "spark_nlp_display-4.1-py3-none-any.whl", hash = "sha256:5af5ae18b8669cb9b2b9bea577e44ad609297a68d6f6c2e3d9ff9f52e26e0440"},
-]
-
-[package.dependencies]
-ipython = "*"
-numpy = "*"
-pandas = "*"
-spark-nlp = "*"
-svgwrite = "1.4"
-
 [[package]]
 name = "sqltrie"
 version = "0.7.0"
@@ -4218,36 +3840,6 @@ files = [
 [package.dependencies]
 catalogue = ">=2.0.3,<2.1.0"
 
-[[package]]
-name = "stack-data"
-version = "0.6.2"
-description = "Extract data from python stack frames and tracebacks for informative displays"
-optional = false
-python-versions = "*"
-files = [
-    {file = "stack_data-0.6.2-py3-none-any.whl", hash = "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8"},
-    {file = "stack_data-0.6.2.tar.gz", hash = "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815"},
-]
-
-[package.dependencies]
-asttokens = ">=2.1.0"
-executing = ">=1.2.0"
-pure-eval = "*"
-
-[package.extras]
-tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
-
-[[package]]
-name = "svgwrite"
-version = "1.4"
-description = "A Python library to create SVG drawings."
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "svgwrite-1.4-py3-none-any.whl", hash = "sha256:fa842fb3129a9399d19b5e9602a022fcc7f2f3f24713550e765c488ffafd743d"},
-    {file = "svgwrite-1.4.zip", hash = "sha256:b38ac03b67f81c728d81a33e4711aaf3ab136a57156d721bb17f88525d9909bb"},
-]
-
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -4505,21 +4097,6 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
-[[package]]
-name = "traitlets"
-version = "5.9.0"
-description = "Traitlets Python configuration system"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "traitlets-5.9.0-py3-none-any.whl", hash = "sha256:9e6ec080259b9a5940c797d58b613b5e31441c2257b87c2e795c5228ae80d2d8"},
-    {file = "traitlets-5.9.0.tar.gz", hash = "sha256:f6cde21a9c68cf756af02035f72d5a723bf607e862e7be33ece505abf4a3bad9"},
-]
-
-[package.extras]
-docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
-test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"]
-
 [[package]]
 name = "transformers"
 version = "4.29.2"
@@ -5076,4 +4653,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "eb7bef16bff140a7569ebb217e8d0d8d8220a883340f533a0be071efe490b500"
+content-hash = "c246890fd08f7d69ace373434b1d4cf0adc5bcb1f76177ee34d28ad12b839afa"
diff --git a/pyproject.toml b/pyproject.toml
index a9387b00..964f7a19 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
 spacy = "^3.6.1"
-johnsnowlabs = "^5.0.7"
 
 
 [tool.poetry.group.dev]

From 2bc3e219fa4a4d43b896322d678a9185b0fed8df Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 12:36:33 +0100
Subject: [PATCH 061/102] Total refactor finished for retagging: XLinear

---
 examples/retag.sh                          |  2 +-
 grants_tagger_light/retagging/retagging.py | 35 ++++++++++++++++------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/examples/retag.sh b/examples/retag.sh
index 97257f97..b62980cf 100644
--- a/examples/retag.sh
+++ b/examples/retag.sh
@@ -1,5 +1,5 @@
-# run in c5.9xlarge with at least 72GB of RAM
 grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
   --tags "Artificial Intelligence,HIV" \
   --years 2016,2017,2018,2019,2020,2021 \
+  --batch-size 10000 \
   --supervised
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 4c3fce14..3ca81b1b 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,3 +1,4 @@
+import io
 import json
 import logging
 import random
@@ -193,15 +194,31 @@ def retag(
         model.save(model_path)
 
     logging.info("- Predicting all tags")
-    for b in tqdm.tqdm(range(int(len(dset) / batch_size))):
-        start = b * batch_size
-        end = min(len(dset), (b+1) * batch_size)
-        batch = dset[start:end]["abstractText"]
-        for tag in tags:
-            if tag not in models:
-                logger.info(f"Skipping {tag} - classifier not trained. Maybe there were little data?")
-                continue
-            models[tag](batch, threshold=threshold)
+    dset = dset.add_column("changes", [[]] * len(dset))
+    with open(os.path.join(save_to_path, 'corrections'), 'w') as f:
+        for b in tqdm.tqdm(range(int(len(dset) / batch_size))):
+            start = b * batch_size
+            end = min(len(dset), (b+1) * batch_size)
+            batch = dset.select([i for i in range(start, end)])
+            batch_buffer = [x for x in batch]
+            for tag in models.keys():
+                batch_preds = models[tag](batch["abstractText"], threshold=threshold)
+                for i, bp in enumerate(batch_preds):
+                    is_predicted = bp == [0]
+                    is_expected = tag in batch[i]['meshMajor']
+                    if is_predicted != is_expected:
+                        if is_predicted:
+                            batch_buffer[i]['meshMajor'].append(tag)
+                            batch_buffer[i]['changes'].append(f"+{tag}")
+                        else:
+                            batch_buffer[i]['meshMajor'].remove(tag)
+                            batch_buffer[i]['changes'].append(f"-{tag}")
+            # batch = Dataset.from_list(batch_buffer)
+            # buffer = io.BytesIO()
+            # batch.to_json(buffer)
+            # f.write(buffer.getvalue().decode('utf-8'))
+            batch_buffer = [json.dumps(x) for x in batch_buffer]
+            f.write('\n'.join(batch_buffer))
 
 
 @retag_app.command()

From f4b1f32b5b7aef8bd2b3bb309b9d61379a75afd0 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 12:42:52 +0100
Subject: [PATCH 062/102] Black

---
 grants_tagger_light/augmentation/augment.py   |  19 +-
 .../preprocessing/preprocess_mesh.py          |  21 +--
 .../retagging/cnn_gpu_config.cfg              | 127 -------------
 grants_tagger_light/retagging/config.cfg      | 124 -------------
 grants_tagger_light/retagging/retagging.py    | 167 ++++++++++--------
 5 files changed, 102 insertions(+), 356 deletions(-)
 delete mode 100644 grants_tagger_light/retagging/cnn_gpu_config.cfg
 delete mode 100644 grants_tagger_light/retagging/config.cfg

diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index e97ff46d..900f8817 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -156,33 +156,24 @@ def augment(
 
 @augment_app.command()
 def augment_cli(
-    data_path: str = typer.Argument(
-        ...,
-        help="Path to mesh.jsonl"),
-    save_to_path: str = typer.Argument(
-        ...,
-        help="Path to save the new jsonl data"
-    ),
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    save_to_path: str = typer.Argument(..., help="Path to save the new jsonl data"),
     model_key: str = typer.Option(
         "gpt-3.5-turbo",
         help="LLM to use data augmentation. By now, only `openai` is supported",
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(),
-        help="Number of processes to use for data augmentation"
+        os.cpu_count(), help="Number of processes to use for data augmentation"
     ),
     batch_size: int = typer.Option(
-        64,
-        help="Preprocessing batch size (for dataset, filter, map, ...)"
+        64, help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
     min_examples: int = typer.Option(
         None,
         help="Minimum number of examples to require. "
         "Less than that will trigger data augmentation.",
     ),
-    examples: int = typer.Option(
-        25,
-        help="Examples to generate per each tag."),
+    examples: int = typer.Option(25, help="Examples to generate per each tag."),
     prompt_template: str = typer.Option(
         "grants_tagger_light/augmentation/prompt.template",
         help="File to use as a prompt. "
diff --git a/grants_tagger_light/preprocessing/preprocess_mesh.py b/grants_tagger_light/preprocessing/preprocess_mesh.py
index 586783cf..92a06eb8 100644
--- a/grants_tagger_light/preprocessing/preprocess_mesh.py
+++ b/grants_tagger_light/preprocessing/preprocess_mesh.py
@@ -117,7 +117,7 @@ def preprocess_mesh(
         batch_size=batch_size,
         num_proc=num_proc,
         desc="Tokenizing",
-        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"}
+        fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"},
     )
     logger.info("Time taken to tokenize: {}".format(time.time() - t1))
 
@@ -225,13 +225,9 @@ def preprocess_mesh(
 
 @preprocess_app.command()
 def preprocess_mesh_cli(
-    data_path: str = typer.Argument(
-        ...,
-        help="Path to mesh.jsonl"
-    ),
+    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
     save_to_path: str = typer.Argument(
-        ...,
-        help="Path to save the serialized PyArrow dataset after preprocessing"
+        ..., help="Path to save the serialized PyArrow dataset after preprocessing"
     ),
     model_key: str = typer.Argument(
         ...,
@@ -239,21 +235,16 @@ def preprocess_mesh_cli(
         "Leave blank if training from scratch",  # noqa
     ),
     test_size: float = typer.Option(
-        None,
-        help="Fraction of data to use for testing in (0,1] or number of rows"
+        None, help="Fraction of data to use for testing in (0,1] or number of rows"
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(),
-        help="Number of processes to use for preprocessing"
+        os.cpu_count(), help="Number of processes to use for preprocessing"
     ),
     max_samples: int = typer.Option(
         -1,
         help="Maximum number of samples to use for preprocessing",
     ),
-    batch_size: int = typer.Option(
-        256,
-        help="Size of the preprocessing batch"
-    ),
+    batch_size: int = typer.Option(256, help="Size of the preprocessing batch"),
     tags: str = typer.Option(
         None,
         help="Comma-separated tags you want to include in the dataset "
diff --git a/grants_tagger_light/retagging/cnn_gpu_config.cfg b/grants_tagger_light/retagging/cnn_gpu_config.cfg
deleted file mode 100644
index e71cf867..00000000
--- a/grants_tagger_light/retagging/cnn_gpu_config.cfg
+++ /dev/null
@@ -1,127 +0,0 @@
-[paths]
-train = ""
-dev = ""
-raw = null
-init_tok2vec = null
-vectors = null
-
-[system]
-seed = 42
-gpu_allocator = "pytorch"
-
-[nlp]
-lang = "en"
-pipeline = ["textcat"]
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-batch_size = 1000
-
-[components]
-
-[components.textcat]
-factory = "textcat_multilabel"
-threshold = 0.5
-
-[components.textcat.model]
-@architectures = "spacy.TextCatCNN.v1"
-exclusive_classes = false
-nO = null
-
-[components.textcat.model.tok2vec]
-@architectures = "spacy.Tok2Vec.v2"
-
-[components.textcat.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
-width = ${components.textcat.model.tok2vec.encode:width}
-rows = [10000,5000,5000,5000]
-attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
-include_static_vectors = false
-
-[components.textcat.model.tok2vec.encode]
-@architectures = "spacy.MaxoutWindowEncoder.v2"
-width = 96
-depth = 4
-window_size = 1
-maxout_pieces = 3
-
-[corpora]
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths:dev}
-gold_preproc = ${corpora.train.gold_preproc}
-max_length = 0
-limit = 0
-augmenter = null
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths:train}
-gold_preproc = false
-max_length = 0
-limit = 0
-augmenter = null
-
-[training]
-train_corpus = "corpora.train"
-dev_corpus = "corpora.dev"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-dropout = 0.2
-patience = 1600
-max_epochs = 0
-max_steps = 20000
-eval_frequency = 200
-accumulate_gradient = 1
-frozen_components = []
-before_to_disk = null
-
-[training.batcher]
-@batchers = "spacy.batch_by_sequence.v1"
-size = 32
-get_length = null
-
-[training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-eps = 0.00000001
-learn_rate = 0.001
-use_averages = true
-
-[training.score_weights]
-cats_score_desc = null
-cats_micro_p = null
-cats_micro_r = null
-cats_micro_f = null
-cats_macro_p = null
-cats_macro_r = null
-cats_macro_f = null
-cats_macro_auc = null
-cats_f_per_type = null
-cats_macro_auc_per_type = null
-cats_score = 1.0
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.tokenizer]
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/config.cfg b/grants_tagger_light/retagging/config.cfg
deleted file mode 100644
index a5fb381e..00000000
--- a/grants_tagger_light/retagging/config.cfg
+++ /dev/null
@@ -1,124 +0,0 @@
-[paths]
-train = null
-dev = null
-vectors = null
-init_tok2vec = null
-
-[system]
-gpu_allocator = null
-seed = 0
-
-[nlp]
-lang = "en"
-pipeline = ["textcat"]
-batch_size = 1000
-disabled = []
-before_creation = null
-after_creation = null
-after_pipeline_creation = null
-tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
-
-[components]
-
-[components.textcat]
-factory = "textcat"
-scorer = {"@scorers":"spacy.textcat_scorer.v1"}
-threshold = 0.5
-
-[components.textcat.model]
-@architectures = "spacy.TextCatBOW.v2"
-exclusive_classes = true
-ngram_size = 1
-no_output_layer = false
-nO = null
-
-[corpora]
-
-[corpora.dev]
-@readers = "spacy.Corpus.v1"
-path = ${paths.dev}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-
-[corpora.train]
-@readers = "spacy.Corpus.v1"
-path = ${paths.train}
-max_length = 0
-gold_preproc = false
-limit = 0
-augmenter = null
-
-[training]
-dev_corpus = "corpora.dev"
-train_corpus = "corpora.train"
-seed = ${system.seed}
-gpu_allocator = ${system.gpu_allocator}
-# dropout = 0.1
-dropout = 0.0
-accumulate_gradient = 1
-# patience = 1600
-patience = 0
-max_epochs = 15
-# max_steps = 20000
-eval_frequency = 200
-frozen_components = []
-annotating_components = []
-before_to_disk = null
-
-[training.batcher]
-@batchers = "spacy.batch_by_words.v1"
-discard_oversize = false
-tolerance = 0.2
-get_length = null
-
-[training.batcher.size]
-@schedules = "compounding.v1"
-start = 100
-stop = 1000
-compound = 1.001
-t = 0.0
-
-[training.logger]
-@loggers = "spacy.ConsoleLogger.v1"
-progress_bar = false
-
-[training.optimizer]
-@optimizers = "Adam.v1"
-beta1 = 0.9
-beta2 = 0.999
-L2_is_weight_decay = true
-L2 = 0.01
-grad_clip = 1.0
-use_averages = false
-eps = 0.00000001
-#learn_rate = 0.001
-learn_rate = 0.005
-
-[training.score_weights]
-cats_score = 1.0
-cats_score_desc = null
-cats_micro_p = null
-cats_micro_r = null
-cats_micro_f = null
-cats_macro_p = null
-cats_macro_r = null
-cats_macro_f = null
-cats_macro_auc = null
-cats_f_per_type = null
-cats_macro_auc_per_type = null
-
-[pretraining]
-
-[initialize]
-vectors = ${paths.vectors}
-init_tok2vec = ${paths.init_tok2vec}
-vocab_data = null
-lookups = null
-before_init = null
-after_init = null
-
-[initialize.components]
-
-[initialize.tokenizer]
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 3ca81b1b..eb73f793 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -30,7 +30,7 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
     dset = dset.select([x for x in range(limit)])
     # Not in parallel since the data is very small and it's worse to divide and conquer
     dset.map(
-        lambda x: {'featured_tag': tag},
+        lambda x: {"featured_tag": tag},
         desc=f"Adding featured tag ({tag})",
     )
     train_size = int(split * min_limit)
@@ -40,44 +40,52 @@ def _load_data(dset: Dataset, tag, limit=100, split=0.8):
 
 
 def _annotate(curation_file, dset, tag, limit, is_positive):
-    field = 'positive' if is_positive else 'negative'
-    human_supervision = {tag: {'positive': [], 'negative': []}}
+    field = "positive" if is_positive else "negative"
+    human_supervision = {tag: {"positive": [], "negative": []}}
     if os.path.isfile(curation_file):
-        prompt = f"File `{curation_file}` found. Do you want to reuse previous work? [y|n]: "
+        prompt = (
+            f"File `{curation_file}` found. Do you want to reuse previous work? [y|n]: "
+        )
         answer = input(prompt)
-        while answer not in ['y', 'n']:
+        while answer not in ["y", "n"]:
             answer = input(prompt)
-        if answer == 'y':
-            with open(curation_file, 'r') as f:
+        if answer == "y":
+            with open(curation_file, "r") as f:
                 human_supervision = json.load(f)
 
     count = len(human_supervision[tag][field])
-    logging.info(f"[{tag}] Annotated: {count} Required: {limit} Available: {len(dset) - count}")
+    logging.info(
+        f"[{tag}] Annotated: {count} Required: {limit} Available: {len(dset) - count}"
+    )
     finished = False
     while count < limit:
         tries = 0
         random.seed(time.time())
         random_pos_row = random.randint(0, len(dset))
-        id_ = dset[random_pos_row]['pmid']
-        while id_ in [x['pmid'] for x in human_supervision[tag][field]]:
+        id_ = dset[random_pos_row]["pmid"]
+        while id_ in [x["pmid"] for x in human_supervision[tag][field]]:
             random_pos_row = random.randint(0, len(dset))
-            id_ = dset[random_pos_row]['pmid']
+            id_ = dset[random_pos_row]["pmid"]
             tries += 1
             if tries >= 10:
-                logger.error(f"Unable to find more examples for {field} {tag} which are not already tagged. "
-                             f"Continuing with {count} examples...")
+                logger.error(
+                    f"Unable to find more examples for {field} {tag} which are not already tagged. "
+                    f"Continuing with {count} examples..."
+                )
                 finished = True
                 break
         if finished:
             break
-        print("="*50)
-        print(dset[random_pos_row]['abstractText'])
         print("=" * 50)
-        res = input(f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""} a `{tag}` text? '
-                    f'[a to accept]: ')
-        if res == 'a':
+        print(dset[random_pos_row]["abstractText"])
+        print("=" * 50)
+        res = input(
+            f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""} a `{tag}` text? '
+            f"[a to accept]: "
+        )
+        if res == "a":
             human_supervision[tag][field].append(dset[random_pos_row])
-            with open(curation_file, 'w') as f:
+            with open(curation_file, "w") as f:
                 json.dump(human_supervision, f)
         count = len(human_supervision[tag])
 
@@ -102,7 +110,6 @@ def retag(
     supervised: bool = True,
     years: list = None,
 ):
-
     # We only have 1 file, so no sharding is available https://huggingface.co/docs/datasets/loading#multiprocessing
     logging.info("Loading the MeSH jsonl...")
     dset = load_dataset("json", data_files=data_path, num_proc=1)
@@ -116,7 +123,7 @@ def retag(
         )
 
     if tags_file_path is not None and os.path.isfile(tags_file_path):
-        with open(tags_file_path, 'r') as f:
+        with open(tags_file_path, "r") as f:
             tags = [x.strip() for x in f.readlines()]
 
     logging.info(f"- Total tags detected: {tags}.")
@@ -125,15 +132,15 @@ def retag(
     for tag in tags:
         os.makedirs(os.path.join(save_to_path, tag.replace(" ", "")), exist_ok=True)
         logging.info(f"- Obtaining positive examples for {tag}...")
-        positive_dset = dset.filter(
-            lambda x: tag in x["meshMajor"], num_proc=num_proc
-        )
-
-        if len(positive_dset['abstractText']) < train_examples:
-            logging.info(f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])} vs "
-                         f"expected {train_examples}). "
-                         f"Check {save_to_path}.err for more information about skipped tags.")
-            with open(f"{save_to_path}.err", 'a') as f:
+        positive_dset = dset.filter(lambda x: tag in x["meshMajor"], num_proc=num_proc)
+
+        if len(positive_dset["abstractText"]) < train_examples:
+            logging.info(
+                f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])} vs "
+                f"expected {train_examples}). "
+                f"Check {save_to_path}.err for more information about skipped tags."
+            )
+            with open(f"{save_to_path}.err", "a") as f:
                 f.write(tag)
             continue
 
@@ -142,33 +149,48 @@ def retag(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
 
-        curation_file = os.path.join(save_to_path, tag.replace(' ', ''), "curation")
+        curation_file = os.path.join(save_to_path, tag.replace(" ", ""), "curation")
         if supervised:
             logging.info(f"- Curating {tag}...")
             _curate(curation_file, positive_dset, negative_dset, tag, train_examples)
         else:
-            with open(curation_file, 'w') as f:
-                json.dump({tag: {'positive': [positive_dset[i] for i in range(train_examples)],
-                                 'negative': [negative_dset[i] for i in range(train_examples)]
-                                 }
-                           }, f)
+            with open(curation_file, "w") as f:
+                json.dump(
+                    {
+                        tag: {
+                            "positive": [
+                                positive_dset[i] for i in range(train_examples)
+                            ],
+                            "negative": [
+                                negative_dset[i] for i in range(train_examples)
+                            ],
+                        }
+                    },
+                    f,
+                )
 
     logging.info("- Retagging...")
 
     models = {}
     for tag in tags:
-        curation_file = os.path.join(save_to_path, tag.replace(' ', ''), "curation")
+        curation_file = os.path.join(save_to_path, tag.replace(" ", ""), "curation")
         if not os.path.isfile(curation_file):
-            logger.info(f"Skipping `{tag}` retagging as no curation data was found. "
-                        f"Maybe there were too little examples? (check {save_to_path}.err)")
+            logger.info(
+                f"Skipping `{tag}` retagging as no curation data was found. "
+                f"Maybe there were too little examples? (check {save_to_path}.err)"
+            )
             continue
         with open(curation_file, "r") as fr:
             data = json.load(fr)
-            positive_dset = Dataset.from_list(data[tag]['positive'])
-            negative_dset = Dataset.from_list(data[tag]['negative'])
+            positive_dset = Dataset.from_list(data[tag]["positive"])
+            negative_dset = Dataset.from_list(data[tag]["negative"])
 
-        pos_x_train, pos_x_test = _load_data(positive_dset, tag, limit=train_examples, split=0.8)
-        neg_x_train, neg_x_test = _load_data(negative_dset, "other", limit=train_examples, split=0.8)
+        pos_x_train, pos_x_test = _load_data(
+            positive_dset, tag, limit=train_examples, split=0.8
+        )
+        neg_x_train, neg_x_test = _load_data(
+            negative_dset, "other", limit=train_examples, split=0.8
+        )
 
         pos_x_train = pos_x_train.add_column("tag", [tag] * len(pos_x_train))
         pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
@@ -180,14 +202,19 @@ def retag(
         test = concatenate_datasets([pos_x_test, neg_x_test])
 
         label_binarizer = preprocessing.LabelBinarizer()
-        label_binarizer_path = os.path.join(save_to_path, tag.replace(" ", ""), 'labelbinarizer')
+        label_binarizer_path = os.path.join(
+            save_to_path, tag.replace(" ", ""), "labelbinarizer"
+        )
         labels = [1 if x == tag else 0 for x in train["tag"]]
         label_binarizer.fit(labels)
-        with open(label_binarizer_path, 'wb') as f:
+        with open(label_binarizer_path, "wb") as f:
             pkl.dump(label_binarizer, f)
 
         model = MeshXLinear(label_binarizer_path=label_binarizer_path)
-        model.fit(train["abstractText"], scipy.sparse.csr_matrix(label_binarizer.transform(labels)))
+        model.fit(
+            train["abstractText"],
+            scipy.sparse.csr_matrix(label_binarizer.transform(labels)),
+        )
         models[tag] = model
         model_path = os.path.join(save_to_path, tag.replace(" ", ""), "clf")
         os.makedirs(model_path, exist_ok=True)
@@ -195,71 +222,61 @@ def retag(
 
     logging.info("- Predicting all tags")
     dset = dset.add_column("changes", [[]] * len(dset))
-    with open(os.path.join(save_to_path, 'corrections'), 'w') as f:
+    with open(os.path.join(save_to_path, "corrections"), "w") as f:
         for b in tqdm.tqdm(range(int(len(dset) / batch_size))):
             start = b * batch_size
-            end = min(len(dset), (b+1) * batch_size)
+            end = min(len(dset), (b + 1) * batch_size)
             batch = dset.select([i for i in range(start, end)])
             batch_buffer = [x for x in batch]
             for tag in models.keys():
                 batch_preds = models[tag](batch["abstractText"], threshold=threshold)
                 for i, bp in enumerate(batch_preds):
                     is_predicted = bp == [0]
-                    is_expected = tag in batch[i]['meshMajor']
+                    is_expected = tag in batch[i]["meshMajor"]
                     if is_predicted != is_expected:
                         if is_predicted:
-                            batch_buffer[i]['meshMajor'].append(tag)
-                            batch_buffer[i]['changes'].append(f"+{tag}")
+                            batch_buffer[i]["meshMajor"].append(tag)
+                            batch_buffer[i]["changes"].append(f"+{tag}")
                         else:
-                            batch_buffer[i]['meshMajor'].remove(tag)
-                            batch_buffer[i]['changes'].append(f"-{tag}")
+                            batch_buffer[i]["meshMajor"].remove(tag)
+                            batch_buffer[i]["changes"].append(f"-{tag}")
             # batch = Dataset.from_list(batch_buffer)
             # buffer = io.BytesIO()
             # batch.to_json(buffer)
             # f.write(buffer.getvalue().decode('utf-8'))
             batch_buffer = [json.dumps(x) for x in batch_buffer]
-            f.write('\n'.join(batch_buffer))
+            f.write("\n".join(batch_buffer))
 
 
 @retag_app.command()
 def retag_cli(
-    data_path: str = typer.Argument(
-        ...,
-        help="Path to allMeSH_2021.jsonl"),
+    data_path: str = typer.Argument(..., help="Path to allMeSH_2021.jsonl"),
     save_to_path: str = typer.Argument(
-        ...,
-        help="Path where to save the retagged data"
+        ..., help="Path where to save the retagged data"
     ),
     num_proc: int = typer.Option(
-        os.cpu_count(),
-        help="Number of processes to use for data augmentation"
+        os.cpu_count(), help="Number of processes to use for data augmentation"
     ),
     batch_size: int = typer.Option(
-        1024,
-        help="Preprocessing batch size (for dataset, filter, map, ...)"
-    ),
-    tags: str = typer.Option(
-        None,
-        help="Comma separated list of tags to retag"
+        1024, help="Preprocessing batch size (for dataset, filter, map, ...)"
     ),
+    tags: str = typer.Option(None, help="Comma separated list of tags to retag"),
     tags_file_path: str = typer.Option(
         None,
         help="Text file containing one line per tag to be considered. "
         "The rest will be discarded.",
     ),
     threshold: float = typer.Option(
-        0.9,
-        help="Minimum threshold of confidence to retag a model. Default: 0.9"
+        0.9, help="Minimum threshold of confidence to retag a model. Default: 0.9"
     ),
     train_examples: int = typer.Option(
-        100,
-        help="Number of examples to use for training the retaggers"
+        100, help="Number of examples to use for training the retaggers"
     ),
     supervised: bool = typer.Option(
         False,
         help="Use human curation, showing a `limit` amount of positive and negative examples to curate data"
-             " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
-             " the model will be train. All intermediary steps will be saved."
+        " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
+        " the model will be train. All intermediary steps will be saved.",
     ),
     years: str = typer.Option(
         None, help="Comma-separated years you want to include in the retagging process"
@@ -280,9 +297,7 @@ def retag_cli(
         exit(-1)
 
     if tags_file_path is not None and not os.path.isfile(tags_file_path):
-        logger.error(
-            f"{tags_file_path} not found"
-        )
+        logger.error(f"{tags_file_path} not found")
         exit(-1)
 
     retag(

From 4ecec1cf99bebd045361f368b9b4069604c63408 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 14:38:01 +0100
Subject: [PATCH 063/102] Adds retagging tests and updates documentation

---
 README.md                                  | 47 ++++++++++++----
 examples/retag.sh                          |  1 +
 grants_tagger_light/retagging/retagging.py |  9 ++--
 tests/test_retagging.py                    | 63 ++++++++++++++++++++++
 4 files changed, 104 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_retagging.py

diff --git a/README.md b/README.md
index bf7ca7fe..a921ff5c 100644
--- a/README.md
+++ b/README.md
@@ -83,14 +83,16 @@ And then connect and attach to your machine with a tunnel
 
 # ⌨️  Commands
 
-| Commands        | Description                                                  | Needs dev |
-|-----------------|--------------------------------------------------------------|-----------|
-| 🔥 train        | preprocesses the data and trains a new model                 | True      |
-| ⚙ preprocess    | (Optional) preprocess and save the data outside training    | False     |
-| 📈 evaluate     | evaluate performance of pretrained model                     | True      |
-| 🔖 predict      | predict tags given a grant abstract using a pretrained model | False     |
-| 🎛 tune         | tune params and threshold                                    | True      |
-| ⬇ download      | download data from EPMC                                      | False     |
+| Commands     | Description                                                  | Needs dev |
+|--------------|--------------------------------------------------------------|-----------|
+| ⚙ preprocess | preprocess and save the data outside training                | False     |
+| 🔥 train     | preprocesses the data and trains a new model                 | True      |
+| 📚 augment   | augments data using an LLM (gpt)                             | False     |
+| ✏ retag      | retags data using XLinear to correct errors                  | False     |
+| 📈 evaluate  | evaluate performance of pretrained model                     | True      |
+| 🔖 predict   | predict tags given a grant abstract using a pretrained model | False     |
+| 🎛 tune      | tune params and threshold                                    | True      |
+| ⬇ download   | download data from EPMC                                      | False     |
 
 
 in square brackets the commands that are not implemented yet
@@ -295,11 +297,35 @@ For tags as `Data Science`, `Artificial Intelligence`, `Data Collection`, `Deep
 
 `Artificial Intelligence` with several thousand rows shows a performance of 0.1 F1, showing a lot of confusion with the other tags described above.
 
-We propose a solution: retagging the data.
+We propose a solution: retagging the original data with a small curated dataset of examples and a quick Machine Learning light classifier: XLinear.
 
 ```
-grants-tagger retag mesh data/raw/allMeSH_2021.jsonl ll --tags-file-path tags_to_augment.txt 
+grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
+  --tags "Artificial Intelligence,HIV" \
+  --years 2016,2017,2018,2019,2020,2021 \
+  --train-examples 100 \
+  --batch-size 10000 \
+  --supervised 
 ```
+Let's take a look at some of the params:
+- *tags*: A comma-separated (and quoted) list of tags you want to retag.
+- *years*: A comma-separated list of years you want to include
+- *train-examples*: The number of examples to include for training the classifier. Default: 100
+- *batch-size*: The size of the processing batch. Keep it high as the memory consumption is really small. Default: 10000
+
+### Getting the curation data: Supervised or Unsupervised?
+For using the retagger, you need a small 
+- *supervised*: If you want to be asked for *train-examples* examples to curate a dataset for training the classifier. Recommended.
+
+If not set, the model will randomly get *train-examples* and train the classifier without your supervision, which will reduce the performance of the classifiers.
+
+### Artifacts created
+As a result of the proces, you will find a folder at *save_to_path*. Inside,  you will find:
+- One folder per tag, including:
+  - `clf` (a classifier),
+  - `curation` (a dataset of positive and negative examples for the tag)
+  - `labelbinarizer` (a label binarizer to encode the labels)  
+- a `corrections` file, the new allMeSH_2021.jsonl with your tags corrected.
 
 ### Other params
 ```
@@ -323,7 +349,6 @@ grants-tagger retag mesh data/raw/allMeSH_2021.jsonl ll --tags-file-path tags_to
 ```
 
 
-
 ## 📈 Evaluate
 
 Evaluate enables evaluation of the performance of various approaches including
diff --git a/examples/retag.sh b/examples/retag.sh
index b62980cf..4bce6dc0 100644
--- a/examples/retag.sh
+++ b/examples/retag.sh
@@ -1,5 +1,6 @@
 grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
   --tags "Artificial Intelligence,HIV" \
   --years 2016,2017,2018,2019,2020,2021 \
+  --train-examples 100 \
   --batch-size 10000 \
   --supervised
\ No newline at end of file
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index eb73f793..980b5edf 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -143,11 +143,12 @@ def retag(
             with open(f"{save_to_path}.err", "a") as f:
                 f.write(tag)
             continue
-
+        logging.info(f"-- Total positive examples for {tag}: {len(positive_dset)}")
         logging.info(f"- Obtaining negative examples ('other') for {tag}...")
         negative_dset = dset.filter(
             lambda x: tag not in x["meshMajor"], num_proc=num_proc
         )
+        logging.info(f"-- Total negative examples for {tag}: {len(negative_dset)}")
 
         curation_file = os.path.join(save_to_path, tag.replace(" ", ""), "curation")
         if supervised:
@@ -199,6 +200,8 @@ def retag(
 
         logging.info(f"- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
+
+        # TODO: Use Evaluation on `test` to see if the model is good enough
         test = concatenate_datasets([pos_x_test, neg_x_test])
 
         label_binarizer = preprocessing.LabelBinarizer()
@@ -240,10 +243,6 @@ def retag(
                         else:
                             batch_buffer[i]["meshMajor"].remove(tag)
                             batch_buffer[i]["changes"].append(f"-{tag}")
-            # batch = Dataset.from_list(batch_buffer)
-            # buffer = io.BytesIO()
-            # batch.to_json(buffer)
-            # f.write(buffer.getvalue().decode('utf-8'))
             batch_buffer = [json.dumps(x) for x in batch_buffer]
             f.write("\n".join(batch_buffer))
 
diff --git a/tests/test_retagging.py b/tests/test_retagging.py
new file mode 100644
index 00000000..f276e7a6
--- /dev/null
+++ b/tests/test_retagging.py
@@ -0,0 +1,63 @@
+import os
+import tempfile
+import unittest
+import pytest
+from grants_tagger_light.retagging.retagging import retag
+
+
+# Note dummy data is not necessarily annotated correctly
+dummy_data = """{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}"""  # noqa
+
+
+@pytest.fixture
+def data_path():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        data_path = tmpdirname + "/data.jsonl"
+        with open(data_path, "w") as f:
+            f.write(dummy_data)
+        yield data_path
+
+
+def test_retagging(data_path):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        save_to_path = tmpdirname + "/output"
+
+    tag = 'COVID-19'
+    retag(data_path,
+          save_to_path,
+          num_proc=1,
+          batch_size=1,
+          tags=[tag],
+          years=['2023'],
+          threshold=0.9,
+          train_examples=10,
+          supervised=False)
+
+    assert os.path.isdir(save_to_path)
+    assert os.path.isdir(os.path.join(save_to_path, tag))
+    assert os.path.isdir(os.path.join(save_to_path, tag, 'clf'))
+    assert os.path.isfile(os.path.join(save_to_path, tag, 'curation'))
+    assert os.path.isfile(os.path.join(save_to_path, tag, 'labelbinarizer'))
+
+
+if __name__ == '__main__':
+    unittest.main()

From 8bc7b4df42dfda5b52e029934daf796c52d99f2d Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 16:36:58 +0100
Subject: [PATCH 064/102] Adds retagging tests and updates documentation

---
 README.md                                  | 23 ++++++++++++++++++++++
 grants_tagger_light/retagging/retagging.py |  6 +++---
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a921ff5c..8ddb2efe 100644
--- a/README.md
+++ b/README.md
@@ -317,6 +317,29 @@ Let's take a look at some of the params:
 For using the retagger, you need a small 
 - *supervised*: If you want to be asked for *train-examples* examples to curate a dataset for training the classifier. Recommended.
 
+```
+==================================================
+The SD BIOLINE HIV/Syphilis Duo assay is the first World Health Organization prequalified dual rapid diagnostic test for
+ simultaneous detection of HIV and Treponema pallidum antibodies in human blood. Prior to introducing the test into 
+ antenatal clinics across South Sudan, a field evaluation of its clinical performance in diagnosing both HIV and 
+ syphilis in pregnant women was conducted. SD Bioline test performance on venous blood samples was compared with (i) 
+ Vironostika HIV1/2 Uniform II Ag/Ab reference standard and Alere Determine HIV 1/2 non-reference standard for HIV 
+ diagnosis, and (ii) Treponema pallidum hemagglutination reference standard and Rapid plasma reagin non-reference 
+ standard for syphilis. Sensitivity, specificity, positive predictive value (PPN), negative predictive value (NPV) 
+ and kappa (ê) value were calculated for each component against the reference standards within 95% confidence 
+ intervals (CIs); agreements between Determine HIV 1/2 and SD Bioline HIV tests were also calculated. Of 442 pregnant 
+ women recruited, eight (1.8%) were HIV positive, 22 (5.0%) had evidence of syphilis exposure; 14 (3.2%) had active 
+ infection. For HIV diagnosis, the sensitivity, specificity, PPV and NPV were 100% (95% CI: 63.1-100), 100% 
+ (95% CI: 99.2-100), 100% (95% CI: 63.1-100) and 100% (95% CI: 99.2-100) respectively with ê value of 1 
+ (95% CI: 0.992-1.000). Overall agreement of the Duo HIV component and Determine test was 99.1% (95% CI: 0.977-0.998) 
+ with 66.7% (95% CI: 34.9-90.1) positive and 100% (95% CI: 0.992-1.000) negative percent agreements. For syphilis, 
+ the Duo assay sensitivity was 86.4% (95% CI: 65.1-97.1) and specificity 100% (95% CI: 99.1-100) with PPV 100% 
+ (95% CI: 82.4-100), NPV 99.2% (95% CI: 97.9-99.9) and ê value 0.92 (95% CI: 0.980-0.999). Our findings suggest the SD Bioline HIV/Syphilis Duo Assay could be suitable for HIV and syphilis testing in women attending antenatal services across South Sudan. Women with positive syphilis results should receive treatment immediately, whereas HIV positive women should undergo confirmatory testing following national HIV testing guidelines.
+==================================================
+[2/100]> Is this  a `HIV` text? [a to accept]:
+
+```
+
 If not set, the model will randomly get *train-examples* and train the classifier without your supervision, which will reduce the performance of the classifiers.
 
 ### Artifacts created
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 980b5edf..5eabb71a 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -61,10 +61,10 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
     while count < limit:
         tries = 0
         random.seed(time.time())
-        random_pos_row = random.randint(0, len(dset))
+        random_pos_row = random.randint(0, len(dset)-1)
         id_ = dset[random_pos_row]["pmid"]
         while id_ in [x["pmid"] for x in human_supervision[tag][field]]:
-            random_pos_row = random.randint(0, len(dset))
+            random_pos_row = random.randint(0, len(dset)-1)
             id_ = dset[random_pos_row]["pmid"]
             tries += 1
             if tries >= 10:
@@ -87,7 +87,7 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
             human_supervision[tag][field].append(dset[random_pos_row])
             with open(curation_file, "w") as f:
                 json.dump(human_supervision, f)
-        count = len(human_supervision[tag])
+        count = len(human_supervision[tag][field])
 
 
 def _curate(save_to_path, pos_dset, neg_dset, tag, limit):

From 58f39fab8d0b759b320cc6df94784c3784a72763 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 16:45:22 +0100
Subject: [PATCH 065/102] Black

---
 grants_tagger_light/retagging/retagging.py |  4 +--
 tests/test_retagging.py                    | 30 ++++++++++++----------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 5eabb71a..f042f088 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -61,10 +61,10 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
     while count < limit:
         tries = 0
         random.seed(time.time())
-        random_pos_row = random.randint(0, len(dset)-1)
+        random_pos_row = random.randint(0, len(dset) - 1)
         id_ = dset[random_pos_row]["pmid"]
         while id_ in [x["pmid"] for x in human_supervision[tag][field]]:
-            random_pos_row = random.randint(0, len(dset)-1)
+            random_pos_row = random.randint(0, len(dset) - 1)
             id_ = dset[random_pos_row]["pmid"]
             tries += 1
             if tries >= 10:
diff --git a/tests/test_retagging.py b/tests/test_retagging.py
index f276e7a6..bc761132 100644
--- a/tests/test_retagging.py
+++ b/tests/test_retagging.py
@@ -41,23 +41,25 @@ def test_retagging(data_path):
     with tempfile.TemporaryDirectory() as tmpdirname:
         save_to_path = tmpdirname + "/output"
 
-    tag = 'COVID-19'
-    retag(data_path,
-          save_to_path,
-          num_proc=1,
-          batch_size=1,
-          tags=[tag],
-          years=['2023'],
-          threshold=0.9,
-          train_examples=10,
-          supervised=False)
+    tag = "COVID-19"
+    retag(
+        data_path,
+        save_to_path,
+        num_proc=1,
+        batch_size=1,
+        tags=[tag],
+        years=["2023"],
+        threshold=0.9,
+        train_examples=10,
+        supervised=False,
+    )
 
     assert os.path.isdir(save_to_path)
     assert os.path.isdir(os.path.join(save_to_path, tag))
-    assert os.path.isdir(os.path.join(save_to_path, tag, 'clf'))
-    assert os.path.isfile(os.path.join(save_to_path, tag, 'curation'))
-    assert os.path.isfile(os.path.join(save_to_path, tag, 'labelbinarizer'))
+    assert os.path.isdir(os.path.join(save_to_path, tag, "clf"))
+    assert os.path.isfile(os.path.join(save_to_path, tag, "curation"))
+    assert os.path.isfile(os.path.join(save_to_path, tag, "labelbinarizer"))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()

From ad845830974ec1fddf3a44bd4e0dec0080969f34 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 16:49:08 +0100
Subject: [PATCH 066/102] rufus

---
 grants_tagger_light/retagging/retagging.py | 45 ++++++++++++++--------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index f042f088..5470579e 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -1,4 +1,3 @@
-import io
 import json
 import logging
 import random
@@ -7,7 +6,7 @@
 import typer
 from loguru import logger
 
-from datasets import Dataset, load_dataset, concatenate_datasets, load_from_disk
+from datasets import Dataset, load_dataset, concatenate_datasets
 
 import os
 
@@ -69,7 +68,8 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
             tries += 1
             if tries >= 10:
                 logger.error(
-                    f"Unable to find more examples for {field} {tag} which are not already tagged. "
+                    f"Unable to find more examples for {field} {tag} "
+                    f"which are not already tagged. "
                     f"Continuing with {count} examples..."
                 )
                 finished = True
@@ -80,8 +80,8 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
         print(dset[random_pos_row]["abstractText"])
         print("=" * 50)
         res = input(
-            f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""} a `{tag}` text? '
-            f"[a to accept]: "
+            f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
+            f' a `{tag}` text? [a to accept]: '
         )
         if res == "a":
             human_supervision[tag][field].append(dset[random_pos_row])
@@ -136,9 +136,11 @@ def retag(
 
         if len(positive_dset["abstractText"]) < train_examples:
             logging.info(
-                f"Skipping {tag}: low examples ({len(positive_dset['abstractText'])} vs "
+                f"Skipping {tag}: low examples "
+                f"({len(positive_dset['abstractText'])} vs "
                 f"expected {train_examples}). "
-                f"Check {save_to_path}.err for more information about skipped tags."
+                f"Check {save_to_path}.err for more information "
+                f"about skipped tags."
             )
             with open(f"{save_to_path}.err", "a") as f:
                 f.write(tag)
@@ -193,10 +195,14 @@ def retag(
             negative_dset, "other", limit=train_examples, split=0.8
         )
 
-        pos_x_train = pos_x_train.add_column("tag", [tag] * len(pos_x_train))
-        pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
-        neg_x_train = neg_x_train.add_column("tag", ["other"] * len(neg_x_train))
-        neg_x_test = neg_x_test.add_column("tag", ["other"] * len(neg_x_test))
+        pos_x_train = pos_x_train.add_column("tag",
+                                             [tag] * len(pos_x_train))
+        pos_x_test = pos_x_test.add_column("tag",
+                                           [tag] * len(pos_x_test))
+        neg_x_train = neg_x_train.add_column("tag",
+                                             ["other"] * len(neg_x_train))
+        neg_x_test = neg_x_test.add_column("tag",
+                                           ["other"] * len(neg_x_test))
 
         logging.info(f"- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
@@ -266,19 +272,23 @@ def retag_cli(
         "The rest will be discarded.",
     ),
     threshold: float = typer.Option(
-        0.9, help="Minimum threshold of confidence to retag a model. Default: 0.9"
+        0.9, help="Minimum threshold of confidence to retag a model. "
+                  "Default: 0.9"
     ),
     train_examples: int = typer.Option(
         100, help="Number of examples to use for training the retaggers"
     ),
     supervised: bool = typer.Option(
         False,
-        help="Use human curation, showing a `limit` amount of positive and negative examples to curate data"
-        " for training the retaggers. The user will be required to accept or reject. When the limit is reached,"
-        " the model will be train. All intermediary steps will be saved.",
+        help="Use human curation, showing a `limit` amount of positive "
+             "and negative examples to curate data"
+             " for training the retaggers. The user will be required to accept"
+             " or reject. When the limit is reached,"
+             " the model will be train. All intermediary steps will be saved.",
     ),
     years: str = typer.Option(
-        None, help="Comma-separated years you want to include in the retagging process"
+        None, help="Comma-separated years you want to include in "
+                   "the retagging process"
     ),
 ):
     if not data_path.endswith("jsonl"):
@@ -290,7 +300,8 @@ def retag_cli(
 
     if tags_file_path is None and tags is None:
         logger.error(
-            "To understand which tags need to be augmented, use --tags [tags separated by comma] or create a file with"
+            "To understand which tags need to be augmented, use --tags "
+            "[tags separated by comma] or create a file with "
             "a newline per tag and set the path in --tags-file-path"
         )
         exit(-1)

From 5d41bfea8353509e71fe1047855ccda5889c3b86 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 16:53:18 +0100
Subject: [PATCH 067/102] rufus and black

---
 grants_tagger_light/retagging/retagging.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 5470579e..b1b12965 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -204,11 +204,11 @@ def retag(
         neg_x_test = neg_x_test.add_column("tag",
                                            ["other"] * len(neg_x_test))
 
-        logging.info(f"- Creating train/test sets...")
+        logging.info("- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
 
         # TODO: Use Evaluation on `test` to see if the model is good enough
-        test = concatenate_datasets([pos_x_test, neg_x_test])
+        # test = concatenate_datasets([pos_x_test, neg_x_test])
 
         label_binarizer = preprocessing.LabelBinarizer()
         label_binarizer_path = os.path.join(

From 96c88cef4c8d73a48fca987be80ba4b2a9646609 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 16:56:54 +0100
Subject: [PATCH 068/102] black

---
 grants_tagger_light/retagging/retagging.py | 29 +++++++++-------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index b1b12965..fe927983 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -81,7 +81,7 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
         print("=" * 50)
         res = input(
             f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
-            f' a `{tag}` text? [a to accept]: '
+            f" a `{tag}` text? [a to accept]: "
         )
         if res == "a":
             human_supervision[tag][field].append(dset[random_pos_row])
@@ -195,14 +195,10 @@ def retag(
             negative_dset, "other", limit=train_examples, split=0.8
         )
 
-        pos_x_train = pos_x_train.add_column("tag",
-                                             [tag] * len(pos_x_train))
-        pos_x_test = pos_x_test.add_column("tag",
-                                           [tag] * len(pos_x_test))
-        neg_x_train = neg_x_train.add_column("tag",
-                                             ["other"] * len(neg_x_train))
-        neg_x_test = neg_x_test.add_column("tag",
-                                           ["other"] * len(neg_x_test))
+        pos_x_train = pos_x_train.add_column("tag", [tag] * len(pos_x_train))
+        pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
+        neg_x_train = neg_x_train.add_column("tag", ["other"] * len(neg_x_train))
+        neg_x_test = neg_x_test.add_column("tag", ["other"] * len(neg_x_test))
 
         logging.info("- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
@@ -272,8 +268,7 @@ def retag_cli(
         "The rest will be discarded.",
     ),
     threshold: float = typer.Option(
-        0.9, help="Minimum threshold of confidence to retag a model. "
-                  "Default: 0.9"
+        0.9, help="Minimum threshold of confidence to retag a model. " "Default: 0.9"
     ),
     train_examples: int = typer.Option(
         100, help="Number of examples to use for training the retaggers"
@@ -281,14 +276,14 @@ def retag_cli(
     supervised: bool = typer.Option(
         False,
         help="Use human curation, showing a `limit` amount of positive "
-             "and negative examples to curate data"
-             " for training the retaggers. The user will be required to accept"
-             " or reject. When the limit is reached,"
-             " the model will be train. All intermediary steps will be saved.",
+        "and negative examples to curate data"
+        " for training the retaggers. The user will be required to accept"
+        " or reject. When the limit is reached,"
+        " the model will be train. All intermediary steps will be saved.",
     ),
     years: str = typer.Option(
-        None, help="Comma-separated years you want to include in "
-                   "the retagging process"
+        None,
+        help="Comma-separated years you want to include in " "the retagging process",
     ),
 ):
     if not data_path.endswith("jsonl"):

From c70bdad9b23d5398d0f4860a27d0cb365eecce5a Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Wed, 13 Sep 2023 18:00:17 +0100
Subject: [PATCH 069/102] Removes spacy

---
 poetry.lock    | 464 +------------------------------------------------
 pyproject.toml |   1 -
 2 files changed, 1 insertion(+), 464 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 708dc739..78b1da32 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -466,46 +466,6 @@ d = ["aiohttp (>=3.7.4)"]
 jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
 uvloop = ["uvloop (>=0.15.2)"]
 
-[[package]]
-name = "blis"
-version = "0.7.10"
-description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
-optional = false
-python-versions = "*"
-files = [
-    {file = "blis-0.7.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1fb4a9fca42d56533e28bf62b740f5c7d122e804742e5ea24b2704950151ae3c"},
-    {file = "blis-0.7.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2167e656d6237443ef7d0cd7dcfbedc12fcd156c54112f2dc5ca9b0249ec835d"},
-    {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a887165f2d7c08814dc92f96535232ca628e3e27927fb09cdeb8492781a28d04"},
-    {file = "blis-0.7.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31a6a8c347ef764ef268b6e11ae7b47ce83aba7ea99fc9223f85543aaab09826"},
-    {file = "blis-0.7.10-cp310-cp310-win_amd64.whl", hash = "sha256:67a17000e953d05f09a1ee7dad001c783ca5d5dc12e40dcfff049b86e74fed67"},
-    {file = "blis-0.7.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:67c8270ea20cf7e9342e4e3ed8fd51123a5236b1aa35fa94fb2200a8e11d0081"},
-    {file = "blis-0.7.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a86f1d2c6370d571dc88fc710416e8cab7dc6bb3a47ee9f27079ee34adf780d6"},
-    {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:288247c424fd2bd3d43b750f1f54bba19fe2cbb11e5c028bc4762bc03bd54b9b"},
-    {file = "blis-0.7.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2846d1a5116a5a1e4c09fa5c3cab6fbe13349c8036bc1c8746a738c556a751c4"},
-    {file = "blis-0.7.10-cp311-cp311-win_amd64.whl", hash = "sha256:f5c4a7c0fa67fec5a06fb6c1656bf1b51e7ab414292a04d417512b1fb1247246"},
-    {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec3e11e8ed6be18cf43152513bbfeabbc3f99a5d391786642fb7a14fb914ee61"},
-    {file = "blis-0.7.10-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:148835c8c96ea4c8957111de0593a28e9044c5b0e4cbcc34b77d700394fa6f13"},
-    {file = "blis-0.7.10-cp36-cp36m-win_amd64.whl", hash = "sha256:2df3d8703d23c39d8a0fb1e43be4681ec09f9010e08e9b35674fe799046c5fd5"},
-    {file = "blis-0.7.10-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fa62e13631c89626365ccd2585a2be154847c5bbb30cfc2ea8fdcf4e83cedd69"},
-    {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc7c70c5d482ce71c61a6008bcb44dfb15a0ac41ba176c59143f016658fa82d"},
-    {file = "blis-0.7.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed4e31d32916f657842572b6640b235c5f2f679a70ec74808160b584c08399ce"},
-    {file = "blis-0.7.10-cp37-cp37m-win_amd64.whl", hash = "sha256:9833fc44795c8d43617732df31a8eca9de3f54b181ff9f0008cc50356cc26d86"},
-    {file = "blis-0.7.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0cca151d046f8b6b9d075b4f3a5ffee52993424b3080f0e0c2be419f20a477a7"},
-    {file = "blis-0.7.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d3bb6c4b9ae45e88e6e69b46eca145858cb9b3cd0a43a6c6812fb34c5c80d871"},
-    {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47c6a0230688ff7c29e31b78f0d207556044c0c84bb90e7c28b009a6765658c4"},
-    {file = "blis-0.7.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:953dd85d4a8f79d4d69c17d27a0b783a5664aee0feafa33662199b7c78b0ee51"},
-    {file = "blis-0.7.10-cp38-cp38-win_amd64.whl", hash = "sha256:ed181a90fef1edff76220cb883df65685aeca610a0abe22c91322a3300e1e89d"},
-    {file = "blis-0.7.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:df7f746159d9ab11f427e00c72abe8de522c1671c7a33ca664739b2bd48b71c2"},
-    {file = "blis-0.7.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dd7870a21aed12b25ec8692a75e6965e9451b1b7f2752e2cac4ae9f565d2de95"},
-    {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4766e26721e37e028336b542c226eab9faf812ea2d89a1869531ed0cada6c359"},
-    {file = "blis-0.7.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc8fac91353f20e747e130bc8d4010442c6700e4c7e5edc38d69bb844802ea81"},
-    {file = "blis-0.7.10-cp39-cp39-win_amd64.whl", hash = "sha256:4329fef5b1050c88dbca6f7d87ecc02d56f09005afa60edf12d826d82544f88a"},
-    {file = "blis-0.7.10.tar.gz", hash = "sha256:343e8b125784d70ff6e1f17a95ea71538705bf0bd3cc236a176d153590842647"},
-]
-
-[package.dependencies]
-numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
-
 [[package]]
 name = "boto3"
 version = "1.28.17"
@@ -544,17 +504,6 @@ urllib3 = ">=1.25.4,<1.27"
 [package.extras]
 crt = ["awscrt (==0.16.26)"]
 
-[[package]]
-name = "catalogue"
-version = "2.0.9"
-description = "Super lightweight function registries for your library"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "catalogue-2.0.9-py3-none-any.whl", hash = "sha256:5817ce97de17ace366a15eadd4987ac022b28f262006147549cdb3467265dc4d"},
-    {file = "catalogue-2.0.9.tar.gz", hash = "sha256:d204c423ec436f2545341ec8a0e026ae033b3ce5911644f95e94d6b887cf631c"},
-]
-
 [[package]]
 name = "celery"
 version = "5.3.4"
@@ -909,21 +858,6 @@ files = [
 [package.extras]
 test = ["flake8 (==3.7.8)", "hypothesis (==3.55.3)"]
 
-[[package]]
-name = "confection"
-version = "0.1.3"
-description = "The sweetest config system for Python"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "confection-0.1.3-py3-none-any.whl", hash = "sha256:58b125c9bc6786f32e37fe4d98bc3a03e5f509a4b9de02541b99c559f2026092"},
-    {file = "confection-0.1.3.tar.gz", hash = "sha256:5a876d368a7698eec58791126757a75a3df16e26cc49653b52426e9ffd39f12f"},
-]
-
-[package.dependencies]
-pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
-srsly = ">=2.4.0,<3.0.0"
-
 [[package]]
 name = "configobj"
 version = "5.0.8"
@@ -983,43 +917,6 @@ ssh = ["bcrypt (>=3.1.5)"]
 test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]
 
-[[package]]
-name = "cymem"
-version = "2.0.7"
-description = "Manage calls to calloc/free through Cython"
-optional = false
-python-versions = "*"
-files = [
-    {file = "cymem-2.0.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4981fc9182cc1fe54bfedf5f73bfec3ce0c27582d9be71e130c46e35958beef0"},
-    {file = "cymem-2.0.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42aedfd2e77aa0518a24a2a60a2147308903abc8b13c84504af58539c39e52a3"},
-    {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c183257dc5ab237b664f64156c743e788f562417c74ea58c5a3939fe2d48d6f6"},
-    {file = "cymem-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d18250f97eeb13af2e8b19d3cefe4bf743b963d93320b0a2e729771410fd8cf4"},
-    {file = "cymem-2.0.7-cp310-cp310-win_amd64.whl", hash = "sha256:864701e626b65eb2256060564ed8eb034ebb0a8f14ce3fbef337e88352cdee9f"},
-    {file = "cymem-2.0.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:314273be1f143da674388e0a125d409e2721fbf669c380ae27c5cbae4011e26d"},
-    {file = "cymem-2.0.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:df543a36e7000808fe0a03d92fd6cd8bf23fa8737c3f7ae791a5386de797bf79"},
-    {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e5e1b7de7952d89508d07601b9e95b2244e70d7ef60fbc161b3ad68f22815f8"},
-    {file = "cymem-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aa33f1dbd7ceda37970e174c38fd1cf106817a261aa58521ba9918156868231"},
-    {file = "cymem-2.0.7-cp311-cp311-win_amd64.whl", hash = "sha256:10178e402bb512b2686b8c2f41f930111e597237ca8f85cb583ea93822ef798d"},
-    {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2971b7da5aa2e65d8fbbe9f2acfc19ff8e73f1896e3d6e1223cc9bf275a0207"},
-    {file = "cymem-2.0.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85359ab7b490e6c897c04863704481600bd45188a0e2ca7375eb5db193e13cb7"},
-    {file = "cymem-2.0.7-cp36-cp36m-win_amd64.whl", hash = "sha256:0ac45088abffbae9b7db2c597f098de51b7e3c1023cb314e55c0f7f08440cf66"},
-    {file = "cymem-2.0.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:26e5d5c6958855d2fe3d5629afe85a6aae5531abaa76f4bc21b9abf9caaccdfe"},
-    {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:011039e12d3144ac1bf3a6b38f5722b817f0d6487c8184e88c891b360b69f533"},
-    {file = "cymem-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f9e63e5ad4ed6ffa21fd8db1c03b05be3fea2f32e32fdace67a840ea2702c3d"},
-    {file = "cymem-2.0.7-cp37-cp37m-win_amd64.whl", hash = "sha256:5ea6b027fdad0c3e9a4f1b94d28d213be08c466a60c72c633eb9db76cf30e53a"},
-    {file = "cymem-2.0.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4302df5793a320c4f4a263c7785d2fa7f29928d72cb83ebeb34d64a610f8d819"},
-    {file = "cymem-2.0.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:24b779046484674c054af1e779c68cb224dc9694200ac13b22129d7fb7e99e6d"},
-    {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c50794c612801ed8b599cd4af1ed810a0d39011711c8224f93e1153c00e08d1"},
-    {file = "cymem-2.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9525ad563b36dc1e30889d0087a0daa67dd7bb7d3e1530c4b61cd65cc756a5b"},
-    {file = "cymem-2.0.7-cp38-cp38-win_amd64.whl", hash = "sha256:48b98da6b906fe976865263e27734ebc64f972a978a999d447ad6c83334e3f90"},
-    {file = "cymem-2.0.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e156788d32ad8f7141330913c5d5d2aa67182fca8f15ae22645e9f379abe8a4c"},
-    {file = "cymem-2.0.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3da89464021fe669932fce1578343fcaf701e47e3206f50d320f4f21e6683ca5"},
-    {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f359cab9f16e25b3098f816c40acbf1697a3b614a8d02c56e6ebcb9c89a06b3"},
-    {file = "cymem-2.0.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f165d7bce55d6730930e29d8294569788aa127f1be8d1642d9550ed96223cb37"},
-    {file = "cymem-2.0.7-cp39-cp39-win_amd64.whl", hash = "sha256:59a09cf0e71b1b88bfa0de544b801585d81d06ea123c1725e7c5da05b7ca0d20"},
-    {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"},
-]
-
 [[package]]
 name = "datasets"
 version = "2.13.1"
@@ -1935,20 +1832,6 @@ sqs = ["boto3 (>=1.26.143)", "pycurl (>=7.43.0.5)", "urllib3 (>=1.26.16)"]
 yaml = ["PyYAML (>=3.10)"]
 zookeeper = ["kazoo (>=2.8.0)"]
 
-[[package]]
-name = "langcodes"
-version = "3.3.0"
-description = "Tools for labeling human languages with IETF language tags"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"},
-    {file = "langcodes-3.3.0.tar.gz", hash = "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"},
-]
-
-[package.extras]
-data = ["language-data (>=1.1,<2.0)"]
-
 [[package]]
 name = "libpecos"
 version = "1.0.0"
@@ -2198,43 +2081,6 @@ files = [
 [package.dependencies]
 dill = ">=0.3.6"
 
-[[package]]
-name = "murmurhash"
-version = "1.0.9"
-description = "Cython bindings for MurmurHash"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "murmurhash-1.0.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:697ed01454d92681c7ae26eb1adcdc654b54062bcc59db38ed03cad71b23d449"},
-    {file = "murmurhash-1.0.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ef31b5c11be2c064dbbdd0e22ab3effa9ceb5b11ae735295c717c120087dd94"},
-    {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a2bd203377a31bbb2d83fe3f968756d6c9bbfa36c64c6ebfc3c6494fc680bc"},
-    {file = "murmurhash-1.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0eb0f8e652431ea238c11bcb671fef5c03aff0544bf7e098df81ea4b6d495405"},
-    {file = "murmurhash-1.0.9-cp310-cp310-win_amd64.whl", hash = "sha256:cf0b3fe54dca598f5b18c9951e70812e070ecb4c0672ad2cc32efde8a33b3df6"},
-    {file = "murmurhash-1.0.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5dc41be79ba4d09aab7e9110a8a4d4b37b184b63767b1b247411667cdb1057a3"},
-    {file = "murmurhash-1.0.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c0f84ecdf37c06eda0222f2f9e81c0974e1a7659c35b755ab2fdc642ebd366db"},
-    {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:241693c1c819148eac29d7882739b1099c891f1f7431127b2652c23f81722cec"},
-    {file = "murmurhash-1.0.9-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f5ca56c430230d3b581dfdbc54eb3ad8b0406dcc9afdd978da2e662c71d370"},
-    {file = "murmurhash-1.0.9-cp311-cp311-win_amd64.whl", hash = "sha256:660ae41fc6609abc05130543011a45b33ca5d8318ae5c70e66bbd351ca936063"},
-    {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01137d688a6b259bde642513506b062364ea4e1609f886d9bd095c3ae6da0b94"},
-    {file = "murmurhash-1.0.9-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1b70bbf55d89713873a35bd4002bc231d38e530e1051d57ca5d15f96c01fd778"},
-    {file = "murmurhash-1.0.9-cp36-cp36m-win_amd64.whl", hash = "sha256:3e802fa5b0e618ee99e8c114ce99fc91677f14e9de6e18b945d91323a93c84e8"},
-    {file = "murmurhash-1.0.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:213d0248e586082e1cab6157d9945b846fd2b6be34357ad5ea0d03a1931d82ba"},
-    {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94b89d02aeab5e6bad5056f9d08df03ac7cfe06e61ff4b6340feb227fda80ce8"},
-    {file = "murmurhash-1.0.9-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c2e2ee2d91a87952fe0f80212e86119aa1fd7681f03e6c99b279e50790dc2b3"},
-    {file = "murmurhash-1.0.9-cp37-cp37m-win_amd64.whl", hash = "sha256:8c3d69fb649c77c74a55624ebf7a0df3c81629e6ea6e80048134f015da57b2ea"},
-    {file = "murmurhash-1.0.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab78675510f83e7a3c6bd0abdc448a9a2b0b385b0d7ee766cbbfc5cc278a3042"},
-    {file = "murmurhash-1.0.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0ac5530c250d2b0073ed058555847c8d88d2d00229e483d45658c13b32398523"},
-    {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69157e8fa6b25c4383645227069f6a1f8738d32ed2a83558961019ca3ebef56a"},
-    {file = "murmurhash-1.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2aebe2ae016525a662ff772b72a2c9244a673e3215fcd49897f494258b96f3e7"},
-    {file = "murmurhash-1.0.9-cp38-cp38-win_amd64.whl", hash = "sha256:a5952f9c18a717fa17579e27f57bfa619299546011a8378a8f73e14eece332f6"},
-    {file = "murmurhash-1.0.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef79202feeac68e83971239169a05fa6514ecc2815ce04c8302076d267870f6e"},
-    {file = "murmurhash-1.0.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:799fcbca5693ad6a40f565ae6b8e9718e5875a63deddf343825c0f31c32348fa"},
-    {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9b995bc82eaf9223e045210207b8878fdfe099a788dd8abd708d9ee58459a9d"},
-    {file = "murmurhash-1.0.9-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b129e1c5ebd772e6ff5ef925bcce695df13169bd885337e6074b923ab6edcfc8"},
-    {file = "murmurhash-1.0.9-cp39-cp39-win_amd64.whl", hash = "sha256:379bf6b414bd27dd36772dd1570565a7d69918e980457370838bd514df0d91e9"},
-    {file = "murmurhash-1.0.9.tar.gz", hash = "sha256:fe7a38cb0d3d87c14ec9dddc4932ffe2dbc77d75469ab80fd5014689b0e07b58"},
-]
-
 [[package]]
 name = "mypy-extensions"
 version = "1.0.0"
@@ -2526,28 +2372,6 @@ files = [
     {file = "pathtools-0.1.2.tar.gz", hash = "sha256:7c35c5421a39bb82e58018febd90e3b6e5db34c5443aaaf742b3f33d4655f1c0"},
 ]
 
-[[package]]
-name = "pathy"
-version = "0.10.2"
-description = "pathlib.Path subclasses for local and cloud bucket storage"
-optional = false
-python-versions = ">= 3.6"
-files = [
-    {file = "pathy-0.10.2-py3-none-any.whl", hash = "sha256:681bc98dbff28e7de3e50efa8246910f727e8ac254c4318c47ce341f7c1ce21d"},
-    {file = "pathy-0.10.2.tar.gz", hash = "sha256:79c572ab7fed84dc46837346edae58565992d0477a789cd4691a41d8eab9917d"},
-]
-
-[package.dependencies]
-smart-open = ">=5.2.1,<7.0.0"
-typer = ">=0.3.0,<1.0.0"
-
-[package.extras]
-all = ["azure-storage-blob", "boto3", "google-cloud-storage (>=1.26.0,<2.0.0)", "mock", "pytest", "pytest-coverage", "typer-cli"]
-azure = ["azure-storage-blob"]
-gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"]
-s3 = ["boto3"]
-test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
-
 [[package]]
 name = "platformdirs"
 version = "3.10.0"
@@ -2596,47 +2420,6 @@ nodeenv = ">=0.11.1"
 pyyaml = ">=5.1"
 virtualenv = ">=20.10.0"
 
-[[package]]
-name = "preshed"
-version = "3.0.8"
-description = "Cython hash table that trusts the keys are pre-hashed"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "preshed-3.0.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ea4b6df8ef7af38e864235256793bc3056e9699d991afcf6256fa298858582fc"},
-    {file = "preshed-3.0.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e945fc814bdc29564a2ce137c237b3a9848aa1e76a1160369b6e0d328151fdd"},
-    {file = "preshed-3.0.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f9a4833530fe53001c351974e0c8bb660211b8d0358e592af185fec1ae12b2d0"},
-    {file = "preshed-3.0.8-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1472ee231f323b4f4368b1b5f8f08481ed43af89697d45450c6ae4af46ac08a"},
-    {file = "preshed-3.0.8-cp310-cp310-win_amd64.whl", hash = "sha256:c8a2e2931eea7e500fbf8e014b69022f3fab2e35a70da882e2fc753e5e487ae3"},
-    {file = "preshed-3.0.8-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0e1bb8701df7861af26a312225bdf7c4822ac06fcf75aeb60fe2b0a20e64c222"},
-    {file = "preshed-3.0.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e9aef2b0b7687aecef48b1c6ff657d407ff24e75462877dcb888fa904c4a9c6d"},
-    {file = "preshed-3.0.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:854d58a8913ebf3b193b0dc8064155b034e8987de25f26838dfeca09151fda8a"},
-    {file = "preshed-3.0.8-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:135e2ac0db1a3948d6ec295598c7e182b52c394663f2fcfe36a97ae51186be21"},
-    {file = "preshed-3.0.8-cp311-cp311-win_amd64.whl", hash = "sha256:019d8fa4161035811fb2804d03214143298739e162d0ad24e087bd46c50970f5"},
-    {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a49ce52856fbb3ef4f1cc744c53f5d7e1ca370b1939620ac2509a6d25e02a50"},
-    {file = "preshed-3.0.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdbc2957b36115a576c515ffe963919f19d2683f3c76c9304ae88ef59f6b5ca6"},
-    {file = "preshed-3.0.8-cp36-cp36m-win_amd64.whl", hash = "sha256:09cc9da2ac1b23010ce7d88a5e20f1033595e6dd80be14318e43b9409f4c7697"},
-    {file = "preshed-3.0.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e19c8069f1a1450f835f23d47724530cf716d581fcafb398f534d044f806b8c2"},
-    {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25b5ef5e387a0e17ff41202a8c1816184ab6fb3c0d0b847bf8add0ed5941eb8d"},
-    {file = "preshed-3.0.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:53d3e2456a085425c66af7baba62d7eaa24aa5e460e1a9e02c401a2ed59abd7b"},
-    {file = "preshed-3.0.8-cp37-cp37m-win_amd64.whl", hash = "sha256:85e98a618fb36cdcc37501d8b9b8c1246651cc2f2db3a70702832523e0ae12f4"},
-    {file = "preshed-3.0.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7f8837bf616335464f3713cbf562a3dcaad22c3ca9193f957018964ef871a68b"},
-    {file = "preshed-3.0.8-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:720593baf2c2e295f855192974799e486da5f50d4548db93c44f5726a43cefb9"},
-    {file = "preshed-3.0.8-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0ad3d860b9ce88a74cf7414bb4b1c6fd833813e7b818e76f49272c4974b19ce"},
-    {file = "preshed-3.0.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd19d48440b152657966a52e627780c0ddbe9d907b8d7ee4598505e80a3c55c7"},
-    {file = "preshed-3.0.8-cp38-cp38-win_amd64.whl", hash = "sha256:246e7c6890dc7fe9b10f0e31de3346b906e3862b6ef42fcbede37968f46a73bf"},
-    {file = "preshed-3.0.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67643e66691770dc3434b01671648f481e3455209ce953727ef2330b16790aaa"},
-    {file = "preshed-3.0.8-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ae25a010c9f551aa2247ee621457f679e07c57fc99d3fd44f84cb40b925f12c"},
-    {file = "preshed-3.0.8-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6a7fcf7dd2e7711051b3f0432da9ec9c748954c989f49d2cd8eabf8c2d953e"},
-    {file = "preshed-3.0.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5942858170c4f53d9afc6352a86bbc72fc96cc4d8964b6415492114a5920d3ed"},
-    {file = "preshed-3.0.8-cp39-cp39-win_amd64.whl", hash = "sha256:06793022a56782ef51d74f1399925a2ba958e50c5cfbc6fa5b25c4945e158a07"},
-    {file = "preshed-3.0.8.tar.gz", hash = "sha256:6c74c70078809bfddda17be96483c41d06d717934b07cab7921011d81758b357"},
-]
-
-[package.dependencies]
-cymem = ">=2.0.2,<2.1.0"
-murmurhash = ">=0.28.0,<1.1.0"
-
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.39"
@@ -3628,27 +3411,6 @@ files = [
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]
 
-[[package]]
-name = "smart-open"
-version = "6.4.0"
-description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
-optional = false
-python-versions = ">=3.6,<4.0"
-files = [
-    {file = "smart_open-6.4.0-py3-none-any.whl", hash = "sha256:8d3ef7e6997e8e42dd55c74166ed21e6ac70664caa32dd940b26d54a8f6b4142"},
-    {file = "smart_open-6.4.0.tar.gz", hash = "sha256:be3c92c246fbe80ebce8fbacb180494a481a77fcdcb7c1aadb2ea5b9c2bee8b9"},
-]
-
-[package.extras]
-all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests"]
-azure = ["azure-common", "azure-core", "azure-storage-blob"]
-gcs = ["google-cloud-storage (>=2.6.0)"]
-http = ["requests"]
-s3 = ["boto3"]
-ssh = ["paramiko"]
-test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses"]
-webhdfs = ["requests"]
-
 [[package]]
 name = "smmap"
 version = "5.0.0"
@@ -3671,115 +3433,6 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
-[[package]]
-name = "spacy"
-version = "3.6.1"
-description = "Industrial-strength Natural Language Processing (NLP) in Python"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "spacy-3.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2fb23b9af51ee8baeea4920d6ffc8ef85bc3ea7a6338dbf330a0626cf6ac6ea9"},
-    {file = "spacy-3.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cb00bc74f59b537518a398fd066c0f7a8f029c763cc88afa1a0a59914f639e83"},
-    {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f75430fef7e18e6a4c32ca7efa3fb17020eaaa5d7ca0aeac6f663748a32888d"},
-    {file = "spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:479132dd3118024e97022735d6ad10d50c789f3979675a8db86e40f333fa335f"},
-    {file = "spacy-3.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:385dd3e48a8bb980ec2b8a70831ab3d2d43496357bae91b486c0e99dedb991aa"},
-    {file = "spacy-3.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:369c1102eadfcfe155ff1d8d540411b784fe163171e15f02e0b47e030af7c527"},
-    {file = "spacy-3.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8ee28656f518e0d454dcc6840a17ec4c6141c055cda86e6b7a772ec6b55cde24"},
-    {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f426f312e945191218a3f753d7ce0068f08d27b253de0e30b9fbae81778bb90"},
-    {file = "spacy-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c51ceb2e0352c99b1703ef97849c10cb27ceb58348cb76ab4734477d485035b"},
-    {file = "spacy-3.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:c6b7184bac8c8f72c4e3dbfd7c82eb0541c03fbccded11412269ae906f0d16c9"},
-    {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643b69be30f092cc3215d576d9a194ee01a3da319accdc06ae5a521d83497093"},
-    {file = "spacy-3.6.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17424ab01023ece5679fe5c9224241d4ba6b08069b756df77df5b0c857fa762c"},
-    {file = "spacy-3.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:eb93b401f7070fb7e6be64b4d9ac5c69f6ed49c9a7c13532481b425a9ee5d980"},
-    {file = "spacy-3.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:46c27249590a0227d33ad33871e99820c2e9890b59f970a37f8f95f4520ca2eb"},
-    {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:590886ca51ad4509100eeae233d22086e3736ab3ff54bf588f356a0862cdb735"},
-    {file = "spacy-3.6.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca97c6052e098f00c0bed89dfa7c0d9a7ea24667d67854baa7dba53c61c8c6f0"},
-    {file = "spacy-3.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:13554a7bda6f9b148f54f3df0870b487c590921eaff0d7ce1a8be15b70e77a92"},
-    {file = "spacy-3.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a110dc5bbc5b37176168bb24064f7e49b9f29f5a4857f09114e5953c3754b311"},
-    {file = "spacy-3.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3abd2b82dd483c13aeb10720f52416523415ac0af84106f0c1eaae29240fe709"},
-    {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77ac5d89d909b30e64873caa93399aa5a1e72b363ae291e297c83a07db6b646f"},
-    {file = "spacy-3.6.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3de915f5419ad28d8d1c614c77172ce05b0b59a7c57854f098b7f2da98e28f40"},
-    {file = "spacy-3.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:738d806851760c2917e20046332af1ccbef78ff43eaebb23914f4d90ed060539"},
-    {file = "spacy-3.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4b5350ad1b70fb9b9e17be220dd866c6b91a950a45cfe6ce524041ef52593621"},
-    {file = "spacy-3.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3b797eedaf29b8726e5fb81e4b839b1734a07c835243a2d59a28cc974d2a9067"},
-    {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7762c1944cdacc0d04f5c781c79cc7beb1caa6cbc2b74687a997775f0846cec1"},
-    {file = "spacy-3.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fdee99625ee3c11537182598c81a17d4d4521c73b59e6c1d0ad6749c6654f16"},
-    {file = "spacy-3.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:c9d112681d3666a75b07dea8c65a0b3f46ebebb9b90fda568089254134f0d28b"},
-    {file = "spacy-3.6.1.tar.gz", hash = "sha256:6323a98706ae2d5561694b03a8b0b5751887a002903a4894e68aeb29cc672166"},
-]
-
-[package.dependencies]
-catalogue = ">=2.0.6,<2.1.0"
-cymem = ">=2.0.2,<2.1.0"
-jinja2 = "*"
-langcodes = ">=3.2.0,<4.0.0"
-murmurhash = ">=0.28.0,<1.1.0"
-numpy = ">=1.15.0"
-packaging = ">=20.0"
-pathy = ">=0.10.0"
-preshed = ">=3.0.2,<3.1.0"
-pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
-requests = ">=2.13.0,<3.0.0"
-setuptools = "*"
-smart-open = ">=5.2.1,<7.0.0"
-spacy-legacy = ">=3.0.11,<3.1.0"
-spacy-loggers = ">=1.0.0,<2.0.0"
-srsly = ">=2.4.3,<3.0.0"
-thinc = ">=8.1.8,<8.2.0"
-tqdm = ">=4.38.0,<5.0.0"
-typer = ">=0.3.0,<0.10.0"
-wasabi = ">=0.9.1,<1.2.0"
-
-[package.extras]
-apple = ["thinc-apple-ops (>=0.1.0.dev0,<1.0.0)"]
-cuda = ["cupy (>=5.0.0b4,<13.0.0)"]
-cuda-autodetect = ["cupy-wheel (>=11.0.0,<13.0.0)"]
-cuda100 = ["cupy-cuda100 (>=5.0.0b4,<13.0.0)"]
-cuda101 = ["cupy-cuda101 (>=5.0.0b4,<13.0.0)"]
-cuda102 = ["cupy-cuda102 (>=5.0.0b4,<13.0.0)"]
-cuda110 = ["cupy-cuda110 (>=5.0.0b4,<13.0.0)"]
-cuda111 = ["cupy-cuda111 (>=5.0.0b4,<13.0.0)"]
-cuda112 = ["cupy-cuda112 (>=5.0.0b4,<13.0.0)"]
-cuda113 = ["cupy-cuda113 (>=5.0.0b4,<13.0.0)"]
-cuda114 = ["cupy-cuda114 (>=5.0.0b4,<13.0.0)"]
-cuda115 = ["cupy-cuda115 (>=5.0.0b4,<13.0.0)"]
-cuda116 = ["cupy-cuda116 (>=5.0.0b4,<13.0.0)"]
-cuda117 = ["cupy-cuda117 (>=5.0.0b4,<13.0.0)"]
-cuda11x = ["cupy-cuda11x (>=11.0.0,<13.0.0)"]
-cuda12x = ["cupy-cuda12x (>=11.5.0,<13.0.0)"]
-cuda80 = ["cupy-cuda80 (>=5.0.0b4,<13.0.0)"]
-cuda90 = ["cupy-cuda90 (>=5.0.0b4,<13.0.0)"]
-cuda91 = ["cupy-cuda91 (>=5.0.0b4,<13.0.0)"]
-cuda92 = ["cupy-cuda92 (>=5.0.0b4,<13.0.0)"]
-ja = ["sudachidict-core (>=20211220)", "sudachipy (>=0.5.2,!=0.6.1)"]
-ko = ["natto-py (>=0.9.0)"]
-lookups = ["spacy-lookups-data (>=1.0.3,<1.1.0)"]
-ray = ["spacy-ray (>=0.1.0,<1.0.0)"]
-th = ["pythainlp (>=2.0)"]
-transformers = ["spacy-transformers (>=1.1.2,<1.3.0)"]
-
-[[package]]
-name = "spacy-legacy"
-version = "3.0.12"
-description = "Legacy registered functions for spaCy backwards compatibility"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "spacy-legacy-3.0.12.tar.gz", hash = "sha256:b37d6e0c9b6e1d7ca1cf5bc7152ab64a4c4671f59c85adaf7a3fcb870357a774"},
-    {file = "spacy_legacy-3.0.12-py2.py3-none-any.whl", hash = "sha256:476e3bd0d05f8c339ed60f40986c07387c0a71479245d6d0f4298dbd52cda55f"},
-]
-
-[[package]]
-name = "spacy-loggers"
-version = "1.0.5"
-description = "Logging utilities for SpaCy"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "spacy-loggers-1.0.5.tar.gz", hash = "sha256:d60b0bdbf915a60e516cc2e653baeff946f0cfc461b452d11a4d5458c6fe5f24"},
-    {file = "spacy_loggers-1.0.5-py3-none-any.whl", hash = "sha256:196284c9c446cc0cdb944005384270d775fdeaf4f494d8e269466cfa497ef645"},
-]
-
 [[package]]
 name = "sqltrie"
 version = "0.7.0"
@@ -3800,46 +3453,6 @@ pygtrie = "*"
 dev = ["mypy (==0.971)", "pyinstaller", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-benchmark", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
 tests = ["mypy (==0.971)", "pyinstaller", "pylint (==2.15.0)", "pytest (==7.2.0)", "pytest-benchmark", "pytest-cov (==3.0.0)", "pytest-mock (==3.8.2)", "pytest-sugar (==0.9.5)"]
 
-[[package]]
-name = "srsly"
-version = "2.4.7"
-description = "Modern high-performance serialization utilities for Python"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "srsly-2.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:38506074cfac43f5581b6b22c335dc4d43ef9a82cbe9fe2557452e149d4540f5"},
-    {file = "srsly-2.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:efd401ac0b239f3c7c0070fcd613f10a4a01478ff5fe7fc8527ea7a23dfa3709"},
-    {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd1be19502fda87108c8055bce6537ec332266057f595133623a4a18e56a91a1"},
-    {file = "srsly-2.4.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87e86be5fd655ed554e4bf6b63a4eb3380ffb40752d0621323a3df879d3e6407"},
-    {file = "srsly-2.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:7be5def9b6ac7896ce326997498b8155b9167ddc672fb209a200090c7fe45a4b"},
-    {file = "srsly-2.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:bb3d54563e33816d33695b58f9daaea410fcd0b9272aba27050410a5279ba8d8"},
-    {file = "srsly-2.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2848735a9fcb0ad9ec23a6986466de7942280a01dbcb7b66583288f1378afba1"},
-    {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:282d59a37c271603dd790ab25fa6521c3d3fdbca67bef3ee838fd664c773ea0d"},
-    {file = "srsly-2.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7affecb281db0683fe78181d644f6d6a061948fa318884c5669a064b97869f54"},
-    {file = "srsly-2.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:76d991167dc83f8684fb366a092a03f51f7582741885ba42444ab577e61ae198"},
-    {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7a7278470bbad3831c9d8abd7f7b9fa9a3d6cd29f797f913f7a04ade5668715"},
-    {file = "srsly-2.4.7-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:654496a07fcf11ba823e9a16f263001271f04d8b1bfd8d94ba6130a1649fc6d8"},
-    {file = "srsly-2.4.7-cp36-cp36m-win_amd64.whl", hash = "sha256:89e35ead948349b2a8d47600544dbf49ff737d15a899bc5a71928220daee2807"},
-    {file = "srsly-2.4.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3e0f0410faf9d5dc5c58caf907a4b0b94e6dc766289e329a15ddf8adca264d1c"},
-    {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c3422ab7ed37438086a178e611be85b7001e0071882655fcb8dca83c4f5f57d"},
-    {file = "srsly-2.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a81186f9c1beb0892fcef4fd6350e6ee0d2d700da5042e400ec6da65a0b52fb"},
-    {file = "srsly-2.4.7-cp37-cp37m-win_amd64.whl", hash = "sha256:1fe4a9bf004174f0b73b3fc3a96d35811c218e0441f4246ac4cb3f06daf0ca12"},
-    {file = "srsly-2.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:86501eb25c6615d934bde0aea98d705ce7edd11d070536162bd2fa8606034f0f"},
-    {file = "srsly-2.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f46bc563a7b80f81aed8dd12f86ef43b93852d937666f44a3d04bcdaa630376c"},
-    {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e60cd20f08b8a0e200017c6e8f5af51321878b17bf7da284dd81c7604825c6e"},
-    {file = "srsly-2.4.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c90953a58dfde2eeaea15749c7dddad2a508b48b17d084b491d56d5213ef2a37"},
-    {file = "srsly-2.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:7c9a1dc7077b4a101fd018c1c567ec735203887e016a813588557f5c4ce2de8b"},
-    {file = "srsly-2.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c8ada26613f49f72baa573dbd7e911f3af88b647c3559cb6641c97ca8dd7cfe0"},
-    {file = "srsly-2.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:267f6ac1b8388a4649a6e6299114ff2f6af03bafd60fc8f267e890a9becf7057"},
-    {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75f2777cc44ad34c5f2239d44c8cd56b0263bf19bc6c1593dcc765e2a21fc5e7"},
-    {file = "srsly-2.4.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2059d447cfe5bf6692634cbfbbb2d5663f554023b0aa0ee3d348387d9ec9345a"},
-    {file = "srsly-2.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:422e44d702da4420c47012d309fc56b5081ca06a500393d83114eb09d71bf1ce"},
-    {file = "srsly-2.4.7.tar.gz", hash = "sha256:93c2cc4588778261ccb23dd0543b24ded81015dd8ab4ec137cd7d04965035d08"},
-]
-
-[package.dependencies]
-catalogue = ">=2.0.3,<2.1.0"
-
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -3882,81 +3495,6 @@ files = [
 [package.extras]
 doc = ["reno", "sphinx", "tornado (>=4.5)"]
 
-[[package]]
-name = "thinc"
-version = "8.1.12"
-description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "thinc-8.1.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efda431bc1513e81e457dbff4ef1610592569ddc362f8df24422628b195d51f4"},
-    {file = "thinc-8.1.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01dbe9063171c1d0df29374a3857ee500fb8acf8f33bd8a85d11214d7453ff7a"},
-    {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fcfe97b80aa02a6cdeef9f5e3127822a13497a9b6f58653da4ff3caf321e3c4"},
-    {file = "thinc-8.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c52d0657c61b7e1a382cb5ee1ee71692a0e9c47bef9f3e02ac3492b26056d27"},
-    {file = "thinc-8.1.12-cp310-cp310-win_amd64.whl", hash = "sha256:b2078018c8bc36540b0c007cb1909f6c81c9a973b3180d15b934414f08988b28"},
-    {file = "thinc-8.1.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:340171c1927592082c79509e5a964766e2d65c2e30c5e583489488935a9a2340"},
-    {file = "thinc-8.1.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:88e8c9cd5119d5dbb0c4ed1bdde5acd6cf12fe1b3316647ecbd79fb12e3ef542"},
-    {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15c6cb31138814599426bd8855b9fc9d8d8ddb2bde1c91d204353b5e5af15deb"},
-    {file = "thinc-8.1.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5dc3117db83ec0d423480b6c77de90f658dfaed5f7a2bbc3d640f1f6c7ff0fe7"},
-    {file = "thinc-8.1.12-cp311-cp311-win_amd64.whl", hash = "sha256:f9ac43fd02e952c005753f85bd375c03baea5fa818a6a4942930177c31130eca"},
-    {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4241d0b8c9e813a1fbba05b6dc7d7056c0a2601b8a1119d372e85185068009e6"},
-    {file = "thinc-8.1.12-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c141e42e610605a9c6def19e5dbb4877353839a610e3cdb1fa68e70f6b39492a"},
-    {file = "thinc-8.1.12-cp36-cp36m-win_amd64.whl", hash = "sha256:9388c1427b4c3615967e1be19fa93427be61241392bdd5a84ab1da0f96c6bcfb"},
-    {file = "thinc-8.1.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:f6fb12692fae1a056432800f94ec88fa714eb1111aff9eabd61d2dfe10beb713"},
-    {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e51c693d477e02eab164a67b588fcdbb3609bc54ec39de6084da2dd9a356b8f8"},
-    {file = "thinc-8.1.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4265f902f9a597be294765479ef6535d679e497fa2fed955cbcabcfdd82f81ad"},
-    {file = "thinc-8.1.12-cp37-cp37m-win_amd64.whl", hash = "sha256:4586d6709f3811db85e192fdf519620b3326d28e5f0193cef8544b057e20a951"},
-    {file = "thinc-8.1.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e10a648872e9ebbe115fa5fba0d515e8226bd0e2de0abd41d55f1ae04017813c"},
-    {file = "thinc-8.1.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:27231eb1d468e7eb97f255c3d1e985d5a0cb8e309e0ec01b29cce2de836b8db2"},
-    {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ece3880ac05d6bb75ecdbd9c03298e6f9691e5cb7480c1f15e66e33fe34004"},
-    {file = "thinc-8.1.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:285f1141ecd7a9b61e2fed58b609c194b40e6ae5daf1e1e8dec31616bc9ffca1"},
-    {file = "thinc-8.1.12-cp38-cp38-win_amd64.whl", hash = "sha256:0400632aa235cfbbc0004014e90cdf54cd42333aa7f5e971ffe87c8125e607ed"},
-    {file = "thinc-8.1.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2edb3ef3a02f966eae8c5c56feb80ad5b6e5c221c94fcd95eb413d09d0d82212"},
-    {file = "thinc-8.1.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e078d3b00e51c597f3f301d3e2925d0842d0725f251ff9a53a1e1b4110d4b9c1"},
-    {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d0ac2f6a0b38ddb913f9b31d8c4b13b98a7f5f62db211e0d8ebefbda5138757"},
-    {file = "thinc-8.1.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47cde897cf54bc731a3a7c2e51a6ef01a86687ab7ae90ab0e9fc5d2294fe0fba"},
-    {file = "thinc-8.1.12-cp39-cp39-win_amd64.whl", hash = "sha256:1b846c35a24b5b33e5d240f514f3a9e8bac2b6a10491caa147753dc50740a400"},
-    {file = "thinc-8.1.12.tar.gz", hash = "sha256:9dd12c5c79b176f077ce9416b49c9752782bd76ff0ea649d66527882e83ea353"},
-]
-
-[package.dependencies]
-blis = ">=0.7.8,<0.8.0"
-catalogue = ">=2.0.4,<2.1.0"
-confection = ">=0.0.1,<1.0.0"
-cymem = ">=2.0.2,<2.1.0"
-murmurhash = ">=1.0.2,<1.1.0"
-numpy = {version = ">=1.19.0", markers = "python_version >= \"3.9\""}
-packaging = ">=20.0"
-preshed = ">=3.0.2,<3.1.0"
-pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0"
-setuptools = "*"
-srsly = ">=2.4.0,<3.0.0"
-wasabi = ">=0.8.1,<1.2.0"
-
-[package.extras]
-cuda = ["cupy (>=5.0.0b4)"]
-cuda-autodetect = ["cupy-wheel (>=11.0.0)"]
-cuda100 = ["cupy-cuda100 (>=5.0.0b4)"]
-cuda101 = ["cupy-cuda101 (>=5.0.0b4)"]
-cuda102 = ["cupy-cuda102 (>=5.0.0b4)"]
-cuda110 = ["cupy-cuda110 (>=5.0.0b4)"]
-cuda111 = ["cupy-cuda111 (>=5.0.0b4)"]
-cuda112 = ["cupy-cuda112 (>=5.0.0b4)"]
-cuda113 = ["cupy-cuda113 (>=5.0.0b4)"]
-cuda114 = ["cupy-cuda114 (>=5.0.0b4)"]
-cuda115 = ["cupy-cuda115 (>=5.0.0b4)"]
-cuda116 = ["cupy-cuda116 (>=5.0.0b4)"]
-cuda117 = ["cupy-cuda117 (>=5.0.0b4)"]
-cuda11x = ["cupy-cuda11x (>=11.0.0)"]
-cuda80 = ["cupy-cuda80 (>=5.0.0b4)"]
-cuda90 = ["cupy-cuda90 (>=5.0.0b4)"]
-cuda91 = ["cupy-cuda91 (>=5.0.0b4)"]
-cuda92 = ["cupy-cuda92 (>=5.0.0b4)"]
-datasets = ["ml-datasets (>=0.2.0,<0.3.0)"]
-mxnet = ["mxnet (>=1.5.1,<1.6.0)"]
-tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"]
-torch = ["torch (>=1.6.0)"]
-
 [[package]]
 name = "threadpoolctl"
 version = "3.2.0"
@@ -4653,4 +4191,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "c246890fd08f7d69ace373434b1d4cf0adc5bcb1f76177ee34d28ad12b839afa"
+content-hash = "0c80743677efb7db358ce4088a77a8f3dc1200ca71f02489f552d90f9b50a1d4"
diff --git a/pyproject.toml b/pyproject.toml
index 964f7a19..edcc4bf9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,6 @@ loguru = "^0.7.0"
 wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
-spacy = "^3.6.1"
 
 
 [tool.poetry.group.dev]

From 487f8cf47440ed54484063c1b84ed3df70b29dc6 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 12:51:32 +0100
Subject: [PATCH 070/102] Adds tests for augment

---
 grants_tagger_light/augmentation/augment.py | 24 ++++++--
 grants_tagger_light/retagging/retagging.py  |  4 +-
 pipelines/bertmesh/dvc.yaml                 | 36 ++++++++---
 pytest.ini                                  |  2 +
 tests/test_augment.py                       | 68 +++++++++++++++++++++
 5 files changed, 118 insertions(+), 16 deletions(-)
 create mode 100644 pytest.ini
 create mode 100644 tests/test_augment.py

diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index 900f8817..80b00ece 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -50,6 +50,7 @@ def augment(
     prompt_template: str = "grants_tagger_light/augmentation/prompt.template",
     concurrent_calls: int = os.cpu_count() * 2,
     temperature: float = 1.5,
+    tags: list = None,
     tags_file_path: str = None,
 ):
     if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
@@ -60,6 +61,7 @@ def augment(
     dset = load_from_disk(os.path.join(data_path, "dataset"))
     if "train" in dset:
         dset = dset["train"]
+
     logger.info("Obtaining count values from the labels...")
     pool = multiprocessing.Pool(processes=num_proc)
     element_counts_list = pool.map(_count_elements_in_sublist, dset["meshMajor"])
@@ -71,16 +73,20 @@ def augment(
         merged_element_counts.items(), key=lambda x: x[1], reverse=True
     )
     sorted_merged_element_counts_dict = dict(sorted_merged_element_counts)
+
+    if tags is None:
+        tags = []
     if tags_file_path is not None:
         with open(tags_file_path, "r") as f:
-            tags = f.read().split("\n")
+            tags.extend([x.strip() for x in f.readlines()])
             logger.info(
                 f"Tags file path found. Filtering {len(tags)} tags "
                 f"(examples found: {tags[:15]}...)"
             )
-            sorted_merged_element_counts_dict = {
-                k: v for k, v in sorted_merged_element_counts_dict.items() if k in tags
-            }
+    if len(tags) > 0:
+        sorted_merged_element_counts_dict = {
+            k: v for k, v in sorted_merged_element_counts_dict.items() if k in tags
+        }
 
     if min_examples is not None:
         sorted_merged_element_counts_dict = {
@@ -191,6 +197,7 @@ def augment_cli(
         max=2,
         help="A value between 0 and 2. The bigger - the more creative.",
     ),
+    tags: str = typer.Option(None, help="Comma separated list of tags to retag"),
     tags_file_path: str = typer.Option(
         None,
         help="Text file containing one line per tag to be considered. "
@@ -204,13 +211,17 @@ def augment_cli(
         )
         exit(-1)
 
-    if tags_file_path is None and min_examples is None:
+    if tags_file_path is None and tags is None and min_examples is None:
         logger.error(
             "To understand which tags need to be augmented, "
-            "set either --min-examples or --tags-file-path"
+            "set either --min-examples or --tags-file-path or --tags"
         )
         exit(-1)
 
+    if tags_file_path is not None and not os.path.isfile(tags_file_path):
+        logger.error(f"{tags_file_path} not found")
+        exit(-1)
+
     if float(temperature) > 2.0 or float(temperature) < -2.0:
         logger.error("Temperature should be in the range [-2, 2]")
         exit(-1)
@@ -226,5 +237,6 @@ def augment_cli(
         prompt_template=prompt_template,
         concurrent_calls=concurrent_calls,
         temperature=temperature,
+        tags=parse_tags(tags),
         tags_file_path=tags_file_path,
     )
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index fe927983..70cbd25c 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -122,9 +122,11 @@ def retag(
             lambda x: any(np.isin(years, [str(x["year"])])), num_proc=num_proc
         )
 
+    if tags is None:
+        tags = []
     if tags_file_path is not None and os.path.isfile(tags_file_path):
         with open(tags_file_path, "r") as f:
-            tags = [x.strip() for x in f.readlines()]
+            tags.extend([x.strip() for x in f.readlines()])
 
     logging.info(f"- Total tags detected: {tags}.")
     logging.info("- Training classifiers (retaggers)")
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index f302c669..27052038 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -1,5 +1,5 @@
 vars:
-    - data_path: "../../data/raw/allMeSH_2021.json"
+    - data_path: "../../data/raw/allMeSH_2021.jsonl"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test"
 
@@ -8,15 +8,33 @@ stages:
         cmd: >-
             python ${script_loc}/train.py
             --model_key ""
-            --data_path ${data_path}
-            --output_dir ${output_dir}
-            --max_samples 10
-            --per_device_train_batch_size 4
-            --per_device_eval_batch_size 4
+            --data_path ${data_path}            
+            --per_device_train_batch_size 16 \
+            --per_device_eval_batch_size 1 \
+            --multilabel_attention True \
+            --freeze_backbone unfreeze \
+            --num_train_epochs 7 \
+            --learning_rate 5e-5 \
+            --dropout 0.1 \
+            --hidden_size 1024 \
+            --warmup_steps 5000 \
+            --max_grad_norm 2.0 \
+            --scheduler_type cosine_hard_restart \
+            --weight_decay 0.2 \
+            --correct_bias True \
+            --threshold 0.25 \
+            --prune_labels_in_evaluation True \
+            --hidden_dropout_prob 0.2 \
+            --attention_probs_dropout_prob 0.2 \
+            --fp16 \
+            --torch_compile \
+            --evaluation_strategy epoch \
+            --eval_accumulation_steps 20 \
+            --save_strategy epoch \
+            --wandb_project wellcome-mesh \
+            --wandb_name test-train-all \
+            --wandb_api_key ${WANDB_API_KEY}
         deps:
             - ${data_path}
-        params:
-            - ../../grants_tagger_light/training/cli_args/train_args.py:
-                  - BertMeshTrainingArguments
         outs:
             - ${output_dir}/best
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..039d0a2f
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+log_cli = true
\ No newline at end of file
diff --git a/tests/test_augment.py b/tests/test_augment.py
new file mode 100644
index 00000000..d22aa1d5
--- /dev/null
+++ b/tests/test_augment.py
@@ -0,0 +1,68 @@
+import logging
+import os
+import tempfile
+import unittest
+import pytest
+
+from grants_tagger_light.augmentation.augment import augment
+from grants_tagger_light.preprocessing.preprocess_mesh import preprocess_mesh
+
+# Note dummy data is not necessarily annotated correctly
+dummy_data = """{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["COVID-19","SARS-CoV-2"],"year":"2023","abstractText":"This is an article about coronavirus.","title":"article1","pmid":"pmid1"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}
+{"journal":"dummyJournal","meshMajor":["Malaria"],"year":"2023","abstractText":"This is an article about malaria", "title": "article3", "pmid": "pmid3"}"""  # noqa
+
+
+@pytest.fixture
+def data_path():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        data_path = tmpdirname + "/data.jsonl"
+        with open(data_path, "w") as f:
+            f.write(dummy_data)
+        yield data_path
+
+
+@pytest.mark.skipif('OPENAI_API_KEY' not in os.environ,
+                    reason="requires OPENAI_API_KEY installed")
+def test_augment(data_path):
+    logging.basicConfig(level=logging.INFO)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        preprocessing_path = tmpdirname
+        save_to_path = tmpdirname + "/augmented.jsonl"
+
+    preprocess_mesh(
+        data_path=data_path, save_to_path=preprocessing_path, model_key="", num_proc=2, batch_size=1, test_size=0.5
+    )
+
+    augment(
+        preprocessing_path,
+        save_to_path,
+        model_key="gpt-3.5-turbo",
+        examples=1,
+        concurrent_calls=1,
+        tags=['COVID-19']
+    )
+
+    with open(save_to_path, 'r') as f:
+        logging.info(f.read())
+
+
+if __name__ == "__main__":
+    unittest.main()

From efacba7a84fe69a8e64f0bbe0461bf3acafecb76 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 12:54:47 +0100
Subject: [PATCH 071/102] Black

---
 tests/test_augment.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/test_augment.py b/tests/test_augment.py
index d22aa1d5..51745674 100644
--- a/tests/test_augment.py
+++ b/tests/test_augment.py
@@ -39,8 +39,9 @@ def data_path():
         yield data_path
 
 
-@pytest.mark.skipif('OPENAI_API_KEY' not in os.environ,
-                    reason="requires OPENAI_API_KEY installed")
+@pytest.mark.skipif(
+    "OPENAI_API_KEY" not in os.environ, reason="requires OPENAI_API_KEY installed"
+)
 def test_augment(data_path):
     logging.basicConfig(level=logging.INFO)
     with tempfile.TemporaryDirectory() as tmpdirname:
@@ -48,7 +49,12 @@ def test_augment(data_path):
         save_to_path = tmpdirname + "/augmented.jsonl"
 
     preprocess_mesh(
-        data_path=data_path, save_to_path=preprocessing_path, model_key="", num_proc=2, batch_size=1, test_size=0.5
+        data_path=data_path,
+        save_to_path=preprocessing_path,
+        model_key="",
+        num_proc=2,
+        batch_size=1,
+        test_size=0.5,
     )
 
     augment(
@@ -57,10 +63,10 @@ def test_augment(data_path):
         model_key="gpt-3.5-turbo",
         examples=1,
         concurrent_calls=1,
-        tags=['COVID-19']
+        tags=["COVID-19"],
     )
 
-    with open(save_to_path, 'r') as f:
+    with open(save_to_path, "r") as f:
         logging.info(f.read())
 
 

From 6e01ee00da914c3874ee471535bdd3c11a4fbada Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jjmcarrascosa@gmail.com>
Date: Thu, 14 Sep 2023 13:26:11 +0100
Subject: [PATCH 072/102] Better error management. Black

---
 examples/augment.sh               | 6 ++++--
 examples/augment_specific_tags.sh | 5 -----
 2 files changed, 4 insertions(+), 7 deletions(-)
 delete mode 100644 examples/augment_specific_tags.sh

diff --git a/examples/augment.sh b/examples/augment.sh
index 9ad482b1..1f5fa1ae 100644
--- a/examples/augment.sh
+++ b/examples/augment.sh
@@ -1,3 +1,5 @@
-grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
-  --min-examples 25 \
+# Augments data using a file with 1 label per line and years
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FILE] \
+  --tags "Environmental Science" \
+  --examples 25 \
   --concurrent-calls 25
\ No newline at end of file
diff --git a/examples/augment_specific_tags.sh b/examples/augment_specific_tags.sh
deleted file mode 100644
index 3ce920c8..00000000
--- a/examples/augment_specific_tags.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-# Augments data using a file with 1 label per line and years
-grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
-  --tags-file-path tags_to_augment.txt \
-  --examples 25 \
-  --concurrent-calls 25
\ No newline at end of file

From 3e91ec665b6255066117732a6ca29ed4e76d4fcc Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 13:27:16 +0100
Subject: [PATCH 073/102] "Better error management. Black"

---
 examples/augment.sh                         |  4 ++--
 grants_tagger_light/augmentation/augment.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/examples/augment.sh b/examples/augment.sh
index 1f5fa1ae..b02b1d2e 100644
--- a/examples/augment.sh
+++ b/examples/augment.sh
@@ -1,5 +1,5 @@
 # Augments data using a file with 1 label per line and years
 grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FILE] \
-  --tags "Environmental Science" \
+  --tags "Mathematics" \
   --examples 25 \
-  --concurrent-calls 25
\ No newline at end of file
+  --concurrent-calls 1
\ No newline at end of file
diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index 80b00ece..e548d36b 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -14,6 +14,7 @@
 from grants_tagger_light.augmentation.parallel_augment_openai import (
     ParallelAugmentOpenAI,
 )
+from grants_tagger_light.utils.years_tags_parser import parse_tags
 
 augment_app = typer.Typer()
 
@@ -74,6 +75,7 @@ def augment(
     )
     sorted_merged_element_counts_dict = dict(sorted_merged_element_counts)
 
+    print(f"Tags: {tags}")
     if tags is None:
         tags = []
     if tags_file_path is not None:
@@ -87,6 +89,7 @@ def augment(
         sorted_merged_element_counts_dict = {
             k: v for k, v in sorted_merged_element_counts_dict.items() if k in tags
         }
+        logger.info(f"Tags count dictionary: {sorted_merged_element_counts_dict}")
 
     if min_examples is not None:
         sorted_merged_element_counts_dict = {
@@ -95,11 +98,27 @@ def augment(
             if v < min_examples
         }
 
+    if len(sorted_merged_element_counts_dict.keys()) < 1:
+        logger.error(
+            "I did not find any examples for your tags in your preprocessed folder. Try:\n"
+            "- Other train/set split in `preprocess`;\n"
+            "- Other years;\n"
+            "- Other tags;"
+        )
+        exit(-1)
+
     with open(f"{save_to_path}.count", "w") as f:
         f.write(json.dumps(sorted_merged_element_counts_dict, indent=2))
 
     tags_to_augment = list(sorted_merged_element_counts_dict.keys())
 
+    if len(tags_to_augment) < concurrent_calls:
+        logger.error(
+            "Found less tags than concurrent calls to OpenAI."
+            f" Overwritting `concurrent-calls` to {len(tags_to_augment)}"
+        )
+        concurrent_calls = len(tags_to_augment)
+
     biggest_tags_to_augment = [
         f"{k}({sorted_merged_element_counts_dict[k]})" for k in tags_to_augment[:5]
     ]

From 10c4468df3d2fc9734802b61c660372884a8dbde Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 13:28:12 +0100
Subject: [PATCH 074/102] Ruff

---
 grants_tagger_light/augmentation/augment.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index e548d36b..fbb58046 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -100,7 +100,8 @@ def augment(
 
     if len(sorted_merged_element_counts_dict.keys()) < 1:
         logger.error(
-            "I did not find any examples for your tags in your preprocessed folder. Try:\n"
+            "I did not find any examples for your tags "
+            "in your preprocessed folder. Try:\n"
             "- Other train/set split in `preprocess`;\n"
             "- Other years;\n"
             "- Other tags;"

From 7206885967291aedbc58230a98c79fbbe08b3fcb Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 16:17:31 +0100
Subject: [PATCH 075/102] Feedback from PR comments. Merging latest main into
 this branch.

---
 README.md                                   | 37 ++-----------
 examples/retag.sh                           |  2 +-
 grants_tagger_light/augmentation/augment.py |  2 +-
 grants_tagger_light/retagging/retagging.py  | 61 ++++++++++++++-------
 poetry.lock                                 |  2 +-
 pyproject.toml                              |  1 +
 6 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/README.md b/README.md
index 99bab44d..271de34e 100644
--- a/README.md
+++ b/README.md
@@ -48,33 +48,7 @@ For inference, CPU-support should suffice.
 
 You now have access to the `grants-tagger` command line interface!
 
-## 3. For `retagging` you will need to make sure you have `openjdk 8 (or 11)` installed to run Spark
-First, make sure you don't have java installed or you have another version that it's not java 8 or 11.
-```shell
-java -version
-```
-
-If you don't or you have another version, install it (example for java 8):
-```shell
-sudo apt update
-sudo apt install openjdk-8-jdk
-```
-
-Make sure you set by default the one we have just installed. Copy the path to the java folder from:
-```shell
-sudo update-alternatives --config java
-```
-
-And now, set your JAVA_HOME env var:
-```shell
-sudo vim /etc/environment
-JAVA_HOME="[PATH_TO_THE_JAVA_FOLDER]
-```
-
-Restar the shell or do `source /etc/environment`
-
-
-## OPTIONAL: 4. Install MantisNLP `remote` to connect to a remote AWS instances
+## OPTIONAL: 3. Install MantisNLP `remote` to connect to a remote AWS instances
 `pip install git+https://github.com/ivyleavedtoadflax/remote.py.git`
 Then add your instance
 `remote config add [instance_name]`
@@ -254,14 +228,14 @@ grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_
 
 ### Other params
 ```                                                                                                                                                                                                                   
- Usage: grants-tagger augment mesh [OPTIONS] DATA_PATH SAVE_TO_PATH                                                                                                                                                
+Usage: grants-tagger augment mesh [OPTIONS] DATA_PATH SAVE_TO_PATH                                                                                                                                                
 
 ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *    data_path         TEXT  Path to mesh.jsonl [default: None] [required]                                                                                                                                      │
-│ *    save_to_path      TEXT  Path to save the new generated data in jsonl format
+│ *    data_path         TEXT  Path to folder after `preprocess` [default: None] [required]                                                                                                                                      │
+│ *    save_to_path      TEXT  Path to save the new jsonl data [default: None] [required]                                                                                                                         │
 ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ --model-key               TEXT                   LLM to use data augmentation. By now, only `openai` is supported [default: gpt-3.5-turbo]                                                                      │
+│ --model-key               TEXT                   LLM to use data augmentation. By now, only openai models are supported [default: gpt-3.5-turbo]                                                                      │
 │ --num-proc                INTEGER                Number of processes to use for data augmentation [default: 8]                                                                                                  │
 │ --batch-size              INTEGER                Preprocessing batch size (for dataset, filter, map, ...) [default: 64]                                                                                         │
 │ --min-examples            INTEGER                Minimum number of examples to require. Less than that will trigger data augmentation. [default: None]                                                          │
@@ -270,6 +244,7 @@ grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_
 │                                                  [default: grants_tagger_light/augmentation/prompt.template]                                                                                                    │
 │ --concurrent-calls        INTEGER RANGE [x>=1]   Concurrent calls with 1 tag each to the different model [default: 16]                                                                                          │
 │ --temperature             FLOAT RANGE [0<=x<=2]  A value between 0 and 2. The bigger - the more creative. [default: 1.5]                                                                                        │
+│ --tags                    TEXT                   Comma separated list of tags to retag [default: None]                                                                                                          │
 │ --tags-file-path          TEXT                   Text file containing one line per tag to be considered. The rest will be discarded. [default: None]                                                            │
 │ --help                                           Show this message and exit.                                                                                                                                    │
 ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
diff --git a/examples/retag.sh b/examples/retag.sh
index 4bce6dc0..2a8c3a32 100644
--- a/examples/retag.sh
+++ b/examples/retag.sh
@@ -1,5 +1,5 @@
 grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
-  --tags "Artificial Intelligence,HIV" \
+  --tags "Artificial Intelligence,HIV,Data Collection,Mathematics,Geography" \
   --years 2016,2017,2018,2019,2020,2021 \
   --train-examples 100 \
   --batch-size 10000 \
diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
index fbb58046..3bfc1ed0 100644
--- a/grants_tagger_light/augmentation/augment.py
+++ b/grants_tagger_light/augmentation/augment.py
@@ -182,7 +182,7 @@ def augment(
 
 @augment_app.command()
 def augment_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
+    data_path: str = typer.Argument(..., help="Path to folder after `preprocess`"),
     save_to_path: str = typer.Argument(..., help="Path to save the new jsonl data"),
     model_key: str = typer.Option(
         "gpt-3.5-turbo",
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 70cbd25c..6ad4a133 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -2,6 +2,7 @@
 import logging
 import random
 import time
+from colorama import Fore, Back, Style
 
 import typer
 from loguru import logger
@@ -42,15 +43,19 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
     field = "positive" if is_positive else "negative"
     human_supervision = {tag: {"positive": [], "negative": []}}
     if os.path.isfile(curation_file):
-        prompt = (
-            f"File `{curation_file}` found. Do you want to reuse previous work? [y|n]: "
-        )
-        answer = input(prompt)
-        while answer not in ["y", "n"]:
+        with open(curation_file, "r") as f:
+            human_supervision = json.load(f)
+        if len(human_supervision[tag][field]) > 0:
+            prompt = (
+                f"{field.title()} examples for `{curation_file}` found. "
+                f"Do you want to reuse them? [y|n]: "
+            )
             answer = input(prompt)
-        if answer == "y":
-            with open(curation_file, "r") as f:
-                human_supervision = json.load(f)
+            while answer not in ["y", "n"]:
+                logging.error(f"{answer} not valid (only `y` or `n` accepted)")
+                answer = input(prompt)
+            if answer == "n":
+                human_supervision[tag][field] = []
 
     count = len(human_supervision[tag][field])
     logging.info(
@@ -77,12 +82,21 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
         if finished:
             break
         print("=" * 50)
-        print(dset[random_pos_row]["abstractText"])
+        keywords = []
+        for k in tag.split(' '):
+            keywords.extend(k.split(','))
+        text = Fore.YELLOW + dset[random_pos_row]["abstractText"] + Style.RESET_ALL
+        for k in keywords:
+            text = text.replace(k.lower(), Back.BLUE + k.lower() + Back.RESET)
+            text = text.replace(k.upper(), Back.BLUE + k.upper() + Back.RESET)
+            text = text.replace(k.title(), Back.BLUE + k.title() + Back.RESET)
+            text = text.replace(k.capitalize(), Back.BLUE + k.capitalize() + Back.RESET)
+        print(text)
         print("=" * 50)
-        res = input(
-            f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
-            f" a `{tag}` text? [a to accept]: "
-        )
+        res = input(Style.BRIGHT +
+                    f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
+                    f" a `{tag}` text? [a to accept]: " + Style.RESET_ALL
+                    )
         if res == "a":
             human_supervision[tag][field].append(dset[random_pos_row])
             with open(curation_file, "w") as f:
@@ -131,6 +145,8 @@ def retag(
     logging.info(f"- Total tags detected: {tags}.")
     logging.info("- Training classifiers (retaggers)")
 
+    # First, we annotate all the tags.
+    # We don't run classification until all tags are annotated
     for tag in tags:
         os.makedirs(os.path.join(save_to_path, tag.replace(" ", "")), exist_ok=True)
         logging.info(f"- Obtaining positive examples for {tag}...")
@@ -176,6 +192,7 @@ def retag(
 
     logging.info("- Retagging...")
 
+    # Second, for each tag, we train classifiers.
     models = {}
     for tag in tags:
         curation_file = os.path.join(save_to_path, tag.replace(" ", ""), "curation")
@@ -198,14 +215,14 @@ def retag(
         )
 
         pos_x_train = pos_x_train.add_column("tag", [tag] * len(pos_x_train))
-        pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
         neg_x_train = neg_x_train.add_column("tag", ["other"] * len(neg_x_train))
-        neg_x_test = neg_x_test.add_column("tag", ["other"] * len(neg_x_test))
 
         logging.info("- Creating train/test sets...")
         train = concatenate_datasets([pos_x_train, neg_x_train])
 
         # TODO: Use Evaluation on `test` to see if the model is good enough
+        # pos_x_test = pos_x_test.add_column("tag", [tag] * len(pos_x_test))
+        # neg_x_test = neg_x_test.add_column("tag", ["other"] * len(neg_x_test))
         # test = concatenate_datasets([pos_x_test, neg_x_test])
 
         label_binarizer = preprocessing.LabelBinarizer()
@@ -227,18 +244,22 @@ def retag(
         os.makedirs(model_path, exist_ok=True)
         model.save(model_path)
 
+    # Third, we predict
     logging.info("- Predicting all tags")
     dset = dset.add_column("changes", [[]] * len(dset))
     with open(os.path.join(save_to_path, "corrections"), "w") as f:
-        for b in tqdm.tqdm(range(int(len(dset) / batch_size))):
-            start = b * batch_size
-            end = min(len(dset), (b + 1) * batch_size)
+        for batch_index in tqdm.tqdm(range(0, len(dset), batch_size)):
+            start = batch_index * batch_size
+            end = min(len(dset), (batch_index + 1) * batch_size)
             batch = dset.select([i for i in range(start, end)])
             batch_buffer = [x for x in batch]
             for tag in models.keys():
                 batch_preds = models[tag](batch["abstractText"], threshold=threshold)
-                for i, bp in enumerate(batch_preds):
-                    is_predicted = bp == [0]
+                for i, pred in enumerate(batch_preds):
+                    # pred is an array of classes returned.
+                    # if pred is [0] - positive (or just len(ped)>0)
+                    # if pred is empty [] - negative
+                    is_predicted = len(pred) > 0
                     is_expected = tag in batch[i]["meshMajor"]
                     if is_predicted != is_expected:
                         if is_predicted:
diff --git a/poetry.lock b/poetry.lock
index d7d360d7..d8ea3549 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -4288,4 +4288,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "0c80743677efb7db358ce4088a77a8f3dc1200ca71f02489f552d90f9b50a1d4"
+content-hash = "019a45e6955dd283a0127e78e50df6fe8c29cebbdfba847bc9cb39d9f7a509bb"
diff --git a/pyproject.toml b/pyproject.toml
index 7b044dc8..ed65aacb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ wandb = "^0.15.4"
 openai = "0.27.8"
 openai-multi-client = "^0.1.1"
 openpyxl = "^3.1.2"
+colorama = "^0.4.6"
 
 
 [tool.poetry.group.dev]

From 867fddb811b2bd9b3ed38aa48400c95f43487c93 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 16:18:51 +0100
Subject: [PATCH 076/102] Black

---
 grants_tagger_light/retagging/retagging.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 6ad4a133..9a18e038 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -83,8 +83,8 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
             break
         print("=" * 50)
         keywords = []
-        for k in tag.split(' '):
-            keywords.extend(k.split(','))
+        for k in tag.split(" "):
+            keywords.extend(k.split(","))
         text = Fore.YELLOW + dset[random_pos_row]["abstractText"] + Style.RESET_ALL
         for k in keywords:
             text = text.replace(k.lower(), Back.BLUE + k.lower() + Back.RESET)
@@ -93,10 +93,11 @@ def _annotate(curation_file, dset, tag, limit, is_positive):
             text = text.replace(k.capitalize(), Back.BLUE + k.capitalize() + Back.RESET)
         print(text)
         print("=" * 50)
-        res = input(Style.BRIGHT +
-                    f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
-                    f" a `{tag}` text? [a to accept]: " + Style.RESET_ALL
-                    )
+        res = input(
+            Style.BRIGHT
+            + f'[{count}/{limit}]> Is this {"NOT " if not is_positive else ""}'
+            f" a `{tag}` text? [a to accept]: " + Style.RESET_ALL
+        )
         if res == "a":
             human_supervision[tag][field].append(dset[random_pos_row])
             with open(curation_file, "w") as f:

From b1f1b405912b5974d9ac0345b4550bfb496f144d Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 17:05:42 +0100
Subject: [PATCH 077/102] Adds retagging dvc data

---
 data/raw/.gitignore                        | 1 +
 data/raw/retagging.dvc                     | 5 +++++
 grants_tagger_light/retagging/retagging.py | 6 ++++--
 pipelines/bertmesh/dvc.yaml                | 5 +----
 4 files changed, 11 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 data/raw/.gitignore
 create mode 100644 data/raw/retagging.dvc

diff --git a/data/raw/.gitignore b/data/raw/.gitignore
old mode 100644
new mode 100755
index b0254cca..f8959d3d
--- a/data/raw/.gitignore
+++ b/data/raw/.gitignore
@@ -3,3 +3,4 @@
 /desc2021.xml
 /disease_tags_validation_grants.xlsx
 /active_grants_last_5_years.csv
+/retagging
diff --git a/data/raw/retagging.dvc b/data/raw/retagging.dvc
new file mode 100644
index 00000000..797836fd
--- /dev/null
+++ b/data/raw/retagging.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: ec54844641e11a97dcbfafeec2fa525d.dir
+  size: 5546175163
+  nfiles: 6
+  path: retagging
diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 9a18e038..69c0abdd 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -258,8 +258,10 @@ def retag(
                 batch_preds = models[tag](batch["abstractText"], threshold=threshold)
                 for i, pred in enumerate(batch_preds):
                     # pred is an array of classes returned.
-                    # if pred is [0] - positive (or just len(ped)>0)
-                    # if pred is empty [] - negative
+                    # Since it's binary clf, possibilities are:
+                    # - [0] - true (class 0 - only class we have)
+                    # - [] - false (no class)
+                    # len(ped)>0 checks if prediction is tag
                     is_predicted = len(pred) > 0
                     is_expected = tag in batch[i]["meshMajor"]
                     if is_predicted != is_expected:
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 27052038..cb0e42a7 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -30,10 +30,7 @@ stages:
             --torch_compile \
             --evaluation_strategy epoch \
             --eval_accumulation_steps 20 \
-            --save_strategy epoch \
-            --wandb_project wellcome-mesh \
-            --wandb_name test-train-all \
-            --wandb_api_key ${WANDB_API_KEY}
+            --save_strategy epoch
         deps:
             - ${data_path}
         outs:

From 88fdd2a76964dd31899fc85ca8ffe57f52a2e8d5 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 18:44:52 +0100
Subject: [PATCH 078/102] Adds retagging dvc data

---
 README.md              | 2 +-
 data/raw/retagging.dvc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 271de34e..2f3b39b9 100644
--- a/README.md
+++ b/README.md
@@ -530,7 +530,7 @@ To run the test you need to have installed the `dev` dependencies first.
 This is done by running `poetry install --with dev` after you are in the sell (`poetry shell`)
 
 Run tests with `pytest`. If you want to write some additional tests,
-they should go in the subfolde `tests/`
+they should go in the subfolder `tests/`
 
 
 ## ✍️ Scripts
diff --git a/data/raw/retagging.dvc b/data/raw/retagging.dvc
index 797836fd..0b53c3c7 100644
--- a/data/raw/retagging.dvc
+++ b/data/raw/retagging.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: ec54844641e11a97dcbfafeec2fa525d.dir
+- md5: 57bb45839cacd8fe9970d1c2a2bc3131.dir
   size: 5546175163
   nfiles: 6
   path: retagging

From f98e5981ce0592254bcc892e0495aaea40fc3418 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 20:36:41 +0100
Subject: [PATCH 079/102] Adds retagging dvc data

---
 data/raw/retagging.dvc      | 2 +-
 pipelines/bertmesh/dvc.yaml | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/raw/retagging.dvc b/data/raw/retagging.dvc
index 0b53c3c7..7154103c 100644
--- a/data/raw/retagging.dvc
+++ b/data/raw/retagging.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 57bb45839cacd8fe9970d1c2a2bc3131.dir
+- md5: 4f033c8a383a91587d0f83cc166e40a3.dir
   size: 5546175163
   nfiles: 6
   path: retagging
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index cb0e42a7..b3b84cbd 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -6,9 +6,9 @@ vars:
 stages:
     train:
         cmd: >-
-            python ${script_loc}/train.py
-            --model_key ""
-            --data_path ${data_path}            
+            python ${script_loc}/train.py \
+            --model_key "" \
+            --data_path ${data_path} \            
             --per_device_train_batch_size 16 \
             --per_device_eval_batch_size 1 \
             --multilabel_attention True \

From 71eebbff4749901f6894cae2cf8a10affc28e700 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 20:39:32 +0100
Subject: [PATCH 080/102] Changes reference fail from original to retagged

---
 examples/preprocess_and_train_by_epochs.sh | 3 ++-
 examples/preprocess_and_train_by_steps.sh  | 3 ++-
 examples/preprocess_splitting_by_fract.sh  | 2 +-
 examples/preprocess_splitting_by_rows.sh   | 2 +-
 examples/preprocess_splitting_by_years.sh  | 2 +-
 pipelines/bertmesh/dvc.yaml                | 3 ++-
 6 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/preprocess_and_train_by_epochs.sh b/examples/preprocess_and_train_by_epochs.sh
index 0e2f2f29..29142912 100644
--- a/examples/preprocess_and_train_by_epochs.sh
+++ b/examples/preprocess_and_train_by_epochs.sh
@@ -1,7 +1,8 @@
 # Run on g5.12xlarge instance
 
 # Without saving (on-the-fly)
-SOURCE="data/raw/allMeSH_2021.jsonl"
+#SOURCE="data/raw/allMeSH_2021.jsonl"
+SOURCE="data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
 
 grants-tagger train bertmesh \
     "" \
diff --git a/examples/preprocess_and_train_by_steps.sh b/examples/preprocess_and_train_by_steps.sh
index 83ec7478..f7f4f866 100644
--- a/examples/preprocess_and_train_by_steps.sh
+++ b/examples/preprocess_and_train_by_steps.sh
@@ -1,7 +1,8 @@
 # Run on g5.12xlarge instance
 
 # Without saving (on-the-fly)
-SOURCE="data/raw/allMeSH_2021.jsonl"
+# SOURCE="data/raw/allMeSH_2021.jsonl"
+SOURCE="data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
 
 grants-tagger train bertmesh \
     "" \
diff --git a/examples/preprocess_splitting_by_fract.sh b/examples/preprocess_splitting_by_fract.sh
index 93a0ba67..7c0e87ff 100644
--- a/examples/preprocess_splitting_by_fract.sh
+++ b/examples/preprocess_splitting_by_fract.sh
@@ -1,2 +1,2 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
   --test-size 0.05
diff --git a/examples/preprocess_splitting_by_rows.sh b/examples/preprocess_splitting_by_rows.sh
index 0a4f82f6..cf49da23 100644
--- a/examples/preprocess_splitting_by_rows.sh
+++ b/examples/preprocess_splitting_by_rows.sh
@@ -1,2 +1,2 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
   --test-size 25000
diff --git a/examples/preprocess_splitting_by_years.sh b/examples/preprocess_splitting_by_years.sh
index 28870229..a6bcd4a7 100644
--- a/examples/preprocess_splitting_by_years.sh
+++ b/examples/preprocess_splitting_by_years.sh
@@ -1,4 +1,4 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
   --test-size 25000 \
   --train-years 2016,2017,2018,2019 \
   --test-years 2020,2021
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index b3b84cbd..8cdd85e6 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -1,5 +1,6 @@
 vars:
-    - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    # - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test"
 

From 2660b7ae81f776761e18d8a78ad8af074fbcbc32 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 20:52:33 +0100
Subject: [PATCH 081/102] Changes reference fail from original to retagged

---
 data/raw/retagging.dvc      | 2 +-
 pipelines/bertmesh/dvc.yaml | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/data/raw/retagging.dvc b/data/raw/retagging.dvc
index 7154103c..e3c6bed6 100644
--- a/data/raw/retagging.dvc
+++ b/data/raw/retagging.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 4f033c8a383a91587d0f83cc166e40a3.dir
+- md5: 1a64ed7c09ef3bc49b1bfcc17f5d7e1f.dir
   size: 5546175163
   nfiles: 6
   path: retagging
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 8cdd85e6..04ef9b9d 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -1,10 +1,14 @@
 vars:
-    # - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    # ORIGINAL: - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    # RETAGGED:
     - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test"
 
 stages:
+    create_dir:
+        cmd: >-
+            mkdir -p ${output_dir}
     train:
         cmd: >-
             python ${script_loc}/train.py \

From a8f86a6cca4517b66b44a09b873cdf878e05f00b Mon Sep 17 00:00:00 2001
From: Juan Martinez <jjmcarrascosa@gmail.com>
Date: Thu, 14 Sep 2023 19:56:27 +0000
Subject: [PATCH 082/102] Adds mkdir to yaml

---
 pipelines/bertmesh/dvc.lock | 30 +++++++++++++++++-------------
 pipelines/bertmesh/dvc.yaml |  8 +++-----
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/pipelines/bertmesh/dvc.lock b/pipelines/bertmesh/dvc.lock
index 85a33960..f8243e4a 100644
--- a/pipelines/bertmesh/dvc.lock
+++ b/pipelines/bertmesh/dvc.lock
@@ -1,19 +1,23 @@
 schema: '2.0'
 stages:
   train:
-    cmd: python ../../grants_tagger_light/training/train.py --model_key "" --data_path
-      ../../data/raw/allMeSH_2021.json --output_dir ../../bertmesh_outs/pipeline_test
-      --max_samples 10 --per_device_train_batch_size 4 --per_device_eval_batch_size
-      4
+    cmd:
+    - mkdir -p ../../bertmesh_outs/pipeline_test/best
+    - python ../../grants_tagger_light/training/train.py \ --model_key "" \ --data_path
+      ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl \ --per_device_train_batch_size
+      16 \ --per_device_eval_batch_size 1 \ --multilabel_attention True \ --freeze_backbone
+      unfreeze \ --num_train_epochs 7 \ --learning_rate 5e-5 \ --dropout 0.1 \ --hidden_size
+      1024 \ --warmup_steps 5000 \ --max_grad_norm 2.0 \ --scheduler_type cosine_hard_restart
+      \ --weight_decay 0.2 \ --correct_bias True \ --threshold 0.25 \ --prune_labels_in_evaluation
+      True \ --hidden_dropout_prob 0.2 \ --attention_probs_dropout_prob 0.2 \ --fp16
+      \ --torch_compile \ --evaluation_strategy epoch \ --eval_accumulation_steps
+      20 \ --save_strategy epoch
     deps:
-    - path: ../../data/raw/allMeSH_2021.json
-      md5: e827a6b8062d1312664dcf075c12d89f
-      size: 27547042745
-    params:
-      ../../grants_tagger_light/training/cli_args/train_args.py:
-        BertMeshTrainingArguments: {}
+    - path: ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl
+      md5: 5011944fabc5ac7d6cfb2b20c17ae4d4
+      size: 5544198887
     outs:
     - path: ../../bertmesh_outs/pipeline_test/best
-      md5: b96ff54ecd600460dbb18b3e82d8b517.dir
-      size: 439905869
-      nfiles: 3
+      md5: d751713988987e9331980363e24189ce.dir
+      size: 0
+      nfiles: 0
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 04ef9b9d..53e7a6ad 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -6,12 +6,10 @@ vars:
     - output_dir: "../../bertmesh_outs/pipeline_test"
 
 stages:
-    create_dir:
-        cmd: >-
-            mkdir -p ${output_dir}
     train:
-        cmd: >-
-            python ${script_loc}/train.py \
+        cmd: 
+          - mkdir -p ${output_dir}/best
+          - python ${script_loc}/train.py \
             --model_key "" \
             --data_path ${data_path} \            
             --per_device_train_batch_size 16 \

From 0e8aa1c9fa005afee213d75c39a5093c1069eb5f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:05:04 +0100
Subject: [PATCH 083/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 53e7a6ad..f0e5d544 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -2,10 +2,22 @@ vars:
     # ORIGINAL: - data_path: "../../data/raw/allMeSH_2021.jsonl"
     # RETAGGED:
     - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
+    - preprocessing_output: "../../preprocessed_results"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test"
 
 stages:
+    preprocess:
+      cmd:
+        - grants-tagger preprocess mesh ${data_path} ${preprocessing_output} '' \
+          --test-size 25000 \
+          --train-years 2016,2017,2018,2019 \
+          --test-years 2020,2021
+      deps:
+        - ${data_path}
+      outs:
+        - ${preprocessing_output}
+
     train:
         cmd: 
           - mkdir -p ${output_dir}/best
@@ -35,6 +47,6 @@ stages:
             --eval_accumulation_steps 20 \
             --save_strategy epoch
         deps:
-            - ${data_path}
+            - ${preprocessing_output}
         outs:
             - ${output_dir}/best

From d79d80bbcede7851c137f35d4c8661de468ad3c5 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:10:34 +0100
Subject: [PATCH 084/102] Adds two steps to the dvc pipeline

---
 README.md                   |  6 +++++
 pipelines/bertmesh/dvc.yaml | 54 ++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 2f3b39b9..81a84d2e 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,12 @@ For inference, CPU-support should suffice.
 
 You now have access to the `grants-tagger` command line interface!
 
+## 3. Pull the data
+`dvc pull`
+
+Make sure the `data/raw/allMeSH_2021.jsonl` file was pulled (the original data) as well as
+`data/raw/retagging/allMeSH_2021.2016-2021.jsonl` file with the retagged data so far.
+
 ## OPTIONAL: 3. Install MantisNLP `remote` to connect to a remote AWS instances
 `pip install git+https://github.com/ivyleavedtoadflax/remote.py.git`
 Then add your instance
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index f0e5d544..37fc9c1e 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -21,31 +21,35 @@ stages:
     train:
         cmd: 
           - mkdir -p ${output_dir}/best
-          - python ${script_loc}/train.py \
-            --model_key "" \
-            --data_path ${data_path} \            
-            --per_device_train_batch_size 16 \
-            --per_device_eval_batch_size 1 \
-            --multilabel_attention True \
-            --freeze_backbone unfreeze \
-            --num_train_epochs 7 \
-            --learning_rate 5e-5 \
-            --dropout 0.1 \
-            --hidden_size 1024 \
-            --warmup_steps 5000 \
-            --max_grad_norm 2.0 \
-            --scheduler_type cosine_hard_restart \
-            --weight_decay 0.2 \
-            --correct_bias True \
-            --threshold 0.25 \
-            --prune_labels_in_evaluation True \
-            --hidden_dropout_prob 0.2 \
-            --attention_probs_dropout_prob 0.2 \
-            --fp16 \
-            --torch_compile \
-            --evaluation_strategy epoch \
-            --eval_accumulation_steps 20 \
-            --save_strategy epoch
+          grants-tagger train bertmesh \
+          "" \
+          $SOURCE \
+          --output_dir bertmesh_outs/pipeline_test/ \
+          --per_device_train_batch_size 16 \
+          --per_device_eval_batch_size 1 \
+          --multilabel_attention True \
+          --freeze_backbone unfreeze \
+          --num_train_epochs 7 \
+          --learning_rate 5e-5 \
+          --dropout 0.1 \
+          --hidden_size 1024 \
+          --warmup_steps 5000 \
+          --max_grad_norm 2.0 \
+          --scheduler_type cosine_hard_restart \
+          --weight_decay 0.2 \
+          --correct_bias True \
+          --threshold 0.25 \
+          --prune_labels_in_evaluation True \
+          --hidden_dropout_prob 0.2 \
+          --attention_probs_dropout_prob 0.2 \
+          --fp16 \
+          --torch_compile \
+          --evaluation_strategy epoch \
+          --eval_accumulation_steps 20 \
+          --save_strategy epoch \
+          --wandb_project wellcome-mesh \
+          --wandb_name test-train-all \
+          --wandb_api_key ${WANDB_API_KEY}
         deps:
             - ${preprocessing_output}
         outs:

From 0adb58efb47524d983145f2a36cf2f5ca7c8a55d Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:13:03 +0100
Subject: [PATCH 085/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 59 +++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 32 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 37fc9c1e..394e665c 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -4,7 +4,7 @@ vars:
     - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
     - preprocessing_output: "../../preprocessed_results"
     - script_loc: "../../grants_tagger_light/training"
-    - output_dir: "../../bertmesh_outs/pipeline_test"
+    - output_dir: "../../bertmesh_outs/pipeline_test/best"
 
 stages:
     preprocess:
@@ -20,37 +20,32 @@ stages:
 
     train:
         cmd: 
-          - mkdir -p ${output_dir}/best
-          grants-tagger train bertmesh \
-          "" \
-          $SOURCE \
-          --output_dir bertmesh_outs/pipeline_test/ \
-          --per_device_train_batch_size 16 \
-          --per_device_eval_batch_size 1 \
-          --multilabel_attention True \
-          --freeze_backbone unfreeze \
-          --num_train_epochs 7 \
-          --learning_rate 5e-5 \
-          --dropout 0.1 \
-          --hidden_size 1024 \
-          --warmup_steps 5000 \
-          --max_grad_norm 2.0 \
-          --scheduler_type cosine_hard_restart \
-          --weight_decay 0.2 \
-          --correct_bias True \
-          --threshold 0.25 \
-          --prune_labels_in_evaluation True \
-          --hidden_dropout_prob 0.2 \
-          --attention_probs_dropout_prob 0.2 \
-          --fp16 \
-          --torch_compile \
-          --evaluation_strategy epoch \
-          --eval_accumulation_steps 20 \
-          --save_strategy epoch \
-          --wandb_project wellcome-mesh \
-          --wandb_name test-train-all \
-          --wandb_api_key ${WANDB_API_KEY}
+          - mkdir -p ${output_dir}
+          - grants-tagger train bertmesh "" ${preprocessing_output} \
+            --output_dir ${output_dir} \
+            --per_device_train_batch_size 16 \
+            --per_device_eval_batch_size 1 \
+            --multilabel_attention True \
+            --freeze_backbone unfreeze \
+            --num_train_epochs 7 \
+            --learning_rate 5e-5 \
+            --dropout 0.1 \
+            --hidden_size 1024 \
+            --warmup_steps 5000 \
+            --max_grad_norm 2.0 \
+            --scheduler_type cosine_hard_restart \
+            --weight_decay 0.2 \
+            --correct_bias True \
+            --threshold 0.25 \
+            --prune_labels_in_evaluation True \
+            --hidden_dropout_prob 0.2 \
+            --attention_probs_dropout_prob 0.2 \
+            --fp16 \
+            --torch_compile \
+            --evaluation_strategy epoch \
+            --eval_accumulation_steps 20 \
+            --save_strategy epoch
         deps:
             - ${preprocessing_output}
         outs:
-            - ${output_dir}/best
+            - ${output_dir}

From 43046edbe321a1799fee144318fa9103f3f166f6 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:27:06 +0100
Subject: [PATCH 086/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 50 ++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 394e665c..6fa8fd4c 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -13,37 +13,37 @@ stages:
           --test-size 25000 \
           --train-years 2016,2017,2018,2019 \
           --test-years 2020,2021
+        - mkdir -p ${output_dir}
       deps:
         - ${data_path}
       outs:
         - ${preprocessing_output}
 
     train:
-        cmd: 
-          - mkdir -p ${output_dir}
-          - grants-tagger train bertmesh "" ${preprocessing_output} \
-            --output_dir ${output_dir} \
-            --per_device_train_batch_size 16 \
-            --per_device_eval_batch_size 1 \
-            --multilabel_attention True \
-            --freeze_backbone unfreeze \
-            --num_train_epochs 7 \
-            --learning_rate 5e-5 \
-            --dropout 0.1 \
-            --hidden_size 1024 \
-            --warmup_steps 5000 \
-            --max_grad_norm 2.0 \
-            --scheduler_type cosine_hard_restart \
-            --weight_decay 0.2 \
-            --correct_bias True \
-            --threshold 0.25 \
-            --prune_labels_in_evaluation True \
-            --hidden_dropout_prob 0.2 \
-            --attention_probs_dropout_prob 0.2 \
-            --fp16 \
-            --torch_compile \
-            --evaluation_strategy epoch \
-            --eval_accumulation_steps 20 \
+        cmd: >-
+          grants-tagger train bertmesh "" ${preprocessing_output}
+            --output_dir ${output_dir}
+            --per_device_train_batch_size 16
+            --per_device_eval_batch_size 1
+            --multilabel_attention True
+            --freeze_backbone unfreeze
+            --num_train_epochs 7
+            --learning_rate 5e-5
+            --dropout 0.1
+            --hidden_size 1024
+            --warmup_steps 5000
+            --max_grad_norm 2.0
+            --scheduler_type cosine_hard_restart
+            --weight_decay 0.2
+            --correct_bias True
+            --threshold 0.25
+            --prune_labels_in_evaluation True
+            --hidden_dropout_prob 0.2
+            --attention_probs_dropout_prob 0.2
+            --fp16
+            --torch_compile
+            --evaluation_strategy epoch
+            --eval_accumulation_steps 20
             --save_strategy epoch
         deps:
             - ${preprocessing_output}

From d8a30b99e8eb927df73b51470e47126f49ac0c82 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:30:38 +0100
Subject: [PATCH 087/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 6fa8fd4c..dca4202f 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -7,15 +7,19 @@ vars:
     - output_dir: "../../bertmesh_outs/pipeline_test/best"
 
 stages:
+    preparation:
+      cmd: >-
+        mkdir -p ${output_dir}
+      outs:
+        - {output_dir}
     preprocess:
-      cmd:
-        - grants-tagger preprocess mesh ${data_path} ${preprocessing_output} '' \
-          --test-size 25000 \
-          --train-years 2016,2017,2018,2019 \
+      cmd: >-
+        grants-tagger preprocess mesh ${data_path} ${preprocessing_output} ""
+          --test-size 25000
+          --train-years 2016,2017,2018,2019
           --test-years 2020,2021
-        - mkdir -p ${output_dir}
       deps:
-        - ${data_path}
+        - ${output_dir}
       outs:
         - ${preprocessing_output}
 

From 441ee9856574eb0937fb19c393671f80af4fc0c6 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:31:24 +0100
Subject: [PATCH 088/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index dca4202f..a243ec5f 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -11,7 +11,8 @@ stages:
       cmd: >-
         mkdir -p ${output_dir}
       outs:
-        - {output_dir}
+        - ${output_dir}
+
     preprocess:
       cmd: >-
         grants-tagger preprocess mesh ${data_path} ${preprocessing_output} ""

From b1736da832c1709d4373db1fd631650bc5f820b8 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:31:53 +0100
Subject: [PATCH 089/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index a243ec5f..10ffea66 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -52,5 +52,3 @@ stages:
             --save_strategy epoch
         deps:
             - ${preprocessing_output}
-        outs:
-            - ${output_dir}

From 5859bbe62b3e5d5a86483a366c3c8ca763a51c7e Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:33:50 +0100
Subject: [PATCH 090/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 10ffea66..c2b0b42a 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -15,9 +15,9 @@ stages:
 
     preprocess:
       cmd: >-
-        grants-tagger preprocess mesh ${data_path} ${preprocessing_output} ""
-          --test-size 25000
-          --train-years 2016,2017,2018,2019
+        grants-tagger preprocess mesh ${data_path} ${preprocessing_output} "" \
+          --test-size 25000 \
+          --train-years 2016,2017,2018,2019 \
           --test-years 2020,2021
       deps:
         - ${output_dir}

From 25aa0e6041dcfcde802dbe97eab871068533f7f2 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:39:28 +0100
Subject: [PATCH 091/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index c2b0b42a..c82cf76c 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -7,20 +7,10 @@ vars:
     - output_dir: "../../bertmesh_outs/pipeline_test/best"
 
 stages:
-    preparation:
-      cmd: >-
-        mkdir -p ${output_dir}
-      outs:
-        - ${output_dir}
-
     preprocess:
-      cmd: >-
-        grants-tagger preprocess mesh ${data_path} ${preprocessing_output} "" \
-          --test-size 25000 \
-          --train-years 2016,2017,2018,2019 \
-          --test-years 2020,2021
-      deps:
-        - ${output_dir}
+      cmd:
+        - mkdir -p ${output_dir}
+        - grants-tagger preprocess mesh ${data_path} ${preprocessing_output} "" --test-size 25000 --train-years 2016,2017,2018,2019 --test-years 2020,2021
       outs:
         - ${preprocessing_output}
 
@@ -52,3 +42,5 @@ stages:
             --save_strategy epoch
         deps:
             - ${preprocessing_output}
+        outs:
+            - ${output_dir}

From cf64bb1708e071a8baf0729b30250de8a90da692 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:45:23 +0100
Subject: [PATCH 092/102] Adds two steps to the dvc pipeline

---
 pipelines/bertmesh/dvc.yaml | 46 ++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index c82cf76c..7f46473b 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -16,29 +16,29 @@ stages:
 
     train:
         cmd: >-
-          grants-tagger train bertmesh "" ${preprocessing_output}
-            --output_dir ${output_dir}
-            --per_device_train_batch_size 16
-            --per_device_eval_batch_size 1
-            --multilabel_attention True
-            --freeze_backbone unfreeze
-            --num_train_epochs 7
-            --learning_rate 5e-5
-            --dropout 0.1
-            --hidden_size 1024
-            --warmup_steps 5000
-            --max_grad_norm 2.0
-            --scheduler_type cosine_hard_restart
-            --weight_decay 0.2
-            --correct_bias True
-            --threshold 0.25
-            --prune_labels_in_evaluation True
-            --hidden_dropout_prob 0.2
-            --attention_probs_dropout_prob 0.2
-            --fp16
-            --torch_compile
-            --evaluation_strategy epoch
-            --eval_accumulation_steps 20
+          grants-tagger train bertmesh "" ${preprocessing_output} \
+            --output_dir ${output_dir} \
+            --per_device_train_batch_size 16 \
+            --per_device_eval_batch_size 1 \
+            --multilabel_attention True \
+            --freeze_backbone unfreeze \
+            --num_train_epochs 7 \
+            --learning_rate 5e-5 \
+            --dropout 0.1 \
+            --hidden_size 1024 \
+            --warmup_steps 5000 \
+            --max_grad_norm 2.0 \
+            --scheduler_type cosine_hard_restart \
+            --weight_decay 0.2 \
+            --correct_bias True \
+            --threshold 0.25 \
+            --prune_labels_in_evaluation True \
+            --hidden_dropout_prob 0.2 \
+            --attention_probs_dropout_prob 0.2 \
+            --fp16 \
+            --torch_compile \
+            --evaluation_strategy epoch \
+            --eval_accumulation_steps 20 \
             --save_strategy epoch
         deps:
             - ${preprocessing_output}

From f5c8da3c9eefdf482e4b3e28b9afdf4edfa69a9c Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:50:46 +0100
Subject: [PATCH 093/102] Default wandb

---
 .../training/cli_args/wandb_args.py           |  7 +--
 pipelines/bertmesh/dvc.yaml                   | 48 +++++++++----------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/grants_tagger_light/training/cli_args/wandb_args.py b/grants_tagger_light/training/cli_args/wandb_args.py
index b004353f..bf6f9573 100644
--- a/grants_tagger_light/training/cli_args/wandb_args.py
+++ b/grants_tagger_light/training/cli_args/wandb_args.py
@@ -1,3 +1,4 @@
+import datetime
 import os
 from dataclasses import dataclass, field, fields
 
@@ -11,17 +12,17 @@ class WandbArguments:
     """
 
     wandb_api_key: str = field(
-        default=None,
+        default=os.environ['WANDB_API_KEY'],
         metadata={"help": "Wandb API key"},
     )
 
     wandb_project: str = field(
-        default=None,
+        default="bertmesh",
         metadata={"help": "Wandb project name"},
     )
 
     wandb_name: str = field(
-        default=None,
+        default=str(datetime.datetime.now()),
         metadata={"help": "Wandb run name"},
     )
 
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 7f46473b..0f207546 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -16,30 +16,30 @@ stages:
 
     train:
         cmd: >-
-          grants-tagger train bertmesh "" ${preprocessing_output} \
-            --output_dir ${output_dir} \
-            --per_device_train_batch_size 16 \
-            --per_device_eval_batch_size 1 \
-            --multilabel_attention True \
-            --freeze_backbone unfreeze \
-            --num_train_epochs 7 \
-            --learning_rate 5e-5 \
-            --dropout 0.1 \
-            --hidden_size 1024 \
-            --warmup_steps 5000 \
-            --max_grad_norm 2.0 \
-            --scheduler_type cosine_hard_restart \
-            --weight_decay 0.2 \
-            --correct_bias True \
-            --threshold 0.25 \
-            --prune_labels_in_evaluation True \
-            --hidden_dropout_prob 0.2 \
-            --attention_probs_dropout_prob 0.2 \
-            --fp16 \
-            --torch_compile \
-            --evaluation_strategy epoch \
-            --eval_accumulation_steps 20 \
-            --save_strategy epoch
+          grants-tagger train bertmesh "" ${preprocessing_output}
+          --output_dir ${output_dir}
+          --per_device_train_batch_size 16
+          --per_device_eval_batch_size 1
+          --multilabel_attention True
+          --freeze_backbone unfreeze
+          --num_train_epochs 7
+          --learning_rate 5e-5
+          --dropout 0.1
+          --hidden_size 1024
+          --warmup_steps 5000
+          --max_grad_norm 2.0
+          --scheduler_type cosine_hard_restart
+          --weight_decay 0.2
+          --correct_bias True
+          --threshold 0.25
+          --prune_labels_in_evaluation True
+          --hidden_dropout_prob 0.2
+          --attention_probs_dropout_prob 0.2
+          --fp16
+          --torch_compile
+          --evaluation_strategy epoch
+          --eval_accumulation_steps 20
+          --save_strategy epoch
         deps:
             - ${preprocessing_output}
         outs:

From a45d982349b8c43cf23f64c3adb7f545064d4c7e Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 21:54:24 +0100
Subject: [PATCH 094/102] Documentation about wandb and dvc repro

---
 README.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 81a84d2e..dbe6972b 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ your own data under development.
 
 ## 🔥 Train
 
-The command will train a model and save it to the specified path. Currently we support on BertMesh.
+The command will train a model and save it to the specified path. Currently, we support BertMesh.
 
 ### Training bertmesh
 ```
@@ -194,6 +194,18 @@ grants-tagger train bertmesh \
     --wandb_api_key ${WANDB_API_KEY}
 ```
 
+## WANDB
+Make sure you track the training. To do that, either set --wandb_api_key or define 'WANDB_API_KEY' in the environment variables.
+
+## DVC for preprocessing and training
+You can run preprocessing and training by doing:
+```bash
+cd pipelines/bertmesh
+dvc repro
+```
+
+Make sure you have  your 'WANDB_API_KEY' in the environment variables!
+
 ## 📚 Augment
 Data augmentation can be useful for low represented classes. LLMs as `openai GPT-3.5` can be used to that purpose.
 

From 823bc64a837cd851f96f37cda430f3ff91cb0d64 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Thu, 14 Sep 2023 22:15:20 +0100
Subject: [PATCH 095/102] 12 epochs

---
 examples/preprocess_and_train_by_epochs.sh | 2 +-
 examples/preprocess_and_train_by_steps.sh  | 2 +-
 examples/train_by_epochs.sh                | 2 +-
 examples/train_by_steps.sh                 | 2 +-
 pipelines/bertmesh/dvc.yaml                | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/preprocess_and_train_by_epochs.sh b/examples/preprocess_and_train_by_epochs.sh
index 29142912..310777a8 100644
--- a/examples/preprocess_and_train_by_epochs.sh
+++ b/examples/preprocess_and_train_by_epochs.sh
@@ -15,7 +15,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 7 \
+    --num_train_epochs 12 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/preprocess_and_train_by_steps.sh b/examples/preprocess_and_train_by_steps.sh
index f7f4f866..75254a0a 100644
--- a/examples/preprocess_and_train_by_steps.sh
+++ b/examples/preprocess_and_train_by_steps.sh
@@ -15,7 +15,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 7 \
+    --num_train_epochs 12 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/train_by_epochs.sh b/examples/train_by_epochs.sh
index bf06afac..433e0b0b 100644
--- a/examples/train_by_epochs.sh
+++ b/examples/train_by_epochs.sh
@@ -11,7 +11,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 7 \
+    --num_train_epochs 12 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/train_by_steps.sh b/examples/train_by_steps.sh
index 67cabcd6..c35fc8c6 100644
--- a/examples/train_by_steps.sh
+++ b/examples/train_by_steps.sh
@@ -11,7 +11,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 7 \
+    --num_train_epochs 12 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 0f207546..96c03a7e 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -22,7 +22,7 @@ stages:
           --per_device_eval_batch_size 1
           --multilabel_attention True
           --freeze_backbone unfreeze
-          --num_train_epochs 7
+          --num_train_epochs 12
           --learning_rate 5e-5
           --dropout 0.1
           --hidden_size 1024

From ac928f125a1695f2e9e7f667e497defc9c56d266 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jose.martinez@wayops.eu>
Date: Fri, 15 Sep 2023 10:29:03 +0100
Subject: [PATCH 096/102] Fixex bug with the batch calculation

---
 grants_tagger_light/retagging/retagging.py          | 5 ++---
 grants_tagger_light/training/cli_args/wandb_args.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/grants_tagger_light/retagging/retagging.py b/grants_tagger_light/retagging/retagging.py
index 69c0abdd..9ef73d6c 100644
--- a/grants_tagger_light/retagging/retagging.py
+++ b/grants_tagger_light/retagging/retagging.py
@@ -249,9 +249,8 @@ def retag(
     logging.info("- Predicting all tags")
     dset = dset.add_column("changes", [[]] * len(dset))
     with open(os.path.join(save_to_path, "corrections"), "w") as f:
-        for batch_index in tqdm.tqdm(range(0, len(dset), batch_size)):
-            start = batch_index * batch_size
-            end = min(len(dset), (batch_index + 1) * batch_size)
+        for start in tqdm.tqdm(range(0, len(dset), batch_size)):
+            end = min(len(dset), start + batch_size)
             batch = dset.select([i for i in range(start, end)])
             batch_buffer = [x for x in batch]
             for tag in models.keys():
diff --git a/grants_tagger_light/training/cli_args/wandb_args.py b/grants_tagger_light/training/cli_args/wandb_args.py
index bf6f9573..65304e1c 100644
--- a/grants_tagger_light/training/cli_args/wandb_args.py
+++ b/grants_tagger_light/training/cli_args/wandb_args.py
@@ -12,7 +12,7 @@ class WandbArguments:
     """
 
     wandb_api_key: str = field(
-        default=os.environ['WANDB_API_KEY'],
+        default=os.environ['WANDB_API_KEY'] if 'WANDB_API_KEY' in os.environ else '',
         metadata={"help": "Wandb API key"},
     )
 

From e41dcbd9e703304695c467db8088a06c331ff8a3 Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jjmcarrascosa@gmail.com>
Date: Fri, 15 Sep 2023 10:31:19 +0100
Subject: [PATCH 097/102] black

---
 grants_tagger_light/training/cli_args/wandb_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grants_tagger_light/training/cli_args/wandb_args.py b/grants_tagger_light/training/cli_args/wandb_args.py
index 65304e1c..6210c628 100644
--- a/grants_tagger_light/training/cli_args/wandb_args.py
+++ b/grants_tagger_light/training/cli_args/wandb_args.py
@@ -12,7 +12,7 @@ class WandbArguments:
     """
 
     wandb_api_key: str = field(
-        default=os.environ['WANDB_API_KEY'] if 'WANDB_API_KEY' in os.environ else '',
+        default=os.environ["WANDB_API_KEY"] if "WANDB_API_KEY" in os.environ else "",
         metadata={"help": "Wandb API key"},
     )
 

From c021da749d8b3ffb78b710e439dd51f6594b5ad9 Mon Sep 17 00:00:00 2001
From: Juan Martinez <jjmcarrascosa@gmail.com>
Date: Fri, 15 Sep 2023 10:16:33 +0000
Subject: [PATCH 098/102] Adds last trained model

---
 .gitignore                    | 1 +
 bertmesh_before_retagging.dvc | 5 +++++
 2 files changed, 6 insertions(+)
 create mode 100644 bertmesh_before_retagging.dvc

diff --git a/.gitignore b/.gitignore
index 22124d82..feac557b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ cython_debug/
 # Folder where training outputs are stored
 bertmesh_outs/
 wandb/
+/bertmesh_before_retagging
diff --git a/bertmesh_before_retagging.dvc b/bertmesh_before_retagging.dvc
new file mode 100644
index 00000000..d94c2ed6
--- /dev/null
+++ b/bertmesh_before_retagging.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: 9a2106b99f825089d4c0a00aae6551e5.dir
+  size: 2593104083
+  nfiles: 4
+  path: bertmesh_before_retagging

From ae92b789ef842de7aff92871cb53246e5d68b68f Mon Sep 17 00:00:00 2001
From: "Jose J. Martinez" <jjmcarrascosa@gmail.com>
Date: Sat, 16 Sep 2023 15:30:38 +0100
Subject: [PATCH 099/102] Adds last metrics bertmesh

---
 bertmesh_before_retagging.dvc              | 6 +++---
 examples/preprocess_and_train_by_epochs.sh | 2 +-
 examples/preprocess_and_train_by_steps.sh  | 2 +-
 examples/train_by_epochs.sh                | 2 +-
 examples/train_by_steps.sh                 | 2 +-
 pipelines/bertmesh/dvc.yaml                | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 bertmesh_before_retagging.dvc

diff --git a/bertmesh_before_retagging.dvc b/bertmesh_before_retagging.dvc
old mode 100644
new mode 100755
index d94c2ed6..b0aa04f5
--- a/bertmesh_before_retagging.dvc
+++ b/bertmesh_before_retagging.dvc
@@ -1,5 +1,5 @@
 outs:
-- md5: 9a2106b99f825089d4c0a00aae6551e5.dir
-  size: 2593104083
-  nfiles: 4
+- md5: 4964c2e8f83f071bcb7c467a859726a6.dir
+  size: 2593104471
+  nfiles: 5
   path: bertmesh_before_retagging
diff --git a/examples/preprocess_and_train_by_epochs.sh b/examples/preprocess_and_train_by_epochs.sh
index 310777a8..29142912 100644
--- a/examples/preprocess_and_train_by_epochs.sh
+++ b/examples/preprocess_and_train_by_epochs.sh
@@ -15,7 +15,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 12 \
+    --num_train_epochs 7 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/preprocess_and_train_by_steps.sh b/examples/preprocess_and_train_by_steps.sh
index 75254a0a..f7f4f866 100644
--- a/examples/preprocess_and_train_by_steps.sh
+++ b/examples/preprocess_and_train_by_steps.sh
@@ -15,7 +15,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 12 \
+    --num_train_epochs 7 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/train_by_epochs.sh b/examples/train_by_epochs.sh
index 433e0b0b..bf06afac 100644
--- a/examples/train_by_epochs.sh
+++ b/examples/train_by_epochs.sh
@@ -11,7 +11,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 12 \
+    --num_train_epochs 7 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/examples/train_by_steps.sh b/examples/train_by_steps.sh
index c35fc8c6..67cabcd6 100644
--- a/examples/train_by_steps.sh
+++ b/examples/train_by_steps.sh
@@ -11,7 +11,7 @@ grants-tagger train bertmesh \
     --per_device_eval_batch_size 1 \
     --multilabel_attention True \
     --freeze_backbone unfreeze \
-    --num_train_epochs 12 \
+    --num_train_epochs 7 \
     --learning_rate 5e-5 \
     --dropout 0.1 \
     --hidden_size 1024 \
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 96c03a7e..0f207546 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -22,7 +22,7 @@ stages:
           --per_device_eval_batch_size 1
           --multilabel_attention True
           --freeze_backbone unfreeze
-          --num_train_epochs 12
+          --num_train_epochs 7
           --learning_rate 5e-5
           --dropout 0.1
           --hidden_size 1024

From f823fd467503785b9f9fbc85151029876c7ef349 Mon Sep 17 00:00:00 2001
From: Nick Sorros <nsorros@gmail.com>
Date: Tue, 19 Sep 2023 10:22:47 +0000
Subject: [PATCH 100/102] tag all active grants

---
 pipelines/generate_grants/dvc.lock            |  8 +--
 poetry.lock                                   | 67 ++++---------------
 pyproject.toml                                |  1 +
 .../create_xlinear_bertmesh_comparison_csv.py | 16 ++---
 4 files changed, 24 insertions(+), 68 deletions(-)

diff --git a/pipelines/generate_grants/dvc.lock b/pipelines/generate_grants/dvc.lock
index d2b12d87..30c625f4 100644
--- a/pipelines/generate_grants/dvc.lock
+++ b/pipelines/generate_grants/dvc.lock
@@ -10,9 +10,9 @@ stages:
       --xlinear-thresh 0.2 --pre-annotate-xlinear --output-path data/grants_comparison/comparison.xlsx
     deps:
     - path: scripts/create_xlinear_bertmesh_comparison_csv.py
-      md5: a8ced1e8851e43f1902ba0d9dbf98781
-      size: 8350
+      md5: d960c5b83b112d390b80f9db8467f8a6
+      size: 8089
     outs:
     - path: data/grants_comparison/comparison.xlsx
-      md5: bd9ecabc26224fab96816fc7a6bd8be8
-      size: 195402
+      md5: 91983b99e7a5386f5e21d3a2ede38642
+      size: 937396
diff --git a/poetry.lock b/poetry.lock
index d8ea3549..285a0f13 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -2326,57 +2326,6 @@ files = [
     {file = "orjson-3.9.7-cp39-none-win32.whl", hash = "sha256:14d3fb6cd1040a4a4a530b28e8085131ed94ebc90d72793c59a713de34b60838"},
     {file = "orjson-3.9.7-cp39-none-win_amd64.whl", hash = "sha256:9ef82157bbcecd75d6296d5d8b2d792242afcd064eb1ac573f8847b52e58f677"},
     {file = "orjson-3.9.7.tar.gz", hash = "sha256:85e39198f78e2f7e054d296395f6c96f5e02892337746ef5b6a1bf3ed5910142"},
-    {file = "orjson-3.9.2-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7323e4ca8322b1ecb87562f1ec2491831c086d9faa9a6c6503f489dadbed37d7"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1272688ea1865f711b01ba479dea2d53e037ea00892fd04196b5875f7021d9d3"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0b9a26f1d1427a9101a1e8910f2e2df1f44d3d18ad5480ba031b15d5c1cb282e"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a5ca55b0d8f25f18b471e34abaee4b175924b6cd62f59992945b25963443141"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:877872db2c0f41fbe21f852ff642ca842a43bc34895b70f71c9d575df31fffb4"},
-    {file = "orjson-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a39c2529d75373b7167bf84c814ef9b8f3737a339c225ed6c0df40736df8748"},
-    {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:84ebd6fdf138eb0eb4280045442331ee71c0aab5e16397ba6645f32f911bfb37"},
-    {file = "orjson-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a60a1cfcfe310547a1946506dd4f1ed0a7d5bd5b02c8697d9d5dcd8d2e9245e"},
-    {file = "orjson-3.9.2-cp310-none-win32.whl", hash = "sha256:2ae61f5d544030a6379dbc23405df66fea0777c48a0216d2d83d3e08b69eb676"},
-    {file = "orjson-3.9.2-cp310-none-win_amd64.whl", hash = "sha256:c290c4f81e8fd0c1683638802c11610b2f722b540f8e5e858b6914b495cf90c8"},
-    {file = "orjson-3.9.2-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:02ef014f9a605e84b675060785e37ec9c0d2347a04f1307a9d6840ab8ecd6f55"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:992af54265ada1c1579500d6594ed73fe333e726de70d64919cf37f93defdd06"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a40958f7af7c6d992ee67b2da4098dca8b770fc3b4b3834d540477788bfa76d3"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93864dec3e3dd058a2dbe488d11ac0345214a6a12697f53a63e34de7d28d4257"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16fdf5a82df80c544c3c91516ab3882cd1ac4f1f84eefeafa642e05cef5f6699"},
-    {file = "orjson-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275b5a18fd9ed60b2720543d3ddac170051c43d680e47d04ff5203d2c6d8ebf1"},
-    {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b9aea6dcb99fcbc9f6d1dd84fca92322fda261da7fb014514bb4689c7c2097a8"},
-    {file = "orjson-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7d74ae0e101d17c22ef67b741ba356ab896fc0fa64b301c2bf2bb0a4d874b190"},
-    {file = "orjson-3.9.2-cp311-none-win32.whl", hash = "sha256:a9a7d618f99b2d67365f2b3a588686195cb6e16666cd5471da603a01315c17cc"},
-    {file = "orjson-3.9.2-cp311-none-win_amd64.whl", hash = "sha256:6320b28e7bdb58c3a3a5efffe04b9edad3318d82409e84670a9b24e8035a249d"},
-    {file = "orjson-3.9.2-cp37-cp37m-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:368e9cc91ecb7ac21f2aa475e1901204110cf3e714e98649c2502227d248f947"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58e9e70f0dcd6a802c35887f306b555ff7a214840aad7de24901fc8bd9cf5dde"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:00c983896c2e01c94c0ef72fd7373b2aa06d0c0eed0342c4884559f812a6835b"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ee743e8890b16c87a2f89733f983370672272b61ee77429c0a5899b2c98c1a7"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b7b065942d362aad4818ff599d2f104c35a565c2cbcbab8c09ec49edba91da75"},
-    {file = "orjson-3.9.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e46e9c5b404bb9e41d5555762fd410d5466b7eb1ec170ad1b1609cbebe71df21"},
-    {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8170157288714678ffd64f5de33039e1164a73fd8b6be40a8a273f80093f5c4f"},
-    {file = "orjson-3.9.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e3e2f087161947dafe8319ea2cfcb9cea4bb9d2172ecc60ac3c9738f72ef2909"},
-    {file = "orjson-3.9.2-cp37-none-win32.whl", hash = "sha256:373b7b2ad11975d143556fdbd2c27e1150b535d2c07e0b48dc434211ce557fe6"},
-    {file = "orjson-3.9.2-cp37-none-win_amd64.whl", hash = "sha256:d7de3dbbe74109ae598692113cec327fd30c5a30ebca819b21dfa4052f7b08ef"},
-    {file = "orjson-3.9.2-cp38-cp38-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:8cd4385c59bbc1433cad4a80aca65d2d9039646a9c57f8084897549b55913b17"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a74036aab1a80c361039290cdbc51aa7adc7ea13f56e5ef94e9be536abd227bd"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1aaa46d7d4ae55335f635eadc9be0bd9bcf742e6757209fc6dc697e390010adc"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e52c67ed6bb368083aa2078ea3ccbd9721920b93d4b06c43eb4e20c4c860046"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1a6cdfcf9c7dd4026b2b01fdff56986251dc0cc1e980c690c79eec3ae07b36e7"},
-    {file = "orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1882a70bb69595b9ec5aac0040a819e94d2833fe54901e2b32f5e734bc259a8b"},
-    {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:fc05e060d452145ab3c0b5420769e7356050ea311fc03cb9d79c481982917cca"},
-    {file = "orjson-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8bc2c40d9bb26efefb10949d261a47ca196772c308babc538dd9f4b73e8d386"},
-    {file = "orjson-3.9.2-cp38-none-win32.whl", hash = "sha256:302d80198d8d5b658065627da3a356cbe5efa082b89b303f162f030c622e0a17"},
-    {file = "orjson-3.9.2-cp38-none-win_amd64.whl", hash = "sha256:3164fc20a585ec30a9aff33ad5de3b20ce85702b2b2a456852c413e3f0d7ab09"},
-    {file = "orjson-3.9.2-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:7a6ccadf788531595ed4728aa746bc271955448d2460ff0ef8e21eb3f2a281ba"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3245d230370f571c945f69aab823c279a868dc877352817e22e551de155cb06c"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:205925b179550a4ee39b8418dd4c94ad6b777d165d7d22614771c771d44f57bd"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0325fe2d69512187761f7368c8cda1959bcb75fc56b8e7a884e9569112320e57"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:806704cd58708acc66a064a9a58e3be25cf1c3f9f159e8757bd3f515bfabdfa1"},
-    {file = "orjson-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03fb36f187a0c19ff38f6289418863df8b9b7880cdbe279e920bef3a09d8dab1"},
-    {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:20925d07a97c49c6305bff1635318d9fc1804aa4ccacb5fb0deb8a910e57d97a"},
-    {file = "orjson-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eebfed53bec5674e981ebe8ed2cf00b3f7bcda62d634733ff779c264307ea505"},
-    {file = "orjson-3.9.2-cp39-none-win32.whl", hash = "sha256:ba60f09d735f16593950c6adf033fbb526faa94d776925579a87b777db7d0838"},
-    {file = "orjson-3.9.2-cp39-none-win_amd64.whl", hash = "sha256:869b961df5fcedf6c79f4096119b35679b63272362e9b745e668f0391a892d39"},
-    {file = "orjson-3.9.2.tar.gz", hash = "sha256:24257c8f641979bf25ecd3e27251b5cc194cdd3a6e96004aac8446f5e63d9664"},
 ]
 
 [[package]]
@@ -3967,7 +3916,6 @@ async = ["httpx (>=0.22.0)"]
 aws = ["boto3"]
 azure = ["azure-identity", "azure-storage-blob"]
 gcp = ["google-cloud-storage"]
-grpc = ["grpcio (>=1.27.2)"]
 kubeflow = ["google-cloud-storage", "kubernetes", "minio", "sh"]
 launch = ["awscli", "azure-containerregistry", "azure-identity", "azure-storage-blob", "boto3", "botocore", "chardet", "google-auth", "google-cloud-artifact-registry", "google-cloud-compute", "google-cloud-storage", "iso8601", "kubernetes", "nbconvert", "nbformat", "optuna", "typing-extensions"]
 media = ["bokeh", "moviepy", "numpy", "pillow", "plotly", "rdkit-pypi", "soundfile"]
@@ -4087,6 +4035,17 @@ files = [
     {file = "wrapt-1.14.1.tar.gz", hash = "sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d"},
 ]
 
+[[package]]
+name = "xlsxwriter"
+version = "3.1.4"
+description = "A Python module for creating Excel XLSX files."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "XlsxWriter-3.1.4-py3-none-any.whl", hash = "sha256:29c7bf5ade4de1f0bb487882eb45d4845eebc3ff72a68b2090df94d83e10b92e"},
+    {file = "XlsxWriter-3.1.4.tar.gz", hash = "sha256:f4b1b1ba046b50aefc0b634d465bce5bf8497530bc8625e216cf30a84ed97a46"},
+]
+
 [[package]]
 name = "xxhash"
 version = "3.3.0"
@@ -4288,4 +4247,4 @@ test = ["zope.testing"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "019a45e6955dd283a0127e78e50df6fe8c29cebbdfba847bc9cb39d9f7a509bb"
+content-hash = "106227ddfe680190f88567d57c7cd3f7e69fea2a621b07f082ffe420f6d0b0e4"
diff --git a/pyproject.toml b/pyproject.toml
index ed65aacb..f757fcef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ openai = "0.27.8"
 openai-multi-client = "^0.1.1"
 openpyxl = "^3.1.2"
 colorama = "^0.4.6"
+xlsxwriter = "^3.1.4"
 
 
 [tool.poetry.group.dev]
diff --git a/scripts/create_xlinear_bertmesh_comparison_csv.py b/scripts/create_xlinear_bertmesh_comparison_csv.py
index cca0c650..10b13330 100644
--- a/scripts/create_xlinear_bertmesh_comparison_csv.py
+++ b/scripts/create_xlinear_bertmesh_comparison_csv.py
@@ -90,7 +90,6 @@ def create_comparison_csv(
     mesh_metadata_path: str,
     mesh_terms_list_path: str,
     active_portfolio_path: str,
-    active_portfolio_sample: int,
     pre_annotate_bertmesh: bool,
     bertmesh_path: str,
     bertmesh_thresh: float,
@@ -129,15 +128,14 @@ def create_comparison_csv(
     active_grants = pd.read_csv(active_portfolio_path)
     active_grants = active_grants[~active_grants["Synopsis"].isna()]
     active_grants.drop_duplicates(subset="Synopsis", inplace=True)
-    active_grants_sample = active_grants.sample(n=active_portfolio_sample)
-    active_grants_sample = pd.DataFrame(
+    active_grants = pd.DataFrame(
         {
-            "abstract": active_grants_sample["Synopsis"],
-            "Reference": active_grants_sample["Reference"],
+            "abstract": active_grants["Synopsis"],
+            "Reference": active_grants["Reference"],
         }
     )
-    active_grants_sample["active_portfolio"] = 1
-    grants_sample = pd.concat([grants_sample, active_grants_sample])
+    active_grants["active_portfolio"] = 1
+    grants_sample = pd.concat([grants_sample, active_grants])
 
     abstracts = grants_sample["abstract"].tolist()
     print(f"{len(abstracts)} abstracts to tag")
@@ -197,7 +195,7 @@ def create_comparison_csv(
         )
 
     # Output df to csv
-    grants_sample.to_excel(output_path, index=False)
+    grants_sample.to_excel(output_path, index=False, engine="xlsxwriter")
 
 
 if __name__ == "__main__":
@@ -208,7 +206,6 @@ def create_comparison_csv(
     parser.add_argument("--mesh-metadata-path", type=str)
     parser.add_argument("--mesh-terms-list-path", type=str)
     parser.add_argument("--active-portfolio-path", type=str)
-    parser.add_argument("--active-portfolio-sample", type=int, default=200)
     parser.add_argument("--pre-annotate-bertmesh", action="store_true")
     parser.add_argument(
         "--bertmesh-path", type=str, default="Wellcome/WellcomeBertMesh"
@@ -229,7 +226,6 @@ def create_comparison_csv(
         mesh_metadata_path=args.mesh_metadata_path,
         mesh_terms_list_path=args.mesh_terms_list_path,
         active_portfolio_path=args.active_portfolio_path,
-        active_portfolio_sample=args.active_portfolio_sample,
         pre_annotate_bertmesh=args.pre_annotate_bertmesh,
         bertmesh_path=args.bertmesh_path,
         bertmesh_thresh=args.bertmesh_thresh,

From f66cbbb31771d83691d433f82430cf1eab01a94a Mon Sep 17 00:00:00 2001
From: agombert <arnault.gombert@pm.me>
Date: Tue, 24 Oct 2023 13:50:34 +0000
Subject: [PATCH 101/102] :wrench: change to steps saving / eval strategy

---
 pipelines/bertmesh/dvc.lock | 17 +++++++++++++----
 pipelines/bertmesh/dvc.yaml | 16 +++++++++-------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/pipelines/bertmesh/dvc.lock b/pipelines/bertmesh/dvc.lock
index f8243e4a..ca8f4eca 100644
--- a/pipelines/bertmesh/dvc.lock
+++ b/pipelines/bertmesh/dvc.lock
@@ -8,10 +8,9 @@ stages:
       16 \ --per_device_eval_batch_size 1 \ --multilabel_attention True \ --freeze_backbone
       unfreeze \ --num_train_epochs 7 \ --learning_rate 5e-5 \ --dropout 0.1 \ --hidden_size
       1024 \ --warmup_steps 5000 \ --max_grad_norm 2.0 \ --scheduler_type cosine_hard_restart
-      \ --weight_decay 0.2 \ --correct_bias True \ --threshold 0.25 \ --prune_labels_in_evaluation
-      True \ --hidden_dropout_prob 0.2 \ --attention_probs_dropout_prob 0.2 \ --fp16
-      \ --torch_compile \ --evaluation_strategy epoch \ --eval_accumulation_steps
-      20 \ --save_strategy epoch
+      \ --weight_decay 0.2 \ --correct_bias True \ --threshold 0.25 \ --hidden_dropout_prob 0.2 \ --attention_probs_dropout_prob 0.2 \ --fp16
+      \ --torch_compile \ --evaluation_strategy steps \eval_steps 10000 \ --eval_accumulation_steps
+      20 \ --save_strategy steps \ --gradient_accumulation_step 2 \ --save_steps 20000
     deps:
     - path: ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl
       md5: 5011944fabc5ac7d6cfb2b20c17ae4d4
@@ -21,3 +20,13 @@ stages:
       md5: d751713988987e9331980363e24189ce.dir
       size: 0
       nfiles: 0
+  preprocess:
+    cmd:
+    - mkdir -p ../../bertmesh_outs/pipeline_test/best
+    - grants-tagger preprocess mesh ../../data/raw/allMeSH_2021.jsonl ../../preprocessed_results
+      "" --test-size 25000 --train-years 2016,2017,2018,2019 --test-years 2020,2021
+    outs:
+    - path: ../../preprocessed_results
+      md5: fc1fd7a533d2acf3b2225e2a62cebab3.dir
+      size: 12552341517
+      nfiles: 199
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 0f207546..44b9f44d 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -1,7 +1,7 @@
 vars:
-    # ORIGINAL: - data_path: "../../data/raw/allMeSH_2021.jsonl"
-    # RETAGGED:
-    - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
+    # RETAGGED: - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
+    # ORIGINAL:
+    - data_path: "../../data/raw/allMeSH_2021.jsonl"
     - preprocessing_output: "../../preprocessed_results"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test/best"
@@ -18,7 +18,8 @@ stages:
         cmd: >-
           grants-tagger train bertmesh "" ${preprocessing_output}
           --output_dir ${output_dir}
-          --per_device_train_batch_size 16
+          --per_device_train_batch_size 8
+          --gradient_accumulation_step 2
           --per_device_eval_batch_size 1
           --multilabel_attention True
           --freeze_backbone unfreeze
@@ -32,14 +33,15 @@ stages:
           --weight_decay 0.2
           --correct_bias True
           --threshold 0.25
-          --prune_labels_in_evaluation True
           --hidden_dropout_prob 0.2
           --attention_probs_dropout_prob 0.2
           --fp16
           --torch_compile
-          --evaluation_strategy epoch
+          --evaluation_strategy steps
           --eval_accumulation_steps 20
-          --save_strategy epoch
+          --save_strategy steps
+          --save_steps 20000
+          --eval_steps 10000
         deps:
             - ${preprocessing_output}
         outs:

From 58ef63e98b488bd27db451ffccbb840b508b3aeb Mon Sep 17 00:00:00 2001
From: agombert <arnault.gombert@pm.me>
Date: Wed, 1 Nov 2023 12:21:03 +0000
Subject: [PATCH 102/102] retrained evaluation

---
 .gitignore                                    |   1 +
 .../evaluation/evaluate_model.py              |  21 +-
 grants_tagger_light/training/train.py         |  13 +-
 grants_tagger_light/utils/utils.py            |  24 +-
 pipelines/bertmesh/dvc.lock                   |  40 +-
 pipelines/bertmesh/dvc.yaml                   |  10 +-
 scripts/evaluation_ss.py                      | 358 ++++++++++++++++++
 7 files changed, 432 insertions(+), 35 deletions(-)
 create mode 100644 scripts/evaluation_ss.py

diff --git a/.gitignore b/.gitignore
index feac557b..2bbb10fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,4 @@ cython_debug/
 bertmesh_outs/
 wandb/
 /bertmesh_before_retagging
+/preprocessed_results
diff --git a/grants_tagger_light/evaluation/evaluate_model.py b/grants_tagger_light/evaluation/evaluate_model.py
index eb43b720..8ace28c2 100644
--- a/grants_tagger_light/evaluation/evaluate_model.py
+++ b/grants_tagger_light/evaluation/evaluate_model.py
@@ -6,6 +6,7 @@
 from typing import Optional
 from transformers import pipeline
 from transformers.pipelines import PIPELINE_REGISTRY
+from tqdm.auto import tqdm
 
 import scipy.sparse as sp
 import typer
@@ -33,13 +34,20 @@ def evaluate_model(
     model = BertMesh.from_pretrained(model_path)
 
     label_binarizer = MultiLabelBinarizer()
-    label_binarizer.fit([list(model.id2label.values())])
+    id2labels = [0 for i in range(model.config.num_labels)]
+    for k, v in model.id2label.items():
+        id2labels[k] = v
+    label_binarizer.fit([id2labels])
 
     pipe = pipeline(
         "grants-tagging",
         model=model,
         tokenizer="Wellcome/WellcomeBertMesh",
+        device=0,
     )
+    def data():
+        for x in X_test:
+            yield x
 
     if split_data:
         print(
@@ -48,13 +56,14 @@ def evaluate_model(
         )
         _, X_test, _, Y_test = load_train_test_data(data_path, label_binarizer)
     else:
-        X_test, Y_test, _ = load_data(data_path, label_binarizer)
-
-    Y_pred_proba = pipe(X_test, return_labels=False)
-
+        X_test, Y_test, _ = load_data(data_path, label_binarizer, model_id2labels=model.id2label)
+    
+    Y_pred_proba = []
+    for out in tqdm(pipe(data(), return_labels=False)):
+        Y_pred_proba.append(out)
     Y_pred_proba = torch.vstack(Y_pred_proba)
 
-    Y_pred_proba = sp.csr_matrix(Y_pred_proba)
+    #Y_pred_proba = sp.csr_matrix(Y_pred_proba)
 
     if not isinstance(threshold, list):
         threshold = [threshold]
diff --git a/grants_tagger_light/training/train.py b/grants_tagger_light/training/train.py
index 680e8fd6..285f80a8 100644
--- a/grants_tagger_light/training/train.py
+++ b/grants_tagger_light/training/train.py
@@ -161,13 +161,19 @@ def sklearn_metrics(prediction: EvalPrediction):
         # This is a batch, so it's an array (rows) of array (labels)
         # Array of arrays with probas [[5.4e-5 1.3e-3...] [5.4e-5 1.3e-3...] ... ]
         y_pred = prediction.predictions
+        logger.info("predictions made before sigmoid")
+        logger.info(np.sum(np.int64(y_pred > training_args.threshold)))
+            
+        def sigmoid(x):
+            return 1 / (1 + np.exp(-x))
+        y_pred = np.array([sigmoid(y) for y in prediction.predictions])
         # Transformed to 0-1 if bigger than threshold [[0 1 0...] [0 0 1...] ... ]
         y_pred = np.int64(y_pred > training_args.threshold)
 
         # Array of arrays with 0/1 [[0 0 1 ...] [0 1 0 ...] ... ]
         y_true = prediction.label_ids
-
-        # report = classification_report(y_pred, y_true, output_dict=True)
+        logger.info("predictions made after sigmoid")
+        logger.info(np.sum(y_pred))
 
         if training_args.prune_labels_in_evaluation:
             mask = np.zeros(y_pred.shape, dtype=bool)
@@ -178,9 +184,10 @@ def sklearn_metrics(prediction: EvalPrediction):
         else:
             filtered_y_pred = y_pred
             filtered_y_true = y_true
+        logger.info("classification report...")
 
         report = classification_report(
-            filtered_y_pred, filtered_y_true, output_dict=True
+              filtered_y_true, filtered_y_pred, output_dict=True
         )
 
         metric_dict = {
diff --git a/grants_tagger_light/utils/utils.py b/grants_tagger_light/utils/utils.py
index 8264ffe2..68ca6c07 100644
--- a/grants_tagger_light/utils/utils.py
+++ b/grants_tagger_light/utils/utils.py
@@ -11,6 +11,7 @@
 from sklearn.model_selection import train_test_split
 from transformers import AutoModel
 from sklearn.preprocessing import MultiLabelBinarizer
+from datasets import load_from_disk
 
 logger = logging.getLogger(__name__)
 
@@ -36,10 +37,31 @@ def yield_tags(data_path, label_binarizer=None):
             else:
                 yield item["tags"]
 
+def load_data_from_dataset(data_path, model_id2labels, label_binarizer):
+    df = load_from_disk(data_path)
+    texts = df['test']['abstractText']
+    label_ids = df['test']['label_ids']
+    labels = []
+    for label_id in label_ids:
+        labels.append([model_id2labels[label] for label in label_id])
+    
+    # create on-hot-encoding
+    tags = []
+    for label_set in label_ids:
+        tag = [0] * len(label_binarizer.classes_)
+        for label in label_set:
+            tag[label] = 1
+        tags.append(tag)
+    
+
+    return texts, tags, None
+    
 
-def load_data(data_path, label_binarizer=None, X_format="List"):
+def load_data(data_path, label_binarizer=None, X_format="List", model_id2labels=None):
     """Load data from the dataset."""
     print("Loading data...")
+    if ".json" not in str(data_path):
+        return load_data_from_dataset(data_path, model_id2labels, label_binarizer)
 
     texts = []
     tags = []
diff --git a/pipelines/bertmesh/dvc.lock b/pipelines/bertmesh/dvc.lock
index ca8f4eca..b824c2c8 100644
--- a/pipelines/bertmesh/dvc.lock
+++ b/pipelines/bertmesh/dvc.lock
@@ -1,32 +1,32 @@
 schema: '2.0'
 stages:
   train:
-    cmd:
-    - mkdir -p ../../bertmesh_outs/pipeline_test/best
-    - python ../../grants_tagger_light/training/train.py \ --model_key "" \ --data_path
-      ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl \ --per_device_train_batch_size
-      16 \ --per_device_eval_batch_size 1 \ --multilabel_attention True \ --freeze_backbone
-      unfreeze \ --num_train_epochs 7 \ --learning_rate 5e-5 \ --dropout 0.1 \ --hidden_size
-      1024 \ --warmup_steps 5000 \ --max_grad_norm 2.0 \ --scheduler_type cosine_hard_restart
-      \ --weight_decay 0.2 \ --correct_bias True \ --threshold 0.25 \ --hidden_dropout_prob 0.2 \ --attention_probs_dropout_prob 0.2 \ --fp16
-      \ --torch_compile \ --evaluation_strategy steps \eval_steps 10000 \ --eval_accumulation_steps
-      20 \ --save_strategy steps \ --gradient_accumulation_step 2 \ --save_steps 20000
+    cmd: grants-tagger train bertmesh "" ../../preprocessed_results --output_dir ../../bertmesh_outs/pipeline_test/best
+      --per_device_train_batch_size 8 --gradient_accumulation_step 2 --per_device_eval_batch_size
+      1 --multilabel_attention True --freeze_backbone unfreeze_bias --num_train_epochs
+      7 --learning_rate 5e-5 --dropout 0.1 --hidden_size 1024 --warmup_steps 5000
+      --max_grad_norm 2.0 --scheduler_type cosine_hard_restart --weight_decay 0.2
+      --correct_bias True --threshold 0.25 --hidden_dropout_prob 0.2 --attention_probs_dropout_prob
+      0.2 --fp16 --torch_compile --evaluation_strategy steps --eval_accumulation_steps
+      20 --save_strategy steps --save_steps 10000 --eval_steps 1000
     deps:
-    - path: ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl
-      md5: 5011944fabc5ac7d6cfb2b20c17ae4d4
-      size: 5544198887
+    - path: ../../preprocessed_results
+      md5: f4e12c892d00f2ec079cf9fb782c8bc7.dir
+      size: 12530480449
+      nfiles: 199
     outs:
     - path: ../../bertmesh_outs/pipeline_test/best
-      md5: d751713988987e9331980363e24189ce.dir
-      size: 0
-      nfiles: 0
+      md5: b0dc8fcdb92f068115081d77d0c4f93f.dir
+      size: 12410643000
+      nfiles: 44
   preprocess:
     cmd:
     - mkdir -p ../../bertmesh_outs/pipeline_test/best
-    - grants-tagger preprocess mesh ../../data/raw/allMeSH_2021.jsonl ../../preprocessed_results
-      "" --test-size 25000 --train-years 2016,2017,2018,2019 --test-years 2020,2021
+    - grants-tagger preprocess mesh ../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl
+      ../../preprocessed_results "" --test-size 25000 --train-years 2016,2017,2018,2019
+      --test-years 2020,2021
     outs:
     - path: ../../preprocessed_results
-      md5: fc1fd7a533d2acf3b2225e2a62cebab3.dir
-      size: 12552341517
+      md5: f4e12c892d00f2ec079cf9fb782c8bc7.dir
+      size: 12530480449
       nfiles: 199
diff --git a/pipelines/bertmesh/dvc.yaml b/pipelines/bertmesh/dvc.yaml
index 44b9f44d..5f1eb6be 100644
--- a/pipelines/bertmesh/dvc.yaml
+++ b/pipelines/bertmesh/dvc.yaml
@@ -1,7 +1,7 @@
 vars:
     # RETAGGED: - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
-    # ORIGINAL:
-    - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    # ORIGINAL: - data_path: "../../data/raw/allMeSH_2021.jsonl"
+    - data_path: "../../data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
     - preprocessing_output: "../../preprocessed_results"
     - script_loc: "../../grants_tagger_light/training"
     - output_dir: "../../bertmesh_outs/pipeline_test/best"
@@ -22,7 +22,7 @@ stages:
           --gradient_accumulation_step 2
           --per_device_eval_batch_size 1
           --multilabel_attention True
-          --freeze_backbone unfreeze
+          --freeze_backbone unfreeze_bias
           --num_train_epochs 7
           --learning_rate 5e-5
           --dropout 0.1
@@ -40,8 +40,8 @@ stages:
           --evaluation_strategy steps
           --eval_accumulation_steps 20
           --save_strategy steps
-          --save_steps 20000
-          --eval_steps 10000
+          --save_steps 10000
+          --eval_steps 1000
         deps:
             - ${preprocessing_output}
         outs:
diff --git a/scripts/evaluation_ss.py b/scripts/evaluation_ss.py
new file mode 100644
index 00000000..28c3643b
--- /dev/null
+++ b/scripts/evaluation_ss.py
@@ -0,0 +1,358 @@
+## Quick Evaluation script based on the training script, for preprocessed tags
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    EvalPrediction,
+    HfArgumentParser,
+    AutoConfig,
+    AdamW,
+    get_cosine_schedule_with_warmup,
+    get_constant_schedule_with_warmup,
+    get_cosine_with_hard_restarts_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+)
+from grants_tagger_light.models.bert_mesh import BertMesh
+from grants_tagger_light.preprocessing.preprocess_mesh import preprocess_mesh
+from grants_tagger_light.training.cli_args import (
+    BertMeshTrainingArguments,
+    WandbArguments,
+    BertMeshModelArguments,
+)
+from grants_tagger_light.training.dataloaders import (
+    MultilabelDataCollator,
+)
+from sklearn.metrics import classification_report
+from loguru import logger
+from pprint import pformat
+import typer
+import numpy as np
+import os
+import transformers
+import json
+from datasets import load_from_disk
+
+from grants_tagger_light.utils.sharding import Sharding
+from grants_tagger_light.utils.years_tags_parser import parse_years, parse_tags
+
+transformers.set_seed(42)
+
+
+def evaluate_bertmesh(
+    model_key: str,
+    data_path: str,
+    training_args: TrainingArguments,
+    model_args: BertMeshModelArguments = None,
+    max_samples: int = -1,
+    test_size: float = None,
+    num_proc: int = os.cpu_count(),
+    shards: int = os.cpu_count(),
+    from_checkpoint: str = None,
+    tags: list = None,
+    train_years: list = None,
+    test_years: list = None,
+):
+    if not model_key:
+        assert isinstance(model_args, BertMeshModelArguments), (
+            "If model_key is not provided, "
+            "must provide model_args of type BertMeshModelArguments"
+        )  # noqa
+
+    logger.info(f"Preprocessing the dataset at {data_path}...")
+    if os.path.isdir(data_path):
+        logger.info(
+            "Train/test data found in a folder, which means you preprocessed and "
+            "save the data before. Loading that split from disk..."
+        )
+        dset = load_from_disk(os.path.join(data_path, "dataset"))
+        with open(os.path.join(data_path, "label2id"), "r") as f:
+            label2id = json.load(f)
+        with open(os.path.join(data_path, "id2label"), "r") as f:
+            id2label = json.load(f)
+    else:
+        logger.info("Preprocessing the data on the fly...")
+        dset, label2id, id2label = preprocess_mesh(
+            data_path=data_path,
+            model_key=model_key,
+            test_size=test_size,
+            num_proc=num_proc,
+            max_samples=max_samples,
+            batch_size=training_args.per_device_train_batch_size,
+            tags=tags,
+            train_years=train_years,
+            test_years=test_years,
+        )
+
+    train_dset, val_dset = dset["train"], dset["test"]
+
+    metric_labels = []
+    for x in train_dset["label_ids"]:
+        metric_labels.extend(x)
+
+    train_dset_size = len(train_dset)
+    logger.info(f"Training dataset size: {train_dset_size}")
+    if max_samples > 0:
+        train_dset_size = min(max_samples, train_dset_size)
+        logger.info(f"Training max samples: {train_dset_size}.")
+        train_dset.filter(
+            lambda example, idx: idx < train_dset_size,
+            with_indices=True,
+            num_proc=num_proc,
+        )
+    else:
+        logger.info("Training with all data...")
+
+    if shards > 0:
+        logger.info("Sharding training dataset...")
+        train_dset = Sharding(num_shards=shards).shard(train_dset)
+
+    if not model_key:
+        logger.info(
+            f"Model key not found. "
+            f"Training from scratch {model_args.pretrained_model_key}"
+        )
+
+        # Instantiate model from scratch
+        logger.info(f"Loading `{model_args.pretrained_model_key}` tokenizer...")
+        config = AutoConfig.from_pretrained(model_args.pretrained_model_key)
+
+        config.update(
+            {
+                "pretrained_model": model_args.pretrained_model_key,
+                "num_labels": len(label2id),
+                "hidden_size": model_args.hidden_size,
+                "dropout": model_args.dropout,
+                "multilabel_attention": model_args.multilabel_attention,
+                "label2id": label2id,
+                "id2label": id2label,
+                "freeze_backbone": model_args.freeze_backbone,
+                "hidden_dropout_prob": model_args.hidden_dropout_prob,
+                "attention_probs_dropout_prob": model_args.attention_probs_dropout_prob,
+            }
+        )
+        logger.info(f"Hidden size: {config.hidden_size}")
+        logger.info(f"Dropout: {config.dropout}")
+        logger.info(f"Multilabel Attention: {config.multilabel_attention}")
+        logger.info(f"Freeze Backbone: {config.freeze_backbone}")
+        logger.info(f"Num labels: {config.num_labels}")
+        logger.info(f"hidden_dropout_prob: {config.hidden_dropout_prob}")
+        logger.info(
+            f"attention_probs_dropout_prob: {config.attention_probs_dropout_prob}"
+        )
+
+        model = BertMesh(config)
+
+    else:
+        logger.info(f"Training from pretrained key {model_key}")
+        model = BertMesh.from_pretrained(model_key, trust_remote_code=True)
+
+    if model_args.freeze_backbone is None:
+        model_args.freeze_backbone = "freeze"
+
+    def sklearn_metrics(prediction: EvalPrediction):
+        # This is a batch, so it's an array (rows) of array (labels)
+        # Array of arrays with probas [[5.4e-5 1.3e-3...] [5.4e-5 1.3e-3...] ... ]
+        y_pred = prediction.predictions
+        logger.info("predictions made before sigmoid")
+        logger.info(np.sum(np.int64(y_pred > training_args.threshold)))
+            
+        def sigmoid(x):
+            return 1 / (1 + np.exp(-x))
+        y_pred = np.array([sigmoid(y) for y in prediction.predictions])
+        # Transformed to 0-1 if bigger than threshold [[0 1 0...] [0 0 1...] ... ]
+        y_pred = np.int64(y_pred > training_args.threshold)
+
+        # Array of arrays with 0/1 [[0 0 1 ...] [0 1 0 ...] ... ]
+        y_true = prediction.label_ids
+        logger.info("predictions made after sigmoid")
+        logger.info(np.sum(y_pred))
+
+        if training_args.prune_labels_in_evaluation:
+            mask = np.zeros(y_pred.shape, dtype=bool)
+            mask[np.arange(y_pred.shape[0])[:, np.newaxis], metric_labels] = True
+
+            filtered_y_pred = y_pred[mask].reshape(y_pred.shape[0], -1)
+            filtered_y_true = y_true[mask].reshape(y_true.shape[0], -1)
+        else:
+            filtered_y_pred = y_pred
+            filtered_y_true = y_true
+        logger.info("classification report...")
+
+        report = classification_report(
+              filtered_y_true, filtered_y_pred, output_dict=True
+        )
+        averages = {idx: r for idx, r in report.items() if "avg" in idx}
+        report = {
+            id2label[idx]: r
+            for idx, r in report.items()
+            if "avg" not in idx
+        }
+
+        report = {**averages, **report}
+
+        metric_dict = {
+            "micro_avg": report["micro avg"],
+            "macro_avg": report["macro avg"],
+            "weighted_avg": report["weighted avg"],
+            "samples_avg": report["samples avg"],
+            "full_report": report,
+        }
+
+        return metric_dict
+
+    logger.info("Collating labels...")
+    collator = MultilabelDataCollator(label2id=label2id)
+
+    if shards > 0:
+        logger.info("Calculating max steps for IterableDatasets shards...")
+        max_steps = Sharding.calculate_max_steps(training_args, train_dset_size)
+        training_args.max_steps = max_steps
+
+    optimizer = AdamW(
+        model.parameters(),
+        lr=training_args.learning_rate,
+        weight_decay=training_args.weight_decay,
+        correct_bias=training_args.correct_bias
+        if hasattr(training_args, "correct_bias")
+        else True,
+    )
+
+    if training_args.warmup_steps is None:
+        training_args.warmup_steps = 0
+
+    if training_args.scheduler_type.lower().strip() == "cosine":
+        scheduler = get_cosine_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=training_args.warmup_steps,
+            num_training_steps=training_args.max_steps,
+        )
+    elif training_args.scheduler_type.lower().strip() == "constant":
+        scheduler = get_constant_schedule_with_warmup(
+            optimizer, num_warmup_steps=training_args.warmup_steps
+        )
+    elif training_args.scheduler_type.lower().strip() == "cosine_hard_restart":
+        scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=training_args.warmup_steps,
+            num_training_steps=training_args.max_steps,
+            num_cycles=training_args.num_train_epochs,
+        )
+    elif training_args.scheduler_type.lower().strip() == "linear":
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=training_args.warmup_steps,
+            num_training_steps=training_args.max_steps,
+        )
+    else:
+        logger.warning(
+            f"{training_args.scheduler_type}: not found or not valid. "
+            f"Falling back to `linear`"
+        )
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer,
+            num_warmup_steps=training_args.warmup_steps,
+            num_training_steps=training_args.max_steps,
+        )
+
+    logger.info(f"Optimizer: {optimizer}")
+    logger.info(f"Scheduler: {training_args.scheduler_type}")
+
+    training_args.optim = optimizer
+    training_args.lr_scheduler_type = scheduler
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dset,
+        eval_dataset=val_dset,
+        data_collator=collator,
+        compute_metrics=sklearn_metrics,
+        optimizers=(optimizer, scheduler),
+    )
+
+
+    logger.info("Evaluating...")
+    metrics = trainer.evaluate(eval_dataset=val_dset)
+
+    logger.info(pformat(metrics))
+    with open("preprocessed_results/metrics_old.json", "w") as f:
+        f.write(pformat(metrics))
+
+
+train_app = typer.Typer()
+
+
+@train_app.command()
+def evaluate_bertmesh_cli(
+    ctx: typer.Context,
+    model_key: str = typer.Argument(
+        ..., help="Pretrained model key. " "Local path or HF location"
+    ),
+    data_path: str = typer.Argument(
+        ...,
+        help="Path to allMeSH_2021.jsonl (or similar) "
+        "or to a folder after preprocessing and saving to disk",
+    ),
+    test_size: float = typer.Option(
+        None, help="Fraction of data to use for testing (0,1] or number of rows"
+    ),
+    num_proc: int = typer.Option(
+        os.cpu_count(), help="Number of processes to use for preprocessing"
+    ),
+    max_samples: int = typer.Option(
+        -1,
+        help="Maximum number of samples to use from the json",
+    ),
+    shards: int = typer.Option(
+        os.cpu_count(),
+        help="Number os shards to divide training "
+        "IterativeDataset to (improves performance)",
+    ),
+    from_checkpoint: str = typer.Option(
+        None, help="Name of the checkpoint to resume training"
+    ),
+    tags: str = typer.Option(
+        None,
+        help="Comma-separated tags you want to include in the dataset "
+        "(the rest will be discarded)",
+    ),
+    train_years: str = typer.Option(
+        None, help="Comma-separated years you want to include in the training dataset"
+    ),
+    test_years: str = typer.Option(
+        None, help="Comma-separated years you want to include in the test dataset"
+    ),
+):
+    parser = HfArgumentParser(
+        (
+            BertMeshTrainingArguments,
+            WandbArguments,
+            BertMeshModelArguments,
+        )
+    )
+    (
+        training_args,
+        wandb_args,
+        model_args,
+    ) = parser.parse_args_into_dataclasses(ctx.args)
+
+    logger.info("Model args: {}".format(pformat(model_args)))
+    logger.info("Training args: {}".format(pformat(training_args)))
+    logger.info("Wandb args: {}".format(pformat(wandb_args)))
+
+    evaluate_bertmesh(
+        model_key=model_key,
+        data_path=data_path,
+        training_args=training_args,
+        model_args=model_args,
+        max_samples=max_samples,
+        test_size=test_size,
+        num_proc=num_proc,
+        shards=shards,
+        from_checkpoint=from_checkpoint,
+        tags=parse_tags(tags),
+        train_years=parse_years(train_years),
+        test_years=parse_years(test_years),
+    )
+
+if __name__ == "__main__":
+    train_app()
\ No newline at end of file