chore: prepare release v1.3.0 (#2304)

# Changelog All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. ## [1.3.0](v1.2.1...v1.3.0) (2023-02-09) ### Features * better log error handling ([#2245](#2245)) ([66e5cce](66e5cce)), closes [#2005](#2005) * Change view mode order in sidebar ([#2215](#2215)) ([dff1ea1](dff1ea1)), closes [#2214](#2214) * **Client:** Expose keywords dataset metrics ([#2290](#2290)) ([a945c5e](a945c5e)), closes [#2135](#2135) * **Client:** relax client constraints for rules management ([#2242](#2242)) ([6e749b7](6e749b7)), closes [#2048](#2048) * Create a multiple contextual help component ([#2255](#2255)) ([a35fae2](a35fae2)), closes [#1926](#1926) * Include record event_timestamp ([#2156](#2156)) ([3992b8f](3992b8f)), closes [#1911](#1911) * updated the `prepare_for_training` methods ([#2225](#2225)) ([e53c201](e53c201)), closes [#2154](#2154) [#2132](#2132) [#2122](#2122) [#2045](#2045) [#1697](#1697) ### Bug Fixes * **Client:** formatting caused offset in prediction ([#2241](#2241)) ([d65db5a](d65db5a)) * **Client:** Log remaining data when shutdown the dataset consumer ([#2269](#2269)) ([d78963e](d78963e)), closes [#2189](#2189) * validate predictions fails on text2text ([#2271](#2271)) ([f68856e](f68856e)), closes [#2252](#2252) ### Visual enhancements * Fine tune menu record card ([#2240](#2240)) ([62148e5](62148e5)), closes [#2224](#2224) * Rely on box-shadow to provide the secondary underline ([#2283](#2283)) ([d786171](d786171)), closes [#2282](#2282) [#2282](#2282) ### Documentation * Add deploy on Spaces buttons ([#2293](#2293)) ([60164a0](60164a0)) * fix typo in documentation ([#2296](#2296)) ([ab8e85e](ab8e85e)) * Improve deployment and quickstart docs and tutorials ([#2201](#2201)) ([075bf94](075bf94)), closes [#2162](#2162) * More spaces! ([#2309](#2309)) ([f02eb60](f02eb60)) * Remove cut-off sentence in docs codeblock ([#2287](#2287)) ([7e87f20](7e87f20)) * Rephrase `to know more` into `to learn more` in Quickstart login page ([#2305](#2305)) ([6082a26](6082a26)) * Replace leftover `rubrix.apikey` with `argilla.apikey` ([#2286](#2286)) ([4871127](4871127)), closes [#2254](#2254) [#2254](#2254) * Simplify token attributions code block ([#2322](#2322)) ([4cb6ae1](4cb6ae1)) * Tutorial buttons ([#2310](#2310)) ([d6e02de](d6e02de)) * Update colab guide ([#2320](#2320)) ([e48a7cc](e48a7cc)) * Update HF Spaces creation image ([#2314](#2314)) ([e4b2a04](e4b2a04)) ## As always, thanks to our amazing contributors! - add repr method for Rule, Dataset. (#2148) by @Ankush-Chander - opensearch docker compose file doesn't run (#2228) by @kayvane1 - Docs: fix typo in documentation (#2296) by @anakin87
argilla-io · Feb 9, 2023 · de0983f · de0983f
2 parents 0bb3187 + e55ea3e
commit de0983f
Show file tree

Hide file tree

Showing 162 changed files with 5,413 additions and 2,060 deletions.
diff --git a/.github/workflows/link-2-teamwork.yml b/.github/workflows/link-2-teamwork.yml
@@ -0,0 +1,18 @@
+name: Link new issues to Team work
+
+on:
+  issues:
+    types:
+      - opened
+
+jobs:
+  add-to-project:
+    name: Add issue to project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/add-to-project@v0.4.0
+        with:
+          # You can target a repository in a different organization
+          # to the issue
+          project-url: https://github.com/orgs/argilla-io/projects/4
+          github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,50 +1,49 @@
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
+  - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
     hooks:
-    -   id: check-yaml
-    -   id: end-of-file-fixer
+      - id: check-yaml
+      - id: end-of-file-fixer
         exclude_types: [text, jupyter]
-    -   id: trailing-whitespace
+      - id: trailing-whitespace
 
--   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.3.1
+  - repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.4.2
     hooks:
-    -   id: insert-license
+      - id: insert-license
         name: "Insert license header in Python source files"
         files: \.py$
         args:
-            - --license-filepath
-            - src/license_header.txt
-            - --fuzzy-match-generates-todo
-            # - --remove-header
+          - --license-filepath
+          - src/license_header.txt
+          - --fuzzy-match-generates-todo
+          # - --remove-header
 
--   repo: https://github.com/psf/black
+  - repo: https://github.com/psf/black
     rev: 22.12.0
     hooks:
-    -   id: black
-        additional_dependencies: ['click==8.0.4']
+      - id: black
+        additional_dependencies: ["click==8.0.4"]
 
--   repo: https://github.com/pycqa/isort
-    rev: 5.11.4
+  - repo: https://github.com/pycqa/isort
+    rev: 5.11.5
     hooks:
-    -   id: isort
+      - id: isort
 
--   repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-    rev: v9.3.0
+  - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
+    rev: v9.4.0
     hooks:
-    -   id: commitlint
+      - id: commitlint
         stages: [commit-msg]
-        additional_dependencies: ['@commitlint/config-conventional']
+        additional_dependencies: ["@commitlint/config-conventional"]
 
 ci:
-    autofix_commit_msg: |
-        [pre-commit.ci] auto fixes from pre-commit.com hooks
-
-        for more information, see https://pre-commit.ci
-    autofix_prs: true
-    autoupdate_branch: ''
-    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
-    autoupdate_schedule: weekly
-    skip: []
-    submodules: false
+  autofix_commit_msg: |
+    [pre-commit.ci] auto fixes from pre-commit.com hooks
+    for more information, see https://pre-commit.ci
+  autofix_prs: true
+  autoupdate_branch: ""
+  autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
+  autoupdate_schedule: weekly
+  skip: []
+  submodules: false
diff --git a/README.md b/README.md
@@ -19,10 +19,14 @@
 </p>
 
 <h2 align="center">Open-source framework for data-centric NLP</h2>
-<p align="center">Data Labeling + Data Curation + Inference Store</p>
+<p align="center">Data Labeling, curation, and Inference Store</p>
 <p align="center">Designed for MLOps & Feedback Loops</p>
 
-#### 🆕 Play with [Argilla live-demo](https://argilla-live-demo.hf.space) powered by Hugging Face Spaces. (username and password: `huggingface`/ `1234` ) </h3>
+
+> 🆕 🔥 Play with Argilla UI with this [live-demo](https://argilla-live-demo.hf.space) powered by Hugging Face Spaces (login:`argilla`, password:`1234`)
+
+> 🆕 🔥 Since `1.2.0` Argilla supports vector search for finding the most similar records to a given one. This feature uses vector or semantic search combined with more traditional search (keyword and filter based). Learn more on this [deep-dive guide](https://docs.argilla.io/en/latest/guides/features/semantic-search.html)
+
 
 ![imagen](https://user-images.githubusercontent.com/1107111/204772677-facee627-9b3b-43ca-8533-bbc9b4e2d0aa.png)
 

diff --git a/docker-compose.opensearch.yaml b/docker-compose.opensearch.yaml
@@ -30,7 +30,7 @@ services:
     expose:
       - "5601" # Expose port 5601 for web access to OpenSearch Dashboards
     environment:
-      DISABLE_SECURITY_DASHBOARDS_PLUGIN: true
+      DISABLE_SECURITY_DASHBOARDS_PLUGIN: 1
       OPENSEARCH_HOSTS: '["http://opensearch-node1:9200"]'
     networks:
       - argilla
@@ -39,4 +39,4 @@ networks:
   argilla:
     driver: bridge
 volumes:
-  opensearch-data:
+  opensearch-data:
diff --git a/docs/_source/getting_started/installation/configurations/user_management.md b/docs/_source/getting_started/installation/configurations/user_management.md
@@ -53,10 +53,15 @@ By default, if you don't configure a `users.yml` file, your Argilla instance is
 
 - username: `argilla`
 - password: `1234`
-- api_key: `rubrix.apikey`
+- api_key: `argilla.apikey`
 
 for security reasons we recommend changing at least the password and API key.
 
+:::{note}
+To connect to an old Argilla server using client `>=1.3.0`, you should specify the default user API key `rubrix.apikey`.
+Otherwise, connections will fail with an Unauthorized server error.
+:::
+
 ### Override default API key
 
 To override the default API key you can set the following environment variable before launching the server:
@@ -105,7 +110,7 @@ To configure your Argilla instance for various users, you just need to create a
   workspaces: ['client_projects'] # access to her user workspace and the client_projects workspace
 - username: user3
   hashed_password: <generated-hashed-password> # See the previous section above
-  api_key: "ThisIsTheUser2APIKEY" # this user can access all workspaces (including
+  api_key: "ThisIsTheUser2APIKEY" # this user can access all workspaces
 - ...
 ```
 

diff --git a/docs/_source/getting_started/installation/deployments/huggingface-spaces.md b/docs/_source/getting_started/installation/deployments/huggingface-spaces.md
@@ -23,7 +23,7 @@ You need to define the **Owner** (your personal account or an organization), a *
 </div>
 
 :::{tip}
-If you want to customize the title, emojis, and colors of your space, go to "Files and Versions" and edit the metadata of your README.md file. 
+If you want to customize the title, emojis, and colors of your space, go to "Files and Versions" and edit the metadata of your README.md file.
 :::
 
 You'll see the `Building` status and once it becomes `Running` your space is ready to go. If you don't see the Argilla login UI refresh the page.

diff --git a/docs/_source/guides/log_load_and_prepare_data.ipynb b/docs/_source/guides/log_load_and_prepare_data.ipynb
@@ -539,7 +539,19 @@
     "## Prepare dataset for training\n",
     "\n",
     "If you want to train a Hugging Face transformer or a spaCy NER pipeline, we provide a handy method to prepare your dataset: `DatasetFor*.prepare_for_training()`.\n",
-    "It will return a Hugging Face dataset or a spaCy DocBin, optimized for the training process with the Hugging Face Trainer or the spaCy cli. Our  [training tutorials](../tutorials/steps/2_training.md), show entire training workflows for your favorite packages."
+    "It will return a Hugging Face dataset, a spaCy DocBin or a SparkNLP-fromatted DataFrame, optimized for the training process with the Hugging Face Trainer, the spaCy cli or the SparkNLP API. Our [libraries deepdive](../libraries/libraries.html) and [training tutorials](../../tutorials/steps/2_training.md), show entire training workflows for your favorite packages."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f6ae027b",
+   "metadata": {},
+   "source": [
+    "### Train-test split\n",
+    "\n",
+    "It is possible to directly include train-test splits to the `prepare_for_training` by passing the `train_size` and `test_size` parameters.\n",
+    "\n"
    ]
   },
   {
@@ -567,9 +579,21 @@
     "    ]\n",
     ")\n",
     "\n",
-    "dataset_rg.prepare_for_training()[0]\n",
+    "dataset_rg.prepare_for_training(framework=\"transformers\")[0]\n",
+    "# Output:\n",
+    "# {'title': 'My title', 'content': 'My content', 'label': 0}\n",
+    "\n",
+    "import spacy\n",
+    "\n",
+    "nlp = spacy.blank(\"en\")\n",
+    "dataset_rg.prepare_for_training(framework=\"spacy\", lang=nlp)\n",
+    "# Output:\n",
+    "# <spacy.tokens._serialize.DocBin object at 0x280613af0>\n",
+    "\n",
+    "\n",
+    "dataset_rg.prepare_for_training(framework=\"spark-nlp\")\n",
     "# Output:\n",
-    "# {'title': 'My title', 'content': 'My content', 'label': 0}"
+    "# <pd.DataFrame>"
    ]
   },
   {
@@ -612,7 +636,11 @@
     "nlp = spacy.blank(\"en\")\n",
     "dataset_rg.prepare_for_training(framework=\"spacy\", lang=nlp)\n",
     "# Output:\n",
-    "# <spacy.tokens._serialize.DocBin object at 0x280613af0>"
+    "# <spacy.tokens._serialize.DocBin object at 0x280613af0>\n",
+    "\n",
+    "dataset_rg.prepare_for_training(framework=\"spark-nlp\")\n",
+    "# Output:\n",
+    "# <pd.DataFrame>"
    ]
   }
  ],

diff --git a/docs/_source/reference/python/python_metrics.rst b/docs/_source/reference/python/python_metrics.rst
@@ -5,9 +5,16 @@ Metrics
 
 Here we describe the available metrics in Argilla:
 
+- Common metrics: Metrics available for all datasets
 - Text classification: Metrics for text classification
 - Token classification: Metrics for token classification
 
+Common metrics
+--------------
+
+.. automodule:: argilla.metrics.commons
+   :members:
+
 Text classification
 -------------------
 

diff --git a/docs/_source/reference/webapp/features.md b/docs/_source/reference/webapp/features.md
@@ -168,7 +168,7 @@ If you struggle to increase the overall coverage, try to filter for the records
 Here you will see a list of your saved rules.
 You can edit a rule by clicking on its name, or delete it by clicking on the trash icon.
 
-## Smenatic search
+## Semantic search
 This feature enables you to find similar records when exploring or annotating records. This leverages semantic search to find "semantically" similar records. In order to use this feature, your dataset records should contain vectors which can be associated when logging the dataset into Argilla. Check the Deep Dive Feature guide about [Semantic Search](../../guides/features/semantic-search.html) to understand how to benefit from this feature.
 
 ![Similarity Search](../../_static/reference/webapp/features-similaritysearch.png)

diff --git a/environment_dev.yml b/environment_dev.yml
@@ -35,7 +35,7 @@ dependencies:
       - pre-commit==2.15.0
       # extra test dependencies
       - cleanlab~=2.0.0 # With this version, tests are failing
-      - datasets>1.17.0,<2.3.0 # TODO: push_to_hub fails up to 2.3.2, check patches when they come out eventually
+      - datasets>1.17.0,!= 2.3.2 # TODO: push_to_hub fails up to 2.3.2, check patches when they come out eventually
       - huggingface_hub != 0.5.0 # some backward comp. problems introduced in 0.5.0
       - flair==0.10
       - faiss-cpu

diff --git a/frontend/assets/icons/index.js b/frontend/assets/icons/index.js
@@ -14,11 +14,11 @@ require('./external')
 require('./filter')
 require('./hand-labeling')
 require('./info')
-require('./kebab-menu')
 require('./link')
 require('./log-out')
 require('./matching')
 require('./math-plus')
+require('./meatballs')
 require('./no-matching')
 require('./progress')
 require('./refresh')

diff --git a/frontend/assets/icons/kebab-menu.js b/frontend/assets/icons/kebab-menu.js
diff --git a/frontend/assets/icons/meatballs.js b/frontend/assets/icons/meatballs.js
@@ -0,0 +1,27 @@
+/*
+ * coding=utf-8
+ * Copyright 2021-present, the Recognai S.L. team.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* eslint-disable */
+var icon = require('vue-svgicon')
+icon.register({
+  'meatballs': {
+    width: 16,
+    height: 16,
+    viewBox: '0 0 30 8',
+    data: '<path pid="0" d="M7.5 4A3.75 3.75 0 110 4a3.75 3.75 0 017.5 0zM18.75 4a3.75 3.75 0 11-7.5 0 3.75 3.75 0 017.5 0zM26.25 7.75a3.75 3.75 0 100-7.5 3.75 3.75 0 000 7.5z" _fill="#000"/>'
+  }
+})