feat: add nutrisight improvements (#362)

* fix(nutrisight): add minor fixes on some nutrisight scripts * fix(nutrisight): rename folder 'dataset-generation' into 'dataset' * docs(nutrisight): improve README.md * fix(nutrisight): improve dataset scripts
openfoodfacts · Nov 21, 2024 · fb59153 · fb59153
1 parent d0f977e
commit fb59153
Show file tree

Hide file tree

Showing 17 changed files with 60 additions and 25 deletions.
diff --git a/nutrisight/README.md b/nutrisight/README.md
@@ -1,5 +1,9 @@
 # NutriSight
 
+## Dataset management
+
+Every script related to dataset management and generation is located in the `dataset` directory.
+
 ## Export
 
 To export the model to ONNX format, create a virtualenv with the following dependencies:
@@ -13,3 +17,10 @@ Then run the following command:
 ```bash
 optimum-cli export onnx -m openfoodfacts/nutrition-extractor --opset 19 --task token-classification nutrition-extractor-onnx-19
 ```
+
+## Thanks to our sponsors!
+
+The NutriSight project has indirectly received funding from the European Union’s Horizon Europe research and innovation action programme, via the DRG4FOOD – Open Call #1 issued and executed under the DRG4FOOD project (Grant Agreement no. 101086523).
+
+<img src="./assets/DRG4FOOD_Logo_Icon+Type-Black.png" alt="Funded by DRG4Food" title="Funded by DRG4Food" height="100" />  
+<img src="./assets/EN_FundedbytheEU_RGB_POS.png" alt="Funded by the EU" title="Funded by the EU" height="100" />
diff --git a/nutrisight/assets/DRG4FOOD_Logo_Icon+Type-Black.png b/nutrisight/assets/DRG4FOOD_Logo_Icon+Type-Black.png
diff --git a/nutrisight/assets/EN_FundedbytheEU_RGB_POS.png b/nutrisight/assets/EN_FundedbytheEU_RGB_POS.png
diff --git a/...-generation/10_get_prepared_as_samples.py → ...ght/dataset/10_get_prepared_as_samples.py b/...-generation/10_get_prepared_as_samples.py → ...ght/dataset/10_get_prepared_as_samples.py
@@ -41,7 +41,7 @@ def run(
 
         if "prepared-values" in issues:
             print(
-                f"https://annotate.openfoodfacts.org/projects/42/data?tab=61&task={task.id}"
+                f"{LABEL_STUDIO_URL}/projects/{project_id}/data?tab={view_id}&task={task.id}"
             )
 
 

diff --git a/...ht/dataset-generation/1_create_dataset.py → nutrisight/dataset/1_create_dataset.py b/...ht/dataset-generation/1_create_dataset.py → nutrisight/dataset/1_create_dataset.py
@@ -2,6 +2,7 @@
 import random
 import re
 import string
+import typing
 from collections import defaultdict
 from typing import Iterator, Optional
 
@@ -17,12 +18,17 @@
 
 logger = get_logger()
 
+# We use Redis to save the state of the dataset generation
 client = redis.Redis(host="localhost", port=6379, db=0)
 
 
 def create_annotation_results(
-    word_text: str, pre_annotation: str, vertices: list[tuple[int, int]], width, height
-):
+    word_text: str,
+    pre_annotation: str,
+    vertices: list[tuple[int, int]],
+    width: int,
+    height: int,
+) -> list[dict]:
     x_min = min(v[0] for v in vertices) * 100
     x_max = max(v[0] for v in vertices) * 100
     y_min = min(v[1] for v in vertices) * 100
@@ -196,12 +202,12 @@ def format_sample(product: dict, min_threshold: Optional[int] = None):
         ocr_url = generate_json_ocr_url(barcode, image_id=image_id)
 
         try:
-            ocr_result = OCRResult.from_url(ocr_url)
+            ocr_result = typing.cast(OCRResult, OCRResult.from_url(ocr_url))
         except openfoodfacts.ocr.OCRResultGenerationException as e:
             logger.info(f"Error generating OCR result: {e}")
             continue
 
-        if not ocr_result.full_text_annotation:
+        if ocr_result.full_text_annotation is None:
             continue
 
         words = [

diff --git a/...ht/dataset-generation/2_create_project.py → nutrisight/dataset/2_create_project.py b/...ht/dataset-generation/2_create_project.py → nutrisight/dataset/2_create_project.py
@@ -7,7 +7,7 @@
 
 logger = get_logger()
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def create_project(
@@ -16,8 +16,9 @@ def create_project(
         file_okay=True, dir_okay=False, exists=True
     ),
     title: str = typer.Option(help="Project title"),
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
 ):
-    ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
     label_config = Path(label_config_path).read_text()
 
     project = ls.projects.create(title=title, label_config=label_config)

diff --git a/...ht/dataset-generation/3_upload_dataset.py → nutrisight/dataset/3_upload_dataset.py b/...ht/dataset-generation/3_upload_dataset.py → nutrisight/dataset/3_upload_dataset.py
@@ -7,14 +7,15 @@
 from label_studio_sdk.client import LabelStudio
 from more_itertools import chunked
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def upload_dataset(
     api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
     project_id: int = 42,
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
 ):
-    ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
 
     with Path("./dataset.jsonl").open() as f:
         for batch in chunked(tqdm.tqdm(map(json.loads, f), desc="tasks"), 25):

diff --git a/nutrisight/dataset-generation/4_add_batch.py → nutrisight/dataset/4_add_batch.py b/nutrisight/dataset-generation/4_add_batch.py → nutrisight/dataset/4_add_batch.py
@@ -3,28 +3,28 @@
 
 import tqdm
 import typer
-from label_studio_sdk import Client
 from label_studio_sdk.client import LabelStudio
 from openfoodfacts.utils import get_logger
 
 logger = get_logger()
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def assign_batch_to_samples(
     api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
     project_id: int = 42,
     start_batch_id: int = 1,
     batch_size: int = 100,
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
 ):
     """Assign a batch (an integer starting from 1) to samples in the label
     studio project.
 
     All samples are fetched, sorted randomly and a unique batch number is
     assigned to each sample.
     """
-    ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
+    ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
     tasks = list(ls.tasks.list(project=project_id, include="data,id", page_size=500))
     logger.info(f"Found {len(tasks)} tasks in the project")
     # get tasks without batch ID

diff --git a/...aset-generation/5_update_checked_field.py → nutrisight/dataset/5_update_checked_field.py b/...aset-generation/5_update_checked_field.py → nutrisight/dataset/5_update_checked_field.py
@@ -6,15 +6,33 @@
 
 logger = get_logger()
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def update_checked_field(
     api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
     project_id: int = 42,
     view_id: int = 64,
-):
-    ls = Client(url=LABEL_STUDIO_URL, api_key=api_key)
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
+) -> None:
+    """The `checked` field is a boolean field that indicates if the task has
+    been checked by the annotator. When the second annotator marks the task
+    as checked (this information is saved in the annotation result), a Google
+    Cloud Function is triggered to update the task in the Label Studio project
+    so that the `data.checked` field is set to True. This allows us to filter
+    out the tasks that have not been checked yet in the Label Studio UI.
+
+    This script is used to update the `data.checked` field when the Google
+    Cloud Function failed for some reason to update the task in the Label
+    Studio project.
+
+    Args:
+        api_key (str): The API key for the Label Studio project.
+        project_id (int): The ID of the Label Studio project.
+        view_id (int): The ID of the Label Studio view.
+        label_studio_url (str): The URL of the Label Studio instance.
+    """
+    ls = Client(url=label_studio_url, api_key=api_key)
     ls.check_connection()
 
     project = ls.get_project(project_id)

diff --git a/...ight/dataset-generation/6_push_dataset.py → nutrisight/dataset/6_push_dataset.py b/...ight/dataset-generation/6_push_dataset.py → nutrisight/dataset/6_push_dataset.py
diff --git a/...ight/dataset-generation/7_check_errors.py → nutrisight/dataset/7_check_errors.py b/...ight/dataset-generation/7_check_errors.py → nutrisight/dataset/7_check_errors.py
@@ -10,6 +10,8 @@
 
 logger = get_logger()
 
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
+
 
 def is_bounding_box_modified(word_annotation: dict, word_prediction: dict):
     if word_annotation.keys() != word_prediction.keys():
@@ -146,9 +148,7 @@ def check_errors(
     # Label Studio instance
     project_id: Annotated[int, typer.Option(..., help="Label Studio project ID")] = 42,
     batch_ids: Optional[list[int]] = None,
-    label_studio_url: Annotated[
-        str, typer.Option()
-    ] = "https://annotate.openfoodfacts.org",
+    label_studio_url: Annotated[str, typer.Option()] = LABEL_STUDIO_DEFAULT_URL,
 ):
     logger.info("Fetching tasks from Label Studio, project %s", project_id)
     tasks = get_tasks(label_studio_url, api_key, project_id, batch_ids)
@@ -158,7 +158,4 @@ def check_errors(
 
 
 if __name__ == "__main__":
-    # typer.run(check_errors)
-    import os
-
-    check_errors(os.environ["LABEL_STUDIO_API_KEY"])
+    typer.run(check_errors)
diff --git a/...ght/dataset-generation/8_update_errors.py → nutrisight/dataset/8_update_errors.py b/...ght/dataset-generation/8_update_errors.py → nutrisight/dataset/8_update_errors.py
@@ -7,15 +7,16 @@
 
 logger = get_logger(level="DEBUG")
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def update_checked_field(
     api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
     project_id: int = 42,
     view_id: int = 62,
+    label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
 ):
-    ls = Client(url=LABEL_STUDIO_URL, api_key=api_key)
+    ls = Client(url=label_studio_url, api_key=api_key)
     ls.check_connection()
 
     project = ls.get_project(project_id)

diff --git a/...dataset-generation/9_add_checked_field.py → nutrisight/dataset/9_add_checked_field.py b/...dataset-generation/9_add_checked_field.py → nutrisight/dataset/9_add_checked_field.py
@@ -7,7 +7,7 @@
 
 logger = get_logger(level="DEBUG")
 
-LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
+LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"
 
 
 def add_checked_field(

diff --git a/...neration/cloud_function/cloud_function.py → .../dataset/cloud_function/cloud_function.py b/...neration/cloud_function/cloud_function.py → .../dataset/cloud_function/cloud_function.py
diff --git a/...eneration/cloud_function/requirements.txt → ...t/dataset/cloud_function/requirements.txt b/...eneration/cloud_function/requirements.txt → ...t/dataset/cloud_function/requirements.txt
diff --git a/...sight/dataset-generation/label_config.xml → nutrisight/dataset/label_config.xml b/...sight/dataset-generation/label_config.xml → nutrisight/dataset/label_config.xml
diff --git a/...sight/dataset-generation/requirements.txt → nutrisight/dataset/requirements.txt b/...sight/dataset-generation/requirements.txt → nutrisight/dataset/requirements.txt
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,7 +41,7 @@ def run( @@
             if "prepared-values" in issues:
                 print(
-                    f"https://annotate.openfoodfacts.org/projects/42/data?tab=61&task={task.id}"
+                    f"{LABEL_STUDIO_URL}/projects/{project_id}/data?tab={view_id}&task={task.id}"
                 )
@@ Expand Down @@