Skip to content

Commit

Permalink
feat: add nutrisight improvements (#362)
Browse files Browse the repository at this point in the history
* fix(nutrisight): add minor fixes on some nutrisight scripts

* fix(nutrisight): rename folder 'dataset-generation' into 'dataset'

* docs(nutrisight): improve README.md

* fix(nutrisight): improve dataset scripts
  • Loading branch information
raphael0202 authored Nov 21, 2024
1 parent d0f977e commit fb59153
Show file tree
Hide file tree
Showing 17 changed files with 60 additions and 25 deletions.
11 changes: 11 additions & 0 deletions nutrisight/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# NutriSight

## Dataset management

Every script related to dataset management and generation is located in the `dataset` directory.

## Export

To export the model to ONNX format, create a virtualenv with the following dependencies:
Expand All @@ -13,3 +17,10 @@ Then run the following command:
```bash
optimum-cli export onnx -m openfoodfacts/nutrition-extractor --opset 19 --task token-classification nutrition-extractor-onnx-19
```

## Thanks to our sponsors!

The NutriSight project has indirectly received funding from the European Union’s Horizon Europe research and innovation action programme, via the DRG4FOOD – Open Call #1 issued and executed under the DRG4FOOD project (Grant Agreement no. 101086523).

<img src="./assets/DRG4FOOD_Logo_Icon+Type-Black.png" alt="Funded by DRG4Food" title="Funded by DRG4Food" height="100" />
<img src="./assets/EN_FundedbytheEU_RGB_POS.png" alt="Funded by the EU" title="Funded by the EU" height="100" />
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added nutrisight/assets/EN_FundedbytheEU_RGB_POS.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run(

if "prepared-values" in issues:
print(
f"https://annotate.openfoodfacts.org/projects/42/data?tab=61&task={task.id}"
f"{LABEL_STUDIO_URL}/projects/{project_id}/data?tab={view_id}&task={task.id}"
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import random
import re
import string
import typing
from collections import defaultdict
from typing import Iterator, Optional

Expand All @@ -17,12 +18,17 @@

logger = get_logger()

# We use Redis to save the state of the dataset generation
client = redis.Redis(host="localhost", port=6379, db=0)


def create_annotation_results(
word_text: str, pre_annotation: str, vertices: list[tuple[int, int]], width, height
):
word_text: str,
pre_annotation: str,
vertices: list[tuple[int, int]],
width: int,
height: int,
) -> list[dict]:
x_min = min(v[0] for v in vertices) * 100
x_max = max(v[0] for v in vertices) * 100
y_min = min(v[1] for v in vertices) * 100
Expand Down Expand Up @@ -196,12 +202,12 @@ def format_sample(product: dict, min_threshold: Optional[int] = None):
ocr_url = generate_json_ocr_url(barcode, image_id=image_id)

try:
ocr_result = OCRResult.from_url(ocr_url)
ocr_result = typing.cast(OCRResult, OCRResult.from_url(ocr_url))
except openfoodfacts.ocr.OCRResultGenerationException as e:
logger.info(f"Error generating OCR result: {e}")
continue

if not ocr_result.full_text_annotation:
if ocr_result.full_text_annotation is None:
continue

words = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

logger = get_logger()

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def create_project(
Expand All @@ -16,8 +16,9 @@ def create_project(
file_okay=True, dir_okay=False, exists=True
),
title: str = typer.Option(help="Project title"),
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
):
ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
label_config = Path(label_config_path).read_text()

project = ls.projects.create(title=title, label_config=label_config)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
from label_studio_sdk.client import LabelStudio
from more_itertools import chunked

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def upload_dataset(
api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
project_id: int = 42,
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
):
ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
ls = LabelStudio(base_url=label_studio_url, api_key=api_key)

with Path("./dataset.jsonl").open() as f:
for batch in chunked(tqdm.tqdm(map(json.loads, f), desc="tasks"), 25):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,28 @@

import tqdm
import typer
from label_studio_sdk import Client
from label_studio_sdk.client import LabelStudio
from openfoodfacts.utils import get_logger

logger = get_logger()

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def assign_batch_to_samples(
api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
project_id: int = 42,
start_batch_id: int = 1,
batch_size: int = 100,
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
):
"""Assign a batch (an integer starting from 1) to samples in the label
studio project.
All samples are fetched, sorted randomly and a unique batch number is
assigned to each sample.
"""
ls = LabelStudio(base_url=LABEL_STUDIO_URL, api_key=api_key)
ls = LabelStudio(base_url=label_studio_url, api_key=api_key)
tasks = list(ls.tasks.list(project=project_id, include="data,id", page_size=500))
logger.info(f"Found {len(tasks)} tasks in the project")
# get tasks without batch ID
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,33 @@

logger = get_logger()

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def update_checked_field(
api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
project_id: int = 42,
view_id: int = 64,
):
ls = Client(url=LABEL_STUDIO_URL, api_key=api_key)
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
) -> None:
"""The `checked` field is a boolean field that indicates if the task has
been checked by the annotator. When the second annotator marks the task
as checked (this information is saved in the annotation result), a Google
Cloud Function is triggered to update the task in the Label Studio project
so that the `data.checked` field is set to True. This allows us to filter
out the tasks that have not been checked yet in the Label Studio UI.
This script is used to update the `data.checked` field when the Google
Cloud Function failed for some reason to update the task in the Label
Studio project.
Args:
api_key (str): The API key for the Label Studio project.
project_id (int): The ID of the Label Studio project.
view_id (int): The ID of the Label Studio view.
label_studio_url (str): The URL of the Label Studio instance.
"""
ls = Client(url=label_studio_url, api_key=api_key)
ls.check_connection()

project = ls.get_project(project_id)
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

logger = get_logger()

LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def is_bounding_box_modified(word_annotation: dict, word_prediction: dict):
if word_annotation.keys() != word_prediction.keys():
Expand Down Expand Up @@ -146,9 +148,7 @@ def check_errors(
# Label Studio instance
project_id: Annotated[int, typer.Option(..., help="Label Studio project ID")] = 42,
batch_ids: Optional[list[int]] = None,
label_studio_url: Annotated[
str, typer.Option()
] = "https://annotate.openfoodfacts.org",
label_studio_url: Annotated[str, typer.Option()] = LABEL_STUDIO_DEFAULT_URL,
):
logger.info("Fetching tasks from Label Studio, project %s", project_id)
tasks = get_tasks(label_studio_url, api_key, project_id, batch_ids)
Expand All @@ -158,7 +158,4 @@ def check_errors(


if __name__ == "__main__":
# typer.run(check_errors)
import os

check_errors(os.environ["LABEL_STUDIO_API_KEY"])
typer.run(check_errors)
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@

logger = get_logger(level="DEBUG")

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def update_checked_field(
api_key: Annotated[str, typer.Argument(envvar="LABEL_STUDIO_API_KEY")],
project_id: int = 42,
view_id: int = 62,
label_studio_url: str = LABEL_STUDIO_DEFAULT_URL,
):
ls = Client(url=LABEL_STUDIO_URL, api_key=api_key)
ls = Client(url=label_studio_url, api_key=api_key)
ls.check_connection()

project = ls.get_project(project_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

logger = get_logger(level="DEBUG")

LABEL_STUDIO_URL = "https://annotate.openfoodfacts.org"
LABEL_STUDIO_DEFAULT_URL = "https://annotate.openfoodfacts.org"


def add_checked_field(
Expand Down
File renamed without changes.
File renamed without changes.

0 comments on commit fb59153

Please sign in to comment.