From f861c843ce2fefef4473559aad33a9d149996fde Mon Sep 17 00:00:00 2001 From: grant-Kolena <146848587+grant-Kolena@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:43:35 -0800 Subject: [PATCH] move dataset out of experimental (#384) * move dataset out of experimental * change split type for experimental tests * change split type for experimental tests * new cache key * only run experimental once * moving test out * test fix * remove unused function * added dataset pytest group * fix test * stand alone dataset test --- .circleci/config.yml | 2 ++ .circleci/continue_config.yml | 25 ++++++++++++++--- .../question_answering/register_dataset.py | 2 +- kolena/_experimental/dataset/__init__.py | 27 ------------------- kolena/dataset/__init__.py | 14 +++++++--- kolena/{_experimental => }/dataset/common.py | 0 .../_dataset.py => dataset/dataset.py} | 8 +++--- .../_evaluation.py => dataset/evaluation.py} | 26 +++++++----------- .../{_experimental => }/dataset/__init__.py | 0 .../dataset/test_dataset.py | 6 ++--- .../dataset/test_evaluation.py | 10 +++---- .../{_experimental => }/dataset/__init__.py | 0 .../unit/{_experimental => }/dataset/data.py | 0 .../dataset/test_common.py | 4 +-- .../dataset/test_dataset.py | 22 +++++++-------- 15 files changed, 69 insertions(+), 77 deletions(-) delete mode 100644 kolena/_experimental/dataset/__init__.py rename kolena/{_experimental => }/dataset/common.py (100%) rename kolena/{_experimental/dataset/_dataset.py => dataset/dataset.py} (97%) rename kolena/{_experimental/dataset/_evaluation.py => dataset/evaluation.py} (86%) rename tests/integration/{_experimental => }/dataset/__init__.py (100%) rename tests/integration/{_experimental => }/dataset/test_dataset.py (98%) rename tests/integration/{_experimental => }/dataset/test_evaluation.py (98%) rename tests/unit/{_experimental => }/dataset/__init__.py (100%) rename tests/unit/{_experimental => }/dataset/data.py (100%) rename tests/unit/{_experimental => }/dataset/test_common.py (96%) rename tests/unit/{_experimental => }/dataset/test_dataset.py (95%) diff --git a/.circleci/config.yml b/.circleci/config.yml index f15c15dfd..948d150e8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,6 +28,8 @@ jobs: tests/integration/fr/.* fr true kolena/workflow/.* workflow true tests/integration/workflow/.* workflow true + kolena/dataset/.* dataset true + tests/integration/dataset/.* dataset true kolena/_(api|utils)/.* all true kolena/[^/]*.py all true tests/integration/[^/]*.py all true diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 771de76ad..5a4a875ac 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -16,6 +16,9 @@ parameters: workflow: type: boolean default: false + dataset: + type: boolean + default: false misc: type: boolean default: false @@ -219,9 +222,13 @@ jobs: export KOLENA_TOKEN=${!token} - when: condition: - equal: - - "none" - - << parameters.extras >> + and: + - equal: + - "none" + - << parameters.extras >> + - not_equal: + - _experimental + - << parameters.pytest-group >> steps: - run: name: Run << parameters.pytest-group >> integration tests @@ -341,11 +348,21 @@ workflows: matrix: parameters: python-version: [ "3.9.18" ] - extras: [ "none", "metrics" ] + extras: [ "metrics" ] pytest-group: _experimental enabled: true requires: - ci-base-<< matrix.python-version >>-<< matrix.extras >> + - integration-test: + name: integration-test-dataset-<< matrix.python-version >>-<< matrix.extras >> + matrix: + parameters: + python-version: [ "3.9.18" ] + extras: [ "none" ] + pytest-group: dataset + enabled: true + requires: + - ci-base-<< matrix.python-version >>-<< matrix.extras >> - integration-test: name: integration-test-misc-<< matrix.python-version >>-<< matrix.extras >> matrix: diff --git a/examples/dataset/question_answering/question_answering/register_dataset.py b/examples/dataset/question_answering/question_answering/register_dataset.py index 6d11cb23f..c3e5031e1 100644 --- a/examples/dataset/question_answering/question_answering/register_dataset.py +++ b/examples/dataset/question_answering/question_answering/register_dataset.py @@ -19,7 +19,7 @@ from question_answering.constants import TRUTHFULQA import kolena -from kolena._experimental.dataset import register_dataset +from kolena.dataset import register_dataset from kolena.workflow.io import dataframe_from_csv DATASETS = { diff --git a/kolena/_experimental/dataset/__init__.py b/kolena/_experimental/dataset/__init__.py deleted file mode 100644 index c85ffe360..000000000 --- a/kolena/_experimental/dataset/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright 2021-2024 Kolena Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# noreorder -from kolena._experimental.dataset._dataset import fetch_dataset_history -from kolena._experimental.dataset._dataset import fetch_dataset -from kolena._experimental.dataset._dataset import register_dataset -from kolena._experimental.dataset._evaluation import fetch_results -from kolena._experimental.dataset._evaluation import test - -__all__ = [ - "register_dataset", - "fetch_dataset_history", - "fetch_dataset", - "fetch_results", - "test", -] diff --git a/kolena/dataset/__init__.py b/kolena/dataset/__init__.py index c72fdb728..00916112e 100644 --- a/kolena/dataset/__init__.py +++ b/kolena/dataset/__init__.py @@ -11,12 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # noreorder -from kolena.workflow import annotation -from kolena.workflow import asset +from kolena.dataset.dataset import fetch_dataset_history +from kolena.dataset.dataset import fetch_dataset +from kolena.dataset.dataset import register_dataset +from kolena.dataset.evaluation import fetch_results +from kolena.dataset.evaluation import test +from kolena.workflow import annotation, asset __all__ = [ + "register_dataset", + "fetch_dataset_history", + "fetch_dataset", + "fetch_results", + "test", "annotation", "asset", ] diff --git a/kolena/_experimental/dataset/common.py b/kolena/dataset/common.py similarity index 100% rename from kolena/_experimental/dataset/common.py rename to kolena/dataset/common.py diff --git a/kolena/_experimental/dataset/_dataset.py b/kolena/dataset/dataset.py similarity index 97% rename from kolena/_experimental/dataset/_dataset.py rename to kolena/dataset/dataset.py index cdca5ae5d..152b8f661 100644 --- a/kolena/_experimental/dataset/_dataset.py +++ b/kolena/dataset/dataset.py @@ -33,10 +33,6 @@ from kolena._api.v2.dataset import LoadDatasetByNameRequest from kolena._api.v2.dataset import Path from kolena._api.v2.dataset import RegisterRequest -from kolena._experimental.dataset.common import COL_DATAPOINT -from kolena._experimental.dataset.common import COL_DATAPOINT_ID_OBJECT -from kolena._experimental.dataset.common import validate_batch_size -from kolena._experimental.dataset.common import validate_dataframe_ids from kolena._utils import krequests_v2 as krequests from kolena._utils import log from kolena._utils.batched_load import _BatchedLoader @@ -46,6 +42,10 @@ from kolena._utils.endpoints import get_dataset_url from kolena._utils.serde import from_dict from kolena._utils.state import API_V2 +from kolena.dataset.common import COL_DATAPOINT +from kolena.dataset.common import COL_DATAPOINT_ID_OBJECT +from kolena.dataset.common import validate_batch_size +from kolena.dataset.common import validate_dataframe_ids from kolena.errors import InputValidationError from kolena.workflow._datatypes import _deserialize_dataobject from kolena.workflow._datatypes import _serialize_dataobject diff --git a/kolena/_experimental/dataset/_evaluation.py b/kolena/dataset/evaluation.py similarity index 86% rename from kolena/_experimental/dataset/_evaluation.py rename to kolena/dataset/evaluation.py index 2bb127361..461edbf0b 100644 --- a/kolena/_experimental/dataset/_evaluation.py +++ b/kolena/dataset/evaluation.py @@ -26,16 +26,6 @@ from kolena._api.v2.model import LoadResultsRequest from kolena._api.v2.model import Path from kolena._api.v2.model import UploadResultsRequest -from kolena._experimental.dataset._dataset import _iter_dataset_raw -from kolena._experimental.dataset._dataset import _to_deserialized_dataframe -from kolena._experimental.dataset._dataset import _to_serialized_dataframe -from kolena._experimental.dataset._dataset import load_dataset -from kolena._experimental.dataset.common import COL_DATAPOINT -from kolena._experimental.dataset.common import COL_DATAPOINT_ID_OBJECT -from kolena._experimental.dataset.common import COL_EVAL_CONFIG -from kolena._experimental.dataset.common import COL_RESULT -from kolena._experimental.dataset.common import validate_batch_size -from kolena._experimental.dataset.common import validate_dataframe_ids from kolena._utils import krequests_v2 as krequests from kolena._utils import log from kolena._utils.batched_load import _BatchedLoader @@ -43,6 +33,15 @@ from kolena._utils.batched_load import upload_data_frame from kolena._utils.consts import BatchSize from kolena._utils.state import API_V2 +from kolena.dataset.common import COL_DATAPOINT +from kolena.dataset.common import COL_DATAPOINT_ID_OBJECT +from kolena.dataset.common import COL_EVAL_CONFIG +from kolena.dataset.common import COL_RESULT +from kolena.dataset.common import validate_batch_size +from kolena.dataset.common import validate_dataframe_ids +from kolena.dataset.dataset import _to_deserialized_dataframe +from kolena.dataset.dataset import _to_serialized_dataframe +from kolena.dataset.dataset import load_dataset from kolena.errors import IncorrectUsageError from kolena.errors import NotFoundError @@ -50,13 +49,6 @@ TEST_ON_TYPE = Optional[Union[str, List[str]]] -def _fetch_dataset(dataset: str) -> pd.DataFrame: - df_data_batch = list(_iter_dataset_raw(dataset)) - df_datapoints = pd.concat(df_data_batch) if df_data_batch else pd.DataFrame(columns=["id", COL_DATAPOINT]) - df_datapoints.rename(columns={"id": "datapoint_id"}, inplace=True) - return df_datapoints - - def _iter_result_raw(dataset: str, model: str, batch_size: int) -> Iterator[pd.DataFrame]: validate_batch_size(batch_size) init_request = LoadResultsRequest(dataset=dataset, model=model, batch_size=batch_size) diff --git a/tests/integration/_experimental/dataset/__init__.py b/tests/integration/dataset/__init__.py similarity index 100% rename from tests/integration/_experimental/dataset/__init__.py rename to tests/integration/dataset/__init__.py diff --git a/tests/integration/_experimental/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py similarity index 98% rename from tests/integration/_experimental/dataset/test_dataset.py rename to tests/integration/dataset/test_dataset.py index a65f689af..17f265d2d 100644 --- a/tests/integration/_experimental/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -21,9 +21,9 @@ from pandas.testing import assert_frame_equal from kolena._api.v2.dataset import CommitData -from kolena._experimental.dataset import fetch_dataset -from kolena._experimental.dataset import fetch_dataset_history -from kolena._experimental.dataset import register_dataset +from kolena.dataset import fetch_dataset +from kolena.dataset import fetch_dataset_history +from kolena.dataset import register_dataset from kolena.errors import NotFoundError from kolena.workflow.annotation import BoundingBox from kolena.workflow.annotation import LabeledBoundingBox diff --git a/tests/integration/_experimental/dataset/test_evaluation.py b/tests/integration/dataset/test_evaluation.py similarity index 98% rename from tests/integration/_experimental/dataset/test_evaluation.py rename to tests/integration/dataset/test_evaluation.py index 9138292f7..465b4da42 100644 --- a/tests/integration/_experimental/dataset/test_evaluation.py +++ b/tests/integration/dataset/test_evaluation.py @@ -20,13 +20,13 @@ import pytest from pandas.testing import assert_frame_equal -from kolena._experimental.dataset import fetch_dataset -from kolena._experimental.dataset import fetch_results -from kolena._experimental.dataset import register_dataset -from kolena._experimental.dataset import test +from kolena.dataset import fetch_dataset +from kolena.dataset import fetch_results +from kolena.dataset import register_dataset +from kolena.dataset import test from kolena.errors import IncorrectUsageError from kolena.errors import NotFoundError -from tests.integration._experimental.dataset.test_dataset import batch_iterator +from tests.integration.dataset.test_dataset import batch_iterator from tests.integration.helper import fake_locator from tests.integration.helper import with_test_prefix diff --git a/tests/unit/_experimental/dataset/__init__.py b/tests/unit/dataset/__init__.py similarity index 100% rename from tests/unit/_experimental/dataset/__init__.py rename to tests/unit/dataset/__init__.py diff --git a/tests/unit/_experimental/dataset/data.py b/tests/unit/dataset/data.py similarity index 100% rename from tests/unit/_experimental/dataset/data.py rename to tests/unit/dataset/data.py diff --git a/tests/unit/_experimental/dataset/test_common.py b/tests/unit/dataset/test_common.py similarity index 96% rename from tests/unit/_experimental/dataset/test_common.py rename to tests/unit/dataset/test_common.py index f93c16bbb..d8956b637 100644 --- a/tests/unit/_experimental/dataset/test_common.py +++ b/tests/unit/dataset/test_common.py @@ -16,8 +16,8 @@ import pandas as pd import pytest -from kolena._experimental.dataset.common import validate_dataframe_ids -from kolena._experimental.dataset.common import validate_id_fields +from kolena.dataset.common import validate_dataframe_ids +from kolena.dataset.common import validate_id_fields from kolena.errors import InputValidationError diff --git a/tests/unit/_experimental/dataset/test_dataset.py b/tests/unit/dataset/test_dataset.py similarity index 95% rename from tests/unit/_experimental/dataset/test_dataset.py rename to tests/unit/dataset/test_dataset.py index 76a595e2f..6402394f0 100644 --- a/tests/unit/_experimental/dataset/test_dataset.py +++ b/tests/unit/dataset/test_dataset.py @@ -22,17 +22,17 @@ from .data import a_text from .data import b_text from kolena._api.v2.dataset import EntityData -from kolena._experimental.dataset._dataset import _add_datatype -from kolena._experimental.dataset._dataset import _flatten_composite -from kolena._experimental.dataset._dataset import _infer_datatype -from kolena._experimental.dataset._dataset import _infer_datatype_value -from kolena._experimental.dataset._dataset import _infer_id_fields -from kolena._experimental.dataset._dataset import _to_deserialized_dataframe -from kolena._experimental.dataset._dataset import _to_serialized_dataframe -from kolena._experimental.dataset._dataset import DatapointType -from kolena._experimental.dataset._dataset import resolve_id_fields -from kolena._experimental.dataset.common import COL_DATAPOINT -from kolena._experimental.dataset.common import COL_RESULT +from kolena.dataset.common import COL_DATAPOINT +from kolena.dataset.common import COL_RESULT +from kolena.dataset.dataset import _add_datatype +from kolena.dataset.dataset import _flatten_composite +from kolena.dataset.dataset import _infer_datatype +from kolena.dataset.dataset import _infer_datatype_value +from kolena.dataset.dataset import _infer_id_fields +from kolena.dataset.dataset import _to_deserialized_dataframe +from kolena.dataset.dataset import _to_serialized_dataframe +from kolena.dataset.dataset import DatapointType +from kolena.dataset.dataset import resolve_id_fields from kolena.errors import InputValidationError from kolena.workflow._datatypes import DATA_TYPE_FIELD from kolena.workflow.annotation import BoundingBox