From 90ccc2ff5f6a757f58269cf919887cf21685c280 Mon Sep 17 00:00:00 2001 From: Yoohee Choi <17771952+y27choi@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:55:28 -0500 Subject: [PATCH] Moving V1 example scripts to example/datasets folder (#369) * Moving V1 example scripts to example/datasets folder * Separate mypy pre-commit check for examples/datasets folder --- .pre-commit-config.yaml | 10 +++++-- .../datasets/question_answering/README.md | 26 +++++++++++++++++++ .../question_answering/pyproject.toml | 20 ++++++++++++++ .../question_answering/__init__.py | 13 ++++++++++ .../question_answering/constants.py | 4 +++ .../question_answering/register_dataset.py | 0 examples/question_answering/pyproject.toml | 2 +- 7 files changed, 72 insertions(+), 3 deletions(-) create mode 100644 examples/datasets/question_answering/README.md create mode 100644 examples/datasets/question_answering/pyproject.toml create mode 100644 examples/datasets/question_answering/question_answering/__init__.py rename examples/{ => datasets}/question_answering/question_answering/constants.py (91%) rename examples/{ => datasets}/question_answering/question_answering/register_dataset.py (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2bab91437..25dd438ff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -47,8 +47,14 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.910 hooks: - - id: mypy - additional_dependencies: [types-all, "pydantic<2.0"] + - id: mypy + name: mypy-default + exclude: ^examples/datasets + additional_dependencies: [ types-all, "pydantic<2.0" ] + - id: mypy + name: mypy-examples-dataset + files: ^examples/datasets + additional_dependencies: [ types-all, "pydantic<2.0" ] - repo: meta hooks: - id: check-hooks-apply diff --git a/examples/datasets/question_answering/README.md b/examples/datasets/question_answering/README.md new file mode 100644 index 000000000..3fc0f6fb9 --- /dev/null +++ b/examples/datasets/question_answering/README.md @@ -0,0 +1,26 @@ +# Example Integration: Question Answering + +This example integration uses the [TruthfulQA (open-domain)](https://github.com/sylinrl/TruthfulQA) and the +[HaluEval (closed-domain)](https://github.com/RUCAIBox/HaluEval/tree/main/evaluation) datasets and OpenAI's GPT models +to demonstrate the question answering workflow in Kolena. + +## Setup + +This project uses [Poetry](https://python-poetry.org/) for packaging and Python dependency management. To get started, +install project dependencies from [`pyproject.toml`](./pyproject.toml) by running: + +```shell +poetry update && poetry install +``` + +## Usage + +The data for this example integration lives in the publicly accessible S3 bucket `s3://kolena-public-datasets`. + +First, ensure that the `KOLENA_TOKEN` environment variable is populated in your environment. See our +[initialization documentation](https://docs.kolena.io/installing-kolena/#initialization) for details. + +This project defines two scripts that perform the following operations: + +1. [`register_dataset.py`](question_answering/register_dataset.py) registers both datasets by default. You can also +select the dataset to register by specifying `--datasets`. diff --git a/examples/datasets/question_answering/pyproject.toml b/examples/datasets/question_answering/pyproject.toml new file mode 100644 index 000000000..1a299dcbc --- /dev/null +++ b/examples/datasets/question_answering/pyproject.toml @@ -0,0 +1,20 @@ +[tool.poetry] +name = "question_answering" +version = "0.1.0" +description = " Kolena Datasets Example integration for question answering" +authors = ["Kolena Engineering "] +license = "Apache-2.0" + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +kolena = ">=0.99.0,<1" +s3fs = "^2022.7.1" + +[tool.poetry.group.dev.dependencies] +pre-commit = "^2.17" +pytest = "^7" +pytest-depends = "^1.0.1" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" diff --git a/examples/datasets/question_answering/question_answering/__init__.py b/examples/datasets/question_answering/question_answering/__init__.py new file mode 100644 index 000000000..124812a88 --- /dev/null +++ b/examples/datasets/question_answering/question_answering/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2021-2023 Kolena Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/question_answering/question_answering/constants.py b/examples/datasets/question_answering/question_answering/constants.py similarity index 91% rename from examples/question_answering/question_answering/constants.py rename to examples/datasets/question_answering/question_answering/constants.py index 92b0d04e3..de4ca47b0 100644 --- a/examples/question_answering/question_answering/constants.py +++ b/examples/datasets/question_answering/question_answering/constants.py @@ -15,3 +15,7 @@ BUCKET = "kolena-public-datasets" TRUTHFULQA = "TruthfulQA" HALUEVALQA = "HaluEval-QA" +MODELS = [ + "gpt-3.5-turbo", + "gpt-4-1106-preview", +] diff --git a/examples/question_answering/question_answering/register_dataset.py b/examples/datasets/question_answering/question_answering/register_dataset.py similarity index 100% rename from examples/question_answering/question_answering/register_dataset.py rename to examples/datasets/question_answering/question_answering/register_dataset.py diff --git a/examples/question_answering/pyproject.toml b/examples/question_answering/pyproject.toml index 0226efde5..f7655a3b7 100644 --- a/examples/question_answering/pyproject.toml +++ b/examples/question_answering/pyproject.toml @@ -7,7 +7,7 @@ license = "Apache-2.0" [tool.poetry.dependencies] python = ">=3.8,<3.11" -kolena = ">=0.99.0,<1" +kolena = ">=0.94.0,<1" s3fs = "^2022.7.1" torch = [ {markers = "sys_platform == 'darwin' and platform_machine == 'arm64'", url = "https://download.pytorch.org/whl/cpu/torch-2.0.1-cp39-none-macosx_11_0_arm64.whl"},