From 2aff54a93350eb9ba9408759a142c5f15274b4ec Mon Sep 17 00:00:00 2001 From: Radu Rogojanu Date: Wed, 5 Feb 2025 14:12:57 +0100 Subject: [PATCH 01/58] feat: add support for language categorical encoding and analysis, wip1 --- examples/language_encoding_types.ipynb | 163 ++++++++++++++++++ .../_encoding_types/language/categorical.py | 115 ++++++++++++ mostlyai/engine/analysis.py | 25 ++- mostlyai/engine/domain.py | 3 + 4 files changed, 302 insertions(+), 4 deletions(-) create mode 100644 examples/language_encoding_types.ipynb create mode 100644 mostlyai/engine/_encoding_types/language/categorical.py diff --git a/examples/language_encoding_types.ipynb b/examples/language_encoding_types.ipynb new file mode 100644 index 0000000..35fce18 --- /dev/null +++ b/examples/language_encoding_types.ipynb @@ -0,0 +1,163 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Language Model: flat data, without context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mostly-ai/mostlyai-engine/blob/main/examples/language.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-02-05 13:43:49,105] INFO : TRAIN_LANGUAGE started\n", + "[2025-02-05 13:43:49,115] INFO : numpy=1.26.4, pandas=2.2.3\n", + "[2025-02-05 13:43:49,118] INFO : torch=2.5.1, opacus=1.5.2\n", + "[2025-02-05 13:43:49,123] INFO : transformers=4.46.3, peft=0.11.1\n", + "[2025-02-05 13:43:49,124] INFO : device=device(type='cpu')\n", + "[2025-02-05 13:43:49,126] INFO : bf16_supported=False\n", + "[2025-02-05 13:43:49,126] INFO : use_mixed_precision=False\n", + "[2025-02-05 13:43:49,127] INFO : model_id='MOSTLY_AI/LSTMFromScratch-3m'\n", + "[2025-02-05 13:43:49,127] INFO : enable_flexible_generation=True\n", + "[2025-02-05 13:43:49,128] INFO : max_training_time=12.0s\n", + "[2025-02-05 13:43:49,128] INFO : max_epochs=100.0\n", + "[2025-02-05 13:43:49,129] INFO : with_dp=False\n", + "[2025-02-05 13:43:49,130] INFO : model_state_strategy=\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "0.01s - Debugger warning: It seems that frozen modules are being used, which may\n", + "0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n", + "0.00s - to python to disable frozen modules.\n", + "0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2025-02-05 13:44:01,027] INFO : create training model\n", + "[2025-02-05 13:44:01,031] INFO : model weights not found; change strategy from ModelStateStrategy.reset to RESET\n", + "[2025-02-05 13:44:01,032] INFO : model_state_strategy=\n", + "[2025-02-05 13:44:01,033] INFO : clear existing checkpoint files\n", + "[2025-02-05 13:44:01,035] INFO : start training progress from epoch=0.0, steps=0\n", + "[2025-02-05 13:44:01,233] INFO : model loading time: 0.20s\n", + "[2025-02-05 13:44:01,234] INFO : no_of_model_params=595591\n", + "[2025-02-05 13:44:01,235] INFO : no_of_trainable_model_params=595591\n", + "[2025-02-05 13:44:01,235] INFO : tokenizer=LlamaTokenizerFast(name_or_path='', vocab_size=135, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", + "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "}\n", + "[2025-02-05 13:44:01,992] INFO : trn_cnt=20768, val_cnt=2308\n", + "[2025-02-05 13:44:01,992] INFO : trn_batch_size=64, val_batch_size=32\n", + "[2025-02-05 13:44:01,993] INFO : trn_steps=324, val_steps=72\n", + "[2025-02-05 13:44:01,993] INFO : batch_size=32, gradient_accumulation_steps=2, initial_lr=0.0004\n", + "[2025-02-05 13:44:02,226] INFO : {'epoch': 0.0, 'is_checkpoint': 0, 'steps': 1, 'samples': 64, 'trn_loss': None, 'val_loss': None, 'total_time': 0.2, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", + "[2025-02-05 13:44:14,525] INFO : saving model weights, as none were saved so far\n", + "[2025-02-05 13:44:15,834] INFO : {'epoch': 0.46, 'is_checkpoint': 1, 'steps': 148, 'samples': 9472, 'trn_loss': None, 'val_loss': 0.2542, 'total_time': 13.8, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", + "[2025-02-05 13:44:15,842] INFO : TRAIN_LANGUAGE finished in 26.74s\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "import pandas as pd\n", + "from mostlyai import engine\n", + "\n", + "# init workspace and logging\n", + "# ws = Path(\"ws-language-flat\")\n", + "ws = Path(\"ws-language-categorical-flat\")\n", + "engine.init_logging()\n", + "\n", + "# # load original data\n", + "# url = \"https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv\"\n", + "# # trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category', 'title']]\n", + "# trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category']]\n", + "\n", + "# execute the engine steps\n", + "# engine.split( # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`\n", + "# workspace_dir=ws,\n", + "# tgt_data=trn_df,\n", + "# # model_type=\"LANGUAGE\",\n", + "# tgt_encoding_types={\"category\": \"LANGUAGE_CATEGORICAL\"},\n", + "# )\n", + "# engine.analyze(workspace_dir=ws) # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`\n", + "# engine.encode(workspace_dir=ws) # encode training data to `{ws}/OriginalData/encoded-data`\n", + "# engine.train( # train model and store to `{ws}/ModelStore/model-data`\n", + "# workspace_dir=ws,\n", + "# model=\"MOSTLY_AI/LSTMFromScratch-3m\", # use a light-weight LSTM model, trained from scratch (GPU recommended)\n", + "# # model=\"microsoft/phi-1.5\", # or alternatively use a HF-hosted LLM model (GPU required)\n", + "# max_training_time=0.2, # limit TRAIN to 10 minute for demo purposes\n", + "# )\n", + "engine.generate( # use model to generate synthetic samples to `{ws}/SyntheticData`\n", + " workspace_dir=ws, \n", + " sample_size=100,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "syn_tgt_df = pd.read_parquet(ws / \"SyntheticData\") # load synthetic data\n", + "syn_tgt_df.head(5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/mostlyai/engine/_encoding_types/language/categorical.py b/mostlyai/engine/_encoding_types/language/categorical.py new file mode 100644 index 0000000..7796516 --- /dev/null +++ b/mostlyai/engine/_encoding_types/language/categorical.py @@ -0,0 +1,115 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Categorical encoding for language models. +""" + +import numpy as np +import pandas as pd + +from mostlyai.engine._common import safe_convert_string + +CATEGORICAL_UNKNOWN_TOKEN = "_RARE_" +CATEGORICAL_NULL_TOKEN = "<>" +CATEGORICAL_SUB_COL_SUFFIX = "cat" +CATEGORICAL_ESCAPE_CHAR = "\x01" + + +def safe_language_categorical_escape(values: pd.Series) -> pd.Series: + """Inplace escaping of categorical values""" + reserved_tokens = (CATEGORICAL_UNKNOWN_TOKEN, CATEGORICAL_NULL_TOKEN) + reserved_tokens_replacement_map = {t: CATEGORICAL_ESCAPE_CHAR + t for t in reserved_tokens} + # first, prefix values starting with escape char with another escape char + mask = values.str.startswith(CATEGORICAL_ESCAPE_CHAR, na=False) + values.loc[mask] = values.loc[mask].str.slice_replace(stop=1, repl=CATEGORICAL_ESCAPE_CHAR * 2) + # second, add escape char to all reserved tokens + values = values.replace(reserved_tokens_replacement_map) + return values + + +def safe_language_categorical_unescape(values: pd.Series) -> pd.Series: + """Inplace un-escaping of categorical values""" + # de-prefix all values starting with escape char by removing just the first one + mask = values.str.startswith(CATEGORICAL_ESCAPE_CHAR, na=False) + values.loc[mask] = values.loc[mask].str[1:] + return values + + +def analyze_language_categorical(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: + # ensure a safe representation of values: 1. string dtype; 2. escape reserved tokens + values = safe_language_categorical_escape(safe_convert_string(values)) + # count distinct root_keys per categorical value for rare-category protection + df = pd.concat([root_keys, values], axis=1) + cnt_values = df.groupby(values.name)[root_keys.name].nunique().to_dict() + stats = {"has_nan": sum(values.isna()) > 0, "cnt_values": cnt_values} + return stats + + +def analyze_reduce_language_categorical(stats_list: list[dict], value_protection: bool = True) -> dict: + # sum up all counts for each categorical value + cnt_values: dict[str, int] = {} + for item in stats_list: + for value, count in item["cnt_values"].items(): + cnt_values[value] = cnt_values.get(value, 0) + count + # create alphabetically sorted list of non-rare categories + known_categories = [k for k in sorted(cnt_values.keys())] + if value_protection: + # stochastic threshold for rare categories + rare_min = 5 + int(3 * np.random.uniform()) + else: + rare_min = 0 + categories = [k for k in known_categories if cnt_values[k] >= rare_min] + no_of_rare_categories = len(known_categories) - len(categories) + # add special token for MISSING categories, if any are present + if any([j["has_nan"] for j in stats_list]): + categories = [CATEGORICAL_NULL_TOKEN] + categories + # add special token for UNKNOWN categories at first position + categories = [CATEGORICAL_UNKNOWN_TOKEN] + categories + stats = { + "no_of_rare_categories": no_of_rare_categories, + "codes": {categories[i]: i for i in range(len(categories))}, + "cardinalities": {CATEGORICAL_SUB_COL_SUFFIX: len(categories)}, + } + return stats + + +def encode_language_categorical(values: pd.Series, stats: dict, _: pd.Series | None = None) -> pd.DataFrame: + # ensure a safe representation of values: 1. string dtype; 2. escape reserved tokens + values = safe_language_categorical_escape(safe_convert_string(values)) + known_categories = [str(k) for k in stats["codes"].keys()] + values = values.copy() + if CATEGORICAL_NULL_TOKEN in known_categories: + values[values.isna()] = CATEGORICAL_NULL_TOKEN + values[~values.isin(known_categories)] = CATEGORICAL_UNKNOWN_TOKEN + + # map categories to their corresponding codes + codes = pd.Series( + pd.Categorical(values, categories=known_categories).codes, + name=CATEGORICAL_SUB_COL_SUFFIX, + index=values.index, + ) + return codes.to_frame() + + +def decode_language_categorical(df_encoded: pd.DataFrame, stats: dict) -> pd.Series: + categories = stats["codes"].keys() + values = pd.Series( + pd.Categorical.from_codes(df_encoded[CATEGORICAL_SUB_COL_SUFFIX], categories=categories), + dtype="string", + ) + values[values == CATEGORICAL_NULL_TOKEN] = pd.NA + # convert escaped values to their original representation + values = safe_language_categorical_unescape(values) + return values diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index 1094c61..ab8ab3e 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -66,6 +66,10 @@ analyze_text, analyze_reduce_text, ) +from mostlyai.engine._encoding_types.language.categorical import ( + analyze_language_categorical, + analyze_reduce_language_categorical, +) from mostlyai.engine.domain import ModelEncodingType from mostlyai.engine._workspace import ( @@ -222,7 +226,8 @@ def _analyze_partition( ctx_root_keys = ctx_primary_keys.rename("__rkey") # analyze all target columns - with parallel_config("loky", n_jobs=n_jobs): + # with parallel_config("loky", n_jobs=n_jobs): + with parallel_config("loky", n_jobs=1): results = Parallel()( delayed(_analyze_col)( values=tgt_df[column], @@ -263,7 +268,8 @@ def _analyze_partition( # analyze all context columns assert isinstance(ctx_encoding_types, dict) - with parallel_config("loky", n_jobs=n_jobs): + # with parallel_config("loky", n_jobs=n_jobs): + with parallel_config("loky", n_jobs=1): results = Parallel()( delayed(_analyze_col)( values=ctx_df[column], @@ -379,6 +385,12 @@ def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: ) elif encoding_type == ModelEncodingType.language_text: stats_col = analyze_reduce_text(stats_list=column_stats_list) + elif encoding_type == ModelEncodingType.language_categorical: + stats_col = analyze_reduce_text(stats_list=column_stats_list) + stats_col |= analyze_reduce_language_categorical( + stats_list=column_stats_list, + value_protection=value_protection, + ) else: raise RuntimeError(f"unknown encoding type {encoding_type}") @@ -405,9 +417,10 @@ def get_argn_processor(mode, is_flat) -> str: if not is_flat: stats_col["seq_len"] = _analyze_reduce_seq_len([column_stats_list[0]["seq_len"]]) - if encoding_type == ModelEncodingType.language_text: + if encoding_type in (ModelEncodingType.language_text, ModelEncodingType.language_categorical): _LOG.info( - f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" + # f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" + f"analyzed column `{column}`: {stats_col['encoding_type']} " ) else: _LOG.info(f"analyzed column `{column}`: {stats_col['encoding_type']} {stats_col['cardinalities']}") @@ -513,6 +526,10 @@ def _analyze_flat_col( stats = analyze_latlong(values, root_keys, context_keys) elif encoding_type == ModelEncodingType.language_text: stats = analyze_text(values, root_keys, context_keys) + elif encoding_type == ModelEncodingType.language_categorical: + stats = analyze_text(values, root_keys, context_keys) + stats2 = analyze_language_categorical(values, root_keys, context_keys) + stats |= stats2 else: raise RuntimeError(f"unknown encoding type: `{encoding_type}` for `{values.name}`") return stats diff --git a/mostlyai/engine/domain.py b/mostlyai/engine/domain.py index 6afab60..4c944be 100644 --- a/mostlyai/engine/domain.py +++ b/mostlyai/engine/domain.py @@ -61,6 +61,9 @@ class ModelEncodingType(str, Enum): tabular_datetime_relative = "TABULAR_DATETIME_RELATIVE" tabular_lat_long = "TABULAR_LAT_LONG" language_text = "LANGUAGE_TEXT" + # language_numeric = "LANGUAGE_NUMERIC" + language_categorical = "LANGUAGE_CATEGORICAL" + # language_datetime = "LANGUAGE_DATETIME" class ModelStateStrategy(str, Enum): From 1c7d5472ecc48cfabcdd97fa18372702b0abb679 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Wed, 5 Feb 2025 14:59:17 +0100 Subject: [PATCH 02/58] wip --- examples/language_encoding_types.ipynb | 482 ++++++++++++++++--- mostlyai/engine/_language/formatron_utils.py | 17 +- mostlyai/engine/_language/generation.py | 8 +- 3 files changed, 435 insertions(+), 72 deletions(-) diff --git a/examples/language_encoding_types.ipynb b/examples/language_encoding_types.ipynb index 35fce18..4512b4a 100644 --- a/examples/language_encoding_types.ipynb +++ b/examples/language_encoding_types.ipynb @@ -18,64 +18,132 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, + "execution_count": 11, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:52:43.092585Z", + "iopub.status.busy": "2025-02-05T13:52:43.092035Z", + "iopub.status.idle": "2025-02-05T13:56:20.259209Z", + "shell.execute_reply": "2025-02-05T13:56:20.258849Z", + "shell.execute_reply.started": "2025-02-05T13:52:43.092556Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[2025-02-05 13:43:49,105] INFO : TRAIN_LANGUAGE started\n", - "[2025-02-05 13:43:49,115] INFO : numpy=1.26.4, pandas=2.2.3\n", - "[2025-02-05 13:43:49,118] INFO : torch=2.5.1, opacus=1.5.2\n", - "[2025-02-05 13:43:49,123] INFO : transformers=4.46.3, peft=0.11.1\n", - "[2025-02-05 13:43:49,124] INFO : device=device(type='cpu')\n", - "[2025-02-05 13:43:49,126] INFO : bf16_supported=False\n", - "[2025-02-05 13:43:49,126] INFO : use_mixed_precision=False\n", - "[2025-02-05 13:43:49,127] INFO : model_id='MOSTLY_AI/LSTMFromScratch-3m'\n", - "[2025-02-05 13:43:49,127] INFO : enable_flexible_generation=True\n", - "[2025-02-05 13:43:49,128] INFO : max_training_time=12.0s\n", - "[2025-02-05 13:43:49,128] INFO : max_epochs=100.0\n", - "[2025-02-05 13:43:49,129] INFO : with_dp=False\n", - "[2025-02-05 13:43:49,130] INFO : model_state_strategy=\n" + "[2025-02-05 14:52:45,343] INFO : SPLIT started\n", + "[2025-02-05 14:52:45,344] INFO : clean `ws-language-categorical-flat/OriginalData/tgt-data`\n", + "[2025-02-05 14:52:45,345] INFO : clean `ws-language-categorical-flat/OriginalData/tgt-meta`\n", + "[2025-02-05 14:52:45,346] INFO : model_type='LANGUAGE'\n", + "[2025-02-05 14:52:45,346] INFO : tgt_encoding_types={'category': 'LANGUAGE_CATEGORICAL', 'title': 'LANGUAGE_TEXT'}\n", + "[2025-02-05 14:52:45,360] INFO : SPLIT finished in 0.02s\n", + "[2025-02-05 14:52:45,361] INFO : ANALYZE started\n", + "[2025-02-05 14:52:45,363] INFO : clean `ws-language-categorical-flat/ModelStore/tgt-stats`\n", + "[2025-02-05 14:52:45,364] INFO : analyzing 2 partitions in parallel\n", + "[2025-02-05 14:52:45,413] INFO : analyzed target partition 000000-trn (20768, 2)\n", + "[2025-02-05 14:52:45,422] INFO : analyzed target partition 000000-val (2308, 2)\n", + "[2025-02-05 14:52:45,422] INFO : combine partition statistics\n", + "[2025-02-05 14:52:45,423] INFO : analyzed column `category`: LANGUAGE_CATEGORICAL \n", + "[2025-02-05 14:52:45,423] INFO : analyzed column `title`: LANGUAGE_TEXT \n", + "[2025-02-05 14:52:45,424] INFO : analyzed 23,076 records: 20,768 training / 2,308 validation\n", + "[2025-02-05 14:52:45,425] INFO : tgt sequence length deciles: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n", + "[2025-02-05 14:52:45,425] INFO : is_sequential: False\n", + "[2025-02-05 14:52:45,425] INFO : write statistics to `ws-language-categorical-flat/ModelStore/tgt-stats/stats.json`\n", + "[2025-02-05 14:52:45,426] INFO : ANALYZE finished in 0.06s\n", + "[2025-02-05 14:52:45,427] INFO : ENCODE_LANGUAGE started\n", + "[2025-02-05 14:52:45,428] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", + "[2025-02-05 14:52:45,428] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", + "[2025-02-05 14:52:45,429] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", + "[2025-02-05 14:52:45,434] INFO : Formatting context columns [] to JSON\n", + "[2025-02-05 14:52:45,443] INFO : Formatting target columns ['category', 'title'] to JSON\n", + "[2025-02-05 14:52:46,161] INFO : token statistics of this partition: \n", + " #pretokens #chars\n", + "min 16.0 50.0\n", + "50% 27.0 115.0\n", + "max 67.0 208.0\n", + "[2025-02-05 14:52:46,172] INFO : encoded partition part.000000-trn.parquet (20768, 2)\n", + "[2025-02-05 14:52:46,177] INFO : Formatting context columns [] to JSON\n", + "[2025-02-05 14:52:46,181] INFO : Formatting target columns ['category', 'title'] to JSON\n", + "[2025-02-05 14:52:46,289] INFO : token statistics of this partition: \n", + " #pretokens #chars\n", + "min 16.0 52.0\n", + "50% 27.0 115.0\n", + "max 52.0 199.0\n", + "[2025-02-05 14:52:46,292] INFO : encoded partition part.000000-val.parquet (2308, 2)\n", + "[2025-02-05 14:52:46,292] INFO : ENCODE_LANGUAGE finished in 0.87s\n", + "[2025-02-05 14:52:46,293] INFO : TRAIN_LANGUAGE started\n", + "[2025-02-05 14:52:46,303] INFO : numpy=1.26.4, pandas=2.2.3\n", + "[2025-02-05 14:52:46,305] INFO : torch=2.5.1, opacus=1.5.2\n", + "[2025-02-05 14:52:46,309] INFO : transformers=4.46.3, peft=0.11.1\n", + "[2025-02-05 14:52:46,309] INFO : device=device(type='cpu')\n", + "[2025-02-05 14:52:46,309] INFO : bf16_supported=False\n", + "[2025-02-05 14:52:46,310] INFO : use_mixed_precision=False\n", + "[2025-02-05 14:52:46,310] INFO : model_id='MOSTLY_AI/LSTMFromScratch-3m'\n", + "[2025-02-05 14:52:46,310] INFO : enable_flexible_generation=True\n", + "[2025-02-05 14:52:46,310] INFO : max_training_time=60s\n", + "[2025-02-05 14:52:46,311] INFO : max_epochs=100.0\n", + "[2025-02-05 14:52:46,311] INFO : with_dp=False\n", + "[2025-02-05 14:52:46,311] INFO : model_state_strategy=\n", + "[2025-02-05 14:52:52,892] INFO : create training model\n", + "[2025-02-05 14:52:52,893] INFO : model_state_strategy=\n", + "[2025-02-05 14:52:52,893] INFO : clear existing checkpoint files\n", + "[2025-02-05 14:52:52,895] INFO : start training progress from epoch=0.0, steps=0\n", + "[2025-02-05 14:52:53,274] INFO : model loading time: 0.38s\n", + "[2025-02-05 14:52:53,274] INFO : no_of_model_params=2668111\n", + "[2025-02-05 14:52:53,274] INFO : no_of_trainable_model_params=2668111\n", + "[2025-02-05 14:52:53,275] INFO : tokenizer=LlamaTokenizerFast(name_or_path='', vocab_size=4175, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", + "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", + "}\n", + "[2025-02-05 14:52:54,202] INFO : trn_cnt=20768, val_cnt=2308\n", + "[2025-02-05 14:52:54,202] INFO : trn_batch_size=64, val_batch_size=32\n", + "[2025-02-05 14:52:54,203] INFO : trn_steps=324, val_steps=72\n", + "[2025-02-05 14:52:54,203] INFO : batch_size=32, gradient_accumulation_steps=2, initial_lr=0.0004\n", + "[2025-02-05 14:52:54,677] INFO : {'epoch': 0.0, 'is_checkpoint': 0, 'steps': 1, 'samples': 64, 'trn_loss': None, 'val_loss': None, 'total_time': 0.5, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", + "[2025-02-05 14:53:54,325] INFO : saving model weights, as none were saved so far\n", + "[2025-02-05 14:53:57,632] INFO : {'epoch': 0.63, 'is_checkpoint': 1, 'steps': 205, 'samples': 13120, 'trn_loss': None, 'val_loss': 2.9262, 'total_time': 63.4, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", + "[2025-02-05 14:53:57,634] INFO : TRAIN_LANGUAGE finished in 71.34s\n", + "[2025-02-05 14:53:57,643] INFO : GENERATE_LANGUAGE started\n", + "[2025-02-05 14:53:57,643] INFO : device=device(type='cpu')\n", + "[2025-02-05 14:53:57,644] INFO : sampling_temperature=1.0, sampling_top_p=1.0\n", + "[2025-02-05 14:53:57,644] INFO : clean `ws-language-categorical-flat/SyntheticData`\n", + "[2025-02-05 14:53:57,649] INFO : seed_data.shape=(10000, 0)\n", + "[2025-02-05 14:53:57,650] INFO : Formatting context columns [] to JSON\n", + "[2025-02-05 14:53:57,674] INFO : token statistics of this partition: \n", + " #pretokens #chars\n", + "min 1.0 3.0\n", + "50% 1.0 3.0\n", + "max 1.0 3.0\n", + "[2025-02-05 14:53:57,675] INFO : max_new_tokens=151\n", + "[2025-02-05 14:53:57,774] INFO : inference engine: HuggingFaceEngine\n", + "[2025-02-05 14:53:57,774] INFO : model loading time: 0.10s\n", + "[2025-02-05 14:53:57,775] INFO : batch_size=128\n", + "[2025-02-05 14:53:57,775] INFO : enforce_json_output=True\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "0.01s - Debugger warning: It seems that frozen modules are being used, which may\n", - "0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off\n", - "0.00s - to python to disable frozen modules.\n", - "0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.\n" + "02/05/2025 14:53:57:WARNING:The following bytes are not present in any token: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]. This likely indicates that the vocabulary loading code is wrong, the tokenizer is doing some creepy processing or the tokenizer is not UTF-8 compatible. Check the vocabulary loading code and the tokenizer code to fix any bug and/or consider processing the vocab like the tokenizer.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[2025-02-05 13:44:01,027] INFO : create training model\n", - "[2025-02-05 13:44:01,031] INFO : model weights not found; change strategy from ModelStateStrategy.reset to RESET\n", - "[2025-02-05 13:44:01,032] INFO : model_state_strategy=\n", - "[2025-02-05 13:44:01,033] INFO : clear existing checkpoint files\n", - "[2025-02-05 13:44:01,035] INFO : start training progress from epoch=0.0, steps=0\n", - "[2025-02-05 13:44:01,233] INFO : model loading time: 0.20s\n", - "[2025-02-05 13:44:01,234] INFO : no_of_model_params=595591\n", - "[2025-02-05 13:44:01,235] INFO : no_of_trainable_model_params=595591\n", - "[2025-02-05 13:44:01,235] INFO : tokenizer=LlamaTokenizerFast(name_or_path='', vocab_size=135, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", - "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "}\n", - "[2025-02-05 13:44:01,992] INFO : trn_cnt=20768, val_cnt=2308\n", - "[2025-02-05 13:44:01,992] INFO : trn_batch_size=64, val_batch_size=32\n", - "[2025-02-05 13:44:01,993] INFO : trn_steps=324, val_steps=72\n", - "[2025-02-05 13:44:01,993] INFO : batch_size=32, gradient_accumulation_steps=2, initial_lr=0.0004\n", - "[2025-02-05 13:44:02,226] INFO : {'epoch': 0.0, 'is_checkpoint': 0, 'steps': 1, 'samples': 64, 'trn_loss': None, 'val_loss': None, 'total_time': 0.2, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", - "[2025-02-05 13:44:14,525] INFO : saving model weights, as none were saved so far\n", - "[2025-02-05 13:44:15,834] INFO : {'epoch': 0.46, 'is_checkpoint': 1, 'steps': 148, 'samples': 9472, 'trn_loss': None, 'val_loss': 0.2542, 'total_time': 13.8, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", - "[2025-02-05 13:44:15,842] INFO : TRAIN_LANGUAGE finished in 26.74s\n" + "[2025-02-05 14:56:20,197] INFO : num_samples_max_length_limit=0\n", + "[2025-02-05 14:56:20,241] INFO : percentage of invalid values: {'category': '0.00%', 'title': '0.00%'}\n", + "[2025-02-05 14:56:20,241] INFO : decoded (10000, 2) from 79 batches in 1.30s\n", + "[2025-02-05 14:56:20,244] INFO : persisted (10000, 2) to `part.000000.000000.parquet` in 0.00s\n", + "[2025-02-05 14:56:20,246] INFO : total_tokenize_fn_time=0.21s\n", + "[2025-02-05 14:56:20,246] INFO : total_logits_processor_build_time=1.37s\n", + "[2025-02-05 14:56:20,246] INFO : total_generate_fn_time=139.43s\n", + "[2025-02-05 14:56:20,246] INFO : GENERATE_LANGUAGE finished in 142.60s\n" ] } ], @@ -90,45 +158,331 @@ "engine.init_logging()\n", "\n", "# # load original data\n", - "# url = \"https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv\"\n", - "# # trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category', 'title']]\n", + "url = \"https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv\"\n", + "trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category', 'title']]\n", "# trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category']]\n", "\n", "# execute the engine steps\n", - "# engine.split( # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`\n", - "# workspace_dir=ws,\n", - "# tgt_data=trn_df,\n", - "# # model_type=\"LANGUAGE\",\n", - "# tgt_encoding_types={\"category\": \"LANGUAGE_CATEGORICAL\"},\n", - "# )\n", - "# engine.analyze(workspace_dir=ws) # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`\n", - "# engine.encode(workspace_dir=ws) # encode training data to `{ws}/OriginalData/encoded-data`\n", - "# engine.train( # train model and store to `{ws}/ModelStore/model-data`\n", - "# workspace_dir=ws,\n", - "# model=\"MOSTLY_AI/LSTMFromScratch-3m\", # use a light-weight LSTM model, trained from scratch (GPU recommended)\n", - "# # model=\"microsoft/phi-1.5\", # or alternatively use a HF-hosted LLM model (GPU required)\n", - "# max_training_time=0.2, # limit TRAIN to 10 minute for demo purposes\n", - "# )\n", + "engine.split( # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`\n", + " workspace_dir=ws,\n", + " tgt_data=trn_df,\n", + " # model_type=\"LANGUAGE\",\n", + " tgt_encoding_types={\"category\": \"LANGUAGE_CATEGORICAL\", \"title\": \"LANGUAGE_TEXT\"},\n", + ")\n", + "engine.analyze(workspace_dir=ws) # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`\n", + "engine.encode(workspace_dir=ws) # encode training data to `{ws}/OriginalData/encoded-data`\n", + "engine.train( # train model and store to `{ws}/ModelStore/model-data`\n", + " workspace_dir=ws,\n", + " model=\"MOSTLY_AI/LSTMFromScratch-3m\", # use a light-weight LSTM model, trained from scratch (GPU recommended)\n", + " # model=\"microsoft/phi-1.5\", # or alternatively use a HF-hosted LLM model (GPU required)\n", + " max_training_time=1, # limit TRAIN to 10 minute for demo purposes\n", + ")\n", "engine.generate( # use model to generate synthetic samples to `{ws}/SyntheticData`\n", " workspace_dir=ws, \n", - " sample_size=100,\n", + " sample_size=10000,\n", ")" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:56:20.260268Z", + "iopub.status.busy": "2025-02-05T13:56:20.260149Z", + "iopub.status.idle": "2025-02-05T13:56:20.269394Z", + "shell.execute_reply": "2025-02-05T13:56:20.268803Z", + "shell.execute_reply.started": "2025-02-05T13:56:20.260257Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'chao-dyn',\n", + " 'cmp-lg',\n", + " 'comp-gas',\n", + " 'cond-mat.other',\n", + " 'cond-mat.quant-gas',\n", + " 'cond-mat.supr-con',\n", + " 'cs.CC',\n", + " 'cs.DL',\n", + " 'cs.FL',\n", + " 'cs.OS',\n", + " 'cs.PL',\n", + " 'cs.SC',\n", + " 'econ.TH',\n", + " 'math.CA',\n", + " 'math.CT',\n", + " 'math.DG',\n", + " 'math.FA',\n", + " 'math.GM',\n", + " 'math.GN',\n", + " 'math.GR',\n", + " 'math.MG',\n", + " 'math.SP',\n", + " 'nlin.AO',\n", + " 'nucl-ex',\n", + " 'nucl-th',\n", + " 'q-bio.CB',\n", + " 'q-bio.OT',\n", + " 'q-bio.SC',\n", + " 'q-fin.EC',\n", + " 'q-fin.MF',\n", + " 'q-fin.PR'}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "syn_tgt_df = pd.read_parquet(ws / \"SyntheticData\") # load synthetic data\n", - "syn_tgt_df.head(5)" + "set(trn_df['category']) - set(syn_tgt_df['category']) " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:56:20.270196Z", + "iopub.status.busy": "2025-02-05T13:56:20.270014Z", + "iopub.status.idle": "2025-02-05T13:56:20.279656Z", + "shell.execute_reply": "2025-02-05T13:56:20.278913Z", + "shell.execute_reply.started": "2025-02-05T13:56:20.270181Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'_RARE_'}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(syn_tgt_df['category']) - set(trn_df['category'])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:56:20.282741Z", + "iopub.status.busy": "2025-02-05T13:56:20.281808Z", + "iopub.status.idle": "2025-02-05T13:56:20.288184Z", + "shell.execute_reply": "2025-02-05T13:56:20.287499Z", + "shell.execute_reply.started": "2025-02-05T13:56:20.282643Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 , \n", + "1 category\n", + "2 : A- for \n", + "3 : \n", + "4 , to\n", + "5 -..ML\n", + "6 D the\n", + "7 -Oed Learning with-to for-c- and Data ofe\n", + " \n", + "8 S from: a Learning ofn- for Synthetic\n", + "9 \n", + "Name: title, dtype: string" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "syn_tgt_df['title'].head(10)" ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:56:20.289566Z", + "iopub.status.busy": "2025-02-05T13:56:20.289133Z", + "iopub.status.idle": "2025-02-05T13:56:20.296014Z", + "shell.execute_reply": "2025-02-05T13:56:20.295608Z", + "shell.execute_reply.started": "2025-02-05T13:56:20.289552Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Conception d'un banc d'essais d\\'ecisionnel\n", + "1 Monotonicity Analysis over Chains and Curves\n", + "2 An active curve approach for tomographic recon...\n", + "3 Application of the HLSVD technique to the filt...\n", + "4 Phase retrieval by iterated projections\n", + "5 DIRC for a Higher Luminosity B Factory\n", + "6 Analysis of approximate nearest neighbor searc...\n", + "7 Efficient Retrieval of Similar Time Sequences ...\n", + "8 Mining Generalized Graph Patterns based on Use...\n", + "9 ARACNE: An Algorithm for the Reconstruction of...\n", + "Name: title, dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_df['title'].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "execution": { + "iopub.execute_input": "2025-02-05T13:56:20.296770Z", + "iopub.status.busy": "2025-02-05T13:56:20.296615Z", + "iopub.status.idle": "2025-02-05T13:56:20.302894Z", + "shell.execute_reply": "2025-02-05T13:56:20.302345Z", + "shell.execute_reply.started": "2025-02-05T13:56:20.296758Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorytitle
0cs.CV,
1cs.CYcategory
2stat.ML: A- for
3cs.CV:
4cs.LG, to
.........
9995cs.LGcategory
9996stat.ME:
9997cs.LGD
9998cs.CL:
9999stat.MLC.
\n", + "

10000 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " category title\n", + "0 cs.CV , \n", + "1 cs.CY category\n", + "2 stat.ML : A- for \n", + "3 cs.CV : \n", + "4 cs.LG , to\n", + "... ... ...\n", + "9995 cs.LG category\n", + "9996 stat.ME : \n", + "9997 cs.LG D \n", + "9998 cs.CL : \n", + "9999 stat.ML C.\n", + "\n", + "[10000 rows x 2 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "syn_tgt_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": ".venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -142,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.8" + "version": "3.10.16" }, "toc": { "base_numbering": 1, diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index d6ef79f..6dee026 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -64,18 +64,31 @@ def monkey_patch_formatron(): def get_formatter_builders( - *, seed_df: pd.DataFrame | None = None, size: int | None = None, unseeded_fields: list[str] + *, seed_df: pd.DataFrame | None = None, size: int | None = None, stats: dict ) -> list[FormatterBuilder]: assert (seed_df is not None) ^ (size is not None), "exactly one of seed_df or size must be provided" formatter_builders = [] if seed_df is None: seed_df = pd.DataFrame(index=range(size)) + unseeded_fields = [c for c in list(stats["columns"].keys()) if c not in seed_df.columns.to_list()] + categorical_fields = [ + column + for column, column_stats in stats["columns"].items() + if column_stats["encoding_type"] == "LANGUAGE_CATEGORICAL" + ] for _, seed_row in seed_df.iterrows(): formatter_builder = FormatterBuilder() model_dict = {} if not seed_row.empty: model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} - model_dict |= {field_name: (str, ...) for field_name in unseeded_fields} + for field_name in unseeded_fields: + if field_name in categorical_fields: + model_dict[field_name] = ( + Literal[tuple(cat for cat in stats["columns"][field_name]["codes"].keys())], + ..., + ) + else: + model_dict[field_name] = (str, ...) schema = create_model("TargetModel", **model_dict, __base__=MostlyClassSchema) formatter_builder.append_str(f"{formatter_builder.json(schema, capture_name=None)}") formatter_builders.append(formatter_builder) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 6c57cd5..d5394ad 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -247,7 +247,6 @@ def tqdm_disabled(): # prepare seed data for clean consumption by formatron seed_data = prepare_seed_for_formatron(seed_data, engine.tokenizer) seeded_tgt_columns = seed_data.columns.to_list() - unseeded_tgt_columns = [c for c in tgt_text_columns if c not in seeded_tgt_columns] total_tokenize_fn_time = 0 total_logits_processor_build_time = 0 @@ -259,7 +258,7 @@ def tqdm_disabled(): if enforce_json_output and len(seeded_tgt_columns) == 0: t0 = time.time() - formatter_builders = get_formatter_builders(size=batch_size, unseeded_fields=unseeded_tgt_columns) + formatter_builders = get_formatter_builders(size=batch_size, stats=tgt_stats) engine.initialize_logits_processors(formatter_builders, formatron_vocab_processors) total_logits_processor_build_time += time.time() - t0 @@ -277,10 +276,7 @@ def tqdm_disabled(): if enforce_json_output and len(seeded_tgt_columns) > 0: t0 = time.time() # some columns are seeded, so we need to create a new logits processor for each batch - formatter_builders = get_formatter_builders( - seed_df=sample_seed_batch, - unseeded_fields=unseeded_tgt_columns, - ) + formatter_builders = get_formatter_builders(seed_df=sample_seed_batch, stats=tgt_stats) engine.initialize_logits_processors(formatter_builders, formatron_vocab_processors) total_logits_processor_build_time += time.time() - t0 From a16a6a19b2d1e298196a223ed074c89f1649982c Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 12:42:52 +0100 Subject: [PATCH 03/58] language_numeric --- mostlyai/engine/analysis.py | 19 ++++++++++++++----- mostlyai/engine/domain.py | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index ab8ab3e..837c0af 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -41,6 +41,7 @@ ProgressCallback, ProgressCallbackWrapper, ) +from mostlyai.engine._encoding_types.language.numeric import analyze_language_numeric, analyze_reduce_language_numeric from mostlyai.engine._encoding_types.tabular.categorical import ( analyze_categorical, analyze_reduce_categorical, @@ -386,8 +387,12 @@ def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: elif encoding_type == ModelEncodingType.language_text: stats_col = analyze_reduce_text(stats_list=column_stats_list) elif encoding_type == ModelEncodingType.language_categorical: - stats_col = analyze_reduce_text(stats_list=column_stats_list) - stats_col |= analyze_reduce_language_categorical( + stats_col = analyze_reduce_text(stats_list=column_stats_list) | analyze_reduce_language_categorical( + stats_list=column_stats_list, + value_protection=value_protection, + ) + elif encoding_type == ModelEncodingType.language_numeric: + stats_col = analyze_reduce_text(stats_list=column_stats_list) | analyze_reduce_language_numeric( stats_list=column_stats_list, value_protection=value_protection, ) @@ -527,9 +532,13 @@ def _analyze_flat_col( elif encoding_type == ModelEncodingType.language_text: stats = analyze_text(values, root_keys, context_keys) elif encoding_type == ModelEncodingType.language_categorical: - stats = analyze_text(values, root_keys, context_keys) - stats2 = analyze_language_categorical(values, root_keys, context_keys) - stats |= stats2 + stats = analyze_text(values, root_keys, context_keys) | analyze_language_categorical( + values, root_keys, context_keys + ) + elif encoding_type == ModelEncodingType.language_numeric: + stats = analyze_text(values, root_keys, context_keys) | analyze_language_numeric( + values, root_keys, context_keys + ) else: raise RuntimeError(f"unknown encoding type: `{encoding_type}` for `{values.name}`") return stats diff --git a/mostlyai/engine/domain.py b/mostlyai/engine/domain.py index 4c944be..6365614 100644 --- a/mostlyai/engine/domain.py +++ b/mostlyai/engine/domain.py @@ -61,8 +61,8 @@ class ModelEncodingType(str, Enum): tabular_datetime_relative = "TABULAR_DATETIME_RELATIVE" tabular_lat_long = "TABULAR_LAT_LONG" language_text = "LANGUAGE_TEXT" - # language_numeric = "LANGUAGE_NUMERIC" language_categorical = "LANGUAGE_CATEGORICAL" + language_numeric = "LANGUAGE_NUMERIC" # language_datetime = "LANGUAGE_DATETIME" From 827679aa562fd8d26662500787f5f15ab472019c Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:05:24 +0100 Subject: [PATCH 04/58] language_numeric --- .../_encoding_types/language/numeric.py | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 mostlyai/engine/_encoding_types/language/numeric.py diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py new file mode 100644 index 0000000..689502e --- /dev/null +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -0,0 +1,110 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from mostlyai.engine._common import safe_convert_numeric +from mostlyai.engine._encoding_types.tabular.numeric import split_sub_columns_digit, NUMERIC_DIGIT_MAX_DECIMAL +from mostlyai.engine.domain import ModelEncodingType + + +def analyze_language_numeric(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: + values = safe_convert_numeric(values) + + # determine lowest/highest values by root ID, and return Top 10 + df = pd.concat([root_keys, values], axis=1) + min_values = df.groupby(root_keys.name)[values.name].min().dropna() + min11 = min_values.sort_values(ascending=True).head(11).astype("float").tolist() + max_values = df.groupby(root_keys.name)[values.name].max().dropna() + max11 = max_values.sort_values(ascending=False).head(11).astype("float").tolist() + + # split values into digits; used for digit numeric encoding, plus to determine precision + df_split = split_sub_columns_digit(values) + is_not_nan = df_split["nan"] == 0 + has_nan = sum(df_split["nan"]) > 0 + has_neg = sum(df_split["neg"]) > 0 + + # extract min/max digit for each position to determine valid value range for digit encoding + if any(is_not_nan): + min_digits = {k: int(df_split[k][is_not_nan].min()) for k in df_split if k.startswith("E")} + max_digits = {k: int(df_split[k][is_not_nan].max()) for k in df_split if k.startswith("E")} + else: + min_digits = {k: 0 for k in df_split if k.startswith("E")} + max_digits = {k: 0 for k in df_split if k.startswith("E")} + + # return stats + stats = { + "has_nan": has_nan, + "has_neg": has_neg, + "min_digits": min_digits, + "max_digits": max_digits, + "min11": min11, + "max11": max11, + } + return stats + + +def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bool = True) -> dict: + # check for occurrence of NaN values + has_nan = any([j["has_nan"] for j in stats_list]) + # check if there are negative values + has_neg = any([j["has_neg"] for j in stats_list]) + + # determine precision to apply rounding of sampled values during generation + keys = stats_list[0]["max_digits"].keys() + min_digits = {k: min([j["min_digits"][k] for j in stats_list]) for k in keys} + max_digits = {k: max([j["max_digits"][k] for j in stats_list]) for k in keys} + non_zero_prec = [k for k in keys if max_digits[k] > 0 and k.startswith("E")] + min_decimal = min([int(k[1:]) for k in non_zero_prec]) if len(non_zero_prec) > 0 else 0 + + # determine min / max 5 values to map too low / too high values to + min11 = sorted([v for min11 in [j["min11"] for j in stats_list] for v in min11], reverse=False)[:11] + max11 = sorted([v for max11 in [j["max11"] for j in stats_list] for v in max11], reverse=True)[:11] + if value_protection: + # extreme value protection - discard lowest/highest 5 values + if len(min11) < 11 or len(max11) < 11: + # less than 11 subjects with non-NULL values; we need to protect all + min5 = [] + max5 = [] + else: + min5 = min11[5:10] # drop 1 to 5th lowest; keep 6th to 10th lowest + max5 = max11[5:10] # drop 1 to 5th highest; keep 6th to 10th highest + else: + min5 = min11[0:5] + max5 = max11[0:5] + + if len(min5) > 0 or len(max5) > 0: + max_abs = np.max(np.abs(np.array([min5[0], max5[0]]))) + max_decimal = int(np.floor(np.log10(max_abs))) if max_abs >= 10 else 0 + else: + max_decimal = 0 + # don't allow more digits than the capped value for it + decimal_cap = [d[1:] for d in keys][0] + decimal_cap = int(decimal_cap) if decimal_cap.isnumeric() else NUMERIC_DIGIT_MAX_DECIMAL + max_decimal = min(max(min_decimal, max_decimal), decimal_cap) + + stats = { + "encoding_type": ModelEncodingType.tabular_numeric_digit.value, + "has_nan": has_nan, + "has_neg": has_neg, + "min_digits": min_digits, + "max_digits": max_digits, + "max_decimal": max_decimal, + "min_decimal": min_decimal, + "min5": min5, + "max5": max5, + } + + return stats From 2e10629ba93a8f14d1cc7d134abfcc79079aa104 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:05:59 +0100 Subject: [PATCH 05/58] language_numeric --- mostlyai/engine/_encoding_types/language/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 689502e..344e1de 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -96,7 +96,7 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo max_decimal = min(max(min_decimal, max_decimal), decimal_cap) stats = { - "encoding_type": ModelEncodingType.tabular_numeric_digit.value, + "encoding_type": ModelEncodingType.language_numeric_digit.value, "has_nan": has_nan, "has_neg": has_neg, "min_digits": min_digits, From e4e0e4f4b7c95733e5585847fc5a51e6f28607b4 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:06:24 +0100 Subject: [PATCH 06/58] language_numeric --- mostlyai/engine/_encoding_types/language/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 344e1de..10dd985 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -96,7 +96,7 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo max_decimal = min(max(min_decimal, max_decimal), decimal_cap) stats = { - "encoding_type": ModelEncodingType.language_numeric_digit.value, + "encoding_type": ModelEncodingType.language_numeric.value, "has_nan": has_nan, "has_neg": has_neg, "min_digits": min_digits, From a9b681d54a2a82b5fa4ec0b370feba130759daa8 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 13:09:24 +0100 Subject: [PATCH 07/58] add test and beginnings of numeric --- mostlyai/engine/_language/formatron_utils.py | 11 ++++++ tests/end_to_end/test_language.py | 38 ++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 6dee026..9aa4226 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -76,6 +76,11 @@ def get_formatter_builders( for column, column_stats in stats["columns"].items() if column_stats["encoding_type"] == "LANGUAGE_CATEGORICAL" ] + numeric_fields = [ + column + for column, column_stats in stats["columns"].items() + if column_stats["encoding_type"] == "LANGUAGE_NUMERIC" + ] for _, seed_row in seed_df.iterrows(): formatter_builder = FormatterBuilder() model_dict = {} @@ -87,6 +92,12 @@ def get_formatter_builders( Literal[tuple(cat for cat in stats["columns"][field_name]["codes"].keys())], ..., ) + elif field_name in numeric_fields: + max_decimals = stats["columns"][field_name]["max_decimals"] + if max_decimals == 0: + model_dict[field_name] = (int, ...) + else: + model_dict[field_name] = (float, ...) else: model_dict[field_name] = (str, ...) schema = create_model("TargetModel", **model_dict, __base__=MostlyClassSchema) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 38d3ea3..0319154 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -72,6 +72,31 @@ def encoded_text_dataset(tmp_path_factory): return workspace_dir +@pytest.fixture(scope="session") +def encoded_numeric_categorical_dataset(tmp_path_factory): + workspace_dir = tmp_path_factory.mktemp("ws") + no_of_records = 20 + data = pd.DataFrame( + { + "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), + "age": [20, 30, 40, 50] * int(no_of_records / 4), + } + ) + tgt_encoding_types = { + "age": ModelEncodingType.language_numeric.value, + "gender": ModelEncodingType.language_categorical.value, + } + split( + tgt_data=data, + workspace_dir=workspace_dir, + model_type="LANGUAGE", + tgt_encoding_types=tgt_encoding_types, + ) + analyze(workspace_dir=workspace_dir) + encode(workspace_dir=workspace_dir) + return workspace_dir + + @pytest.fixture(scope="session") def single_record_text_dataset(tmp_path_factory): workspace_dir = tmp_path_factory.mktemp("ws-single-record") @@ -117,6 +142,19 @@ def test_tgt_only(tgt_only_text_dataset): assert str(syn["bio"].dtype).startswith("string") +def test_categorical_numeric(encoded_numeric_categorical_dataset): + workspace_dir = encoded_numeric_categorical_dataset + train(workspace_dir=workspace_dir, model=LSTMFromScratchConfig.model_id) + generate(workspace_dir=workspace_dir, sample_size=10) + + syn_data_path = workspace_dir / "SyntheticData" + syn = pd.read_parquet(syn_data_path) + assert len(syn) == 10 + assert set(syn.columns) == {"age", "gender"} + assert str(syn["age"].dtype).startswith("int64") + assert str(syn["gender"].dtype).startswith("string") + + @pytest.mark.parametrize( ("model_name", "sampling_temperature"), [ From ab5ce5a20aac530acbec02a0e82daeb3e2700500 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:09:34 +0100 Subject: [PATCH 08/58] language_numeric --- mostlyai/engine/analysis.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index 837c0af..08569ea 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -422,10 +422,13 @@ def get_argn_processor(mode, is_flat) -> str: if not is_flat: stats_col["seq_len"] = _analyze_reduce_seq_len([column_stats_list[0]["seq_len"]]) - if encoding_type in (ModelEncodingType.language_text, ModelEncodingType.language_categorical): + if encoding_type in ( + ModelEncodingType.language_text, + ModelEncodingType.language_categorical, + ModelEncodingType.language_numeric, + ): _LOG.info( - # f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" - f"analyzed column `{column}`: {stats_col['encoding_type']} " + f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" ) else: _LOG.info(f"analyzed column `{column}`: {stats_col['encoding_type']} {stats_col['cardinalities']}") From 52c8303f765b1b7926bb0c4a46d8c979316062c7 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:27:09 +0100 Subject: [PATCH 09/58] fix --- mostlyai/engine/_language/formatron_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 9aa4226..3cdc84e 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -93,8 +93,8 @@ def get_formatter_builders( ..., ) elif field_name in numeric_fields: - max_decimals = stats["columns"][field_name]["max_decimals"] - if max_decimals == 0: + max_decimal = stats["columns"][field_name]["max_decimal"] + if max_decimal == 0: model_dict[field_name] = (int, ...) else: model_dict[field_name] = (float, ...) From ce61fede1c5bb01c1b19590ac31944f48a050faf Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 13:28:26 +0100 Subject: [PATCH 10/58] remove unnecessary --- mostlyai/engine/_encoding_types/language/numeric.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 10dd985..05b065b 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -38,17 +38,14 @@ def analyze_language_numeric(values: pd.Series, root_keys: pd.Series, _: pd.Seri # extract min/max digit for each position to determine valid value range for digit encoding if any(is_not_nan): - min_digits = {k: int(df_split[k][is_not_nan].min()) for k in df_split if k.startswith("E")} max_digits = {k: int(df_split[k][is_not_nan].max()) for k in df_split if k.startswith("E")} else: - min_digits = {k: 0 for k in df_split if k.startswith("E")} max_digits = {k: 0 for k in df_split if k.startswith("E")} # return stats stats = { "has_nan": has_nan, "has_neg": has_neg, - "min_digits": min_digits, "max_digits": max_digits, "min11": min11, "max11": max11, @@ -64,7 +61,6 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo # determine precision to apply rounding of sampled values during generation keys = stats_list[0]["max_digits"].keys() - min_digits = {k: min([j["min_digits"][k] for j in stats_list]) for k in keys} max_digits = {k: max([j["max_digits"][k] for j in stats_list]) for k in keys} non_zero_prec = [k for k in keys if max_digits[k] > 0 and k.startswith("E")] min_decimal = min([int(k[1:]) for k in non_zero_prec]) if len(non_zero_prec) > 0 else 0 @@ -99,8 +95,6 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo "encoding_type": ModelEncodingType.language_numeric.value, "has_nan": has_nan, "has_neg": has_neg, - "min_digits": min_digits, - "max_digits": max_digits, "max_decimal": max_decimal, "min_decimal": min_decimal, "min5": min5, From e2e23a4c1ade271ed68fd597fa463d9225db536a Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 13:41:07 +0100 Subject: [PATCH 11/58] change test to AMD, fix max_decimal --- tests/end_to_end/test_language.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 0319154..87c32fd 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -142,9 +142,16 @@ def test_tgt_only(tgt_only_text_dataset): assert str(syn["bio"].dtype).startswith("string") -def test_categorical_numeric(encoded_numeric_categorical_dataset): +@pytest.mark.parametrize( + ("model_name"), + [ + # LSTMFromScratchConfig.model_id, # FIXME: this fails due to `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`, potentially due to missing numeric unicode tokens + "amd/AMD-Llama-135m", + ], +) +def test_categorical_numeric(encoded_numeric_categorical_dataset, model_name): workspace_dir = encoded_numeric_categorical_dataset - train(workspace_dir=workspace_dir, model=LSTMFromScratchConfig.model_id) + train(workspace_dir=workspace_dir, model=model_name) generate(workspace_dir=workspace_dir, sample_size=10) syn_data_path = workspace_dir / "SyntheticData" From 988d3c8d80538994036204bcd22d5b58b8e44b8e Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 14:24:16 +0100 Subject: [PATCH 12/58] fix decimal --- mostlyai/engine/_language/formatron_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 3cdc84e..a2265cd 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -93,8 +93,8 @@ def get_formatter_builders( ..., ) elif field_name in numeric_fields: - max_decimal = stats["columns"][field_name]["max_decimal"] - if max_decimal == 0: + min_decimal = stats["columns"][field_name]["min_decimal"] + if min_decimal >= 0: model_dict[field_name] = (int, ...) else: model_dict[field_name] = (float, ...) From e3dc8108780a2e2c54c2c0975545517cd79c0693 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 14:24:51 +0100 Subject: [PATCH 13/58] max_scale --- .../engine/_encoding_types/language/numeric.py | 18 +++--------------- mostlyai/engine/_language/formatron_utils.py | 4 ++-- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 05b065b..d5edd34 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pandas as pd from mostlyai.engine._common import safe_convert_numeric -from mostlyai.engine._encoding_types.tabular.numeric import split_sub_columns_digit, NUMERIC_DIGIT_MAX_DECIMAL +from mostlyai.engine._encoding_types.tabular.numeric import split_sub_columns_digit from mostlyai.engine.domain import ModelEncodingType @@ -57,7 +56,6 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo # check for occurrence of NaN values has_nan = any([j["has_nan"] for j in stats_list]) # check if there are negative values - has_neg = any([j["has_neg"] for j in stats_list]) # determine precision to apply rounding of sampled values during generation keys = stats_list[0]["max_digits"].keys() @@ -81,22 +79,12 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo min5 = min11[0:5] max5 = max11[0:5] - if len(min5) > 0 or len(max5) > 0: - max_abs = np.max(np.abs(np.array([min5[0], max5[0]]))) - max_decimal = int(np.floor(np.log10(max_abs))) if max_abs >= 10 else 0 - else: - max_decimal = 0 - # don't allow more digits than the capped value for it - decimal_cap = [d[1:] for d in keys][0] - decimal_cap = int(decimal_cap) if decimal_cap.isnumeric() else NUMERIC_DIGIT_MAX_DECIMAL - max_decimal = min(max(min_decimal, max_decimal), decimal_cap) + max_scale = abs(min(min_decimal, 0)) stats = { "encoding_type": ModelEncodingType.language_numeric.value, "has_nan": has_nan, - "has_neg": has_neg, - "max_decimal": max_decimal, - "min_decimal": min_decimal, + "max_scale": max_scale, "min5": min5, "max5": max5, } diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index a2265cd..0b63ebf 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -93,8 +93,8 @@ def get_formatter_builders( ..., ) elif field_name in numeric_fields: - min_decimal = stats["columns"][field_name]["min_decimal"] - if min_decimal >= 0: + max_scale = stats["columns"][field_name]["max_scale"] + if max_scale == 0: model_dict[field_name] = (int, ...) else: model_dict[field_name] = (float, ...) From 926ec2b2b6a97169e7ac57982e9dd5648da3047e Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 14:57:49 +0100 Subject: [PATCH 14/58] encode string/numeric --- mostlyai/engine/_language/generation.py | 37 ++++++++++++++++++------- tests/end_to_end/test_language.py | 4 +-- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index d5394ad..029d9cd 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -21,6 +21,7 @@ import time from pathlib import Path +import numpy as np import pandas as pd import torch from huggingface_hub import constants as hf_constants @@ -53,7 +54,7 @@ def decode_buffered_samples( buffer: FixedSizeSampleBuffer, tokenizer: PreTrainedTokenizerBase, - tgt_text_columns: list[str], + tgt_stats: dict[str, str], tgt_context_key: str, max_new_tokens: int, ): @@ -87,8 +88,8 @@ def parse_json(x, columns: list[str]): tgt_seed = pd.concat(tgt_seed, axis=0).reset_index(drop=True) # The model works with un-prefixed column names, but we need to recover prefixed column names for the final output tgt_data = pd.DataFrame( - [parse_json(text, tgt_text_columns) for text in output_texts], - columns=tgt_text_columns, + [parse_json(text, tgt_stats["columns"].keys()) for text in output_texts], + columns=tgt_stats["columns"].keys(), index=ctx_keys.index, dtype="string", ) @@ -98,19 +99,37 @@ def parse_json(x, columns: list[str]): ) # overwrite generated columns with the seeded values tgt_data.update(tgt_seed) - # ensure STRING type - tgt_data = tgt_data.astype(STRING) + # prepend the context keys to the data (if not dummy context) if ctx_keys.name != DUMMY_CONTEXT_KEY: tgt_data = pd.concat([ctx_keys, tgt_data], axis=1) - invalid_percentage = ((tgt_data[tgt_text_columns] == INVALID_VALUE).sum() / len(tgt_data) * 100.0).map( + invalid_percentage = ((tgt_data[tgt_stats["columns"].keys()] == INVALID_VALUE).sum() / len(tgt_data) * 100.0).map( "{:.2f}%".format ) + + for col in tgt_stats["columns"].keys(): + col_stats = tgt_stats["columns"][col] + if col_stats["encoding_type"] == "LANGUAGE_NUMERIC": + tgt_data[col] = _decode_numeric(tgt_data[col], col_stats) + else: + tgt_data[col] = _decode_string(tgt_data[col], col_stats) + _LOG.info(f"percentage of invalid values: {invalid_percentage.to_dict()}") _LOG.info(f"decoded {tgt_data.shape} from {len(buffer.buffer)} batches in {time.time() - t0:.2f}s") return tgt_data +def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + return x.astype(STRING) + + +def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + x[x == ""] = np.nan + if col_stats["max_scale"] == 0: + return x.astype("Int64") + return x.astype(float) + + def generate( *, ctx_data: pd.DataFrame | None = None, @@ -291,7 +310,7 @@ def tqdm_disabled(): buffer.add((outputs, ctx_keys, sample_seed_batch)) if buffer.is_full(): decoded_data = decode_buffered_samples( - buffer, engine.tokenizer, tgt_text_columns, tgt_context_key, max_new_tokens + buffer, engine.tokenizer, tgt_stats, tgt_context_key, max_new_tokens ) persist_data_part( decoded_data, @@ -303,9 +322,7 @@ def tqdm_disabled(): samples_processed += len(ctx_batch) if not buffer.is_empty(): - decoded_data = decode_buffered_samples( - buffer, engine.tokenizer, tgt_text_columns, tgt_context_key, max_new_tokens - ) + decoded_data = decode_buffered_samples(buffer, engine.tokenizer, tgt_stats, tgt_context_key, max_new_tokens) persist_data_part( decoded_data, output_path, diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 87c32fd..579cf47 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -158,8 +158,8 @@ def test_categorical_numeric(encoded_numeric_categorical_dataset, model_name): syn = pd.read_parquet(syn_data_path) assert len(syn) == 10 assert set(syn.columns) == {"age", "gender"} - assert str(syn["age"].dtype).startswith("int64") - assert str(syn["gender"].dtype).startswith("string") + assert syn["age"].dtype == "Int64" + assert syn["gender"].dtype == "string" @pytest.mark.parametrize( From abda5d55de1777c806df84ddbc729213154022e5 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 15:23:11 +0100 Subject: [PATCH 15/58] placeholder datetime --- mostlyai/engine/_language/formatron_utils.py | 13 +++++++++++-- mostlyai/engine/_language/generation.py | 9 ++++++++- tests/end_to_end/test_language.py | 14 +++++++++++--- 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 0b63ebf..e706c7a 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -24,6 +24,8 @@ from pydantic import create_model from transformers import PreTrainedTokenizerBase +from mostlyai.engine.domain import ModelEncodingType + JSON_NULL = "null" @@ -74,12 +76,17 @@ def get_formatter_builders( categorical_fields = [ column for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == "LANGUAGE_CATEGORICAL" + if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_CATEGORICAL ] numeric_fields = [ column for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == "LANGUAGE_NUMERIC" + if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_NUMERIC + ] + datetime_fields = [ + column + for column, column_stats in stats["columns"].items() + if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_DATETIME ] for _, seed_row in seed_df.iterrows(): formatter_builder = FormatterBuilder() @@ -98,6 +105,8 @@ def get_formatter_builders( model_dict[field_name] = (int, ...) else: model_dict[field_name] = (float, ...) + elif field_name in datetime_fields: + model_dict[field_name] = (str, ...) # FIXME: temp else: model_dict[field_name] = (str, ...) schema = create_model("TargetModel", **model_dict, __base__=MostlyClassSchema) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 029d9cd..f6c0bcf 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -45,6 +45,7 @@ prepare_seed_for_formatron, get_vocab_processors, ) +from mostlyai.engine.domain import ModelEncodingType INVALID_VALUE = "_INVALID_" # when JSON parsing fails, the values of target columns will be set to this DUMMY_CONTEXT_KEY = "__dummy_context_key" @@ -109,8 +110,10 @@ def parse_json(x, columns: list[str]): for col in tgt_stats["columns"].keys(): col_stats = tgt_stats["columns"][col] - if col_stats["encoding_type"] == "LANGUAGE_NUMERIC": + if col_stats["encoding_type"] == ModelEncodingType.LANGUAGE_NUMERIC: tgt_data[col] = _decode_numeric(tgt_data[col], col_stats) + elif col_stats["encoding_type"] == ModelEncodingType.LANGUAGE_DATETIME: + tgt_data[col] = _decode_datetime(tgt_data[col], col_stats) else: tgt_data[col] = _decode_string(tgt_data[col], col_stats) @@ -130,6 +133,10 @@ def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: return x.astype(float) +def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + return pd.to_datetime(x) + + def generate( *, ctx_data: pd.DataFrame | None = None, diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 579cf47..7109a6f 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -73,18 +73,26 @@ def encoded_text_dataset(tmp_path_factory): @pytest.fixture(scope="session") -def encoded_numeric_categorical_dataset(tmp_path_factory): +def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): workspace_dir = tmp_path_factory.mktemp("ws") no_of_records = 20 data = pd.DataFrame( { "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), "age": [20, 30, 40, 50] * int(no_of_records / 4), + "date": [ + pd.Timestamp("2020-01-01"), + pd.Timestamp("2020-01-02"), + pd.Timestamp("2023-01-03"), + pd.Timestamp("2025-01-04"), + ] + * int(no_of_records / 4), } ) tgt_encoding_types = { "age": ModelEncodingType.language_numeric.value, "gender": ModelEncodingType.language_categorical.value, + "date": ModelEncodingType.language_datetime.value, } split( tgt_data=data, @@ -149,8 +157,8 @@ def test_tgt_only(tgt_only_text_dataset): "amd/AMD-Llama-135m", ], ) -def test_categorical_numeric(encoded_numeric_categorical_dataset, model_name): - workspace_dir = encoded_numeric_categorical_dataset +def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): + workspace_dir = encoded_numeric_categorical_datetime_dataset train(workspace_dir=workspace_dir, model=model_name) generate(workspace_dir=workspace_dir, sample_size=10) From 337003c0d3b80add40cf2fb0f72114db627cfebc Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 15:24:19 +0100 Subject: [PATCH 16/58] add dt test asserts --- tests/end_to_end/test_language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 7109a6f..1007c81 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -165,9 +165,10 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas syn_data_path = workspace_dir / "SyntheticData" syn = pd.read_parquet(syn_data_path) assert len(syn) == 10 - assert set(syn.columns) == {"age", "gender"} + assert set(syn.columns) == {"age", "gender", "date"} assert syn["age"].dtype == "Int64" assert syn["gender"].dtype == "string" + assert syn["date"].dtype == "datetime64[ns]" @pytest.mark.parametrize( From c668e67a4f4791ff72aa1296bd0d591749b4cb91 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 6 Feb 2025 15:40:13 +0100 Subject: [PATCH 17/58] datetime --- .../_encoding_types/language/datetime.py | 82 +++++++++++++++++++ mostlyai/engine/_language/formatron_utils.py | 6 +- mostlyai/engine/_language/generation.py | 4 +- mostlyai/engine/analysis.py | 14 ++++ mostlyai/engine/domain.py | 5 +- 5 files changed, 105 insertions(+), 6 deletions(-) create mode 100644 mostlyai/engine/_encoding_types/language/datetime.py diff --git a/mostlyai/engine/_encoding_types/language/datetime.py b/mostlyai/engine/_encoding_types/language/datetime.py new file mode 100644 index 0000000..f6d633d --- /dev/null +++ b/mostlyai/engine/_encoding_types/language/datetime.py @@ -0,0 +1,82 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd + +from mostlyai.engine._common import safe_convert_datetime +from mostlyai.engine._encoding_types.tabular.datetime import split_sub_columns_datetime, DATETIME_PARTS + + +def analyze_language_datetime(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: + values = safe_convert_datetime(values) + df = pd.concat([root_keys, values], axis=1) + # determine lowest/highest values by root ID, and return Top 10 + min_dates = df.groupby(root_keys.name)[values.name].min().dropna() + min11 = min_dates.sort_values(ascending=True).head(11).astype(str).tolist() + max_dates = df.groupby(root_keys.name)[values.name].max().dropna() + max11 = max_dates.sort_values(ascending=False).head(11).astype(str).tolist() + # split into datetime parts + df_split = split_sub_columns_datetime(values) + is_not_nan = df_split["nan"] == 0 + has_nan = any(df_split["nan"] == 1) + # extract min/max value for each part to determine valid value range + if any(is_not_nan): + min_values = {k: int(df_split[k][is_not_nan].min()) for k in DATETIME_PARTS} + max_values = {k: int(df_split[k][is_not_nan].max()) for k in DATETIME_PARTS} + else: + def_values = {"year": 2022, "month": 1, "day": 1} + min_values = {k: 0 for k in DATETIME_PARTS} | def_values + max_values = {k: 0 for k in DATETIME_PARTS} | def_values + # return stats + stats = { + "has_nan": has_nan, + "min_values": min_values, + "max_values": max_values, + "min11": min11, + "max11": max11, + } + return stats + + +def analyze_reduce_language_datetime(stats_list: list[dict], value_protection: bool = True) -> dict: + # check if there are missing values + has_nan = any([j["has_nan"] for j in stats_list]) + # determine min/max values for each part + keys = stats_list[0]["min_values"].keys() + min_values = {k: min([j["min_values"][k] for j in stats_list]) for k in keys} + max_values = {k: max([j["max_values"][k] for j in stats_list]) for k in keys} + # determine min / max 5 values to map too low / too high values to + min11 = sorted([v for min11 in [j["min11"] for j in stats_list] for v in min11], reverse=False)[:11] + max11 = sorted([v for max11 in [j["max11"] for j in stats_list] for v in max11], reverse=True)[:11] + if value_protection: + # extreme value protection - discard lowest/highest 5 values + if len(min11) < 11 or len(max11) < 11: + # less than 11 subjects with non-NULL values; we need to protect all + min5 = [] + max5 = [] + else: + min5 = [str(v) for v in min11[5:10]] # drop 1 to 5th lowest; keep 6th to 10th lowest + max5 = [str(v) for v in max11[5:10]] # drop 1 to 5th highest; keep 6th to 10th highest + # update min/max year based on first four letters of protected min/max dates + max_values["year"] = int(max5[0][0:4]) + min_values["year"] = int(min5[0][0:4]) + else: + min5 = min11[0:4] + max5 = max11[0:4] + stats = { + "has_nan": has_nan, + "min5": min5, + "max5": max5, + } + return stats diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index e706c7a..b5ab3ff 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -76,17 +76,17 @@ def get_formatter_builders( categorical_fields = [ column for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_CATEGORICAL + if column_stats["encoding_type"] == ModelEncodingType.language_categorical ] numeric_fields = [ column for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_NUMERIC + if column_stats["encoding_type"] == ModelEncodingType.language_numeric ] datetime_fields = [ column for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.LANGUAGE_DATETIME + if column_stats["encoding_type"] == ModelEncodingType.language_datetime ] for _, seed_row in seed_df.iterrows(): formatter_builder = FormatterBuilder() diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index f6c0bcf..236f4cc 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -110,9 +110,9 @@ def parse_json(x, columns: list[str]): for col in tgt_stats["columns"].keys(): col_stats = tgt_stats["columns"][col] - if col_stats["encoding_type"] == ModelEncodingType.LANGUAGE_NUMERIC: + if col_stats["encoding_type"] == ModelEncodingType.language_numeric: tgt_data[col] = _decode_numeric(tgt_data[col], col_stats) - elif col_stats["encoding_type"] == ModelEncodingType.LANGUAGE_DATETIME: + elif col_stats["encoding_type"] == ModelEncodingType.language_datetime: tgt_data[col] = _decode_datetime(tgt_data[col], col_stats) else: tgt_data[col] = _decode_string(tgt_data[col], col_stats) diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index 08569ea..eea3bac 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -41,6 +41,10 @@ ProgressCallback, ProgressCallbackWrapper, ) +from mostlyai.engine._encoding_types.language.datetime import ( + analyze_reduce_language_datetime, + analyze_language_datetime, +) from mostlyai.engine._encoding_types.language.numeric import analyze_language_numeric, analyze_reduce_language_numeric from mostlyai.engine._encoding_types.tabular.categorical import ( analyze_categorical, @@ -396,6 +400,11 @@ def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: stats_list=column_stats_list, value_protection=value_protection, ) + elif encoding_type == ModelEncodingType.language_datetime: + stats_col = analyze_reduce_text(stats_list=column_stats_list) | analyze_reduce_language_datetime( + stats_list=column_stats_list, + value_protection=value_protection, + ) else: raise RuntimeError(f"unknown encoding type {encoding_type}") @@ -426,6 +435,7 @@ def get_argn_processor(mode, is_flat) -> str: ModelEncodingType.language_text, ModelEncodingType.language_categorical, ModelEncodingType.language_numeric, + ModelEncodingType.language_datetime, ): _LOG.info( f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" @@ -542,6 +552,10 @@ def _analyze_flat_col( stats = analyze_text(values, root_keys, context_keys) | analyze_language_numeric( values, root_keys, context_keys ) + elif encoding_type == ModelEncodingType.language_datetime: + stats = analyze_text(values, root_keys, context_keys) | analyze_language_datetime( + values, root_keys, context_keys + ) else: raise RuntimeError(f"unknown encoding type: `{encoding_type}` for `{values.name}`") return stats diff --git a/mostlyai/engine/domain.py b/mostlyai/engine/domain.py index 6365614..1449e66 100644 --- a/mostlyai/engine/domain.py +++ b/mostlyai/engine/domain.py @@ -48,6 +48,9 @@ class ModelEncodingType(str, Enum): - `TABULAR_DATETIME_RELATIVE`: Model samples the relative difference between datetimes within a sequence. - `TABULAR_LAT_LONG`: Model samples a latitude-longitude column. The format is "latitude,longitude". - `LANGUAGE_TEXT`: Model will train a distinct LANGUAGE model for this column, to then generate free text. + - `LANGUAGE_CATEGORICAL`: TODO + - `LANGUAGE_NUMERIC`: TODO + - `LANGUAGE_DATETIME`: TODO """ auto = "AUTO" @@ -63,7 +66,7 @@ class ModelEncodingType(str, Enum): language_text = "LANGUAGE_TEXT" language_categorical = "LANGUAGE_CATEGORICAL" language_numeric = "LANGUAGE_NUMERIC" - # language_datetime = "LANGUAGE_DATETIME" + language_datetime = "LANGUAGE_DATETIME" class ModelStateStrategy(str, Enum): From 8d5e764add56c5395db5650c2a492cce05e61f34 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 6 Feb 2025 16:04:28 +0100 Subject: [PATCH 18/58] add coercion --- mostlyai/engine/_language/formatron_utils.py | 7 +++++-- mostlyai/engine/_language/generation.py | 6 ++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index b5ab3ff..8a85ff2 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -63,6 +63,9 @@ def monkey_patch_formatron(): array_begin ::= #"\\[{SPACE_NONTERMINAL}"; array_end ::= #"{SPACE_NONTERMINAL}\\]"; """ + # date ::= #"(19\d{2}|20\d{2})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1])"; + # time ::= #"([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])"; + # datetime ::= date|date " " time; def get_formatter_builders( @@ -92,11 +95,11 @@ def get_formatter_builders( formatter_builder = FormatterBuilder() model_dict = {} if not seed_row.empty: - model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} + model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} # type: ignore[valid-type] for field_name in unseeded_fields: if field_name in categorical_fields: model_dict[field_name] = ( - Literal[tuple(cat for cat in stats["columns"][field_name]["codes"].keys())], + Literal[tuple(cat for cat in stats["columns"][field_name]["codes"].keys())], # type: ignore[valid-type] ..., ) elif field_name in numeric_fields: diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 236f4cc..dbf5251 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -127,14 +127,16 @@ def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - x[x == ""] = np.nan + # FIXME revisit for invalid values -- sample from values / nan / or other + x[(x == "") | (x == "__INVALID__")] = np.nan if col_stats["max_scale"] == 0: return x.astype("Int64") return x.astype(float) def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - return pd.to_datetime(x) + # FIXME revisit for invalid values -- sample from values / nan / or other + return pd.to_datetime(x, errors="coerce") def generate( From 5913da9965122f46be34bcfc8df3da26aee28f32 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Fri, 7 Feb 2025 10:48:08 +0100 Subject: [PATCH 19/58] fix invalid to nan for numeric --- mostlyai/engine/_language/generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index dbf5251..119c365 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -128,7 +128,7 @@ def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: # FIXME revisit for invalid values -- sample from values / nan / or other - x[(x == "") | (x == "__INVALID__")] = np.nan + x[(x == "") | (x == "_INVALID_")] = np.nan if col_stats["max_scale"] == 0: return x.astype("Int64") return x.astype(float) From 87fb34fa5974608a1f205118327872cb1f14622d Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Fri, 7 Feb 2025 14:21:52 +0100 Subject: [PATCH 20/58] coerce datetimes to valid dates, add datetime grammar, change test to use gpt2 for now. temporarily disable categorical test due to error --- mostlyai/engine/_language/engine/hf_engine.py | 7 +- mostlyai/engine/_language/formatron_utils.py | 67 ++++-- mostlyai/engine/_language/generation.py | 28 +++ mostlyai/engine/_language/temp_formatron.py | 220 ++++++++++++++++++ tests/end_to_end/test_language.py | 109 ++++----- 5 files changed, 346 insertions(+), 85 deletions(-) create mode 100644 mostlyai/engine/_language/temp_formatron.py diff --git a/mostlyai/engine/_language/engine/hf_engine.py b/mostlyai/engine/_language/engine/hf_engine.py index 18c3c5f..e651822 100644 --- a/mostlyai/engine/_language/engine/hf_engine.py +++ b/mostlyai/engine/_language/engine/hf_engine.py @@ -24,7 +24,7 @@ from transformers import AutoTokenizer from mostlyai.engine._language.common import load_base_model_and_config from mostlyai.engine._language.tokenizer_utils import tokenize_fn -from mostlyai.engine._language.formatron_utils import monkey_patch_formatron +# from mostlyai.engine._language.formatron_utils import monkey_patch_formatron from mostlyai.engine._language.engine.base import EngineMetrics, LanguageEngine from formatron.formatter import FormatterBuilder @@ -66,11 +66,6 @@ def __init__( self.tokenizer.special_tokens_map ) self._json_enforcing_possible = is_peft_adapter or is_trained_lstm_tokenizer - - # apply all necessary monkey patches to the formatron library - if self._json_enforcing_possible: - monkey_patch_formatron() - self._logits_processors = None def get_default_batch_size(self) -> int: diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 8a85ff2..d56cef9 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. + +import datetime import typing import pandas as pd @@ -23,6 +25,9 @@ from formatron.formats import json from pydantic import create_model from transformers import PreTrainedTokenizerBase +from mostlyai.engine._language.temp_formatron import JsonExtractor +import collections +from formatron.schemas.schema import Schema from mostlyai.engine.domain import ModelEncodingType @@ -43,29 +48,40 @@ def transform(x: str | None) -> str: return sample_seed.astype("string[pyarrow]").map(transform) -def monkey_patch_formatron(): - # alter the Grammar of formatron's json schema - FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 - SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" - - json.GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; - number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; - string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; - boolean ::= "true"|"false"; - null ::= "null"; - array ::= array_begin (json_value (comma json_value)*)? array_end; - object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; - json_value ::= number|string|boolean|null|array|object; - comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; - colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; - object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; - object_end ::= #"{SPACE_NONTERMINAL}\\}}"; - array_begin ::= #"\\[{SPACE_NONTERMINAL}"; - array_end ::= #"{SPACE_NONTERMINAL}\\]"; - """ - # date ::= #"(19\d{2}|20\d{2})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1])"; - # time ::= #"([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])"; - # datetime ::= date|date " " time; +class MostlyFormatterBuilder(FormatterBuilder): + def __init__(self): + super().__init__() + + def json(self, schema: type[Schema] | collections.abc.Sequence, *, capture_name: str = None) -> JsonExtractor: + """ + Create a JSON extractor. Check out the JsonExtractor docs for more details. + + Args: + schema: The schema for extraction. + capture_name: The capture name of the extractor, or `None` if the extractor does not capture. + Returns: + The JSON extractor. + """ + + def to_json(_json: str): + local_schema = schema + origin = typing.get_origin(local_schema) + if origin is not None: + local_schema = origin + if isinstance(local_schema, type) and issubclass(local_schema, Schema): + try: + return local_schema.from_json(_json) + except JSONDecodeError: # make ChoiceExtractor work appropriately + return None + else: + try: + return json.loads(_json) + except JSONDecodeError: + return None + + return self._add_extractor( + "json", lambda nonterminal: JsonExtractor(nonterminal, capture_name, schema, to_json) + ) def get_formatter_builders( @@ -92,7 +108,7 @@ def get_formatter_builders( if column_stats["encoding_type"] == ModelEncodingType.language_datetime ] for _, seed_row in seed_df.iterrows(): - formatter_builder = FormatterBuilder() + formatter_builder = MostlyFormatterBuilder() model_dict = {} if not seed_row.empty: model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} # type: ignore[valid-type] @@ -109,7 +125,8 @@ def get_formatter_builders( else: model_dict[field_name] = (float, ...) elif field_name in datetime_fields: - model_dict[field_name] = (str, ...) # FIXME: temp + # model_dict[field_name] = (str, Field(pattern=r"19\\d{2}|20\\d{2}-0[1-9]|1[0-2]-0[1-9]|1[0-9]|2[0-9]|3[0-1]")) - might be able to make this work, but it fails + model_dict[field_name] = (datetime.datetime, ...) else: model_dict[field_name] = (str, ...) schema = create_model("TargetModel", **model_dict, __base__=MostlyClassSchema) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 119c365..2731a5b 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import calendar import contextlib +import datetime import json import os @@ -134,8 +136,34 @@ def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: return x.astype(float) +def coerce_datetime(text: str) -> str: + """ + Ensure that the text is a valid date or datetime string. + """ + if text == "" or text == "_INVALID_": + return text + # FIXME copy paste from datallm, see if should be cleaned up + + # extract year, month, and day from the ISO formatted text + y, m, d = int(text[:4]), int(text[5:7]), int(text[8:10]) + # set to last day of month, in case of too large day value + last_day = calendar.monthrange(y, m)[1] + d = min(d, last_day) + dt_str = f"{y:04d}-{m:02d}-{d:02d}" + text[10:] + # convert to date and back to check for valid date + try: + dt_str = datetime.datetime.fromisoformat(dt_str).isoformat().replace("T", " ") + except ValueError: + dt_str = text # FIXME revisit, if e.g. a cutoff date, then we just return the original text + # trim to original length + dt_str = dt_str[: len(text)] + return dt_str + + def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + print(x) # FIXME revisit for invalid values -- sample from values / nan / or other + x = x.map(coerce_datetime) return pd.to_datetime(x, errors="coerce") diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py new file mode 100644 index 0000000..b4c9a71 --- /dev/null +++ b/mostlyai/engine/_language/temp_formatron.py @@ -0,0 +1,220 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +The module defines the `JsonExtractor` class, which is used to extract data from a string in JSON format. +""" + +import collections +import datetime +import typing + +from formatron import extractor, schemas +from formatron.formats.json import _type_to_nonterminals + +__all__ = ["JsonExtractor"] + + +FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 +SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" + +GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; +number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; +string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; +boolean ::= "true"|"false"; +null ::= "null"; +array ::= array_begin (json_value (comma json_value)*)? array_end; +object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; +json_value ::= number|string|boolean|null|array|object; +datetime ::= #'"(19\\d{{2}}|20\\d{{2}})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1]) ([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])"'; +comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; +colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; +object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; +object_end ::= #"{SPACE_NONTERMINAL}\\}}"; +array_begin ::= #"\\[{SPACE_NONTERMINAL}"; +array_end ::= #"{SPACE_NONTERMINAL}\\]"; +""" + +_type_id_to_nonterminal = { + id(int): "integer", + id(float): "number", + id(str): "string", + id(bool): "boolean", + id(type(None)): "null", + id(list): "array", + id(dict): "object", + id(typing.Any): "json_value", + id(datetime.datetime): "datetime", +} + + +def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Sequence, start_nonterminal: str) -> str: + """ + Generate a KBNF grammar string from a schema for JSON format. + + Args: + schema: The schema to generate a grammar for. + start_nonterminal: The start nonterminal of the grammar. Default is "start". + + Returns: + The generated KBNF grammar string. + """ + result = [GRAMMAR_HEADER] + nonterminals = set() + stack = [(schema, start_nonterminal)] + while stack: + (current, nonterminal) = stack.pop() + type_id = id(current) + if type_id in _type_id_to_nonterminal: + line = f"{nonterminal} ::= {_type_id_to_nonterminal[type_id]};\n" + result.append(line) + continue + _type_id_to_nonterminal[type_id] = nonterminal + for i in _type_to_nonterminals: + value = i(current, nonterminal) + if value is not None: + line, to_stack = value + result.append(line) + stack.extend(to_stack) + nonterminals.add(nonterminal) + break + else: + raise TypeError(f"{current} from {nonterminal} is not supported in json_generators!") + return "".join(result) + + +class JsonExtractor(extractor.NonterminalExtractor): + """ + An extractor that loads json data to an object from a string. + """ + + def __init__( + self, + nonterminal: str, + capture_name: str | None, + schema: schemas.schema.Schema | collections.abc.Sequence, + to_object: typing.Callable[[str], schemas.schema.Schema], + ): + """ + Create a json extractor from a given schema or a list of supported types. + + Currently, the following data types are supported: + + - bool + - int + - positive int + - negative int + - nonnegative int + - nonpositive int + - float + - positive float + - negative float + - nonnegative float + - nonpositive float + - str + - optionally with min_length, max_length and pattern constraints + - length is measured in UTF-8 character number after json parsing + - *Warning*: too large difference between min_length and max_length can lead to enormous memory consumption! + - pattern is mutually exclusive with min_length and max_length + - pattern will be compiled to a regular expression so all caveats of regular expressions apply + - pattern currently is automatically anchored at both ends + - the generated json could be invalid if the pattern allows invalid content between the json string's quotes. + - for example, `pattern=".*"` will allow '\"' to appear in the json string which is forbidden by JSON standard. + - also supports substring_of constraint which constrains the string to be a substring of a given string + - the generated json could be invalid if the given string contains invalid content when put into the json string's quotes. + - for example, `substring_of="abc\""` will allow '\"' to appear in the json string which is forbidden by JSON standard. + - NoneType + - typing.Any + - Subclasses of collections.abc.Mapping[str,T] and typing.Mapping[str,T] where T is a supported type, + - Subclasses of collections.abc.Sequence[T] and typing.Sequence[T] where T is a supported type. + - optionally with `minItems`, `maxItems`, `prefixItems` constraints + - *Warning*: too large difference between minItems and maxItems can lead to very slow performance! + - *Warning*: By json schema definition, prefixItems by default allows additional items and missing items in the prefixItems, which may not be the desired behavior and can lead to very slow performance if prefixItems is long! + - tuple[T1,T2,...] where T1,T2,... are supported types. The order, type and number of elements will be preserved. + - typing.Literal[x1,x2,...] where x1, x2, ... are instances of int, string, bool or NoneType, or another typing.Literal[y1,y2,...] + - typing.Union[T1,T2,...] where T1,T2,... are supported types. + - schemas.Schema where all its fields' data types are supported. Recursive schema definitions are supported as well. + - *Warning*: while not required field is supported, they can lead to very slow performance and/or enormous memory consumption if there are too many of them! + - Custom types registered via register_type_nonterminal() + + Args: + nonterminal: The nonterminal representing the extractor. + capture_name: The capture name of the extractor, or `None` if the extractor does not capture. + schema: The schema. + to_object: A callable to convert the extracted string to a schema instance. + """ + super().__init__(nonterminal, capture_name) + self._to_object = to_object + self._rule_str = _generate_kbnf_grammar( + schema, self.nonterminal + ) # FIXME, probably just monkey patch this instead + + def extract(self, input_str: str) -> tuple[str, schemas.schema.Schema] | None: + """ + Extract a schema instance from a string. + + Args: + input_str: The input string to extract from. + + Returns: + A tuple of the remaining string and the extracted schema instance, or `None` if extraction failed. + """ + + # Ensure the input string starts with '{' or '[' after stripping leading whitespace + input_str = input_str.lstrip() + if not input_str.startswith(("{", "[")): + return None + + # Variables to track the balance of brackets and the position in the string + bracket_count = 0 + position = 0 + in_string = False + escape_next = False + start_char = input_str[0] + end_char = "}" if start_char == "{" else "]" + + # Iterate over the string to find where the JSON object or array ends + for char in input_str: + if not in_string: + if char == start_char: + bracket_count += 1 + elif char == end_char: + bracket_count -= 1 + elif char == '"': + in_string = True + else: + if char == '"' and not escape_next: + in_string = False + elif char == "\\": + escape_next = not escape_next + else: + escape_next = False + + # Move to the next character + position += 1 + + # If brackets are balanced and we're not in a string, stop processing + if bracket_count == 0 and not in_string: + break + else: + return None + # The position now points to the character after the last '}', so we slice to position + json_str = input_str[:position] + remaining_str = input_str[position:] + # Return the unparsed remainder of the string and the decoded JSON object + return remaining_str, self._to_object(json_str) + + @property + def kbnf_definition(self): + return self._rule_str diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 1007c81..e8df75d 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -72,39 +72,6 @@ def encoded_text_dataset(tmp_path_factory): return workspace_dir -@pytest.fixture(scope="session") -def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): - workspace_dir = tmp_path_factory.mktemp("ws") - no_of_records = 20 - data = pd.DataFrame( - { - "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), - "age": [20, 30, 40, 50] * int(no_of_records / 4), - "date": [ - pd.Timestamp("2020-01-01"), - pd.Timestamp("2020-01-02"), - pd.Timestamp("2023-01-03"), - pd.Timestamp("2025-01-04"), - ] - * int(no_of_records / 4), - } - ) - tgt_encoding_types = { - "age": ModelEncodingType.language_numeric.value, - "gender": ModelEncodingType.language_categorical.value, - "date": ModelEncodingType.language_datetime.value, - } - split( - tgt_data=data, - workspace_dir=workspace_dir, - model_type="LANGUAGE", - tgt_encoding_types=tgt_encoding_types, - ) - analyze(workspace_dir=workspace_dir) - encode(workspace_dir=workspace_dir) - return workspace_dir - - @pytest.fixture(scope="session") def single_record_text_dataset(tmp_path_factory): workspace_dir = tmp_path_factory.mktemp("ws-single-record") @@ -150,27 +117,6 @@ def test_tgt_only(tgt_only_text_dataset): assert str(syn["bio"].dtype).startswith("string") -@pytest.mark.parametrize( - ("model_name"), - [ - # LSTMFromScratchConfig.model_id, # FIXME: this fails due to `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`, potentially due to missing numeric unicode tokens - "amd/AMD-Llama-135m", - ], -) -def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): - workspace_dir = encoded_numeric_categorical_datetime_dataset - train(workspace_dir=workspace_dir, model=model_name) - generate(workspace_dir=workspace_dir, sample_size=10) - - syn_data_path = workspace_dir / "SyntheticData" - syn = pd.read_parquet(syn_data_path) - assert len(syn) == 10 - assert set(syn.columns) == {"age", "gender", "date"} - assert syn["age"].dtype == "Int64" - assert syn["gender"].dtype == "string" - assert syn["date"].dtype == "datetime64[ns]" - - @pytest.mark.parametrize( ("model_name", "sampling_temperature"), [ @@ -464,3 +410,58 @@ def test_special_character_column_name(tmp_path_factory): syn_data = pd.read_parquet(workspace_dir / "SyntheticData") assert len(syn_data) == 2 assert set(syn_data.columns) == set([TEMPORARY_PRIMARY_KEY] + list(tgt_encoding_types.keys())) + + +@pytest.fixture(scope="session") +def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): + workspace_dir = tmp_path_factory.mktemp("ws") + no_of_records = 20 # 20 + data = pd.DataFrame( + { + # "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), + "age": [20, 30, 40, 50] * int(no_of_records / 4), + "date": [ + pd.Timestamp("2020-01-01"), + pd.Timestamp("2020-01-02"), + pd.Timestamp("2023-01-03"), + pd.Timestamp("2025-01-04"), + ] + * int(no_of_records / 4), + } + ) + tgt_encoding_types = { + "age": ModelEncodingType.language_numeric.value, + # "gender": ModelEncodingType.language_categorical.value, # FIXME had to comment out due to some issue with formatron + "date": ModelEncodingType.language_datetime.value, + } + split( + tgt_data=data, + workspace_dir=workspace_dir, + model_type="LANGUAGE", + tgt_encoding_types=tgt_encoding_types, + ) + analyze(workspace_dir=workspace_dir) + encode(workspace_dir=workspace_dir) + return workspace_dir + + +@pytest.mark.parametrize( + ("model_name"), + [ + # LSTMFromScratchConfig.model_id, # FIXME: this fails due to `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`, potentially due to missing numeric unicode tokens (missing ASCII) + # "amd/AMD-Llama-135m", # FIXME this model is horrible so we're skipping it + "openai-community/gpt2" # TEMP, better model than AMD + ], +) +def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): + workspace_dir = encoded_numeric_categorical_datetime_dataset + train(workspace_dir=workspace_dir, model=model_name) + generate(workspace_dir=workspace_dir, sample_size=10) + + syn_data_path = workspace_dir / "SyntheticData" + syn = pd.read_parquet(syn_data_path) + assert len(syn) == 10 + assert set(syn.columns) == {"age", "date"} # "gender", + assert syn["age"].dtype == "Int64" + # assert syn["gender"].dtype == "string" # FIXME had to comment out due to some issue with formatron + assert syn["date"].dtype == "datetime64[ns]" From 548f11a15087ee5745159c64ce5834da3c6a9d8e Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Fri, 7 Feb 2025 15:44:34 +0100 Subject: [PATCH 21/58] comments and refactor --- mostlyai/engine/_language/engine/hf_engine.py | 1 - mostlyai/engine/_language/generation.py | 3 +- mostlyai/engine/_language/temp_formatron.py | 34 +++++++++++-------- mostlyai/engine/_language/training.py | 1 + 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/mostlyai/engine/_language/engine/hf_engine.py b/mostlyai/engine/_language/engine/hf_engine.py index e651822..744fcaa 100644 --- a/mostlyai/engine/_language/engine/hf_engine.py +++ b/mostlyai/engine/_language/engine/hf_engine.py @@ -24,7 +24,6 @@ from transformers import AutoTokenizer from mostlyai.engine._language.common import load_base_model_and_config from mostlyai.engine._language.tokenizer_utils import tokenize_fn -# from mostlyai.engine._language.formatron_utils import monkey_patch_formatron from mostlyai.engine._language.engine.base import EngineMetrics, LanguageEngine from formatron.formatter import FormatterBuilder diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 2731a5b..26a63f8 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -130,6 +130,7 @@ def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: # FIXME revisit for invalid values -- sample from values / nan / or other + # FIXME add programmatic constraint x[(x == "") | (x == "_INVALID_")] = np.nan if col_stats["max_scale"] == 0: return x.astype("Int64") @@ -161,8 +162,8 @@ def coerce_datetime(text: str) -> str: def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - print(x) # FIXME revisit for invalid values -- sample from values / nan / or other + # TODO clamp datetime to valid range x = x.map(coerce_datetime) return pd.to_datetime(x, errors="coerce") diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py index b4c9a71..9ac1c9e 100644 --- a/mostlyai/engine/_language/temp_formatron.py +++ b/mostlyai/engine/_language/temp_formatron.py @@ -29,6 +29,7 @@ FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" +# Copy from formatron, altered to have limited whitespace repetitions and datetime format GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; @@ -46,19 +47,10 @@ array_end ::= #"{SPACE_NONTERMINAL}\\]"; """ -_type_id_to_nonterminal = { - id(int): "integer", - id(float): "number", - id(str): "string", - id(bool): "boolean", - id(type(None)): "null", - id(list): "array", - id(dict): "object", - id(typing.Any): "json_value", - id(datetime.datetime): "datetime", -} +# FIXME add grammar constraint of integer and number +# Copy from formatron except `datetime` def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Sequence, start_nonterminal: str) -> str: """ Generate a KBNF grammar string from a schema for JSON format. @@ -70,17 +62,28 @@ def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Seque Returns: The generated KBNF grammar string. """ + type_id_to_nonterminal = { + id(int): "integer", + id(float): "number", + id(str): "string", + id(bool): "boolean", + id(type(None)): "null", + id(list): "array", + id(dict): "object", + id(typing.Any): "json_value", + id(datetime.datetime): "datetime", # altered + } result = [GRAMMAR_HEADER] nonterminals = set() stack = [(schema, start_nonterminal)] while stack: (current, nonterminal) = stack.pop() type_id = id(current) - if type_id in _type_id_to_nonterminal: - line = f"{nonterminal} ::= {_type_id_to_nonterminal[type_id]};\n" + if type_id in type_id_to_nonterminal: + line = f"{nonterminal} ::= {type_id_to_nonterminal[type_id]};\n" result.append(line) continue - _type_id_to_nonterminal[type_id] = nonterminal + type_id_to_nonterminal[type_id] = nonterminal for i in _type_to_nonterminals: value = i(current, nonterminal) if value is not None: @@ -94,6 +97,7 @@ def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Seque return "".join(result) +# Copy from formatron except it uses `_generate_kbnf_grammar` from this file to construct self._rule_str class JsonExtractor(extractor.NonterminalExtractor): """ An extractor that loads json data to an object from a string. @@ -158,7 +162,7 @@ def __init__( self._to_object = to_object self._rule_str = _generate_kbnf_grammar( schema, self.nonterminal - ) # FIXME, probably just monkey patch this instead + ) # altered FIXME can we monkey patch this instead? def extract(self, input_str: str) -> tuple[str, schemas.schema.Schema] | None: """ diff --git a/mostlyai/engine/_language/training.py b/mostlyai/engine/_language/training.py index 4a21da8..a09f0d2 100644 --- a/mostlyai/engine/_language/training.py +++ b/mostlyai/engine/_language/training.py @@ -352,6 +352,7 @@ def concat_prompt_and_response(x): for i in range(0, len(content_dataset["train"]), 1_000) ) # train a custom tokenizer and convert it to a LlamaTokenizerFast object + # FIXME add stats arg, use modelencodingtype to set initial vocab (e.g. numeric --> add "-+[0-9]" and if max_scale > 0 also add "".Ee"") see `temp_formatron.py` grammar tokenizer = train_tokenizer(tokenizer_train_iter, tokenizer_kwargs=tokenizer_args) model_config = LSTMFromScratchConfig(vocab_size=len(tokenizer), with_dp=with_dp) model = LSTMFromScratchLMHeadModel(model_config).to(device) From 31fea138088a3d9caece6d688895db17670bc546 Mon Sep 17 00:00:00 2001 From: michdr Date: Fri, 7 Feb 2025 15:46:52 +0100 Subject: [PATCH 22/58] tiny refactor --- mostlyai/engine/_language/formatron_utils.py | 22 +++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index d56cef9..b7e3d22 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -92,21 +92,13 @@ def get_formatter_builders( if seed_df is None: seed_df = pd.DataFrame(index=range(size)) unseeded_fields = [c for c in list(stats["columns"].keys()) if c not in seed_df.columns.to_list()] - categorical_fields = [ - column - for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.language_categorical - ] - numeric_fields = [ - column - for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.language_numeric - ] - datetime_fields = [ - column - for column, column_stats in stats["columns"].items() - if column_stats["encoding_type"] == ModelEncodingType.language_datetime - ] + field_types = { + t: [col for col, col_stats in stats["columns"].items() if col_stats["encoding_type"] == t] + for t in ModelEncodingType + } + categorical_fields = field_types.get(ModelEncodingType.language_categorical, []) + numeric_fields = field_types.get(ModelEncodingType.language_numeric, []) + datetime_fields = field_types.get(ModelEncodingType.language_datetime, []) for _, seed_row in seed_df.iterrows(): formatter_builder = MostlyFormatterBuilder() model_dict = {} From 62287826db3d9c92cd3e013c92986cbdb76e8802 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Fri, 7 Feb 2025 16:20:48 +0100 Subject: [PATCH 23/58] add back categorical test --- mostlyai/engine/_language/generation.py | 2 ++ tests/end_to_end/test_language.py | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 26a63f8..dcba0c0 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -132,6 +132,8 @@ def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: # FIXME revisit for invalid values -- sample from values / nan / or other # FIXME add programmatic constraint x[(x == "") | (x == "_INVALID_")] = np.nan + # FIXME consider if this try/catch is correct approach + # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long if col_stats["max_scale"] == 0: return x.astype("Int64") return x.astype(float) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index e8df75d..c70c63f 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -415,10 +415,10 @@ def test_special_character_column_name(tmp_path_factory): @pytest.fixture(scope="session") def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): workspace_dir = tmp_path_factory.mktemp("ws") - no_of_records = 20 # 20 + no_of_records = 20 data = pd.DataFrame( { - # "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), + "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), "age": [20, 30, 40, 50] * int(no_of_records / 4), "date": [ pd.Timestamp("2020-01-01"), @@ -431,7 +431,7 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): ) tgt_encoding_types = { "age": ModelEncodingType.language_numeric.value, - # "gender": ModelEncodingType.language_categorical.value, # FIXME had to comment out due to some issue with formatron + "gender": ModelEncodingType.language_categorical.value, "date": ModelEncodingType.language_datetime.value, } split( @@ -461,7 +461,7 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas syn_data_path = workspace_dir / "SyntheticData" syn = pd.read_parquet(syn_data_path) assert len(syn) == 10 - assert set(syn.columns) == {"age", "date"} # "gender", + assert set(syn.columns) == {"age", "gender", "date"} assert syn["age"].dtype == "Int64" - # assert syn["gender"].dtype == "string" # FIXME had to comment out due to some issue with formatron + assert syn["gender"].dtype == "string" assert syn["date"].dtype == "datetime64[ns]" From 386d9766fe119548291f7caf16e33f851b83bc88 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 10 Feb 2025 14:03:45 +0100 Subject: [PATCH 24/58] Simpler categorical analyze / analyze reduce; rare category protection (#32) --- .../_encoding_types/language/categorical.py | 72 ++++--------------- mostlyai/engine/_language/encoding.py | 44 ++++++++---- mostlyai/engine/_language/formatron_utils.py | 2 +- mostlyai/engine/_language/generation.py | 5 +- tests/end_to_end/test_language.py | 8 ++- 5 files changed, 51 insertions(+), 80 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/categorical.py b/mostlyai/engine/_encoding_types/language/categorical.py index 7796516..72e018e 100644 --- a/mostlyai/engine/_encoding_types/language/categorical.py +++ b/mostlyai/engine/_encoding_types/language/categorical.py @@ -22,34 +22,10 @@ from mostlyai.engine._common import safe_convert_string CATEGORICAL_UNKNOWN_TOKEN = "_RARE_" -CATEGORICAL_NULL_TOKEN = "<>" -CATEGORICAL_SUB_COL_SUFFIX = "cat" -CATEGORICAL_ESCAPE_CHAR = "\x01" - - -def safe_language_categorical_escape(values: pd.Series) -> pd.Series: - """Inplace escaping of categorical values""" - reserved_tokens = (CATEGORICAL_UNKNOWN_TOKEN, CATEGORICAL_NULL_TOKEN) - reserved_tokens_replacement_map = {t: CATEGORICAL_ESCAPE_CHAR + t for t in reserved_tokens} - # first, prefix values starting with escape char with another escape char - mask = values.str.startswith(CATEGORICAL_ESCAPE_CHAR, na=False) - values.loc[mask] = values.loc[mask].str.slice_replace(stop=1, repl=CATEGORICAL_ESCAPE_CHAR * 2) - # second, add escape char to all reserved tokens - values = values.replace(reserved_tokens_replacement_map) - return values - - -def safe_language_categorical_unescape(values: pd.Series) -> pd.Series: - """Inplace un-escaping of categorical values""" - # de-prefix all values starting with escape char by removing just the first one - mask = values.str.startswith(CATEGORICAL_ESCAPE_CHAR, na=False) - values.loc[mask] = values.loc[mask].str[1:] - return values def analyze_language_categorical(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: - # ensure a safe representation of values: 1. string dtype; 2. escape reserved tokens - values = safe_language_categorical_escape(safe_convert_string(values)) + values = safe_convert_string(values) # count distinct root_keys per categorical value for rare-category protection df = pd.concat([root_keys, values], axis=1) cnt_values = df.groupby(values.name)[root_keys.name].nunique().to_dict() @@ -72,44 +48,22 @@ def analyze_reduce_language_categorical(stats_list: list[dict], value_protection rare_min = 0 categories = [k for k in known_categories if cnt_values[k] >= rare_min] no_of_rare_categories = len(known_categories) - len(categories) - # add special token for MISSING categories, if any are present + # add None to categories, if any are present if any([j["has_nan"] for j in stats_list]): - categories = [CATEGORICAL_NULL_TOKEN] + categories + categories = [None] + categories # add special token for UNKNOWN categories at first position - categories = [CATEGORICAL_UNKNOWN_TOKEN] + categories - stats = { - "no_of_rare_categories": no_of_rare_categories, - "codes": {categories[i]: i for i in range(len(categories))}, - "cardinalities": {CATEGORICAL_SUB_COL_SUFFIX: len(categories)}, - } + if no_of_rare_categories > 0: + categories = [CATEGORICAL_UNKNOWN_TOKEN] + categories + stats = {"no_of_rare_categories": no_of_rare_categories, "categories": categories} return stats -def encode_language_categorical(values: pd.Series, stats: dict, _: pd.Series | None = None) -> pd.DataFrame: - # ensure a safe representation of values: 1. string dtype; 2. escape reserved tokens - values = safe_language_categorical_escape(safe_convert_string(values)) - known_categories = [str(k) for k in stats["codes"].keys()] +def encode_categorical(values: pd.Series, stats: dict) -> pd.DataFrame: + values = safe_convert_string(values) values = values.copy() - if CATEGORICAL_NULL_TOKEN in known_categories: - values[values.isna()] = CATEGORICAL_NULL_TOKEN - values[~values.isin(known_categories)] = CATEGORICAL_UNKNOWN_TOKEN - - # map categories to their corresponding codes - codes = pd.Series( - pd.Categorical(values, categories=known_categories).codes, - name=CATEGORICAL_SUB_COL_SUFFIX, - index=values.index, - ) - return codes.to_frame() - - -def decode_language_categorical(df_encoded: pd.DataFrame, stats: dict) -> pd.Series: - categories = stats["codes"].keys() - values = pd.Series( - pd.Categorical.from_codes(df_encoded[CATEGORICAL_SUB_COL_SUFFIX], categories=categories), - dtype="string", - ) - values[values == CATEGORICAL_NULL_TOKEN] = pd.NA - # convert escaped values to their original representation - values = safe_language_categorical_unescape(values) + known_categories = stats["categories"] + mask = ~values.isin(known_categories) + if None in known_categories: + mask &= ~pd.isna(values) + values[mask] = CATEGORICAL_UNKNOWN_TOKEN return values diff --git a/mostlyai/engine/_language/encoding.py b/mostlyai/engine/_language/encoding.py index cb74ada..6562e64 100644 --- a/mostlyai/engine/_language/encoding.py +++ b/mostlyai/engine/_language/encoding.py @@ -24,17 +24,30 @@ from mostlyai.engine._common import is_sequential, ProgressCallback, ProgressCallbackWrapper, TABLE_COLUMN_INFIX from mostlyai.engine._workspace import ensure_workspace_dir, Workspace, reset_dir +from mostlyai.engine._encoding_types.language.categorical import encode_categorical _LOG = logging.getLogger(__name__) -def format_df(df: pd.DataFrame, columns: list[str], is_target: bool = False) -> pd.DataFrame: - df = df[columns].copy() +def apply_encoding_types(df: pd.DataFrame, stats: dict) -> pd.DataFrame: + for col, col_stats in stats["columns"].items(): + if col_stats["encoding_type"] == "LANGUAGE_CATEGORICAL": + df[col] = encode_categorical(df[col], col_stats) + return df + + +def drop_sequential_columns(df: pd.DataFrame) -> pd.DataFrame: # Some columns (e.g., SCP columns) may contain np.ndarray, which are not JSON serializable # We need to drop them before converting the DataFrame to JSON sequential_columns = [col for col in df.columns if is_sequential(df[col])] df = df.drop(columns=sequential_columns) - _LOG.info(f"Formatting {'target' if is_target else 'context'} columns {df.columns.tolist()} to JSON") + return df + + +def format_df(df: pd.DataFrame, stats: dict, is_target: bool = False) -> pd.DataFrame: + columns = list(stats["columns"].keys()) + df = df[columns].copy() + _LOG.info(f"Formatting {'target' if is_target else 'context'} columns {columns} to JSON") # convert date format to ISO so that it's JSON serializable for col in df.columns: if is_datetime64_any_dtype(df[col]): @@ -76,15 +89,21 @@ def row_to_json(row: pd.Series, is_target: bool = False) -> str: def encode_df( ctx_df: pd.DataFrame, - ctx_columns: list[str], + ctx_stats: dict | None = None, tgt_df: pd.DataFrame | None = None, - tgt_columns: list[str] | None = None, + tgt_stats: dict | None = None, ) -> pd.DataFrame: - assert (tgt_df is None) == (tgt_columns is None), "tgt_df and tgt_columns must be both None or both not None" + assert (tgt_df is None) == (tgt_stats is None), "tgt_df and tgt_stats must be both None or both not None" + if ctx_stats is None: + ctx_stats = {"columns": {}} df = pd.DataFrame() - df["ctx"] = format_df(ctx_df, columns=ctx_columns, is_target=False) - if tgt_df is not None and tgt_columns is not None: - df["tgt"] = format_df(tgt_df, columns=tgt_columns, is_target=True) + ctx_df = drop_sequential_columns(ctx_df) + ctx_df = apply_encoding_types(ctx_df, stats=ctx_stats) + df["ctx"] = format_df(ctx_df, stats=ctx_stats, is_target=False) + if tgt_df is not None and tgt_stats is not None: + tgt_df = drop_sequential_columns(tgt_df) + tgt_df = apply_encoding_types(tgt_df, stats=tgt_stats) + df["tgt"] = format_df(tgt_df, stats=tgt_stats, is_target=True) # log the bounds of n_tokens in this partition content = df["ctx"] + df["tgt"] if "tgt" in df.columns else df["ctx"] @@ -107,19 +126,16 @@ def _encode_partition( ctx_stats: dict | None = None, ) -> None: tgt_df = pd.read_parquet(tgt_partition_file) - tgt_columns = list(tgt_stats.get("columns", {}).keys()) if ctx_partition_file: ctx_df = pd.read_parquet(ctx_partition_file) - ctx_columns = list(ctx_stats.get("columns", {}).keys()) else: # create on-the-fly context ctx_df = pd.DataFrame(index=range(len(tgt_df))) - ctx_columns = [] df = encode_df( ctx_df=ctx_df, - ctx_columns=ctx_columns, + ctx_stats=ctx_stats, tgt_df=tgt_df, - tgt_columns=tgt_columns, + tgt_stats=tgt_stats, ) # shuffle and persist to disk as parquet files df = df.sample(frac=1) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index b7e3d22..9f94cd7 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -107,7 +107,7 @@ def get_formatter_builders( for field_name in unseeded_fields: if field_name in categorical_fields: model_dict[field_name] = ( - Literal[tuple(cat for cat in stats["columns"][field_name]["codes"].keys())], # type: ignore[valid-type] + Literal[tuple(stats["columns"][field_name]["categories"])], # type: ignore[valid-type] ..., ) elif field_name in numeric_fields: diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index dcba0c0..f4b8870 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -221,7 +221,6 @@ def tqdm_disabled(): if has_context: ctx_stats = workspace.ctx_stats.read() - ctx_columns = list(ctx_stats["columns"].keys()) ctx_primary_key = ctx_stats["keys"].get("primary_key") # ensure ctx_data exists @@ -246,11 +245,11 @@ def tqdm_disabled(): sample_size = len(ctx_data) _LOG.info(f"{sample_size=}") else: + ctx_stats = None # create on-the-fly context if sample_size is None: trn_sample_size = tgt_stats["no_of_training_records"] + tgt_stats["no_of_validation_records"] sample_size = trn_sample_size if sample_size is None else sample_size - ctx_columns = [] ctx_primary_key = tgt_context_key = DUMMY_CONTEXT_KEY ctx_data = pd.DataFrame({ctx_primary_key: range(sample_size)}) @@ -272,7 +271,7 @@ def tqdm_disabled(): return # encode context data - encoded_ctx_data = encode_df(ctx_df=ctx_data, ctx_columns=ctx_columns) + encoded_ctx_data = encode_df(ctx_df=ctx_data, ctx_stats=ctx_stats) # estimate max new tokens based on char length of original data; consider JSON overhead max_new_tokens = estimate_max_tokens(tgt_stats) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index c70c63f..812178e 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -418,15 +418,16 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): no_of_records = 20 data = pd.DataFrame( { - "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), - "age": [20, 30, 40, 50] * int(no_of_records / 4), + "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4) + ["rare"], + "age": [20, 30, 40, 50] * int(no_of_records / 4) + [50], "date": [ pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02"), pd.Timestamp("2023-01-03"), pd.Timestamp("2025-01-04"), ] - * int(no_of_records / 4), + * int(no_of_records / 4) + + [pd.Timestamp("2025-01-04")], } ) tgt_encoding_types = { @@ -464,4 +465,5 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas assert set(syn.columns) == {"age", "gender", "date"} assert syn["age"].dtype == "Int64" assert syn["gender"].dtype == "string" + assert "rare" not in syn["gender"].values assert syn["date"].dtype == "datetime64[ns]" From 6afe58f895677b3152ef3aa5a71e764133cb5dd7 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 10 Feb 2025 15:49:03 +0100 Subject: [PATCH 25/58] expose rare_category_replacement_method for LANGUAGE --- mostlyai/engine/_language/formatron_utils.py | 17 +++++++++++------ mostlyai/engine/_language/generation.py | 13 ++++++++++--- mostlyai/engine/generation.py | 10 +++------- tests/end_to_end/test_language.py | 15 +++++++++++++-- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 9f94cd7..7c43ab7 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -25,11 +25,12 @@ from formatron.formats import json from pydantic import create_model from transformers import PreTrainedTokenizerBase +from mostlyai.engine._encoding_types.language.categorical import CATEGORICAL_UNKNOWN_TOKEN from mostlyai.engine._language.temp_formatron import JsonExtractor import collections from formatron.schemas.schema import Schema -from mostlyai.engine.domain import ModelEncodingType +from mostlyai.engine.domain import ModelEncodingType, RareCategoryReplacementMethod JSON_NULL = "null" @@ -85,7 +86,11 @@ def to_json(_json: str): def get_formatter_builders( - *, seed_df: pd.DataFrame | None = None, size: int | None = None, stats: dict + *, + seed_df: pd.DataFrame | None = None, + size: int | None = None, + stats: dict, + rare_category_replacement_method: RareCategoryReplacementMethod, ) -> list[FormatterBuilder]: assert (seed_df is not None) ^ (size is not None), "exactly one of seed_df or size must be provided" formatter_builders = [] @@ -106,10 +111,10 @@ def get_formatter_builders( model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} # type: ignore[valid-type] for field_name in unseeded_fields: if field_name in categorical_fields: - model_dict[field_name] = ( - Literal[tuple(stats["columns"][field_name]["categories"])], # type: ignore[valid-type] - ..., - ) + categories = stats["columns"][field_name]["categories"] + if rare_category_replacement_method == RareCategoryReplacementMethod.sample and len(categories) > 1: + categories = [c for c in categories if c != CATEGORICAL_UNKNOWN_TOKEN] + model_dict[field_name] = (Literal[tuple(categories)], ...) # type: ignore[valid-type] elif field_name in numeric_fields: max_scale = stats["columns"][field_name]["max_scale"] if max_scale == 0: diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index f4b8870..01ace69 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -47,7 +47,7 @@ prepare_seed_for_formatron, get_vocab_processors, ) -from mostlyai.engine.domain import ModelEncodingType +from mostlyai.engine.domain import ModelEncodingType, RareCategoryReplacementMethod INVALID_VALUE = "_INVALID_" # when JSON parsing fails, the values of target columns will be set to this DUMMY_CONTEXT_KEY = "__dummy_context_key" @@ -178,6 +178,7 @@ def generate( batch_size: int | None = None, sampling_temperature: float = 1.0, sampling_top_p: float = 1.0, + rare_category_replacement_method: RareCategoryReplacementMethod | str = RareCategoryReplacementMethod.constant, device: torch.device | str | None = None, workspace_dir: str | Path = "engine-ws", update_progress: ProgressCallback | None = None, @@ -316,7 +317,9 @@ def tqdm_disabled(): if enforce_json_output and len(seeded_tgt_columns) == 0: t0 = time.time() - formatter_builders = get_formatter_builders(size=batch_size, stats=tgt_stats) + formatter_builders = get_formatter_builders( + size=batch_size, stats=tgt_stats, rare_category_replacement_method=rare_category_replacement_method + ) engine.initialize_logits_processors(formatter_builders, formatron_vocab_processors) total_logits_processor_build_time += time.time() - t0 @@ -334,7 +337,11 @@ def tqdm_disabled(): if enforce_json_output and len(seeded_tgt_columns) > 0: t0 = time.time() # some columns are seeded, so we need to create a new logits processor for each batch - formatter_builders = get_formatter_builders(seed_df=sample_seed_batch, stats=tgt_stats) + formatter_builders = get_formatter_builders( + seed_df=sample_seed_batch, + stats=tgt_stats, + rare_category_replacement_method=rare_category_replacement_method, + ) engine.initialize_logits_processors(formatter_builders, formatron_vocab_processors) total_logits_processor_build_time += time.time() - t0 diff --git a/mostlyai/engine/generation.py b/mostlyai/engine/generation.py index efc8639..e2a1b92 100644 --- a/mostlyai/engine/generation.py +++ b/mostlyai/engine/generation.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect from pathlib import Path import pandas as pd @@ -36,7 +35,7 @@ def generate( sampling_temperature: float = 1.0, sampling_top_p: float = 1.0, device: str | None = None, - rare_category_replacement_method: RareCategoryReplacementMethod | str | None = None, + rare_category_replacement_method: RareCategoryReplacementMethod | str = RareCategoryReplacementMethod.constant, rebalancing: RebalancingConfig | dict | None = None, imputation: ImputationConfig | dict | None = None, fairness: FairnessConfig | dict | None = None, @@ -76,9 +75,7 @@ def generate( batch_size=batch_size, sampling_temperature=sampling_temperature, sampling_top_p=sampling_top_p, - rare_category_replacement_method=inspect.signature(generate_tabular) - .parameters["rare_category_replacement_method"] - .default, + rare_category_replacement_method=rare_category_replacement_method, rebalancing=rebalancing, imputation=imputation, fairness=fairness, @@ -95,8 +92,6 @@ def generate( raise ValueError("fairness is not supported for language models") if rebalancing is not None: raise ValueError("rebalancing is not supported for language models") - if rare_category_replacement_method is not None: - raise ValueError("rare_category_replacement_method is not supported for language models") return generate_language( ctx_data=ctx_data, seed_data=seed_data, @@ -104,6 +99,7 @@ def generate( batch_size=batch_size, sampling_temperature=sampling_temperature, sampling_top_p=sampling_top_p, + rare_category_replacement_method=rare_category_replacement_method, device=device, workspace_dir=workspace_dir, update_progress=update_progress, diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 812178e..777b2de 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -27,10 +27,16 @@ from mostlyai.engine._language.encoding import encode from mostlyai.engine.analysis import analyze from mostlyai.engine._common import TEMPORARY_PRIMARY_KEY +from mostlyai.engine._encoding_types.language.categorical import CATEGORICAL_UNKNOWN_TOKEN from mostlyai.engine._language.lstm import LSTMFromScratchConfig from mostlyai.engine._language.tokenizer_utils import MostlyDataCollatorForLanguageModeling from mostlyai.engine._language.training import train -from mostlyai.engine.domain import ModelEncodingType, ModelStateStrategy, DifferentialPrivacyConfig +from mostlyai.engine.domain import ( + ModelEncodingType, + ModelStateStrategy, + DifferentialPrivacyConfig, + RareCategoryReplacementMethod, +) from mostlyai.engine._language.formatron_utils import get_formatter_builders from formatron.integrations.transformers import create_formatter_logits_processor_list @@ -457,7 +463,11 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): workspace_dir = encoded_numeric_categorical_datetime_dataset train(workspace_dir=workspace_dir, model=model_name) - generate(workspace_dir=workspace_dir, sample_size=10) + generate( + workspace_dir=workspace_dir, + sample_size=10, + rare_category_replacement_method=RareCategoryReplacementMethod.sample, + ) syn_data_path = workspace_dir / "SyntheticData" syn = pd.read_parquet(syn_data_path) @@ -466,4 +476,5 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas assert syn["age"].dtype == "Int64" assert syn["gender"].dtype == "string" assert "rare" not in syn["gender"].values + assert CATEGORICAL_UNKNOWN_TOKEN not in syn["gender"].values assert syn["date"].dtype == "datetime64[ns]" From f7948bcd4763e425f51400d750d1a143ca760e39 Mon Sep 17 00:00:00 2001 From: Shuang Wu <149689370+shuangwu5@users.noreply.github.com> Date: Mon, 10 Feb 2025 18:29:58 +0100 Subject: [PATCH 26/58] MSD-XXX: add initial alphabets to untrained tokenizer if needed (#33) --- mostlyai/engine/_language/tokenizer_utils.py | 24 +++++++++++++++++++- mostlyai/engine/_language/training.py | 3 +-- tests/end_to_end/test_language.py | 4 ++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/mostlyai/engine/_language/tokenizer_utils.py b/mostlyai/engine/_language/tokenizer_utils.py index ff2cba0..64c13e5 100644 --- a/mostlyai/engine/_language/tokenizer_utils.py +++ b/mostlyai/engine/_language/tokenizer_utils.py @@ -19,13 +19,19 @@ from transformers import DataCollatorForLanguageModeling, BatchEncoding, PreTrainedTokenizerFast, LlamaTokenizerFast from transformers.data.data_collator import pad_without_fast_tokenizer_warning, _torch_collate_batch +from mostlyai.engine.domain import ModelEncodingType + ################# ### TOKENIZER ### ################# -def train_tokenizer(training_iterator: Iterator | list | None = None, tokenizer_kwargs=None): +def train_tokenizer( + training_iterator: Iterator | list | None = None, + tokenizer_kwargs: dict[str, Any] | None = None, + tgt_stats: dict[str, Any] | None = None, +): if tokenizer_kwargs is None: tokenizer_kwargs = {} from tokenizers import Tokenizer, decoders @@ -46,10 +52,26 @@ def train_tokenizer(training_iterator: Iterator | list | None = None, tokenizer_ MIN_FREQ_MERGE = 20 VOCAB_SIZE = 5000 + # add initial alphabet for numeric and datetime columns if needed + has_numeric_columns = any( + col_stats["encoding_type"] == ModelEncodingType.language_numeric for col_stats in tgt_stats["columns"].values() + ) + has_datetime_columns = any( + col_stats["encoding_type"] == ModelEncodingType.language_datetime for col_stats in tgt_stats["columns"].values() + ) + initial_alphabet = set() + if has_numeric_columns: + # FIXME: maybe the set can be more fine-grained based on max_scale in stats + initial_alphabet |= {str(i) for i in range(10)} | {".", "-", "+", "e", "E"} + if has_datetime_columns: + initial_alphabet |= {str(i) for i in range(10)} | {".", "-", ":", "T", "Z"} + initial_alphabet = list(initial_alphabet) + # Builds a BPE raw_tokenizer, and optionally trains it based on provided text training_iterator = training_iterator or [] # allow easy training skip raw_tokenizer = Tokenizer(BPE(unk_token=special_tokens["unk_token"])) trainer = BpeTrainer( + initial_alphabet=initial_alphabet, special_tokens=SPECIAL_TOKENS, min_frequency=MIN_FREQ_MERGE, vocab_size=VOCAB_SIZE, diff --git a/mostlyai/engine/_language/training.py b/mostlyai/engine/_language/training.py index a09f0d2..95edc70 100644 --- a/mostlyai/engine/_language/training.py +++ b/mostlyai/engine/_language/training.py @@ -352,8 +352,7 @@ def concat_prompt_and_response(x): for i in range(0, len(content_dataset["train"]), 1_000) ) # train a custom tokenizer and convert it to a LlamaTokenizerFast object - # FIXME add stats arg, use modelencodingtype to set initial vocab (e.g. numeric --> add "-+[0-9]" and if max_scale > 0 also add "".Ee"") see `temp_formatron.py` grammar - tokenizer = train_tokenizer(tokenizer_train_iter, tokenizer_kwargs=tokenizer_args) + tokenizer = train_tokenizer(tokenizer_train_iter, tokenizer_kwargs=tokenizer_args, tgt_stats=tgt_stats) model_config = LSTMFromScratchConfig(vocab_size=len(tokenizer), with_dp=with_dp) model = LSTMFromScratchLMHeadModel(model_config).to(device) else: diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 777b2de..38c3d85 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -455,8 +455,8 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): @pytest.mark.parametrize( ("model_name"), [ - # LSTMFromScratchConfig.model_id, # FIXME: this fails due to `RuntimeError: probability tensor contains either `inf`, `nan` or element < 0`, potentially due to missing numeric unicode tokens (missing ASCII) - # "amd/AMD-Llama-135m", # FIXME this model is horrible so we're skipping it + # LSTMFromScratchConfig.model_id, # FIXME: failed when generating incomplete datetime or overflow numbers atm + # "amd/AMD-Llama-135m", # FIXME failed when generating incomplete datetime or overflow numbers atm "openai-community/gpt2" # TEMP, better model than AMD ], ) From ac2c1cc04db54e8821df5d88a242da8039ad9551 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 11 Feb 2025 14:15:41 +0100 Subject: [PATCH 27/58] simplify numeric --- .../_encoding_types/language/numeric.py | 40 ++++++++----------- .../engine/_encoding_types/tabular/numeric.py | 2 +- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index d5edd34..065b6e1 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -15,37 +15,37 @@ import pandas as pd from mostlyai.engine._common import safe_convert_numeric -from mostlyai.engine._encoding_types.tabular.numeric import split_sub_columns_digit from mostlyai.engine.domain import ModelEncodingType def analyze_language_numeric(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: values = safe_convert_numeric(values) - # determine lowest/highest values by root ID, and return Top 10 + # determine lowest/highest values by root ID, and return top 11 df = pd.concat([root_keys, values], axis=1) min_values = df.groupby(root_keys.name)[values.name].min().dropna() min11 = min_values.sort_values(ascending=True).head(11).astype("float").tolist() max_values = df.groupby(root_keys.name)[values.name].max().dropna() max11 = max_values.sort_values(ascending=False).head(11).astype("float").tolist() - # split values into digits; used for digit numeric encoding, plus to determine precision - df_split = split_sub_columns_digit(values) - is_not_nan = df_split["nan"] == 0 - has_nan = sum(df_split["nan"]) > 0 - has_neg = sum(df_split["neg"]) > 0 + # determine if there are any NaN values + has_nan = bool(values.isna().any()) - # extract min/max digit for each position to determine valid value range for digit encoding - if any(is_not_nan): - max_digits = {k: int(df_split[k][is_not_nan].max()) for k in df_split if k.startswith("E")} - else: - max_digits = {k: 0 for k in df_split if k.startswith("E")} + # determine max scale + def count_scale(num: float) -> int: + # represent number as fixed point string, remove trailing zeros and decimal point + num = format(num, "f").rstrip("0").rstrip(".") + if "." in num: + # in case of decimal, return number of digits after decimal point + return len(num.split(".")[1]) + # in case of integer, return 0 + return 0 + + max_scale = int(values.apply(count_scale).max()) - # return stats stats = { "has_nan": has_nan, - "has_neg": has_neg, - "max_digits": max_digits, + "max_scale": max_scale, "min11": min11, "max11": max11, } @@ -55,13 +55,9 @@ def analyze_language_numeric(values: pd.Series, root_keys: pd.Series, _: pd.Seri def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bool = True) -> dict: # check for occurrence of NaN values has_nan = any([j["has_nan"] for j in stats_list]) - # check if there are negative values - # determine precision to apply rounding of sampled values during generation - keys = stats_list[0]["max_digits"].keys() - max_digits = {k: max([j["max_digits"][k] for j in stats_list]) for k in keys} - non_zero_prec = [k for k in keys if max_digits[k] > 0 and k.startswith("E")] - min_decimal = min([int(k[1:]) for k in non_zero_prec]) if len(non_zero_prec) > 0 else 0 + # determine max scale + max_scale = max([j["max_scale"] for j in stats_list]) # determine min / max 5 values to map too low / too high values to min11 = sorted([v for min11 in [j["min11"] for j in stats_list] for v in min11], reverse=False)[:11] @@ -79,8 +75,6 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo min5 = min11[0:5] max5 = max11[0:5] - max_scale = abs(min(min_decimal, 0)) - stats = { "encoding_type": ModelEncodingType.language_numeric.value, "has_nan": has_nan, diff --git a/mostlyai/engine/_encoding_types/tabular/numeric.py b/mostlyai/engine/_encoding_types/tabular/numeric.py index 44cf58c..8edff93 100644 --- a/mostlyai/engine/_encoding_types/tabular/numeric.py +++ b/mostlyai/engine/_encoding_types/tabular/numeric.py @@ -165,7 +165,7 @@ def analyze_numeric( # do not count values, if there are too many cnt_values = None - # determine lowest/highest values by root ID, and return Top 10 + # determine lowest/highest values by root ID, and return top 11 df = pd.concat([root_keys, values], axis=1) min_values = df.groupby(root_keys.name)[values.name].min().dropna() min11 = min_values.sort_values(ascending=True).head(11).astype("float").tolist() From 65b9dd634f23342a453e965681dbd674b4e11f45 Mon Sep 17 00:00:00 2001 From: Shuang Wu <149689370+shuangwu5@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:18:07 +0100 Subject: [PATCH 28/58] fix tests (#34) --- tests/end_to_end/test_language.py | 4 +++- tests/unit/test_encoding.py | 28 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 38c3d85..124e4c4 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -262,7 +262,9 @@ def test_conditional_generation(tmp_path_factory): def test_formatter(): lone_leading_surrogate_issue = '{"E0": "[b]\\ud83c\\udc00\\ud83d\\ud8bc}{"}' unexpected_end_of_hex_escape_issue = '{"E0": "』』』\u200f』 avex\\ud8dd"}' - formatter_builders = get_formatter_builders(size=1, unseeded_fields=["some_field"]) + formatter_builders = get_formatter_builders( + size=1, stats={"columns": {}}, rare_category_replacement_method=RareCategoryReplacementMethod.constant + ) tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-135M", legacy=True) logits_processor = create_formatter_logits_processor_list(tokenizer, formatter_builders) formatter = logits_processor[0]._formatters[0] diff --git a/tests/unit/test_encoding.py b/tests/unit/test_encoding.py index 62356af..0d7db9b 100644 --- a/tests/unit/test_encoding.py +++ b/tests/unit/test_encoding.py @@ -162,21 +162,21 @@ def test_long_sequential_values(self): class TestLanguageEncode: @pytest.fixture(scope="class") - def ctx_encoding_types(self): + def ctx_stats(self): return { - "table0::col_obj": ModelEncodingType.tabular_categorical, - "table1::col_int": ModelEncodingType.tabular_numeric_auto, - "table1::col_float": ModelEncodingType.tabular_numeric_auto, - "table1::col_bool": ModelEncodingType.tabular_categorical, - "table2::col_date": ModelEncodingType.tabular_datetime, - "table3::col_datetime": ModelEncodingType.tabular_datetime, + "columns": { + "table0::col_obj": {}, + "table1::col_int": {}, + "table1::col_float": {}, + "table1::col_bool": {}, + "table2::col_date": {}, + "table3::col_datetime": {}, + } } @pytest.fixture(scope="class") - def tgt_encoding_types(self): - return { - "table3::col_str": ModelEncodingType.language_text, - } + def tgt_stats(self): + return {"columns": {"table3::col_str": {}}} @pytest.fixture(scope="class") def ctx_df(self): @@ -208,9 +208,9 @@ def tgt_df(self): ) return df - def test_format_df(self, ctx_df, tgt_df, ctx_encoding_types, tgt_encoding_types): - formatted_ctx_df = format_df(ctx_df, is_target=False, columns=list(ctx_encoding_types.keys())) - formatted_tgt_df = format_df(tgt_df, is_target=True, columns=list(tgt_encoding_types.keys())) + def test_format_df(self, ctx_df, tgt_df, ctx_stats, tgt_stats): + formatted_ctx_df = format_df(ctx_df, is_target=False, stats=ctx_stats) + formatted_tgt_df = format_df(tgt_df, is_target=True, stats=tgt_stats) ctx = formatted_ctx_df.iloc[0] tgt = formatted_tgt_df.iloc[0] From a1bdade90ab2fac9f55834f3c7cea3de35dc22fb Mon Sep 17 00:00:00 2001 From: Shuang Wu <149689370+shuangwu5@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:45:38 +0100 Subject: [PATCH 29/58] refactor temp_formatron.py (#35) --- mostlyai/engine/_language/formatron_utils.py | 8 +- mostlyai/engine/_language/temp_formatron.py | 127 ++----------------- 2 files changed, 13 insertions(+), 122 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 7c43ab7..908cc39 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -26,7 +26,7 @@ from pydantic import create_model from transformers import PreTrainedTokenizerBase from mostlyai.engine._encoding_types.language.categorical import CATEGORICAL_UNKNOWN_TOKEN -from mostlyai.engine._language.temp_formatron import JsonExtractor +from mostlyai.engine._language.temp_formatron import MostlyJsonExtractor import collections from formatron.schemas.schema import Schema @@ -53,9 +53,9 @@ class MostlyFormatterBuilder(FormatterBuilder): def __init__(self): super().__init__() - def json(self, schema: type[Schema] | collections.abc.Sequence, *, capture_name: str = None) -> JsonExtractor: + def json(self, schema: type[Schema] | collections.abc.Sequence, *, capture_name: str = None) -> MostlyJsonExtractor: """ - Create a JSON extractor. Check out the JsonExtractor docs for more details. + Create a JSON extractor. Check out the MostlyJsonExtractor docs for more details. Args: schema: The schema for extraction. @@ -81,7 +81,7 @@ def to_json(_json: str): return None return self._add_extractor( - "json", lambda nonterminal: JsonExtractor(nonterminal, capture_name, schema, to_json) + "json", lambda nonterminal: MostlyJsonExtractor(nonterminal, capture_name, schema, to_json) ) diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py index 9ac1c9e..c88ddc9 100644 --- a/mostlyai/engine/_language/temp_formatron.py +++ b/mostlyai/engine/_language/temp_formatron.py @@ -13,17 +13,17 @@ # limitations under the License. """ -The module defines the `JsonExtractor` class, which is used to extract data from a string in JSON format. +The module defines the `MostlyJsonExtractor` class, which is used to extract data from a string in JSON format. """ import collections import datetime import typing -from formatron import extractor, schemas -from formatron.formats.json import _type_to_nonterminals +from formatron import schemas +from formatron.formats.json import _type_to_nonterminals, JsonExtractor -__all__ = ["JsonExtractor"] +__all__ = ["MostlyJsonExtractor"] FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 @@ -97,10 +97,10 @@ def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Seque return "".join(result) -# Copy from formatron except it uses `_generate_kbnf_grammar` from this file to construct self._rule_str -class JsonExtractor(extractor.NonterminalExtractor): +class MostlyJsonExtractor(JsonExtractor): """ - An extractor that loads json data to an object from a string. + Same as the parent class from formatron + except that it uses `_generate_kbnf_grammar` from this file to construct self._rule_str """ def __init__( @@ -110,115 +110,6 @@ def __init__( schema: schemas.schema.Schema | collections.abc.Sequence, to_object: typing.Callable[[str], schemas.schema.Schema], ): - """ - Create a json extractor from a given schema or a list of supported types. - - Currently, the following data types are supported: - - - bool - - int - - positive int - - negative int - - nonnegative int - - nonpositive int - - float - - positive float - - negative float - - nonnegative float - - nonpositive float - - str - - optionally with min_length, max_length and pattern constraints - - length is measured in UTF-8 character number after json parsing - - *Warning*: too large difference between min_length and max_length can lead to enormous memory consumption! - - pattern is mutually exclusive with min_length and max_length - - pattern will be compiled to a regular expression so all caveats of regular expressions apply - - pattern currently is automatically anchored at both ends - - the generated json could be invalid if the pattern allows invalid content between the json string's quotes. - - for example, `pattern=".*"` will allow '\"' to appear in the json string which is forbidden by JSON standard. - - also supports substring_of constraint which constrains the string to be a substring of a given string - - the generated json could be invalid if the given string contains invalid content when put into the json string's quotes. - - for example, `substring_of="abc\""` will allow '\"' to appear in the json string which is forbidden by JSON standard. - - NoneType - - typing.Any - - Subclasses of collections.abc.Mapping[str,T] and typing.Mapping[str,T] where T is a supported type, - - Subclasses of collections.abc.Sequence[T] and typing.Sequence[T] where T is a supported type. - - optionally with `minItems`, `maxItems`, `prefixItems` constraints - - *Warning*: too large difference between minItems and maxItems can lead to very slow performance! - - *Warning*: By json schema definition, prefixItems by default allows additional items and missing items in the prefixItems, which may not be the desired behavior and can lead to very slow performance if prefixItems is long! - - tuple[T1,T2,...] where T1,T2,... are supported types. The order, type and number of elements will be preserved. - - typing.Literal[x1,x2,...] where x1, x2, ... are instances of int, string, bool or NoneType, or another typing.Literal[y1,y2,...] - - typing.Union[T1,T2,...] where T1,T2,... are supported types. - - schemas.Schema where all its fields' data types are supported. Recursive schema definitions are supported as well. - - *Warning*: while not required field is supported, they can lead to very slow performance and/or enormous memory consumption if there are too many of them! - - Custom types registered via register_type_nonterminal() - - Args: - nonterminal: The nonterminal representing the extractor. - capture_name: The capture name of the extractor, or `None` if the extractor does not capture. - schema: The schema. - to_object: A callable to convert the extracted string to a schema instance. - """ - super().__init__(nonterminal, capture_name) + super(JsonExtractor, self).__init__(nonterminal, capture_name) self._to_object = to_object - self._rule_str = _generate_kbnf_grammar( - schema, self.nonterminal - ) # altered FIXME can we monkey patch this instead? - - def extract(self, input_str: str) -> tuple[str, schemas.schema.Schema] | None: - """ - Extract a schema instance from a string. - - Args: - input_str: The input string to extract from. - - Returns: - A tuple of the remaining string and the extracted schema instance, or `None` if extraction failed. - """ - - # Ensure the input string starts with '{' or '[' after stripping leading whitespace - input_str = input_str.lstrip() - if not input_str.startswith(("{", "[")): - return None - - # Variables to track the balance of brackets and the position in the string - bracket_count = 0 - position = 0 - in_string = False - escape_next = False - start_char = input_str[0] - end_char = "}" if start_char == "{" else "]" - - # Iterate over the string to find where the JSON object or array ends - for char in input_str: - if not in_string: - if char == start_char: - bracket_count += 1 - elif char == end_char: - bracket_count -= 1 - elif char == '"': - in_string = True - else: - if char == '"' and not escape_next: - in_string = False - elif char == "\\": - escape_next = not escape_next - else: - escape_next = False - - # Move to the next character - position += 1 - - # If brackets are balanced and we're not in a string, stop processing - if bracket_count == 0 and not in_string: - break - else: - return None - # The position now points to the character after the last '}', so we slice to position - json_str = input_str[:position] - remaining_str = input_str[position:] - # Return the unparsed remainder of the string and the decoded JSON object - return remaining_str, self._to_object(json_str) - - @property - def kbnf_definition(self): - return self._rule_str + self._rule_str = _generate_kbnf_grammar(schema, self.nonterminal) From f3ec656142245fce43bd411e8883c1d1ae4ef722 Mon Sep 17 00:00:00 2001 From: michdr Date: Tue, 11 Feb 2025 16:18:33 +0100 Subject: [PATCH 30/58] fix several _decode_numeric and _decode_datetime FIXMEs --- mostlyai/engine/_language/generation.py | 94 +++++++++++++++++-------- 1 file changed, 64 insertions(+), 30 deletions(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 01ace69..d1f2337 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -128,46 +128,80 @@ def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: return x.astype(STRING) +def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: + x_numeric = pd.to_numeric(x, errors="coerce") + min_arr = np.array(min5, dtype=x_numeric.dtype) + max_arr = np.array(max5, dtype=x_numeric.dtype) + n = len(x_numeric) + random_mins = np.random.choice(min_arr, size=n) + random_maxs = np.random.choice(max_arr, size=n) + clipped = np.minimum(np.maximum(x_numeric.to_numpy(), random_mins), random_maxs) + return pd.Series(clipped, index=x.index) + + +def _clip_datetime(x: pd.Series, min5: list, max5: list) -> pd.Series: + x_dt = pd.to_datetime(x, errors="coerce") + min_arr = pd.to_datetime(min5).to_numpy(dtype="datetime64[ns]") + max_arr = pd.to_datetime(max5).to_numpy(dtype="datetime64[ns]") + n = len(x_dt) + random_mins = np.random.choice(min_arr, size=n) + random_maxs = np.random.choice(max_arr, size=n) + clipped = np.minimum(np.maximum(x_dt.to_numpy(dtype="datetime64[ns]"), random_mins), random_maxs) + return pd.Series(clipped, index=x.index) + + def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - # FIXME revisit for invalid values -- sample from values / nan / or other # FIXME add programmatic constraint - x[(x == "") | (x == "_INVALID_")] = np.nan - # FIXME consider if this try/catch is correct approach + x = pd.to_numeric(x, errors="coerce") + x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long if col_stats["max_scale"] == 0: return x.astype("Int64") return x.astype(float) -def coerce_datetime(text: str) -> str: - """ - Ensure that the text is a valid date or datetime string. - """ - if text == "" or text == "_INVALID_": - return text - # FIXME copy paste from datallm, see if should be cleaned up - - # extract year, month, and day from the ISO formatted text - y, m, d = int(text[:4]), int(text[5:7]), int(text[8:10]) - # set to last day of month, in case of too large day value - last_day = calendar.monthrange(y, m)[1] - d = min(d, last_day) - dt_str = f"{y:04d}-{m:02d}-{d:02d}" + text[10:] - # convert to date and back to check for valid date - try: - dt_str = datetime.datetime.fromisoformat(dt_str).isoformat().replace("T", " ") - except ValueError: - dt_str = text # FIXME revisit, if e.g. a cutoff date, then we just return the original text - # trim to original length - dt_str = dt_str[: len(text)] - return dt_str +def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + x = x.where(~x.isin(["", "_INVALID_"]), np.nan) + valid_mask = ( + x.str.len().ge(10) + & x.str.slice(0, 4).str.isdigit() + & x.str.slice(5, 7).str.isdigit() + & x.str.slice(8, 10).str.isdigit() + ) + if valid_mask.sum() > 0: # expected "YYYY-MM-DD" prefix + # handle the date portion, ensuring validity + years = x[valid_mask].str.slice(0, 4).astype(int) + months = x[valid_mask].str.slice(5, 7).astype(int) + days = x[valid_mask].str.slice(8, 10).astype(int) + + # clamp days according to maximum possible day of the month of a given year + last_days = np.array([calendar.monthrange(y, m)[1] for y, m in zip(years, months)]) + clamped_days = np.minimum(days, last_days) + + # rebuild the date portion + new_date = ( + years.astype(str).str.zfill(4) + + "-" + + months.astype(str).str.zfill(2) + + "-" + + pd.Series(clamped_days, index=years.index).astype(str).str.zfill(2) + ) -def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - # FIXME revisit for invalid values -- sample from values / nan / or other - # TODO clamp datetime to valid range - x = x.map(coerce_datetime) - return pd.to_datetime(x, errors="coerce") + # handle the time portion, ensuring validity + remainder = x[valid_mask].str.slice(10) + + time_regex = r"^[ T]?(\d{2}:\d{2}:\d{2}(?:\.\d+)?)" + valid_time = remainder.str.extract(time_regex, expand=False) + valid_time = valid_time.fillna("00:00:00") + valid_time = " " + valid_time + + new_date = new_date + valid_time + x.loc[valid_mask] = new_date + + x = pd.to_datetime(x, errors="coerce") + x.loc[valid_mask] = _clip_datetime(x.loc[valid_mask], col_stats["min5"], col_stats["max5"]) + return x.astype("datetime64[ns]") def generate( From 53b5e4acff9e64525c9615681f13640912732979 Mon Sep 17 00:00:00 2001 From: michdr Date: Tue, 11 Feb 2025 16:20:45 +0100 Subject: [PATCH 31/58] ruff --- mostlyai/engine/_language/generation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index d1f2337..235ef1a 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -14,7 +14,6 @@ import calendar import contextlib -import datetime import json import os From c4fb39b3cc4aae2541db1bfcf0f4c7ab557e1feb Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 11 Feb 2025 16:42:04 +0100 Subject: [PATCH 32/58] Extreme value protection for LANGUAGE_NUMERIC (#36) --- .../_encoding_types/language/categorical.py | 2 +- .../_encoding_types/language/numeric.py | 31 +++++++++++++++++++ mostlyai/engine/_language/encoding.py | 7 +++-- tests/end_to_end/test_language.py | 3 +- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/categorical.py b/mostlyai/engine/_encoding_types/language/categorical.py index 72e018e..4f8d8ce 100644 --- a/mostlyai/engine/_encoding_types/language/categorical.py +++ b/mostlyai/engine/_encoding_types/language/categorical.py @@ -58,7 +58,7 @@ def analyze_reduce_language_categorical(stats_list: list[dict], value_protection return stats -def encode_categorical(values: pd.Series, stats: dict) -> pd.DataFrame: +def encode_language_categorical(values: pd.Series, stats: dict) -> pd.Series: values = safe_convert_string(values) values = values.copy() known_categories = stats["categories"] diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 065b6e1..356d03a 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -15,6 +15,7 @@ import pandas as pd from mostlyai.engine._common import safe_convert_numeric +from mostlyai.engine._encoding_types.tabular.numeric import _type_safe_numeric_series from mostlyai.engine.domain import ModelEncodingType @@ -84,3 +85,33 @@ def analyze_reduce_language_numeric(stats_list: list[dict], value_protection: bo } return stats + + +def encode_language_numeric(values: pd.Series, stats: dict, _: pd.Series | None = None) -> pd.DataFrame: + values = safe_convert_numeric(values) + # try to convert to int, if possible + dtype = "Int64" if stats["max_scale"] == 0 else "Float64" + if dtype == "Int64": + values = values.round() + try: + values = values.astype(dtype) + except TypeError: + if dtype == "Int64": # if couldn't safely convert to int, stick to float + dtype = "Float64" + values = values.astype(dtype) + # reset index, as `values.mask` can throw errors for misaligned indices + values.reset_index(drop=True, inplace=True) + # replace extreme values with randomly sampled 5-th to 10-th largest/smallest values + min5 = _type_safe_numeric_series(stats["min5"] or [0], dtype) + max5 = _type_safe_numeric_series(stats["max5"] or [0], dtype) + values.mask( + values < min5[0], + min5.sample(n=len(values), replace=True, ignore_index=True), + inplace=True, + ) + values.mask( + values > max5[0], + max5.sample(n=len(values), replace=True, ignore_index=True), + inplace=True, + ) + return values diff --git a/mostlyai/engine/_language/encoding.py b/mostlyai/engine/_language/encoding.py index 6562e64..782573e 100644 --- a/mostlyai/engine/_language/encoding.py +++ b/mostlyai/engine/_language/encoding.py @@ -24,7 +24,8 @@ from mostlyai.engine._common import is_sequential, ProgressCallback, ProgressCallbackWrapper, TABLE_COLUMN_INFIX from mostlyai.engine._workspace import ensure_workspace_dir, Workspace, reset_dir -from mostlyai.engine._encoding_types.language.categorical import encode_categorical +from mostlyai.engine._encoding_types.language.categorical import encode_language_categorical +from mostlyai.engine._encoding_types.language.numeric import encode_language_numeric _LOG = logging.getLogger(__name__) @@ -32,7 +33,9 @@ def apply_encoding_types(df: pd.DataFrame, stats: dict) -> pd.DataFrame: for col, col_stats in stats["columns"].items(): if col_stats["encoding_type"] == "LANGUAGE_CATEGORICAL": - df[col] = encode_categorical(df[col], col_stats) + df[col] = encode_language_categorical(df[col], col_stats) + elif col_stats["encoding_type"] == "LANGUAGE_NUMERIC": + df[col] = encode_language_numeric(df[col], col_stats) return df diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 124e4c4..e741c61 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -427,7 +427,7 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): data = pd.DataFrame( { "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4) + ["rare"], - "age": [20, 30, 40, 50] * int(no_of_records / 4) + [50], + "age": [20, 30, 40, 50] * int(no_of_records / 4) + [60], "date": [ pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02"), @@ -476,6 +476,7 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas assert len(syn) == 10 assert set(syn.columns) == {"age", "gender", "date"} assert syn["age"].dtype == "Int64" + assert all((syn["age"] >= 30) & (syn["age"] <= 50)) assert syn["gender"].dtype == "string" assert "rare" not in syn["gender"].values assert CATEGORICAL_UNKNOWN_TOKEN not in syn["gender"].values From 74567607b8487f5d129d9e758f45ea1372ac9389 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Tue, 11 Feb 2025 17:01:30 +0100 Subject: [PATCH 33/58] extreme value protection for datetimes --- .../_encoding_types/language/datetime.py | 48 ++++++++++--------- mostlyai/engine/_language/encoding.py | 3 ++ tests/end_to_end/test_language.py | 13 ++++- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/datetime.py b/mostlyai/engine/_encoding_types/language/datetime.py index f6d633d..68d5662 100644 --- a/mostlyai/engine/_encoding_types/language/datetime.py +++ b/mostlyai/engine/_encoding_types/language/datetime.py @@ -15,7 +15,6 @@ import pandas as pd from mostlyai.engine._common import safe_convert_datetime -from mostlyai.engine._encoding_types.tabular.datetime import split_sub_columns_datetime, DATETIME_PARTS def analyze_language_datetime(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: @@ -26,23 +25,11 @@ def analyze_language_datetime(values: pd.Series, root_keys: pd.Series, _: pd.Ser min11 = min_dates.sort_values(ascending=True).head(11).astype(str).tolist() max_dates = df.groupby(root_keys.name)[values.name].max().dropna() max11 = max_dates.sort_values(ascending=False).head(11).astype(str).tolist() - # split into datetime parts - df_split = split_sub_columns_datetime(values) - is_not_nan = df_split["nan"] == 0 - has_nan = any(df_split["nan"] == 1) - # extract min/max value for each part to determine valid value range - if any(is_not_nan): - min_values = {k: int(df_split[k][is_not_nan].min()) for k in DATETIME_PARTS} - max_values = {k: int(df_split[k][is_not_nan].max()) for k in DATETIME_PARTS} - else: - def_values = {"year": 2022, "month": 1, "day": 1} - min_values = {k: 0 for k in DATETIME_PARTS} | def_values - max_values = {k: 0 for k in DATETIME_PARTS} | def_values + # determine if there are any NaN values + has_nan = bool(values.isna().any()) # return stats stats = { "has_nan": has_nan, - "min_values": min_values, - "max_values": max_values, "min11": min11, "max11": max11, } @@ -52,10 +39,6 @@ def analyze_language_datetime(values: pd.Series, root_keys: pd.Series, _: pd.Ser def analyze_reduce_language_datetime(stats_list: list[dict], value_protection: bool = True) -> dict: # check if there are missing values has_nan = any([j["has_nan"] for j in stats_list]) - # determine min/max values for each part - keys = stats_list[0]["min_values"].keys() - min_values = {k: min([j["min_values"][k] for j in stats_list]) for k in keys} - max_values = {k: max([j["max_values"][k] for j in stats_list]) for k in keys} # determine min / max 5 values to map too low / too high values to min11 = sorted([v for min11 in [j["min11"] for j in stats_list] for v in min11], reverse=False)[:11] max11 = sorted([v for max11 in [j["max11"] for j in stats_list] for v in max11], reverse=True)[:11] @@ -68,9 +51,6 @@ def analyze_reduce_language_datetime(stats_list: list[dict], value_protection: b else: min5 = [str(v) for v in min11[5:10]] # drop 1 to 5th lowest; keep 6th to 10th lowest max5 = [str(v) for v in max11[5:10]] # drop 1 to 5th highest; keep 6th to 10th highest - # update min/max year based on first four letters of protected min/max dates - max_values["year"] = int(max5[0][0:4]) - min_values["year"] = int(min5[0][0:4]) else: min5 = min11[0:4] max5 = max11[0:4] @@ -80,3 +60,27 @@ def analyze_reduce_language_datetime(stats_list: list[dict], value_protection: b "max5": max5, } return stats + + +def encode_language_datetime(values: pd.Series, stats: dict, _: pd.Series | None = None) -> pd.Series: + # convert + values = safe_convert_datetime(values) + values = values.copy() + # reset index, as `values.mask` can throw errors for misaligned indices + values.reset_index(drop=True, inplace=True) + # replace extreme values with randomly sampled 5-th to 10-th largest/smallest values + min5 = stats["min5"] if len(stats["min5"]) > 0 else [0] + max5 = stats["max5"] if len(stats["max5"]) > 0 else [0] + min5 = pd.Series(min5, dtype=values.dtype) + max5 = pd.Series(max5, dtype=values.dtype) + values.mask( + values < min5[0], + min5.sample(n=len(values), replace=True, ignore_index=True), + inplace=True, + ) + values.mask( + values > max5[0], + max5.sample(n=len(values), replace=True, ignore_index=True), + inplace=True, + ) + return values diff --git a/mostlyai/engine/_language/encoding.py b/mostlyai/engine/_language/encoding.py index 782573e..752624f 100644 --- a/mostlyai/engine/_language/encoding.py +++ b/mostlyai/engine/_language/encoding.py @@ -26,6 +26,7 @@ from mostlyai.engine._workspace import ensure_workspace_dir, Workspace, reset_dir from mostlyai.engine._encoding_types.language.categorical import encode_language_categorical from mostlyai.engine._encoding_types.language.numeric import encode_language_numeric +from mostlyai.engine._encoding_types.language.datetime import encode_language_datetime _LOG = logging.getLogger(__name__) @@ -36,6 +37,8 @@ def apply_encoding_types(df: pd.DataFrame, stats: dict) -> pd.DataFrame: df[col] = encode_language_categorical(df[col], col_stats) elif col_stats["encoding_type"] == "LANGUAGE_NUMERIC": df[col] = encode_language_numeric(df[col], col_stats) + elif col_stats["encoding_type"] == "LANGUAGE_DATETIME": + df[col] = encode_language_datetime(df[col], col_stats) return df diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index e741c61..978abf3 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -435,7 +435,7 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): pd.Timestamp("2025-01-04"), ] * int(no_of_records / 4) - + [pd.Timestamp("2025-01-04")], + + [pd.Timestamp("2025-01-05")], } ) tgt_encoding_types = { @@ -475,9 +475,18 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas syn = pd.read_parquet(syn_data_path) assert len(syn) == 10 assert set(syn.columns) == {"age", "gender", "date"} + assert syn["age"].dtype == "Int64" - assert all((syn["age"] >= 30) & (syn["age"] <= 50)) + # test extreme value protection + assert syn["age"].min() >= 20 + assert syn["age"].max() <= 50 + assert syn["gender"].dtype == "string" + # test rare category protection assert "rare" not in syn["gender"].values assert CATEGORICAL_UNKNOWN_TOKEN not in syn["gender"].values + assert syn["date"].dtype == "datetime64[ns]" + # test extreme value protection + assert syn["date"].min() >= pd.Timestamp("2020-01-02") + assert syn["date"].max() <= pd.Timestamp("2025-01-04") From 28516f698b0b374a7e67e2cddfc7de6cc299c2d6 Mon Sep 17 00:00:00 2001 From: Shuang Wu <149689370+shuangwu5@users.noreply.github.com> Date: Tue, 11 Feb 2025 18:26:02 +0100 Subject: [PATCH 34/58] build: uv run without re-syncing the environment (#37) --- .github/workflows/run-tests-cpu.yaml | 10 +++++----- .github/workflows/run-tests-gpu.yaml | 6 +++--- Makefile | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run-tests-cpu.yaml b/.github/workflows/run-tests-cpu.yaml index bf66900..6a36d09 100644 --- a/.github/workflows/run-tests-cpu.yaml +++ b/.github/workflows/run-tests-cpu.yaml @@ -32,16 +32,16 @@ jobs: run: uv sync --frozen --extra cpu - name: Run | Tests -> unit - run: uv run pytest tests/unit + run: uv run --no-sync pytest tests/unit - name: Build mkdocs - run: uv run mkdocs build --strict + run: uv run --no-sync mkdocs build --strict - name: Run tests -> end_to_end -> sequential - run: uv run pytest tests/end_to_end/test_tabular_sequential.py + run: uv run --no-sync pytest tests/end_to_end/test_tabular_sequential.py - name: Run tests -> end_to_end -> sequential context - run: uv run pytest tests/end_to_end/test_tabular_sequential_context.py + run: uv run --no-sync pytest tests/end_to_end/test_tabular_sequential_context.py run-tests-cpu-end-to-end-nonsequential: runs-on: ubuntu-latest @@ -66,4 +66,4 @@ jobs: run: uv sync --frozen --extra cpu --no-group docs - name: Run tests -> end_to_end all except sequential - run: uv run pytest --ignore=tests/end_to_end/test_tabular_sequential.py --ignore=tests/end_to_end/test_tabular_sequential_context.py tests/end_to_end/ + run: uv run --no-sync pytest --ignore=tests/end_to_end/test_tabular_sequential.py --ignore=tests/end_to_end/test_tabular_sequential_context.py tests/end_to_end/ diff --git a/.github/workflows/run-tests-gpu.yaml b/.github/workflows/run-tests-gpu.yaml index 3958cbc..269f8a9 100644 --- a/.github/workflows/run-tests-gpu.yaml +++ b/.github/workflows/run-tests-gpu.yaml @@ -42,10 +42,10 @@ jobs: run: nvidia-smi - name: Run tests -> end_to_end -> sequential - run: uv run pytest tests/end_to_end/test_tabular_sequential.py + run: uv run --no-sync pytest tests/end_to_end/test_tabular_sequential.py - name: Run tests -> end_to_end -> sequential context - run: uv run pytest tests/end_to_end/test_tabular_sequential_context.py + run: uv run --no-sync pytest tests/end_to_end/test_tabular_sequential_context.py - name: Run tests -> end_to_end all except sequential - run: uv run pytest --ignore=tests/end_to_end/test_tabular_sequential.py --ignore=tests/end_to_end/test_tabular_sequential_context.py tests/end_to_end/ + run: uv run --no-sync pytest --ignore=tests/end_to_end/test_tabular_sequential.py --ignore=tests/end_to_end/test_tabular_sequential_context.py tests/end_to_end/ diff --git a/Makefile b/Makefile index 853ab26..9fc72a1 100644 --- a/Makefile +++ b/Makefile @@ -12,11 +12,11 @@ install: # Install dependencies .PHONY: lint lint: ## Run lints - uv run pre-commit run --all-files + uv run --no-sync pre-commit run --all-files .PHONY: test test: ## Run tests - uv run pytest + uv run --no-sync pytest .PHONY: all all: clean install lint test ## Run the commands: clean install lint test From 5d8011f7043d0ce6f5eab7e1af8c809b8c4546d8 Mon Sep 17 00:00:00 2001 From: michdr Date: Tue, 11 Feb 2025 18:38:18 +0100 Subject: [PATCH 35/58] temp fix for datetime validation --- mostlyai/engine/_language/formatron_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 908cc39..b7fe2d0 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -156,9 +156,13 @@ def from_json(cls, _json: str) -> "MostlyClassSchema": try: return cls.model_validate_json(_json) except ValidationError as e: + do_raise = True # FIXME temporary work-around for error in e.errors(): if error["type"] == "json_invalid": raise JSONDecodeError( f"Caught pydantic ValidationError {e}, reraising as JSONDecodeError", _json, 0 ) - raise e + elif "day value is outside expected range" in error.get("msg"): + do_raise = False # FIXME: make flexible datetime validation instead + if do_raise: + raise e From 4eccc0dcca85113b9e01e8942c7b7144334ac4a5 Mon Sep 17 00:00:00 2001 From: michdr Date: Wed, 12 Feb 2025 10:37:34 +0100 Subject: [PATCH 36/58] enhance test_categorical_numeric_datetime --- tests/end_to_end/test_language.py | 33 ++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 978abf3..bfcf5de 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -423,21 +423,31 @@ def test_special_character_column_name(tmp_path_factory): @pytest.fixture(scope="session") def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): workspace_dir = tmp_path_factory.mktemp("ws") - no_of_records = 20 + no_of_records = 40 data = pd.DataFrame( { - "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4) + ["rare"], - "age": [20, 30, 40, 50] * int(no_of_records / 4) + [60], + "gender": ["m", "f", "x", pd.NA] * int(no_of_records / 4), + "age": [20, 30, 40, 50] * int(no_of_records / 4), "date": [ pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02"), pd.Timestamp("2023-01-03"), pd.Timestamp("2025-01-04"), ] - * int(no_of_records / 4) - + [pd.Timestamp("2025-01-05")], + * int(no_of_records / 4), + } + ) + rare_df = pd.DataFrame( + { + "gender": [f"rare{i + 1}" for i in range(20)], + "age": list(range(10, 20)) + list(range(51, 61)), + "date": ( + [pd.Timestamp("2019-01-01") + pd.Timedelta(days=i) for i in range(10)] + + [pd.Timestamp("2026-01-01") + pd.Timedelta(days=i) for i in range(10)] + ), } ) + data = pd.concat([data, rare_df], ignore_index=True) tgt_encoding_types = { "age": ModelEncodingType.language_numeric.value, "gender": ModelEncodingType.language_categorical.value, @@ -467,26 +477,27 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas train(workspace_dir=workspace_dir, model=model_name) generate( workspace_dir=workspace_dir, - sample_size=10, + sample_size=40, rare_category_replacement_method=RareCategoryReplacementMethod.sample, ) syn_data_path = workspace_dir / "SyntheticData" syn = pd.read_parquet(syn_data_path) - assert len(syn) == 10 + assert len(syn) == 40 assert set(syn.columns) == {"age", "gender", "date"} assert syn["age"].dtype == "Int64" # test extreme value protection - assert syn["age"].min() >= 20 - assert syn["age"].max() <= 50 + assert syn["age"].min() >= 15 + assert syn["age"].max() <= 55 assert syn["gender"].dtype == "string" # test rare category protection assert "rare" not in syn["gender"].values assert CATEGORICAL_UNKNOWN_TOKEN not in syn["gender"].values + assert syn["gender"].nunique(dropna=False) == 4 assert syn["date"].dtype == "datetime64[ns]" # test extreme value protection - assert syn["date"].min() >= pd.Timestamp("2020-01-02") - assert syn["date"].max() <= pd.Timestamp("2025-01-04") + assert syn["date"].min() >= pd.Timestamp("2019-01-06") + assert syn["date"].max() <= pd.Timestamp("2026-01-05") From d52772ec078dab4707e96a076aa38242c22f0807 Mon Sep 17 00:00:00 2001 From: michdr Date: Wed, 12 Feb 2025 11:19:05 +0100 Subject: [PATCH 37/58] enable all models in test_categorical_numeric_datetime --- mostlyai/engine/_language/generation.py | 4 +++- tests/end_to_end/test_language.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 235ef1a..c6a6c3a 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -124,7 +124,9 @@ def parse_json(x, columns: list[str]): def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - return x.astype(STRING) + x = x.astype(STRING) + allowed_categories = col_stats.get("categories", []) + return x.where(x.isin(allowed_categories), other=None) def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index bfcf5de..de4b626 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -467,9 +467,9 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): @pytest.mark.parametrize( ("model_name"), [ - # LSTMFromScratchConfig.model_id, # FIXME: failed when generating incomplete datetime or overflow numbers atm - # "amd/AMD-Llama-135m", # FIXME failed when generating incomplete datetime or overflow numbers atm - "openai-community/gpt2" # TEMP, better model than AMD + LSTMFromScratchConfig.model_id, + "amd/AMD-Llama-135m", + "openai-community/gpt2", # TEMP, better model than AMD ], ) def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): @@ -495,9 +495,11 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas # test rare category protection assert "rare" not in syn["gender"].values assert CATEGORICAL_UNKNOWN_TOKEN not in syn["gender"].values - assert syn["gender"].nunique(dropna=False) == 4 + assert syn["gender"].nunique(dropna=False) <= 4 assert syn["date"].dtype == "datetime64[ns]" # test extreme value protection - assert syn["date"].min() >= pd.Timestamp("2019-01-06") - assert syn["date"].max() <= pd.Timestamp("2026-01-05") + dates = syn["date"].dropna() + if not dates.empty: + assert dates.min() >= pd.Timestamp("2019-01-06") + assert dates.max() <= pd.Timestamp("2026-01-05") From e343ace5c7bdf9b787e06c39f3899f5b524e701c Mon Sep 17 00:00:00 2001 From: michdr Date: Wed, 12 Feb 2025 17:08:51 +0100 Subject: [PATCH 38/58] fix _decode_datetime --- mostlyai/engine/_language/generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index c6a6c3a..0cf500c 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -201,7 +201,7 @@ def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x.loc[valid_mask] = new_date x = pd.to_datetime(x, errors="coerce") - x.loc[valid_mask] = _clip_datetime(x.loc[valid_mask], col_stats["min5"], col_stats["max5"]) + x = _clip_datetime(x, col_stats["min5"], col_stats["max5"]) return x.astype("datetime64[ns]") From 9453080b2792d8d84fce374970aea3e38de35e82 Mon Sep 17 00:00:00 2001 From: andre-mostly <149782207+andre-mostly@users.noreply.github.com> Date: Thu, 13 Feb 2025 11:22:34 +0100 Subject: [PATCH 39/58] feat: constrain numeric and simplify datetime (#38) --- mostlyai/engine/_language/engine/hf_engine.py | 3 + .../engine/_language/engine/vllm_engine.py | 2 + mostlyai/engine/_language/formatron_utils.py | 66 ++---- mostlyai/engine/_language/generation.py | 1 - mostlyai/engine/_language/temp_formatron.py | 194 ++++++++++-------- 5 files changed, 126 insertions(+), 140 deletions(-) diff --git a/mostlyai/engine/_language/engine/hf_engine.py b/mostlyai/engine/_language/engine/hf_engine.py index 744fcaa..e64fce0 100644 --- a/mostlyai/engine/_language/engine/hf_engine.py +++ b/mostlyai/engine/_language/engine/hf_engine.py @@ -23,6 +23,7 @@ from transformers import AutoTokenizer from mostlyai.engine._language.common import load_base_model_and_config +from mostlyai.engine._language.temp_formatron import monkey_patch_formatron from mostlyai.engine._language.tokenizer_utils import tokenize_fn from mostlyai.engine._language.engine.base import EngineMetrics, LanguageEngine @@ -65,6 +66,8 @@ def __init__( self.tokenizer.special_tokens_map ) self._json_enforcing_possible = is_peft_adapter or is_trained_lstm_tokenizer + if self.supports_json_enforcing(): + monkey_patch_formatron() self._logits_processors = None def get_default_batch_size(self) -> int: diff --git a/mostlyai/engine/_language/engine/vllm_engine.py b/mostlyai/engine/_language/engine/vllm_engine.py index 247479a..d66552d 100644 --- a/mostlyai/engine/_language/engine/vllm_engine.py +++ b/mostlyai/engine/_language/engine/vllm_engine.py @@ -26,6 +26,7 @@ from peft import PeftConfig from transformers import AutoTokenizer, AutoConfig, PreTrainedTokenizerBase +from mostlyai.engine._language.temp_formatron import monkey_patch_formatron from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest from vllm.config import _get_and_verify_max_len @@ -123,6 +124,7 @@ def __init__( add_eos_token=False, ) self._logits_processors = None + monkey_patch_formatron() def get_default_batch_size(self) -> int: return 192 diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index b7fe2d0..1a83c92 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -13,23 +13,17 @@ # limitations under the License. -import datetime import typing import pandas as pd from formatron.schemas.pydantic import ClassSchema from json import JSONDecodeError -from pydantic import ValidationError +from pydantic import Field, SkipValidation, ValidationError from formatron.formatter import FormatterBuilder from typing import Literal -from formatron.formats import json from pydantic import create_model from transformers import PreTrainedTokenizerBase from mostlyai.engine._encoding_types.language.categorical import CATEGORICAL_UNKNOWN_TOKEN -from mostlyai.engine._language.temp_formatron import MostlyJsonExtractor -import collections -from formatron.schemas.schema import Schema - from mostlyai.engine.domain import ModelEncodingType, RareCategoryReplacementMethod JSON_NULL = "null" @@ -49,42 +43,6 @@ def transform(x: str | None) -> str: return sample_seed.astype("string[pyarrow]").map(transform) -class MostlyFormatterBuilder(FormatterBuilder): - def __init__(self): - super().__init__() - - def json(self, schema: type[Schema] | collections.abc.Sequence, *, capture_name: str = None) -> MostlyJsonExtractor: - """ - Create a JSON extractor. Check out the MostlyJsonExtractor docs for more details. - - Args: - schema: The schema for extraction. - capture_name: The capture name of the extractor, or `None` if the extractor does not capture. - Returns: - The JSON extractor. - """ - - def to_json(_json: str): - local_schema = schema - origin = typing.get_origin(local_schema) - if origin is not None: - local_schema = origin - if isinstance(local_schema, type) and issubclass(local_schema, Schema): - try: - return local_schema.from_json(_json) - except JSONDecodeError: # make ChoiceExtractor work appropriately - return None - else: - try: - return json.loads(_json) - except JSONDecodeError: - return None - - return self._add_extractor( - "json", lambda nonterminal: MostlyJsonExtractor(nonterminal, capture_name, schema, to_json) - ) - - def get_formatter_builders( *, seed_df: pd.DataFrame | None = None, @@ -105,7 +63,7 @@ def get_formatter_builders( numeric_fields = field_types.get(ModelEncodingType.language_numeric, []) datetime_fields = field_types.get(ModelEncodingType.language_datetime, []) for _, seed_row in seed_df.iterrows(): - formatter_builder = MostlyFormatterBuilder() + formatter_builder = FormatterBuilder() model_dict = {} if not seed_row.empty: model_dict |= {field_name: (Literal[seed_value], ...) for field_name, seed_value in seed_row.items()} # type: ignore[valid-type] @@ -117,13 +75,19 @@ def get_formatter_builders( model_dict[field_name] = (Literal[tuple(categories)], ...) # type: ignore[valid-type] elif field_name in numeric_fields: max_scale = stats["columns"][field_name]["max_scale"] + # min_min5 = min(stats["columns"][field_name]["min5"]) + # max_max5 = max(stats["columns"][field_name]["max5"]) if max_scale == 0: - model_dict[field_name] = (int, ...) + model_dict[field_name] = (SkipValidation[int], ...) # , Field(ge=min_min5, le=max_max5)) else: - model_dict[field_name] = (float, ...) + model_dict[field_name] = (SkipValidation[float], ...) # , Field(ge=min_min5, le=max_max5)) elif field_name in datetime_fields: - # model_dict[field_name] = (str, Field(pattern=r"19\\d{2}|20\\d{2}-0[1-9]|1[0-2]-0[1-9]|1[0-9]|2[0-9]|3[0-1]")) - might be able to make this work, but it fails - model_dict[field_name] = (datetime.datetime, ...) + model_dict[field_name] = ( + SkipValidation[str], + Field( + pattern=r"""(19\\d{2}|20\\d{2})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1]) ([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])""" + ), + ) else: model_dict[field_name] = (str, ...) schema = create_model("TargetModel", **model_dict, __base__=MostlyClassSchema) @@ -156,13 +120,9 @@ def from_json(cls, _json: str) -> "MostlyClassSchema": try: return cls.model_validate_json(_json) except ValidationError as e: - do_raise = True # FIXME temporary work-around for error in e.errors(): if error["type"] == "json_invalid": raise JSONDecodeError( f"Caught pydantic ValidationError {e}, reraising as JSONDecodeError", _json, 0 ) - elif "day value is outside expected range" in error.get("msg"): - do_raise = False # FIXME: make flexible datetime validation instead - if do_raise: - raise e + raise e diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 0cf500c..61ff577 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -152,7 +152,6 @@ def _clip_datetime(x: pd.Series, min5: list, max5: list) -> pd.Series: def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - # FIXME add programmatic constraint x = pd.to_numeric(x, errors="coerce") x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py index c88ddc9..91c0ad8 100644 --- a/mostlyai/engine/_language/temp_formatron.py +++ b/mostlyai/engine/_language/temp_formatron.py @@ -12,104 +12,126 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" -The module defines the `MostlyJsonExtractor` class, which is used to extract data from a string in JSON format. -""" - -import collections -import datetime import typing from formatron import schemas -from formatron.formats.json import _type_to_nonterminals, JsonExtractor +from formatron.formats import json -__all__ = ["MostlyJsonExtractor"] +def monkey_patch_formatron(): + FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 + SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" -FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 -SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" + # Copy from formatron, altered to have limited whitespace repetitions and datetime format + json.GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; + number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; + string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; + boolean ::= "true"|"false"; + null ::= "null"; + array ::= array_begin (json_value (comma json_value)*)? array_end; + object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; + json_value ::= number|string|boolean|null|array|object; + comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; + colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; + object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; + object_end ::= #"{SPACE_NONTERMINAL}\\}}"; + array_begin ::= #"\\[{SPACE_NONTERMINAL}"; + array_end ::= #"{SPACE_NONTERMINAL}\\]"; + """ -# Copy from formatron, altered to have limited whitespace repetitions and datetime format -GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; -number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; -string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; -boolean ::= "true"|"false"; -null ::= "null"; -array ::= array_begin (json_value (comma json_value)*)? array_end; -object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; -json_value ::= number|string|boolean|null|array|object; -datetime ::= #'"(19\\d{{2}}|20\\d{{2}})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1]) ([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])"'; -comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; -colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; -object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; -object_end ::= #"{SPACE_NONTERMINAL}\\}}"; -array_begin ::= #"\\[{SPACE_NONTERMINAL}"; -array_end ::= #"{SPACE_NONTERMINAL}\\]"; -""" + # direct copy from formatron + def string_metadata(current: type, nonterminal: str): + min_length = current.metadata.get("min_length") + max_length = current.metadata.get("max_length") + pattern = current.metadata.get("pattern") + substring_of = current.metadata.get("substring_of") + if pattern: + assert not (min_length or max_length or substring_of), ( + "pattern is mutually exclusive with min_length, max_length and substring_of" + ) + if substring_of: + assert not (min_length or max_length or pattern), ( + "substring_of is mutually exclusive with min_length, max_length and pattern" + ) + repetition_map = { + (True, False): f"{{{min_length},}}", + (False, True): f"{{0,{max_length}}}", + (True, True): f"{{{min_length},{max_length}}}", + } + repetition = repetition_map.get((min_length is not None, max_length is not None)) + if repetition is not None: + return ( + rf"""{nonterminal} ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}}){repetition}"'; + """, + [], + ) + if pattern is not None: + pattern = pattern.replace("'", "\\'") + return f"""{nonterminal} ::= #'"{pattern}"';\n""", [] + if substring_of is not None: + return f"""{nonterminal} ::= '"' #substrs{repr(substring_of)} '"';\n""", [] -# FIXME add grammar constraint of integer and number + # altered + def number_metadata(current: type, nonterminal: str): + # For now only constrains number of digits and whether it is negative + gt = current.metadata.get("gt") + ge = current.metadata.get("ge") + lt = current.metadata.get("lt") + le = current.metadata.get("le") + if lt is not None or gt is not None: + raise NotImplementedError("gt and lt are not supported for number metadata") + if le < ge: + raise ValueError("le must be greater than or equal to ge") + pattern_parts = [] + if issubclass(current.type, float): + le, le_frac = le.split(".") + ge, ge_frac = ge.split(".") -# Copy from formatron except `datetime` -def _generate_kbnf_grammar(schema: schemas.schema.Schema | collections.abc.Sequence, start_nonterminal: str) -> str: - """ - Generate a KBNF grammar string from a schema for JSON format. + if ge is not None and le is not None: + if ge < 0 and le < 0: + pattern_parts.append("-") + min_num = abs(le) + max_num = abs(ge) + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + elif ge > 0: + min_num = ge + max_num = le + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + else: + if ge < 0: + pattern_parts.append("-?") + max_digits = max(len(str(abs(ge))), len(str(abs(le)))) + pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") - Args: - schema: The schema to generate a grammar for. - start_nonterminal: The start nonterminal of the grammar. Default is "start". + if issubclass(current.type, float): + # FIXME: currently is not constrained + pattern_parts.append(r"(\\.\\d+)?") - Returns: - The generated KBNF grammar string. - """ - type_id_to_nonterminal = { - id(int): "integer", - id(float): "number", - id(str): "string", - id(bool): "boolean", - id(type(None)): "null", - id(list): "array", - id(dict): "object", - id(typing.Any): "json_value", - id(datetime.datetime): "datetime", # altered - } - result = [GRAMMAR_HEADER] - nonterminals = set() - stack = [(schema, start_nonterminal)] - while stack: - (current, nonterminal) = stack.pop() - type_id = id(current) - if type_id in type_id_to_nonterminal: - line = f"{nonterminal} ::= {type_id_to_nonterminal[type_id]};\n" - result.append(line) - continue - type_id_to_nonterminal[type_id] = nonterminal - for i in _type_to_nonterminals: - value = i(current, nonterminal) - if value is not None: - line, to_stack = value - result.append(line) - stack.extend(to_stack) - nonterminals.add(nonterminal) - break - else: - raise TypeError(f"{current} from {nonterminal} is not supported in json_generators!") - return "".join(result) + pattern = "".join(pattern_parts) + return f"""{nonterminal} ::= #"{pattern}";\n""", [] + # removed sequence metadata since unnecessary and altered number_metadata to use ours + def metadata(current: type, nonterminal: str): + if isinstance(current, schemas.schema.TypeWithMetadata): + original = typing.get_origin(current.type) + if original is None: + original = current.type + if not current.metadata: + return "", [(current.type, nonterminal)] + if isinstance(current.type, type) and issubclass(current.type, str): + return string_metadata(current, nonterminal) + elif isinstance(current.type, type) and issubclass(current.type, (int, float)): + return number_metadata(current, nonterminal) + return None -class MostlyJsonExtractor(JsonExtractor): - """ - Same as the parent class from formatron - except that it uses `_generate_kbnf_grammar` from this file to construct self._rule_str - """ + def alter_type_to_nonterminals_inplace(type_to_nonterminals: list[typing.Callable]): + metadata_idx = [idx for idx, fn in enumerate(type_to_nonterminals) if fn.__name__ == "metadata"] + assert len(metadata_idx) == 1, "metadata function must be present and unique" + type_to_nonterminals[metadata_idx[0]] = metadata - def __init__( - self, - nonterminal: str, - capture_name: str | None, - schema: schemas.schema.Schema | collections.abc.Sequence, - to_object: typing.Callable[[str], schemas.schema.Schema], - ): - super(JsonExtractor, self).__init__(nonterminal, capture_name) - self._to_object = to_object - self._rule_str = _generate_kbnf_grammar(schema, self.nonterminal) + alter_type_to_nonterminals_inplace(json._type_to_nonterminals) From d66670d81e69aa5ffaa500df38b823d5d35a388e Mon Sep 17 00:00:00 2001 From: michdr Date: Thu, 13 Feb 2025 11:58:51 +0100 Subject: [PATCH 40/58] refactor: move language decode functions + tabular encoding types unit tests --- .../_encoding_types/language/categorical.py | 8 +- .../_encoding_types/language/datetime.py | 57 +++++++++++ .../_encoding_types/language/numeric.py | 22 ++++- mostlyai/engine/_language/generation.py | 95 ++----------------- .../unit/encoding_types/language/__init__.py | 13 +++ tests/unit/encoding_types/tabular/__init__.py | 13 +++ .../{ => tabular}/test_categorical.py | 0 .../{ => tabular}/test_character.py | 0 .../{ => tabular}/test_datetime.py | 0 .../encoding_types/{ => tabular}/test_itt.py | 0 .../{ => tabular}/test_lat_long.py | 0 .../{ => tabular}/test_numeric.py | 0 12 files changed, 118 insertions(+), 90 deletions(-) create mode 100644 tests/unit/encoding_types/language/__init__.py create mode 100644 tests/unit/encoding_types/tabular/__init__.py rename tests/unit/encoding_types/{ => tabular}/test_categorical.py (100%) rename tests/unit/encoding_types/{ => tabular}/test_character.py (100%) rename tests/unit/encoding_types/{ => tabular}/test_datetime.py (100%) rename tests/unit/encoding_types/{ => tabular}/test_itt.py (100%) rename tests/unit/encoding_types/{ => tabular}/test_lat_long.py (100%) rename tests/unit/encoding_types/{ => tabular}/test_numeric.py (100%) diff --git a/mostlyai/engine/_encoding_types/language/categorical.py b/mostlyai/engine/_encoding_types/language/categorical.py index 4f8d8ce..46f57f1 100644 --- a/mostlyai/engine/_encoding_types/language/categorical.py +++ b/mostlyai/engine/_encoding_types/language/categorical.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd -from mostlyai.engine._common import safe_convert_string +from mostlyai.engine._common import safe_convert_string, STRING CATEGORICAL_UNKNOWN_TOKEN = "_RARE_" @@ -67,3 +67,9 @@ def encode_language_categorical(values: pd.Series, stats: dict) -> pd.Series: mask &= ~pd.isna(values) values[mask] = CATEGORICAL_UNKNOWN_TOKEN return values + + +def decode_categorical(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + x = x.astype(STRING) + allowed_categories = col_stats.get("categories", []) + return x.where(x.isin(allowed_categories), other=None) diff --git a/mostlyai/engine/_encoding_types/language/datetime.py b/mostlyai/engine/_encoding_types/language/datetime.py index 68d5662..cc7aa04 100644 --- a/mostlyai/engine/_encoding_types/language/datetime.py +++ b/mostlyai/engine/_encoding_types/language/datetime.py @@ -11,7 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import calendar +import numpy as np import pandas as pd from mostlyai.engine._common import safe_convert_datetime @@ -84,3 +86,58 @@ def encode_language_datetime(values: pd.Series, stats: dict, _: pd.Series | None inplace=True, ) return values + + +def _clip_datetime(x: pd.Series, min5: list, max5: list) -> pd.Series: + x_dt = pd.to_datetime(x, errors="coerce") + min_arr = pd.to_datetime(min5).to_numpy(dtype="datetime64[ns]") + max_arr = pd.to_datetime(max5).to_numpy(dtype="datetime64[ns]") + n = len(x_dt) + random_mins = np.random.choice(min_arr, size=n) + random_maxs = np.random.choice(max_arr, size=n) + clipped = np.minimum(np.maximum(x_dt.to_numpy(dtype="datetime64[ns]"), random_mins), random_maxs) + return pd.Series(clipped, index=x.index) + + +def decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + x = x.where(~x.isin(["", "_INVALID_"]), np.nan) + + valid_mask = ( + x.str.len().ge(10) + & x.str.slice(0, 4).str.isdigit() + & x.str.slice(5, 7).str.isdigit() + & x.str.slice(8, 10).str.isdigit() + ) + if valid_mask.sum() > 0: # expected "YYYY-MM-DD" prefix + # handle the date portion, ensuring validity + years = x[valid_mask].str.slice(0, 4).astype(int) + months = x[valid_mask].str.slice(5, 7).astype(int) + days = x[valid_mask].str.slice(8, 10).astype(int) + + # clamp days according to maximum possible day of the month of a given year + last_days = np.array([calendar.monthrange(y, m)[1] for y, m in zip(years, months)]) + clamped_days = np.minimum(days, last_days) + + # rebuild the date portion + new_date = ( + years.astype(str).str.zfill(4) + + "-" + + months.astype(str).str.zfill(2) + + "-" + + pd.Series(clamped_days, index=years.index).astype(str).str.zfill(2) + ) + + # handle the time portion, ensuring validity + remainder = x[valid_mask].str.slice(10) + + time_regex = r"^[ T]?(\d{2}:\d{2}:\d{2}(?:\.\d+)?)" + valid_time = remainder.str.extract(time_regex, expand=False) + valid_time = valid_time.fillna("00:00:00") + valid_time = " " + valid_time + + new_date = new_date + valid_time + x.loc[valid_mask] = new_date + + x = pd.to_datetime(x, errors="coerce") + x = _clip_datetime(x, col_stats["min5"], col_stats["max5"]) + return x.astype("datetime64[ns]") diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 356d03a..8a99820 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import numpy as np import pandas as pd from mostlyai.engine._common import safe_convert_numeric @@ -115,3 +115,23 @@ def encode_language_numeric(values: pd.Series, stats: dict, _: pd.Series | None inplace=True, ) return values + + +def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: + x_numeric = pd.to_numeric(x, errors="coerce") + min_arr = np.array(min5, dtype=x_numeric.dtype) + max_arr = np.array(max5, dtype=x_numeric.dtype) + n = len(x_numeric) + random_mins = np.random.choice(min_arr, size=n) + random_maxs = np.random.choice(max_arr, size=n) + clipped = np.minimum(np.maximum(x_numeric.to_numpy(), random_mins), random_maxs) + return pd.Series(clipped, index=x.index) + + +def decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + x = pd.to_numeric(x, errors="coerce") + x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) + # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long + if col_stats["max_scale"] == 0: + return x.astype("Int64") + return x.astype(float) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 61ff577..d999b6c 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import calendar import contextlib import json import os @@ -22,7 +21,6 @@ import time from pathlib import Path -import numpy as np import pandas as pd import torch from huggingface_hub import constants as hf_constants @@ -34,10 +32,12 @@ from mostlyai.engine._common import ( persist_data_part, FixedSizeSampleBuffer, - STRING, ProgressCallback, ProgressCallbackWrapper, ) +from mostlyai.engine._encoding_types.language.categorical import decode_categorical +from mostlyai.engine._encoding_types.language.datetime import decode_datetime +from mostlyai.engine._encoding_types.language.numeric import decode_numeric from mostlyai.engine._language.common import estimate_max_tokens, MAX_LENGTH from mostlyai.engine._language.encoding import encode_df from mostlyai.engine._workspace import ensure_workspace_dir, Workspace, reset_dir @@ -112,98 +112,17 @@ def parse_json(x, columns: list[str]): for col in tgt_stats["columns"].keys(): col_stats = tgt_stats["columns"][col] if col_stats["encoding_type"] == ModelEncodingType.language_numeric: - tgt_data[col] = _decode_numeric(tgt_data[col], col_stats) + tgt_data[col] = decode_numeric(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_datetime: - tgt_data[col] = _decode_datetime(tgt_data[col], col_stats) - else: - tgt_data[col] = _decode_string(tgt_data[col], col_stats) + tgt_data[col] = decode_datetime(tgt_data[col], col_stats) + elif col_stats["encoding_type"] == ModelEncodingType.language_categorical: + tgt_data[col] = decode_categorical(tgt_data[col], col_stats) _LOG.info(f"percentage of invalid values: {invalid_percentage.to_dict()}") _LOG.info(f"decoded {tgt_data.shape} from {len(buffer.buffer)} batches in {time.time() - t0:.2f}s") return tgt_data -def _decode_string(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - x = x.astype(STRING) - allowed_categories = col_stats.get("categories", []) - return x.where(x.isin(allowed_categories), other=None) - - -def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: - x_numeric = pd.to_numeric(x, errors="coerce") - min_arr = np.array(min5, dtype=x_numeric.dtype) - max_arr = np.array(max5, dtype=x_numeric.dtype) - n = len(x_numeric) - random_mins = np.random.choice(min_arr, size=n) - random_maxs = np.random.choice(max_arr, size=n) - clipped = np.minimum(np.maximum(x_numeric.to_numpy(), random_mins), random_maxs) - return pd.Series(clipped, index=x.index) - - -def _clip_datetime(x: pd.Series, min5: list, max5: list) -> pd.Series: - x_dt = pd.to_datetime(x, errors="coerce") - min_arr = pd.to_datetime(min5).to_numpy(dtype="datetime64[ns]") - max_arr = pd.to_datetime(max5).to_numpy(dtype="datetime64[ns]") - n = len(x_dt) - random_mins = np.random.choice(min_arr, size=n) - random_maxs = np.random.choice(max_arr, size=n) - clipped = np.minimum(np.maximum(x_dt.to_numpy(dtype="datetime64[ns]"), random_mins), random_maxs) - return pd.Series(clipped, index=x.index) - - -def _decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - x = pd.to_numeric(x, errors="coerce") - x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) - # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long - if col_stats["max_scale"] == 0: - return x.astype("Int64") - return x.astype(float) - - -def _decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: - x = x.where(~x.isin(["", "_INVALID_"]), np.nan) - - valid_mask = ( - x.str.len().ge(10) - & x.str.slice(0, 4).str.isdigit() - & x.str.slice(5, 7).str.isdigit() - & x.str.slice(8, 10).str.isdigit() - ) - if valid_mask.sum() > 0: # expected "YYYY-MM-DD" prefix - # handle the date portion, ensuring validity - years = x[valid_mask].str.slice(0, 4).astype(int) - months = x[valid_mask].str.slice(5, 7).astype(int) - days = x[valid_mask].str.slice(8, 10).astype(int) - - # clamp days according to maximum possible day of the month of a given year - last_days = np.array([calendar.monthrange(y, m)[1] for y, m in zip(years, months)]) - clamped_days = np.minimum(days, last_days) - - # rebuild the date portion - new_date = ( - years.astype(str).str.zfill(4) - + "-" - + months.astype(str).str.zfill(2) - + "-" - + pd.Series(clamped_days, index=years.index).astype(str).str.zfill(2) - ) - - # handle the time portion, ensuring validity - remainder = x[valid_mask].str.slice(10) - - time_regex = r"^[ T]?(\d{2}:\d{2}:\d{2}(?:\.\d+)?)" - valid_time = remainder.str.extract(time_regex, expand=False) - valid_time = valid_time.fillna("00:00:00") - valid_time = " " + valid_time - - new_date = new_date + valid_time - x.loc[valid_mask] = new_date - - x = pd.to_datetime(x, errors="coerce") - x = _clip_datetime(x, col_stats["min5"], col_stats["max5"]) - return x.astype("datetime64[ns]") - - def generate( *, ctx_data: pd.DataFrame | None = None, diff --git a/tests/unit/encoding_types/language/__init__.py b/tests/unit/encoding_types/language/__init__.py new file mode 100644 index 0000000..a18e33e --- /dev/null +++ b/tests/unit/encoding_types/language/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/encoding_types/tabular/__init__.py b/tests/unit/encoding_types/tabular/__init__.py new file mode 100644 index 0000000..a18e33e --- /dev/null +++ b/tests/unit/encoding_types/tabular/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/encoding_types/test_categorical.py b/tests/unit/encoding_types/tabular/test_categorical.py similarity index 100% rename from tests/unit/encoding_types/test_categorical.py rename to tests/unit/encoding_types/tabular/test_categorical.py diff --git a/tests/unit/encoding_types/test_character.py b/tests/unit/encoding_types/tabular/test_character.py similarity index 100% rename from tests/unit/encoding_types/test_character.py rename to tests/unit/encoding_types/tabular/test_character.py diff --git a/tests/unit/encoding_types/test_datetime.py b/tests/unit/encoding_types/tabular/test_datetime.py similarity index 100% rename from tests/unit/encoding_types/test_datetime.py rename to tests/unit/encoding_types/tabular/test_datetime.py diff --git a/tests/unit/encoding_types/test_itt.py b/tests/unit/encoding_types/tabular/test_itt.py similarity index 100% rename from tests/unit/encoding_types/test_itt.py rename to tests/unit/encoding_types/tabular/test_itt.py diff --git a/tests/unit/encoding_types/test_lat_long.py b/tests/unit/encoding_types/tabular/test_lat_long.py similarity index 100% rename from tests/unit/encoding_types/test_lat_long.py rename to tests/unit/encoding_types/tabular/test_lat_long.py diff --git a/tests/unit/encoding_types/test_numeric.py b/tests/unit/encoding_types/tabular/test_numeric.py similarity index 100% rename from tests/unit/encoding_types/test_numeric.py rename to tests/unit/encoding_types/tabular/test_numeric.py From ccbb1e0ba778bb5d368f077d1fbd52358b25f188 Mon Sep 17 00:00:00 2001 From: michdr Date: Thu, 13 Feb 2025 12:31:57 +0100 Subject: [PATCH 41/58] add decode_text --- mostlyai/engine/_encoding_types/language/text.py | 5 ++++- mostlyai/engine/_language/generation.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mostlyai/engine/_encoding_types/language/text.py b/mostlyai/engine/_encoding_types/language/text.py index 2e89b75..fbec61f 100644 --- a/mostlyai/engine/_encoding_types/language/text.py +++ b/mostlyai/engine/_encoding_types/language/text.py @@ -14,7 +14,7 @@ import pandas as pd -from mostlyai.engine._common import safe_convert_string +from mostlyai.engine._common import safe_convert_string, STRING def analyze_text(values: pd.Series, root_keys: pd.Series, _: pd.Series | None = None) -> dict: @@ -39,3 +39,6 @@ def analyze_reduce_text(stats_list: list[dict], _: bool = True) -> dict: "nchar_max": nchar_max, } return stats + +def decode_text(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: + return x.astype(STRING) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index d999b6c..08762ce 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -38,6 +38,7 @@ from mostlyai.engine._encoding_types.language.categorical import decode_categorical from mostlyai.engine._encoding_types.language.datetime import decode_datetime from mostlyai.engine._encoding_types.language.numeric import decode_numeric +from mostlyai.engine._encoding_types.language.text import decode_text from mostlyai.engine._language.common import estimate_max_tokens, MAX_LENGTH from mostlyai.engine._language.encoding import encode_df from mostlyai.engine._workspace import ensure_workspace_dir, Workspace, reset_dir @@ -117,6 +118,8 @@ def parse_json(x, columns: list[str]): tgt_data[col] = decode_datetime(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_categorical: tgt_data[col] = decode_categorical(tgt_data[col], col_stats) + else: + tgt_data[col] = decode_text(tgt_data[col], col_stats) _LOG.info(f"percentage of invalid values: {invalid_percentage.to_dict()}") _LOG.info(f"decoded {tgt_data.shape} from {len(buffer.buffer)} batches in {time.time() - t0:.2f}s") From c643d253473c67ee6acc456302e69018b3e0eaf4 Mon Sep 17 00:00:00 2001 From: michdr Date: Thu, 13 Feb 2025 12:32:59 +0100 Subject: [PATCH 42/58] ruff --- mostlyai/engine/_encoding_types/language/text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mostlyai/engine/_encoding_types/language/text.py b/mostlyai/engine/_encoding_types/language/text.py index fbec61f..245699a 100644 --- a/mostlyai/engine/_encoding_types/language/text.py +++ b/mostlyai/engine/_encoding_types/language/text.py @@ -40,5 +40,6 @@ def analyze_reduce_text(stats_list: list[dict], _: bool = True) -> dict: } return stats + def decode_text(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: return x.astype(STRING) From 88b57729b8402f179d2482be7be85ee0b787761a Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 13 Feb 2025 15:02:48 +0100 Subject: [PATCH 43/58] restrict number to correct number of decimal points in grammar, negative/positive, and number of digits, and whether 0 can be a member --- mostlyai/engine/_language/formatron_utils.py | 11 +- mostlyai/engine/_language/generation.py | 1 + mostlyai/engine/_language/temp_formatron.py | 192 ++++++++++--------- tests/end_to_end/test_language.py | 50 ++++- 4 files changed, 156 insertions(+), 98 deletions(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 1a83c92..66f7068 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -75,12 +75,15 @@ def get_formatter_builders( model_dict[field_name] = (Literal[tuple(categories)], ...) # type: ignore[valid-type] elif field_name in numeric_fields: max_scale = stats["columns"][field_name]["max_scale"] - # min_min5 = min(stats["columns"][field_name]["min5"]) - # max_max5 = max(stats["columns"][field_name]["max5"]) + min_min5 = min(stats["columns"][field_name]["min5"]) + max_max5 = max(stats["columns"][field_name]["max5"]) if max_scale == 0: - model_dict[field_name] = (SkipValidation[int], ...) # , Field(ge=min_min5, le=max_max5)) + model_dict[field_name] = (SkipValidation[int], Field(ge=min_min5, le=max_max5)) else: - model_dict[field_name] = (SkipValidation[float], ...) # , Field(ge=min_min5, le=max_max5)) + model_dict[field_name] = ( + SkipValidation[float], + Field(ge=min_min5, le=max_max5, decimal_places=max_scale), + ) elif field_name in datetime_fields: model_dict[field_name] = ( SkipValidation[str], diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 08762ce..0d7d395 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -82,6 +82,7 @@ def parse_json(x, columns: list[str]): num_samples_max_length_limit += sum(1 for tokens in num_tokens_by_row if tokens >= max_new_tokens) except AttributeError: num_samples_max_length_limit = float("-inf") + outputs_text = tokenizer.batch_decode(outputs_ids, skip_special_tokens=True) output_texts.extend(outputs_text) ctx_keys.append(keys_df) diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py index 91c0ad8..6c3c242 100644 --- a/mostlyai/engine/_language/temp_formatron.py +++ b/mostlyai/engine/_language/temp_formatron.py @@ -18,6 +18,102 @@ from formatron.formats import json +# direct copy from formatron +def _string_metadata(current: type, nonterminal: str): + min_length = current.metadata.get("min_length") + max_length = current.metadata.get("max_length") + pattern = current.metadata.get("pattern") + substring_of = current.metadata.get("substring_of") + if pattern: + assert not (min_length or max_length or substring_of), ( + "pattern is mutually exclusive with min_length, max_length and substring_of" + ) + if substring_of: + assert not (min_length or max_length or pattern), ( + "substring_of is mutually exclusive with min_length, max_length and pattern" + ) + repetition_map = { + (True, False): f"{{{min_length},}}", + (False, True): f"{{0,{max_length}}}", + (True, True): f"{{{min_length},{max_length}}}", + } + repetition = repetition_map.get((min_length is not None, max_length is not None)) + if repetition is not None: + return ( + rf"""{nonterminal} ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}}){repetition}"'; +""", + [], + ) + if pattern is not None: + pattern = pattern.replace("'", "\\'") + return f"""{nonterminal} ::= #'"{pattern}"';\n""", [] + if substring_of is not None: + return f"""{nonterminal} ::= '"' #substrs{repr(substring_of)} '"';\n""", [] + + +# completely altered +def _number_metadata(current: type, nonterminal: str): + # For now only constrains number of digits and whether it is negative + gt = current.metadata.get("gt") + ge = current.metadata.get("ge") + lt = current.metadata.get("lt") + le = current.metadata.get("le") + if lt is not None or gt is not None: + raise NotImplementedError("gt and lt are not supported for number metadata") + if le < ge: + raise ValueError("le must be greater than or equal to ge") + + pattern_parts = [] + if issubclass(current.type, float): + le, le_frac = str(le).split(".") + ge, ge_frac = str(ge).split(".") + le, le_frac = int(le), int(le_frac) + ge, ge_frac = int(ge), int(ge_frac) + decimal_places = current.metadata.get("decimal_places") + + if ge is not None and le is not None: + if ge < 0 and le < 0: + pattern_parts.append("-") + min_num = abs(le) + max_num = abs(ge) + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + elif ge > 0: + min_num = ge + max_num = le + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + else: + if ge < 0: + pattern_parts.append("-?") + max_digits = max(len(str(abs(ge))), len(str(abs(le)))) + pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") + + if issubclass(current.type, float): + # FIXME: currently is not constrained + pattern_parts.append(rf"(\\.[0-9]{{0,{decimal_places}}})?") + + pattern = "".join(pattern_parts) + return f"""{nonterminal} ::= #"{pattern}";\n""", [] + + +# removed sequence metadata since unnecessary and altered number_metadata to use ours +def _metadata(current: type, nonterminal: str): + if isinstance(current, schemas.schema.TypeWithMetadata): + original = typing.get_origin(current.type) + if original is None: + original = current.type + if not current.metadata: + return "", [(current.type, nonterminal)] + if isinstance(current.type, type) and issubclass(current.type, str): + return _string_metadata(current, nonterminal) + elif isinstance(current.type, type) and issubclass(current.type, (int, float)): + return _number_metadata(current, nonterminal) + return None + + def monkey_patch_formatron(): FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" @@ -39,99 +135,9 @@ def monkey_patch_formatron(): array_end ::= #"{SPACE_NONTERMINAL}\\]"; """ - # direct copy from formatron - def string_metadata(current: type, nonterminal: str): - min_length = current.metadata.get("min_length") - max_length = current.metadata.get("max_length") - pattern = current.metadata.get("pattern") - substring_of = current.metadata.get("substring_of") - if pattern: - assert not (min_length or max_length or substring_of), ( - "pattern is mutually exclusive with min_length, max_length and substring_of" - ) - if substring_of: - assert not (min_length or max_length or pattern), ( - "substring_of is mutually exclusive with min_length, max_length and pattern" - ) - repetition_map = { - (True, False): f"{{{min_length},}}", - (False, True): f"{{0,{max_length}}}", - (True, True): f"{{{min_length},{max_length}}}", - } - repetition = repetition_map.get((min_length is not None, max_length is not None)) - if repetition is not None: - return ( - rf"""{nonterminal} ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}}){repetition}"'; - """, - [], - ) - if pattern is not None: - pattern = pattern.replace("'", "\\'") - return f"""{nonterminal} ::= #'"{pattern}"';\n""", [] - if substring_of is not None: - return f"""{nonterminal} ::= '"' #substrs{repr(substring_of)} '"';\n""", [] - - # altered - def number_metadata(current: type, nonterminal: str): - # For now only constrains number of digits and whether it is negative - gt = current.metadata.get("gt") - ge = current.metadata.get("ge") - lt = current.metadata.get("lt") - le = current.metadata.get("le") - if lt is not None or gt is not None: - raise NotImplementedError("gt and lt are not supported for number metadata") - if le < ge: - raise ValueError("le must be greater than or equal to ge") - - pattern_parts = [] - if issubclass(current.type, float): - le, le_frac = le.split(".") - ge, ge_frac = ge.split(".") - - if ge is not None and le is not None: - if ge < 0 and le < 0: - pattern_parts.append("-") - min_num = abs(le) - max_num = abs(ge) - max_digits = len(str(max_num)) - min_digits = len(str(min_num)) - pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") - elif ge > 0: - min_num = ge - max_num = le - max_digits = len(str(max_num)) - min_digits = len(str(min_num)) - pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") - else: - if ge < 0: - pattern_parts.append("-?") - max_digits = max(len(str(abs(ge))), len(str(abs(le)))) - pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") - - if issubclass(current.type, float): - # FIXME: currently is not constrained - pattern_parts.append(r"(\\.\\d+)?") - - pattern = "".join(pattern_parts) - return f"""{nonterminal} ::= #"{pattern}";\n""", [] - - # removed sequence metadata since unnecessary and altered number_metadata to use ours - def metadata(current: type, nonterminal: str): - if isinstance(current, schemas.schema.TypeWithMetadata): - original = typing.get_origin(current.type) - if original is None: - original = current.type - if not current.metadata: - return "", [(current.type, nonterminal)] - if isinstance(current.type, type) and issubclass(current.type, str): - return string_metadata(current, nonterminal) - elif isinstance(current.type, type) and issubclass(current.type, (int, float)): - return number_metadata(current, nonterminal) - return None - - def alter_type_to_nonterminals_inplace(type_to_nonterminals: list[typing.Callable]): + def alter_type_to_nonterminals_metadata_inplace(type_to_nonterminals: list[typing.Callable]): metadata_idx = [idx for idx, fn in enumerate(type_to_nonterminals) if fn.__name__ == "metadata"] assert len(metadata_idx) == 1, "metadata function must be present and unique" - type_to_nonterminals[metadata_idx[0]] = metadata + type_to_nonterminals[metadata_idx[0]] = _metadata - alter_type_to_nonterminals_inplace(json._type_to_nonterminals) + alter_type_to_nonterminals_metadata_inplace(json._type_to_nonterminals) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index de4b626..a181540 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -37,7 +37,7 @@ DifferentialPrivacyConfig, RareCategoryReplacementMethod, ) - +from mostlyai.engine._language.temp_formatron import _number_metadata from mostlyai.engine._language.formatron_utils import get_formatter_builders from formatron.integrations.transformers import create_formatter_logits_processor_list @@ -503,3 +503,51 @@ def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_datas if not dates.empty: assert dates.min() >= pd.Timestamp("2019-01-06") assert dates.max() <= pd.Timestamp("2026-01-05") + + +def test_number_metadata(): + class TypeWithMetadata: + def __init__(self, type, metadata): + self.type = type + self.metadata = metadata + + # test positive integer range + number_type = TypeWithMetadata(int, {"ge": 10, "le": 450}) + pattern, deps = _number_metadata(number_type, "test_number") + + assert deps == [] + # should match 2-3 digit numbers between 10-999 + assert 'test_number ::= #"([1-9][0-9]{1,2})";\n' in pattern + + # test negative integer range + number_type = TypeWithMetadata(int, {"ge": -269, "le": -10}) + pattern, deps = _number_metadata(number_type, "test_number") + + # should match negative 2-3 digit numbers + assert 'test_number ::= #"-([1-9][0-9]{1,2})";\n' in pattern + + # test range including both negative and positive + number_type = TypeWithMetadata(int, {"ge": -10, "le": 100}) + pattern, deps = _number_metadata(number_type, "test_number") + + # should allow optional negative sign and up to 3 digits and 0 + assert 'test_number ::= #"-?(0|[1-9][0-9]{0,2})";\n' in pattern + + # test float with decimal places + number_type = TypeWithMetadata(float, {"ge": 0.0, "le": 100.0, "decimal_places": 2}) + pattern, deps = _number_metadata(number_type, "test_number") + + # should match numbers with optional decimal part + assert r'test_number ::= #"(0|[1-9][0-9]{0,2})(\\.[0-9]{0,2})?";' + "\n" in pattern + + # test invalid range where le < ge + number_type = TypeWithMetadata(int, {"ge": 100, "le": 10}) + + with pytest.raises(ValueError, match="le must be greater than or equal to ge"): + _number_metadata(number_type, "test_number") + + # test unsupported gt/lt constraints + number_type = TypeWithMetadata(int, {"gt": 10, "lt": 100}) + + with pytest.raises(NotImplementedError, match="gt and lt are not supported for number metadata"): + _number_metadata(number_type, "test_number") From 8a3506cd36bba4fb692e95f7ddf44d343f53958b Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 13 Feb 2025 15:37:47 +0100 Subject: [PATCH 44/58] fix datetime pattern --- mostlyai/engine/_language/formatron_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 66f7068..e3e72c9 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -88,7 +88,7 @@ def get_formatter_builders( model_dict[field_name] = ( SkipValidation[str], Field( - pattern=r"""(19\\d{2}|20\\d{2})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1]) ([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])""" + pattern=r"""(19\\d{2}|20\\d{2})-(0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-9]|3[0-1])T([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])""" ), ) else: From 6ee92e17e9780a49ac67fcf98d7bbea86f344c66 Mon Sep 17 00:00:00 2001 From: michdr Date: Thu, 13 Feb 2025 16:14:39 +0100 Subject: [PATCH 45/58] add unit tests + improve decode_numeric --- .../_encoding_types/language/numeric.py | 6 +- tests/end_to_end/test_language.py | 4 +- .../language/test_categorical.py | 33 ++++++++ .../encoding_types/language/test_datetime.py | 77 +++++++++++++++++++ .../encoding_types/language/test_numeric.py | 66 ++++++++++++++++ 5 files changed, 181 insertions(+), 5 deletions(-) create mode 100644 tests/unit/encoding_types/language/test_categorical.py create mode 100644 tests/unit/encoding_types/language/test_datetime.py create mode 100644 tests/unit/encoding_types/language/test_numeric.py diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 8a99820..a460ac1 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -130,8 +130,8 @@ def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: def decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x = pd.to_numeric(x, errors="coerce") + x = x.round(col_stats["max_scale"]) x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long - if col_stats["max_scale"] == 0: - return x.astype("Int64") - return x.astype(float) + dtype = "Int64" if col_stats["max_scale"] == 0 else float + return x.astype(dtype) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index a181540..9b0e051 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -468,8 +468,8 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): ("model_name"), [ LSTMFromScratchConfig.model_id, - "amd/AMD-Llama-135m", - "openai-community/gpt2", # TEMP, better model than AMD + # "amd/AMD-Llama-135m", + # "openai-community/gpt2", # TEMP, better model than AMD ], ) def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): diff --git a/tests/unit/encoding_types/language/test_categorical.py b/tests/unit/encoding_types/language/test_categorical.py new file mode 100644 index 0000000..4790d59 --- /dev/null +++ b/tests/unit/encoding_types/language/test_categorical.py @@ -0,0 +1,33 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from mostlyai.engine._encoding_types.language.categorical import decode_categorical + + +class TestCategoricalDecode: + @pytest.fixture + def col_stats(self): + return {"categories": ["apple", "banana", "cherry"]} + + @pytest.fixture + def sample_values(self): + return pd.Series(["apple", "durian", "banana", "elderberry", "cherry", "fig", None]) + + def test_decode_categorical(self, sample_values, col_stats): + decoded = decode_categorical(sample_values, col_stats) + expected = pd.Series(["apple", None, "banana", None, "cherry", None, None], dtype=decoded.dtype) + pd.testing.assert_series_equal(decoded, expected) diff --git a/tests/unit/encoding_types/language/test_datetime.py b/tests/unit/encoding_types/language/test_datetime.py new file mode 100644 index 0000000..5bb4cc3 --- /dev/null +++ b/tests/unit/encoding_types/language/test_datetime.py @@ -0,0 +1,77 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from mostlyai.engine._encoding_types.language.datetime import decode_datetime +from mostlyai.engine.domain import ModelEncodingType + + +class TestDatetimeDecode: + @pytest.fixture + def datetime_stats(self): + return { + "encoding_type": ModelEncodingType.language_datetime, + "has_nan": True, + "min5": ["2000-01-01"] * 5, + "max5": ["2024-12-31"] * 5, + } + + @pytest.fixture + def no_clip_stats(self): + return { + "encoding_type": ModelEncodingType.language_datetime, + "has_nan": True, + "min5": ["1900-01-01"] * 5, + "max5": ["2100-01-01"] * 5, + } + + @pytest.fixture + def sample_dates(self): + return pd.Series( + [ + "2021-05-20 14:30:00", # valid datetime with time + "2020-02-30", # Feb 30 is invalid; should be clamped to Feb 29, 2020 + "1999-12-31", # below the min bound -> will be clipped upward + "2025-01-01", # above the max bound -> will be clipped downward + "abcd", # invalid date string -> becomes NaT + "", # empty string -> becomes NaT + "_INVALID_", # marked as invalid -> becomes NaT + "2010-10-10", # valid date without explicit time (defaults to 00:00:00) + ] + ) + + def test_datetime_dtype_bounds_and_invalids(self, sample_dates, datetime_stats): + decoded = decode_datetime(sample_dates, datetime_stats) + assert decoded.dtype == "datetime64[ns]" + non_null = decoded.dropna() + min_bound = pd.to_datetime(datetime_stats["min5"][0]) + max_bound = pd.to_datetime(datetime_stats["max5"][0]) + for dt in non_null: + assert dt >= min_bound + assert dt <= max_bound + assert all(pd.isna(decoded.iloc[4:7])) + + def test_date_day_clamping(self, no_clip_stats): + s = pd.Series(["2021-04-31"]) + decoded = decode_datetime(s, no_clip_stats) + expected = pd.Timestamp("2021-04-30 00:00:00") + assert decoded.iloc[0] == expected + + def test_time_extraction(self, no_clip_stats): + s = pd.Series(["2021-07-15T23:59:59.123"]) + decoded = decode_datetime(s, no_clip_stats) + expected = pd.Timestamp("2021-07-15 23:59:59.123") + assert decoded.iloc[0] == expected diff --git a/tests/unit/encoding_types/language/test_numeric.py b/tests/unit/encoding_types/language/test_numeric.py new file mode 100644 index 0000000..41fcca8 --- /dev/null +++ b/tests/unit/encoding_types/language/test_numeric.py @@ -0,0 +1,66 @@ +# Copyright 2025 MOSTLY AI +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + +from mostlyai.engine._encoding_types.language.numeric import decode_numeric +from mostlyai.engine.domain import ModelEncodingType + + +class TestNumericDecode: + @pytest.fixture + def int_stats(self): + return { + "encoding_type": ModelEncodingType.language_numeric, + "has_nan": False, + "max5": [91] * 5, + "max_scale": 0, + "min5": [17] * 5, + } + + @pytest.fixture + def float_stats(self): + return { + "encoding_type": ModelEncodingType.language_numeric, + "has_nan": False, + "max5": [91.12] * 5, + "max_scale": 2, + "min5": [17.0] * 5, + } + + @pytest.fixture + def sample_values(self): + return pd.Series(["25.3541", "99.99", "-312.0", "61", None, "35.10091", "-1.223"]) + + @pytest.mark.parametrize( + "stats_name, expected_dtype", + [ + ("int_stats", "Int64"), + ("float_stats", float), + ], + ) + def test_decode_numeric(self, sample_values, request, stats_name, expected_dtype): + stats = request.getfixturevalue(stats_name) + decoded = decode_numeric(sample_values, stats) + assert decoded.dtype == expected_dtype + non_null = decoded.dropna() # we don't enforce compatability with "has_nan" + max_val = stats["max5"][0] + min_val = stats["min5"][0] + round_digits = stats["max_scale"] + for v in non_null: + assert np.isclose(v, round(v, round_digits), atol=1e-8) + assert all(non_null <= max_val) + assert all(non_null >= min_val) From 9c20ee2283eb30be52c5276df69a0e1eee753656 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 13 Feb 2025 16:44:14 +0100 Subject: [PATCH 46/58] fix monkey patch --- mostlyai/engine/_language/temp_formatron.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py index 6c3c242..02cf848 100644 --- a/mostlyai/engine/_language/temp_formatron.py +++ b/mostlyai/engine/_language/temp_formatron.py @@ -137,7 +137,7 @@ def monkey_patch_formatron(): def alter_type_to_nonterminals_metadata_inplace(type_to_nonterminals: list[typing.Callable]): metadata_idx = [idx for idx, fn in enumerate(type_to_nonterminals) if fn.__name__ == "metadata"] - assert len(metadata_idx) == 1, "metadata function must be present and unique" - type_to_nonterminals[metadata_idx[0]] = _metadata + if len(metadata_idx) == 1: + type_to_nonterminals[metadata_idx[0]] = _metadata alter_type_to_nonterminals_metadata_inplace(json._type_to_nonterminals) From 089cf62736456350411ba6e43faf814be2ef2f3e Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 13 Feb 2025 16:49:30 +0100 Subject: [PATCH 47/58] remove temp_formatron.py and move code into formatron_utils.py --- mostlyai/engine/_language/engine/hf_engine.py | 2 +- .../engine/_language/engine/vllm_engine.py | 2 +- mostlyai/engine/_language/formatron_utils.py | 128 ++++++++++++++++ mostlyai/engine/_language/temp_formatron.py | 143 ------------------ tests/end_to_end/test_language.py | 3 +- 5 files changed, 131 insertions(+), 147 deletions(-) delete mode 100644 mostlyai/engine/_language/temp_formatron.py diff --git a/mostlyai/engine/_language/engine/hf_engine.py b/mostlyai/engine/_language/engine/hf_engine.py index e64fce0..88f30a2 100644 --- a/mostlyai/engine/_language/engine/hf_engine.py +++ b/mostlyai/engine/_language/engine/hf_engine.py @@ -23,7 +23,7 @@ from transformers import AutoTokenizer from mostlyai.engine._language.common import load_base_model_and_config -from mostlyai.engine._language.temp_formatron import monkey_patch_formatron +from mostlyai.engine._language.formatron_utils import monkey_patch_formatron from mostlyai.engine._language.tokenizer_utils import tokenize_fn from mostlyai.engine._language.engine.base import EngineMetrics, LanguageEngine diff --git a/mostlyai/engine/_language/engine/vllm_engine.py b/mostlyai/engine/_language/engine/vllm_engine.py index d66552d..afa1268 100644 --- a/mostlyai/engine/_language/engine/vllm_engine.py +++ b/mostlyai/engine/_language/engine/vllm_engine.py @@ -26,7 +26,7 @@ from peft import PeftConfig from transformers import AutoTokenizer, AutoConfig, PreTrainedTokenizerBase -from mostlyai.engine._language.temp_formatron import monkey_patch_formatron +from mostlyai.engine._language.formatron_utils import monkey_patch_formatron from vllm import LLM, SamplingParams from vllm.lora.request import LoRARequest from vllm.config import _get_and_verify_max_len diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index e3e72c9..6f1180c 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -15,11 +15,14 @@ import typing + import pandas as pd from formatron.schemas.pydantic import ClassSchema from json import JSONDecodeError from pydantic import Field, SkipValidation, ValidationError from formatron.formatter import FormatterBuilder +from formatron import schemas +from formatron.formats import json from typing import Literal from pydantic import create_model from transformers import PreTrainedTokenizerBase @@ -129,3 +132,128 @@ def from_json(cls, _json: str) -> "MostlyClassSchema": f"Caught pydantic ValidationError {e}, reraising as JSONDecodeError", _json, 0 ) raise e + + +# copy formatron: direct copy from formatron +def _string_metadata(current: type, nonterminal: str): + min_length = current.metadata.get("min_length") + max_length = current.metadata.get("max_length") + pattern = current.metadata.get("pattern") + substring_of = current.metadata.get("substring_of") + if pattern: + assert not (min_length or max_length or substring_of), ( + "pattern is mutually exclusive with min_length, max_length and substring_of" + ) + if substring_of: + assert not (min_length or max_length or pattern), ( + "substring_of is mutually exclusive with min_length, max_length and pattern" + ) + repetition_map = { + (True, False): f"{{{min_length},}}", + (False, True): f"{{0,{max_length}}}", + (True, True): f"{{{min_length},{max_length}}}", + } + repetition = repetition_map.get((min_length is not None, max_length is not None)) + if repetition is not None: + return ( + rf"""{nonterminal} ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}}){repetition}"'; +""", + [], + ) + if pattern is not None: + pattern = pattern.replace("'", "\\'") + return f"""{nonterminal} ::= #'"{pattern}"';\n""", [] + if substring_of is not None: + return f"""{nonterminal} ::= '"' #substrs{repr(substring_of)} '"';\n""", [] + + +# completely altered vs formatron +def _number_metadata(current: type, nonterminal: str): + # For now only constrains number of digits and whether it is negative + gt = current.metadata.get("gt") + ge = current.metadata.get("ge") + lt = current.metadata.get("lt") + le = current.metadata.get("le") + if lt is not None or gt is not None: + raise NotImplementedError("gt and lt are not supported for number metadata") + if le < ge: + raise ValueError("le must be greater than or equal to ge") + + pattern_parts = [] + if issubclass(current.type, float): + le, le_frac = str(le).split(".") + ge, ge_frac = str(ge).split(".") + le, le_frac = int(le), int(le_frac) + ge, ge_frac = int(ge), int(ge_frac) + decimal_places = current.metadata.get("decimal_places") + + if ge is not None and le is not None: + if ge < 0 and le < 0: + pattern_parts.append("-") + min_num = abs(le) + max_num = abs(ge) + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + elif ge > 0: + min_num = ge + max_num = le + max_digits = len(str(max_num)) + min_digits = len(str(min_num)) + pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") + else: + if ge < 0: + pattern_parts.append("-?") + max_digits = max(len(str(abs(ge))), len(str(abs(le)))) + pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") + + if issubclass(current.type, float): + # FIXME: currently is not constrained + pattern_parts.append(rf"(\\.[0-9]{{0,{decimal_places}}})?") + + pattern = "".join(pattern_parts) + return f"""{nonterminal} ::= #"{pattern}";\n""", [] + + +# copy formatron: removed sequence metadata since unnecessary and altered number_metadata to use ours +def _metadata(current: type, nonterminal: str): + if isinstance(current, schemas.schema.TypeWithMetadata): + original = typing.get_origin(current.type) + if original is None: + original = current.type + if not current.metadata: + return "", [(current.type, nonterminal)] + if isinstance(current.type, type) and issubclass(current.type, str): + return _string_metadata(current, nonterminal) + elif isinstance(current.type, type) and issubclass(current.type, (int, float)): + return _number_metadata(current, nonterminal) + return None + + +def monkey_patch_formatron(): + FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 + SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" + + # Copy from formatron, altered to have limited whitespace repetitions and datetime format + json.GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; + number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; + string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; + boolean ::= "true"|"false"; + null ::= "null"; + array ::= array_begin (json_value (comma json_value)*)? array_end; + object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; + json_value ::= number|string|boolean|null|array|object; + comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; + colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; + object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; + object_end ::= #"{SPACE_NONTERMINAL}\\}}"; + array_begin ::= #"\\[{SPACE_NONTERMINAL}"; + array_end ::= #"{SPACE_NONTERMINAL}\\]"; + """ + + def alter_type_to_nonterminals_metadata_inplace(type_to_nonterminals: list[typing.Callable]): + metadata_idx = [idx for idx, fn in enumerate(type_to_nonterminals) if fn.__name__ == "metadata"] + if len(metadata_idx) == 1: + type_to_nonterminals[metadata_idx[0]] = _metadata + + alter_type_to_nonterminals_metadata_inplace(json._type_to_nonterminals) diff --git a/mostlyai/engine/_language/temp_formatron.py b/mostlyai/engine/_language/temp_formatron.py deleted file mode 100644 index 02cf848..0000000 --- a/mostlyai/engine/_language/temp_formatron.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright 2025 MOSTLY AI -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -from formatron import schemas -from formatron.formats import json - - -# direct copy from formatron -def _string_metadata(current: type, nonterminal: str): - min_length = current.metadata.get("min_length") - max_length = current.metadata.get("max_length") - pattern = current.metadata.get("pattern") - substring_of = current.metadata.get("substring_of") - if pattern: - assert not (min_length or max_length or substring_of), ( - "pattern is mutually exclusive with min_length, max_length and substring_of" - ) - if substring_of: - assert not (min_length or max_length or pattern), ( - "substring_of is mutually exclusive with min_length, max_length and pattern" - ) - repetition_map = { - (True, False): f"{{{min_length},}}", - (False, True): f"{{0,{max_length}}}", - (True, True): f"{{{min_length},{max_length}}}", - } - repetition = repetition_map.get((min_length is not None, max_length is not None)) - if repetition is not None: - return ( - rf"""{nonterminal} ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}}){repetition}"'; -""", - [], - ) - if pattern is not None: - pattern = pattern.replace("'", "\\'") - return f"""{nonterminal} ::= #'"{pattern}"';\n""", [] - if substring_of is not None: - return f"""{nonterminal} ::= '"' #substrs{repr(substring_of)} '"';\n""", [] - - -# completely altered -def _number_metadata(current: type, nonterminal: str): - # For now only constrains number of digits and whether it is negative - gt = current.metadata.get("gt") - ge = current.metadata.get("ge") - lt = current.metadata.get("lt") - le = current.metadata.get("le") - if lt is not None or gt is not None: - raise NotImplementedError("gt and lt are not supported for number metadata") - if le < ge: - raise ValueError("le must be greater than or equal to ge") - - pattern_parts = [] - if issubclass(current.type, float): - le, le_frac = str(le).split(".") - ge, ge_frac = str(ge).split(".") - le, le_frac = int(le), int(le_frac) - ge, ge_frac = int(ge), int(ge_frac) - decimal_places = current.metadata.get("decimal_places") - - if ge is not None and le is not None: - if ge < 0 and le < 0: - pattern_parts.append("-") - min_num = abs(le) - max_num = abs(ge) - max_digits = len(str(max_num)) - min_digits = len(str(min_num)) - pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") - elif ge > 0: - min_num = ge - max_num = le - max_digits = len(str(max_num)) - min_digits = len(str(min_num)) - pattern_parts.append(rf"([1-9][0-9]{{{min_digits - 1},{max_digits - 1}}})") - else: - if ge < 0: - pattern_parts.append("-?") - max_digits = max(len(str(abs(ge))), len(str(abs(le)))) - pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") - - if issubclass(current.type, float): - # FIXME: currently is not constrained - pattern_parts.append(rf"(\\.[0-9]{{0,{decimal_places}}})?") - - pattern = "".join(pattern_parts) - return f"""{nonterminal} ::= #"{pattern}";\n""", [] - - -# removed sequence metadata since unnecessary and altered number_metadata to use ours -def _metadata(current: type, nonterminal: str): - if isinstance(current, schemas.schema.TypeWithMetadata): - original = typing.get_origin(current.type) - if original is None: - original = current.type - if not current.metadata: - return "", [(current.type, nonterminal)] - if isinstance(current.type, type) and issubclass(current.type, str): - return _string_metadata(current, nonterminal) - elif isinstance(current.type, type) and issubclass(current.type, (int, float)): - return _number_metadata(current, nonterminal) - return None - - -def monkey_patch_formatron(): - FORMATRON_WHITESPACE_MAX_REPETITIONS = 10 - SPACE_NONTERMINAL = f"[ \t\n\r]{{0,{FORMATRON_WHITESPACE_MAX_REPETITIONS}}}" - - # Copy from formatron, altered to have limited whitespace repetitions and datetime format - json.GRAMMAR_HEADER = rf"""integer ::= #"-?(0|[1-9]\\d*)"; - number ::= #"-?(0|[1-9]\\d*)(\\.\\d+)?([eE][+-]?\\d+)?"; - string ::= #'"([^\\\\"\u0000-\u001f]|\\\\["\\\\bfnrt/]|\\\\u[0-9A-Fa-f]{{4}})*"'; - boolean ::= "true"|"false"; - null ::= "null"; - array ::= array_begin (json_value (comma json_value)*)? array_end; - object ::= object_begin (string colon json_value (comma string colon json_value)*)? object_end; - json_value ::= number|string|boolean|null|array|object; - comma ::= #"{SPACE_NONTERMINAL},{SPACE_NONTERMINAL}"; - colon ::= #"{SPACE_NONTERMINAL}:{SPACE_NONTERMINAL}"; - object_begin ::= #" \\{{{SPACE_NONTERMINAL}"; - object_end ::= #"{SPACE_NONTERMINAL}\\}}"; - array_begin ::= #"\\[{SPACE_NONTERMINAL}"; - array_end ::= #"{SPACE_NONTERMINAL}\\]"; - """ - - def alter_type_to_nonterminals_metadata_inplace(type_to_nonterminals: list[typing.Callable]): - metadata_idx = [idx for idx, fn in enumerate(type_to_nonterminals) if fn.__name__ == "metadata"] - if len(metadata_idx) == 1: - type_to_nonterminals[metadata_idx[0]] = _metadata - - alter_type_to_nonterminals_metadata_inplace(json._type_to_nonterminals) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 9b0e051..596fde9 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -37,8 +37,7 @@ DifferentialPrivacyConfig, RareCategoryReplacementMethod, ) -from mostlyai.engine._language.temp_formatron import _number_metadata -from mostlyai.engine._language.formatron_utils import get_formatter_builders +from mostlyai.engine._language.formatron_utils import get_formatter_builders, _number_metadata from formatron.integrations.transformers import create_formatter_logits_processor_list From 669225825109f9547d8e428ec43ece68a7895160 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Thu, 13 Feb 2025 17:25:52 +0100 Subject: [PATCH 48/58] fix numeric training --- mostlyai/engine/_language/training.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mostlyai/engine/_language/training.py b/mostlyai/engine/_language/training.py index 95edc70..15eebc4 100644 --- a/mostlyai/engine/_language/training.py +++ b/mostlyai/engine/_language/training.py @@ -38,7 +38,6 @@ from torch.utils.data import DataLoader from mostlyai.engine._common import ( - STRING, ProgressCallback, ProgressCallbackWrapper, TABLE_COLUMN_INFIX, @@ -272,7 +271,7 @@ def train( raw_dataset = load_dataset("parquet", data_files=data_files) def shuffle_tgt_columns(x): - x_tgt = pd.DataFrame([json.loads(x.pop("tgt"))], dtype=STRING) # convert to DataFrame + x_tgt = pd.DataFrame([json.loads(x.pop("tgt"))]) # convert to DataFrame x_tgt = x_tgt.sample(frac=1, axis=1) # shuffle columns x_tgt = row_to_json( x_tgt.add_prefix("tgt" + TABLE_COLUMN_INFIX).squeeze(axis=0), is_target=True From da28b5cab4812cc73af6be32f19b172a98915072 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Thu, 13 Feb 2025 17:40:14 +0100 Subject: [PATCH 49/58] make max5, min5 maintain numeric dtype (int for int and float for float) rather than always float --- mostlyai/engine/_encoding_types/language/numeric.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index a460ac1..4e98710 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -25,9 +25,9 @@ def analyze_language_numeric(values: pd.Series, root_keys: pd.Series, _: pd.Seri # determine lowest/highest values by root ID, and return top 11 df = pd.concat([root_keys, values], axis=1) min_values = df.groupby(root_keys.name)[values.name].min().dropna() - min11 = min_values.sort_values(ascending=True).head(11).astype("float").tolist() + min11 = min_values.sort_values(ascending=True).head(11).tolist() max_values = df.groupby(root_keys.name)[values.name].max().dropna() - max11 = max_values.sort_values(ascending=False).head(11).astype("float").tolist() + max11 = max_values.sort_values(ascending=False).head(11).tolist() # determine if there are any NaN values has_nan = bool(values.isna().any()) From 7b4bc2f40b8173617c3bc4c30643bb9558e186c9 Mon Sep 17 00:00:00 2001 From: Michael Platzer Date: Fri, 14 Feb 2025 08:04:21 +0100 Subject: [PATCH 50/58] added description for new enc types --- mostlyai/engine/domain.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mostlyai/engine/domain.py b/mostlyai/engine/domain.py index 1449e66..ac71625 100644 --- a/mostlyai/engine/domain.py +++ b/mostlyai/engine/domain.py @@ -47,10 +47,10 @@ class ModelEncodingType(str, Enum): - `TABULAR_DATETIME`: Model samples each part of a datetime value. - `TABULAR_DATETIME_RELATIVE`: Model samples the relative difference between datetimes within a sequence. - `TABULAR_LAT_LONG`: Model samples a latitude-longitude column. The format is "latitude,longitude". - - `LANGUAGE_TEXT`: Model will train a distinct LANGUAGE model for this column, to then generate free text. - - `LANGUAGE_CATEGORICAL`: TODO - - `LANGUAGE_NUMERIC`: TODO - - `LANGUAGE_DATETIME`: TODO + - `LANGUAGE_TEXT`: Model will sample free text, using a LANGUAGE model. + - `LANGUAGE_CATEGORICAL`: Model samples from existing (non-rare) categories, using a LANGUAGE model. + - `LANGUAGE_NUMERIC`: Model samples from the valid numeric value range, using a LANGUAGE model. + - `LANGUAGE_DATETIME`: Model samples from the valid datetime value range, using a LANGUAGE model. """ auto = "AUTO" From 6e00e98cf23f4e83e1398a280c59e15b3cd2f140 Mon Sep 17 00:00:00 2001 From: michdr Date: Fri, 14 Feb 2025 10:07:12 +0100 Subject: [PATCH 51/58] re-add disabled models in test_categorical_numeric_datetime --- tests/end_to_end/test_language.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/end_to_end/test_language.py b/tests/end_to_end/test_language.py index 596fde9..b5a3add 100644 --- a/tests/end_to_end/test_language.py +++ b/tests/end_to_end/test_language.py @@ -467,8 +467,8 @@ def encoded_numeric_categorical_datetime_dataset(tmp_path_factory): ("model_name"), [ LSTMFromScratchConfig.model_id, - # "amd/AMD-Llama-135m", - # "openai-community/gpt2", # TEMP, better model than AMD + "amd/AMD-Llama-135m", + "openai-community/gpt2", # TEMP, better model than AMD ], ) def test_categorical_numeric_datetime(encoded_numeric_categorical_datetime_dataset, model_name): From 743ec3e922b37d99f0b4a994e21f8e736396f838 Mon Sep 17 00:00:00 2001 From: andre-mostly Date: Mon, 17 Feb 2025 10:27:09 +0100 Subject: [PATCH 52/58] fix comments --- mostlyai/engine/_encoding_types/language/numeric.py | 1 - mostlyai/engine/_language/formatron_utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index 4e98710..bd60490 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -132,6 +132,5 @@ def decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x = pd.to_numeric(x, errors="coerce") x = x.round(col_stats["max_scale"]) x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) - # FIXME can result in OverFlowError when turning string into int in _decode_numeric in generation.py, from age '-5555555555555555555555555' -> OverflowError: Python int too large to convert to C long dtype = "Int64" if col_stats["max_scale"] == 0 else float return x.astype(dtype) diff --git a/mostlyai/engine/_language/formatron_utils.py b/mostlyai/engine/_language/formatron_utils.py index 6f1180c..ec6df8b 100644 --- a/mostlyai/engine/_language/formatron_utils.py +++ b/mostlyai/engine/_language/formatron_utils.py @@ -208,7 +208,6 @@ def _number_metadata(current: type, nonterminal: str): pattern_parts.append(rf"(0|[1-9][0-9]{{0,{max_digits - 1}}})") if issubclass(current.type, float): - # FIXME: currently is not constrained pattern_parts.append(rf"(\\.[0-9]{{0,{decimal_places}}})?") pattern = "".join(pattern_parts) From 77e55c087dbb76e9190abfe8bf486f6e1675255d Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 11:08:31 +0100 Subject: [PATCH 53/58] refactor analyze --- mostlyai/engine/analysis.py | 81 ++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index eea3bac..b3b286c 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -93,6 +93,9 @@ ModelEncodingType.tabular_numeric_binned, ModelEncodingType.tabular_datetime, ModelEncodingType.tabular_datetime_relative, + ModelEncodingType.language_categorical, + ModelEncodingType.language_numeric, + ModelEncodingType.language_datetime, ) @@ -324,22 +327,7 @@ def _analyze_reduce( column: column_stats.get("encoding_type") for column, column_stats in stats_list[0]["columns"].items() } - # build mapping of original column name to ARGN table and column identifiers - def get_table(qualified_column_name: str) -> str: - # column names are assumed to be :: - return qualified_column_name.split(TABLE_COLUMN_INFIX)[0] - - def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: - duplicated_tables = [get_table(c) for c in qualified_column_names] - return list(dict.fromkeys(duplicated_tables)) - - unique_tables = get_unique_tables(encoding_types.keys()) - argn_identifiers: dict[str, tuple[str, str]] = { - c: (f"t{unique_tables.index(get_table(qualified_column_name=c))}", f"c{idx}") - for idx, c in enumerate(encoding_types.keys()) - } - - for i, column in enumerate(encoding_types.keys()): + for column in encoding_types: encoding_type = encoding_types[column] column_stats_list = [item["columns"][column] for item in stats_list] column_stats_list = [ @@ -414,34 +402,53 @@ def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: if encoding_type in _VALUE_PROTECTION_ENCODING_TYPES: stats_col = {"value_protection": value_protection} | stats_col - # select model pipeline to process given column - def get_argn_processor(mode, is_flat) -> str: - if mode == "tgt": - return TGT - else: # mode == "ctx" - return CTXFLT if is_flat else CTXSEQ - - is_flat = "seq_len" not in column_stats_list[0] - stats_col[ARGN_PROCESSOR] = get_argn_processor(mode, is_flat) - ( - stats_col[ARGN_TABLE], - stats_col[ARGN_COLUMN], - ) = argn_identifiers[column] - - if not is_flat: + is_flat_column = "seq_len" not in column_stats_list[0] + if not is_flat_column: stats_col["seq_len"] = _analyze_reduce_seq_len([column_stats_list[0]["seq_len"]]) - if encoding_type in ( + is_language_column = encoding_type in ( ModelEncodingType.language_text, ModelEncodingType.language_categorical, ModelEncodingType.language_numeric, ModelEncodingType.language_datetime, - ): - _LOG.info( - f"analyzed column `{column}`: {stats_col['encoding_type']} nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" + ) + + if not is_language_column: + # build mapping of original column name to ARGN table and column identifiers + def get_table(qualified_column_name: str) -> str: + # column names are assumed to be
:: + return qualified_column_name.split(TABLE_COLUMN_INFIX)[0] + + def get_unique_tables(qualified_column_names: Iterable[str]) -> list[str]: + duplicated_tables = [get_table(c) for c in qualified_column_names] + return list(dict.fromkeys(duplicated_tables)) + + unique_tables = get_unique_tables(encoding_types.keys()) + argn_identifiers: dict[str, tuple[str, str]] = { + c: (f"t{unique_tables.index(get_table(qualified_column_name=c))}", f"c{idx}") + for idx, c in enumerate(encoding_types.keys()) + } + + def get_argn_processor(mode, is_flat) -> str: + if mode == "tgt": + return TGT + else: # mode == "ctx" + return CTXFLT if is_flat else CTXSEQ + + stats_col[ARGN_PROCESSOR] = get_argn_processor(mode, is_flat="seq_len" not in column_stats_list[0]) + ( + stats_col[ARGN_TABLE], + stats_col[ARGN_COLUMN], + ) = argn_identifiers[column] + + _LOG.info( + f"analyzed column `{column}`: {stats_col['encoding_type']} " + + ( + f"nchar_max={stats_col['nchar_max']} nchar_avg={stats_col['nchar_avg']}" + if is_language_column + else f"{stats_col['cardinalities']}" ) - else: - _LOG.info(f"analyzed column `{column}`: {stats_col['encoding_type']} {stats_col['cardinalities']}") + ) stats["columns"][column] = stats_col if mode == "ctx": From 33a3b1e6246aa768a81dc83e630226c748d33bdc Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 12:41:26 +0100 Subject: [PATCH 54/58] LANGUAGE CATEGORICAL (#43) --- .../_encoding_types/language/categorical.py | 2 +- mostlyai/engine/_language/generation.py | 4 +- .../language/test_categorical.py | 67 +++++++++++++++++-- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/categorical.py b/mostlyai/engine/_encoding_types/language/categorical.py index 46f57f1..aebfc39 100644 --- a/mostlyai/engine/_encoding_types/language/categorical.py +++ b/mostlyai/engine/_encoding_types/language/categorical.py @@ -69,7 +69,7 @@ def encode_language_categorical(values: pd.Series, stats: dict) -> pd.Series: return values -def decode_categorical(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: +def decode_language_categorical(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x = x.astype(STRING) allowed_categories = col_stats.get("categories", []) return x.where(x.isin(allowed_categories), other=None) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 0d7d395..7b60108 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -35,7 +35,7 @@ ProgressCallback, ProgressCallbackWrapper, ) -from mostlyai.engine._encoding_types.language.categorical import decode_categorical +from mostlyai.engine._encoding_types.language.categorical import decode_language_categorical from mostlyai.engine._encoding_types.language.datetime import decode_datetime from mostlyai.engine._encoding_types.language.numeric import decode_numeric from mostlyai.engine._encoding_types.language.text import decode_text @@ -118,7 +118,7 @@ def parse_json(x, columns: list[str]): elif col_stats["encoding_type"] == ModelEncodingType.language_datetime: tgt_data[col] = decode_datetime(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_categorical: - tgt_data[col] = decode_categorical(tgt_data[col], col_stats) + tgt_data[col] = decode_language_categorical(tgt_data[col], col_stats) else: tgt_data[col] = decode_text(tgt_data[col], col_stats) diff --git a/tests/unit/encoding_types/language/test_categorical.py b/tests/unit/encoding_types/language/test_categorical.py index 4790d59..3721436 100644 --- a/tests/unit/encoding_types/language/test_categorical.py +++ b/tests/unit/encoding_types/language/test_categorical.py @@ -14,20 +14,77 @@ import pandas as pd import pytest +import numpy as np -from mostlyai.engine._encoding_types.language.categorical import decode_categorical +from mostlyai.engine._encoding_types.language.categorical import ( + CATEGORICAL_UNKNOWN_TOKEN, + decode_language_categorical, + analyze_language_categorical, + analyze_reduce_language_categorical, + encode_language_categorical, +) -class TestCategoricalDecode: +class TestLanguageCategoricalAnalyze: + def test_3_frequent_and_1_rare_values(self): + values = pd.Series(np.repeat(["secret", "male", "female", pd.NA], 100), name="gender") + ids = pd.Series( + np.concatenate([np.repeat(0, 100), range(100), range(100, 200), range(200, 300)]), + name="subject_id", + ) + stats = analyze_language_categorical(values, ids) + assert stats == { + "cnt_values": {"female": 100, "male": 100, "secret": 1}, + "has_nan": True, + } + + +class TestLanguageCategoricalAnalyzeReduce: + @pytest.fixture + def stats_list(self): + stats1 = { + "cnt_values": {"secret1": 1, "male": 100}, + "has_nan": True, + } + stats2 = { + "cnt_values": {"secret2": 1, "male": 100, "female": 100}, + "has_nan": False, + } + return stats1, stats2 + + def test_with_value_protection(self, stats_list): + stats1, stats2 = stats_list + stats = analyze_reduce_language_categorical([stats1, stats2], value_protection=True) + assert stats == { + "categories": [CATEGORICAL_UNKNOWN_TOKEN, None, "female", "male"], + "no_of_rare_categories": 2, + } + + +class TestLanguageCategoricalEncode: + def test_2_frequent_and_1_rare_and_1_null_values(self): + values = pd.Series(np.repeat(["secret", "male", "female", pd.NA], 100), name="gender") + stats = { + "categories": [CATEGORICAL_UNKNOWN_TOKEN, None, "female", "male"], + "no_of_rare_categories": 1, + } + expected = pd.Series( + np.repeat([CATEGORICAL_UNKNOWN_TOKEN, "male", "female", pd.NA], 100), name="gender", dtype="string" + ) + encoded = encode_language_categorical(values, stats) + pd.testing.assert_series_equal(encoded, expected) + + +class TestLanguageCategoricalDecode: @pytest.fixture def col_stats(self): - return {"categories": ["apple", "banana", "cherry"]} + return {"categories": [CATEGORICAL_UNKNOWN_TOKEN, None, "apple", "banana", "cherry"]} @pytest.fixture def sample_values(self): return pd.Series(["apple", "durian", "banana", "elderberry", "cherry", "fig", None]) - def test_decode_categorical(self, sample_values, col_stats): - decoded = decode_categorical(sample_values, col_stats) + def test_language_categorical_decode(self, sample_values, col_stats): + decoded = decode_language_categorical(sample_values, col_stats) expected = pd.Series(["apple", None, "banana", None, "cherry", None, None], dtype=decoded.dtype) pd.testing.assert_series_equal(decoded, expected) From 7875908e48ccc3e42290c02aafe0216eff38600d Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 13:55:53 +0100 Subject: [PATCH 55/58] n_jobs --- mostlyai/engine/analysis.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mostlyai/engine/analysis.py b/mostlyai/engine/analysis.py index b3b286c..2a27b77 100644 --- a/mostlyai/engine/analysis.py +++ b/mostlyai/engine/analysis.py @@ -234,8 +234,7 @@ def _analyze_partition( ctx_root_keys = ctx_primary_keys.rename("__rkey") # analyze all target columns - # with parallel_config("loky", n_jobs=n_jobs): - with parallel_config("loky", n_jobs=1): + with parallel_config("loky", n_jobs=n_jobs): results = Parallel()( delayed(_analyze_col)( values=tgt_df[column], @@ -276,8 +275,7 @@ def _analyze_partition( # analyze all context columns assert isinstance(ctx_encoding_types, dict) - # with parallel_config("loky", n_jobs=n_jobs): - with parallel_config("loky", n_jobs=1): + with parallel_config("loky", n_jobs=n_jobs): results = Parallel()( delayed(_analyze_col)( values=ctx_df[column], From be41a73cca2e78ccd2d1cf8c77ed80b826527197 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 14:28:18 +0100 Subject: [PATCH 56/58] kill examples/language_encoding_types.ipynb --- examples/language_encoding_types.ipynb | 517 ------------------------- 1 file changed, 517 deletions(-) delete mode 100644 examples/language_encoding_types.ipynb diff --git a/examples/language_encoding_types.ipynb b/examples/language_encoding_types.ipynb deleted file mode 100644 index 4512b4a..0000000 --- a/examples/language_encoding_types.ipynb +++ /dev/null @@ -1,517 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "# Language Model: flat data, without context" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mostly-ai/mostlyai-engine/blob/main/examples/language.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:52:43.092585Z", - "iopub.status.busy": "2025-02-05T13:52:43.092035Z", - "iopub.status.idle": "2025-02-05T13:56:20.259209Z", - "shell.execute_reply": "2025-02-05T13:56:20.258849Z", - "shell.execute_reply.started": "2025-02-05T13:52:43.092556Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2025-02-05 14:52:45,343] INFO : SPLIT started\n", - "[2025-02-05 14:52:45,344] INFO : clean `ws-language-categorical-flat/OriginalData/tgt-data`\n", - "[2025-02-05 14:52:45,345] INFO : clean `ws-language-categorical-flat/OriginalData/tgt-meta`\n", - "[2025-02-05 14:52:45,346] INFO : model_type='LANGUAGE'\n", - "[2025-02-05 14:52:45,346] INFO : tgt_encoding_types={'category': 'LANGUAGE_CATEGORICAL', 'title': 'LANGUAGE_TEXT'}\n", - "[2025-02-05 14:52:45,360] INFO : SPLIT finished in 0.02s\n", - "[2025-02-05 14:52:45,361] INFO : ANALYZE started\n", - "[2025-02-05 14:52:45,363] INFO : clean `ws-language-categorical-flat/ModelStore/tgt-stats`\n", - "[2025-02-05 14:52:45,364] INFO : analyzing 2 partitions in parallel\n", - "[2025-02-05 14:52:45,413] INFO : analyzed target partition 000000-trn (20768, 2)\n", - "[2025-02-05 14:52:45,422] INFO : analyzed target partition 000000-val (2308, 2)\n", - "[2025-02-05 14:52:45,422] INFO : combine partition statistics\n", - "[2025-02-05 14:52:45,423] INFO : analyzed column `category`: LANGUAGE_CATEGORICAL \n", - "[2025-02-05 14:52:45,423] INFO : analyzed column `title`: LANGUAGE_TEXT \n", - "[2025-02-05 14:52:45,424] INFO : analyzed 23,076 records: 20,768 training / 2,308 validation\n", - "[2025-02-05 14:52:45,425] INFO : tgt sequence length deciles: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n", - "[2025-02-05 14:52:45,425] INFO : is_sequential: False\n", - "[2025-02-05 14:52:45,425] INFO : write statistics to `ws-language-categorical-flat/ModelStore/tgt-stats/stats.json`\n", - "[2025-02-05 14:52:45,426] INFO : ANALYZE finished in 0.06s\n", - "[2025-02-05 14:52:45,427] INFO : ENCODE_LANGUAGE started\n", - "[2025-02-05 14:52:45,428] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", - "[2025-02-05 14:52:45,428] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", - "[2025-02-05 14:52:45,429] INFO : clean `ws-language-categorical-flat/OriginalData/encoded-data`\n", - "[2025-02-05 14:52:45,434] INFO : Formatting context columns [] to JSON\n", - "[2025-02-05 14:52:45,443] INFO : Formatting target columns ['category', 'title'] to JSON\n", - "[2025-02-05 14:52:46,161] INFO : token statistics of this partition: \n", - " #pretokens #chars\n", - "min 16.0 50.0\n", - "50% 27.0 115.0\n", - "max 67.0 208.0\n", - "[2025-02-05 14:52:46,172] INFO : encoded partition part.000000-trn.parquet (20768, 2)\n", - "[2025-02-05 14:52:46,177] INFO : Formatting context columns [] to JSON\n", - "[2025-02-05 14:52:46,181] INFO : Formatting target columns ['category', 'title'] to JSON\n", - "[2025-02-05 14:52:46,289] INFO : token statistics of this partition: \n", - " #pretokens #chars\n", - "min 16.0 52.0\n", - "50% 27.0 115.0\n", - "max 52.0 199.0\n", - "[2025-02-05 14:52:46,292] INFO : encoded partition part.000000-val.parquet (2308, 2)\n", - "[2025-02-05 14:52:46,292] INFO : ENCODE_LANGUAGE finished in 0.87s\n", - "[2025-02-05 14:52:46,293] INFO : TRAIN_LANGUAGE started\n", - "[2025-02-05 14:52:46,303] INFO : numpy=1.26.4, pandas=2.2.3\n", - "[2025-02-05 14:52:46,305] INFO : torch=2.5.1, opacus=1.5.2\n", - "[2025-02-05 14:52:46,309] INFO : transformers=4.46.3, peft=0.11.1\n", - "[2025-02-05 14:52:46,309] INFO : device=device(type='cpu')\n", - "[2025-02-05 14:52:46,309] INFO : bf16_supported=False\n", - "[2025-02-05 14:52:46,310] INFO : use_mixed_precision=False\n", - "[2025-02-05 14:52:46,310] INFO : model_id='MOSTLY_AI/LSTMFromScratch-3m'\n", - "[2025-02-05 14:52:46,310] INFO : enable_flexible_generation=True\n", - "[2025-02-05 14:52:46,310] INFO : max_training_time=60s\n", - "[2025-02-05 14:52:46,311] INFO : max_epochs=100.0\n", - "[2025-02-05 14:52:46,311] INFO : with_dp=False\n", - "[2025-02-05 14:52:46,311] INFO : model_state_strategy=\n", - "[2025-02-05 14:52:52,892] INFO : create training model\n", - "[2025-02-05 14:52:52,893] INFO : model_state_strategy=\n", - "[2025-02-05 14:52:52,893] INFO : clear existing checkpoint files\n", - "[2025-02-05 14:52:52,895] INFO : start training progress from epoch=0.0, steps=0\n", - "[2025-02-05 14:52:53,274] INFO : model loading time: 0.38s\n", - "[2025-02-05 14:52:53,274] INFO : no_of_model_params=2668111\n", - "[2025-02-05 14:52:53,274] INFO : no_of_trainable_model_params=2668111\n", - "[2025-02-05 14:52:53,275] INFO : tokenizer=LlamaTokenizerFast(name_or_path='', vocab_size=4175, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': '', 'pad_token': ''}, clean_up_tokenization_spaces=False), added_tokens_decoder={\n", - "\t0: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t1: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t2: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "\t3: AddedToken(\"\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n", - "}\n", - "[2025-02-05 14:52:54,202] INFO : trn_cnt=20768, val_cnt=2308\n", - "[2025-02-05 14:52:54,202] INFO : trn_batch_size=64, val_batch_size=32\n", - "[2025-02-05 14:52:54,203] INFO : trn_steps=324, val_steps=72\n", - "[2025-02-05 14:52:54,203] INFO : batch_size=32, gradient_accumulation_steps=2, initial_lr=0.0004\n", - "[2025-02-05 14:52:54,677] INFO : {'epoch': 0.0, 'is_checkpoint': 0, 'steps': 1, 'samples': 64, 'trn_loss': None, 'val_loss': None, 'total_time': 0.5, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", - "[2025-02-05 14:53:54,325] INFO : saving model weights, as none were saved so far\n", - "[2025-02-05 14:53:57,632] INFO : {'epoch': 0.63, 'is_checkpoint': 1, 'steps': 205, 'samples': 13120, 'trn_loss': None, 'val_loss': 2.9262, 'total_time': 63.4, 'learn_rate': 0.0004, 'dp_eps': None, 'dp_delta': None}\n", - "[2025-02-05 14:53:57,634] INFO : TRAIN_LANGUAGE finished in 71.34s\n", - "[2025-02-05 14:53:57,643] INFO : GENERATE_LANGUAGE started\n", - "[2025-02-05 14:53:57,643] INFO : device=device(type='cpu')\n", - "[2025-02-05 14:53:57,644] INFO : sampling_temperature=1.0, sampling_top_p=1.0\n", - "[2025-02-05 14:53:57,644] INFO : clean `ws-language-categorical-flat/SyntheticData`\n", - "[2025-02-05 14:53:57,649] INFO : seed_data.shape=(10000, 0)\n", - "[2025-02-05 14:53:57,650] INFO : Formatting context columns [] to JSON\n", - "[2025-02-05 14:53:57,674] INFO : token statistics of this partition: \n", - " #pretokens #chars\n", - "min 1.0 3.0\n", - "50% 1.0 3.0\n", - "max 1.0 3.0\n", - "[2025-02-05 14:53:57,675] INFO : max_new_tokens=151\n", - "[2025-02-05 14:53:57,774] INFO : inference engine: HuggingFaceEngine\n", - "[2025-02-05 14:53:57,774] INFO : model loading time: 0.10s\n", - "[2025-02-05 14:53:57,775] INFO : batch_size=128\n", - "[2025-02-05 14:53:57,775] INFO : enforce_json_output=True\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "02/05/2025 14:53:57:WARNING:The following bytes are not present in any token: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247]. This likely indicates that the vocabulary loading code is wrong, the tokenizer is doing some creepy processing or the tokenizer is not UTF-8 compatible. Check the vocabulary loading code and the tokenizer code to fix any bug and/or consider processing the vocab like the tokenizer.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2025-02-05 14:56:20,197] INFO : num_samples_max_length_limit=0\n", - "[2025-02-05 14:56:20,241] INFO : percentage of invalid values: {'category': '0.00%', 'title': '0.00%'}\n", - "[2025-02-05 14:56:20,241] INFO : decoded (10000, 2) from 79 batches in 1.30s\n", - "[2025-02-05 14:56:20,244] INFO : persisted (10000, 2) to `part.000000.000000.parquet` in 0.00s\n", - "[2025-02-05 14:56:20,246] INFO : total_tokenize_fn_time=0.21s\n", - "[2025-02-05 14:56:20,246] INFO : total_logits_processor_build_time=1.37s\n", - "[2025-02-05 14:56:20,246] INFO : total_generate_fn_time=139.43s\n", - "[2025-02-05 14:56:20,246] INFO : GENERATE_LANGUAGE finished in 142.60s\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "import pandas as pd\n", - "from mostlyai import engine\n", - "\n", - "# init workspace and logging\n", - "# ws = Path(\"ws-language-flat\")\n", - "ws = Path(\"ws-language-categorical-flat\")\n", - "engine.init_logging()\n", - "\n", - "# # load original data\n", - "url = \"https://github.com/mostly-ai/public-demo-data/raw/refs/heads/dev/arxiv\"\n", - "trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category', 'title']]\n", - "# trn_df = pd.read_parquet(f\"{url}/synthetic-data-papers.parquet\")[['category']]\n", - "\n", - "# execute the engine steps\n", - "engine.split( # split data as PQT files for `trn` + `val` to `{ws}/OriginalData/tgt-data`\n", - " workspace_dir=ws,\n", - " tgt_data=trn_df,\n", - " # model_type=\"LANGUAGE\",\n", - " tgt_encoding_types={\"category\": \"LANGUAGE_CATEGORICAL\", \"title\": \"LANGUAGE_TEXT\"},\n", - ")\n", - "engine.analyze(workspace_dir=ws) # generate column-level statistics to `{ws}/ModelStore/tgt-stats/stats.json`\n", - "engine.encode(workspace_dir=ws) # encode training data to `{ws}/OriginalData/encoded-data`\n", - "engine.train( # train model and store to `{ws}/ModelStore/model-data`\n", - " workspace_dir=ws,\n", - " model=\"MOSTLY_AI/LSTMFromScratch-3m\", # use a light-weight LSTM model, trained from scratch (GPU recommended)\n", - " # model=\"microsoft/phi-1.5\", # or alternatively use a HF-hosted LLM model (GPU required)\n", - " max_training_time=1, # limit TRAIN to 10 minute for demo purposes\n", - ")\n", - "engine.generate( # use model to generate synthetic samples to `{ws}/SyntheticData`\n", - " workspace_dir=ws, \n", - " sample_size=10000,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:56:20.260268Z", - "iopub.status.busy": "2025-02-05T13:56:20.260149Z", - "iopub.status.idle": "2025-02-05T13:56:20.269394Z", - "shell.execute_reply": "2025-02-05T13:56:20.268803Z", - "shell.execute_reply.started": "2025-02-05T13:56:20.260257Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'chao-dyn',\n", - " 'cmp-lg',\n", - " 'comp-gas',\n", - " 'cond-mat.other',\n", - " 'cond-mat.quant-gas',\n", - " 'cond-mat.supr-con',\n", - " 'cs.CC',\n", - " 'cs.DL',\n", - " 'cs.FL',\n", - " 'cs.OS',\n", - " 'cs.PL',\n", - " 'cs.SC',\n", - " 'econ.TH',\n", - " 'math.CA',\n", - " 'math.CT',\n", - " 'math.DG',\n", - " 'math.FA',\n", - " 'math.GM',\n", - " 'math.GN',\n", - " 'math.GR',\n", - " 'math.MG',\n", - " 'math.SP',\n", - " 'nlin.AO',\n", - " 'nucl-ex',\n", - " 'nucl-th',\n", - " 'q-bio.CB',\n", - " 'q-bio.OT',\n", - " 'q-bio.SC',\n", - " 'q-fin.EC',\n", - " 'q-fin.MF',\n", - " 'q-fin.PR'}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "syn_tgt_df = pd.read_parquet(ws / \"SyntheticData\") # load synthetic data\n", - "set(trn_df['category']) - set(syn_tgt_df['category']) " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:56:20.270196Z", - "iopub.status.busy": "2025-02-05T13:56:20.270014Z", - "iopub.status.idle": "2025-02-05T13:56:20.279656Z", - "shell.execute_reply": "2025-02-05T13:56:20.278913Z", - "shell.execute_reply.started": "2025-02-05T13:56:20.270181Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_RARE_'}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "set(syn_tgt_df['category']) - set(trn_df['category'])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:56:20.282741Z", - "iopub.status.busy": "2025-02-05T13:56:20.281808Z", - "iopub.status.idle": "2025-02-05T13:56:20.288184Z", - "shell.execute_reply": "2025-02-05T13:56:20.287499Z", - "shell.execute_reply.started": "2025-02-05T13:56:20.282643Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 , \n", - "1 category\n", - "2 : A- for \n", - "3 : \n", - "4 , to\n", - "5 -..ML\n", - "6 D the\n", - "7 -Oed Learning with-to for-c- and Data ofe\n", - " \n", - "8 S from: a Learning ofn- for Synthetic\n", - "9 \n", - "Name: title, dtype: string" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "syn_tgt_df['title'].head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:56:20.289566Z", - "iopub.status.busy": "2025-02-05T13:56:20.289133Z", - "iopub.status.idle": "2025-02-05T13:56:20.296014Z", - "shell.execute_reply": "2025-02-05T13:56:20.295608Z", - "shell.execute_reply.started": "2025-02-05T13:56:20.289552Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "0 Conception d'un banc d'essais d\\'ecisionnel\n", - "1 Monotonicity Analysis over Chains and Curves\n", - "2 An active curve approach for tomographic recon...\n", - "3 Application of the HLSVD technique to the filt...\n", - "4 Phase retrieval by iterated projections\n", - "5 DIRC for a Higher Luminosity B Factory\n", - "6 Analysis of approximate nearest neighbor searc...\n", - "7 Efficient Retrieval of Similar Time Sequences ...\n", - "8 Mining Generalized Graph Patterns based on Use...\n", - "9 ARACNE: An Algorithm for the Reconstruction of...\n", - "Name: title, dtype: object" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_df['title'].head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "execution": { - "iopub.execute_input": "2025-02-05T13:56:20.296770Z", - "iopub.status.busy": "2025-02-05T13:56:20.296615Z", - "iopub.status.idle": "2025-02-05T13:56:20.302894Z", - "shell.execute_reply": "2025-02-05T13:56:20.302345Z", - "shell.execute_reply.started": "2025-02-05T13:56:20.296758Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
categorytitle
0cs.CV,
1cs.CYcategory
2stat.ML: A- for
3cs.CV:
4cs.LG, to
.........
9995cs.LGcategory
9996stat.ME:
9997cs.LGD
9998cs.CL:
9999stat.MLC.
\n", - "

10000 rows × 2 columns

\n", - "" - ], - "text/plain": [ - " category title\n", - "0 cs.CV , \n", - "1 cs.CY category\n", - "2 stat.ML : A- for \n", - "3 cs.CV : \n", - "4 cs.LG , to\n", - "... ... ...\n", - "9995 cs.LG category\n", - "9996 stat.ME : \n", - "9997 cs.LG D \n", - "9998 cs.CL : \n", - "9999 stat.ML C.\n", - "\n", - "[10000 rows x 2 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "syn_tgt_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.16" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": false, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From d0452c4b44d0e2d0a14844bec1e7d0d37c9924b4 Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 15:38:14 +0100 Subject: [PATCH 57/58] datetime tests --- .../_encoding_types/language/datetime.py | 2 +- mostlyai/engine/_language/generation.py | 4 +- .../encoding_types/language/test_datetime.py | 88 +++++++++++++++++-- 3 files changed, 86 insertions(+), 8 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/datetime.py b/mostlyai/engine/_encoding_types/language/datetime.py index cc7aa04..c7a1b37 100644 --- a/mostlyai/engine/_encoding_types/language/datetime.py +++ b/mostlyai/engine/_encoding_types/language/datetime.py @@ -99,7 +99,7 @@ def _clip_datetime(x: pd.Series, min5: list, max5: list) -> pd.Series: return pd.Series(clipped, index=x.index) -def decode_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: +def decode_language_datetime(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x = x.where(~x.isin(["", "_INVALID_"]), np.nan) valid_mask = ( diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 7b60108..4c23d65 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -36,7 +36,7 @@ ProgressCallbackWrapper, ) from mostlyai.engine._encoding_types.language.categorical import decode_language_categorical -from mostlyai.engine._encoding_types.language.datetime import decode_datetime +from mostlyai.engine._encoding_types.language.datetime import decode_language_datetime from mostlyai.engine._encoding_types.language.numeric import decode_numeric from mostlyai.engine._encoding_types.language.text import decode_text from mostlyai.engine._language.common import estimate_max_tokens, MAX_LENGTH @@ -116,7 +116,7 @@ def parse_json(x, columns: list[str]): if col_stats["encoding_type"] == ModelEncodingType.language_numeric: tgt_data[col] = decode_numeric(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_datetime: - tgt_data[col] = decode_datetime(tgt_data[col], col_stats) + tgt_data[col] = decode_language_datetime(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_categorical: tgt_data[col] = decode_language_categorical(tgt_data[col], col_stats) else: diff --git a/tests/unit/encoding_types/language/test_datetime.py b/tests/unit/encoding_types/language/test_datetime.py index 5bb4cc3..15eab3e 100644 --- a/tests/unit/encoding_types/language/test_datetime.py +++ b/tests/unit/encoding_types/language/test_datetime.py @@ -15,11 +15,89 @@ import pandas as pd import pytest -from mostlyai.engine._encoding_types.language.datetime import decode_datetime +from mostlyai.engine._encoding_types.language.datetime import ( + analyze_language_datetime, + analyze_reduce_language_datetime, + decode_language_datetime, + encode_language_datetime, +) from mostlyai.engine.domain import ModelEncodingType -class TestDatetimeDecode: +class TestLanguageDatetimeAnalyze: + def test_analyze_language_datetime(self): + birth_dates = pd.Series( + [ + "1910-01-01", + "", + "1930-01-31", + "1940-02-12", + "", + "1971-09-01", + "1983-05-19", + "1998-05-24", + ] + * 11, + name="birth_date", + ) + keys = pd.Series(range(len(birth_dates)), name="id") + stats = analyze_language_datetime(birth_dates, keys) + assert stats["has_nan"] is True + assert stats["min11"] == ["1910-01-01"] * 11 + assert stats["max11"] == ["1998-05-24"] * 11 + + +class TestLanguageDatetimeAnalyzeReduce: + def test_analyze_reduce_language_datetime(self): + stats1 = { + "has_nan": True, + "min11": ["1910-01-01"] * 11, + "max11": ["1998-05-24"] * 11, + } + stats2 = { + "has_nan": False, + "min11": ["2000-01-01"] * 11, + "max11": ["2024-12-31"] * 11, + } + reduced = analyze_reduce_language_datetime([stats1, stats2]) + assert reduced["has_nan"] is True + assert reduced["min5"] == ["1910-01-01"] * 5 + assert reduced["max5"] == ["2024-12-31"] * 5 + + +class TestLanguageDatetimeEncode: + def test_encode_language_datetime(self): + values = pd.Series( + [ + "1910-01-01", + "", + "1930-01-31", + "1940-02-12", + "", + "1971-09-01", + "1983-05-19", + "1998-05-24", + ], + name="birth_date", + ) + stats = { + "has_nan": True, + "min5": ["1930-01-31"] * 5, + "max5": ["2024-12-31"] * 5, + } + encoded = encode_language_datetime(values, stats) + assert encoded.dtype == "datetime64[us]" + assert encoded.isna().sum() == 2 + assert encoded.iloc[0] == pd.Timestamp("1930-01-31") + assert encoded.iloc[1] is pd.NaT + assert encoded.iloc[2] == pd.Timestamp("1930-01-31") + assert encoded.iloc[3] == pd.Timestamp("1940-02-12") + assert encoded.iloc[4] is pd.NaT + assert encoded.iloc[5] == pd.Timestamp("1971-09-01") + assert encoded.iloc[6] == pd.Timestamp("1983-05-19") + + +class TestLanguageDatetimeDecode: @pytest.fixture def datetime_stats(self): return { @@ -54,7 +132,7 @@ def sample_dates(self): ) def test_datetime_dtype_bounds_and_invalids(self, sample_dates, datetime_stats): - decoded = decode_datetime(sample_dates, datetime_stats) + decoded = decode_language_datetime(sample_dates, datetime_stats) assert decoded.dtype == "datetime64[ns]" non_null = decoded.dropna() min_bound = pd.to_datetime(datetime_stats["min5"][0]) @@ -66,12 +144,12 @@ def test_datetime_dtype_bounds_and_invalids(self, sample_dates, datetime_stats): def test_date_day_clamping(self, no_clip_stats): s = pd.Series(["2021-04-31"]) - decoded = decode_datetime(s, no_clip_stats) + decoded = decode_language_datetime(s, no_clip_stats) expected = pd.Timestamp("2021-04-30 00:00:00") assert decoded.iloc[0] == expected def test_time_extraction(self, no_clip_stats): s = pd.Series(["2021-07-15T23:59:59.123"]) - decoded = decode_datetime(s, no_clip_stats) + decoded = decode_language_datetime(s, no_clip_stats) expected = pd.Timestamp("2021-07-15 23:59:59.123") assert decoded.iloc[0] == expected From 25484319ccf2f4b53152ab4386f1b193173d388d Mon Sep 17 00:00:00 2001 From: Lukasz Kolodziejczyk Date: Mon, 17 Feb 2025 15:48:06 +0100 Subject: [PATCH 58/58] numeric tests --- .../_encoding_types/language/numeric.py | 2 +- mostlyai/engine/_language/generation.py | 4 +- .../encoding_types/language/test_numeric.py | 66 +++++++++++++++++-- 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/mostlyai/engine/_encoding_types/language/numeric.py b/mostlyai/engine/_encoding_types/language/numeric.py index bd60490..a3723b4 100644 --- a/mostlyai/engine/_encoding_types/language/numeric.py +++ b/mostlyai/engine/_encoding_types/language/numeric.py @@ -128,7 +128,7 @@ def _clip_numeric(x: pd.Series, min5: list, max5: list) -> pd.Series: return pd.Series(clipped, index=x.index) -def decode_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: +def decode_language_numeric(x: pd.Series, col_stats: dict[str, str]) -> pd.Series: x = pd.to_numeric(x, errors="coerce") x = x.round(col_stats["max_scale"]) x = _clip_numeric(x, col_stats["min5"], col_stats["max5"]) diff --git a/mostlyai/engine/_language/generation.py b/mostlyai/engine/_language/generation.py index 4c23d65..e712af1 100644 --- a/mostlyai/engine/_language/generation.py +++ b/mostlyai/engine/_language/generation.py @@ -37,7 +37,7 @@ ) from mostlyai.engine._encoding_types.language.categorical import decode_language_categorical from mostlyai.engine._encoding_types.language.datetime import decode_language_datetime -from mostlyai.engine._encoding_types.language.numeric import decode_numeric +from mostlyai.engine._encoding_types.language.numeric import decode_language_numeric from mostlyai.engine._encoding_types.language.text import decode_text from mostlyai.engine._language.common import estimate_max_tokens, MAX_LENGTH from mostlyai.engine._language.encoding import encode_df @@ -114,7 +114,7 @@ def parse_json(x, columns: list[str]): for col in tgt_stats["columns"].keys(): col_stats = tgt_stats["columns"][col] if col_stats["encoding_type"] == ModelEncodingType.language_numeric: - tgt_data[col] = decode_numeric(tgt_data[col], col_stats) + tgt_data[col] = decode_language_numeric(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_datetime: tgt_data[col] = decode_language_datetime(tgt_data[col], col_stats) elif col_stats["encoding_type"] == ModelEncodingType.language_categorical: diff --git a/tests/unit/encoding_types/language/test_numeric.py b/tests/unit/encoding_types/language/test_numeric.py index 41fcca8..1331468 100644 --- a/tests/unit/encoding_types/language/test_numeric.py +++ b/tests/unit/encoding_types/language/test_numeric.py @@ -16,11 +16,69 @@ import pandas as pd import pytest -from mostlyai.engine._encoding_types.language.numeric import decode_numeric +from mostlyai.engine._encoding_types.language.numeric import ( + analyze_language_numeric, + analyze_reduce_language_numeric, + decode_language_numeric, + encode_language_numeric, +) from mostlyai.engine.domain import ModelEncodingType -class TestNumericDecode: +class TestLanguageNumericAnalyze: + def test_analyze_language_numeric(self): + values = pd.Series([0, 1, 2, 3, 4, 5] * 11, name="value") + ids = pd.Series(range(len(values)), name="id") + stats = analyze_language_numeric(values, ids) + assert stats["has_nan"] is False + assert stats["max11"] == [5] * 11 + assert stats["min11"] == [0] * 11 + + +class TestLanguageNumericAnalyzeReduce: + def test_analyze_reduce_language_numeric(self): + stats1 = { + "has_nan": False, + "max11": [5] * 11, + "min11": [0] * 11, + "max_scale": 0, + } + stats2 = { + "has_nan": True, + "max11": [10] * 11, + "min11": [6] * 11, + "max_scale": 1, + } + reduced = analyze_reduce_language_numeric([stats1, stats2]) + assert reduced["has_nan"] is True + assert reduced["max5"] == [10] * 5 + assert reduced["min5"] == [0] * 5 + assert reduced["max_scale"] == 1 + + +class TestLanguageNumericEncode: + def test_encode_language_numeric(self): + values = pd.Series([-1, 0, 1, 2, 3, 4, 5, 6], name="value") + stats = { + "has_nan": False, + "max5": [5] * 5, + "min5": [0] * 5, + "max_scale": 0, + } + encoded = encode_language_numeric(values, stats) + assert encoded.dtype == "Int64" + assert encoded.isna().sum() == 0 + assert encoded.iloc[0] == 0 + assert encoded.iloc[1] == 0 + assert encoded.iloc[2] == 1 + assert encoded.iloc[3] == 2 + assert encoded.iloc[4] == 3 + assert encoded.iloc[5] == 4 + assert encoded.iloc[6] == 5 + assert encoded.iloc[7] == 5 + + +class TestLanguageNumericDecode: @pytest.fixture def int_stats(self): return { @@ -52,9 +110,9 @@ def sample_values(self): ("float_stats", float), ], ) - def test_decode_numeric(self, sample_values, request, stats_name, expected_dtype): + def test_decode_language_numeric(self, sample_values, request, stats_name, expected_dtype): stats = request.getfixturevalue(stats_name) - decoded = decode_numeric(sample_values, stats) + decoded = decode_language_numeric(sample_values, stats) assert decoded.dtype == expected_dtype non_null = decoded.dropna() # we don't enforce compatability with "has_nan" max_val = stats["max5"][0]