diff --git a/python/di_to_cu_migration_tool/.sample_env b/python/di_to_cu_migration_tool/.sample_env index b8106c8..49ff430 100644 --- a/python/di_to_cu_migration_tool/.sample_env +++ b/python/di_to_cu_migration_tool/.sample_env @@ -1,6 +1,6 @@ # Rename to .env HOST="" -API_VERSION = "2025-05-01-preview" +API_VERSION = "2025-11-01" SUBSCRIPTION_KEY = "" # This is your API Key if you have one or can be your Subscription ID diff --git a/python/di_to_cu_migration_tool/README.md b/python/di_to_cu_migration_tool/README.md index e473ad0..53717dc 100644 --- a/python/di_to_cu_migration_tool/README.md +++ b/python/di_to_cu_migration_tool/README.md @@ -1,13 +1,13 @@ # Document Intelligence to Content Understanding Migration Tool (Python) -Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **Preview.2** 2025-05-01-preview format, as used in AI Foundry. The following DI versions are supported: +Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **GA** 2025-11-01 format, as used in AI Foundry. The following DI versions are supported: - Custom Extraction Model DI 3.1 GA (2023-07-31) to DI 4.0 GA (2024-11-30) (Document Intelligence Studio) → DI-version = neural - Document Field Extraction Model 4.0 Preview (2024-07-31-preview) (AI Foundry / AI Services / Vision + Document / Document Field Extraction) → DI-version = generative To identify the version of your Document Intelligence dataset, please consult the sample documents in this folder to match your format. You can also verify the version by reviewing your DI project's user experience. For instance, Custom Extraction DI 3.1/4.0 GA appears in Document Intelligence Studio (https://documentintelligence.ai.azure.com/studio), whereas Document Field Extraction DI 4.0 Preview is only available on Azure AI Foundry's preview service (https://ai.azure.com/explore/aiservices/vision/document/extraction). -For migrating from these DI versions to Content Understanding Preview.2, this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. +For migrating from these DI versions to Content Understanding GA (2025-11-01), this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. ## Details About the Tools @@ -27,8 +27,26 @@ Here is a detailed breakdown of the three CLI tools and their functionality: * **call_analyze.py** * This CLI tool verifies that the migration completed successfully and assesses the quality of the created analyzer. + ## Setup +## Prerequisites + +⚠️ **IMPORTANT: Before using this migration tool**, ensure your Azure AI Foundry resource is properly configured for Content Understanding: + +1. **Configure Default Model Deployments**: You must set default model deployments in your Content Understanding in your Foundry Resource before creating or running analyzers. + + To do this walk through the prerequisites here: + - [REST API Quickstart Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=portal%2Cdocument) + + For more details about defaults checkout this documentation: + - [Models and Deployments Documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments) + +2. **Verify you can create and use a basic Content Understanding analyzer** in your Azure AI Foundry resource before attempting migration. This ensures all prerequisites are met. + +3. Complete all setup steps outlined in the REST API documentation above, including authentication and model deployment configuration. + +### Tool Setup Please follow these steps to set up the tool: 1. Install dependencies by running: @@ -43,7 +61,7 @@ Please follow these steps to set up the tool: - **SUBSCRIPTION_KEY:** Update to your Azure AI Service API Key or Subscription ID to authenticate the API requests. - Locate your API Key here: ![Azure AI Service Endpoints With Keys](assets/endpoint-with-keys.png) - If using Azure Active Directory (AAD), please refer to your Subscription ID: ![Azure AI Service Subscription ID](assets/subscription-id.png) - - **API_VERSION:** This is preset to the CU Preview.2 version; no changes are needed. + - **API_VERSION:** This is preset to the CU GA version (2025-11-01); no changes are needed. ## How to Locate Your Document Field Extraction Dataset for Migration @@ -73,6 +91,7 @@ To obtain SAS URLs for a file or folder for any container URL arguments, please 3. Configure permissions and expiry for your SAS URL as follows: - For the **DI source dataset**, please select permissions: _**Read & List**_ + - For the **CU target dataset**, please select permissions: _**Read, Add, Create, & Write**_ After configuring, click **Generate SAS Token and URL** and copy the URL shown under **Blob SAS URL**. @@ -155,7 +174,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request** errors: Please validate the following: - The endpoint URL is valid. Example: - `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-05-01-preview` + `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-11-01` - Your converted CU dataset respects the naming constraints below. If needed, please manually correct the `analyzer.json` fields: - Field names start with a letter or underscore - Field name length must be between 1 and 64 characters @@ -174,7 +193,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request**: This implies that you might have an incorrect endpoint or SAS URL. Please ensure that your endpoint is valid and that you are using the correct SAS URL for the document: - `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-05-01-preview` + `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-11-01` Confirm you are using the correct SAS URL for the document. - **401 Unauthorized**: @@ -189,4 +208,4 @@ Below are common issues you might encounter when creating an analyzer or running 2. Signature field types (e.g., in previous DI versions) are not yet supported in Content Understanding. These will be ignored during migration when creating the analyzer. 3. The content of your training documents is retained in the CU model's metadata, under storage specifically. You can find more details at: https://learn.microsoft.com/en-us/legal/cognitive-services/content-understanding/transparency-note?toc=%2Fazure%2Fai-services%2Fcontent-understanding%2Ftoc.json&bc=%2Fazure%2Fai-services%2Fcontent-understanding%2Fbreadcrumb%2Ftoc.json -4. All conversions are for Content Understanding preview.2 version only. \ No newline at end of file +4. All conversions are for Content Understanding GA (2025-11-01) version. \ No newline at end of file diff --git a/python/di_to_cu_migration_tool/constants.py b/python/di_to_cu_migration_tool/constants.py index 09dc972..5944645 100644 --- a/python/di_to_cu_migration_tool/constants.py +++ b/python/di_to_cu_migration_tool/constants.py @@ -1,6 +1,10 @@ # Supported DI versions DI_VERSIONS = ["generative", "neural"] -CU_API_VERSION = "2025-05-01-preview" +CU_API_VERSION = "2025-11-01" + +# Models +COMPLETION_MODEL = "gpt-4.1" +EMBEDDING_MODEL = "text-embedding-3-large" # constants MAX_FIELD_COUNT = 100 @@ -8,10 +12,12 @@ # standard file names FIELDS_JSON = "fields.json" +ANALYZER_JSON = "analyzer.json" LABELS_JSON = ".labels.json" VALIDATION_TXT = "validation.txt" PDF = ".pdf" OCR_JSON = ".ocr.json" +RESULT_JSON = ".result.json" # for field type conversion SUPPORT_FIELD_TYPE = [ diff --git a/python/di_to_cu_migration_tool/cu_converter_generative.py b/python/di_to_cu_migration_tool/cu_converter_generative.py index f27938d..1873468 100644 --- a/python/di_to_cu_migration_tool/cu_converter_generative.py +++ b/python/di_to_cu_migration_tool/cu_converter_generative.py @@ -12,7 +12,7 @@ from rich import print # For colored output # imports from same project -from constants import CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES +from constants import CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES, COMPLETION_MODEL, EMBEDDING_MODEL from field_definitions import FieldDefinitions # schema constants subject to change @@ -48,7 +48,7 @@ def format_angle(angle: float) -> float: formatted_num = f"{rounded_angle:.7f}".rstrip('0') # Remove trailing zeros return float(formatted_num) -def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions) -> dict: +def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> dict: """ Convert DI 4.0 preview Custom Document fields.json to analyzer.json format. Args: @@ -79,7 +79,11 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional # build analyzer.json appropriately analyzer_data = { "analyzerId": analyzer_id, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": COMPLETION_MODEL, + "embedding": EMBEDDING_MODEL + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -121,6 +125,17 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) @@ -287,7 +302,11 @@ def recursive_convert_di_label_to_cu_helper(value: dict) -> dict: di_label["valueDate"] = date_string # going with the default elif value_type == "number": try: - di_label["valueNumber"] = float(value.get("content")) # content can be easily converted to a float + content_val = value.get("content") + if not content_val: + di_label["valueNumber"] = None + else: + di_label["valueNumber"] = float(content_val) # content can be easily converted to a float except Exception as ex: # strip the string of all non-numerical values and periods string_value = value.get("content") @@ -296,16 +315,27 @@ def recursive_convert_di_label_to_cu_helper(value: dict) -> dict: # if more than one period exists, remove them all if cleaned_string.count('.') > 1: print("More than one decimal point exists, so will be removing them all.") - cleaned_string = cleaned_string = re.sub(r'\.', '', string_value) - di_label["valueNumber"] = float(cleaned_string) + cleaned_string = re.sub(r'\.', '', string_value) + + if not cleaned_string: + di_label["valueNumber"] = None + else: + di_label["valueNumber"] = float(cleaned_string) elif value_type == "integer": try: - di_label["valueInteger"] = int(value.get("content")) # content can be easily converted to an int + content_val = value.get("content") + if not content_val: + di_label["valueInteger"] = None + else: + di_label["valueInteger"] = int(content_val) # content can be easily converted to an int except Exception as ex: # strip the string of all non-numerical values string_value = value.get("content") cleaned_string = re.sub(r'[^0-9]', '', string_value) - di_label["valueInteger"] = int(cleaned_string) + if not cleaned_string: + di_label["valueInteger"] = None + else: + di_label["valueInteger"] = int(cleaned_string) else: di_label[value_part] = value.get("content") di_label["spans"] = value.get("spans", []) diff --git a/python/di_to_cu_migration_tool/cu_converter_neural.py b/python/di_to_cu_migration_tool/cu_converter_neural.py index d825f10..6a925aa 100644 --- a/python/di_to_cu_migration_tool/cu_converter_neural.py +++ b/python/di_to_cu_migration_tool/cu_converter_neural.py @@ -12,7 +12,7 @@ from rich import print # For colored output # imports from same project -from constants import COMPLETE_DATE_FORMATS, CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES +from constants import COMPLETE_DATE_FORMATS, CU_API_VERSION, MAX_FIELD_LENGTH, VALID_CU_FIELD_TYPES, COMPLETION_MODEL, EMBEDDING_MODEL, ANALYZER_JSON from field_definitions import FieldDefinitions # schema constants subject to change @@ -37,7 +37,7 @@ def convert_bounding_regions_to_source(page_number: int, polygon: list) -> str: source = f"D({page_number},{polygon_str})" return source -def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions) -> Tuple[dict, dict]: +def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> Tuple[dict, dict]: """ Convert DI 3.1/4.0GA Custom Neural fields.json to analyzer.json format. Args: @@ -67,7 +67,11 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O # Build analyzer.json content analyzer_data = { "analyzerId": analyzer_prefix, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": COMPLETION_MODEL, + "embedding": EMBEDDING_MODEL + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -128,10 +132,21 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O # Determine output path if target_dir: - analyzer_json_path = target_dir / 'analyzer.json' + analyzer_json_path = target_dir / ANALYZER_JSON else: - analyzer_json_path = fields_json_path.parent / 'analyzer.json' + analyzer_json_path = fields_json_path.parent / ANALYZER_JSON + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) @@ -405,16 +420,25 @@ def creating_cu_label_for_neural(label:dict, label_type: str) -> dict: # if more than one period exists, remove them all if cleaned_string.count('.') > 1: print("More than one decimal point exists, so will be removing them all.") - cleaned_string = cleaned_string = re.sub(r'\.', '', string_value) - final_content = float(cleaned_string) + cleaned_string = re.sub(r'\.', '', string_value) + if not cleaned_string: + final_content = None + else: + final_content = float(cleaned_string) elif label_type == "integer": try: - final_content = int(final_content) + if not final_content: + final_content = None + else: + final_content = int(final_content) except Exception as ex: # strip the string of all non-numerical values string_value = final_content cleaned_string = re.sub(r'[^0-9]', '', string_value) - final_content = int(cleaned_string) + if not cleaned_string: + final_content = None + else: + final_content = int(cleaned_string) elif label_type == "date": # dates can be dmy, mdy, ydm, or not specified # for CU, the format of our dates should be "%Y-%m-%d" diff --git a/python/di_to_cu_migration_tool/di_to_cu_converter.py b/python/di_to_cu_migration_tool/di_to_cu_converter.py index 5de14d9..0198b23 100644 --- a/python/di_to_cu_migration_tool/di_to_cu_converter.py +++ b/python/di_to_cu_migration_tool/di_to_cu_converter.py @@ -8,19 +8,21 @@ import shutil import tempfile import typer -from typing import Tuple +from typing import Optional, Tuple # imports from external packages (in requirements.txt) from rich import print # For colored output # imports from same project -from constants import DI_VERSIONS, FIELDS_JSON, LABELS_JSON, MAX_FIELD_COUNT, OCR_JSON, VALIDATION_TXT +from constants import DI_VERSIONS, FIELDS_JSON, LABELS_JSON, MAX_FIELD_COUNT, OCR_JSON, VALIDATION_TXT, ANALYZER_JSON, RESULT_JSON import cu_converter_neural as cu_converter_neural import cu_converter_generative as cu_converter_generative from field_definitions import FieldDefinitions import field_type_conversion from get_ocr import run_cu_layout_ocr +NON_DOCUMENT_FILES = {FIELDS_JSON, VALIDATION_TXT, ANALYZER_JSON} + app = typer.Typer() def validate_field_count(DI_version, byte_fields) -> Tuple[int, bool]: @@ -161,7 +163,7 @@ def main( print(f"[yellow]WARNING: The following signatures were removed from the dataset: {removed_signatures}[/yellow]\n") print("Second: Running DI to CU dataset conversion...") - analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures) + analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures, target_container_sas_url, target_blob_folder) # Run OCR on the pdf files run_cu_layout_ocr(ocr_files, temp_target_dir, subscription_key) @@ -232,7 +234,7 @@ def running_field_type_conversion(temp_source_dir: Path, temp_dir: Path, DI_vers return removed_signatures -def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: str, removed_signatures: list) -> Tuple[dict, list]: +def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: Optional[str], removed_signatures: list, target_container_sas_url: str, target_blob_folder: str) -> Tuple[dict, list]: """ Function to run the DI to CU conversion Args: @@ -241,6 +243,8 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str DI_version (str): The version of DI being used analyzer_prefix (str): The prefix for the analyzer name removed_signatures (list): The list of removed signatures that will not be used in the CU converter + target_container_sas_url (str): The target container SAS URL for training data + target_blob_folder (str): The target blob folder prefix for training data """ # Creating a FieldDefinitons object to handle the converison of definitions in the fields.json field_definitions = FieldDefinitions() @@ -251,14 +255,14 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str assert fields_path.exists(), "fields.json is needed. Fields.json is missing from the given dataset." if DI_version == "generative": - analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) elif DI_version == "neural": - analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) ocr_files = [] # List to store paths to pdf files to get OCR results from later for file in files: file_path = root_path / file - if (file_path.name == FIELDS_JSON or file_path.name == VALIDATION_TXT): + if file_path.name in NON_DOCUMENT_FILES or file_path.name.endswith(RESULT_JSON): continue # Converting DI labels to CU labels if (file.endswith(LABELS_JSON)): @@ -276,4 +280,3 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str if __name__ == "__main__": app() - diff --git a/python/di_to_cu_migration_tool/get_ocr.py b/python/di_to_cu_migration_tool/get_ocr.py index a1b849b..403e6ae 100644 --- a/python/di_to_cu_migration_tool/get_ocr.py +++ b/python/di_to_cu_migration_tool/get_ocr.py @@ -14,6 +14,8 @@ import requests from rich import print # For colored output import typer +# imports from same project (in constants.py) +from constants import CU_API_VERSION, COMPLETION_MODEL, EMBEDDING_MODEL def is_token_expired(token) -> bool: """ @@ -46,75 +48,6 @@ def get_token(credential, current_token = None) -> str: print("Successfully refreshed token") return current_token -def build_analyzer(credential, current_token, host, api_version, subscriptionKey) -> str: - """ - Function to create an analyzer with empty schema to get CU Layout results - Args: - credential: The Azure credential object to use for authentication. - current_token: The current token object to check for expiration. - host: The host URL for the Cognitive Services API. - api_version: The API version enviornmental variable to use. - subscriptionKey: The subscription key for the Cognitive Services API. - Returns: - str: The analyzer ID of the created analyzer. - """ - # Get a valid token - current_token = get_token(credential, current_token) - access_token = current_token.token - headers = { - "Authorization": f"Bearer {access_token}", - "Ocp-Apim-Subscription-Key": f"{subscriptionKey}", - "Content-Type": "application/json" - } - analyzer_id = "sampleAnalyzer" + str(random.randint(0, 1000000)) - request_body = { - "analyzerId": analyzer_id, - "description": "Sample analyzer", - "baseAnalyzerId": "prebuilt-documentAnalyzer", - "config": { - "returnDetails": True, - "enableOcr": True, - "enableLayout": True, - "enableFormula": False, - "disableContentFiltering": False, - "estimateFieldSourceAndConfidence": False - }, - "fieldSchema": {}, - "warnings": [], - "status": "ready", - "processingLocation": "geography", - "mode": "standard" - } - endpoint = f"{host}/contentunderstanding/analyzers/{analyzer_id}?api-version={api_version}" - print("[yellow]Creating sample analyzer to attain CU Layout results...[/yellow]") - response = requests.put( - url=endpoint, - headers=headers, - json=request_body, - ) - response.raise_for_status() - operation_location = response.headers.get("Operation-Location", None) - if not operation_location: - print("Error: 'Operation-Location' header is missing.") - - while True: - poll_response = requests.get(operation_location, headers=headers) - poll_response.raise_for_status() - - result = poll_response.json() - status = result.get("status", "").lower() - - if status == "succeeded": - print(f"[green]Successfully created sample analyzer to gather Layout results[/green]") - break - elif status == "failed": - print(f"[red]Failed: {result}[/red]") - break - else: - print(".", end="", flush=True) - time.sleep(0.5) - return analyzer_id - def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_key: str) -> None: """ Function to run the CU Layout OCR on the list of pdf files and write to the given output directory @@ -129,7 +62,7 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke load_dotenv() # Set the global variables - api_version = os.getenv("API_VERSION") + api_version = os.getenv("API_VERSION") or CU_API_VERSION host = os.getenv("HOST") credential = DefaultAzureCredential() @@ -138,9 +71,8 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke output_dir = Path(output_dir_string) output_dir.mkdir(parents=True, exist_ok=True) - # Need to create analyzer with empty schema - analyzer_id = build_analyzer(credential, current_token, host, api_version, subscription_key) - url = f"{host}/contentunderstanding/analyzers/{analyzer_id}:analyze?api-version={api_version}" + # Use prebuilt-layout analyzer directly - no need to create a custom analyzer + url = f"{host.rstrip('/')}/contentunderstanding/analyzers/prebuilt-layout:analyzeBinary?api-version={api_version}" for file in input_files: try: @@ -150,8 +82,8 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke current_token = get_token(credential, current_token) headers = { "Authorization": f"Bearer {current_token.token}", - "Apim-Subscription-id": f"{subscription_key}", - "Content-Type": "application/pdf", + "Ocp-Apim-Subscription-Key": f"{subscription_key}", + "Content-Type": "application/octet-stream", } with open(file, "rb") as f: