diff --git a/notebooks/move_training_data_across_analyzers.ipynb b/notebooks/move_training_data_across_analyzers.ipynb new file mode 100644 index 00000000..428d9dbb --- /dev/null +++ b/notebooks/move_training_data_across_analyzers.ipynb @@ -0,0 +1,1299 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3ff63c1", + "metadata": {}, + "source": [ + "# Move Training Data Across Analyzers\n", + "\n", + "This notebook demonstrates how to reuse training data from an existing analyzer when creating a new analyzer in the same Azure AI Content Understanding resource.\n", + "\n", + "## Overview\n", + "\n", + "When you have an analyzer with training data and want to create a new analyzer using the same labeled examples, you can reference the existing blob storage location without duplicating or moving the data.\n", + "\n", + "### Benefits\n", + "- **No data duplication**: Reuse existing training data without copying\n", + "- **Same resource**: Both analyzers access the same blob storage\n", + "- **Field portability**: Maintain stable `fieldId`s across analyzers\n", + "- **Rapid iteration**: Test schema variations quickly\n", + "\n", + "### Prerequisites\n", + "1. An existing analyzer with training data already configured\n", + "2. Azure AI service configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource)\n", + "3. Required packages installed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f76b866", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r ../requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a0032373", + "metadata": {}, + "source": [ + "## Create Azure AI Content Understanding Client\n", + "\n", + "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class providing functions to interact with the Content Understanding API. Before the official release of the Content Understanding SDK, this acts as a lightweight SDK.\n", + "\n", + "> ⚠️ **Important**: Update the code below to match your Azure authentication method. Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", + "\n", + "> ⚠️ **Note**: Using a subscription key works, but using a token provider with Azure Active Directory (AAD) is safer and highly recommended for production environments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcea7936", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import json\n", + "import os\n", + "import sys\n", + "import uuid\n", + "from pathlib import Path\n", + "from dotenv import find_dotenv, load_dotenv\n", + "from azure.identity import DefaultAzureCredential, get_bearer_token_provider\n", + "\n", + "load_dotenv(find_dotenv())\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "# For authentication, you can use either token-based authentication or a subscription key; only one method is required.\n", + "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", + "# IMPORTANT: Replace with your actual subscription key or set it in the \".env\" file if not using token authentication.\n", + "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", + "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n", + "\n", + "# Add the parent directory to the path to use shared modules\n", + "parent_dir = Path(Path.cwd()).parent\n", + "sys.path.append(str(parent_dir))\n", + "from python.content_understanding_client import AzureContentUnderstandingClient\n", + "\n", + "credential = DefaultAzureCredential()\n", + "token_provider = get_bearer_token_provider(credential, \"https://cognitiveservices.azure.com/.default\")\n", + "\n", + "client = AzureContentUnderstandingClient(\n", + " endpoint=AZURE_AI_ENDPOINT,\n", + " api_version=AZURE_AI_API_VERSION,\n", + " # IMPORTANT: Comment out token_provider if using subscription key\n", + " token_provider=token_provider,\n", + " # IMPORTANT: Uncomment this if using subscription key\n", + " # subscription_key=AZURE_AI_API_KEY,\n", + " x_ms_useragent=\"azure-ai-content-understanding-python/move_training_data\",\n", + ")\n", + "\n", + "print(\"✅ Content Understanding client initialized successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "92e5f27f", + "metadata": {}, + "source": [ + "## Step 1: List Available Analyzers\n", + "\n", + "First, let's see what analyzers are available in your resource. We'll look for analyzers that have training data configured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcbc218a", + "metadata": {}, + "outputs": [], + "source": [ + "# Get all analyzers in your resource\n", + "all_analyzers = client.get_all_analyzers()\n", + "analyzers_list = all_analyzers.get('value', [])\n", + "\n", + "print(f\"Found {len(analyzers_list)} analyzer(s) in your resource\\n\")\n", + "\n", + "# Display analyzer names and IDs\n", + "if analyzers_list:\n", + " print(\"Available analyzers:\")\n", + " for idx, analyzer in enumerate(analyzers_list, 1):\n", + " analyzer_id = analyzer.get('analyzerId', 'N/A')\n", + " analyzer_name = analyzer.get('name', 'N/A')\n", + " print(f\"{idx}. ID: {analyzer_id}\")\n", + " print(f\" Name: {analyzer_name}\")\n", + " print()\n", + "else:\n", + " print(\"No analyzers found. Please create an analyzer with training data first.\")\n", + " print(\"See: notebooks/analyzer_training.ipynb for guidance.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8e6ae2ac", + "metadata": {}, + "source": [ + "## Step 2: Select Source Analyzer\n", + "\n", + "Specify the ID of the analyzer whose training data you want to reuse.\n", + "\n", + "Set `SOURCE_ANALYZER_ID` to an existing analyzer ID from the list above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9772b0f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer ID: invoiceLabeledData\n" + ] + } + ], + "source": [ + "# OPTION 1: Specify an existing analyzer ID that has training data\n", + "\n", + "# ⚠️ REQUIRED: Replace \"MyAnalyzer\" with your actual analyzer ID from the list above\n", + "# You can find available analyzer IDs in the output of the previous cell\n", + "SOURCE_ANALYZER_ID = \"MyAnalyzer\" # ← CHANGE THIS!\n", + "\n", + "# Uncomment to use the first analyzer from the list\n", + "# if analyzers_list:\n", + "# SOURCE_ANALYZER_ID = analyzers_list[0].get('id')\n", + "# print(f\"Using first analyzer: {SOURCE_ANALYZER_ID}\")\n", + "\n", + "print(f\"Source Analyzer ID: {SOURCE_ANALYZER_ID}\")" + ] + }, + { + "cell_type": "markdown", + "id": "d9b1bc93", + "metadata": {}, + "source": [ + "## Step 3: Retrieve Source Analyzer Details\n", + "\n", + "Now we'll fetch the complete definition of the source analyzer, including its training data configuration." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b2c9ae0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Analyzer: invoiceLabeledData\n", + "Name: N/A\n", + "Description: \n", + "\n", + "Full analyzer definition:\n", + "{\n", + " \"analyzerId\": \"invoiceLabeledData\",\n", + " \"description\": \"\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:03:08Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:03:11Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + "}\n" + ] + } + ], + "source": [ + "# Get detailed information about the source analyzer\n", + "source_analyzer = client.get_analyzer_detail_by_id(SOURCE_ANALYZER_ID)\n", + "\n", + "print(f\"Source Analyzer: {SOURCE_ANALYZER_ID}\")\n", + "print(f\"Name: {source_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {source_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nFull analyzer definition:\")\n", + "print(json.dumps(source_analyzer, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "3eb0b65d", + "metadata": {}, + "source": [ + "## Step 4: Extract Training Data Configuration\n", + "\n", + "Extract the training data configuration from the source analyzer. This includes:\n", + "- **trainingData**: The blob container location with labeled examples\n", + "- **fieldSchema**: The field definitions\n", + "- **tags**: Project and template metadata (important for Azure AI Foundry project association)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7c57655f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "📦 Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\"\n", + "}\n", + "\n", + "✅ Found training data at: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Path prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n", + "\n", + "📚 Knowledge Sources Configuration:\n", + "No knowledge sources configured (this is normal for standard mode)\n", + "\n", + "📋 Field Schema:\n", + "{\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + "}\n", + "\n", + "🏷️ Tags (Project & Template Metadata):\n", + "{\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + "}\n", + "\n", + "✅ Found Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + "✅ Found Template ID: document-2025-05-01\n", + "\n", + "💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\n" + ] + } + ], + "source": [ + "# Extract training data configuration\n", + "training_data_config = source_analyzer.get('trainingData')\n", + "knowledge_sources_config = source_analyzer.get('knowledgeSources')\n", + "field_schema = source_analyzer.get('fieldSchema', {})\n", + "tags = source_analyzer.get('tags', {})\n", + "\n", + "print(\"📦 Training Data Configuration:\")\n", + "if training_data_config:\n", + " print(json.dumps(training_data_config, indent=2))\n", + " container_url = training_data_config.get('containerUrl', 'N/A')\n", + " prefix = training_data_config.get('prefix', '')\n", + " print(f\"\\n✅ Found training data at: {container_url}\")\n", + " print(f\" Path prefix: {prefix}\")\n", + "else:\n", + " print(\"⚠️ No training data found in this analyzer.\")\n", + " print(\" Please select an analyzer that has training data configured.\")\n", + "\n", + "print(\"\\n📚 Knowledge Sources Configuration:\")\n", + "if knowledge_sources_config:\n", + " print(json.dumps(knowledge_sources_config, indent=2))\n", + "else:\n", + " print(\"No knowledge sources configured (this is normal for standard mode)\")\n", + "\n", + "print(\"\\n📋 Field Schema:\")\n", + "print(json.dumps(field_schema, indent=2))\n", + "\n", + "print(\"\\n🏷️ Tags (Project & Template Metadata):\")\n", + "if tags:\n", + " print(json.dumps(tags, indent=2))\n", + " project_id = tags.get('projectId')\n", + " template_id = tags.get('templateId')\n", + " if project_id:\n", + " print(f\"\\n✅ Found Project ID: {project_id}\")\n", + " if template_id:\n", + " print(f\"✅ Found Template ID: {template_id}\")\n", + " print(\"\\n💡 These tags will be copied to ensure the new analyzer appears in the same Azure AI Foundry project.\")\n", + "else:\n", + " print(\"No tags found (the new analyzer may not be associated with a Foundry project)\")" + ] + }, + { + "cell_type": "markdown", + "id": "e7770461", + "metadata": {}, + "source": [ + "## Step 5: Create New Analyzer with Existing Training Data\n", + "\n", + "Now we'll create a new analyzer that references the same training data. This new analyzer will:\n", + "- Use the same blob storage container and path\n", + "- Start with the same field schema (you can modify this)\n", + "- Have its own unique ID\n", + "- **Include the same tags** (projectId and templateId) to ensure it appears in the correct Azure AI Foundry project\n", + "\n", + "### Key Points:\n", + "- **Same resource**: Both analyzers are in the same Azure AI resource\n", + "- **No data duplication**: The training data stays in one place\n", + "- **Same project**: Tags ensure the analyzer appears in the same Foundry project\n", + "- **Independent lifecycle**: Each analyzer can be updated or deleted independently" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "98b0c9c3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Including tags from source analyzer (ensures correct project association in Foundry)\n", + " Project ID: d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\n", + " Template ID: document-2025-05-01\n", + "\n", + "Creating new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "New analyzer payload (ordered to match API structure):\n", + "{\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"mode\": \"standard\"\n", + "}\n", + "\n", + "📦 Training data will be configured separately:\n", + " Container URL: https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\n", + " Prefix: labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train\n" + ] + } + ], + "source": [ + "# Verify we have training data before proceeding\n", + "if not training_data_config:\n", + " raise ValueError(\n", + " \"Cannot proceed: Source analyzer does not have training data. \"\n", + " \"Please select an analyzer with training data or create one using the optional cell above.\"\n", + " )\n", + "\n", + "# Create a new analyzer ID\n", + "# Analyzer names must be 1-64 characters and only contain letters, numbers, dots, underscores, or hyphens\n", + "NEW_ANALYZER_ID = \"cloned-analyzer-\" + str(uuid.uuid4())\n", + "\n", + "# Build the new analyzer payload in the correct order matching the API structure\n", + "# Note: Read-only fields like createdAt, lastModifiedAt, status, etc. are omitted as they're set by the service\n", + "new_analyzer_payload = {}\n", + "\n", + "# 1. Analyzer ID (not needed as it's passed separately, but kept for reference)\n", + "# new_analyzer_payload[\"analyzerId\"] = NEW_ANALYZER_ID\n", + "\n", + "# 2. Description\n", + "new_analyzer_payload[\"description\"] = f\"Created from {SOURCE_ANALYZER_ID} with reused training data\"\n", + "\n", + "# 3. Tags (projectId and templateId) - IMPORTANT for Foundry project association\n", + "if tags:\n", + " new_analyzer_payload[\"tags\"] = tags\n", + " print(\"✅ Including tags from source analyzer (ensures correct project association in Foundry)\")\n", + " print(f\" Project ID: {tags.get('projectId', 'N/A')}\")\n", + " print(f\" Template ID: {tags.get('templateId', 'N/A')}\")\n", + "else:\n", + " print(\"⚠️ No tags found in source analyzer - new analyzer may not appear in Foundry project\")\n", + "\n", + "# 4. Base Analyzer ID (if present)\n", + "if 'baseAnalyzerId' in source_analyzer:\n", + " new_analyzer_payload['baseAnalyzerId'] = source_analyzer['baseAnalyzerId']\n", + "\n", + "# 5. Config settings\n", + "if 'config' in source_analyzer:\n", + " new_analyzer_payload['config'] = source_analyzer['config']\n", + "\n", + "# 6. Field Schema\n", + "new_analyzer_payload[\"fieldSchema\"] = field_schema\n", + "\n", + "# 7. Training Data - Will be passed separately to begin_create_analyzer()\n", + "# Note: We extract the container URL and prefix to pass as separate parameters\n", + "training_container_sas_url = training_data_config.get('containerUrl', '')\n", + "training_container_prefix = training_data_config.get('prefix', '')\n", + "\n", + "# 8. Knowledge Sources (if present - typically for Pro mode)\n", + "# Extract these separately if they exist\n", + "pro_mode_container_sas_url = \"\"\n", + "pro_mode_container_prefix = \"\"\n", + "if knowledge_sources_config and isinstance(knowledge_sources_config, list) and len(knowledge_sources_config) > 0:\n", + " # Get the first knowledge source (typically there's only one)\n", + " first_knowledge_source = knowledge_sources_config[0]\n", + " pro_mode_container_sas_url = first_knowledge_source.get('containerUrl', '')\n", + " pro_mode_container_prefix = first_knowledge_source.get('prefix', '')\n", + "\n", + "# 9. Mode (if present)\n", + "if 'mode' in source_analyzer:\n", + " new_analyzer_payload['mode'] = source_analyzer['mode']\n", + "\n", + "print(f\"\\nCreating new analyzer: {NEW_ANALYZER_ID}\")\n", + "print(\"\\nNew analyzer payload (ordered to match API structure):\")\n", + "print(json.dumps(new_analyzer_payload, indent=2))\n", + "\n", + "print(\"\\n📦 Training data will be configured separately:\")\n", + "print(f\" Container URL: {training_container_sas_url}\")\n", + "print(f\" Prefix: {training_container_prefix}\")\n", + "\n", + "if pro_mode_container_sas_url:\n", + " print(\"\\n📚 Pro mode reference docs will be configured separately:\")\n", + " print(f\" Container URL: {pro_mode_container_sas_url}\")\n", + " print(f\" Prefix: {pro_mode_container_prefix}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "385a0867", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzer cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b create request accepted.\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request a22ddf12-3156-4a9a-9675-7b85789a8686 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 152.25 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Successfully created new analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "\n", + "Creation result:\n", + "{\n", + " \"id\": \"a22ddf12-3156-4a9a-9675-7b85789a8686\",\n", + " \"status\": \"Succeeded\",\n", + " \"result\": {\n", + " \"analyzerId\": \"cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\",\n", + " \"description\": \"Created from invoiceLabeledData with reused training data\",\n", + " \"tags\": {\n", + " \"projectId\": \"d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb\",\n", + " \"templateId\": \"document-2025-05-01\"\n", + " },\n", + " \"createdAt\": \"2025-10-22T22:44:56Z\",\n", + " \"lastModifiedAt\": \"2025-10-22T22:47:27Z\",\n", + " \"baseAnalyzerId\": \"prebuilt-documentAnalyzer\",\n", + " \"config\": {\n", + " \"returnDetails\": true,\n", + " \"enableOcr\": true,\n", + " \"enableLayout\": true,\n", + " \"enableFormula\": false,\n", + " \"disableContentFiltering\": false,\n", + " \"tableFormat\": \"html\",\n", + " \"estimateFieldSourceAndConfidence\": false\n", + " },\n", + " \"fieldSchema\": {\n", + " \"fields\": {\n", + " \"CompanyName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the pharmaceutical company involved in the rebate program\"\n", + " },\n", + " \"ProductDetails\": {\n", + " \"type\": \"array\",\n", + " \"description\": \"List of products with rebate and unit details\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"description\": \"Details of a single product\",\n", + " \"properties\": {\n", + " \"ProductPackageCode\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Code representing the product or package\"\n", + " },\n", + " \"ProductName\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Name of the product\"\n", + " },\n", + " \"FfsimcoRecordId\": {\n", + " \"type\": \"string\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Record ID for FFSIMCO\"\n", + " },\n", + " \"RebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Rebate amount per unit of the product\"\n", + " },\n", + " \"AdjustedRebatePerUnit\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Adjusted rebate amount per unit\"\n", + " },\n", + " \"UnitsInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units invoiced\"\n", + " },\n", + " \"UnitsPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Number of units for which payment was made\"\n", + " },\n", + " \"RebateAmountInvoiced\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount invoiced\"\n", + " },\n", + " \"RebateAmountPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total rebate amount paid\"\n", + " }\n", + " }\n", + " }\n", + " },\n", + " \"TotalPaid\": {\n", + " \"type\": \"number\",\n", + " \"method\": \"extract\",\n", + " \"description\": \"Total payment amount \"\n", + " }\n", + " }\n", + " },\n", + " \"trainingData\": {\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + " },\n", + " \"warnings\": [],\n", + " \"status\": \"ready\",\n", + " \"processingLocation\": \"geography\",\n", + " \"mode\": \"standard\"\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "# Create the new analyzer\n", + "# Pass training data and knowledge sources as separate parameters\n", + "response = client.begin_create_analyzer(\n", + " NEW_ANALYZER_ID,\n", + " analyzer_template=new_analyzer_payload,\n", + " training_storage_container_sas_url=training_container_sas_url,\n", + " training_storage_container_path_prefix=training_container_prefix,\n", + ")\n", + "\n", + "result = client.poll_result(response)\n", + "\n", + "if result and result.get('status') == 'Succeeded':\n", + " print(f\"✅ Successfully created new analyzer: {NEW_ANALYZER_ID}\")\n", + " print(\"\\nCreation result:\")\n", + " print(json.dumps(result, indent=2))\n", + "else:\n", + " print(\"⚠️ Analyzer creation encountered an issue.\")\n", + " print(json.dumps(result, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "63295659", + "metadata": {}, + "source": [ + "## Step 6: Verify the New Analyzer\n", + "\n", + "Let's confirm the new analyzer was created correctly and is using the same training data." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "685ff06f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "New Analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "Name: N/A\n", + "Description: Created from invoiceLabeledData with reused training data\n", + "\n", + "Training Data Configuration:\n", + "{\n", + " \"containerUrl\": \"https://staistudiote203841201294.blob.core.windows.net/7c123b64-9378-4fa7-a807-081efa839c00-cu\",\n", + " \"kind\": \"blob\",\n", + " \"prefix\": \"labelingProjects/d7afeaa4-fe05-4df7-bd7c-46f3a94a96cb/train/\"\n", + "}\n", + "\n", + "✅ Verification successful: Both analyzers reference the same training data location!\n" + ] + } + ], + "source": [ + "# Get details of the newly created analyzer\n", + "new_analyzer = client.get_analyzer_detail_by_id(NEW_ANALYZER_ID)\n", + "\n", + "print(f\"New Analyzer: {NEW_ANALYZER_ID}\")\n", + "print(f\"Name: {new_analyzer.get('name', 'N/A')}\")\n", + "print(f\"Description: {new_analyzer.get('description', 'N/A')}\")\n", + "print(\"\\nTraining Data Configuration:\")\n", + "print(json.dumps(new_analyzer.get('trainingData', {}), indent=2))\n", + "\n", + "# Verify the training data location matches\n", + "new_training_data = new_analyzer.get('trainingData', {})\n", + "original_container = training_data_config.get('containerUrl', '')\n", + "new_container = new_training_data.get('containerUrl', '')\n", + "\n", + "if original_container == new_container:\n", + " print(\"\\n✅ Verification successful: Both analyzers reference the same training data location!\")\n", + "else:\n", + " print(\"\\n⚠️ Warning: Training data locations don't match.\")\n", + " print(f\"Original: {original_container}\")\n", + " print(f\"New: {new_container}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fe3352c9", + "metadata": {}, + "source": [ + "## Step 7: Test Both Analyzers\n", + "\n", + "Now let's test both analyzers with a sample file to verify they both work correctly with the shared training data." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "cc934efd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing with file: ../data/receipt.png\n" + ] + } + ], + "source": [ + "# Specify a test file - adjust this path based on your analyzer type\n", + "# For receipt analyzers:\n", + "test_file = \"../data/receipt.png\"\n", + "\n", + "# For invoice analyzers:\n", + "# test_file = \"../data/invoice.pdf\"\n", + "\n", + "# For custom documents:\n", + "# test_file = \"../data/your-document.pdf\"\n", + "\n", + "# Verify the file exists\n", + "if not Path(test_file).exists():\n", + " print(f\"⚠️ Test file not found: {test_file}\")\n", + " print(\"Please adjust the test_file path to match your use case.\")\n", + "else:\n", + " print(f\"Testing with file: {test_file}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "273dd85c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with SOURCE analyzer: invoiceLabeledData\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: invoiceLabeledData\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request 80b00372-a498-4564-9ff1-1e6901778a2d in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.71 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Source Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n" + ] + } + ], + "source": [ + "# Test the original analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with SOURCE analyzer: {SOURCE_ANALYZER_ID}\")\n", + " response_source = client.begin_analyze(SOURCE_ANALYZER_ID, file_location=test_file)\n", + " result_source = client.poll_result(response_source)\n", + " \n", + " print(\"\\nSource Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_source.get('status') == 'Succeeded':\n", + " result_data = result_source.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_source, indent=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "e9654313", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "📝 Analyzing with NEW analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:python.content_understanding_client:Analyzing file ../data/receipt.png with analyzer: cloned-analyzer-c073f24d-5659-42ed-8ac8-b083bde79a9b\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request 5d982b83-4b1c-4e99-b045-48e36cb5a7e3 in progress ...\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n", + "INFO:python.content_understanding_client:Request result is ready after 4.72 seconds.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "New Analyzer Results:\n", + "Extracted 3 field(s)\n", + " - CompanyName: {'type': 'string', 'valueString': 'Contoso'}\n", + " - ProductDetails: {'type': 'array'}\n", + " - TotalPaid: {'type': 'number', 'valueNumber': 2516.28}\n", + "\n", + "✅ Both analyzers successfully processed the file using the shared training data!\n" + ] + } + ], + "source": [ + "# Test the new analyzer\n", + "if Path(test_file).exists():\n", + " print(f\"\\n📝 Analyzing with NEW analyzer: {NEW_ANALYZER_ID}\")\n", + " response_new = client.begin_analyze(NEW_ANALYZER_ID, file_location=test_file)\n", + " result_new = client.poll_result(response_new)\n", + " \n", + " print(\"\\nNew Analyzer Results:\")\n", + " # Print a summary of extracted fields\n", + " if result_new.get('status') == 'Succeeded':\n", + " result_data = result_new.get('result', {})\n", + " fields = result_data.get('contents', [{}])[0].get('fields', {})\n", + " print(f\"Extracted {len(fields)} field(s)\")\n", + " for field_name, field_value in fields.items():\n", + " print(f\" - {field_name}: {field_value}\")\n", + " else:\n", + " print(json.dumps(result_new, indent=2))\n", + " \n", + " print(\"\\n✅ Both analyzers successfully processed the file using the shared training data!\")" + ] + }, + { + "cell_type": "markdown", + "id": "f913b6dd", + "metadata": {}, + "source": [ + "## Step 8: Compare Results (Optional)\n", + "\n", + "Let's compare the full results from both analyzers side by side." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6467b3f", + "metadata": {}, + "outputs": [], + "source": [ + "if Path(test_file).exists():\n", + " print(\"=\" * 80)\n", + " print(\"SOURCE ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_source, indent=2))\n", + " \n", + " print(\"\\n\" + \"=\" * 80)\n", + " print(\"NEW ANALYZER FULL RESULTS\")\n", + " print(\"=\" * 80)\n", + " print(json.dumps(result_new, indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "5f65f05c", + "metadata": {}, + "source": [ + "## Step 9: Cleanup (Optional)\n", + "\n", + "If you want to clean up the test analyzers, you can delete them. In production, you typically keep analyzers for reuse.\n", + "\n", + "⚠️ **Warning**: This will permanently delete the analyzer. The training data in blob storage will remain unaffected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00cde3ff", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to delete the new analyzer\n", + "# print(f\"Deleting new analyzer: {NEW_ANALYZER_ID}\")\n", + "# client.delete_analyzer(NEW_ANALYZER_ID)\n", + "# print(\"✅ New analyzer deleted\")\n", + "\n", + "# Uncomment to also delete the source analyzer (be careful!)\n", + "# print(f\"Deleting source analyzer: {SOURCE_ANALYZER_ID}\")\n", + "# client.delete_analyzer(SOURCE_ANALYZER_ID)\n", + "# print(\"✅ Source analyzer deleted\")" + ] + }, + { + "cell_type": "markdown", + "id": "d952dfef", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "🎉 **Congratulations!** You have successfully:\n", + "\n", + "✅ Retrieved an existing analyzer with training data \n", + "✅ Extracted the training data configuration \n", + "✅ Created a new analyzer referencing the same training data \n", + "✅ Verified both analyzers work correctly \n", + "✅ Tested both analyzers with a sample file \n", + "\n", + "### Key Takeaways\n", + "\n", + "- **No data duplication**: Both analyzers reference the same blob storage location\n", + "- **Same resource**: Both analyzers use the same authentication and access permissions\n", + "- **Field portability**: You can maintain stable `fieldId`s across different analyzer versions\n", + "- **Rapid iteration**: Test schema changes quickly without re-uploading training data\n", + "\n", + "### Best Practices\n", + "\n", + "1. **Stable field IDs**: Keep `fieldId`s consistent across analyzers for easier migration\n", + "2. **Version control**: Maintain analyzer schemas in source control\n", + "3. **Documentation**: Document which blob paths contain which training datasets\n", + "4. **Testing**: Always test a new analyzer before deleting the original\n", + "5. **Naming conventions**: Use descriptive analyzer IDs that indicate purpose and version\n", + "\n", + "### Next Steps\n", + "\n", + "- Modify the field schema in the new analyzer to test different configurations\n", + "- Add additional training data to improve both analyzers\n", + "- Use this pattern to create A/B testing scenarios\n", + "- Explore other notebooks:\n", + " - [analyzer_training.ipynb](./analyzer_training.ipynb) - Create analyzers with training data\n", + " - [field_extraction.ipynb](./field_extraction.ipynb) - Extract fields from documents\n", + " - [management.ipynb](./management.ipynb) - Manage analyzer lifecycle" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python/di_to_cu_migration_tool/README.md b/python/di_to_cu_migration_tool/README.md index e473ad0a..737a4ba5 100644 --- a/python/di_to_cu_migration_tool/README.md +++ b/python/di_to_cu_migration_tool/README.md @@ -1,13 +1,13 @@ # Document Intelligence to Content Understanding Migration Tool (Python) -Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **Preview.2** 2025-05-01-preview format, as used in AI Foundry. The following DI versions are supported: +Welcome! This tool helps convert your Document Intelligence (DI) datasets to the Content Understanding (CU) **GA** 2025-11-01 format, as used in AI Foundry. The following DI versions are supported: - Custom Extraction Model DI 3.1 GA (2023-07-31) to DI 4.0 GA (2024-11-30) (Document Intelligence Studio) → DI-version = neural - Document Field Extraction Model 4.0 Preview (2024-07-31-preview) (AI Foundry / AI Services / Vision + Document / Document Field Extraction) → DI-version = generative To identify the version of your Document Intelligence dataset, please consult the sample documents in this folder to match your format. You can also verify the version by reviewing your DI project's user experience. For instance, Custom Extraction DI 3.1/4.0 GA appears in Document Intelligence Studio (https://documentintelligence.ai.azure.com/studio), whereas Document Field Extraction DI 4.0 Preview is only available on Azure AI Foundry's preview service (https://ai.azure.com/explore/aiservices/vision/document/extraction). -For migrating from these DI versions to Content Understanding Preview.2, this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. +For migrating from these DI versions to Content Understanding GA (2025-11-01), this tool first converts the DI dataset into a CU-compatible format. After conversion, you can create a Content Understanding Analyzer trained on your converted CU dataset. Additionally, you have the option to test its quality against any sample documents. ## Details About the Tools @@ -27,8 +27,26 @@ Here is a detailed breakdown of the three CLI tools and their functionality: * **call_analyze.py** * This CLI tool verifies that the migration completed successfully and assesses the quality of the created analyzer. + ## Setup +## Prerequisites + +⚠️ **IMPORTANT: Before using this migration tool**, ensure your Azure AI Foundry resource is properly configured for Content Understanding: + +1. **Configure Default Model Deployments**: You must set default model deployments in your Content Understanding in your Foundry Resource before creating or running analyzers. + + To do this walk through the prerequisites here: + - [REST API Quickstart Guide](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-rest-api?tabs=portal%2Cdocument) + + For more details about defaults checkout this documentation: + - [Models and Deployments Documentation](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/concepts/models-deployments) + +2. **Verify you can create and use a basic Content Understanding analyzer** in your Azure AI Foundry resource before attempting migration. This ensures all prerequisites are met. + +3. Complete all setup steps outlined in the REST API documentation above, including authentication and model deployment configuration. + +### Tool Setup Please follow these steps to set up the tool: 1. Install dependencies by running: @@ -43,7 +61,7 @@ Please follow these steps to set up the tool: - **SUBSCRIPTION_KEY:** Update to your Azure AI Service API Key or Subscription ID to authenticate the API requests. - Locate your API Key here: ![Azure AI Service Endpoints With Keys](assets/endpoint-with-keys.png) - If using Azure Active Directory (AAD), please refer to your Subscription ID: ![Azure AI Service Subscription ID](assets/subscription-id.png) - - **API_VERSION:** This is preset to the CU Preview.2 version; no changes are needed. + - **API_VERSION:** This is preset to the CU GA version (2025-11-01); no changes are needed. ## How to Locate Your Document Field Extraction Dataset for Migration @@ -73,8 +91,12 @@ To obtain SAS URLs for a file or folder for any container URL arguments, please 3. Configure permissions and expiry for your SAS URL as follows: - For the **DI source dataset**, please select permissions: _**Read & List**_ +https://jfilcikditestdata.blob.core.windows.net/didata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A17%3A06Z&se=2025-12-17T22%3A17%3A06Z&sr=c&sp=rl&sig=nvUIelZQ9yWEJx3jA%2FjUOIdHn6OVnp5gvKSJ3zgzwvE%3D + - For the **CU target dataset**, please select permissions: _**Read, Add, Create, & Write**_ +https://jfilcikditestdata.blob.core.windows.net/cudata?sv=2025-07-05&spr=https&st=2025-12-16T22%3A19%3A39Z&se=2025-12-17T22%3A19%3A39Z&sr=c&sp=racwl&sig=K82dxEFNpYhuf5JRq3xJ4vc5SYE8A7FfsBnTJbB1VJY%3D + After configuring, click **Generate SAS Token and URL** and copy the URL shown under **Blob SAS URL**. ![Generate SAS Pop-Up](assets/generate-sas-pop-up.png) @@ -155,7 +177,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request** errors: Please validate the following: - The endpoint URL is valid. Example: - `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-05-01-preview` + `https://yourEndpoint/contentunderstanding/analyzers/yourAnalyzerID?api-version=2025-11-01` - Your converted CU dataset respects the naming constraints below. If needed, please manually correct the `analyzer.json` fields: - Field names start with a letter or underscore - Field name length must be between 1 and 64 characters @@ -174,7 +196,7 @@ Below are common issues you might encounter when creating an analyzer or running - **400 Bad Request**: This implies that you might have an incorrect endpoint or SAS URL. Please ensure that your endpoint is valid and that you are using the correct SAS URL for the document: - `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-05-01-preview` + `https://yourendpoint/contentunderstanding/analyzers/yourAnalyzerID:analyze?api-version=2025-11-01` Confirm you are using the correct SAS URL for the document. - **401 Unauthorized**: @@ -189,4 +211,4 @@ Below are common issues you might encounter when creating an analyzer or running 2. Signature field types (e.g., in previous DI versions) are not yet supported in Content Understanding. These will be ignored during migration when creating the analyzer. 3. The content of your training documents is retained in the CU model's metadata, under storage specifically. You can find more details at: https://learn.microsoft.com/en-us/legal/cognitive-services/content-understanding/transparency-note?toc=%2Fazure%2Fai-services%2Fcontent-understanding%2Ftoc.json&bc=%2Fazure%2Fai-services%2Fcontent-understanding%2Fbreadcrumb%2Ftoc.json -4. All conversions are for Content Understanding preview.2 version only. \ No newline at end of file +4. All conversions are for Content Understanding GA (2025-11-01) version. \ No newline at end of file diff --git a/python/di_to_cu_migration_tool/constants.py b/python/di_to_cu_migration_tool/constants.py index 09dc9721..73f9e0ce 100644 --- a/python/di_to_cu_migration_tool/constants.py +++ b/python/di_to_cu_migration_tool/constants.py @@ -1,6 +1,6 @@ # Supported DI versions DI_VERSIONS = ["generative", "neural"] -CU_API_VERSION = "2025-05-01-preview" +CU_API_VERSION = "2025-11-01" # constants MAX_FIELD_COUNT = 100 diff --git a/python/di_to_cu_migration_tool/cu_converter_generative.py b/python/di_to_cu_migration_tool/cu_converter_generative.py index f27938d1..f384dc78 100644 --- a/python/di_to_cu_migration_tool/cu_converter_generative.py +++ b/python/di_to_cu_migration_tool/cu_converter_generative.py @@ -48,7 +48,7 @@ def format_angle(angle: float) -> float: formatted_num = f"{rounded_angle:.7f}".rstrip('0') # Remove trailing zeros return float(formatted_num) -def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions) -> dict: +def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Path, field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> dict: """ Convert DI 4.0 preview Custom Document fields.json to analyzer.json format. Args: @@ -79,7 +79,11 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional # build analyzer.json appropriately analyzer_data = { "analyzerId": analyzer_id, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -121,6 +125,17 @@ def convert_fields_to_analyzer(fields_json_path: Path, analyzer_prefix: Optional else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/cu_converter_neural.py b/python/di_to_cu_migration_tool/cu_converter_neural.py index d825f10e..64d4d33b 100644 --- a/python/di_to_cu_migration_tool/cu_converter_neural.py +++ b/python/di_to_cu_migration_tool/cu_converter_neural.py @@ -37,7 +37,7 @@ def convert_bounding_regions_to_source(page_number: int, polygon: list) -> str: source = f"D({page_number},{polygon_str})" return source -def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions) -> Tuple[dict, dict]: +def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: Optional[str], target_dir: Optional[Path], field_definitions: FieldDefinitions, target_container_sas_url: str = None, target_blob_folder: str = None) -> Tuple[dict, dict]: """ Convert DI 3.1/4.0GA Custom Neural fields.json to analyzer.json format. Args: @@ -67,7 +67,11 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O # Build analyzer.json content analyzer_data = { "analyzerId": analyzer_prefix, - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, # Add the following line as a temp workaround before service issue is fixed. @@ -132,6 +136,17 @@ def convert_fields_to_analyzer_neural(fields_json_path: Path, analyzer_prefix: O else: analyzer_json_path = fields_json_path.parent / 'analyzer.json' + # Add knowledgeSources section if container info is provided + if target_container_sas_url and target_blob_folder: + analyzer_data["knowledgeSources"] = [ + { + "kind": "labeledData", + "containerUrl": target_container_sas_url, + "prefix": target_blob_folder, + "fileListPath": "" + } + ] + # Ensure target directory exists analyzer_json_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/di_to_cu_migration_tool/di_to_cu_converter.py b/python/di_to_cu_migration_tool/di_to_cu_converter.py index 5de14d91..c84111b8 100644 --- a/python/di_to_cu_migration_tool/di_to_cu_converter.py +++ b/python/di_to_cu_migration_tool/di_to_cu_converter.py @@ -8,7 +8,7 @@ import shutil import tempfile import typer -from typing import Tuple +from typing import Optional, Tuple # imports from external packages (in requirements.txt) from rich import print # For colored output @@ -161,7 +161,7 @@ def main( print(f"[yellow]WARNING: The following signatures were removed from the dataset: {removed_signatures}[/yellow]\n") print("Second: Running DI to CU dataset conversion...") - analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures) + analyzer_data, ocr_files = running_cu_conversion(temp_dir, temp_target_dir, DI_version, analyzer_prefix, removed_signatures, target_container_sas_url, target_blob_folder) # Run OCR on the pdf files run_cu_layout_ocr(ocr_files, temp_target_dir, subscription_key) @@ -232,15 +232,17 @@ def running_field_type_conversion(temp_source_dir: Path, temp_dir: Path, DI_vers return removed_signatures -def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: str, removed_signatures: list) -> Tuple[dict, list]: +def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str, analyzer_prefix: Optional[str], removed_signatures: list, target_container_sas_url: str, target_blob_folder: str) -> Tuple[dict, list]: """ - Function to run the DI to CU conversion + Function to run the CU conversion Args: temp_dir (Path): The path to the source directory temp_target_dir (Path): The path to the target directory DI_version (str): The version of DI being used analyzer_prefix (str): The prefix for the analyzer name removed_signatures (list): The list of removed signatures that will not be used in the CU converter + target_container_sas_url (str): The target container SAS URL for training data + target_blob_folder (str): The target blob folder prefix for training data """ # Creating a FieldDefinitons object to handle the converison of definitions in the fields.json field_definitions = FieldDefinitions() @@ -251,9 +253,9 @@ def running_cu_conversion(temp_dir: Path, temp_target_dir: Path, DI_version: str assert fields_path.exists(), "fields.json is needed. Fields.json is missing from the given dataset." if DI_version == "generative": - analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data = cu_converter_generative.convert_fields_to_analyzer(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) elif DI_version == "neural": - analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions) + analyzer_data, fields_dict = cu_converter_neural.convert_fields_to_analyzer_neural(fields_path, analyzer_prefix, temp_target_dir, field_definitions, target_container_sas_url, target_blob_folder) ocr_files = [] # List to store paths to pdf files to get OCR results from later for file in files: diff --git a/python/di_to_cu_migration_tool/get_ocr.py b/python/di_to_cu_migration_tool/get_ocr.py index a1b849bf..32c0584f 100644 --- a/python/di_to_cu_migration_tool/get_ocr.py +++ b/python/di_to_cu_migration_tool/get_ocr.py @@ -70,7 +70,11 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey request_body = { "analyzerId": analyzer_id, "description": "Sample analyzer", - "baseAnalyzerId": "prebuilt-documentAnalyzer", + "baseAnalyzerId": "prebuilt-document", + "models": { + "completion": "gpt-4.1", + "embedding": "text-embedding-3-large" + }, "config": { "returnDetails": True, "enableOcr": True, @@ -82,8 +86,7 @@ def build_analyzer(credential, current_token, host, api_version, subscriptionKey "fieldSchema": {}, "warnings": [], "status": "ready", - "processingLocation": "geography", - "mode": "standard" + "processingLocation": "geography" } endpoint = f"{host}/contentunderstanding/analyzers/{analyzer_id}?api-version={api_version}" print("[yellow]Creating sample analyzer to attain CU Layout results...[/yellow]") @@ -138,9 +141,8 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke output_dir = Path(output_dir_string) output_dir.mkdir(parents=True, exist_ok=True) - # Need to create analyzer with empty schema - analyzer_id = build_analyzer(credential, current_token, host, api_version, subscription_key) - url = f"{host}/contentunderstanding/analyzers/{analyzer_id}:analyze?api-version={api_version}" + # Use prebuilt-read analyzer directly - no need to create a custom analyzer + url = f"{host}/contentunderstanding/analyzers/prebuilt-read:analyze?api-version={api_version}" for file in input_files: try: @@ -150,7 +152,7 @@ def run_cu_layout_ocr(input_files: list, output_dir_string: str, subscription_ke current_token = get_token(credential, current_token) headers = { "Authorization": f"Bearer {current_token.token}", - "Apim-Subscription-id": f"{subscription_key}", + "Ocp-Apim-Subscription-Key": f"{subscription_key}", "Content-Type": "application/pdf", } diff --git a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json index bfa151fd..f1507dcd 100644 --- a/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json +++ b/python/di_to_cu_migration_tool/sample_documents/analyzer_result.json @@ -3,7 +3,7 @@ "status": "Succeeded", "result": { "analyzerId": "mySampleAnalyzer", - "apiVersion": "2025-05-01-preview", + "apiVersion": "2025-11-01", "createdAt": "2025-05-30T15:47:15Z", "warnings": [], "contents": [