From b160962ae901ea564f54e5c6a5a1a3c676cab47b Mon Sep 17 00:00:00 2001 From: Chien Yuan Chang Date: Wed, 19 Nov 2025 14:17:44 -0800 Subject: [PATCH] docs: review notebooks/analyzer_training.ipynb --- notebooks/analyzer_training.ipynb | 46 +++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/notebooks/analyzer_training.ipynb b/notebooks/analyzer_training.ipynb index 056a0b0..0e4fe1b 100644 --- a/notebooks/analyzer_training.ipynb +++ b/notebooks/analyzer_training.ipynb @@ -15,7 +15,7 @@ "\n", "Labeled data consists of samples that have been tagged with one or more labels to add context or meaning. This additional information is used to improve the analyzer's performance.\n", "\n", - "In your own projects, you can use [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-ai-foundry) to annotate your data with the labeling tool.\n", + "For your own projects, you can use [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-ai-foundry) to annotate your data with the labeling tool.\n", "\n", "This notebook demonstrates how to create an analyzer using your labeled data and how to analyze your files afterward.\n", "\n", @@ -23,10 +23,10 @@ "## Prerequisites\n", "1. Ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", "2. Set environment variables related to training data by following the steps in [Set env for training data](../docs/set_env_for_training_data_and_reference_doc.md) and adding them to the [.env](./.env) file.\n", - " - You can either set `TRAINING_DATA_SAS_URL` directly with the SAS URL for your Azure Blob container,\n", + " - You can either set `TRAINING_DATA_SAS_URL` directly with the SAS URL for your Azure Blob container.\n", " - Or set both `TRAINING_DATA_STORAGE_ACCOUNT_NAME` and `TRAINING_DATA_CONTAINER_NAME` to generate the SAS URL automatically during later steps.\n", " - Also set `TRAINING_DATA_PATH` to specify the folder path within the container where the training data will be uploaded.\n", - "3. Install the packages required to run the sample:\n" + "3. Please install the packages required to run the sample:\n" ] }, { @@ -67,11 +67,11 @@ "## Create Azure Content Understanding Client\n", "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class that contains helper functions. Before the official release of the Content Understanding SDK, please consider it a lightweight SDK.\n", ">\n", - "> Fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the information from your Azure AI Service.\n", + "> Please fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the information from your Azure AI Service.\n", "\n", "> āš ļø Important:\n", "You must update the code below to match your Azure authentication method.\n", - "Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", + "Look for the `# IMPORTANT` comments and please modify those sections accordingly.\n", "If you skip this step, the sample may not run correctly.\n", "\n", "> āš ļø Note: While using a subscription key works, using a token provider with Azure Active Directory (AAD) is safer and highly recommended for production environments." @@ -153,18 +153,18 @@ "\n", "> **šŸ’” Note:** This step is only required **once per Azure Content Understanding resource**, unless the GPT deployment has been changed. You can skip this section if:\n", "> - This configuration has already been run once for your resource, or\n", - "> - Your administrator has already configured the model deployments for you\n", + "> - Your administrator has already configured the model deployments for you.\n", "\n", "Before using prebuilt analyzers, you need to configure the default model deployment mappings. This tells Content Understanding which model deployments to use.\n", "\n", "**Model Requirements:**\n", - "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`)\n", - "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`)\n", - "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings\n", + "- **GPT-4.1** - Required for most prebuilt analyzers (e.g., `prebuilt-invoice`, `prebuilt-receipt`, `prebuilt-idDocument`).\n", + "- **GPT-4.1-mini** - Required for RAG analyzers (e.g., `prebuilt-documentSearch`, `prebuilt-audioSearch`, `prebuilt-videoSearch`).\n", + "- **text-embedding-3-large** - Required for all prebuilt analyzers that use embeddings.\n", "\n", "**Prerequisites:**\n", - "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry\n", - "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names" + "1. Deploy **GPT-4.1**, **GPT-4.1-mini**, and **text-embedding-3-large** models in Azure AI Foundry.\n", + "2. Set `GPT_4_1_DEPLOYMENT`, `GPT_4_1_MINI_DEPLOYMENT`, and `TEXT_EMBEDDING_3_LARGE_DEPLOYMENT` in your `.env` file with the deployment names." ] }, { @@ -193,12 +193,12 @@ " print(f\" - {deployment}\")\n", " print(\"\\n Prebuilt analyzers require GPT-4.1, GPT-4.1-mini, and text-embedding-3-large deployments.\")\n", " print(\" Please:\")\n", - " print(\" 1. Deploy all three models in Azure AI Foundry\")\n", + " print(\" 1. Deploy all three models in Azure AI Foundry.\")\n", " print(\" 2. Add the following to notebooks/.env:\")\n", " print(\" GPT_4_1_DEPLOYMENT=\")\n", " print(\" GPT_4_1_MINI_DEPLOYMENT=\")\n", " print(\" TEXT_EMBEDDING_3_LARGE_DEPLOYMENT=\")\n", - " print(\" 3. Restart the kernel and run this cell again\")\n", + " print(\" 3. Restart the kernel and run this cell again.\")\n", "else:\n", " print(f\"šŸ“‹ Configuring default model deployments...\")\n", " print(f\" GPT-4.1 deployment: {GPT_4_1_DEPLOYMENT}\")\n", @@ -220,8 +220,8 @@ " except Exception as e:\n", " print(f\"āŒ Failed to configure defaults: {e}\")\n", " print(f\" This may happen if:\")\n", - " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project\")\n", - " print(f\" - You don't have permission to update defaults\")\n", + " print(f\" - One or more deployment names don't exist in your Azure AI Foundry project.\")\n", + " print(f\" - You don't have permission to update defaults.\")\n", " raise\n" ] }, @@ -231,7 +231,7 @@ "source": [ "## Prepare Labeled Data\n", "In this step, we will:\n", - "- Use the environment variables `TRAINING_DATA_PATH` and SAS URL related variables set in the Prerequisites step.\n", + "- Use the environment variables `TRAINING_DATA_PATH` and SAS URL related variables set in the Prerequisites section.\n", "- Attempt to get the SAS URL from the environment variable `TRAINING_DATA_SAS_URL`.\n", "- If `TRAINING_DATA_SAS_URL` is not set, try generating it automatically using `TRAINING_DATA_STORAGE_ACCOUNT_NAME` and `TRAINING_DATA_CONTAINER_NAME` environment variables.\n", "- Verify that each document file in the local folder has corresponding `.labels.json` and `.result.json` files.\n", @@ -311,7 +311,7 @@ "metadata": {}, "source": [ "## Create Analyzer with Defined Schema\n", - "Before creating the analyzer, fill in the constant `ANALYZER_ID` with a relevant name for your task. In this example, we generate a unique suffix so that this cell can be run multiple times to create different analyzers.\n", + "Before creating the analyzer, please fill in the constant `ANALYZER_ID` with a relevant name for your task. In this example, we generate a unique suffix so that this cell can be run multiple times to create different analyzers.\n", "\n", "We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** as set in the [.env](./.env) file and used in the previous step." ] @@ -493,7 +493,7 @@ " elif val.get('type') == 'number':\n", " print(f\" {key}: {val.get('valueNumber')}\")\n", " else:\n", - " print(\"No fields extracted\")\n", + " print(\"No fields extracted.\")\n", " \n", " # Display content metadata\n", " print(f\"\\nšŸ“‹ Content Metadata:\")\n", @@ -527,16 +527,16 @@ " col_count = table.get(\"columnCount\", 0)\n", " print(f\" Table {idx}: {row_count} rows x {col_count} columns\")\n", " else:\n", - " print(\"\\nšŸ“š Document Information: Not available for this content type\")\n", + " print(\"\\nšŸ“š Document Information: Not available for this content type.\")\n", " else:\n", - " print(\"No contents available in analysis result\")\n", + " print(\"No contents available in analysis result.\")\n", " \n", " # Save the analysis result to a file\n", " saved_file_path = save_json_to_file(analysis_result, filename_prefix=\"analyzer_training_result\")\n", " # Print the full analysis result as a JSON string\n", " print(json.dumps(analysis_result, indent=2))\n", "else:\n", - " print(\"No analysis result available\")" + " print(\"No analysis result available.\")" ] }, { @@ -544,7 +544,7 @@ "metadata": {}, "source": [ "## Delete Existing Analyzer in Content Understanding Service\n", - "This snippet is optional and is included to prevent test analyzers from remaining in your service. Without deletion, the analyzer will stay in your service and may be reused in subsequent operations." + "This snippet is optional and is included to help prevent test analyzers from remaining in your service. Without deletion, the analyzer will stay in your service and may be reused in subsequent operations." ] }, { @@ -580,4 +580,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file