From 5243cae77b7a3edb2aac2f4170950d1497cc1fce Mon Sep 17 00:00:00 2001 From: Amna Mubashar Date: Thu, 25 Sep 2025 15:13:19 +0200 Subject: [PATCH] Update the tutorial --- index.toml | 10 +- ...=> 28_Structured_Output_With_OpenAI.ipynb} | 240 ++++++++++-------- 2 files changed, 142 insertions(+), 108 deletions(-) rename tutorials/{28_Structured_Output_With_Loop.ipynb => 28_Structured_Output_With_OpenAI.ipynb} (63%) diff --git a/index.toml b/index.toml index ccd03cc..50db52f 100644 --- a/index.toml +++ b/index.toml @@ -16,13 +16,13 @@ dependencies = ["datasets>=2.6.1", "sentence-transformers>=4.1.0"] featured = true [[tutorial]] -title = "Generating Structured Output with Loop-Based Auto-Correction" -description = "Learn how to extract structured data using an LLM, and to validate the generated output against a predefined schema." -level = "intermediate" +title = "Generating Structured Output with OpenAIChatGenerator" +description = "Learn how to generate structured output using OpenAIChatGenerator, and to validate the generated output against a predefined schema." +level = "beginner" weight = 71 -notebook = "28_Structured_Output_With_Loop.ipynb" +notebook = "28_Structured_Output_With_OpenAI.ipynb" aliases = [] -completion_time = "15 min" +completion_time = "10 min" created_at = 2023-11-30 dependencies = ["colorama"] diff --git a/tutorials/28_Structured_Output_With_Loop.ipynb b/tutorials/28_Structured_Output_With_OpenAI.ipynb similarity index 63% rename from tutorials/28_Structured_Output_With_Loop.ipynb rename to tutorials/28_Structured_Output_With_OpenAI.ipynb index 5041dbc..80ade77 100644 --- a/tutorials/28_Structured_Output_With_Loop.ipynb +++ b/tutorials/28_Structured_Output_With_OpenAI.ipynb @@ -6,18 +6,18 @@ "id": "AVBtOVlNJ51C" }, "source": [ - "# Tutorial: Generating Structured Output with Loop-Based Auto-Correction\n", + "# Tutorial: Generating Structured Output with OpenAIChatGenerator\n", "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", "- **Prerequisites**: You must have an API key from an active OpenAI account as this tutorial is using the gpt-4o-mini model by OpenAI.\n", - "- **Components Used**: `PromptBuilder`, `OpenAIChatGenerator`, `OutputValidator` (Custom component)\n", - "- **Goal**: After completing this tutorial, you will have built a system that extracts unstructured data, puts it in a JSON schema, and automatically corrects errors in the JSON output from a large language model (LLM) to make sure it follows the specified structure.\n", + "- **Components Used**: `ChatPromptBuilder`, `OpenAIChatGenerator`, `OutputValidator` (Custom component)\n", + "- **Goal**: Learn how to generate structured outputs with `OpenAIChatGenerator` using Pydantic model or Json schema. Optionally, use a custom `OutputValidator` to confirm the output matches the provided Pydantic model.\n", "\n", "## Overview\n", - "This tutorial demonstrates how to use Haystack's advanced [looping pipelines](https://docs.haystack.deepset.ai/docs/pipelines#loops) with LLMs for more dynamic and flexible data processing. You'll learn how to extract structured data from unstructured data using an LLM, and to validate the generated output against a predefined schema.\n", + "This tutorial shows how to produce structured outputs by either providing [Pydantic](https://github.com/pydantic/pydantic) model or JSON schema to `OpenAIChatGenerator`. We’ll use `OutputValidator` to verify the generated output against the schema.\n", "\n", - "This tutorial uses `gpt-4o-mini` to change unstructured passages into JSON outputs that follow the [Pydantic](https://github.com/pydantic/pydantic) schema. It uses a custom OutputValidator component to validate the JSON and loop back to make corrections, if necessary." + "Note: Only latest model starting with `gpt-4o-mini` can be used for this feature.\n" ] }, { @@ -58,18 +58,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kcc1AlLQd_jI", - "outputId": "efc4bbab-a9fe-46ee-d8af-9d86edacaf04" - }, + "metadata": {}, "outputs": [], "source": [ "%%bash\n", "\n", - "pip install haystack-ai\n", + "pip install \"haystack-ai>=2.18.0\"\n", "pip install colorama" ] }, @@ -79,14 +73,14 @@ "id": "Cmjfa8CiCeFl" }, "source": [ - "## Defining a Schema to Parse the JSON Object\n", + "## Structured output using Pydantic Model\n", "\n", - "Define a simple JSON schema for the data you want to extract from a text passsage using the LLM. As the first step, define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`, with suitable fields and types." + "First, we'll see how to pass Pydantic model to `OpenAIChatGenerator`. For this purpose, we define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`. These models specify the fields and types that represent the data structure we want." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "id": "xwKrDOOGdaAz" }, @@ -115,28 +109,6 @@ "> You can change these models according to the format you wish to extract from the text." ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "ouk1mAOUCeFl" - }, - "source": [ - "Then, generate a JSON schema from Pydantic models using `schema_json()`. You will later on use this schema in the prompt to instruct the LLM.\n", - "\n", - "To learn more about the JSON schemas, visit [Pydantic Schema](https://docs.pydantic.dev/1.10/usage/schema/). " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "8Lg9_72jCeFl" - }, - "outputs": [], - "source": [ - "json_schema = CitiesData.schema_json(indent=2)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -145,21 +117,20 @@ "source": [ "## Creating a Custom Component: OutputValidator\n", "\n", - "`OutputValidator` is a custom component that validates if the JSON object the LLM generates complies with the provided [Pydantic model](https://docs.pydantic.dev/1.10/usage/models/). If it doesn't, OutputValidator returns an error message along with the incorrect JSON object to get it fixed in the next loop.\n", + "`OutputValidator` is a custom component that validates if the JSON object the LLM generates complies with the provided [Pydantic model](https://docs.pydantic.dev/1.10/usage/models/). If it doesn't, OutputValidator returns an error message.\n", "\n", "For more details about custom components, see [Creating Custom Components](https://docs.haystack.deepset.ai/docs/custom-components)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "yr6D8RN2d7Vy" }, "outputs": [], "source": [ "import json\n", - "import random\n", "import pydantic\n", "from pydantic import ValidationError\n", "from typing import Optional, List\n", @@ -179,25 +150,22 @@ " @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])\n", " def run(self, replies: List[ChatMessage]):\n", "\n", - " self.iteration_counter += 1\n", - "\n", " ## Try to parse the LLM's reply ##\n", " # If the LLM's reply is a valid object, return `\"valid_replies\"`\n", " try:\n", " output_dict = json.loads(replies[0].text)\n", - " self.pydantic_model.parse_obj(output_dict)\n", + " self.pydantic_model.model_validate(output_dict)\n", " print(\n", " Fore.GREEN\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}\"\n", + " + f\"Valid JSON from LLM\"\n", " )\n", " return {\"valid_replies\": replies}\n", "\n", - " # If the LLM's reply is corrupted or not valid, return \"invalid_replies\" and the \"error_message\" for LLM to try again\n", + " # If the LLM's reply is corrupted or not valid, return \"invalid_replies\" and the \"error_message\"\n", " except (ValueError, ValidationError) as e:\n", " print(\n", " Fore.RED\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\\n\"\n", - " f\"Output from LLM:\\n {replies[0]} \\n\"\n", + " + f\"Output from LLM:\\n {replies[0]} \\n\"\n", " f\"Error from OutputValidator: {e}\"\n", " )\n", " return {\"invalid_replies\": replies, \"error_message\": str(e)}" @@ -214,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": { "id": "bhPCLCBCCeFm" }, @@ -229,39 +197,31 @@ "id": "xcIWKjW4k42r" }, "source": [ - "## Creating the Prompt\n", - "\n", - "Write instructions for the LLM for converting a passage into a JSON format. Ensure the instructions explain how to identify and correct errors if the JSON doesn't match the required schema. Once you create the prompt, initialize PromptBuilder to use it. \n", + "## Create a Prompt for LLM\n", "\n", + "Use `ChatPromptBuilder` in the pipeline to pass the user’s message to `OpenAIChatGenerator`.\n", "For information about Jinja2 template and ChatPromptBuilder, see [ChatPromptBuilder](https://docs.haystack.deepset.ai/docs/chatpromptbuilder)." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": { "id": "ohPpNALjdVKt" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:haystack.components.builders.chat_prompt_builder:ChatPromptBuilder has 1 prompt variables, but `required_variables` is not set. By default, all prompt variables are treated as optional, which may lead to unintended behavior in multi-branch pipelines. To avoid unexpected execution, ensure that variables intended to be required are explicitly set in `required_variables`.\n" + ] + } + ], "source": [ "from haystack.components.builders import ChatPromptBuilder\n", "\n", - "\n", - "prompt_template = [\n", - " ChatMessage.from_user(\n", - " \"\"\"\n", - "Create a JSON object from the information present in this passage: {{passage}}.\n", - "Only use information that is present in the passage. Follow this JSON schema, but only return the actual instances without any additional schema definition:\n", - "{{schema}}\n", - "Make sure your response is a dict and not a list.\n", - "{% if invalid_replies and error_message %}\n", - " You already created the following output in a previous attempt: {{invalid_replies}}\n", - " However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}\n", - " Correct the output and try again. Just return the corrected output without any extra explanations.\n", - "{% endif %}\n", - "\"\"\"\n", - " )\n", - "]\n", + "prompt_template = [ChatMessage.from_user(\"User Input: {{passage}}\")]\n", "prompt_builder = ChatPromptBuilder(template=prompt_template)" ] }, @@ -271,15 +231,19 @@ "id": "KM9-Zq2FL7Nn" }, "source": [ - "## Initalizing the ChatGenerator\n", + "## Initalizing the ChatGenerator to produce structured outputs\n", "\n", "[OpenAIChatGenerator](https://docs.haystack.deepset.ai/docs/openaichatgenerator) generates\n", - "text using OpenAI's `gpt-4o-mini` model by default. Set the `OPENAI_API_KEY` variable and provide a model name to the ChatGenerator." + "text using OpenAI's `gpt-4o-mini` model by default. We pass our Pydantic model to `response_format` paramter in generation_kwargs .\n", + "\n", + "We also need to set the `OPENAI_API_KEY` variable.\n", + "\n", + "Note: You can also set the `response_format` param in the `run` method of chat generator." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": { "id": "Z4cQteIgunUR" }, @@ -292,7 +256,7 @@ "\n", "if \"OPENAI_API_KEY\" not in os.environ:\n", " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", - "chat_generator = OpenAIChatGenerator()" + "chat_generator = OpenAIChatGenerator(generation_kwargs={\"response_format\": CitiesData})" ] }, { @@ -303,12 +267,12 @@ "source": [ "## Building the Pipeline\n", "\n", - "Add all components to your pipeline and connect them. Add connections from `output_validator` back to the `prompt_builder` for cases where the produced JSON doesn't comply with the JSON schema. Set `max_runs_per_component` to avoid infinite looping." + "Add all components to your pipeline and connect them." ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": { "id": "eFglN9YEv-1W" }, @@ -316,19 +280,17 @@ { "data": { "text/plain": [ - "\n", + "\n", "🚅 Components\n", " - prompt_builder: ChatPromptBuilder\n", " - llm: OpenAIChatGenerator\n", " - output_validator: OutputValidator\n", "🛤️ Connections\n", - " - prompt_builder.prompt -> llm.messages (List[ChatMessage])\n", - " - llm.replies -> output_validator.replies (List[ChatMessage])\n", - " - output_validator.invalid_replies -> prompt_builder.invalid_replies (Optional[List[str]])\n", - " - output_validator.error_message -> prompt_builder.error_message (Optional[str])" + " - prompt_builder.prompt -> llm.messages (list[ChatMessage])\n", + " - llm.replies -> output_validator.replies (list[ChatMessage])" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -345,10 +307,7 @@ "\n", "# Now, connect the components to each other\n", "pipeline.connect(\"prompt_builder.prompt\", \"llm.messages\")\n", - "pipeline.connect(\"llm.replies\", \"output_validator\")\n", - "# If a component has more than one output or input, explicitly specify the connections:\n", - "pipeline.connect(\"output_validator.invalid_replies\", \"prompt_builder.invalid_replies\")\n", - "pipeline.connect(\"output_validator.error_message\", \"prompt_builder.error_message\")" + "pipeline.connect(\"llm.replies\", \"output_validator\")\n" ] }, { @@ -364,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": { "id": "RZJg6YHId300" }, @@ -408,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -416,19 +375,18 @@ "id": "yIoMedb6eKia", "outputId": "4a9ef924-cf26-4908-d83f-b0bc0dc03b54" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mValid JSON from LLM\n" + ] + } + ], "source": [ "passage = \"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\"\n", - "result = pipeline.run({\"prompt_builder\": {\"passage\": passage, \"schema\": json_schema}})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WWxmPgADS_Fa" - }, - "source": [ - "> If you encounter `PipelineMaxLoops: Maximum loops count (5) exceeded for component 'prompt_builder'.` error, consider increasing the maximum loop count or simply rerun the pipeline." + "result = pipeline.run({\"prompt_builder\": {\"passage\": passage}})" ] }, { @@ -443,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -466,6 +424,82 @@ "print(valid_json)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Structured output using Json schema\n", + "\n", + "Now, we’ll create a json schema of the `CitiesData` model and pass it to `OpenAIChatGenerator`. OpenAI expects schemas in a specific format, so the schema generated with `model_json_schema()` cannot be used directly.\n", + "\n", + "For details on how to create schemas for OpenAI, see the [OpenAI Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs#supported-schemas)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "cities_data_schema={\n", + " \"type\": \"json_schema\",\n", + " \"json_schema\": {\n", + " \"name\": \"CitiesData\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"cities\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"name\": { \"type\": \"string\" },\n", + " \"country\": { \"type\": \"string\" },\n", + " \"population\": { \"type\": \"integer\" }\n", + " },\n", + " \"required\": [\"name\", \"country\", \"population\"],\n", + " \"additionalProperties\": False\n", + " }\n", + " }\n", + " },\n", + " \"required\": [\"cities\"],\n", + " \"additionalProperties\": False\n", + " },\n", + " \"strict\": True\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pass this json schema to the `response_format` parameter in chat generator. We run the generator indivdually to see the output." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"cities\":[{\"name\":\"Berlin\",\"country\":\"Germany\",\"population\":3850809},{\"name\":\"Paris\",\"country\":\"France\",\"population\":2161000},{\"name\":\"Lisbon\",\"country\":\"Portugal\",\"population\":504718}]}\n" + ] + } + ], + "source": [ + "chat_generator = OpenAIChatGenerator(generation_kwargs={\"response_format\": cities_data_schema})\n", + "\n", + "messages = [ChatMessage.from_user(\"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\")]\n", + "\n", + "result = chat_generator.run(messages=messages)\n", + "\n", + "print(result[\"replies\"][0].text)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -474,7 +508,7 @@ "source": [ "## What's next\n", "\n", - "🎉 Congratulations! You've built a system that generates structured JSON out of unstructured text passages, and auto-corrects it by using the looping functionality of Haystack pipelines.\n", + "🎉 Congratulations! You've learnt how to easily produce structured ouputs with `OpenAIChatGenerator` using Pydantic models and Json schema.\n", "\n", "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", "\n", @@ -489,7 +523,7 @@ "provenance": [] }, "kernelspec": { - "display_name": ".venv", + "display_name": "python312", "language": "python", "name": "python3" }, @@ -503,7 +537,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.12.11" } }, "nbformat": 4,