From 8abe25970aa1b483676dde17b7972359c8c55475 Mon Sep 17 00:00:00 2001 From: Ilya Nikokoshev Date: Tue, 15 Oct 2024 17:24:41 +0300 Subject: [PATCH] [Auto Import] Fix cases where LLM generates incorrect array field access (#196207) ## Release Note Fixes cases where LLM was likely to generate invalid processors containing array access in Automatic Import. ## Context Previously, it happened from time to time that the LLM attempts to add related fields or apply categorization conditions that use a field, path to which goes through an array. The problem is that such an access is invalid and leads to an immediate error (key part highlighted): Even including explicit instructions to avoid brackets or an array access did not seem enough, as the LLM would try to use a different syntax, owing to the aggressiveness of our review instructions. The suggested solution is to remove all arrays from the information shown to the LLM in the related chain. This guarantees that no illegal access will ever be attempted. ### Summary - Introduces a utility function to remove all arrays from a JSON object. - Applies this function for all LLM calls in the related chain. - Modifies the prompts of related and categorization chain to skip the arrays as well. --------- Co-authored-by: Bharat Pasupula <123897612+bhapas@users.noreply.github.com> --- .../server/graphs/categorization/prompts.ts | 3 + .../server/graphs/related/prompts.ts | 6 +- .../server/graphs/related/related.ts | 3 +- .../server/graphs/related/review.ts | 3 +- .../server/graphs/related/util.test.ts | 135 ++++++++++++++++++ .../server/graphs/related/util.ts | 37 +++++ 6 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 x-pack/plugins/integration_assistant/server/graphs/related/util.test.ts create mode 100644 x-pack/plugins/integration_assistant/server/graphs/related/util.ts diff --git a/x-pack/plugins/integration_assistant/server/graphs/categorization/prompts.ts b/x-pack/plugins/integration_assistant/server/graphs/categorization/prompts.ts index 2f90e426dc552f..baf9d5d5b3adad 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/categorization/prompts.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/categorization/prompts.ts @@ -53,6 +53,7 @@ You ALWAYS follow these guidelines when writing your response: - You can add as many processor objects as you need to cover all the unique combinations that was detected. - If conditions should always use a ? character when accessing nested fields, in case the field might not always be available, see example processors above. +- You can access nested dictionaries with the ctx.field?.another_field syntax, but it's not possible to access elements of an array. Never use brackets in an if statement. - When an if condition is not needed the argument it should not be included in that specific object of your response. - When using a range based if condition like > 0, you first need to check that the field is not null, for example: ctx.somefield?.production != null && ctx.somefield?.production > 0 - If no good match is found for any of the pipeline results, then respond with an empty array [] as valid JSON enclosed with 3 backticks (\`). @@ -110,6 +111,7 @@ You ALWAYS follow these guidelines when writing your response: - You can use as many processor objects as you need to add all relevant ECS categories and types combinations. - If conditions should always use a ? character when accessing nested fields, in case the field might not always be available, see example processors above. +- You can access nested dictionaries with the ctx.field?.another_field syntax, but it's not possible to access elements of an array. Never use brackets in an if statement. - When an if condition is not needed the argument should not be used for the processor object. - If updates are needed you respond with the initially provided array of processors. - If an update removes the last remaining processor object you respond with an empty array [] as valid JSON enclosed with 3 backticks (\`). @@ -159,6 +161,7 @@ You ALWAYS follow these guidelines when writing your response: - If the error complains about having event.type or event.category not in the allowed values , fix the corresponding append processors to use the allowed values mentioned in the error. - If the error is about event.type not compatible with any event.category, please refer to the 'compatible_types' in the context to fix the corresponding append processors to use valid combination of event.type and event.category - If resolving the validation removes the last remaining processor object, respond with an empty array [] as valid JSON enclosed with 3 backticks (\`). +- Reminder: you can access nested dictionaries with the ctx.field?.another_field syntax, but it's not possible to access elements of an array. Never use brackets in an if statement. - Do not respond with anything except the complete updated array of processors as a valid JSON object enclosed with 3 backticks (\`), see example response below. diff --git a/x-pack/plugins/integration_assistant/server/graphs/related/prompts.ts b/x-pack/plugins/integration_assistant/server/graphs/related/prompts.ts index 9fa50d5900806f..9b76ddb96ba8db 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/related/prompts.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/related/prompts.ts @@ -27,7 +27,7 @@ Here are some context for you to reference for your task, read it carefully as y For each pipeline result you find matching values that would fit any of the related fields perform the follow steps: 1. Identify which related field the value would fit in. -2. Create a new processor object with the field value set to the correct related.field, and the value_field set to the full path of the field that contains the value which we want to append. +2. Create a new processor object with the field value set to the correct related.field, and the value_field set to the full path of the field that contains the value which we want to append, if that path can be encoded as a string of dict key accesses. 3. Always check if the related.ip, related.hash, related.user and related.host fields are common in the ecs context above. 4. The value_field argument in your response consist of only one value. @@ -35,6 +35,7 @@ You ALWAYS follow these guidelines when writing your response: - The \`message\` field may not be part of related fields. - You can use as many processor objects as needed to map all relevant pipeline result fields to any of the ECS related fields. +- You can access nested dictionaries with the field.another_field syntax, but it's not possible to access elements of an array; skip them instead. - If no relevant fields or values are found that could be mapped confidently to any of the related fields, then respond with an empty array [] as valid JSON enclosed with 3 backticks (\`). - Do not respond with anything except the array of processors as a valid JSON objects enclosed with 3 backticks (\`), see example response below. @@ -82,6 +83,7 @@ You ALWAYS follow these guidelines when writing your response: - The \`message\` field may not be part of related fields. - Never use "split" in template values, only use the field name inside the triple brackets. If the error mentions "Improperly closed variable in query-template" then check each "value" field for any special characters and remove them. +- You can access nested dictionaries with the field.another_field syntax, but it's not possible to access elements of an array. Never use brackets in the field name, never try to access array elements. - If solving an error means removing the last processor in the list, then return an empty array [] as valid JSON enclosed with 3 backticks (\`). - Do not respond with anything except the complete updated array of processors as a valid JSON object enclosed with 3 backticks (\`), see example response below. @@ -123,7 +125,7 @@ Please review the pipeline results and the array of current processors above, an For each pipeline result you find matching values that would fit any of the related fields perform the follow steps: 1. Identify which related field the value would fit in. -2. Create a new processor object with the field value set to the correct related.field, and the value_field set to the full path of the field that contains the value which we want to append. +2. Create a new processor object with the field value set to the correct related.field, and the value_field set to the full path of the field that contains the value which we want to append. You can access fields inside nested dictionaries with the field.another_field syntax, but it's not possible to access elements of an array, so skip a field if it's path contains an array. 3. If previous errors above is not empty, do not add any processors that would cause any of the same errors again, if you are unsure, then remove the processor from the list. 4. If no updates are needed, then respond with the initially provided current processors. diff --git a/x-pack/plugins/integration_assistant/server/graphs/related/related.ts b/x-pack/plugins/integration_assistant/server/graphs/related/related.ts index 902427a1c484fa..4298fb1ab24fac 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/related/related.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/related/related.ts @@ -11,6 +11,7 @@ import type { RelatedState, SimplifiedProcessor, SimplifiedProcessors } from '.. import { combineProcessors } from '../../util/processors'; import { RELATED_MAIN_PROMPT } from './prompts'; import type { RelatedNodeParams } from './types'; +import { deepCopySkipArrays } from './util'; export async function handleRelated({ state, @@ -21,7 +22,7 @@ export async function handleRelated({ const relatedMainGraph = relatedMainPrompt.pipe(model).pipe(outputParser); const currentProcessors = (await relatedMainGraph.invoke({ - pipeline_results: JSON.stringify(state.pipelineResults, null, 2), + pipeline_results: JSON.stringify(state.pipelineResults.map(deepCopySkipArrays), null, 2), ex_answer: state.exAnswer, ecs: state.ecs, })) as SimplifiedProcessor[]; diff --git a/x-pack/plugins/integration_assistant/server/graphs/related/review.ts b/x-pack/plugins/integration_assistant/server/graphs/related/review.ts index 300f33144b52ac..37c0008304958c 100644 --- a/x-pack/plugins/integration_assistant/server/graphs/related/review.ts +++ b/x-pack/plugins/integration_assistant/server/graphs/related/review.ts @@ -11,6 +11,7 @@ import type { RelatedState, SimplifiedProcessors, SimplifiedProcessor } from '.. import type { RelatedNodeParams } from './types'; import { combineProcessors } from '../../util/processors'; import { RELATED_REVIEW_PROMPT } from './prompts'; +import { deepCopySkipArrays } from './util'; export async function handleReview({ state, @@ -24,7 +25,7 @@ export async function handleReview({ current_processors: JSON.stringify(state.currentProcessors, null, 2), ex_answer: state.exAnswer, previous_error: state.previousError, - pipeline_results: JSON.stringify(state.pipelineResults, null, 2), + pipeline_results: JSON.stringify(state.pipelineResults.map(deepCopySkipArrays), null, 2), })) as SimplifiedProcessor[]; const processors = { diff --git a/x-pack/plugins/integration_assistant/server/graphs/related/util.test.ts b/x-pack/plugins/integration_assistant/server/graphs/related/util.test.ts new file mode 100644 index 00000000000000..c81369f98e56d4 --- /dev/null +++ b/x-pack/plugins/integration_assistant/server/graphs/related/util.test.ts @@ -0,0 +1,135 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +import { deepCopySkipArrays } from './util'; + +describe('deepCopySkipArrays', () => { + it('should skip arrays and deeply copy objects', () => { + const input = { + field: ['a', 'b'], + another: { field: 'c' }, + }; + + const expectedOutput = { + another: { field: 'c' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + it('should return primitive types as is', () => { + expect(deepCopySkipArrays(42)).toBe(42); + expect(deepCopySkipArrays('string')).toBe('string'); + expect(deepCopySkipArrays(true)).toBe(true); + }); + + it('should handle nested objects and skip nested arrays', () => { + const input = { + level1: { + level2: { + array: [1, 2, 3], + value: 'test', + }, + }, + }; + + const expectedOutput = { + level1: { + level2: { + value: 'test', + }, + }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + it('should return undefined for arrays', () => { + expect(deepCopySkipArrays([1, 2, 3])).toBeUndefined(); + }); + + it('should handle null and undefined values', () => { + expect(deepCopySkipArrays(null)).toBeNull(); + expect(deepCopySkipArrays(undefined)).toBeUndefined(); + }); + + it('should handle empty objects', () => { + expect(deepCopySkipArrays({})).toEqual({}); + }); + + it('should handle objects with mixed types', () => { + const input = { + number: 1, + string: 'test', + boolean: true, + object: { key: 'value' }, + array: [1, 2, 3], + }; + + const expectedOutput = { + number: 1, + string: 'test', + boolean: true, + object: { key: 'value' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + // Test case + it('should skip arrays and deeply copy objects with nested arrays', () => { + const input = { + field: ['a', 'b'], + another: { field: 'c' }, + }; + + const expectedOutput = { + another: { field: 'c' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + it('should handle objects with nested empty arrays', () => { + const input = { + field: [], + another: { field: 'c' }, + }; + + const expectedOutput = { + another: { field: 'c' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + it('should handle objects with nested arrays containing objects', () => { + const input = { + field: [{ key: 'value' }], + another: { field: 'c' }, + }; + + const expectedOutput = { + another: { field: 'c' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); + + it('should handle objects with nested arrays containing mixed types', () => { + const input = { + field: [1, 'string', true, { key: 'value' }], + another: { field: 'c' }, + }; + + const expectedOutput = { + another: { field: 'c' }, + }; + + expect(deepCopySkipArrays(input)).toEqual(expectedOutput); + }); +}); diff --git a/x-pack/plugins/integration_assistant/server/graphs/related/util.ts b/x-pack/plugins/integration_assistant/server/graphs/related/util.ts new file mode 100644 index 00000000000000..b939e939fed327 --- /dev/null +++ b/x-pack/plugins/integration_assistant/server/graphs/related/util.ts @@ -0,0 +1,37 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * Deeply copies a JSON object, skipping all arrays. + * + * @param value - The JSON value to be deeply copied, which can be an array, object, or other types. + * @returns A new object that is a deep copy of the input value, but with arrays skipped. + * + * This function recursively traverses the provided value. If the value is an array, it skips it. + * If the value is a regular object, it continues traversing its properties and copying them. + */ +export function deepCopySkipArrays(value: unknown): unknown { + if (Array.isArray(value)) { + // Skip arrays + return undefined; + } + + if (typeof value === 'object' && value !== null) { + // Regular dictionary, continue traversing. + const result: Record = {}; + for (const [k, v] of Object.entries(value)) { + const copiedValue = deepCopySkipArrays(v); + if (copiedValue !== undefined) { + result[k] = copiedValue; + } + } + return result; + } + + // For primitive types, return the value as is. + return value; +}