feat(ai-model): support JSON 100% limit capability for gpt-4o-2024-08…

…-06 model (#86)
web-infra-dev · Sep 5, 2024 · c5077a2 · c5077a2
1 parent cfa92b3
commit c5077a2
Show file tree

Hide file tree

Showing 13 changed files with 240 additions and 25 deletions.
diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -59,7 +59,7 @@
   },
   "dependencies": {
     "node-fetch": "2.6.7",
-    "openai": "4.47.1",
+    "openai": "4.57.1",
     "optional": "0.1.4",
     "@midscene/shared": "workspace:*"
   },

diff --git a/packages/midscene/src/ai-model/automation/planning.ts b/packages/midscene/src/ai-model/automation/planning.ts
@@ -1,3 +1,5 @@
+import type { ResponseFormatJSONSchema } from 'openai/resources';
+
 export function systemPromptToTaskPlanning() {
   return `
   You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
@@ -51,3 +53,51 @@ export function systemPromptToTaskPlanning() {
   }
   `;
 }
+
+export const planSchema: ResponseFormatJSONSchema = {
+  type: 'json_schema',
+  json_schema: {
+    name: 'action_items',
+    strict: true,
+    schema: {
+      type: 'object',
+      properties: {
+        queryLanguage: {
+          type: 'string',
+          description: 'Language of the description of the task',
+        },
+        actions: {
+          type: 'array',
+          items: {
+            type: 'object',
+            properties: {
+              thought: {
+                type: 'string',
+                description:
+                  'Reasons for generating this task, and why this task is feasible on this page',
+              },
+              type: {
+                type: 'string',
+                description: 'Type of action, like "Tap", "Hover", etc.',
+              },
+              param: {
+                type: ['object', 'null'],
+                description: 'Parameter towards the task type, can be null',
+              },
+            },
+            required: ['thought', 'type', 'param'],
+            additionalProperties: false,
+          },
+          description: 'List of actions to be performed',
+        },
+        error: {
+          type: ['string', 'null'],
+          description:
+            'Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here',
+        },
+      },
+      required: ['queryLanguage', 'actions', 'error'],
+      additionalProperties: false,
+    },
+  },
+};
diff --git a/packages/midscene/src/ai-model/common.ts b/packages/midscene/src/ai-model/common.ts
@@ -1,3 +1,4 @@
+import type OpenAI from 'openai';
 import type {
   ChatCompletionSystemMessageParam,
   ChatCompletionUserMessageParam,
@@ -32,7 +33,7 @@ export async function callAiFn<T>(options: {
 }) {
   const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
   if (useOpenAIModel(useModel)) {
-    const parseResult = await callToGetJSONObject<T>(msgs);
+    const parseResult = await callToGetJSONObject<T>(msgs, AIActionTypeValue);
     return parseResult;
   }
 

diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts
@@ -3,6 +3,10 @@ import { AIResponseFormat } from '@/types';
 import { wrapOpenAI } from 'langsmith/wrappers';
 import OpenAI, { type ClientOptions } from 'openai';
 import type { ChatCompletionMessageParam } from 'openai/resources';
+import { planSchema } from '../automation/planning';
+import { AIActionType } from '../common';
+import { findElementSchema } from '../prompt/element_inspector';
+import { assertSchema } from '../prompt/util';
 
 export const MIDSCENE_OPENAI_INIT_CONFIG_JSON =
   'MIDSCENE_OPENAI_INIT_CONFIG_JSON';
@@ -48,7 +52,9 @@ async function createOpenAI() {
 
 export async function call(
   messages: ChatCompletionMessageParam[],
-  responseFormat?: AIResponseFormat,
+  responseFormat?:
+    | OpenAI.ChatCompletionCreateParams['response_format']
+    | OpenAI.ResponseFormatJSONObject,
 ): Promise<string> {
   const openai = await createOpenAI();
 
@@ -58,21 +64,46 @@ export async function call(
   const completion = await openai.chat.completions.create({
     model,
     messages,
-    response_format: { type: responseFormat },
+    response_format: responseFormat,
     temperature: 0.2,
   });
   shouldPrintTiming && console.timeEnd('Midscene - AI call');
   shouldPrintTiming && console.log('Midscene - AI usage', completion.usage);
-
   const { content } = completion.choices[0].message;
   assert(content, 'empty content');
   return content;
 }
 
 export async function callToGetJSONObject<T>(
   messages: ChatCompletionMessageParam[],
+  AIActionTypeValue: AIActionType,
 ): Promise<T> {
-  const response = await call(messages, AIResponseFormat.JSON);
+  // gpt-4o-2024-05-13 only support json_object response format
+  let responseFormat:
+    | OpenAI.ChatCompletionCreateParams['response_format']
+    | OpenAI.ResponseFormatJSONObject = {
+    type: AIResponseFormat.JSON,
+  };
+
+  if (model === 'gpt-4o-2024-08-06') {
+    switch (AIActionTypeValue) {
+      case AIActionType.ASSERT:
+        responseFormat = assertSchema;
+        break;
+      case AIActionType.INSPECT_ELEMENT:
+        responseFormat = findElementSchema;
+        break;
+      case AIActionType.EXTRACT_DATA:
+        //TODO: Currently the restriction type can only be a json subset of the constraint, and the way the extract api is used needs to be adjusted to limit the user's data to this as well
+        // targetResponseFormat = extractDataSchema;
+        break;
+      case AIActionType.PLAN:
+        responseFormat = planSchema;
+        break;
+    }
+  }
+
+  const response = await call(messages, responseFormat);
   assert(response, 'empty response');
   return JSON.parse(response);
 }
diff --git a/packages/midscene/src/ai-model/prompt/element_inspector.ts b/packages/midscene/src/ai-model/prompt/element_inspector.ts
@@ -1,3 +1,5 @@
+import type { ResponseFormatJSONSchema } from 'openai/resources';
+
 export function systemPromptToFindElement() {
   return `
 ## Role:
@@ -135,3 +137,48 @@ export function multiDescription(multi: boolean) {
     ? 'multiple elements matching the description (two or more)'
     : 'The element closest to the description (only one)';
 }
+
+export const findElementSchema: ResponseFormatJSONSchema = {
+  type: 'json_schema',
+  json_schema: {
+    name: 'find_elements',
+    strict: true,
+    schema: {
+      type: 'object',
+      properties: {
+        elements: {
+          type: 'array',
+          items: {
+            type: 'object',
+            properties: {
+              reason: {
+                type: 'string',
+                description: 'Reason for finding this element',
+              },
+              text: {
+                type: 'string',
+                description: 'Text content of the element',
+              },
+              id: {
+                type: 'string',
+                description: 'ID of this element',
+              },
+            },
+            required: ['reason', 'text', 'id'],
+            additionalProperties: false,
+          },
+          description: 'List of found elements',
+        },
+        errors: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+          description: 'List of error messages, if any',
+        },
+      },
+      required: ['elements', 'errors'],
+      additionalProperties: false,
+    },
+  },
+};
diff --git a/packages/midscene/src/ai-model/prompt/util.ts b/packages/midscene/src/ai-model/prompt/util.ts
@@ -7,6 +7,7 @@ import type {
   UIContext,
   UISection,
 } from '@/types';
+import type { ResponseFormatJSONSchema } from 'openai/resources';
 
 const characteristic =
   'You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.';
@@ -123,6 +124,37 @@ Return in the following JSON format:
 `;
 }
 
+export const extractDataSchema: ResponseFormatJSONSchema = {
+  type: 'json_schema',
+  json_schema: {
+    name: 'extract_data',
+    strict: true,
+    schema: {
+      type: 'object',
+      properties: {
+        language: {
+          type: 'string',
+          enum: ['en', 'zh'],
+          description: 'The language of the page',
+        },
+        data: {
+          type: 'object',
+          description: 'The extracted data from extract_data_from_UI skill',
+        },
+        errors: {
+          type: 'array',
+          items: {
+            type: 'string',
+          },
+          description: 'Error messages, if any',
+        },
+      },
+      required: ['language', 'data', 'errors'],
+      additionalProperties: false,
+    },
+  },
+};
+
 export function systemPromptToAssert() {
   return `
 ${characteristic}
@@ -138,6 +170,29 @@ Return in the following JSON format:
 `;
 }
 
+export const assertSchema: ResponseFormatJSONSchema = {
+  type: 'json_schema',
+  json_schema: {
+    name: 'assert',
+    strict: true,
+    schema: {
+      type: 'object',
+      properties: {
+        thought: {
+          type: 'string',
+          description: 'The thought process behind the assertion',
+        },
+        pass: {
+          type: 'boolean',
+          description: 'Whether the assertion passed or failed',
+        },
+      },
+      required: ['thought', 'pass'],
+      additionalProperties: false,
+    },
+  },
+};
+
 /*
 To modify the response format:
   1. update the function `describeSectionResponseFormat` here

diff --git a/packages/midscene/tests/ai/inspector/online_order_inspector.test.ts b/packages/midscene/tests/ai/inspector/online_order_inspector.test.ts
@@ -2,6 +2,7 @@ import { readFileSync } from 'node:fs';
 import path from 'node:path';
 import { AiInspectElement } from '@/ai-model';
 import { expect, test } from 'vitest';
+import { repeatTime } from '../util';
 import {
   getPageTestData,
   repeat,
@@ -36,8 +37,6 @@ const testCases = [
   },
 ];
 
-const repeatTime = process.env.GITHUB_ACTIONS ? 1 : 5;
-
 repeat(repeatTime, (repeatIndex) => {
   test(
     'xicha: inspect element',

diff --git a/packages/midscene/tests/ai/inspector/test-data/visualstudio/element-snapshot.json b/packages/midscene/tests/ai/inspector/test-data/visualstudio/element-snapshot.json
@@ -619,6 +619,29 @@
     "htmlNode": null,
     "indexId": "26"
   },
+  {
+    "id": "3f353e2096",
+    "nodePath": "0-5-5-1-1-1-5-5-1-3",
+    "nodeHashId": "3f353e2096",
+    "nodeType": "TEXT Node",
+    "locator": "",
+    "attributes": {
+      "nodeType": "TEXT Node"
+    },
+    "center": [
+      239,
+      604
+    ],
+    "content": ".",
+    "rect": {
+      "left": 237,
+      "top": 597,
+      "width": 3,
+      "height": 13
+    },
+    "htmlNode": null,
+    "indexId": "27"
+  },
   {
     "id": "7eb7a9b4da",
     "nodePath": "0-5-5-1-1-3-3",
@@ -643,6 +666,6 @@
       395
     ],
     "htmlNode": null,
-    "indexId": "27"
+    "indexId": "28"
   }
 ]
diff --git a/packages/midscene/tests/ai/inspector/todo_inspector.test.ts b/packages/midscene/tests/ai/inspector/todo_inspector.test.ts
@@ -3,6 +3,7 @@ import { AiInspectElement } from '@/ai-model';
 import { useCozeModel } from '@/ai-model/coze';
 import { AiAssert } from '@/ai-model/inspect';
 import { expect, it } from 'vitest';
+import { repeatTime } from '../util';
 import {
   getPageTestData,
   repeat,
@@ -39,8 +40,6 @@ if (useCozeModel('coze')) {
   modelList.push('coze');
 }
 
-const repeatTime = process.env.GITHUB_ACTIONS ? 1 : 2;
-
 modelList.forEach((model) => {
   repeat(repeatTime, (repeatIndex) => {
     it(
@@ -82,7 +81,7 @@ modelList.forEach((model) => {
     );
   });
 
-  repeat(2, () => {
+  repeat(repeatTime, () => {
     it(
       `todo: assert ${model}`,
       async () => {

diff --git a/packages/midscene/tests/ai/util.ts b/packages/midscene/tests/ai/util.ts
@@ -15,3 +15,5 @@ export function makePlanResultStable(plans: PlanningAction[]) {
 export const modelList: Array<'openAI' | 'coze'> = useCozeModel('coze')
   ? ['openAI', 'coze']
   : ['openAI'];
+
+export const repeatTime = process.env.GITHUB_ACTIONS ? 2 : 6;
diff --git a/packages/web-integration/package.json b/packages/web-integration/package.json
@@ -74,7 +74,7 @@
   },
   "files": ["dist", "README.md"],
   "dependencies": {
-    "openai": "4.47.1",
+    "openai": "4.57.1",
     "inquirer": "10.1.5",
     "@midscene/core": "workspace:*",
     "@midscene/shared": "workspace:*",

diff --git a/packages/web-integration/playwright.config.ts b/packages/web-integration/playwright.config.ts
@@ -18,7 +18,7 @@ dotenv.config({
  */
 export default defineConfig({
   // testDir: './tests/ai/e2e',
-  // testIgnore: 'generate-test-data.spec.ts',
+  testIgnore: 'generate-test-data.spec.ts',
   timeout: 900 * 1000,
   /* Run tests in files in parallel */
   fullyParallel: true,