fix(prompt): resolve the llm-planning format error (#341)

zhoushaw · web-flow · commit f8744e441ab7 · 2025-01-30T14:14:14.000+08:00
diff --git a/.github/workflows/ai-unit-test.yml b/.github/workflows/ai-unit-test.yml
@@ -44,6 +44,11 @@ jobs:
     - name: Install dependencies
       run: pnpm install --frozen-lockfile
 
+    - name: Install puppeteer dependencies
+      run: |
+        cd packages/web-integration
+        npx puppeteer browsers install chrome
+
     - name: Build project
       run: pnpm run build
     
diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -47,7 +47,7 @@
     "@midscene/shared": "workspace:*",
     "@langchain/core": "0.3.26",
     "socks-proxy-agent": "8.0.4",
-    "openai": "4.57.1"
+    "openai": "4.81.0"
   },
   "devDependencies": {
     "@modern-js/module-tools": "2.60.6",
diff --git a/packages/midscene/src/ai-model/prompt/llm-planning.ts b/packages/midscene/src/ai-model/prompt/llm-planning.ts
@@ -291,6 +291,7 @@ export const planSchema: ResponseFormatJSONSchema = {
                 type: ['object', 'null'],
                 description:
                   'Parameter of the action, can be null ONLY when the type field is Tap or Hover',
+                additionalProperties: true,
               },
               locate: {
                 type: ['object', 'null'],
diff --git a/packages/midscene/tests/ai/__snapshots__/prompt.test.ts.snap b/packages/midscene/tests/ai/__snapshots__/prompt.test.ts.snap
@@ -0,0 +1,234 @@
+// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
+
+exports[`automation - computer > should be able to generate prompt 1`] = `
+"
+## Role
+
+You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
+
+## Objective
+
+- Decompose the instruction user asked into a series of actions
+- Locate the target element if possible
+- If the instruction cannot be accomplished, give a further plan.
+
+## Workflow
+
+1. Receive the user's element description, screenshot, and instruction.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep). The "About the action" section below will give you more details.
+3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
+4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
+5. Consider whether the user's instruction will be accomplished after all the actions
+ - If yes, set \`taskWillBeAccomplished\` to true
+ - If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
+
+## Constraints
+
+- All the actions you composed MUST be based on the page context information you get.
+- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
+- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
+- If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
+
+## About the \`actions\` field
+
+### The common \`locate\` param
+
+The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it follows the following scheme:
+
+type LocateParam = {
+        "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
+        "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
+      } | null // If it's not on the page, the LocateParam should be null
+
+### Supported actions
+
+Each action has a \`type\` and corresponding \`param\`. To be detailed:
+- type: 'Tap', tap the located element
+  * { locate: {"id": "c81c4e9a33", "prompt": "the search bar"}, param: null }
+- type: 'Hover', move mouse over to the located element
+  * { locate: LocateParam, param: null }
+- type: 'Input', replace the value in the input field
+  * { locate: LocateParam, param: { value: string } }
+  * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. 
+- type: 'KeyboardPress', press a key
+  * { param: { value: string } }
+- type: 'Scroll', scroll up or down.
+  * { 
+      locate: LocateParam | null, 
+      param: { 
+        direction: 'down'(default) | 'up' | 'right' | 'left', 
+        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', 
+        distance: null | number 
+      } 
+    }
+    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field. 
+    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
+- type: 'FalsyConditionStatement'
+  * { param: null }
+  * use this action when the instruction is an "if" statement and the condition is falsy.
+- type: 'Sleep'
+  * { param: { timeMs: number } }
+
+## How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields ?
+
+\`taskWillBeAccomplished\` is a boolean field, means whether the task will be accomplished after all the actions.
+
+\`furtherPlan\` is used when the task cannot be accomplished. It follows the scheme { whatHaveDone: string, whatToDoNext: string }:
+- \`whatHaveDone\`: a string, describe what have been done after the previous actions.
+- \`whatToDoNext\`: a string, describe what should be done next after the previous actions has finished. It should be a concise and clear description of the actions to be performed. Make sure you don't lose any necessary steps user asked.
+
+
+
+## Output JSON Format:
+
+The JSON format is as follows:
+
+{
+  "actions": [
+    {
+      "thought": "Reasons for generating this task, and why this task is feasible on this page.", // Use the same language as the user's instruction.
+      "type": "Tap",
+      "param": null,
+      "locate": {"id": "c81c4e9a33", "prompt": "the search bar"} | null,
+    },
+    // ... more actions
+  ],
+  "taskWillBeAccomplished": boolean,
+  "furtherPlan": { "whatHaveDone": string, "whatToDoNext": string } | null, // Use the same language as the user's instruction.
+  "error"?: string // Use the same language as the user's instruction.
+}
+Here is an example of how to decompose a task:
+
+When a user says 'Click the language switch button, wait 1s, click "English"', the user will give you the description like this:
+
+====================
+
+The size of the page: 1280 x 720
+Some of the elements are marked with a rectangle in the screenshot, some are not.
+
+JSON description of all the elements in screenshot:
+id=c81c4e9a33: {
+  "markerId": 2, // The number indicated by the rectangle label in the screenshot
+  "attributes":  // Attributes of the element
+    {"data-id":"@submit s0","class":".gh-search","aria-label":"搜索","nodeType":"IMG", "src": "image_url"},
+  "rect": { "left": 16, "top": 378, "width": 89, "height": 16 } // Position of the element in the page
+}
+
+id=5a29bf6419bd: {
+  "content": "获取优惠券",
+  "attributes": { "nodeType": "TEXT" },
+  "rect": { "left": 32, "top": 332, "width": 70, "height": 18 }
+}
+
+...many more
+====================
+
+By viewing the page screenshot and description, you should consider this and output the JSON:
+
+* The main steps should be: tap the switch button, sleep, and tap the 'English' option 
+* The language switch button is shown in the screenshot, but it's not marked with a rectangle. So we have to use the page description to find the element. By carefully checking the context information (coordinates, attributes, content, etc.), you can find the element.
+* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So the last action will have a \`null\` value in the \`locate\` field. 
+* The task cannot be accomplished (because we cannot see the "English" option now), so a \`furtherPlan\` field is needed.
+
+{
+  "actions":[
+    {
+      "type": "Tap", 
+      "thought": "Click the language switch button to open the language options.",
+      "param": null,
+      "locate": {"id": "c81c4e9a33", "prompt": "the search bar"},
+    },
+    {
+      "type": "Sleep",
+      "thought": "Wait for 1 second to ensure the language options are displayed.",
+      "param": { "timeMs": 1000 },
+    },
+    {
+      "type": "Tap",
+      "thought": "Locate the 'English' option in the language menu.",
+      "param": null, 
+      "locate": null
+    },
+  ],
+  "error": null,
+  "taskWillBeAccomplished": false,
+  "furtherPlan": {
+    "whatToDoNext": "find the 'English' option and click on it",
+    "whatHaveDone": "Click the language switch button and wait 1s"
+  }
+}
+
+Here is another example of how to tolerate error situations only when the instruction is an "if" statement:
+
+If the user says "If there is a popup, close it", you should consider this and output the JSON:
+
+* By viewing the page screenshot and description, you cannot find the popup, so the condition is falsy.
+* The instruction itself is an "if" statement, it means the user can tolerate this situation, so you should leave a \`FalsyConditionStatement\` action.
+
+{
+  "actions": [{
+      "type": "FalsyConditionStatement",
+      "thought": "There is no popup on the page",
+      "param": null
+    }
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+
+For contrast, if the user says "Close the popup" in this situation, you should consider this and output the JSON:
+
+{
+  "actions": [],
+  "error": "The instruction and page context are irrelevant, there is no popup on the page",
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null
+}
+
+Here is an example of when task is accomplished, don't plan more actions:
+
+When the user ask to "Wait 4s", you should consider this:
+
+{
+  "actions": [
+    {
+      "type": "Sleep",
+      "thought": "Wait for 4 seconds",
+      "param": { "timeMs": 4000 },
+    },
+  ],
+  "taskWillBeAccomplished": true,
+  "furtherPlan": null // All steps have been included in the actions, so no further plan is needed
+}
+
+Here is an example of what NOT to do:
+
+Wrong output:
+
+{
+  "actions":[
+    {
+      "type": "Tap",
+      "thought": "Click the language switch button to open the language options.",
+      "param": null,
+      "locate": {
+        {"id": "c81c4e9a33", "prompt": "the search bar"}, // WRONG:prompt is missing
+      }
+    },
+    {
+      "type": "Tap", 
+      "thought": "Click the English option",
+      "param": null,
+      "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
+    }
+  ],
+  "taskWillBeAccomplished": false,
+  // WRONG: should not be null
+  "furtherPlan": null,
+}
+
+Reason:
+* The \`prompt\` is missing in the first 'Locate' action
+* Since the option button is not shown in the screenshot, the task cannot be accomplished, so a \`furtherPlan\` field is needed.
+"
+`;
diff --git a/packages/midscene/tests/ai/evaluate/assertion.test.ts b/packages/midscene/tests/ai/evaluate/assertion.test.ts
@@ -72,7 +72,7 @@ describe('ai inspect element', () => {
           console.log('assertion passed, thought:', result?.content?.thought);
         },
         {
-          timeout: 60 * 1000,
+          timeout: 3 * 60 * 1000,
         },
       );
     });
diff --git a/packages/midscene/tests/ai/evaluate/plan/__snapshots__/planning-input.test.ts.snap b/packages/midscene/tests/ai/evaluate/plan/__snapshots__/planning-input.test.ts.snap
@@ -5,7 +5,7 @@ exports[`automation - planning input > input value 1`] = `
   {
     "locate": {
       "id": "fbc2d002",
-      "prompt": "the input field with placeholder 'What needs to be done?'",
+      "prompt": "the input field labeled 'What needs to be done?'",
     },
     "param": {
       "value": "learning english",
@@ -21,7 +21,7 @@ exports[`automation - planning input > input value 2`] = `
   {
     "locate": {
       "id": "fbc2d002",
-      "prompt": "the input field labeled 'What needs to be done?'",
+      "prompt": "the input field with placeholder 'What needs to be done?'",
     },
     "param": {
       "value": "learning english",
@@ -45,7 +45,7 @@ exports[`automation - planning input > input value Add, delete, correct and chec
   {
     "locate": {
       "id": "fbc2d002",
-      "prompt": "the task input box with content 'Learn English'",
+      "prompt": "the task input box with the content 'Learn English'",
     },
     "param": {
       "value": "Learn English tomorrow",
@@ -61,7 +61,7 @@ exports[`automation - planning input > input value Add, delete, correct and chec
   {
     "locate": {
       "id": "fbc2d002",
-      "prompt": "the task input box containing 'Learn English'",
+      "prompt": "the input box containing 'Learn English'",
     },
     "param": {
       "value": "Learn Skiing",
diff --git a/packages/midscene/tests/ai/extract/__snapshots__/extract.test.ts.snap b/packages/midscene/tests/ai/extract/__snapshots__/extract.test.ts.snap
@@ -13,7 +13,7 @@ exports[`extract > online order 1`] = `
     },
   ],
   "errors": [],
-  "language": "zh",
+  "language": "en",
 }
 `;
 
diff --git a/packages/midscene/tests/ai/prompt.test.ts b/packages/midscene/tests/ai/prompt.test.ts
@@ -4,8 +4,7 @@ import { describe, expect, it, test } from 'vitest';
 describe('automation - computer', () => {
   it('should be able to generate prompt', async () => {
     const prompt = await systemPromptToTaskPlanning();
-    console.log(prompt);
     expect(prompt).toBeDefined();
+    expect(prompt).toMatchSnapshot();
   });
 });
-test('inspect with quick answer', async () => {});
diff --git a/packages/web-integration/package.json b/packages/web-integration/package.json
@@ -130,7 +130,7 @@
     "cors": "2.8.5",
     "express": "4.21.1",
     "inquirer": "10.1.5",
-    "openai": "4.57.1",
+    "openai": "4.81.0",
     "socket.io": "4.8.1",
     "socket.io-client": "4.8.1"
   },
diff --git a/packages/web-integration/tests/ai/web/playwright/ai-auto-todo.spec.ts b/packages/web-integration/tests/ai/web/playwright/ai-auto-todo.spec.ts
@@ -34,7 +34,9 @@ test('ai todo', async ({ ai, aiQuery }) => {
   await ai('Click the checkbox next to the second task');
   await ai('Click the "completed" Status button below the task list');
 
-  const taskList = await aiQuery<string[]>('string[], tasks in the list');
+  const taskList = await aiQuery<string[]>(
+    'string[], Extract all task names from the list',
+  );
   expect(taskList.length).toBe(1);
   expect(taskList[0]).toBe('Learning AI the day after tomorrow');
 
diff --git a/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts b/packages/web-integration/tests/ai/web/puppeteer/showcase.test.ts
@@ -91,17 +91,27 @@ describe(
       expect(names.length).toBeGreaterThan(5);
     });
 
-    it('search engine', async () => {
-      const { originPage, reset } = await launchPage('https://www.baidu.com/');
-      resetFn = reset;
-      const mid = new PuppeteerAgent(originPage);
-      await mid.aiAction('type "AI 101" in search box');
-      await mid.aiAction(
-        'type "Hello world" in search box, hit Enter, wait 2s, click the second result, wait 4s',
-      );
+    it(
+      'search engine',
+      async () => {
+        const { originPage, reset } = await launchPage(
+          'https://www.baidu.com/',
+        );
+        resetFn = reset;
+        const mid = new PuppeteerAgent(originPage);
+        await mid.aiAction('type "AI 101" in search box');
+        await mid.aiAction(
+          'type "Hello world" in search box, hit Enter, wait 2s, click the second result, wait 4s',
+        );
 
-      await mid.aiWaitFor('there are some search results about "Hello world"');
-    });
+        await mid.aiWaitFor(
+          'there are some search results about "Hello world"',
+        );
+      },
+      {
+        timeout: 3 * 60 * 1000,
+      },
+    );
 
     it('scroll', async () => {
       const htmlPath = path.join(__dirname, 'scroll.html');
@@ -152,6 +162,6 @@ describe(
     });
   },
   {
-    timeout: 60 * 1000,
+    timeout: 4 * 60 * 1000,
   },
 );
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml

Original file line number	Diff line number	Diff line change
`@@ -72,7 +72,7 @@ describe('ai inspect element', () => {`
`72`	`72`	`console.log('assertion passed, thought:', result?.content?.thought);`
`73`	`73`	`},`
`74`	`74`	`{`
`75`		`- timeout: 60 * 1000,`
	`75`	`+ timeout: 3 * 60 * 1000,`
`76`	`76`	`},`
`77`	`77`	`);`
`78`	`78`	`});`
Original file line number	Diff line number	Diff line change
@@ -13,7 +13,7 @@ exports[`extract > online order 1`] = `
`13`	`13`	`},`
`14`	`14`	`],`
`15`	`15`	`"errors": [],`
`16`		`- "language": "zh",`
	`16`	`+ "language": "en",`
`17`	`17`	`}`
`18`	`18`	`;
`19`	`19`