web-infra-dev · yuyutaotao · Feb 7, 2025 · Jan 27, 2025 · Jan 27, 2025 · Feb 2, 2025
diff --git a/.gitignore b/.gitignore
@@ -96,6 +96,7 @@ test-results/
 playwright-report/
 blob-report/
 playwright/.cache/
+iife-script/
 
 # Midscene.js dump files
 __ai_responses__/

diff --git a/biome.json b/biome.json
@@ -16,6 +16,7 @@
       "**/doc_build",
       "*-dump.json",
       "test-results/**",
+      "iife-script/**",
       "script_get_all_texts.tmp.js",
       "**/playwright-report/**",
       "**/todo-report.spec.ts-snapshots/**",

diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json
@@ -7,7 +7,7 @@
     "forceConsistentCasingInFileNames": true,
     "isolatedModules": true,
     "jsx": "preserve",
-    "lib": ["ESNext"],
+    "lib": ["ESNext", "DOM"],
     "moduleResolution": "node",
     "paths": {
       "@/*": ["./src/*"]

diff --git a/packages/midscene/modern.config.ts b/packages/midscene/modern.config.ts
@@ -10,6 +10,7 @@ export default defineConfig({
       index: 'src/index.ts',
       env: 'src/env.ts',
       utils: 'src/utils.ts',
+      tree: 'src/tree.ts',
       'ai-model': 'src/ai-model/index.ts',
     },
     outDir: 'dist/lib',

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -13,14 +13,16 @@
     ".": "./dist/lib/index.js",
     "./env": "./dist/lib/env.js",
     "./utils": "./dist/lib/utils.js",
-    "./ai-model": "./dist/lib/ai-model.js"
+    "./ai-model": "./dist/lib/ai-model.js",
+    "./tree": "./dist/lib/tree.js"
   },
   "typesVersions": {
     "*": {
       ".": ["./dist/lib/types/index.d.ts"],
       "env": ["./dist/lib/types/env.d.ts"],
       "utils": ["./dist/lib/types/utils.d.ts"],
-      "ai-model": ["./dist/lib/types/ai-model.d.ts"]
+      "ai-model": ["./dist/lib/types/ai-model.d.ts"],
+      "tree": ["./dist/lib/types/tree.d.ts"]
     }
   },
   "scripts": {

diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -8,6 +8,7 @@ import type {
   AIUsageInfo,
   BaseElement,
   ElementById,
+  ElementTreeNode,
   Size,
   UIContext,
 } from '@/types';
@@ -52,9 +53,8 @@ function transformToAbsoluteCoords(
 // let index = 0;
 export async function transformElementPositionToId(
   aiResult: AIElementResponse | [number, number],
-  elementsInfo: BaseElement[],
+  treeRoot: ElementTreeNode<BaseElement>,
   size: { width: number; height: number },
-  screenshotBase64: string,
 ) {
   if (Array.isArray(aiResult)) {
     const relativePosition = aiResult;
@@ -67,7 +67,7 @@ export async function transformElementPositionToId(
     );
 
     const element = elementByPositionWithElementInfo(
-      elementsInfo,
+      treeRoot,
       absolutePosition,
     );
     assert(
@@ -96,7 +96,7 @@ function getQuickAnswer(
     | Partial<AISingleElementResponse>
     | Partial<AISingleElementResponseByPosition>
     | undefined,
-  elementsInfo: BaseElement[],
+  tree: ElementTreeNode<BaseElement>,
   elementById: ElementById,
   insertElementByPosition: (position: { x: number; y: number }) => BaseElement,
 ) {
@@ -115,10 +115,7 @@ function getQuickAnswer(
   }
 
   if ('position' in quickAnswer && quickAnswer.position) {
-    let element = elementByPositionWithElementInfo(
-      elementsInfo,
-      quickAnswer.position,
-    );
+    let element = elementByPositionWithElementInfo(tree, quickAnswer.position);
     if (!element) {
       element = insertElementByPosition(quickAnswer.position);
     }
@@ -156,7 +153,7 @@ export async function AiInspectElement<
   // meet quick answer
   const quickAnswer = getQuickAnswer(
     options.quickAnswer,
-    context.content,
+    context.tree,
     elementById,
     insertElementByPosition,
   );
@@ -202,9 +199,8 @@ export async function AiInspectElement<
   return {
     parseResult: await transformElementPositionToId(
       res.content,
-      context.content,
+      context.tree,
       size,
-      screenshotBase64,
     ),
     rawResponse: res.content,
     elementById,
@@ -282,7 +278,6 @@ export async function AiAssert<
   assert(assertion, 'assertion should be a string');
 
   const { screenshotBase64 } = context;
-  const { description } = await describeUserPage(context, liteContextConfig);
   const systemPrompt = systemPromptToAssert();
 
   const msgs: AIArgs = [

diff --git a/packages/midscene/src/ai-model/llm-planning.ts b/packages/midscene/src/ai-model/llm-planning.ts
@@ -20,8 +20,7 @@ export async function plan(
 ): Promise<PlanningAIResponse> {
   const { callAI, context } = opts || {};
   const { screenshotBase64, screenshotBase64WithElementMarker } = context;
-  const { description: pageDescription, elementByPosition } =
-    await describeUserPage(context);
+  const { description: pageDescription } = await describeUserPage(context);
 
   const systemPrompt = await systemPromptToTaskPlanning();
   const taskBackgroundContextText = generateTaskBackgroundContext(

diff --git a/packages/midscene/src/ai-model/prompt/llm-planning.ts b/packages/midscene/src/ai-model/prompt/llm-planning.ts
@@ -65,7 +65,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 
 - All the actions you composed MUST be based on the page context information you get.
 - Trust the "What have been done" field about the task (if any), don't repeat actions in it.
-- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`.
+- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
 - If you cannot plan any action at all (i.e. empty actions array), set reason in the \`error\` field.
 
 ## About the \`actions\` field
@@ -80,7 +80,7 @@ type LocateParam = {locateParam}
 
 Each action has a \`type\` and corresponding \`param\`. To be detailed:
 - type: 'Tap', tap the located element
-  * {{ locate: {sample}, param: null }}
+  * {{ locate: LocateParam, param: null }}
 - type: 'Hover', move mouse over to the located element
   * {{ locate: LocateParam, param: null }}
 - type: 'Input', replace the value in the input field
@@ -133,7 +133,10 @@ The JSON format is as follows:
   "furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
   "error"?: string // Use the same language as the user's instruction.
 }}
-Here is an example of how to decompose a task:
+
+## Examples
+
+### Example 1: Decompose a task
 
 When a user says 'Click the language switch button, wait 1s, click "English"', the user will give you the description like this:
 
@@ -176,7 +179,7 @@ By viewing the page screenshot and description, you should consider this and out
   }}
 }}
 
-Here is another example of how to tolerate error situations only when the instruction is an "if" statement:
+### Example 2: Tolerate error situations only when the instruction is an "if" statement
 
 If the user says "If there is a popup, close it", you should consider this and output the JSON:
 
@@ -203,7 +206,7 @@ For contrast, if the user says "Close the popup" in this situation, you should c
   "furtherPlan": null
 }}
 
-Here is an example of when task is accomplished, don't plan more actions:
+### Example 3: When task is accomplished, don't plan more actions
 
 When the user ask to "Wait 4s", you should consider this:
 
@@ -219,7 +222,7 @@ When the user ask to "Wait 4s", you should consider this:
   "furtherPlan": null // All steps have been included in the actions, so no further plan is needed
 }}
 
-Here is an example of what NOT to do:
+### Example 4: What NOT to do
 
 Wrong output:
 
@@ -230,7 +233,7 @@ Wrong output:
       "thought": "Click the language switch button to open the language options.",
       "param": null,
       "locate": {{
-        {sample}, // WRONG:prompt is missing
+        {{"id": "c81c4e9a33"}}, // WRONG:prompt is missing
       }}
     }},
     {{