Skip to content

Commit

Permalink
refacotr(ai-model): optimize model evalution method (#98)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhoushaw authored Sep 29, 2024
1 parent 032b505 commit 10757a8
Show file tree
Hide file tree
Showing 143 changed files with 11,231 additions and 3,761 deletions.
3 changes: 1 addition & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,5 @@
"source.organizeImports.biome": "explicit"
},
"editor.defaultFormatter": "biomejs.biome",
"editor.formatOnSave": true,
"editor.formatOnSaveMode": "modifications"
"editor.formatOnSave": true
}
2 changes: 2 additions & 0 deletions biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"**/dist",
"**/test-data/**",
"dist",
"__ai_responses__",
"ai-data/**",
"**/doc_build",
"*-dump.json",
"script_get_all_texts.tmp.js",
Expand Down
28 changes: 15 additions & 13 deletions packages/midscene/src/ai-model/automation/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import assert from 'node:assert';
import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
import { AIActionType, type AIArgs, callAiFn } from '../common';
import {
AIActionType,
type AIArgs,
callAiFn,
transformUserMessages,
} from '../common';
import { describeUserPage } from '../prompt/util';
import { systemPromptToTaskPlanning } from './planning';

Expand All @@ -22,7 +27,7 @@ export async function plan(
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
content: transformUserMessages([
{
type: 'image_url',
image_url: {
Expand All @@ -33,19 +38,16 @@ export async function plan(
{
type: 'text',
text: `
pageDescription: ${pageDescription}
pageDescription:\n
${pageDescription}
\n
Here is the description of the task. Just go ahead:
=====================================
${userPrompt}
=====================================
`,
},
{
type: 'text',
text: `
Here is the description of the task. Just go ahead:
=====================================
${userPrompt}
=====================================
`,
},
],
]),
},
];

Expand Down
4 changes: 2 additions & 2 deletions packages/midscene/src/ai-model/automation/planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ export function systemPromptToTaskPlanning() {
2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
Return in the following JSON format:
Please return the result in JSON format as follows:
{
queryLanguage: '', // language of the description of the task
actions: [ // always return in Array
Expand Down
19 changes: 18 additions & 1 deletion packages/midscene/src/ai-model/common.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type OpenAI from 'openai';
import type {
ChatCompletionContentPart,
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
} from 'openai/resources';
Expand All @@ -12,7 +13,11 @@ import {
transfromOpenAiArgsToCoze,
useCozeModel,
} from './coze';
import { callToGetJSONObject, useOpenAIModel } from './openai';
import {
MIDSCENE_MODEL_TEXT_ONLY,
callToGetJSONObject,
useOpenAIModel,
} from './openai';

export type AIArgs = [
ChatCompletionSystemMessageParam,
Expand Down Expand Up @@ -64,3 +69,15 @@ export async function callAiFn<T>(options: {
'Cannot find Coze or OpenAI config. You should set at least one of them.',
);
}

export function transformUserMessages(msgs: ChatCompletionContentPart[]) {
const textOnly = Boolean(process.env[MIDSCENE_MODEL_TEXT_ONLY]);
if (!textOnly) return msgs;

return msgs.reduce((res, msg) => {
if (msg.type === 'text') {
res += msg.text;
}
return res;
}, '');
}
60 changes: 25 additions & 35 deletions packages/midscene/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import type {
ChatCompletionSystemMessageParam,
ChatCompletionUserMessageParam,
} from 'openai/resources';
import { AIActionType, callAiFn } from './common';
import { AIActionType, callAiFn, transformUserMessages } from './common';
import {
multiDescription,
systemPromptToFindElement,
Expand Down Expand Up @@ -45,34 +45,29 @@ export async function AiInspectElement<
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
content: transformUserMessages([
{
type: 'image_url',
image_url: {
url: screenshotBase64,
detail: 'high',
},
},
{
type: 'text',
text: `
pageDescription: \n
${description}
`,
},
{
type: 'text',
text: `
Here is the description of the findElement. Just go ahead:
=====================================
${JSON.stringify({
description: findElementDescription,
multi: multiDescription(multi),
})}
=====================================
`,
pageDescription: \n
${description}
Here is the description of the findElement. Just go ahead:
=====================================
${JSON.stringify({
description: findElementDescription,
multi: multiDescription(multi),
})}
=====================================
`,
},
],
]),
},
];

Expand Down Expand Up @@ -117,7 +112,7 @@ export async function AiExtractElementInfo<
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
content: transformUserMessages([
{
type: 'image_url',
image_url: {
Expand All @@ -142,7 +137,7 @@ ${typeof dataQuery === 'string' ? dataQuery : JSON.stringify(dataQuery, null, 2)
DATA_DEMAND ends.
`,
},
],
]),
},
];

Expand Down Expand Up @@ -176,7 +171,7 @@ export async function AiAssert<
{ role: 'system', content: systemPrompt },
{
role: 'user',
content: [
content: transformUserMessages([
{
type: 'image_url',
image_url: {
Expand All @@ -186,20 +181,15 @@ export async function AiAssert<
{
type: 'text',
text: `
pageDescription: \n
${description}
`,
},
{
type: 'text',
text: `
Here is the description of the assertion. Just go ahead:
=====================================
${assertion}
=====================================
`,
pageDescription: \n
${description}
Here is the description of the assertion. Just go ahead:
=====================================
${assertion}
=====================================
`,
},
],
]),
},
];

Expand Down
32 changes: 31 additions & 1 deletion packages/midscene/src/ai-model/openai/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export const MIDSCENE_MODEL_NAME = 'MIDSCENE_MODEL_NAME';
export const MIDSCENE_LANGSMITH_DEBUG = 'MIDSCENE_LANGSMITH_DEBUG';
export const MIDSCENE_DEBUG_AI_PROFILE = 'MIDSCENE_DEBUG_AI_PROFILE';
export const OPENAI_API_KEY = 'OPENAI_API_KEY';
export const MIDSCENE_MODEL_TEXT_ONLY = 'MIDSCENE_MODEL_TEXT_ONLY';

const OPENAI_USE_AZURE = 'OPENAI_USE_AZURE';

Expand Down Expand Up @@ -73,6 +74,7 @@ export async function call(
messages,
response_format: responseFormat,
temperature: 0.2,
stream: false,
});
shouldPrintTiming && console.timeEnd('Midscene - AI call');
shouldPrintTiming && console.log('Midscene - AI usage', completion.usage);
Expand Down Expand Up @@ -110,7 +112,35 @@ export async function callToGetJSONObject<T>(
}
}

if (model.startsWith('gemini')) {
responseFormat = { type: AIResponseFormat.TEXT };
}

const response = await call(messages, responseFormat);
assert(response, 'empty response');
return JSON.parse(response.replace(/^```json\n|\n```$/g, ''));
const jsonContent = extractJSONFromCodeBlock(response);
return JSON.parse(jsonContent);
}

export function extractJSONFromCodeBlock(response: string) {
// First, try to match a JSON object directly in the response
const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
if (jsonMatch) {
return jsonMatch[1];
}

// If no direct JSON object is found, try to extract JSON from a code block
const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
if (codeBlockMatch) {
return codeBlockMatch[1];
}

// If no code block is found, try to find a JSON-like structure in the text
const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
if (jsonLikeMatch) {
return jsonLikeMatch[0];
}

// If no JSON-like structure is found, return the original response
return response;
}
3 changes: 3 additions & 0 deletions packages/midscene/src/ai-model/prompt/element_inspector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ You are an expert in software page image (2D) and page element text analysis.
- The returned data must conform to the specified JSON format.
## Output Format:
Please return the result in JSON format as follows:
\`\`\`json
{
"elements": [
Expand Down
Loading

0 comments on commit 10757a8

Please sign in to comment.