HumeAI · twitchard · May 8, 2025
diff --git a/src/evals/index.ts b/src/evals/index.ts
@@ -123,17 +123,17 @@ const runMultipleEvals = async (
 ): Promise<EvalResult[]> => {
   await fs.mkdir(outputDir, { recursive: true });
 
-  const descriptionsSource =
-    descriptions === DESCRIPTIONS ? "default" : "custom";
-  const timestamp =
-    customTimestamp || new Date().toISOString().replace(/[:.]/g, "-");
+  // Convert timestamp to unix format for shorter filenames
+  const unixTimestamp = customTimestamp || 
+    Math.floor(new Date().getTime() / 1000).toString();
 
   console.error(`Running ${count} evaluations of ${scenarioName} in parallel`);
 
   const evalTasks = Array.from({ length: count }, (_, i) => {
+    // Create shorter filenames by avoiding repetition and using unix timestamp
     const outputPath = path.join(
       outputDir,
-      `${scenarioName}-${descriptionsSource}-${timestamp}-${i + 1}.json`,
+      `${scenarioName}-${unixTimestamp}-${i + 1}.json`,
     );
     return {
       index: i + 1,
@@ -193,9 +193,8 @@ const run = async (
 ): Promise<void> => {
   await fs.mkdir(outputDir, { recursive: true });
 
-  const descriptionsSource =
-    descriptions === DESCRIPTIONS ? "default" : "custom";
-  const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
+  // Use unix timestamp for shorter filenames
+  const timestamp = Math.floor(new Date().getTime() / 1000).toString();
 
   console.error(
     `Running ${count} evaluations of ${scenarioNames.length} scenarios: ${scenarioNames.join(", ")}`,
@@ -213,7 +212,7 @@ const run = async (
       outputDir,
       modelName,
       descriptions,
-      `${scenarioName}-${descriptionsSource}-${timestamp}`,
+      `${scenarioName}-${timestamp}`,
     );
 
     return { scenarioName, results };

diff --git a/src/evals/scenario/ai-playwright.ts b/src/evals/scenario/ai-playwright.ts
@@ -33,33 +33,29 @@ export const aiPlaywrightScenario = async (
           displayUse: mockDisplayUse,
           displayResult: mockDisplayResult,
         })),
-        get_scene: getContent(
-          "This tool is able to retrieve dialogue for the play.",
-          {
-            full_scene: dialogueContent,
-          },
-        ),
       },
       initialMessage:
-        "I have a scene from my play at /with a mentor and apprentice discussing courage. Could you help me read it with different voices for each character?",
-      roleplayerPrompt: `You are roleplaying a user who has written a play and wants to hear it performed with different character voices.
+        `Can you help me perform the following scene from my play with AI voices?\n\n${dialogueContent}`,
+      roleplayerPrompt: `You are roleplaying a user who has written a play and wants to create audio files of the dialogue being read aloud by two distinct engaging AI voices. 
 
-      You have a scene with two distinct characters: an elderly wise mentor and a young, enthusiastic apprentice discussing the concept of courage. You want the AI to help you access this dialogue and then read it using distinct voices that match each character.
+    Be unsatisfied with the initial voices the AI chooses and provide feedback. Become satisfied after a couple iterations.
 
-      After the agent helps you find the dialogue, express interest in hearing it performed. When the agent uses the tts tool to perform the dialogue, provide feedback on the voices used.
-
-      ${commonInstructions}
-
-      End the roleplay when you've heard a satisfactory performance of the dialogue that captures both characters with appropriate voices.
-      `,
+    ${commonInstructions}
+
+    End the roleplay when the entire text of your play has been read aloud correctly by each character.
+    `,
     },
     criteria: {
       ...commonCriteria,
       ...voiceDesignCriteria,
       diarization:
         "The agent should use save_voice to create a voice for each character, and each utterance dialogue belonging to that character should always be voiced with that voice.",
       one_speaker_per_request:
-        "All utterances within a single tts call should be spoken by the same character. The agent should not mix voices within a single tts call.",
+        `All text passed to the tts tool should be for a single voice. The agent should split the text into separate requests for each speaker. Not just separate UTTERANCES, separate REQUESTS.
+Ok: TTS(utterances[{text: ..., voiceName: a}, {text: ..., voiceName: a}])
+Not ok: TTS(utterances[{text: ..., voiceName: a}, {text: ..., voiceName: b}])
+Ok: TTS(utterances[{text: ..., voiceName: a}], TTS(utterances[{text: ..., voiceName: b}]))
+`,
       only_speech:
         "The 'text' passed to the tts tool should contain only the text meant to be spoken. It should be stripped of any stage directions, or speaker names, or section titles",
     },

diff --git a/src/evals/scenario/common.ts b/src/evals/scenario/common.ts
@@ -45,7 +45,8 @@ export const commonInstructions = `
 const voiceDesignText = await fs.readFile(
   path.join(__dirname, "/../data/voice_design.txt"),
   "utf-8",
-);
+)
 export const voiceDesignCriteria = {
+  voice_design_collaborative: `When the agent designs a new voice, it should ALWAYS STOP and ask for feedback on how the voice sounds BEFORE saving the voice or generating a significant amount of text with it. There should be a message from the roleplayer providing feedback before any voice is saved or any large source text is spoken.`,
   voice_design_well_done: `When crafting voice descriptions, or presenting the user with voice options, or guiding the user through the process of voice design, the agent should abide by the following directions:\n\n ${voiceDesignText}`,
 };
diff --git a/src/evals/scenario/picky-screenreader.ts b/src/evals/scenario/picky-screenreader.ts
@@ -69,6 +69,6 @@ export const pickyScreenreaderScenario = async (
       voice_options:
         "The agent should offer or demonstrate different voice options after the user expresses dissatisfaction.",
     },
-    maxTurns: 25,
+    maxTurns: 35,
   };
 };