Skip to content

Commit 46c379b

Browse files
authored
fix eval in ci (#90)
* add bananalyzer init to git action workflow * use headless on ci * make all remaining evals headless * add sshx for debugging the eval * move envs to container level * debugging evals ci * only keep last twenty github commits eval for debugging * update playground * add more evals back in * increase ci container size * revert back to small container * remove sshx * add bananalyzer evals back in * debugging twitter_signup * testing bigger docker container in ci * revert back to ubuntu-latest * >= 20 * remove timeout from git evals * cleanup * remove costar + more reliable output checking * back to === * increase trial count to 3 * cleanup playground * remove trial count 3 until we switch to browserbase (unstable rn)
1 parent 72f408f commit 46c379b

File tree

4 files changed

+69
-104
lines changed

4 files changed

+69
-104
lines changed

.github/workflows/ci.yml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ jobs:
99
run-evals:
1010
runs-on: ubuntu-latest
1111
timeout-minutes: 25
12+
env:
13+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
14+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
15+
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
16+
HEADLESS: true
1217

1318
steps:
1419
- name: Check out repository code
@@ -17,7 +22,7 @@ jobs:
1722
- name: Set up Node.js
1823
uses: actions/setup-node@v4
1924
with:
20-
node-version: '20'
25+
node-version: "20"
2126

2227
- name: Install pnpm
2328
run: npm install -g pnpm
@@ -28,11 +33,13 @@ jobs:
2833
- name: Install Playwright browsers
2934
run: pnpm exec playwright install --with-deps
3035

36+
- name: Initialize Banana Analyzer Evals
37+
run: ./evals/bananalyzer-ts/init.sh
38+
3139
- name: Run Evals
32-
env:
33-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
34-
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
35-
BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
36-
HEADLESS: true
3740
run: pnpm evals
3841
timeout-minutes: 20
42+
43+
# Uncomment the lines below to ssh into the container for debugging
44+
# - name: Install sshx
45+
# run: curl -sSf https://sshx.io/get | sh -s run

evals/bananalyzer-ts/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,11 @@ export async function evaluateExample(
5959
return false;
6060
}
6161

62-
const stagehand = new Stagehand({ env: "LOCAL", verbose: 1 });
62+
const stagehand = new Stagehand({
63+
env: "LOCAL",
64+
verbose: 1,
65+
headless: process.env.HEADLESS !== "false",
66+
});
6367
await stagehand.init();
6468

6569
let server: Server | null = null;

evals/index.eval.ts

Lines changed: 50 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -123,120 +123,76 @@ const peeler_complex = async () => {
123123

124124
return price !== null;
125125
};
126+
126127
const extract_collaborators_from_github_repository = async () => {
127-
const stagehand = new Stagehand({ env: "LOCAL", verbose: 1 });
128+
const stagehand = new Stagehand({
129+
env: "LOCAL",
130+
verbose: 1,
131+
headless: process.env.HEADLESS !== "false",
132+
});
128133
await stagehand.init();
129134

130-
const timeoutPromise = new Promise((_, reject) =>
131-
setTimeout(() => reject(new Error("Operation timed out")), 60000),
132-
);
133-
134135
try {
135-
const extractionPromise = (async () => {
136-
await stagehand.page.goto("https://github.com/facebook/react");
137-
await stagehand.act({
138-
action: "find the contributors section",
139-
});
140-
141-
await stagehand.waitForSettledDom();
142-
143-
const { contributors } = await stagehand.extract({
144-
instruction: "Extract top 20 contributors of this repository",
145-
schema: z.object({
146-
contributors: z.array(
147-
z.object({
148-
github_username: z.string(),
149-
information: z.string(),
150-
}),
151-
),
152-
}),
153-
modelName: "gpt-4o-2024-08-06",
154-
});
155-
156-
console.log("Extracted collaborators:", contributors);
157-
return contributors.length === 20;
158-
})();
159-
160-
const result = await Promise.race([extractionPromise, timeoutPromise]);
161-
await stagehand.context.close();
162-
return result;
163-
} catch (error) {
164-
console.error("Error or timeout occurred:", error);
165-
await stagehand.context.close();
166-
return false;
167-
}
168-
};
136+
await stagehand.page.goto("https://github.com/facebook/react");
137+
await stagehand.act({
138+
action: "find the contributors section",
139+
});
169140

170-
const extract_last_twenty_github_commits = async () => {
171-
const stagehand = new Stagehand({ env: "LOCAL", verbose: 1 });
172-
await stagehand.init();
141+
await stagehand.waitForSettledDom();
173142

174-
const timeoutPromise = new Promise((_, reject) =>
175-
setTimeout(() => reject(new Error("Operation timed out")), 60000),
176-
);
143+
const { contributors } = await stagehand.extract({
144+
instruction: "Extract top 20 contributors of this repository",
145+
schema: z.object({
146+
contributors: z.array(
147+
z.object({
148+
github_username: z.string(),
149+
information: z.string(),
150+
}),
151+
),
152+
}),
153+
modelName: "gpt-4o-2024-08-06",
154+
});
177155

178-
try {
179-
const extractionPromise = (async () => {
180-
await stagehand.page.goto("https://github.com/facebook/react");
181-
182-
await stagehand.waitForSettledDom();
183-
184-
const { commits } = await stagehand.extract({
185-
instruction: "Extract last 20 commits",
186-
schema: z.object({
187-
commits: z.array(
188-
z.object({
189-
commit_message: z.string(),
190-
commit_url: z.string(),
191-
commit_hash: z.string(),
192-
}),
193-
),
194-
}),
195-
modelName: "gpt-4o-2024-08-06",
196-
});
197-
198-
console.log("Extracted commits:", commits);
199-
return commits.length === 20;
200-
})();
201-
202-
const result = await Promise.race([extractionPromise, timeoutPromise]);
156+
console.log("Extracted collaborators:", contributors);
203157
await stagehand.context.close();
204-
return result;
158+
return contributors.length === 20;
205159
} catch (error) {
206160
console.error("Error or timeout occurred:", error);
207161
await stagehand.context.close();
208162
return false;
209163
}
210164
};
211165

212-
const twitter_signup = async () => {
213-
const stagehand = new Stagehand({ env: "LOCAL", verbose: 1 });
166+
const extract_last_twenty_github_commits = async () => {
167+
const stagehand = new Stagehand({
168+
env: "LOCAL",
169+
verbose: 1,
170+
headless: process.env.HEADLESS !== "false",
171+
});
214172
await stagehand.init();
215173

216-
const timeoutPromise = new Promise((_, reject) =>
217-
setTimeout(() => reject(new Error("Operation timed out")), 120000),
218-
);
219-
220174
try {
221-
const signupPromise = (async () => {
222-
await stagehand.page.goto("https://twitter.com");
223-
224-
await stagehand.act({
225-
action:
226-
'sign up with email "{random 12 digit number}@gmail.com", password "TEstTEst.1234". Use whatever else you want for all other fields. You can only stop if you have reached the verification stage.',
227-
});
228-
229-
await stagehand.waitForSettledDom();
175+
await stagehand.page.goto("https://github.com/facebook/react");
230176

231-
// Add a check here to verify if signup was successful
232-
// For example, check if a certain element is visible after signup
177+
await stagehand.waitForSettledDom();
233178

234-
return true; // Return true if signup was successful
235-
})();
179+
const { commits } = await stagehand.extract({
180+
instruction: "Extract last 20 commits",
181+
schema: z.object({
182+
commits: z.array(
183+
z.object({
184+
commit_message: z.string(),
185+
commit_url: z.string(),
186+
commit_hash: z.string(),
187+
}),
188+
),
189+
}),
190+
modelName: "gpt-4o-2024-08-06",
191+
});
236192

237-
const result = await Promise.race([signupPromise, timeoutPromise]);
193+
console.log("Extracted commits:", commits);
238194
await stagehand.context.close();
239-
return result;
195+
return commits.length === 20;
240196
} catch (error) {
241197
console.error("Error or timeout occurred:", error);
242198
await stagehand.context.close();
@@ -396,7 +352,6 @@ const tasks = {
396352
simple_google_search,
397353
extract_collaborators_from_github_repository,
398354
extract_last_twenty_github_commits,
399-
twitter_signup,
400355
costar,
401356
google_jobs,
402357
};
@@ -406,7 +361,7 @@ const exactMatch = (args: { input; output; expected? }) => {
406361

407362
return {
408363
name: "Exact match",
409-
score: Boolean(args.output) ? 1 : 0,
364+
score: args.output === true || args.output?.success == true,
410365
};
411366
};
412367

@@ -433,7 +388,6 @@ const testcases = [
433388
{ input: { name: "simple_google_search" } },
434389
{ input: { name: "extract_collaborators_from_github_repository" } },
435390
{ input: { name: "extract_last_twenty_github_commits" } },
436-
{ input: { name: "twitter_signup" } },
437391
// { input: { name: "costar" } },
438392
{ input: { name: "google_jobs" } },
439393
...chosenBananalyzerEvals.map((evalItem: any) => ({
@@ -496,4 +450,5 @@ Eval("stagehand", {
496450
}
497451
},
498452
scores: [exactMatch],
453+
// trialCount: 3,
499454
});

evals/playground.ts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import { Eval } from "braintrust";
21
import { Stagehand } from "../lib";
32
import { z } from "zod";
43

@@ -116,7 +115,7 @@ async function main() {
116115
// const [costarResult] = await Promise.all([costar()]);
117116
const [homedepotResult] = await Promise.all([homedepot()]);
118117

119-
console.log("Homedepot result:", homedepotResult);
118+
console.log("Result:", homedepotResult);
120119
}
121120

122121
main().catch(console.error);

0 commit comments

Comments
 (0)