Skip to content

Commit f49e9f6

Browse files
authored
x/evals: CoT-consensus (#2961)
* More infra * storage using sqlite * consensus working * fix log * finalStats * more results
1 parent a5fcb71 commit f49e9f6

File tree

13 files changed

+1245
-200
lines changed

13 files changed

+1245
-200
lines changed

x/spolu/research/evals/RESULTS.log

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# CoT total=128
2+
3+
## Game24
4+
5+
Finished run: algorithm=CoT dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=2 total=128
6+
Finished run: algorithm=CoT dataset=Game24 provider=openai model=gpt-4-1106-preview check=5 total=128
7+
8+
Finished run: algorithm=CoT dataset=Game24 provider=mistral model=mistral-small check=0 total=128
9+
Finished run: algorithm=CoT dataset=Game24 provider=mistral model=mistral-medium check=0 total=128
10+
11+
## MATH
12+
13+
Finished run: algorithm=CoT dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=51 total=128
14+
Finished run: algorithm=CoT dataset=MATH provider=openai model=gpt-4-1106-preview check=88 total=128
15+
16+
17+
Finished run: algorithm=CoT dataset=MATH provider=mistral model=mistral-small check=39 total=128
18+
Finished run: algorithm=CoT dataset=MATH provider=mistral model=mistral-medium check=49 total=128
19+
20+
# CoT-Consensus total=128 pool=32
21+
22+
## Game24
23+
24+
Finished run: algorithm=CoT-consensus dataset=Game24 provider=openai model=gpt-3.5-turbo-1106
25+
Result: algorithm=CoT-consensus poolSize=1 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=5 total=128
26+
Result: algorithm=CoT-consensus poolSize=2 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=5 total=128
27+
Result: algorithm=CoT-consensus poolSize=4 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=5 total=128
28+
Result: algorithm=CoT-consensus poolSize=8 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=7 total=128
29+
Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=7 total=128
30+
Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=10 total=128
31+
Final stats: rate=17.28/spromptTokensRate=12608.702/s completionTokensRate=891.42/s promptTokensTotal=2988275 completionTokensTotal=211267
32+
33+
Finished run: algorithm=CoT-consensus dataset=Game24 provider=mistral model=mistral-small
34+
Result: algorithm=CoT-consensus poolSize=1 dataset=Game24 provider=mistral model=mistral-small check=2 total=128
35+
Result: algorithm=CoT-consensus poolSize=2 dataset=Game24 provider=mistral model=mistral-small check=2 total=128
36+
Result: algorithm=CoT-consensus poolSize=4 dataset=Game24 provider=mistral model=mistral-small check=2 total=128
37+
Result: algorithm=CoT-consensus poolSize=8 dataset=Game24 provider=mistral model=mistral-small check=1 total=128
38+
Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=mistral model=mistral-small check=0 total=128
39+
Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=mistral model=mistral-small check=0 total=128
40+
Final stats: rate=2.56/spromptTokensRate=2222.717/s completionTokensRate=221.41/s promptTokensTotal=3551411 completionTokensTotal=353772
41+
42+
## MATH
43+
44+
Finished run: algorithm=CoT-consensus dataset=MATH provider=openai model=gpt-3.5-turbo-1106
45+
Result: algorithm=CoT-consensus poolSize=1 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=55 total=128
46+
Result: algorithm=CoT-consensus poolSize=2 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=55 total=128
47+
Result: algorithm=CoT-consensus poolSize=4 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=63 total=128
48+
Result: algorithm=CoT-consensus poolSize=8 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=66 total=128
49+
Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=70 total=128
50+
Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=68 total=128
51+
Final stats: rate=3.33/s promptTokensRate=7484.167/s completionTokensRate=606.85/s promptTokensTotal=9267718 completionTokensTotal=802822
52+
53+
Finished run: algorithm=CoT-consensus dataset=MATH provider=mistral model=mistral-small
54+
Result: algorithm=CoT-consensus poolSize=1 dataset=MATH provider=mistral model=mistral-small check=36 total=128
55+
Result: algorithm=CoT-consensus poolSize=2 dataset=MATH provider=mistral model=mistral-small check=36 total=128
56+
Result: algorithm=CoT-consensus poolSize=4 dataset=MATH provider=mistral model=mistral-small check=42 total=128
57+
Result: algorithm=CoT-consensus poolSize=8 dataset=MATH provider=mistral model=mistral-small check=51 total=128
58+
Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=mistral model=mistral-small check=59 total=128
59+
Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=mistral model=mistral-small check=60 total=128
60+
Final stats: rate=2.59/spromptTokensRate=6575.136/s completionTokensRate=617.24/s promptTokensTotal=10387169 completionTokensTotal=975095

x/spolu/research/evals/lib/algorithms.ts

Lines changed: 118 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import PQueue from "p-queue";
2+
import { Database, open } from "sqlite";
3+
import sqlite3 from "sqlite3";
24

35
import { Dataset, ProblemId, Test } from "@app/lib/datasets";
4-
import { ChatCompletion, Model } from "@app/lib/models";
6+
import { ChatCompletion, ChatQuery, hashQuery, Model } from "@app/lib/models";
57

6-
export const ValidAlgorithmTypes = ["CoT"] as const;
8+
export const ValidAlgorithmTypes = ["CoT", "CoT-consensus"] as const;
79
export type AlgorithmType = (typeof ValidAlgorithmTypes)[number];
810

911
export type TestResult = {
@@ -12,39 +14,108 @@ export type TestResult = {
1214
check: boolean;
1315
};
1416

15-
const CONCURRENCY = 4;
16-
1717
export abstract class Algorithm {
18-
abstract readonly algorithm: AlgorithmType;
19-
2018
private history: {
2119
createdAt: number;
20+
runId: string;
2221
test: ProblemId;
22+
queryHash: string;
2323
completion: ChatCompletion;
2424
check: boolean;
2525
}[];
2626

27-
constructor() {
27+
private _sqlite: Database | null = null;
28+
29+
protected dataset: Dataset;
30+
protected model: Model;
31+
32+
constructor(dataset: Dataset, model: Model) {
2833
this.history = [];
34+
this.dataset = dataset;
35+
this.model = model;
2936
}
3037

31-
storeCompletion({
38+
abstract algorithm(): AlgorithmType;
39+
40+
async sqlite() {
41+
if (this._sqlite === null) {
42+
this._sqlite = await open({
43+
filename: `stores/${this.runId()}.sqlite`,
44+
driver: sqlite3.Database,
45+
});
46+
// this._sqlite = new Database(`stores/${this.runId()}.sqlite`);
47+
const query =
48+
"CREATE TABLE IF NOT EXISTS store (" +
49+
"id BIGSERIAL PRIMARY KEY, " +
50+
"created_at INTEGER NOT NULL, " +
51+
"run_id TEXT NOT NULL, " +
52+
"test TEXT NOT NULL, " +
53+
"query_hash TEXT NOT NULL, " +
54+
"completion TEXT NOT NULL, " +
55+
"is_check INTEGER NOT NULL" +
56+
")";
57+
await this._sqlite.exec(query);
58+
}
59+
return this._sqlite;
60+
}
61+
62+
runId(): string {
63+
return `${this.model.provider}-${this.model.model()}-${
64+
this.dataset.dataset
65+
}-${this.algorithm()}`;
66+
}
67+
68+
async storeCompletion({
3269
test,
70+
query,
3371
completion,
3472
check,
3573
}: {
3674
test: Test;
37-
check: boolean;
75+
query: ChatQuery;
3876
completion: ChatCompletion;
39-
}): void {
77+
check: boolean;
78+
}) {
79+
const db = await this.sqlite();
80+
81+
const now = Date.now();
82+
83+
await db.run(
84+
"INSERT INTO store (created_at, run_id, test, query_hash, completion, is_check) VALUES (?, ?, ?, ?, ?, ?)",
85+
[
86+
now,
87+
this.runId(),
88+
test.id,
89+
hashQuery(query),
90+
JSON.stringify(completion),
91+
check ? 1 : 0,
92+
]
93+
);
94+
4095
this.history.push({
41-
createdAt: Date.now(),
96+
createdAt: now,
97+
runId: this.runId(),
4298
test: test.id,
99+
queryHash: hashQuery(query),
43100
completion,
44101
check,
45102
});
46103
}
47104

105+
async runCompletion(query: ChatQuery): Promise<ChatCompletion> {
106+
const db = await this.sqlite();
107+
108+
const result = await db.get(
109+
"SELECT * FROM store WHERE run_id = ? AND query_hash = ?",
110+
[this.runId(), hashQuery(query)]
111+
);
112+
if (result) {
113+
return JSON.parse(result.completion);
114+
}
115+
116+
return await this.model.completionWithRetry(query);
117+
}
118+
48119
stats() {
49120
const now = Date.now();
50121
const window = this.history.filter((x) => x.createdAt > now - 60000);
@@ -83,38 +154,60 @@ export abstract class Algorithm {
83154
);
84155
}
85156

157+
finalStats() {
158+
if (this.history.length > 1) {
159+
const first = this.history[0];
160+
const last = this.history[this.history.length - 1];
161+
const duration = last.createdAt - first.createdAt;
162+
const rate = this.history.length / (duration / 1000);
163+
const completionTokensTotal = this.history.reduce(
164+
(acc, x) => acc + x.completion.usage.completionTokens,
165+
0
166+
);
167+
const promptTokensTotal = this.history.reduce(
168+
(acc, x) => acc + x.completion.usage.promptTokens,
169+
0
170+
);
171+
const completionTokensRate = completionTokensTotal / (duration / 1000);
172+
const promptTokensRate = promptTokensTotal / (duration / 1000);
173+
174+
console.log(
175+
`Final stats: ` +
176+
`rate=${rate.toFixed(2)}/s` +
177+
`promptTokensRate=${promptTokensRate.toFixed(3)}/s ` +
178+
`completionTokensRate=${completionTokensRate.toFixed(2)}/s ` +
179+
`promptTokensTotal=${promptTokensTotal} ` +
180+
`completionTokensTotal=${completionTokensTotal}`
181+
);
182+
}
183+
}
184+
86185
abstract runOne({
87-
model,
88-
dataset,
89186
test,
90187
debug,
188+
iteration,
91189
}: {
92-
model: Model;
93-
dataset: Dataset;
94190
test: Test;
95191
debug?: boolean;
192+
iteration?: number;
96193
}): Promise<TestResult>;
97194

98195
async run({
99-
model,
100-
dataset,
101196
tests,
197+
concurrency,
102198
debug,
103199
}: {
104-
model: Model;
105-
dataset: Dataset;
106200
tests: Test[];
201+
concurrency: number;
107202
debug?: boolean;
108203
}): Promise<TestResult[]> {
109204
const queue = new PQueue({
110-
concurrency: CONCURRENCY,
205+
concurrency,
111206
});
112207

113208
const results = (
114209
await Promise.all(
115-
tests.map((test) =>
116-
queue.add(() => this.runOne({ model, dataset, test, debug }))
117-
)
210+
tests.map((test) => queue.add(() => this.runOne({ test, debug })))
118211
)
119212
).filter((x) => x);
120213

@@ -124,4 +217,6 @@ export abstract class Algorithm {
124217

125218
return results as TestResult[];
126219
}
220+
221+
abstract computeResults(): void;
127222
}

0 commit comments

Comments
 (0)