From 4540656000ddb4216c74b04d82bc469bd2f7472d Mon Sep 17 00:00:00 2001 From: Stanislas Polu Date: Wed, 20 Dec 2023 08:46:58 +0100 Subject: [PATCH] CoT results --- x/spolu/research/evals/RESULTS.log | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/x/spolu/research/evals/RESULTS.log b/x/spolu/research/evals/RESULTS.log index 28bf0b709873..e08a6359bd56 100644 --- a/x/spolu/research/evals/RESULTS.log +++ b/x/spolu/research/evals/RESULTS.log @@ -30,6 +30,15 @@ Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=openai model Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=openai model=gpt-3.5-turbo-1106 check=10 total=128 Final stats: rate=17.28/spromptTokensRate=12608.702/s completionTokensRate=891.42/s promptTokensTotal=2988275 completionTokensTotal=211267 +Finished run: algorithm=CoT-consensus dataset=Game24 provider=openai model=gpt-4-1106-preview +Result: algorithm=CoT-consensus poolSize=1 dataset=Game24 provider=openai model=gpt-4-1106-preview check=5 total=128 +Result: algorithm=CoT-consensus poolSize=2 dataset=Game24 provider=openai model=gpt-4-1106-preview check=5 total=128 +Result: algorithm=CoT-consensus poolSize=4 dataset=Game24 provider=openai model=gpt-4-1106-preview check=5 total=128 +Result: algorithm=CoT-consensus poolSize=8 dataset=Game24 provider=openai model=gpt-4-1106-preview check=6 total=128 +Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=openai model=gpt-4-1106-preview check=5 total=128 +Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=openai model=gpt-4-1106-preview check=4 total=128 +Final stats: rate=1.21/spromptTokensRate=885.027/s completionTokensRate=100.03/s promptTokensTotal=2988275 completionTokensTotal=337743 + Finished run: algorithm=CoT-consensus dataset=Game24 provider=mistral model=mistral-small Result: algorithm=CoT-consensus poolSize=1 dataset=Game24 provider=mistral model=mistral-small check=2 total=128 Result: algorithm=CoT-consensus poolSize=2 dataset=Game24 provider=mistral model=mistral-small check=2 total=128 @@ -39,6 +48,15 @@ Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=mistral mode Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=mistral model=mistral-small check=0 total=128 Final stats: rate=2.56/spromptTokensRate=2222.717/s completionTokensRate=221.41/s promptTokensTotal=3551411 completionTokensTotal=353772 +Finished run: algorithm=CoT-consensus dataset=Game24 provider=mistral model=mistral-medium +Result: algorithm=CoT-consensus poolSize=1 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Result: algorithm=CoT-consensus poolSize=2 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Result: algorithm=CoT-consensus poolSize=4 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Result: algorithm=CoT-consensus poolSize=8 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Result: algorithm=CoT-consensus poolSize=16 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Result: algorithm=CoT-consensus poolSize=32 dataset=Game24 provider=mistral model=mistral-medium check=0 total=128 +Final stats: rate=1.84/spromptTokensRate=1594.544/s completionTokensRate=178.56/s promptTokensTotal=3555507 completionTokensTotal=398154 + ## MATH Finished run: algorithm=CoT-consensus dataset=MATH provider=openai model=gpt-3.5-turbo-1106 @@ -50,6 +68,15 @@ Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=openai model=g Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=openai model=gpt-3.5-turbo-1106 check=68 total=128 Final stats: rate=3.33/s promptTokensRate=7484.167/s completionTokensRate=606.85/s promptTokensTotal=9267718 completionTokensTotal=802822 +Finished run: algorithm=CoT-consensus dataset=MATH provider=openai model=gpt-4-1106-preview +Result: algorithm=CoT-consensus poolSize=1 dataset=MATH provider=openai model=gpt-4-1106-preview check=80 total=128 +Result: algorithm=CoT-consensus poolSize=2 dataset=MATH provider=openai model=gpt-4-1106-preview check=80 total=128 +Result: algorithm=CoT-consensus poolSize=4 dataset=MATH provider=openai model=gpt-4-1106-preview check=83 total=128 +Result: algorithm=CoT-consensus poolSize=8 dataset=MATH provider=openai model=gpt-4-1106-preview check=91 total=128 +Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=openai model=gpt-4-1106-preview check=92 total=128 +Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=openai model=gpt-4-1106-preview check=91 total=128 +Final stats: rate=0.44/spromptTokensRate=989.536/s completionTokensRate=119.40/s promptTokensTotal=9267718 completionTokensTotal=1118264 + Finished run: algorithm=CoT-consensus dataset=MATH provider=mistral model=mistral-small Result: algorithm=CoT-consensus poolSize=1 dataset=MATH provider=mistral model=mistral-small check=36 total=128 Result: algorithm=CoT-consensus poolSize=2 dataset=MATH provider=mistral model=mistral-small check=36 total=128 @@ -58,3 +85,12 @@ Result: algorithm=CoT-consensus poolSize=8 dataset=MATH provider=mistral model=m Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=mistral model=mistral-small check=59 total=128 Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=mistral model=mistral-small check=60 total=128 Final stats: rate=2.59/spromptTokensRate=6575.136/s completionTokensRate=617.24/s promptTokensTotal=10387169 completionTokensTotal=975095 + +Finished run: algorithm=CoT-consensus dataset=MATH provider=mistral model=mistral-medium +Result: algorithm=CoT-consensus poolSize=1 dataset=MATH provider=mistral model=mistral-medium check=41 total=128 +Result: algorithm=CoT-consensus poolSize=2 dataset=MATH provider=mistral model=mistral-medium check=41 total=128 +Result: algorithm=CoT-consensus poolSize=4 dataset=MATH provider=mistral model=mistral-medium check=50 total=128 +Result: algorithm=CoT-consensus poolSize=8 dataset=MATH provider=mistral model=mistral-medium check=56 total=128 +Result: algorithm=CoT-consensus poolSize=16 dataset=MATH provider=mistral model=mistral-medium check=64 total=128 +Result: algorithm=CoT-consensus poolSize=32 dataset=MATH provider=mistral model=mistral-medium check=68 total=128 +Final stats: rate=0.96/spromptTokensRate=2407.524/s completionTokensRate=255.92/s promptTokensTotal=10323371 completionTokensTotal=1097376