perf: add benchmark results for all gpt models (#89)

* feat: add benchmark results for all gpt models and minor bug fix * feat: add cost per $ metric to suites and readme * Update README.md * Update util.py --------- Co-authored-by: Silvery D. Fu <163332118+zenodflow@users.noreply.github.com>
abcsys · Aug 13, 2024 · 6fbb72a · 6fbb72a
1 parent c38a707
commit 6fbb72a
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 69 deletions.
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -66,77 +66,96 @@ python -m benchmark.run -s <suite-name>
 
 ----
 
-### Setup
-
-| Setup   | Model    | Tools | Data Preparation                       |
-| :------ | :------- | :---- | :------------------------------------  |
-| S1      | `gpt-4o` | Off   | Field values only separated by spaces  |
-| S2      | `gpt-4o` | Off   | Schema inline with entity description  |
-
-### Results
-
-|   Benchmark    | Precision (S1, S2) | Recall (S1, S2)  |   F1 (S1, S2)   |
-| :------------- | :----------------: | :--------------: | :-------------: |
-| abt-buy        |   84.0, **89.9**   |  99.5, **99.5**  | 91.1, **94.5**  |
-| amazon-google  |   60.0, **67.4**   |  89.7, **92.7**  | 71.9, **78.1**  |
-| beer           |   92.3, **92.3**   |  85.7, **85.7**  | 88.9, **88.9**  |
-| dblp-acm       |   80.4, **94.7**   | **100.0**, 99.6  | 89.1, **97.1**  |
-| dblp-scholar   |   78.4, **88.3**   |  98.8, **93.6**  | 87.4, **90.9**  |
-| fodors-zagats  |  95.7, **100.0**   | 100.0, **100.0** | 97.8, **100.0** |
-| itunes-amazon  |  89.3, **100.0**   |  92.6, **96.3**  | 90.9, **98.11** |
-| walmart-amazon |   75.4, **85.4**   |  95.3, **91.2**  | 84.2, **88.2**  |
+#### GPT-4o-2024-08-06
+
+| Benchmark      | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+| abt-buy        |   92.92   | 95.63  | 94.26 |  0.6579  |     1831    |       110        |
+| amazon-google  |   71.32   | 83.26  | 76.83 |  1.324   |      926    |        83        |
+| beer           |   91.67   | 78.57  | 84.62 |  0.04477 |     2032    |        18        |
+| dblp-acm       |   95.79   |  100   | 97.85 |  0.5626  |     2216    |       120        |
+| dblp-scholar   |   91.57   | 91.2   | 91.38 |  0.764   |     1630    |       100        |
+| fodors-zagats  |    100    | 95.45  | 97.67 |  0.08398 |     2250    |        40        |
+| itunes-amazon  |    100    | 57.69  | 73.17 |  0.07134 |     1471    |        98        |
+| walmart-amazon |   90.26   | 91.19  | 90.72 |  0.6333  |     1883    |       110        |
+
+#### GPT-4o-2024-05-13
+
+|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+| abt-buy        |   90.18   | 98.06  | 93.95 |  1.206   |     999     |        95        |
+| amazon-google  |   69.71   | 81.97  | 75.35 |  1.045   |    1174     |        75        |
+| beer           |   91.67   | 78.57  | 84.62 |  0.06916 |    1315     |        54        |
+| dblp-acm       |   96.51   | 99.6   | 98.03 |  1.035   |    1204     |        90        |
+| dblp-scholar   |   92.92   | 89.2   | 91.02 |  0.9502  |    1311     |       140        |
+| fodors-zagats  |   100     | 95.45  | 97.67 |  0.1674  |    1129     |       120        |
+| itunes-amazon  |   100     | 88.46  | 93.88 |  0.1422  |     738     |        46        |
+| walmart-amazon |   91.01   | 89.12  | 90.05 |  1.01    |    1181     |       110        |
 
 #### GPT-4o-mini
 
-|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Throughput (pps) |
-| :------------- | :-------: | :----: | :---: | :------: | :--------------: |
-|    abt-buy     |   94.61   |  76.7  | 84.72 |  0.0362  |       140        |
-| amazon-google  |   68.32   | 76.82  | 72.32 | 0.02291  |       110        |
-|      beer      |    100    | 28.57  | 44.44 | 0.002078 |        87        |
-|    dblp-acm    |   96.79   |  84.4  | 90.17 | 0.03115  |       120        |
-|  dblp-scholar  |   90.75   |  62.8  | 74.23 | 0.02864  |       120        |
-| fodors-zagats  |    100    | 77.27  | 87.18 | 0.005028 |        28        |
-| itunes-amazon  |    100    | 46.15  | 63.16 | 0.004256 |        34        |
-| walmart-amazon |   96.3    | 67.36  | 79.27 | 0.03037  |        82        |
+|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+|    abt-buy     |   94.61   |  76.7  | 84.72 |  0.0362  |    33287    |       140        |
+| amazon-google  |   68.32   | 76.82  | 72.32 | 0.02291  |    53557    |       110        |
+|      beer      |    100    | 28.57  | 44.44 | 0.002078 |    43792    |        87        |
+|    dblp-acm    |   96.79   |  84.4  | 90.17 | 0.03115  |    40032    |       120        |
+|  dblp-scholar  |   90.75   |  62.8  | 74.23 | 0.02864  |    43505    |       120        |
+| fodors-zagats  |    100    | 77.27  | 87.18 | 0.005028 |    37589    |        28        |
+| itunes-amazon  |    100    | 46.15  | 63.16 | 0.004256 |    24671    |        34        |
+| walmart-amazon |   96.3    | 67.36  | 79.27 | 0.03037  |    39282    |        82        |
+
+#### GPT-4-turbo
+
+|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+| abt-buy        |   92.06   | 84.47  | 88.1  |  2.486   |     484     |        86        |
+| amazon-google  |   69.92   | 73.82  | 71.82 |  1.531   |     801     |       110        |
+| beer           |    100    |  50    | 66.67 |  0.1406  |     647     |        63        |
+| dblp-acm       |   95.75   | 99.2   | 97.45 |  2.139   |     582     |        79        |
+| dblp-scholar   |   95.12   |  78    | 85.71 |  2.064   |     603     |        58        |
+| fodors-zagats  |    100    | 86.36  | 92.68 |  0.3252  |     581     |        51        |
+| itunes-amazon  |    100    | 57.69  | 73.17 |  0.2998  |     350     |       9.8        |
+| walmart-amazon |   92.18   | 85.49  | 88.71 |  2.106   |     566     |        47        |
 
 #### GPT-4
 
-|    Benchmark   | Precision | Recall |  F1   | Cost ($) | Throughput (pps) |
-| :------------- | :-------: | :----: | :---: | :------: | :--------------: |
-| abt-buy        |   95.02   |  92.72 | 93.86 |   7.26   |              140 |
-| amazon-google  |   63.44   |  90.13 | 74.47 |   4.44   |               94 |
-| beer           |     90    |  64.29 | 75    |  0.4133  |               74 |
-| dblp-acm       |   96.15   |  100   | 98.04 |   6.232  |              130 |
-| dblp-scholar   |   91.56   |  82.4  | 86.74 |   5.694  |              130 |
-| fodors-zagats  |    100    |  86.36 | 92.68 |  0.9667  |               73 |
-| itunes-amazon  |    100    |  46.15 | 63.16 |   0.853  |               71 |
-| walmart-amazon |   90.91   |  88.08 | 89.47 |   6.032  |              140 |
+|    Benchmark   | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+| abt-buy        |   95.02   |  92.72 | 93.86 |   7.26   |     165     |       140        |
+| amazon-google  |   63.44   |  90.13 | 74.47 |   4.44   |     276     |        94        |
+| beer           |     90    |  64.29 | 75    |  0.4133  |     220     |        74        |
+| dblp-acm       |   96.15   |  100   | 98.04 |   6.232  |     200     |       130        |
+| dblp-scholar   |   91.56   |  82.4  | 86.74 |   5.694  |     218     |       130        |
+| fodors-zagats  |    100    |  86.36 | 92.68 |  0.9667  |     195     |        73        |
+| itunes-amazon  |    100    |  46.15 | 63.16 |   0.853  |     123     |        71        |
+| walmart-amazon |   90.91   |  88.08 | 89.47 |   6.032  |     197     |       140        |
 
 #### GPT-3.5-turbo
 
-|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Throughput (pps) |
-| :------------- | :-------: | :----: | :---: | :------: | :--------------: |
-|    abt-buy     |    100    | 15.05  | 26.16 |  0.3649  |        22        |
-| amazon-google  |   68.6    | 35.62  | 46.89 |  0.2209  |       160        |
-|      beer      |    100    | 35.71  | 52.63 | 0.02057  |        78        |
-|    dblp-acm    |   99.38   |   64   | 77.86 |  0.3106  |       140        |
-|  dblp-scholar  |   92.41   |  29.2  | 44.38 |  0.2834  |       150        |
-| fodors-zagats  |    100    | 40.91  | 58.06 | 0.04857  |        54        |
-| itunes-amazon  |    100    | 19.23  | 32.26 | 0.04254  |        87        |
-| walmart-amazon |   94.44   | 35.23  | 51.32 |  0.3007  |       150        |
+|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Pairs per $ | Throughput (pps) |
+| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: |
+|    abt-buy     |    100    | 15.05  | 26.16 |  0.3649  |    3302     |        22        |
+| amazon-google  |   68.6    | 35.62  | 46.89 |  0.2209  |    5554     |       160        |
+|      beer      |    100    | 35.71  | 52.63 | 0.02057  |    4423     |        78        |
+|    dblp-acm    |   99.38   |   64   | 77.86 |  0.3106  |    4014     |       140        |
+|  dblp-scholar  |   92.41   |  29.2  | 44.38 |  0.2834  |    4396     |       150        |
+| fodors-zagats  |    100    | 40.91  | 58.06 | 0.04857  |    3891     |        54        |
+| itunes-amazon  |    100    | 19.23  | 32.26 | 0.04254  |    2468     |        87        |
+| walmart-amazon |   94.44   | 35.23  | 51.32 |  0.3007  |    3967     |       150        |
 
 ### Meta-Llama3-8B-Instruct-8bit
 
-|   Benchmark    | Precision | Recall |  F1   | Cost ($) | Throughput (pps) |
-|:--------------:|:---------:|:------:|:-----:|:--------:|:----------------:|
-|    abt-buy     |   70.36   | 95.63  | 81.07 |    -     |       0.74       |
-| amazon-google  |   51.46   | 75.54  | 61.22 |    -     |       1.2        |
-|      beer      |   90.0    | 64.29  | 75.0  |    -     |       0.89       |
-|    dblp-acm    |   88.58   |  90.0  | 89.29 |    -     |       0.99       |
-|  dblp-scholar  |   81.68   |  85.6  | 83.59 |    -     |       1.1        |
-| fodors-zagats  |   89.47   | 77.27  | 82.93 |    -     |       0.92       |
-| itunes-amazon  |   50.0    | 69.23  | 58.06 |    -     |       0.66       |
-| walmart-amazon |   54.9    | 87.05  | 67.33 |    -     |       0.98       |
+|   Benchmark    | Precision | Recall |  F1   | Throughput (pps) |
+|:--------------:|:---------:|:------:|:-----:|:----------------:|
+|    abt-buy     |   70.36   | 95.63  | 81.07 |       0.74       |
+| amazon-google  |   51.46   | 75.54  | 61.22 |       1.2        |
+|      beer      |   90.0    | 64.29  | 75.0  |       0.89       |
+|    dblp-acm    |   88.58   |  90.0  | 89.29 |       0.99       |
+|  dblp-scholar  |   81.68   |  85.6  | 83.59 |       1.1        |
+| fodors-zagats  |   89.47   | 77.27  | 82.93 |       0.92       |
+| itunes-amazon  |   50.0    | 69.23  | 58.06 |       0.66       |
+| walmart-amazon |   54.9    | 87.05  | 67.33 |       0.98       |
 
 For more results on Llama models, please see [Llama.md](Llama.md)
 
@@ -184,3 +203,31 @@ Varying the batch size when performing EM over the `abt-buy` benchmark:
 |    128     | 94.54 |   22.32    | 0.11             | 0.99 |
 |    256     | 93.43 |   32.93    | 0.10             | 0.99 |
 |    512     | 70.87 |   74.44    | 0.08             | 0.95 |
+
+### Data preparation (w/ vs. w/o schema)
+
+To run a single benchmark in `/classic` without schema:
+
+```
+python -m benchmark.run -n <benchmark-name> -p -1 --no-schema
+```
+
+#### Setup
+
+| Setup   | Model    | Tools | Data Preparation                       |
+| :------ | :------- | :---- | :------------------------------------  |
+| S1      | `gpt-4o` | Off   | Field values only separated by spaces  |
+| S2      | `gpt-4o` | Off   | Schema inline with entity description  |
+
+#### Results
+
+|   Benchmark    | Precision (S1, S2) | Recall (S1, S2)  |   F1 (S1, S2)   |
+| :------------- | :----------------: | :--------------: | :-------------: |
+| abt-buy        |   84.0, **89.9**   |  99.5, **99.5**  | 91.1, **94.5**  |
+| amazon-google  |   60.0, **67.4**   |  89.7, **92.7**  | 71.9, **78.1**  |
+| beer           |   92.3, **92.3**   |  85.7, **85.7**  | 88.9, **88.9**  |
+| dblp-acm       |   80.4, **94.7**   | **100.0**, 99.6  | 89.1, **97.1**  |
+| dblp-scholar   |   78.4, **88.3**   |  98.8, **93.6**  | 87.4, **90.9**  |
+| fodors-zagats  |  95.7, **100.0**   | 100.0, **100.0** | 97.8, **100.0** |
+| itunes-amazon  |  89.3, **100.0**   |  92.6, **96.3**  | 90.9, **98.11** |
+| walmart-amazon |   75.4, **85.4**   |  95.3, **91.2**  | 84.2, **88.2**  |
diff --git a/benchmark/suite/gpt_35_turbo.py b/benchmark/suite/gpt_35_turbo.py
@@ -37,14 +37,16 @@ def run(args):
     save(df, name)
 
     # generate markdown table
+    df["pair_per_$"] = df["num_pairs"] // df["cost"]
     df = df[["benchmark", "precision", "recall", "f1",
-             "cost", "throughput"]]
+             "cost", "pair_per_$", "throughput"]]
     field_names = {
         "benchmark": "Benchmark",
         "precision": "Precision",
         "recall": "Recall",
         "f1": "F1",
         "cost": "Cost ($)",
+        "pair_per_$": "Pair per $",
         "throughput": "Throughput (pps)",
     }
     df = df.rename(columns=field_names)

diff --git a/benchmark/suite/gpt_4.py b/benchmark/suite/gpt_4.py
@@ -37,14 +37,16 @@ def run(args):
     save(df, name)
 
     # generate markdown table
+    df["pair_per_$"] = df["num_pairs"] // df["cost"]
     df = df[["benchmark", "precision", "recall", "f1",
-             "cost", "throughput"]]
+             "cost", "pair_per_$", "throughput"]]
     field_names = {
         "benchmark": "Benchmark",
         "precision": "Precision",
         "recall": "Recall",
         "f1": "F1",
         "cost": "Cost ($)",
+        "pair_per_$": "Pair per $",
         "throughput": "Throughput (pps)",
     }
     df = df.rename(columns=field_names)

diff --git a/benchmark/suite/gpt_4_turbo.py b/benchmark/suite/gpt_4_turbo.py
@@ -37,14 +37,16 @@ def run(args):
     save(df, name)
 
     # generate markdown table
+    df["pair_per_$"] = df["num_pairs"] // df["cost"]
     df = df[["benchmark", "precision", "recall", "f1",
-             "cost", "throughput"]]
+             "cost", "pair_per_$", "throughput"]]
     field_names = {
         "benchmark": "Benchmark",
         "precision": "Precision",
         "recall": "Recall",
         "f1": "F1",
         "cost": "Cost ($)",
+        "pair_per_$": "Pair per $",
         "throughput": "Throughput (pps)",
     }
     df = df.rename(columns=field_names)

diff --git a/benchmark/suite/gpt_4o.py b/benchmark/suite/gpt_4o.py
@@ -37,14 +37,16 @@ def run(args):
     save(df, name)
 
     # generate markdown table
+    df["pair_per_$"] = df["num_pairs"] // df["cost"]
     df = df[["benchmark", "precision", "recall", "f1",
-             "cost", "throughput"]]
+             "cost", "pair_per_$", "throughput"]]
     field_names = {
         "benchmark": "Benchmark",
         "precision": "Precision",
         "recall": "Recall",
         "f1": "F1",
         "cost": "Cost ($)",
+        "pair_per_$": "Pair per $",
         "throughput": "Throughput (pps)",
     }
     df = df.rename(columns=field_names)

diff --git a/benchmark/suite/gpt_4o_mini.py b/benchmark/suite/gpt_4o_mini.py
@@ -37,14 +37,16 @@ def run(args):
     save(df, name)
 
     # generate markdown table
+    df["pair_per_$"] = df["num_pairs"] // df["cost"]
     df = df[["benchmark", "precision", "recall", "f1",
-             "cost", "throughput"]]
+             "cost", "pair_per_$", "throughput"]]
     field_names = {
         "benchmark": "Benchmark",
         "precision": "Precision",
         "recall": "Recall",
         "f1": "F1",
         "cost": "Cost ($)",
+        "pair_per_$": "Pair per $",
         "throughput": "Throughput (pps)",
     }
     df = df.rename(columns=field_names)

diff --git a/benchmark/suite/util.py b/benchmark/suite/util.py
@@ -7,11 +7,14 @@
 import benchmark as bm
 
 
-def run_benchmark(benchmark, args):
-    # create a deep copy of args to pass into benchmark
+def run_benchmark(benchmark: str, args):
     libem.reset()
+
+    _args = copy.deepcopy(args)
+    _args.name = benchmark
     benchmark = bm.benchmarks[benchmark]
-    return benchmark(copy.deepcopy(args))
+
+    return benchmark(_args)
 
 
 def report_to_dataframe(reports, key_col: str = "benchmark"):
@@ -28,7 +31,8 @@ def report_to_dataframe(reports, key_col: str = "benchmark"):
     return pd.concat(rows)
 
 
-def tabulate(df: pd.DataFrame, name):
+def tabulate(df: pd.DataFrame, name: str):
+    name = name.replace('_', '-')
     output_file = os.path.join(
                     bm.table_dir,
                     f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"
@@ -43,7 +47,8 @@ def plot(df: pd.DataFrame):
     pass
 
 
-def save(df: pd.DataFrame, name):
+def save(df: pd.DataFrame, name: str):
+    name = name.replace('_', '-')
     output_file = os.path.join(
                     bm.result_dir,
                     f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"