diff --git a/benchmark/README.md b/benchmark/README.md index 29c6e1c..3094e62 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -66,77 +66,96 @@ python -m benchmark.run -s ---- -### Setup - -| Setup | Model | Tools | Data Preparation | -| :------ | :------- | :---- | :------------------------------------ | -| S1 | `gpt-4o` | Off | Field values only separated by spaces | -| S2 | `gpt-4o` | Off | Schema inline with entity description | - -### Results - -| Benchmark | Precision (S1, S2) | Recall (S1, S2) | F1 (S1, S2) | -| :------------- | :----------------: | :--------------: | :-------------: | -| abt-buy | 84.0, **89.9** | 99.5, **99.5** | 91.1, **94.5** | -| amazon-google | 60.0, **67.4** | 89.7, **92.7** | 71.9, **78.1** | -| beer | 92.3, **92.3** | 85.7, **85.7** | 88.9, **88.9** | -| dblp-acm | 80.4, **94.7** | **100.0**, 99.6 | 89.1, **97.1** | -| dblp-scholar | 78.4, **88.3** | 98.8, **93.6** | 87.4, **90.9** | -| fodors-zagats | 95.7, **100.0** | 100.0, **100.0** | 97.8, **100.0** | -| itunes-amazon | 89.3, **100.0** | 92.6, **96.3** | 90.9, **98.11** | -| walmart-amazon | 75.4, **85.4** | 95.3, **91.2** | 84.2, **88.2** | +#### GPT-4o-2024-08-06 + +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 92.92 | 95.63 | 94.26 | 0.6579 | 1831 | 110 | +| amazon-google | 71.32 | 83.26 | 76.83 | 1.324 | 926 | 83 | +| beer | 91.67 | 78.57 | 84.62 | 0.04477 | 2032 | 18 | +| dblp-acm | 95.79 | 100 | 97.85 | 0.5626 | 2216 | 120 | +| dblp-scholar | 91.57 | 91.2 | 91.38 | 0.764 | 1630 | 100 | +| fodors-zagats | 100 | 95.45 | 97.67 | 0.08398 | 2250 | 40 | +| itunes-amazon | 100 | 57.69 | 73.17 | 0.07134 | 1471 | 98 | +| walmart-amazon | 90.26 | 91.19 | 90.72 | 0.6333 | 1883 | 110 | + +#### GPT-4o-2024-05-13 + +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 90.18 | 98.06 | 93.95 | 1.206 | 999 | 95 | +| amazon-google | 69.71 | 81.97 | 75.35 | 1.045 | 1174 | 75 | +| beer | 91.67 | 78.57 | 84.62 | 0.06916 | 1315 | 54 | +| dblp-acm | 96.51 | 99.6 | 98.03 | 1.035 | 1204 | 90 | +| dblp-scholar | 92.92 | 89.2 | 91.02 | 0.9502 | 1311 | 140 | +| fodors-zagats | 100 | 95.45 | 97.67 | 0.1674 | 1129 | 120 | +| itunes-amazon | 100 | 88.46 | 93.88 | 0.1422 | 738 | 46 | +| walmart-amazon | 91.01 | 89.12 | 90.05 | 1.01 | 1181 | 110 | #### GPT-4o-mini -| Benchmark | Precision | Recall | F1 | Cost ($) | Throughput (pps) | -| :------------- | :-------: | :----: | :---: | :------: | :--------------: | -| abt-buy | 94.61 | 76.7 | 84.72 | 0.0362 | 140 | -| amazon-google | 68.32 | 76.82 | 72.32 | 0.02291 | 110 | -| beer | 100 | 28.57 | 44.44 | 0.002078 | 87 | -| dblp-acm | 96.79 | 84.4 | 90.17 | 0.03115 | 120 | -| dblp-scholar | 90.75 | 62.8 | 74.23 | 0.02864 | 120 | -| fodors-zagats | 100 | 77.27 | 87.18 | 0.005028 | 28 | -| itunes-amazon | 100 | 46.15 | 63.16 | 0.004256 | 34 | -| walmart-amazon | 96.3 | 67.36 | 79.27 | 0.03037 | 82 | +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 94.61 | 76.7 | 84.72 | 0.0362 | 33287 | 140 | +| amazon-google | 68.32 | 76.82 | 72.32 | 0.02291 | 53557 | 110 | +| beer | 100 | 28.57 | 44.44 | 0.002078 | 43792 | 87 | +| dblp-acm | 96.79 | 84.4 | 90.17 | 0.03115 | 40032 | 120 | +| dblp-scholar | 90.75 | 62.8 | 74.23 | 0.02864 | 43505 | 120 | +| fodors-zagats | 100 | 77.27 | 87.18 | 0.005028 | 37589 | 28 | +| itunes-amazon | 100 | 46.15 | 63.16 | 0.004256 | 24671 | 34 | +| walmart-amazon | 96.3 | 67.36 | 79.27 | 0.03037 | 39282 | 82 | + +#### GPT-4-turbo + +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 92.06 | 84.47 | 88.1 | 2.486 | 484 | 86 | +| amazon-google | 69.92 | 73.82 | 71.82 | 1.531 | 801 | 110 | +| beer | 100 | 50 | 66.67 | 0.1406 | 647 | 63 | +| dblp-acm | 95.75 | 99.2 | 97.45 | 2.139 | 582 | 79 | +| dblp-scholar | 95.12 | 78 | 85.71 | 2.064 | 603 | 58 | +| fodors-zagats | 100 | 86.36 | 92.68 | 0.3252 | 581 | 51 | +| itunes-amazon | 100 | 57.69 | 73.17 | 0.2998 | 350 | 9.8 | +| walmart-amazon | 92.18 | 85.49 | 88.71 | 2.106 | 566 | 47 | #### GPT-4 -| Benchmark | Precision | Recall | F1 | Cost ($) | Throughput (pps) | -| :------------- | :-------: | :----: | :---: | :------: | :--------------: | -| abt-buy | 95.02 | 92.72 | 93.86 | 7.26 | 140 | -| amazon-google | 63.44 | 90.13 | 74.47 | 4.44 | 94 | -| beer | 90 | 64.29 | 75 | 0.4133 | 74 | -| dblp-acm | 96.15 | 100 | 98.04 | 6.232 | 130 | -| dblp-scholar | 91.56 | 82.4 | 86.74 | 5.694 | 130 | -| fodors-zagats | 100 | 86.36 | 92.68 | 0.9667 | 73 | -| itunes-amazon | 100 | 46.15 | 63.16 | 0.853 | 71 | -| walmart-amazon | 90.91 | 88.08 | 89.47 | 6.032 | 140 | +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 95.02 | 92.72 | 93.86 | 7.26 | 165 | 140 | +| amazon-google | 63.44 | 90.13 | 74.47 | 4.44 | 276 | 94 | +| beer | 90 | 64.29 | 75 | 0.4133 | 220 | 74 | +| dblp-acm | 96.15 | 100 | 98.04 | 6.232 | 200 | 130 | +| dblp-scholar | 91.56 | 82.4 | 86.74 | 5.694 | 218 | 130 | +| fodors-zagats | 100 | 86.36 | 92.68 | 0.9667 | 195 | 73 | +| itunes-amazon | 100 | 46.15 | 63.16 | 0.853 | 123 | 71 | +| walmart-amazon | 90.91 | 88.08 | 89.47 | 6.032 | 197 | 140 | #### GPT-3.5-turbo -| Benchmark | Precision | Recall | F1 | Cost ($) | Throughput (pps) | -| :------------- | :-------: | :----: | :---: | :------: | :--------------: | -| abt-buy | 100 | 15.05 | 26.16 | 0.3649 | 22 | -| amazon-google | 68.6 | 35.62 | 46.89 | 0.2209 | 160 | -| beer | 100 | 35.71 | 52.63 | 0.02057 | 78 | -| dblp-acm | 99.38 | 64 | 77.86 | 0.3106 | 140 | -| dblp-scholar | 92.41 | 29.2 | 44.38 | 0.2834 | 150 | -| fodors-zagats | 100 | 40.91 | 58.06 | 0.04857 | 54 | -| itunes-amazon | 100 | 19.23 | 32.26 | 0.04254 | 87 | -| walmart-amazon | 94.44 | 35.23 | 51.32 | 0.3007 | 150 | +| Benchmark | Precision | Recall | F1 | Cost ($) | Pairs per $ | Throughput (pps) | +| :------------- | :-------: | :----: | :---: | :------: | :---------: | :--------------: | +| abt-buy | 100 | 15.05 | 26.16 | 0.3649 | 3302 | 22 | +| amazon-google | 68.6 | 35.62 | 46.89 | 0.2209 | 5554 | 160 | +| beer | 100 | 35.71 | 52.63 | 0.02057 | 4423 | 78 | +| dblp-acm | 99.38 | 64 | 77.86 | 0.3106 | 4014 | 140 | +| dblp-scholar | 92.41 | 29.2 | 44.38 | 0.2834 | 4396 | 150 | +| fodors-zagats | 100 | 40.91 | 58.06 | 0.04857 | 3891 | 54 | +| itunes-amazon | 100 | 19.23 | 32.26 | 0.04254 | 2468 | 87 | +| walmart-amazon | 94.44 | 35.23 | 51.32 | 0.3007 | 3967 | 150 | ### Meta-Llama3-8B-Instruct-8bit -| Benchmark | Precision | Recall | F1 | Cost ($) | Throughput (pps) | -|:--------------:|:---------:|:------:|:-----:|:--------:|:----------------:| -| abt-buy | 70.36 | 95.63 | 81.07 | - | 0.74 | -| amazon-google | 51.46 | 75.54 | 61.22 | - | 1.2 | -| beer | 90.0 | 64.29 | 75.0 | - | 0.89 | -| dblp-acm | 88.58 | 90.0 | 89.29 | - | 0.99 | -| dblp-scholar | 81.68 | 85.6 | 83.59 | - | 1.1 | -| fodors-zagats | 89.47 | 77.27 | 82.93 | - | 0.92 | -| itunes-amazon | 50.0 | 69.23 | 58.06 | - | 0.66 | -| walmart-amazon | 54.9 | 87.05 | 67.33 | - | 0.98 | +| Benchmark | Precision | Recall | F1 | Throughput (pps) | +|:--------------:|:---------:|:------:|:-----:|:----------------:| +| abt-buy | 70.36 | 95.63 | 81.07 | 0.74 | +| amazon-google | 51.46 | 75.54 | 61.22 | 1.2 | +| beer | 90.0 | 64.29 | 75.0 | 0.89 | +| dblp-acm | 88.58 | 90.0 | 89.29 | 0.99 | +| dblp-scholar | 81.68 | 85.6 | 83.59 | 1.1 | +| fodors-zagats | 89.47 | 77.27 | 82.93 | 0.92 | +| itunes-amazon | 50.0 | 69.23 | 58.06 | 0.66 | +| walmart-amazon | 54.9 | 87.05 | 67.33 | 0.98 | For more results on Llama models, please see [Llama.md](Llama.md) @@ -184,3 +203,31 @@ Varying the batch size when performing EM over the `abt-buy` benchmark: | 128 | 94.54 | 22.32 | 0.11 | 0.99 | | 256 | 93.43 | 32.93 | 0.10 | 0.99 | | 512 | 70.87 | 74.44 | 0.08 | 0.95 | + +### Data preparation (w/ vs. w/o schema) + +To run a single benchmark in `/classic` without schema: + +``` +python -m benchmark.run -n -p -1 --no-schema +``` + +#### Setup + +| Setup | Model | Tools | Data Preparation | +| :------ | :------- | :---- | :------------------------------------ | +| S1 | `gpt-4o` | Off | Field values only separated by spaces | +| S2 | `gpt-4o` | Off | Schema inline with entity description | + +#### Results + +| Benchmark | Precision (S1, S2) | Recall (S1, S2) | F1 (S1, S2) | +| :------------- | :----------------: | :--------------: | :-------------: | +| abt-buy | 84.0, **89.9** | 99.5, **99.5** | 91.1, **94.5** | +| amazon-google | 60.0, **67.4** | 89.7, **92.7** | 71.9, **78.1** | +| beer | 92.3, **92.3** | 85.7, **85.7** | 88.9, **88.9** | +| dblp-acm | 80.4, **94.7** | **100.0**, 99.6 | 89.1, **97.1** | +| dblp-scholar | 78.4, **88.3** | 98.8, **93.6** | 87.4, **90.9** | +| fodors-zagats | 95.7, **100.0** | 100.0, **100.0** | 97.8, **100.0** | +| itunes-amazon | 89.3, **100.0** | 92.6, **96.3** | 90.9, **98.11** | +| walmart-amazon | 75.4, **85.4** | 95.3, **91.2** | 84.2, **88.2** | diff --git a/benchmark/suite/gpt_35_turbo.py b/benchmark/suite/gpt_35_turbo.py index d10feae..8ae9b5f 100644 --- a/benchmark/suite/gpt_35_turbo.py +++ b/benchmark/suite/gpt_35_turbo.py @@ -37,14 +37,16 @@ def run(args): save(df, name) # generate markdown table + df["pair_per_$"] = df["num_pairs"] // df["cost"] df = df[["benchmark", "precision", "recall", "f1", - "cost", "throughput"]] + "cost", "pair_per_$", "throughput"]] field_names = { "benchmark": "Benchmark", "precision": "Precision", "recall": "Recall", "f1": "F1", "cost": "Cost ($)", + "pair_per_$": "Pair per $", "throughput": "Throughput (pps)", } df = df.rename(columns=field_names) diff --git a/benchmark/suite/gpt_4.py b/benchmark/suite/gpt_4.py index 458b9a9..1b0a2a5 100644 --- a/benchmark/suite/gpt_4.py +++ b/benchmark/suite/gpt_4.py @@ -37,14 +37,16 @@ def run(args): save(df, name) # generate markdown table + df["pair_per_$"] = df["num_pairs"] // df["cost"] df = df[["benchmark", "precision", "recall", "f1", - "cost", "throughput"]] + "cost", "pair_per_$", "throughput"]] field_names = { "benchmark": "Benchmark", "precision": "Precision", "recall": "Recall", "f1": "F1", "cost": "Cost ($)", + "pair_per_$": "Pair per $", "throughput": "Throughput (pps)", } df = df.rename(columns=field_names) diff --git a/benchmark/suite/gpt_4_turbo.py b/benchmark/suite/gpt_4_turbo.py index 51d7029..4de717c 100644 --- a/benchmark/suite/gpt_4_turbo.py +++ b/benchmark/suite/gpt_4_turbo.py @@ -37,14 +37,16 @@ def run(args): save(df, name) # generate markdown table + df["pair_per_$"] = df["num_pairs"] // df["cost"] df = df[["benchmark", "precision", "recall", "f1", - "cost", "throughput"]] + "cost", "pair_per_$", "throughput"]] field_names = { "benchmark": "Benchmark", "precision": "Precision", "recall": "Recall", "f1": "F1", "cost": "Cost ($)", + "pair_per_$": "Pair per $", "throughput": "Throughput (pps)", } df = df.rename(columns=field_names) diff --git a/benchmark/suite/gpt_4o.py b/benchmark/suite/gpt_4o.py index da9ac60..0297a03 100644 --- a/benchmark/suite/gpt_4o.py +++ b/benchmark/suite/gpt_4o.py @@ -37,14 +37,16 @@ def run(args): save(df, name) # generate markdown table + df["pair_per_$"] = df["num_pairs"] // df["cost"] df = df[["benchmark", "precision", "recall", "f1", - "cost", "throughput"]] + "cost", "pair_per_$", "throughput"]] field_names = { "benchmark": "Benchmark", "precision": "Precision", "recall": "Recall", "f1": "F1", "cost": "Cost ($)", + "pair_per_$": "Pair per $", "throughput": "Throughput (pps)", } df = df.rename(columns=field_names) diff --git a/benchmark/suite/gpt_4o_mini.py b/benchmark/suite/gpt_4o_mini.py index d3db055..3423e74 100644 --- a/benchmark/suite/gpt_4o_mini.py +++ b/benchmark/suite/gpt_4o_mini.py @@ -37,14 +37,16 @@ def run(args): save(df, name) # generate markdown table + df["pair_per_$"] = df["num_pairs"] // df["cost"] df = df[["benchmark", "precision", "recall", "f1", - "cost", "throughput"]] + "cost", "pair_per_$", "throughput"]] field_names = { "benchmark": "Benchmark", "precision": "Precision", "recall": "Recall", "f1": "F1", "cost": "Cost ($)", + "pair_per_$": "Pair per $", "throughput": "Throughput (pps)", } df = df.rename(columns=field_names) diff --git a/benchmark/suite/util.py b/benchmark/suite/util.py index 7ca56df..9d5cf52 100644 --- a/benchmark/suite/util.py +++ b/benchmark/suite/util.py @@ -7,11 +7,14 @@ import benchmark as bm -def run_benchmark(benchmark, args): - # create a deep copy of args to pass into benchmark +def run_benchmark(benchmark: str, args): libem.reset() + + _args = copy.deepcopy(args) + _args.name = benchmark benchmark = bm.benchmarks[benchmark] - return benchmark(copy.deepcopy(args)) + + return benchmark(_args) def report_to_dataframe(reports, key_col: str = "benchmark"): @@ -28,7 +31,8 @@ def report_to_dataframe(reports, key_col: str = "benchmark"): return pd.concat(rows) -def tabulate(df: pd.DataFrame, name): +def tabulate(df: pd.DataFrame, name: str): + name = name.replace('_', '-') output_file = os.path.join( bm.table_dir, f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-" @@ -43,7 +47,8 @@ def plot(df: pd.DataFrame): pass -def save(df: pd.DataFrame, name): +def save(df: pd.DataFrame, name: str): + name = name.replace('_', '-') output_file = os.path.join( bm.result_dir, f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-"