diff --git a/benchmark/src/bench_overlap_parallel.py b/benchmark/src/bench_overlap_parallel.py
new file mode 100755
index 0000000..decd9c8
--- /dev/null
+++ b/benchmark/src/bench_overlap_parallel.py
@@ -0,0 +1,208 @@
+import json
+import os
+import timeit
+
+import bioframe as bf
+import numpy as np
+import pandas as pd
+import pyranges as pr
+import pyranges1 as pr1
+from rich import print
+from rich.box import MARKDOWN
+from rich.table import Table
+
+import polars_bio as pb
+
+BENCH_DATA_ROOT = os.getenv("BENCH_DATA_ROOT")
+
+if BENCH_DATA_ROOT is None:
+    raise ValueError("BENCH_DATA_ROOT is not set")
+
+pb.ctx.set_option("datafusion.optimizer.repartition_joins", "true")
+
+columns = ("contig", "pos_start", "pos_end")
+
+test_threads = [1, 2, 4, 8, 16]
+
+test_cases = [
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/fBrain-DS14718/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/exons/*.parquet",
+    #     "name": "1-2",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/exons/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet",
+    #     "name": "2-7",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/fBrain-DS14718/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+    #     "name": "1-0",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+    #     "name": "7-0",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainOrnAna1/*.parquet",
+    #     "name": "7-3",
+    # },
+    {
+        "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+        "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet",
+        "name": "0-8",
+    },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/chainVicPac2/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet",
+    #     "name": "4-8",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet",
+    #     "name": "7-8",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/chainOrnAna1/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+    #     "name": "3-0",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainVicPac2/*.parquet",
+    #     "name": "0-4",
+    # },
+    # {
+    #     "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet",
+    #     "df_path_2": f"{BENCH_DATA_ROOT}/chainXenTro3Link/*.parquet",
+    #     "name": "0-5",
+    # },
+]
+
+
+# pyranges0
+def df2pr0(df):
+    return pr.PyRanges(
+        chromosomes=df.contig,
+        starts=df.pos_start,
+        ends=df.pos_end,
+    )
+
+
+# df_1_pr0 = df2pr0(df_1)
+# df_2_pr0 = df2pr0(df_2)
+
+
+### pyranges1
+def df2pr1(df):
+    return pr1.PyRanges(
+        {
+            "Chromosome": df.contig,
+            "Start": df.pos_start,
+            "End": df.pos_end,
+        }
+    )
+
+
+def polars_bio(df_path_1, df_path_2):
+    pb.overlap(df_path_1, df_path_2, col1=columns, col2=columns).collect().count()
+
+
+def pyranges0(df_1_pr0, df_2_pr0):
+    len(df_1_pr0.join(df_2_pr0))
+
+
+functions = [
+    pyranges0,
+    polars_bio,
+]
+
+
+num_repeats = 3
+num_executions = 3
+
+# mkdir
+# mkdir results directory if it does not exist
+
+os.makedirs("results", exist_ok=True)
+
+for t in test_cases:
+    results = []
+    df_1 = pd.read_parquet(t["df_path_1"].replace("*.parquet", ""), engine="pyarrow")
+    df_2 = pd.read_parquet(t["df_path_2"].replace("*.parquet", ""), engine="pyarrow")
+    df_1_pr0 = df2pr0(df_1)
+    df_2_pr0 = df2pr0(df_2)
+    df_1_pr1 = df2pr1(df_1)
+    df_2_pr1 = df2pr1(df_2)
+    for p in test_threads:
+        pb.ctx.set_option("datafusion.execution.target_partitions", str(p))
+        for func in functions:
+            times = None
+            print(f"Running {func.__name__}...")
+            if func == polars_bio:
+                times = timeit.repeat(
+                    lambda: func(t["df_path_1"], t["df_path_2"]),
+                    repeat=num_repeats,
+                    number=num_executions,
+                )
+            elif func == pyranges0 and p == 1:
+                times = timeit.repeat(
+                    lambda: func(df_1_pr0, df_2_pr0),
+                    repeat=num_repeats,
+                    number=num_executions,
+                )
+            else:
+                continue
+            per_run_times = [
+                time / num_executions for time in times
+            ]  # Convert to per-run times
+            results.append(
+                {
+                    "name": f"{func.__name__}-{p}",
+                    "min": min(per_run_times),
+                    "max": max(per_run_times),
+                    "mean": np.mean(per_run_times),
+                }
+            )
+
+        # fastest_mean = min(result["mean"] for result in results)
+        fastest_mean = results[1]["mean"]
+        for result in results:
+            result["speedup"] = fastest_mean / result["mean"]
+
+        # Create Rich table
+        table = Table(title="Benchmark Results", box=MARKDOWN)
+        table.add_column("Library", justify="left", style="cyan", no_wrap=True)
+        table.add_column("Min (s)", justify="right", style="green")
+        table.add_column("Max (s)", justify="right", style="green")
+        table.add_column("Mean (s)", justify="right", style="green")
+        table.add_column("Speedup", justify="right", style="magenta")
+
+        # Add rows to the table
+        for result in results:
+            table.add_row(
+                result["name"],
+                f"{result['min']:.6f}",
+                f"{result['max']:.6f}",
+                f"{result['mean']:.6f}",
+                f"{result['speedup']:.2f}x",
+            )
+
+    # Display the table
+    benchmark_results = {
+        "inputs": {
+            "df_1_num": len(df_1),
+            "df_2_num": len(df_2),
+        },
+        "output_num": len(
+            bf.overlap(df_1, df_2, cols1=columns, cols2=columns, how="inner")
+        ),
+        "results": results,
+    }
+    print(t["name"])
+    print(json.dumps(benchmark_results, indent=4))
+    json.dump(benchmark_results, open(f"results/{t['name']}.json", "w"))
+    print(table)
diff --git a/docs/performance.md b/docs/performance.md
index e891f3f..a3a6eb4 100644
--- a/docs/performance.md
+++ b/docs/performance.md
@@ -378,6 +378,12 @@
 
 ##### L-size (7-8), output: 307, 184,634
 
+| Library    |   Min (s) |   Max (s) |  Mean (s) | Speedup   |
+|------------|-----------|-----------|-----------|-----------|
+| bioframe   | 51.923368 | 52.840132 | 52.354141 | 0.14x     |
+| polars_bio |  6.604371 |  7.975253 |  7.151908 | **1.00x** |
+| pyranges0  | 41.702499 | 42.557826 | 42.027393 | **0.17x** |
+| pyranges1  | 73.713501 | 76.161131 | 74.770918 | 0.10x     |
 
 
 ### Google Axion
@@ -407,12 +413,74 @@
 
 ### Parallel execution and scalability
 
+#### Intel
+
+- cpu architecture: `x86_64`
+- cpu name: `INTEL(R) XEON(R) PLATINUM 8581C CPU @ 2.30GHz`
+- cpu cores: `16`
+- memory: `118 GB`
+- kernel: `#27~22.04.1-Ubuntu SMP Tue Jul 16 23:03:39 UTC 2024`
+- system: `Linux`
+- os-release: `Linux-6.5.0-1025-gcp-x86_64-with-glibc2.35`
+- python: `3.12.8`
+- polars-bio: `0.3.0`
+
+#### 0-8 (input: 2,350,965 and 9,944,559,  output: 164,196,784)
+
+##### Apple Silicon
+| Library       |  Min (s) |  Max (s) | Mean (s) | Speedup   |
+|---------------|----------|----------|----------|-----------|
+| pyranges0-1   | 9.331440 | 9.399316 | 9.358115 | 0.31x     |
+| polars_bio-1  | 2.810053 | 3.163260 | 2.935647 | **1.00x** |
+| polars_bio-2  | 1.353191 | 1.422477 | 1.376621 | 2.13x     |
+| polars_bio-4  | 1.020456 | 1.029563 | 1.024929 | 2.86x     |
+| polars_bio-8  | 0.734393 | 0.738268 | 0.735762 | **3.99x** |
+
+
+
+##### Intel
+| Library       |   Min (s) |   Max (s) |  Mean (s) | Speedup   |
+|---------------|-----------|-----------|-----------|-----------|
+| pyranges0-1   | 22.856168 | 23.086879 | 22.958235 | 0.27x     |
+| polars_bio-1  |  5.935124 |  6.694116 |  6.203911 | **1.00x** |
+| polars_bio-2  |  3.763082 |  3.913454 |  3.815991 | 1.63x     |
+| polars_bio-4  |  2.331916 |  2.358274 |  2.342218 | 2.65x     |
+| polars_bio-8  |  1.317331 |  1.326317 |  1.322318 | **4.69x** |
+
+
+
+#### 2-5 (input: 438,694 and 50,980,975,  output: 52,395,369)
+
+##### Apple Silicon
+| Library       |   Min (s) |   Max (s) |  Mean (s) | Speedup   |
+|---------------|-----------|-----------|-----------|-----------|
+| pyranges0-1   | 11.836572 | 12.033881 | 11.943536 | 0.41x     |
+| polars_bio-1  |  4.878542 |  4.944363 |  4.912092 | **1.00x** |
+| polars_bio-2  |  3.109014 |  3.113733 |  3.111639 | 1.58x     |
+| polars_bio-4  |  1.928374 |  1.944733 |  1.935807 | 2.54x     |
+| polars_bio-8  |  1.319147 |  1.334540 |  1.324507 | 3.71x     |
+| polars_bio-16 |  0.751453 |  0.758128 |  0.754517 | **6.51x** |
+
+
+#### 2-6 (input: 438,694 and 128,186,542, output: 116,300,901)
+
+| Library       |   Min (s) |   Max (s) |  Mean (s) | Speedup   |
+|---------------|-----------|-----------|-----------|-----------|
+| pyranges0-1   | 29.674772 | 31.891295 | 30.546541 | 0.37x     |
+| polars_bio-1  | 11.379310 | 11.423765 | 11.399042 | **1.00x** |
+| polars_bio-2  |  7.134765 |  7.209546 |  7.163538 | 1.59x     |
+| polars_bio-4  |  4.409859 |  4.462592 |  4.429911 | 2.57x     |
+| polars_bio-8  |  3.069381 |  3.080261 |  3.073801 | 3.71x     |
+| polars_bio-16 |  1.698058 |  1.736596 |  1.717683 | **6.64x** |
+
+
 ### Native, Pandas, Polars performance comparison
 
 ## How to run the benchmarks
 ```bash
 poetry env use python3.12
 poetry update
+poetry shell
 RUSTFLAGS="-Ctarget-cpu=native" maturin develop --release  -m Cargo.toml
-poetry run python benchmark/src/bench_overlap.py
+python benchmark/src/bench_overlap.py
 ```
\ No newline at end of file