diff --git a/benchmark/results/apple-silicon/0-1.json b/benchmark/results/apple-silicon/0-1.json new file mode 100644 index 0000000..d0870dd --- /dev/null +++ b/benchmark/results/apple-silicon/0-1.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 2350965, + "df_2_num": 198621 + }, + "output_num": 320955, + "results": [ + { + "name": "bioframe", + "min": 0.4824440833326662, + "max": 0.4864661946630804, + "mean": 0.48469012044208487, + "speedup": 0.16860836806000543 + }, + { + "name": "polars_bio", + "min": 0.09075104166792396, + "max": 0.09702943066561905, + "mean": 0.09311920377755693, + "speedup": 0.8776150021403402 + }, + { + "name": "pyranges0", + "min": 0.07799816666617214, + "max": 0.08790026399947237, + "mean": 0.0817228102225474, + "speedup": 1.0 + }, + { + "name": "pyranges1", + "min": 0.23589159733577011, + "max": 0.2378742219977236, + "mean": 0.23700583333427658, + "speedup": 0.344813497089265 + }, + { + "name": "pybedtools0", + "min": 1.4111382639966905, + "max": 1.4246090833330527, + "mean": 1.416416699110414, + "speedup": 0.05769687004811066 + }, + { + "name": "pygenomics", + "min": 1.390716375002133, + "max": 1.402293403002356, + "mean": 1.395175759334557, + "speedup": 0.05857527962034397 + }, + { + "name": "genomicranges", + "min": 4.046394958335441, + "max": 4.0558332916649915, + "mean": 4.050494356556253, + "speedup": 0.020176008908707254 + } + ] +} diff --git a/benchmark/results/apple-silicon/0-7.json b/benchmark/results/apple-silicon/0-7.json new file mode 100644 index 0000000..65f3249 --- /dev/null +++ b/benchmark/results/apple-silicon/0-7.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 2350965, + "df_2_num": 1194285 + }, + "output_num": 2761621, + "results": [ + { + "name": "bioframe", + "min": 0.2980392776662484, + "max": 0.30927084732684307, + "mean": 0.30290485188662486, + "speedup": 0.29821783673328733 + }, + { + "name": "polars_bio", + "min": 0.08932411100249738, + "max": 0.09219969466600257, + "mean": 0.09033162966564608, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 0.09647758333206487, + "max": 0.10345558333210647, + "mean": 0.10102296299818488, + "speedup": 0.8941692758236471 + }, + { + "name": "pyranges1", + "min": 0.19562062499850677, + "max": 0.19802473599944884, + "mean": 0.19714646755407253, + "speedup": 0.45819552734755586 + }, + { + "name": "pybedtools0", + "min": 1.0045766390006368, + "max": 1.0130972083328136, + "mean": 1.0077013518894091, + "speedup": 0.08964127069620086 + }, + { + "name": "pygenomics", + "min": 4.264574694330804, + "max": 4.275965152997135, + "mean": 4.26905469455394, + "speedup": 0.021159632782611733 + }, + { + "name": "genomicranges", + "min": 2.919674833334284, + "max": 2.9267854306672234, + "mean": 2.9235494306647323, + "speedup": 0.0308979313700563 + } + ] +} \ No newline at end of file diff --git a/benchmark/results/apple-silicon/0-8.json b/benchmark/results/apple-silicon/0-8.json new file mode 100644 index 0000000..705429b --- /dev/null +++ b/benchmark/results/apple-silicon/0-8.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 2350965, + "df_2_num": 9944559 + }, + "output_num": 164196784, + "results": [ + { + "name": "bioframe", + "min": 15.630508430331247, + "max": 16.719792805665445, + "mean": 16.080008865665555, + "speedup": 0.18642745012968612 + }, + { + "name": "polars_bio", + "min": 2.882899638667974, + "max": 3.1351002083344306, + "mean": 2.997755050888776, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 9.276095166666588, + "max": 10.15810941666617, + "mean": 9.761879657443691, + "speedup": 0.30708789250468876 + }, + { + "name": "pyranges1", + "min": 13.076820124998145, + "max": 13.510233986002277, + "mean": 13.329948374999931, + "speedup": 0.22488872173811336 + }, + { + "name": "pybedtools0", + "min": 322.9229150833368, + "max": 335.12307094466814, + "mean": 329.65914210189015, + "speedup": 0.00909349891459172 + }, + { + "name": "pygenomics", + "min": 128.84953573599827, + "max": 132.10968911099675, + "mean": 130.08909561566543, + "speedup": 0.02304386110689345 + }, + { + "name": "genomicranges", + "min": 234.23743527799766, + "max": 239.3151572223384, + "mean": 236.504564745557, + "speedup": 0.012675252395715514 + } + ] +} diff --git a/benchmark/results/apple-silicon/1-2.json b/benchmark/results/apple-silicon/1-2.json new file mode 100644 index 0000000..deb64e3 --- /dev/null +++ b/benchmark/results/apple-silicon/1-2.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 198621, + "df_2_num": 438694 + }, + "output_num": 54246, + "results": [ + { + "name": "bioframe", + "min": 0.10073762500057153, + "max": 0.10154148600122426, + "mean": 0.10111909722197905, + "speedup": 0.25305574464594827 + }, + { + "name": "polars_bio", + "min": 0.032156180668001376, + "max": 0.0355005000019446, + "mean": 0.03339360188854496, + "speedup": 0.7662775800837371 + }, + { + "name": "pyranges0", + "min": 0.024099944668705575, + "max": 0.02827072200307157, + "mean": 0.025588768445433948, + "speedup": 1.0 + }, + { + "name": "pyranges1", + "min": 0.05377036133237804, + "max": 0.054647471998274945, + "mean": 0.05412140744253217, + "speedup": 0.472803085777193 + }, + { + "name": "pybedtools0", + "min": 0.2819691666663857, + "max": 0.2833851113343068, + "mean": 0.28285702322156997, + "speedup": 0.09046538125160689 + }, + { + "name": "pygenomics", + "min": 1.4249745693329412, + "max": 1.436369222336604, + "mean": 1.430531254667181, + "speedup": 0.0178875983044406 + }, + { + "name": "genomicranges", + "min": 0.9727168609970249, + "max": 0.9790132083338298, + "mean": 0.9757608008868476, + "speedup": 0.0262244275668554 + } + ] +} diff --git a/benchmark/results/apple-silicon/2-7.json b/benchmark/results/apple-silicon/2-7.json new file mode 100644 index 0000000..fc23141 --- /dev/null +++ b/benchmark/results/apple-silicon/2-7.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 438694, + "df_2_num": 1194285 + }, + "output_num": 273500, + "results": [ + { + "name": "bioframe", + "min": 0.2980392776662484, + "max": 0.30927084732684307, + "mean": 0.30290485188662486, + "speedup": 0.29821783673328733 + }, + { + "name": "polars_bio", + "min": 0.08932411100249738, + "max": 0.09219969466600257, + "mean": 0.09033162966564608, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 0.09647758333206487, + "max": 0.10345558333210647, + "mean": 0.10102296299818488, + "speedup": 0.8941692758236471 + }, + { + "name": "pyranges1", + "min": 0.19562062499850677, + "max": 0.19802473599944884, + "mean": 0.19714646755407253, + "speedup": 0.45819552734755586 + }, + { + "name": "pybedtools0", + "min": 1.0045766390006368, + "max": 1.0130972083328136, + "mean": 1.0077013518894091, + "speedup": 0.08964127069620086 + }, + { + "name": "pygenomics", + "min": 4.264574694330804, + "max": 4.275965152997135, + "mean": 4.26905469455394, + "speedup": 0.021159632782611733 + }, + { + "name": "genomicranges", + "min": 2.919674833334284, + "max": 2.9267854306672234, + "mean": 2.9235494306647323, + "speedup": 0.0308979313700563 + } + ] +} diff --git a/benchmark/results/apple-silicon/3-7.json b/benchmark/results/apple-silicon/3-7.json new file mode 100644 index 0000000..ca800f9 --- /dev/null +++ b/benchmark/results/apple-silicon/3-7.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 1956864, + "df_2_num": 1194285 + }, + "output_num": 4408383, + "results": [ + { + "name": "bioframe", + "min": 0.9651126806663038, + "max": 0.9735246386650639, + "mean": 0.9680251064419281, + "speedup": 0.21091899669168662 + }, + { + "name": "polars_bio", + "min": 0.19807397232701382, + "max": 0.21415665267462222, + "mean": 0.20417488422309463, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 0.299306430669579, + "max": 0.3361351666681003, + "mean": 0.3225051574441345, + "speedup": 0.6330902917683185 + }, + { + "name": "pyranges1", + "min": 1.0402721390031122, + "max": 1.0440249723324087, + "mean": 1.0427527454448864, + "speedup": 0.1958037369021591 + }, + { + "name": "pybedtools0", + "min": 8.970137541667404, + "max": 9.048951763669416, + "mean": 9.003954847223618, + "speedup": 0.02267613373095182 + }, + { + "name": "pygenomics", + "min": 8.626979291671887, + "max": 8.637451514000228, + "mean": 8.632298328781163, + "speedup": 0.023652436054295067 + }, + { + "name": "genomicranges", + "min": 10.57524026366688, + "max": 10.5942114443363, + "mean": 10.584906157222576, + "speedup": 0.019289248406210626 + } + ] +} diff --git a/benchmark/results/apple-silicon/4-8.json b/benchmark/results/apple-silicon/4-8.json new file mode 100644 index 0000000..49e1793 --- /dev/null +++ b/benchmark/results/apple-silicon/4-8.json @@ -0,0 +1,58 @@ +{ + "inputs": { + "df_1_num": 7684066, + "df_2_num": 9944559 + }, + "output_num": 227832153, + "results": [ + { + "name": "bioframe", + "min": 22.91120584734017, + "max": 23.118099846993573, + "mean": 23.030571874890786, + "speedup": 0.15997506012713292 + }, + { + "name": "polars_bio", + "min": 3.5413247360071787, + "max": 3.9377595280044866, + "mean": 3.68431712044791, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 13.035069347339837, + "max": 13.510202666666979, + "mean": 13.225004953778503, + "speedup": 0.27858720154167255 + }, + { + "name": "pyranges1", + "min": 20.924920874997042, + "max": 21.65729676366512, + "mean": 21.398281009111088, + "speedup": 0.17217818192401432 + }, + { + "name": "pybedtools0", + "min": 505.89715663900523, + "max": 521.2392763610018, + "mean": 511.3106857593359, + "speedup": 0.007205632941107839 + }, + { + "name": "pygenomics", + "min": 159.8838469166658, + "max": 160.94232931933948, + "mean": 160.30697036100253, + "speedup": 0.022982887844184376 + }, + { + "name": "genomicranges", + "min": 322.217280152671, + "max": 322.49039133332553, + "mean": 322.3716621758876, + "speedup": 0.011428787181789347 + } + ] +} diff --git a/benchmark/results/apple-silicon/7-8.json b/benchmark/results/apple-silicon/7-8.json new file mode 100644 index 0000000..3a86879 --- /dev/null +++ b/benchmark/results/apple-silicon/7-8.json @@ -0,0 +1,107 @@ +{ + "inputs": { + "df_1_num": 1194285, + "df_2_num": 9944559 + }, + "output_num": 307184634, + "results": [ + { + "name": "bioframe", + "min": 22.91120584734017, + "max": 23.118099846993573, + "mean": 23.030571874890786, + "speedup": 0.15150635688648495 + }, + { + "name": "polars_bio", + "min": 3.5413247360071787, + "max": 3.9377595280044866, + "mean": 3.68431712044791, + "speedup": 0.9470623531323078 + }, + { + "name": "pyranges0", + "min": 13.035069347339837, + "max": 13.510202666666979, + "mean": 13.225004953778503, + "speedup": 0.2638394506446009 + }, + { + "name": "pyranges1", + "min": 20.924920874997042, + "max": 21.65729676366512, + "mean": 21.398281009111088, + "speedup": 0.1630634741309996 + }, + { + "name": "pybedtools0", + "min": 505.89715663900523, + "max": 521.2392763610018, + "mean": 511.3106857593359, + "speedup": 0.006824183689013262 + }, + { + "name": "pygenomics", + "min": 159.8838469166658, + "max": 160.94232931933948, + "mean": 160.30697036100253, + "speedup": 0.021766227843489167 + }, + { + "name": "genomicranges", + "min": 322.217280152671, + "max": 322.49039133332553, + "mean": 322.3716621758876, + "speedup": 0.010823774081833775 + }, + { + "name": "bioframe", + "min": 29.128663666672463, + "max": 29.9931820416629, + "mean": 29.518215175889463, + "speedup": 0.11820762268265783 + }, + { + "name": "polars_bio", + "min": 3.26043755565964, + "max": 3.8972603473327276, + "mean": 3.489278041777046, + "speedup": 1.0 + }, + { + "name": "pyranges0", + "min": 16.615282875000656, + "max": 16.983202430671856, + "mean": 16.753369328779325, + "speedup": 0.20827321199104012 + }, + { + "name": "pyranges1", + "min": 44.154733083337, + "max": 44.4963572776663, + "mean": 44.37964724533311, + "speedup": 0.07862338387883361 + }, + { + "name": "pybedtools0", + "min": 555.4805318053308, + "max": 559.9474212220035, + "mean": 556.9867719350021, + "speedup": 0.006264561777033062 + }, + { + "name": "pygenomics", + "min": 156.7244200556694, + "max": 157.32151397200263, + "mean": 156.9354242082239, + "speedup": 0.022233845923450832 + }, + { + "name": "genomicranges", + "min": 416.0955732223326, + "max": 417.2842364446648, + "mean": 416.6999997594459, + "speedup": 0.008373597417305853 + } + ] +} diff --git a/benchmark/src/bench_overlap.py b/benchmark/src/bench_overlap.py index 0ffba54..0bd6909 100755 --- a/benchmark/src/bench_overlap.py +++ b/benchmark/src/bench_overlap.py @@ -1,10 +1,17 @@ +import itertools +import json import timeit import bioframe as bf import numpy as np import pandas as pd +import pybedtools import pyranges as pr +import pyranges1 as pr1 +from genomicranges import GenomicRanges +from pygenomics.interval import GenomicBase from rich import print +from rich.box import MARKDOWN from rich.table import Table import polars_bio as pb @@ -13,11 +20,13 @@ # BENCH_DATA_ROOT = os.getenv("BENCH_DATA_ROOT") # polars_bio +# 0-1 # df_path_1 = f"{BENCH_DATA_ROOT}/chainRn4/*.parquet" # df_path_2 = f"{BENCH_DATA_ROOT}/fBrain-DS14718/*.parquet" -df_path_1 = f"{BENCH_DATA_ROOT}/chainRn4/*.parquet" -df_path_2 = f"{BENCH_DATA_ROOT}/ex-rna/*.parquet" +# df_path_1 = f"{BENCH_DATA_ROOT}/chainRn4/*.parquet" +# df_path_2 = f"{BENCH_DATA_ROOT}/ex-rna/*.parquet" +# 7-8 # df_path_1 = f"{BENCH_DATA_ROOT}/ex-anno/*.parquet" # df_path_2 = f"{BENCH_DATA_ROOT}/ex-rna/*.parquet" pb.ctx.set_option("datafusion.optimizer.repartition_joins", "false") @@ -25,13 +34,72 @@ columns = ("contig", "pos_start", "pos_end") -# bioframe -df_1 = pd.read_parquet(df_path_1.replace("*.parquet", ""), engine="pyarrow") -df_2 = pd.read_parquet(df_path_2.replace("*.parquet", ""), engine="pyarrow") +# df read +# df_1 = pd.read_parquet(df_path_1.replace("*.parquet", ""), engine="pyarrow") +# df_2 = pd.read_parquet(df_path_2.replace("*.parquet", ""), engine="pyarrow") + + +test_cases = [ + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/fBrain-DS14718/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/exons/*.parquet", + # "name": "1-2", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/exons/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet", + # "name": "2-7", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/fBrain-DS14718/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + # "name": "1-0", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + # "name": "7-0", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/chainOrnAna1/*.parquet", + # "name": "7-3", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet", + # "name": "0-8", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/chainVicPac2/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet", + # "name": "4-8", + # }, + # { + # "df_path_1": f"{BENCH_DATA_ROOT}/ex-anno/*.parquet", + # "df_path_2": f"{BENCH_DATA_ROOT}/ex-rna/*.parquet", + # "name": "7-8", + # }, + { + "df_path_1": f"{BENCH_DATA_ROOT}/chainOrnAna1/*.parquet", + "df_path_2": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + "name": "3-0", + }, + { + "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + "df_path_2": f"{BENCH_DATA_ROOT}/chainVicPac2/*.parquet", + "name": "0-4", + }, + { + "df_path_1": f"{BENCH_DATA_ROOT}/chainRn4/*.parquet", + "df_path_2": f"{BENCH_DATA_ROOT}/chainXenTro3Link/*.parquet", + "name": "0-5", + }, +] -# pyranges -def df2pr(df): +# pyranges0 +def df2pr0(df): return pr.PyRanges( chromosomes=df.contig, starts=df.pos_start, @@ -39,63 +107,200 @@ def df2pr(df): ) -df_1_pr = df2pr(df_1) -df_2_pr = df2pr(df_2) +# df_1_pr0 = df2pr0(df_1) +# df_2_pr0 = df2pr0(df_2) + + +### pyranges1 +def df2pr1(df): + return pr1.PyRanges( + { + "Chromosome": df.contig, + "Start": df.pos_start, + "End": df.pos_end, + } + ) + + +# df_1_pr1 = df2pr1(df_1) +# df_2_pr1 = df2pr1(df_2) -def bioframe(): - bf.overlap(df_1, df_2, cols1=columns, cols2=columns, how="inner").count() +def bioframe(df_1, df_2): + len(bf.overlap(df_1, df_2, cols1=columns, cols2=columns, how="inner")) -def polars_bio(): +def polars_bio(df_path_1, df_path_2): pb.overlap(df_path_1, df_path_2, col1=columns, col2=columns).collect().count() -def pyranges(): - df_1_pr.join(df_2_pr) +def pyranges0(df_1_pr0, df_2_pr0): + len(df_1_pr0.join(df_2_pr0)) -functions = [bioframe, polars_bio, pyranges] +def pyranges1(df_1_pr1, df_2_pr1): + len(df_1_pr1.overlap(df_2_pr1)) -num_repeats = 3 -num_executions = 5 -results = [] -for func in functions: - times = timeit.repeat(func, repeat=num_repeats, number=num_executions) - per_run_times = [ - time / num_executions for time in times - ] # Convert to per-run times - results.append( - { - "name": func.__name__, - "min": min(per_run_times), - "max": max(per_run_times), - "mean": np.mean(per_run_times), - } +def pybedtools0(df_1_bed, df_2_bed): + len(df_1_bed.intersect(df_2_bed)) + + +def pygenomics(df_1_pg, df_2_array): + len( + list( + itertools.chain.from_iterable( + [df_1_pg.find_all((r[0], r[1], r[2])) for r in df_2_array] + ) + ) ) -fastest_mean = min(result["mean"] for result in results) -for result in results: - result["speedup"] = fastest_mean / result["mean"] - -# Create Rich table -table = Table(title="Benchmark Results") -table.add_column("Function", justify="left", style="cyan", no_wrap=True) -table.add_column("Min (s)", justify="right", style="green") -table.add_column("Max (s)", justify="right", style="green") -table.add_column("Mean (s)", justify="right", style="green") -table.add_column("Speedup", justify="right", style="magenta") - -# Add rows to the table -for result in results: - table.add_row( - result["name"], - f"{result['min']:.6f}", - f"{result['max']:.6f}", - f"{result['mean']:.6f}", - f"{result['speedup']:.2f}x", + +def genomicranges(df_1, df_2): + len(df_1.find_overlaps(df_2, ignore_strand=True, query_type="any")) + + +functions = [ + bioframe, + polars_bio, + pyranges0, + pyranges1, + pybedtools0, + pygenomics, + genomicranges, +] + + +# functions = [ +# bioframe, +# polars_bio, +# pyranges0, +# pyranges1, +# pybedtools0, +# pygenomics, +# genomicranges, +# ] + +num_repeats = 2 +num_executions = 3 + + +for t in test_cases: + results = [] + df_1 = pd.read_parquet(t["df_path_1"].replace("*.parquet", ""), engine="pyarrow") + df_2 = pd.read_parquet(t["df_path_2"].replace("*.parquet", ""), engine="pyarrow") + df_1_pr0 = df2pr0(df_1) + df_2_pr0 = df2pr0(df_2) + df_1_pr1 = df2pr1(df_1) + df_2_pr1 = df2pr1(df_2) + df_0_bed = pybedtools.BedTool.from_dataframe(df_1) + df_1_bed = pybedtools.BedTool.from_dataframe(df_2) + df_1_pg = GenomicBase( + [(r.contig, r.pos_start, r.pos_end) for r in df_1.itertuples()] ) + df_2_array = df_2.values.tolist() + + df_0_gr = GenomicRanges.from_pandas( + df_1.rename( + columns={"contig": "seqnames", "pos_start": "starts", "pos_end": "ends"} + ) + ) + df_1_gr = GenomicRanges.from_pandas( + df_2.rename( + columns={"contig": "seqnames", "pos_start": "starts", "pos_end": "ends"} + ) + ) + + for func in functions: + times = None + print(f"Running {func.__name__}...") + if func == bioframe: + times = timeit.repeat( + lambda: func(df_1, df_2), repeat=num_repeats, number=num_executions + ) + elif func == polars_bio: + times = timeit.repeat( + lambda: func(t["df_path_1"], t["df_path_2"]), + repeat=num_repeats, + number=num_executions, + ) + elif func == pyranges0: + times = timeit.repeat( + lambda: func(df_1_pr0, df_2_pr0), + repeat=num_repeats, + number=num_executions, + ) + elif func == pyranges1: + times = timeit.repeat( + lambda: func(df_1_pr1, df_2_pr1), + repeat=num_repeats, + number=num_executions, + ) + elif func == pybedtools0: + times = timeit.repeat( + lambda: func(df_0_bed, df_1_bed), + repeat=num_repeats, + number=num_executions, + ) + elif func == pygenomics: + times = timeit.repeat( + lambda: func(df_1_pg, df_2_array), + repeat=num_repeats, + number=num_executions, + ) + elif func == genomicranges: + times = timeit.repeat( + lambda: func(df_0_gr, df_1_gr), + repeat=num_repeats, + number=num_executions, + ) + + per_run_times = [ + time / num_executions for time in times + ] # Convert to per-run times + results.append( + { + "name": func.__name__, + "min": min(per_run_times), + "max": max(per_run_times), + "mean": np.mean(per_run_times), + } + ) + + fastest_mean = min(result["mean"] for result in results) + for result in results: + result["speedup"] = fastest_mean / result["mean"] + + # Create Rich table + table = Table(title="Benchmark Results", box=MARKDOWN) + table.add_column("Library", justify="left", style="cyan", no_wrap=True) + table.add_column("Min (s)", justify="right", style="green") + table.add_column("Max (s)", justify="right", style="green") + table.add_column("Mean (s)", justify="right", style="green") + table.add_column("Speedup", justify="right", style="magenta") + + # Add rows to the table + for result in results: + table.add_row( + result["name"], + f"{result['min']:.6f}", + f"{result['max']:.6f}", + f"{result['mean']:.6f}", + f"{result['speedup']:.2f}x", + ) -# Display the table -print(table) + # Display the table + benchmark_results = { + "inputs": { + "df_1_num": len(df_1), + "df_2_num": len(df_2), + }, + "output_num": len( + bf.overlap(df_1, df_2, cols1=columns, cols2=columns, how="inner") + ), + "results": results, + } + print(t["name"]) + print(json.dumps(benchmark_results, indent=4)) + json.dump(benchmark_results, open(f"results/{t['name']}.json", "w")) + print(table) diff --git a/benchmark/src/results/1-0.json b/benchmark/src/results/1-0.json new file mode 100644 index 0000000..2c01ef4 --- /dev/null +++ b/benchmark/src/results/1-0.json @@ -0,0 +1 @@ +{"inputs": {"df_1_num": 198621, "df_2_num": 2350965}, "output_num": 320955, "results": [{"name": "bioframe", "min": 0.5210934306669515, "max": 0.5496743470042323, "mean": 0.5340839861116061, "speedup": 0.27565411588012717}, {"name": "polars_bio", "min": 0.13541077766179418, "max": 0.1685700970022784, "mean": 0.14722244899732892, "speedup": 1.0}, {"name": "pyranges0", "min": 0.27153947233455256, "max": 0.2820805696634731, "mean": 0.27629844455643454, "speedup": 0.532838500895934}, {"name": "pyranges1", "min": 0.4189718473353423, "max": 0.426372694336654, "mean": 0.42206015744401765, "speedup": 0.3488186373452145}, {"name": "pybedtools0", "min": 1.25882765299563, "max": 1.269215235991093, "mean": 1.2646743379947214, "speedup": 0.11641135158223113}, {"name": "pygenomics", "min": 7.877380986339024, "max": 7.9085313053316595, "mean": 7.89410838888984, "speedup": 0.01864966146202523}, {"name": "genomicranges", "min": 4.222081680665724, "max": 4.266591763996985, "mean": 4.244864842663649, "speedup": 0.03468248211760424}]} \ No newline at end of file diff --git a/benchmark/src/results/1-2.json b/benchmark/src/results/1-2.json new file mode 100644 index 0000000..dbf2c04 --- /dev/null +++ b/benchmark/src/results/1-2.json @@ -0,0 +1 @@ +{"inputs": {"df_1_num": 198621, "df_2_num": 438694}, "output_num": 54246, "results": [{"name": "bioframe", "min": 0.10484341666839707, "max": 0.11148261100364228, "mean": 0.1072822545570994, "speedup": 0.24727368834339988}, {"name": "polars_bio", "min": 0.0334676806620943, "max": 0.035909083332323156, "mean": 0.034654666666433, "speedup": 0.7654980217664296}, {"name": "pyranges0", "min": 0.025483972334768623, "max": 0.02856297233180764, "mean": 0.026528078778129488, "speedup": 1.0}, {"name": "pyranges1", "min": 0.05592556967167184, "max": 0.05716152800596319, "mean": 0.05647209744590024, "speedup": 0.4697555071961538}, {"name": "pybedtools0", "min": 0.29112999999779277, "max": 0.29761261099095765, "mean": 0.2935548795503564, "speedup": 0.09036837956419955}, {"name": "pygenomics", "min": 1.4335990416584536, "max": 1.438006500005334, "mean": 1.4357903611089569, "speedup": 0.01847628978205431}, {"name": "genomicranges", "min": 0.9786961113277357, "max": 0.9859378749970347, "mean": 0.981327458330068, "speedup": 0.027032850811361694}]} \ No newline at end of file diff --git a/benchmark/src/results/2-7.json b/benchmark/src/results/2-7.json new file mode 100644 index 0000000..4d089da --- /dev/null +++ b/benchmark/src/results/2-7.json @@ -0,0 +1 @@ +{"inputs": {"df_1_num": 438694, "df_2_num": 1194285}, "output_num": 273500, "results": [{"name": "bioframe", "min": 0.2993143889956021, "max": 0.30426263899425976, "mean": 0.3012149584408487, "speedup": 0.2997951766306785}, {"name": "polars_bio", "min": 0.08901551400776953, "max": 0.0912378056673333, "mean": 0.09030279166957673, "speedup": 1.0}, {"name": "pyranges0", "min": 0.10240074999940892, "max": 0.10710211100134377, "mean": 0.10427152777750355, "speedup": 0.8660349914721346}, {"name": "pyranges1", "min": 0.19792461100344858, "max": 0.2004592779946203, "mean": 0.1990527963336919, "speedup": 0.4536625123225761}, {"name": "pybedtools0", "min": 1.0202673056628555, "max": 1.0225228890000533, "mean": 1.021485282441265, "speedup": 0.08840341923846476}, {"name": "pygenomics", "min": 4.2786070556612685, "max": 4.289572486003938, "mean": 4.283311361110665, "speedup": 0.021082471960703123}, {"name": "genomicranges", "min": 2.899962486330575, "max": 2.947669222330054, "mean": 2.9298967315537285, "speedup": 0.03082115171400223}]} \ No newline at end of file diff --git a/benchmark/src/results/7-0.json b/benchmark/src/results/7-0.json new file mode 100644 index 0000000..07b6f7e --- /dev/null +++ b/benchmark/src/results/7-0.json @@ -0,0 +1 @@ +{"inputs": {"df_1_num": 1194285, "df_2_num": 2350965}, "output_num": 2761621, "results": [{"name": "bioframe", "min": 0.9351406250013193, "max": 0.9907102916719547, "mean": 0.9703449953360379, "speedup": 0.22289779927662567}, {"name": "polars_bio", "min": 0.21388038899749517, "max": 0.22015119467202263, "mean": 0.21628776399949046, "speedup": 1.0}, {"name": "pyranges0", "min": 0.40863704166258685, "max": 0.4342615970041758, "mean": 0.4223802823334053, "speedup": 0.5120687992456144}, {"name": "pyranges1", "min": 1.1300036109945115, "max": 1.1627242360070038, "mean": 1.1472921341127302, "speedup": 0.18852021866842028}, {"name": "pybedtools0", "min": 6.415976124999967, "max": 6.467303972002507, "mean": 6.444268189778086, "speedup": 0.033562812352000905}, {"name": "pygenomics", "min": 9.588232111331308, "max": 9.70503543067025, "mean": 9.65336835666676, "speedup": 0.02240541912503721}, {"name": "genomicranges", "min": 9.017885528000383, "max": 9.058916444337228, "mean": 9.033964495333043, "speedup": 0.023941622098606317}]} \ No newline at end of file diff --git a/benchmark/src/results/7-3.json b/benchmark/src/results/7-3.json new file mode 100644 index 0000000..635227c --- /dev/null +++ b/benchmark/src/results/7-3.json @@ -0,0 +1 @@ +{"inputs": {"df_1_num": 1194285, "df_2_num": 1956864}, "output_num": 4408383, "results": [{"name": "bioframe", "min": 0.9547651946680465, "max": 0.9693065000077089, "mean": 0.9597036760015828, "speedup": 0.21155808236442464}, {"name": "polars_bio", "min": 0.1986074443363274, "max": 0.20890630533297858, "mean": 0.20303306933298396, "speedup": 1.0}, {"name": "pyranges0", "min": 0.42527733333796885, "max": 0.43052738900102366, "mean": 0.428593564891748, "speedup": 0.47371936016880006}, {"name": "pyranges1", "min": 1.1906214443345864, "max": 1.199869138993866, "mean": 1.1945928147761151, "speedup": 0.16996006239250272}, {"name": "pybedtools0", "min": 9.403818319668062, "max": 9.491574458331646, "mean": 9.45340162499795, "speedup": 0.02147724992410105}, {"name": "pygenomics", "min": 8.638967846995607, "max": 8.662197041664816, "mean": 8.647763953667083, "speedup": 0.02347810028358693}, {"name": "genomicranges", "min": 10.514233041671105, "max": 10.556003902999995, "mean": 10.540376555669354, "speedup": 0.019262411381667243}]} \ No newline at end of file diff --git a/docs/performance.md b/docs/performance.md index 219ee51..e891f3f 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -1,83 +1,418 @@ -## Test environment - -```python exec="on" session="benchmark" -import os -import platform -from textwrap import dedent -import polars_bio -import cpuinfo -import psutil -import numpy as np -BENCH_DATA_ROOT = os.getenv("BENCH_DATA_ROOT") -BENCH_SRC_ROOT = os.getenv("BENCH_SRC_ROOT") -OUTPUT_MD = "test.md" - -print( - dedent( - f""" - - cpu architecture: `{platform.machine()}` - - cpu name: `{cpuinfo.get_cpu_info()['brand_raw']}` - - cpu cores: `{psutil.cpu_count(logical=False)}` - - memory: `{int(np.round(psutil.virtual_memory().total / (1024. **3)))} GB` - - kernel: `{platform.version()}` - - system: `{platform.system()}` - - os-release: `{platform.platform()}` - - python: `{platform.python_version()}` - - polars-bio: `{polars_bio.__version__}` - """ - ) -) -``` - -### Overlap operation -```python exec="on" session="benchmark" -import os -import subprocess - -subprocess.run(["hyperfine", "python benchmark/src/overlap/test-polars-bio.py", "--export-markdown", OUTPUT_MD, "-u", "millisecond", "--show-output"]) -markdown = open("test.md").read() -print(markdown) -os.remove(OUTPUT_MD) -``` - - -## Benchmarking -polars-bio significantly outperforms other libraries in terms of speed and memory usage. -It was benchmarked against following libraries: +## Test datasets +[AIList](https://github.com/databio/AIList) dataset was used for benchmarking. + +|Dataset# | Name |size(x1000) |non-flatness | +|:---------|:----------------|:-----------|:------------| +|0 | chainRn4 |2,351 |6 | +|1 | fBrain |199 |1 | +|2 | exons |439 |2 | +|3 | chainOrnAna1 |1,957 |6 | +|4 | chainVicPac2 |7,684 |8 | +|5 | chainXenTro3Link|50,981 |7 | +|6 | chainMonDom5Link|128,187 |7 | +|7 | ex-anno |1,194 |2 | +|8 | ex-rna |9,945 |7 | + +!!! note + Test dataset in *Parquet* format can be downloaded from: + + * for [single-threaded](https://drive.google.com/file/d/1lctmude31mSAh9fWjI60K1bDrbeDPGfm/view?usp=sharing) tests + * for [parallel](https://drive.google.com/file/d/1Sj7nTB5gCUq9nbeQOg4zzS4tKO37M5Nd/view?usp=sharing) tests (8 partitions per dataset) + +## Test libraries - [Bioframe](https://github.com/open2c/bioframe)-0.7.2 -- [PyRanges](https://github.com/pyranges/pyranges)-0.0.132 +- [PyRanges0](https://github.com/pyranges/pyranges)-0.0.132 +- [PyRanges1](https://github.com/pyranges/pyranges_1.x)-master - [pybedtools](https://github.com/daler/pybedtools)-0.10.0 - [PyGenomics](https://gitlab.com/gtamazian/pygenomics)-0.1.1 -- [GenomicRanges](https://github.com/BiocPy/GenomicRanges)-0.4.34 +- [GenomicRanges](https://github.com/BiocPy/GenomicRanges)-0.5.0 -!!! todo - 1. Add more details about the benchmarking process. - 2. GenomicRanges +## Binary operations -### Test datasets -[AIList](https://github.com/databio/AIList) dataset was used for benchmarking. +### Apple Silicon -|Dataset# |Name(.bed) |size(x1000) |non-flatness | -|:---------|:-----------------|:-----------|:------------| -|0 |chainRn4 |2,351 |6 | -|1 |fBrain |199 |1 | -|2 |exons |439 |2 | -|3 |chainOrnAna1 |1,957 |6 | -|4 |chainVicPac2 |7,684 |8 | -|5 |chainXenTro3Link |50,981 |7 | -|6 |chainMonDom5Link |128,187 |7 | -|7 |ex-anno |1,194 |2 | -|8 |ex-rna |9,945 |7 | +- cpu architecture: `arm64` +- cpu name: `Apple M3 Max` +- cpu cores: `16` +- memory: `64 GB` +- kernel: `Darwin Kernel Version 24.2.0: Fri Dec 6 19:02:12 PST 2024; root:xnu-11215.61.5~2/RELEASE_ARM64_T6031` +- system: `Darwin` +- os-release: `macOS-15.2-arm64-arm-64bit` +- python: `3.12.4` +- polars-bio: `0.3.0` -!!! note - Test dataset in *Parquet* format can be downloaded from: +#### Overlap operation +#### S-size, output < 1,000,000 +##### S-size (1-2) - output: 54,246 - * for [single-threaded](https://drive.google.com/file/d/1lctmude31mSAh9fWjI60K1bDrbeDPGfm/view?usp=sharing) tests - * for [parallel](https://drive.google.com/file/d/1Sj7nTB5gCUq9nbeQOg4zzS4tKO37M5Nd/view?usp=sharing) tests (8 partitions per dataset) -## Results -### Overlap operation -![results-overlap-0.1.1.png](assets/results-overlap-0.1.1.png) +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.100738 | 0.101541 | 0.101119 | 0.25x | +| polars_bio | 0.032156 | 0.035501 | 0.033394 | **0.77x** | +| pyranges0 | 0.024100 | 0.028271 | 0.025589 | **1.00x** | +| pyranges1 | 0.053770 | 0.054647 | 0.054121 | 0.47x | +| pybedtools0 | 0.281969 | 0.283385 | 0.282857 | 0.09x | +| pygenomics | 1.424975 | 1.436369 | 1.430531 | 0.02x | +| genomicranges | 0.972717 | 0.979013 | 0.975761 | 0.03x | + +##### S-size (2-7), output: 273,500 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.298039 | 0.309271 | 0.302905 | 0.30x | +| polars_bio | 0.089324 | 0.092200 | 0.090332 | **1.00x** | +| pyranges0 | 0.096478 | 0.103456 | 0.101023 | **0.89x** | +| pyranges1 | 0.195621 | 0.198025 | 0.197146 | 0.46x | +| pybedtools0 | 1.004577 | 1.013097 | 1.007701 | 0.09x | +| pygenomics | 4.264575 | 4.275965 | 4.269055 | 0.02x | +| genomicranges | 2.919675 | 2.926785 | 2.923549 | 0.03x | + + +##### S-size (1-0) - output: 320,955 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.521093 | 0.549674 | 0.534084 | 0.28x | +| polars_bio | 0.135411 | 0.168570 | 0.147222 | **1.00x** | +| pyranges0 | 0.271539 | 0.282081 | 0.276298 | **0.53x** | +| pyranges1 | 0.418972 | 0.426373 | 0.422060 | 0.35x | +| pybedtools0 | 1.258828 | 1.269215 | 1.264674 | 0.12x | +| pygenomics | 7.877381 | 7.908531 | 7.894108 | 0.02x | +| genomicranges | 4.222082 | 4.266592 | 4.244865 | 0.03x | + + + + + +#### M-size, 1,000,000 < output < 100,000,000 + + +##### M-size (7-0), output: 2,761,621 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.935141 | 0.990710 | 0.970345 | 0.22x | +| polars_bio | 0.213880 | 0.220151 | 0.216288 | **1.00x** | +| pyranges0 | 0.408637 | 0.434262 | 0.422380 | **0.51x** | +| pyranges1 | 1.130004 | 1.162724 | 1.147292 | 0.19x | +| pybedtools0 | 6.415976 | 6.467304 | 6.444268 | 0.03x | +| pygenomics | 9.588232 | 9.705035 | 9.653368 | 0.02x | +| genomicranges | 9.017886 | 9.058916 | 9.033964 | 0.02x | + + + +##### M-size (7-3), output: 4,408,383 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 0.954765 | 0.969307 | 0.959704 | 0.21x | +| polars_bio | 0.198607 | 0.208906 | 0.203033 | **1.00x** | +| pyranges0 | 0.425277 | 0.430527 | 0.428594 | **0.47x** | +| pyranges1 | 1.190621 | 1.199869 | 1.194593 | 0.17x | +| pybedtools0 | 9.403818 | 9.491574 | 9.453402 | 0.02x | +| pygenomics | 8.638968 | 8.662197 | 8.647764 | 0.02x | +| genomicranges | 10.514233 | 10.556004 | 10.540377 | 0.02x | + + +##### L-size (0-8), output: 164,196,784 +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|------------|------------|------------|-----------| +| bioframe | 15.630508 | 16.719793 | 16.080009 | 0.19x | +| polars_bio | 2.882900 | 3.135100 | 2.997755 | **1.00x** | +| pyranges0 | 9.276095 | 10.158109 | 9.761880 | **0.31x** | +| pyranges1 | 13.076820 | 13.510234 | 13.329948 | 0.22x | +| pybedtools0 | 322.922915 | 335.123071 | 329.659142 | 0.01x | +| pygenomics | 128.849536 | 132.109689 | 130.089096 | 0.02x | +| genomicranges | 234.237435 | 239.315157 | 236.504565 | 0.01x | + + +##### L-size (4-8), output: 227,832,153 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|------------|------------|------------|-----------| +| bioframe | 22.911206 | 23.118100 | 23.030572 | 0.16x | +| polars_bio | 3.541325 | 3.937760 | 3.684317 | **1.00x** | +| pyranges0 | 13.035069 | 13.510203 | 13.225005 | **0.28x** | +| pyranges1 | 20.924921 | 21.657297 | 21.398281 | 0.17x | +| pybedtools0 | 505.897157 | 521.239276 | 511.310686 | 0.01x | +| pygenomics | 159.883847 | 160.942329 | 160.306970 | 0.02x | +| genomicranges | 322.217280 | 322.490391 | 322.371662 | 0.01x | + + +##### L-size (7-8), output: 307, 184,634 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|------------|------------|------------|-----------| +| bioframe | 29.128664 | 29.993182 | 29.518215 | 0.12x | +| polars_bio | 3.260438 | 3.897260 | 3.489278 | **1.00x** | +| pyranges0 | 16.615283 | 16.983202 | 16.753369 | **0.21x** | +| pyranges1 | 44.154733 | 44.496357 | 44.379647 | 0.08x | +| pybedtools0 | 555.480532 | 559.947421 | 556.986772 | 0.01x | +| pygenomics | 156.724420 | 157.321514 | 156.935424 | 0.02x | +| genomicranges | 416.095573 | 417.284236 | 416.700000 | 0.01x | + + +#### XL-size, output > 1,000,000,000 + +##### XL-size (3-0), output: 1,086,692,495 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|------------|------------|------------|------------|-----------| +| bioframe | 124.244987 | 126.569689 | 125.435831 | 0.12x | +| polars_bio | 12.650240 | 15.858913 | 14.776997 | **1.00x** | +| pyranges0 | 85.652054 | 94.383934 | 88.712706 | **0.17x** | +| pyranges1 | 92.802026 | 94.400313 | 93.447716 | 0.16x | + + +##### XL-size (0-4), output: xxx + +##### XL-size (0-5), output: xxx + +### AMD +- cpu architecture: `x86_64` +- cpu name: `AMD EPYC 9B14` +- cpu cores: `4` +- memory: `63 GB` +- kernel: `#22~22.04.1-Ubuntu SMP Mon Dec 9 20:42:57 UTC 2024` +- system: `Linux` +- os-release: `Linux-6.8.0-1020-gcp-x86_64-with-glibc2.35` +- python: `3.12.8` +- polars-bio: `0.3.0` + +#### Overlap operation +#### S-size, output < 1,000,000 +##### S-size (1-2) - output: 54,246 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.094509 | 0.095311 | 0.094797 | 0.61x | +| polars_bio | 0.058527 | 0.066444 | 0.061503 | **0.95x** | +| pyranges0 | 0.057583 | 0.059461 | 0.058245 | **1.00x** | +| pyranges1 | 0.098868 | 0.107992 | 0.101964 | 0.57x | +| pybedtools0 | 0.382701 | 0.384930 | 0.383619 | 0.15x | +| pygenomics | 2.335400 | 2.340616 | 2.338876 | 0.02x | +| genomicranges | 1.648289 | 1.663941 | 1.657652 | 0.04x | + + +##### S-size (2-7), output: 273,500 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.273727 | 0.275239 | 0.274383 | 0.60x | +| polars_bio | 0.161882 | 0.164253 | 0.163334 | **1.00x** | +| pyranges0 | 0.169721 | 0.171931 | 0.170678 | **0.96x** | +| pyranges1 | 0.304432 | 0.323747 | 0.311284 | 0.52x | +| pybedtools0 | 1.477541 | 1.478301 | 1.477841 | 0.11x | +| pygenomics | 6.929725 | 6.932875 | 6.931662 | 0.02x | +| genomicranges | 5.096514 | 5.105638 | 5.100280 | 0.03x | + +##### S-size (1-0) - output: 320,955 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 0.457869 | 0.460473 | 0.459397 | 0.55x | +| polars_bio | 0.251083 | 0.252582 | 0.251673 | **1.00x** | +| pyranges0 | 0.365083 | 0.376212 | 0.369148 | **0.68x** | +| pyranges1 | 0.593858 | 0.605304 | 0.600537 | 0.42x | +| pybedtools0 | 1.834958 | 1.858740 | 1.844379 | 0.14x | +| pygenomics | 12.730241 | 12.771149 | 12.756920 | 0.02x | +| genomicranges | 7.090998 | 7.121029 | 7.107298 | 0.04x | + + +##### M-size (7-0), output: 2,761,621 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 0.873343 | 0.875288 | 0.874457 | 0.50x | +| polars_bio | 0.420260 | 0.450565 | 0.433827 | **1.00x** | +| pyranges0 | 0.559251 | 0.564516 | 0.561273 | **0.77x** | +| pyranges1 | 1.876350 | 1.888463 | 1.880867 | 0.23x | +| pybedtools0 | 10.379844 | 10.430488 | 10.404292 | 0.04x | +| pygenomics | 15.553783 | 15.567857 | 15.562953 | 0.03x | +| genomicranges | 15.517461 | 15.548186 | 15.535206 | 0.03x | + + + +##### M-size (7-3), output: 4,408,383 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 1.022998 | 1.028002 | 1.024980 | 0.40x | +| polars_bio | 0.397203 | 0.426743 | 0.412704 | **1.00x** | +| pyranges0 | 0.590809 | 0.602570 | 0.594928 | **0.69x** | +| pyranges1 | 2.027123 | 2.074861 | 2.045372 | 0.20x | +| pybedtools0 | 15.957823 | 16.006681 | 15.988963 | 0.03x | +| pygenomics | 13.983596 | 13.994300 | 13.990662 | 0.03x | +| genomicranges | 18.602139 | 18.625446 | 18.615777 | 0.02x | + +##### L-size (0-8), output: 164,196,784 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|------------|------------|------------|-----------| +| bioframe | 21.459718 | 21.516023 | 21.480410 | 0.29x | +| polars_bio | 5.713430 | 6.952107 | 6.129996 | **1.00x** | +| pyranges0 | 15.898455 | 16.227408 | 16.011707 | **0.38x** | +| pyranges1 | 21.721230 | 22.272518 | 21.917855 | 0.28x | +| pybedtools0 | 575.612739 | 578.021023 | 577.165597 | 0.01x | +| pygenomics | 244.510614 | 245.508453 | 245.063967 | 0.03x | +| genomicranges | 440.650408 | 440.737924 | 440.706206 | 0.01x | + + +##### L-size (4-8), output: 227,832,153 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|------------|-----------|-----------|-----------|-----------| +| bioframe | 29.460466 | 29.864740 | 29.633731 | 0.34x | +| polars_bio | 9.731893 | 10.180046 | 9.968996 | **1.00x** | +| pyranges0 | 21.637592 | 22.724399 | 22.011753 | **0.45x** | +| pyranges1 | 37.035666 | 37.531010 | 37.218867 | 0.27x | + +##### L-size (7-8), output: 307, 184,634 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|------------|-----------|-----------|-----------|-----------| +| bioframe | 38.547761 | 38.593432 | 38.573512 | 0.18x | +| polars_bio | 6.356472 | 8.204682 | 6.980182 | **1.00x** | +| pyranges0 | 28.664496 | 28.878972 | 28.751498 | **0.24x** | +| pyranges1 | 80.373241 | 80.871479 | 80.546908 | 0.09x | + + +### Intel + +- cpu architecture: `x86_64` +- cpu name: `INTEL(R) XEON(R) PLATINUM 8581C CPU @ 2.30GHz` +- cpu cores: `4` +- memory: `61 GB` +- kernel: `#27~22.04.1-Ubuntu SMP Tue Jul 16 23:03:39 UTC 2024` +- system: `Linux` +- os-release: `Linux-6.5.0-1025-gcp-x86_64-with-glibc2.35` +- python: `3.12.8` +- polars-bio: `0.3.0` + +#### Overlap operation +#### S-size, output < 1,000,000 +##### S-size (1-2) - output: 54,246 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.080274 | 0.083350 | 0.082125 | 0.67x | +| polars_bio | 0.051923 | 0.060853 | 0.055115 | **1.00x** | +| pyranges0 | 0.057737 | 0.063692 | 0.060233 | **0.92x** | +| pyranges1 | 0.092273 | 0.104232 | 0.096598 | 0.57x | +| pybedtools0 | 0.342928 | 0.350446 | 0.345739 | 0.16x | +| pygenomics | 1.933479 | 1.980263 | 1.958915 | 0.03x | +| genomicranges | 1.317808 | 1.365975 | 1.345268 | 0.04x | + +##### S-size (2-7), output: 273,500 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|----------|----------|----------|-----------| +| bioframe | 0.242910 | 0.250233 | 0.246872 | 0.59x | +| polars_bio | 0.142933 | 0.151324 | 0.146654 | **1.00x** | +| pyranges0 | 0.181919 | 0.184524 | 0.183063 | **0.80x** | +| pyranges1 | 0.303359 | 0.305036 | 0.304166 | 0.48x | +| pybedtools0 | 1.303765 | 1.318575 | 1.310322 | 0.11x | +| pygenomics | 5.744573 | 5.917737 | 5.816145 | 0.03x | +| genomicranges | 4.202981 | 4.298941 | 4.243175 | 0.03x | + + +##### S-size (1-0) - output: 320,955 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 0.421461 | 0.449266 | 0.434152 | 0.53x | +| polars_bio | 0.228252 | 0.233000 | 0.230004 | **1.00x** | +| pyranges0 | 0.383663 | 0.401601 | 0.391000 | **0.59x** | +| pyranges1 | 0.563753 | 0.575554 | 0.570290 | 0.40x | +| pybedtools0 | 1.617740 | 1.643310 | 1.631340 | 0.14x | +| pygenomics | 10.491757 | 10.753130 | 10.636810 | 0.02x | +| genomicranges | 5.806456 | 5.880285 | 5.851234 | 0.04x | + + +##### M-size (7-0), output: 2,761,621 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 0.900843 | 0.928098 | 0.917930 | 0.43x | +| polars_bio | 0.380828 | 0.408791 | 0.390157 | **1.00x** | +| pyranges0 | 0.580401 | 0.607483 | 0.595004 | **0.66x** | +| pyranges1 | 1.697365 | 1.705109 | 1.699965 | 0.23x | +| pybedtools0 | 9.120270 | 9.384526 | 9.211789 | 0.04x | +| pygenomics | 13.123205 | 13.179993 | 13.160740 | 0.03x | +| genomicranges | 13.230635 | 13.690668 | 13.472020 | 0.03x | + + +##### M-size (7-3), output: 4,408,383 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|---------------|-----------|-----------|-----------|-----------| +| bioframe | 1.137155 | 1.142985 | 1.140749 | 0.35x | +| polars_bio | 0.382198 | 0.411443 | 0.396179 | **1.00x** | +| pyranges0 | 0.650236 | 0.675971 | 0.659619 | **0.60x** | +| pyranges1 | 1.818395 | 1.841851 | 1.826528 | 0.22x | +| pybedtools0 | 14.588216 | 14.666769 | 14.621019 | 0.03x | +| pygenomics | 11.975859 | 12.196851 | 12.121281 | 0.03x | +| genomicranges | 15.640415 | 15.839974 | 15.736289 | 0.03x | + + +##### L-size (0-8), output: 164,196,784 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|------------|-----------|-----------|-----------|-----------| +| bioframe | 28.818453 | 28.956365 | 28.884398 | 0.21x | +| polars_bio | 5.904987 | 6.562457 | 6.145784 | **1.00x** | +| pyranges0 | 22.664353 | 22.997717 | 22.806512 | **0.27x** | +| pyranges1 | 24.446387 | 24.804753 | 24.613135 | 0.25x | + + +##### L-size (4-8), output: 227,832,153 + +| Library | Min (s) | Max (s) | Mean (s) | Speedup | +|------------|-----------|-----------|-----------|-----------| +| bioframe | 39.868340 | 40.109302 | 39.951601 | 0.25x | +| polars_bio | 9.736690 | 10.277895 | 10.021107 | **1.00x** | +| pyranges0 | 31.146222 | 31.290984 | 31.208499 | **0.32x** | +| pyranges1 | 39.407547 | 40.279563 | 39.843926 | 0.25x | + + +##### L-size (7-8), output: 307, 184,634 + + + +### Google Axion +[//]: # (## Benchmarking) + +[//]: # (polars-bio significantly outperforms other libraries in terms of speed and memory usage.) + +[//]: # (It was benchmarked against following libraries:) + +[//]: # () +[//]: # () +[//]: # () +[//]: # () +[//]: # () +[//]: # (## Results) + +[//]: # (### Overlap operation) + +[//]: # (![results-overlap-0.1.1.png](assets/results-overlap-0.1.1.png)) + +[//]: # () +[//]: # (### Nearest interval operation) + +[//]: # (![results-nearest-0.1.1.png](assets/results-nearest-0.1.1.png)) + +#### Nearest (closest) operation + +### Parallel execution and scalability + +### Native, Pandas, Polars performance comparison -### Nearest interval operation -![results-nearest-0.1.1.png](assets/results-nearest-0.1.1.png) \ No newline at end of file +## How to run the benchmarks +```bash +poetry env use python3.12 +poetry update +RUSTFLAGS="-Ctarget-cpu=native" maturin develop --release -m Cargo.toml +poetry run python benchmark/src/bench_overlap.py +``` \ No newline at end of file diff --git a/polars_bio/__init__.py b/polars_bio/__init__.py index 3708fd0..d32bc01 100644 --- a/polars_bio/__init__.py +++ b/polars_bio/__init__.py @@ -7,5 +7,5 @@ logger = logging.getLogger("polars_bio") logger.setLevel(logging.INFO) -__version__ = "0.3.1" +__version__ = "0.3.0" __all__ = ["overlap", "nearest", "ctx", "FilterOp", "vizualize_intervals"] diff --git a/pyproject.toml b/pyproject.toml index cf4cf1f..9f9063f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ authors = ["Marek WiewiĆ³rka "] readme = "README.md" [tool.poetry.dependencies] -python = ">=3.9,<3.13" +python = ">=3.12,<3.13" polars = "^1.16.0" pandas = "^2.2.3" pyarrow = "^18.1.0" @@ -62,4 +62,8 @@ py-cpuinfo = "^9.0.0" jupyter_client = "^8.6.3" psutil = "^6.1.1" rich = "^13.9.4" -pyranges = "^0.0.1" \ No newline at end of file +pyranges = {git = "https://github.com/pyranges/pyranges.git", rev = "4f0a153336e7153cdfea15b141ce4ea35a24e233" } +GenomicRanges = "^0.5.0" +pyranges1 = { git = "https://github.com/mwiewior/pyranges1.git", rev = "949d7c15c1c2e217f4404415f79b386f326b6f8d"} +pybedtools = "^0.10.0" +pygenomics = { git = "https://gitlab.com/gtamazian/pygenomics.git", rev = "0.1.1"} \ No newline at end of file