Skip to content

Commit

Permalink
Closes #3613: Add fixed-length flag to string read benchmark for Parq…
Browse files Browse the repository at this point in the history
…uet (#3614)

* Add fixed-length flag to string read benchmark for Parquet

This PR introduces a fixed-length flag to the string read
benchmark to facilitate performance evaluation of Parquet's
fixed-length optimization. By enforcing consistent file
sizes during the write process, the benchmark can leverage
the fixed_size flag when reading, eliminating the need for
manual size calculations. This change enables more
accurate and efficient performance measurements of the
Parquet fixed-length optimization.

* Thread through Parquet

* Fix short flag naming
  • Loading branch information
bmcdonald3 authored Aug 5, 2024
1 parent e40150b commit e727765
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 16 deletions.
41 changes: 29 additions & 12 deletions benchmarks/IO.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class FileFormat(Enum):
PARQUET = 2
CSV = 3

def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, comps=None):
def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, comps=None, fixed_size=-1):
if comps is None or comps == [""]:
comps = COMPRESSIONS

Expand All @@ -50,7 +50,10 @@ def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat,
elif dtype == "uint64":
a = ak.randint(0, 2**32, N, dtype=ak.uint64, seed=seed)
elif dtype == "str":
a = ak.random_strings_uniform(1, 16, N, seed=seed)
if fixed_size > 0:
a = ak.random_strings_uniform(fixed_size, fixed_size+1, N, seed=seed)
else:
a = ak.random_strings_uniform(1, 16, N, seed=seed)

times = {}
if fileFormat == FileFormat.PARQUET:
Expand Down Expand Up @@ -93,7 +96,7 @@ def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat,
print("write Average rate {} = {:.4f} GiB/sec".format(key, nb / 2**30 / times[key]))


def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps=None):
def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps=None, fixed_size=-1):
if comps is None or comps == [""]:
comps = COMPRESSIONS

Expand All @@ -111,15 +114,26 @@ def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps=

times = {}
if fileFormat == FileFormat.PARQUET:
for comp in COMPRESSIONS:
if comp in comps:
readtimes = []
for i in range(trials):
start = time.time()
a = ak.read_parquet(path + comp + "*").popitem()[1]
end = time.time()
readtimes.append(end - start)
times[comp] = sum(readtimes) / trials
if fixed_size < 1:
for comp in COMPRESSIONS:
if comp in comps:
readtimes = []
for i in range(trials):
start = time.time()
a = ak.read_parquet(path + comp + "*").popitem()[1]
end = time.time()
readtimes.append(end - start)
times[comp] = sum(readtimes) / trials
else:
for comp in COMPRESSIONS:
if comp in comps:
readtimes = []
for i in range(trials):
start = time.time()
a = ak.read_parquet(path + comp + "*",fixed_len=fixed_size).popitem()[1]
end = time.time()
readtimes.append(end - start)
times[comp] = sum(readtimes) / trials

elif fileFormat == FileFormat.HDF5:
readtimes = []
Expand Down Expand Up @@ -211,6 +225,9 @@ def create_parser():
parser.add_argument(
"-n", "--size", type=int, default=10**8, help="Problem size: length of array to write/read"
)
parser.add_argument(
"--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet"
)
parser.add_argument(
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark"
)
Expand Down
9 changes: 7 additions & 2 deletions benchmarks/parquetIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ def create_parser():
parser.add_argument(
"-n", "--size", type=int, default=10**6, help="Problem size: length of array to read/write"
)
parser.add_argument(
"--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet"
)
parser.add_argument(
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark"
)
Expand Down Expand Up @@ -105,9 +108,10 @@ def create_parser():
args.seed,
FileFormat.PARQUET,
comp_types,
args.fixed_size,
)
elif args.only_read:
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size)
else:
time_ak_write(
args.size,
Expand All @@ -118,8 +122,9 @@ def create_parser():
args.seed,
FileFormat.PARQUET,
comp_types,
args.fixed_size,
)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size)
remove_files(args.path)

sys.exit(0)
9 changes: 7 additions & 2 deletions benchmarks/parquetMultiIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def create_parser():
parser.add_argument(
"-n", "--size", type=int, default=10**7, help="Problem size: length of array to write/read"
)
parser.add_argument(
"--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet"
)
parser.add_argument(
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark"
)
Expand Down Expand Up @@ -107,9 +110,10 @@ def create_parser():
args.seed,
FileFormat.PARQUET,
comp_types,
args.fixed_size
)
elif args.only_read:
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size)
elif args.only_delete:
remove_files(args.path)
else:
Expand All @@ -122,8 +126,9 @@ def create_parser():
args.seed,
FileFormat.PARQUET,
comp_types,
args.fixed_size
)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types)
time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size)
remove_files(args.path)

sys.exit(0)

0 comments on commit e727765

Please sign in to comment.