diff --git a/benchmarks/IO.py b/benchmarks/IO.py index 943a0981b4..0b3581ea02 100644 --- a/benchmarks/IO.py +++ b/benchmarks/IO.py @@ -29,7 +29,7 @@ class FileFormat(Enum): PARQUET = 2 CSV = 3 -def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, comps=None): +def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, comps=None, fixed_size=-1): if comps is None or comps == [""]: comps = COMPRESSIONS @@ -50,7 +50,10 @@ def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, elif dtype == "uint64": a = ak.randint(0, 2**32, N, dtype=ak.uint64, seed=seed) elif dtype == "str": - a = ak.random_strings_uniform(1, 16, N, seed=seed) + if fixed_size > 0: + a = ak.random_strings_uniform(fixed_size, fixed_size+1, N, seed=seed) + else: + a = ak.random_strings_uniform(1, 16, N, seed=seed) times = {} if fileFormat == FileFormat.PARQUET: @@ -93,7 +96,7 @@ def time_ak_write(N_per_locale, numfiles, trials, dtype, path, seed, fileFormat, print("write Average rate {} = {:.4f} GiB/sec".format(key, nb / 2**30 / times[key])) -def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps=None): +def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps=None, fixed_size=-1): if comps is None or comps == [""]: comps = COMPRESSIONS @@ -111,15 +114,26 @@ def time_ak_read(N_per_locale, numfiles, trials, dtype, path, fileFormat, comps= times = {} if fileFormat == FileFormat.PARQUET: - for comp in COMPRESSIONS: - if comp in comps: - readtimes = [] - for i in range(trials): - start = time.time() - a = ak.read_parquet(path + comp + "*").popitem()[1] - end = time.time() - readtimes.append(end - start) - times[comp] = sum(readtimes) / trials + if fixed_size < 1: + for comp in COMPRESSIONS: + if comp in comps: + readtimes = [] + for i in range(trials): + start = time.time() + a = ak.read_parquet(path + comp + "*").popitem()[1] + end = time.time() + readtimes.append(end - start) + times[comp] = sum(readtimes) / trials + else: + for comp in COMPRESSIONS: + if comp in comps: + readtimes = [] + for i in range(trials): + start = time.time() + a = ak.read_parquet(path + comp + "*",fixed_len=fixed_size).popitem()[1] + end = time.time() + readtimes.append(end - start) + times[comp] = sum(readtimes) / trials elif fileFormat == FileFormat.HDF5: readtimes = [] @@ -211,6 +225,9 @@ def create_parser(): parser.add_argument( "-n", "--size", type=int, default=10**8, help="Problem size: length of array to write/read" ) + parser.add_argument( + "--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet" + ) parser.add_argument( "-t", "--trials", type=int, default=1, help="Number of times to run the benchmark" ) diff --git a/benchmarks/parquetIO.py b/benchmarks/parquetIO.py index 4f83857773..385bc0a4ab 100644 --- a/benchmarks/parquetIO.py +++ b/benchmarks/parquetIO.py @@ -19,6 +19,9 @@ def create_parser(): parser.add_argument( "-n", "--size", type=int, default=10**6, help="Problem size: length of array to read/write" ) + parser.add_argument( + "--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet" + ) parser.add_argument( "-t", "--trials", type=int, default=1, help="Number of times to run the benchmark" ) @@ -105,9 +108,10 @@ def create_parser(): args.seed, FileFormat.PARQUET, comp_types, + args.fixed_size, ) elif args.only_read: - time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types) + time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size) else: time_ak_write( args.size, @@ -118,8 +122,9 @@ def create_parser(): args.seed, FileFormat.PARQUET, comp_types, + args.fixed_size, ) - time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types) + time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size) remove_files(args.path) sys.exit(0) diff --git a/benchmarks/parquetMultiIO.py b/benchmarks/parquetMultiIO.py index b0a8916ec6..b2af578454 100644 --- a/benchmarks/parquetMultiIO.py +++ b/benchmarks/parquetMultiIO.py @@ -21,6 +21,9 @@ def create_parser(): parser.add_argument( "-n", "--size", type=int, default=10**7, help="Problem size: length of array to write/read" ) + parser.add_argument( + "--fixed-size", type=int, default=-1, help="Fixed size length of string for Parquet" + ) parser.add_argument( "-t", "--trials", type=int, default=1, help="Number of times to run the benchmark" ) @@ -107,9 +110,10 @@ def create_parser(): args.seed, FileFormat.PARQUET, comp_types, + args.fixed_size ) elif args.only_read: - time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types) + time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size) elif args.only_delete: remove_files(args.path) else: @@ -122,8 +126,9 @@ def create_parser(): args.seed, FileFormat.PARQUET, comp_types, + args.fixed_size ) - time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types) + time_ak_read(args.size, args.files_per_loc, args.trials, args.dtype, args.path, FileFormat.PARQUET, comp_types, args.fixed_size) remove_files(args.path) sys.exit(0)