diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py index ff3f2d00ff..e2de830c5a 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py @@ -44,7 +44,7 @@ } -def downloadDlioOutputs(dlioWorkloads): +def downloadDlioOutputs(dlioWorkloads: set, instanceId: str): for dlioWorkload in dlioWorkloads: print(f"Downloading DLIO logs from the bucket {dlioWorkload.bucket}...") result = subprocess.run( @@ -54,8 +54,8 @@ def downloadDlioOutputs(dlioWorkloads): "-q", # download silently without any logs "cp", "-r", - f"gs://{dlioWorkload.bucket}/logs", - LOCAL_LOGS_LOCATION, + f"gs://{dlioWorkload.bucket}/logs/{instanceId}", + LOCAL_LOGS_LOCATION + "/logs", ], capture_output=False, text=True, @@ -91,17 +91,22 @@ def downloadDlioOutputs(dlioWorkloads): ), required=True, ) + parser.add_argument( + "--instance-id", + help="unique string ID for current test-run", + required=True, + ) args = parser.parse_args() try: - os.makedirs(LOCAL_LOGS_LOCATION) + os.makedirs(LOCAL_LOGS_LOCATION + "/logs") except FileExistsError: pass dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads( args.workload_config ) - downloadDlioOutputs(dlioWorkloads) + downloadDlioOutputs(dlioWorkloads, args.instance_id) """ "{num_files_train}-{mean_file_size}-{batch_size}": @@ -119,7 +124,9 @@ def downloadDlioOutputs(dlioWorkloads): if not mash_installed: print("Mash is not installed, will skip parsing CPU and memory usage.") - for root, _, files in os.walk(LOCAL_LOGS_LOCATION): + for root, _, files in os.walk( + LOCAL_LOGS_LOCATION + "/logs/" + args.instance_id + ): if files: print(f"Parsing directory {root} ...") per_epoch_stats_file = root + "/per_epoch_stats.json" @@ -152,9 +159,9 @@ def downloadDlioOutputs(dlioWorkloads): if key not in output: output[key] = { - "num_files_train": part_list[2], - "mean_file_size": part_list[3], - "batch_size": part_list[4], + "num_files_train": part_list[-3], + "mean_file_size": part_list[-2], + "batch_size": part_list[-1], "records": { "local-ssd": [], "gcsfuse-generic": [], @@ -166,7 +173,7 @@ def downloadDlioOutputs(dlioWorkloads): r = record.copy() r["pod_name"] = summary_data["hostname"] r["epoch"] = i + 1 - r["scenario"] = "-".join(part_list[5:]) + r["scenario"] = root.split("/")[-1] r["train_au_percentage"] = round( summary_data["metric"]["train_au_percentage"][i], 2 ) @@ -220,7 +227,7 @@ def downloadDlioOutputs(dlioWorkloads): " (s),GPU Utilization (%),Throughput (sample/s),Throughput" " (MB/s),Throughput over Local SSD (%),GCSFuse Lowest Memory (MB),GCSFuse" " Highest Memory (MB),GCSFuse Lowest CPU (core),GCSFuse Highest CPU" - " (core),Pod,Start,End,GcsfuseMountOptions\n" + " (core),Pod,Start,End,GcsfuseMountOptions,InstanceID\n" ) for key in output: @@ -241,19 +248,25 @@ def downloadDlioOutputs(dlioWorkloads): ): for i in range(len(record_set["records"]["local-ssd"])): r = record_set["records"][scenario][i] - r["throughput_over_local_ssd"] = round( - r["train_throughput_mb_per_second"] - / record_set["records"]["local-ssd"][i][ - "train_throughput_mb_per_second" - ] - * 100, - 2, - ) + try: + r["throughput_over_local_ssd"] = round( + r["train_throughput_mb_per_second"] + / record_set["records"]["local-ssd"][i][ + "train_throughput_mb_per_second" + ] + * 100, + 2, + ) + except ZeroDivisionError: + print("Got ZeroDivisionError. Ignoring it.") + r["throughput_over_local_ssd"] = 0 + except: + raise output_file.write( f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario}," ) output_file.write( - f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n" + f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n" ) else: for i in range(len(record_set["records"][scenario])): @@ -263,7 +276,7 @@ def downloadDlioOutputs(dlioWorkloads): f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario}," ) output_file.write( - f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n" + f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n" ) output_file.close() diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py index 0a95ff387c..7e117e2f05 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/run_tests.py @@ -33,7 +33,7 @@ def run_command(command: str): print(result.stderr) -def createHelmInstallCommands(dlioWorkloads: set): +def createHelmInstallCommands(dlioWorkloads: list, instanceId: str): """Create helm install commands for the given set of dlioWorkload objects.""" helm_commands = [] for dlioWorkload in dlioWorkloads: @@ -41,13 +41,14 @@ def createHelmInstallCommands(dlioWorkloads: set): commands = [ ( 'helm install' - f' {dlioWorkload.bucket}-{batchSize}-{dlioWorkload.scenario} unet3d-loading-test' + f' dlio-unet3d-{dlioWorkload.scenario}-{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize} unet3d-loading-test' ), f'--set bucketName={dlioWorkload.bucket}', f'--set scenario={dlioWorkload.scenario}', f'--set dlio.numFilesTrain={dlioWorkload.numFilesTrain}', f'--set dlio.recordLength={dlioWorkload.recordLength}', f'--set dlio.batchSize={batchSize}', + f'--set instanceId={instanceId}', ] helm_command = ' '.join(commands) @@ -59,7 +60,9 @@ def main(args) -> None: dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads( args.workload_config ) - helmInstallCommands = createHelmInstallCommands(dlioWorkloads) + helmInstallCommands = createHelmInstallCommands( + dlioWorkloads, args.instance_id + ) for helmInstallCommand in helmInstallCommands: print(f'{helmInstallCommand}') if not args.dry_run: @@ -80,6 +83,11 @@ def main(args) -> None: help='Runs DLIO Unet3d tests using this JSON workload configuration.', required=True, ) + parser.add_argument( + '--instance-id', + help='unique string ID for current test-run', + required=True, + ) parser.add_argument( '-n', '--dry-run', diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml index 7a117da56b..bda1029baf 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/templates/dlio-tester.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: Pod metadata: - name: dlio-tester-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}-{{ .Values.scenario }} + name: dlio-tester-{{ .Values.scenario }}-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }} {{- if ne .Values.scenario "local-ssd" }} annotations: gke-gcsfuse/volumes: "true" @@ -84,14 +84,14 @@ spec: ++workload.reader.batch_size={{ .Values.dlio.batchSize }} \ ++workload.dataset.record_length={{ .Values.dlio.recordLength }} \ ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \ - ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} + ++workload.output.folder=/logs/{{ .Values.instanceId }}/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }} # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic"}} - echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options + echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.instanceId }}/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options {{ end }} - gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M") + gsutil -m cp -R /logs/{{ .Values.instanceId }} gs://{{ .Values.bucketName }}/logs/{{ .Values.instanceId }}/$(date +"%Y-%m-%d-%H-%M") volumeMounts: - name: dshm mountPath: /dev/shm diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml index c96e4ba9f0..e7c0c006f4 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/dlio/unet3d-loading-test/values.yaml @@ -21,6 +21,7 @@ image: jiaxun/dlio:v1.2.0 bucketName: gke-dlio-test-data scenario: local-ssd nodeType: n2-standard-96 +instanceId: ldap-yyyymmdd-hhmmss resourceLimits: cpu: 0 diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml index 368fba13af..d9c79eb8d4 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/templates/fio-tester.yaml @@ -16,7 +16,7 @@ apiVersion: v1 kind: Pod metadata: - name: fio-tester-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.scenario }} + name: fio-tester-{{ .Values.instanceId }}-{{ .Values.scenario }}-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.fio.numThreads }}-{{ .Values.fio.filesPerThread }} {{- if ne .Values.scenario "local-ssd" }} annotations: gke-gcsfuse/volumes: "true" @@ -45,7 +45,12 @@ spec: echo "Install dependencies..." apt-get update apt-get install -y libaio-dev gcc make git time wget - + + no_of_files_per_thread={{ .Values.fio.filesPerThread }} + block_size={{ .Values.fio.blockSize }} + file_size={{ .Values.fio.fileSize }} + num_of_threads={{ .Values.fio.numThreads }} + {{ if eq .Values.scenario "local-ssd" }} echo "Installing gsutil..." apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl @@ -105,12 +110,8 @@ spec: echo "Setup default values..." epoch=4 - no_of_files_per_thread={{ .Values.fio.filesPerThread }} read_type={{ .Values.fio.readType }} pause_in_seconds=20 - block_size={{ .Values.fio.blockSize }} - file_size={{ .Values.fio.fileSize }} - num_of_threads={{ .Values.fio.numThreads }} workload_dir=/data # Cleaning the pagecache, dentries and inode cache before the starting the workload. @@ -125,18 +126,18 @@ spec: time ls -R $workload_dir 1> /dev/null echo "Run fio tests..." - mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type + output_dir=/data/fio-output/{{ .Values.instanceId }}/${file_size}-{{ lower .Values.fio.blockSize}}-${num_of_threads}-${no_of_files_per_thread}/{{ .Values.scenario }}/$read_type + mkdir -p ${output_dir} # dump the gcsfuse-mount-configuration to a file in output-directory. {{ if eq .Values.scenario "gcsfuse-generic" }} - echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options + echo "{{ .Values.gcsfuse.mountOptions }}" > ${output_dir}/gcsfuse_mount_options {{ end }} for i in $(seq $epoch); do - echo "[Epoch ${i}] start time:" `date +%s` free -mh # Memory usage before workload start. - NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="/data/fio-output/{{ .Values.scenario }}/${read_type}/epoch${i}.json" + NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="${output_dir}/epoch${i}.json" free -mh # Memory usage after workload completion. echo "[Epoch ${i}] end time:" `date +%s` @@ -154,7 +155,7 @@ spec: done {{ if eq .Values.scenario "local-ssd" }} - gsutil -m cp -R /data/fio-output/local-ssd gs://{{ .Values.bucketName }}/fio-output + gsutil -m cp -R /data/fio-output/{{ .Values.instanceId }}/* gs://{{ .Values.bucketName }}/fio-output/{{ .Values.instanceId }}/ {{ end }} echo "fio job completed!" diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml index ad8a969cfa..b1995da8c4 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/loading-test/values.yaml @@ -21,6 +21,7 @@ image: ubuntu:24.04 bucketName: gke-dlio-test-data scenario: local-ssd nodeType: n2-standard-96 +instanceId: ldap-yyyymmdd-hhmmss resourceLimits: cpu: 0 diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py index 59d5efb196..60ec92c81c 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py @@ -46,10 +46,11 @@ } -def downloadFioOutputs(fioWorkloads): +def downloadFioOutputs(fioWorkloads: set, instanceId: str): for fioWorkload in fioWorkloads: + dstDir = LOCAL_LOGS_LOCATION + "/" + instanceId + "/" + fioWorkload.fileSize try: - os.makedirs(LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize) + os.makedirs(dstDir) except FileExistsError: pass @@ -61,8 +62,8 @@ def downloadFioOutputs(fioWorkloads): "-q", # download silently without any logs "cp", "-r", - f"gs://{fioWorkload.bucket}/fio-output", - LOCAL_LOGS_LOCATION + "/" + fioWorkload.fileSize, + f"gs://{fioWorkload.bucket}/fio-output/{instanceId}/*", + dstDir, ], capture_output=False, text=True, @@ -98,6 +99,11 @@ def downloadFioOutputs(fioWorkloads): ), required=True, ) + parser.add_argument( + "--instance-id", + help="unique string ID for current test-run", + required=True, + ) args = parser.parse_args() try: @@ -108,10 +114,10 @@ def downloadFioOutputs(fioWorkloads): fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads( args.workload_config ) - downloadFioOutputs(fioWorkloads) + downloadFioOutputs(fioWorkloads, args.instance_id) """ - "{read_type}-{mean_file_size}": + "{read_type}-{mean_file_size}-{bs}-{numjobs}-{nrfiles}": "mean_file_size": str "read_type": str "records": @@ -125,7 +131,7 @@ def downloadFioOutputs(fioWorkloads): if not mash_installed: print("Mash is not installed, will skip parsing CPU and memory usage.") - for root, _, files in os.walk(LOCAL_LOGS_LOCATION): + for root, _, files in os.walk(LOCAL_LOGS_LOCATION + "/" + args.instance_id): for file in files: per_epoch_output = root + f"/{file}" if not per_epoch_output.endswith(".json"): @@ -138,13 +144,6 @@ def downloadFioOutputs(fioWorkloads): with open(gcsfuse_mount_options_file) as f: gcsfuse_mount_options = f.read().strip() - print(f"Now parsing file {per_epoch_output} ...") - root_split = root.split("/") - mean_file_size = root_split[-4] - scenario = root_split[-2] - read_type = root_split[-1] - epoch = int(file.split(".")[0][-1]) - with open(per_epoch_output, "r") as f: try: per_epoch_output_data = json.load(f) @@ -152,14 +151,36 @@ def downloadFioOutputs(fioWorkloads): print(f"failed to json-parse {per_epoch_output}, so skipping it.") continue + if ( + not "jobs" in per_epoch_output_data + or not per_epoch_output_data["jobs"] + or not "job options" in per_epoch_output_data["jobs"][0] + or not "bs" in per_epoch_output_data["jobs"][0]["job options"] + ): + print( + f'Did not find "[jobs][0][job options][bs]" in {per_epoch_output},' + " so ignoring this file" + ) + continue + + print(f"Now parsing file {per_epoch_output} ...") + root_split = root.split("/") + mean_file_size = root_split[-4] + scenario = root_split[-2] + read_type = root_split[-1] + epoch = int(file.split(".")[0][-1]) + if "global options" not in per_epoch_output_data: print(f"field: 'global options' missing in {per_epoch_output}") continue global_options = per_epoch_output_data["global options"] nrfiles = int(global_options["nrfiles"]) numjobs = int(global_options["numjobs"]) + bs = per_epoch_output_data["jobs"][0]["job options"]["bs"] - key = "-".join([read_type, mean_file_size]) + key = "-".join( + [read_type, mean_file_size, bs, str(numjobs), str(nrfiles)] + ) if key not in output: output[key] = { "mean_file_size": mean_file_size, @@ -175,7 +196,7 @@ def downloadFioOutputs(fioWorkloads): r = record.copy() bs = per_epoch_output_data["jobs"][0]["job options"]["bs"] r["pod_name"] = ( - f"fio-tester-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{scenario}" + f"fio-tester-{args.instance_id}-{scenario}-{read_type}-{mean_file_size.lower()}-{bs.lower()}-{numjobs}-{nrfiles}" ) r["epoch"] = epoch r["scenario"] = scenario @@ -229,13 +250,16 @@ def downloadFioOutputs(fioWorkloads): " (s),Throughput (MB/s),IOPS,Throughput over Local SSD (%),GCSFuse Lowest" " Memory (MB),GCSFuse Highest Memory (MB),GCSFuse Lowest CPU" " (core),GCSFuse Highest CPU" - " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads\n" + " (core),Pod,Start,End,GcsfuseMoutOptions,BlockSize,FilesPerThread,NumThreads,InstanceID\n" ) for key in output: record_set = output[key] for scenario in scenario_order: + if not record_set["records"][scenario]: + continue + for i in range(len(record_set["records"][scenario])): if ("local-ssd" in record_set["records"]) and ( len(record_set["records"]["local-ssd"]) @@ -259,7 +283,7 @@ def downloadFioOutputs(fioWorkloads): continue else: output_file.write( - f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n" + f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n" ) else: try: @@ -273,6 +297,6 @@ def downloadFioOutputs(fioWorkloads): continue else: output_file.write( - f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},'Unknown',{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']}\n" + f"{record_set['mean_file_size']},{record_set['read_type']},{scenario},{r['epoch']},{r['duration']},{r['throughput_mb_per_second']},{r['IOPS']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{r['blockSize']},{r['filesPerThread']},{r['numThreads']},{args.instance_id}\n" ) output_file.close() diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py index 739b902b74..7bf3d544fa 100644 --- a/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py +++ b/perfmetrics/scripts/testing_on_gke/examples/fio/run_tests.py @@ -33,7 +33,7 @@ def run_command(command: str): print(result.stderr) -def createHelmInstallCommands(fioWorkloads): +def createHelmInstallCommands(fioWorkloads: list, instanceId: str): """Create helm install commands for the given set of fioWorkload objects.""" helm_commands = [] for fioWorkload in fioWorkloads: @@ -41,7 +41,7 @@ def createHelmInstallCommands(fioWorkloads): commands = [ ( 'helm install' - f' fio-loading-test-{fioWorkload.fileSize.lower()}-{readType}-{fioWorkload.scenario} loading-test' + f' fio-load-{fioWorkload.scenario}-{readType}-{fioWorkload.fileSize.lower()}-{fioWorkload.blockSize.lower()}-{fioWorkload.numThreads}-{fioWorkload.filesPerThread} loading-test' ), f'--set bucketName={fioWorkload.bucket}', f'--set scenario={fioWorkload.scenario}', @@ -50,6 +50,7 @@ def createHelmInstallCommands(fioWorkloads): f'--set fio.blockSize={fioWorkload.blockSize}', f'--set fio.filesPerThread={fioWorkload.filesPerThread}', f'--set fio.numThreads={fioWorkload.numThreads}', + f'--set instanceId={instanceId}', ] helm_command = ' '.join(commands) @@ -61,7 +62,9 @@ def main(args) -> None: fioWorkloads = fio_workload.ParseTestConfigForFioWorkloads( args.workload_config ) - helmInstallCommands = createHelmInstallCommands(fioWorkloads) + helmInstallCommands = createHelmInstallCommands( + fioWorkloads, args.instance_id + ) for helmInstallCommand in helmInstallCommands: print(f'{helmInstallCommand}') if not args.dry_run: @@ -82,6 +85,11 @@ def main(args) -> None: help='Runs FIO tests using this JSON workload configuration', required=True, ) + parser.add_argument( + '--instance-id', + help='unique string ID for current test-run', + required=True, + ) parser.add_argument( '-n', '--dry-run', diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh index f05ac4b555..ee98043148 100755 --- a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh +++ b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh @@ -45,6 +45,7 @@ readonly gcsfuse_branch=garnitin/add-gke-load-testing/v1 # GCSFuse configuration related readonly DEFAULT_GCSFUSE_MOUNT_OPTIONS="implicit-dirs" # Test runtime configuration +readonly DEFAULT_INSTANCE_ID=${USER}-$(date +%Y%m%d-%H%M%S) readonly DEFAULT_POD_WAIT_TIME_IN_SECONDS=300 function printHelp() { @@ -71,7 +72,8 @@ function printHelp() { # GCSFuse configuration related echo "gcsfuse_mount_options=<\"comma-separated-gcsfuse-mount-options\" e.g. \""${DEFAULT_GCSFUSE_MOUNT_OPTIONS}"\">" # Test runtime configuration - echo "pod_wait_time_in_seconds=" + echo "pod_wait_time_in_seconds=" + echo "instance_id=gcsfuse_mount_options # Test runtime configuration echo "pod_wait_time_in_seconds=\"${pod_wait_time_in_seconds}\"" + echo "instance_id=\"${instance_id}\"" echo "" echo "" echo "" @@ -411,12 +415,12 @@ function deleteAllPods() { function deployAllFioHelmCharts() { echo "Deploying all fio helm charts ..." - cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py && cd - + cd "${gke_testing_dir}"/examples/fio && python3 ./run_tests.py --workload-config ${gke_testing_dir}/examples/workloads.json --instance-id ${instance_id} && cd - } function deployAllDlioHelmCharts() { echo "Deploying all dlio helm charts ..." - cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py && cd - + cd "${gke_testing_dir}"/examples/dlio && python3 ./run_tests.py --workload-config ${gke_testing_dir}/examples/workloads.json --instance-id ${instance_id} && cd - } function listAllHelmCharts() { @@ -500,7 +504,7 @@ function revertPodConfigsFilesAfterTestRuns() { # echo "" # fioDataLoaderBucketNames | while read bucket; do # echo "${bucket}:" - # gcloud storage ls -l gs://${bucket}/fio-output/*/* | (grep -e 'json\|gcsfuse_mount_options' || true) + # gcloud storage ls -l gs://${bucket}/fio-output/${instance_id}/*/*/* | (grep -e 'json\|gcsfuse_mount_options' || true) # done # } # @@ -508,7 +512,7 @@ function revertPodConfigsFilesAfterTestRuns() { # echo "" # dlioDataLoaderBucketNames | while read bucket; do # echo "${bucket}:" - # gcloud storage ls -l gs://${bucket}/logs/*/*/* | (grep -e 'summary\.json\|per_epoch_stats\.json\|gcsfuse_mount_options' || true) + # gcloud storage ls -l gs://${bucket}/logs/${instance_id}/*/*/* | (grep -e 'summary\.json\|per_epoch_stats\.json\|gcsfuse_mount_options' || true) # done # } @@ -549,14 +553,14 @@ function revertPodConfigsFilesAfterTestRuns() { function fetchAndParseFioOutputs() { echo "Fetching and parsing fio outputs ..." cd "${gke_testing_dir}"/examples/fio - python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json + python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} cd - } function fetchAndParseDlioOutputs() { echo "Fetching and parsing dlio outputs ..." cd "${gke_testing_dir}"/examples/dlio - python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json + python3 parse_logs.py --project-number=${project_number} --workload-config="${gke_testing_dir}"/examples/workloads.json --instance-id ${instance_id} cd - }