Skip to content

Commit

Permalink
Collecting rename metrics for nested folders (#2370)
Browse files Browse the repository at this point in the history
* added check directory functions and its unit tests

* adding function to parse config file and generate dir in bucket

* exit_code set to 1

* correct format

* refactors

* test formatting

* function to avoid code repitition

* added check directory functions and its unit tests

* moving mount functions to utils

* compute metrics from time of operation

* uploading metrics to gsheet

* nits

* adding logic to collect nested folder metrics

* added logic to accumulate nested rename folder metrics

* updated unit tests

* adding unit tests for new func

* adding correct nested test case scenario

* added a unit test and function description
  • Loading branch information
anushka567 authored Sep 4, 2024
1 parent 199cfc1 commit 6bdb29d
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 34 deletions.
112 changes: 83 additions & 29 deletions perfmetrics/scripts/hns_rename_folders_metrics/renaming_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,72 @@ def _upload_to_gsheet(worksheet, data, spreadsheet_id) -> (int):
return exit_code


def _calculate_num_files(folder_structure):
"""
Calculate the total number of files across folders specified in folder structure.
Args:
folder_structure: JSON list containing JSON objects representing folders.
Returns:
Total count of files.
Examples:
folder_structure:[
{
...
num_files:1 ,
...
},
{
...
num_files:1,
...
}
]
For the above structure, the function returns 2.
"""
count=0
for folder in folder_structure:
count+=folder["num_files"]
return count


def _create_row_of_values(operation,test_type,num_files,num_folders,metrics):
"""
Creates rows of values from the metrics dict to be uploaded to gsheet.
Args:
operation: Type of rename operation (whether involves nested folders or not)
test_type: flat or hns
num_files: Total number of files involved in the rename operation(filepath got affected)
num_folders: Total number of folders renamed/folderpath got affected
metrics: Dict object containing metrics to be uploaded.
Returns:
A row containing values to be uploaded.
"""
row = [
operation,
test_type,
num_files,
num_folders,
metrics['Number of samples'],
metrics['Mean'],
metrics['Median'],
metrics['Standard Dev'],
metrics['Min'],
metrics['Max'],
metrics['Quantiles']['0 %ile'],
metrics['Quantiles']['20 %ile'],
metrics['Quantiles']['50 %ile'],
metrics['Quantiles']['90 %ile'],
metrics['Quantiles']['95 %ile'],
metrics['Quantiles']['98 %ile'],
metrics['Quantiles']['99 %ile'],
metrics['Quantiles']['99.5 %ile'],
metrics['Quantiles']['99.9 %ile'],
metrics['Quantiles']['100 %ile']

]
return row


def _get_values_to_export(dir, metrics, test_type):
"""
This function takes in extracted metrics data, filters it, rearranges it,
Expand All @@ -86,31 +152,15 @@ def _get_values_to_export(dir, metrics, test_type):
num_files = folder["num_files"]
num_folders = 1

row = [
'Renaming Operation',
test_type,
num_files,
num_folders,
metrics[folder["name"]]['Number of samples'],
metrics[folder["name"]]['Mean'],
metrics[folder["name"]]['Median'],
metrics[folder["name"]]['Standard Dev'],
metrics[folder["name"]]['Min'],
metrics[folder["name"]]['Max'],
metrics[folder["name"]]['Quantiles']['0 %ile'],
metrics[folder["name"]]['Quantiles']['20 %ile'],
metrics[folder["name"]]['Quantiles']['50 %ile'],
metrics[folder["name"]]['Quantiles']['90 %ile'],
metrics[folder["name"]]['Quantiles']['95 %ile'],
metrics[folder["name"]]['Quantiles']['98 %ile'],
metrics[folder["name"]]['Quantiles']['99 %ile'],
metrics[folder["name"]]['Quantiles']['99.5 %ile'],
metrics[folder["name"]]['Quantiles']['99.9 %ile'],
metrics[folder["name"]]['Quantiles']['100 %ile']
row=_create_row_of_values('Renaming Operation',test_type,num_files,num_folders,metrics[folder["name"]])
metrics_data.append(row)

]
nested_folder_name=dir["nested_folders"]["folder_name"]
num_files= _calculate_num_files(dir["nested_folders"]["folder_structure"])
num_folders=dir["nested_folders"]["num_folders"]

metrics_data.append(row)
row=_create_row_of_values('Renaming Operation Nested',test_type,num_files,num_folders,metrics[nested_folder_name])
metrics_data.append(row)

return metrics_data

Expand Down Expand Up @@ -174,25 +224,25 @@ def _parse_results(dir, results, num_samples):
folder_name = folder["name"]
metrics[folder_name] = _compute_metrics_from_time_of_operation(
num_samples, results[folder_name])
#TODO add logic for metrics parsing for nested folder

metrics[dir["nested_folders"]["folder_name"]]= _compute_metrics_from_time_of_operation(
num_samples, results[dir["nested_folders"]["folder_name"]])
return metrics


def _record_time_for_folder_rename(mount_point,folder,num_samples):
def _record_time_for_folder_rename(parent_dir,folder,num_samples):
"""
This function records the time of rename operation for folder,for num_samples
number of test runs.
Args:
mount_point: Mount point for the GCS bucket.
parent_dir: Parent directory for the folder.
folder: JSON object representing the folder being renamed.
num_samples: Number of samples to collect for each test.
Returns:
A list containing time of rename operations in seconds.
"""
folder_name= '{}/{}'.format(mount_point,folder["name"])
folder_name= '{}/{}'.format(parent_dir,folder["name"])
folder_rename = folder_name+"_renamed"
time_op = []
for iter in range(num_samples):
Expand Down Expand Up @@ -237,7 +287,11 @@ def _record_time_of_operation(mount_point, dir, num_samples):
# Collecting metrics for non-nested folders.
for folder in dir["folders"]["folder_structure"]:
results[folder["name"]] = _record_time_for_folder_rename(mount_point,folder,num_samples)
#TODO Add metric collection logic for nested-folders

nested_folder={
"name": dir["nested_folders"]["folder_name"]
}
results[dir["nested_folders"]["folder_name"]] = _record_time_for_folder_rename(mount_point,nested_folder,num_samples)
return results


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,34 @@

class TestRenamingBenchmark(unittest.TestCase):

def test_calculate_num_files(self):
dir = {
"name": "gcs_bucket",
"nested_folders": {
"folder_name": "nested_folder",
"num_folders":2,
"folder_structure": [
{
'name': "test_nfolder1",
"num_files": 2,
"file_name_prefix": "file",
"file_size": "1kb"
},
{
'name': "test_nfolder2",
"num_files": 1,
"file_name_prefix": "file",
"file_size": "1kb"
}
]
}
}
expected_count_of_files = 3

num_files = renaming_benchmark._calculate_num_files(dir["nested_folders"]["folder_structure"])

self.assertEqual(num_files, expected_count_of_files)

@patch('subprocess.call')
@patch('time.time')
def test_record_time_for_folder_rename(self,mock_time,mock_subprocess):
Expand Down Expand Up @@ -51,9 +79,15 @@ def test_record_time_of_operation(self,mock_time,mock_subprocess):
"num_files": 1,
"file_name_prefix": "file",
"file_size": "1kb"
},
}
]
},
"nested_folders": {
"folder_name": "nested_folder",
"num_folders":1,
"folder_structure": [
{
'name': "test_folder2",
'name': "test_nfolder1",
"num_files": 1,
"file_name_prefix": "file",
"file_size": "1kb"
Expand All @@ -63,11 +97,11 @@ def test_record_time_of_operation(self,mock_time,mock_subprocess):
}
num_samples=2
mock_time.side_effect = [1.0, 2.0, 3.0, 4.0,1.0, 2.0, 3.0, 4.0]
expected_time_of_operation={'test_folder1':[1.0,1.0] ,'test_folder2':[1.0,1.0]}
expected_time_of_operation={'test_folder1':[1.0,1.0] ,'nested_folder':[1.0,1.0]}
expected_subprocess_calls=[call("mv ./gcs_bucket/test_folder1 ./gcs_bucket/test_folder1_renamed",shell=True),
call("mv ./gcs_bucket/test_folder1_renamed ./gcs_bucket/test_folder1",shell=True),
call("mv ./gcs_bucket/test_folder2 ./gcs_bucket/test_folder2_renamed",shell=True),
call("mv ./gcs_bucket/test_folder2_renamed ./gcs_bucket/test_folder2",shell=True),]
call("mv ./gcs_bucket/nested_folder ./gcs_bucket/nested_folder_renamed",shell=True),
call("mv ./gcs_bucket/nested_folder_renamed ./gcs_bucket/nested_folder",shell=True),]

time_op=renaming_benchmark._record_time_of_operation(mount_point,dir,num_samples)

Expand Down Expand Up @@ -164,6 +198,34 @@ def test_compute_metrics_from_op_time(self):

self.assertEqual(metrics,expected_metrics)

def test_create_row_of_values(self):
metrics={
'Number of samples':2,
'Mean':1.0,
'Median':1.0,
'Standard Dev':0,
'Min': 1.0,
'Max':1.0,
'Quantiles':{'0 %ile': 1.0, '20 %ile': 1.0, '50 %ile': 1.0,
'90 %ile': 1.0, '95 %ile': 1.0, '98 %ile': 1.0,
'99 %ile': 1.0, '99.5 %ile': 1.0, '99.9 %ile': 1.0,
'100 %ile': 1.0}
}
operation="renaming test"
test_type="flat"
num_files=1
num_folders=1
expected_row=[
"renaming test",
"flat",
1,1,2,1.0,1.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
]

row=renaming_benchmark._create_row_of_values(operation,test_type,num_files,num_folders,metrics)

self.assertEqual(row,expected_row)


def test_get_values_to_export(self):
dir = {
"name": "gcs_bucket",
Expand All @@ -176,6 +238,18 @@ def test_get_values_to_export(self):
"file_size": "1kb"
}
]
},
"nested_folders": {
"folder_name": "nested_folder",
"num_folders":1,
"folder_structure": [
{
'name': "test_nfolder1",
"num_files": 1,
"file_name_prefix": "file",
"file_size": "1kb"
}
]
}
}
metrics={
Expand All @@ -190,10 +264,24 @@ def test_get_values_to_export(self):
'90 %ile': 1.0, '95 %ile': 1.0, '98 %ile': 1.0,
'99 %ile': 1.0, '99.5 %ile': 1.0, '99.9 %ile': 1.0,
'100 %ile': 1.0}
},
"nested_folder": {
'Number of samples':2,
'Mean':1.0,
'Median':1.0,
'Standard Dev':0,
'Min': 1.0,
'Max':1.0,
'Quantiles':{'0 %ile': 1.0, '20 %ile': 1.0, '50 %ile': 1.0,
'90 %ile': 1.0, '95 %ile': 1.0, '98 %ile': 1.0,
'99 %ile': 1.0, '99.5 %ile': 1.0, '99.9 %ile': 1.0,
'100 %ile': 1.0}
}
}
test_type="flat"
expected_export_values=[['Renaming Operation','flat',1,1,2,1.0,1.0,0,1.0,1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],
['Renaming Operation Nested','flat',1,1,2,1.0,1.0,0,1.0,1.0,
1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]]

values_to_export = renaming_benchmark._get_values_to_export(dir,metrics,test_type)
Expand Down

0 comments on commit 6bdb29d

Please sign in to comment.