-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalize-csv.py
85 lines (74 loc) · 3.13 KB
/
normalize-csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
Normalizes magnitude of values in specified column of a CSV file, removes unit and places it in the column header.
Usage: python .\normalize-csv.py "file.csv" Mean
"""
import argparse
import csv
import re
import os
parser = argparse.ArgumentParser(description="Clean up units in CSV file.")
parser.add_argument('filename', metavar='filename',
help='file to process.')
parser.add_argument('column', metavar='column',
help='name of column to process.')
parser.add_argument('unit', metavar='unit', nargs='?',
help='target unit. Leave blank to use the most popular unit.')
args = parser.parse_args()
REGEX = r"([\d,]*(\.\d+)?)\s*((\D)*)"
FACTORS = {
'ns': 1, 'us': 1000, 'ms': 1000*1000, 's': 1000*1000*1000,
'B': 1, 'KB': 1000*1000, 'MB': 1000*1000*1000, 'GB': 1000*1000*1000*1000
}
# Back up the file
originalName = args.filename + '.original'
os.replace(args.filename, originalName)
with open(originalName, 'r') as csvfile:
reader = csv.DictReader(csvfile)
rows = list(reader)
headers = list(rows[0])
unit = args.unit
if unit is None:
# Find the most common unit
unitHistogram = {}
for row in rows:
target = row[args.column]
if target is None:
raise ValueError()
matches = re.search(REGEX, target)
currentUnit = matches.group(3)
unitHistogram.setdefault(currentUnit, 0)
unitHistogram[currentUnit] = unitHistogram[currentUnit] + 1
unit = max(unitHistogram, key=unitHistogram.get)
print('Determined target conversion unit: ' + unit)
else:
print('Using provided conversion unit: ' + unit)
# Replace the header
newColumnName = "{0} [{1}]".format(args.column, unit)
for header in headers:
if header == args.column:
headers[headers.index(header)] = newColumnName
# Convert values to the target unit
with open(args.filename, 'w+', newline='') as newFile:
writer = csv.DictWriter(newFile, fieldnames = headers)
writer.writeheader()
for row in rows:
target = row[args.column]
if target is None:
raise ValueError()
matches = re.search(REGEX, target)
currentUnit = matches.group(3)
newValue = matches.group(1)
if (currentUnit not in FACTORS):
print('No known conversion for ' + matches.group(3))
newValue = float(matches.group(1).replace(',',''))
elif (currentUnit == unit):
#Debug: print('No conversion necessary for ' + matches.group(1) + '[' + matches.group(3) + ']')
newValue = float(matches.group(1).replace(',',''))
else:
factor = FACTORS[currentUnit] / FACTORS[unit]
newValue = float(matches.group(1).replace(',','')) * factor
#Debug: print('Converting ' + matches.group(1) + '[' + matches.group(3) + '] into ' + str(newValue) + ' [' + unit + ']' )
row.pop(args.column)
row[newColumnName] = newValue
writer.writerow(row)
print('Done.')