-
Notifications
You must be signed in to change notification settings - Fork 0
/
removeUnchangedSubjects.py
46 lines (38 loc) · 1.2 KB
/
removeUnchangedSubjects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import argparse
from datetime import datetime
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='enter filename with csv.')
args = parser.parse_args()
if args.file:
filename = args.file
else:
filename = input('Enter filename (including \'.csv\'): ')
df_subjects = pd.read_csv(filename, header=0)
df_subjects.newValue = df_subjects.newValue.str.strip()
new_unique = df_subjects.newValue.value_counts()
new_unique.to_csv('newValuecounts.csv')
print(len(new_unique))
old_unique = df_subjects.oldValue.value_counts()
old_unique.to_csv('oldValuecounts.csv')
print(len(old_unique))
all_items = []
dropped = 0
for count, row in df_subjects.iterrows():
row = row
uri = row['uri']
oldValue = row['oldValue']
newValue = row['newValue'].strip()
newKey = row['newKey'].strip()
oldKey = row['oldKey']
if ((oldKey == newKey) and (newValue == oldValue)):
dropped = dropped + 1
print(oldKey, newKey)
print(oldValue, newValue)
print("")
pass
else:
all_items.append(row)
df = pd.DataFrame.from_dict(all_items)
dt = datetime.now().strftime('%Y-%m-%d %H.%M.%S')
df.to_csv('test_'+dt+'.csv', index=False)