-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_given_data_preprocessing.py
66 lines (60 loc) · 2.38 KB
/
run_given_data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
import os
import shutil
from run_rbm2supervised import read_data
import numpy as np
from PIL import Image
def rename_given_data_by_label(path_to_input, path_to_output):
"""
rename the given data to 'idx[digit].*' format
"""
def copy(src, digit, image_format):
global idx
shutil.copy(src, os.path.join(path_to_output, "%d[%s].%s" % (idx, digit, image_format)))
idx += 1
if not os.path.exists(path_to_output):
os.mkdir(path_to_output)
for entry in os.listdir(path_to_input):
full_path = os.path.join(path_to_input, entry)
if os.path.isfile(full_path):
match = re.match("(?P<digit>\d)_\d+\..*png", entry)
if match:
copy(full_path, match.group("digit"), "png")
continue
match = re.match("[\d+]\.(?P<digit>\d).*png", entry)
if match:
copy(full_path, match.group("digit"), "png")
continue
match = re.match("(?P<digit>\d)\.jpg", entry)
if match:
copy(full_path, match.group("digit"), "jpg")
continue
match = re.match("(?P<digit>\d)\.jpg.*png", entry)
if match:
copy(full_path, match.group("digit"), "png")
continue
match = re.match("[\d+]-(?P<digit>\d)\.jpg.*png", entry)
if match:
copy(full_path, match.group("digit"), "png")
continue
match = re.match("[\d+]-(?P<digit>\d)\.jpg", entry)
if match:
copy(full_path, match.group("digit"), "jpg")
continue
print(full_path)
elif os.path.isdir(full_path):
rename_given_data_by_label(full_path, path_to_output)
def clean_repeated_train_data(path_train, path_test):
size = (32, 32)
test_data, _ = read_data(path_test, size)
for filename in os.listdir(path_train):
fullpath = os.path.join(path_train, filename)
arr = np.asarray(Image.open(fullpath).convert("L").resize(size))
same = np.count_nonzero(np.sum(np.abs(test_data - np.expand_dims(arr, 0)), axis=(-2, -1)) == 0)
if same != 0:
os.remove(fullpath)
print("remove:", fullpath)
if __name__ == '__main__':
idx = 0
# rename_given_data_by_label("GIVEN_TRAIN_DATA", "TRAIN")
clean_repeated_train_data("TRAIN", "TEST")