-
Notifications
You must be signed in to change notification settings - Fork 1
/
latent_reprocessor.py
71 lines (64 loc) · 2.27 KB
/
latent_reprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
Iterates through the output of the prediction loop from QuackAutoencoder and structures the files for use
as a dataset.
"""
from autoencoder import item_path
from argparse import ArgumentParser
from pathlib import Path
import os
import pickle
import shutil
def main() -> None:
"""
The reprocessing logic.
**Required** arguments are:
`--source_path`
*str* **Required** The path to top dir of the QuackIterableDataset.
`--storage_path`
*str* **Required** The top directory of the data storage tree for the QuackImageDataset.
Returns
-------
void
"""
# Add args to make a more flexible cli tool.
arg_parser = ArgumentParser()
arg_parser.add_argument('--source_path', type=str, required=True)
arg_parser.add_argument('--storage_path', type=str, required=True)
args = arg_parser.parse_args()
# Initialize
count = 0
metadata = {
'censored': 0,
'undetermined': 0,
'uncensored': 0,
'length': 0
}
for root, dirs, files in os.walk(args.source_path):
for file in Path(root).glob('*.pyc'):
with file.open('rb') as source:
item = pickle.load(source)
# Ensure storage is ready.
storage_path = Path(args.storage_path + item_path(count, dir_only=True))
storage_path.mkdir(parents=True, exist_ok=True)
data_storage = Path(args.storage_path + item_path(count, 'pyc'))
# Count:
if item['metadata']['censored'] == 1:
metadata['censored'] += 1
elif item['metadata']['censored'] == 0:
metadata['undetermined'] += 1
elif item['metadata']['censored'] == -1:
metadata['uncensored'] += 1
# Move:
shutil.move(file, data_storage)
count += 1
if count % 10000 == 0:
print(f'Processed {count:,} items.')
metadata['length'] = count
root_meta = Path(args.storage_path + '/metadata.pyc')
with root_meta.open(mode='wb') as stored_dict:
pickle.dump(metadata, stored_dict)
print(f'{count} items re-stored as filtered data.')
for key, value in metadata.items():
print(f'{key}: {value}')
if __name__ == '__main__':
main()