-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandomSampleSelection.py
172 lines (129 loc) · 6.26 KB
/
RandomSampleSelection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 15 12:49:46 2024
@author: Gouwe-Gozer
"""
###############################################################################
# 15/03/2024
# This script loads files listed in a folder (or subfolders of a folder), sets
# certain requirements for the files and then takes a random subsample of the
# files.
# The file names can then be written to a csv-file and/or the files can be copied
# to a different folder.
###############################################################################
### Loading packages ###
try: # First see if the packages are already installed
import os
import random
import shutil
import pandas as pd
from datetime import datetime
except ModuleNotFoundError: # Else install the packages first
# !pip install random shutil os datetime
import os
import random
import shutil
import pandas as pd
from datetime import datetime
# This will still give a ModuleNotFoundError. Kernel needs to be restarted:
# Console > restart kernel or Crtl + .
# Then rerun the code above
############################# Variables ###################################
# Specify the folder where your files are located
input_folder = 'C:/Users/User/Documents/main_folder/'
# Are your files nested?
# i.e. are the files listed in subfolders of the input_folder?
nested = True
# What sample size do you want?
sample_size = 120
# Do you want to save the names of your subsample in a csv?
write_to_csv = True
# If so, list the directory you wish to save the csv to
output_csv = "C:/Users/User/Documents/"
csv_name = "subsample"
# Do you want to copy the selected files to a new folder?
copy_to_new_dir = True
# If so, list the directory you wish to save the csv to
output_folder = "C:/Users/User/Documents/subset_folder/"
#### File name requirements ####
# Do you only want to select certain file types/ a certain file type?
extension_requirement = True
extension = ".png"
# Do you only want to select files that contain a certain string?
name_requirement = True
# File name must contain (not case sensitive):
must_contain = "parrot"
# Alternatively, do you want to exclude any files based on there file name
exclude = True
# if so
cannot_contain = "donkey"
### End of variable section ###
### The script should not require any user input after these lines ###
###############################################################################
###############################################################################
# Create custom error to notify the end user of errors
class GouweGozerSays(Exception):
"""Custom exception class to notify the end-user"""
pass
file_list = []
relative_path_list = []
if nested:
# Use os.walk to list all files within the input folder and all subfolders.
for root, dirs, files in os.walk(input_folder):
for file in files:
if (extension_requirement == False or file.lower().endswith(extension)) and \
(name_requirement == False or (must_contain in file.lower()) and \
(exclude == False or cannot_contain not in file.lower())):
file_list.append(file)
# Construct the full path of the file using 'root'
relative_path = os.path.join(root, file)
relative_path_list.append(relative_path)
else:
# Use os.listdir to list all files within the input folder
for file in os.listdir(input_folder):
if (extension_requirement == False or file.lower().endswith(extension)) and \
(name_requirement == False or (must_contain in file.lower()) and \
(exclude == False or cannot_contain not in file.lower())):
file_list.append(file)
print(f"I found a total of {len(file_list)} files in the folder that met your specifications. Does that seem right to you?")
subsample = random.sample(file_list, sample_size)
print(f"I created a random subsample of {sample_size} from all the files that I found. That's {(sample_size/len(file_list))*100}% of all the files! ")
if nested:
subsample_paths = []
# Iterate over the subsample of filenames
for filename in subsample:
# Iterate over the relative paths to find a match
for path in relative_path_list:
# Extract the filename from the path
file_name_from_path = path.split("\\")[-1]
# Check if the filename matches the subsample filename
if filename == file_name_from_path:
# If it matches, add the path to the list of selected paths
subsample_paths.append(path)
break # Move to the next filename in the subsample
if copy_to_new_dir:
if os.listdir(output_folder):
raise MinnertSays(f"The output folder ({output_folder}) is not empty. Make sure you move or delete all current files before copying the subset files to the folder. This way the output folder only contians your subset. ")
if nested:
# Iterate over each file and copy it to the destination folder
for i in range(0, len(subsample)):
source_file_path = os.path.join(subsample_paths[i])
destination_file_path = os.path.join(output_folder, subsample[i])
shutil.copy(source_file_path, destination_file_path)
else:
for file in subsample:
source_file_path = os.path.join(input_folder, file)
destination_file_path = os.path.join(output_folder, file)
shutil.copy(source_file_path, destination_file_path)
print(f"{len(os.listdir(output_folder))} files copied to {output_folder}")
if write_to_csv:
csv_subsample = pd.DataFrame({
'file name': subsample,
})
# Set name for csv
today = datetime.today().date()
today_string = today.strftime("%Y%m%d") + "_"
# set path to csv
file_name_csv = output_csv + today_string + csv_name + ".csv"
csv_subsample.to_csv(file_name_csv, index=False)
print(f"list of subsample file names written to {file_name_csv}")