-
Notifications
You must be signed in to change notification settings - Fork 0
/
clams_processing.py
executable file
·359 lines (268 loc) · 14.7 KB
/
clams_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import glob
import os
import re
from datetime import timedelta
import numpy as np
import pandas as pd
def clean_all_clams_data(directory_path):
"""Reformat all CLAMS data files (.csv) in the provided directory by dropping unnecessary rows.
Parameters:
directory_path (string): directory containing .csv files to clean
Returns:
Nothing. Prints new filenames saved to "Cleaned_CLAMS_data" directory.
"""
def clean_file(file_path, output_directory):
"""Helper function to clean individual file."""
# Read the file as plain text to extract metadata
with open(file_path, 'r') as f:
lines = f.readlines()
# Extract the "Subject ID" value
for line in lines:
if 'Subject ID' in line:
subject_id = line.split(',')[1].strip()
break
# Read the data chunk of the CSV file
df = pd.read_csv(file_path, skiprows=range(0, 22))
# Drop additional 2 formatting rows
df.drop([0, 1], inplace=True)
# Construct the new file name
file_name = os.path.basename(file_path)
base_name, ext = os.path.splitext(file_name)
ext = ext.lower()
new_file_name = f"{base_name}_ID{subject_id}{ext}"
# Save the cleaned data to the new directory
output_path = os.path.join(output_directory, new_file_name)
df.to_csv(output_path, index=False)
print(f"Cleaning {file_name}")
# Create the output directory if it doesn't exist
output_directory = os.path.join(directory_path, "Cleaned_CLAMS_data")
if not os.path.exists(output_directory):
os.makedirs(output_directory)
# Process all CSV files in the directory, regardless of extension case
csv_pattern = re.compile(r"\.csv$", re.IGNORECASE)
all_files = glob.iglob(os.path.join(directory_path, "*"))
csv_files = [file_path for file_path in all_files if csv_pattern.search(file_path)]
for file_path in csv_files:
clean_file(file_path, output_directory)
def trim_all_clams_data(directory_path, trim_hours, keep_hours, start_dark):
"""Trims all cleaned CLAMS data files in the specified directory.
Parameters:
directory_path (string): path to the directory containing cleaned .csv files
trim_hours (int): number of hours to trim from the beginning
keep_hours (int): number of hours to keep in the resulting file
Returns:
Nothing. Saves the trimmed data to new CSV files in the "Trimmed_CLAMS_data" directory.
"""
# Create a new directory for trimmed files if it doesn't exist
trimmed_directory = os.path.join(directory_path, "Trimmed_CLAMS_data")
if not os.path.exists(trimmed_directory):
os.makedirs(trimmed_directory)
# Get the path to the cleaned data files
cleaned_directory = os.path.join(directory_path, "Cleaned_CLAMS_data")
# List all files in the directory
files = [f for f in os.listdir(cleaned_directory) if
os.path.isfile(os.path.join(cleaned_directory, f)) and f.endswith('.csv')]
for file in files:
file_path = os.path.join(cleaned_directory, file)
# Read the cleaned CSV file
df = pd.read_csv(file_path)
# Convert the 'DATE/TIME' column to datetime format
df['DATE/TIME'] = pd.to_datetime(df['DATE/TIME'], errors='coerce')
# Calculate the starting timestamp after trimming
start_index = df[df['DATE/TIME'] >= df['DATE/TIME'].iloc[0] + timedelta(hours=trim_hours)].index[0]
# Note the value in the "LED LIGHTNESS" column after trimming
initial_led_value = df['LED LIGHTNESS'].iloc[start_index]
# Find the index of the next change in the "LED LIGHTNESS" value
while df['LED LIGHTNESS'].iloc[start_index] == initial_led_value:
start_index += 1
# Determine if the 1st light change does not match the cycle specified by the user and adjust start_index to the next light change if necessary
if (start_dark and df['LED LIGHTNESS'].iloc[start_index] != 0) or (not start_dark and df['LED LIGHTNESS'].iloc[start_index] == 0):
initial_led_value = df['LED LIGHTNESS'].iloc[start_index]
while df['LED LIGHTNESS'].iloc[start_index] == initial_led_value:
start_index += 1
# Zero columns that contain accumulative variables to appropriately account for variable trimming times
columns_to_zero = ['ACCO2', 'ACCCO2', 'FEED1 ACC', 'WHEEL ACC']
for col in columns_to_zero:
df[col] = (df[col] - df[col].iloc[start_index - 1]).round(2)
# Calculate the ending timestamp
end_time = df['DATE/TIME'].iloc[start_index] + timedelta(hours=keep_hours)
# Filter the dataframe from calculated start_index to end_time
df_result = df[(df.index >= start_index) & (df['DATE/TIME'] <= end_time)]
# Save the resulting data to a new CSV file in the "Trimmed_CLAMS_data" directory
file_name = os.path.basename(file_path)
base_name, ext = os.path.splitext(file)
ext = ext.lower()
new_file_name = os.path.join(trimmed_directory, f"{base_name}_trimmed{ext}")
df_result.to_csv(new_file_name, index=False)
print(f"Trimming {file_name}")
def bin_clams_data(file_path, bin_hours):
df = pd.read_csv(file_path)
bin_hours = int(bin_hours)
# Convert 'DATE/TIME' column to datetime format
df['DATE/TIME'] = pd.to_datetime(df['DATE/TIME'])
# Drop unnecessary columns
columns_to_drop = ["STATUS1", "O2IN", "O2OUT", "DO2", "CO2IN", "CO2OUT", "DCO2", "XTOT", "YTOT", "LED HUE",
"LED SATURATION", "BIN"]
df = df.drop(columns=columns_to_drop, errors='ignore')
# Add AMB & AMB ACC columns to the original dataframe
df['AMB'] = df['XAMB'] + df['YAMB']
df['AMB ACC'] = df['AMB'].cumsum()
# Create a new column for bin labels
df['BIN'] = np.nan
# For each unique "LED LIGHTNESS" value, assign bin labels
for led_value in df['LED LIGHTNESS'].unique():
subset = df[df['LED LIGHTNESS'] == led_value].copy()
start_time = subset['DATE/TIME'].iloc[0]
bin_label = 0
bin_labels = []
for timestamp in subset['DATE/TIME']:
if (timestamp - start_time) >= timedelta(hours=bin_hours):
bin_label += 1
start_time = timestamp
bin_labels.append(bin_label)
df.loc[subset.index, 'BIN'] = bin_labels
# Columns to retain the last value in the bin
last_val_columns = ["INTERVAL", "CHAN", "DATE/TIME", "ACCO2", "ACCCO2", "FEED1 ACC", "WHEEL ACC", "AMB ACC"]
# Columns to sum within the bin
sum_columns = ["WHEEL", "FEED1", "AMB"]
# Columns to average (excluding the ones we're taking the last value or summing)
avg_columns = df.columns.difference(last_val_columns + sum_columns + ['BIN', 'LED LIGHTNESS'])
# Group by "LED LIGHTNESS" and "BIN" and calculate the mean, sum, or last value as appropriate
df_binned = df.groupby(['LED LIGHTNESS', 'BIN']).agg({**{col: 'last' for col in last_val_columns},
**{col: 'mean' for col in avg_columns},
**{col: 'sum' for col in sum_columns}}).reset_index()
# Add start and end time columns
start_times = df.groupby(['LED LIGHTNESS', 'BIN'])['DATE/TIME'].first().reset_index(name='DATE/TIME_start')
end_times = df.groupby(['LED LIGHTNESS', 'BIN'])['DATE/TIME'].last().reset_index(name='DATE/TIME_end')
df_binned = pd.merge(df_binned, start_times, on=['LED LIGHTNESS', 'BIN'])
df_binned = pd.merge(df_binned, end_times, on=['LED LIGHTNESS', 'BIN'])
# Add start and end interval columns
start_intervals = df.groupby(['LED LIGHTNESS', 'BIN'])['INTERVAL'].first().reset_index(name='INTERVAL_start')
end_intervals = df.groupby(['LED LIGHTNESS', 'BIN'])['INTERVAL'].last().reset_index(name='INTERVAL_end')
df_binned = pd.merge(df_binned, start_intervals, on=['LED LIGHTNESS', 'BIN'])
df_binned = pd.merge(df_binned, end_intervals, on=['LED LIGHTNESS', 'BIN'])
# Calculate the duration of each bin in hours
df_binned['DURATION'] = (df_binned['DATE/TIME_end'] - df_binned['DATE/TIME_start']).dt.total_seconds() / 3600
# Drop rows with a duration of 0
df_binned = df_binned[df_binned['DURATION'] != 0]
# Drop existing BIN column & sort based on INTERVAL_start
df_binned = df_binned.sort_values(by='INTERVAL_start')
# Add a DAY column
df_binned['DAY'] = (df_binned['BIN'] // (12 / bin_hours) + 1).astype(int)
# Reset index and add a new 'HOUR' column starting from 1
df_binned.reset_index(drop=True, inplace=True)
df_binned['HOUR'] = df_binned.index
# Add DAILY_BIN column
df_binned['24 HOUR'] = df_binned['HOUR'] % ( 24 // bin_hours)
# Convert HOUR & 24 HOUR columns to time
df_binned['HOUR'] = (df_binned['HOUR'] + 1) * bin_hours
df_binned['24 HOUR'] = (df_binned['24 HOUR'] + 1) * bin_hours
# Reorder columns based on your request
desired_order = ["CHAN", "INTERVAL_start", "INTERVAL_end", "DATE/TIME_start", "DATE/TIME_end", "DURATION",
"VO2", "ACCO2", "VCO2", "ACCCO2", "RER", "HEAT", "FLOW", "PRESSURE", "FEED1", "FEED1 ACC",
"AMB", "AMB ACC", "WHEEL", "WHEEL ACC", "ENCLOSURE TEMP", "ENCLOSURE SETPOINT", "LED LIGHTNESS", "DAY", "HOUR", "24 HOUR"]
df_binned = df_binned[desired_order]
# Round all variables to 4 decimal places
df_binned = df_binned.round(4)
# Save the binned data to a new CSV file
output_path = file_path.replace(
"Trimmed_CLAMS_data", f"{bin_hours}hour_bins_Binned_CLAMS_data"
).replace(
".csv", f"_{bin_hours}hour_bins.csv"
)
# Check if the directory exists, if not, create it
output_directory = os.path.dirname(output_path)
if not os.path.exists(output_directory):
os.makedirs(output_directory)
df_binned.to_csv(output_path, index=False)
def process_directory(directory_path, bin_hours):
# Get path to trimmed directory
trimmed_directory = os.path.join(directory_path, "Trimmed_CLAMS_data")
# Get a list of all .CSV files in the directory
csv_files = [f for f in os.listdir(trimmed_directory) if
f.endswith('.csv') and os.path.isfile(os.path.join(trimmed_directory, f))]
# Process each .CSV file
for csv_file in csv_files:
file_path = os.path.join(trimmed_directory, csv_file)
bin_clams_data(file_path, bin_hours)
print(f"Binning {csv_file}")
def extract_id_number(filename):
# Extract the ID number from the filename
match = re.search(r'ID(\d+)', filename)
if match:
return match.group(1)
else:
return None
def recombine_columns(directory_path, experiment_config_file, bin_hours):
# Use bin_hours to create a directory for each binned version
bin_hours = int(bin_hours)
# Define Combined CLAMS data directory for each bin window
combined_directory = os.path.join(directory_path, f"{bin_hours}hour_bins_Combined_CLAMS_data")
if not os.path.exists(combined_directory):
os.makedirs(combined_directory)
# Define input directory
input_directory = os.path.join(directory_path, f"{bin_hours}hour_bins_Binned_CLAMS_data")
# Desired output variables
output_variables = ['ACCCO2', 'ACCO2', 'FEED1 ACC', 'FEED1', 'RER', 'AMB', 'AMB ACC', 'VCO2', 'VO2', 'WHEEL ACC', 'WHEEL']
# Define columns to include in the output
selected_columns = ['ID', 'GROUP_LABEL', 'DAY', 'HOUR', '24 HOUR'] + output_variables
# Read the experiment configuration
config_df = pd.read_csv(experiment_config_file)
print(f'CONFIGRESULTS: {config_df.columns}')
# Create an empty DataFrame to store the combined data
combined_data = pd.DataFrame(columns=selected_columns)
# Loop through all files in the specified directory
for filename in os.listdir(input_directory):
if filename.endswith(".csv"):
file_path = os.path.join(input_directory, filename)
# Read the current .csv file into a DataFrame
df = pd.read_csv(file_path)
# Get the 'ID' number from the file name
file_id = extract_id_number(filename)
# Find the GROUP_LABEL for the current ID
group_label = config_df[config_df['ID'] == int(file_id)]['GROUP_LABEL'].values
if len(group_label) > 0:
group_label = group_label[0]
else:
group_label = ""
# Add columns 'ID', 'DAY', 'HOUR', '24 HOUR'
df['ID'] = file_id
df['GROUP_LABEL'] = group_label
df['DAY'] = df['DAY'].astype(int)
df['HOUR'] = df['HOUR'].astype(int)
df['24 HOUR'] = df['24 HOUR'].astype(int)
# Filter and reorder columns
df = df[selected_columns]
# Append the data to the combined DataFrame
combined_data = pd.concat([combined_data, df], ignore_index=True)
# Group the combined data by the output variables and save to separate .csv files
for variable in output_variables:
output_filename = os.path.join(combined_directory, f"{variable}.csv")
variable_data = combined_data[['ID', 'GROUP_LABEL', 'DAY', 'HOUR', '24 HOUR', variable]]
variable_data.to_csv(output_filename, index=False)
def reformat_csv(input_csv_path, output_csv_path):
"""Reformat a CLAMS CSV file to a "tidy" format."""
df = pd.read_csv(input_csv_path)
# Replace missing values in "GROUP_LABEL" with a placeholder value
df["GROUP_LABEL"].fillna("NO_LABEL", inplace=True)
# Extract the name of the last column
last_column_name = df.columns[-1]
# Pivot the table using "ID", "GROUP_LABEL", "DAY", and "24 HOUR" as indices
pivot_table = df.pivot_table(index=["ID", "GROUP_LABEL", "DAY"],
columns="24 HOUR", values=last_column_name,
aggfunc="first").reset_index()
# Flatten the column index and rename columns
pivot_table.columns = ["ID", "GROUP_LABEL", "DAY"] + [f"{last_column_name}_{hour}" for hour in
pivot_table.columns[3:]]
# Save the pivot table to a new CSV file
pivot_table.to_csv(output_csv_path, index=False)
# Function to process all CSV files in a directory
def reformat_csvs_in_directory(input_dir):
output_dir = os.path.join(input_dir, "Reformatted_CSVs")
os.makedirs(output_dir, exist_ok=True)
for filename in os.listdir(input_dir):
if filename.endswith(".csv"):
input_csv_path = os.path.join(input_dir, filename)
output_csv_path = os.path.join(output_dir, f"reformatted_{filename}")
reformat_csv(input_csv_path, output_csv_path)
print(f"Reformatting '{filename}' to reformatted_'{filename}'")