diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index c6186ce2..d8f398e5 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -344,9 +344,16 @@ def prepare_raw_data( # audio. read_aloud_duration = 0.0 conversation_duration = 0.0 + rows_to_remove: list[int] = [] for row_i, row in tqdm(recordings.iterrows()): filename = input_path / row["filename"] + # Check if the file is empty, and if it is, remove it from the dataframe + # and continue to the next file + if filename.stat().st_size < 200000: # Any file smaller than this is empty + rows_to_remove.append(row_i) + continue + # Get the new filename # New filename is in the format is for conversations: # "recording_id_speaker_id1_speaker_id2_recorder_speaker_id_conversation.wav" @@ -400,6 +407,9 @@ def prepare_raw_data( except FileNotFoundError: pass + # Remove rows with empty files + recordings = recordings.drop(rows_to_remove).reset_index(drop=True) + # Write a README file readme = make_readme() with open(output_path / "README.md", "w") as f: