From 552c60175d3be8209ac6aff81263822c8c42dae9 Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen Date: Wed, 29 Nov 2023 14:13:09 +0100 Subject: [PATCH 1/2] fix: remove empty files. --- src/coral_models/prepare_raw_data.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index c6186ce2..4e7f428b 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -344,9 +344,16 @@ def prepare_raw_data( # audio. read_aloud_duration = 0.0 conversation_duration = 0.0 + rows_to_remove = [] for row_i, row in tqdm(recordings.iterrows()): filename = input_path / row["filename"] + # Check if the file is empty, and if it is, remove it from the dataframe + # and continue to the next file + if filename.stat().st_size < 200000: # Any file smaller than this is empty + rows_to_remove.append(row_i) + continue + # Get the new filename # New filename is in the format is for conversations: # "recording_id_speaker_id1_speaker_id2_recorder_speaker_id_conversation.wav" @@ -400,6 +407,9 @@ def prepare_raw_data( except FileNotFoundError: pass + # Remove rows with empty files + recordings = recordings.drop(rows_to_remove).reset_index(drop=True) + # Write a README file readme = make_readme() with open(output_path / "README.md", "w") as f: From ff2a3fedfefa23dda6fc6aa24c75f981962f991d Mon Sep 17 00:00:00 2001 From: Anders Jess Pedersen <38854604+AJDERS@users.noreply.github.com> Date: Thu, 30 Nov 2023 10:15:22 +0100 Subject: [PATCH 2/2] Update src/coral_models/prepare_raw_data.py Co-authored-by: Dan Saattrup Nielsen <47701536+saattrupdan@users.noreply.github.com> --- src/coral_models/prepare_raw_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py index 4e7f428b..d8f398e5 100644 --- a/src/coral_models/prepare_raw_data.py +++ b/src/coral_models/prepare_raw_data.py @@ -344,7 +344,7 @@ def prepare_raw_data( # audio. read_aloud_duration = 0.0 conversation_duration = 0.0 - rows_to_remove = [] + rows_to_remove: list[int] = [] for row_i, row in tqdm(recordings.iterrows()): filename = input_path / row["filename"]