From 552c60175d3be8209ac6aff81263822c8c42dae9 Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <anders.j.pedersen@alexandra.dk>
Date: Wed, 29 Nov 2023 14:13:09 +0100
Subject: [PATCH 1/2] fix: remove empty files.

---
 src/coral_models/prepare_raw_data.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index c6186ce2..4e7f428b 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -344,9 +344,16 @@ def prepare_raw_data(
     # audio.
     read_aloud_duration = 0.0
     conversation_duration = 0.0
+    rows_to_remove = []
     for row_i, row in tqdm(recordings.iterrows()):
         filename = input_path / row["filename"]
 
+        # Check if the file is empty, and if it is, remove it from the dataframe
+        # and continue to the next file
+        if filename.stat().st_size < 200000:  # Any file smaller than this is empty
+            rows_to_remove.append(row_i)
+            continue
+
         # Get the new filename
         # New filename is in the format is for conversations:
         # "recording_id_speaker_id1_speaker_id2_recorder_speaker_id_conversation.wav"
@@ -400,6 +407,9 @@ def prepare_raw_data(
         except FileNotFoundError:
             pass
 
+    # Remove rows with empty files
+    recordings = recordings.drop(rows_to_remove).reset_index(drop=True)
+
     # Write a README file
     readme = make_readme()
     with open(output_path / "README.md", "w") as f:

From ff2a3fedfefa23dda6fc6aa24c75f981962f991d Mon Sep 17 00:00:00 2001
From: Anders Jess Pedersen <38854604+AJDERS@users.noreply.github.com>
Date: Thu, 30 Nov 2023 10:15:22 +0100
Subject: [PATCH 2/2] Update src/coral_models/prepare_raw_data.py

Co-authored-by: Dan Saattrup Nielsen <47701536+saattrupdan@users.noreply.github.com>
---
 src/coral_models/prepare_raw_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coral_models/prepare_raw_data.py b/src/coral_models/prepare_raw_data.py
index 4e7f428b..d8f398e5 100644
--- a/src/coral_models/prepare_raw_data.py
+++ b/src/coral_models/prepare_raw_data.py
@@ -344,7 +344,7 @@ def prepare_raw_data(
     # audio.
     read_aloud_duration = 0.0
     conversation_duration = 0.0
-    rows_to_remove = []
+    rows_to_remove: list[int] = []
     for row_i, row in tqdm(recordings.iterrows()):
         filename = input_path / row["filename"]