From 9ff75e39de191d78426a008d598db9e6e262d417 Mon Sep 17 00:00:00 2001
From: Jack <jack20220723@gmail.com>
Date: Sat, 5 Oct 2024 17:17:56 +0800
Subject: [PATCH] Initial release

---
 .streamlit/config.toml |  2 +-
 utils.py               | 96 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100644 utils.py

diff --git a/.streamlit/config.toml b/.streamlit/config.toml
index bde5044..5125201 100644
--- a/.streamlit/config.toml
+++ b/.streamlit/config.toml
@@ -5,4 +5,4 @@ secondaryBackgroundColor="#F0F2F6"
 textColor="#262730"
 font="Ubuntu"
 [server]
-maxUploadSize=1028
\ No newline at end of file
+maxUploadSize=1028
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..ae54176
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,96 @@
+import textwrap
+import zlib
+from typing import Iterator, TextIO
+
+
+def exact_div(x, y):
+    assert x % y == 0
+    return x // y
+
+
+def str2bool(string):
+    str2val = {"True": True, "False": False}
+    if string in str2val:
+        return str2val[string]
+    else:
+        raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+
+
+def optional_int(string):
+    return None if string == "None" else int(string)
+
+
+def optional_float(string):
+    return None if string == "None" else float(string)
+
+
+def compression_ratio(text) -> float:
+    return len(text) / len(zlib.compress(text.encode("utf-8")))
+
+
+def format_timestamp(seconds: float, always_include_hours: bool = False, fractionalSeperator: str = '.'):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractionalSeperator}{milliseconds:03d}"
+
+
+def write_txt(transcript: Iterator[dict], file: TextIO):
+    for segment in transcript:
+        print(segment['text'].strip(), file=file, flush=True)
+
+
+def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        text = processText(segment['text'], maxLineWidth).replace('-->', '->')
+
+        print(
+            f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+
+
+def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
+    """
+    Write a transcript to a file in SRT format.
+    Example usage:
+        from pathlib import Path
+        from whisper.utils import write_srt
+        result = transcribe(model, audio_path, temperature=temperature, **args)
+        # save SRT
+        audio_basename = Path(audio_path).stem
+        with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
+            write_srt(result["segments"], file=srt)
+    """
+    for i, segment in enumerate(transcript, start=1):
+        text = processText(segment['text'].strip(), maxLineWidth).replace('-->', '->')
+
+        # write srt lines
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['start'], always_include_hours=True, fractionalSeperator=',')} --> "
+            f"{format_timestamp(segment['end'], always_include_hours=True, fractionalSeperator=',')}\n"
+            f"{text}\n",
+            file=file,
+            flush=True,
+        )
+
+def processText(text: str, maxLineWidth=None):
+    if (maxLineWidth is None or maxLineWidth < 0):
+        return text
+
+    lines = textwrap.wrap(text, width=maxLineWidth, tabsize=4)
+    return '\n'.join(lines)