From d7880c8f4cc964424b42ae81c1720fc4657ddbdb Mon Sep 17 00:00:00 2001 From: wh1te-moon <90997324+wh1te-moon@users.noreply.github.com> Date: Wed, 8 May 2024 17:41:27 +0800 Subject: [PATCH] new generate_subs_based_on_punc function (#218) * update generate cn subs * # new file: src/streaming_with_cn_subtitles.py new example * modified: src/streaming_with_cn_subtitles.py * update new re clause * modified: src/streaming_with_cn_subtitles.py * update comment and rename file --------- Co-authored-by: wh1te-moon --- src/edge_tts/submaker.py | 70 +++++++++++++++++++++++- src/streaming_with_new_subtitles_test.py | 42 ++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 src/streaming_with_new_subtitles_test.py diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index a7f6622..c469b9c 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -6,6 +6,7 @@ """ import math +import re from typing import List, Tuple from xml.sax.saxutils import escape, unescape @@ -98,7 +99,7 @@ def generate_subs(self, words_in_cue: int = 10) -> str: if sub_state_count == words_in_cue or idx == len(self.offset) - 1: subs = sub_state_subs split_subs: List[str] = [ - subs[i : i + 79] for i in range(0, len(subs), 79) + subs[i: i + 79] for i in range(0, len(subs), 79) ] for i in range(len(split_subs) - 1): sub = split_subs[i] @@ -123,3 +124,70 @@ def generate_subs(self, words_in_cue: int = 10) -> str: sub_state_start = -1 sub_state_subs = "" return data + + def generate_subs_based_on_punc(self, text) -> str: + PUNCTUATION = [',', '。', '!', '?', ';', + ':', '\n', '“', '”', ',', '!', '\\. '] + # def clause(self)->list[str]: + # start=0 + # i=0 + # text_list=[] + # while(i list[str]: + pattern = '(' + '|'.join(punc for punc in PUNCTUATION) + ')' + text_list = re.split(pattern, text) + + index = 0 + pattern = '^[' + ''.join(p for p in PUNCTUATION) + ']+$' + while (index < len(text_list)-1): + if not text_list[index+1]: + text_list.pop(index+1) + continue + if re.match(pattern, text_list[index+1]): + if (text_list[index+1] == '\n'): + text_list.pop(index+1) + continue + text_list[index] += text_list.pop(index+1) + else: + index += 1 + + return text_list + + self.text_list = clause(self) + if len(self.subs) != len(self.offset): + raise ValueError("subs and offset are not of the same length") + data = "WEBVTT\r\n\r\n" + j = 0 + for text in self.text_list: + try: + start_time = self.offset[j][0] + except IndexError: + return data + try: + while (self.subs[j + 1] in text): + j += 1 + except IndexError: + pass + data += formatter(start_time, self.offset[j][1], text) + j += 1 + return data + + +if __name__ == "__main__": + generator = SubMaker() + generator.create_sub((0, 15000), " 你好,") + generator.create_sub((15000, 15000), "世界!") + print(generator.generate_subs_based_on_punc("你好,世界!")) + # print(generator.generate_subs()) + print() diff --git a/src/streaming_with_new_subtitles_test.py b/src/streaming_with_new_subtitles_test.py new file mode 100644 index 0000000..f897a90 --- /dev/null +++ b/src/streaming_with_new_subtitles_test.py @@ -0,0 +1,42 @@ +import asyncio +import edge_tts +from edge_tts.communicate import Communicate + +TEXT = """Title: Exploring the Beauty of Mathematics + +Mathematics, often regarded as the language of the universe, encompasses a myriad of concepts, from basic arithmetic to complex calculus. It's a discipline that transcends boundaries and delves into the depths of abstraction. + +In mathematics, precision is paramount. Numbers dance across the page, guided by symbols such as +, -, ×, and ÷, each punctuation mark playing a crucial role in shaping equations and expressions. These symbols, like punctuation in language, clarify and organize mathematical ideas. + +Consider the beauty of decimals, those subtle points that delineate fractions of wholes. They appear unassumingly yet hold profound significance in calculations. Whether it's 3.14, the beloved pi, or the golden ratio 1.618, decimals offer glimpses into the elegant patterns underlying the chaos of numbers. + +But mathematics isn't just about numbers and symbols; it's about discovery and exploration. It's about unraveling the mysteries of the universe, from the microscopic world of quantum mechanics to the vast expanse of cosmology. Punctuation marks in mathematics, much like their linguistic counterparts, serve as signposts on this journey, guiding us through the intricate landscapes of mathematical thought. + +So let us embrace the beauty of mathematics, where decimals and punctuation marks converge to form the tapestry of our understanding, illuminating the path to new insights and discoveries.""" +# VOICE = "zh-CN-YunxiNeural" +OUTPUT_FILE = "test.mp3" +WEBVTT_FILE = "test.vtt" + +async def amain() -> None: + """Main function""" + communicate = Communicate(TEXT, + # rate="+50%",volume="+50%" + ) + submaker = edge_tts.SubMaker() + with open(OUTPUT_FILE, "wb") as file: + async for chunk in communicate.stream(): + if chunk["type"] == "audio": + file.write(chunk["data"]) + elif chunk["type"] == "WordBoundary": + submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) + + with open(WEBVTT_FILE, "w", encoding="utf-8") as file: + file.write(submaker.generate_subs_based_on_punc(TEXT)) + # file.write(submaker.generate_subs()) + + +loop = asyncio.get_event_loop_policy().get_event_loop() +try: + loop.run_until_complete(amain()) +finally: + loop.close() \ No newline at end of file