From 42737550ad7090bed93e9c9509633d64c0edbe10 Mon Sep 17 00:00:00 2001 From: Ng Ky <15950171+ng-ky@users.noreply.github.com> Date: Sun, 27 Oct 2024 23:32:55 +0800 Subject: [PATCH] Add support for "auto" when translating files --- argostranslatefiles/abstract_file.py | 3 ++ argostranslatefiles/argostranslatefiles.py | 17 +++++++++++ argostranslatefiles/formats/epub.py | 30 +++++++++++++++++++ argostranslatefiles/formats/html.py | 9 ++++++ .../formats/opendocument/odt.py | 17 +++++++++++ argostranslatefiles/formats/openxml/docx.py | 17 +++++++++++ argostranslatefiles/formats/openxml/pptx.py | 17 +++++++++++ argostranslatefiles/formats/txt.py | 3 ++ 8 files changed, 113 insertions(+) diff --git a/argostranslatefiles/abstract_file.py b/argostranslatefiles/abstract_file.py index 43765ae..58c9ab3 100644 --- a/argostranslatefiles/abstract_file.py +++ b/argostranslatefiles/abstract_file.py @@ -21,3 +21,6 @@ def get_output_path(self, underlying_translation: ITranslation, file_path: str): @abc.abstractmethod def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError + + @abc.abstractmethod + def get_texts(self, file_path: str): raise NotImplementedError diff --git a/argostranslatefiles/argostranslatefiles.py b/argostranslatefiles/argostranslatefiles.py index 9090cfd..09b3cc2 100644 --- a/argostranslatefiles/argostranslatefiles.py +++ b/argostranslatefiles/argostranslatefiles.py @@ -37,3 +37,20 @@ def translate_file(underlying_translation: ITranslation, file_path: str): return supported_format.translate(underlying_translation, file_path) return False + + +def get_texts(file_path: str): + """Get the file contents. + + Args: + file_path (str): file path + + Returns: + texts: File contents + """ + + for supported_format in get_supported_formats(): + if supported_format.support(file_path): + return supported_format.get_texts(file_path) + + return False diff --git a/argostranslatefiles/formats/epub.py b/argostranslatefiles/formats/epub.py index 8c99f1a..7fb2d42 100644 --- a/argostranslatefiles/formats/epub.py +++ b/argostranslatefiles/formats/epub.py @@ -54,3 +54,33 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outzip.close() return outzip_path + + def get_texts(self, file_path: str): + inzip = zipfile.ZipFile(file_path, "r") + + texts = "" + + for inzipinfo in inzip.infolist(): + if len(texts) > 4096: + break + with inzip.open(inzipinfo) as infile: + translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"] + if inzipinfo.filename in translatable_xml_filenames: + soup = BeautifulSoup(infile.read(), 'xml') + + texts += self.itag_of_soup(soup).text() + elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'): + head = '\n' + content = str(infile.read(), 'utf-8') + head_present = content.startswith(head) + + if head_present: + content = content[len(head):] + + texts += content + else: + texts += infile.read().decode() + + inzip.close() + + return texts diff --git a/argostranslatefiles/formats/html.py b/argostranslatefiles/formats/html.py index a4214ab..a08af21 100644 --- a/argostranslatefiles/formats/html.py +++ b/argostranslatefiles/formats/html.py @@ -2,6 +2,7 @@ from argostranslate.translate import ITranslation from argostranslatefiles.abstract_file import AbstractFile +from bs4 import BeautifulSoup class Html(AbstractFile): @@ -32,3 +33,11 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outfile.close() return outfile_path + + def get_texts(self, file_path: str): + infile = open(file_path, "r") + + content = infile.read() + + soup = BeautifulSoup(content, "html.parser") + return translatehtml.itag_of_soup(soup).text()[0:4096] diff --git a/argostranslatefiles/formats/opendocument/odt.py b/argostranslatefiles/formats/opendocument/odt.py index dbac8bb..727be4f 100644 --- a/argostranslatefiles/formats/opendocument/odt.py +++ b/argostranslatefiles/formats/opendocument/odt.py @@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outzip.close() return outzip_path + + def get_texts(self, file_path: str): + inzip = zipfile.ZipFile(file_path, "r") + + texts = "" + + for inzipinfo in inzip.infolist(): + if len(texts) > 4096: + break + with inzip.open(inzipinfo) as infile: + if inzipinfo.filename == "content.xml": + soup = BeautifulSoup(infile.read(), 'xml') + texts += self.itag_of_soup(soup).text() + + inzip.close() + + return texts diff --git a/argostranslatefiles/formats/openxml/docx.py b/argostranslatefiles/formats/openxml/docx.py index bc37489..b3f5f91 100644 --- a/argostranslatefiles/formats/openxml/docx.py +++ b/argostranslatefiles/formats/openxml/docx.py @@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outzip.close() return outzip_path + + def get_texts(self, file_path: str): + inzip = zipfile.ZipFile(file_path, "r") + + texts = "" + + for inzipinfo in inzip.infolist(): + if len(texts) > 4096: + break + with inzip.open(inzipinfo) as infile: + if inzipinfo.filename == "word/document.xml": + soup = BeautifulSoup(infile.read(), 'xml') + texts += self.itag_of_soup(soup).text() + + inzip.close() + + return texts diff --git a/argostranslatefiles/formats/openxml/pptx.py b/argostranslatefiles/formats/openxml/pptx.py index 1c115df..29605da 100644 --- a/argostranslatefiles/formats/openxml/pptx.py +++ b/argostranslatefiles/formats/openxml/pptx.py @@ -34,3 +34,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outzip.close() return outzip_path + + def get_texts(self, file_path: str): + inzip = zipfile.ZipFile(file_path, "r") + + texts = "" + + for inzipinfo in inzip.infolist(): + if len(texts) > 4096: + break + with inzip.open(inzipinfo) as infile: + if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename): + soup = BeautifulSoup(infile.read(), 'xml') + texts += self.itag_of_soup(soup).text() + + inzip.close() + + return texts diff --git a/argostranslatefiles/formats/txt.py b/argostranslatefiles/formats/txt.py index d38fe50..a8fab7e 100644 --- a/argostranslatefiles/formats/txt.py +++ b/argostranslatefiles/formats/txt.py @@ -19,3 +19,6 @@ def translate(self, underlying_translation: ITranslation, file_path: str): outfile.close() return outfile_path + + def get_texts(self, file_path: str): + return open(file_path, "r").read(4096)