Merge pull request #6 from ng-ky/main

Add support for "auto" when translating files
LibreTranslate · Oct 28, 2024 · f071dce · f071dce
2 parents ed3b701 + 4273755
commit f071dce
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 0 deletions.
diff --git a/argostranslatefiles/abstract_file.py b/argostranslatefiles/abstract_file.py
@@ -21,3 +21,6 @@ def get_output_path(self, underlying_translation: ITranslation, file_path: str):
 
     @abc.abstractmethod
     def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_texts(self, file_path: str): raise NotImplementedError
diff --git a/argostranslatefiles/argostranslatefiles.py b/argostranslatefiles/argostranslatefiles.py
@@ -37,3 +37,20 @@ def translate_file(underlying_translation: ITranslation, file_path: str):
             return supported_format.translate(underlying_translation, file_path)
 
     return False
+
+
+def get_texts(file_path: str):
+    """Get the file contents.
+
+    Args:
+        file_path (str): file path
+
+    Returns:
+        texts: File contents
+    """
+
+    for supported_format in get_supported_formats():
+        if supported_format.support(file_path):
+            return supported_format.get_texts(file_path)
+
+    return False
diff --git a/argostranslatefiles/formats/epub.py b/argostranslatefiles/formats/epub.py
@@ -54,3 +54,33 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outzip.close()
 
         return outzip_path
+
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+
+        texts = ""
+
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
+                if inzipinfo.filename in translatable_xml_filenames:
+                    soup = BeautifulSoup(infile.read(), 'xml')
+
+                    texts += self.itag_of_soup(soup).text()
+                elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
+                    head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
+                    content = str(infile.read(), 'utf-8')
+                    head_present = content.startswith(head)
+
+                    if head_present:
+                        content = content[len(head):]
+
+                    texts += content
+                else:
+                    texts += infile.read().decode()
+
+        inzip.close()
+
+        return texts
diff --git a/argostranslatefiles/formats/html.py b/argostranslatefiles/formats/html.py
@@ -2,6 +2,7 @@
 from argostranslate.translate import ITranslation
 
 from argostranslatefiles.abstract_file import AbstractFile
+from bs4 import BeautifulSoup
 
 
 class Html(AbstractFile):
@@ -32,3 +33,11 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outfile.close()
 
         return outfile_path
+
+    def get_texts(self, file_path: str):
+        infile = open(file_path, "r")
+
+        content = infile.read()
+
+        soup = BeautifulSoup(content, "html.parser")
+        return translatehtml.itag_of_soup(soup).text()[0:4096]
diff --git a/argostranslatefiles/formats/opendocument/odt.py b/argostranslatefiles/formats/opendocument/odt.py
@@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outzip.close()
 
         return outzip_path
+
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+
+        texts = ""
+
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "content.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+
+        inzip.close()
+
+        return texts
diff --git a/argostranslatefiles/formats/openxml/docx.py b/argostranslatefiles/formats/openxml/docx.py
@@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outzip.close()
 
         return outzip_path
+
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+
+        texts = ""
+
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if inzipinfo.filename == "word/document.xml":
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+
+        inzip.close()
+
+        return texts
diff --git a/argostranslatefiles/formats/openxml/pptx.py b/argostranslatefiles/formats/openxml/pptx.py
@@ -34,3 +34,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outzip.close()
 
         return outzip_path
+
+    def get_texts(self, file_path: str):
+        inzip = zipfile.ZipFile(file_path, "r")
+
+        texts = ""
+
+        for inzipinfo in inzip.infolist():
+            if len(texts) > 4096:
+                break
+            with inzip.open(inzipinfo) as infile:
+                if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
+                    soup = BeautifulSoup(infile.read(), 'xml')
+                    texts += self.itag_of_soup(soup).text()
+
+        inzip.close()
+
+        return texts
diff --git a/argostranslatefiles/formats/txt.py b/argostranslatefiles/formats/txt.py
@@ -19,3 +19,6 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
         outfile.close()
 
         return outfile_path
+
+    def get_texts(self, file_path: str):
+        return open(file_path, "r").read(4096)