Skip to content

Commit

Permalink
Merge pull request #6 from ng-ky/main
Browse files Browse the repository at this point in the history
Add support for "auto" when translating files
  • Loading branch information
pierotofy authored Oct 28, 2024
2 parents ed3b701 + 4273755 commit f071dce
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 0 deletions.
3 changes: 3 additions & 0 deletions argostranslatefiles/abstract_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ def get_output_path(self, underlying_translation: ITranslation, file_path: str):

@abc.abstractmethod
def translate(self, underlying_translation: ITranslation, file_path: str): raise NotImplementedError

@abc.abstractmethod
def get_texts(self, file_path: str): raise NotImplementedError
17 changes: 17 additions & 0 deletions argostranslatefiles/argostranslatefiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,20 @@ def translate_file(underlying_translation: ITranslation, file_path: str):
return supported_format.translate(underlying_translation, file_path)

return False


def get_texts(file_path: str):
"""Get the file contents.
Args:
file_path (str): file path
Returns:
texts: File contents
"""

for supported_format in get_supported_formats():
if supported_format.support(file_path):
return supported_format.get_texts(file_path)

return False
30 changes: 30 additions & 0 deletions argostranslatefiles/formats/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,33 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outzip.close()

return outzip_path

def get_texts(self, file_path: str):
inzip = zipfile.ZipFile(file_path, "r")

texts = ""

for inzipinfo in inzip.infolist():
if len(texts) > 4096:
break
with inzip.open(inzipinfo) as infile:
translatable_xml_filenames = ["OPS/content.opf", "OPS/toc.ncx", "OEBPS/content.opf", "OEBPS/toc.ncx"]
if inzipinfo.filename in translatable_xml_filenames:
soup = BeautifulSoup(infile.read(), 'xml')

texts += self.itag_of_soup(soup).text()
elif inzipinfo.filename.endswith('.html') or inzipinfo.filename.endswith('.xhtml'):
head = '<?xml version="1.0" encoding="utf-8"?>\n<!DOCTYPE html>'
content = str(infile.read(), 'utf-8')
head_present = content.startswith(head)

if head_present:
content = content[len(head):]

texts += content
else:
texts += infile.read().decode()

inzip.close()

return texts
9 changes: 9 additions & 0 deletions argostranslatefiles/formats/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from argostranslate.translate import ITranslation

from argostranslatefiles.abstract_file import AbstractFile
from bs4 import BeautifulSoup


class Html(AbstractFile):
Expand Down Expand Up @@ -32,3 +33,11 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outfile.close()

return outfile_path

def get_texts(self, file_path: str):
infile = open(file_path, "r")

content = infile.read()

soup = BeautifulSoup(content, "html.parser")
return translatehtml.itag_of_soup(soup).text()[0:4096]
17 changes: 17 additions & 0 deletions argostranslatefiles/formats/opendocument/odt.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outzip.close()

return outzip_path

def get_texts(self, file_path: str):
inzip = zipfile.ZipFile(file_path, "r")

texts = ""

for inzipinfo in inzip.infolist():
if len(texts) > 4096:
break
with inzip.open(inzipinfo) as infile:
if inzipinfo.filename == "content.xml":
soup = BeautifulSoup(infile.read(), 'xml')
texts += self.itag_of_soup(soup).text()

inzip.close()

return texts
17 changes: 17 additions & 0 deletions argostranslatefiles/formats/openxml/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outzip.close()

return outzip_path

def get_texts(self, file_path: str):
inzip = zipfile.ZipFile(file_path, "r")

texts = ""

for inzipinfo in inzip.infolist():
if len(texts) > 4096:
break
with inzip.open(inzipinfo) as infile:
if inzipinfo.filename == "word/document.xml":
soup = BeautifulSoup(infile.read(), 'xml')
texts += self.itag_of_soup(soup).text()

inzip.close()

return texts
17 changes: 17 additions & 0 deletions argostranslatefiles/formats/openxml/pptx.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,20 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outzip.close()

return outzip_path

def get_texts(self, file_path: str):
inzip = zipfile.ZipFile(file_path, "r")

texts = ""

for inzipinfo in inzip.infolist():
if len(texts) > 4096:
break
with inzip.open(inzipinfo) as infile:
if re.match(r"ppt\/slides\/slide[0-9]*\.xml", inzipinfo.filename):
soup = BeautifulSoup(infile.read(), 'xml')
texts += self.itag_of_soup(soup).text()

inzip.close()

return texts
3 changes: 3 additions & 0 deletions argostranslatefiles/formats/txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ def translate(self, underlying_translation: ITranslation, file_path: str):
outfile.close()

return outfile_path

def get_texts(self, file_path: str):
return open(file_path, "r").read(4096)

0 comments on commit f071dce

Please sign in to comment.