-
Notifications
You must be signed in to change notification settings - Fork 2
/
translate_pdf.py
70 lines (58 loc) · 2.64 KB
/
translate_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import os
import shutil
import fitz
import requests
from PyPDF2 import PdfReader
def extract_text_and_split_sentences(pdf_path):
with fitz.open(pdf_path) as doc:
text = ""
for page in doc:
text += page.get_text()
sentences = text.replace('\n', ' ').split('. ')
return sentences
def translate_sentence(sentence, src_lang, tgt_lang, server_url, endpoint):
url = f"{server_url}/{endpoint}"
params = {
"inputs": sentence,
"src_lang": src_lang,
"tgt_lang": tgt_lang
}
headers = {"Content-Type": "application/json"}
response = requests.post(url, params=params, headers=headers)
try:
translation = response.json()["translation"][0]["translation_text"]
except KeyError:
print(f"Error translating sentence: '{sentence}'")
print(f"Response: {response.json()}")
raise
return translation
def main():
temp_dir = "/tmp/pdf_translator"
os.makedirs(temp_dir, exist_ok=True)
# Get the list of PDF files in the mounted directory
pdf_files = [f for f in os.listdir(temp_dir) if f.endswith('.pdf')]
if not pdf_files:
print("No PDF files found in the mounted directory.")
return
input_pdf_name = pdf_files[0]
print(f"Translating PDF file: {input_pdf_name}")
input_pdf_path = os.path.join(temp_dir, input_pdf_name)
sentences = extract_text_and_split_sentences(input_pdf_path)
src_lang = input("Enter the source language code (e.g., en_XX): ")
tgt_lang = input("Enter the target language code (e.g., de_DE): ")
server_url = input("Enter the translation server URL (e.g., http://192.168.xx.xx:7001): ")
endpoint = input("Enter the translation endpoint (e.g., translate): ")
translations = []
with open(os.path.join(temp_dir, f"{os.path.splitext(input_pdf_name)[0]}_{src_lang}_{tgt_lang}.txt"), "w", encoding="utf-8") as source_target_file:
with open(os.path.join(temp_dir, f"{os.path.splitext(input_pdf_name)[0]}_{tgt_lang}.txt"), "w", encoding="utf-8") as target_file:
for i, sentence in enumerate(sentences, start=1):
print(f"Translating sentence: '{sentence}'")
translation = translate_sentence(sentence, src_lang, tgt_lang, server_url, endpoint)
translations.append(translation)
print(translation) # Print the translation result
source_target_file.write(f"Source: {sentence}\n")
source_target_file.write(f"Target: {translation}\n\n")
target_file.write(f"{translation}\n")
print(f"Translated {i}/{len(sentences)} sentences.")
if __name__ == "__main__":
main()