-
Notifications
You must be signed in to change notification settings - Fork 0
/
Translate_NL.py
142 lines (113 loc) · 5.82 KB
/
Translate_NL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
from pathlib import Path
import translatepy
import shutil
import re
import time # For pausing between chunks
# Regular expression to match subtitle blocks (timestamps and text)
subtitle_block_pattern = re.compile(r'(\d+)\s*(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{3}:\d{3},\d{3})\s*(.*)')
# Function to clean up timestamps (remove extra spaces)
def clean_timestamps(text):
# Correct the timestamps to remove unnecessary spaces
# Remove spaces after the colon in timestamps
cleaned_text = re.sub(r'(\d{2}):\s*(\d{2}):\s*(\d{2}),(\d{3})', r'\1:\2:\3,\4', text)
# Remove spaces around the arrow --> in the timestamps
cleaned_text = re.sub(r'\s*->\s*', ' --> ', cleaned_text)
return cleaned_text
# Function to perform translation
def translate_text(text, target_lang='nl'):
try:
# Using translatepy to translate the entire subtitle block
translator = translatepy.Translator()
translated_text = translator.translate(text, target_lang)
print(f"Translation result: {translated_text.result[:50]}...") # Debug: print part of the result
if translated_text.result:
return translated_text.result
else:
print(f"Warning: No translation found for text: {text[:30]}...")
return "" # Return an empty string if no translation is found
except Exception as e:
print(f"Error during translation: {e}")
return ""
# Function to process and translate subtitle chunks
def process_chunk(file_path, chunk_start, chunk_size=100):
with open(file_path, 'r') as file:
lines = file.readlines()
print(f"Processing subtitle file: {file_path}")
print(f"Total number of lines: {len(lines)}")
subtitle_block = ""
chunk_end = chunk_start + chunk_size
# Read the chunk of subtitle lines (chunk_start to chunk_end)
for i in range(chunk_start, min(chunk_end, len(lines))):
line = lines[i].strip()
print(f"Processing line {i+1}: {line}") # Debugging: Print each line
match = subtitle_block_pattern.match(line)
if match:
subtitle_block += f"{match.group(1)}\n{match.group(2)} --> {match.group(3)}\n{match.group(4)}\n\n"
else:
subtitle_block += f"{line}\n" # Include non-matching lines as well
# Print the subtitle block for debugging
if subtitle_block.strip():
print(f"\nChunk starting at line {chunk_start + 1}:\n{subtitle_block}")
time.sleep(0.4) # Pause for 2 seconds to inspect the chunk
# Translate the subtitle block
translated_block = translate_text(subtitle_block)
# Write the translated block to the output file
if translated_block:
output_file = "/tmp/SRT_Translate/test.srt"
os.makedirs(os.path.dirname(output_file), exist_ok=True) # Create directory if it doesn't exist
with open(output_file, 'a') as out_file:
out_file.write(f"{translated_block}\n")
print(f"Translated subtitle written to {output_file}")
else:
print("No translation was returned.")
# Function to clean up timestamps in the entire translated file
def clean_translated_file(output_file):
with open(output_file, 'r') as file:
content = file.read()
# Clean the timestamps in the translated text
cleaned_content = clean_timestamps(content)
# Write the cleaned content back to the file
with open(output_file, 'w') as file:
file.write(cleaned_content)
print(f"Cleaned the timestamps in the translated file: {output_file}")
# Function to copy the translated file, rename it, and clear test.srt
def copy_and_rename(output_file, source_directory, original_file):
# Extract the original filename from the source file (without the directory path)
original_filename = os.path.basename(original_file)
# Construct the new filename by replacing 'en.srt' with 'nld.srt'
new_filename = original_filename.replace('.en.srt', '.nld.srt') # We are renaming test.srt to nld.srt
# Define the full path to copy the file to
new_file_path = os.path.join(source_directory, new_filename)
# Copy the file to the source directory with the new name
shutil.copy(output_file, new_file_path)
print(f"Copied and renamed file to: {new_file_path}")
# Clear the original test.srt file
with open(output_file, 'w') as file:
file.truncate(0)
print(f"Cleared the content of {output_file}")
# Main function to process all .en.srt files in the directory
def main():
source_directory = input("Enter the source directory: ")
# Check if source directory exists
if not os.path.exists(source_directory):
print(f"Source directory '{source_directory}' does not exist.")
return
# Loop through all .srt files in the directory
for file_path in Path(source_directory).rglob("*"):
if str(file_path).endswith(".en.srt"): # Process only .en.srt files
# Open the file and read lines to calculate total lines
with open(file_path, 'r') as file:
lines = file.readlines()
total_lines = len(lines) # Get total number of lines in the file
chunk_start = 0 # Start at the first chunk (lines 1-100)
while chunk_start < total_lines: # Continue until the last line
process_chunk(str(file_path), chunk_start)
chunk_start += 100 # Increment to process the next 100 lines
# After processing all chunks, clean the timestamps in the output file
output_file = "/tmp/SRT_Translate/test.srt"
clean_translated_file(output_file)
# Copy and rename the file, then clear the original test.srt file
copy_and_rename(output_file, source_directory, str(file_path)) # Pass the original file path here
if __name__ == "__main__":
main()