-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
112 lines (99 loc) · 4.49 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import speech_recognition as sr
import subprocess
import json
import os
import requests
from pydub import AudioSegment
from pydub.playback import play
from translate import Translator
import keyboard
OLLAMA_MODEL = "phi3"
OLLAMA_PROMPT = "You are a native Japanese teacher who is teaching Japanese. Please speak only in Japanese and explain in Japanese. Please do not use other languages."
VOICEVOX_SPEAKER_ID = 61 # Change this based on the speaker ID in VoiceVox
TRANSLATION_LIMIT = 500
translator = Translator(to_lang="en")
translator_ja_es = Translator(from_lang="ja", to_lang="en")
def ollama_response(prompt):
try:
process = subprocess.Popen(['ollama', 'run', OLLAMA_MODEL], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, encoding='utf-8')
stdout, stderr = process.communicate(input=prompt)
if process.returncode == 0:
return stdout.strip()
else:
print(f"Error running Ollama: {stderr}")
return ""
except Exception as e:
print(f"Connection error with Ollama: {e}")
return ""
# Text to speech function using VoiceVox
def japanese_text_to_speech(text, speaker_id=VOICEVOX_SPEAKER_ID):
url = f"http://localhost:50021/audio_query?text={text}&speaker={speaker_id}"
headers = {"Content-Type": "application/json"}
try:
response = requests.post(url, headers=headers)
response.raise_for_status()
audio_query = response.json()
synthesis_url = f"http://localhost:50021/synthesis?speaker={speaker_id}"
synthesis_response = requests.post(synthesis_url, headers=headers, json=audio_query)
synthesis_response.raise_for_status()
audio_path = "output_jp.wav"
with open(audio_path, "wb") as audio_file:
audio_file.write(synthesis_response.content)
return audio_path
except requests.exceptions.RequestException as e:
print(f"VoiceVox connection error: {e}")
return ""
# Speech recognition function
def recognize_speech():
r = sr.Recognizer()
with sr.Microphone() as source:
print("Press 'v' to start talking or 'e' to write the message...")
while True:
if keyboard.is_pressed('v'):
print("Listening... Press 'b' to stop.")
audio = r.listen(source)
while not keyboard.is_pressed('b'):
pass
try:
text = r.recognize_google(audio, language="ja-JP")
print(f"You said: {text}")
return text
except sr.UnknownValueError:
print("I couldn't understand what you said.")
return ""
except sr.RequestError as e:
print(f"Error requesting Google Speech Recognition results; {e}")
return ""
elif keyboard.is_pressed('e'):
text = input("Write your message: ")
print(f"You said: {text}")
return text
def split_text(text, limit):
return [text[i:i+limit] for i in range(0, len(text), limit)]
def main():
print("Japanese immersive environment started. Press 'v' to start speaking and 'b' to stop, or 'e' to type the message.")
while True:
user_input = recognize_speech()
if user_input:
translation = translator_ja_es.translate(user_input)
print(f"You said (translation): {translation}")
confirm = input("Type 'c' to confirm or 'r' to listen again: ").strip().lower()
if confirm == 'c':
response = ollama_response(f"{OLLAMA_PROMPT}\n{user_input}")
if response:
response_parts = split_text(response, TRANSLATION_LIMIT)
for part in response_parts:
response_translation = translator_ja_es.translate(part)
print(f"Answer (Japanese): {part}")
print(f"Answer (translation): {response_translation}")
audio_path = japanese_text_to_speech(part, speaker_id=VOICEVOX_SPEAKER_ID)
if audio_path:
audio = AudioSegment.from_file(audio_path)
play(audio)
os.remove(audio_path)
else:
print("No response was obtained.")
elif confirm == 'r':
print("Listening again...")
if __name__ == "__main__":
main()