-
Notifications
You must be signed in to change notification settings - Fork 3
/
teller_of_tales_coqui.py
318 lines (225 loc) · 10.6 KB
/
teller_of_tales_coqui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
# if first run then run installer
# import nltk
# nltk.download()
from nltk.tokenize import sent_tokenize, word_tokenize
from datetime import datetime
import time
import openai
import json
import os
import re
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
import torch
# from gtts import gTTS
from moviepy.editor import *
from tqdm.auto import tqdm
# imports for coqui-ai/TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
# Use API_KEY imported from environment variables
openai.api_key = os.environ['OPENAI_TOKEN']
# show step by step debug info?
DEBUG = True
# minimal amount of words to put in each story fragment
FRAGMENT_LENGTH = 10
# select model to use
model_engine = "text-davinci-003"
# configure coqui-ai/TTS
path = "env/Lib/site-packages/TTS/.models.json"
model_manager = ModelManager(path)
model_path, config_path, model_item = model_manager.download_model("tts_models/en/ljspeech/tacotron2-DDC_ph")
voc_path, voc_config_path, _ = model_manager.download_model(model_item["default_vocoder"])
syn = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
vocoder_checkpoint=voc_path,
vocoder_config=voc_config_path
)
def write_list(a_list, filename):
print("Started writing list data into a json file")
with open(filename, "w") as fp:
json.dump(a_list, fp)
print("Done writing JSON data into .json file")
def read_list(filename):
# for reading also binary mode is important
with open(filename, 'rb') as fp:
n_list = json.load(fp)
return n_list
def showTime():
return str("["+datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')+" UTC]")
def pause():
programPause = input("Press the <ENTER> key to continue...")
def createFolders():
if not os.path.exists("text"):
print("no path")
os.makedirs("text")
if not os.path.exists("audio"):
os.makedirs("audio")
if not os.path.exists("images"):
os.makedirs("images")
if not os.path.exists("videos"):
os.makedirs("videos")
def load_and_split_to_sentences(filename):
# read raw story from txt file
with open(filename, "r", encoding="utf-8") as file:
story_raw = file.read()
# remove quotes from story
story = story_raw.replace('“', '').replace('”', '').replace('——', ' ')
# split story into list of sentences
story_sentences_list = sent_tokenize(story)
write_list(story_sentences_list, "text/story_sentences_list.json")
if DEBUG:
# display story enumerating through each sentence
for i, story_sentence in enumerate(story_sentences_list):
print( i, story_sentence)
print("\n!!!!!!!!!!!!!!\nThis is last chance to make changes in story_sentences_list.json file\n!!!!!!!!!!!!!!")
pause()
story_sentences_list = read_list("text/story_sentences_list.json")
return story_sentences_list
def sentences_to_fragments(story_sentences_list, FRAGMENT_LENGTH):
# story divided into fragments
story_fragments = []
# fragment currently being worked on
current_fragment = None
# current fragment word counter
current_fragment_word_counter = 0
# for every sentence in list of sentences
# combine sentences form story into fragments
for story_sentence in story_sentences_list:
# insert story sentence if current fragment is empty
if current_fragment == None:
current_fragment = story_sentence
# add story sentence to current fragment
else:
current_fragment += ' ' + story_sentence
# get amount of words in fragment
current_fragment_word_counter = len(word_tokenize(current_fragment))
# if minimal length requirement is meet
if current_fragment_word_counter > FRAGMENT_LENGTH:
if DEBUG:
print(current_fragment_word_counter)
# add current fragment to story fragments
story_fragments.append(current_fragment)
# zero temporary variables
current_fragment = None
current_fragment_word_counter = 0
# add last fragment
if current_fragment is not None:
story_fragments.append(current_fragment)
write_list(story_fragments, "text/story_fragments.json")
if DEBUG:
# display story enumerating through each sentence
for i, story_fragment in enumerate(story_fragments):
print( i, story_fragment)
print("\n!!!!!!!!!!!!!!\nThis is last chance to make changes in story_fragments.json file\n!!!!!!!!!!!!!!")
pause()
story_fragments = read_list("text/story_fragments.json")
return story_fragments
def prompt_to_image(i, image_prompt, image_width, image_height):
# clear cuda cache
with torch.no_grad():
torch.cuda.empty_cache()
# set parameters for image
seed = 1337
possitive_prompt_sufix = " (extremely detailed CG unity 8k wallpaper), nostalgia, professional majestic oil painting, trending on ArtStation, trending on CGSociety, Intricate, High Detail, Sharp focus, dramatic, by midjourney and greg rutkowski, realism, beautiful and detailed lighting, shadows, by Jeremy Lipking"
negative_prompt = "disfigured, kitsch, ugly, oversaturated, grain, low-res, Deformed, blurry, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, poorly drawn hands, missing limb, blurry, floating limbs, disconnected limbs, malformed hands, blur, out of focus, long neck, long body, ugly, disgusting, poorly drawn, childish, mutilated, mangled, old, surreal, text"
model_id = "darkstorm2150/Protogen_v2.2_Official_Release"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
generator = torch.Generator("cuda").manual_seed(seed)
# consider chunking the attention computation if limited by GPU memory
pipe.enable_attention_slicing()
prompt = image_prompt + possitive_prompt_sufix
image = pipe(prompt=prompt, negative_prompt=negative_prompt, height=image_height, width=image_width, guidance_scale=7.5, generator=generator, num_inference_steps=25).images[0]
image.save(f"images/image{i}.jpg")
def createVideoClip(i, story_fragment):
'''
# create gTTS instance and save to a file
tts = gTTS(text=story_fragment, lang='en', slow=False)
tts.save(f"audio/voiceover{i}.mp3")
# load the audio file using moviepy
audio_clip = AudioFileClip(f"audio/voiceover{i}.mp3")
audio_duration = audio_clip.duration
'''
# audio creation using coqui-ai/TTS
# create coqui-ai/TTS instance and save to a file
outputs = syn.tts(story_fragment.replace('\n', ' '))
syn.save_wav(outputs, f"audio/voiceover{i}.mp3")
# load the audio file using moviepy
audio_clip = AudioFileClip(f"audio/voiceover{i}.mp3")
audio_duration = audio_clip.duration
# load the image file using moviepy
image_clip = ImageClip(f"images/image{i}.jpg").set_duration(audio_duration)
# use moviepy to create a text clip from the text
screensize = (image_width, image_height)
text_clip = TextClip(story_fragment, fontsize=35, font="Impact", color="black", stroke_color="white", stroke_width=1.5, size=screensize, method='caption', align="South")
text_clip = text_clip.set_duration(audio_duration)
# concatenate the audio, image, and text clips
clip = image_clip.set_audio(audio_clip)
video = CompositeVideoClip([clip, text_clip])
# save Video Clip to a file
video = video.write_videofile(f"videos/video{i}.mp4", fps=24)
print(f"{showTime()} The Video{i} Has Been Created Successfully!")
def askChatGPT(text, model_engine):
completions = openai.Completion.create(
engine=model_engine,
prompt=text,
max_tokens=100,
n=1,
stop=None,
temperature=0.9,
)
return completions.choices[0].text
def createListOfClips():
clips = []
l_files = os.listdir("videos")
l_files.sort(key=lambda f: int(re.sub('\D', '', f)))
for file in l_files:
clip = VideoFileClip(f"videos/{file}")
clips.append(clip)
return clips
if __name__ == "__main__":
print(f"{showTime()}")
if DEBUG:
pause()
# Create directiories for text, audio, images and video files
createFolders()
# load story and split it by sentence
story_sentences_list = load_and_split_to_sentences("story3.txt")
# group sentences into story fragments of a given length
story_fragments = sentences_to_fragments(story_sentences_list, FRAGMENT_LENGTH)
# convert each story fragment into prompt and use it to generate image
image_width = 1024
image_height = 576
image_prompts = []
# for each story fragment
for i, story_fragment in enumerate(story_fragments):
print(f"{showTime()}")
prefix = "Suggest good image to illustrate the following fragment from story, make descrpition short and precise, one sentence, max 10 words: "
# translate fragment into prompt
try:
image_prompt = askChatGPT(prefix + story_fragment, model_engine).strip()
print(i, image_prompt)
image_prompts.append(image_prompt)
write_list(image_prompts, "text/image_prompts.json")
image_prompts = read_list("text/image_prompts.json")
except:
print(f"{showTime()} Cannot connect with OpenAI servers. \nProbable cause: No Internet connection, Invalid API token, Too much calls in short time")
exit()
# generate image form prompt
prompt_to_image(i, image_prompt, image_width, image_height)
# create video clip using story fragment and generated image
createVideoClip(i, story_fragment)
# if DEBUG:
# pause()
# create sorted list of clips
print(f"{showTime()} Fixing order of video clips")
clips = createListOfClips()
# add audio fade to prevent audio glitches when combining multiple clips
clips = [clip.audio_fadein(0.05).audio_fadeout(0.05) for clip in clips]
# combine all clips into final video
print(f"{showTime()} Concatenate all clips into final video...")
final_video = concatenate_videoclips(clips, method="compose")
final_video = final_video.write_videofile("final_video.mp4")
print(f"{showTime()} Final video created successfully!")