diff --git a/README.md b/README.md index 630156c..a37aa0f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,11 @@ -This script takes an epub (or text file) and reads it to an mp3 or an m4b audiobook file, using TTS by https://github.com/coqui-ai/TTS +This script takes an epub (or text file) and reads it to an m4b audiobook file, using TTS by https://github.com/coqui-ai/TTS or OpenAI. The audiofiles are created in discrete chunks then transcribed using whisper speech-to-text. The transcription is compared to the original text, and if they don't match well it tries again. Finally all silence longer than a second is removed from all audio segments, and the audio is cleaned up before being combined into an m4b audiobook file. I recognize this is not very user friendly, but I wanted to share in case folks thought it was useful. If there are a few more people than myself that find this is useful I will keep working on turning it into something that could be used by someone without dev experience. +**NOTE: BIG UPDATE for XTTS!** The Coqui team released v2 of their XTTS model and the quality is amazing! This latest release includes significant refactoring, and uses streaming inference for XTTS. Suggested usage is to include up to three wav file speaker samples, up to 30 seconds each. Check out the XTTS sample to get an idea of the quality you can expect. + +Example usage: `epub2tts my-book.epub --start 4 --end 20 --xtts shadow-1.wav,shadow-2.wav,shadow-3.wav` + **NOTE:** Now with [OpenAI TTS](https://platform.openai.com/docs/guides/text-to-speech) support! It's not free, but the average cost for a few books I tested was around $7. If you use `--openai ` flag epub2tts will provide a cost estimate and prompt you to approve before continuing. **NOTE:** HUGE thanks to a recent PR from [wonka929](https://github.com/wonka929), epub2tts now recognizes when a CUDA GPU is available and will use it automatically. In a brief test I did, the speedup was incredible! @@ -13,17 +17,13 @@ Usage: TEXT: `epub2tts my-book.txt` - URL: `epub2tts --url https://www.example.com/page --name example-page` - -To use Coqui XTTS, add: `--xtts ` (GPU absolutely required, and even then it's slow but sounds amazing!) +To use Coqui XTTS, add: `--xtts ,,` (GPU required, slow but sounds amazing!) To use OpenAI TTS, add: `--openai ` (Use speaker option to specify voice other than onyx: `--speaker shimmer`) -To change speaker (ex p307 for a good male voice), add: `--speaker p307` - -To output in mp3 format instead of m4b, add: `--mp3` +To change speaker (ex p307 for a good male voice w/Coqui TTS), add: `--speaker p307` -To skip reading any links, add: `--skip-links` +To skip reading any links, add: `--skiplinks` Using `--scan` will list excerpts of each chapter, then exit. This is helpful for finding which chapter to start and end on if you want to skip bibliography, TOC, etc. diff --git a/epub2tts.py b/epub2tts.py index 64a6d2d..aefa768 100644 --- a/epub2tts.py +++ b/epub2tts.py @@ -1,393 +1,375 @@ -# Inspired by this medium article: -# https://medium.com/@zazazakaria18/turn-your-ebook-to-text-with-python-in-seconds-2a1e42804913 -# and this post which just cleaned up what was in the medium article: -# https://xwiki.recursos.uoc.edu/wiki/mat00001ca/view/Research%20on%20Translation%20Technologies/Working%20with%20PDF%20files%20using%20Python/ -# -# Usage: `epub2tts my-book.epub` -# To change speaker (ex p307 for a good male voice), add: `--speaker p307` -# To output in mp3 format instead of m4b, add: `--mp3` -# To skip reading any links, add: `--skip-links` -# Using `--scan` will list excerpts of each chapter, then exit. This is helpful -# for finding which chapter to start and end on if you want to skip bibliography, TOC, etc. -# To specify which chapter to start on (ex 3): `--start 3` -# To specify which chapter to end on (ex 20): `--end 20` -# To specify bitrate: --bitrate 30k -# Output will be an m4b or mp3 with each chapter read by Coqui TTS: https://github.com/coqui-ai/TTS - +import argparse import os -import requests +import re import string import subprocess import sys import time +import warnings import wave - from bs4 import BeautifulSoup import ebooklib from ebooklib import epub +from fuzzywuzzy import fuzz from newspaper import Article +import noisereduce +from openai import OpenAI +from pedalboard import Pedalboard, Compressor, Gain, NoiseGate, LowShelfFilter +from pedalboard.io import AudioFile from pydub import AudioSegment +from pydub.silence import split_on_silence import pysbd -from TTS.api import TTS +import requests import torch, gc -from openai import OpenAI - - -# Verify if CUDA or mps is available and select it -if torch.cuda.is_available(): - device = "cuda" -#except mps doesn't work right for this yet :( -#elif torch.backends.mps.is_available(): -# device = "mps" -else: - device = "cpu" -print(f"Using device: {device}") - -blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script'] -ffmetadatafile = "FFMETADATAFILE" - -usage = """ -Usage: - EPUB: epub2tts my-book.epub - TEXT: epub2tts my-book.txt - URL: epub2tts --url https://www.example.com/page --name example-page - -Adding --scan will list excerpts of each chapter, then exit. This is -helpful for finding which chapter to start and end on if you want to -skip TOC, bibliography, etc. - -To use Coqui XTTS, add: --xtts (GPU absolutely required, and even then it's slow but sounds amazing!) -To use OpenAI TTS, add: --openai (Use speaker option to specify voice other than onyx: `--speaker shimmer`) -To change speaker (ex p307 for a good male voice), add: --speaker p307 -To output in mp3 format instead of m4b, add: --mp3 -To skip reading any links, add: --skip-links -To specify which chapter to start on (ex 3): --start 3 -To specify which chapter to end on (ex 20): --end 20 -To specify bitrate (ex 30k): --bitrate 30k -""" - -def chap2text(chap): - output = '' - soup = BeautifulSoup(chap, 'html.parser') - if "--skip-links" in sys.argv: - # Remove everything that is an href +import torchaudio +from TTS.api import TTS +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts +from TTS.utils.generic_utils import get_user_data_dir +from tqdm import tqdm +import whisper + + +class EpubToAudiobook: + def __init__(self, source, start, end, skiplinks, engine, minratio, debug): + self.source = source + self.bookname = os.path.splitext(os.path.basename(source))[0] + self.start = start - 1 + self.end = end + self.skiplinks = skiplinks + self.engine = engine + self.minratio = minratio + self.debug = debug + self.output_filename = self.bookname + ".m4b" + self.chapters = [] + self.chapters_to_read = [] + if source.endswith('.epub'): + self.book = epub.read_epub(source) + self.sourcetype = 'epub' + elif source.endswith('.txt'): + self.sourcetype = 'txt' + else: + print("Can only handle epub or txt as source.") + sys.exit() + self.tts_dir = str(get_user_data_dir("tts")) + self.xtts_model = self.tts_dir + "/tts_models--multilingual--multi-dataset--xtts_v2" + self.whispermodel = whisper.load_model("tiny") + self.ffmetadatafile = "FFMETADATAFILE" + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + def generate_metadata(self, files, title, author): + chap = 1 + start_time = 0 + with open(self.ffmetadatafile, "w") as file: + file.write(";FFMETADATA1\n") + file.write("ARTIST=" + str(author) + "\n") + file.write("ALBUM=" + str(title) + "\n") + for file_name in files: + duration = self.get_wav_duration(file_name) + file.write("[CHAPTER]\n") + file.write("TIMEBASE=1/1000\n") + file.write("START=" + str(start_time) + "\n") + file.write("END=" + str(start_time + duration) + "\n") + file.write("title=Part " + str(chap) + "\n") + chap += 1 + start_time += duration + + def get_wav_duration(self, file_path): + with wave.open(file_path, 'rb') as wav_file: + num_frames = wav_file.getnframes() + frame_rate = wav_file.getframerate() + duration = num_frames / frame_rate + duration_milliseconds = duration * 1000 + return int(duration_milliseconds) + + def get_length(self, start, end, chapters_to_read): + total_chars = 0 + for i in range(start, end): + total_chars += len(chapters_to_read[i]) + return (total_chars) + + def chap2text(self, chap): + blacklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script'] + output = '' + soup = BeautifulSoup(chap, 'html.parser') + if self.skiplinks: + # Remove everything that is an href + for a in soup.findAll('a', href=True): + a.extract() + # Always skip reading links that are just a number (footnotes) for a in soup.findAll('a', href=True): - a.extract() - #Always skip reading links that are just a number (footnotes) - for a in soup.findAll('a', href=True): - if a.text.isdigit(): - a.extract() - text = soup.find_all(string=True) - for t in text: - if t.parent.name not in blacklist: - output += '{} '.format(t) - return output - - -def get_wav_duration(file_path): - with wave.open(file_path, 'rb') as wav_file: - num_frames = wav_file.getnframes() - frame_rate = wav_file.getframerate() - duration = num_frames / frame_rate - duration_milliseconds = duration * 1000 - return int(duration_milliseconds) - - -def gen_ffmetadata(files, title, author): - chap = 1 - start_time = 0 - with open(ffmetadatafile, "w") as file: - file.write(";FFMETADATA1\n") - file.write("ARTIST=" + str(author) + "\n") - file.write("ALBUM=" + str(title) + "\n") - for file_name in files: - duration = get_wav_duration(file_name) - file.write("[CHAPTER]\n") - file.write("TIMEBASE=1/1000\n") - file.write("START=" + str(start_time) + "\n") - file.write("END=" + str(start_time + duration) + "\n") - file.write("title=Part " + str(chap) + "\n") - chap += 1 - start_time += duration - -def get_bookname(): - bookname = '' - for i, arg in enumerate(sys.argv): - if arg.endswith('.txt') or arg.endswith('.epub'): - bookname = arg - if ("--url" in sys.argv) and ("--name" in sys.argv): - index = sys.argv.index("--name") - bookname = sys.argv[index + 1] + ".url" - if len(bookname) > 0: - print(f"Book filename: {bookname}") - return(bookname) - elif ("--url" in sys.argv) and ("--name" in sys.argv): - return(".url") - else: - print(usage) - sys.exit() - -def get_url(): - index = sys.argv.index("--url") - url = sys.argv[index + 1] - return(url) - -def get_speaker(): - if "--speaker" in sys.argv: - index = sys.argv.index("--speaker") - speaker_used = sys.argv[index + 1] - elif "--openai" in sys.argv: - speaker_used = "onyx" - elif "--xtts" in sys.argv: - speaker_used = "xtts" - else: - speaker_used = "p335" - print(f"Speaker: {speaker_used}") - return(speaker_used) - -def get_bitrate(): - if "--bitrate" in sys.argv: - index = sys.argv.index("--bitrate") - bitrate = sys.argv[index + 1] - else: - bitrate = "69k" - print(f"Bitrate: {bitrate}") - return(bitrate) - -def get_chapters_epub(book, bookname): - chapters = [] - for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_DOCUMENT: - chapters.append(item.get_content()) - - chapters_to_read = [] - for i in range(len(chapters)): - #strip some characters that might have caused TTS to choke - text = chap2text(chapters[i]) - text = text.replace("—", ", ") - allowed_chars = string.ascii_letters + string.digits + "-,.!? '" - text = ''.join(c for c in text if c in allowed_chars) - if len(text) < 150: - #too short to bother with - continue - outputwav = str(i)+"-"+bookname.split(".")[0]+".wav" - print(outputwav + " Length: " + str(len(text))) - print("Part: " + str(len(chapters_to_read)+1)) + if a.text.isdigit(): + a.extract() + text = soup.find_all(string=True) + for t in text: + if t.parent.name not in blacklist: + output += '{} '.format(t) + return output + + def get_chapters_epub(self): + for item in self.book.get_items(): + if item.get_type() == ebooklib.ITEM_DOCUMENT: + self.chapters.append(item.get_content()) + + for i in range(len(self.chapters)): + #strip some characters that might have caused TTS to choke + text = self.chap2text(self.chapters[i]) + text = text.replace("—", ", ").replace("--", ", ").replace(";", ", ").replace(":", ", ").replace("''", ", ") + allowed_chars = string.ascii_letters + string.digits + "-,.!? " + text = ''.join(c for c in text if c in allowed_chars) + if len(text) < 150: + #too short to bother with + continue + print("Length: " + str(len(text))) + print("Part: " + str(len(self.chapters_to_read) + 1)) + print(text[:256]) + self.chapters_to_read.append(text) # append the last piece of text (shorter than max_len) + print("Number of chapters to read: " + str(len(self.chapters_to_read))) + if self.end == 999: + self.end = len(self.chapters_to_read) + + def get_chapters_text(self): + with open(self.source, 'r') as file: + text = file.read() print(text[:256]) - chapters_to_read.append(text) # append the last piece of text (shorter than max_len) - print("Number of chapters to read: " + str(len(chapters_to_read))) - if "--scan" in sys.argv: - sys.exit() - return(chapters_to_read) - -def get_chapters_text(text): - chapters_to_read = [] - max_len = 50000 - while len(text) > max_len: - pos = text.rfind(' ', 0, max_len) # find the last space within the limit - chapters_to_read.append(text[:pos]) - print("Part: " + str(len(chapters_to_read))) - print(str(chapters_to_read[-1])[:256]) - text = text[pos+1:] # +1 to avoid starting the next chapter with a space - chapters_to_read.append(text) - return(chapters_to_read) - -def get_text(bookname): - with open(bookname, 'r') as file: - text = file.read() - return(text) - -def get_url_text(url): - article = Article(url) - article.download() - article.parse() - return(article.text) - -def get_length(start, end, chapters_to_read): - total_chars = 0 - for i in range(start, end): - total_chars += len(chapters_to_read[i]) - return(total_chars) - -def get_start(): -# There are definitely better ways to handle arguments, this should be fixed - if "--start" in sys.argv: - start = int(sys.argv[sys.argv.index("--start") + 1]) - 1 - else: - start = 0 - return(start) - -def get_end(chapters_to_read): -# There are definitely better ways to handle arguments, this should be fixed - if "--end" in sys.argv: - end = int(sys.argv[sys.argv.index("--end") + 1]) - else: - end = len(chapters_to_read) - return(end) - -def get_api_key(): - if "--openai" in sys.argv: - key = str(sys.argv[sys.argv.index("--openai") + 1]) - else: - key = '' - print(key) - return(key) - -def combine_sentences(sentences, length=3500): - combined = "" - for sentence in sentences: - if len(combined) + len(sentence) <= length: - combined += sentence + " " - else: - yield combined - combined = sentence - yield combined - -def main(): - if "--xtts" in sys.argv: - model_name = "tts_models/multilingual/multi-dataset/xtts_v2" - index = sys.argv.index("--xtts") - speaker_wav = sys.argv[index + 1] - else: - model_name = "tts_models/en/vctk/vits" - bookname = get_bookname() #detect .txt, .epub or https - booktype = bookname.split('.')[-1] - speaker_used = get_speaker() - openai_api_key = get_api_key() - if booktype == "epub": - book = epub.read_epub(bookname) - chapters_to_read = get_chapters_epub(book, bookname) - elif booktype == "txt": - print("Detected TEXT for file type, --scan, --start and --end will be ignored") - text = get_text(bookname) - chapters_to_read = get_chapters_text(text) - elif booktype == "url": - print("Detected URL for file type, --scan, --start and --end will be ignored") - url = get_url() - text = get_url_text(url) - print("Name: " + bookname) - print(text) - while True: - user_input = input("Look good, continue? (y/n): ") - if user_input.lower() not in ['y', 'n']: - print("Invalid input. Please enter y for yes or n for no.") - elif user_input.lower() == 'n': - sys.exit() + self.chapters_to_read.append(text) + self.end = len(self.chapters_to_read) + + def read_chunk_xtts(self, sentences, wav_file_path): + #takes list of sentences to read, reads through them and saves to wave file + t0 = time.time() + wav_chunks = [] + segmenter = pysbd.Segmenter(language="en", clean=True) + sentence_list = segmenter.segment(sentences) + for i, sentence in enumerate(sentence_list): + # Run TTS for each sentence + print(sentence) if self.debug else None + chunks = self.model.inference_stream( + sentence, + "en", + self.gpt_cond_latent, + self.speaker_embedding, + stream_chunk_size=60, + temperature=0.60, + repetition_penalty=10.0, + enable_text_splitting=True + ) + for j, chunk in enumerate(chunks): + if i == 0: + print(f"Time to first chunck: {time.time() - t0}") if self.debug else None + print(f"Received chunk {i} of audio length {chunk.shape[-1]}") if self.debug else None + wav_chunks.append(chunk.to(device=self.device)) # Move chunk to available device + # Add a short pause between sentences (e.g., X.XX seconds of silence) + if i < len(sentence_list) - 1: + silence_duration = int(24000 * 1.0) + silence = torch.zeros((silence_duration,), dtype=torch.float32, + device=self.device) # Move silence tensor to available device + wav_chunks.append(silence) + wav = torch.cat(wav_chunks, dim=0) + torchaudio.save(wav_file_path, wav.squeeze().unsqueeze(0).cpu(), 24000) + with AudioFile(wav_file_path).resampled_to(24000) as f: + audio = f.read(f.frames) + reduced_noise = noisereduce.reduce_noise(y=audio, sr=24000, stationary=True, prop_decrease=0.75) + board = Pedalboard([ + NoiseGate(threshold_db=-30, ratio=1.5, release_ms=250), + Compressor(threshold_db=12, ratio=2.5), + LowShelfFilter(cutoff_frequency_hz=400, gain_db=5, q=1), + Gain(gain_db=0) + ]) + result = board(reduced_noise, 24000) + with AudioFile(wav_file_path, 'w', 24000, result.shape[0]) as f: + f.write(result) + + def compare(self, text, wavfile): + result = self.whispermodel.transcribe(wavfile) + text = re.sub(' +', ' ', text).lower().strip() + ratio = fuzz.ratio(text, result["text"].lower()) + print("Transcript: " + result["text"].lower()) if self.debug else None + print("Text to transcript comparison ratio: " + str(ratio)) if self.debug else None + return (ratio) + + def combine_sentences(self, sentences, length=1000): + combined = "" + for sentence in sentences: + if len(combined) + len(sentence) <= length: + combined += sentence + " " else: - print("Continuing...") - break - chapters_to_read = get_chapters_text(text) - start = get_start() - end = get_end(chapters_to_read) - total_chars = get_length(start, end, chapters_to_read) - print("Total characters: " + str(total_chars)) - if "--openai" in sys.argv: - while True: - openai_sdcost = (total_chars/1000) * 0.015 - print("OpenAI TTS SD Cost: $" + str(openai_sdcost)) - user_input = input("This will not be free, continue? (y/n): ") - if user_input.lower() not in ['y', 'n']: - print("Invalid input. Please enter y for yes or n for no.") - elif user_input.lower() == 'n': - sys.exit() - else: - print("Continuing...") - break - files = [] - position = 0 - start_time = time.time() - if "--openai" in sys.argv: - client = OpenAI(api_key=openai_api_key) - else: - tts = TTS(model_name).to(device) - - for i in range(start, end): - outputwav = bookname.split(".")[0]+"-"+str(i+1)+".wav" - print("Reading " + str(i)) - if os.path.isfile(outputwav): - print(outputwav + " exists, skipping to next chapter") + yield combined + combined = sentence + yield combined + + def read_book(self, voice_samples, engine, openai, model_name, speaker, bitrate): + self.model_name = model_name + self.openai = openai + if engine == 'xtts': + self.voice_samples = voice_samples.split(",") + voice_name = "-" + re.split('-|\d+|\.', self.voice_samples[0])[0] + elif engine == 'openai': + if speaker == 'p335': + speaker = 'onyx' + voice_name = "-" + speaker else: - if "--openai" in sys.argv: + voice_name = "-" + speaker + self.output_filename = re.sub('.m4b', voice_name + ".m4b", self.output_filename) + print("Saving to " + self.output_filename) + total_chars = self.get_length(self.start, self.end, self.chapters_to_read) + print("Total characters: " + str(total_chars)) + if engine == "xtts": + print("Loading model: " + self.xtts_model) + config = XttsConfig() + model_json = self.xtts_model + "/config.json" + config.load_json(model_json) + self.model = Xtts.init_from_config(config) + self.model.load_checkpoint(config, + checkpoint_dir=self.xtts_model, + use_deepspeed=False) + self.model.cuda() + print("Computing speaker latents...") + self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents( + audio_path=self.voice_samples) + elif engine == "openai": + while True: + openai_sdcost = (total_chars/1000) * 0.015 + print("OpenAI TTS SD Cost: $" + str(openai_sdcost)) + user_input = input("This will not be free, continue? (y/n): ") + if user_input.lower() not in ['y', 'n']: + print("Invalid input. Please enter y for yes or n for no.") + elif user_input.lower() == 'n': + sys.exit() + else: + print("Continuing...") + break + client = OpenAI(api_key=self.openai) + else: + print("Engine is TTS, model is " + model_name) + self.tts = TTS(model_name).to(self.device) + + files = [] + position = 0 + start_time = time.time() + print("Reading from " + str(self.start) + " to " + str(self.end)) + for i in range(self.start, self.end): + outputwav = self.bookname + "-" + str(i+1) + ".wav" + if os.path.isfile(outputwav): + print(outputwav + " exists, skipping to next chapter") + else: + #print("Debug is " + str(self.debug)) tempfiles = [] segmenter = pysbd.Segmenter(language="en", clean=True) - sentences = segmenter.segment(chapters_to_read[i]) - sentence_groups = list(combine_sentences(sentences)) - for x in range(len(sentence_groups)): - tempwav = "temp" + str(x) + ".mp3" - print(sentence_groups[x]) - response = client.audio.speech.create( model="tts-1", voice=speaker_used, input=sentence_groups[x]) - response.stream_to_file(tempwav) + sentences = segmenter.segment(self.chapters_to_read[i]) + sentence_groups = list(self.combine_sentences(sentences)) + for x in tqdm(range(len(sentence_groups))): + retries = 1 + tempwav = "temp" + str(x) + ".wav" + if os.path.isfile(tempwav): + print(tempwav + " exists, skipping to next chunk") + else: + while retries > 0: + try: + if engine == "xtts": + self.read_chunk_xtts(sentence_groups[x], tempwav) + elif engine == "openai": + response = client.audio.speech.create( model="tts-1", voice=speaker, input=sentence_groups[x]) + response.stream_to_file(tempwav) + elif engine == "tts": + if model_name == 'tts_models/en/vctk/vits': + #assume we're using a multi-speaker model + self.tts.tts_to_file(text = sentence_groups[x], speaker = speaker, file_path = tempwav) + else: + self.tts.tts_to_file(text = sentence_groups[x], file_path = tempwav) + ratio = self.compare(sentence_groups[x], tempwav) + if ratio < self.minratio: + raise Exception("Spoken text did not sound right - " +str(ratio)) + break + except Exception as e: + retries -= 1 + print(f"Error: {str(e)} ... Retrying ({retries} retries left)") + if retries == 0: + print("Something is wrong with the audio (" + str(ratio) + "): " + tempwav) + #sys.exit() tempfiles.append(tempwav) tempwavfiles = [AudioSegment.from_mp3(f"{f}") for f in tempfiles] concatenated = sum(tempwavfiles) concatenated.export(outputwav, format="wav") for f in tempfiles: os.remove(f) - else: - if "--xtts" in sys.argv: -#look at all this disgusting duplicated code! FIX IT!!! - tempfiles = [] - segmenter = pysbd.Segmenter(language="en", clean=True) - sentences = segmenter.segment(chapters_to_read[i]) - sentence_groups = list(combine_sentences(sentences, 1000)) - for x in range(len(sentence_groups)): - tempwav = "temp" + str(x) + ".wav" - tts.tts_to_file(text=sentence_groups[x], speaker_wav = speaker_wav, file_path=tempwav, language="en") - tempfiles.append(tempwav) - tempwavfiles = [AudioSegment.from_mp3(f"{f}") for f in tempfiles] - concatenated = sum(tempwavfiles) - concatenated.export(outputwav, format="wav") - for f in tempfiles: - os.remove(f) - - else: - tts.tts_to_file(text = chapters_to_read[i], speaker = speaker_used, file_path = outputwav) - - - files.append(outputwav) - position += len(chapters_to_read[i]) - percentage = (position / total_chars) *100 - print(f"{percentage:.2f}% spoken so far.") - elapsed_time = time.time() - start_time - chars_remaining = total_chars - position - estimated_total_time = elapsed_time / position * total_chars - estimated_time_remaining = estimated_total_time - elapsed_time - print(f"Elapsed: {int(elapsed_time / 60)} minutes, ETA: {int((estimated_time_remaining) / 60)} minutes") - - # Clean GPU cache to have it all available for next step - if device == 'cuda': + files.append(outputwav) + position += len(self.chapters_to_read[i]) + percentage = (position / total_chars) * 100 + print(f"{percentage:.2f}% spoken so far.") + elapsed_time = time.time() - start_time + chars_remaining = total_chars - position + estimated_total_time = elapsed_time / position * total_chars + estimated_time_remaining = estimated_total_time - elapsed_time + print(f"Elapsed: {int(elapsed_time / 60)} minutes, ETA: {int((estimated_time_remaining) / 60)} minutes") gc.collect() torch.cuda.empty_cache() - else: - pass - - - #Load all WAV files and concatenate into one object - wav_files = [AudioSegment.from_wav(f"{f}") for f in files] - concatenated = sum(wav_files) - if "--mp3" in sys.argv: - outputmp3 = bookname.split(".")[0]+"-"+speaker_used+".mp3" - concatenated.export(outputmp3, format="mp3", parameters=["-write_xing", "0", "-filter:a", "speechnorm=e=6.25:r=0.00001:l=1"]) - else: - outputm4a = bookname.split(".")[0]+"-"+speaker_used+".m4a" - outputm4b = outputm4a.replace("m4a", "m4b") - bitrate = get_bitrate() + # Load all WAV files and concatenate into one object + wav_files = [AudioSegment.from_wav(f"{f}") for f in files] + one_sec_silence = AudioSegment.silent(duration=1000) + concatenated = AudioSegment.empty() + for audio in wav_files: + # Split audio into chunks where detected silence is longer than one second + chunks = split_on_silence(audio, min_silence_len=1000, silence_thresh=-50) + # Iterate through each chunk + for i, chunk in enumerate(tqdm(chunks)): + concatenated += chunk + concatenated += one_sec_silence + outputm4a = self.output_filename.replace("m4b", "m4a") concatenated.export(outputm4a, format="ipod", bitrate=bitrate) - if booktype == 'epub': - author = book.get_metadata('DC', 'creator')[0][0] - title = book.get_metadata('DC', 'title')[0][0] + if self.sourcetype == 'epub': + author = self.book.get_metadata('DC', 'creator')[0][0] + title = self.book.get_metadata('DC', 'title')[0][0] else: author = "Unknown" - title = bookname - gen_ffmetadata(files, title, author) - ffmpeg_command = ["ffmpeg","-i",outputm4a,"-i",ffmetadatafile,"-map_metadata","1","-codec","copy",outputm4b] + title = self.bookname + self.generate_metadata(files, title, author) + ffmpeg_command = ["ffmpeg","-i",outputm4a,"-i",self.ffmetadatafile,"-map_metadata","1","-codec","copy",self.output_filename] subprocess.run(ffmpeg_command) - os.remove(ffmetadatafile) + os.remove(self.ffmetadatafile) os.remove(outputm4a) - #cleanup, delete the wav files we no longer need - for f in files: - os.remove(f) + for f in files: + os.remove(f) + print(self.output_filename + " complete") + +def main(): + warnings.filterwarnings("ignore", category=UserWarning) + parser = argparse.ArgumentParser( + prog='EpubToAudiobook', + description='Read an epub (or other source) to audiobook format') + parser.add_argument('sourcefile', type=str, help='The epub or text file to process') + parser.add_argument('--engine', type=str, default='tts', nargs='?', const='tts', help='Which TTS to use [tts|xtts|openai]') + parser.add_argument('--xtts', type=str, nargs='?', const="zzz", default="zzz", help='Sample wave file(s) for XTTS training separated by commas') + parser.add_argument('--openai', type=str, nargs='?', const="zzz", default="zzz", help='OpenAI API key if engine is OpenAI') + parser.add_argument('--model', type=str, nargs='?', const='tts_models/en/vctk/vits', default='tts_models/en/vctk/vits', help='TTS model to use, default: tts_models/en/vctk/vits') + parser.add_argument('--speaker', type=str, default='p335', nargs='?', const='p335', help='Speaker to use (ex p335 for VITS, or onyx for OpenAI)') + parser.add_argument("--scan", action='store_true', help='Scan the epub to show beginning of chapters, then exit') + parser.add_argument('--start', type=int, nargs='?', const=1, default=1, help='Chapter/part to start from') + parser.add_argument('--end', type=int, nargs='?', const=999, default=999, help='Chapter/part to end with') + parser.add_argument('--minratio', type=int, nargs='?', const=88, default=88, help='Minimum match ratio between text and transcript') + parser.add_argument('--skiplinks', action='store_true', help='Skip reading any HTML links') + parser.add_argument('--bitrate', type=str, nargs='?', const="69k", default="69k", help="Specify bitrate for output file") + parser.add_argument('--debug', action='store_true', help='Enable debug output') + args = parser.parse_args() + print(args) + + if args.openai != "zzz": + args.engine = "openai" + if args.xtts != "zzz": + args.engine = "xtts" + mybook = EpubToAudiobook(source=args.sourcefile, start=args.start, end=args.end, skiplinks=args.skiplinks, engine=args.engine, minratio=args.minratio, debug=args.debug) + if mybook.sourcetype == 'epub': + mybook.get_chapters_epub() + else: + mybook.get_chapters_text() + if args.scan: + sys.exit() + mybook.read_book(voice_samples=args.xtts, engine=args.engine, openai=args.openai, model_name=args.model, speaker=args.speaker, bitrate=args.bitrate) + if __name__ == '__main__': main() diff --git a/requirements.txt b/requirements.txt index bb31778..3477d0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,16 @@ -TTS -ebooklib beautifulsoup4 +ebooklib +fuzzywuzzy +newspaper3k +noisereduce openai +openai-whisper +pedalboard pydub pysbd -newspaper3k \ No newline at end of file +python-Levenshtein +requests +torch +torchaudio +TTS +tqdm diff --git a/sample-onyx-openai.m4b b/sample-onyx-openai.m4b new file mode 100644 index 0000000..37bb752 Binary files /dev/null and b/sample-onyx-openai.m4b differ diff --git a/sample-p307-coquiTTS.m4b b/sample-p307-coquiTTS.m4b new file mode 100644 index 0000000..c6397c0 Binary files /dev/null and b/sample-p307-coquiTTS.m4b differ diff --git a/sample-p335-coquiTTS.m4b b/sample-p335-coquiTTS.m4b new file mode 100644 index 0000000..0683d67 Binary files /dev/null and b/sample-p335-coquiTTS.m4b differ diff --git a/sample-shadow-coquiXTTS.m4b b/sample-shadow-coquiXTTS.m4b new file mode 100644 index 0000000..f314bb8 Binary files /dev/null and b/sample-shadow-coquiXTTS.m4b differ diff --git a/sample.txt b/sample.txt new file mode 100644 index 0000000..84c3df4 --- /dev/null +++ b/sample.txt @@ -0,0 +1,11 @@ +This script takes an epub (or text file) and reads it to an m4b audiobook file, +using TTS by Coqui or OpenAI. The audiofiles are created in discrete chunks then +transcribed using whisper speech-to-text. The transcription is compared to the +original text, and if they don't match well it tries again. Finally all silence +longer than a second is removed from all audio segments, and the audio is cleaned +up before being combined into an m4b audiobook file. + +I recognize this is not very user friendly, but I wanted to share in case folks +thought it was useful. If there are a few more people than myself that find this +is useful I will keep working on turning it into something that could be used by +someone without dev experience. diff --git a/setup.py b/setup.py index 2d6c0d0..c39ecf0 100644 --- a/setup.py +++ b/setup.py @@ -5,12 +5,12 @@ setup( name='epub2tts', - description='Tool to read an epub to mp3 using TTS', - author='Christopher Aedo', + description='Tool to read an epub to audiobook using AI TTS', + author='Christopher Aedo linkedin.com/in/aedo', author_email='doc@aedo.net', url='https://github.com/aedocw/epub2tts', license='Apache License, Version 2.0', - version='1.5.0', + version='2.0.0', packages=find_packages(), install_requires=requirements, py_modules=['epub2tts'],