Skip to content

Commit

Permalink
Merge pull request #12 from sul-dlss/google
Browse files Browse the repository at this point in the history
Google transcription
  • Loading branch information
edsu authored Apr 9, 2024
2 parents b36fe90 + ac4cc7c commit 4254b07
Show file tree
Hide file tree
Showing 52 changed files with 129 additions and 25 deletions.
6 changes: 5 additions & 1 deletion env-example
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_TRANSCRIBE_S3_BUCKET="sul-transcription-test"
AWS_TRANSCRIBE_S3_BUCKET=sul-transcription-test

GOOGLE_APPLICATION_CREDENTIALS=/path/to/google/service-account/credentials.json
GOOGLE_CLOUD_PROJECT_ID=sul-ai-sandbox
GOOGLE_TRANSCRIBE_GCS_BUCKET=sul-dlss-transcription-test

# Depending on your AWS credentials you may need these too:
#
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
black
boto3
google-cloud-speech
google-cloud-storage
jiwer
jupyterlab
matplotlib
numpy
openai-whisper
pandas
pydub
pytest
python-dotenv
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
Empty file.
6 changes: 3 additions & 3 deletions test/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_transcript():
result = aws.transcribe(path.join(TEST_DATA, "en.wav"))
assert result == {
"language": "en-US",
"transcript": "This is a test for whisper reading in English.",
"text": "This is a test for whisper reading in English.",
}


Expand All @@ -25,7 +25,7 @@ def test_transcript_with_silence():
result = aws.transcribe(path.join(TEST_DATA, "en-with-silence.wav"))
assert result == {
"language": "en-US",
"transcript": "This is a test for whisper reading in English.",
"text": "This is a test for whisper reading in English.",
}


Expand All @@ -34,5 +34,5 @@ def test_transcript_fr():
result = aws.transcribe(path.join(TEST_DATA, "fr.wav"))
assert result == {
"language": "fr-FR",
"transcript": "Il s'agit d'un test de lecture de Whisper en français.",
"text": "Il s'agit d'un test de lecture de Whisper en français.",
}
22 changes: 18 additions & 4 deletions test/test_google.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,27 @@

dotenv.load_dotenv()

NO_GOOGLE = os.environ.get("GOOGLE") is None
NO_GOOGLE = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is None


@mark.skipif(NO_GOOGLE, reason="no Google keys")
def test_transcript():
result = google.transcribe("test-data/en.wav")
result = google.transcribe("test/data/en.wav")
assert result == {
"language": "en-US",
"transcript": "This is a test for whisper reading in English.",
"language": "en-us",
"text": "this is a test for whisper reading in English",
}


@mark.skipif(NO_GOOGLE, reason="no Google keys")
def test_copy_file():
assert (
google.copy_file("test/data/en.wav")
== "gs://sul-dlss-transcription-edsu-test/en.wav"
)


def test_convert_to_wav():
path = google.convert_to_wav("test/data/en.wav")
assert path.endswith(".wav")
assert os.path.getsize(path) > 0
11 changes: 8 additions & 3 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@


def test_get_files():
files = sorted(utils.get_files(path.join(TEST_DATA, "bags")))
assert len(files) == 2
files = utils.get_files(path.join(TEST_DATA, "bags"))
assert len(files) == 3
assert path.basename(files[0]) == "bb158br2509_sl.m4a"
assert path.basename(files[1]) == "gj097zq7635_a_sl.m4a"
assert path.basename(files[2]) == "gk220dt2833_Ali_Shan_10of10_sl.mp4"


def test_get_reference_file():
files = sorted(utils.get_files(path.join(TEST_DATA, "bags")))
files = utils.get_files(path.join(TEST_DATA, "bags"))
assert (
path.basename(utils.get_reference_file(files[0], "en"))
== "bb158br2509_script.txt"
Expand All @@ -22,6 +23,10 @@ def test_get_reference_file():
path.basename(utils.get_reference_file(files[1], "en"))
== "gj097zq7635_a_sl_script.txt"
)
assert (
path.basename(utils.get_reference_file(files[2], "en"))
== "gk220dt2833_Ali_Shan_10of10_sl_script.txt"
)


def test_compare_transcripts():
Expand Down
4 changes: 2 additions & 2 deletions transcribe/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def run(bags_dir, output_dir):

reference = utils.get_reference(file, transcription["language"])

result = utils.compare_transcripts(reference, transcription["transcript"])
result = utils.compare_transcripts(reference, transcription["text"])
result["language"] = transcription["language"]
result["file"] = os.path.basename(file)
result["runtime"] = runtime
Expand Down Expand Up @@ -66,7 +66,7 @@ def transcribe(media_file):
# return the detected language and the transcript
return {
"language": results["results"]["language_code"],
"transcript": results["results"]["transcripts"][0]["transcript"],
"text": results["results"]["transcripts"][0]["transcript"],
}


Expand Down
93 changes: 84 additions & 9 deletions transcribe/google.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,100 @@
import datetime
import json
import logging
import os
import subprocess
import tempfile
from collections import Counter

import tqdm
from google.api_core.exceptions import NotFound
from google.cloud import speech, storage

from . import utils


def run(bags_dir, output_dir):
results = []
for file in utils.get_files(bags_dir):
hypothesis = transcribe(file)
reference = utils.get_reference_file(file)
result = utils.compare_transcripts(reference, hypothesis)
for file in tqdm.tqdm(utils.get_files(bags_dir)):
logging.info(f"running google speech-to-text with {file}")

start_time = datetime.datetime.now()
transcription = transcribe(file)
runtime = utils.get_runtime(start_time)

with open(
os.path.join(output_dir, f"{os.path.basename(file)}-google.json"), "w"
) as fh:
json.dump(transcription, fh, ensure_ascii=False)

reference = utils.get_reference_file(file, "en")
result = utils.compare_transcripts(reference, transcription["text"])
result["language"] = transcription["language"]
result["file"] = os.path.basename(file)
result["runtime"] = runtime
logging.info(f"result: {result}")
results.append(result)

csv_filename = os.path.join(
output_dir, f"{datetime.now().date()}-aws-spreadsheet.csv"
)
csv_filename = os.path.join(output_dir, "report-google.csv")
utils.write_report(results, csv_filename)


def transcribe(media_file):

# convert the media file to single channel wav and upload to google cloud
wav_file = convert_to_wav(media_file)
blob_uri = copy_file(wav_file)
audio = speech.RecognitionAudio(uri=blob_uri)

# send the transcription job to google
logging.info(f"starting speech-to-text job for {wav_file}")
config = speech.RecognitionConfig(language_code="en-US", model="latest_long")
client = speech.SpeechClient()
operation = client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=60 * 60 * 2)

# join together all the text chunks in results
text = "".join([result.alternatives[0].transcript for result in response.results])

# get the most common language
language = Counter(
[result.language_code for result in response.results]
).most_common(1)[0][0]

# remove the temporary wav file
os.remove(wav_file)

return {
"language": "en_US",
"transcript": "This is a test for whisper reading in English.",
"language": language,
"text": text,
}


def copy_file(media_file):
bucket_name = os.environ.get("GOOGLE_TRANSCRIBE_GCS_BUCKET")
logging.info(f"copying {media_file} to google storage bucket {bucket_name}")
storage_client = storage.Client()

try:
bucket = storage_client.get_bucket(bucket_name)
except NotFound:
bucket = storage_client.create_bucket(bucket_name)

filename = os.path.basename(media_file)
blob = bucket.blob(filename)
blob.upload_from_filename(media_file)

return f"gs://{bucket_name}/{filename}"


def convert_to_wav(media_file):
temp_dir = tempfile.gettempdir()
wav_file = os.path.join(temp_dir, os.path.basename(media_file))
wav_file, ext = os.path.splitext(wav_file)
wav_file = f"{wav_file}.wav"

logging.info(f"ffmpeg converting {media_file} to {wav_file}")
subprocess.run(
["ffmpeg", "-y", "-loglevel", "panic", "-i", media_file, "-ac", "1", wav_file]
)
return wav_file
7 changes: 4 additions & 3 deletions transcribe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@


def get_files(bags_dir):
# TODO: simplify this
folder = f"{bags_dir}/*"
files_with_transcript = []
folders = glob.glob(folder + os.path.sep)
files = [glob.glob("{}data/content/*_sl*.m*".format(folder)) for folder in folders]
for folder in files:
for file in folder:
for file in sorted(folder):
if (
len(
glob.glob(f"{file.rsplit('.', 1)[0].replace('_sl', '')}*script.txt")
Expand All @@ -37,7 +38,7 @@ def get_files(bags_dir):
):
files_with_transcript.append(file)
break
return files_with_transcript
return list(sorted(files_with_transcript))


def get_reference_file(file, language):
Expand All @@ -46,7 +47,7 @@ def get_reference_file(file, language):
)
find_file = list(filter(lambda x: "_{}".format(language) in x, reference_files))
reference_file = find_file if len(find_file) > 0 else reference_files
return reference_file[0]
return list(sorted(reference_file))[0]


def get_reference(file, language):
Expand Down

0 comments on commit 4254b07

Please sign in to comment.