Skip to content

Commit

Permalink
CyberBuddy - Backport management of MP3, WAV and WEBM files
Browse files Browse the repository at this point in the history
  • Loading branch information
csavelief committed Nov 13, 2024
1 parent c554de9 commit 573ebaf
Show file tree
Hide file tree
Showing 7 changed files with 397 additions and 292 deletions.
8 changes: 8 additions & 0 deletions app/Modules/CyberBuddy/Helpers/ApiUtils.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@

class ApiUtils
{
public function whisper(string $url, string $lang = 'fr')
{
return $this->post('/api/whisper', [
'url' => $url,
'lang' => $lang,
]);
}

public function file_input(string $client, string $url): array
{
return $this->post('/api/file-input', [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public function rules()
'files.*' => [
'required',
'file',
'mimes:pdf,doc,docx,txt',
'mimes:pdf,doc,docx,txt,mp3,wav,webm',
'max:10240',
],
];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public function rules()
'file' => [
'required',
'file',
'mimes:pdf,doc,docx,txt',
'mimes:pdf,doc,docx,txt,mp3,wav,webm',
'max:10240',
],
];
Expand Down
152 changes: 119 additions & 33 deletions app/Modules/CyberBuddy/Listeners/IngestFileListener.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
use App\Modules\CyberBuddy\Rules\IsValidCollectionName;
use Illuminate\Support\Facades\Auth;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Str;
use Symfony\Component\Process\Process;

class IngestFileListener extends AbstractListener
{
Expand Down Expand Up @@ -40,48 +42,132 @@ protected function handle2($event)
$file = File::find($event->fileId);

if (!$file) {
Log::error("Invalid file id : {$event->fileId}");
} else {
throw new \Exception("Invalid file id : {$event->fileId}");
}

// webm to mp3
if ($file->mime_type === 'audio/webm') {

$webmFileContent = file_get_contents($file->downloadUrl());

if ($webmFileContent === false) {
throw new \Exception("Failed to download webm file : {$file->downloadUrl()}");
}

// Download webm file
$webmFilePath = "/tmp/{$file->name_normalized}.{$file->extension}";
file_put_contents($webmFilePath, $webmFileContent);

$response = ApiUtils::file_input($event->user->client(), $file->downloadUrl());
// webm to mp3
$mp3FilePath = "/tmp/{$file->name_normalized}.mp3";
$process = Process::fromShellCommandline("ffmpeg -i " . escapeshellarg($webmFilePath) . " " . escapeshellarg($mp3FilePath));
$process->run();

// Cleanup
unlink($webmFilePath);

if (!$process->isSuccessful() || !file_exists($mp3FilePath)) {
throw new \Exception("Failed to convert webm to mp3 : {$file->downloadUrl()}");
}

// Upload file to S3
$collection = $file->collection()->where('is_deleted', false)->first();

if (!$collection) {
throw new \Exception("Unknown file collection : {$file->downloadUrl()}");
}
if (!Storage::disk('files-s3')->putFileAs($this->storageFilePath($collection), new \Illuminate\Http\File($mp3FilePath), $this->storageFileName($file, 'mp3'))) {
throw new \Exception("Failed to upload mp3 file : {$mp3FilePath}");
}

// Replace the webm reference by the mp3 one
$file->extension = 'mp3';
$file->mime_type = 'audio/mpeg';
$file->save();

// Cleanup
unlink($mp3FilePath);
}

// Speech-to-text
if ($file->mime_type === 'audio/mpeg' || $file->mime_type === 'audio/wav') {

$response = ApiUtils::whisper($file->downloadUrl());

if ($response['error']) {
Log::error($response['error_details']);
throw new \Exception($response['error_details']);
}

// Write text to disk
$txtFilePath = "/tmp/{$file->name_normalized}.txt";
file_put_contents($txtFilePath, $response['text']);

// Move file to storage
$collection = $file->collection()->where('is_deleted', false)->first();

if (!$collection) {
throw new \Exception("Unknown file collection : {$file->downloadUrl()}");
}
if (!Storage::disk('files-s3')->putFileAs($this->storageFilePath($collection), new \Illuminate\Http\File($txtFilePath), $this->storageFileName($file, 'txt'))) {
throw new \Exception("Failed to upload text file : {$txtFilePath}");
}

// Replace the mp3 reference by the txt one
$file->extension = 'txt';
$file->mime_type = 'text/plain';
$file->save();

// Cleanup
unlink($txtFilePath);
}

$response = ApiUtils::file_input($event->user->client(), $file->downloadUrl());

if ($response['error']) {
throw new \Exception($response['error_details']);
}

$fragments = $response['response'];

foreach ($fragments as $fragment) {

$tags = explode('>', $fragment['metadata']['title']);
$page = $fragment['metadata']['page_idx'] + 1;

if ($fragment['metadata']['tag'] === 'list') {
$text = trim($fragment['metadata']['prevPara']['text']) . "\n" . trim($fragment['text']);
} else {
$text = trim($fragment['text']);
}

/** @var Chunk $chunk */
$chunk = $collection->chunks()->create([
'file_id' => $file->id,
'url' => $file->downloadUrl(),
'page' => $page,
'text' => $text,
]);

$fragments = $response['response'];

foreach ($fragments as $fragment) {

$tags = explode('>', $fragment['metadata']['title']);
$page = $fragment['metadata']['page_idx'] + 1;

if ($fragment['metadata']['tag'] === 'list') {
$text = trim($fragment['metadata']['prevPara']['text']) . "\n" . trim($fragment['text']);
} else {
$text = trim($fragment['text']);
}

/** @var Chunk $chunk */
$chunk = $collection->chunks()->create([
'file_id' => $file->id,
'url' => $file->downloadUrl(),
'page' => $page,
'text' => $text,
]);

foreach ($tags as $tag) {
$chunk->tags()->create(['tag' => Str::lower($tag)]);
}
}
if (!Chunk::where('file_id', $file->id)->exists()) { // no chunks -> no embeddings -> processing is complete
$file->is_embedded = true;
$file->save();
}
foreach ($tags as $tag) {
$chunk->tags()->create(['tag' => Str::lower($tag)]);
}
}
if (!Chunk::where('file_id', $file->id)->exists()) { // no chunks -> no embeddings -> processing is complete
$file->is_embedded = true;
$file->save();
}
} catch (\Exception $exception) {
Log::error($exception->getMessage());
}
}

private function storageFileName(File $file, string $extension): string
{
return "{$file->id}_{$file->name_normalized}.{$extension}";
}

private function storageFilePath(\App\Modules\CyberBuddy\Models\Collection $collection): string
{
return "/cyber-buddy/{$collection->id}";
}
}
Loading

0 comments on commit 573ebaf

Please sign in to comment.