Skip to content

Commit

Permalink
feat add youtube-dl loader (#100)
Browse files Browse the repository at this point in the history
* chore: add youtube-dl loader

* chore: remove comments

* fix
  • Loading branch information
henomis authored Aug 4, 2023
1 parent f501cf2 commit d6a9ba5
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 0 deletions.
37 changes: 37 additions & 0 deletions examples/loader/youtube-dl/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package main

import (
"context"
"fmt"

"github.com/henomis/lingoose/llm/openai"
"github.com/henomis/lingoose/loader"
)

func main() {

l := loader.NewYoutubeDLLoader("https://www.youtube.com/watch?v=--khbXchTeE").WithYoutubeDLPath("/opt/homebrew/bin/youtube-dl")

docs, err := l.Load(context.Background())
if err != nil {
panic(err)
}

fmt.Println("Transcription:")
fmt.Println(docs[0].Content)

llm := openai.NewCompletion()

summary, err := llm.Completion(
context.Background(),
fmt.Sprintf("Summarize the following text:\n\nTranscription:\n%s", docs[0].Content),
)

if err != nil {
panic(err)
}

fmt.Println("Summary:")
fmt.Println(summary)

}
159 changes: 159 additions & 0 deletions loader/youtube-dl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package loader

import (
"bufio"
"context"
"fmt"
"os"
"os/exec"
"regexp"
"strings"

"github.com/henomis/lingoose/document"
"github.com/henomis/lingoose/types"
)

var (
ErrYoutubeDLNotFound = fmt.Errorf("youtube-dl not found")
defaultYoutubeDLPath = "/usr/bin/youtube-dl"
defaultYoutubeDLSubtitleLanguage = "en"
defaultYoutubeDLSubtitleMode = "--write-sub"
)

type YoutubeDLLoader struct {
loader Loader

youtubeDlPath string
path string
language string
subtitlesMode string
}

func NewYoutubeDLLoader(url string) *YoutubeDLLoader {
return &YoutubeDLLoader{
youtubeDlPath: defaultYoutubeDLPath,
path: url,
language: defaultYoutubeDLSubtitleLanguage,
subtitlesMode: defaultYoutubeDLSubtitleMode,
}
}

func (y *YoutubeDLLoader) WithYoutubeDLPath(youtubeDLPath string) *YoutubeDLLoader {
y.youtubeDlPath = youtubeDLPath
return y
}

func (y *YoutubeDLLoader) WithTextSplitter(textSplitter TextSplitter) *YoutubeDLLoader {
y.loader.textSplitter = textSplitter
return y
}

func (y *YoutubeDLLoader) WithLanguage(language string) *YoutubeDLLoader {
y.language = language
return y
}

func (y *YoutubeDLLoader) WithAutoSubtitlesMode() *YoutubeDLLoader {
y.subtitlesMode = "--write-auto-sub"
return y
}

func (y *YoutubeDLLoader) Load(ctx context.Context) ([]document.Document, error) {

err := isFile(y.youtubeDlPath)
if err != nil {
return nil, ErrYoutubeDLNotFound
}

documents, err := y.loadVideo(ctx)
if err != nil {
return nil, err
}

if y.loader.textSplitter != nil {
documents = y.loader.textSplitter.SplitDocuments(documents)
}

return documents, nil
}

func (y *YoutubeDLLoader) loadVideo(ctx context.Context) ([]document.Document, error) {

tempDir, err := os.MkdirTemp("", "youtube-dl")
if err != nil {
return nil, err
}
defer os.RemoveAll(tempDir)

args := []string{
y.subtitlesMode,
"--sub-lang", y.language,
"--skip-download",
"-o", fmt.Sprintf("%s/subtitles", tempDir),
y.path,
}

cmd := exec.CommandContext(ctx, y.youtubeDlPath, args...)
cmd.Stderr = os.Stderr
cmd.Stdout = os.Stdout

err = cmd.Run()
if err != nil {
return nil, err
}

plainText, err := convertVTTtoPlainText(fmt.Sprintf("%s/subtitles.%s.vtt", tempDir, y.language))
if err != nil {
return nil, err
}

return []document.Document{
{
Content: plainText,
Metadata: types.Meta{
"source": y.path,
},
},
}, nil
}

func convertVTTtoPlainText(filename string) (string, error) {

file, err := os.Open(filename)
if err != nil {
return "", err
}
defer file.Close()

scanner := bufio.NewScanner(file)
var lines []string
for scanner.Scan() {
lines = append(lines, scanner.Text())
}

var plainText string
for _, line := range lines {

timestampRegex := regexp.MustCompile(`\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}`)
line = timestampRegex.ReplaceAllString(line, "")

cueSettingsRegex := regexp.MustCompile(`(<c[.\w\s]+>|<\/c>)`)
line = cueSettingsRegex.ReplaceAllString(line, "")

vttTagsRegex := regexp.MustCompile(`(<\/?\w+>)`)
line = vttTagsRegex.ReplaceAllString(line, "")

line = strings.ReplaceAll(line, "&nbsp;", "")

line = strings.TrimSpace(line)
if line != "" {
plainText += line + "\n"
}
}

if err := scanner.Err(); err != nil {
return "", err
}

return plainText, nil
}

0 comments on commit d6a9ba5

Please sign in to comment.