-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adds the ability to fetch transcript from youtube
- Loading branch information
1 parent
6681fb9
commit 126ca2a
Showing
18 changed files
with
27,329 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
require "message_pb" | ||
|
||
module Youtube | ||
class Transcript | ||
attr_reader :response | ||
|
||
def get_vtt(video_id) | ||
message = {one: "asr", two: "en"} | ||
typedef = MessageType | ||
two = get_base64_protobuf(message, typedef) | ||
|
||
message = {one: video_id, two: two} | ||
params = get_base64_protobuf(message, typedef) | ||
|
||
url = "https://www.youtube.com/youtubei/v1/get_transcript" | ||
headers = {"Content-Type" => "application/json"} | ||
body = { | ||
context: { | ||
client: { | ||
clientName: "WEB", | ||
clientVersion: "2.20240313" | ||
} | ||
}, | ||
params: params | ||
} | ||
|
||
@response = HTTParty.post(url, headers: headers, body: body.to_json) | ||
convert_to_vtt(JSON.parse(response.body)) | ||
end | ||
|
||
def self.get_vtt(video_id) | ||
new.get_vtt(video_id) | ||
end | ||
|
||
private | ||
|
||
def encode_message(message, typedef) | ||
encoded_message = typedef.new(message) | ||
encoded_message.to_proto | ||
end | ||
|
||
def get_base64_protobuf(message, typedef) | ||
encoded_data = encode_message(message, typedef) | ||
Base64.encode64(encoded_data).delete("\n") | ||
end | ||
|
||
def convert_to_vtt(transcript) | ||
vtt_content = "WEBVTT\n\n" | ||
events = transcript.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", "initialSegments") | ||
if events | ||
events.each_with_index do |event, index| | ||
segment = event["transcriptSegmentRenderer"] | ||
start_time = format_time(segment["startMs"].to_i) | ||
end_time = format_time(segment["endMs"].to_i) | ||
text = segment.dig("snippet", "runs")&.map { |run| run["text"] }&.join || "" | ||
vtt_content += "#{index + 1}\n" | ||
vtt_content += "#{start_time} --> #{end_time}\n" | ||
vtt_content += "#{text}\n\n" | ||
end | ||
else | ||
vtt_content += "NOTE No transcript data available\n" | ||
end | ||
vtt_content | ||
end | ||
|
||
def format_time(ms) | ||
hours = ms / (1000 * 60 * 60) | ||
minutes = (ms % (1000 * 60 * 60)) / (1000 * 60) | ||
seconds = (ms % (1000 * 60)) / 1000 | ||
milliseconds = ms % 1000 | ||
format("%02d:%02d:%02d.%03d", hours, minutes, seconds, milliseconds) | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
require "webvtt" | ||
|
||
class WebVTTSerializer | ||
def self.dump(transcript) | ||
return "" if transcript.blank? | ||
|
||
# If transcript is a raw VTT string, convert it to cues first | ||
transcript = self.load(transcript) if transcript.is_a?(String) | ||
|
||
webvtt = "WEBVTT\n\n" | ||
transcript.each do |cue| | ||
webvtt += "#{cue[:start_time]} --> #{cue[:end_time]}\n#{cue[:text]}\n\n" | ||
end | ||
webvtt.strip | ||
end | ||
|
||
def self.load(transcript) | ||
return [] if transcript.blank? | ||
|
||
cues = [] | ||
# Split transcript by blank lines | ||
transcript.split("\n\n").each do |block| | ||
lines = block.split("\n") | ||
next if lines.size < 2 | ||
|
||
timecodes = lines[0].split(" --> ") | ||
text = lines[1..].join("\n") | ||
cues << {start_time: timecodes[0], end_time: timecodes[1], text: text} | ||
end | ||
cues | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
MeiliSearch::Rails.configuration = { | ||
meilisearch_url: Rails.env.local? ? "http://localhost:7700" : "http://91.107.208.207:7700", # example: http://localhost:7700 | ||
meilisearch_api_key: ENV["MEILI_MASTER_KEY"] | ||
meilisearch_api_key: ENV["MEILI_MASTER_KEY"], | ||
per_environment: true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
class AddTranscriptToTalk < ActiveRecord::Migration[7.1] | ||
def change | ||
add_column :talks, :transcript, :text, default: "", null: false | ||
end | ||
end |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.