Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

So this adds in a VadSession::forward #16

Merged
merged 3 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion scripts/plot_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,13 @@ def rust_duration_to_seconds(obj):
vad = data["summary"][args.audio]
silence_samples = vad["current_silence_samples"]
speech_samples = vad["current_speech_samples"]
likelihoods = vad["likelihoods"]
redemption_time = rust_duration_to_seconds(data["config"]["redemption_time"])
pre_speech_pad = rust_duration_to_seconds(data["config"]["pre_speech_pad"])

positive_thresh = float(data["config"]["positive_speech_threshold"]) * 100
negative_thresh = float(data["config"]["negative_speech_threshold"]) * 100

print(f"redemption time: {redemption_time}")
for segment in vad["transitions"]:
if "SpeechStart" in segment:
Expand Down Expand Up @@ -84,14 +89,17 @@ def rust_duration_to_seconds(obj):

times = np.linspace(0, n_samples/sample_freq, num=n_samples)

fig, (ax, ax2) = plt.subplots(2)
fig, (ax, ax2, ax3) = plt.subplots(3)

ax.plot(times, signal_array)

ax.set(xlabel="Time (s)", ylabel="Signal", title="Audio")
ax2.set(title = "Buffer Sizes")
ax3.set(title = "Network likelihoods")

ax2.plot(silence_samples, label = "Current silence samples")
ax2.plot(speech_samples, label = "Current speech samples")
ax3.plot(likelihoods, label = "network likelihoods")
labeled_start = False
labeled_end = False
for (i, (start, end)) in enumerate(speech_segments):
Expand Down Expand Up @@ -121,6 +129,10 @@ def rust_duration_to_seconds(obj):
ax2.axhline(y=redemption_time_samples, color = 'r', linestyle = 'dashed', label = "redemption_time")
ax2.legend()

ax3.axhline(y=positive_thresh, color = 'g', linestyle = 'dashed', label = "positive threshold")
ax3.axhline(y=negative_thresh, color = 'r', linestyle = 'dashed', label = "negative threshold")
ax3.legend()

fill_regions = [False] * len(signal_array)

for i in range(len(signal_array)):
Expand Down
52 changes: 27 additions & 25 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,10 @@ impl VadSession {
Ok(transitions)
}

/// Advance the VAD state machine with an audio frame. Keep between 30-96ms in length.
/// Return indicates if a transition from speech to silence (or silence to speech) occurred.
///
/// Important: don't implement your own endpointing logic.
/// Instead, when a `SpeechEnd` is returned, you can use the `get_current_speech()` method to retrieve the audio.
fn process_internal(&mut self, range: Range<usize>) -> Result<Option<VadTransition>> {
let audio_frame = &self.session_audio[range];
let samples = audio_frame.len();
let audio_tensor = Array2::from_shape_vec((1, samples), audio_frame.to_vec())?;
let result = self.model.run(ort::inputs![
pub fn forward(&mut self, input: Vec<f32>) -> Result<ort::Value> {
let samples = input.len();
let audio_tensor = Array2::from_shape_vec((1, samples), input)?;
let mut result = self.model.run(ort::inputs![
audio_tensor.view(),
self.sample_rate_tensor.view(),
self.h_tensor.view(),
Expand All @@ -174,27 +168,35 @@ impl VadSession {
self.h_tensor = result
.get("hn")
.unwrap()
.try_extract_tensor::<f32>()
.unwrap()
.try_extract_tensor::<f32>()?
.to_owned()
.into_shape((2, 1, 64))
.expect("Shape mismatch for h_tensor");
.context("Shape mismatch for h_tensor")?;

self.c_tensor = result
.get("cn")
.unwrap()
.try_extract_tensor::<f32>()
.unwrap()
.try_extract_tensor::<f32>()?
.to_owned()
.into_shape((2, 1, 64))
.expect("Shape mismatch for h_tensor");
.context("Shape mismatch for h_tensor")?;

let prob = *result
.get("output")
.unwrap()
.try_extract_tensor::<f32>()
.unwrap()
.first()
.unwrap();
let prob_tensor = result.remove("output").unwrap();
Ok(prob_tensor)
}

/// Advance the VAD state machine with an audio frame. Keep between 30-96ms in length.
/// Return indicates if a transition from speech to silence (or silence to speech) occurred.
///
/// Important: don't implement your own endpointing logic.
/// Instead, when a `SpeechEnd` is returned, you can use the `get_current_speech()` method to retrieve the audio.
fn process_internal(&mut self, range: Range<usize>) -> Result<Option<VadTransition>> {
let audio_frame = self.session_audio[range].to_vec();
let samples = audio_frame.len();

let result = self.forward(audio_frame)?;

let prob = *result.try_extract_tensor::<f32>().unwrap().first().unwrap();

let mut vad_change = None;

Expand Down Expand Up @@ -243,7 +245,7 @@ impl VadSession {
} else {
if current_silence > self.config.redemption_time {
if *redemption_passed {
let speech_end = (self.processed_samples + audio_frame.len()
let speech_end = (self.processed_samples + samples
- self.silent_samples)
/ (self.config.sample_rate / 1000);
vad_change = Some(VadTransition::SpeechEnd {
Expand All @@ -258,7 +260,7 @@ impl VadSession {
}
};

self.processed_samples += audio_frame.len();
self.processed_samples += samples;

Ok(vad_change)
}
Expand Down
Loading
Loading