Skip to content

Commit 14bf689

Browse files
authored
Fix pyannote processor post_process_speaker_diarization (#1082)
1 parent 9914e7a commit 14bf689

File tree

2 files changed

+63
-54
lines changed

2 files changed

+63
-54
lines changed

src/models/pyannote/feature_extraction_pyannote.js

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js';
22
import { Tensor } from '../../utils/tensor.js';
3+
import { max, softmax } from '../../utils/maths.js';
34

45

56
export class PyAnnoteFeatureExtractor extends FeatureExtractor {
@@ -25,4 +26,59 @@ export class PyAnnoteFeatureExtractor extends FeatureExtractor {
2526
};
2627
}
2728

29+
/**
30+
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
31+
* @param {number} samples The number of frames in the audio.
32+
* @returns {number} The number of frames in the audio.
33+
*/
34+
samples_to_frames(samples) {
35+
return ((samples - this.config.offset) / this.config.step);
36+
}
37+
38+
/**
39+
* Post-processes the speaker diarization logits output by the model.
40+
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
41+
* @param {number} num_samples Number of samples in the input audio.
42+
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
43+
*/
44+
post_process_speaker_diarization(logits, num_samples) {
45+
const ratio = (
46+
num_samples / this.samples_to_frames(num_samples)
47+
) / this.config.sampling_rate;
48+
49+
const results = [];
50+
for (const scores of logits.tolist()) {
51+
const accumulated_segments = [];
52+
53+
let current_speaker = -1;
54+
for (let i = 0; i < scores.length; ++i) {
55+
const probabilities = softmax(scores[i]);
56+
const [score, id] = max(probabilities);
57+
const [start, end] = [i, i + 1];
58+
59+
if (id !== current_speaker) {
60+
// Speaker has changed
61+
current_speaker = id;
62+
accumulated_segments.push({ id, start, end, score });
63+
} else {
64+
// Continue the current segment
65+
accumulated_segments.at(-1).end = end;
66+
accumulated_segments.at(-1).score += score;
67+
}
68+
}
69+
70+
results.push(accumulated_segments.map(
71+
// Convert frame-space to time-space
72+
// and compute the confidence
73+
({ id, start, end, score }) => ({
74+
id,
75+
start: start * ratio,
76+
end: end * ratio,
77+
confidence: score / (end - start),
78+
})
79+
));
80+
}
81+
return results;
82+
}
83+
2884
}
Lines changed: 7 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import { Processor } from '../../base/processing_utils.js';
2-
import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js';
3-
import { max, softmax } from '../../utils/maths.js';
2+
import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js';
43

54
export class PyAnnoteProcessor extends Processor {
6-
static feature_extractor_class = AutoFeatureExtractor
5+
static feature_extractor_class = PyAnnoteFeatureExtractor
76

87
/**
98
* Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
1413
return await this.feature_extractor(audio)
1514
}
1615

17-
/**
18-
* NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
19-
* @param {number} samples The number of frames in the audio.
20-
* @returns {number} The number of frames in the audio.
21-
*/
22-
samples_to_frames(samples) {
23-
return ((samples - this.config.offset) / this.config.step);
16+
/** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization']} */
17+
post_process_speaker_diarization(...args) {
18+
return /** @type {PyAnnoteFeatureExtractor} */(this.feature_extractor).post_process_speaker_diarization(...args);
2419
}
2520

26-
/**
27-
* Post-processes the speaker diarization logits output by the model.
28-
* @param {import('../../utils/tensor.js').Tensor} logits The speaker diarization logits output by the model.
29-
* @param {number} num_samples Number of samples in the input audio.
30-
* @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>>} The post-processed speaker diarization results.
31-
*/
32-
post_process_speaker_diarization(logits, num_samples) {
33-
const ratio = (
34-
num_samples / this.samples_to_frames(num_samples)
35-
) / this.config.sampling_rate;
36-
37-
const results = [];
38-
for (const scores of logits.tolist()) {
39-
const accumulated_segments = [];
40-
41-
let current_speaker = -1;
42-
for (let i = 0; i < scores.length; ++i) {
43-
const probabilities = softmax(scores[i]);
44-
const [score, id] = max(probabilities);
45-
const [start, end] = [i, i + 1];
46-
47-
if (id !== current_speaker) {
48-
// Speaker has changed
49-
current_speaker = id;
50-
accumulated_segments.push({ id, start, end, score });
51-
} else {
52-
// Continue the current segment
53-
accumulated_segments.at(-1).end = end;
54-
accumulated_segments.at(-1).score += score;
55-
}
56-
}
57-
58-
results.push(accumulated_segments.map(
59-
// Convert frame-space to time-space
60-
// and compute the confidence
61-
({ id, start, end, score }) => ({
62-
id,
63-
start: start * ratio,
64-
end: end * ratio,
65-
confidence: score / (end - start),
66-
})
67-
));
68-
}
69-
return results;
21+
get sampling_rate() {
22+
return this.feature_extractor.config.sampling_rate;
7023
}
7124
}

0 commit comments

Comments
 (0)