1
1
import { Processor } from '../../base/processing_utils.js' ;
2
- import { AutoFeatureExtractor } from '../auto/feature_extraction_auto.js' ;
3
- import { max , softmax } from '../../utils/maths.js' ;
2
+ import { PyAnnoteFeatureExtractor } from './feature_extraction_pyannote.js' ;
4
3
5
4
export class PyAnnoteProcessor extends Processor {
6
- static feature_extractor_class = AutoFeatureExtractor
5
+ static feature_extractor_class = PyAnnoteFeatureExtractor
7
6
8
7
/**
9
8
* Calls the feature_extractor function with the given audio input.
@@ -14,58 +13,12 @@ export class PyAnnoteProcessor extends Processor {
14
13
return await this . feature_extractor ( audio )
15
14
}
16
15
17
- /**
18
- * NOTE: Can return fractional values. `Math.ceil` will ensure correct value.
19
- * @param {number } samples The number of frames in the audio.
20
- * @returns {number } The number of frames in the audio.
21
- */
22
- samples_to_frames ( samples ) {
23
- return ( ( samples - this . config . offset ) / this . config . step ) ;
16
+ /** @type {PyAnnoteFeatureExtractor['post_process_speaker_diarization'] } */
17
+ post_process_speaker_diarization ( ...args ) {
18
+ return /** @type {PyAnnoteFeatureExtractor } */ ( this . feature_extractor ) . post_process_speaker_diarization ( ...args ) ;
24
19
}
25
20
26
- /**
27
- * Post-processes the speaker diarization logits output by the model.
28
- * @param {import('../../utils/tensor.js').Tensor } logits The speaker diarization logits output by the model.
29
- * @param {number } num_samples Number of samples in the input audio.
30
- * @returns {Array<Array<{ id: number, start: number, end: number, confidence: number }>> } The post-processed speaker diarization results.
31
- */
32
- post_process_speaker_diarization ( logits , num_samples ) {
33
- const ratio = (
34
- num_samples / this . samples_to_frames ( num_samples )
35
- ) / this . config . sampling_rate ;
36
-
37
- const results = [ ] ;
38
- for ( const scores of logits . tolist ( ) ) {
39
- const accumulated_segments = [ ] ;
40
-
41
- let current_speaker = - 1 ;
42
- for ( let i = 0 ; i < scores . length ; ++ i ) {
43
- const probabilities = softmax ( scores [ i ] ) ;
44
- const [ score , id ] = max ( probabilities ) ;
45
- const [ start , end ] = [ i , i + 1 ] ;
46
-
47
- if ( id !== current_speaker ) {
48
- // Speaker has changed
49
- current_speaker = id ;
50
- accumulated_segments . push ( { id, start, end, score } ) ;
51
- } else {
52
- // Continue the current segment
53
- accumulated_segments . at ( - 1 ) . end = end ;
54
- accumulated_segments . at ( - 1 ) . score += score ;
55
- }
56
- }
57
-
58
- results . push ( accumulated_segments . map (
59
- // Convert frame-space to time-space
60
- // and compute the confidence
61
- ( { id, start, end, score } ) => ( {
62
- id,
63
- start : start * ratio ,
64
- end : end * ratio ,
65
- confidence : score / ( end - start ) ,
66
- } )
67
- ) ) ;
68
- }
69
- return results ;
21
+ get sampling_rate ( ) {
22
+ return this . feature_extractor . config . sampling_rate ;
70
23
}
71
24
}
0 commit comments