diff --git a/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp b/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp index 5505d8878..3b6123ca9 100644 --- a/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp +++ b/src/algorithms/machinelearning/tensorflowpredicteffnetdiscogs.cpp @@ -156,6 +156,11 @@ const char* TensorflowPredictEffnetDiscogs::description = DOC( "Note: This algorithm does not make any check on the input model so it is " "the user's responsibility to make sure it is a valid one.\n" "\n" + "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or " + "class activations (the output shape is, e.g., [time, number of classes]). If the output " + "parameter is set to an intermediate layer with more dimensions, the output will be " + "flattened to 2D.\n" + "\n" "References:\n" "\n" "1. Supported models at https://essentia.upf.edu/models/\n\n"); diff --git a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp index a280c6dde..9f8f29204 100644 --- a/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp +++ b/src/algorithms/machinelearning/tensorflowpredictfsdsinet.cpp @@ -159,12 +159,17 @@ const char* TensorflowPredictFSDSINet::description = DOC( "Note: This algorithm does not make any check on the input model so it is " "the user's responsibility to make sure it is a valid one.\n" "\n" + "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or " + "class activations (the output shape is, e.g., [time, number of classes]). If the output " + "parameter is set to an intermediate layer with more dimensions, the output will be " + "flattened to 2D.\n" + "\n" "Note: The FSD-SINet models were trained on normalized audio clips. " "Clip-level normalization is only implemented in standard mode since in streaming there is no access to the entire audio clip. " "In the streaming case, the user is responsible for controlling the dynamic range of the input signal. " "Ideally, the signal should be zero-mean (no DC) and normalized to the full dynamic range (-1, 1).\n\n" "References:\n" - " [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n" + " [1] Fonseca, E., Ferraro, A., & Serra, X. (2021). Improving sound event classification by increasing shift invariance in convolutional neural networks. arXiv preprint arXiv:2107.00623.\n\n" " [2] https://github.com/edufonseca/shift_sec" ); diff --git a/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp b/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp index cd037085e..59aa13ce3 100644 --- a/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp +++ b/src/algorithms/machinelearning/tensorflowpredictmusicnn.cpp @@ -158,6 +158,11 @@ const char* TensorflowPredictMusiCNN::description = DOC( "Note: This algorithm does not make any check on the input model so it is " "the user's responsibility to make sure it is a valid one.\n" "\n" + "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or " + "class activations (the output shape is, e.g., [time, number of classes]). If the output " + "parameter is set to an intermediate layer with more dimensions, the output will be " + "flattened to 2D.\n" + "\n" "References:\n" "\n" "1. Pons, J., & Serra, X. (2019). musicnn: Pre-trained convolutional neural " diff --git a/src/algorithms/machinelearning/tensorflowpredictvggish.cpp b/src/algorithms/machinelearning/tensorflowpredictvggish.cpp index 3f628c16b..4f7076227 100644 --- a/src/algorithms/machinelearning/tensorflowpredictvggish.cpp +++ b/src/algorithms/machinelearning/tensorflowpredictvggish.cpp @@ -156,6 +156,11 @@ const char* TensorflowPredictVGGish::description = DOC( "Note: This algorithm does not make any check on the input model so it is " "the user's responsibility to make sure it is a valid one.\n" "\n" + "Note: The output of this algorithm is 2D, which is suitable for extracting embeddings or " + "class activations (the output shape is, e.g., [time, number of classes]). If the output " + "parameter is set to an intermediate layer with more dimensions, the output will be " + "flattened to 2D.\n" + "\n" "References:\n" "\n" "1. Gemmeke, J. et. al., AudioSet: An ontology and human-labelled dataset " diff --git a/src/algorithms/standard/tensortovectorreal.cpp b/src/algorithms/standard/tensortovectorreal.cpp index a5092ec40..001e477ff 100644 --- a/src/algorithms/standard/tensortovectorreal.cpp +++ b/src/algorithms/standard/tensortovectorreal.cpp @@ -36,6 +36,7 @@ void TensorToVectorReal::configure() { _channels = 0; _timeStamps = 0; _featsSize = 0; + _warned = false; } @@ -44,6 +45,7 @@ void TensorToVectorReal::reset() { _channels = 0; _timeStamps = 0; _featsSize = 0; + _warned = false; } @@ -66,6 +68,11 @@ AlgorithmStatus TensorToVectorReal::process() { _timeStamps = tensor.dimension(2); _featsSize = tensor.dimension(3); + if (_channels != 1 && !_warned) { + E_WARNING("TensorToVectorReal: The channel axis (dimension 1) of the input tensor has size larger than 1, but the output of this algorithm is 2D. The batch, channel, and time axes (dimensions 0, 1, 2) will be flattened to the first dimension of the output matrix."); + _warned = true; + } + _frame.setAcquireSize(_timeStamps * _channels * _batchSize); _frame.setReleaseSize(_timeStamps * _channels *_batchSize); diff --git a/src/algorithms/standard/tensortovectorreal.h b/src/algorithms/standard/tensortovectorreal.h index 1f196b18a..b2e04612b 100644 --- a/src/algorithms/standard/tensortovectorreal.h +++ b/src/algorithms/standard/tensortovectorreal.h @@ -37,6 +37,7 @@ class TensorToVectorReal : public Algorithm { int _channels; int _timeStamps; int _featsSize; + bool _warned; public: TensorToVectorReal(){