model.py

# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Keyword spotter model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
import sys
import audio_recorder
import mel_features
import numpy as np
import queue
import tflite_runtime.interpreter as tflite
import platform

EDGETPU_SHARED_LIB = {
    'Linux': 'libedgetpu.so.1',
    'Darwin': 'libedgetpu.1.dylib',
    'Windows': 'edgetpu.dll'
}[platform.system()]

q = queue.Queue()

logging.basicConfig(
    stream=sys.stdout,
    format="%(levelname)-8s %(asctime)-15s %(name)s %(message)s")
audio_recorder.logger.setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)


def get_queue():
  return q


class Uint8LogMelFeatureExtractor(object):
  """Provide uint8 log mel spectrogram slices from an AudioRecorder object.

  This class provides one public method, get_next_spectrogram(), which gets
  a specified number of spectral slices from an AudioRecorder.
  """

  def __init__(self, num_frames_hop=33):
    self.spectrogram_window_length_seconds = 0.025
    self.spectrogram_hop_length_seconds = 0.010
    self.num_mel_bins = 32
    self.frame_length_spectra = 198
    if self.frame_length_spectra % num_frames_hop:
        raise ValueError('Invalid num_frames_hop value (%d), '
                         'must devide %d' % (num_frames_hop,
                                             self.frame_length_spectra))
    self.frame_hop_spectra = num_frames_hop
    self._norm_factor = 3
    self._clear_buffers()

  def _clear_buffers(self):
    self._audio_buffer = np.array([], dtype=np.int16).reshape(0, 1)
    self._spectrogram = np.zeros((self.frame_length_spectra, self.num_mel_bins),
                                 dtype=np.float32)

  def _spectrogram_underlap_samples(self, audio_sample_rate_hz):
    return int((self.spectrogram_window_length_seconds -
                self.spectrogram_hop_length_seconds) * audio_sample_rate_hz)

  def _frame_duration_seconds(self, num_spectra):
    return (self.spectrogram_window_length_seconds +
            (num_spectra - 1) * self.spectrogram_hop_length_seconds)

  def _compute_spectrogram(self, audio_samples, audio_sample_rate_hz):
    """Compute log-mel spectrogram and scale it to uint8."""
    samples = audio_samples.flatten() / float(2**15)
    spectrogram = 30 * (
        mel_features.log_mel_spectrogram(
            samples,
            audio_sample_rate_hz,
            log_offset=0.001,
            window_length_secs=self.spectrogram_window_length_seconds,
            hop_length_secs=self.spectrogram_hop_length_seconds,
            num_mel_bins=self.num_mel_bins,
            lower_edge_hertz=60,
            upper_edge_hertz=3800) - np.log(1e-3))
    return spectrogram

  def _get_next_spectra(self, recorder, num_spectra):
    """Returns the next spectrogram.

    Compute num_spectra spectrogram samples from an AudioRecorder.
    Blocks until num_spectra spectrogram slices are available.

    Args:
      recorder: an AudioRecorder object from which to get raw audio samples.
      num_spectra: the number of spectrogram slices to return.

    Returns:
      num_spectra spectrogram slices computed from the samples.
    """
    required_audio_duration_seconds = self._frame_duration_seconds(num_spectra)
    logger.info("required_audio_duration_seconds %f",
                required_audio_duration_seconds)
    required_num_samples = int(
        np.ceil(required_audio_duration_seconds *
                recorder.audio_sample_rate_hz))
    logger.info("required_num_samples %d, %s", required_num_samples,
                str(self._audio_buffer.shape))
    audio_samples = np.concatenate(
        (self._audio_buffer,
         recorder.get_audio(required_num_samples - len(self._audio_buffer))[0]))
    self._audio_buffer = audio_samples[
        required_num_samples -
        self._spectrogram_underlap_samples(recorder.audio_sample_rate_hz):]
    spectrogram = self._compute_spectrogram(
        audio_samples[:required_num_samples], recorder.audio_sample_rate_hz)
    assert len(spectrogram) == num_spectra
    return spectrogram

  def get_next_spectrogram(self, recorder):
    """Get the most recent spectrogram frame.

    Blocks until the frame is available.

    Args:
      recorder: an AudioRecorder instance which provides the audio samples.

    Returns:
      The next spectrogram frame as a uint8 numpy array.
    """
    assert recorder.is_active
    logger.info("self._spectrogram shape %s", str(self._spectrogram.shape))
    self._spectrogram[:-self.frame_hop_spectra] = (
        self._spectrogram[self.frame_hop_spectra:])
    self._spectrogram[-self.frame_hop_spectra:] = (
        self._get_next_spectra(recorder, self.frame_hop_spectra))
    # Return a copy of the internal state that's safe to persist and won't
    # change the next time we call this function.
    logger.info("self._spectrogram shape %s", str(self._spectrogram.shape))
    spectrogram = self._spectrogram.copy()
    spectrogram -= np.mean(spectrogram, axis=0)
    if self._norm_factor:
      spectrogram /= self._norm_factor * np.std(spectrogram, axis=0)
      spectrogram += 1
      spectrogram *= 127.5
    return np.maximum(0, np.minimum(255, spectrogram)).astype(np.uint8)


def read_labels(filename):
  # The labels file can be made something like this.
  f = open(filename, "r")
  lines = f.readlines()
  return ['negative'] + [l.rstrip() for l in lines]


def read_commands(filename):
  # commands should consist of a label, a command and a confidence.
  f = open(filename, "r")
  commands = {}
  lines = f.readlines()
  for command, key, confidence in [l.rstrip().split(',') for l in lines]:
    commands[command] = { 'key': key, 'conf': 0.4}
    if confidence and 0 <= float(confidence) <= 1:
      commands[command]['conf'] = float(confidence)
  return commands


def get_output(interpreter):
    """Returns entire output, threshold is applied later."""
    return output_tensor(interpreter, 0)

def output_tensor(interpreter, i):
    """Returns dequantized output tensor if quantized before."""
    output_details = interpreter.get_output_details()[i]
    output_data = np.squeeze(interpreter.tensor(output_details['index'])())
    if 'quantization' not in output_details:
        return output_data
    scale, zero_point = output_details['quantization']
    if scale == 0:
        return output_data - zero_point
    return scale * (output_data - zero_point)


def input_tensor(interpreter):
    """Returns the input tensor view as numpy array."""
    tensor_index = interpreter.get_input_details()[0]['index']
    return interpreter.tensor(tensor_index)()[0]


def set_input(interpreter, data):
    """Copies data to input tensor."""
    interpreter_shape = interpreter.get_input_details()[0]['shape']
    input_tensor(interpreter)[:,:] = np.reshape(data, interpreter_shape[1:3])


def make_interpreter(model_file):
    model_file, *device = model_file.split('@')
    return tflite.Interpreter(
      model_path=model_file,
      experimental_delegates=[
          tflite.load_delegate(EDGETPU_SHARED_LIB,
                               {'device': device[0]} if device else {})
      ])


def add_model_flags(parser):
  parser.add_argument(
      "--model_file",
      help="File path of TFlite model.",
      default="models/voice_commands_v0.7_edgetpu.tflite")
  parser.add_argument("--mic", default=None,
                      help="Optional: Input source microphone ID.")
  parser.add_argument(
      "--num_frames_hop",
      default=33,
      help="Optional: Number of frames to wait between model inference "
      "calls. Smaller numbers will reduce the latancy while increasing "
      "compute cost. Must devide 198. Defaults to 33.")
  parser.add_argument(
      "--sample_rate_hz",
      default=16000,
      help="Optional: Sample Rate. The model expects 16000. "
      "However you may alternative sampling rate that may or may not work."
      "If you specify 48000 it will be downsampled to 16000.")

def classify_audio(audio_device_index, interpreter, labels_file,
                   commands_file=None,
                   result_callback=None, dectection_callback=None,
                   sample_rate_hz=16000,
                   negative_threshold=0.6, num_frames_hop=33):
  """Acquire audio, preprocess, and classify."""
  # Initialize recorder.
  AUDIO_SAMPLE_RATE_HZ = sample_rate_hz
  downsample_factor = 1
  if AUDIO_SAMPLE_RATE_HZ == 48000:
    downsample_factor = 3
  # Most microphones support this
  # Because the model expects 16KHz audio, we downsample 3 fold
  recorder = audio_recorder.AudioRecorder(
      AUDIO_SAMPLE_RATE_HZ,
      downsample_factor=downsample_factor,
      device_index=audio_device_index)
  feature_extractor = Uint8LogMelFeatureExtractor(num_frames_hop=num_frames_hop)
  labels = read_labels(labels_file)
  if commands_file:
    commands = read_commands(commands_file)
  else:
    commands = {}
  logger.info("Loaded commands: %s", str(commands))
  logger.info("Recording")
  timed_out = False
  with recorder:
    last_detection = -1
    while not timed_out:
      spectrogram = feature_extractor.get_next_spectrogram(recorder)
      set_input(interpreter, spectrogram.flatten())
      interpreter.invoke()
      result = get_output(interpreter)
      if result_callback:
        result_callback(result, commands, labels)
      if dectection_callback:
        detection = -1
        if result[0] < negative_threshold:
          top3 = np.argsort(-result)[:3]
          for p in range(3):
            label = labels[top3[p]]
            if label not in commands.keys():
              continue
            if top3[p] and result[top3[p]] > commands[label]['conf']:
              if detection < 0:
                detection = top3[p]
        if detection < 0 and last_detection > 0:
          print("---------------")
          last_detection = 0
        if labels[detection] in commands.keys() and detection != last_detection:
          print(labels[detection], commands[labels[detection]])
          dectection_callback(commands[labels[detection]]['key'])
          last_detection = detection
      if spectrogram.mean() < 0.001:
        print("Warning: Input audio signal is nearly 0. Mic may be off ?")