[micro_wake_word] Use RingBufferAudioSource (#16595)

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-06-24 16:04:55 +00:00 · 2026-05-24 15:34:51 -04:00
parent 747787ae98
commit 9fcb638f33
2 changed files with 71 additions and 54 deletions
--- a/esphome/components/micro_wake_word/micro_wake_word.cpp
+++ b/esphome/components/micro_wake_word/micro_wake_word.cpp
@@ -33,7 +33,8 @@ static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
 static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;

 enum EventGroupBits : uint32_t {
-  COMMAND_STOP = (1 << 0),  // Signals the inference task should stop
+  COMMAND_STOP = (1 << 0),               // Signals the inference task should stop
+  COMMAND_RESET_RING_BUFFER = (1 << 1),  // Signals the inference task to discard buffered audio

  TASK_STARTING = (1 << 3),
  TASK_RUNNING = (1 << 4),
@@ -114,13 +115,13 @@ void MicroWakeWord::setup() {
    }
    std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
    if (this->ring_buffer_.use_count() > 1) {
-      size_t bytes_free = temp_ring_buffer->free();
-
-      if (bytes_free < data.size()) {
-        xEventGroupSetBits(this->event_group_, EventGroupBits::WARNING_FULL_RING_BUFFER);
-        temp_ring_buffer->reset();
+      // Producer-only write: never touches consumer state. If the buffer is full, ask the inference task
+      // to drain it - reset() is a consumer operation and must run on the inference task's thread.
+      // Disable partial writes so audio chunks are either fully accepted or rejected and handled below.
+      if (temp_ring_buffer->write_without_replacement(data.data(), data.size(), 0, false) == 0) {
+        xEventGroupSetBits(this->event_group_,
+                           EventGroupBits::WARNING_FULL_RING_BUFFER | EventGroupBits::COMMAND_RESET_RING_BUFFER);
      }
-      temp_ring_buffer->write((void *) data.data(), data.size());
    }
  });

@@ -146,56 +147,65 @@ void MicroWakeWord::inference_task(void *params) {

  {  // Ensures any C++ objects fall out of scope to deallocate before deleting the task

-    const size_t new_bytes_to_process =
-        this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(this_mww->features_step_size_);
-    std::unique_ptr<audio::AudioSourceTransferBuffer> audio_buffer;
+    const auto &stream_info = this_mww->microphone_source_->get_audio_stream_info();
+    const size_t bytes_per_frame = stream_info.frames_to_bytes(1);
+    const size_t max_fill_bytes = stream_info.ms_to_bytes(this_mww->features_step_size_);
+    std::unique_ptr<audio::RingBufferAudioSource> audio_source;
    int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];

    if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
-      // Allocate audio transfer buffer
-      audio_buffer = audio::AudioSourceTransferBuffer::create(new_bytes_to_process);
-
-      if (audio_buffer == nullptr) {
+      // Round ring buffer size down to a frame multiple so the wrap boundary never splits an int16 sample.
+      const size_t ring_buffer_size =
+          (stream_info.ms_to_bytes(RING_BUFFER_DURATION_MS) / bytes_per_frame) * bytes_per_frame;
+      std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
+      if (temp_ring_buffer == nullptr) {
        xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
+      } else {
+        audio_source = audio::RingBufferAudioSource::create(temp_ring_buffer, max_fill_bytes,
+                                                            static_cast<uint8_t>(bytes_per_frame));
+        if (audio_source == nullptr) {
+          xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
+        } else {
+          this_mww->ring_buffer_ = temp_ring_buffer;
+        }
      }
    }

-    if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
-      // Allocate ring buffer
-      std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(
-          this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(RING_BUFFER_DURATION_MS));
-      if (temp_ring_buffer.use_count() == 0) {
-        xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
-      }
-      audio_buffer->set_source(temp_ring_buffer);
-      this_mww->ring_buffer_ = temp_ring_buffer;
-    }
-
    if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
      this_mww->microphone_source_->start();
      xEventGroupSetBits(this_mww->event_group_, EventGroupBits::TASK_RUNNING);

-      while (!(xEventGroupGetBits(this_mww->event_group_) & COMMAND_STOP)) {
-        audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS));
-
-        if (audio_buffer->available() < new_bytes_to_process) {
-          // Insufficient data to generate new spectrogram features, read more next iteration
-          continue;
+      while (!(xEventGroupGetBits(this_mww->event_group_) & (COMMAND_STOP | ERROR_BITS))) {
+        if (xEventGroupGetBits(this_mww->event_group_) & EventGroupBits::COMMAND_RESET_RING_BUFFER) {
+          // Producer asked us to drain; run the consumer-side reset from this thread.
+          audio_source->clear_buffered_data();
+          xEventGroupClearBits(this_mww->event_group_, EventGroupBits::COMMAND_RESET_RING_BUFFER);
        }

-        // Generate new spectrogram features
-        uint32_t processed_samples = this_mww->generate_features_(
-            (int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() / sizeof(int16_t), features_buffer);
-        audio_buffer->decrease_buffer_length(processed_samples * sizeof(int16_t));
+        audio_source->fill(pdMS_TO_TICKS(DATA_TIMEOUT_MS), false);

-        // Run inference using the new spectorgram features
-        if (!this_mww->update_model_probabilities_(features_buffer)) {
-          xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_INFERENCE);
-          break;
+        // The frontend buffers samples internally and only emits a feature once it has a full window, so we can
+        // hand it whatever the source exposes. The frontend consumes at least one sample per call, so available()
+        // strictly decreases and this loop always terminates.
+        while (audio_source->available() >= sizeof(int16_t)) {
+          const size_t samples_available = audio_source->available() / sizeof(int16_t);
+          const int16_t *audio_data = reinterpret_cast<const int16_t *>(audio_source->data());
+
+          size_t processed_samples = 0;
+          const bool feature_generated =
+              this_mww->generate_features_(audio_data, samples_available, features_buffer, &processed_samples);
+          audio_source->consume(processed_samples * sizeof(int16_t));
+
+          if (feature_generated) {
+            if (!this_mww->update_model_probabilities_(features_buffer)) {
+              xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_INFERENCE);
+              break;
+            }
+
+            // Process each model's probabilities and possibly send a Detection Event to the queue
+            this_mww->process_probabilities_();
+          }
        }
-
-        // Process each model's probabilities and possibly send a Detection Event to the queue
-        this_mww->process_probabilities_();
      }
    }
  }
@@ -386,11 +396,15 @@ void MicroWakeWord::set_state_(State state) {
  }
 }

-size_t MicroWakeWord::generate_features_(int16_t *audio_buffer, size_t samples_available,
-                                         int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]) {
-  size_t processed_samples = 0;
+bool MicroWakeWord::generate_features_(const int16_t *audio_buffer, size_t samples_available,
+                                       int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE], size_t *processed_samples) {
+  *processed_samples = 0;
  struct FrontendOutput frontend_output =
-      FrontendProcessSamples(&this->frontend_state_, audio_buffer, samples_available, &processed_samples);
+      FrontendProcessSamples(&this->frontend_state_, audio_buffer, samples_available, processed_samples);
+
+  if (frontend_output.size == 0) {
+    return false;
+  }

  for (size_t i = 0; i < frontend_output.size; ++i) {
    // These scaling values are set to match the TFLite audio frontend int8 output.
@@ -415,7 +429,7 @@ size_t MicroWakeWord::generate_features_(int16_t *audio_buffer, size_t samples_a
    features_buffer[i] = static_cast<int8_t>(clamp<int32_t>(value, INT8_MIN, INT8_MAX));
  }

-  return processed_samples;
+  return true;
 }

 void MicroWakeWord::process_probabilities_() {
--- a/esphome/components/micro_wake_word/micro_wake_word.h
+++ b/esphome/components/micro_wake_word/micro_wake_word.h
@@ -115,13 +115,16 @@ class MicroWakeWord : public Component

  void set_state_(State state);

-  /// @brief Generates spectrogram features from an input buffer of audio samples
-  /// @param audio_buffer (int16_t *) Buffer containing input audio samples
-  /// @param samples_available (size_t) Number of samples avaiable in the input buffer
-  /// @param features_buffer (int8_t *) Buffer to store generated features
-  /// @return (size_t) Number of samples processed from the input buffer
-  size_t generate_features_(int16_t *audio_buffer, size_t samples_available,
-                            int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]);
+  /// @brief Generates a spectrogram feature from an input buffer of audio samples. The frontend buffers samples
+  /// internally, so callers may stream arbitrary-sized chunks; a feature is only emitted once enough samples have
+  /// accumulated to fill a full analysis window.
+  /// @param audio_buffer (const int16_t *) Buffer containing input audio samples
+  /// @param samples_available (size_t) Number of samples available in the input buffer
+  /// @param features_buffer (int8_t *) Buffer to store the generated feature, valid only when the return value is true
+  /// @param processed_samples (size_t *) Set to the number of samples consumed from the input buffer
+  /// @return True if a new feature was generated; false if more samples are required
+  bool generate_features_(const int16_t *audio_buffer, size_t samples_available,
+                          int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE], size_t *processed_samples);

  /// @brief Processes any new probabilities for each model. If any wake word is detected, it will send a DetectionEvent
  /// to the detection_queue_.