[i2s_audio] Properly track DMA input/output (#16317)

2026-06-24 12:53:26 +00:00 · 2026-05-12 21:36:26 -04:00
parent 3df0527c1f
commit 65b53692bd
3 changed files with 178 additions and 80 deletions
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -99,7 +99,7 @@ void I2SAudioSpeakerBase::loop() {
  }

  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NO_MEM) {
-    ESP_LOGE(TAG, "Not enough memory");
+    ESP_LOGE(TAG, "Speaker task setup failed (allocation, preload, or channel enable)");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
  }

--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -36,9 +36,7 @@ enum SpeakerEventGroupBits : uint32_t {
  ERR_ESP_NO_MEM = (1 << 19),

  ERR_DROPPED_EVENT = (1 << 20),    // ISR overflowed the event queue, dropping a completion event
-  ERR_PARTIAL_WRITE = (1 << 21),    // a DMA write returned fewer bytes than requested (or the encoder
-                                    // failed to commit a complete block), which breaks the lockstep
-                                    // invariant for every subsequent event
+  ERR_PARTIAL_WRITE = (1 << 21),    // i2s_channel_write returned fewer bytes than requested
  ERR_LOCKSTEP_DESYNC = (1 << 22),  // i2s_event_queue_ and write_records_queue_ fell out of sync

  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
@@ -17,7 +17,14 @@ namespace esphome::i2s_audio {
 static const char *const TAG = "i2s_audio.speaker.std";

 static constexpr size_t DMA_BUFFERS_COUNT = 4;
-static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT + 1;
+// Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
+// doubled so that a transient backlog never overruns the queue (which would desync the lockstep
+// invariant between i2s_event_queue_ and write_records_queue_).
+static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
+// Generous timeout for ``i2s_channel_write`` blocking. A buffer frees roughly every
+// DMA_BUFFER_DURATION_MS, so a multiple of that gives plenty of slack against scheduling jitter
+// without masking real failures.
+static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));

 void I2SAudioSpeaker::dump_config() {
  I2SAudioSpeakerBase::dump_config();
@@ -49,30 +56,73 @@ void I2SAudioSpeaker::run_speaker_task() {
  // avoids unnecessary single-frame splices.
  const size_t ring_buffer_size =
      (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
-  const uint32_t frames_to_fill_single_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
-  const size_t bytes_to_fill_single_dma_buffer =
-      this->current_stream_info_.frames_to_bytes(frames_to_fill_single_dma_buffer);
+  const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);

  bool successful_setup = false;
+
  std::unique_ptr<audio::RingBufferAudioSource> audio_source;

-  {
+  // Pre-zeroed buffer used to silence-pad each DMA descriptor whenever real audio doesn't fully fill it.
+  RAMAllocator<uint8_t> silence_allocator;
+  uint8_t *silence_buffer = silence_allocator.allocate(dma_buffer_bytes);
+
+  if (silence_buffer != nullptr) {
+    memset(silence_buffer, 0, dma_buffer_bytes);
+
    std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
-    audio_source = audio::RingBufferAudioSource::create(temp_ring_buffer, bytes_to_fill_single_dma_buffer,
-                                                        static_cast<uint8_t>(bytes_per_frame));
+    audio_source =
+        audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_bytes, static_cast<uint8_t>(bytes_per_frame));
+
    if (audio_source != nullptr) {
+      // audio_source is nullptr if the ring buffer fails to allocate
      this->audio_ring_buffer_ = temp_ring_buffer;
      successful_setup = true;
    }
  }

+  if (successful_setup) {
+    // Preload every DMA descriptor with silence and push a matching zero-real-frames record per buffer.
+    // This guarantees that every on_sent event has a corresponding write record from the start, so
+    // ``i2s_event_queue_`` and ``write_records_queue_`` stay in lockstep for the entire task lifetime.
+    for (size_t i = 0; i < DMA_BUFFERS_COUNT; i++) {
+      size_t bytes_loaded = 0;
+      esp_err_t err = i2s_channel_preload_data(this->tx_handle_, silence_buffer, dma_buffer_bytes, &bytes_loaded);
+      if (err != ESP_OK || bytes_loaded != dma_buffer_bytes) {
+        ESP_LOGV(TAG, "Failed to preload silence into DMA buffer %u (err=%d, loaded=%u)", (unsigned) i, (int) err,
+                 (unsigned) bytes_loaded);
+        successful_setup = false;
+        break;
+      }
+      uint32_t zero_real_frames = 0;
+      if (xQueueSend(this->write_records_queue_, &zero_real_frames, 0) != pdTRUE) {
+        // Should never happen: the queue was just reset and is sized for DMA_BUFFERS_COUNT * 2 entries.
+        ESP_LOGV(TAG, "Failed to push preload write record");
+        successful_setup = false;
+        break;
+      }
+    }
+  }
+
+  if (successful_setup) {
+    // Register the on_sent callback BEFORE enabling the channel so the very first transmitted buffer
+    // generates a queued event that pairs with the first preloaded silence record.
+    const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
+    i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
+
+    if (i2s_channel_enable(this->tx_handle_) != ESP_OK) {
+      ESP_LOGV(TAG, "Failed to enable I2S channel");
+      successful_setup = false;
+    }
+  }
+
  if (!successful_setup) {
    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
  } else {
    bool stop_gracefully = false;
-    bool tx_dma_underflow = true;
-
-    uint32_t frames_written = 0;
+    // Number of records currently in ``write_records_queue_`` that carry real audio. Used by graceful
+    // stop to wait until every real-audio buffer has been confirmed played by an ISR event.
+    uint32_t pending_real_buffers = 0;
    uint32_t last_data_received_time = millis();

    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
@@ -81,11 +131,21 @@ void I2SAudioSpeaker::run_speaker_task() {
    // - Paused, OR
    // - No timeout configured, OR
    // - Timeout hasn't elapsed since last data
+    //
+    // Always-fill model: every iteration writes exactly one DMA buffer's worth, mixing real audio
+    // and silence padding as needed. The blocking ``i2s_channel_write`` paces the loop at the DMA
+    // consumption rate, and every buffer write is matched 1:1 with a record on ``write_records_queue_``.
+    //
+    // While paused, the real-audio fill is skipped and the entire DMA buffer is filled with silence;
+    // the same blocking ``i2s_channel_write`` provides natural pacing (one buffer per ~DMA_BUFFER_DURATION_MS),
+    // so the lockstep invariant is preserved without burning CPU.
    while (this->pause_state_ || !this->timeout_.has_value() ||
           (millis() - last_data_received_time) <= this->timeout_.value()) {
      uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);

      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
+        // COMMAND_STOP is set both by user-initiated stop() and by the ISR when it drops a completion
+        // event (paired with ERR_DROPPED_EVENT so loop() can distinguish the two cases).
        xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
        ESP_LOGV(TAG, "Exiting: COMMAND_STOP received");
        break;
@@ -101,80 +161,115 @@ void I2SAudioSpeaker::run_speaker_task() {
        break;
      }

+      // Drain ISR-stamped completion events. Each event corresponds 1:1 with a write_records_queue_
+      // entry by construction (preloaded records at startup, plus exactly one record pushed per
+      // iteration alongside exactly one DMA-buffer-sized write).
      int64_t write_timestamp;
+      bool lockstep_broken = false;
      while (xQueueReceive(this->i2s_event_queue_, &write_timestamp, 0)) {
-        // Receives timing events from the I2S on_sent callback. If actual audio data was sent in this event, it passes
-        // on the timing info via the audio_output_callback.
-        uint32_t frames_sent = frames_to_fill_single_dma_buffer;
-        if (frames_to_fill_single_dma_buffer > frames_written) {
-          tx_dma_underflow = true;
-          frames_sent = frames_written;
-          const uint32_t frames_zeroed = frames_to_fill_single_dma_buffer - frames_written;
-          write_timestamp -= this->current_stream_info_.frames_to_microseconds(frames_zeroed);
-        } else {
-          tx_dma_underflow = false;
-        }
-        frames_written -= frames_sent;
-
-        // Standard I2S mode: fire callback immediately for each event
-        if (frames_sent > 0) {
-          this->audio_output_callback_(frames_sent, write_timestamp);
-        }
-      }
-
-      if (this->pause_state_) {
-        // Pause state is accessed atomically, so thread safe
-        // Delay so the task yields, then skip transferring audio data
-        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
-        continue;
-      }
-
-      // Wait half the duration of the data already written to the DMA buffers for new audio data
-      // The millisecond helper modifies the frames_written variable, so use the microsecond helper and divide by 1000
-      uint32_t read_delay = (this->current_stream_info_.frames_to_microseconds(frames_written) / 1000) / 2;
-
-      size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(read_delay), false);
-      uint8_t *new_data = audio_source->mutable_data();
-
-      if (bytes_read > 0) {
-        this->apply_software_volume_(new_data, bytes_read);
-        this->swap_esp32_mono_samples_(new_data, bytes_read);
-      }
-
-      if (audio_source->available() == 0) {
-        if (stop_gracefully && tx_dma_underflow) {
+        uint32_t real_frames = 0;
+        if (xQueueReceive(this->write_records_queue_, &real_frames, 0) != pdTRUE) {
+          // Should never happen: would indicate the lockstep invariant is broken.
+          ESP_LOGV(TAG, "Event without matching write record");
+          xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
+          lockstep_broken = true;
          break;
        }
-        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS / 2));
-      } else {
-        size_t bytes_written = 0;
-
-        if (tx_dma_underflow) {
-          // Temporarily disable channel and callback to reset the I2S driver's internal DMA buffer queue
-          i2s_channel_disable(this->tx_handle_);
-          const i2s_event_callbacks_t null_callbacks = {.on_sent = nullptr};
-          i2s_channel_register_event_callback(this->tx_handle_, &null_callbacks, this);
-          i2s_channel_preload_data(this->tx_handle_, audio_source->data(), audio_source->available(), &bytes_written);
-        } else {
-          // Audio is already playing, use regular write to add to the DMA buffers
-          i2s_channel_write(this->tx_handle_, audio_source->data(), audio_source->available(), &bytes_written,
-                            DMA_BUFFER_DURATION_MS);
+        if (real_frames > 0) {
+          pending_real_buffers--;
+          // Real audio is packed at the start of each DMA buffer with any silence padding on the
+          // tail, so the real audio finished playing earlier than the buffer-completion timestamp
+          // by the duration of the trailing zeros.
+          const uint32_t silence_frames = frames_per_dma_buffer - real_frames;
+          const int64_t adjusted_ts =
+              write_timestamp - this->current_stream_info_.frames_to_microseconds(silence_frames);
+          this->audio_output_callback_(real_frames, adjusted_ts);
        }
+      }
+      if (lockstep_broken) {
+        break;
+      }

-        if (bytes_written > 0) {
-          last_data_received_time = millis();
-          frames_written += this->current_stream_info_.bytes_to_frames(bytes_written);
-          audio_source->consume(bytes_written);
+      // Graceful stop: exit only after the source's exposed chunk is drained, the underlying ring
+      // buffer has nothing left to hand over, and every real-audio buffer we submitted has been
+      // confirmed played. ``has_buffered_data()`` returns bytes still sitting in the ring buffer
+      // awaiting fill().
+      if (stop_gracefully && audio_source->available() == 0 && !this->has_buffered_data() &&
+          pending_real_buffers == 0) {
+        ESP_LOGV(TAG, "Exiting: graceful stop complete");
+        break;
+      }

-          if (tx_dma_underflow) {
-            tx_dma_underflow = false;
-            // Enable the on_sent callback and channel after preload
-            xQueueReset(this->i2s_event_queue_);
-            const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
-            i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
-            i2s_channel_enable(this->tx_handle_);
+      // Compose exactly one DMA buffer's worth: drain as much real audio as the source currently
+      // exposes (may take multiple fill() calls when crossing a ring buffer wrap), then pad any
+      // remainder with silence. All writes pack into the next free DMA descriptor in order, so the
+      // descriptor ends up holding [real audio][silence padding].
+      size_t bytes_written_total = 0;
+      size_t real_bytes_total = 0;
+      bool partial_write_failure = false;
+
+      if (!this->pause_state_) {
+        while (bytes_written_total < dma_buffer_bytes) {
+          size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS) / 2, false);
+          if (bytes_read > 0) {
+            uint8_t *new_data = audio_source->mutable_data() + audio_source->available() - bytes_read;
+            this->apply_software_volume_(new_data, bytes_read);
+            this->swap_esp32_mono_samples_(new_data, bytes_read);
          }
+
+          const size_t to_write = std::min(audio_source->available(), dma_buffer_bytes - bytes_written_total);
+          if (to_write == 0) {
+            // Ring buffer has nothing more to hand over right now; pad the rest of this DMA buffer
+            // with silence so the lockstep invariant (one write per iteration) is preserved.
+            break;
+          }
+
+          size_t bw = 0;
+          i2s_channel_write(this->tx_handle_, audio_source->data(), to_write, &bw, WRITE_TIMEOUT_TICKS);
+          if (bw != to_write) {
+            // A short real-audio write breaks DMA descriptor alignment for every subsequent event;
+            // the only safe recovery is to restart the task.
+            ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) to_write);
+            xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
+            partial_write_failure = true;
+            break;
+          }
+          audio_source->consume(bw);
+          bytes_written_total += bw;
+          real_bytes_total += bw;
        }
+        if (real_bytes_total > 0) {
+          last_data_received_time = millis();
+        }
+      }
+
+      if (partial_write_failure) {
+        break;
+      }
+
+      const size_t silence_bytes = dma_buffer_bytes - bytes_written_total;
+      if (silence_bytes > 0) {
+        size_t bw = 0;
+        i2s_channel_write(this->tx_handle_, silence_buffer, silence_bytes, &bw, WRITE_TIMEOUT_TICKS);
+        if (bw != silence_bytes) {
+          // Same descriptor-alignment hazard as a partial real-audio write.
+          ESP_LOGV(TAG, "Partial silence write: %u of %u bytes", (unsigned) bw, (unsigned) silence_bytes);
+          xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
+          break;
+        }
+      }
+
+      const uint32_t real_frames_in_buffer = this->current_stream_info_.bytes_to_frames(real_bytes_total);
+      // Push the matching write record. Capacity headroom in I2S_EVENT_QUEUE_COUNT guarantees this
+      // succeeds even with a transient backlog of unprocessed events; if it ever fails the lockstep
+      // invariant is broken and every subsequent timestamp would be silently wrong, so bail.
+      if (xQueueSend(this->write_records_queue_, &real_frames_in_buffer, 0) != pdTRUE) {
+        ESP_LOGV(TAG, "Exiting: write records queue full");
+        xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
+        break;
+      }
+      if (real_frames_in_buffer > 0) {
+        pending_real_buffers++;
      }
    }
  }
@@ -183,6 +278,11 @@ void I2SAudioSpeaker::run_speaker_task() {

  audio_source.reset();

+  if (silence_buffer != nullptr) {
+    silence_allocator.deallocate(silence_buffer, dma_buffer_bytes);
+    silence_buffer = nullptr;
+  }
+
  xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPED);

  while (true) {
@@ -301,7 +401,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
    return err;
  }

-  i2s_channel_enable(this->tx_handle_);
+  // The speaker task will enable the channel after preloading.

  return ESP_OK;
 }