[i2s_audio] Refactor SPDIF output, fixing synchronization problems (#16319)

2026-06-24 12:53:26 +00:00 · 2026-05-08 17:26:09 -04:00
parent 88c2a1c096
commit 70b9edfabe
5 changed files with 206 additions and 311 deletions
--- a/esphome/components/i2s_audio/speaker/i2s_audio_spdif.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_spdif.cpp
@@ -23,35 +23,14 @@ static const char *const TAG = "i2s_audio.spdif";
 // 15 buffers x 4ms = 60ms of DMA buffering (same as 4 x 15ms for standard)
 static constexpr size_t SPDIF_DMA_BUFFERS_COUNT = 15;

-// Timeout for flushing pending frames if no callback received.
-static constexpr uint32_t SPDIF_FLUSH_TIMEOUT_MS = 20;
-
 // Number of DMA events between upstream callbacks (~16ms = 4 events x 4ms each).
 // Matches non-SPDIF timing to prevent overwhelming upstream sync algorithms.
 static constexpr uint32_t SPDIF_DMA_EVENTS_PER_CALLBACK = 4;

-// Consider TX stalled only if no DMA callbacks have arrived for this long.
-// Zero-block non-blocking writes alone are not sufficient (they can happen when DMA is simply full).
-static constexpr uint32_t SPDIF_STALL_NO_DMA_MS = 80;
-
-// Fallback stall detector: force recovery if silence writes make no forward progress for too long,
-// even if occasional DMA callbacks are still observed.
-static constexpr uint32_t SPDIF_STALL_ZERO_PROGRESS_MS = 1000;
-
-// Minimum spacing between re-prime attempts to avoid churn.
-static constexpr uint32_t SPDIF_REPRIME_COOLDOWN_MS = 500;
-
-// Small waits used in SPDIF mode to keep DMA fed during rapid pipeline churn.
-static constexpr uint32_t SPDIF_EMPTY_READ_DELAY_MS = 1;
-static constexpr uint32_t SPDIF_SILENCE_LOOP_DELAY_MS = 1;
+// Brief retry wait used by play() to catch short free-space windows during rapid track transitions.
 static constexpr uint32_t SPDIF_PLAY_RETRY_WAIT_MS = 5;

-static constexpr size_t SPDIF_I2S_EVENT_QUEUE_COUNT = SPDIF_DMA_BUFFERS_COUNT + 1;
-
-// Static silence buffer for SPDIF continuous mode
-// 192 samples * 2 channels * 2 bytes per sample = 768 bytes
-// Stored in flash (.rodata section) to avoid stack/heap usage
-static const int16_t SPDIF_SILENCE_BUFFER[SPDIF_BLOCK_SAMPLES * 2] = {0};
+static constexpr size_t SPDIF_I2S_EVENT_QUEUE_COUNT = 2 * SPDIF_DMA_BUFFERS_COUNT;

 // Static callback functions for SPDIF encoder (avoids std::function overhead)
 static esp_err_t spdif_preload_cb(void *user_ctx, uint32_t *data, size_t size, TickType_t ticks_to_wait) {
@@ -59,7 +38,7 @@ static esp_err_t spdif_preload_cb(void *user_ctx, uint32_t *data, size_t size, T
  size_t bytes_written = 0;
  esp_err_t err = i2s_channel_preload_data(speaker->get_tx_handle(), data, size, &bytes_written);
  if (err != ESP_OK || bytes_written != size) {
-    ESP_LOGW(TAG, "Preload failed: %s (wrote %zu/%zu bytes)", esp_err_to_name(err), bytes_written, size);
+    ESP_LOGV(TAG, "Preload failed: %s (wrote %zu/%zu bytes)", esp_err_to_name(err), bytes_written, size);
    return (err != ESP_OK) ? err : ESP_ERR_NO_MEM;
  }
  return ESP_OK;
@@ -69,9 +48,8 @@ static esp_err_t spdif_write_cb(void *user_ctx, uint32_t *data, size_t size, Tic
  auto *speaker = static_cast<I2SAudioSpeakerSPDIF *>(user_ctx);
  size_t bytes_written = 0;
  esp_err_t err = i2s_channel_write(speaker->get_tx_handle(), data, size, &bytes_written, ticks_to_wait);
-  // ESP_ERR_TIMEOUT is expected under DMA backpressure in SPDIF mode.
-  if (err != ESP_OK && err != ESP_ERR_TIMEOUT) {
-    ESP_LOGW(TAG, "I2S write failed: %s (wrote %zu/%zu bytes)", esp_err_to_name(err), bytes_written, size);
+  if (err != ESP_OK) {
+    ESP_LOGV(TAG, "I2S write failed: %s (wrote %zu/%zu bytes)", esp_err_to_name(err), bytes_written, size);
  }
  return err;
 }
@@ -157,6 +135,9 @@ void I2SAudioSpeakerSPDIF::run_speaker_task() {
    this->spdif_encoder_->reset();
  }

+  // Reset lockstep records queue so it starts paired with the (also-reset) i2s_event_queue_.
+  xQueueReset(this->write_records_queue_);
+
  const uint32_t dma_buffers_duration_ms = DMA_BUFFER_DURATION_MS * SPDIF_DMA_BUFFERS_COUNT;
  // Ensure ring buffer duration is at least the duration of all DMA buffers
  const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this->buffer_duration_ms_);
@@ -188,19 +169,16 @@ void I2SAudioSpeakerSPDIF::run_speaker_task() {
    // Preload DMA buffers with SPDIF-encoded silence before enabling the channel.
    // This ensures the first data transmitted is valid SPDIF (not raw zeros from
    // auto_clear) and prevents phantom DMA events before real audio is available.
-    // Track how many buffers were preloaded so the DMA event loop can skip
-    // frame accounting until the preloaded silence has fully drained.
-    uint32_t preload_buffers_remaining = 0;
+    // Each preloaded block pushes a 0-real-frame record so that the corresponding
+    // on_sent events drain in lockstep without crediting any audio frames.
    this->spdif_encoder_->set_preload_mode(true);
    for (size_t i = 0; i < SPDIF_DMA_BUFFERS_COUNT; i++) {
-      uint32_t preload_blocks = 0;
-      esp_err_t preload_err = this->spdif_encoder_->write(reinterpret_cast<const uint8_t *>(SPDIF_SILENCE_BUFFER),
-                                                          sizeof(SPDIF_SILENCE_BUFFER),
-                                                          pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS), &preload_blocks);
-      if (preload_err != ESP_OK || preload_blocks == 0) {
-        break;  // DMA buffers full or error
+      esp_err_t preload_err = this->spdif_encoder_->flush_with_silence(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
+      if (preload_err != ESP_OK) {
+        break;  // DMA preload buffer full or error
      }
-      preload_buffers_remaining += preload_blocks;
+      const uint32_t silence_record = 0;
+      xQueueSendToBack(this->write_records_queue_, &silence_record, 0);
    }
    this->spdif_encoder_->set_preload_mode(false);
    this->spdif_encoder_->reset();  // Clean encoder state for the main loop
@@ -211,299 +189,193 @@ void I2SAudioSpeakerSPDIF::run_speaker_task() {
    i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
    i2s_channel_enable(this->tx_handle_);

-    bool stop_gracefully = false;
-    bool tx_dma_underflow = true;
+    // Always-fill model: each iteration produces exactly one SPDIF block (= one DMA buffer).
+    // We drain real PCM up to one block from the ring buffer and silence-pad any remainder.
+    // Blocking writes pace the loop at the DMA consumption rate. This mirrors the standard
+    // I2S speaker pattern (PR #16317): fill what you can, then silence-pad whatever is still
+    // missing to complete the DMA buffer.
+    const uint32_t block_duration_us = this->current_stream_info_.frames_to_microseconds(SPDIF_BLOCK_SAMPLES);
+    // Sized to absorb the worst case where every DMA buffer is full when we issue the write.
+    const TickType_t write_timeout_ticks =
+        pdMS_TO_TICKS(((block_duration_us * (SPDIF_DMA_BUFFERS_COUNT + 1)) + 999) / 1000);
+    // Brief read budget when the ring buffer is empty (~half a block).
+    const TickType_t read_timeout_ticks = pdMS_TO_TICKS(((block_duration_us / 2) + 999) / 1000);

-    uint32_t frames_written = 0;
-
-    // SPDIF Continuous Silence Mode + Callback Decimation
-    //
-    // Key principles:
-    // 1. NEVER stop the I2S channel - always output a valid SPDIF stream
-    // 2. When no audio data, output silence-encoded SPDIF blocks (not zeros!)
-    // 3. Fire callbacks every 4 DMA events (~16ms), matching non-SPDIF timing
-    //
-    // This eliminates gaps that cause SPDIF receivers to re-sync, and reduces
-    // callback rate to prevent overwhelming upstream sync algorithms.
-    const uint32_t spdif_callback_threshold = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
+    // SPDIF Callback Decimation: fire every 4th DMA event (~16ms), matching non-SPDIF timing.
    uint32_t spdif_pending_frames = 0;
    int64_t spdif_pending_timestamp = 0;
-    uint32_t spdif_last_callback_time = millis();
-    // Count DMA events for decimation
    uint32_t spdif_dma_event_count = 0;
-    uint32_t spdif_last_dma_event_time = millis();
-    // Detect a stalled DMA path (many silence write attempts with zero accepted blocks).
-    uint32_t spdif_zero_block_streak = 0;
-    uint32_t spdif_last_block_progress_time = millis();
-    uint32_t spdif_last_reprime_time = 0;

    xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);

    // SPDIF continuous mode: loop runs indefinitely, outputting silence when no audio data
-    // to keep the receiver synced. Exits only via break (stream info change or silence timeout).
+    // to keep the receiver synced. Exits only via break (stream info change, silence timeout,
+    // lockstep desync, dropped event, or partial-write failure).
    while (true) {
      uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);

      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
        xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
-        // In SPDIF continuous mode, don't tear down or expose STOPPED here.
-        // Keep the task alive and transition to silence output.
+        // The ISR pairs COMMAND_STOP with ERR_DROPPED_EVENT when it has to discard a completion
+        // event; that desyncs the lockstep queues permanently and the only safe recovery is a full
+        // task restart.
+        if (event_group_bits & SpeakerEventGroupBits::ERR_DROPPED_EVENT) {
+          ESP_LOGV(TAG, "Exiting: ISR dropped event, restarting to recover lockstep");
+          break;
+        }
+        // User-initiated stop. In SPDIF continuous mode, transition to silence output rather
+        // than tearing the task down.
        this->spdif_silence_start_ = millis();
        ESP_LOGV(TAG, "COMMAND_STOP received, continuing in silence mode");
      }
      if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY) {
+        // SPDIF continuous mode never tears the channel down on graceful stop. Clear the flag and
+        // let the audio simply drain through the always-fill loop into the silence-timeout path.
        xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP_GRACEFULLY);
-        stop_gracefully = true;
      }

      if (this->audio_stream_info_ != this->current_stream_info_) {
-        // Audio stream info changed, stop the speaker task so it will restart with the proper settings.
        ESP_LOGV(TAG, "Exiting: stream info changed");
        break;
      }

+      // Drain ISR completion events, popping a matching record for each.
      int64_t write_timestamp;
+      bool lockstep_broken = false;
      while (xQueueReceive(this->i2s_event_queue_, &write_timestamp, 0)) {
-        spdif_last_dma_event_time = millis();
-
-        // Skip frame accounting for preloaded silence buffers still draining.
-        // These DMA events correspond to silence that was preloaded before the
-        // channel was enabled, not real audio written by the task.
-        if (preload_buffers_remaining > 0) {
-          preload_buffers_remaining--;
-          continue;
+        // Lockstep: pop the matching record (real audio frames packed into this DMA block).
+        // Records are pushed by the task right after each successful block commit, so the FIFO
+        // order matches DMA completion order. Empty records queue here means lockstep broke.
+        uint32_t real_frames = 0;
+        if (xQueueReceive(this->write_records_queue_, &real_frames, 0) != pdTRUE) {
+          ESP_LOGV(TAG, "Event without matching write record");
+          xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
+          lockstep_broken = true;
+          break;
        }

-        // Receives timing events from the I2S on_sent callback. If actual audio data was sent in this event, it passes
-        // on the timing info via the audio_output_callback.
-        uint32_t frames_sent = frames_to_fill_single_dma_buffer;
-        if (frames_to_fill_single_dma_buffer > frames_written) {
-          tx_dma_underflow = true;
-          frames_sent = frames_written;
-          const uint32_t frames_zeroed = frames_to_fill_single_dma_buffer - frames_written;
+        // Per-block timestamp adjustment: shift back by the silence-padding portion of the block
+        // so the reported timestamp reflects when the last real sample left the wire.
+        uint32_t frames_sent = real_frames;
+        if (real_frames < SPDIF_BLOCK_SAMPLES) {
+          const uint32_t frames_zeroed = SPDIF_BLOCK_SAMPLES - real_frames;
          write_timestamp -= this->current_stream_info_.frames_to_microseconds(frames_zeroed);
-        } else {
-          tx_dma_underflow = false;
        }
-        frames_written -= frames_sent;

-        // SPDIF Callback Decimation: fire every 4th DMA event (~16ms)
-        // This matches non-SPDIF timing and prevents overwhelming upstream.
-        if (spdif_callback_threshold > 0) {
-          spdif_dma_event_count++;
+        spdif_dma_event_count++;
+        // Accumulate frames; keep the latest timestamp so the callback reports when the last
+        // sample left the wire, not the first.
+        if (frames_sent > 0) {
+          spdif_pending_timestamp = write_timestamp;
+          spdif_pending_frames += frames_sent;
+        }

-          // Accumulate frames; always keep the latest timestamp so the
-          // callback reports when the last sample left the wire, not the first.
-          if (frames_sent > 0) {
-            spdif_pending_timestamp = write_timestamp;
-            spdif_pending_frames += frames_sent;
-          }
-
-          // Fire callback every 4 DMA events, or on timeout if we have pending frames
-          bool decimation_reached = (spdif_dma_event_count >= SPDIF_DMA_EVENTS_PER_CALLBACK);
-          bool timeout_flush =
-              (spdif_pending_frames > 0) && ((millis() - spdif_last_callback_time) >= SPDIF_FLUSH_TIMEOUT_MS);
-
-          if (decimation_reached || timeout_flush) {
-            if (spdif_pending_frames > 0) {
-              this->audio_output_callback_(spdif_pending_frames, spdif_pending_timestamp);
-              spdif_pending_frames = 0;
-              spdif_last_callback_time = millis();
-            }
-            spdif_dma_event_count = 0;  // Reset decimation counter
+        bool decimation_reached = (spdif_dma_event_count >= SPDIF_DMA_EVENTS_PER_CALLBACK);
+        // Partial blocks mark an end-of-stream boundary (silence-padded tail). Fire immediately
+        // so the back-shifted timestamp isn't overwritten by a later full audio block landing
+        // in the same decimation window.
+        bool partial_flush = (real_frames > 0 && real_frames < SPDIF_BLOCK_SAMPLES);
+
+        if (decimation_reached || partial_flush) {
+          if (spdif_pending_frames > 0) {
+            this->audio_output_callback_(spdif_pending_frames, spdif_pending_timestamp);
+            spdif_pending_frames = 0;
          }
+          spdif_dma_event_count = 0;
        }
      }
-
-      if (this->pause_state_) {
-        // Pause state is accessed atomically, so thread safe
-        // Delay so the task yields, then skip transferring audio data
-        vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
-        continue;
+      if (lockstep_broken) {
+        ESP_LOGV(TAG, "Exiting: lockstep desync, restarting task");
+        break;
      }

-      // Wait half the duration of the data already written to the DMA buffers for new audio data
-      // The millisecond helper modifies the frames_written variable, so use the microsecond helper and divide by 1000
-      uint32_t read_delay = (this->current_stream_info_.frames_to_microseconds(frames_written) / 1000) / 2;
+      // Always-fill: produce exactly one SPDIF block this iteration. The blocking encoder write
+      // paces the task at the DMA consumption rate.
+      uint32_t real_frames_in_block = 0;
+      bool block_committed = false;
+      bool partial_write_failure = false;

-      // In SPDIF mode, if transfer buffer is empty (we're pumping silence), use a very short timeout.
-      // This ensures we can pump silence fast enough to keep the DMA fed (~250 blocks/sec needed).
-      // Otherwise the long timeout based on frames_written causes DMA to run dry.
-      if (transfer_buffer->available() == 0) {
-        read_delay = SPDIF_EMPTY_READ_DELAY_MS;
-      }
-
-      size_t bytes_read = transfer_buffer->transfer_data_from_source(pdMS_TO_TICKS(read_delay));
-      uint8_t *new_data = transfer_buffer->get_buffer_end() - bytes_read;
-
-      if (bytes_read > 0) {
-        this->apply_software_volume_(new_data, bytes_read);
-        this->swap_esp32_mono_samples_(new_data, bytes_read);
-      }
-
-      if (transfer_buffer->available() == 0) {
-        // SPDIF Continuous Silence Mode: always output valid SPDIF stream
-        // When no audio data, write silence-encoded blocks to keep receiver happy
-        if (this->spdif_encoder_ != nullptr) {
-          // "Graceful stop" means "drain buffered audio, then stop." In SPDIF
-          // continuous mode we never actually stop, so once audio is drained
-          // (we're here), reset the flag to re-enable silence writing and stall
-          // recovery. Without this, stop_gracefully stays true forever and
-          // blocks silence output, causing DMA to degrade on auto_clear zeros.
-          stop_gracefully = false;
-
-          // Track when we entered silence mode
-          if (this->spdif_silence_start_ == 0) {
-            this->spdif_silence_start_ = millis();
+      if (!this->pause_state_) {
+        while (real_frames_in_block < SPDIF_BLOCK_SAMPLES) {
+          if (transfer_buffer->available() == 0) {
+            size_t bytes_read = transfer_buffer->transfer_data_from_source(read_timeout_ticks);
+            if (bytes_read == 0) {
+              break;  // No upstream data within the read budget; silence-pad the remainder.
+            }
+            uint8_t *new_data = transfer_buffer->get_buffer_end() - bytes_read;
+            this->apply_software_volume_(new_data, bytes_read);
+            this->swap_esp32_mono_samples_(new_data, bytes_read);
          }

-          // If silence persists past the configured timeout, stop the task
-          // so components expecting timeout semantics can recover.
-          if (this->timeout_.has_value()) {
-            const uint32_t silence_duration = millis() - this->spdif_silence_start_;
-            if (silence_duration >= this->timeout_.value()) {
-              ESP_LOGV(TAG, "Silence timeout reached (%" PRIu32 "ms) - stopping speaker", silence_duration);
-              break;
-            }
-          }
+          const uint32_t frames_still_needed = SPDIF_BLOCK_SAMPLES - real_frames_in_block;
+          const size_t bytes_still_needed = this->current_stream_info_.frames_to_bytes(frames_still_needed);
+          const size_t bytes_to_feed = std::min(transfer_buffer->available(), bytes_still_needed);

-          // First flush any partial block with silence padding (non-blocking to avoid getting stuck).
-          // IMPORTANT: Credit any partial block frames to frames_written so the audio_output_callback_
-          // fires for them. Without this, pending_playback_frames_ in the mixer's SourceSpeaker never
-          // reaches 0 when a stream ends on a non-192-frame boundary, permanently blocking teardown.
-          if (this->spdif_encoder_->has_pending_data()) {
-            uint32_t partial_frames = this->spdif_encoder_->get_pending_frames();
-            // Use a tiny timeout to allow DMA queue progress without stalling the task.
-            esp_err_t flush_err = this->spdif_encoder_->flush_with_silence(pdMS_TO_TICKS(1));
-            if (flush_err == ESP_OK && partial_frames > 0) {
-              frames_written += partial_frames;
-            }
-          }
-
-          // CRITICAL: In SPDIF continuous mode, ALWAYS write silence when no audio data.
-          // We don't check tx_dma_underflow because:
-          // 1. When DMA runs empty, callbacks stop, so tx_dma_underflow doesn't update
-          // 2. The non-blocking write handles "DMA full" gracefully (just doesn't write)
-          // 3. We need continuous output to prevent receiver from losing sync
-          if (!stop_gracefully) {
-            uint32_t silence_blocks = 0;
-            esp_err_t write_err = this->spdif_encoder_->write(
-                reinterpret_cast<const uint8_t *>(SPDIF_SILENCE_BUFFER), sizeof(SPDIF_SILENCE_BUFFER), pdMS_TO_TICKS(1),
-                &silence_blocks);  // Non-blocking
-                                   // Don't count silence as frames_written - it's not real audio
-
-            // Recovery path for a stalled SPDIF TX channel:
-            // if silence writes repeatedly produce zero blocks AND DMA callbacks have stopped,
-            // re-prime DMA using preload mode.
-            const uint32_t ms_since_dma = millis() - spdif_last_dma_event_time;
-            const bool dma_events_stalled = ms_since_dma >= SPDIF_STALL_NO_DMA_MS;
-            if (silence_blocks > 0) {
-              spdif_last_block_progress_time = millis();
-            }
-            const bool long_zero_progress = (millis() - spdif_last_block_progress_time) >= SPDIF_STALL_ZERO_PROGRESS_MS;
-            if (dma_events_stalled && silence_blocks == 0 && (write_err == ESP_OK || write_err == ESP_ERR_TIMEOUT)) {
-              spdif_zero_block_streak++;
-            } else {
-              spdif_zero_block_streak = 0;
-            }
-
-            const uint32_t now_ms = millis();
-            const bool reprime_cooldown_elapsed =
-                (spdif_last_reprime_time == 0) || ((now_ms - spdif_last_reprime_time) >= SPDIF_REPRIME_COOLDOWN_MS);
-
-            if ((spdif_zero_block_streak >= 100 || long_zero_progress) && reprime_cooldown_elapsed) {
-              ESP_LOGV(TAG, "TX appears stalled, attempting DMA re-prime");
-
-              i2s_channel_disable(this->tx_handle_);
-
-              const i2s_event_callbacks_t null_callbacks = {.on_sent = nullptr};
-              i2s_channel_register_event_callback(this->tx_handle_, &null_callbacks, this);
-
-              this->spdif_encoder_->set_preload_mode(true);
-              uint32_t preload_blocks = 0;
-              esp_err_t preload_err = this->spdif_encoder_->write(
-                  reinterpret_cast<const uint8_t *>(SPDIF_SILENCE_BUFFER), sizeof(SPDIF_SILENCE_BUFFER),
-                  pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS), &preload_blocks);
-              this->spdif_encoder_->set_preload_mode(false);
-
-              xQueueReset(this->i2s_event_queue_);
-              const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
-              i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
-              i2s_channel_enable(this->tx_handle_);
-
-              if (preload_err == ESP_OK && preload_blocks > 0) {
-                tx_dma_underflow = false;
-                preload_buffers_remaining = preload_blocks;
-                frames_written = 0;  // Stale after channel disable/enable cycle
-                ESP_LOGV(TAG, "DMA re-prime successful (%" PRIu32 " preload blocks)", preload_blocks);
-                spdif_last_block_progress_time = now_ms;
-              } else {
-                ESP_LOGW(TAG, "DMA re-prime failed (%s, blocks=%" PRIu32 ")", esp_err_to_name(preload_err),
-                         preload_blocks);
-              }
-              spdif_last_reprime_time = now_ms;
-              spdif_zero_block_streak = 0;
-            }
-          }
-        }
-
-        if (stop_gracefully && tx_dma_underflow) {
-          // In SPDIF continuous mode, don't break on graceful stop during silence
-          // Keep outputting silence until new audio arrives or explicit COMMAND_STOP
-          // (handled above which transitions to silence mode rather than breaking)
-        }
-
-        // In SPDIF mode, use a shorter delay to pump silence faster
-        // We need ~250 blocks/sec to keep DMA fed, so max 4ms per iteration
-        vTaskDelay(pdMS_TO_TICKS(SPDIF_SILENCE_LOOP_DELAY_MS));
-      } else {
-        // Have audio data to write
-        size_t bytes_written = 0;
-
-        // Clear silence timer since we have audio data now
-        if (this->spdif_silence_start_ != 0) {
-          uint32_t silence_duration = millis() - this->spdif_silence_start_;
-          if (silence_duration > 100) {
-            ESP_LOGV(TAG, "Exiting silence mode after %" PRIu32 "ms, have audio data", silence_duration);
-          }
-          this->spdif_silence_start_ = 0;
-        }
-
-        {
          uint32_t blocks_sent = 0;
-          size_t pcm_bytes_consumed = 0;
-
-          // Write audio data to encoder (which writes to DMA)
-          esp_err_t err =
-              this->spdif_encoder_->write(transfer_buffer->get_buffer_start(), transfer_buffer->available(),
-                                          pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS), &blocks_sent, &pcm_bytes_consumed);
-          if (err != ESP_OK && err != ESP_ERR_TIMEOUT) {
-            ESP_LOGW(TAG, "Write failed: %s", esp_err_to_name(err));
+          size_t pcm_consumed = 0;
+          esp_err_t err = this->spdif_encoder_->write(transfer_buffer->get_buffer_start(), bytes_to_feed,
+                                                      write_timeout_ticks, &blocks_sent, &pcm_consumed);
+          if (err != ESP_OK) {
+            // A failed (or timed-out) send leaves an unsent block in the encoder's stitch buffer;
+            // resuming would credit the next iteration's bytes against an old block. Bail and
+            // let loop() restart the task with a clean encoder.
+            xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
+            partial_write_failure = true;
+            break;
          }

-          // Only consume source bytes that were actually accepted by the encoder.
-          bytes_written = pcm_bytes_consumed;
-
-          // Update frame accounting based on complete blocks sent (192 frames per block)
-          if (bytes_written > 0) {
-            frames_written += blocks_sent * SPDIF_BLOCK_SAMPLES;
-            transfer_buffer->decrease_buffer_length(bytes_written);
-            // Audio blocks count as DMA progress for the stall detector.
-            // Without this, a long uninterrupted audio stream makes the
-            // progress timer stale, triggering a spurious re-prime the
-            // instant we transition to silence.
-            spdif_last_block_progress_time = millis();
+          if (pcm_consumed > 0) {
+            transfer_buffer->decrease_buffer_length(pcm_consumed);
+            real_frames_in_block += this->current_stream_info_.bytes_to_frames(pcm_consumed);
+          }
+          if (blocks_sent > 0) {
+            block_committed = true;
+            break;
          }
        }
      }
-    }
-    // If we reach here, the while loop exited - either via break or condition became false
-    // In SPDIF mode, loop exit is expected when:
-    // 1. Timeout reached (user configured timeout)
-    // 2. Stream info changed
-    // Only warn if timeout is "never" since that should never exit
-    if (!this->timeout_.has_value()) {
-      ESP_LOGW(TAG, "Unexpected loop exit; set 'timeout: never' to prevent this");
+
+      if (partial_write_failure) {
+        break;
+      }
+
+      if (!block_committed) {
+        // Pad whatever real audio we managed to feed (if any) with silence to complete one block,
+        // or emit a full silence block if the encoder is empty.
+        esp_err_t err = this->spdif_encoder_->flush_with_silence(write_timeout_ticks);
+        if (err != ESP_OK) {
+          xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
+          break;
+        }
+      }
+
+      // One block committed to DMA; push exactly one record carrying its real-audio frame count.
+      // Failure here means the records queue is full, which violates the lockstep invariant.
+      if (xQueueSendToBack(this->write_records_queue_, &real_frames_in_block, 0) != pdTRUE) {
+        xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
+        break;
+      }
+
+      // Silence-timeout tracking and graceful-stop reset.
+      if (real_frames_in_block == 0) {
+        if (this->spdif_silence_start_ == 0) {
+          this->spdif_silence_start_ = millis();
+        }
+
+        if (this->timeout_.has_value()) {
+          const uint32_t silence_duration = millis() - this->spdif_silence_start_;
+          if (silence_duration >= this->timeout_.value()) {
+            ESP_LOGV(TAG, "Silence timeout reached (%" PRIu32 "ms) - stopping speaker", silence_duration);
+            break;
+          }
+        }
+      } else if (this->spdif_silence_start_ != 0) {
+        uint32_t silence_duration = millis() - this->spdif_silence_start_;
+        if (silence_duration > 100) {
+          ESP_LOGV(TAG, "Exiting silence mode after %" PRIu32 "ms, have audio data", silence_duration);
+        }
+        this->spdif_silence_start_ = 0;
+      }
    }
  }

--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -69,6 +69,17 @@ void I2SAudioSpeakerBase::loop() {
  }
  if (event_group_bits & SpeakerEventGroupBits::TASK_STOPPING) {
    ESP_LOGV(TAG, "Stopping");
+    // Lockstep-breaking error bits are latched by the task and cleared along with all other bits
+    // when TASK_STOPPED is processed; log them here, exactly once, as the task winds down.
+    if (event_group_bits & SpeakerEventGroupBits::ERR_DROPPED_EVENT) {
+      ESP_LOGE(TAG, "ISR event queue overflow, restarting speaker task to recover timestamp sync");
+    }
+    if (event_group_bits & SpeakerEventGroupBits::ERR_PARTIAL_WRITE) {
+      ESP_LOGE(TAG, "Partial DMA write broke buffer alignment, restarting speaker task");
+    }
+    if (event_group_bits & SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC) {
+      ESP_LOGE(TAG, "Event/record queues desynced, restarting speaker task");
+    }
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPING);
    this->state_ = speaker::STATE_STOPPING;
  }
@@ -87,18 +98,11 @@ void I2SAudioSpeakerBase::loop() {
    this->state_ = speaker::STATE_STOPPED;
  }

-  // Log any errors encountered by the task
  if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NO_MEM) {
    ESP_LOGE(TAG, "Not enough memory");
    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
  }

-  // Warn if any playback timestamp events are dropped, which drastically reduces synced playback accuracy
-  if (event_group_bits & SpeakerEventGroupBits::WARN_DROPPED_EVENT) {
-    ESP_LOGW(TAG, "Event dropped, synchronized playback accuracy is reduced");
-    xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT);
-  }
-
  // Handle the speaker's state
  switch (this->state_) {
    case speaker::STATE_STARTING:
@@ -271,6 +275,22 @@ esp_err_t I2SAudioSpeakerBase::init_i2s_channel_(const i2s_chan_config_t &chan_c
    xQueueReset(this->i2s_event_queue_);
  }

+  // Lockstep records queue. One record per in-flight DMA buffer; sized to match the I2S event queue
+  // so a fully-saturated DMA pipeline cannot overflow either side before drain.
+  if (this->write_records_queue_ == nullptr) {
+    this->write_records_queue_ = xQueueCreate(event_queue_size, sizeof(uint32_t));
+  } else {
+    xQueueReset(this->write_records_queue_);
+  }
+
+  if (this->i2s_event_queue_ == nullptr || this->write_records_queue_ == nullptr) {
+    ESP_LOGE(TAG, "Failed to allocate I2S event queue(s)");
+    i2s_del_channel(this->tx_handle_);
+    this->tx_handle_ = nullptr;
+    this->parent_->unlock();
+    return ESP_ERR_NO_MEM;
+  }
+
  return ESP_OK;
 }

@@ -293,10 +313,16 @@ bool IRAM_ATTR I2SAudioSpeakerBase::i2s_on_sent_cb(i2s_chan_handle_t handle, i2s
  I2SAudioSpeakerBase *this_speaker = (I2SAudioSpeakerBase *) user_ctx;

  if (xQueueIsQueueFullFromISR(this_speaker->i2s_event_queue_)) {
-    // Queue is full, so discard the oldest event and set the warning flag to inform the user
+    // Queue is full, so discard the oldest event. Once we drop a completion event, ``i2s_event_queue_``
+    // and any per-buffer record queue maintained by the task are permanently desynced, so the task
+    // must restart to recover. Set both ERR_DROPPED_EVENT (so loop() can log it) and COMMAND_STOP
+    // (so the task bails immediately, closing the race where loop() could clear the error bit
+    // before the task observes it).
    int64_t dummy;
    xQueueReceiveFromISR(this_speaker->i2s_event_queue_, &dummy, &need_yield1);
-    xEventGroupSetBitsFromISR(this_speaker->event_group_, SpeakerEventGroupBits::WARN_DROPPED_EVENT, &need_yield2);
+    xEventGroupSetBitsFromISR(this_speaker->event_group_,
+                              SpeakerEventGroupBits::ERR_DROPPED_EVENT | SpeakerEventGroupBits::COMMAND_STOP,
+                              &need_yield2);
  }

  xQueueSendToBackFromISR(this_speaker->i2s_event_queue_, &now, &need_yield3);
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h
@@ -35,7 +35,11 @@ enum SpeakerEventGroupBits : uint32_t {

  ERR_ESP_NO_MEM = (1 << 19),

-  WARN_DROPPED_EVENT = (1 << 20),
+  ERR_DROPPED_EVENT = (1 << 20),    // ISR overflowed the event queue, dropping a completion event
+  ERR_PARTIAL_WRITE = (1 << 21),    // a DMA write returned fewer bytes than requested (or the encoder
+                                    // failed to commit a complete block), which breaks the lockstep
+                                    // invariant for every subsequent event
+  ERR_LOCKSTEP_DESYNC = (1 << 22),  // i2s_event_queue_ and write_records_queue_ fell out of sync

  ALL_BITS = 0x00FFFFFF,  // All valid FreeRTOS event group bits
 };
@@ -141,7 +145,9 @@ class I2SAudioSpeakerBase : public I2SAudioOut, public speaker::Speaker, public
  TaskHandle_t speaker_task_handle_{nullptr};
  EventGroupHandle_t event_group_{nullptr};

+  // Lockstepped DMA buffer queues: i2s_event is outgoing, write_records is incoming
  QueueHandle_t i2s_event_queue_{nullptr};
+  QueueHandle_t write_records_queue_{nullptr};

  std::weak_ptr<ring_buffer::RingBuffer> audio_ring_buffer_;

--- a/esphome/components/i2s_audio/speaker/spdif_encoder.cpp
+++ b/esphome/components/i2s_audio/speaker/spdif_encoder.cpp
@@ -358,25 +358,15 @@ HOT esp_err_t SPDIFEncoder::write(const uint8_t *src, size_t size, TickType_t ti
 }

 esp_err_t SPDIFEncoder::flush_with_silence(TickType_t ticks_to_wait) {
-  // First, send any pending complete block from a previous failed send
-  if (this->spdif_block_ptr_ >= &this->spdif_block_buf_[SPDIF_BLOCK_SIZE_U32]) {
-    esp_err_t err = this->send_block_(ticks_to_wait);
-    if (err != ESP_OK) {
-      return err;
+  // If a complete block is already pending (from a previous failed send), emit just that block.
+  // Otherwise pad the partial block with silence (or generate a full silence block if empty)
+  // and send. Always emits exactly one block on success.
+  if (this->spdif_block_ptr_ < &this->spdif_block_buf_[SPDIF_BLOCK_SIZE_U32]) {
+    static const uint8_t SILENCE[2] = {0, 0};
+    while (this->spdif_block_ptr_ < &this->spdif_block_buf_[SPDIF_BLOCK_SIZE_U32]) {
+      this->encode_sample_(SILENCE);
    }
  }
-
-  if (!this->has_pending_data()) {
-    return ESP_OK;  // Nothing to flush
-  }
-
-  // Encode silence (zeros) until the block is complete
-  static const uint8_t SILENCE[2] = {0, 0};
-
-  while (this->spdif_block_ptr_ < &this->spdif_block_buf_[SPDIF_BLOCK_SIZE_U32]) {
-    this->encode_sample_(SILENCE);
-  }
-
  return this->send_block_(ticks_to_wait);
 }

--- a/esphome/components/i2s_audio/speaker/spdif_encoder.h
+++ b/esphome/components/i2s_audio/speaker/spdif_encoder.h
@@ -85,9 +85,10 @@ class SPDIFEncoder {
  /// @brief Check if there is a partial block pending
  bool has_pending_data() const { return this->spdif_block_ptr_ != this->spdif_block_buf_.get(); }

-  /// @brief Flush any pending partial block by padding with silence and sending
+  /// @brief Emit one complete SPDIF block: pad any pending partial block with silence and send,
+  /// or send a full silence block if nothing is pending. Always produces exactly one block on success.
  /// @param ticks_to_wait Timeout for blocking writes
-  /// @return esp_err_t as returned from the callback, or ESP_OK if nothing to flush
+  /// @return esp_err_t as returned from the callback
  esp_err_t flush_with_silence(TickType_t ticks_to_wait);

  /// @brief Reset the SPDIF block buffer and position tracking, discarding any partial block