[i2s_audio] Fix speaker DMA buffer sizing and validate bit depth at compile time (#16672)

2026-06-24 12:53:26 +00:00 · 2026-06-02 09:32:27 -04:00
parent 6197282f1a
commit 063770bcf4
3 changed files with 71 additions and 19 deletions
--- a/esphome/components/i2s_audio/init.py
+++ b/esphome/components/i2s_audio/init.py
@@ -170,7 +170,7 @@ def i2s_audio_component_schema(
                min=1
            ),
            cv.Optional(CONF_BITS_PER_SAMPLE, default=default_bits_per_sample): cv.All(
-                _validate_bits, cv.one_of(*I2S_BITS_PER_SAMPLE)
+                _validate_bits, cv.int_, cv.one_of(*I2S_BITS_PER_SAMPLE)
            ),
            cv.Optional(CONF_I2S_MODE, default=CONF_PRIMARY): cv.one_of(
                *I2S_MODE_OPTIONS, lower=True
--- a/esphome/components/i2s_audio/speaker/init.py
+++ b/esphome/components/i2s_audio/speaker/init.py
@@ -98,11 +98,19 @@ def _set_stream_limits(config):
            min_sample_rate=config.get(CONF_SAMPLE_RATE),
            max_sample_rate=config.get(CONF_SAMPLE_RATE),
        )(config)
-    elif config[CONF_I2S_MODE] == CONF_PRIMARY:
-        # Primary mode has modifiable stream settings
+        return config
+
+    # The original ESP32 cannot lay out sub-16-bit slots that match ESPHome's packed audio, so the smallest
+    # stream it accepts is 16-bit (see start_i2s_driver); the other variants handle 8-bit.
+    min_bits_per_sample = 16 if esp32.get_esp32_variant() == esp32.VARIANT_ESP32 else 8
+
+    if config[CONF_I2S_MODE] == CONF_PRIMARY:
+        # Primary mode can reconfigure the bus to the incoming sample rate and channel count, but the
+        # configured bits per sample is a hard ceiling: the speaker rejects any stream that exceeds the
+        # slot bit width it was set up with (see start_i2s_driver), so advertise that as the maximum.
        audio.set_stream_limits(
-            min_bits_per_sample=8,
-            max_bits_per_sample=32,
+            min_bits_per_sample=min_bits_per_sample,
+            max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
            min_channels=1,
            max_channels=2,
            min_sample_rate=16000,
@@ -111,13 +119,13 @@ def _set_stream_limits(config):
    else:
        # Secondary mode has unmodifiable max bits per sample and min/max sample rates
        audio.set_stream_limits(
-            min_bits_per_sample=8,
-            max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+            min_bits_per_sample=min_bits_per_sample,
+            max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
            min_channels=1,
            max_channels=2,
            min_sample_rate=config.get(CONF_SAMPLE_RATE),
            max_sample_rate=config.get(CONF_SAMPLE_RATE),
-        )
+        )(config)

    return config

@@ -134,12 +142,11 @@ def _validate_esp32_variant(config):
    if config[CONF_DAC_TYPE] == "internal":
        if variant not in INTERNAL_DAC_VARIANTS:
            raise cv.Invalid(f"{variant} does not have an internal DAC")
-    elif (
-        variant == esp32.VARIANT_ESP32
-        and config.get(CONF_BITS_PER_SAMPLE) == 8
-        and config.get(CONF_CHANNEL) in (CONF_MONO, CONF_LEFT, CONF_RIGHT)
-    ):
-        raise cv.Invalid("8-bit mono mode is not supported on ESP32")
+    elif variant == esp32.VARIANT_ESP32 and config[CONF_BITS_PER_SAMPLE] == 8:
+        # The original ESP32 I2S peripheral packs each sample into a whole number of 16-bit words, so an
+        # 8-bit slot does not line up with ESPHome's tightly packed audio (see start_i2s_driver). Reject it
+        # at config time rather than emitting corrupted output at runtime.
+        raise cv.Invalid("8-bit audio is not supported on the original ESP32")
    return config


--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
@@ -3,6 +3,7 @@
 #ifdef USE_ESP32

 #include <driver/i2s_std.h>
+#include <hal/dma_types.h>

 #include "esphome/components/audio/audio.h"
 #include "esphome/components/audio/audio_transfer_buffer.h"
@@ -16,8 +17,16 @@ namespace esphome::i2s_audio {

 static const char *const TAG = "i2s_audio.speaker.std";

-static constexpr uint32_t DMA_BUFFER_DURATION_MS = 15;
-static constexpr size_t DMA_BUFFERS_COUNT = 4;
+static constexpr uint32_t DMA_BUFFER_DURATION_MS = 10;
+static constexpr size_t DMA_BUFFERS_COUNT = 5;
+// ESP-IDF clamps each DMA descriptor to this many bytes when allocating the channel (see i2s_get_buf_size in
+// the I2S driver). Mirror its target-dependent selection so the requested dma_frame_num stays in range; the
+// speaker task reads the size actually allocated back from the driver rather than relying on this value.
+#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE
+static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_64B_ALIGNED;
+#else
+static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_4B_ALIGNED;
+#endif
 // Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
 // doubled so that a transient backlog never overruns the queue (which would desync the lockstep
 // invariant between i2s_event_queue_ and write_records_queue_).
@@ -27,6 +36,17 @@ static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
 // without masking real failures.
 static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));

+// Requested frames per DMA buffer for the given stream, clamped so the byte size stays within the ESP-IDF
+// maximum DMA descriptor size. This is only the value handed to the channel config: ESP-IDF may still adjust
+// it (e.g. cache-line rounding on some targets), so the speaker task reads the size actually allocated back
+// from the driver instead of assuming this value. Clamping here keeps the request in range and avoids a
+// noisy ESP-IDF "dma frame num is out of dma buffer size" warning at high sample rates or bit depths.
+static uint32_t dma_buffer_frames(const audio::AudioStreamInfo &stream_info) {
+  const uint32_t frames_from_duration = stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  const uint32_t max_frames = I2S_DMA_BUFFER_MAX_SIZE / stream_info.frames_to_bytes(1);
+  return std::min(frames_from_duration, max_frames);
+}
+
 void I2SAudioSpeaker::dump_config() {
  I2SAudioSpeakerBase::dump_config();
  const char *fmt_str;
@@ -57,8 +77,21 @@ void I2SAudioSpeaker::run_speaker_task() {
  // avoids unnecessary single-frame splices.
  const size_t ring_buffer_size =
      (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
-  const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
-  const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
+  // ESP-IDF may allocate smaller (or cache-line-rounded) DMA buffers than dma_buffer_frames() requested: it
+  // clamps each descriptor to the max DMA descriptor size and, on targets that route internal memory through
+  // the L1 cache (e.g. ESP32-P4), rounds the buffer to the cache line. Read the size the driver actually
+  // allocated so preload, silence padding, and the write/event lockstep all match it exactly. The channel is
+  // in the READY state here because start_i2s_driver() initialized it before this task was created.
+  size_t dma_buffer_bytes;
+  i2s_chan_info_t chan_info;
+  if (i2s_channel_get_info(this->tx_handle_, &chan_info) == ESP_OK && chan_info.total_dma_buf_size > 0) {
+    // total_dma_buf_size spans all DMA_BUFFERS_COUNT descriptors and is an exact multiple of the count.
+    dma_buffer_bytes = chan_info.total_dma_buf_size / DMA_BUFFERS_COUNT;
+  } else {
+    // Should not happen for a READY channel; fall back to the requested size.
+    dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(dma_buffer_frames(this->current_stream_info_));
+  }
+  const uint32_t frames_per_dma_buffer = this->current_stream_info_.bytes_to_frames(dma_buffer_bytes);

  bool successful_setup = false;

@@ -308,12 +341,24 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
    return ESP_ERR_NOT_SUPPORTED;
  }

+#ifdef USE_ESP32_VARIANT_ESP32
+  // The original ESP32 I2S peripheral stores each sample in a whole number of 16-bit words (a 24-bit sample
+  // occupies 4 bytes in the DMA buffer, an 8-bit sample 2 bytes), but ESPHome's audio pipeline packs samples
+  // tightly (3 bytes for 24-bit, 1 for 8-bit). The two layouts only line up when the bit depth is a multiple
+  // of 16, so reject anything else rather than emit corrupted audio.
+  if (audio_stream_info.get_bits_per_sample() % 16 != 0) {
+    ESP_LOGE(TAG, "ESP32 supports only 16- or 32-bit audio, got %u-bit",
+             (unsigned) audio_stream_info.get_bits_per_sample());
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+#endif  // USE_ESP32_VARIANT_ESP32
+
  if (!this->parent_->try_lock()) {
    ESP_LOGE(TAG, "Parent bus is busy");
    return ESP_ERR_INVALID_STATE;
  }

-  uint32_t dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  uint32_t dma_buffer_length = dma_buffer_frames(audio_stream_info);

  i2s_role_t i2s_role = this->i2s_role_;
  i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;