diff --git a/esphome/components/i2s_audio/__init__.py b/esphome/components/i2s_audio/__init__.py index 951b8c0498..8e432695a1 100644 --- a/esphome/components/i2s_audio/__init__.py +++ b/esphome/components/i2s_audio/__init__.py @@ -170,7 +170,7 @@ def i2s_audio_component_schema( min=1 ), cv.Optional(CONF_BITS_PER_SAMPLE, default=default_bits_per_sample): cv.All( - _validate_bits, cv.one_of(*I2S_BITS_PER_SAMPLE) + _validate_bits, cv.int_, cv.one_of(*I2S_BITS_PER_SAMPLE) ), cv.Optional(CONF_I2S_MODE, default=CONF_PRIMARY): cv.one_of( *I2S_MODE_OPTIONS, lower=True diff --git a/esphome/components/i2s_audio/speaker/__init__.py b/esphome/components/i2s_audio/speaker/__init__.py index 8215d8b518..5ba2f4b1a5 100644 --- a/esphome/components/i2s_audio/speaker/__init__.py +++ b/esphome/components/i2s_audio/speaker/__init__.py @@ -98,11 +98,19 @@ def _set_stream_limits(config): min_sample_rate=config.get(CONF_SAMPLE_RATE), max_sample_rate=config.get(CONF_SAMPLE_RATE), )(config) - elif config[CONF_I2S_MODE] == CONF_PRIMARY: - # Primary mode has modifiable stream settings + return config + + # The original ESP32 cannot lay out sub-16-bit slots that match ESPHome's packed audio, so the smallest + # stream it accepts is 16-bit (see start_i2s_driver); the other variants handle 8-bit. + min_bits_per_sample = 16 if esp32.get_esp32_variant() == esp32.VARIANT_ESP32 else 8 + + if config[CONF_I2S_MODE] == CONF_PRIMARY: + # Primary mode can reconfigure the bus to the incoming sample rate and channel count, but the + # configured bits per sample is a hard ceiling: the speaker rejects any stream that exceeds the + # slot bit width it was set up with (see start_i2s_driver), so advertise that as the maximum. audio.set_stream_limits( - min_bits_per_sample=8, - max_bits_per_sample=32, + min_bits_per_sample=min_bits_per_sample, + max_bits_per_sample=config[CONF_BITS_PER_SAMPLE], min_channels=1, max_channels=2, min_sample_rate=16000, @@ -111,13 +119,13 @@ def _set_stream_limits(config): else: # Secondary mode has unmodifiable max bits per sample and min/max sample rates audio.set_stream_limits( - min_bits_per_sample=8, - max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE), + min_bits_per_sample=min_bits_per_sample, + max_bits_per_sample=config[CONF_BITS_PER_SAMPLE], min_channels=1, max_channels=2, min_sample_rate=config.get(CONF_SAMPLE_RATE), max_sample_rate=config.get(CONF_SAMPLE_RATE), - ) + )(config) return config @@ -134,12 +142,11 @@ def _validate_esp32_variant(config): if config[CONF_DAC_TYPE] == "internal": if variant not in INTERNAL_DAC_VARIANTS: raise cv.Invalid(f"{variant} does not have an internal DAC") - elif ( - variant == esp32.VARIANT_ESP32 - and config.get(CONF_BITS_PER_SAMPLE) == 8 - and config.get(CONF_CHANNEL) in (CONF_MONO, CONF_LEFT, CONF_RIGHT) - ): - raise cv.Invalid("8-bit mono mode is not supported on ESP32") + elif variant == esp32.VARIANT_ESP32 and config[CONF_BITS_PER_SAMPLE] == 8: + # The original ESP32 I2S peripheral packs each sample into a whole number of 16-bit words, so an + # 8-bit slot does not line up with ESPHome's tightly packed audio (see start_i2s_driver). Reject it + # at config time rather than emitting corrupted output at runtime. + raise cv.Invalid("8-bit audio is not supported on the original ESP32") return config diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp index ffe901504d..0afb67fb36 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp @@ -3,6 +3,7 @@ #ifdef USE_ESP32 #include +#include #include "esphome/components/audio/audio.h" #include "esphome/components/audio/audio_transfer_buffer.h" @@ -16,8 +17,16 @@ namespace esphome::i2s_audio { static const char *const TAG = "i2s_audio.speaker.std"; -static constexpr uint32_t DMA_BUFFER_DURATION_MS = 15; -static constexpr size_t DMA_BUFFERS_COUNT = 4; +static constexpr uint32_t DMA_BUFFER_DURATION_MS = 10; +static constexpr size_t DMA_BUFFERS_COUNT = 5; +// ESP-IDF clamps each DMA descriptor to this many bytes when allocating the channel (see i2s_get_buf_size in +// the I2S driver). Mirror its target-dependent selection so the requested dma_frame_num stays in range; the +// speaker task reads the size actually allocated back from the driver rather than relying on this value. +#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE +static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_64B_ALIGNED; +#else +static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_4B_ALIGNED; +#endif // Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight, // doubled so that a transient backlog never overruns the queue (which would desync the lockstep // invariant between i2s_event_queue_ and write_records_queue_). @@ -27,6 +36,17 @@ static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2; // without masking real failures. static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1)); +// Requested frames per DMA buffer for the given stream, clamped so the byte size stays within the ESP-IDF +// maximum DMA descriptor size. This is only the value handed to the channel config: ESP-IDF may still adjust +// it (e.g. cache-line rounding on some targets), so the speaker task reads the size actually allocated back +// from the driver instead of assuming this value. Clamping here keeps the request in range and avoids a +// noisy ESP-IDF "dma frame num is out of dma buffer size" warning at high sample rates or bit depths. +static uint32_t dma_buffer_frames(const audio::AudioStreamInfo &stream_info) { + const uint32_t frames_from_duration = stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS); + const uint32_t max_frames = I2S_DMA_BUFFER_MAX_SIZE / stream_info.frames_to_bytes(1); + return std::min(frames_from_duration, max_frames); +} + void I2SAudioSpeaker::dump_config() { I2SAudioSpeakerBase::dump_config(); const char *fmt_str; @@ -57,8 +77,21 @@ void I2SAudioSpeaker::run_speaker_task() { // avoids unnecessary single-frame splices. const size_t ring_buffer_size = (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame; - const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS); - const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer); + // ESP-IDF may allocate smaller (or cache-line-rounded) DMA buffers than dma_buffer_frames() requested: it + // clamps each descriptor to the max DMA descriptor size and, on targets that route internal memory through + // the L1 cache (e.g. ESP32-P4), rounds the buffer to the cache line. Read the size the driver actually + // allocated so preload, silence padding, and the write/event lockstep all match it exactly. The channel is + // in the READY state here because start_i2s_driver() initialized it before this task was created. + size_t dma_buffer_bytes; + i2s_chan_info_t chan_info; + if (i2s_channel_get_info(this->tx_handle_, &chan_info) == ESP_OK && chan_info.total_dma_buf_size > 0) { + // total_dma_buf_size spans all DMA_BUFFERS_COUNT descriptors and is an exact multiple of the count. + dma_buffer_bytes = chan_info.total_dma_buf_size / DMA_BUFFERS_COUNT; + } else { + // Should not happen for a READY channel; fall back to the requested size. + dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(dma_buffer_frames(this->current_stream_info_)); + } + const uint32_t frames_per_dma_buffer = this->current_stream_info_.bytes_to_frames(dma_buffer_bytes); bool successful_setup = false; @@ -308,12 +341,24 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream return ESP_ERR_NOT_SUPPORTED; } +#ifdef USE_ESP32_VARIANT_ESP32 + // The original ESP32 I2S peripheral stores each sample in a whole number of 16-bit words (a 24-bit sample + // occupies 4 bytes in the DMA buffer, an 8-bit sample 2 bytes), but ESPHome's audio pipeline packs samples + // tightly (3 bytes for 24-bit, 1 for 8-bit). The two layouts only line up when the bit depth is a multiple + // of 16, so reject anything else rather than emit corrupted audio. + if (audio_stream_info.get_bits_per_sample() % 16 != 0) { + ESP_LOGE(TAG, "ESP32 supports only 16- or 32-bit audio, got %u-bit", + (unsigned) audio_stream_info.get_bits_per_sample()); + return ESP_ERR_NOT_SUPPORTED; + } +#endif // USE_ESP32_VARIANT_ESP32 + if (!this->parent_->try_lock()) { ESP_LOGE(TAG, "Parent bus is busy"); return ESP_ERR_INVALID_STATE; } - uint32_t dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS); + uint32_t dma_buffer_length = dma_buffer_frames(audio_stream_info); i2s_role_t i2s_role = this->i2s_role_; i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;