[i2s_audio] Narrow wider streams to the speaker's configured bit depth (#16821)

Co-authored-by: Jonathan Swoboda <154711427+swoboda1337@users.noreply.github.com>
This commit is contained in:
Kevin Ahrendt
2026-06-21 17:52:05 -04:00
committed by GitHub
parent 73dbc8214b
commit 77a91853be
5 changed files with 99 additions and 45 deletions

View File

@@ -104,23 +104,26 @@ def _set_stream_limits(config):
# stream it accepts is 16-bit (see start_i2s_driver); the other variants handle 8-bit.
min_bits_per_sample = 16 if esp32.get_esp32_variant() == esp32.VARIANT_ESP32 else 8
# The configured bits per sample sets the I2S slot width, but the speaker narrows wider streams down to it
# in place before clocking them out (see start_i2s_driver). Advertise up to 32-bit so those wider streams
# are accepted rather than forcing an upstream conversion.
max_bits_per_sample = 32
if config[CONF_I2S_MODE] == CONF_PRIMARY:
# Primary mode can reconfigure the bus to the incoming sample rate and channel count, but the
# configured bits per sample is a hard ceiling: the speaker rejects any stream that exceeds the
# slot bit width it was set up with (see start_i2s_driver), so advertise that as the maximum.
# Primary mode can reconfigure the bus to the incoming sample rate and channel count.
audio.set_stream_limits(
min_bits_per_sample=min_bits_per_sample,
max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
max_bits_per_sample=max_bits_per_sample,
min_channels=1,
max_channels=2,
min_sample_rate=16000,
max_sample_rate=48000,
)(config)
else:
# Secondary mode has unmodifiable max bits per sample and min/max sample rates
# Secondary mode has unmodifiable min/max sample rates
audio.set_stream_limits(
min_bits_per_sample=min_bits_per_sample,
max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
max_bits_per_sample=max_bits_per_sample,
min_channels=1,
max_channels=2,
min_sample_rate=config.get(CONF_SAMPLE_RATE),

View File

@@ -404,6 +404,8 @@ void I2SAudioSpeakerSPDIF::run_speaker_task() {
esp_err_t I2SAudioSpeakerSPDIF::start_i2s_driver(audio::AudioStreamInfo &audio_stream_info) {
this->current_stream_info_ = audio_stream_info;
// SPDIF never narrows the bit depth; the encoder consumes the input format directly.
this->output_stream_info_ = audio_stream_info;
// SPDIF mode validation
if (this->sample_rate_ != audio_stream_info.get_sample_rate()) {

View File

@@ -354,7 +354,7 @@ void I2SAudioSpeakerBase::apply_software_volume_(uint8_t *data, size_t bytes_rea
void I2SAudioSpeakerBase::swap_esp32_mono_samples_(uint8_t *data, size_t bytes_read) {
#ifdef USE_ESP32_VARIANT_ESP32
// For ESP32 16-bit mono mode, adjacent samples need to be swapped.
if (this->current_stream_info_.get_channels() == 1 && this->current_stream_info_.get_bits_per_sample() == 16) {
if (this->output_stream_info_.get_channels() == 1 && this->output_stream_info_.get_bits_per_sample() == 16) {
int16_t *samples = reinterpret_cast<int16_t *>(data);
size_t sample_count = bytes_read / sizeof(int16_t);
for (size_t i = 0; i + 1 < sample_count; i += 2) {

View File

@@ -134,7 +134,8 @@ class I2SAudioSpeakerBase : public I2SAudioOut, public speaker::Speaker, public
void apply_software_volume_(uint8_t *data, size_t bytes_read);
/// @brief Swap adjacent 16-bit mono samples for ESP32 (non-variant) hardware quirk.
/// Only applies when running on original ESP32 with 16-bit mono audio.
/// Only applies when running on original ESP32 with 16-bit mono output. Operates on the data that is
/// handed to the I2S peripheral, so the check uses the output (post-narrowing) stream info.
/// @param data Pointer to audio sample data (modified in place)
/// @param bytes_read Number of bytes of audio data
void swap_esp32_mono_samples_(uint8_t *data, size_t bytes_read);
@@ -156,7 +157,11 @@ class I2SAudioSpeakerBase : public I2SAudioOut, public speaker::Speaker, public
int32_t q31_volume_factor_{INT32_MAX};
audio::AudioStreamInfo current_stream_info_; // The currently loaded driver's stream info
audio::AudioStreamInfo current_stream_info_; // Format of the audio in the ring buffer (the I2S input)
// Format actually clocked out of the I2S peripheral. Same channel count and sample rate as
// current_stream_info_, but the bits per sample may be narrower when the incoming stream is wider than
// the speaker's configured slot bit width. Set by start_i2s_driver before the speaker task starts.
audio::AudioStreamInfo output_stream_info_;
gpio_num_t dout_pin_;
i2s_chan_handle_t tx_handle_{nullptr};

View File

@@ -13,6 +13,9 @@
#include "esp_timer.h"
// esp-audio-libs
#include <pcm_convert.h>
namespace esphome::i2s_audio {
static const char *const TAG = "i2s_audio.speaker.std";
@@ -62,6 +65,12 @@ void I2SAudioSpeaker::dump_config() {
break;
}
ESP_LOGCONFIG(TAG, " Communication format: %s", fmt_str);
if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
// The width of each I2S slot. It is also the narrowing ceiling: streams wider than this are narrowed to
// it. A stream narrower than the slot is left at its own width and clocked into the wider slot, so this
// is not necessarily the sample data width (which depends on the incoming stream).
ESP_LOGCONFIG(TAG, " Slot bit width: %u", (unsigned) static_cast<uint32_t>(this->slot_bit_width_));
}
}
void I2SAudioSpeaker::run_speaker_task() {
@@ -71,12 +80,19 @@ void I2SAudioSpeaker::run_speaker_task() {
// Ensure ring buffer duration is at least the duration of all DMA buffers
const uint32_t ring_buffer_duration = std::max(dma_buffers_duration_ms, this->buffer_duration_ms_);
// The DMA buffers may have more bits per sample, so calculate buffer sizes based on the input audio stream info
// The ring buffer holds input-format audio (what play() receives), so size it from the input stream info.
const size_t bytes_per_frame = this->current_stream_info_.frames_to_bytes(1);
// Round the ring buffer size down to a multiple of bytes_per_frame so the wrap boundary stays frame-aligned and
// avoids unnecessary single-frame splices.
const size_t ring_buffer_size =
(this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
// Per-frame byte widths and whether the task must narrow the bit depth before writing to the I2S peripheral.
const uint8_t channels = this->current_stream_info_.get_channels();
const uint8_t input_bytes_per_sample = this->current_stream_info_.get_bits_per_sample() / 8;
const uint8_t output_bytes_per_sample = this->output_stream_info_.get_bits_per_sample() / 8;
const bool narrowing = input_bytes_per_sample != output_bytes_per_sample;
// ESP-IDF may allocate smaller (or cache-line-rounded) DMA buffers than dma_buffer_frames() requested: it
// clamps each descriptor to the max DMA descriptor size and, on targets that route internal memory through
// the L1 cache (e.g. ESP32-P4), rounds the buffer to the cache line. Read the size the driver actually
@@ -89,9 +105,12 @@ void I2SAudioSpeaker::run_speaker_task() {
dma_buffer_bytes = chan_info.total_dma_buf_size / DMA_BUFFERS_COUNT;
} else {
// Should not happen for a READY channel; fall back to the requested size.
dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(dma_buffer_frames(this->current_stream_info_));
dma_buffer_bytes = this->output_stream_info_.frames_to_bytes(dma_buffer_frames(this->output_stream_info_));
}
const uint32_t frames_per_dma_buffer = this->current_stream_info_.bytes_to_frames(dma_buffer_bytes);
// dma_buffer_bytes counts output-format bytes; convert with the output stream info.
const uint32_t frames_per_dma_buffer = this->output_stream_info_.bytes_to_frames(dma_buffer_bytes);
// Soft cap for each source read: enough input-format bytes to fill one DMA buffer's worth of frames.
const size_t dma_buffer_input_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
bool successful_setup = false;
@@ -105,8 +124,8 @@ void I2SAudioSpeaker::run_speaker_task() {
memset(silence_buffer, 0, dma_buffer_bytes);
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
audio_source =
audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_bytes, static_cast<uint8_t>(bytes_per_frame));
audio_source = audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_input_bytes,
static_cast<uint8_t>(bytes_per_frame));
if (audio_source != nullptr) {
// audio_source is nullptr if the ring buffer fails to allocate
@@ -237,42 +256,61 @@ void I2SAudioSpeaker::run_speaker_task() {
// Compose exactly one DMA buffer's worth: drain as much real audio as the source currently
// exposes (may take multiple fill() calls when crossing a ring buffer wrap), then pad any
// remainder with silence. All writes pack into the next free DMA descriptor in order, so the
// descriptor ends up holding [real audio][silence padding].
// descriptor ends up holding [real audio][silence padding]. ``bytes_written_total`` counts
// output-format bytes so it tracks how full the DMA buffer is regardless of any narrowing.
size_t bytes_written_total = 0;
size_t real_bytes_total = 0;
uint32_t real_frames_total = 0;
bool partial_write_failure = false;
if (!this->pause_state_) {
while (bytes_written_total < dma_buffer_bytes) {
size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS) / 2, false);
if (bytes_read > 0) {
// Apply volume at the input bit depth, before any narrowing, so the full precision is scaled.
uint8_t *new_data = audio_source->mutable_data() + audio_source->available() - bytes_read;
this->apply_software_volume_(new_data, bytes_read);
this->swap_esp32_mono_samples_(new_data, bytes_read);
}
const size_t to_write = std::min(audio_source->available(), dma_buffer_bytes - bytes_written_total);
if (to_write == 0) {
// Convert as many whole frames as fit in the remaining DMA space, bounded by what the source
// currently exposes. Frame counts are shared between input and output; only the byte widths differ.
const uint32_t frames_available = this->current_stream_info_.bytes_to_frames(audio_source->available());
const uint32_t frames_room =
this->output_stream_info_.bytes_to_frames(dma_buffer_bytes - bytes_written_total);
const uint32_t frames_to_write = std::min(frames_available, frames_room);
if (frames_to_write == 0) {
// Ring buffer has nothing more to hand over right now; pad the rest of this DMA buffer
// with silence so the lockstep invariant (one write per iteration) is preserved.
break;
}
const size_t input_bytes = this->current_stream_info_.frames_to_bytes(frames_to_write);
const size_t output_bytes = this->output_stream_info_.frames_to_bytes(frames_to_write);
uint8_t *chunk = audio_source->mutable_data();
if (narrowing) {
// Narrow the bit depth in place: output exactly aliases input with the same channel count and a
// smaller width, which copy_frames handles as a single forward pass. Only the frames about to be
// consumed are overwritten, so any unprocessed tail stays intact for the next iteration.
esp_audio_libs::pcm_convert::copy_frames(chunk, chunk, input_bytes_per_sample, channels,
output_bytes_per_sample, channels, frames_to_write);
}
this->swap_esp32_mono_samples_(chunk, output_bytes);
size_t bw = 0;
i2s_channel_write(this->tx_handle_, audio_source->data(), to_write, &bw, WRITE_TIMEOUT_TICKS);
if (bw != to_write) {
i2s_channel_write(this->tx_handle_, chunk, output_bytes, &bw, WRITE_TIMEOUT_TICKS);
if (bw != output_bytes) {
// A short real-audio write breaks DMA descriptor alignment for every subsequent event;
// the only safe recovery is to restart the task.
ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) to_write);
ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) output_bytes);
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
partial_write_failure = true;
break;
}
audio_source->consume(bw);
bytes_written_total += bw;
real_bytes_total += bw;
audio_source->consume(input_bytes);
bytes_written_total += output_bytes;
real_frames_total += frames_to_write;
}
if (real_bytes_total > 0) {
if (real_frames_total > 0) {
last_data_received_time = millis();
}
}
@@ -293,16 +331,15 @@ void I2SAudioSpeaker::run_speaker_task() {
}
}
const uint32_t real_frames_in_buffer = this->current_stream_info_.bytes_to_frames(real_bytes_total);
// Push the matching write record. Capacity headroom in I2S_EVENT_QUEUE_COUNT guarantees this
// succeeds even with a transient backlog of unprocessed events; if it ever fails the lockstep
// invariant is broken and every subsequent timestamp would be silently wrong, so bail.
if (xQueueSend(this->write_records_queue_, &real_frames_in_buffer, 0) != pdTRUE) {
if (xQueueSend(this->write_records_queue_, &real_frames_total, 0) != pdTRUE) {
ESP_LOGV(TAG, "Exiting: write records queue full");
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
break;
}
if (real_frames_in_buffer > 0) {
if (real_frames_total > 0) {
pending_real_buffers++;
}
}
@@ -334,21 +371,28 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
return ESP_ERR_NOT_SUPPORTED;
}
if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO &&
(i2s_slot_bit_width_t) audio_stream_info.get_bits_per_sample() > this->slot_bit_width_) {
// Currently can't handle the case when the incoming audio has more bits per sample than the configured value
ESP_LOGE(TAG, "Stream bits per sample must be less than or equal to the speaker's configuration");
return ESP_ERR_NOT_SUPPORTED;
// When the stream is wider than the configured slot bit width, the speaker task narrows each frame in place
// before handing it to the I2S peripheral. Compute the output format here so the driver, DMA buffers, and
// the task's conversion all agree on the clocked-out width. A stream no wider than the slot width is passed
// through unchanged (the slot may still be wider than the data, the existing behavior).
uint8_t output_bits_per_sample = audio_stream_info.get_bits_per_sample();
if (this->slot_bit_width_ != I2S_SLOT_BIT_WIDTH_AUTO) {
const uint8_t configured_bits = static_cast<uint8_t>(this->slot_bit_width_);
if (output_bits_per_sample > configured_bits) {
output_bits_per_sample = configured_bits;
}
}
this->output_stream_info_ = audio::AudioStreamInfo(output_bits_per_sample, audio_stream_info.get_channels(),
audio_stream_info.get_sample_rate());
#ifdef USE_ESP32_VARIANT_ESP32
// The original ESP32 I2S peripheral stores each sample in a whole number of 16-bit words (a 24-bit sample
// occupies 4 bytes in the DMA buffer, an 8-bit sample 2 bytes), but ESPHome's audio pipeline packs samples
// tightly (3 bytes for 24-bit, 1 for 8-bit). The two layouts only line up when the bit depth is a multiple
// of 16, so reject anything else rather than emit corrupted audio.
if (audio_stream_info.get_bits_per_sample() % 16 != 0) {
ESP_LOGE(TAG, "ESP32 supports only 16- or 32-bit audio, got %u-bit",
(unsigned) audio_stream_info.get_bits_per_sample());
// of 16. The check is on the output width since that is what reaches the peripheral; a wider input is fine
// as long as it narrows to a 16- or 32-bit slot.
if (output_bits_per_sample % 16 != 0) {
ESP_LOGE(TAG, "ESP32 supports only 16- or 32-bit output, got %u-bit", (unsigned) output_bits_per_sample);
return ESP_ERR_NOT_SUPPORTED;
}
#endif // USE_ESP32_VARIANT_ESP32
@@ -358,7 +402,8 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
return ESP_ERR_INVALID_STATE;
}
uint32_t dma_buffer_length = dma_buffer_frames(audio_stream_info);
// The DMA buffers hold output-format (post-narrowing) samples, so size them from the output stream info.
uint32_t dma_buffer_length = dma_buffer_frames(this->output_stream_info_);
i2s_role_t i2s_role = this->i2s_role_;
i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;
@@ -398,19 +443,18 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
slot_mask = I2S_STD_SLOT_BOTH;
}
// Configure the data bit width from the output (post-narrowing) format, which is what is clocked out.
const i2s_data_bit_width_t data_bit_width = (i2s_data_bit_width_t) this->output_stream_info_.get_bits_per_sample();
i2s_std_slot_config_t slot_cfg;
switch (this->i2s_comm_fmt_) {
case I2SCommFmt::PCM:
slot_cfg =
I2S_STD_PCM_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
slot_cfg = I2S_STD_PCM_SLOT_DEFAULT_CONFIG(data_bit_width, slot_mode);
break;
case I2SCommFmt::MSB:
slot_cfg =
I2S_STD_MSB_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(), slot_mode);
slot_cfg = I2S_STD_MSB_SLOT_DEFAULT_CONFIG(data_bit_width, slot_mode);
break;
default:
slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG((i2s_data_bit_width_t) audio_stream_info.get_bits_per_sample(),
slot_mode);
slot_cfg = I2S_STD_PHILIPS_SLOT_DEFAULT_CONFIG(data_bit_width, slot_mode);
break;
}