mirror of
https://github.com/esphome/esphome.git
synced 2026-06-24 12:53:26 +00:00
[i2s_audio] Properly track DMA input/output (#16317)
This commit is contained in:
@@ -99,7 +99,7 @@ void I2SAudioSpeakerBase::loop() {
|
||||
}
|
||||
|
||||
if (event_group_bits & SpeakerEventGroupBits::ERR_ESP_NO_MEM) {
|
||||
ESP_LOGE(TAG, "Not enough memory");
|
||||
ESP_LOGE(TAG, "Speaker task setup failed (allocation, preload, or channel enable)");
|
||||
xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
|
||||
}
|
||||
|
||||
|
||||
@@ -36,9 +36,7 @@ enum SpeakerEventGroupBits : uint32_t {
|
||||
ERR_ESP_NO_MEM = (1 << 19),
|
||||
|
||||
ERR_DROPPED_EVENT = (1 << 20), // ISR overflowed the event queue, dropping a completion event
|
||||
ERR_PARTIAL_WRITE = (1 << 21), // a DMA write returned fewer bytes than requested (or the encoder
|
||||
// failed to commit a complete block), which breaks the lockstep
|
||||
// invariant for every subsequent event
|
||||
ERR_PARTIAL_WRITE = (1 << 21), // i2s_channel_write returned fewer bytes than requested
|
||||
ERR_LOCKSTEP_DESYNC = (1 << 22), // i2s_event_queue_ and write_records_queue_ fell out of sync
|
||||
|
||||
ALL_BITS = 0x00FFFFFF, // All valid FreeRTOS event group bits
|
||||
|
||||
@@ -17,7 +17,14 @@ namespace esphome::i2s_audio {
|
||||
static const char *const TAG = "i2s_audio.speaker.std";
|
||||
|
||||
static constexpr size_t DMA_BUFFERS_COUNT = 4;
|
||||
static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT + 1;
|
||||
// Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
|
||||
// doubled so that a transient backlog never overruns the queue (which would desync the lockstep
|
||||
// invariant between i2s_event_queue_ and write_records_queue_).
|
||||
static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
|
||||
// Generous timeout for ``i2s_channel_write`` blocking. A buffer frees roughly every
|
||||
// DMA_BUFFER_DURATION_MS, so a multiple of that gives plenty of slack against scheduling jitter
|
||||
// without masking real failures.
|
||||
static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));
|
||||
|
||||
void I2SAudioSpeaker::dump_config() {
|
||||
I2SAudioSpeakerBase::dump_config();
|
||||
@@ -49,30 +56,73 @@ void I2SAudioSpeaker::run_speaker_task() {
|
||||
// avoids unnecessary single-frame splices.
|
||||
const size_t ring_buffer_size =
|
||||
(this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
|
||||
const uint32_t frames_to_fill_single_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
|
||||
const size_t bytes_to_fill_single_dma_buffer =
|
||||
this->current_stream_info_.frames_to_bytes(frames_to_fill_single_dma_buffer);
|
||||
const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
|
||||
const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
|
||||
|
||||
bool successful_setup = false;
|
||||
|
||||
std::unique_ptr<audio::RingBufferAudioSource> audio_source;
|
||||
|
||||
{
|
||||
// Pre-zeroed buffer used to silence-pad each DMA descriptor whenever real audio doesn't fully fill it.
|
||||
RAMAllocator<uint8_t> silence_allocator;
|
||||
uint8_t *silence_buffer = silence_allocator.allocate(dma_buffer_bytes);
|
||||
|
||||
if (silence_buffer != nullptr) {
|
||||
memset(silence_buffer, 0, dma_buffer_bytes);
|
||||
|
||||
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
|
||||
audio_source = audio::RingBufferAudioSource::create(temp_ring_buffer, bytes_to_fill_single_dma_buffer,
|
||||
static_cast<uint8_t>(bytes_per_frame));
|
||||
audio_source =
|
||||
audio::RingBufferAudioSource::create(temp_ring_buffer, dma_buffer_bytes, static_cast<uint8_t>(bytes_per_frame));
|
||||
|
||||
if (audio_source != nullptr) {
|
||||
// audio_source is nullptr if the ring buffer fails to allocate
|
||||
this->audio_ring_buffer_ = temp_ring_buffer;
|
||||
successful_setup = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (successful_setup) {
|
||||
// Preload every DMA descriptor with silence and push a matching zero-real-frames record per buffer.
|
||||
// This guarantees that every on_sent event has a corresponding write record from the start, so
|
||||
// ``i2s_event_queue_`` and ``write_records_queue_`` stay in lockstep for the entire task lifetime.
|
||||
for (size_t i = 0; i < DMA_BUFFERS_COUNT; i++) {
|
||||
size_t bytes_loaded = 0;
|
||||
esp_err_t err = i2s_channel_preload_data(this->tx_handle_, silence_buffer, dma_buffer_bytes, &bytes_loaded);
|
||||
if (err != ESP_OK || bytes_loaded != dma_buffer_bytes) {
|
||||
ESP_LOGV(TAG, "Failed to preload silence into DMA buffer %u (err=%d, loaded=%u)", (unsigned) i, (int) err,
|
||||
(unsigned) bytes_loaded);
|
||||
successful_setup = false;
|
||||
break;
|
||||
}
|
||||
uint32_t zero_real_frames = 0;
|
||||
if (xQueueSend(this->write_records_queue_, &zero_real_frames, 0) != pdTRUE) {
|
||||
// Should never happen: the queue was just reset and is sized for DMA_BUFFERS_COUNT * 2 entries.
|
||||
ESP_LOGV(TAG, "Failed to push preload write record");
|
||||
successful_setup = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (successful_setup) {
|
||||
// Register the on_sent callback BEFORE enabling the channel so the very first transmitted buffer
|
||||
// generates a queued event that pairs with the first preloaded silence record.
|
||||
const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
|
||||
i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
|
||||
|
||||
if (i2s_channel_enable(this->tx_handle_) != ESP_OK) {
|
||||
ESP_LOGV(TAG, "Failed to enable I2S channel");
|
||||
successful_setup = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!successful_setup) {
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_ESP_NO_MEM);
|
||||
} else {
|
||||
bool stop_gracefully = false;
|
||||
bool tx_dma_underflow = true;
|
||||
|
||||
uint32_t frames_written = 0;
|
||||
// Number of records currently in ``write_records_queue_`` that carry real audio. Used by graceful
|
||||
// stop to wait until every real-audio buffer has been confirmed played by an ISR event.
|
||||
uint32_t pending_real_buffers = 0;
|
||||
uint32_t last_data_received_time = millis();
|
||||
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_RUNNING);
|
||||
@@ -81,11 +131,21 @@ void I2SAudioSpeaker::run_speaker_task() {
|
||||
// - Paused, OR
|
||||
// - No timeout configured, OR
|
||||
// - Timeout hasn't elapsed since last data
|
||||
//
|
||||
// Always-fill model: every iteration writes exactly one DMA buffer's worth, mixing real audio
|
||||
// and silence padding as needed. The blocking ``i2s_channel_write`` paces the loop at the DMA
|
||||
// consumption rate, and every buffer write is matched 1:1 with a record on ``write_records_queue_``.
|
||||
//
|
||||
// While paused, the real-audio fill is skipped and the entire DMA buffer is filled with silence;
|
||||
// the same blocking ``i2s_channel_write`` provides natural pacing (one buffer per ~DMA_BUFFER_DURATION_MS),
|
||||
// so the lockstep invariant is preserved without burning CPU.
|
||||
while (this->pause_state_ || !this->timeout_.has_value() ||
|
||||
(millis() - last_data_received_time) <= this->timeout_.value()) {
|
||||
uint32_t event_group_bits = xEventGroupGetBits(this->event_group_);
|
||||
|
||||
if (event_group_bits & SpeakerEventGroupBits::COMMAND_STOP) {
|
||||
// COMMAND_STOP is set both by user-initiated stop() and by the ISR when it drops a completion
|
||||
// event (paired with ERR_DROPPED_EVENT so loop() can distinguish the two cases).
|
||||
xEventGroupClearBits(this->event_group_, SpeakerEventGroupBits::COMMAND_STOP);
|
||||
ESP_LOGV(TAG, "Exiting: COMMAND_STOP received");
|
||||
break;
|
||||
@@ -101,80 +161,115 @@ void I2SAudioSpeaker::run_speaker_task() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Drain ISR-stamped completion events. Each event corresponds 1:1 with a write_records_queue_
|
||||
// entry by construction (preloaded records at startup, plus exactly one record pushed per
|
||||
// iteration alongside exactly one DMA-buffer-sized write).
|
||||
int64_t write_timestamp;
|
||||
bool lockstep_broken = false;
|
||||
while (xQueueReceive(this->i2s_event_queue_, &write_timestamp, 0)) {
|
||||
// Receives timing events from the I2S on_sent callback. If actual audio data was sent in this event, it passes
|
||||
// on the timing info via the audio_output_callback.
|
||||
uint32_t frames_sent = frames_to_fill_single_dma_buffer;
|
||||
if (frames_to_fill_single_dma_buffer > frames_written) {
|
||||
tx_dma_underflow = true;
|
||||
frames_sent = frames_written;
|
||||
const uint32_t frames_zeroed = frames_to_fill_single_dma_buffer - frames_written;
|
||||
write_timestamp -= this->current_stream_info_.frames_to_microseconds(frames_zeroed);
|
||||
} else {
|
||||
tx_dma_underflow = false;
|
||||
}
|
||||
frames_written -= frames_sent;
|
||||
|
||||
// Standard I2S mode: fire callback immediately for each event
|
||||
if (frames_sent > 0) {
|
||||
this->audio_output_callback_(frames_sent, write_timestamp);
|
||||
}
|
||||
}
|
||||
|
||||
if (this->pause_state_) {
|
||||
// Pause state is accessed atomically, so thread safe
|
||||
// Delay so the task yields, then skip transferring audio data
|
||||
vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Wait half the duration of the data already written to the DMA buffers for new audio data
|
||||
// The millisecond helper modifies the frames_written variable, so use the microsecond helper and divide by 1000
|
||||
uint32_t read_delay = (this->current_stream_info_.frames_to_microseconds(frames_written) / 1000) / 2;
|
||||
|
||||
size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(read_delay), false);
|
||||
uint8_t *new_data = audio_source->mutable_data();
|
||||
|
||||
if (bytes_read > 0) {
|
||||
this->apply_software_volume_(new_data, bytes_read);
|
||||
this->swap_esp32_mono_samples_(new_data, bytes_read);
|
||||
}
|
||||
|
||||
if (audio_source->available() == 0) {
|
||||
if (stop_gracefully && tx_dma_underflow) {
|
||||
uint32_t real_frames = 0;
|
||||
if (xQueueReceive(this->write_records_queue_, &real_frames, 0) != pdTRUE) {
|
||||
// Should never happen: would indicate the lockstep invariant is broken.
|
||||
ESP_LOGV(TAG, "Event without matching write record");
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
|
||||
lockstep_broken = true;
|
||||
break;
|
||||
}
|
||||
vTaskDelay(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS / 2));
|
||||
} else {
|
||||
size_t bytes_written = 0;
|
||||
|
||||
if (tx_dma_underflow) {
|
||||
// Temporarily disable channel and callback to reset the I2S driver's internal DMA buffer queue
|
||||
i2s_channel_disable(this->tx_handle_);
|
||||
const i2s_event_callbacks_t null_callbacks = {.on_sent = nullptr};
|
||||
i2s_channel_register_event_callback(this->tx_handle_, &null_callbacks, this);
|
||||
i2s_channel_preload_data(this->tx_handle_, audio_source->data(), audio_source->available(), &bytes_written);
|
||||
} else {
|
||||
// Audio is already playing, use regular write to add to the DMA buffers
|
||||
i2s_channel_write(this->tx_handle_, audio_source->data(), audio_source->available(), &bytes_written,
|
||||
DMA_BUFFER_DURATION_MS);
|
||||
if (real_frames > 0) {
|
||||
pending_real_buffers--;
|
||||
// Real audio is packed at the start of each DMA buffer with any silence padding on the
|
||||
// tail, so the real audio finished playing earlier than the buffer-completion timestamp
|
||||
// by the duration of the trailing zeros.
|
||||
const uint32_t silence_frames = frames_per_dma_buffer - real_frames;
|
||||
const int64_t adjusted_ts =
|
||||
write_timestamp - this->current_stream_info_.frames_to_microseconds(silence_frames);
|
||||
this->audio_output_callback_(real_frames, adjusted_ts);
|
||||
}
|
||||
}
|
||||
if (lockstep_broken) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (bytes_written > 0) {
|
||||
last_data_received_time = millis();
|
||||
frames_written += this->current_stream_info_.bytes_to_frames(bytes_written);
|
||||
audio_source->consume(bytes_written);
|
||||
// Graceful stop: exit only after the source's exposed chunk is drained, the underlying ring
|
||||
// buffer has nothing left to hand over, and every real-audio buffer we submitted has been
|
||||
// confirmed played. ``has_buffered_data()`` returns bytes still sitting in the ring buffer
|
||||
// awaiting fill().
|
||||
if (stop_gracefully && audio_source->available() == 0 && !this->has_buffered_data() &&
|
||||
pending_real_buffers == 0) {
|
||||
ESP_LOGV(TAG, "Exiting: graceful stop complete");
|
||||
break;
|
||||
}
|
||||
|
||||
if (tx_dma_underflow) {
|
||||
tx_dma_underflow = false;
|
||||
// Enable the on_sent callback and channel after preload
|
||||
xQueueReset(this->i2s_event_queue_);
|
||||
const i2s_event_callbacks_t callbacks = {.on_sent = i2s_on_sent_cb};
|
||||
i2s_channel_register_event_callback(this->tx_handle_, &callbacks, this);
|
||||
i2s_channel_enable(this->tx_handle_);
|
||||
// Compose exactly one DMA buffer's worth: drain as much real audio as the source currently
|
||||
// exposes (may take multiple fill() calls when crossing a ring buffer wrap), then pad any
|
||||
// remainder with silence. All writes pack into the next free DMA descriptor in order, so the
|
||||
// descriptor ends up holding [real audio][silence padding].
|
||||
size_t bytes_written_total = 0;
|
||||
size_t real_bytes_total = 0;
|
||||
bool partial_write_failure = false;
|
||||
|
||||
if (!this->pause_state_) {
|
||||
while (bytes_written_total < dma_buffer_bytes) {
|
||||
size_t bytes_read = audio_source->fill(pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS) / 2, false);
|
||||
if (bytes_read > 0) {
|
||||
uint8_t *new_data = audio_source->mutable_data() + audio_source->available() - bytes_read;
|
||||
this->apply_software_volume_(new_data, bytes_read);
|
||||
this->swap_esp32_mono_samples_(new_data, bytes_read);
|
||||
}
|
||||
|
||||
const size_t to_write = std::min(audio_source->available(), dma_buffer_bytes - bytes_written_total);
|
||||
if (to_write == 0) {
|
||||
// Ring buffer has nothing more to hand over right now; pad the rest of this DMA buffer
|
||||
// with silence so the lockstep invariant (one write per iteration) is preserved.
|
||||
break;
|
||||
}
|
||||
|
||||
size_t bw = 0;
|
||||
i2s_channel_write(this->tx_handle_, audio_source->data(), to_write, &bw, WRITE_TIMEOUT_TICKS);
|
||||
if (bw != to_write) {
|
||||
// A short real-audio write breaks DMA descriptor alignment for every subsequent event;
|
||||
// the only safe recovery is to restart the task.
|
||||
ESP_LOGV(TAG, "Partial real audio write: %u of %u bytes", (unsigned) bw, (unsigned) to_write);
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
|
||||
partial_write_failure = true;
|
||||
break;
|
||||
}
|
||||
audio_source->consume(bw);
|
||||
bytes_written_total += bw;
|
||||
real_bytes_total += bw;
|
||||
}
|
||||
if (real_bytes_total > 0) {
|
||||
last_data_received_time = millis();
|
||||
}
|
||||
}
|
||||
|
||||
if (partial_write_failure) {
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t silence_bytes = dma_buffer_bytes - bytes_written_total;
|
||||
if (silence_bytes > 0) {
|
||||
size_t bw = 0;
|
||||
i2s_channel_write(this->tx_handle_, silence_buffer, silence_bytes, &bw, WRITE_TIMEOUT_TICKS);
|
||||
if (bw != silence_bytes) {
|
||||
// Same descriptor-alignment hazard as a partial real-audio write.
|
||||
ESP_LOGV(TAG, "Partial silence write: %u of %u bytes", (unsigned) bw, (unsigned) silence_bytes);
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_PARTIAL_WRITE);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t real_frames_in_buffer = this->current_stream_info_.bytes_to_frames(real_bytes_total);
|
||||
// Push the matching write record. Capacity headroom in I2S_EVENT_QUEUE_COUNT guarantees this
|
||||
// succeeds even with a transient backlog of unprocessed events; if it ever fails the lockstep
|
||||
// invariant is broken and every subsequent timestamp would be silently wrong, so bail.
|
||||
if (xQueueSend(this->write_records_queue_, &real_frames_in_buffer, 0) != pdTRUE) {
|
||||
ESP_LOGV(TAG, "Exiting: write records queue full");
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::ERR_LOCKSTEP_DESYNC);
|
||||
break;
|
||||
}
|
||||
if (real_frames_in_buffer > 0) {
|
||||
pending_real_buffers++;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -183,6 +278,11 @@ void I2SAudioSpeaker::run_speaker_task() {
|
||||
|
||||
audio_source.reset();
|
||||
|
||||
if (silence_buffer != nullptr) {
|
||||
silence_allocator.deallocate(silence_buffer, dma_buffer_bytes);
|
||||
silence_buffer = nullptr;
|
||||
}
|
||||
|
||||
xEventGroupSetBits(this->event_group_, SpeakerEventGroupBits::TASK_STOPPED);
|
||||
|
||||
while (true) {
|
||||
@@ -301,7 +401,7 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
|
||||
return err;
|
||||
}
|
||||
|
||||
i2s_channel_enable(this->tx_handle_);
|
||||
// The speaker task will enable the channel after preloading.
|
||||
|
||||
return ESP_OK;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user