From 747787ae98f3a19819aa1ecbbecafca558435267 Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Sun, 24 May 2026 15:34:15 -0400 Subject: [PATCH] [audio] Use RingBufferAudioSource for resampling (#16560) --- esphome/components/audio/audio_resampler.cpp | 61 +++++++++++++++----- esphome/components/audio/audio_resampler.h | 17 +++--- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/esphome/components/audio/audio_resampler.cpp b/esphome/components/audio/audio_resampler.cpp index c04cc881f5..bef62ce190 100644 --- a/esphome/components/audio/audio_resampler.cpp +++ b/esphome/components/audio/audio_resampler.cpp @@ -12,16 +12,17 @@ static const uint32_t READ_WRITE_TIMEOUT_MS = 20; AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size) : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) { - this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size); this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size); } esp_err_t AudioResampler::add_source(std::weak_ptr &input_ring_buffer) { - if (this->input_transfer_buffer_ != nullptr) { - this->input_transfer_buffer_->set_source(input_ring_buffer); - return ESP_OK; + // The zero-copy RingBufferAudioSource is created lazily on the first resample() call, once both the ring + // buffer (stored here) and the input stream info (set by start()) are available, in either order. + this->source_ring_buffer_ = input_ring_buffer.lock(); + if (this->source_ring_buffer_ == nullptr) { + return ESP_ERR_INVALID_STATE; } - return ESP_ERR_NO_MEM; + return ESP_OK; } esp_err_t AudioResampler::add_sink(std::weak_ptr &output_ring_buffer) { @@ -47,7 +48,7 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI this->input_stream_info_ = input_stream_info; this->output_stream_info_ = output_stream_info; - if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) { + if (this->output_transfer_buffer_ == nullptr) { return ESP_ERR_NO_MEM; } @@ -56,6 +57,13 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI return ESP_ERR_NOT_SUPPORTED; } + // Reject frame sizes that can't be used as the zero-copy source's alignment up front, where the caller checks + // the return code. The lazy create() in resample() keeps its own guard since it runs before the uint8_t cast. + const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1); + if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) { + return ESP_ERR_NOT_SUPPORTED; + } + if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) || (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) { this->resampler_ = make_unique( @@ -87,8 +95,27 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI } AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) { + if (this->audio_source_ == nullptr) { + // Lazily create the zero-copy source on first use. Frame-aligned reads ensure multi-channel frames are + // never split across the ring buffer's wrap boundary. + const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1); + if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) { + // Stream info is unset or the frame is too large to use as an alignment; the uint8_t cast below would + // truncate it and could yield a source that tears frames. + return AudioResamplerState::FAILED; + } + // Pass the shared_ptr by copy so a failed create() leaves source_ring_buffer_ intact; release our + // reference only after the source has taken ownership. + this->audio_source_ = RingBufferAudioSource::create(this->source_ring_buffer_, this->input_buffer_size_, + static_cast(bytes_per_frame)); + if (this->audio_source_ == nullptr) { + return AudioResamplerState::FAILED; + } + this->source_ring_buffer_.reset(); + } + if (stop_gracefully) { - if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) { + if (!this->audio_source_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) { return AudioResamplerState::FINISHED; } } @@ -102,9 +129,11 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d delay(READ_WRITE_TIMEOUT_MS); } - this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS)); + // Expose a chunk of the ring buffer's internal storage. pre_shift is ignored by RingBufferAudioSource + // (there is no intermediate transfer buffer to compact). + this->audio_source_->fill(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false); - if (this->input_transfer_buffer_->available() == 0) { + if (this->audio_source_->available() == 0) { // No samples available to process return AudioResamplerState::RESAMPLING; } @@ -112,17 +141,17 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d const size_t bytes_free = this->output_transfer_buffer_->free(); const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free); - const size_t bytes_available = this->input_transfer_buffer_->available(); + const size_t bytes_available = this->audio_source_->available(); const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available); if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) || (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) { // Adjust gain by -3 dB to avoid clipping due to the resampling process esp_audio_libs::resampler::ResamplerResults results = - this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(), - this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3); + this->resampler_->resample(this->audio_source_->data(), this->output_transfer_buffer_->get_buffer_end(), + frames_available, frames_free, -3); - this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used)); + this->audio_source_->consume(this->input_stream_info_.frames_to_bytes(results.frames_used)); this->output_transfer_buffer_->increase_buffer_length( this->output_stream_info_.frames_to_bytes(results.frames_generated)); @@ -146,10 +175,10 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free), this->input_stream_info_.frames_to_bytes(frames_available)); - std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), - (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer); + std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), (const void *) this->audio_source_->data(), + bytes_to_transfer); - this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer); + this->audio_source_->consume(bytes_to_transfer); this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer); } diff --git a/esphome/components/audio/audio_resampler.h b/esphome/components/audio/audio_resampler.h index 575ad13692..c09070c0ce 100644 --- a/esphome/components/audio/audio_resampler.h +++ b/esphome/components/audio/audio_resampler.h @@ -22,7 +22,7 @@ namespace esphome::audio { enum class AudioResamplerState : uint8_t { RESAMPLING, // More data is available to resample FINISHED, // All file data has been resampled and transferred - FAILED, // Unused state included for consistency among Audio classes + FAILED, // Failed to allocate the audio source }; class AudioResampler { @@ -32,14 +32,16 @@ class AudioResampler { * component). Also supports converting bits per sample. */ public: - /// @brief Allocates the input and output transfer buffers - /// @param input_buffer_size Size of the input transfer buffer in bytes. + /// @brief Allocates the output transfer buffer. The input source is created later in resample(). + /// @param input_buffer_size Max bytes exposed per fill() call on the zero-copy input source. /// @param output_buffer_size Size of the output transfer buffer in bytes. AudioResampler(size_t input_buffer_size, size_t output_buffer_size); - /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr. - /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership - /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated + /// @brief Sets the ring buffer the audio is read from and takes shared ownership of it. The zero-copy + /// RingBufferAudioSource that reads directly from its internal storage is created lazily on the first + /// resample() call, so add_source() and start() may be called in any order. + /// @param input_ring_buffer weak_ptr of a shared_ptr of the source ring buffer to transfer ownership + /// @return ESP_OK if successful, ESP_ERR_INVALID_STATE if the ring buffer is no longer alive esp_err_t add_source(std::weak_ptr &input_ring_buffer); /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr. @@ -78,7 +80,8 @@ class AudioResampler { void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; } protected: - std::unique_ptr input_transfer_buffer_; + std::shared_ptr source_ring_buffer_; + std::unique_ptr audio_source_; std::unique_ptr output_transfer_buffer_; size_t input_buffer_size_;