[audio] Use RingBufferAudioSource for resampling (#16560)

2026-06-24 14:19:03 +00:00 · 2026-05-24 15:34:15 -04:00
parent 5cb7e62241
commit 747787ae98
2 changed files with 55 additions and 23 deletions
--- a/esphome/components/audio/audio_resampler.cpp
+++ b/esphome/components/audio/audio_resampler.cpp
@@ -12,16 +12,17 @@ static const uint32_t READ_WRITE_TIMEOUT_MS = 20;

 AudioResampler::AudioResampler(size_t input_buffer_size, size_t output_buffer_size)
    : input_buffer_size_(input_buffer_size), output_buffer_size_(output_buffer_size) {
-  this->input_transfer_buffer_ = AudioSourceTransferBuffer::create(input_buffer_size);
  this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size);
 }

 esp_err_t AudioResampler::add_source(std::weak_ptr<ring_buffer::RingBuffer> &input_ring_buffer) {
-  if (this->input_transfer_buffer_ != nullptr) {
-    this->input_transfer_buffer_->set_source(input_ring_buffer);
-    return ESP_OK;
+  // The zero-copy RingBufferAudioSource is created lazily on the first resample() call, once both the ring
+  // buffer (stored here) and the input stream info (set by start()) are available, in either order.
+  this->source_ring_buffer_ = input_ring_buffer.lock();
+  if (this->source_ring_buffer_ == nullptr) {
+    return ESP_ERR_INVALID_STATE;
  }
-  return ESP_ERR_NO_MEM;
+  return ESP_OK;
 }

 esp_err_t AudioResampler::add_sink(std::weak_ptr<ring_buffer::RingBuffer> &output_ring_buffer) {
@@ -47,7 +48,7 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI
  this->input_stream_info_ = input_stream_info;
  this->output_stream_info_ = output_stream_info;

-  if ((this->input_transfer_buffer_ == nullptr) || (this->output_transfer_buffer_ == nullptr)) {
+  if (this->output_transfer_buffer_ == nullptr) {
    return ESP_ERR_NO_MEM;
  }

@@ -56,6 +57,13 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI
    return ESP_ERR_NOT_SUPPORTED;
  }

+  // Reject frame sizes that can't be used as the zero-copy source's alignment up front, where the caller checks
+  // the return code. The lazy create() in resample() keeps its own guard since it runs before the uint8_t cast.
+  const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);
+  if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+
  if ((input_stream_info.get_sample_rate() != output_stream_info.get_sample_rate()) ||
      (input_stream_info.get_bits_per_sample() != output_stream_info.get_bits_per_sample())) {
    this->resampler_ = make_unique<esp_audio_libs::resampler::Resampler>(
@@ -87,8 +95,27 @@ esp_err_t AudioResampler::start(AudioStreamInfo &input_stream_info, AudioStreamI
 }

 AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_differential) {
+  if (this->audio_source_ == nullptr) {
+    // Lazily create the zero-copy source on first use. Frame-aligned reads ensure multi-channel frames are
+    // never split across the ring buffer's wrap boundary.
+    const size_t bytes_per_frame = this->input_stream_info_.frames_to_bytes(1);
+    if ((bytes_per_frame == 0) || (bytes_per_frame > RingBufferAudioSource::MAX_ALIGNMENT_BYTES)) {
+      // Stream info is unset or the frame is too large to use as an alignment; the uint8_t cast below would
+      // truncate it and could yield a source that tears frames.
+      return AudioResamplerState::FAILED;
+    }
+    // Pass the shared_ptr by copy so a failed create() leaves source_ring_buffer_ intact; release our
+    // reference only after the source has taken ownership.
+    this->audio_source_ = RingBufferAudioSource::create(this->source_ring_buffer_, this->input_buffer_size_,
+                                                        static_cast<uint8_t>(bytes_per_frame));
+    if (this->audio_source_ == nullptr) {
+      return AudioResamplerState::FAILED;
+    }
+    this->source_ring_buffer_.reset();
+  }
+
  if (stop_gracefully) {
-    if (!this->input_transfer_buffer_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {
+    if (!this->audio_source_->has_buffered_data() && (this->output_transfer_buffer_->available() == 0)) {
      return AudioResamplerState::FINISHED;
    }
  }
@@ -102,9 +129,11 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
    delay(READ_WRITE_TIMEOUT_MS);
  }

-  this->input_transfer_buffer_->transfer_data_from_source(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS));
+  // Expose a chunk of the ring buffer's internal storage. pre_shift is ignored by RingBufferAudioSource
+  // (there is no intermediate transfer buffer to compact).
+  this->audio_source_->fill(pdMS_TO_TICKS(READ_WRITE_TIMEOUT_MS), false);

-  if (this->input_transfer_buffer_->available() == 0) {
+  if (this->audio_source_->available() == 0) {
    // No samples available to process
    return AudioResamplerState::RESAMPLING;
  }
@@ -112,17 +141,17 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
  const size_t bytes_free = this->output_transfer_buffer_->free();
  const uint32_t frames_free = this->output_stream_info_.bytes_to_frames(bytes_free);

-  const size_t bytes_available = this->input_transfer_buffer_->available();
+  const size_t bytes_available = this->audio_source_->available();
  const uint32_t frames_available = this->input_stream_info_.bytes_to_frames(bytes_available);

  if ((this->input_stream_info_.get_sample_rate() != this->output_stream_info_.get_sample_rate()) ||
      (this->input_stream_info_.get_bits_per_sample() != this->output_stream_info_.get_bits_per_sample())) {
    // Adjust gain by -3 dB to avoid clipping due to the resampling process
    esp_audio_libs::resampler::ResamplerResults results =
-        this->resampler_->resample(this->input_transfer_buffer_->get_buffer_start(),
-                                   this->output_transfer_buffer_->get_buffer_end(), frames_available, frames_free, -3);
+        this->resampler_->resample(this->audio_source_->data(), this->output_transfer_buffer_->get_buffer_end(),
+                                   frames_available, frames_free, -3);

-    this->input_transfer_buffer_->decrease_buffer_length(this->input_stream_info_.frames_to_bytes(results.frames_used));
+    this->audio_source_->consume(this->input_stream_info_.frames_to_bytes(results.frames_used));
    this->output_transfer_buffer_->increase_buffer_length(
        this->output_stream_info_.frames_to_bytes(results.frames_generated));

@@ -146,10 +175,10 @@ AudioResamplerState AudioResampler::resample(bool stop_gracefully, int32_t *ms_d
    const size_t bytes_to_transfer = std::min(this->output_stream_info_.frames_to_bytes(frames_free),
                                              this->input_stream_info_.frames_to_bytes(frames_available));

-    std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(),
-                (void *) this->input_transfer_buffer_->get_buffer_start(), bytes_to_transfer);
+    std::memcpy((void *) this->output_transfer_buffer_->get_buffer_end(), (const void *) this->audio_source_->data(),
+                bytes_to_transfer);

-    this->input_transfer_buffer_->decrease_buffer_length(bytes_to_transfer);
+    this->audio_source_->consume(bytes_to_transfer);
    this->output_transfer_buffer_->increase_buffer_length(bytes_to_transfer);
  }

--- a/esphome/components/audio/audio_resampler.h
+++ b/esphome/components/audio/audio_resampler.h
@@ -22,7 +22,7 @@ namespace esphome::audio {
 enum class AudioResamplerState : uint8_t {
  RESAMPLING,  // More data is available to resample
  FINISHED,    // All file data has been resampled and transferred
-  FAILED,      // Unused state included for consistency among Audio classes
+  FAILED,      // Failed to allocate the audio source
 };

 class AudioResampler {
@@ -32,14 +32,16 @@ class AudioResampler {
   * component). Also supports converting bits per sample.
   */
 public:
-  /// @brief Allocates the input and output transfer buffers
-  /// @param input_buffer_size Size of the input transfer buffer in bytes.
+  /// @brief Allocates the output transfer buffer. The input source is created later in resample().
+  /// @param input_buffer_size Max bytes exposed per fill() call on the zero-copy input source.
  /// @param output_buffer_size Size of the output transfer buffer in bytes.
  AudioResampler(size_t input_buffer_size, size_t output_buffer_size);

-  /// @brief Adds a source ring buffer for audio data. Takes ownership of the ring buffer in a shared_ptr.
-  /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership
-  /// @return ESP_OK if successsful, ESP_ERR_NO_MEM if the transfer buffer wasn't allocated
+  /// @brief Sets the ring buffer the audio is read from and takes shared ownership of it. The zero-copy
+  /// RingBufferAudioSource that reads directly from its internal storage is created lazily on the first
+  /// resample() call, so add_source() and start() may be called in any order.
+  /// @param input_ring_buffer weak_ptr of a shared_ptr of the source ring buffer to transfer ownership
+  /// @return ESP_OK if successful, ESP_ERR_INVALID_STATE if the ring buffer is no longer alive
  esp_err_t add_source(std::weak_ptr<ring_buffer::RingBuffer> &input_ring_buffer);

  /// @brief Adds a sink ring buffer for resampled audio. Takes ownership of the ring buffer in a shared_ptr.
@@ -78,7 +80,8 @@ class AudioResampler {
  void set_pause_output_state(bool pause_state) { this->pause_output_ = pause_state; }

 protected:
-  std::unique_ptr<AudioSourceTransferBuffer> input_transfer_buffer_;
+  std::shared_ptr<ring_buffer::RingBuffer> source_ring_buffer_;
+  std::unique_ptr<RingBufferAudioSource> audio_source_;
  std::unique_ptr<AudioSinkTransferBuffer> output_transfer_buffer_;

  size_t input_buffer_size_;