From 857e529803d10971ffd36089068b6590fac3dc6f Mon Sep 17 00:00:00 2001 From: Kevin Ahrendt Date: Mon, 4 May 2026 18:41:50 -0400 Subject: [PATCH] [audio] Use the microMP3 library instead of esp-audio-libs (#16236) --- esphome/components/audio/__init__.py | 1 + esphome/components/audio/audio_decoder.cpp | 108 ++++++++++----------- esphome/components/audio/audio_decoder.h | 13 +-- esphome/idf_component.yml | 2 + 4 files changed, 61 insertions(+), 63 deletions(-) diff --git a/esphome/components/audio/__init__.py b/esphome/components/audio/__init__.py index 8528e77ae7..60ff40ea4b 100644 --- a/esphome/components/audio/__init__.py +++ b/esphome/components/audio/__init__.py @@ -395,6 +395,7 @@ async def to_code(config): ) if data.mp3_support: cg.add_define("USE_AUDIO_MP3_SUPPORT") + add_idf_component(name="esphome/micro-mp3", ref="0.2.0") _emit_memory_pair( data.mp3.buffer_memory, "CONFIG_MP3_DECODER_PREFER_PSRAM", diff --git a/esphome/components/audio/audio_decoder.cpp b/esphome/components/audio/audio_decoder.cpp index baa4c41c06..65a4db4e10 100644 --- a/esphome/components/audio/audio_decoder.cpp +++ b/esphome/components/audio/audio_decoder.cpp @@ -20,14 +20,6 @@ AudioDecoder::AudioDecoder(size_t input_buffer_size, size_t output_buffer_size) this->output_transfer_buffer_ = AudioSinkTransferBuffer::create(output_buffer_size); } -AudioDecoder::~AudioDecoder() { -#ifdef USE_AUDIO_MP3_SUPPORT - if (this->audio_file_type_ == AudioFileType::MP3) { - esp_audio_libs::helix_decoder::MP3FreeDecoder(this->mp3_decoder_); - } -#endif -} - esp_err_t AudioDecoder::add_source(std::weak_ptr &input_ring_buffer) { auto source = AudioSourceTransferBuffer::create(this->input_buffer_size_); if (source == nullptr) { @@ -92,13 +84,10 @@ esp_err_t AudioDecoder::start(AudioFileType audio_file_type) { #endif #ifdef USE_AUDIO_MP3_SUPPORT case AudioFileType::MP3: - this->mp3_decoder_ = esp_audio_libs::helix_decoder::MP3InitDecoder(); - - // MP3 always has 1152 samples per chunk - this->free_buffer_required_ = 1152 * sizeof(int16_t) * 2; // samples * size per sample * channels - - // Always reallocate the output transfer buffer to the smallest necessary size - this->output_transfer_buffer_->reallocate(this->free_buffer_required_); + this->mp3_decoder_ = make_unique(); + this->free_buffer_required_ = + this->output_transfer_buffer_->capacity(); // Adjusted and reallocated after reading the header + this->decoder_buffers_internally_ = true; break; #endif #ifdef USE_AUDIO_OPUS_SUPPORT @@ -312,51 +301,56 @@ FileDecoderState AudioDecoder::decode_flac_() { #ifdef USE_AUDIO_MP3_SUPPORT FileDecoderState AudioDecoder::decode_mp3_() { - // Look for the next sync word - int buffer_length = (int) this->input_buffer_->available(); - int32_t offset = esp_audio_libs::helix_decoder::MP3FindSyncWord(this->input_buffer_->data(), buffer_length); + // microMP3's samples_decoded value is samples per channel; e.g., what ESPHome typically calls an audio frame. + // microMP3 uses the term frame to refer to an MP3 frame: an encoded packet that contains multiple audio frames. + size_t bytes_consumed = 0; + size_t samples_decoded = 0; - if (offset < 0) { - // New data may have the sync word - this->input_buffer_->consume(buffer_length); + // microMP3 buffers internally: it consumes from our input buffer at its own pace, emits MP3_STREAM_INFO_READY once + // the first frame header is parsed, and only then produces PCM. It handles sync-word search and ID3v2 tag skipping. + micro_mp3::Mp3Result result = this->mp3_decoder_->decode( + this->input_buffer_->data(), this->input_buffer_->available(), this->output_transfer_buffer_->get_buffer_end(), + this->output_transfer_buffer_->free(), bytes_consumed, samples_decoded); + + this->input_buffer_->consume(bytes_consumed); + + if (result == micro_mp3::MP3_OK) { + if (samples_decoded > 0 && this->audio_stream_info_.has_value()) { + this->output_transfer_buffer_->increase_buffer_length( + this->audio_stream_info_.value().frames_to_bytes(samples_decoded)); + } + } else if (result == micro_mp3::MP3_STREAM_INFO_READY) { + // First successful header parse: capture stream info and resize the output buffer to fit one full frame. + // microMP3 always outputs 16-bit PCM. + this->audio_stream_info_ = + audio::AudioStreamInfo(16, this->mp3_decoder_->get_channels(), this->mp3_decoder_->get_sample_rate()); + this->free_buffer_required_ = + this->mp3_decoder_->get_samples_per_frame() * this->mp3_decoder_->get_channels() * sizeof(int16_t); + if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) { + return FileDecoderState::FAILED; + } + } else if (result == micro_mp3::MP3_NEED_MORE_DATA) { + return FileDecoderState::MORE_TO_PROCESS; + } else if (result == micro_mp3::MP3_OUTPUT_BUFFER_TOO_SMALL) { + // Reallocate to decode the frame on the next call + if (this->mp3_decoder_->get_channels() > 0) { + this->free_buffer_required_ = + this->mp3_decoder_->get_samples_per_frame() * this->mp3_decoder_->get_channels() * sizeof(int16_t); + } else { + // Fallback to worst-case size if channel info isn't available + this->free_buffer_required_ = this->mp3_decoder_->get_min_output_buffer_bytes(); + } + if (!this->output_transfer_buffer_->reallocate(this->free_buffer_required_)) { + return FileDecoderState::FAILED; + } + } else if (result == micro_mp3::MP3_DECODE_ERROR) { + // Corrupt frame skipped; recoverable, retry on next call + ESP_LOGW(TAG, "MP3 decoder skipped a corrupt frame"); return FileDecoderState::POTENTIALLY_FAILED; - } - - // Advance read pointer to match the offset for the syncword - this->input_buffer_->consume(offset); - const uint8_t *buffer_start = this->input_buffer_->data(); - - buffer_length = (int) this->input_buffer_->available(); - int err = esp_audio_libs::helix_decoder::MP3Decode(this->mp3_decoder_, &buffer_start, &buffer_length, - (int16_t *) this->output_transfer_buffer_->get_buffer_end(), 0); - - size_t consumed = this->input_buffer_->available() - buffer_length; - this->input_buffer_->consume(consumed); - - if (err) { - switch (err) { - case esp_audio_libs::helix_decoder::ERR_MP3_OUT_OF_MEMORY: - [[fallthrough]]; - case esp_audio_libs::helix_decoder::ERR_MP3_NULL_POINTER: - return FileDecoderState::FAILED; - break; - default: - // Most errors are recoverable by moving on to the next frame, so mark as potentailly failed - return FileDecoderState::POTENTIALLY_FAILED; - break; - } } else { - esp_audio_libs::helix_decoder::MP3FrameInfo mp3_frame_info; - esp_audio_libs::helix_decoder::MP3GetLastFrameInfo(this->mp3_decoder_, &mp3_frame_info); - if (mp3_frame_info.outputSamps > 0) { - int bytes_per_sample = (mp3_frame_info.bitsPerSample / 8); - this->output_transfer_buffer_->increase_buffer_length(mp3_frame_info.outputSamps * bytes_per_sample); - - if (!this->audio_stream_info_.has_value()) { - this->audio_stream_info_ = - audio::AudioStreamInfo(mp3_frame_info.bitsPerSample, mp3_frame_info.nChans, mp3_frame_info.samprate); - } - } + // MP3_ALLOCATION_FAILED, MP3_INPUT_INVALID, or any future error -- not recoverable + ESP_LOGE(TAG, "MP3 decoder failed: %d", static_cast(result)); + return FileDecoderState::FAILED; } return FileDecoderState::MORE_TO_PROCESS; diff --git a/esphome/components/audio/audio_decoder.h b/esphome/components/audio/audio_decoder.h index 6e3a228a68..4cbe8b6720 100644 --- a/esphome/components/audio/audio_decoder.h +++ b/esphome/components/audio/audio_decoder.h @@ -16,9 +16,6 @@ #include "esp_err.h" // esp-audio-libs -#ifdef USE_AUDIO_MP3_SUPPORT -#include -#endif #include // micro-flac @@ -26,6 +23,11 @@ #include #endif +// micro-mp3 +#ifdef USE_AUDIO_MP3_SUPPORT +#include +#endif + // micro-opus #ifdef USE_AUDIO_OPUS_SUPPORT #include @@ -62,8 +64,7 @@ class AudioDecoder { /// @param output_buffer_size Size of the output transfer buffer in bytes. AudioDecoder(size_t input_buffer_size, size_t output_buffer_size); - /// @brief Deallocates the MP3 decoder (the flac, opus, and wav decoders are deallocated automatically) - ~AudioDecoder(); + ~AudioDecoder() = default; /// @brief Adds a source ring buffer for raw file data. Takes ownership of the ring buffer in a shared_ptr. /// @param input_ring_buffer weak_ptr of a shared_ptr of the sink ring buffer to transfer ownership @@ -125,7 +126,7 @@ class AudioDecoder { #endif #ifdef USE_AUDIO_MP3_SUPPORT FileDecoderState decode_mp3_(); - esp_audio_libs::helix_decoder::HMP3Decoder mp3_decoder_; + std::unique_ptr mp3_decoder_; #endif #ifdef USE_AUDIO_OPUS_SUPPORT FileDecoderState decode_opus_(); diff --git a/esphome/idf_component.yml b/esphome/idf_component.yml index f5a8dd8c60..5ad9090215 100644 --- a/esphome/idf_component.yml +++ b/esphome/idf_component.yml @@ -9,6 +9,8 @@ dependencies: version: 0.2.0 esphome/micro-flac: version: 0.1.1 + esphome/micro-mp3: + version: 0.2.0 esphome/micro-opus: version: 0.4.0 espressif/esp-dsp: