mirror of
https://github.com/esphome/esphome.git
synced 2026-06-24 16:04:55 +00:00
[micro_wake_word] Use RingBufferAudioSource (#16595)
Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -33,7 +33,8 @@ static const uint32_t INFERENCE_TASK_STACK_SIZE = 3072;
|
||||
static const UBaseType_t INFERENCE_TASK_PRIORITY = 3;
|
||||
|
||||
enum EventGroupBits : uint32_t {
|
||||
COMMAND_STOP = (1 << 0), // Signals the inference task should stop
|
||||
COMMAND_STOP = (1 << 0), // Signals the inference task should stop
|
||||
COMMAND_RESET_RING_BUFFER = (1 << 1), // Signals the inference task to discard buffered audio
|
||||
|
||||
TASK_STARTING = (1 << 3),
|
||||
TASK_RUNNING = (1 << 4),
|
||||
@@ -114,13 +115,13 @@ void MicroWakeWord::setup() {
|
||||
}
|
||||
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_.lock();
|
||||
if (this->ring_buffer_.use_count() > 1) {
|
||||
size_t bytes_free = temp_ring_buffer->free();
|
||||
|
||||
if (bytes_free < data.size()) {
|
||||
xEventGroupSetBits(this->event_group_, EventGroupBits::WARNING_FULL_RING_BUFFER);
|
||||
temp_ring_buffer->reset();
|
||||
// Producer-only write: never touches consumer state. If the buffer is full, ask the inference task
|
||||
// to drain it - reset() is a consumer operation and must run on the inference task's thread.
|
||||
// Disable partial writes so audio chunks are either fully accepted or rejected and handled below.
|
||||
if (temp_ring_buffer->write_without_replacement(data.data(), data.size(), 0, false) == 0) {
|
||||
xEventGroupSetBits(this->event_group_,
|
||||
EventGroupBits::WARNING_FULL_RING_BUFFER | EventGroupBits::COMMAND_RESET_RING_BUFFER);
|
||||
}
|
||||
temp_ring_buffer->write((void *) data.data(), data.size());
|
||||
}
|
||||
});
|
||||
|
||||
@@ -146,56 +147,65 @@ void MicroWakeWord::inference_task(void *params) {
|
||||
|
||||
{ // Ensures any C++ objects fall out of scope to deallocate before deleting the task
|
||||
|
||||
const size_t new_bytes_to_process =
|
||||
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(this_mww->features_step_size_);
|
||||
std::unique_ptr<audio::AudioSourceTransferBuffer> audio_buffer;
|
||||
const auto &stream_info = this_mww->microphone_source_->get_audio_stream_info();
|
||||
const size_t bytes_per_frame = stream_info.frames_to_bytes(1);
|
||||
const size_t max_fill_bytes = stream_info.ms_to_bytes(this_mww->features_step_size_);
|
||||
std::unique_ptr<audio::RingBufferAudioSource> audio_source;
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE];
|
||||
|
||||
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
|
||||
// Allocate audio transfer buffer
|
||||
audio_buffer = audio::AudioSourceTransferBuffer::create(new_bytes_to_process);
|
||||
|
||||
if (audio_buffer == nullptr) {
|
||||
// Round ring buffer size down to a frame multiple so the wrap boundary never splits an int16 sample.
|
||||
const size_t ring_buffer_size =
|
||||
(stream_info.ms_to_bytes(RING_BUFFER_DURATION_MS) / bytes_per_frame) * bytes_per_frame;
|
||||
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(ring_buffer_size);
|
||||
if (temp_ring_buffer == nullptr) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
|
||||
} else {
|
||||
audio_source = audio::RingBufferAudioSource::create(temp_ring_buffer, max_fill_bytes,
|
||||
static_cast<uint8_t>(bytes_per_frame));
|
||||
if (audio_source == nullptr) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
|
||||
} else {
|
||||
this_mww->ring_buffer_ = temp_ring_buffer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
|
||||
// Allocate ring buffer
|
||||
std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = ring_buffer::RingBuffer::create(
|
||||
this_mww->microphone_source_->get_audio_stream_info().ms_to_bytes(RING_BUFFER_DURATION_MS));
|
||||
if (temp_ring_buffer.use_count() == 0) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_MEMORY);
|
||||
}
|
||||
audio_buffer->set_source(temp_ring_buffer);
|
||||
this_mww->ring_buffer_ = temp_ring_buffer;
|
||||
}
|
||||
|
||||
if (!(xEventGroupGetBits(this_mww->event_group_) & ERROR_BITS)) {
|
||||
this_mww->microphone_source_->start();
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::TASK_RUNNING);
|
||||
|
||||
while (!(xEventGroupGetBits(this_mww->event_group_) & COMMAND_STOP)) {
|
||||
audio_buffer->transfer_data_from_source(pdMS_TO_TICKS(DATA_TIMEOUT_MS));
|
||||
|
||||
if (audio_buffer->available() < new_bytes_to_process) {
|
||||
// Insufficient data to generate new spectrogram features, read more next iteration
|
||||
continue;
|
||||
while (!(xEventGroupGetBits(this_mww->event_group_) & (COMMAND_STOP | ERROR_BITS))) {
|
||||
if (xEventGroupGetBits(this_mww->event_group_) & EventGroupBits::COMMAND_RESET_RING_BUFFER) {
|
||||
// Producer asked us to drain; run the consumer-side reset from this thread.
|
||||
audio_source->clear_buffered_data();
|
||||
xEventGroupClearBits(this_mww->event_group_, EventGroupBits::COMMAND_RESET_RING_BUFFER);
|
||||
}
|
||||
|
||||
// Generate new spectrogram features
|
||||
uint32_t processed_samples = this_mww->generate_features_(
|
||||
(int16_t *) audio_buffer->get_buffer_start(), audio_buffer->available() / sizeof(int16_t), features_buffer);
|
||||
audio_buffer->decrease_buffer_length(processed_samples * sizeof(int16_t));
|
||||
audio_source->fill(pdMS_TO_TICKS(DATA_TIMEOUT_MS), false);
|
||||
|
||||
// Run inference using the new spectorgram features
|
||||
if (!this_mww->update_model_probabilities_(features_buffer)) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_INFERENCE);
|
||||
break;
|
||||
// The frontend buffers samples internally and only emits a feature once it has a full window, so we can
|
||||
// hand it whatever the source exposes. The frontend consumes at least one sample per call, so available()
|
||||
// strictly decreases and this loop always terminates.
|
||||
while (audio_source->available() >= sizeof(int16_t)) {
|
||||
const size_t samples_available = audio_source->available() / sizeof(int16_t);
|
||||
const int16_t *audio_data = reinterpret_cast<const int16_t *>(audio_source->data());
|
||||
|
||||
size_t processed_samples = 0;
|
||||
const bool feature_generated =
|
||||
this_mww->generate_features_(audio_data, samples_available, features_buffer, &processed_samples);
|
||||
audio_source->consume(processed_samples * sizeof(int16_t));
|
||||
|
||||
if (feature_generated) {
|
||||
if (!this_mww->update_model_probabilities_(features_buffer)) {
|
||||
xEventGroupSetBits(this_mww->event_group_, EventGroupBits::ERROR_INFERENCE);
|
||||
break;
|
||||
}
|
||||
|
||||
// Process each model's probabilities and possibly send a Detection Event to the queue
|
||||
this_mww->process_probabilities_();
|
||||
}
|
||||
}
|
||||
|
||||
// Process each model's probabilities and possibly send a Detection Event to the queue
|
||||
this_mww->process_probabilities_();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -386,11 +396,15 @@ void MicroWakeWord::set_state_(State state) {
|
||||
}
|
||||
}
|
||||
|
||||
size_t MicroWakeWord::generate_features_(int16_t *audio_buffer, size_t samples_available,
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]) {
|
||||
size_t processed_samples = 0;
|
||||
bool MicroWakeWord::generate_features_(const int16_t *audio_buffer, size_t samples_available,
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE], size_t *processed_samples) {
|
||||
*processed_samples = 0;
|
||||
struct FrontendOutput frontend_output =
|
||||
FrontendProcessSamples(&this->frontend_state_, audio_buffer, samples_available, &processed_samples);
|
||||
FrontendProcessSamples(&this->frontend_state_, audio_buffer, samples_available, processed_samples);
|
||||
|
||||
if (frontend_output.size == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < frontend_output.size; ++i) {
|
||||
// These scaling values are set to match the TFLite audio frontend int8 output.
|
||||
@@ -415,7 +429,7 @@ size_t MicroWakeWord::generate_features_(int16_t *audio_buffer, size_t samples_a
|
||||
features_buffer[i] = static_cast<int8_t>(clamp<int32_t>(value, INT8_MIN, INT8_MAX));
|
||||
}
|
||||
|
||||
return processed_samples;
|
||||
return true;
|
||||
}
|
||||
|
||||
void MicroWakeWord::process_probabilities_() {
|
||||
|
||||
@@ -115,13 +115,16 @@ class MicroWakeWord : public Component
|
||||
|
||||
void set_state_(State state);
|
||||
|
||||
/// @brief Generates spectrogram features from an input buffer of audio samples
|
||||
/// @param audio_buffer (int16_t *) Buffer containing input audio samples
|
||||
/// @param samples_available (size_t) Number of samples avaiable in the input buffer
|
||||
/// @param features_buffer (int8_t *) Buffer to store generated features
|
||||
/// @return (size_t) Number of samples processed from the input buffer
|
||||
size_t generate_features_(int16_t *audio_buffer, size_t samples_available,
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE]);
|
||||
/// @brief Generates a spectrogram feature from an input buffer of audio samples. The frontend buffers samples
|
||||
/// internally, so callers may stream arbitrary-sized chunks; a feature is only emitted once enough samples have
|
||||
/// accumulated to fill a full analysis window.
|
||||
/// @param audio_buffer (const int16_t *) Buffer containing input audio samples
|
||||
/// @param samples_available (size_t) Number of samples available in the input buffer
|
||||
/// @param features_buffer (int8_t *) Buffer to store the generated feature, valid only when the return value is true
|
||||
/// @param processed_samples (size_t *) Set to the number of samples consumed from the input buffer
|
||||
/// @return True if a new feature was generated; false if more samples are required
|
||||
bool generate_features_(const int16_t *audio_buffer, size_t samples_available,
|
||||
int8_t features_buffer[PREPROCESSOR_FEATURE_SIZE], size_t *processed_samples);
|
||||
|
||||
/// @brief Processes any new probabilities for each model. If any wake word is detected, it will send a DetectionEvent
|
||||
/// to the detection_queue_.
|
||||
|
||||
Reference in New Issue
Block a user