From f94735dc621dc494d7383233cdd904e631fbcdad Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 12 May 2026 20:38:39 -0500 Subject: [PATCH] [api][voice_assistant] Add second audio channel for voice_assistant (#16265) Co-authored-by: Kevin Ahrendt Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com> --- esphome/components/api/api.proto | 1 + esphome/components/api/api_pb2.cpp | 7 + esphome/components/api/api_pb2.h | 4 +- esphome/components/api/api_pb2_dump.cpp | 1 + .../components/voice_assistant/__init__.py | 38 +++-- .../voice_assistant/voice_assistant.cpp | 142 +++++++++++++++--- .../voice_assistant/voice_assistant.h | 8 + .../voice_assistant/common-idf.yaml | 14 +- .../voice_assistant/test.esp32-idf.yaml | 1 + 9 files changed, 179 insertions(+), 37 deletions(-) diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index 4d72be5407..f4f15c1042 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -2026,6 +2026,7 @@ message VoiceAssistantAudio { bytes data = 1 [(pointer_to_buffer) = true]; bool end = 2; + bytes data2 = 3 [(pointer_to_buffer) = true]; } enum VoiceAssistantTimerEvent { diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 68be7550ee..c711ef167c 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -2893,6 +2893,11 @@ bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited this->data_len = value.size(); break; } + case 3: { + this->data2 = value.data(); + this->data2_len = value.size(); + break; + } default: return false; } @@ -2902,12 +2907,14 @@ uint8_t *VoiceAssistantAudio::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG uint8_t *__restrict__ pos = buffer.get_pos(); ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 1, this->data, this->data_len); ProtoEncode::encode_bool(pos PROTO_ENCODE_DEBUG_ARG, 2, this->end); + ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 3, this->data2, this->data2_len); return pos; } uint32_t VoiceAssistantAudio::calculate_size() const { uint32_t size = 0; size += ProtoSize::calc_length(1, this->data_len); size += ProtoSize::calc_bool(1, this->end); + size += ProtoSize::calc_length(1, this->data2_len); return size; } bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, proto_varint_value_t value) { diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index 7b82f1884d..7e926ee0d4 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -2436,13 +2436,15 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage { class VoiceAssistantAudio final : public ProtoDecodableMessage { public: static constexpr uint8_t MESSAGE_TYPE = 106; - static constexpr uint8_t ESTIMATED_SIZE = 21; + static constexpr uint8_t ESTIMATED_SIZE = 40; #ifdef HAS_PROTO_MESSAGE_DUMP const LogString *message_name() const override { return LOG_STR("voice_assistant_audio"); } #endif const uint8_t *data{nullptr}; uint16_t data_len{0}; bool end{false}; + const uint8_t *data2{nullptr}; + uint16_t data2_len{0}; uint8_t *encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const; uint32_t calculate_size() const; #ifdef HAS_PROTO_MESSAGE_DUMP diff --git a/esphome/components/api/api_pb2_dump.cpp b/esphome/components/api/api_pb2_dump.cpp index 5258b355ce..850ad37bc9 100644 --- a/esphome/components/api/api_pb2_dump.cpp +++ b/esphome/components/api/api_pb2_dump.cpp @@ -2174,6 +2174,7 @@ const char *VoiceAssistantAudio::dump_to(DumpBuffer &out) const { MessageDumpHelper helper(out, ESPHOME_PSTR("VoiceAssistantAudio")); dump_bytes_field(out, ESPHOME_PSTR("data"), this->data, this->data_len); dump_field(out, ESPHOME_PSTR("end"), this->end); + dump_bytes_field(out, ESPHOME_PSTR("data2"), this->data2, this->data2_len); return out.c_str(); } const char *VoiceAssistantTimerEventResponse::dump_to(DumpBuffer &out) const { diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index 9387797ba2..958d1cbf91 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -53,6 +53,8 @@ CONF_ON_TIMER_CANCELLED = "on_timer_cancelled" CONF_ON_TIMER_FINISHED = "on_timer_finished" CONF_ON_TIMER_TICK = "on_timer_tick" +MAX_MICROPHONE_SOURCES = 2 + voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant") VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component) @@ -90,13 +92,20 @@ CONFIG_SCHEMA = cv.All( cv.Schema( { cv.GenerateID(): cv.declare_id(VoiceAssistant), - cv.Optional( - CONF_MICROPHONE, default={} - ): microphone.microphone_source_schema( - min_bits_per_sample=16, - max_bits_per_sample=16, - min_channels=1, - max_channels=1, + cv.Optional(CONF_MICROPHONE, default=[{}]): cv.All( + cv.ensure_list( + microphone.microphone_source_schema( + min_bits_per_sample=16, + max_bits_per_sample=16, + min_channels=1, + max_channels=1, + ) + ), + cv.Length( + min=1, + max=MAX_MICROPHONE_SOURCES, + msg=f"Voice Assistant supports at most {MAX_MICROPHONE_SOURCES} microphone sources", + ), ), cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id( media_player.MediaPlayer @@ -179,10 +188,10 @@ CONFIG_SCHEMA = cv.All( FINAL_VALIDATE_SCHEMA = cv.All( cv.Schema( { - cv.Optional( - CONF_MICROPHONE - ): microphone.final_validate_microphone_source_schema( - "voice_assistant", sample_rate=16000 + cv.Optional(CONF_MICROPHONE): cv.ensure_list( + microphone.final_validate_microphone_source_schema( + "voice_assistant", sample_rate=16000 + ) ), }, extra=cv.ALLOW_EXTRA, @@ -194,9 +203,14 @@ async def to_code(config): var = cg.new_Pvariable(config[CONF_ID]) await cg.register_component(var, config) - mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE]) + mic_sources = config[CONF_MICROPHONE] + mic_source = await microphone.microphone_source_to_code(mic_sources[0]) cg.add(var.set_microphone_source(mic_source)) + if len(mic_sources) > 1: + mic_source2 = await microphone.microphone_source_to_code(mic_sources[1]) + cg.add(var.set_microphone_source2(mic_source2)) + if CONF_MICRO_WAKE_WORD in config: mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD]) cg.add(var.set_micro_wake_word(mww)) diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 50a8265297..286e6645d2 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -31,11 +31,21 @@ VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; } void VoiceAssistant::setup() { this->mic_source_->add_data_callback([this](const std::vector &data) { std::shared_ptr temp_ring_buffer = this->ring_buffer_; - if (this->ring_buffer_.use_count() > 1) { + if (temp_ring_buffer != nullptr) { temp_ring_buffer->write((void *) data.data(), data.size()); } }); + // Second microphone channel + if (this->mic_source2_ != nullptr) { + this->mic_source2_->add_data_callback([this](const std::vector &data) { + std::shared_ptr temp_ring_buffer = this->ring_buffer2_; + if (temp_ring_buffer != nullptr) { + temp_ring_buffer->write((void *) data.data(), data.size()); + } + }); + } + #ifdef USE_MEDIA_PLAYER if (this->media_player_ != nullptr) { this->media_player_->add_on_state_callback([this](media_player::MediaPlayerState state) { @@ -115,9 +125,9 @@ bool VoiceAssistant::allocate_buffers_() { } #endif - if (this->ring_buffer_.use_count() == 0) { + if (this->ring_buffer_ == nullptr) { this->ring_buffer_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE); - if (this->ring_buffer_.use_count() == 0) { + if (this->ring_buffer_ == nullptr) { ESP_LOGE(TAG, "Could not allocate ring buffer"); return false; } @@ -132,6 +142,26 @@ bool VoiceAssistant::allocate_buffers_() { } } + // Second microphone channel + if (this->mic_source2_ != nullptr) { + if (this->ring_buffer2_ == nullptr) { + this->ring_buffer2_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE); + if (this->ring_buffer2_ == nullptr) { + ESP_LOGE(TAG, "Could not allocate second ring buffer"); + return false; + } + } + + if (this->send_buffer2_ == nullptr) { + RAMAllocator send_allocator; + this->send_buffer2_ = send_allocator.allocate(SEND_BUFFER_SIZE); + if (this->send_buffer2_ == nullptr) { + ESP_LOGW(TAG, "Could not allocate second send buffer"); + return false; + } + } + } + return true; } @@ -144,6 +174,15 @@ void VoiceAssistant::clear_buffers_() { this->ring_buffer_->reset(); } + // Second microphone channel + if (this->send_buffer2_ != nullptr) { + memset(this->send_buffer2_, 0, SEND_BUFFER_SIZE); + } + + if (this->ring_buffer2_ != nullptr) { + this->ring_buffer2_->reset(); + } + #ifdef USE_SPEAKER if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE); @@ -162,10 +201,17 @@ void VoiceAssistant::deallocate_buffers_() { this->send_buffer_ = nullptr; } - if (this->ring_buffer_.use_count() > 0) { - this->ring_buffer_.reset(); + this->ring_buffer_.reset(); + + // Second microphone channel + if (this->send_buffer2_ != nullptr) { + RAMAllocator send_deallocator; + send_deallocator.deallocate(this->send_buffer2_, SEND_BUFFER_SIZE); + this->send_buffer2_ = nullptr; } + this->ring_buffer2_.reset(); + #ifdef USE_SPEAKER if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) { RAMAllocator speaker_deallocator; @@ -183,7 +229,8 @@ void VoiceAssistant::reset_conversation_id() { void VoiceAssistant::loop() { if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE && this->state_ != State::STOPPING_MICROPHONE) { - if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) { + if (this->mic_source_->is_running() || (this->mic_source2_ && this->mic_source2_->is_running()) || + this->state_ == State::STARTING_MICROPHONE) { this->set_state_(State::STOP_MICROPHONE, State::IDLE); } else { this->set_state_(State::IDLE, State::IDLE); @@ -215,11 +262,14 @@ void VoiceAssistant::loop() { this->clear_buffers_(); this->mic_source_->start(); + if (this->mic_source2_) { + this->mic_source2_->start(); + } this->set_state_(State::STARTING_MICROPHONE); break; } case State::STARTING_MICROPHONE: { - if (this->mic_source_->is_running()) { + if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) { this->set_state_(this->desired_state_); } break; @@ -266,15 +316,44 @@ void VoiceAssistant::loop() { break; // State changed when udp server port received } case State::STREAMING_MICROPHONE: { - size_t available = this->ring_buffer_->available(); - while (available >= SEND_BUFFER_SIZE) { - size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); - if (this->audio_mode_ == AUDIO_MODE_API) { + if (this->audio_mode_ == AUDIO_MODE_API) { + // API audio + // Both microphone channels are sent, if configured + bool is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE; + bool is_available2 = false; + if (this->mic_source2_) { + is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE; + } + + while (is_available || is_available2) { api::VoiceAssistantAudio msg; - msg.data = this->send_buffer_; - msg.data_len = read_bytes; + + if (is_available) { + size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); + msg.data = this->send_buffer_; + msg.data_len = read_bytes; + } + + // Second microphone channel + if (is_available2) { + size_t read_bytes = this->ring_buffer2_->read((void *) this->send_buffer2_, SEND_BUFFER_SIZE, 0); + msg.data2 = this->send_buffer2_; + msg.data2_len = read_bytes; + } + this->api_client_->send_message(msg); - } else { + is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE; + if (this->mic_source2_) { + is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE; + } else { + is_available2 = false; + } + } + } else { + // UDP (will eventually be deprecated) + // Only the primary microphone channel is used + while (this->ring_buffer_->available() >= SEND_BUFFER_SIZE) { + size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0); if (!this->udp_socket_running_) { if (!this->start_udp_socket_()) { this->set_state_(State::STOP_MICROPHONE, State::IDLE); @@ -284,14 +363,23 @@ void VoiceAssistant::loop() { this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_, sizeof(this->dest_addr_)); } - available = this->ring_buffer_->available(); - } - + } // audio mode break; } case State::STOP_MICROPHONE: { - if (this->mic_source_->is_running()) { - this->mic_source_->stop(); + // Check both microphone channels + bool is_running = this->mic_source_->is_running(); + bool is_running2 = false; + if (this->mic_source2_) { + is_running2 = this->mic_source2_->is_running(); + } + if (is_running || is_running2) { + if (is_running) { + this->mic_source_->stop(); + } + if (is_running2) { + this->mic_source2_->stop(); + } this->set_state_(State::STOPPING_MICROPHONE); } else { this->set_state_(this->desired_state_); @@ -299,7 +387,13 @@ void VoiceAssistant::loop() { break; } case State::STOPPING_MICROPHONE: { - if (this->mic_source_->is_stopped()) { + // Check both microphone channels + bool is_stopped = this->mic_source_->is_stopped(); + bool is_stopped2 = true; + if (this->mic_source2_) { + is_stopped2 = this->mic_source2_->is_stopped(); + } + if (is_stopped && is_stopped2) { this->set_state_(this->desired_state_); } break; @@ -504,7 +598,8 @@ void VoiceAssistant::start_streaming() { ESP_LOGD(TAG, "Client started, streaming microphone"); this->audio_mode_ = AUDIO_MODE_API; - if (this->mic_source_->is_running()) { + // Both microphone channels + if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) { this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); } else { this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); @@ -520,6 +615,10 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por ESP_LOGD(TAG, "Client started, streaming microphone"); this->audio_mode_ = AUDIO_MODE_UDP; + if (this->mic_source2_ != nullptr) { + ESP_LOGW(TAG, "UDP audio mode does not support a second microphone channel; only the primary will be streamed"); + } + memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); if (this->dest_addr_.ss_family == AF_INET) { ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port); @@ -534,6 +633,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por return; } + // Only primary microphone channel over UDP if (this->mic_source_->is_running()) { this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); } else { diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 3de4673001..c4fa7eb615 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -40,6 +40,7 @@ enum VoiceAssistantFeature : uint32_t { FEATURE_TIMERS = 1 << 3, FEATURE_ANNOUNCE = 1 << 4, FEATURE_START_CONVERSATION = 1 << 5, + FEATURE_MULTI_CHANNEL_AUDIO = 1 << 6, }; enum class State { @@ -120,6 +121,7 @@ class VoiceAssistant : public Component { void failed_to_start(); void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; } + void set_microphone_source2(microphone::MicrophoneSource *mic_source2) { this->mic_source2_ = mic_source2; } #ifdef USE_MICRO_WAKE_WORD void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; } #endif @@ -149,6 +151,9 @@ class VoiceAssistant : public Component { uint32_t flags = 0; flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT; flags |= VoiceAssistantFeature::FEATURE_API_AUDIO; + if (this->mic_source2_ != nullptr) { + flags |= VoiceAssistantFeature::FEATURE_MULTI_CHANNEL_AUDIO; + } #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { flags |= VoiceAssistantFeature::FEATURE_SPEAKER; @@ -276,6 +281,7 @@ class VoiceAssistant : public Component { bool timer_tick_running_{false}; microphone::MicrophoneSource *mic_source_{nullptr}; + microphone::MicrophoneSource *mic_source2_{nullptr}; #ifdef USE_SPEAKER void write_speaker_(); speaker::Speaker *speaker_{nullptr}; @@ -301,6 +307,7 @@ class VoiceAssistant : public Component { std::string wake_word_; std::shared_ptr ring_buffer_; + std::shared_ptr ring_buffer2_; bool use_wake_word_; uint8_t noise_suppression_level_; @@ -309,6 +316,7 @@ class VoiceAssistant : public Component { uint32_t conversation_timeout_; uint8_t *send_buffer_{nullptr}; + uint8_t *send_buffer2_{nullptr}; bool continuous_{false}; bool silence_detection_; diff --git a/tests/components/voice_assistant/common-idf.yaml b/tests/components/voice_assistant/common-idf.yaml index 8565683700..0fa0903370 100644 --- a/tests/components/voice_assistant/common-idf.yaml +++ b/tests/components/voice_assistant/common-idf.yaml @@ -31,6 +31,11 @@ microphone: i2s_din_pin: ${i2s_din_pin} adc_type: external pdm: false + - platform: i2s_audio + id: mic_id_external2 + i2s_din_pin: ${i2s_din_pin2} + adc_type: external + pdm: false speaker: - platform: i2s_audio @@ -40,9 +45,12 @@ speaker: voice_assistant: microphone: - microphone: mic_id_external - gain_factor: 4 - channels: 0 + - microphone: mic_id_external + gain_factor: 4 + channels: 0 + - microphone: mic_id_external2 + gain_factor: 4 + channels: 0 speaker: speaker_id micro_wake_word: mww_id conversation_timeout: 60s diff --git a/tests/components/voice_assistant/test.esp32-idf.yaml b/tests/components/voice_assistant/test.esp32-idf.yaml index 1c5c9ddf99..0cc670a77e 100644 --- a/tests/components/voice_assistant/test.esp32-idf.yaml +++ b/tests/components/voice_assistant/test.esp32-idf.yaml @@ -3,6 +3,7 @@ substitutions: i2s_bclk_pin: GPIO5 i2s_mclk_pin: GPIO15 i2s_din_pin: GPIO13 + i2s_din_pin2: GPIO14 i2s_dout_pin: GPIO12 <<: !include common-idf.yaml