From f94735dc621dc494d7383233cdd904e631fbcdad Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Tue, 12 May 2026 20:38:39 -0500
Subject: [PATCH] [api][voice_assistant] Add second audio channel for
 voice_assistant (#16265)

Co-authored-by: Kevin Ahrendt <kevin.ahrendt@openhomefoundation.org>
Co-authored-by: Jesse Hills <3060199+jesserockz@users.noreply.github.com>
---
 esphome/components/api/api.proto              |   1 +
 esphome/components/api/api_pb2.cpp            |   7 +
 esphome/components/api/api_pb2.h              |   4 +-
 esphome/components/api/api_pb2_dump.cpp       |   1 +
 .../components/voice_assistant/__init__.py    |  38 +++--
 .../voice_assistant/voice_assistant.cpp       | 142 +++++++++++++++---
 .../voice_assistant/voice_assistant.h         |   8 +
 .../voice_assistant/common-idf.yaml           |  14 +-
 .../voice_assistant/test.esp32-idf.yaml       |   1 +
 9 files changed, 179 insertions(+), 37 deletions(-)

diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto
index 4d72be5407..f4f15c1042 100644
--- a/esphome/components/api/api.proto
+++ b/esphome/components/api/api.proto
@@ -2026,6 +2026,7 @@ message VoiceAssistantAudio {
 
   bytes data = 1 [(pointer_to_buffer) = true];
   bool end = 2;
+  bytes data2 = 3 [(pointer_to_buffer) = true];
 }
 
 enum VoiceAssistantTimerEvent {
diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp
index 68be7550ee..c711ef167c 100644
--- a/esphome/components/api/api_pb2.cpp
+++ b/esphome/components/api/api_pb2.cpp
@@ -2893,6 +2893,11 @@ bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited
       this->data_len = value.size();
       break;
     }
+    case 3: {
+      this->data2 = value.data();
+      this->data2_len = value.size();
+      break;
+    }
     default:
       return false;
   }
@@ -2902,12 +2907,14 @@ uint8_t *VoiceAssistantAudio::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG
   uint8_t *__restrict__ pos = buffer.get_pos();
   ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 1, this->data, this->data_len);
   ProtoEncode::encode_bool(pos PROTO_ENCODE_DEBUG_ARG, 2, this->end);
+  ProtoEncode::encode_bytes(pos PROTO_ENCODE_DEBUG_ARG, 3, this->data2, this->data2_len);
   return pos;
 }
 uint32_t VoiceAssistantAudio::calculate_size() const {
   uint32_t size = 0;
   size += ProtoSize::calc_length(1, this->data_len);
   size += ProtoSize::calc_bool(1, this->end);
+  size += ProtoSize::calc_length(1, this->data2_len);
   return size;
 }
 bool VoiceAssistantTimerEventResponse::decode_varint(uint32_t field_id, proto_varint_value_t value) {
diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h
index 7b82f1884d..7e926ee0d4 100644
--- a/esphome/components/api/api_pb2.h
+++ b/esphome/components/api/api_pb2.h
@@ -2436,13 +2436,15 @@ class VoiceAssistantEventResponse final : public ProtoDecodableMessage {
 class VoiceAssistantAudio final : public ProtoDecodableMessage {
  public:
   static constexpr uint8_t MESSAGE_TYPE = 106;
-  static constexpr uint8_t ESTIMATED_SIZE = 21;
+  static constexpr uint8_t ESTIMATED_SIZE = 40;
 #ifdef HAS_PROTO_MESSAGE_DUMP
   const LogString *message_name() const override { return LOG_STR("voice_assistant_audio"); }
 #endif
   const uint8_t *data{nullptr};
   uint16_t data_len{0};
   bool end{false};
+  const uint8_t *data2{nullptr};
+  uint16_t data2_len{0};
   uint8_t *encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const;
   uint32_t calculate_size() const;
 #ifdef HAS_PROTO_MESSAGE_DUMP
diff --git a/esphome/components/api/api_pb2_dump.cpp b/esphome/components/api/api_pb2_dump.cpp
index 5258b355ce..850ad37bc9 100644
--- a/esphome/components/api/api_pb2_dump.cpp
+++ b/esphome/components/api/api_pb2_dump.cpp
@@ -2174,6 +2174,7 @@ const char *VoiceAssistantAudio::dump_to(DumpBuffer &out) const {
   MessageDumpHelper helper(out, ESPHOME_PSTR("VoiceAssistantAudio"));
   dump_bytes_field(out, ESPHOME_PSTR("data"), this->data, this->data_len);
   dump_field(out, ESPHOME_PSTR("end"), this->end);
+  dump_bytes_field(out, ESPHOME_PSTR("data2"), this->data2, this->data2_len);
   return out.c_str();
 }
 const char *VoiceAssistantTimerEventResponse::dump_to(DumpBuffer &out) const {
diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py
index 9387797ba2..958d1cbf91 100644
--- a/esphome/components/voice_assistant/__init__.py
+++ b/esphome/components/voice_assistant/__init__.py
@@ -53,6 +53,8 @@ CONF_ON_TIMER_CANCELLED = "on_timer_cancelled"
 CONF_ON_TIMER_FINISHED = "on_timer_finished"
 CONF_ON_TIMER_TICK = "on_timer_tick"
 
+MAX_MICROPHONE_SOURCES = 2
+
 
 voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant")
 VoiceAssistant = voice_assistant_ns.class_("VoiceAssistant", cg.Component)
@@ -90,13 +92,20 @@ CONFIG_SCHEMA = cv.All(
     cv.Schema(
         {
             cv.GenerateID(): cv.declare_id(VoiceAssistant),
-            cv.Optional(
-                CONF_MICROPHONE, default={}
-            ): microphone.microphone_source_schema(
-                min_bits_per_sample=16,
-                max_bits_per_sample=16,
-                min_channels=1,
-                max_channels=1,
+            cv.Optional(CONF_MICROPHONE, default=[{}]): cv.All(
+                cv.ensure_list(
+                    microphone.microphone_source_schema(
+                        min_bits_per_sample=16,
+                        max_bits_per_sample=16,
+                        min_channels=1,
+                        max_channels=1,
+                    )
+                ),
+                cv.Length(
+                    min=1,
+                    max=MAX_MICROPHONE_SOURCES,
+                    msg=f"Voice Assistant supports at most {MAX_MICROPHONE_SOURCES} microphone sources",
+                ),
             ),
             cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(
                 media_player.MediaPlayer
@@ -179,10 +188,10 @@ CONFIG_SCHEMA = cv.All(
 FINAL_VALIDATE_SCHEMA = cv.All(
     cv.Schema(
         {
-            cv.Optional(
-                CONF_MICROPHONE
-            ): microphone.final_validate_microphone_source_schema(
-                "voice_assistant", sample_rate=16000
+            cv.Optional(CONF_MICROPHONE): cv.ensure_list(
+                microphone.final_validate_microphone_source_schema(
+                    "voice_assistant", sample_rate=16000
+                )
             ),
         },
         extra=cv.ALLOW_EXTRA,
@@ -194,9 +203,14 @@ async def to_code(config):
     var = cg.new_Pvariable(config[CONF_ID])
     await cg.register_component(var, config)
 
-    mic_source = await microphone.microphone_source_to_code(config[CONF_MICROPHONE])
+    mic_sources = config[CONF_MICROPHONE]
+    mic_source = await microphone.microphone_source_to_code(mic_sources[0])
     cg.add(var.set_microphone_source(mic_source))
 
+    if len(mic_sources) > 1:
+        mic_source2 = await microphone.microphone_source_to_code(mic_sources[1])
+        cg.add(var.set_microphone_source2(mic_source2))
+
     if CONF_MICRO_WAKE_WORD in config:
         mww = await cg.get_variable(config[CONF_MICRO_WAKE_WORD])
         cg.add(var.set_micro_wake_word(mww))
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index 50a8265297..286e6645d2 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -31,11 +31,21 @@ VoiceAssistant::VoiceAssistant() { global_voice_assistant = this; }
 void VoiceAssistant::setup() {
   this->mic_source_->add_data_callback([this](const std::vector<uint8_t> &data) {
     std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer_;
-    if (this->ring_buffer_.use_count() > 1) {
+    if (temp_ring_buffer != nullptr) {
       temp_ring_buffer->write((void *) data.data(), data.size());
     }
   });
 
+  // Second microphone channel
+  if (this->mic_source2_ != nullptr) {
+    this->mic_source2_->add_data_callback([this](const std::vector<uint8_t> &data) {
+      std::shared_ptr<ring_buffer::RingBuffer> temp_ring_buffer = this->ring_buffer2_;
+      if (temp_ring_buffer != nullptr) {
+        temp_ring_buffer->write((void *) data.data(), data.size());
+      }
+    });
+  }
+
 #ifdef USE_MEDIA_PLAYER
   if (this->media_player_ != nullptr) {
     this->media_player_->add_on_state_callback([this](media_player::MediaPlayerState state) {
@@ -115,9 +125,9 @@ bool VoiceAssistant::allocate_buffers_() {
   }
 #endif
 
-  if (this->ring_buffer_.use_count() == 0) {
+  if (this->ring_buffer_ == nullptr) {
     this->ring_buffer_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
-    if (this->ring_buffer_.use_count() == 0) {
+    if (this->ring_buffer_ == nullptr) {
       ESP_LOGE(TAG, "Could not allocate ring buffer");
       return false;
     }
@@ -132,6 +142,26 @@ bool VoiceAssistant::allocate_buffers_() {
     }
   }
 
+  // Second microphone channel
+  if (this->mic_source2_ != nullptr) {
+    if (this->ring_buffer2_ == nullptr) {
+      this->ring_buffer2_ = ring_buffer::RingBuffer::create(RING_BUFFER_SIZE);
+      if (this->ring_buffer2_ == nullptr) {
+        ESP_LOGE(TAG, "Could not allocate second ring buffer");
+        return false;
+      }
+    }
+
+    if (this->send_buffer2_ == nullptr) {
+      RAMAllocator<uint8_t> send_allocator;
+      this->send_buffer2_ = send_allocator.allocate(SEND_BUFFER_SIZE);
+      if (this->send_buffer2_ == nullptr) {
+        ESP_LOGW(TAG, "Could not allocate second send buffer");
+        return false;
+      }
+    }
+  }
+
   return true;
 }
 
@@ -144,6 +174,15 @@ void VoiceAssistant::clear_buffers_() {
     this->ring_buffer_->reset();
   }
 
+  // Second microphone channel
+  if (this->send_buffer2_ != nullptr) {
+    memset(this->send_buffer2_, 0, SEND_BUFFER_SIZE);
+  }
+
+  if (this->ring_buffer2_ != nullptr) {
+    this->ring_buffer2_->reset();
+  }
+
 #ifdef USE_SPEAKER
   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
     memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
@@ -162,10 +201,17 @@ void VoiceAssistant::deallocate_buffers_() {
     this->send_buffer_ = nullptr;
   }
 
-  if (this->ring_buffer_.use_count() > 0) {
-    this->ring_buffer_.reset();
+  this->ring_buffer_.reset();
+
+  // Second microphone channel
+  if (this->send_buffer2_ != nullptr) {
+    RAMAllocator<uint8_t> send_deallocator;
+    send_deallocator.deallocate(this->send_buffer2_, SEND_BUFFER_SIZE);
+    this->send_buffer2_ = nullptr;
   }
 
+  this->ring_buffer2_.reset();
+
 #ifdef USE_SPEAKER
   if ((this->speaker_ != nullptr) && (this->speaker_buffer_ != nullptr)) {
     RAMAllocator<uint8_t> speaker_deallocator;
@@ -183,7 +229,8 @@ void VoiceAssistant::reset_conversation_id() {
 void VoiceAssistant::loop() {
   if (this->api_client_ == nullptr && this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE &&
       this->state_ != State::STOPPING_MICROPHONE) {
-    if (this->mic_source_->is_running() || this->state_ == State::STARTING_MICROPHONE) {
+    if (this->mic_source_->is_running() || (this->mic_source2_ && this->mic_source2_->is_running()) ||
+        this->state_ == State::STARTING_MICROPHONE) {
       this->set_state_(State::STOP_MICROPHONE, State::IDLE);
     } else {
       this->set_state_(State::IDLE, State::IDLE);
@@ -215,11 +262,14 @@ void VoiceAssistant::loop() {
       this->clear_buffers_();
 
       this->mic_source_->start();
+      if (this->mic_source2_) {
+        this->mic_source2_->start();
+      }
       this->set_state_(State::STARTING_MICROPHONE);
       break;
     }
     case State::STARTING_MICROPHONE: {
-      if (this->mic_source_->is_running()) {
+      if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
         this->set_state_(this->desired_state_);
       }
       break;
@@ -266,15 +316,44 @@ void VoiceAssistant::loop() {
       break;  // State changed when udp server port received
     }
     case State::STREAMING_MICROPHONE: {
-      size_t available = this->ring_buffer_->available();
-      while (available >= SEND_BUFFER_SIZE) {
-        size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
-        if (this->audio_mode_ == AUDIO_MODE_API) {
+      if (this->audio_mode_ == AUDIO_MODE_API) {
+        // API audio
+        // Both microphone channels are sent, if configured
+        bool is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
+        bool is_available2 = false;
+        if (this->mic_source2_) {
+          is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
+        }
+
+        while (is_available || is_available2) {
           api::VoiceAssistantAudio msg;
-          msg.data = this->send_buffer_;
-          msg.data_len = read_bytes;
+
+          if (is_available) {
+            size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
+            msg.data = this->send_buffer_;
+            msg.data_len = read_bytes;
+          }
+
+          // Second microphone channel
+          if (is_available2) {
+            size_t read_bytes = this->ring_buffer2_->read((void *) this->send_buffer2_, SEND_BUFFER_SIZE, 0);
+            msg.data2 = this->send_buffer2_;
+            msg.data2_len = read_bytes;
+          }
+
           this->api_client_->send_message(msg);
-        } else {
+          is_available = this->ring_buffer_->available() >= SEND_BUFFER_SIZE;
+          if (this->mic_source2_) {
+            is_available2 = this->ring_buffer2_->available() >= SEND_BUFFER_SIZE;
+          } else {
+            is_available2 = false;
+          }
+        }
+      } else {
+        // UDP (will eventually be deprecated)
+        // Only the primary microphone channel is used
+        while (this->ring_buffer_->available() >= SEND_BUFFER_SIZE) {
+          size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
           if (!this->udp_socket_running_) {
             if (!this->start_udp_socket_()) {
               this->set_state_(State::STOP_MICROPHONE, State::IDLE);
@@ -284,14 +363,23 @@ void VoiceAssistant::loop() {
           this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
                                 sizeof(this->dest_addr_));
         }
-        available = this->ring_buffer_->available();
-      }
-
+      }  // audio mode
       break;
     }
     case State::STOP_MICROPHONE: {
-      if (this->mic_source_->is_running()) {
-        this->mic_source_->stop();
+      // Check both microphone channels
+      bool is_running = this->mic_source_->is_running();
+      bool is_running2 = false;
+      if (this->mic_source2_) {
+        is_running2 = this->mic_source2_->is_running();
+      }
+      if (is_running || is_running2) {
+        if (is_running) {
+          this->mic_source_->stop();
+        }
+        if (is_running2) {
+          this->mic_source2_->stop();
+        }
         this->set_state_(State::STOPPING_MICROPHONE);
       } else {
         this->set_state_(this->desired_state_);
@@ -299,7 +387,13 @@ void VoiceAssistant::loop() {
       break;
     }
     case State::STOPPING_MICROPHONE: {
-      if (this->mic_source_->is_stopped()) {
+      // Check both microphone channels
+      bool is_stopped = this->mic_source_->is_stopped();
+      bool is_stopped2 = true;
+      if (this->mic_source2_) {
+        is_stopped2 = this->mic_source2_->is_stopped();
+      }
+      if (is_stopped && is_stopped2) {
         this->set_state_(this->desired_state_);
       }
       break;
@@ -504,7 +598,8 @@ void VoiceAssistant::start_streaming() {
   ESP_LOGD(TAG, "Client started, streaming microphone");
   this->audio_mode_ = AUDIO_MODE_API;
 
-  if (this->mic_source_->is_running()) {
+  // Both microphone channels
+  if (this->mic_source_->is_running() && (!this->mic_source2_ || this->mic_source2_->is_running())) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
     this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
@@ -520,6 +615,10 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
   ESP_LOGD(TAG, "Client started, streaming microphone");
   this->audio_mode_ = AUDIO_MODE_UDP;
 
+  if (this->mic_source2_ != nullptr) {
+    ESP_LOGW(TAG, "UDP audio mode does not support a second microphone channel; only the primary will be streamed");
+  }
+
   memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
   if (this->dest_addr_.ss_family == AF_INET) {
     ((struct sockaddr_in *) &this->dest_addr_)->sin_port = htons(port);
@@ -534,6 +633,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
     return;
   }
 
+  // Only primary microphone channel over UDP
   if (this->mic_source_->is_running()) {
     this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
   } else {
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index 3de4673001..c4fa7eb615 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -40,6 +40,7 @@ enum VoiceAssistantFeature : uint32_t {
   FEATURE_TIMERS = 1 << 3,
   FEATURE_ANNOUNCE = 1 << 4,
   FEATURE_START_CONVERSATION = 1 << 5,
+  FEATURE_MULTI_CHANNEL_AUDIO = 1 << 6,
 };
 
 enum class State {
@@ -120,6 +121,7 @@ class VoiceAssistant : public Component {
   void failed_to_start();
 
   void set_microphone_source(microphone::MicrophoneSource *mic_source) { this->mic_source_ = mic_source; }
+  void set_microphone_source2(microphone::MicrophoneSource *mic_source2) { this->mic_source2_ = mic_source2; }
 #ifdef USE_MICRO_WAKE_WORD
   void set_micro_wake_word(micro_wake_word::MicroWakeWord *mww) { this->micro_wake_word_ = mww; }
 #endif
@@ -149,6 +151,9 @@ class VoiceAssistant : public Component {
     uint32_t flags = 0;
     flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
     flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
+    if (this->mic_source2_ != nullptr) {
+      flags |= VoiceAssistantFeature::FEATURE_MULTI_CHANNEL_AUDIO;
+    }
 #ifdef USE_SPEAKER
     if (this->speaker_ != nullptr) {
       flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
@@ -276,6 +281,7 @@ class VoiceAssistant : public Component {
   bool timer_tick_running_{false};
 
   microphone::MicrophoneSource *mic_source_{nullptr};
+  microphone::MicrophoneSource *mic_source2_{nullptr};
 #ifdef USE_SPEAKER
   void write_speaker_();
   speaker::Speaker *speaker_{nullptr};
@@ -301,6 +307,7 @@ class VoiceAssistant : public Component {
   std::string wake_word_;
 
   std::shared_ptr<ring_buffer::RingBuffer> ring_buffer_;
+  std::shared_ptr<ring_buffer::RingBuffer> ring_buffer2_;
 
   bool use_wake_word_;
   uint8_t noise_suppression_level_;
@@ -309,6 +316,7 @@ class VoiceAssistant : public Component {
   uint32_t conversation_timeout_;
 
   uint8_t *send_buffer_{nullptr};
+  uint8_t *send_buffer2_{nullptr};
 
   bool continuous_{false};
   bool silence_detection_;
diff --git a/tests/components/voice_assistant/common-idf.yaml b/tests/components/voice_assistant/common-idf.yaml
index 8565683700..0fa0903370 100644
--- a/tests/components/voice_assistant/common-idf.yaml
+++ b/tests/components/voice_assistant/common-idf.yaml
@@ -31,6 +31,11 @@ microphone:
     i2s_din_pin: ${i2s_din_pin}
     adc_type: external
     pdm: false
+  - platform: i2s_audio
+    id: mic_id_external2
+    i2s_din_pin: ${i2s_din_pin2}
+    adc_type: external
+    pdm: false
 
 speaker:
   - platform: i2s_audio
@@ -40,9 +45,12 @@ speaker:
 
 voice_assistant:
   microphone:
-    microphone: mic_id_external
-    gain_factor: 4
-    channels: 0
+    - microphone: mic_id_external
+      gain_factor: 4
+      channels: 0
+    - microphone: mic_id_external2
+      gain_factor: 4
+      channels: 0
   speaker: speaker_id
   micro_wake_word: mww_id
   conversation_timeout: 60s
diff --git a/tests/components/voice_assistant/test.esp32-idf.yaml b/tests/components/voice_assistant/test.esp32-idf.yaml
index 1c5c9ddf99..0cc670a77e 100644
--- a/tests/components/voice_assistant/test.esp32-idf.yaml
+++ b/tests/components/voice_assistant/test.esp32-idf.yaml
@@ -3,6 +3,7 @@ substitutions:
   i2s_bclk_pin: GPIO5
   i2s_mclk_pin: GPIO15
   i2s_din_pin: GPIO13
+  i2s_din_pin2: GPIO14
   i2s_dout_pin: GPIO12
 
 <<: !include common-idf.yaml