[core] fast_select stats: spin-poll to bucket load-bearing hits by µs

Replaces the single load_bearing counter with three buckets measured via a bounded (100µs) spin-poll of the task notification value after the scan finds data: race (<10µs) — notification arrived within ~10µs: callback-ordering race between rcvevent write and xTaskNotifyGive a few instructions later. Scan is noise. micro (<100µs) — notification arrived within 100µs: still noise at loop_interval scale. stall (>=100µs) — notification did not arrive within the poll window. Only this case could represent a real latency spike that the scan is rescuing. The 100µs spin cap is intentional: if we are wrong and this IS a real stall, we only add 100µs to that one unlucky loop iteration. The immediate ESP_LOGW on each hit now includes gap_us and the bucket label so individual events can be investigated.
2026-06-24 11:07:33 +00:00 · 2026-04-10 20:15:15 -10:00
parent 8921a9dd60
commit c37ce25fb7
2 changed files with 69 additions and 7 deletions
--- a/esphome/core/application.cpp
+++ b/esphome/core/application.cpp
@@ -453,17 +453,62 @@ void Application::enable_pending_loops_() {
 std::atomic<uint32_t> Application::fast_select_scan_total_{0};
 std::atomic<uint32_t> Application::fast_select_scan_found_data_{0};
 std::atomic<uint32_t> Application::fast_select_scan_load_bearing_{0};
+std::atomic<uint32_t> Application::fast_select_scan_load_bearing_race_{0};
+std::atomic<uint32_t> Application::fast_select_scan_load_bearing_micro_{0};
+std::atomic<uint32_t> Application::fast_select_scan_load_bearing_stall_{0};

 void Application::log_fast_select_scan_stats_() {
  uint32_t total = fast_select_scan_total_.load(std::memory_order_relaxed);
  uint32_t found = fast_select_scan_found_data_.load(std::memory_order_relaxed);
  uint32_t load_bearing = fast_select_scan_load_bearing_.load(std::memory_order_relaxed);
-  ESP_LOGD(TAG, "fast_select scan: total=%" PRIu32 " found_data=%" PRIu32 " load_bearing=%" PRIu32, total, found,
-           load_bearing);
+  uint32_t lb_race = fast_select_scan_load_bearing_race_.load(std::memory_order_relaxed);
+  uint32_t lb_micro = fast_select_scan_load_bearing_micro_.load(std::memory_order_relaxed);
+  uint32_t lb_stall = fast_select_scan_load_bearing_stall_.load(std::memory_order_relaxed);
+  ESP_LOGD(TAG,
+           "fast_select scan: total=%" PRIu32 " found_data=%" PRIu32 " load_bearing=%" PRIu32 " (race<10us=%" PRIu32
+           " micro<100us=%" PRIu32 " stall>100us=%" PRIu32 ")",
+           total, found, load_bearing, lb_race, lb_micro, lb_stall);
 }

 void Application::note_fast_select_load_bearing_(struct lwip_sock *sock, uint32_t delay_ms) {
  uint32_t load_bearing = fast_select_scan_load_bearing_.fetch_add(1, std::memory_order_relaxed) + 1;
+
+  // Spin-poll the task notification value for a short bounded window to measure how long
+  // the counterfactual ulTaskNotifyTake would actually have blocked. This distinguishes
+  // three cases:
+  //   race  (<10µs)   — notification arrived within ~10µs of scan start: callback-ordering
+  //                     race between the lwip event_callback writing rcvevent and calling
+  //                     xTaskNotifyGive a few instructions later. Scan is noise.
+  //   micro (<100µs)  — notification arrived within 100µs: still noise at loop_interval scale.
+  //   stall (≥100µs)  — notification did not arrive within our polling window. This is the
+  //                     only case where the scan could be rescuing a real latency spike.
+  // Cap the spin at 100µs so that if we're wrong and this IS a real stall, we only add
+  // 100µs of extra work to that one unlucky loop iteration.
+  uint32_t t_start = micros();
+  uint32_t gap_us = UINT32_MAX;
+  while (true) {
+    if (ulTaskNotifyValueClear(nullptr, 0) != 0) {
+      gap_us = micros() - t_start;
+      break;
+    }
+    uint32_t elapsed = micros() - t_start;
+    if (elapsed >= 100) {
+      break;
+    }
+  }
+
+  const char *bucket;
+  if (gap_us == UINT32_MAX) {
+    fast_select_scan_load_bearing_stall_.fetch_add(1, std::memory_order_relaxed);
+    bucket = "STALL";
+  } else if (gap_us < 10) {
+    fast_select_scan_load_bearing_race_.fetch_add(1, std::memory_order_relaxed);
+    bucket = "race";
+  } else {
+    fast_select_scan_load_bearing_micro_.fetch_add(1, std::memory_order_relaxed);
+    bucket = "micro";
+  }
+
  // Find the socket's index in monitored_sockets_ for easier correlation with registration order.
  int index = -1;
  for (size_t i = 0; i < this->monitored_sockets_.size(); i++) {
@@ -483,11 +528,19 @@ void Application::note_fast_select_load_bearing_(struct lwip_sock *sock, uint32_
    if (esphome_lwip_socket_has_data(s))
      sockets_with_data++;
  }
-  ESP_LOGW(TAG,
-           "fast_select LOAD-BEARING hit #%" PRIu32 ": sock=%p idx=%d/%u rcvevent=%d delay_ms=%" PRIu32
-           " sockets_with_data=%u",
-           load_bearing, sock, index, static_cast<unsigned>(this->monitored_sockets_.size()), rcvevent, delay_ms,
-           static_cast<unsigned>(sockets_with_data));
+  if (gap_us == UINT32_MAX) {
+    ESP_LOGW(TAG,
+             "fast_select LOAD-BEARING #%" PRIu32 " [%s]: sock=%p idx=%d/%u rcvevent=%d delay_ms=%" PRIu32
+             " sockets_with_data=%u gap_us=>100",
+             load_bearing, bucket, sock, index, static_cast<unsigned>(this->monitored_sockets_.size()), rcvevent,
+             delay_ms, static_cast<unsigned>(sockets_with_data));
+  } else {
+    ESP_LOGW(TAG,
+             "fast_select LOAD-BEARING #%" PRIu32 " [%s]: sock=%p idx=%d/%u rcvevent=%d delay_ms=%" PRIu32
+             " sockets_with_data=%u gap_us=%" PRIu32,
+             load_bearing, bucket, sock, index, static_cast<unsigned>(this->monitored_sockets_.size()), rcvevent,
+             delay_ms, static_cast<unsigned>(sockets_with_data), gap_us);
+  }
 }

 bool Application::register_socket(struct lwip_sock *sock) {
--- a/esphome/core/application.h
+++ b/esphome/core/application.h
@@ -661,7 +661,16 @@ class Application {
  // These are static because yield_with_select_() is inlined at every call site.
  static std::atomic<uint32_t> fast_select_scan_total_;
  static std::atomic<uint32_t> fast_select_scan_found_data_;
+  // Umbrella counter: pre-scan notify peek was 0 and scan found data.
+  // Broken down into three buckets based on the post-scan spin-poll result:
+  //   _race_  — notify arrived in < 10µs   (callback-ordering race, scan is noise)
+  //   _micro_ — notify arrived in 10..100µs (still noise at loop_interval scale)
+  //   _stall_ — notify did not arrive within 100µs (the only case that could be a real stall)
+  // If _stall_ stays 0, the scan is provably irrelevant under this workload.
  static std::atomic<uint32_t> fast_select_scan_load_bearing_;
+  static std::atomic<uint32_t> fast_select_scan_load_bearing_race_;
+  static std::atomic<uint32_t> fast_select_scan_load_bearing_micro_;
+  static std::atomic<uint32_t> fast_select_scan_load_bearing_stall_;
  uint32_t fast_select_scan_stats_last_log_{0};
  void log_fast_select_scan_stats_();
  // Non-inline, called only on the rare load-bearing event so the hot path stays unchanged.