From 5a236697473e554d7b27b29d25f7b188163d3b49 Mon Sep 17 00:00:00 2001
From: "J. Nick Koston" <nick@koston.org>
Date: Fri, 3 Apr 2026 08:27:29 -1000
Subject: [PATCH] [scheduler] Fix unrealistic scheduler benchmarks missing
 periodic drain (#15396)

---
 tests/benchmarks/core/bench_scheduler.cpp | 124 +++++++++++++++++++---
 1 file changed, 107 insertions(+), 17 deletions(-)

diff --git a/tests/benchmarks/core/bench_scheduler.cpp b/tests/benchmarks/core/bench_scheduler.cpp
index 9357734cc8..214fe0e4b8 100644
--- a/tests/benchmarks/core/bench_scheduler.cpp
+++ b/tests/benchmarks/core/bench_scheduler.cpp
@@ -8,7 +8,24 @@ namespace esphome::benchmarks {
 // Inner iteration count to amortize CodSpeed instrumentation overhead.
 // Without this, the ~60ns per-iteration valgrind start/stop cost dominates
 // sub-microsecond benchmarks.
-static constexpr int kInnerIterations = 2000;
+// Must be divisible by all batch sizes used below (3, 10) to avoid
+// pool imbalance at iteration boundaries that causes spurious malloc.
+static constexpr int kInnerIterations = 2100;
+
+// Warm the scheduler pool by registering and replacing items twice.
+// The first batch allocates fresh items; the second batch cancels them and
+// populates the recycling pool with the cancelled items from the first batch.
+static void warm_pool(Scheduler &scheduler, Component *component, int batch_size, uint32_t delay) {
+  uint32_t now = millis();
+  for (int i = 0; i < batch_size; i++) {
+    scheduler.set_timeout(component, static_cast<uint32_t>(i), delay, []() {});
+  }
+  scheduler.call(++now);
+  for (int i = 0; i < batch_size; i++) {
+    scheduler.set_timeout(component, static_cast<uint32_t>(i), delay, []() {});
+  }
+  scheduler.call(++now);
+}
 
 // --- Scheduler fast path: no work to do ---
 
@@ -83,11 +100,21 @@ static void Scheduler_SetTimeout(benchmark::State &state) {
   Scheduler scheduler;
   Component dummy_component;
 
+  // Register 3 timeouts then call() — realistic worst case where multiple
+  // components schedule in the same loop iteration. Keeps item count within
+  // the recycling pool (MAX_POOL_SIZE=5) to avoid spurious malloc/free.
+  static constexpr int kBatchSize = 3;
+  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
+  warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
   for (auto _ : state) {
+    uint32_t now = millis();
     for (int i = 0; i < kInnerIterations; i++) {
-      scheduler.set_timeout(&dummy_component, static_cast<uint32_t>(i % 5), 1000, []() {});
+      scheduler.set_timeout(&dummy_component, static_cast<uint32_t>(i % kBatchSize), 1000, []() {});
+      if ((i + 1) % kBatchSize == 0) {
+        scheduler.call(++now);
+      }
     }
-    scheduler.process_to_add();
+    scheduler.call(++now);
     benchmark::DoNotOptimize(scheduler);
   }
   state.SetItemsProcessed(state.iterations() * kInnerIterations);
@@ -99,22 +126,22 @@ BENCHMARK(Scheduler_SetTimeout);
 static void Scheduler_SetInterval(benchmark::State &state) {
   Scheduler scheduler;
   Component dummy_component;
-  // Number of distinct interval keys; controls how many unique timers exist
-  // simultaneously and the drain cadence for process_to_add().
-  static constexpr int kKeyCount = 5;
 
+  // Register 3 intervals then call() — realistic worst case where multiple
+  // components schedule in the same loop iteration. Keeps item count within
+  // the recycling pool (MAX_POOL_SIZE=5) to avoid spurious malloc/free.
+  static constexpr int kBatchSize = 3;
+  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
+  warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
   for (auto _ : state) {
+    uint32_t now = millis();
     for (int i = 0; i < kInnerIterations; i++) {
-      scheduler.set_interval(&dummy_component, static_cast<uint32_t>(i % kKeyCount), 1000, []() {});
-      // Drain to_add_ periodically to reflect production behavior where
-      // process_to_add() runs each main loop iteration. Without this,
-      // cancelled items accumulate in to_add_ causing O(n²) scan cost.
-      if ((i + 1) % kKeyCount == 0) {
-        scheduler.process_to_add();
+      scheduler.set_interval(&dummy_component, static_cast<uint32_t>(i % kBatchSize), 1000, []() {});
+      if ((i + 1) % kBatchSize == 0) {
+        scheduler.call(++now);
       }
     }
-    // Final drain in case kInnerIterations is not a multiple of 5
-    scheduler.process_to_add();
+    scheduler.call(++now);
     benchmark::DoNotOptimize(scheduler);
   }
   state.SetItemsProcessed(state.iterations() * kInnerIterations);
@@ -128,16 +155,79 @@ static void Scheduler_Defer(benchmark::State &state) {
   Component dummy_component;
 
   // defer() is Component::defer which calls set_timeout(delay=0).
-  // Call set_timeout directly since defer() is protected.
+  // Component::defer(func) passes nullptr as the name, which skips
+  // cancel_item_locked_ entirely — matching production behavior where
+  // defers are anonymous fire-and-forget callbacks.
+  static constexpr int kBatchSize = 3;
+  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
+  warm_pool(scheduler, &dummy_component, kBatchSize, 0);
   for (auto _ : state) {
+    uint32_t now = millis();
     for (int i = 0; i < kInnerIterations; i++) {
-      scheduler.set_timeout(&dummy_component, static_cast<uint32_t>(i % 5), 0, []() {});
+      scheduler.set_timeout(&dummy_component, static_cast<const char *>(nullptr), 0, []() {});
+      if ((i + 1) % kBatchSize == 0) {
+        scheduler.call(++now);
+      }
     }
-    scheduler.process_to_add();
+    scheduler.call(++now);
     benchmark::DoNotOptimize(scheduler);
   }
   state.SetItemsProcessed(state.iterations() * kInnerIterations);
 }
 BENCHMARK(Scheduler_Defer);
 
+// --- Scheduler: defer with same ID (cancel-and-replace pattern) ---
+
+static void Scheduler_Defer_SameID(benchmark::State &state) {
+  Scheduler scheduler;
+  Component dummy_component;
+
+  // Measures defer with a fixed numeric ID — each call cancels the previous
+  // pending defer before adding the new one. This is the pattern used by
+  // components that defer work but want to coalesce rapid updates.
+  static constexpr int kBatchSize = 3;
+  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
+  warm_pool(scheduler, &dummy_component, kBatchSize, 0);
+  for (auto _ : state) {
+    uint32_t now = millis();
+    for (int i = 0; i < kInnerIterations; i++) {
+      scheduler.set_timeout(&dummy_component, static_cast<uint32_t>(0), 0, []() {});
+      if ((i + 1) % kBatchSize == 0) {
+        scheduler.call(++now);
+      }
+    }
+    scheduler.call(++now);
+    benchmark::DoNotOptimize(scheduler);
+  }
+  state.SetItemsProcessed(state.iterations() * kInnerIterations);
+}
+BENCHMARK(Scheduler_Defer_SameID);
+
+// --- Scheduler: set_timeout with batch size exceeding pool (cliff test) ---
+
+static void Scheduler_SetTimeout_ExceedPool(benchmark::State &state) {
+  Scheduler scheduler;
+  Component dummy_component;
+
+  // Register 10 timeouts then call() — exceeds MAX_POOL_SIZE=5 to measure
+  // the performance cliff when the recycling pool is exhausted and items
+  // must be malloc'd/freed.
+  static constexpr int kBatchSize = 10;
+  static_assert(kInnerIterations % kBatchSize == 0, "kInnerIterations must be divisible by kBatchSize");
+  warm_pool(scheduler, &dummy_component, kBatchSize, 1000);
+  for (auto _ : state) {
+    uint32_t now = millis();
+    for (int i = 0; i < kInnerIterations; i++) {
+      scheduler.set_timeout(&dummy_component, static_cast<uint32_t>(i % kBatchSize), 1000, []() {});
+      if ((i + 1) % kBatchSize == 0) {
+        scheduler.call(++now);
+      }
+    }
+    scheduler.call(++now);
+    benchmark::DoNotOptimize(scheduler);
+  }
+  state.SetItemsProcessed(state.iterations() * kInnerIterations);
+}
+BENCHMARK(Scheduler_SetTimeout_ExceedPool);
+
 }  // namespace esphome::benchmarks