[core] Attribute "took a long time" blocking warning to the owning script (#16768)

This commit is contained in:
J. Nick Koston
2026-06-15 15:46:33 -05:00
committed by GitHub
parent 963465a0a6
commit 3420cff316
13 changed files with 389 additions and 103 deletions

View File

@@ -47,7 +47,7 @@ class RuntimeStatsCollector {
// overhead between Phase A and stats belongs to "residual").
// Residual overhead at log time = active Σ(component) before tail,
// which captures per-iteration inter-component bookkeeping (set_current_component,
// WarnIfComponentBlockingGuard construction/destruction, feed_wdt_with_time calls,
// LoopBlockingGuard construction/destruction, feed_wdt_with_time calls,
// the for-loop itself).
void record_loop_active(uint32_t active_us, uint32_t before_us, uint32_t tail_us) {
this->period_active_count_++;

View File

@@ -3,6 +3,7 @@
#include <list>
#include <memory>
#include <tuple>
#include "esphome/core/application.h"
#include "esphome/core/automation.h"
#include "esphome/core/component.h"
#include "esphome/core/helpers.h"
@@ -57,6 +58,14 @@ template<typename... Ts> class Script : public ScriptLogger, public Trigger<Ts..
this->execute(std::get<S>(tuple)...);
}
// Run the action chain with this script's name published as the current source (RAII save/restore,
// so nesting composes), so deferred work inside the script is attributed to it in blocking
// warnings. Force-inlined to fold into the always-inlined trigger chain (no extra stack frame).
inline void run_actions_(const Ts &...x) ESPHOME_ALWAYS_INLINE {
ScopedSourceGuard source_guard{this->name_};
this->trigger(x...);
}
const LogString *name_{nullptr};
};
@@ -74,7 +83,7 @@ template<typename... Ts> class SingleScript : public Script<Ts...> {
return;
}
this->trigger(x...);
this->run_actions_(x...);
}
};
@@ -91,7 +100,7 @@ template<typename... Ts> class RestartScript : public Script<Ts...> {
this->stop_action();
}
this->trigger(x...);
this->run_actions_(x...);
}
};
@@ -136,7 +145,7 @@ template<typename... Ts> class QueueingScript : public Script<Ts...>, public Com
return;
}
this->trigger(x...);
this->run_actions_(x...);
// Check if the trigger was immediate and we can continue right away.
this->loop();
}
@@ -175,7 +184,7 @@ template<typename... Ts> class QueueingScript : public Script<Ts...>, public Com
}
template<size_t... S> void trigger_tuple_(const std::tuple<Ts...> &tuple, std::index_sequence<S...> /*unused*/) {
this->trigger(std::get<S>(tuple)...);
this->run_actions_(std::get<S>(tuple)...);
}
int num_queued_ = 0; // Number of queued instances (not including currently running)
@@ -197,7 +206,7 @@ template<typename... Ts> class ParallelScript : public Script<Ts...> {
LOG_STR_ARG(this->name_));
return;
}
this->trigger(x...);
this->run_actions_(x...);
}
void set_max_runs(int max_runs) { max_runs_ = max_runs; }

View File

@@ -104,9 +104,13 @@ class Application {
void register_area(Area *area) { this->areas_.push_back(area); }
#endif
void set_current_component(Component *component) { this->current_component_ = component; }
Component *get_current_component() { return this->current_component_; }
// Owning script of the action chain currently executing (nullptr when none); used to attribute
// blocking warnings for deferred work to the script that scheduled it.
void set_current_source(const LogString *source) { this->current_source_ = source; }
const LogString *get_current_source() { return this->current_source_; }
// Entity register methods (generated from entity_types.h).
// Each entity type gets two overloads:
// - register_<entity>(obj) — bare push_back
@@ -393,6 +397,7 @@ class Application {
protected:
friend Component;
friend class Scheduler;
friend class LoopBlockingGuard;
#ifdef USE_RUNTIME_STATS
friend class runtime_stats::RuntimeStatsCollector;
#endif
@@ -402,6 +407,14 @@ class Application {
/// Freshen the cached loop component start time. Called by Scheduler before each dispatch.
void set_loop_component_start_time_(uint32_t now) { this->loop_component_start_time_ = now; }
// Publish the running unit's identity (component + source) and dispatch time together, so a
// dispatch site can't set one without the others. Friend-only (Scheduler).
void set_current_execution_context_(Component *component, const LogString *source, uint32_t now) {
this->current_component_ = component;
this->current_source_ = source;
this->set_loop_component_start_time_(now);
}
/// Walk all registered components looking for any whose component_state_
/// has the given flag set. Used by Component::status_clear_*_slow_path_()
/// (which is a friend) to decide whether to clear the corresponding bit on
@@ -482,6 +495,7 @@ class Application {
// Pointer-sized members first
Component *current_component_{nullptr};
const LogString *current_source_{nullptr};
// std::vector (3 pointers each: begin, end, capacity)
// Partitioned vector design for looping components
@@ -554,6 +568,76 @@ class Application {
/// Global storage of Application pointer - only one Application can exist.
extern Application App; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
/// RAII guard that publishes a current source (e.g. a script name) for a scope and restores the
/// previous value on exit, attributing deferred work scheduled inside to that source.
class ScopedSourceGuard {
public:
explicit ScopedSourceGuard(const LogString *source) : prev_(App.get_current_source()) {
App.set_current_source(source);
}
~ScopedSourceGuard() { App.set_current_source(this->prev_); }
ScopedSourceGuard(const ScopedSourceGuard &) = delete;
ScopedSourceGuard &operator=(const ScopedSourceGuard &) = delete;
private:
const LogString *prev_;
};
// Times one unit of work (a component loop() or a scheduled callback) and warns if it blocks the
// main loop too long. The constructor publishes the unit's identity + dispatch time to App;
// finish()/the cold warning path read them back, so the guard stores no copy.
//
// Guards must not nest: the constructor publishes to App but never restores on destruction, so a
// nested guard would clobber the outer's context. Safe because the two dispatch sites (component
// loop phase, execute_item_) run strictly sequentially and aren't re-entered from a timed callback.
class LoopBlockingGuard {
public:
// Publish the unit's identity + dispatch time, then start timing. The millis start lives in App,
// so only the runtime-stats micros stamp is kept here.
LoopBlockingGuard(Component *component, const LogString *source, uint32_t now) {
App.set_current_execution_context_(component, source, now);
#ifdef USE_RUNTIME_STATS
this->started_us_ = micros();
#endif
}
// Finish the timing operation and return the current time (millis)
// Inlined: the fast path is just millis() + subtract + compare
inline uint32_t HOT finish() {
#ifdef USE_RUNTIME_STATS
uint32_t elapsed_us = micros() - this->started_us_;
// Delays have no component; accumulate into the global counter so loop() can subtract them.
Component *component = App.get_current_component();
if (component != nullptr) {
component->runtime_stats_.record_time(elapsed_us);
} else {
ComponentRuntimeStats::global_recorded_us += elapsed_us;
}
#endif
uint32_t curr_time = MillisInternal::get();
#ifndef USE_BENCHMARK
// Fast path: compare against constant threshold in ms (computed at compile time from centiseconds)
static constexpr uint32_t WARN_IF_BLOCKING_OVER_MS = static_cast<uint32_t>(WARN_IF_BLOCKING_OVER_CS) * 10U;
uint32_t blocking_time = curr_time - App.get_loop_component_start_time();
if (blocking_time > WARN_IF_BLOCKING_OVER_MS) [[unlikely]] {
warn_blocking(blocking_time);
}
#endif
return curr_time;
}
~LoopBlockingGuard() = default;
#ifdef USE_RUNTIME_STATS
protected:
uint32_t started_us_;
#endif
private:
// Cold path; defined in component.cpp. Reads the current component/source from App to name the culprit.
static void __attribute__((noinline, cold)) warn_blocking(uint32_t blocking_time);
};
// Phase A: drain wake notifications and run the scheduler. Invoked on every
// Application::loop() tick regardless of whether a component phase runs, so
// scheduler items fire at their requested cadence even when the caller has
@@ -607,7 +691,7 @@ inline void ESPHOME_ALWAYS_INLINE Application::loop() {
// before/tail splits recorded below.
uint32_t loop_active_start_us = micros();
// Snapshot the cumulative component-recorded time so we can subtract the
// slice that the scheduler spends inside its own WarnIfComponentBlockingGuard
// slice that the scheduler spends inside its own LoopBlockingGuard
// (scheduler.cpp) — that time is already counted in per-component stats,
// so charging it again to "before" would double-count.
uint64_t loop_recorded_snap = ComponentRuntimeStats::global_recorded_us;
@@ -660,12 +744,9 @@ inline void ESPHOME_ALWAYS_INLINE Application::loop() {
this->current_loop_index_++) {
Component *component = this->looping_components_[this->current_loop_index_];
// Update the cached time before each component runs
this->loop_component_start_time_ = last_op_end_time;
{
this->set_current_component(component);
WarnIfComponentBlockingGuard guard{component, last_op_end_time};
// Guard publishes this component (no script source) + dispatch time, then times loop().
LoopBlockingGuard guard{component, nullptr, last_op_end_time};
component->loop();
// Use the finish method to get the current time as the end time
last_op_end_time = guard.finish();

View File

@@ -201,7 +201,10 @@ template<typename... Ts> class DelayAction : public Action<Ts...> {
/* component= */ nullptr, Scheduler::SchedulerItem::TIMEOUT, Scheduler::NameType::SELF_POINTER,
/* static_name= */ reinterpret_cast<const char *>(this), /* hash_or_id= */ 0, this->delay_.value(),
[this]() { this->play_next_(); },
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1);
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1,
// Record the owning script (if any) so the blocking warning can name it; propagates across
// chained delays via the scheduler.
/* source= */ App.get_current_source());
} else {
// For delays with arguments, capture by value to preserve argument values
// Arguments must be copied because original references may be invalid after delay
@@ -212,7 +215,9 @@ template<typename... Ts> class DelayAction : public Action<Ts...> {
/* component= */ nullptr, Scheduler::SchedulerItem::TIMEOUT, Scheduler::NameType::SELF_POINTER,
/* static_name= */ reinterpret_cast<const char *>(this), /* hash_or_id= */ 0, this->delay_.value(x...),
std::move(f),
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1);
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1,
// See the no-argument branch above: record the owning script for log attribution.
/* source= */ App.get_current_source());
}
}

View File

@@ -258,9 +258,11 @@ void Component::call() {
break;
}
}
bool Component::should_warn_of_blocking(uint32_t blocking_time) {
bool Component::should_warn_of_blocking(uint32_t blocking_time, uint32_t &threshold_ms_out) {
// Convert centisecond threshold to milliseconds for comparison
uint32_t threshold_ms = static_cast<uint32_t>(this->warn_if_blocking_over_) * 10U;
// Report the threshold that was exceeded (before any ratcheting below) so the warning is accurate.
threshold_ms_out = threshold_ms;
if (blocking_time > threshold_ms) {
// Set new threshold: blocking_time + increment, converted back to centiseconds
uint32_t new_threshold_ms = blocking_time + WARN_IF_BLOCKING_INCREMENT_MS;
@@ -491,19 +493,25 @@ uint32_t PollingComponent::get_update_interval() const { return this->update_int
uint64_t ComponentRuntimeStats::global_recorded_us = 0; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
#endif
void __attribute__((noinline, cold))
WarnIfComponentBlockingGuard::warn_blocking(Component *component, uint32_t blocking_time) {
bool should_warn;
void __attribute__((noinline, cold)) LoopBlockingGuard::warn_blocking(uint32_t blocking_time) {
// Identity is published on App by the caller before the guard is built; read it back here.
Component *component = App.get_current_component();
// Component-less path always warns (the caller already checked the constant threshold).
uint32_t threshold_ms = WARN_IF_BLOCKING_OVER_MS;
if (component != nullptr && !component->should_warn_of_blocking(blocking_time, threshold_ms)) {
return; // Component's (possibly ratcheted) threshold not exceeded yet
}
// Component name if any, else the published source (owning script), else a generic label.
const LogString *name;
if (component != nullptr) {
should_warn = component->should_warn_of_blocking(blocking_time);
name = component->get_component_log_str();
} else {
should_warn = true; // Already checked > WARN_IF_BLOCKING_OVER_MS in caller
}
if (should_warn) {
ESP_LOGW(TAG, "%s took a long time for an operation (%" PRIu32 " ms), max is 30 ms",
component == nullptr ? LOG_STR_LITERAL("<null>") : LOG_STR_ARG(component->get_component_log_str()),
blocking_time);
name = App.get_current_source();
if (name == nullptr)
name = LOG_STR("a scheduled task");
}
ESP_LOGW(TAG, "%s took a long time for an operation (%" PRIu32 " ms), max is %" PRIu32 " ms", LOG_STR_ARG(name),
blocking_time, threshold_ms);
}
#ifdef USE_SETUP_PRIORITY_OVERRIDE

View File

@@ -118,7 +118,7 @@ struct ComponentRuntimeStats {
// Cumulative sum of every record_time() duration since boot, across all
// components. Used by Application::loop() to snapshot time spent inside
// WarnIfComponentBlockingGuard (including guards constructed by the
// LoopBlockingGuard (including guards constructed by the
// scheduler at scheduler.cpp) so main-loop overhead accounting can
// subtract scheduled-callback time from the before_loop_tasks_ wall time.
static uint64_t global_recorded_us; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
@@ -326,7 +326,7 @@ class Component {
return component_source_lookup(this->component_source_index_);
}
bool should_warn_of_blocking(uint32_t blocking_time);
bool should_warn_of_blocking(uint32_t blocking_time, uint32_t &threshold_ms_out);
protected:
friend class Application;
@@ -571,7 +571,7 @@ class Component {
volatile bool pending_enable_loop_{false}; ///< ISR-safe flag for enable_loop_soon_any_context
#ifdef USE_RUNTIME_STATS
friend class runtime_stats::RuntimeStatsCollector;
friend class WarnIfComponentBlockingGuard;
friend class LoopBlockingGuard;
ComponentRuntimeStats runtime_stats_;
#endif
};
@@ -619,59 +619,7 @@ class PollingComponent : public Component {
uint32_t update_interval_;
};
// millis() and micros() are available via hal.h
class WarnIfComponentBlockingGuard {
public:
WarnIfComponentBlockingGuard(Component *component, uint32_t start_time)
: started_(start_time),
component_(component)
#ifdef USE_RUNTIME_STATS
,
started_us_(micros())
#endif
{
}
// Finish the timing operation and return the current time (millis)
// Inlined: the fast path is just millis() + subtract + compare
inline uint32_t HOT finish() {
#ifdef USE_RUNTIME_STATS
uint32_t elapsed_us = micros() - this->started_us_;
// component_ is nullptr for self-keyed scheduler items (set_timeout/set_interval(self, ...))
if (this->component_ != nullptr) {
this->component_->runtime_stats_.record_time(elapsed_us);
} else {
// Still accumulate into the global counter so Application::loop() can subtract
// this time from before_loop_tasks_ wall time.
ComponentRuntimeStats::global_recorded_us += elapsed_us;
}
#endif
uint32_t curr_time = MillisInternal::get();
#ifndef USE_BENCHMARK
// Fast path: compare against constant threshold in ms (computed at compile time from centiseconds)
static constexpr uint32_t WARN_IF_BLOCKING_OVER_MS = static_cast<uint32_t>(WARN_IF_BLOCKING_OVER_CS) * 10U;
uint32_t blocking_time = curr_time - this->started_;
if (blocking_time > WARN_IF_BLOCKING_OVER_MS) [[unlikely]] {
warn_blocking(this->component_, blocking_time);
}
#endif
return curr_time;
}
~WarnIfComponentBlockingGuard() = default;
protected:
uint32_t started_;
Component *component_;
#ifdef USE_RUNTIME_STATS
uint32_t started_us_;
#endif
private:
// Cold path for blocking warning - defined in component.cpp
static void __attribute__((noinline, cold)) warn_blocking(Component *component, uint32_t blocking_time);
};
// LoopBlockingGuard lives in application.h because it reads its state from App.
// Function to clear setup priority overrides after all components are set up
// Only has an implementation when USE_SETUP_PRIORITY_OVERRIDE is defined

View File

@@ -16,7 +16,7 @@ namespace esphome {
// Friend-gated accessor for a fast millis() variant intended only for
// known task-context callers on the main loop hot path (Application::loop()
// and WarnIfComponentBlockingGuard::finish()). It skips the ISR-context
// and LoopBlockingGuard::finish()). It skips the ISR-context
// dispatch that the public esphome::millis() pays on ESP32 and libretiny.
//
// MUST NOT be called from ISR context: on ESP32 and libretiny it calls the
@@ -50,7 +50,7 @@ class MillisInternal {
#endif
}
friend class Application;
friend class WarnIfComponentBlockingGuard;
friend class LoopBlockingGuard;
};
} // namespace esphome

View File

@@ -131,7 +131,8 @@ bool Scheduler::is_retry_cancelled_locked_(Component *component, NameType name_t
// name_type determines storage type: STATIC_STRING uses static_name, others use hash_or_id
void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type type, NameType name_type,
const char *static_name, uint32_t hash_or_id, uint32_t delay,
std::function<void()> &&func, bool is_retry, bool skip_cancel) {
std::function<void()> &&func, bool is_retry, bool skip_cancel,
const LogString *source) {
if (delay == SCHEDULER_DONT_RUN) {
// Still need to cancel existing timer if we have a name/id
if (!skip_cancel) {
@@ -174,7 +175,12 @@ void HOT Scheduler::set_timer_common_(Component *component, SchedulerItem::Type
// Create and populate the scheduler item
SchedulerItem *item = this->get_item_from_pool_locked_();
// SELF_POINTER items store the source name (owning script) in the union slot instead of a component.
if (name_type == NameType::SELF_POINTER) {
item->source_name = source;
} else {
item->component = component;
}
item->set_name(name_type, static_name, hash_or_id);
item->type = type;
// Use destroy + placement-new instead of move-assignment.
@@ -642,8 +648,8 @@ uint32_t HOT Scheduler::call(uint32_t now) {
// Not reached timeout yet, done for this call
break;
}
// Don't run on failed components
if (item->component != nullptr && item->component->is_failed()) {
// Don't run on failed components (is_item_failed_ exempts SELF_POINTER delays).
if (this->is_item_failed_(item)) {
LockGuard guard{this->lock_};
this->recycle_item_main_loop_(this->pop_raw_locked_());
continue;
@@ -790,10 +796,21 @@ Scheduler::SchedulerItem *HOT Scheduler::pop_raw_locked_() {
// Helper to execute a scheduler item
uint32_t HOT Scheduler::execute_item_(SchedulerItem *item, uint32_t now) {
App.set_current_component(item->component);
// Freshen so callbacks reading App.get_loop_component_start_time() see this item's dispatch time.
App.set_loop_component_start_time_(now);
WarnIfComponentBlockingGuard guard{item->component, now};
// Resolve the component and (for SELF_POINTER/deferred items) the source name from the shared
// union slot with a single name-type check. Self-keyed items have no owning component; their slot
// holds the source name (e.g. the owning script), published so deferred work chained inside the
// callback re-captures it and the blocking warning can name the script instead of "<null>".
Component *component;
const LogString *source;
if (item->get_name_type() == NameType::SELF_POINTER) {
component = nullptr;
source = item->source_name;
} else {
component = item->component;
source = nullptr;
}
// Guard publishes the item's identity + dispatch time, then times the callback.
LoopBlockingGuard guard{component, source, now};
item->callback();
uint32_t end = guard.finish();
// Feed the watchdog after each scheduled item (both main heap and defer

View File

@@ -183,11 +183,12 @@ class Scheduler {
protected:
struct SchedulerItem {
// Ordered by size to minimize padding.
// `component` while live; `next_free` while in scheduler_item_pool_head_ (mutually exclusive).
// Ordered by size to minimize padding. Mutually exclusive by state; read the component via
// get_component() so SELF_POINTER items read as component-less.
union {
Component *component;
SchedulerItem *next_free;
Component *component; // live, non-SELF_POINTER: owning component
const LogString *source_name; // live SELF_POINTER: owning script name (log attribution)
SchedulerItem *next_free; // while pooled
};
// Optimized name storage using tagged union - zero heap allocation
union {
@@ -302,14 +303,23 @@ class Scheduler {
next_execution_high_ = static_cast<uint16_t>(value >> 32);
}
constexpr const char *get_type_str() const { return (type == TIMEOUT) ? "timeout" : "interval"; }
const LogString *get_source() const { return component ? component->get_component_log_str() : LOG_STR("unknown"); }
// The owning component, or nullptr for SELF_POINTER items (whose slot holds source_name instead).
// All component access goes through this so SELF_POINTER items read as component-less.
Component *get_component() const { return name_type_ == NameType::SELF_POINTER ? nullptr : component; }
const LogString *get_source() const {
// Same no-source label as warn_blocking, for consistent log vocabulary.
if (name_type_ == NameType::SELF_POINTER)
return source_name != nullptr ? source_name : LOG_STR("a scheduled task");
return component != nullptr ? component->get_component_log_str() : LOG_STR("unknown");
}
};
// Common implementation for both timeout and interval
// name_type determines storage type: STATIC_STRING uses static_name, others use hash_or_id
// `source` is stored (in the union slot) only for SELF_POINTER items; ignored otherwise.
void set_timer_common_(Component *component, SchedulerItem::Type type, NameType name_type, const char *static_name,
uint32_t hash_or_id, uint32_t delay, std::function<void()> &&func, bool is_retry = false,
bool skip_cancel = false);
bool skip_cancel = false, const LogString *source = nullptr);
// Common implementation for retry - Remove before 2026.8.0
// name_type determines storage type: STATIC_STRING uses static_name, others use hash_or_id
@@ -402,8 +412,10 @@ class Scheduler {
// Fixes: https://github.com/esphome/esphome/issues/11940
if (item == nullptr)
return false;
if (item->component != component || item->type != type || (skip_removed && this->is_item_removed_locked_(item)) ||
(match_retry && !item->is_retry)) {
// get_component() is nullptr for SELF_POINTER items (their cancels pass nullptr too), so they
// match by the `this` key alone.
if (item->get_component() != component || item->type != type ||
(skip_removed && this->is_item_removed_locked_(item)) || (match_retry && !item->is_retry)) {
return false;
}
// Name type must match
@@ -423,11 +435,16 @@ class Scheduler {
// Helper to execute a scheduler item
uint32_t execute_item_(SchedulerItem *item, uint32_t now);
// Helper to check if item should be skipped
bool should_skip_item_(SchedulerItem *item) const {
return is_item_removed_(item) || (item->component != nullptr && item->component->is_failed());
// True if the item's component is failed (so it must not run). SELF_POINTER delays have no
// component (get_component() == nullptr) and always fire.
bool is_item_failed_(SchedulerItem *item) const {
Component *component = item->get_component();
return component != nullptr && component->is_failed();
}
// Helper to check if item should be skipped
bool should_skip_item_(SchedulerItem *item) const { return is_item_removed_(item) || this->is_item_failed_(item); }
// Helper to recycle a SchedulerItem back to the pool.
// Takes a raw pointer — caller transfers ownership. The item is either added to the
// pool or deleted if the pool is full.

View File

@@ -0,0 +1,22 @@
esphome:
name: scheduler-blocking-warning
on_boot:
then:
- script.execute: blocking_script
host:
api:
logger:
level: DEBUG
# The busy-block runs in the second delay's continuation; the warning must name the script. Two
# delays verify the source survives chained delays (the scheduler republishes it each continuation).
script:
- id: blocking_script
then:
- delay: 10ms
- delay: 10ms
- lambda: |-
const uint32_t start = millis();
while (millis() - start < 80) {
}

View File

@@ -0,0 +1,30 @@
esphome:
name: scheduler-blocking-generic
host:
api:
logger:
level: DEBUG
globals:
- id: done
type: bool
restore_value: false
initial_value: "false"
# A delay in a plain (non-script) automation has no owning script, so the block must log the
# generic "a scheduled task" label, not a script name.
interval:
- interval: 100ms
id: gen_interval
then:
- if:
condition:
lambda: "return !id(done);"
then:
- lambda: "id(done) = true;"
- delay: 10ms
- lambda: |-
const uint32_t start = millis();
while (millis() - start < 80) {
}

View File

@@ -0,0 +1,29 @@
esphome:
name: scheduler-delay-failed
host:
api:
logger:
level: DEBUG
globals:
- id: started
type: bool
restore_value: false
initial_value: "false"
# The interval marks itself failed, then schedules a delay. The delay must still fire: a failed
# component must not drop it, since the SELF_POINTER scheduler item has no owning component.
interval:
- interval: 100ms
id: host_interval
then:
- if:
condition:
lambda: "return !id(started);"
then:
- lambda: |-
id(started) = true;
id(host_interval)->mark_failed();
- delay: 200ms
- logger.log: "DELAY_FIRED_AFTER_FAIL"

View File

@@ -0,0 +1,120 @@
"""Integration tests for blocking-warning source attribution.
A blocking operation that runs inside a deferred scheduler continuation (e.g. after a ``delay``
in a script) used to be reported as ``<null> took a long time for an operation (NN ms),
max is 30 ms`` because the continuation carries no component. The warning should instead name
the owning script and report the real threshold (50 ms).
"""
from __future__ import annotations
import asyncio
import re
import pytest
from .types import APIClientConnectedFactory, RunCompiledFunction
# Matches: "<source> took a long time for an operation (NN ms), max is NN ms"
WARN_PATTERN = re.compile(
r"(\S+) took a long time for an operation \((\d+) ms\), max is (\d+) ms"
)
@pytest.mark.asyncio
async def test_scheduler_blocking_warning(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""Deferred blocking work inside a script is attributed to the script, not "<null>"."""
loop = asyncio.get_running_loop()
warning_future: asyncio.Future[str] = loop.create_future()
def check_output(line: str) -> None:
if WARN_PATTERN.search(line) and not warning_future.done():
warning_future.set_result(line)
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
device_info = await client.device_info()
assert device_info is not None
# on_boot runs the script, which defers via delay then busy-blocks > 50 ms in the
# continuation, tripping the blocking warning.
warning_line = await asyncio.wait_for(warning_future, timeout=10.0)
# Must name the owning script, not "<null>" and not the generic fallback.
assert "<null>" not in warning_line, (
f"Warning should name the script, got: {warning_line}"
)
assert "a scheduled task" not in warning_line, (
f"Warning should name the script, got: {warning_line}"
)
match = WARN_PATTERN.search(warning_line)
assert match is not None
assert match.group(1) == "blocking_script", (
f"Warning should name 'blocking_script', got: {warning_line}"
)
# The reported threshold must be the real default (50 ms), not the stale "30 ms".
assert match.group(3) == "50", f"Expected 'max is 50 ms', got: {warning_line}"
@pytest.mark.asyncio
async def test_scheduler_blocking_warning_generic_source(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""A delay in a plain (non-script) automation logs the generic label, not a script name."""
loop = asyncio.get_running_loop()
warning_future: asyncio.Future[str] = loop.create_future()
def check_output(line: str) -> None:
if WARN_PATTERN.search(line) and not warning_future.done():
warning_future.set_result(line)
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
assert await client.device_info() is not None
warning_line = await asyncio.wait_for(warning_future, timeout=10.0)
assert "a scheduled task took a long time" in warning_line, (
f"Non-script deferred work should log the generic label, got: {warning_line}"
)
assert "<null>" not in warning_line
match = WARN_PATTERN.search(warning_line)
assert match is not None and match.group(3) == "50", (
f"Expected 'max is 50 ms', got: {warning_line}"
)
@pytest.mark.asyncio
async def test_scheduler_delay_runs_on_failed_component(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""A delay must still fire even when its context component is marked failed.
Deferred (SELF_POINTER) scheduler items have no owning component, so the scheduler's
failed-component skip must not drop them.
"""
loop = asyncio.get_running_loop()
fired: asyncio.Future[bool] = loop.create_future()
def check_output(line: str) -> None:
if "DELAY_FIRED_AFTER_FAIL" in line and not fired.done():
fired.set_result(True)
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
assert await client.device_info() is not None
# If the failed host component wrongly dropped the delay, this times out.
await asyncio.wait_for(fired, timeout=10.0)