Compare commits

...

2 Commits

Author SHA1 Message Date
J. Nick Koston
3a4f67def8 [core] Fix delay on failed component being dropped; DRY the is_failed check
The is_failed() skip exists in two execution paths: the heap loop in call()
and should_skip_item_() (defer queue / delay:0). The previous commit only
exempted SELF_POINTER items from the heap-path check, so on multi-threaded
platforms a delay:0 continuation whose host component had failed would still
be silently dropped.

Extract a single is_item_failed_() helper (with the SELF_POINTER exemption)
and use it from both paths so they cannot drift again.

Add an integration test that schedules a delay from a component that marks
itself failed and asserts the continuation still fires (verified to fail
without the exemption).
2026-06-02 13:54:17 -05:00
J. Nick Koston
5b728f19c3 [core] Attribute "took a long time" blocking warning to its source
A blocking operation that runs inside a deferred scheduler continuation
(e.g. after a delay in a script/automation) was reported as:

    <null> took a long time for an operation (83 ms), max is 30 ms

Two problems:

* The DelayAction continuation carries no component (since #16129 dropped
  Component inheritance), so the warning had nothing to name and printed
  "<null>". Telling the user an anonymous delay action is blocking is not
  useful; naming the component that hosts the automation is.
* The threshold was hardcoded to "30 ms" but the real default is 50 ms
  (WARN_IF_BLOCKING_OVER_CS) and is adaptive per component.

DelayAction now records App.get_current_component() on the scheduler item,
so the warning names the component whose automation chain hit the delay
(falling back to "a scheduled task" when there is genuinely no current
component). This propagates across chained delays because the scheduler
restores the item's component as the current component before each callback.

For SELF_POINTER items the stored component is log-attribution only: the
key (the caller's `this`) is globally unique, so matches_item_locked_
ignores the component when matching and the is_failed() skip is bypassed.
This keeps delay cancellation (restart/parallel/stop) and always-fire
semantics unchanged.

The warning now reports the real (pre-ratchet) threshold instead of the
stale "30 ms".

Adds an integration test reproducing the deferred-block path via an
interval + delay + busy lambda and asserting the warning names a component
and reports "max is 50 ms".
2026-06-02 13:29:27 -05:00
8 changed files with 188 additions and 23 deletions

View File

@@ -198,7 +198,12 @@ template<typename... Ts> class DelayAction : public Action<Ts...> {
// to avoid overhead from capturing arguments by value
if constexpr (sizeof...(Ts) == 0) {
App.scheduler.set_timer_common_(
/* component= */ nullptr, Scheduler::SchedulerItem::TIMEOUT, Scheduler::NameType::SELF_POINTER,
// The component is stored for blocking-warning log attribution only (SELF_POINTER items
// match by `this`, so it does not affect cancellation). Capturing the current component
// lets the warning name the source instead of "<null>"; it propagates across chained
// delays because the scheduler restores it as the current component before each callback.
/* component= */ App.get_current_component(), Scheduler::SchedulerItem::TIMEOUT,
Scheduler::NameType::SELF_POINTER,
/* static_name= */ reinterpret_cast<const char *>(this), /* hash_or_id= */ 0, this->delay_.value(),
[this]() { this->play_next_(); },
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1);
@@ -209,7 +214,9 @@ template<typename... Ts> class DelayAction : public Action<Ts...> {
// are passed as non-const lvalues to play_next_(const Ts&...) where Ts may be `T&`
auto f = [this, x...]() mutable { this->play_next_(x...); };
App.scheduler.set_timer_common_(
/* component= */ nullptr, Scheduler::SchedulerItem::TIMEOUT, Scheduler::NameType::SELF_POINTER,
// See the no-argument branch above: component is captured for log attribution only.
/* component= */ App.get_current_component(), Scheduler::SchedulerItem::TIMEOUT,
Scheduler::NameType::SELF_POINTER,
/* static_name= */ reinterpret_cast<const char *>(this), /* hash_or_id= */ 0, this->delay_.value(x...),
std::move(f),
/* is_retry= */ false, /* skip_cancel= */ this->num_running_ > 1);

View File

@@ -258,9 +258,11 @@ void Component::call() {
break;
}
}
bool Component::should_warn_of_blocking(uint32_t blocking_time) {
bool Component::should_warn_of_blocking(uint32_t blocking_time, uint32_t &threshold_ms_out) {
// Convert centisecond threshold to milliseconds for comparison
uint32_t threshold_ms = static_cast<uint32_t>(this->warn_if_blocking_over_) * 10U;
// Report the threshold that was exceeded (before any ratcheting below) so the warning is accurate.
threshold_ms_out = threshold_ms;
if (blocking_time > threshold_ms) {
// Set new threshold: blocking_time + increment, converted back to centiseconds
uint32_t new_threshold_ms = blocking_time + WARN_IF_BLOCKING_INCREMENT_MS;
@@ -493,17 +495,17 @@ uint64_t ComponentRuntimeStats::global_recorded_us = 0; // NOLINT(cppcoreguidel
void __attribute__((noinline, cold))
WarnIfComponentBlockingGuard::warn_blocking(Component *component, uint32_t blocking_time) {
bool should_warn;
if (component != nullptr) {
should_warn = component->should_warn_of_blocking(blocking_time);
} else {
should_warn = true; // Already checked > WARN_IF_BLOCKING_OVER_MS in caller
}
if (should_warn) {
ESP_LOGW(TAG, "%s took a long time for an operation (%" PRIu32 " ms), max is 30 ms",
component == nullptr ? LOG_STR_LITERAL("<null>") : LOG_STR_ARG(component->get_component_log_str()),
blocking_time);
// Default threshold for the null path (no component to consult); the caller already checked
// blocking_time > WARN_IF_BLOCKING_OVER_MS, so always warn in that case.
uint32_t threshold_ms = WARN_IF_BLOCKING_OVER_MS;
if (component != nullptr && !component->should_warn_of_blocking(blocking_time, threshold_ms)) {
return; // Component's (possibly ratcheted) threshold not exceeded yet
}
// A null component means the work ran in a scheduler continuation with no associated component
// (e.g. a delay inside a script/automation), so report it as a scheduled task rather than "<null>".
ESP_LOGW(TAG, "%s took a long time for an operation (%" PRIu32 " ms), max is %" PRIu32 " ms",
component == nullptr ? LOG_STR_LITERAL("a scheduled task") : LOG_STR_ARG(component->get_component_log_str()),
blocking_time, threshold_ms);
}
#ifdef USE_SETUP_PRIORITY_OVERRIDE

View File

@@ -326,7 +326,7 @@ class Component {
return component_source_lookup(this->component_source_index_);
}
bool should_warn_of_blocking(uint32_t blocking_time);
bool should_warn_of_blocking(uint32_t blocking_time, uint32_t &threshold_ms_out);
protected:
friend class Application;

View File

@@ -642,8 +642,8 @@ uint32_t HOT Scheduler::call(uint32_t now) {
// Not reached timeout yet, done for this call
break;
}
// Don't run on failed components
if (item->component != nullptr && item->component->is_failed()) {
// Don't run on failed components (see is_item_failed_ for the SELF_POINTER exception).
if (this->is_item_failed_(item)) {
LockGuard guard{this->lock_};
this->recycle_item_main_loop_(this->pop_raw_locked_());
continue;

View File

@@ -402,7 +402,7 @@ class Scheduler {
// Fixes: https://github.com/esphome/esphome/issues/11940
if (item == nullptr)
return false;
if (item->component != component || item->type != type || (skip_removed && this->is_item_removed_locked_(item)) ||
if (item->type != type || (skip_removed && this->is_item_removed_locked_(item)) ||
(match_retry && !item->is_retry)) {
return false;
}
@@ -411,23 +411,35 @@ class Scheduler {
return false;
// STATIC_STRING: compare string content. SELF_POINTER: raw pointer equality (no strcmp).
// Other types: compare hash/ID value.
if (name_type == NameType::SELF_POINTER) {
// SELF_POINTER keys are globally unique (the caller's `this`), so the stored component is
// log-attribution only (e.g. DelayAction records the current component for blocking
// warnings) and must NOT participate in matching. Match by pointer equality alone.
return item->name_.static_name == static_name;
}
// All other name types must also match on component identity.
if (item->component != component)
return false;
if (name_type == NameType::STATIC_STRING) {
return this->names_match_static_(item->get_name(), static_name);
}
if (name_type == NameType::SELF_POINTER) {
return item->name_.static_name == static_name;
}
return item->get_name_hash_or_id() == hash_or_id;
}
// Helper to execute a scheduler item
uint32_t execute_item_(SchedulerItem *item, uint32_t now);
// Helper to check if item should be skipped
bool should_skip_item_(SchedulerItem *item) const {
return is_item_removed_(item) || (item->component != nullptr && item->component->is_failed());
// True if the item belongs to a failed component and should therefore not run.
// SELF_POINTER items (e.g. DelayAction) store the component for log attribution only, so they
// must always fire regardless of that component's failed state and are never skipped here.
bool is_item_failed_(SchedulerItem *item) const {
return item->component != nullptr && item->get_name_type() != NameType::SELF_POINTER &&
item->component->is_failed();
}
// Helper to check if item should be skipped
bool should_skip_item_(SchedulerItem *item) const { return is_item_removed_(item) || this->is_item_failed_(item); }
// Helper to recycle a SchedulerItem back to the pool.
// Takes a raw pointer — caller transfers ownership. The item is either added to the
// pool or deleted if the pool is full.

View File

@@ -0,0 +1,21 @@
esphome:
name: scheduler-blocking-warning
host:
api:
logger:
level: DEBUG
# An interval fires via the scheduler (so the current component is the interval),
# defers through a delay, then busy-blocks well over the 50 ms warn threshold inside
# the deferred continuation. The blocking warning must be attributed to the interval
# component (captured at schedule time) instead of "<null>".
interval:
- interval: 500ms
then:
- delay: 10ms
- lambda: |-
const uint32_t start = millis();
// Spin for longer than WARN_IF_BLOCKING_OVER_MS (50 ms) to trip the warning.
while (millis() - start < 80) {
}

View File

@@ -0,0 +1,32 @@
esphome:
name: scheduler-delay-failed
host:
api:
logger:
level: DEBUG
globals:
- id: started
type: bool
restore_value: false
initial_value: "false"
# The interval fires with itself as the current component, schedules a delay (which
# captures that component for log attribution), then marks itself failed. The delay
# continuation must still fire: a failed component must not drop an already-scheduled
# delay, because the SELF_POINTER scheduler item stores the component for attribution
# only.
interval:
- interval: 100ms
id: host_interval
then:
- if:
condition:
lambda: "return !id(started);"
then:
- lambda: |-
id(started) = true;
id(host_interval)->mark_failed();
- delay: 200ms
- logger.log: "DELAY_FIRED_AFTER_FAIL"

View File

@@ -0,0 +1,91 @@
"""Integration test for blocking-warning source attribution.
A blocking operation that runs inside a deferred scheduler continuation (e.g. after
a ``delay`` in a script/automation) used to be reported as
``<null> took a long time for an operation (NN ms), max is 30 ms`` because the
continuation carries no component. The warning should instead name the component that
was current when the delay was scheduled and report the real threshold (50 ms).
"""
from __future__ import annotations
import asyncio
import re
import pytest
from .types import APIClientConnectedFactory, RunCompiledFunction
# Matches: "<source> took a long time for an operation (NN ms), max is NN ms"
WARN_PATTERN = re.compile(
r"took a long time for an operation \((\d+) ms\), max is (\d+) ms"
)
@pytest.mark.asyncio
async def test_scheduler_blocking_warning(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""Deferred blocking work is attributed to a real component, not "<null>"."""
loop = asyncio.get_running_loop()
warning_future: asyncio.Future[str] = loop.create_future()
def check_output(line: str) -> None:
if WARN_PATTERN.search(line) and not warning_future.done():
warning_future.set_result(line)
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
device_info = await client.device_info()
assert device_info is not None
# The interval fires, defers via delay, then busy-blocks > 50 ms in the
# continuation, which should trip the blocking warning.
warning_line = await asyncio.wait_for(warning_future, timeout=10.0)
# The deferred block must be attributed to a real component, not "<null>".
assert "<null>" not in warning_line, (
f"Warning should name a component, got: {warning_line}"
)
# The delay was scheduled from a known component (the interval), so the warning
# must name it rather than falling back to the generic scheduled-task label.
assert "a scheduled task" not in warning_line, (
f"Warning should name the interval component, got: {warning_line}"
)
# The reported threshold must be the real default (50 ms), not the stale "30 ms".
match = WARN_PATTERN.search(warning_line)
assert match is not None
assert match.group(2) == "50", f"Expected 'max is 50 ms', got: {warning_line}"
@pytest.mark.asyncio
async def test_scheduler_delay_runs_on_failed_component(
yaml_config: str,
run_compiled: RunCompiledFunction,
api_client_connected: APIClientConnectedFactory,
) -> None:
"""A delay must still fire even when its host component is marked failed.
DelayAction records the current component on its scheduler item purely for log
attribution. That component must not gate execution: the scheduler skips items
belonging to failed components, but SELF_POINTER items (delays) are exempt. This
guards the is_item_failed_() exception on both the heap and defer-queue paths.
"""
loop = asyncio.get_running_loop()
fired: asyncio.Future[bool] = loop.create_future()
def check_output(line: str) -> None:
if "DELAY_FIRED_AFTER_FAIL" in line and not fired.done():
fired.set_result(True)
async with (
run_compiled(yaml_config, line_callback=check_output),
api_client_connected() as client,
):
assert await client.device_info() is not None
# If the failed host component wrongly dropped the delay, this times out.
await asyncio.wait_for(fired, timeout=10.0)