mirror of
https://github.com/esphome/esphome.git
synced 2026-06-24 12:33:10 +00:00
[core] Fix delay on failed component being dropped; DRY the is_failed check
The is_failed() skip exists in two execution paths: the heap loop in call() and should_skip_item_() (defer queue / delay:0). The previous commit only exempted SELF_POINTER items from the heap-path check, so on multi-threaded platforms a delay:0 continuation whose host component had failed would still be silently dropped. Extract a single is_item_failed_() helper (with the SELF_POINTER exemption) and use it from both paths so they cannot drift again. Add an integration test that schedules a delay from a component that marks itself failed and asserts the continuation still fires (verified to fail without the exemption).
This commit is contained in:
@@ -642,10 +642,8 @@ uint32_t HOT Scheduler::call(uint32_t now) {
|
|||||||
// Not reached timeout yet, done for this call
|
// Not reached timeout yet, done for this call
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Don't run on failed components.
|
// Don't run on failed components (see is_item_failed_ for the SELF_POINTER exception).
|
||||||
// SELF_POINTER items (e.g. DelayAction) store the component for log attribution only and
|
if (this->is_item_failed_(item)) {
|
||||||
// must always fire regardless of that component's failed state, so skip the check for them.
|
|
||||||
if (item->component != nullptr && item->get_name_type() != NameType::SELF_POINTER && item->component->is_failed()) {
|
|
||||||
LockGuard guard{this->lock_};
|
LockGuard guard{this->lock_};
|
||||||
this->recycle_item_main_loop_(this->pop_raw_locked_());
|
this->recycle_item_main_loop_(this->pop_raw_locked_());
|
||||||
continue;
|
continue;
|
||||||
|
|||||||
@@ -429,11 +429,17 @@ class Scheduler {
|
|||||||
// Helper to execute a scheduler item
|
// Helper to execute a scheduler item
|
||||||
uint32_t execute_item_(SchedulerItem *item, uint32_t now);
|
uint32_t execute_item_(SchedulerItem *item, uint32_t now);
|
||||||
|
|
||||||
// Helper to check if item should be skipped
|
// True if the item belongs to a failed component and should therefore not run.
|
||||||
bool should_skip_item_(SchedulerItem *item) const {
|
// SELF_POINTER items (e.g. DelayAction) store the component for log attribution only, so they
|
||||||
return is_item_removed_(item) || (item->component != nullptr && item->component->is_failed());
|
// must always fire regardless of that component's failed state and are never skipped here.
|
||||||
|
bool is_item_failed_(SchedulerItem *item) const {
|
||||||
|
return item->component != nullptr && item->get_name_type() != NameType::SELF_POINTER &&
|
||||||
|
item->component->is_failed();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper to check if item should be skipped
|
||||||
|
bool should_skip_item_(SchedulerItem *item) const { return is_item_removed_(item) || this->is_item_failed_(item); }
|
||||||
|
|
||||||
// Helper to recycle a SchedulerItem back to the pool.
|
// Helper to recycle a SchedulerItem back to the pool.
|
||||||
// Takes a raw pointer — caller transfers ownership. The item is either added to the
|
// Takes a raw pointer — caller transfers ownership. The item is either added to the
|
||||||
// pool or deleted if the pool is full.
|
// pool or deleted if the pool is full.
|
||||||
|
|||||||
@@ -0,0 +1,32 @@
|
|||||||
|
esphome:
|
||||||
|
name: scheduler-delay-failed
|
||||||
|
|
||||||
|
host:
|
||||||
|
api:
|
||||||
|
logger:
|
||||||
|
level: DEBUG
|
||||||
|
|
||||||
|
globals:
|
||||||
|
- id: started
|
||||||
|
type: bool
|
||||||
|
restore_value: false
|
||||||
|
initial_value: "false"
|
||||||
|
|
||||||
|
# The interval fires with itself as the current component, schedules a delay (which
|
||||||
|
# captures that component for log attribution), then marks itself failed. The delay
|
||||||
|
# continuation must still fire: a failed component must not drop an already-scheduled
|
||||||
|
# delay, because the SELF_POINTER scheduler item stores the component for attribution
|
||||||
|
# only.
|
||||||
|
interval:
|
||||||
|
- interval: 100ms
|
||||||
|
id: host_interval
|
||||||
|
then:
|
||||||
|
- if:
|
||||||
|
condition:
|
||||||
|
lambda: "return !id(started);"
|
||||||
|
then:
|
||||||
|
- lambda: |-
|
||||||
|
id(started) = true;
|
||||||
|
id(host_interval)->mark_failed();
|
||||||
|
- delay: 200ms
|
||||||
|
- logger.log: "DELAY_FIRED_AFTER_FAIL"
|
||||||
@@ -60,3 +60,32 @@ async def test_scheduler_blocking_warning(
|
|||||||
match = WARN_PATTERN.search(warning_line)
|
match = WARN_PATTERN.search(warning_line)
|
||||||
assert match is not None
|
assert match is not None
|
||||||
assert match.group(2) == "50", f"Expected 'max is 50 ms', got: {warning_line}"
|
assert match.group(2) == "50", f"Expected 'max is 50 ms', got: {warning_line}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scheduler_delay_runs_on_failed_component(
|
||||||
|
yaml_config: str,
|
||||||
|
run_compiled: RunCompiledFunction,
|
||||||
|
api_client_connected: APIClientConnectedFactory,
|
||||||
|
) -> None:
|
||||||
|
"""A delay must still fire even when its host component is marked failed.
|
||||||
|
|
||||||
|
DelayAction records the current component on its scheduler item purely for log
|
||||||
|
attribution. That component must not gate execution: the scheduler skips items
|
||||||
|
belonging to failed components, but SELF_POINTER items (delays) are exempt. This
|
||||||
|
guards the is_item_failed_() exception on both the heap and defer-queue paths.
|
||||||
|
"""
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
fired: asyncio.Future[bool] = loop.create_future()
|
||||||
|
|
||||||
|
def check_output(line: str) -> None:
|
||||||
|
if "DELAY_FIRED_AFTER_FAIL" in line and not fired.done():
|
||||||
|
fired.set_result(True)
|
||||||
|
|
||||||
|
async with (
|
||||||
|
run_compiled(yaml_config, line_callback=check_output),
|
||||||
|
api_client_connected() as client,
|
||||||
|
):
|
||||||
|
assert await client.device_info() is not None
|
||||||
|
# If the failed host component wrongly dropped the delay, this times out.
|
||||||
|
await asyncio.wait_for(fired, timeout=10.0)
|
||||||
|
|||||||
Reference in New Issue
Block a user