[bk72xx] Add crash-on-previous-boot detection

Brings BK72XX to parity with esp32/esp8266/rp2040: register snapshot +
stack-scanned backtrace are captured during the SDK's exception traps
(undefined instruction / prefetch abort / data abort), persisted across
the watchdog reset in a .noinit RAM region, then logged on the next
boot via the logger and API log subscription paths.

The capture path uses linker --wrap on bk_trap_udef/pabt/dabt — the
trap symbols are defined in LibreTiny's intc.c fixup but called from
the closed-source Beken SDK across translation units, so --wrap takes
effect cleanly. The original SDK behavior (UART register dump +
bk_cpu_shutdown → watchdog reset) is preserved by tail-calling the
real handlers after capture.

A small extra_script (patch_bk72xx_noinit.py) injects a .noinit
section between .bss and _empty_ram in the generated linker script
so the crash record is not zeroed by the C runtime startup and the
heap shrinks to fit rather than overlapping it.
This commit is contained in:
J. Nick Koston
2026-04-30 12:50:16 -05:00
parent 61261b4a59
commit 8fc2e1e542
8 changed files with 358 additions and 5 deletions

View File

@@ -23,6 +23,9 @@
#ifdef USE_ESP8266_CRASH_HANDLER
#include "esphome/components/esp8266/crash_handler.h"
#endif
#ifdef USE_BK72XX_CRASH_HANDLER
#include "esphome/components/bk72xx/crash_handler.h"
#endif
#include "esphome/core/entity_base.h"
#include "esphome/core/string_ref.h"
@@ -283,6 +286,9 @@ class APIConnection final : public APIServerConnectionBase {
#endif
#ifdef USE_ESP8266_CRASH_HANDLER
esp8266::crash_handler_log();
#endif
#ifdef USE_BK72XX_CRASH_HANDLER
bk72xx::crash_handler_log();
#endif
}
#ifdef USE_API_HOMEASSISTANT_SERVICES

View File

@@ -0,0 +1,214 @@
#ifdef USE_BK72XX
#include "esphome/core/defines.h"
#ifdef USE_BK72XX_CRASH_HANDLER
#include "crash_handler.h"
#include "esphome/core/log.h"
#include <cinttypes>
#include <cstdint>
#include <cstring>
// ARM968E-S register snapshot passed to the SDK trap handlers (matches the
// layout used by bk_show_register() in LibreTiny's intc.c fixup).
extern "C" struct arm_registers {
uint32_t r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10;
uint32_t fp;
uint32_t ip;
uint32_t sp;
uint32_t lr;
uint32_t pc;
uint32_t spsr;
uint32_t cpsr;
};
static constexpr uint32_t CRASH_MAGIC = 0xDEADBEEF;
static constexpr uint32_t CRASH_DATA_VERSION = 1;
static constexpr size_t MAX_BACKTRACE = 16;
// BK72XX exception types (matches the order in which we wrap the SDK traps).
enum CrashException : uint8_t {
CRASH_EXC_NONE = 0,
CRASH_EXC_UNDEF = 1, // Undefined instruction
CRASH_EXC_PABT = 2, // Prefetch abort
CRASH_EXC_DABT = 3, // Data abort
};
// Persistent crash record. Lives in .noinit so it survives the SDK's watchdog
// reset that follows bk_cpu_shutdown(). Validated by magic + version on the
// next boot.
struct CrashData {
uint32_t magic;
uint32_t version;
uint32_t pc;
uint32_t lr;
uint32_t sp;
uint32_t cpsr;
uint8_t exception;
uint8_t backtrace_count;
uint16_t reserved;
uint32_t backtrace[MAX_BACKTRACE];
};
// Placed in .noinit so it survives the watchdog reset that follows
// bk_cpu_shutdown(). The libretiny linker fragment (patch_bk72xx_noinit.py)
// inserts a .noinit section between .bss and _empty_ram so this region is
// not zeroed by the C runtime startup.
static CrashData s_raw_crash_data __attribute__((section(".noinit"), used));
// Whether crash data was found and validated this boot. Lives in .bss
// (zero-initialized at startup) — set by crash_handler_read_and_clear().
static bool s_crash_data_valid = false; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
// BK72XX flash code is mapped at the chip-specific BKOFFSET_APP. Real code
// addresses always live above 0x00000000 in the flash region; SRAM ends at
// 0x00440000. We accept any address in the flash window (broad bound) and
// reject obvious non-code values.
static constexpr uint32_t BK72XX_FLASH_START = 0x00010000; // BKRBL header end (~min app offset)
static constexpr uint32_t BK72XX_FLASH_END = 0x00200000; // 2MB cap (largest typical flash)
static inline bool is_code_addr(uint32_t addr) {
// ARM968 instructions are 4-byte aligned; reject obviously bogus values.
if ((addr & 0x3) != 0)
return false;
return addr >= BK72XX_FLASH_START && addr < BK72XX_FLASH_END;
}
// SRAM bounds for stack-scan validity. RAM = 0x00400100 to 0x00440000 (256KB).
static constexpr uint32_t BK72XX_RAM_START = 0x00400100;
static constexpr uint32_t BK72XX_RAM_END = 0x00440000;
static inline bool is_valid_stack_ptr(uint32_t sp) {
return (sp & 0x3) == 0 && sp >= BK72XX_RAM_START && sp < BK72XX_RAM_END;
}
// Walk the stack starting at `sp` and capture up to `max` code-looking
// 32-bit values into `out`. Skips entries equal to the fault PC so it isn't
// reported twice. Returns the number of entries captured.
static uint8_t scan_backtrace(uint32_t sp, uint32_t pc, uint32_t *out, uint8_t max) {
if (!is_valid_stack_ptr(sp))
return 0;
uint8_t count = 0;
// Limit the scan to 256 words (1KB) — covers typical nested call frames
// without dredging up too many stale stack values.
const auto *scan = reinterpret_cast<const uint32_t *>(sp);
const auto *end = reinterpret_cast<const uint32_t *>(BK72XX_RAM_END);
const uint32_t *limit = scan + 256;
if (limit > end)
limit = end;
for (; scan < limit && count < max; scan++) {
uint32_t val = *scan;
if (is_code_addr(val) && val != pc) {
out[count++] = val;
}
}
return count;
}
namespace esphome::bk72xx {
static const char *const TAG = "bk72xx.crash";
void crash_handler_read_and_clear() {
if (s_raw_crash_data.magic == CRASH_MAGIC && s_raw_crash_data.version == CRASH_DATA_VERSION) {
s_crash_data_valid = true;
if (s_raw_crash_data.backtrace_count > MAX_BACKTRACE)
s_raw_crash_data.backtrace_count = MAX_BACKTRACE;
if (s_raw_crash_data.exception > CRASH_EXC_DABT)
s_raw_crash_data.exception = CRASH_EXC_NONE;
}
// Clear magic so the next boot doesn't replay this crash. s_crash_data_valid
// remains true so additional API clients connecting in this boot session
// still see the log via crash_handler_log().
s_raw_crash_data.magic = 0;
}
bool crash_handler_has_data() { return s_crash_data_valid; }
static const char *get_exception_name(uint8_t exc) {
switch (exc) {
case CRASH_EXC_UNDEF:
return "Undefined instruction";
case CRASH_EXC_PABT:
return "Prefetch abort";
case CRASH_EXC_DABT:
return "Data abort";
default:
return "Unknown";
}
}
// Intentionally uses separate ESP_LOGE calls per line instead of combining into
// one multi-line log message. This ensures each address appears as its own line
// on the serial console, making it possible to see partial output if the device
// crashes again during boot, and allowing the CLI's process_stacktrace to match
// and decode each address individually.
void crash_handler_log() {
if (!s_crash_data_valid)
return;
ESP_LOGE(TAG, "*** CRASH DETECTED ON PREVIOUS BOOT ***");
ESP_LOGE(TAG, " Reason: %s", get_exception_name(s_raw_crash_data.exception));
ESP_LOGE(TAG, " PC: 0x%08" PRIX32 " (fault location)", s_raw_crash_data.pc);
ESP_LOGE(TAG, " LR: 0x%08" PRIX32 " (return address)", s_raw_crash_data.lr);
ESP_LOGE(TAG, " SP: 0x%08" PRIX32, s_raw_crash_data.sp);
ESP_LOGE(TAG, " CPSR:0x%08" PRIX32, s_raw_crash_data.cpsr);
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count; i++) {
ESP_LOGE(TAG, " BT%d: 0x%08" PRIX32 " (stack scan)", i, s_raw_crash_data.backtrace[i]);
}
// Build addr2line hint with all captured addresses for easy copy-paste.
char hint[160];
int pos = snprintf(hint, sizeof(hint), "Use: addr2line -pfiaC -e firmware.elf 0x%08" PRIX32 " 0x%08" PRIX32,
s_raw_crash_data.pc, s_raw_crash_data.lr);
for (uint8_t i = 0; i < s_raw_crash_data.backtrace_count && pos < (int) sizeof(hint) - 12; i++) {
pos += snprintf(hint + pos, sizeof(hint) - pos, " 0x%08" PRIX32, s_raw_crash_data.backtrace[i]);
}
ESP_LOGE(TAG, "%s", hint);
}
} // namespace esphome::bk72xx
// --- BK72XX trap-handler wrappers ---
// LibreTiny's fixup intc.c defines bk_trap_udef/pabt/dabt; the closed-source
// Beken SDK calls them from do_undefined/pabort/dabort after pushing a full
// arm_registers frame. We intercept the call via -Wl,--wrap so the original
// behavior (register dump on UART + bk_cpu_shutdown → watchdog reset) still
// runs after we've snapshotted the registers and a stack-scan backtrace.
extern "C" {
// NOLINTBEGIN(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
extern void __real_bk_trap_udef(struct arm_registers *regs);
extern void __real_bk_trap_pabt(struct arm_registers *regs);
extern void __real_bk_trap_dabt(struct arm_registers *regs);
static void capture_crash(uint8_t exception, struct arm_registers *regs) {
s_raw_crash_data.pc = regs->pc;
s_raw_crash_data.lr = regs->lr;
s_raw_crash_data.sp = regs->sp;
s_raw_crash_data.cpsr = regs->cpsr;
s_raw_crash_data.exception = exception;
s_raw_crash_data.backtrace_count = scan_backtrace(regs->sp, regs->pc, s_raw_crash_data.backtrace, MAX_BACKTRACE);
// Write version + magic last so a partial write is invalidated.
s_raw_crash_data.version = CRASH_DATA_VERSION;
s_raw_crash_data.magic = CRASH_MAGIC;
}
void __wrap_bk_trap_udef(struct arm_registers *regs) {
capture_crash(CRASH_EXC_UNDEF, regs);
__real_bk_trap_udef(regs);
}
void __wrap_bk_trap_pabt(struct arm_registers *regs) {
capture_crash(CRASH_EXC_PABT, regs);
__real_bk_trap_pabt(regs);
}
void __wrap_bk_trap_dabt(struct arm_registers *regs) {
capture_crash(CRASH_EXC_DABT, regs);
__real_bk_trap_dabt(regs);
}
// NOLINTEND(bugprone-reserved-identifier,cert-dcl37-c,cert-dcl51-cpp,readability-identifier-naming)
} // extern "C"
#endif // USE_BK72XX_CRASH_HANDLER
#endif // USE_BK72XX

View File

@@ -0,0 +1,26 @@
#pragma once
#ifdef USE_BK72XX
#include "esphome/core/defines.h"
#ifdef USE_BK72XX_CRASH_HANDLER
namespace esphome::bk72xx {
/// Read crash data from .noinit memory and clear the magic marker so the
/// next boot doesn't re-report it. Crash data is copied into a static and
/// remains accessible via crash_handler_log() / crash_handler_has_data() for
/// the rest of this boot session.
void crash_handler_read_and_clear();
/// Log crash data if a crash was detected on previous boot.
void crash_handler_log();
/// Returns true if crash data was found this boot.
bool crash_handler_has_data();
} // namespace esphome::bk72xx
#endif // USE_BK72XX_CRASH_HANDLER
#endif // USE_BK72XX

View File

@@ -479,6 +479,16 @@ async def component_to_code(config):
# RAM-executable output section and prints a post-link placement summary.
if FAMILY_COMPONENT[config[CONF_FAMILY]] != COMPONENT_BK72XX:
cg.add_platformio_option("extra_scripts", ["pre:patch_linker.py"])
else:
# BK72XX: wrap the SDK's exception trap handlers (defined in LibreTiny's
# intc.c fixup, called from the closed-source SDK's do_undefined/pabort/
# dabort) so we can capture register state + a stack-scan backtrace into
# a .noinit region before the SDK's own register dump + bk_cpu_shutdown
# chain runs. The .noinit region is injected into the generated linker
# script by patch_bk72xx_noinit.py so it survives the watchdog reset.
for sym in ("bk_trap_udef", "bk_trap_pabt", "bk_trap_dabt"):
cg.add_build_flag(f"-Wl,--wrap={sym}")
cg.add_platformio_option("extra_scripts", ["pre:patch_bk72xx_noinit.py"])
# dummy version code
cg.add_define("USE_ARDUINO_VERSION_CODE", cg.RawExpression("VERSION_CODE(0, 0, 0)"))
# decrease web server stack size (16k words -> 4k words)
@@ -568,8 +578,11 @@ async def component_to_code(config):
# Called by writer.py
def copy_files() -> None:
script_dir = Path(__file__).parent
patch_linker_file = script_dir / "patch_linker.py.script"
copy_file_if_changed(
patch_linker_file,
CORE.relative_build_path("patch_linker.py"),
)
for src_name, dst_name in (
("patch_linker.py.script", "patch_linker.py"),
("patch_bk72xx_noinit.py.script", "patch_bk72xx_noinit.py"),
):
copy_file_if_changed(
script_dir / src_name,
CORE.relative_build_path(dst_name),
)

View File

@@ -4,6 +4,10 @@
#include "esphome/core/hal.h"
#include "preferences.h"
#ifdef USE_BK72XX_CRASH_HANDLER
#include "esphome/components/bk72xx/crash_handler.h"
#endif
#include <FreeRTOS.h>
#include <task.h>
@@ -19,6 +23,11 @@ namespace esphome {
// inlined in core/hal/hal_libretiny.h.
void arch_init() {
#ifdef USE_BK72XX_CRASH_HANDLER
// Snapshot any crash data left in .noinit by the previous boot before the
// watchdog or anything else can perturb it.
bk72xx::crash_handler_read_and_clear();
#endif
libretiny::setup_preferences();
lt_wdt_enable(10000L);
#ifdef USE_BK72XX

View File

@@ -0,0 +1,74 @@
# pylint: disable=E0602
Import("env") # noqa
import os
import re
# BK72XX crash handler needs a region of RAM that is not zero-initialized
# by the C runtime startup (boot_handlers_bk7231u.S _sysboot_zi_init zeros
# from _bss_start to _bss_end). The stock linker script has no .noinit
# section, so we inject one between .bss and _empty_ram in the generated
# linker script: that places the section after _bss_end (skipping the
# zero-init range) and before the heap start, so the heap shrinks to fit
# rather than overlapping our crash record.
_MARKER = "/* esphome .noinit */"
# Insert a NOLOAD .noinit section right after the .bss block. NOLOAD prevents
# the linker from emitting any LMA copy data for it.
_NOINIT_BLOCK = (
"\n"
"\t.noinit ALIGN(8) (NOLOAD) :\n"
"\t{\n"
"\t\t_noinit_start = .;\n"
"\t\t*(.noinit*)\n"
"\t\t. = ALIGN(8);\n"
"\t\t_noinit_end = .;\n"
"\t} > ram\n"
"\n"
+ "\t" + _MARKER + "\n"
)
# Match the closing brace + memory region of the .bss section, capturing
# everything up through "} > ram" so we can append our block after it.
_BSS_END = re.compile(
r"(\.bss\s+ALIGN\(8\)\s*:\s*\{[^}]*\}\s*>\s*ram\b[^\n]*\n)",
re.DOTALL,
)
def _patch(content):
if _MARKER in content:
return content
return _BSS_END.sub(r"\1" + _NOINIT_BLOCK, content, count=1)
def _pre_link(target, source, env):
build_dir = env.subst("$BUILD_DIR")
if not os.path.isdir(build_dir):
return
patched = 0
for name in os.listdir(build_dir):
if not name.endswith(".ld"):
continue
path = os.path.join(build_dir, name)
with open(path, "r", encoding="utf-8") as fh:
original = fh.read()
if _MARKER in original:
patched += 1
continue
content = _patch(original)
if content != original:
with open(path, "w", encoding="utf-8") as fh:
fh.write(content)
print("ESPHome: injected .noinit into {} for crash handler".format(name))
patched += 1
if not patched:
raise RuntimeError(
"ESPHome: could not inject .noinit into any BK72XX linker script. "
"The crash handler relies on a non-zeroed RAM region; without it, "
"previous-boot crash data would be wiped at startup. Update "
"patch_bk72xx_noinit.py.script — the .bss matcher likely no longer "
"matches the current LibreTiny linker template."
)
env.AddPreAction("$BUILD_DIR/${PROGNAME}.elf", _pre_link)

View File

@@ -1,5 +1,9 @@
#ifdef USE_LIBRETINY
#include "logger.h"
#include "esphome/core/defines.h"
#ifdef USE_BK72XX_CRASH_HANDLER
#include "esphome/components/bk72xx/crash_handler.h"
#endif
namespace esphome::logger {
@@ -47,6 +51,9 @@ void Logger::pre_setup() {
global_logger = this;
ESP_LOGI(TAG, "Log initialized");
#ifdef USE_BK72XX_CRASH_HANDLER
bk72xx::crash_handler_log();
#endif
}
const LogString *Logger::get_uart_selection_() {

View File

@@ -420,6 +420,10 @@
#define ESPHOME_TASK_LOG_BUFFER_SIZE 768
#endif
#ifdef USE_BK72XX
#define USE_BK72XX_CRASH_HANDLER
#endif
#ifdef USE_HOST
#define USE_HTTP_REQUEST_RESPONSE
#define USE_SOCKET_IMPL_BSD_SOCKETS