Compare commits

...

4 Commits

Author SHA1 Message Date
J. Nick Koston
5e881738da [api] Add speed_optimized proto option for hot encode paths
Add a new (speed_optimized) message option that emits
__attribute__((optimize("O2"))) on the generated encode() and
calculate_size() methods. Under -Os, GCC does not inline the small
ProtoEncode helpers (write_raw_byte, encode_varint, etc.) into the
generated methods, causing significant overhead on hot paths.

Apply to SensorStateResponse and BluetoothLERawAdvertisementsResponse
which are the highest-frequency encode paths.
2026-04-12 19:12:31 -10:00
J. Nick Koston
5a250cc74f [api] Compile noise-c and libsodium with -O2 for speed
Crypto libraries are CPU-bound and benefit significantly from speed
optimization over the default -Os. Add a post: extra_script that
appends -O2 to noise-c and libsodium build flags when API noise
encryption is enabled. GCC uses the last -O flag, so this overrides
the global -Os for these libraries only.
2026-04-12 19:03:21 -10:00
J. Nick Koston
02f828fcbf [benchmark] Use -Os to match firmware optimization level
CodSpeed benchmarks were building with -O2, while all firmware
targets (ESP8266, ESP32, LibreTiny) use -Os. This mismatch means
the benchmarks cannot detect inlining regressions that affect real
devices — GCC under -O2 inlines functions that -Os outlines due to
its size-conscious cost model.

Switch to -Os with -ffunction-sections/-fdata-sections for proper
dead-code stripping (needed because -Os preserves references that
-O2 optimizes away at compile time).
2026-04-12 18:37:50 -10:00
J. Nick Koston
ab64916c37 [benchmark] Use -Os to match firmware optimization level
CodSpeed benchmarks were building with -O2, while all firmware
targets (ESP8266, ESP32, LibreTiny) use -Os. This mismatch means
the benchmarks cannot detect inlining regressions that affect real
devices — GCC under -O2 inlines functions that -Os outlines due to
its size-conscious cost model.

Remove the -Os unflag and -O2 override so benchmarks use the
platform default -Os, matching what actually runs on devices.
2026-04-12 18:32:03 -10:00
7 changed files with 46 additions and 10 deletions

View File

@@ -1,5 +1,6 @@
import base64
import logging
import pathlib
from esphome import automation
from esphome.automation import Condition
@@ -458,6 +459,10 @@ async def to_code(config: ConfigType) -> None:
# Enable optimized memzero/memcmp in libsodium instead of volatile byte loops
cg.add_build_flag("-DHAVE_WEAK_SYMBOLS=1")
cg.add_build_flag("-DHAVE_INLINE_ASM=1")
# Compile crypto libraries with -O2 for speed instead of -Os.
# Crypto is CPU-bound and benefits significantly from speed optimization.
# GCC uses the last -O flag, so appending -O2 overrides the global -Os.
_write_crypto_optimize_script()
else:
cg.add_define("USE_API_PLAINTEXT")
@@ -465,6 +470,17 @@ async def to_code(config: ConfigType) -> None:
cg.add_global(api_ns.using)
_CRYPTO_OPTIMIZE_SCRIPT = "crypto_optimize.py"
def _write_crypto_optimize_script() -> None:
from esphome.helpers import copy_file_if_changed
script_src = pathlib.Path(__file__).parent / f"{_CRYPTO_OPTIMIZE_SCRIPT}.script"
copy_file_if_changed(script_src, CORE.relative_build_path(_CRYPTO_OPTIMIZE_SCRIPT))
cg.add_platformio_option("extra_scripts", [f"post:{_CRYPTO_OPTIMIZE_SCRIPT}"])
KEY_VALUE_SCHEMA = cv.Schema({cv.string: cv.templatable(cv.string_strict)})

View File

@@ -671,6 +671,7 @@ message SensorStateResponse {
option (source) = SOURCE_SERVER;
option (ifdef) = "USE_SENSOR";
option (no_delay) = true;
option (speed_optimized) = true;
fixed32 key = 1 [(force) = true];
float state = 2;
@@ -1638,6 +1639,7 @@ message BluetoothLERawAdvertisementsResponse {
option (source) = SOURCE_SERVER;
option (ifdef) = "USE_BLUETOOTH_PROXY";
option (no_delay) = true;
option (speed_optimized) = true;
repeated BluetoothLERawAdvertisement advertisements = 1 [(fixed_array_with_length_define) = "BLUETOOTH_PROXY_ADVERTISEMENT_BATCH_SIZE"];
}

View File

@@ -23,6 +23,7 @@ extend google.protobuf.MessageOptions {
optional bool no_delay = 1040 [default=false];
optional string base_class = 1041;
optional bool inline_encode = 1042 [default=false];
optional bool speed_optimized = 1043 [default=false];
}
extend google.protobuf.FieldOptions {

View File

@@ -745,7 +745,8 @@ uint32_t ListEntitiesSensorResponse::calculate_size() const {
#endif
return size;
}
uint8_t *SensorStateResponse::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {
__attribute__((optimize("O2"))) uint8_t *SensorStateResponse::encode(
ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {
uint8_t *__restrict__ pos = buffer.get_pos();
ProtoEncode::write_tag_and_fixed32(pos PROTO_ENCODE_DEBUG_ARG, 13, this->key);
ProtoEncode::encode_float(pos PROTO_ENCODE_DEBUG_ARG, 2, this->state);
@@ -755,7 +756,7 @@ uint8_t *SensorStateResponse::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG
#endif
return pos;
}
uint32_t SensorStateResponse::calculate_size() const {
__attribute__((optimize("O2"))) uint32_t SensorStateResponse::calculate_size() const {
uint32_t size = 0;
size += 5;
size += ProtoSize::calc_float(1, this->state);
@@ -2328,7 +2329,8 @@ bool SubscribeBluetoothLEAdvertisementsRequest::decode_varint(uint32_t field_id,
}
return true;
}
uint8_t *BluetoothLERawAdvertisementsResponse::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {
__attribute__((optimize("O2"))) uint8_t *BluetoothLERawAdvertisementsResponse::encode(
ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {
uint8_t *__restrict__ pos = buffer.get_pos();
for (uint16_t i = 0; i < this->advertisements_len; i++) {
auto &sub_msg = this->advertisements[i];
@@ -2350,7 +2352,7 @@ uint8_t *BluetoothLERawAdvertisementsResponse::encode(ProtoWriteBuffer &buffer P
}
return pos;
}
uint32_t BluetoothLERawAdvertisementsResponse::calculate_size() const {
__attribute__((optimize("O2"))) uint32_t BluetoothLERawAdvertisementsResponse::calculate_size() const {
uint32_t size = 0;
for (uint16_t i = 0; i < this->advertisements_len; i++) {
auto &sub_msg = this->advertisements[i];

View File

@@ -0,0 +1,9 @@
# Compile crypto libraries with -O2 for speed instead of the default -Os.
# Crypto is CPU-bound and benefits significantly from speed optimization.
# GCC uses the last -O flag, so appending -O2 overrides the global -Os
# for these libraries only.
Import("env")
for lb in env.GetLibBuilders():
if lb.name in ("noise-c", "libsodium"):
lb.env.Append(CCFLAGS=["-O2"])

View File

@@ -2679,6 +2679,13 @@ def build_message_type(
and get_opt(desc, inline_opt, False)
)
# Check if this message wants speed-optimized encode/calculate_size.
# When set, __attribute__((optimize("O2"))) is added to the definitions
# so GCC inlines the small ProtoEncode helpers even under -Os.
speed_opt = getattr(pb, "speed_optimized", None)
is_speed_optimized = speed_opt is not None and get_opt(desc, speed_opt, False)
speed_attr = '__attribute__((optimize("O2"))) ' if is_speed_optimized else ""
# Only generate encode method if this message needs encoding and has fields
if needs_encode and encode and not is_inline_only:
# Add PROTO_ENCODE_DEBUG_ARG after pos in all proto_* calls
@@ -2688,7 +2695,7 @@ def build_message_type(
)
for line in encode
]
o = f"uint8_t *{desc.name}::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {{\n"
o = f"{speed_attr}uint8_t *{desc.name}::encode(ProtoWriteBuffer &buffer PROTO_ENCODE_DEBUG_PARAM) const {{\n"
o += " uint8_t *__restrict__ pos = buffer.get_pos();\n"
o += indent("\n".join(encode_debug)) + "\n"
o += " return pos;\n"
@@ -2702,7 +2709,7 @@ def build_message_type(
# Add calculate_size method only if this message needs encoding and has fields
if needs_encode and size_calc and not is_inline_only:
o = f"uint32_t {desc.name}::calculate_size() const {{\n"
o = f"{speed_attr}uint32_t {desc.name}::calculate_size() const {{\n"
o += " uint32_t size = 0;\n"
o += indent("\n".join(size_calc)) + "\n"
o += " return size;\n"

View File

@@ -26,12 +26,11 @@ CORE_BENCHMARKS_DIR: Path = Path(root_path) / "tests" / "benchmarks" / "core"
STUBS_DIR: Path = Path(root_path) / "tests" / "benchmarks" / "stubs"
PLATFORMIO_OPTIONS = {
"build_unflags": [
"-Os", # remove default size-opt
],
"build_flags": [
"-O2", # optimize for speed (CodSpeed recommends RelWithDebInfo)
"-Os", # match firmware optimization level (detects inlining regressions)
"-g", # debug symbols for profiling
"-ffunction-sections", # required for dead-code stripping with -Os
"-fdata-sections", # required for dead-code stripping with -Os
"-DUSE_BENCHMARK", # disable WarnIfComponentBlockingGuard in finish()
f"-I{STUBS_DIR}", # stub headers for ESP32-only components
],