diff --git a/esphome/core/application.h b/esphome/core/application.h
index 04e0f1138e..4a18714d0d 100644
--- a/esphome/core/application.h
+++ b/esphome/core/application.h
@@ -9,6 +9,10 @@
 #include <vector>
 #include "esphome/core/component.h"
 #include "esphome/core/defines.h"
+
+#if defined(USE_LWIP_FAST_SELECT) && defined(ESPHOME_THREAD_MULTI_ATOMICS)
+#include <atomic>  // for std::atomic_thread_fence in Application::loop()
+#endif
 #include "esphome/core/hal.h"
 #include "esphome/core/helpers.h"
 #include "esphome/core/preferences.h"
@@ -580,6 +584,15 @@ inline ESPHOME_ALWAYS_INLINE Application::ComponentPhaseGuard::ComponentPhaseGua
 }
 
 inline void ESPHOME_ALWAYS_INLINE Application::loop() {
+#if defined(USE_LWIP_FAST_SELECT) && defined(ESPHOME_THREAD_MULTI_ATOMICS)
+  // Pairs with the TCP/IP thread's SYS_ARCH_UNPROTECT release on rcvevent so
+  // subsequent Socket::ready() checks in this iter observe the published state
+  // without a per-call memw. Wake is independent (xTaskNotifyGive/
+  // ulTaskNotifyTake), so non-losing. Skipped on MULTI_NO_ATOMICS (e.g.
+  // BK72xx) — that path keeps `volatile` in esphome_lwip_socket_has_data()
+  // instead.
+  std::atomic_thread_fence(std::memory_order_acquire);
+#endif
 #ifdef USE_RUNTIME_STATS
   // Capture the start of the active (non-sleeping) portion of this iteration.
   // Used to derive main-loop overhead = active time − Σ(component time) −
diff --git a/esphome/core/lwip_fast_select.h b/esphome/core/lwip_fast_select.h
index 3b5e449148..4ba2606d76 100644
--- a/esphome/core/lwip_fast_select.h
+++ b/esphome/core/lwip_fast_select.h
@@ -26,25 +26,23 @@ extern "C" {
 struct lwip_sock *esphome_lwip_get_sock(int fd);
 
 /// Check if a cached LwIP socket has data ready via unlocked hint read of rcvevent.
-/// This avoids lwIP core lock contention between the main loop (CPU0) and
-/// streaming/networking work (CPU1). Correctness is preserved because callers
-/// already handle EWOULDBLOCK on nonblocking sockets — a stale hint simply causes
-/// a harmless retry on the next loop iteration. In practice, stale reads have not
-/// been observed across multi-day testing, but the design does not depend on that.
-///
-/// The sock pointer must have been obtained from esphome_lwip_get_sock() and must
-/// remain valid (caller owns socket lifetime — no concurrent close).
-/// Hot path: inlined volatile 16-bit load — no function call overhead.
-/// Uses offset-based access because lwip/priv/sockets_priv.h conflicts with C++.
+/// On ESPHOME_THREAD_MULTI_ATOMICS builds, the caller must run on the main
+/// loop task after Application::loop's per-iter std::atomic_thread_fence
+/// (memory_order_acquire); that fence pairs with the TCP/IP thread's
+/// SYS_ARCH_UNPROTECT release, so a plain load suffices and avoids the
+/// per-call `memw` that volatile would emit on Xtensa under default
+/// -mserialize-volatile. Without atomics (e.g. BK72xx), the fence is skipped
+/// and the volatile load provides ordering on its own.
+/// Stale reads are harmless either way: the hooked event_callback
+/// xTaskNotifyGives on RCVPLUS, so the next iteration re-snapshots and
+/// ulTaskNotifyTake never loses a wake.
 /// The offset and size are verified at compile time in lwip_fast_select.c.
 static inline bool esphome_lwip_socket_has_data(struct lwip_sock *sock) {
-  // Unlocked hint read — no lwIP core lock needed.
-  // volatile prevents the compiler from caching/reordering this cross-thread read.
-  // The write side (TCP/IP thread) commits via SYS_ARCH_UNPROTECT which releases a
-  // FreeRTOS mutex (ESP32) or resumes the scheduler (LibreTiny), ensuring the value
-  // is visible. Aligned 16-bit reads are single-instruction loads (L16SI/LH/LDRH) on
-  // Xtensa/RISC-V/ARM and cannot produce torn values.
+#ifdef ESPHOME_THREAD_MULTI_ATOMICS
+  return *(int16_t *) ((char *) sock + (int) ESPHOME_LWIP_SOCK_RCVEVENT_OFFSET) > 0;
+#else
   return *(volatile int16_t *) ((char *) sock + (int) ESPHOME_LWIP_SOCK_RCVEVENT_OFFSET) > 0;
+#endif
 }
 
 /// Hook a socket's netconn callback to notify the main loop task on receive events.