diff --git a/esphome/components/i2s_audio/__init__.py b/esphome/components/i2s_audio/__init__.py
index 951b8c0498..8e432695a1 100644
--- a/esphome/components/i2s_audio/__init__.py
+++ b/esphome/components/i2s_audio/__init__.py
@@ -170,7 +170,7 @@ def i2s_audio_component_schema(
                 min=1
             ),
             cv.Optional(CONF_BITS_PER_SAMPLE, default=default_bits_per_sample): cv.All(
-                _validate_bits, cv.one_of(*I2S_BITS_PER_SAMPLE)
+                _validate_bits, cv.int_, cv.one_of(*I2S_BITS_PER_SAMPLE)
             ),
             cv.Optional(CONF_I2S_MODE, default=CONF_PRIMARY): cv.one_of(
                 *I2S_MODE_OPTIONS, lower=True
diff --git a/esphome/components/i2s_audio/speaker/__init__.py b/esphome/components/i2s_audio/speaker/__init__.py
index 8215d8b518..5ba2f4b1a5 100644
--- a/esphome/components/i2s_audio/speaker/__init__.py
+++ b/esphome/components/i2s_audio/speaker/__init__.py
@@ -98,11 +98,19 @@ def _set_stream_limits(config):
             min_sample_rate=config.get(CONF_SAMPLE_RATE),
             max_sample_rate=config.get(CONF_SAMPLE_RATE),
         )(config)
-    elif config[CONF_I2S_MODE] == CONF_PRIMARY:
-        # Primary mode has modifiable stream settings
+        return config
+
+    # The original ESP32 cannot lay out sub-16-bit slots that match ESPHome's packed audio, so the smallest
+    # stream it accepts is 16-bit (see start_i2s_driver); the other variants handle 8-bit.
+    min_bits_per_sample = 16 if esp32.get_esp32_variant() == esp32.VARIANT_ESP32 else 8
+
+    if config[CONF_I2S_MODE] == CONF_PRIMARY:
+        # Primary mode can reconfigure the bus to the incoming sample rate and channel count, but the
+        # configured bits per sample is a hard ceiling: the speaker rejects any stream that exceeds the
+        # slot bit width it was set up with (see start_i2s_driver), so advertise that as the maximum.
         audio.set_stream_limits(
-            min_bits_per_sample=8,
-            max_bits_per_sample=32,
+            min_bits_per_sample=min_bits_per_sample,
+            max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
             min_channels=1,
             max_channels=2,
             min_sample_rate=16000,
@@ -111,13 +119,13 @@ def _set_stream_limits(config):
     else:
         # Secondary mode has unmodifiable max bits per sample and min/max sample rates
         audio.set_stream_limits(
-            min_bits_per_sample=8,
-            max_bits_per_sample=config.get(CONF_BITS_PER_SAMPLE),
+            min_bits_per_sample=min_bits_per_sample,
+            max_bits_per_sample=config[CONF_BITS_PER_SAMPLE],
             min_channels=1,
             max_channels=2,
             min_sample_rate=config.get(CONF_SAMPLE_RATE),
             max_sample_rate=config.get(CONF_SAMPLE_RATE),
-        )
+        )(config)
 
     return config
 
@@ -134,12 +142,11 @@ def _validate_esp32_variant(config):
     if config[CONF_DAC_TYPE] == "internal":
         if variant not in INTERNAL_DAC_VARIANTS:
             raise cv.Invalid(f"{variant} does not have an internal DAC")
-    elif (
-        variant == esp32.VARIANT_ESP32
-        and config.get(CONF_BITS_PER_SAMPLE) == 8
-        and config.get(CONF_CHANNEL) in (CONF_MONO, CONF_LEFT, CONF_RIGHT)
-    ):
-        raise cv.Invalid("8-bit mono mode is not supported on ESP32")
+    elif variant == esp32.VARIANT_ESP32 and config[CONF_BITS_PER_SAMPLE] == 8:
+        # The original ESP32 I2S peripheral packs each sample into a whole number of 16-bit words, so an
+        # 8-bit slot does not line up with ESPHome's tightly packed audio (see start_i2s_driver). Reject it
+        # at config time rather than emitting corrupted output at runtime.
+        raise cv.Invalid("8-bit audio is not supported on the original ESP32")
     return config
 
 
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
index ffe901504d..0afb67fb36 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker_standard.cpp
@@ -3,6 +3,7 @@
 #ifdef USE_ESP32
 
 #include <driver/i2s_std.h>
+#include <hal/dma_types.h>
 
 #include "esphome/components/audio/audio.h"
 #include "esphome/components/audio/audio_transfer_buffer.h"
@@ -16,8 +17,16 @@ namespace esphome::i2s_audio {
 
 static const char *const TAG = "i2s_audio.speaker.std";
 
-static constexpr uint32_t DMA_BUFFER_DURATION_MS = 15;
-static constexpr size_t DMA_BUFFERS_COUNT = 4;
+static constexpr uint32_t DMA_BUFFER_DURATION_MS = 10;
+static constexpr size_t DMA_BUFFERS_COUNT = 5;
+// ESP-IDF clamps each DMA descriptor to this many bytes when allocating the channel (see i2s_get_buf_size in
+// the I2S driver). Mirror its target-dependent selection so the requested dma_frame_num stays in range; the
+// speaker task reads the size actually allocated back from the driver rather than relying on this value.
+#if SOC_CACHE_INTERNAL_MEM_VIA_L1CACHE
+static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_64B_ALIGNED;
+#else
+static constexpr size_t I2S_DMA_BUFFER_MAX_SIZE = DMA_DESCRIPTOR_BUFFER_MAX_SIZE_4B_ALIGNED;
+#endif
 // Sized to comfortably absorb scheduling jitter: at most DMA_BUFFERS_COUNT events can be in flight,
 // doubled so that a transient backlog never overruns the queue (which would desync the lockstep
 // invariant between i2s_event_queue_ and write_records_queue_).
@@ -27,6 +36,17 @@ static constexpr size_t I2S_EVENT_QUEUE_COUNT = DMA_BUFFERS_COUNT * 2;
 // without masking real failures.
 static constexpr TickType_t WRITE_TIMEOUT_TICKS = pdMS_TO_TICKS(DMA_BUFFER_DURATION_MS * (DMA_BUFFERS_COUNT + 1));
 
+// Requested frames per DMA buffer for the given stream, clamped so the byte size stays within the ESP-IDF
+// maximum DMA descriptor size. This is only the value handed to the channel config: ESP-IDF may still adjust
+// it (e.g. cache-line rounding on some targets), so the speaker task reads the size actually allocated back
+// from the driver instead of assuming this value. Clamping here keeps the request in range and avoids a
+// noisy ESP-IDF "dma frame num is out of dma buffer size" warning at high sample rates or bit depths.
+static uint32_t dma_buffer_frames(const audio::AudioStreamInfo &stream_info) {
+  const uint32_t frames_from_duration = stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  const uint32_t max_frames = I2S_DMA_BUFFER_MAX_SIZE / stream_info.frames_to_bytes(1);
+  return std::min(frames_from_duration, max_frames);
+}
+
 void I2SAudioSpeaker::dump_config() {
   I2SAudioSpeakerBase::dump_config();
   const char *fmt_str;
@@ -57,8 +77,21 @@ void I2SAudioSpeaker::run_speaker_task() {
   // avoids unnecessary single-frame splices.
   const size_t ring_buffer_size =
       (this->current_stream_info_.ms_to_bytes(ring_buffer_duration) / bytes_per_frame) * bytes_per_frame;
-  const uint32_t frames_per_dma_buffer = this->current_stream_info_.ms_to_frames(DMA_BUFFER_DURATION_MS);
-  const size_t dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(frames_per_dma_buffer);
+  // ESP-IDF may allocate smaller (or cache-line-rounded) DMA buffers than dma_buffer_frames() requested: it
+  // clamps each descriptor to the max DMA descriptor size and, on targets that route internal memory through
+  // the L1 cache (e.g. ESP32-P4), rounds the buffer to the cache line. Read the size the driver actually
+  // allocated so preload, silence padding, and the write/event lockstep all match it exactly. The channel is
+  // in the READY state here because start_i2s_driver() initialized it before this task was created.
+  size_t dma_buffer_bytes;
+  i2s_chan_info_t chan_info;
+  if (i2s_channel_get_info(this->tx_handle_, &chan_info) == ESP_OK && chan_info.total_dma_buf_size > 0) {
+    // total_dma_buf_size spans all DMA_BUFFERS_COUNT descriptors and is an exact multiple of the count.
+    dma_buffer_bytes = chan_info.total_dma_buf_size / DMA_BUFFERS_COUNT;
+  } else {
+    // Should not happen for a READY channel; fall back to the requested size.
+    dma_buffer_bytes = this->current_stream_info_.frames_to_bytes(dma_buffer_frames(this->current_stream_info_));
+  }
+  const uint32_t frames_per_dma_buffer = this->current_stream_info_.bytes_to_frames(dma_buffer_bytes);
 
   bool successful_setup = false;
 
@@ -308,12 +341,24 @@ esp_err_t I2SAudioSpeaker::start_i2s_driver(audio::AudioStreamInfo &audio_stream
     return ESP_ERR_NOT_SUPPORTED;
   }
 
+#ifdef USE_ESP32_VARIANT_ESP32
+  // The original ESP32 I2S peripheral stores each sample in a whole number of 16-bit words (a 24-bit sample
+  // occupies 4 bytes in the DMA buffer, an 8-bit sample 2 bytes), but ESPHome's audio pipeline packs samples
+  // tightly (3 bytes for 24-bit, 1 for 8-bit). The two layouts only line up when the bit depth is a multiple
+  // of 16, so reject anything else rather than emit corrupted audio.
+  if (audio_stream_info.get_bits_per_sample() % 16 != 0) {
+    ESP_LOGE(TAG, "ESP32 supports only 16- or 32-bit audio, got %u-bit",
+             (unsigned) audio_stream_info.get_bits_per_sample());
+    return ESP_ERR_NOT_SUPPORTED;
+  }
+#endif  // USE_ESP32_VARIANT_ESP32
+
   if (!this->parent_->try_lock()) {
     ESP_LOGE(TAG, "Parent bus is busy");
     return ESP_ERR_INVALID_STATE;
   }
 
-  uint32_t dma_buffer_length = audio_stream_info.ms_to_frames(DMA_BUFFER_DURATION_MS);
+  uint32_t dma_buffer_length = dma_buffer_frames(audio_stream_info);
 
   i2s_role_t i2s_role = this->i2s_role_;
   i2s_clock_src_t clk_src = I2S_CLK_SRC_DEFAULT;