Răsfoiți Sursa

Finish implementation of RP2040 SDIO support

(cherry picked from commit c756fa807939daa522f11bbefdf577d09fb1ea70)
Petteri Aimonen 3 ani în urmă
părinte
comite
8b86a20ded

+ 7 - 5
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.cpp

@@ -67,11 +67,8 @@ void azplatform_init()
         azlog("ERROR: SCSI initiator mode is not implemented yet, turn DIP switch off for proper operation!");
     }
 
-    if (dbglog)
-    {
-        g_azlog_debug = true;
-    }
-
+    g_azlog_debug = dbglog;
+    
     if (termination)
     {
         azlog("SCSI termination is enabled");
@@ -137,6 +134,11 @@ void azplatform_init()
 
     // LED pin
     gpio_conf(LED_PIN,        GPIO_FUNC_SIO, false,false, true,  false, false);
+
+    // I2C pins
+    //        pin             function       pup   pdown  out    state fast
+    gpio_conf(GPIO_I2C_SCL,   GPIO_FUNC_I2C, true,false, false,  true, true);
+    gpio_conf(GPIO_I2C_SDA,   GPIO_FUNC_I2C, true,false, false,  true, true);
 }
 
 void azplatform_late_init()

+ 4 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform_gpio.h

@@ -58,6 +58,10 @@
 #define SD_SPI_MISO  20
 #define SD_SPI_CS    23
 
+// IO expander I2C
+#define GPIO_I2C_SDA 14
+#define GPIO_I2C_SCL 15
+
 // DIP switch pins
 #define DIP_INITIATOR 10
 #define DIP_DBGLOG 16

+ 280 - 174
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.cpp

@@ -20,11 +20,12 @@
 #define SDIO_CMD_SM 0
 #define SDIO_DATA_SM 1
 #define SDIO_DMA_CH 1
+#define SDIO_DMA_CHB 2
 
 // Maximum number of 512 byte blocks to transfer in one request
 #define SDIO_MAX_BLOCKS 256
 
-enum sdio_transfer_state_t { SDIO_IDLE, SDIO_RX, SDIO_TX };
+enum sdio_transfer_state_t { SDIO_IDLE, SDIO_RX, SDIO_TX, SDIO_TX_WAIT_IDLE};
 
 static struct {
     uint32_t pio_cmd_clk_offset;
@@ -34,16 +35,33 @@ static struct {
     pio_sm_config pio_cfg_data_tx;
 
     sdio_transfer_state_t transfer_state;
-    bool inside_irq_handler; // True if we are inside crash handler code
     uint32_t transfer_start_time;
     uint32_t *data_buf;
     uint32_t blocks_done; // Number of blocks transferred so far
     uint32_t total_blocks; // Total number of blocks to transfer
     uint32_t blocks_checksumed; // Number of blocks that have had CRC calculated
     uint32_t checksum_errors; // Number of checksum errors detected
-    uint64_t block_checksums[SDIO_MAX_BLOCKS];
+
+    // Variables for block writes
+    uint64_t next_wr_block_checksum;
+    uint32_t end_token_buf[3]; // CRC and end token for write block
+    sdio_status_t wr_status;
+    uint32_t card_response;
+    
+    // Variables for block reads
+    // This is used to perform DMA into data buffers and checksum buffers separately.
+    struct {
+        void * write_addr;
+        uint32_t transfer_count;
+    } dma_blocks[SDIO_MAX_BLOCKS * 2];
+    struct {
+        uint32_t top;
+        uint32_t bottom;
+    } received_checksums[SDIO_MAX_BLOCKS];
 } g_sdio;
 
+void rp2040_sdio_dma_irq();
+
 /*******************************************************
  * Checksum algorithms
  *******************************************************/
@@ -323,71 +341,6 @@ sdio_status_t rp2040_sdio_command_R3(uint8_t command, uint32_t arg, uint32_t *re
  * Data reception from SD card
  *******************************************************/
 
-static void sdio_start_next_block_rx()
-{
-    assert (g_sdio.blocks_done < g_sdio.total_blocks);
-
-    // Disable and reset PIO from previous block
-    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
-    pio_sm_restart(SDIO_PIO, SDIO_DATA_SM);
-    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_jmp(g_sdio.pio_data_rx_offset));
-
-    // Start new DMA transfer
-    dma_channel_transfer_to_buffer_now(SDIO_DMA_CH, g_sdio.data_buf + 128 * g_sdio.blocks_done, 128);
-
-    // Enable PIO
-    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
-}
-
-// Check checksums for received blocks
-static void sdio_verify_rx_checksums(uint32_t maxcount)
-{
-    while (g_sdio.blocks_checksumed < g_sdio.blocks_done && maxcount-- > 0)
-    {
-        int blockidx = g_sdio.blocks_checksumed++;
-        uint64_t checksum = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * 128, 128);
-        uint64_t expected = g_sdio.block_checksums[blockidx];
-
-        if (checksum != expected)
-        {
-            g_sdio.checksum_errors++;
-            if (g_sdio.checksum_errors == 1)
-            {
-                azlog("SDIO checksum error in reception: calculated ", checksum, " expected ", expected);
-            }
-        }
-    }
-}
-
-static void rp2040_sdio_rx_irq()
-{
-    dma_hw->ints1 = 1 << SDIO_DMA_CH;
-
-    // Wait for CRC to be received
-    int maxwait = 1000;
-    while (pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_DATA_SM) < 2)
-    {
-        if (maxwait-- < 0)
-        {
-            azlog("rp2040_sdio_rx_irq(): timeout waiting for CRC reception");
-            break;
-        }
-    }
-    uint32_t crc0 = pio_sm_get(SDIO_PIO, SDIO_DATA_SM);
-    uint32_t crc1 = pio_sm_get(SDIO_PIO, SDIO_DATA_SM);
-    g_sdio.block_checksums[g_sdio.blocks_done] = ((uint64_t)crc0 << 32) | crc1;
-    g_sdio.blocks_done++;
-
-    if (g_sdio.blocks_done < g_sdio.total_blocks)
-    {
-        sdio_start_next_block_rx();
-    }
-    else
-    {
-        g_sdio.transfer_state = SDIO_IDLE;
-    }
-}
-
 sdio_status_t rp2040_sdio_rx_start(uint8_t *buffer, uint32_t num_blocks)
 {
     // Buffer must be aligned
@@ -401,39 +354,103 @@ sdio_status_t rp2040_sdio_rx_start(uint8_t *buffer, uint32_t num_blocks)
     g_sdio.blocks_checksumed = 0;
     g_sdio.checksum_errors = 0;
 
-    // Check if we are inside interrupt handler.
-    // This happens when saving crash log from hardfault.
-    // If true, must use polling mode instead of interrupts.
-    g_sdio.inside_irq_handler = (SCB->ICSR & SCB_ICSR_VECTACTIVE_Msk);
+    // Create DMA block descriptors to store each block of 512 bytes of data to buffer
+    // and then 8 bytes to g_sdio.received_checksums.
+    for (int i = 0; i < num_blocks; i++)
+    {
+        g_sdio.dma_blocks[i * 2].write_addr = buffer + i * SDIO_BLOCK_SIZE;
+        g_sdio.dma_blocks[i * 2].transfer_count = SDIO_BLOCK_SIZE / sizeof(uint32_t);
 
-    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_rx_offset, &g_sdio.pio_cfg_data_rx);
-    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
+        g_sdio.dma_blocks[i * 2 + 1].write_addr = &g_sdio.received_checksums[i];
+        g_sdio.dma_blocks[i * 2 + 1].transfer_count = 2;
+    }
+    g_sdio.dma_blocks[num_blocks * 2].write_addr = 0;
+    g_sdio.dma_blocks[num_blocks * 2].transfer_count = 0;
 
-    // Configure DMA to receive the data block payload (512 bytes).
+    // Configure first DMA channel for reading from the PIO RX fifo
     dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
     channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
     channel_config_set_read_increment(&dmacfg, false);
     channel_config_set_write_increment(&dmacfg, true);
     channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, false));
     channel_config_set_bswap(&dmacfg, true);
+    channel_config_set_chain_to(&dmacfg, SDIO_DMA_CHB);
     dma_channel_configure(SDIO_DMA_CH, &dmacfg, 0, &SDIO_PIO->rxf[SDIO_DATA_SM], 0, false);
 
-    sdio_start_next_block_rx();
+    // Configure second DMA channel for reconfiguring the first one
+    dmacfg = dma_channel_get_default_config(SDIO_DMA_CHB);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, true);
+    channel_config_set_write_increment(&dmacfg, true);
+    channel_config_set_ring(&dmacfg, true, 3);
+    dma_channel_configure(SDIO_DMA_CHB, &dmacfg, &dma_hw->ch[SDIO_DMA_CH].al1_write_addr,
+        g_sdio.dma_blocks, 2, false);
+
+    // Initialize PIO state machine
+    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_rx_offset, &g_sdio.pio_cfg_data_rx);
+    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
+
+    // Write number of nibbles to receive to Y register
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, SDIO_BLOCK_SIZE * 2 + 16 - 1);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_y, 32));
+
+    // Enable RX FIFO join because we don't need the TX FIFO during transfer.
+    // This gives more leeway for the DMA block switching
+    SDIO_PIO->sm[SDIO_DATA_SM].shiftctrl |= PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS;
+
+    // Start PIO and DMA
+    dma_channel_start(SDIO_DMA_CHB);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
 
     return SDIO_OK;
 }
 
+// Check checksums for received blocks
+static void sdio_verify_rx_checksums(uint32_t maxcount)
+{
+    while (g_sdio.blocks_checksumed < g_sdio.blocks_done && maxcount-- > 0)
+    {
+        // Calculate checksum from received data
+        int blockidx = g_sdio.blocks_checksumed++;
+        uint64_t checksum = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * SDIO_WORDS_PER_BLOCK,
+                                                     SDIO_WORDS_PER_BLOCK);
+
+        // Convert received checksum to little-endian format
+        uint32_t top = __builtin_bswap32(g_sdio.received_checksums[blockidx].top);
+        uint32_t bottom = __builtin_bswap32(g_sdio.received_checksums[blockidx].bottom);
+        uint64_t expected = ((uint64_t)top << 32) | bottom;
+
+        if (checksum != expected)
+        {
+            g_sdio.checksum_errors++;
+            if (g_sdio.checksum_errors == 1)
+            {
+                azlog("SDIO checksum error in reception: block ", blockidx,
+                      " calculated ", checksum, " expected ", expected);
+            }
+        }
+    }
+}
+
 sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 {
-    if (g_sdio.inside_irq_handler && (dma_hw->ints0 & (1 << SDIO_DMA_CH)))
+    // Check how many DMA control blocks have been consumed
+    uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
+    dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
+
+    // Compute how many complete 512 byte SDIO blocks have been transferred
+    // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
+    g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
+
+    // Is it all done?
+    if (g_sdio.blocks_done >= g_sdio.total_blocks)
     {
-        // Make sure DMA interrupt handler gets called even from inside hardfault handler.
-        rp2040_sdio_rx_irq();
+        g_sdio.transfer_state = SDIO_IDLE;
     }
 
     if (bytes_complete)
     {
-        *bytes_complete = g_sdio.blocks_done * 512;
+        *bytes_complete = g_sdio.blocks_done * SDIO_BLOCK_SIZE;
     }
 
     if (g_sdio.transfer_state == SDIO_IDLE)
@@ -471,52 +488,60 @@ sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 
 static void sdio_start_next_block_tx()
 {
-    assert (g_sdio.blocks_done < g_sdio.total_blocks && g_sdio.blocks_checksumed > g_sdio.blocks_done);
-
-    // Start new DMA transfer
-    dma_channel_transfer_from_buffer_now(SDIO_DMA_CH, g_sdio.data_buf + 128 * g_sdio.blocks_done, 128);
-}
+    // Initialize PIO
+    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_tx_offset, &g_sdio.pio_cfg_data_tx);
+    
+    // Configure DMA to send the data block payload (512 bytes)
+    dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, true);
+    channel_config_set_write_increment(&dmacfg, false);
+    channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, true));
+    channel_config_set_bswap(&dmacfg, true);
+    channel_config_set_chain_to(&dmacfg, SDIO_DMA_CHB);
+    dma_channel_configure(SDIO_DMA_CH, &dmacfg,
+        &SDIO_PIO->txf[SDIO_DATA_SM], g_sdio.data_buf + g_sdio.blocks_done * SDIO_WORDS_PER_BLOCK,
+        SDIO_WORDS_PER_BLOCK, false);
+
+    // Prepare second DMA channel to send the CRC and block end marker
+    uint64_t crc = g_sdio.next_wr_block_checksum;
+    g_sdio.end_token_buf[0] = (uint32_t)(crc >> 32);
+    g_sdio.end_token_buf[1] = (uint32_t)(crc >>  0);
+    g_sdio.end_token_buf[2] = 0xFFFFFFFF;
+    channel_config_set_bswap(&dmacfg, false);
+    dma_channel_configure(SDIO_DMA_CHB, &dmacfg,
+        &SDIO_PIO->txf[SDIO_DATA_SM], g_sdio.end_token_buf, 3, false);
+    
+    // Enable IRQ to trigger when block is done
+    dma_hw->ints1 = 1 << SDIO_DMA_CHB;
+    dma_set_irq1_channel_mask_enabled(1 << SDIO_DMA_CHB, 1);
+
+    // Initialize register X with nibble count and register Y with response bit count
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 1048);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_x, 32));
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 31);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_y, 32));
+    
+    // Initialize pins to output and high
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_set(pio_pins, 15));
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_set(pio_pindirs, 15));
 
-static void sdio_compute_tx_checksums(uint32_t maxcount)
-{
-    while (g_sdio.blocks_checksumed < g_sdio.blocks_done && maxcount-- > 0)
-    {
-        int blockidx = g_sdio.blocks_checksumed++;
-        g_sdio.block_checksums[blockidx] = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * 128, 128);
-    }
+    // Write start token and start the DMA transfer.
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 0xFFFFFFF0);
+    dma_channel_start(SDIO_DMA_CH);
+    
+    // Start state machine
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
 }
 
-static void rp2040_sdio_tx_irq()
+static void sdio_compute_next_tx_checksum()
 {
-    // Wait for there to be enough space for checksum
-    int maxwait = 1000;
-    while (pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_DATA_SM) < 5)
-    {
-        if (maxwait-- < 0)
-        {
-            azlog("rp2040_sdio_tx_irq(): timeout waiting for space in TX buffer for CRC");
-            break;
-        }
-    }
-
-    // Send the checksum and block end marker
-    uint64_t crc = g_sdio.block_checksums[g_sdio.blocks_done];
-    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, (uint32_t)(crc >> 32));
-    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, (uint32_t)(crc >>  0));
-    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 0xFFFFFFFF);
-
-    g_sdio.blocks_done++;
-    if (g_sdio.blocks_done < g_sdio.total_blocks)
-    {
-        sdio_start_next_block_tx();
-    }
-    else
-    {
-        g_sdio.transfer_state = SDIO_IDLE;
-    }
+    assert (g_sdio.blocks_done < g_sdio.total_blocks && g_sdio.blocks_checksumed < g_sdio.total_blocks);
+    int blockidx = g_sdio.blocks_checksumed++;
+    g_sdio.next_wr_block_checksum = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * SDIO_WORDS_PER_BLOCK,
+                                                             SDIO_WORDS_PER_BLOCK);
 }
 
-
 // Start transferring data from memory to SD card
 sdio_status_t rp2040_sdio_tx_start(const uint8_t *buffer, uint32_t num_blocks)
 {
@@ -531,61 +556,143 @@ sdio_status_t rp2040_sdio_tx_start(const uint8_t *buffer, uint32_t num_blocks)
     g_sdio.blocks_checksumed = 0;
     g_sdio.checksum_errors = 0;
 
-    // Check if we are inside interrupt handler.
-    // This happens when saving crash log from hardfault.
-    // If true, must use polling mode instead of interrupts.
-    g_sdio.inside_irq_handler = (SCB->ICSR & SCB_ICSR_VECTACTIVE_Msk);
-
     // Compute first block checksum
-    sdio_compute_tx_checksums(1);
-
-    // Initialize PIO
-    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_tx_offset, &g_sdio.pio_cfg_data_tx);
-    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, true);
-
-    // Configure DMA to send the data block payload (512 bytes)
-    dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
-    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
-    channel_config_set_read_increment(&dmacfg, true);
-    channel_config_set_write_increment(&dmacfg, false);
-    channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, false));
-    channel_config_set_bswap(&dmacfg, true);
-    dma_channel_configure(SDIO_DMA_CH, &dmacfg, 0, &SDIO_PIO->txf[SDIO_DATA_SM], 0, false);
+    sdio_compute_next_tx_checksum();
 
     // Start first DMA transfer and PIO
     sdio_start_next_block_tx();
-    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
 
-    // Compute rest of the block checksums so that they are ready when needed
-    sdio_compute_tx_checksums(g_sdio.total_blocks);
+    if (g_sdio.blocks_checksumed < g_sdio.total_blocks)
+    {
+        // Precompute second block checksum
+        sdio_compute_next_tx_checksum();
+    }
 
     return SDIO_OK;
 }
 
+sdio_status_t check_sdio_write_response(uint32_t card_response)
+{
+    // Shift card response until top bit is 0 (the start bit)
+    // The format of response is poorly documented in SDIO spec but refer to e.g.
+    // http://my-cool-projects.blogspot.com/2013/02/the-mysterious-sd-card-crc-status.html
+    uint32_t resp = card_response;
+    if (!(~resp & 0xFFFF0000)) resp <<= 16;
+    if (!(~resp & 0xFF000000)) resp <<= 8;
+    if (!(~resp & 0xF0000000)) resp <<= 4;
+    if (!(~resp & 0xC0000000)) resp <<= 2;
+    if (!(~resp & 0x80000000)) resp <<= 1;
+
+    uint32_t wr_status = (resp >> 28) & 7;
+
+    if (wr_status == 2)
+    {
+        return SDIO_OK;
+    }
+    else if (wr_status == 5)
+    {
+        azlog("SDIO card reports write CRC error, status ", card_response);
+        return SDIO_ERR_WRITE_CRC;    
+    }
+    else if (wr_status == 6)
+    {
+        azlog("SDIO card reports write failure, status ", card_response);
+        return SDIO_ERR_WRITE_FAIL;    
+    }
+    else
+    {
+        azlog("SDIO card reports unknown write status ", card_response);
+        return SDIO_ERR_WRITE_FAIL;    
+    }
+}
+
+// When a block finishes, this IRQ handler starts the next one
+static void rp2040_sdio_tx_irq()
+{
+    dma_hw->ints1 = 1 << SDIO_DMA_CHB;
+
+    if (g_sdio.transfer_state == SDIO_TX)
+    {
+        if (!dma_channel_is_busy(SDIO_DMA_CH) && !dma_channel_is_busy(SDIO_DMA_CHB))
+        {
+            // Main data transfer is finished now.
+            // When card is ready, PIO will put card response on RX fifo
+            g_sdio.transfer_state = SDIO_TX_WAIT_IDLE;
+            if (!pio_sm_is_rx_fifo_empty(SDIO_PIO, SDIO_DATA_SM))
+            {
+                // Card is already idle
+                g_sdio.card_response = pio_sm_get(SDIO_PIO, SDIO_DATA_SM);
+            }
+            else
+            {
+                // Use DMA to wait for the response
+                dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CHB);
+                channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+                channel_config_set_read_increment(&dmacfg, false);
+                channel_config_set_write_increment(&dmacfg, false);
+                channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, false));
+                dma_channel_configure(SDIO_DMA_CHB, &dmacfg,
+                    &g_sdio.card_response, &SDIO_PIO->rxf[SDIO_DATA_SM], 1, true);
+            }
+        }
+    }
+    
+    if (g_sdio.transfer_state == SDIO_TX_WAIT_IDLE)
+    {
+        if (!dma_channel_is_busy(SDIO_DMA_CHB))
+        {
+            g_sdio.wr_status = check_sdio_write_response(g_sdio.card_response);
+
+            if (g_sdio.wr_status != SDIO_OK)
+            {
+                rp2040_sdio_stop();
+                return;
+            }
+
+            g_sdio.blocks_done++;
+            if (g_sdio.blocks_done < g_sdio.total_blocks)
+            {
+                sdio_start_next_block_tx();
+                g_sdio.transfer_state = SDIO_TX;
+
+                if (g_sdio.blocks_checksumed < g_sdio.total_blocks)
+                {
+                    // Precompute the CRC for next block so that it is ready when
+                    // we want to send it.
+                    sdio_compute_next_tx_checksum();
+                }
+            }
+            else
+            {
+                rp2040_sdio_stop();
+            }
+        }    
+    }
+}
+
 // Check if transmission is complete
 sdio_status_t rp2040_sdio_tx_poll(uint32_t *bytes_complete)
 {
-    if (g_sdio.inside_irq_handler && (dma_hw->ints0 & (1 << SDIO_DMA_CH)))
+    if (SCB->ICSR & SCB_ICSR_VECTACTIVE_Msk)
     {
-        // Make sure DMA interrupt handler gets called even from inside hardfault handler.
+        // Verify that IRQ handler gets called even if we are in hardfault handler
         rp2040_sdio_tx_irq();
     }
 
     if (bytes_complete)
     {
-        *bytes_complete = g_sdio.blocks_done * 512;
+        *bytes_complete = g_sdio.blocks_done * SDIO_BLOCK_SIZE;
     }
 
     if (g_sdio.transfer_state == SDIO_IDLE)
     {
-        pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
-        pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
-        return SDIO_OK;
+        rp2040_sdio_stop();
+        return g_sdio.wr_status;
     }
     else if ((uint32_t)(millis() - g_sdio.transfer_start_time) > 1000)
     {
         azdbg("rp2040_sdio_tx_poll() timeout, "
-            "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_DATA_SM) - (int)g_sdio.pio_data_rx_offset,
+            "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_DATA_SM) - (int)g_sdio.pio_data_tx_offset,
             " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
             " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
             " DMA CNT: ", dma_hw->ch[SDIO_DMA_CH].al2_transfer_count);
@@ -600,23 +707,15 @@ sdio_status_t rp2040_sdio_tx_poll(uint32_t *bytes_complete)
 sdio_status_t rp2040_sdio_stop()
 {
     dma_channel_abort(SDIO_DMA_CH);
+    dma_channel_abort(SDIO_DMA_CHB);
+    dma_set_irq1_channel_mask_enabled(1 << SDIO_DMA_CHB, 0);
     pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
     pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
     g_sdio.transfer_state = SDIO_IDLE;
     return SDIO_OK;
 }
 
-void rp2040_sdio_dma_irq()
-{
-    dma_hw->ints1 = 1 << SDIO_DMA_CH;
-
-    if (g_sdio.transfer_state == SDIO_TX)
-        rp2040_sdio_tx_irq();
-    else if (g_sdio.transfer_state == SDIO_RX)
-        rp2040_sdio_rx_irq();
-}
-
-void rp2040_sdio_init()
+void rp2040_sdio_init(int clock_divider)
 {
     // Mark resources as being in use, unless it has been done already.
     static bool resources_claimed = false;
@@ -625,11 +724,17 @@ void rp2040_sdio_init()
         pio_sm_claim(SDIO_PIO, SDIO_CMD_SM);
         pio_sm_claim(SDIO_PIO, SDIO_DATA_SM);
         dma_channel_claim(SDIO_DMA_CH);
+        dma_channel_claim(SDIO_DMA_CHB);
         resources_claimed = true;
     }
 
     memset(&g_sdio, 0, sizeof(g_sdio));
 
+    dma_channel_abort(SDIO_DMA_CH);
+    dma_channel_abort(SDIO_DMA_CHB);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_CMD_SM, false);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
+
     // Load PIO programs
     pio_clear_instruction_memory(SDIO_PIO);
 
@@ -643,7 +748,7 @@ void rp2040_sdio_init()
     sm_config_set_sideset_pins(&cfg, SDIO_CLK);
     sm_config_set_out_shift(&cfg, false, true, 32);
     sm_config_set_in_shift(&cfg, false, true, 32);
-    sm_config_set_clkdiv_int_frac(&cfg, 5, 0);
+    sm_config_set_clkdiv_int_frac(&cfg, clock_divider, 0);
     sm_config_set_mov_status(&cfg, STATUS_TX_LESSTHAN, 2);
 
     pio_sm_init(SDIO_PIO, SDIO_CMD_SM, g_sdio.pio_cmd_clk_offset, &cfg);
@@ -655,20 +760,25 @@ void rp2040_sdio_init()
     g_sdio.pio_cfg_data_rx = sdio_data_rx_program_get_default_config(g_sdio.pio_data_rx_offset);
     sm_config_set_in_pins(&g_sdio.pio_cfg_data_rx, SDIO_D0);
     sm_config_set_in_shift(&g_sdio.pio_cfg_data_rx, false, true, 32);
-    sm_config_set_fifo_join(&g_sdio.pio_cfg_data_rx, PIO_FIFO_JOIN_RX);
+    sm_config_set_out_shift(&g_sdio.pio_cfg_data_rx, false, true, 32);
+    sm_config_set_clkdiv_int_frac(&g_sdio.pio_cfg_data_rx, clock_divider, 0);
 
     // Data transmission program
     g_sdio.pio_data_tx_offset = pio_add_program(SDIO_PIO, &sdio_data_tx_program);
-    g_sdio.pio_cfg_data_tx = sdio_data_rx_program_get_default_config(g_sdio.pio_data_tx_offset);
+    g_sdio.pio_cfg_data_tx = sdio_data_tx_program_get_default_config(g_sdio.pio_data_tx_offset);
+    sm_config_set_in_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0);
+    sm_config_set_set_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0, 4);
     sm_config_set_out_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0, 4);
+    sm_config_set_in_shift(&g_sdio.pio_cfg_data_tx, false, false, 32);
     sm_config_set_out_shift(&g_sdio.pio_cfg_data_tx, false, true, 32);
-    sm_config_set_fifo_join(&g_sdio.pio_cfg_data_tx, PIO_FIFO_JOIN_TX);
+    sm_config_set_clkdiv_int_frac(&g_sdio.pio_cfg_data_tx, clock_divider, 0);
 
-    // Disable CLK pin input synchronizer.
-    // This reduces delay from clk state machine to data state machine.
-    // Because the CLK pin is output and driven synchronously to CPU clock,
-    // there is no metastability problems.
-    SDIO_PIO->input_sync_bypass |= (1 << SDIO_CLK);
+    // Disable SDIO pins input synchronizer.
+    // This reduces input delay.
+    // Because the CLK is driven synchronously to CPU clock,
+    // there should be no metastability problems.
+    SDIO_PIO->input_sync_bypass |= (1 << SDIO_CLK) | (1 << SDIO_CMD)
+                                 | (1 << SDIO_D0) | (1 << SDIO_D1) | (1 << SDIO_D2) | (1 << SDIO_D3);
 
     // Redirect GPIOs to PIO
     gpio_set_function(SDIO_CMD, GPIO_FUNC_PIO1);
@@ -679,10 +789,6 @@ void rp2040_sdio_init()
     gpio_set_function(SDIO_D3, GPIO_FUNC_PIO1);
 
     // Set up IRQ handler when DMA completes.
-    // This is time-critical because the CRC must be written / read before PIO FIFO runs out.
-    dma_hw->ints1 = 1 << SDIO_DMA_CH;
-    dma_channel_set_irq1_enabled(SDIO_DMA_CH, true);
-    irq_set_exclusive_handler(DMA_IRQ_1, rp2040_sdio_dma_irq);
+    irq_set_exclusive_handler(DMA_IRQ_1, rp2040_sdio_tx_irq);
     irq_set_enabled(DMA_IRQ_1, true);
-    irq_set_priority(DMA_IRQ_1, 255);
 }

+ 6 - 1
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.h

@@ -13,8 +13,13 @@ enum sdio_status_t {
     SDIO_ERR_RESPONSE_CODE = 4,    // Response command code does not match what was sent
     SDIO_ERR_DATA_TIMEOUT = 5,     // Timed out waiting for data block
     SDIO_ERR_DATA_CRC = 6,         // CRC for data packet is wrong
+    SDIO_ERR_WRITE_CRC = 7,        // Card reports bad CRC for write
+    SDIO_ERR_WRITE_FAIL = 8,       // Card reports write failure
 };
 
+#define SDIO_BLOCK_SIZE 512
+#define SDIO_WORDS_PER_BLOCK 128
+
 // Execute a command that has 48-bit reply (response types R1, R6, R7)
 // If response is NULL, does not wait for reply.
 sdio_status_t rp2040_sdio_command_R1(uint8_t command, uint32_t arg, uint32_t *response);
@@ -44,4 +49,4 @@ sdio_status_t rp2040_sdio_tx_poll(uint32_t *bytes_complete = nullptr);
 sdio_status_t rp2040_sdio_stop();
 
 // (Re)initialize the SDIO interface
-void rp2040_sdio_init();
+void rp2040_sdio_init(int clock_divider = 1);

+ 45 - 21
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.pio

@@ -20,10 +20,10 @@
 ;
 ; Because data is written on the falling edge and read on the rising
 ; edge, it is preferrable to have a long 0 state and short 1 state.
-;.define CLOCK_DIVIDER 3
-.define CLOCK_DIVIDER 25
-.define D1 (CLOCK_DIVIDER/2 - 1)
-.define D0 ((CLOCK_DIVIDER + 1) / 2 - 1)
+;.define CLKDIV 3
+.define CLKDIV 5
+.define D0 ((CLKDIV + 1) / 2 - 1)
+.define D1 (CLKDIV/2 - 1)
 .define SDIO_CLK_GPIO 18
 
 ; State machine 0 is used to:
@@ -95,27 +95,51 @@ resp_done:
 
 ; Data reception program
 ; This program will wait for initial start of block token and then
-; continuously receive data. The application can set limit of bytes
-; to receive by using DMA controller, and the final checksum will
-; fit in state machine RX FIFO.
+; receive a data block. The application must set number of nibbles
+; to receive minus 1 to Y register before running this program.
 .program sdio_data_rx
 
 wait_start:
-    wait 0 pin 0                ; Wait for zero state on D0
-    wait 1 gpio SDIO_CLK_GPIO   ; Wait for rising edge
+    mov X, Y                               ; Reinitialize number of nibbles to receive
+    wait 0 pin 0                           ; Wait for zero state on D0
+    wait 1 gpio SDIO_CLK_GPIO  [CLKDIV-1]  ; Wait for rising edge and then whole clock cycle
 
-.wrap_target
-    wait 0 gpio SDIO_CLK_GPIO
-    wait 1 gpio SDIO_CLK_GPIO   ; Wait for rising clock edge
-    in PINS, 4                  ; Read nibble
-.wrap
+rx_data:
+    in PINS, 4                 [CLKDIV-2]  ; Read nibble
+    jmp X--, rx_data
 
 ; Data transmission program
-; This program will simply send nibbles out to pins, synchronous
-; to the clock signal. The application should prepend 0xF0 to the data
-; for the start of block token, and append the checksum.
-; The data should be padded to full 32 bits by 0xFF bytes.
+;
+; Before running this program, pindirs should be set as output
+; and register X should be initialized with the number of nibbles
+; to send minus 1 (typically 8 + 1024 + 16 + 1 - 1 = 1048)
+; and register Y with the number of response bits minus 1 (typically 31).
+;
+; Words written to TX FIFO must be:
+; - Word 0: start token 0xFFFFFFF0
+; - Word 1-128: transmitted data (512 bytes)
+; - Word 129-130: CRC checksum
+; - Word 131: end token 0xFFFFFFFF
+;
+; After the card reports idle status, RX FIFO will get a word that
+; contains the D0 line response from card.
+
 .program sdio_data_tx
-    wait 1 gpio SDIO_CLK_GPIO
-    wait 0 gpio SDIO_CLK_GPIO   ; Wait for falling clock edge
-    out PINS, 4                 ; Write nibble
+    wait 0 gpio SDIO_CLK_GPIO  
+    wait 1 gpio SDIO_CLK_GPIO  [CLKDIV + D1 - 1]; Synchronize so that write occurs on falling edge
+
+tx_loop:
+    out PINS, 4                [D0]    ; Write nibble and wait for whole clock cycle
+    jmp X-- tx_loop            [D1]
+
+    set pindirs, 0x00          [D0]    ; Set data bus as input
+
+.wrap_target
+response_loop:
+    in PINS, 1                 [D1]    ; Read D0 on rising edge
+    jmp Y--, response_loop     [D0]
+
+wait_idle:
+    wait 1 pin 0               [D1]    ; Wait for card to indicate idle condition
+    push                       [D0]    ; Push the response token
+.wrap

+ 36 - 30
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.pio.h

@@ -17,24 +17,24 @@
 
 static const uint16_t sdio_cmd_clk_program_instructions[] = {
             //     .wrap_target
-    0xbbe3, //  0: mov    osr, null       side 1 [11]
-    0xac4d, //  1: mov    y, !status      side 0 [12]
-    0x1b61, //  2: jmp    !y, 1           side 1 [11]
-    0x6c60, //  3: out    null, 32        side 0 [12]
-    0x7b28, //  4: out    x, 8            side 1 [11]
-    0xec01, //  5: set    pins, 1         side 0 [12]
-    0xfb81, //  6: set    pindirs, 1      side 1 [11]
-    0x6c01, //  7: out    pins, 1         side 0 [12]
-    0x1b47, //  8: jmp    x--, 7          side 1 [11]
-    0xec80, //  9: set    pindirs, 0      side 0 [12]
-    0x7b28, // 10: out    x, 8            side 1 [11]
-    0xac42, // 11: nop                    side 0 [12]
-    0x1b31, // 12: jmp    !x, 17          side 1 [11]
-    0xac42, // 13: nop                    side 0 [12]
-    0x1bcd, // 14: jmp    pin, 13         side 1 [11]
-    0x4c01, // 15: in     pins, 1         side 0 [12]
-    0x1b4f, // 16: jmp    x--, 15         side 1 [11]
-    0x8c20, // 17: push   block           side 0 [12]
+    0xb1e3, //  0: mov    osr, null       side 1 [1] 
+    0xa24d, //  1: mov    y, !status      side 0 [2] 
+    0x1161, //  2: jmp    !y, 1           side 1 [1] 
+    0x6260, //  3: out    null, 32        side 0 [2] 
+    0x7128, //  4: out    x, 8            side 1 [1] 
+    0xe201, //  5: set    pins, 1         side 0 [2] 
+    0xf181, //  6: set    pindirs, 1      side 1 [1] 
+    0x6201, //  7: out    pins, 1         side 0 [2] 
+    0x1147, //  8: jmp    x--, 7          side 1 [1] 
+    0xe280, //  9: set    pindirs, 0      side 0 [2] 
+    0x7128, // 10: out    x, 8            side 1 [1] 
+    0xa242, // 11: nop                    side 0 [2] 
+    0x1131, // 12: jmp    !x, 17          side 1 [1] 
+    0xa242, // 13: nop                    side 0 [2] 
+    0x11cd, // 14: jmp    pin, 13         side 1 [1] 
+    0x4201, // 15: in     pins, 1         side 0 [2] 
+    0x114f, // 16: jmp    x--, 15         side 1 [1] 
+    0x8220, // 17: push   block           side 0 [2] 
             //     .wrap
 };
 
@@ -57,16 +57,16 @@ static inline pio_sm_config sdio_cmd_clk_program_get_default_config(uint offset)
 // sdio_data_rx //
 // ------------ //
 
-#define sdio_data_rx_wrap_target 2
+#define sdio_data_rx_wrap_target 0
 #define sdio_data_rx_wrap 4
 
 static const uint16_t sdio_data_rx_program_instructions[] = {
-    0x2020, //  0: wait   0 pin, 0
-    0x2092, //  1: wait   1 gpio, 18
             //     .wrap_target
-    0x2012, //  2: wait   0 gpio, 18
-    0x2092, //  3: wait   1 gpio, 18
-    0x4004, //  4: in     pins, 4
+    0xa022, //  0: mov    x, y                       
+    0x2020, //  1: wait   0 pin, 0                   
+    0x2492, //  2: wait   1 gpio, 18             [4] 
+    0x4304, //  3: in     pins, 4                [3] 
+    0x0043, //  4: jmp    x--, 3                     
             //     .wrap
 };
 
@@ -88,21 +88,27 @@ static inline pio_sm_config sdio_data_rx_program_get_default_config(uint offset)
 // sdio_data_tx //
 // ------------ //
 
-#define sdio_data_tx_wrap_target 0
-#define sdio_data_tx_wrap 2
+#define sdio_data_tx_wrap_target 5
+#define sdio_data_tx_wrap 8
 
 static const uint16_t sdio_data_tx_program_instructions[] = {
+    0x2012, //  0: wait   0 gpio, 18                 
+    0x2592, //  1: wait   1 gpio, 18             [5] 
+    0x6204, //  2: out    pins, 4                [2] 
+    0x0142, //  3: jmp    x--, 2                 [1] 
+    0xe280, //  4: set    pindirs, 0             [2] 
             //     .wrap_target
-    0x2092, //  0: wait   1 gpio, 18
-    0x2012, //  1: wait   0 gpio, 18
-    0x6004, //  2: out    pins, 4
+    0x4101, //  5: in     pins, 1                [1] 
+    0x0285, //  6: jmp    y--, 5                 [2] 
+    0x21a0, //  7: wait   1 pin, 0               [1] 
+    0x8220, //  8: push   block                  [2] 
             //     .wrap
 };
 
 #if !PICO_NO_HARDWARE
 static const struct pio_program sdio_data_tx_program = {
     .instructions = sdio_data_tx_program_instructions,
-    .length = 3,
+    .length = 9,
     .origin = -1,
 };
 

+ 106 - 47
lib/ZuluSCSI_platform_RP2040/sd_card_sdio.cpp

@@ -15,6 +15,7 @@ static uint32_t g_sdio_rca; // Relative card address
 static cid_t g_sdio_cid;
 static int g_sdio_error_line;
 static sdio_status_t g_sdio_error;
+static uint32_t g_sdio_dma_buf[128];
 
 #define checkReturnOk(call) ((g_sdio_error = (call)) == SDIO_OK ? true : logSDError(__LINE__))
 static bool logSDError(int line)
@@ -24,12 +25,48 @@ static bool logSDError(int line)
     return false;
 }
 
+// Callback used by SCSI code for simultaneous processing
+static sd_callback_t m_stream_callback;
+static const uint8_t *m_stream_buffer;
+static uint32_t m_stream_count;
+static uint32_t m_stream_count_start;
+
+void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer)
+{
+    m_stream_callback = func;
+    m_stream_buffer = buffer;
+    m_stream_count = 0;
+    m_stream_count_start = 0;
+}
+
+static sd_callback_t get_stream_callback(const uint8_t *buf, uint32_t count)
+{
+    m_stream_count_start = m_stream_count;
+
+    if (m_stream_callback)
+    {
+        if (buf == m_stream_buffer + m_stream_count)
+        {
+            m_stream_count += count;
+            return m_stream_callback;
+        }
+        else
+        {
+            azdbg("Stream buffer mismatch: ", (uint32_t)buf, " vs. ", (uint32_t)(m_stream_buffer + m_stream_count));
+            return NULL;
+        }
+    }
+    
+    return NULL;
+}
+
 bool SdioCard::begin(SdioConfig sdioConfig)
 {
     uint32_t reply;
     sdio_status_t status;
     
-    rp2040_sdio_init();
+    // Initialize at 1 MHz clock speed
+    rp2040_sdio_init(25);
 
     // Establish initial connection with the card
     for (int retries = 0; retries < 5; retries++)
@@ -97,6 +134,9 @@ bool SdioCard::begin(SdioConfig sdioConfig)
         return false;
     }
 
+    // Increase to 25 MHz clock rate
+    rp2040_sdio_init(1);
+
     return true;
 }
 
@@ -194,6 +234,10 @@ bool SdioCard::stopTransmission(bool blocking)
         uint32_t end = millis() + 100;
         while (millis() < end && isBusy())
         {
+            if (m_stream_callback)
+            {
+                m_stream_callback(m_stream_count);
+            }
         }
         if (isBusy())
         {
@@ -246,43 +290,16 @@ bool SdioCard::erase(uint32_t firstSector, uint32_t lastSector)
 
 /* Writing and reading, with progress callback */
 
-static sd_callback_t m_stream_callback;
-static const uint8_t *m_stream_buffer;
-static uint32_t m_stream_count;
-static uint32_t m_stream_count_start;
-
-void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer)
-{
-    m_stream_callback = func;
-    m_stream_buffer = buffer;
-    m_stream_count = 0;
-    m_stream_count_start = 0;
-}
-
-static sd_callback_t get_stream_callback(const uint8_t *buf, uint32_t count)
+bool SdioCard::writeSector(uint32_t sector, const uint8_t* src)
 {
-    m_stream_count_start = m_stream_count;
-
-    if (m_stream_callback)
+    if (((uint32_t)src & 3) != 0)
     {
-        if (buf == m_stream_buffer + m_stream_count)
-        {
-            m_stream_count += count;
-            return m_stream_callback;
-        }
-        else
-        {
-            azdbg("Stream buffer mismatch: ", (uint32_t)buf, " vs. ", (uint32_t)(m_stream_buffer + m_stream_count));
-            return NULL;
-        }
+        // Buffer is not aligned, need to memcpy() the data to a temporary buffer.
+        memcpy(g_sdio_dma_buf, src, sizeof(g_sdio_dma_buf));
+        src = (uint8_t*)g_sdio_dma_buf;
     }
-    
-    return NULL;
-}
 
-
-bool SdioCard::writeSector(uint32_t sector, const uint8_t* src)
-{
+    // If possible, report transfer status to application through callback.
     sd_callback_t callback = get_stream_callback(src, 512);
 
     uint32_t reply;
@@ -305,7 +322,7 @@ bool SdioCard::writeSector(uint32_t sector, const uint8_t* src)
 
     if (g_sdio_error != SDIO_OK)
     {
-        azdbg("SdioCard::writeSector(", sector, ") failed: ", (int)g_sdio_error);
+        azlog("SdioCard::writeSector(", sector, ") failed: ", (int)g_sdio_error);
     }
 
     return g_sdio_error == SDIO_OK;
@@ -313,7 +330,20 @@ bool SdioCard::writeSector(uint32_t sector, const uint8_t* src)
 
 bool SdioCard::writeSectors(uint32_t sector, const uint8_t* src, size_t n)
 {
-    sd_callback_t callback = get_stream_callback(src, 512);
+    if (((uint32_t)src & 3) != 0)
+    {
+        // Unaligned write, execute sector-by-sector
+        for (size_t i = 0; i < n; i++)
+        {
+            if (!writeSector(sector + i, src + 512 * i))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    sd_callback_t callback = get_stream_callback(src, n * 512);
 
     uint32_t reply;
     if (!checkReturnOk(rp2040_sdio_command_R1(16, 512, &reply)) || // SET_BLOCKLEN
@@ -335,18 +365,27 @@ bool SdioCard::writeSectors(uint32_t sector, const uint8_t* src, size_t n)
         }
     } while (g_sdio_error == SDIO_BUSY);
 
-    checkReturnOk(rp2040_sdio_command_R1(CMD12, 0, &reply)); // STOP_TRANSMISSION
-
     if (g_sdio_error != SDIO_OK)
     {
-        azdbg("SdioCard::writeSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        azlog("SdioCard::writeSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        stopTransmission(true);
+        return false;
+    }
+    else
+    {
+        return stopTransmission(true);
     }
-
-    return g_sdio_error == SDIO_OK;
 }
 
 bool SdioCard::readSector(uint32_t sector, uint8_t* dst)
 {
+    uint8_t *real_dst = dst;
+    if (((uint32_t)dst & 3) != 0)
+    {
+        // Buffer is not aligned, need to memcpy() the data from a temporary buffer.
+        dst = (uint8_t*)g_sdio_dma_buf;
+    }
+
     sd_callback_t callback = get_stream_callback(dst, 512);
 
     uint32_t reply;
@@ -369,7 +408,12 @@ bool SdioCard::readSector(uint32_t sector, uint8_t* dst)
 
     if (g_sdio_error != SDIO_OK)
     {
-        azdbg("SdioCard::readSector(", sector, ") failed: ", (int)g_sdio_error);
+        azlog("SdioCard::readSector(", sector, ") failed: ", (int)g_sdio_error);
+    }
+
+    if (dst != real_dst)
+    {
+        memcpy(real_dst, g_sdio_dma_buf, sizeof(g_sdio_dma_buf));
     }
 
     return g_sdio_error == SDIO_OK;
@@ -377,6 +421,19 @@ bool SdioCard::readSector(uint32_t sector, uint8_t* dst)
 
 bool SdioCard::readSectors(uint32_t sector, uint8_t* dst, size_t n)
 {
+    if (((uint32_t)dst & 3) != 0)
+    {
+        // Unaligned read, execute sector-by-sector
+        for (size_t i = 0; i < n; i++)
+        {
+            if (!readSector(sector + i, dst + 512 * i))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
     sd_callback_t callback = get_stream_callback(dst, n * 512);
 
     uint32_t reply;
@@ -397,14 +454,16 @@ bool SdioCard::readSectors(uint32_t sector, uint8_t* dst, size_t n)
         }
     } while (g_sdio_error == SDIO_BUSY);
 
-    checkReturnOk(rp2040_sdio_command_R1(CMD12, 0, &reply)); // STOP_TRANSMISSION
-
     if (g_sdio_error != SDIO_OK)
     {
-        azdbg("SdioCard::readSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        azlog("SdioCard::readSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        stopTransmission(true);
+        return false;
+    }
+    else
+    {
+        return stopTransmission(true);
     }
-
-    return g_sdio_error == SDIO_OK;
 }
 
 // These functions are not used for SDIO mode but are needed to avoid build error.