Przeglądaj źródła

Add SCSI acceleration using PIO peripheral on RP2040.

(cherry picked from commit 48ec5741c2e767b07661bd0bc3df7ab342db8d61)
Petteri Aimonen 3 lat temu
rodzic
commit
9b7d1bfa1c

+ 10 - 9
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.cpp

@@ -57,7 +57,7 @@ void azplatform_init()
 
     /* Initialize logging to SWO pin (UART0) */
     gpio_conf(SWO_PIN,        GPIO_FUNC_UART,false,false, true,  false, true);
-    uart_init(uart0, 2000000);
+    uart_init(uart0, 1000000);
     mbed_set_error_hook(mbed_error_hook);
 
     azlog("DIP switch settings: initiator ", (int)initiator, ", debug log ", (int)dbglog, ", termination ", (int)termination);
@@ -85,7 +85,8 @@ void azplatform_init()
      * SCSI pins should be inactive / input at this point.
      */
 
-    // SCSI data bus
+    // SCSI data bus direction is switched by DATA_DIR signal.
+    // Pullups make sure that no glitches occur when switching direction.
     //        pin             function       pup   pdown  out    state fast
     gpio_conf(SCSI_IO_DB0,    GPIO_FUNC_SIO, true, false, false, true, true);
     gpio_conf(SCSI_IO_DB1,    GPIO_FUNC_SIO, true, false, false, true, true);
@@ -101,7 +102,9 @@ void azplatform_init()
     //        pin             function       pup   pdown  out    state fast
     gpio_conf(SCSI_OUT_IO,    GPIO_FUNC_SIO, false,false, true,  true, true);
     gpio_conf(SCSI_OUT_MSG,   GPIO_FUNC_SIO, false,false, true,  true, true);
-    gpio_conf(SCSI_OUT_REQ,   GPIO_FUNC_SIO, false,false, true,  true, true);
+
+    // REQ pin is switched between PIO and SIO, pull-up makes sure no glitches
+    gpio_conf(SCSI_OUT_REQ,   GPIO_FUNC_SIO, true ,false, true,  true, true);
 
     // Shared pins are changed to input / output depending on communication phase
     gpio_conf(SCSI_IN_SEL,    GPIO_FUNC_SIO, true, false, false, true, true);
@@ -230,9 +233,8 @@ void azplatform_reset_watchdog()
 /* Mapping from data bytes to GPIO BOP values */
 /**********************************************/
 
-/* A lookup table is the fastest way to calculate parity and convert the IO pin mapping for
- * data bus. The method below uses the BOP register of GD32, this is called BSRR on STM32.
- * If there are no other pins on the same port, you can also use direct writes to the GPIO.
+/* A lookup table is the fastest way to calculate parity and convert the IO pin mapping for data bus.
+ * For RP2040 we expect that the bits are consecutive and in order.
  */
 
 #define PARITY(n) ((1 ^ (n) ^ ((n)>>1) ^ ((n)>>2) ^ ((n)>>3) ^ ((n)>>4) ^ ((n)>>5) ^ ((n)>>6) ^ ((n)>>7)) & 1)
@@ -245,11 +247,10 @@ void azplatform_reset_watchdog()
     ((n & 0x20) ? 0 : (1 << SCSI_IO_DB5)) | \
     ((n & 0x40) ? 0 : (1 << SCSI_IO_DB6)) | \
     ((n & 0x80) ? 0 : (1 << SCSI_IO_DB7)) | \
-    (PARITY(n)  ? 0 : (1 << SCSI_IO_DBP)) | \
-    (1 << SCSI_OUT_REQ) \
+    (PARITY(n)  ? 0 : (1 << SCSI_IO_DBP)) \
 )
 
-const uint32_t g_scsi_out_byte_lookup[256] =
+const uint32_t g_scsi_parity_lookup[256] =
 {
     X(0x00), X(0x01), X(0x02), X(0x03), X(0x04), X(0x05), X(0x06), X(0x07), X(0x08), X(0x09), X(0x0a), X(0x0b), X(0x0c), X(0x0d), X(0x0e), X(0x0f),
     X(0x10), X(0x11), X(0x12), X(0x13), X(0x14), X(0x15), X(0x16), X(0x17), X(0x18), X(0x19), X(0x1a), X(0x1b), X(0x1c), X(0x1d), X(0x1e), X(0x1f),

+ 3 - 2
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.h

@@ -73,9 +73,10 @@ void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer);
      sio_hw->gpio_oe_set = SCSI_IO_DATA_MASK)
 
 // Write SCSI data bus, also sets REQ to inactive.
-extern const uint32_t g_scsi_out_byte_lookup[256];
+extern const uint32_t g_scsi_parity_lookup[256];
 #define SCSI_OUT_DATA(data) \
-    gpio_put_masked(SCSI_IO_DATA_MASK | (1 << SCSI_OUT_REQ), g_scsi_out_byte_lookup[(uint8_t)(data)]), \
+    gpio_put_masked(SCSI_IO_DATA_MASK | (1 << SCSI_OUT_REQ), \
+                    g_scsi_parity_lookup[(uint8_t)(data)] | (1 << SCSI_OUT_REQ)), \
     SCSI_ENABLE_DATA_OUT()
 
 // Release SCSI data bus and REQ signal

+ 46 - 23
lib/ZuluSCSI_platform_RP2040/scsiPhy.cpp

@@ -6,6 +6,7 @@
 #include "ZuluSCSI_log.h"
 #include "ZuluSCSI_log_trace.h"
 #include "ZuluSCSI_config.h"
+#include "scsi_accel_rp2040.h"
 
 #include <scsi2sd.h>
 extern "C" {
@@ -128,6 +129,8 @@ extern "C" void scsiPhyReset(void)
     g_scsi_sts_selection = 0;
     g_scsi_ctrl_bsy = 0;
 
+    scsi_accel_rp2040_init();
+
     // Enable BSY and RST interrupts
     // Note: RP2040 library currently supports only one callback,
     // so it has to be same for both pins.
@@ -245,33 +248,39 @@ extern "C" void scsiWriteByte(uint8_t value)
 
 extern "C" void scsiWrite(const uint8_t* data, uint32_t count)
 {
-    scsiLogDataIn(data, count);
-    for (uint32_t i = 0; i < count; i++)
-    {
-        if (scsiDev.resetFlag) break;
-        scsiWriteOneByte(data[i]);
-    }
+    scsiStartWrite(data, count);
+    scsiFinishWrite();
 }
 
 extern "C" void scsiStartWrite(const uint8_t* data, uint32_t count)
 {
-    // If the platform supports DMA for either SD card access or for SCSI bus,
-    // this function can be used to execute SD card transfers in parallel with
-    // SCSI transfers. This usually doubles the transfer speed.
-    //
-    // For simplicity, this example only implements blocking writes.
-    scsiWrite(data, count);
+    scsiLogDataIn(data, count);
+
+    if ((count & 1) != 0)
+    {
+        // Unaligned write, do it byte-by-byte
+        scsiFinishWrite();
+        for (uint32_t i = 0; i < count; i++)
+        {
+            if (scsiDev.resetFlag) break;
+            scsiWriteOneByte(data[i]);
+        }
+    }
+    else
+    {
+        // Use accelerated routine
+        scsi_accel_rp2040_startWrite(data, count, &scsiDev.resetFlag);
+    }
 }
 
 extern "C" bool scsiIsWriteFinished(const uint8_t *data)
 {
-    // Asynchronous writes are not implemented in this example.
-    return true;
+    return scsi_accel_rp2040_isWriteFinished(data);
 }
 
 extern "C" void scsiFinishWrite()
 {
-    // Asynchronous writes are not implemented in this example.
+    scsi_accel_rp2040_finishWrite(&scsiDev.resetFlag);
 }
 
 /*********************/
@@ -279,21 +288,27 @@ extern "C" void scsiFinishWrite()
 /*********************/
 
 // Read one byte from SCSI host using the handshake mechanism.
-static inline uint8_t scsiReadOneByte(void)
+static inline uint8_t scsiReadOneByte(int* parityError)
 {
     SCSI_OUT(REQ, 1);
     SCSI_WAIT_ACTIVE(ACK);
     delay_100ns();
-    uint8_t r = SCSI_IN_DATA();
+    uint16_t r = SCSI_IN_DATA();
     SCSI_OUT(REQ, 0);
     SCSI_WAIT_INACTIVE(ACK);
 
-    return r;
+    if (parityError && r != (g_scsi_parity_lookup[r & 0xFF] ^ SCSI_IO_DATA_MASK))
+    {
+        azlog("Parity error in scsiReadOneByte(): ", (uint32_t)r);
+        *parityError = 1;
+    }
+
+    return (uint8_t)r;
 }
 
 extern "C" uint8_t scsiReadByte(void)
 {
-    uint8_t r = scsiReadOneByte();
+    uint8_t r = scsiReadOneByte(NULL);
     scsiLogDataOut(&r, 1);
     return r;
 }
@@ -302,11 +317,19 @@ extern "C" void scsiRead(uint8_t* data, uint32_t count, int* parityError)
 {
     *parityError = 0;
 
-    for (uint32_t i = 0; i < count; i++)
+    if ((count & 1) != 0)
     {
-        if (scsiDev.resetFlag) break;
-
-        data[i] = scsiReadOneByte();
+        // Unaligned transfer, do byte by byte
+        for (uint32_t i = 0; i < count; i++)
+        {
+            if (scsiDev.resetFlag) break;
+            data[i] = scsiReadOneByte(parityError);
+        }
+    }
+    else
+    {
+        // Use accelerated routine
+        scsi_accel_rp2040_read(data, count, parityError, &scsiDev.resetFlag);
     }
 
     scsiLogDataOut(data, count);

+ 46 - 0
lib/ZuluSCSI_platform_RP2040/scsi_accel.pio

@@ -0,0 +1,46 @@
+; RP2040 PIO program for accelerating SCSI communication
+; Run "pioasm scsi_accel.pio scsi_accel.pio.h" to regenerate the C header from this.
+; GPIO mapping:
+; - 0-7: DB0-DB7
+; -   8: DBP
+; Side set is REQ pin
+
+.define REQ 9
+.define ACK 10
+
+; Delay from data setup to REQ assertion.
+; deskew delay + cable skew delay = 55 ns minimum
+; One clock cycle is 8 ns => delay 7 clocks
+.define REQ_DLY 7
+
+; Write to SCSI bus using asynchronous handshake.
+; Data is written as 16-bit words that contain the 8 data bits + 1 parity bit.
+; 7 bits in each word are discarded.
+; Number of bytes to send must be multiple of 2.
+.program scsi_accel_async_write
+    .side_set 1
+
+    pull ifempty block          side 1  ; Get data from TX FIFO
+    out pins, 9                 side 1  ; Write data and parity bit
+    out null, 7 [REQ_DLY-2]     side 1  ; Discard unused bits, wait for data preset time
+    wait 1 gpio ACK             side 1  ; Wait for ACK to be inactive
+    wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
+
+; Read from SCSI bus using asynchronous handshake.
+; Data is returned as 16-bit words that contain the 8 data bits + 1 parity bit.
+; Number of bytes to receive minus 1 should be written to TX fifo.
+; Number of bytes to receive must be divisible by 2.
+.program scsi_accel_async_read
+    .side_set 1
+
+    pull block                  side 1  ; Get number of bytes to receive
+    mov x, osr                  side 1  ; Store to counter X
+
+start:
+    wait 1 gpio ACK             side 1  ; Wait for ACK high
+    wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
+    in pins, 9                  side 1  ; Deassert REQ, read GPIO
+    in null, 7                  side 1  ; Padding bits
+    push iffull block           side 1  ; Put data to RX FIFO
+    jmp x-- start               side 1  ; Decrement byte count and jump to start
+

+ 77 - 0
lib/ZuluSCSI_platform_RP2040/scsi_accel.pio.h

@@ -0,0 +1,77 @@
+// -------------------------------------------------- //
+// This file is autogenerated by pioasm; do not edit! //
+// -------------------------------------------------- //
+
+#pragma once
+
+#if !PICO_NO_HARDWARE
+#include "hardware/pio.h"
+#endif
+
+// ---------------------- //
+// scsi_accel_async_write //
+// ---------------------- //
+
+#define scsi_accel_async_write_wrap_target 0
+#define scsi_accel_async_write_wrap 4
+
+static const uint16_t scsi_accel_async_write_program_instructions[] = {
+            //     .wrap_target
+    0x90e0, //  0: pull   ifempty block   side 1     
+    0x7009, //  1: out    pins, 9         side 1     
+    0x7567, //  2: out    null, 7         side 1 [5] 
+    0x308a, //  3: wait   1 gpio, 10      side 1     
+    0x200a, //  4: wait   0 gpio, 10      side 0     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_accel_async_write_program = {
+    .instructions = scsi_accel_async_write_program_instructions,
+    .length = 5,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_accel_async_write_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_accel_async_write_wrap_target, offset + scsi_accel_async_write_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// --------------------- //
+// scsi_accel_async_read //
+// --------------------- //
+
+#define scsi_accel_async_read_wrap_target 0
+#define scsi_accel_async_read_wrap 7
+
+static const uint16_t scsi_accel_async_read_program_instructions[] = {
+            //     .wrap_target
+    0x90a0, //  0: pull   block           side 1     
+    0xb027, //  1: mov    x, osr          side 1     
+    0x308a, //  2: wait   1 gpio, 10      side 1     
+    0x200a, //  3: wait   0 gpio, 10      side 0     
+    0x5009, //  4: in     pins, 9         side 1     
+    0x5067, //  5: in     null, 7         side 1     
+    0x9060, //  6: push   iffull block    side 1     
+    0x1042, //  7: jmp    x--, 2          side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_accel_async_read_program = {
+    .instructions = scsi_accel_async_read_program_instructions,
+    .length = 8,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_accel_async_read_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_accel_async_read_wrap_target, offset + scsi_accel_async_read_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+

+ 455 - 0
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.cpp

@@ -0,0 +1,455 @@
+/* Data flow in SCSI acceleration:
+ *
+ * 1. Application provides a buffer of bytes to send.
+ * 2. Code in this module adds parity bit to the bytes and packs two bytes into 32 bit words.
+ * 3. DMA controller copies the words to PIO peripheral FIFO.
+ * 4. PIO peripheral handles low-level SCSI handshake and writes bytes and parity to GPIO.
+ */
+
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "scsi_accel_rp2040.h"
+#include "scsi_accel.pio.h"
+#include <hardware/pio.h>
+#include <hardware/dma.h>
+#include <hardware/irq.h>
+#include <hardware/structs/iobank0.h>
+
+#define SCSI_DMA_PIO pio0
+#define SCSI_DMA_SM 0
+#define SCSI_DMA_CH 0
+
+enum scsidma_buf_sel_t { SCSIBUF_NONE = 0, SCSIBUF_A = 1, SCSIBUF_B = 2 };
+
+#define DMA_BUF_SIZE 128
+static struct {
+    uint8_t *app_buf; // Buffer provided by application
+    uint32_t app_bytes; // Bytes available in application buffer
+    uint32_t dma_bytes; // Bytes that have been converted to DMA buffer so far
+    
+    uint8_t *next_app_buf; // Next buffer from application after current one finishes
+    uint32_t next_app_bytes; // Bytes in next buffer
+
+    // PIO configurations
+    uint32_t pio_offset_async_write;
+    uint32_t pio_offset_async_read;
+    pio_sm_config pio_cfg_async_write;
+    pio_sm_config pio_cfg_async_read;
+
+    // DMA configurations
+    dma_channel_config dma_write_config;
+
+    // We use two DMA buffers alternatively
+    // The buffer contains the data bytes with parity added.
+    scsidma_buf_sel_t dma_current_buf;
+    uint32_t dma_countA;
+    uint32_t dma_countB;
+    uint32_t dma_bufA[DMA_BUF_SIZE];
+    uint32_t dma_bufB[DMA_BUF_SIZE];
+} g_scsi_dma;
+
+enum scsidma_state_t { SCSIDMA_IDLE = 0,
+                       SCSIDMA_WRITE, SCSIDMA_WRITE_DONE,
+                       SCSIDMA_READ };
+static volatile scsidma_state_t g_scsi_dma_state;
+static bool g_channels_claimed = false;
+
+// Fill DMA buffer and return number of words ready to be transferred
+static uint32_t refill_dmabuf(uint32_t *buf)
+{
+    uint32_t count = (g_scsi_dma.app_bytes - g_scsi_dma.dma_bytes) / 2;
+    if (count > DMA_BUF_SIZE) count = DMA_BUF_SIZE;
+
+    uint16_t *src = (uint16_t*)&g_scsi_dma.app_buf[g_scsi_dma.dma_bytes];
+    uint16_t *end = src + count;
+    uint32_t *dst = buf;
+    while (src < end)
+    {
+        uint16_t input = *src++;
+        *dst++ = (g_scsi_parity_lookup[input & 0xFF])
+               | ((g_scsi_parity_lookup[input >> 8]) << 16);
+    }
+
+    g_scsi_dma.dma_bytes += count * 2;
+
+    // Check if this buffer has been fully processed
+    if (g_scsi_dma.dma_bytes >= g_scsi_dma.app_bytes)
+    {
+        assert(g_scsi_dma.dma_bytes == g_scsi_dma.app_bytes);
+        g_scsi_dma.dma_bytes = 0;
+        g_scsi_dma.app_buf = g_scsi_dma.next_app_buf;
+        g_scsi_dma.app_bytes = g_scsi_dma.next_app_bytes;
+        g_scsi_dma.next_app_buf = 0;
+        g_scsi_dma.next_app_bytes = 0;
+    }
+
+    return count;
+}
+
+// Select GPIO from PIO peripheral or from software controlled SIO
+static void scsidma_config_gpio()
+{
+    if (g_scsi_dma_state == SCSIDMA_IDLE)
+    {
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_SIO;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_WRITE)
+    {
+        // Make sure the initial state of all pins is high and output
+        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DMA_SM, 0x3FF);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DMA_SM, 0, 10, true);
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_READ)
+    {
+        // Data bus as input, REQ pin as output
+        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DMA_SM, 0x3FF);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DMA_SM, 0, 9, false);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DMA_SM, 9, 1, true);
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
+}
+
+static void start_dma_write()
+{
+    // Prefill both DMA buffers
+    g_scsi_dma.dma_countA = refill_dmabuf(g_scsi_dma.dma_bufA);
+    g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
+    
+    // Start DMA from buffer A
+    g_scsi_dma.dma_current_buf = SCSIBUF_A;
+    dma_channel_configure(SCSI_DMA_CH,
+        &g_scsi_dma.dma_write_config,
+        &SCSI_DMA_PIO->txf[SCSI_DMA_SM],
+        g_scsi_dma.dma_bufA,
+        g_scsi_dma.dma_countA,
+        true
+    );
+}
+
+static void scsi_dma_write_irq()
+{
+    dma_hw->ints0 = 1 << SCSI_DMA_CH;
+
+    if (g_scsi_dma.dma_current_buf == SCSIBUF_A)
+    {
+        // Transfer from buffer A finished
+        g_scsi_dma.dma_countA = 0;
+        g_scsi_dma.dma_current_buf = SCSIBUF_NONE;
+
+        if (g_scsi_dma.dma_countB != 0)
+        {
+            // Start transferring buffer B immediately
+            dma_channel_set_trans_count(SCSI_DMA_CH, g_scsi_dma.dma_countB, false);
+            dma_channel_set_read_addr(SCSI_DMA_CH, g_scsi_dma.dma_bufB, true);
+            g_scsi_dma.dma_current_buf = SCSIBUF_B;
+
+            // Refill buffer A for next time
+            g_scsi_dma.dma_countA = refill_dmabuf(g_scsi_dma.dma_bufA);
+        }
+    }
+    else
+    {
+        // Transfer from buffer B finished
+        g_scsi_dma.dma_countB = 0;
+        g_scsi_dma.dma_current_buf = SCSIBUF_NONE;
+
+        if (g_scsi_dma.dma_countA != 0)
+        {
+            // Start transferring buffer A immediately
+            dma_channel_set_trans_count(SCSI_DMA_CH, g_scsi_dma.dma_countA, false);
+            dma_channel_set_read_addr(SCSI_DMA_CH, g_scsi_dma.dma_bufA, true);
+            g_scsi_dma.dma_current_buf = SCSIBUF_A;
+
+            // Refill buffer B for next time
+            g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
+        }
+    }
+
+    if (g_scsi_dma.dma_current_buf == SCSIBUF_NONE)
+    {
+        // Both buffers are empty, check if we have more data
+        g_scsi_dma.dma_countA = refill_dmabuf(g_scsi_dma.dma_bufA);
+
+        if (g_scsi_dma.dma_countA == 0)
+        {
+            // End of data for DMA, but PIO may still have bytes in its buffer
+            g_scsi_dma_state = SCSIDMA_WRITE_DONE;
+        }
+        else
+        {
+            // Start transfer from buffer A
+            dma_channel_set_trans_count(SCSI_DMA_CH, g_scsi_dma.dma_countA, false);
+            dma_channel_set_read_addr(SCSI_DMA_CH, g_scsi_dma.dma_bufA, true);
+            g_scsi_dma.dma_current_buf = SCSIBUF_A;
+
+            // Refill B for the next interrupt
+            g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
+        }
+    }
+}
+
+void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
+{
+    // Number of bytes should always be divisible by 2.
+    assert((count & 1) == 0);
+
+    __disable_irq();
+    if (g_scsi_dma_state == SCSIDMA_WRITE)
+    {
+        if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
+        {
+            // Combine with currently running request
+            g_scsi_dma.app_bytes += count;
+            count = 0;
+        }
+        else if (data == g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+        {
+            // Combine with queued request
+            g_scsi_dma.next_app_bytes += count;
+            count = 0;
+        }
+        else if (!g_scsi_dma.next_app_buf)
+        {
+            // Add as queued request
+            g_scsi_dma.next_app_buf = (uint8_t*)data;
+            g_scsi_dma.next_app_bytes = count;
+            count = 0;
+        }
+    }
+    __enable_irq();
+
+    // Check if the request was combined
+    if (count == 0) return;
+
+    if (g_scsi_dma_state != SCSIDMA_IDLE && g_scsi_dma_state != SCSIDMA_WRITE_DONE)
+    {
+        // Wait for previous request to finish
+        scsi_accel_rp2040_finishWrite(resetFlag);
+        if (*resetFlag)
+        {
+            return;
+        }
+    }
+
+    bool must_reconfig_gpio = (g_scsi_dma_state == SCSIDMA_IDLE);
+    g_scsi_dma_state = SCSIDMA_WRITE;
+    g_scsi_dma.app_buf = (uint8_t*)data;
+    g_scsi_dma.app_bytes = count;
+    g_scsi_dma.dma_bytes = 0;
+    g_scsi_dma.next_app_buf = 0;
+    g_scsi_dma.next_app_bytes = 0;
+    g_scsi_dma.dma_current_buf = SCSIBUF_NONE;
+    
+    if (must_reconfig_gpio)
+    {
+        SCSI_ENABLE_DATA_OUT();
+        pio_sm_init(SCSI_DMA_PIO, SCSI_DMA_SM, g_scsi_dma.pio_offset_async_write, &g_scsi_dma.pio_cfg_async_write);
+        scsidma_config_gpio();
+        pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DMA_SM, true);
+        
+        dma_channel_set_irq0_enabled(SCSI_DMA_CH, true);
+        irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+        irq_set_enabled(DMA_IRQ_0, true);
+    }
+
+    start_dma_write();
+}
+
+bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
+{
+    // Check if everything has completed
+    if (g_scsi_dma_state == SCSIDMA_IDLE || g_scsi_dma_state == SCSIDMA_WRITE_DONE)
+    {
+        return true;
+    }
+
+    if (!data)
+        return false;
+    
+    // Check if this data item is still in queue.
+    __disable_irq();
+    bool finished = true;
+    if (data >= g_scsi_dma.app_buf + g_scsi_dma.dma_bytes &&
+        data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
+    {
+        finished = false; // In current transfer
+    }
+    else if (data >= g_scsi_dma.next_app_buf &&
+             data < g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+    {
+        finished = false; // In queued transfer
+    }
+    __enable_irq();
+
+    return finished;
+}
+
+void scsi_accel_rp2040_stopWrite(volatile int *resetFlag)
+{
+    // Wait for TX fifo to be empty and ACK to go high
+    uint32_t start = millis();
+    while ((!pio_sm_is_tx_fifo_empty(SCSI_DMA_PIO, SCSI_DMA_SM) || SCSI_IN(ACK)) && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_stopWrite() timeout");
+            *resetFlag = 1;
+            break;
+        }
+    }
+
+    dma_channel_abort(SCSI_DMA_CH);
+    dma_channel_set_irq0_enabled(SCSI_DMA_CH, false);
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    SCSI_RELEASE_DATA_REQ();
+    scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DMA_SM, false);
+}
+
+void scsi_accel_rp2040_finishWrite(volatile int *resetFlag)
+{
+    uint32_t start = millis();
+    while (g_scsi_dma_state != SCSIDMA_IDLE && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_finishWrite() timeout");
+            *resetFlag = 1;
+            break;
+        }
+
+        if (g_scsi_dma_state == SCSIDMA_WRITE_DONE)
+        {
+            // DMA done, wait for PIO to finish also and reconfig GPIO.
+            scsi_accel_rp2040_stopWrite(resetFlag);
+        }
+    }
+}
+
+void scsi_accel_rp2040_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    // The hardware would support DMA for reading from SCSI bus also, but currently
+    // the rest of the software architecture does not. There is not much benefit
+    // because there isn't much else to do before we get the data from the SCSI bus.
+    //
+    // Currently this method just reads from the PIO RX fifo directly in software loop.
+    
+    g_scsi_dma_state = SCSIDMA_READ;
+    pio_sm_init(SCSI_DMA_PIO, SCSI_DMA_SM, g_scsi_dma.pio_offset_async_read, &g_scsi_dma.pio_cfg_async_read);
+    scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DMA_SM, true);
+
+    // Set the number of bytes to read, must be divisible by 2.
+    assert((count & 1) == 0);
+    pio_sm_put(SCSI_DMA_PIO, SCSI_DMA_SM, count - 1);
+
+    // Read results from PIO RX FIFO
+    uint8_t *dst = buf;
+    uint8_t *end = buf + count;
+    uint32_t paritycheck = 0;
+    while (dst < end)
+    {
+        if (*resetFlag)
+        {
+            break;
+        }
+
+        uint32_t available = pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DMA_SM);
+
+        while (available > 0)
+        {
+            available--;
+            uint32_t word = pio_sm_get(SCSI_DMA_PIO, SCSI_DMA_SM);
+            paritycheck ^= word;
+            word = ~word;
+            *dst++ = word & 0xFF;
+            *dst++ = word >> 16;
+        }
+    }
+
+    // Check parity errors in whole block
+    // This doesn't detect if there is even number of parity errors in block.
+    uint8_t byte0 = ~(paritycheck & 0xFF);
+    uint8_t byte1 = ~(paritycheck >> 16);
+    if (paritycheck != ((g_scsi_parity_lookup[byte1] << 16) | g_scsi_parity_lookup[byte0]))
+    {
+        azlog("Parity error in scsi_accel_rp2040_read(): ", paritycheck);
+        *parityError = 1;
+    }
+
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    SCSI_RELEASE_DATA_REQ();
+    scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DMA_SM, false);
+}
+
+void scsi_accel_rp2040_init()
+{
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    scsidma_config_gpio();
+
+    // Mark channels as being in use, unless it has been done already
+    if (!g_channels_claimed)
+    {
+        pio_sm_claim(SCSI_DMA_PIO, SCSI_DMA_SM);
+        dma_channel_claim(SCSI_DMA_CH);
+        g_channels_claimed = true;
+    }
+
+    // Load PIO programs
+    pio_clear_instruction_memory(SCSI_DMA_PIO);
+    
+    // Asynchronous SCSI write
+    g_scsi_dma.pio_offset_async_write = pio_add_program(SCSI_DMA_PIO, &scsi_accel_async_write_program);
+    g_scsi_dma.pio_cfg_async_write = scsi_accel_async_write_program_get_default_config(g_scsi_dma.pio_offset_async_write);
+    sm_config_set_out_pins(&g_scsi_dma.pio_cfg_async_write, SCSI_IO_DB0, 9);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_async_write, SCSI_OUT_REQ);
+    sm_config_set_fifo_join(&g_scsi_dma.pio_cfg_async_write, PIO_FIFO_JOIN_TX);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_async_write, true, false, 32);
+
+    // Asynchronous SCSI read
+    g_scsi_dma.pio_offset_async_read = pio_add_program(SCSI_DMA_PIO, &scsi_accel_async_read_program);
+    g_scsi_dma.pio_cfg_async_read = scsi_accel_async_read_program_get_default_config(g_scsi_dma.pio_offset_async_read);
+    sm_config_set_in_pins(&g_scsi_dma.pio_cfg_async_read, SCSI_IO_DB0);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_async_read, SCSI_OUT_REQ);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_async_write, true, false, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_async_read, true, true, 32);
+
+    // Create DMA channel configuration so it can be applied quickly later
+    dma_channel_config cfg = dma_channel_get_default_config(SCSI_DMA_CH);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, true);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DMA_SM, true));
+    g_scsi_dma.dma_write_config = cfg;
+}

+ 17 - 0
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.h

@@ -0,0 +1,17 @@
+// Accelerated SCSI subroutines using RP2040 hardware PIO peripheral.
+
+#pragma once
+
+#include <stdint.h>
+
+void scsi_accel_rp2040_init();
+
+void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag);
+void scsi_accel_rp2040_stopWrite(volatile int *resetFlag);
+void scsi_accel_rp2040_finishWrite(volatile int *resetFlag);
+
+// Query whether the data at pointer has already been read, i.e. buffer can be reused.
+// If data is NULL, checks if all writes have completed.
+bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data);
+
+void scsi_accel_rp2040_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag);

+ 24 - 0
lib/ZuluSCSI_platform_RP2040/sd_card_spi.cpp

@@ -44,20 +44,43 @@ public:
     uint8_t receive(uint8_t* buf, size_t count)
     {
         spi_read_blocking(SD_SPI, 0xFF, buf, count);
+
+        if (m_stream_callback && buf == m_stream_buffer + m_stream_count)
+        {
+            m_stream_count += count;
+            m_stream_callback(m_stream_count);
+        }
+
         return 0;
     }
 
     // Multiple byte send
     void send(const uint8_t* buf, size_t count) {
         spi_write_blocking(SD_SPI, buf, count);
+
+        if (m_stream_callback && buf == m_stream_buffer + m_stream_count)
+        {
+            m_stream_count += count;
+            m_stream_callback(m_stream_count);
+        }
     }
 
     void setSckSpeed(uint32_t maxSck) {
         m_sckfreq = maxSck;
     }
 
+    void set_sd_callback(sd_callback_t func, const uint8_t *buffer)
+    {
+        m_stream_buffer = buffer;
+        m_stream_count = 0;
+        m_stream_callback = func;
+    }
+
 private:
     uint32_t m_sckfreq;
+    const uint8_t *m_stream_buffer;
+    uint32_t m_stream_count;
+    sd_callback_t m_stream_callback;
 };
 
 void sdCsInit(SdCsPin_t pin)
@@ -77,6 +100,7 @@ SdSpiConfig g_sd_spi_config(0, DEDICATED_SPI, SD_SCK_MHZ(25), &g_sd_spi_port);
 
 void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer)
 {
+    g_sd_spi_port.set_sd_callback(func, buffer);
 }
 
 #endif

+ 87 - 0
utils/random_tester.py

@@ -0,0 +1,87 @@
+#!/usr/bin/python3
+
+'''This script executes random-sized reads and writes to one or more block devices to test them.
+It will destroy the contents of the block device.'''
+
+import sys
+import os
+import mmap
+import random
+import time
+
+class BlockDevice:
+    def __init__(self, path, sectorsize = 512):
+        self.path = path
+        self.dev = os.fdopen(os.open(path, os.O_RDWR | os.O_DIRECT | os.O_SYNC), "rb+", 0)
+        self.sectorsize = sectorsize
+
+    def write_block(self, first_sector, sector_count, seed):
+        rnd = random.Random(seed)
+        buffer = mmap.mmap(-1, sector_count * self.sectorsize)
+        buffer.write(rnd.randbytes(sector_count * self.sectorsize))
+        
+        start = time.time()
+        self.dev.seek(first_sector * self.sectorsize)
+        self.dev.write(buffer)
+        elapsed = time.time() - start
+        speed = sector_count * self.sectorsize / elapsed / 1e6
+
+        print("Wrote  %16s, %8d, %8d, %8d, %8.3f MB/s" % (self.path, first_sector, sector_count, seed, speed))
+
+    def verify_block(self, first_sector, sector_count, seed):
+        rnd = random.Random(seed)
+        buffer = mmap.mmap(-1, sector_count * self.sectorsize)
+
+        start = time.time()
+        self.dev.seek(first_sector * self.sectorsize)
+        self.dev.readinto(buffer)
+        elapsed = time.time() - start
+        speed = sector_count * self.sectorsize / elapsed / 1e6
+
+        print("Verify %16s, %8d, %8d, %8d, %8.3f MB/s" % (self.path, first_sector, sector_count, seed, speed))
+
+        buffer.seek(0)
+        actual = buffer.read(sector_count * self.sectorsize)
+        expected = rnd.randbytes(sector_count * self.sectorsize)
+        if expected != actual:
+            print("Compare error, device = %s, sectorsize = %d, first_sector = %d, sector_count = %d, seed = %d"
+                % (self.path, self.sectorsize, first_sector, sector_count, seed))
+            fname = "%d" % time.time()
+            open(fname + ".expected", "wb").write(expected)
+            open(fname + ".actual", "wb").write(actual)
+            print("Saved data to %s.expected/actual" % fname)
+            raise Exception("Compare error")
+
+if __name__ == "__main__":
+    blockdevs = []
+    for path in sys.argv[1:]:
+        sectorsize = 512
+        if ':' in path:
+            path, sectorsize = path.split(':')
+            sectorsize = int(sectorsize)
+        blockdevs.append(BlockDevice(path, sectorsize=sectorsize))
+    
+    maxsectors = 100000
+    rnd = random.Random()
+    while True:
+        blocks = []
+        start = 0
+        while start + 256 < maxsectors:
+            start = min(maxsectors, start + rnd.randint(0, 10000))
+            dev = rnd.choice(blockdevs)
+            count = rnd.randint(1, 256)
+            seed = rnd.randint(1, 10000000)
+            blocks.append((dev, start, count, seed))
+            start += count
+        
+        print("Write / verify set size: %d" % len(blocks))
+
+        random.shuffle(blocks)
+        for dev, start, count, seed in blocks:
+            dev.write_block(start, count, seed)
+        
+        random.shuffle(blocks)
+        for dev, start, count, seed in blocks:
+            dev.verify_block(start, count, seed)
+
+

+ 9 - 0
utils/run_gdb_rp2040.sh

@@ -0,0 +1,9 @@
+#!/bin/bash
+
+
+arm-none-eabi-gdb \
+       -iex 'target extended /dev/ttyACM0' \
+       -iex 'mon s' -iex 'att 1' \
+       -iex 'set mem inaccessible-by-default off' \
+       -iex 'source utils/rp2040_gdb_macros' \
+       .pio/build/ZuluSCSI_RP2040_v2_0/firmware.elf