Browse Source

ZuluSCSI firmware working with BlueSCSIv2 hw

Morio 2 years ago
parent
commit
1730c8e8cc

+ 654 - 0
lib/ZuluSCSI_platform_BS2/ZuluSCSI_platform.cpp

@@ -0,0 +1,654 @@
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "ZuluSCSI_config.h"
+#include <SdFat.h>
+#include <scsi.h>
+#include <assert.h>
+#include <hardware/gpio.h>
+#include <hardware/uart.h>
+#include <hardware/spi.h>
+#include <hardware/structs/xip_ctrl.h>
+#include <platform/mbed_error.h>
+#include <multicore.h>
+
+extern "C" {
+
+// As of 2022-09-13, the platformio RP2040 core is missing cplusplus guard on flash.h
+// For that reason this has to be inside the extern "C" here.
+#include <hardware/flash.h>
+#include "rp2040_flash_do_cmd.h"
+
+const char *g_azplatform_name = PLATFORM_NAME;
+static bool g_scsi_initiator = false;
+static uint32_t g_flash_chip_size = 0;
+static bool g_uart_initialized = false;
+
+void mbed_error_hook(const mbed_error_ctx * error_context);
+
+/***************/
+/* GPIO init   */
+/***************/
+
+// Helper function to configure whole GPIO in one line
+static void gpio_conf(uint gpio, enum gpio_function fn, bool pullup, bool pulldown, bool output, bool initial_state, bool fast_slew)
+{
+    gpio_put(gpio, initial_state);
+    gpio_set_dir(gpio, output);
+    gpio_set_pulls(gpio, pullup, pulldown);
+    gpio_set_function(gpio, fn);
+
+    if (fast_slew)
+    {
+        padsbank0_hw->io[gpio] |= PADS_BANK0_GPIO0_SLEWFAST_BITS;
+    }
+}
+
+void azplatform_init()
+{
+    // Make sure second core is stopped
+    multicore_reset_core1();
+
+    /* First configure the pins that affect external buffer directions.
+     * RP2040 defaults to pulldowns, while these pins have external pull-ups.
+     */
+    //        pin             function       pup   pdown  out    state fast
+    gpio_conf(SCSI_DATA_DIR,  GPIO_FUNC_SIO, false,false, true,  false, true);
+    gpio_conf(SCSI_OUT_BSY,   GPIO_FUNC_SIO, false,false, true,  true, false);
+    gpio_conf(SCSI_OUT_SEL,   GPIO_FUNC_SIO, false,false, true,  true, false);
+    gpio_conf(SCSI_IN_ACK,    GPIO_FUNC_SIO, false, false, false, false, false);
+    gpio_conf(SCSI_IN_ATN,    GPIO_FUNC_SIO, false, false, false, false, false);
+
+    delay(10); // 10 ms delay to let pull-ups do their work
+
+    /* Initialize logging to SWO pin (UART0) */
+    gpio_conf(SWO_PIN,        GPIO_FUNC_UART,false,false, true,  false, true);
+    uart_init(uart0, 1000000);
+    g_uart_initialized = true;
+    mbed_set_error_hook(mbed_error_hook);
+
+    azlog("Platform: ", g_azplatform_name);
+    azlog("FW Version: ", g_azlog_firmwareversion);
+
+    g_azlog_debug = false;
+    
+    azlog("SCSI termination is handled by a hardware jumper");
+
+    // Get flash chip size
+    uint8_t cmd_read_jedec_id[4] = {0x9f, 0, 0, 0};
+    uint8_t response_jedec[4] = {0};
+    flash_do_cmd(cmd_read_jedec_id, response_jedec, 4);
+    g_flash_chip_size = (1 << response_jedec[3]);
+    azlog("Flash chip size: ", (int)(g_flash_chip_size / 1024), " kB");
+
+    // SD card pins
+    // Card is used in SDIO mode for main program, and in SPI mode for crash handler & bootloader.
+    //        pin             function       pup   pdown  out    state fast
+    gpio_conf(SD_SPI_SCK,     GPIO_FUNC_SPI, true, false, true,  true, true);
+    gpio_conf(SD_SPI_MOSI,    GPIO_FUNC_SPI, true, false, true,  true, true);
+    gpio_conf(SD_SPI_MISO,    GPIO_FUNC_SPI, true, false, false, true, true);
+    gpio_conf(SD_SPI_CS,      GPIO_FUNC_SIO, true, false, true,  true, true);
+    gpio_conf(SDIO_D1,        GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SDIO_D2,        GPIO_FUNC_SIO, true, false, false, true, true);
+
+    // LED pin
+    gpio_conf(LED_PIN,        GPIO_FUNC_SIO, false,false, true,  false, false);
+
+}
+
+static bool read_initiator_dip_switch()
+{
+    /* Revision 2022d hardware has problems reading initiator DIP switch setting.
+     * The 74LVT245 hold current is keeping the GPIO_ACK state too strongly.
+     * Detect this condition by toggling the pin up and down and seeing if it sticks.
+     */
+
+    // Strong output high, then pulldown
+    //        pin             function       pup   pdown   out    state  fast
+    gpio_conf(DIP_INITIATOR,  GPIO_FUNC_SIO, false, false, true,  true,  false);
+    gpio_conf(DIP_INITIATOR,  GPIO_FUNC_SIO, false, true,  false, true,  false);
+    delay(1);
+    bool initiator_state1 = gpio_get(DIP_INITIATOR);
+    
+    // Strong output low, then pullup
+    //        pin             function       pup   pdown   out    state  fast
+    gpio_conf(DIP_INITIATOR,  GPIO_FUNC_SIO, false, false, true,  false, false);
+    gpio_conf(DIP_INITIATOR,  GPIO_FUNC_SIO, true,  false, false, false, false);
+    delay(1);
+    bool initiator_state2 = gpio_get(DIP_INITIATOR);
+
+    if (initiator_state1 == initiator_state2)
+    {
+        // Ok, was able to read the state directly
+        return !initiator_state1;
+    }
+
+    // Enable OUT_BSY for a short time.
+    // If in target mode, this will force GPIO_ACK high.
+    gpio_put(SCSI_OUT_BSY, 0);
+    delay_100ns();
+    gpio_put(SCSI_OUT_BSY, 1);
+
+    return !gpio_get(DIP_INITIATOR);
+}
+
+// late_init() only runs in main application, SCSI not needed in bootloader
+void azplatform_late_init()
+{
+    if (read_initiator_dip_switch())
+    {
+        g_scsi_initiator = true;
+        azlog("SCSI initiator mode selected by DIP switch, expecting SCSI disks on the bus");
+    }
+    else
+    {
+        g_scsi_initiator = false;
+        azlog("SCSI target/disk mode selected by DIP switch, acting as a SCSI disk");
+    }
+
+    /* Initialize SCSI pins to required modes.
+     * SCSI pins should be inactive / input at this point.
+     */
+
+    // SCSI data bus direction is switched by DATA_DIR signal.
+    // Pullups make sure that no glitches occur when switching direction.
+    //        pin             function       pup   pdown  out    state fast
+    gpio_conf(SCSI_IO_DB0,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB1,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB2,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB3,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB4,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB5,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB6,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DB7,    GPIO_FUNC_SIO, true, false, false, true, true);
+    gpio_conf(SCSI_IO_DBP,    GPIO_FUNC_SIO, true, false, false, true, true);
+
+    if (!g_scsi_initiator)
+    {
+        // Act as SCSI device / target
+
+        // SCSI control outputs
+        //        pin             function       pup   pdown  out    state fast
+        gpio_conf(SCSI_OUT_IO,    GPIO_FUNC_SIO, false,false, true,  true, true);
+        gpio_conf(SCSI_OUT_MSG,   GPIO_FUNC_SIO, false,false, true,  true, true);
+
+        // REQ pin is switched between PIO and SIO, pull-up makes sure no glitches
+        gpio_conf(SCSI_OUT_REQ,   GPIO_FUNC_SIO, true ,false, true,  true, true);
+
+        // Shared pins are changed to input / output depending on communication phase
+        gpio_conf(SCSI_IN_SEL,    GPIO_FUNC_SIO, true, false, false, true, true);
+        if (SCSI_OUT_CD != SCSI_IN_SEL)
+        {
+            gpio_conf(SCSI_OUT_CD,    GPIO_FUNC_SIO, false,false, true,  true, true);
+        }
+
+        gpio_conf(SCSI_IN_BSY,    GPIO_FUNC_SIO, true, false, false, true, true);
+        if (SCSI_OUT_MSG != SCSI_IN_BSY)
+        {
+            gpio_conf(SCSI_OUT_MSG,    GPIO_FUNC_SIO, false,false, true,  true, true);
+        }
+
+        // SCSI control inputs
+        //        pin             function       pup   pdown  out    state fast
+        gpio_conf(SCSI_IN_ACK,    GPIO_FUNC_SIO, false, false, false, true, false);
+        gpio_conf(SCSI_IN_ATN,    GPIO_FUNC_SIO, false, false, false, true, false);
+        gpio_conf(SCSI_IN_RST,    GPIO_FUNC_SIO, true, false, false, true, false);
+    }
+    else
+    {
+        // Act as SCSI initiator
+
+        //        pin             function       pup   pdown  out    state fast
+        gpio_conf(SCSI_IN_IO,     GPIO_FUNC_SIO, true ,false, false, true, false);
+        gpio_conf(SCSI_IN_MSG,    GPIO_FUNC_SIO, true ,false, false, true, false);
+        gpio_conf(SCSI_IN_CD,     GPIO_FUNC_SIO, true ,false, false, true, false);
+        gpio_conf(SCSI_IN_REQ,    GPIO_FUNC_SIO, true ,false, false, true, false);
+        gpio_conf(SCSI_IN_BSY,    GPIO_FUNC_SIO, true, false, false, true, false);
+        gpio_conf(SCSI_IN_RST,    GPIO_FUNC_SIO, true, false, false, true, false);
+        gpio_conf(SCSI_OUT_SEL,   GPIO_FUNC_SIO, false,false, true,  true, true);
+        gpio_conf(SCSI_OUT_ACK,   GPIO_FUNC_SIO, false,false, true,  true, true);
+        gpio_conf(SCSI_OUT_ATN,   GPIO_FUNC_SIO, false,false, true,  true, true);
+    }
+}
+
+bool azplatform_is_initiator_mode_enabled()
+{
+    return g_scsi_initiator;
+}
+
+void azplatform_disable_led(void)
+{   
+    //        pin      function       pup   pdown  out    state fast
+    gpio_conf(LED_PIN, GPIO_FUNC_SIO, false,false, false, false, false);
+    azlog("Disabling status LED");
+}
+
+/*****************************************/
+/* Crash handlers                        */
+/*****************************************/
+
+extern SdFs SD;
+extern uint32_t __StackTop;
+
+void azplatform_emergency_log_save()
+{
+    azplatform_set_sd_callback(NULL, NULL);
+
+    SD.begin(SD_CONFIG_CRASH);
+    FsFile crashfile = SD.open(CRASHFILE, O_WRONLY | O_CREAT | O_TRUNC);
+
+    if (!crashfile.isOpen())
+    {
+        // Try to reinitialize
+        int max_retry = 10;
+        while (max_retry-- > 0 && !SD.begin(SD_CONFIG_CRASH));
+
+        crashfile = SD.open(CRASHFILE, O_WRONLY | O_CREAT | O_TRUNC);
+    }
+
+    uint32_t startpos = 0;
+    crashfile.write(azlog_get_buffer(&startpos));
+    crashfile.write(azlog_get_buffer(&startpos));
+    crashfile.flush();
+    crashfile.close();
+}
+
+void mbed_error_hook(const mbed_error_ctx * error_context)
+{
+    azlog("--------------");
+    azlog("CRASH!");
+    azlog("Platform: ", g_azplatform_name);
+    azlog("FW Version: ", g_azlog_firmwareversion);
+    azlog("error_status: ", (uint32_t)error_context->error_status);
+    azlog("error_address: ", error_context->error_address);
+    azlog("error_value: ", error_context->error_value);
+
+    uint32_t *p = (uint32_t*)((uint32_t)error_context->thread_current_sp & ~3);
+    for (int i = 0; i < 8; i++)
+    {
+        if (p == &__StackTop) break; // End of stack
+
+        azlog("STACK ", (uint32_t)p, ":    ", p[0], " ", p[1], " ", p[2], " ", p[3]);
+        p += 4;
+    }
+
+    azplatform_emergency_log_save();
+
+    while (1)
+    {
+        // Flash the crash address on the LED
+        // Short pulse means 0, long pulse means 1
+        int base_delay = 1000;
+        for (int i = 31; i >= 0; i--)
+        {
+            LED_OFF();
+            for (int j = 0; j < base_delay; j++) delay_ns(100000);
+
+            int delay = (error_context->error_address & (1 << i)) ? (3 * base_delay) : base_delay;
+            LED_ON();
+            for (int j = 0; j < delay; j++) delay_ns(100000);
+            LED_OFF();
+        }
+
+        for (int j = 0; j < base_delay * 10; j++) delay_ns(100000);
+    }
+}
+
+/*****************************************/
+/* Debug logging and watchdog            */
+/*****************************************/
+
+// This function is called for every log message.
+void azplatform_log(const char *s)
+{
+    if (g_uart_initialized)
+    {
+        uart_puts(uart0, s);
+    }
+}
+
+static int g_watchdog_timeout;
+static bool g_watchdog_initialized;
+
+static void watchdog_callback(unsigned alarm_num)
+{
+    g_watchdog_timeout -= 1000;
+
+    if (g_watchdog_timeout <= WATCHDOG_CRASH_TIMEOUT - WATCHDOG_BUS_RESET_TIMEOUT)
+    {
+        if (!scsiDev.resetFlag || !g_scsiHostPhyReset)
+        {
+            azlog("--------------");
+            azlog("WATCHDOG TIMEOUT, attempting bus reset");
+            azlog("GPIO states: out ", sio_hw->gpio_out, " oe ", sio_hw->gpio_oe, " in ", sio_hw->gpio_in);
+
+            uint32_t *p = (uint32_t*)__get_PSP();
+            for (int i = 0; i < 8; i++)
+            {
+                if (p == &__StackTop) break; // End of stack
+
+                azlog("STACK ", (uint32_t)p, ":    ", p[0], " ", p[1], " ", p[2], " ", p[3]);
+                p += 4;
+            }
+
+            scsiDev.resetFlag = 1;
+            g_scsiHostPhyReset = true;
+        }
+
+        if (g_watchdog_timeout <= 0)
+        {
+            azlog("--------------");
+            azlog("WATCHDOG TIMEOUT!");
+            azlog("Platform: ", g_azplatform_name);
+            azlog("FW Version: ", g_azlog_firmwareversion);
+            azlog("GPIO states: out ", sio_hw->gpio_out, " oe ", sio_hw->gpio_oe, " in ", sio_hw->gpio_in);
+
+            uint32_t *p = (uint32_t*)__get_PSP();
+            for (int i = 0; i < 8; i++)
+            {
+                if (p == &__StackTop) break; // End of stack
+
+                azlog("STACK ", (uint32_t)p, ":    ", p[0], " ", p[1], " ", p[2], " ", p[3]);
+                p += 4;
+            }
+
+            azplatform_emergency_log_save();
+
+            azplatform_boot_to_main_firmware();
+        }
+    }
+
+    hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+}
+
+// This function can be used to periodically reset watchdog timer for crash handling.
+// It can also be left empty if the platform does not use a watchdog timer.
+void azplatform_reset_watchdog()
+{
+    g_watchdog_timeout = WATCHDOG_CRASH_TIMEOUT;
+
+    if (!g_watchdog_initialized)
+    {
+        hardware_alarm_claim(3);
+        hardware_alarm_set_callback(3, &watchdog_callback);
+        hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+        g_watchdog_initialized = true;
+    }
+}
+
+/*****************************************/
+/* Flash reprogramming from bootloader   */
+/*****************************************/
+
+#ifdef AZPLATFORM_BOOTLOADER_SIZE
+
+extern uint32_t __real_vectors_start;
+extern uint32_t __StackTop;
+static volatile void *g_bootloader_exit_req;
+
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE])
+{
+    if (offset == AZPLATFORM_BOOTLOADER_SIZE)
+    {
+        if (buffer[3] != 0x20 || buffer[7] != 0x10)
+        {
+            azlog("Invalid firmware file, starts with: ", bytearray(buffer, 16));
+            return false;
+        }
+    }
+
+    azdbg("Writing flash at offset ", offset, " data ", bytearray(buffer, 4));
+    assert(offset % AZPLATFORM_FLASH_PAGE_SIZE == 0);
+    assert(offset >= AZPLATFORM_BOOTLOADER_SIZE);
+
+    // Avoid any mbed timer interrupts triggering during the flashing.
+    __disable_irq();
+
+    // For some reason any code executed after flashing crashes
+    // unless we disable the XIP cache.
+    // Not sure why this happens, as flash_range_program() is flushing
+    // the cache correctly.
+    // The cache is now enabled from bootloader start until it starts
+    // flashing, and again after reset to main firmware.
+    xip_ctrl_hw->ctrl = 0;
+
+    flash_range_erase(offset, AZPLATFORM_FLASH_PAGE_SIZE);
+    flash_range_program(offset, buffer, AZPLATFORM_FLASH_PAGE_SIZE);
+
+    uint32_t *buf32 = (uint32_t*)buffer;
+    uint32_t num_words = AZPLATFORM_FLASH_PAGE_SIZE / 4;
+    for (int i = 0; i < num_words; i++)
+    {
+        uint32_t expected = buf32[i];
+        uint32_t actual = *(volatile uint32_t*)(XIP_NOCACHE_BASE + offset + i * 4);
+
+        if (actual != expected)
+        {
+            azlog("Flash verify failed at offset ", offset + i * 4, " got ", actual, " expected ", expected);
+            return false;
+        }
+    }
+
+    __enable_irq();
+
+    return true;
+}
+
+void azplatform_boot_to_main_firmware()
+{
+    // To ensure that the system state is reset properly, we perform
+    // a SYSRESETREQ and jump straight from the reset vector to main application.
+    g_bootloader_exit_req = &g_bootloader_exit_req;
+    SCB->AIRCR = 0x05FA0004;
+    while(1);
+}
+
+void btldr_reset_handler()
+{
+    uint32_t* application_base = &__real_vectors_start;
+    if (g_bootloader_exit_req == &g_bootloader_exit_req)
+    {
+        // Boot to main application
+        application_base = (uint32_t*)(XIP_BASE + AZPLATFORM_BOOTLOADER_SIZE);
+    }
+
+    SCB->VTOR = (uint32_t)application_base;
+    __asm__(
+        "msr msp, %0\n\t"
+        "bx %1" : : "r" (application_base[0]),
+                    "r" (application_base[1]) : "memory");
+}
+
+// Replace the reset handler when building the bootloader
+// The rp2040_btldr.ld places real vector table at an offset.
+__attribute__((section(".btldr_vectors")))
+const void * btldr_vectors[2] = {&__StackTop, (void*)&btldr_reset_handler};
+
+#endif
+
+/************************************/
+/* ROM drive in extra flash space   */
+/************************************/
+
+#ifdef PLATFORM_HAS_ROM_DRIVE
+
+// Reserve up to 352 kB for firmware.
+#define ROMDRIVE_OFFSET (352 * 1024)
+
+uint32_t azplatform_get_romdrive_maxsize()
+{
+    if (g_flash_chip_size >= ROMDRIVE_OFFSET)
+    {
+        return g_flash_chip_size - ROMDRIVE_OFFSET;
+    }
+    else
+    {
+        // Failed to read flash chip size, default to 2 MB
+        return 2048 * 1024 - ROMDRIVE_OFFSET;
+    }
+}
+
+bool azplatform_read_romdrive(uint8_t *dest, uint32_t start, uint32_t count)
+{
+    xip_ctrl_hw->stream_ctr = 0;
+
+    while (!(xip_ctrl_hw->stat & XIP_STAT_FIFO_EMPTY))
+    {
+        (void) xip_ctrl_hw->stream_fifo;
+    }
+
+    xip_ctrl_hw->stream_addr = start + ROMDRIVE_OFFSET;
+    xip_ctrl_hw->stream_ctr = count / 4;
+
+    // Transfer happens in multiples of 4 bytes
+    assert(start < azplatform_get_romdrive_maxsize());
+    assert((count & 3) == 0);
+    assert((((uint32_t)dest) & 3) == 0);
+
+    uint32_t *dest32 = (uint32_t*)dest;
+    uint32_t words_remain = count / 4;
+    while (words_remain > 0)
+    {
+        if (!(xip_ctrl_hw->stat & XIP_STAT_FIFO_EMPTY))
+        {
+            *dest32++ = xip_ctrl_hw->stream_fifo;
+            words_remain--;
+        }
+    }
+
+    return true;
+}
+
+bool azplatform_write_romdrive(const uint8_t *data, uint32_t start, uint32_t count)
+{
+    assert(start < azplatform_get_romdrive_maxsize());
+    assert((count % AZPLATFORM_ROMDRIVE_PAGE_SIZE) == 0);
+
+    __disable_irq();
+    flash_range_erase(start + ROMDRIVE_OFFSET, count);
+    flash_range_program(start + ROMDRIVE_OFFSET, data, count);
+    __enable_irq();
+    return true;
+}
+
+#endif
+
+/**********************************************/
+/* Mapping from data bytes to GPIO BOP values */
+/**********************************************/
+
+/* A lookup table is the fastest way to calculate parity and convert the IO pin mapping for data bus.
+ * For RP2040 we expect that the bits are consecutive and in order.
+ * The PIO-based parity scheme also requires that the lookup table is aligned to 512-byte increment.
+ * The parity table is placed into SRAM4 area to reduce bus contention.
+ */
+
+#define PARITY(n) ((1 ^ (n) ^ ((n)>>1) ^ ((n)>>2) ^ ((n)>>3) ^ ((n)>>4) ^ ((n)>>5) ^ ((n)>>6) ^ ((n)>>7)) & 1)
+#define X(n) (\
+    ((n & 0x01) ? 0 : (1 << SCSI_IO_DB0)) | \
+    ((n & 0x02) ? 0 : (1 << SCSI_IO_DB1)) | \
+    ((n & 0x04) ? 0 : (1 << SCSI_IO_DB2)) | \
+    ((n & 0x08) ? 0 : (1 << SCSI_IO_DB3)) | \
+    ((n & 0x10) ? 0 : (1 << SCSI_IO_DB4)) | \
+    ((n & 0x20) ? 0 : (1 << SCSI_IO_DB5)) | \
+    ((n & 0x40) ? 0 : (1 << SCSI_IO_DB6)) | \
+    ((n & 0x80) ? 0 : (1 << SCSI_IO_DB7)) | \
+    (PARITY(n)  ? 0 : (1 << SCSI_IO_DBP)) \
+)
+
+const uint16_t g_scsi_parity_lookup[256] __attribute__((aligned(512), section(".scratch_x.parity"))) =
+{
+    X(0x00), X(0x01), X(0x02), X(0x03), X(0x04), X(0x05), X(0x06), X(0x07), X(0x08), X(0x09), X(0x0a), X(0x0b), X(0x0c), X(0x0d), X(0x0e), X(0x0f),
+    X(0x10), X(0x11), X(0x12), X(0x13), X(0x14), X(0x15), X(0x16), X(0x17), X(0x18), X(0x19), X(0x1a), X(0x1b), X(0x1c), X(0x1d), X(0x1e), X(0x1f),
+    X(0x20), X(0x21), X(0x22), X(0x23), X(0x24), X(0x25), X(0x26), X(0x27), X(0x28), X(0x29), X(0x2a), X(0x2b), X(0x2c), X(0x2d), X(0x2e), X(0x2f),
+    X(0x30), X(0x31), X(0x32), X(0x33), X(0x34), X(0x35), X(0x36), X(0x37), X(0x38), X(0x39), X(0x3a), X(0x3b), X(0x3c), X(0x3d), X(0x3e), X(0x3f),
+    X(0x40), X(0x41), X(0x42), X(0x43), X(0x44), X(0x45), X(0x46), X(0x47), X(0x48), X(0x49), X(0x4a), X(0x4b), X(0x4c), X(0x4d), X(0x4e), X(0x4f),
+    X(0x50), X(0x51), X(0x52), X(0x53), X(0x54), X(0x55), X(0x56), X(0x57), X(0x58), X(0x59), X(0x5a), X(0x5b), X(0x5c), X(0x5d), X(0x5e), X(0x5f),
+    X(0x60), X(0x61), X(0x62), X(0x63), X(0x64), X(0x65), X(0x66), X(0x67), X(0x68), X(0x69), X(0x6a), X(0x6b), X(0x6c), X(0x6d), X(0x6e), X(0x6f),
+    X(0x70), X(0x71), X(0x72), X(0x73), X(0x74), X(0x75), X(0x76), X(0x77), X(0x78), X(0x79), X(0x7a), X(0x7b), X(0x7c), X(0x7d), X(0x7e), X(0x7f),
+    X(0x80), X(0x81), X(0x82), X(0x83), X(0x84), X(0x85), X(0x86), X(0x87), X(0x88), X(0x89), X(0x8a), X(0x8b), X(0x8c), X(0x8d), X(0x8e), X(0x8f),
+    X(0x90), X(0x91), X(0x92), X(0x93), X(0x94), X(0x95), X(0x96), X(0x97), X(0x98), X(0x99), X(0x9a), X(0x9b), X(0x9c), X(0x9d), X(0x9e), X(0x9f),
+    X(0xa0), X(0xa1), X(0xa2), X(0xa3), X(0xa4), X(0xa5), X(0xa6), X(0xa7), X(0xa8), X(0xa9), X(0xaa), X(0xab), X(0xac), X(0xad), X(0xae), X(0xaf),
+    X(0xb0), X(0xb1), X(0xb2), X(0xb3), X(0xb4), X(0xb5), X(0xb6), X(0xb7), X(0xb8), X(0xb9), X(0xba), X(0xbb), X(0xbc), X(0xbd), X(0xbe), X(0xbf),
+    X(0xc0), X(0xc1), X(0xc2), X(0xc3), X(0xc4), X(0xc5), X(0xc6), X(0xc7), X(0xc8), X(0xc9), X(0xca), X(0xcb), X(0xcc), X(0xcd), X(0xce), X(0xcf),
+    X(0xd0), X(0xd1), X(0xd2), X(0xd3), X(0xd4), X(0xd5), X(0xd6), X(0xd7), X(0xd8), X(0xd9), X(0xda), X(0xdb), X(0xdc), X(0xdd), X(0xde), X(0xdf),
+    X(0xe0), X(0xe1), X(0xe2), X(0xe3), X(0xe4), X(0xe5), X(0xe6), X(0xe7), X(0xe8), X(0xe9), X(0xea), X(0xeb), X(0xec), X(0xed), X(0xee), X(0xef),
+    X(0xf0), X(0xf1), X(0xf2), X(0xf3), X(0xf4), X(0xf5), X(0xf6), X(0xf7), X(0xf8), X(0xf9), X(0xfa), X(0xfb), X(0xfc), X(0xfd), X(0xfe), X(0xff)
+};
+
+#undef X
+
+/* Similarly, another lookup table is used to verify parity of received data.
+ * This table is indexed by the 8 data bits + 1 parity bit from SCSI bus (active low)
+ * Each word contains the data byte (inverted to active-high) and a bit indicating whether parity is valid.
+ */
+#define X(n) (\
+    ((n & 0xFF) ^ 0xFF) | \
+    (((PARITY(n & 0xFF) ^ (n >> 8)) & 1) << 8) \
+)
+
+const uint16_t g_scsi_parity_check_lookup[512] __attribute__((aligned(1024), section(".scratch_x.parity"))) =
+{
+    X(0x000), X(0x001), X(0x002), X(0x003), X(0x004), X(0x005), X(0x006), X(0x007), X(0x008), X(0x009), X(0x00a), X(0x00b), X(0x00c), X(0x00d), X(0x00e), X(0x00f),
+    X(0x010), X(0x011), X(0x012), X(0x013), X(0x014), X(0x015), X(0x016), X(0x017), X(0x018), X(0x019), X(0x01a), X(0x01b), X(0x01c), X(0x01d), X(0x01e), X(0x01f),
+    X(0x020), X(0x021), X(0x022), X(0x023), X(0x024), X(0x025), X(0x026), X(0x027), X(0x028), X(0x029), X(0x02a), X(0x02b), X(0x02c), X(0x02d), X(0x02e), X(0x02f),
+    X(0x030), X(0x031), X(0x032), X(0x033), X(0x034), X(0x035), X(0x036), X(0x037), X(0x038), X(0x039), X(0x03a), X(0x03b), X(0x03c), X(0x03d), X(0x03e), X(0x03f),
+    X(0x040), X(0x041), X(0x042), X(0x043), X(0x044), X(0x045), X(0x046), X(0x047), X(0x048), X(0x049), X(0x04a), X(0x04b), X(0x04c), X(0x04d), X(0x04e), X(0x04f),
+    X(0x050), X(0x051), X(0x052), X(0x053), X(0x054), X(0x055), X(0x056), X(0x057), X(0x058), X(0x059), X(0x05a), X(0x05b), X(0x05c), X(0x05d), X(0x05e), X(0x05f),
+    X(0x060), X(0x061), X(0x062), X(0x063), X(0x064), X(0x065), X(0x066), X(0x067), X(0x068), X(0x069), X(0x06a), X(0x06b), X(0x06c), X(0x06d), X(0x06e), X(0x06f),
+    X(0x070), X(0x071), X(0x072), X(0x073), X(0x074), X(0x075), X(0x076), X(0x077), X(0x078), X(0x079), X(0x07a), X(0x07b), X(0x07c), X(0x07d), X(0x07e), X(0x07f),
+    X(0x080), X(0x081), X(0x082), X(0x083), X(0x084), X(0x085), X(0x086), X(0x087), X(0x088), X(0x089), X(0x08a), X(0x08b), X(0x08c), X(0x08d), X(0x08e), X(0x08f),
+    X(0x090), X(0x091), X(0x092), X(0x093), X(0x094), X(0x095), X(0x096), X(0x097), X(0x098), X(0x099), X(0x09a), X(0x09b), X(0x09c), X(0x09d), X(0x09e), X(0x09f),
+    X(0x0a0), X(0x0a1), X(0x0a2), X(0x0a3), X(0x0a4), X(0x0a5), X(0x0a6), X(0x0a7), X(0x0a8), X(0x0a9), X(0x0aa), X(0x0ab), X(0x0ac), X(0x0ad), X(0x0ae), X(0x0af),
+    X(0x0b0), X(0x0b1), X(0x0b2), X(0x0b3), X(0x0b4), X(0x0b5), X(0x0b6), X(0x0b7), X(0x0b8), X(0x0b9), X(0x0ba), X(0x0bb), X(0x0bc), X(0x0bd), X(0x0be), X(0x0bf),
+    X(0x0c0), X(0x0c1), X(0x0c2), X(0x0c3), X(0x0c4), X(0x0c5), X(0x0c6), X(0x0c7), X(0x0c8), X(0x0c9), X(0x0ca), X(0x0cb), X(0x0cc), X(0x0cd), X(0x0ce), X(0x0cf),
+    X(0x0d0), X(0x0d1), X(0x0d2), X(0x0d3), X(0x0d4), X(0x0d5), X(0x0d6), X(0x0d7), X(0x0d8), X(0x0d9), X(0x0da), X(0x0db), X(0x0dc), X(0x0dd), X(0x0de), X(0x0df),
+    X(0x0e0), X(0x0e1), X(0x0e2), X(0x0e3), X(0x0e4), X(0x0e5), X(0x0e6), X(0x0e7), X(0x0e8), X(0x0e9), X(0x0ea), X(0x0eb), X(0x0ec), X(0x0ed), X(0x0ee), X(0x0ef),
+    X(0x0f0), X(0x0f1), X(0x0f2), X(0x0f3), X(0x0f4), X(0x0f5), X(0x0f6), X(0x0f7), X(0x0f8), X(0x0f9), X(0x0fa), X(0x0fb), X(0x0fc), X(0x0fd), X(0x0fe), X(0x0ff),
+    X(0x100), X(0x101), X(0x102), X(0x103), X(0x104), X(0x105), X(0x106), X(0x107), X(0x108), X(0x109), X(0x10a), X(0x10b), X(0x10c), X(0x10d), X(0x10e), X(0x10f),
+    X(0x110), X(0x111), X(0x112), X(0x113), X(0x114), X(0x115), X(0x116), X(0x117), X(0x118), X(0x119), X(0x11a), X(0x11b), X(0x11c), X(0x11d), X(0x11e), X(0x11f),
+    X(0x120), X(0x121), X(0x122), X(0x123), X(0x124), X(0x125), X(0x126), X(0x127), X(0x128), X(0x129), X(0x12a), X(0x12b), X(0x12c), X(0x12d), X(0x12e), X(0x12f),
+    X(0x130), X(0x131), X(0x132), X(0x133), X(0x134), X(0x135), X(0x136), X(0x137), X(0x138), X(0x139), X(0x13a), X(0x13b), X(0x13c), X(0x13d), X(0x13e), X(0x13f),
+    X(0x140), X(0x141), X(0x142), X(0x143), X(0x144), X(0x145), X(0x146), X(0x147), X(0x148), X(0x149), X(0x14a), X(0x14b), X(0x14c), X(0x14d), X(0x14e), X(0x14f),
+    X(0x150), X(0x151), X(0x152), X(0x153), X(0x154), X(0x155), X(0x156), X(0x157), X(0x158), X(0x159), X(0x15a), X(0x15b), X(0x15c), X(0x15d), X(0x15e), X(0x15f),
+    X(0x160), X(0x161), X(0x162), X(0x163), X(0x164), X(0x165), X(0x166), X(0x167), X(0x168), X(0x169), X(0x16a), X(0x16b), X(0x16c), X(0x16d), X(0x16e), X(0x16f),
+    X(0x170), X(0x171), X(0x172), X(0x173), X(0x174), X(0x175), X(0x176), X(0x177), X(0x178), X(0x179), X(0x17a), X(0x17b), X(0x17c), X(0x17d), X(0x17e), X(0x17f),
+    X(0x180), X(0x181), X(0x182), X(0x183), X(0x184), X(0x185), X(0x186), X(0x187), X(0x188), X(0x189), X(0x18a), X(0x18b), X(0x18c), X(0x18d), X(0x18e), X(0x18f),
+    X(0x190), X(0x191), X(0x192), X(0x193), X(0x194), X(0x195), X(0x196), X(0x197), X(0x198), X(0x199), X(0x19a), X(0x19b), X(0x19c), X(0x19d), X(0x19e), X(0x19f),
+    X(0x1a0), X(0x1a1), X(0x1a2), X(0x1a3), X(0x1a4), X(0x1a5), X(0x1a6), X(0x1a7), X(0x1a8), X(0x1a9), X(0x1aa), X(0x1ab), X(0x1ac), X(0x1ad), X(0x1ae), X(0x1af),
+    X(0x1b0), X(0x1b1), X(0x1b2), X(0x1b3), X(0x1b4), X(0x1b5), X(0x1b6), X(0x1b7), X(0x1b8), X(0x1b9), X(0x1ba), X(0x1bb), X(0x1bc), X(0x1bd), X(0x1be), X(0x1bf),
+    X(0x1c0), X(0x1c1), X(0x1c2), X(0x1c3), X(0x1c4), X(0x1c5), X(0x1c6), X(0x1c7), X(0x1c8), X(0x1c9), X(0x1ca), X(0x1cb), X(0x1cc), X(0x1cd), X(0x1ce), X(0x1cf),
+    X(0x1d0), X(0x1d1), X(0x1d2), X(0x1d3), X(0x1d4), X(0x1d5), X(0x1d6), X(0x1d7), X(0x1d8), X(0x1d9), X(0x1da), X(0x1db), X(0x1dc), X(0x1dd), X(0x1de), X(0x1df),
+    X(0x1e0), X(0x1e1), X(0x1e2), X(0x1e3), X(0x1e4), X(0x1e5), X(0x1e6), X(0x1e7), X(0x1e8), X(0x1e9), X(0x1ea), X(0x1eb), X(0x1ec), X(0x1ed), X(0x1ee), X(0x1ef),
+    X(0x1f0), X(0x1f1), X(0x1f2), X(0x1f3), X(0x1f4), X(0x1f5), X(0x1f6), X(0x1f7), X(0x1f8), X(0x1f9), X(0x1fa), X(0x1fb), X(0x1fc), X(0x1fd), X(0x1fe), X(0x1ff),
+};
+
+#undef X
+
+} /* extern "C" */
+
+/* Logging from mbed */
+
+static class LogTarget: public mbed::FileHandle {
+public:
+    virtual ssize_t read(void *buffer, size_t size) { return 0; }
+    virtual ssize_t write(const void *buffer, size_t size)
+    {
+        // A bit inefficient but mbed seems to write() one character
+        // at a time anyways.
+        for (int i = 0; i < size; i++)
+        {
+            char buf[2] = {((const char*)buffer)[i], 0};
+            azlog_raw(buf);
+        }
+        return size;
+    }
+
+    virtual off_t seek(off_t offset, int whence = SEEK_SET) { return offset; }
+    virtual int close() { return 0; }
+    virtual off_t size() { return 0; }
+} g_LogTarget;
+
+mbed::FileHandle *mbed::mbed_override_console(int fd)
+{
+    return &g_LogTarget;
+}

+ 178 - 0
lib/ZuluSCSI_platform_BS2/ZuluSCSI_platform.h

@@ -0,0 +1,178 @@
+// Platform-specific definitions for ZuluSCSI RP2040 hardware.
+
+#pragma once
+
+#include <stdint.h>
+#include <Arduino.h>
+#include "ZuluSCSI_platform_gpio.h"
+#include "scsiHostPhy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These are used in debug output and default SCSI strings */
+extern const char *g_azplatform_name;
+#define PLATFORM_NAME "ZuluSCSI BS2"
+#define PLATFORM_REVISION "3.0"
+#define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
+#define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 32768
+#define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 65536
+#define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 8192
+#define SD_USE_SDIO 1
+#define PLATFORM_HAS_INITIATOR_MODE 1
+#define PLATFORM_HAS_PARITY_CHECK 1
+
+// NOTE: The driver supports synchronous speeds higher than 10MB/s, but this
+// has not been tested due to lack of fast enough SCSI adapter.
+// #define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_TURBO
+
+// Debug logging function, can be used to print to e.g. serial port.
+// May get called from interrupt handlers.
+void azplatform_log(const char *s);
+void azplatform_emergency_log_save();
+
+// Timing and delay functions.
+// Arduino platform already provides these
+unsigned long millis(void);
+void delay(unsigned long ms);
+
+// Short delays, can be called from interrupt mode
+static inline void delay_ns(unsigned long ns)
+{
+    delayMicroseconds((ns + 999) / 1000);
+}
+
+// Approximate fast delay
+static inline void delay_100ns()
+{
+    asm volatile ("nop \n nop \n nop \n nop \n nop \n nop \n nop \n nop \n nop \n nop \n nop");
+}
+
+// Initialize SD card and GPIO configuration
+void azplatform_init();
+
+// Initialization for main application, not used for bootloader
+void azplatform_late_init();
+
+// Disable the status LED
+void azplatform_disable_led(void);
+
+// Query whether initiator mode is enabled on targets with PLATFORM_HAS_INITIATOR_MODE
+bool azplatform_is_initiator_mode_enabled();
+
+// Setup soft watchdog if supported
+void azplatform_reset_watchdog();
+
+// Set callback that will be called during data transfer to/from SD card.
+// This can be used to implement simultaneous transfer to SCSI bus.
+typedef void (*sd_callback_t)(uint32_t bytes_complete);
+void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer);
+
+// Reprogram firmware in main program area.
+#ifndef RP2040_DISABLE_BOOTLOADER
+#define AZPLATFORM_BOOTLOADER_SIZE (128 * 1024)
+#define AZPLATFORM_FLASH_TOTAL_SIZE (1024 * 1024)
+#define AZPLATFORM_FLASH_PAGE_SIZE 4096
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE]);
+void azplatform_boot_to_main_firmware();
+#endif
+
+// ROM drive in the unused external flash area
+#ifndef RP2040_DISABLE_ROMDRIVE
+#define PLATFORM_HAS_ROM_DRIVE 1
+// Check maximum available space for ROM drive in bytes
+uint32_t azplatform_get_romdrive_maxsize();
+
+// Read ROM drive area
+bool azplatform_read_romdrive(uint8_t *dest, uint32_t start, uint32_t count);
+
+// Reprogram ROM drive area
+#define AZPLATFORM_ROMDRIVE_PAGE_SIZE 4096
+bool azplatform_write_romdrive(const uint8_t *data, uint32_t start, uint32_t count);
+#endif
+
+// Parity lookup tables for write and read from SCSI bus.
+// These are used by macros below and the code in scsi_accel_rp2040.cpp
+extern const uint16_t g_scsi_parity_lookup[256];
+extern const uint16_t g_scsi_parity_check_lookup[512];
+
+// Below are GPIO access definitions that are used from scsiPhy.cpp.
+
+// Write a single SCSI pin.
+// Example use: SCSI_OUT(ATN, 1) sets SCSI_ATN to low (active) state.
+#define SCSI_OUT(pin, state) \
+    *(state ? &sio_hw->gpio_clr : &sio_hw->gpio_set) = 1 << (SCSI_OUT_ ## pin)
+
+// Read a single SCSI pin.
+// Example use: SCSI_IN(ATN), returns 1 for active low state.
+#define SCSI_IN(pin) \
+    ((sio_hw->gpio_in & (1 << (SCSI_IN_ ## pin))) ? 0 : 1)
+
+// Set pin directions for initiator vs. target mode
+#define SCSI_ENABLE_INITIATOR() \
+    (sio_hw->gpio_oe_set = (1 << SCSI_OUT_ACK) | \
+                           (1 << SCSI_OUT_ATN)), \
+    (sio_hw->gpio_oe_clr = (1 << SCSI_IN_IO) | \
+                           (1 << SCSI_IN_CD) | \
+                           (1 << SCSI_IN_MSG) | \
+                           (1 << SCSI_IN_REQ))
+
+// Enable driving of shared control pins
+#define SCSI_ENABLE_CONTROL_OUT() \
+    (sio_hw->gpio_oe_set = (1 << SCSI_OUT_CD) | \
+                           (1 << SCSI_OUT_MSG))
+
+// Set SCSI data bus to output
+#define SCSI_ENABLE_DATA_OUT() \
+    (sio_hw->gpio_set = (1 << SCSI_DATA_DIR), \
+     sio_hw->gpio_oe_set = SCSI_IO_DATA_MASK)
+
+// Write SCSI data bus, also sets REQ to inactive.
+#define SCSI_OUT_DATA(data) \
+    gpio_put_masked(SCSI_IO_DATA_MASK | (1 << SCSI_OUT_REQ), \
+                    g_scsi_parity_lookup[(uint8_t)(data)] | (1 << SCSI_OUT_REQ)), \
+    SCSI_ENABLE_DATA_OUT()
+
+// Release SCSI data bus and REQ signal
+#define SCSI_RELEASE_DATA_REQ() \
+    (sio_hw->gpio_oe_clr = SCSI_IO_DATA_MASK, \
+     sio_hw->gpio_clr = (1 << SCSI_DATA_DIR), \
+     sio_hw->gpio_set = (1 << SCSI_OUT_REQ))
+
+// Release all SCSI outputs
+#define SCSI_RELEASE_OUTPUTS() \
+    SCSI_RELEASE_DATA_REQ(), \
+    sio_hw->gpio_set = (1 << SCSI_OUT_IO) | \
+                       (1 << SCSI_OUT_CD) | \
+                       (1 << SCSI_OUT_MSG) | \
+                       (1 << SCSI_OUT_RST) | \
+                       (1 << SCSI_OUT_BSY) | \
+                       (1 << SCSI_OUT_REQ) | \
+                       (1 << SCSI_OUT_SEL), \
+                       delay(1), \
+    sio_hw->gpio_oe_clr = (1 << SCSI_OUT_CD) | \
+                          (1 << SCSI_OUT_MSG)
+
+// Read SCSI data bus
+#define SCSI_IN_DATA() \
+    (~sio_hw->gpio_in & SCSI_IO_DATA_MASK) >> SCSI_IO_SHIFT
+
+#ifdef __cplusplus
+}
+
+// SD card driver for SdFat
+
+#ifdef SD_USE_SDIO
+class SdioConfig;
+extern SdioConfig g_sd_sdio_config;
+#define SD_CONFIG g_sd_sdio_config
+#define SD_CONFIG_CRASH g_sd_sdio_config
+#else
+class SdSpiConfig;
+extern SdSpiConfig g_sd_spi_config;
+#define SD_CONFIG g_sd_spi_config
+#define SD_CONFIG_CRASH g_sd_spi_config
+#endif
+
+#endif

+ 75 - 0
lib/ZuluSCSI_platform_BS2/ZuluSCSI_platform_gpio.h

@@ -0,0 +1,75 @@
+// GPIO definitions for ZuluSCSI RP2040-based hardware
+
+#pragma once
+
+#include <hardware/gpio.h>
+
+// SCSI data input/output port.
+// The data bus uses external bidirectional buffer, with
+// direction controlled by DATA_DIR pin.
+#define SCSI_IO_DB0  0
+#define SCSI_IO_DB1  1
+#define SCSI_IO_DB2  2
+#define SCSI_IO_DB3  3
+#define SCSI_IO_DB4  4
+#define SCSI_IO_DB5  5
+#define SCSI_IO_DB6  6
+#define SCSI_IO_DB7  7
+#define SCSI_IO_DBP  8
+#define SCSI_IO_DATA_MASK 0x1FF
+#define SCSI_IO_SHIFT 0
+
+// Data direction control
+#define SCSI_DATA_DIR 9
+
+// SCSI output status lines
+#define SCSI_OUT_IO   22
+#define SCSI_OUT_CD   18
+#define SCSI_OUT_MSG  20
+#define SCSI_OUT_RST  22
+#define SCSI_OUT_BSY  27
+#define SCSI_OUT_REQ  17
+#define SCSI_OUT_SEL  19
+
+// SCSI input status signals
+#define SCSI_IN_SEL  18
+#define SCSI_IN_ACK  26
+#define SCSI_IN_ATN  28
+#define SCSI_IN_BSY  20
+#define SCSI_IN_RST  21
+
+// Status line outputs for initiator mode
+#define SCSI_OUT_ACK  10
+#define SCSI_OUT_ATN  29
+
+// Status line inputs for initiator mode
+#define SCSI_IN_IO    12
+#define SCSI_IN_CD    11
+#define SCSI_IN_MSG   13
+#define SCSI_IN_REQ   9
+
+// Status LED pins
+#define LED_PIN      25
+#define LED_ON()     sio_hw->gpio_set = 1 << LED_PIN
+#define LED_OFF()    sio_hw->gpio_clr = 1 << LED_PIN
+
+// SD card pins in SDIO mode
+#define SDIO_CLK 10
+#define SDIO_CMD 11
+#define SDIO_D0  12
+#define SDIO_D1  13
+#define SDIO_D2  14
+#define SDIO_D3  15
+
+// SD card pins in SPI mode
+#define SD_SPI       spi0
+#define SD_SPI_SCK   10
+#define SD_SPI_MOSI  11
+#define SD_SPI_MISO  12
+#define SD_SPI_CS    15
+
+// Initiator setting PIN
+#define DIP_INITIATOR SCSI_OUT_ATN
+
+// Other pins
+#define SWO_PIN 16

+ 5 - 0
lib/ZuluSCSI_platform_BS2/bsp.h

@@ -0,0 +1,5 @@
+// Dummy file for SCSI2SD.
+
+#pragma once
+
+#define S2S_DMA_ALIGN

+ 196 - 0
lib/ZuluSCSI_platform_BS2/rp2040.ld

@@ -0,0 +1,196 @@
+MEMORY
+{
+    FLASH(rx) : ORIGIN = 0x10000000, LENGTH = 352k
+    RAM(rwx) : ORIGIN = 0x20000000, LENGTH = 240k  /* Leave space for pico-debug */
+    SCRATCH_X(rwx) : ORIGIN = 0x20040000, LENGTH = 4k
+    SCRATCH_Y(rwx) : ORIGIN = 0x20041000, LENGTH = 4k
+}
+ENTRY(_entry_point)
+SECTIONS
+{
+    .flash_begin : {
+        __flash_binary_start = .;
+    } > FLASH
+    .boot2 : {
+        __boot2_start__ = .;
+        KEEP (*(.boot2))
+        __boot2_end__ = .;
+    } > FLASH
+    ASSERT(__boot2_end__ - __boot2_start__ == 256,
+        "ERROR: Pico second stage bootloader must be 256 bytes in size")
+
+    /* If ZuluSCSI SD card bootloader is included, it goes in first 128 kB */
+    .text.bootloader : ALIGN(16) SUBALIGN(16)
+    {
+        KEEP(*(.text.btldr*))
+        . = ALIGN(131072);
+        CHECK_BOOTLOADER_SIZE = 1 / (. <= 131072);
+    } > FLASH
+
+    .text : {
+        __logical_binary_start = .;
+        __real_vectors_start = .;
+        KEEP (*(.vectors))
+        KEEP (*(.binary_info_header))
+        __binary_info_header_end = .;
+        KEEP (*(.reset))
+        KEEP (*(.init))
+        *(.fini)
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+        *crtbegin.o(.dtors)
+        *crtbegin?.o(.dtors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+        *(SORT(.dtors.*))
+        *(.dtors)
+        *(.eh_frame*)
+        . = ALIGN(4);
+
+        /* Put only non-timecritical code in flash
+         * This includes e.g. floating point math routines.
+         */
+        *libm*:(.text .text*)
+        *libc*:(.text .text*)
+        *libgcc*:*df*(.text .text*)
+        *USB*(.text .text*)
+        *SPI*(.text .text*)
+        *Spi*(.text .text*)
+        *spi*(.text .text*)
+        *stdc*:(.text .text*)
+        *supc*:(.text .text*)
+        *nosys*:(.text .text*)
+        *libc*:*printf*(.text .text*)
+        *libc*:*toa*(.text .text*)
+        *libminIni.a:(.text .text*)
+
+        /* RP2040 breakpoints in RAM code don't always work very well
+         * because the boot routine tends to overwrite them.
+         * Uncommenting this line puts all code in flash.
+         */
+        /* *(.text .text*) */
+    } > FLASH
+    .rodata : {
+        . = ALIGN(4);
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*)))
+        *(.rodata)
+        *(.rodata*)
+        . = ALIGN(4);
+    } > FLASH
+    .ARM.extab :
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+    . = ALIGN(4);
+    __binary_info_start = .;
+    .binary_info :
+    {
+        KEEP(*(.binary_info.keep.*))
+        *(.binary_info.*)
+    } > FLASH
+    __binary_info_end = .;
+    . = ALIGN(4);
+    __etext = .;
+   .ram_vector_table (COPY): {
+        *(.ram_vector_table)
+    } > RAM
+    .data : {
+        __data_start__ = .;
+        *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
+        *(.time_critical*)
+        . = ALIGN(4);
+        *(.text)
+        *(.text*)
+
+        . = ALIGN(4);
+        *(.data*)
+        . = ALIGN(4);
+        *(.after_data.*)
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__mutex_array_start = .);
+        KEEP(*(SORT(.mutex_array.*)))
+        KEEP(*(.mutex_array))
+        PROVIDE_HIDDEN (__mutex_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(SORT(.preinit_array.*)))
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        *(SORT(.fini_array.*))
+        *(.fini_array)
+        PROVIDE_HIDDEN (__fini_array_end = .);
+        *(.jcr)
+        . = ALIGN(4);
+        __data_end__ = .;
+    } > RAM AT> FLASH
+    .uninitialized_data (COPY): {
+        . = ALIGN(4);
+        *(.uninitialized_data*)
+    } > RAM
+    .scratch_x : {
+        __scratch_x_start__ = .;
+        *(.scratch_x.*)
+        . = ALIGN(4);
+        __scratch_x_end__ = .;
+    } > SCRATCH_X AT > FLASH
+    __scratch_x_source__ = LOADADDR(.scratch_x);
+    .scratch_y : {
+        __scratch_y_start__ = .;
+        *(.scratch_y.*)
+        . = ALIGN(4);
+        __scratch_y_end__ = .;
+    } > SCRATCH_Y AT > FLASH
+    __scratch_y_source__ = LOADADDR(.scratch_y);
+    .bss : {
+        . = ALIGN(4);
+        __bss_start__ = .;
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+    .heap (COPY):
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        *(.heap*)
+        . = ORIGIN(RAM) + LENGTH(RAM) - 0x400;
+        __HeapLimit = .;
+    } > RAM
+    .stack1_dummy (COPY):
+    {
+        *(.stack1*)
+    } > SCRATCH_X
+    .stack_dummy (COPY):
+    {
+        *(.stack*)
+    } > RAM
+    .flash_end : {
+        __flash_binary_end = .;
+    } > FLASH
+    __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+    __StackLimit = __StackTop - 0x400;
+    __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X);
+    __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy);
+    __StackBottom = __StackTop - SIZEOF(.stack_dummy);
+    PROVIDE(__stack = __StackTop);
+    ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed")
+    ASSERT( __binary_info_header_end - __logical_binary_start <= 256, "Binary info must be in first 256 bytes of the binary")
+}

+ 168 - 0
lib/ZuluSCSI_platform_BS2/rp2040_btldr.ld

@@ -0,0 +1,168 @@
+/*
+ *
+ * Customized linker script for building bootloader
+ *
+ */
+
+ MEMORY
+{
+    /* The bootloader is linked to begin at 0x12000100.
+     * First 256 bytes are reserved for RP2040 second stage bootloader,
+     * which comes as part of the main firmware.elf and is never overwritten.
+     */
+    FLASH(rx) : ORIGIN = 0x10000100, LENGTH = 128k-256
+    RAM(rwx) : ORIGIN = 0x20000000, LENGTH = 240k  /* Leave space for pico-debug */
+    SCRATCH_X(rwx) : ORIGIN = 0x20040000, LENGTH = 4k
+    SCRATCH_Y(rwx) : ORIGIN = 0x20041000, LENGTH = 4k
+}
+ENTRY(_entry_point)
+SECTIONS
+{
+    .flash_begin : {
+        __flash_binary_start = .;
+    } > FLASH
+
+    .text : {
+        __logical_binary_start = .;
+        KEEP (*(.btldr_vectors))
+        KEEP (*(.binary_info_header))
+        __binary_info_header_end = .;
+        . = ALIGN(256);
+        __real_vectors_start = .;
+        KEEP (*(.vectors))
+        KEEP (*(.reset))
+        KEEP (*(.init))
+        *(.fini)
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+        *crtbegin.o(.dtors)
+        *crtbegin?.o(.dtors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+        *(SORT(.dtors.*))
+        *(.dtors)
+        *(.eh_frame*)
+        *(.text .text*)
+        . = ALIGN(4);
+    } > FLASH
+    .rodata : {
+        . = ALIGN(4);
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*)))
+        *(.rodata)
+        *(.rodata*)
+        . = ALIGN(4);
+    } > FLASH
+    .ARM.extab :
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+    . = ALIGN(4);
+    __binary_info_start = .;
+    .binary_info :
+    {
+        KEEP(*(.binary_info.keep.*))
+        *(.binary_info.*)
+    } > FLASH
+    __binary_info_end = .;
+    . = ALIGN(4);
+    __etext = .;
+   .ram_vector_table (COPY): {
+        *(.ram_vector_table)
+    } > RAM
+    .data : {
+        __data_start__ = .;
+        *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
+        *(.time_critical*)
+
+        . = ALIGN(4);
+        *(.data*)
+        . = ALIGN(4);
+        *(.after_data.*)
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__mutex_array_start = .);
+        KEEP(*(SORT(.mutex_array.*)))
+        KEEP(*(.mutex_array))
+        PROVIDE_HIDDEN (__mutex_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(SORT(.preinit_array.*)))
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        *(SORT(.fini_array.*))
+        *(.fini_array)
+        PROVIDE_HIDDEN (__fini_array_end = .);
+        *(.jcr)
+        . = ALIGN(4);
+        __data_end__ = .;
+    } > RAM AT> FLASH
+    .uninitialized_data (COPY): {
+        . = ALIGN(4);
+        *(.uninitialized_data*)
+    } > RAM
+    .scratch_x : {
+        __scratch_x_start__ = .;
+        *(.scratch_x.*)
+        . = ALIGN(4);
+        __scratch_x_end__ = .;
+    } > SCRATCH_X AT > FLASH
+    __scratch_x_source__ = LOADADDR(.scratch_x);
+    .scratch_y : {
+        __scratch_y_start__ = .;
+        *(.scratch_y.*)
+        . = ALIGN(4);
+        __scratch_y_end__ = .;
+    } > SCRATCH_Y AT > FLASH
+    __scratch_y_source__ = LOADADDR(.scratch_y);
+    .bss : {
+        . = ALIGN(4);
+        __bss_start__ = .;
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+    .heap (COPY):
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        *(.heap*)
+        . = ORIGIN(RAM) + LENGTH(RAM) - 0x400;
+        __HeapLimit = .;
+    } > RAM
+    .stack1_dummy (COPY):
+    {
+        *(.stack1*)
+    } > SCRATCH_X
+    .stack_dummy (COPY):
+    {
+        *(.stack*)
+    } > RAM
+    .flash_end : {
+        __flash_binary_end = .;
+    } > FLASH
+    __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+    __StackLimit = __StackTop - 0x400;
+    __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X);
+    __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy);
+    __StackBottom = __StackTop - SIZEOF(.stack_dummy);
+    PROVIDE(__stack = __StackTop);
+    ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed")
+    ASSERT( __binary_info_header_end - __logical_binary_start <= 256, "Binary info must be in first 256 bytes of the binary")
+}

+ 78 - 0
lib/ZuluSCSI_platform_BS2/rp2040_flash_do_cmd.h

@@ -0,0 +1,78 @@
+// As of 2022-11-16, the raspberrypi platformio package ships with old version
+// of pico-sdk. This version lacks the flash_do_cmd() function in flash.h header.
+// This file backports the function from new versions.
+
+#pragma once
+
+#ifndef RP2040_HAS_FLASH_DO_CMD
+
+#include <hardware/flash.h>
+#include "pico/bootrom.h"
+#include "hardware/structs/ssi.h"
+#include "hardware/structs/ioqspi.h"
+
+#define BOOT2_SIZE_WORDS 64
+
+static uint32_t boot2_copyout[BOOT2_SIZE_WORDS];
+static bool boot2_copyout_valid = false;
+
+static void __no_inline_not_in_flash_func(flash_init_boot2_copyout)(void) {
+    if (boot2_copyout_valid)
+        return;
+    for (int i = 0; i < BOOT2_SIZE_WORDS; ++i)
+        boot2_copyout[i] = ((uint32_t *)XIP_BASE)[i];
+    __asm__ volatile ("" : : : "memory");
+    boot2_copyout_valid = true;
+}
+
+static void __no_inline_not_in_flash_func(flash_enable_xip_via_boot2)(void) {
+    ((void (*)(void))((char*)boot2_copyout+1))();
+}
+
+// Bitbanging the chip select using IO overrides, in case RAM-resident IRQs
+// are still running, and the FIFO bottoms out. (the bootrom does the same)
+static void __no_inline_not_in_flash_func(flash_cs_force)(bool high) {
+    uint32_t field_val = high ?
+        IO_QSPI_GPIO_QSPI_SS_CTRL_OUTOVER_VALUE_HIGH :
+        IO_QSPI_GPIO_QSPI_SS_CTRL_OUTOVER_VALUE_LOW;
+    hw_write_masked(&ioqspi_hw->io[1].ctrl,
+        field_val << IO_QSPI_GPIO_QSPI_SS_CTRL_OUTOVER_LSB,
+        IO_QSPI_GPIO_QSPI_SS_CTRL_OUTOVER_BITS
+    );
+}
+
+static void __no_inline_not_in_flash_func(flash_do_cmd)(const uint8_t *txbuf, uint8_t *rxbuf, size_t count) {
+    void (*connect_internal_flash)(void) = (void(*)(void))rom_func_lookup(rom_table_code('I', 'F'));
+    void (*flash_exit_xip)(void) = (void(*)(void))rom_func_lookup(rom_table_code('E', 'X'));
+    void (*flash_flush_cache)(void) = (void(*)(void))rom_func_lookup(rom_table_code('F', 'C'));
+    assert(connect_internal_flash && flash_exit_xip && flash_flush_cache);
+    flash_init_boot2_copyout();
+    __asm__ volatile ("" : : : "memory");
+    connect_internal_flash();
+    flash_exit_xip();
+
+    flash_cs_force(0);
+    size_t tx_remaining = count;
+    size_t rx_remaining = count;
+    // We may be interrupted -- don't want FIFO to overflow if we're distracted.
+    const size_t max_in_flight = 16 - 2;
+    while (tx_remaining || rx_remaining) {
+        uint32_t flags = ssi_hw->sr;
+        bool can_put = !!(flags & SSI_SR_TFNF_BITS);
+        bool can_get = !!(flags & SSI_SR_RFNE_BITS);
+        if (can_put && tx_remaining && rx_remaining - tx_remaining < max_in_flight) {
+            ssi_hw->dr0 = *txbuf++;
+            --tx_remaining;
+        }
+        if (can_get && rx_remaining) {
+            *rxbuf++ = (uint8_t)ssi_hw->dr0;
+            --rx_remaining;
+        }
+    }
+    flash_cs_force(1);
+
+    flash_flush_cache();
+    flash_enable_xip_via_boot2();
+}
+
+#endif

+ 807 - 0
lib/ZuluSCSI_platform_BS2/rp2040_sdio.cpp

@@ -0,0 +1,807 @@
+// Implementation of SDIO communication for RP2040
+//
+// The RP2040 official work-in-progress code at
+// https://github.com/raspberrypi/pico-extras/tree/master/src/rp2_common/pico_sd_card
+// may be useful reference, but this is independent implementation.
+//
+// For official SDIO specifications, refer to:
+// https://www.sdcard.org/downloads/pls/
+// "SDIO Physical Layer Simplified Specification Version 8.00"
+
+#include "rp2040_sdio.h"
+#include "rp2040_sdio.pio.h"
+#include <hardware/pio.h>
+#include <hardware/dma.h>
+#include <hardware/gpio.h>
+#include <ZuluSCSI_platform.h>
+#include <ZuluSCSI_log.h>
+
+#define SDIO_PIO pio1
+#define SDIO_CMD_SM 0
+#define SDIO_DATA_SM 1
+#define SDIO_DMA_CH 4
+#define SDIO_DMA_CHB 5
+
+// Maximum number of 512 byte blocks to transfer in one request
+#define SDIO_MAX_BLOCKS 256
+
+enum sdio_transfer_state_t { SDIO_IDLE, SDIO_RX, SDIO_TX, SDIO_TX_WAIT_IDLE};
+
+static struct {
+    uint32_t pio_cmd_clk_offset;
+    uint32_t pio_data_rx_offset;
+    pio_sm_config pio_cfg_data_rx;
+    uint32_t pio_data_tx_offset;
+    pio_sm_config pio_cfg_data_tx;
+
+    sdio_transfer_state_t transfer_state;
+    uint32_t transfer_start_time;
+    uint32_t *data_buf;
+    uint32_t blocks_done; // Number of blocks transferred so far
+    uint32_t total_blocks; // Total number of blocks to transfer
+    uint32_t blocks_checksumed; // Number of blocks that have had CRC calculated
+    uint32_t checksum_errors; // Number of checksum errors detected
+
+    // Variables for block writes
+    uint64_t next_wr_block_checksum;
+    uint32_t end_token_buf[3]; // CRC and end token for write block
+    sdio_status_t wr_status;
+    uint32_t card_response;
+    
+    // Variables for block reads
+    // This is used to perform DMA into data buffers and checksum buffers separately.
+    struct {
+        void * write_addr;
+        uint32_t transfer_count;
+    } dma_blocks[SDIO_MAX_BLOCKS * 2];
+    struct {
+        uint32_t top;
+        uint32_t bottom;
+    } received_checksums[SDIO_MAX_BLOCKS];
+} g_sdio;
+
+void rp2040_sdio_dma_irq();
+
+/*******************************************************
+ * Checksum algorithms
+ *******************************************************/
+
+// Table lookup for calculating CRC-7 checksum that is used in SDIO command packets.
+// Usage:
+//    uint8_t crc = 0;
+//    crc = crc7_table[crc ^ byte];
+//    .. repeat for every byte ..
+static const uint8_t crc7_table[256] = {
+	0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,	0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee,
+	0x32, 0x20, 0x16, 0x04, 0x7a, 0x68, 0x5e, 0x4c,	0xa2, 0xb0, 0x86, 0x94, 0xea, 0xf8, 0xce, 0xdc,
+	0x64, 0x76, 0x40, 0x52, 0x2c, 0x3e, 0x08, 0x1a,	0xf4, 0xe6, 0xd0, 0xc2, 0xbc, 0xae, 0x98, 0x8a,
+	0x56, 0x44, 0x72, 0x60, 0x1e, 0x0c, 0x3a, 0x28,	0xc6, 0xd4, 0xe2, 0xf0, 0x8e, 0x9c, 0xaa, 0xb8,
+	0xc8, 0xda, 0xec, 0xfe, 0x80, 0x92, 0xa4, 0xb6,	0x58, 0x4a, 0x7c, 0x6e, 0x10, 0x02, 0x34, 0x26,
+	0xfa, 0xe8, 0xde, 0xcc, 0xb2, 0xa0, 0x96, 0x84,	0x6a, 0x78, 0x4e, 0x5c, 0x22, 0x30, 0x06, 0x14,
+	0xac, 0xbe, 0x88, 0x9a, 0xe4, 0xf6, 0xc0, 0xd2,	0x3c, 0x2e, 0x18, 0x0a, 0x74, 0x66, 0x50, 0x42,
+	0x9e, 0x8c, 0xba, 0xa8, 0xd6, 0xc4, 0xf2, 0xe0,	0x0e, 0x1c, 0x2a, 0x38, 0x46, 0x54, 0x62, 0x70,
+	0x82, 0x90, 0xa6, 0xb4, 0xca, 0xd8, 0xee, 0xfc,	0x12, 0x00, 0x36, 0x24, 0x5a, 0x48, 0x7e, 0x6c,
+	0xb0, 0xa2, 0x94, 0x86, 0xf8, 0xea, 0xdc, 0xce,	0x20, 0x32, 0x04, 0x16, 0x68, 0x7a, 0x4c, 0x5e,
+	0xe6, 0xf4, 0xc2, 0xd0, 0xae, 0xbc, 0x8a, 0x98,	0x76, 0x64, 0x52, 0x40, 0x3e, 0x2c, 0x1a, 0x08,
+	0xd4, 0xc6, 0xf0, 0xe2, 0x9c, 0x8e, 0xb8, 0xaa,	0x44, 0x56, 0x60, 0x72, 0x0c, 0x1e, 0x28, 0x3a,
+	0x4a, 0x58, 0x6e, 0x7c, 0x02, 0x10, 0x26, 0x34,	0xda, 0xc8, 0xfe, 0xec, 0x92, 0x80, 0xb6, 0xa4,
+	0x78, 0x6a, 0x5c, 0x4e, 0x30, 0x22, 0x14, 0x06,	0xe8, 0xfa, 0xcc, 0xde, 0xa0, 0xb2, 0x84, 0x96,
+	0x2e, 0x3c, 0x0a, 0x18, 0x66, 0x74, 0x42, 0x50,	0xbe, 0xac, 0x9a, 0x88, 0xf6, 0xe4, 0xd2, 0xc0,
+	0x1c, 0x0e, 0x38, 0x2a, 0x54, 0x46, 0x70, 0x62,	0x8c, 0x9e, 0xa8, 0xba, 0xc4, 0xd6, 0xe0, 0xf2
+};
+
+// Calculate the CRC16 checksum for parallel 4 bit lines separately.
+// When the SDIO bus operates in 4-bit mode, the CRC16 algorithm
+// is applied to each line separately and generates total of
+// 4 x 16 = 64 bits of checksum.
+__attribute__((optimize("O3")))
+uint64_t sdio_crc16_4bit_checksum(uint32_t *data, uint32_t num_words)
+{
+    uint64_t crc = 0;
+    uint32_t *end = data + num_words;
+    while (data < end)
+    {
+        for (int unroll = 0; unroll < 4; unroll++)
+        {
+            // Each 32-bit word contains 8 bits per line.
+            // Reverse the bytes because SDIO protocol is big-endian.
+            uint32_t data_in = __builtin_bswap32(*data++);
+
+            // Shift out 8 bits for each line
+            uint32_t data_out = crc >> 32;
+            crc <<= 32;
+
+            // XOR outgoing data to itself with 4 bit delay
+            data_out ^= (data_out >> 16);
+
+            // XOR incoming data to outgoing data with 4 bit delay
+            data_out ^= (data_in >> 16);
+
+            // XOR outgoing and incoming data to accumulator at each tap
+            uint64_t xorred = data_out ^ data_in;
+            crc ^= xorred;
+            crc ^= xorred << (5 * 4);
+            crc ^= xorred << (12 * 4);
+        }
+    }
+
+    return crc;
+}
+
+/*******************************************************
+ * Basic SDIO command execution
+ *******************************************************/
+
+static void sdio_send_command(uint8_t command, uint32_t arg, uint8_t response_bits)
+{
+    // azdbg("SDIO Command: ", (int)command, " arg ", arg);
+
+    // Format the arguments in the way expected by the PIO code.
+    uint32_t word0 =
+        (47 << 24) | // Number of bits in command minus one
+        ( 1 << 22) | // Transfer direction from host to card
+        (command << 16) | // Command byte
+        (((arg >> 24) & 0xFF) << 8) | // MSB byte of argument
+        (((arg >> 16) & 0xFF) << 0);
+    
+    uint32_t word1 =
+        (((arg >> 8) & 0xFF) << 24) |
+        (((arg >> 0) & 0xFF) << 16) | // LSB byte of argument
+        ( 1 << 8); // End bit
+
+    // Set number of bits in response minus one, or leave at 0 if no response expected
+    if (response_bits)
+    {
+        word1 |= ((response_bits - 1) << 0);
+    }
+
+    // Calculate checksum in the order that the bytes will be transmitted (big-endian)
+    uint8_t crc = 0;
+    crc = crc7_table[crc ^ ((word0 >> 16) & 0xFF)];
+    crc = crc7_table[crc ^ ((word0 >>  8) & 0xFF)];
+    crc = crc7_table[crc ^ ((word0 >>  0) & 0xFF)];
+    crc = crc7_table[crc ^ ((word1 >> 24) & 0xFF)];
+    crc = crc7_table[crc ^ ((word1 >> 16) & 0xFF)];
+    word1 |= crc << 8;
+    
+    // Transmit command
+    pio_sm_clear_fifos(SDIO_PIO, SDIO_CMD_SM);
+    pio_sm_put(SDIO_PIO, SDIO_CMD_SM, word0);
+    pio_sm_put(SDIO_PIO, SDIO_CMD_SM, word1);
+}
+
+sdio_status_t rp2040_sdio_command_R1(uint8_t command, uint32_t arg, uint32_t *response)
+{
+    sdio_send_command(command, arg, response ? 48 : 0);
+
+    // Wait for response
+    uint32_t start = millis();
+    uint32_t wait_words = response ? 2 : 1;
+    while (pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_CMD_SM) < wait_words)
+    {
+        if ((uint32_t)(millis() - start) > 2)
+        {
+            if (command != 8) // Don't log for missing SD card
+            {
+                azdbg("Timeout waiting for response in rp2040_sdio_command_R1(", (int)command, "), ",
+                    "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_CMD_SM) - (int)g_sdio.pio_cmd_clk_offset,
+                    " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_CMD_SM),
+                    " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_CMD_SM));
+            }
+
+            // Reset the state machine program
+            pio_sm_clear_fifos(SDIO_PIO, SDIO_CMD_SM);
+            pio_sm_exec(SDIO_PIO, SDIO_CMD_SM, pio_encode_jmp(g_sdio.pio_cmd_clk_offset));
+            return SDIO_ERR_RESPONSE_TIMEOUT;
+        }
+    }
+
+    if (response)
+    {
+        // Read out response packet
+        uint32_t resp0 = pio_sm_get(SDIO_PIO, SDIO_CMD_SM);
+        uint32_t resp1 = pio_sm_get(SDIO_PIO, SDIO_CMD_SM);
+        // azdbg("SDIO R1 response: ", resp0, " ", resp1);
+
+        // Calculate response checksum
+        uint8_t crc = 0;
+        crc = crc7_table[crc ^ ((resp0 >> 24) & 0xFF)];
+        crc = crc7_table[crc ^ ((resp0 >> 16) & 0xFF)];
+        crc = crc7_table[crc ^ ((resp0 >>  8) & 0xFF)];
+        crc = crc7_table[crc ^ ((resp0 >>  0) & 0xFF)];
+        crc = crc7_table[crc ^ ((resp1 >>  8) & 0xFF)];
+
+        uint8_t actual_crc = ((resp1 >> 0) & 0xFE);
+        if (crc != actual_crc)
+        {
+            azdbg("rp2040_sdio_command_R1(", (int)command, "): CRC error, calculated ", crc, " packet has ", actual_crc);
+            return SDIO_ERR_RESPONSE_CRC;
+        }
+
+        uint8_t response_cmd = ((resp0 >> 24) & 0xFF);
+        if (response_cmd != command && command != 41)
+        {
+            azdbg("rp2040_sdio_command_R1(", (int)command, "): received reply for ", (int)response_cmd);
+            return SDIO_ERR_RESPONSE_CODE;
+        }
+
+        *response = ((resp0 & 0xFFFFFF) << 8) | ((resp1 >> 8) & 0xFF);
+    }
+    else
+    {
+        // Read out dummy marker
+        pio_sm_get(SDIO_PIO, SDIO_CMD_SM);
+    }
+
+    return SDIO_OK;
+}
+
+sdio_status_t rp2040_sdio_command_R2(uint8_t command, uint32_t arg, uint8_t response[16])
+{
+    // The response is too long to fit in the PIO FIFO, so use DMA to receive it.
+    pio_sm_clear_fifos(SDIO_PIO, SDIO_CMD_SM);
+    uint32_t response_buf[5];
+    dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, false);
+    channel_config_set_write_increment(&dmacfg, true);
+    channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_CMD_SM, false));
+    dma_channel_configure(SDIO_DMA_CH, &dmacfg, &response_buf, &SDIO_PIO->rxf[SDIO_CMD_SM], 5, true);
+
+    sdio_send_command(command, arg, 136);
+
+    uint32_t start = millis();
+    while (dma_channel_is_busy(SDIO_DMA_CH))
+    {
+        if ((uint32_t)(millis() - start) > 2)
+        {
+            azdbg("Timeout waiting for response in rp2040_sdio_command_R2(", (int)command, "), ",
+                  "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_CMD_SM) - (int)g_sdio.pio_cmd_clk_offset,
+                  " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_CMD_SM),
+                  " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_CMD_SM));
+
+            // Reset the state machine program
+            dma_channel_abort(SDIO_DMA_CH);
+            pio_sm_clear_fifos(SDIO_PIO, SDIO_CMD_SM);
+            pio_sm_exec(SDIO_PIO, SDIO_CMD_SM, pio_encode_jmp(g_sdio.pio_cmd_clk_offset));
+            return SDIO_ERR_RESPONSE_TIMEOUT;
+        }
+    }
+
+    dma_channel_abort(SDIO_DMA_CH);
+
+    // Copy the response payload to output buffer
+    response[0]  = ((response_buf[0] >> 16) & 0xFF);
+    response[1]  = ((response_buf[0] >>  8) & 0xFF);
+    response[2]  = ((response_buf[0] >>  0) & 0xFF);
+    response[3]  = ((response_buf[1] >> 24) & 0xFF);
+    response[4]  = ((response_buf[1] >> 16) & 0xFF);
+    response[5]  = ((response_buf[1] >>  8) & 0xFF);
+    response[6]  = ((response_buf[1] >>  0) & 0xFF);
+    response[7]  = ((response_buf[2] >> 24) & 0xFF);
+    response[8]  = ((response_buf[2] >> 16) & 0xFF);
+    response[9]  = ((response_buf[2] >>  8) & 0xFF);
+    response[10] = ((response_buf[2] >>  0) & 0xFF);
+    response[11] = ((response_buf[3] >> 24) & 0xFF);
+    response[12] = ((response_buf[3] >> 16) & 0xFF);
+    response[13] = ((response_buf[3] >>  8) & 0xFF);
+    response[14] = ((response_buf[3] >>  0) & 0xFF);
+    response[15] = ((response_buf[4] >>  0) & 0xFF);
+
+    // Calculate checksum of the payload
+    uint8_t crc = 0;
+    for (int i = 0; i < 15; i++)
+    {
+        crc = crc7_table[crc ^ response[i]];
+    }
+
+    uint8_t actual_crc = response[15] & 0xFE;
+    if (crc != actual_crc)
+    {
+        azdbg("rp2040_sdio_command_R2(", (int)command, "): CRC error, calculated ", crc, " packet has ", actual_crc);
+        return SDIO_ERR_RESPONSE_CRC;
+    }
+
+    uint8_t response_cmd = ((response_buf[0] >> 24) & 0xFF);
+    if (response_cmd != 0x3F)
+    {
+        azdbg("rp2040_sdio_command_R2(", (int)command, "): Expected reply code 0x3F");
+        return SDIO_ERR_RESPONSE_CODE;
+    }
+
+    return SDIO_OK;
+}
+
+
+sdio_status_t rp2040_sdio_command_R3(uint8_t command, uint32_t arg, uint32_t *response)
+{
+    sdio_send_command(command, arg, 48);
+
+    // Wait for response
+    uint32_t start = millis();
+    while (pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_CMD_SM) < 2)
+    {
+        if ((uint32_t)(millis() - start) > 2)
+        {
+            azdbg("Timeout waiting for response in rp2040_sdio_command_R3(", (int)command, "), ",
+                  "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_CMD_SM) - (int)g_sdio.pio_cmd_clk_offset,
+                  " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_CMD_SM),
+                  " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_CMD_SM));
+
+            // Reset the state machine program
+            pio_sm_clear_fifos(SDIO_PIO, SDIO_CMD_SM);
+            pio_sm_exec(SDIO_PIO, SDIO_CMD_SM, pio_encode_jmp(g_sdio.pio_cmd_clk_offset));
+            return SDIO_ERR_RESPONSE_TIMEOUT;
+        }
+    }
+
+    // Read out response packet
+    uint32_t resp0 = pio_sm_get(SDIO_PIO, SDIO_CMD_SM);
+    uint32_t resp1 = pio_sm_get(SDIO_PIO, SDIO_CMD_SM);
+    *response = ((resp0 & 0xFFFFFF) << 8) | ((resp1 >> 8) & 0xFF);
+    // azdbg("SDIO R3 response: ", resp0, " ", resp1);
+
+    return SDIO_OK;
+}
+
+/*******************************************************
+ * Data reception from SD card
+ *******************************************************/
+
+sdio_status_t rp2040_sdio_rx_start(uint8_t *buffer, uint32_t num_blocks)
+{
+    // Buffer must be aligned
+    assert(((uint32_t)buffer & 3) == 0 && num_blocks <= SDIO_MAX_BLOCKS);
+
+    g_sdio.transfer_state = SDIO_RX;
+    g_sdio.transfer_start_time = millis();
+    g_sdio.data_buf = (uint32_t*)buffer;
+    g_sdio.blocks_done = 0;
+    g_sdio.total_blocks = num_blocks;
+    g_sdio.blocks_checksumed = 0;
+    g_sdio.checksum_errors = 0;
+
+    // Create DMA block descriptors to store each block of 512 bytes of data to buffer
+    // and then 8 bytes to g_sdio.received_checksums.
+    for (int i = 0; i < num_blocks; i++)
+    {
+        g_sdio.dma_blocks[i * 2].write_addr = buffer + i * SDIO_BLOCK_SIZE;
+        g_sdio.dma_blocks[i * 2].transfer_count = SDIO_BLOCK_SIZE / sizeof(uint32_t);
+
+        g_sdio.dma_blocks[i * 2 + 1].write_addr = &g_sdio.received_checksums[i];
+        g_sdio.dma_blocks[i * 2 + 1].transfer_count = 2;
+    }
+    g_sdio.dma_blocks[num_blocks * 2].write_addr = 0;
+    g_sdio.dma_blocks[num_blocks * 2].transfer_count = 0;
+
+    // Configure first DMA channel for reading from the PIO RX fifo
+    dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, false);
+    channel_config_set_write_increment(&dmacfg, true);
+    channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, false));
+    channel_config_set_bswap(&dmacfg, true);
+    channel_config_set_chain_to(&dmacfg, SDIO_DMA_CHB);
+    dma_channel_configure(SDIO_DMA_CH, &dmacfg, 0, &SDIO_PIO->rxf[SDIO_DATA_SM], 0, false);
+
+    // Configure second DMA channel for reconfiguring the first one
+    dmacfg = dma_channel_get_default_config(SDIO_DMA_CHB);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, true);
+    channel_config_set_write_increment(&dmacfg, true);
+    channel_config_set_ring(&dmacfg, true, 3);
+    dma_channel_configure(SDIO_DMA_CHB, &dmacfg, &dma_hw->ch[SDIO_DMA_CH].al1_write_addr,
+        g_sdio.dma_blocks, 2, false);
+
+    // Initialize PIO state machine
+    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_rx_offset, &g_sdio.pio_cfg_data_rx);
+    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
+
+    // Write number of nibbles to receive to Y register
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, SDIO_BLOCK_SIZE * 2 + 16 - 1);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_y, 32));
+
+    // Enable RX FIFO join because we don't need the TX FIFO during transfer.
+    // This gives more leeway for the DMA block switching
+    SDIO_PIO->sm[SDIO_DATA_SM].shiftctrl |= PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS;
+
+    // Start PIO and DMA
+    dma_channel_start(SDIO_DMA_CHB);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
+
+    return SDIO_OK;
+}
+
+// Check checksums for received blocks
+static void sdio_verify_rx_checksums(uint32_t maxcount)
+{
+    while (g_sdio.blocks_checksumed < g_sdio.blocks_done && maxcount-- > 0)
+    {
+        // Calculate checksum from received data
+        int blockidx = g_sdio.blocks_checksumed++;
+        uint64_t checksum = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * SDIO_WORDS_PER_BLOCK,
+                                                     SDIO_WORDS_PER_BLOCK);
+
+        // Convert received checksum to little-endian format
+        uint32_t top = __builtin_bswap32(g_sdio.received_checksums[blockidx].top);
+        uint32_t bottom = __builtin_bswap32(g_sdio.received_checksums[blockidx].bottom);
+        uint64_t expected = ((uint64_t)top << 32) | bottom;
+
+        if (checksum != expected)
+        {
+            g_sdio.checksum_errors++;
+            if (g_sdio.checksum_errors == 1)
+            {
+                azlog("SDIO checksum error in reception: block ", blockidx,
+                      " calculated ", checksum, " expected ", expected);
+            }
+        }
+    }
+}
+
+sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
+{
+    // Was everything done when the previous rx_poll() finished?
+    if (g_sdio.blocks_done >= g_sdio.total_blocks)
+    {
+        g_sdio.transfer_state = SDIO_IDLE;
+    }
+    else
+    {
+        // Use the idle time to calculate checksums
+        sdio_verify_rx_checksums(4);
+
+        // Check how many DMA control blocks have been consumed
+        uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
+        dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
+
+        // Compute how many complete 512 byte SDIO blocks have been transferred
+        // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
+        g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
+
+        // NOTE: When all blocks are done, rx_poll() still returns SDIO_BUSY once.
+        // This provides a chance to start the SCSI transfer before the last checksums
+        // are computed. Any checksum failures can be indicated in SCSI status after
+        // the data transfer has finished.
+    }
+
+    if (bytes_complete)
+    {
+        *bytes_complete = g_sdio.blocks_done * SDIO_BLOCK_SIZE;
+    }
+
+    if (g_sdio.transfer_state == SDIO_IDLE)
+    {
+        // Verify all remaining checksums.
+        sdio_verify_rx_checksums(g_sdio.total_blocks);
+
+        if (g_sdio.checksum_errors == 0)
+            return SDIO_OK;
+        else
+            return SDIO_ERR_DATA_CRC;
+    }
+    else if ((uint32_t)(millis() - g_sdio.transfer_start_time) > 1000)
+    {
+        azdbg("rp2040_sdio_rx_poll() timeout, "
+            "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_DATA_SM) - (int)g_sdio.pio_data_rx_offset,
+            " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
+            " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
+            " DMA CNT: ", dma_hw->ch[SDIO_DMA_CH].al2_transfer_count);
+        rp2040_sdio_stop();
+        return SDIO_ERR_DATA_TIMEOUT;
+    }
+
+    return SDIO_BUSY;
+}
+
+
+/*******************************************************
+ * Data transmission to SD card
+ *******************************************************/
+
+static void sdio_start_next_block_tx()
+{
+    // Initialize PIO
+    pio_sm_init(SDIO_PIO, SDIO_DATA_SM, g_sdio.pio_data_tx_offset, &g_sdio.pio_cfg_data_tx);
+    
+    // Configure DMA to send the data block payload (512 bytes)
+    dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CH);
+    channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&dmacfg, true);
+    channel_config_set_write_increment(&dmacfg, false);
+    channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, true));
+    channel_config_set_bswap(&dmacfg, true);
+    channel_config_set_chain_to(&dmacfg, SDIO_DMA_CHB);
+    dma_channel_configure(SDIO_DMA_CH, &dmacfg,
+        &SDIO_PIO->txf[SDIO_DATA_SM], g_sdio.data_buf + g_sdio.blocks_done * SDIO_WORDS_PER_BLOCK,
+        SDIO_WORDS_PER_BLOCK, false);
+
+    // Prepare second DMA channel to send the CRC and block end marker
+    uint64_t crc = g_sdio.next_wr_block_checksum;
+    g_sdio.end_token_buf[0] = (uint32_t)(crc >> 32);
+    g_sdio.end_token_buf[1] = (uint32_t)(crc >>  0);
+    g_sdio.end_token_buf[2] = 0xFFFFFFFF;
+    channel_config_set_bswap(&dmacfg, false);
+    dma_channel_configure(SDIO_DMA_CHB, &dmacfg,
+        &SDIO_PIO->txf[SDIO_DATA_SM], g_sdio.end_token_buf, 3, false);
+    
+    // Enable IRQ to trigger when block is done
+    dma_hw->ints1 = 1 << SDIO_DMA_CHB;
+    dma_set_irq1_channel_mask_enabled(1 << SDIO_DMA_CHB, 1);
+
+    // Initialize register X with nibble count and register Y with response bit count
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 1048);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_x, 32));
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 31);
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_out(pio_y, 32));
+    
+    // Initialize pins to output and high
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_set(pio_pins, 15));
+    pio_sm_exec(SDIO_PIO, SDIO_DATA_SM, pio_encode_set(pio_pindirs, 15));
+
+    // Write start token and start the DMA transfer.
+    pio_sm_put(SDIO_PIO, SDIO_DATA_SM, 0xFFFFFFF0);
+    dma_channel_start(SDIO_DMA_CH);
+    
+    // Start state machine
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, true);
+}
+
+static void sdio_compute_next_tx_checksum()
+{
+    assert (g_sdio.blocks_done < g_sdio.total_blocks && g_sdio.blocks_checksumed < g_sdio.total_blocks);
+    int blockidx = g_sdio.blocks_checksumed++;
+    g_sdio.next_wr_block_checksum = sdio_crc16_4bit_checksum(g_sdio.data_buf + blockidx * SDIO_WORDS_PER_BLOCK,
+                                                             SDIO_WORDS_PER_BLOCK);
+}
+
+// Start transferring data from memory to SD card
+sdio_status_t rp2040_sdio_tx_start(const uint8_t *buffer, uint32_t num_blocks)
+{
+    // Buffer must be aligned
+    assert(((uint32_t)buffer & 3) == 0 && num_blocks <= SDIO_MAX_BLOCKS);
+
+    g_sdio.transfer_state = SDIO_TX;
+    g_sdio.transfer_start_time = millis();
+    g_sdio.data_buf = (uint32_t*)buffer;
+    g_sdio.blocks_done = 0;
+    g_sdio.total_blocks = num_blocks;
+    g_sdio.blocks_checksumed = 0;
+    g_sdio.checksum_errors = 0;
+
+    // Compute first block checksum
+    sdio_compute_next_tx_checksum();
+
+    // Start first DMA transfer and PIO
+    sdio_start_next_block_tx();
+
+    if (g_sdio.blocks_checksumed < g_sdio.total_blocks)
+    {
+        // Precompute second block checksum
+        sdio_compute_next_tx_checksum();
+    }
+
+    return SDIO_OK;
+}
+
+sdio_status_t check_sdio_write_response(uint32_t card_response)
+{
+    // Shift card response until top bit is 0 (the start bit)
+    // The format of response is poorly documented in SDIO spec but refer to e.g.
+    // http://my-cool-projects.blogspot.com/2013/02/the-mysterious-sd-card-crc-status.html
+    uint32_t resp = card_response;
+    if (!(~resp & 0xFFFF0000)) resp <<= 16;
+    if (!(~resp & 0xFF000000)) resp <<= 8;
+    if (!(~resp & 0xF0000000)) resp <<= 4;
+    if (!(~resp & 0xC0000000)) resp <<= 2;
+    if (!(~resp & 0x80000000)) resp <<= 1;
+
+    uint32_t wr_status = (resp >> 28) & 7;
+
+    if (wr_status == 2)
+    {
+        return SDIO_OK;
+    }
+    else if (wr_status == 5)
+    {
+        azlog("SDIO card reports write CRC error, status ", card_response);
+        return SDIO_ERR_WRITE_CRC;    
+    }
+    else if (wr_status == 6)
+    {
+        azlog("SDIO card reports write failure, status ", card_response);
+        return SDIO_ERR_WRITE_FAIL;    
+    }
+    else
+    {
+        azlog("SDIO card reports unknown write status ", card_response);
+        return SDIO_ERR_WRITE_FAIL;    
+    }
+}
+
+// When a block finishes, this IRQ handler starts the next one
+static void rp2040_sdio_tx_irq()
+{
+    dma_hw->ints1 = 1 << SDIO_DMA_CHB;
+
+    if (g_sdio.transfer_state == SDIO_TX)
+    {
+        if (!dma_channel_is_busy(SDIO_DMA_CH) && !dma_channel_is_busy(SDIO_DMA_CHB))
+        {
+            // Main data transfer is finished now.
+            // When card is ready, PIO will put card response on RX fifo
+            g_sdio.transfer_state = SDIO_TX_WAIT_IDLE;
+            if (!pio_sm_is_rx_fifo_empty(SDIO_PIO, SDIO_DATA_SM))
+            {
+                // Card is already idle
+                g_sdio.card_response = pio_sm_get(SDIO_PIO, SDIO_DATA_SM);
+            }
+            else
+            {
+                // Use DMA to wait for the response
+                dma_channel_config dmacfg = dma_channel_get_default_config(SDIO_DMA_CHB);
+                channel_config_set_transfer_data_size(&dmacfg, DMA_SIZE_32);
+                channel_config_set_read_increment(&dmacfg, false);
+                channel_config_set_write_increment(&dmacfg, false);
+                channel_config_set_dreq(&dmacfg, pio_get_dreq(SDIO_PIO, SDIO_DATA_SM, false));
+                dma_channel_configure(SDIO_DMA_CHB, &dmacfg,
+                    &g_sdio.card_response, &SDIO_PIO->rxf[SDIO_DATA_SM], 1, true);
+            }
+        }
+    }
+    
+    if (g_sdio.transfer_state == SDIO_TX_WAIT_IDLE)
+    {
+        if (!dma_channel_is_busy(SDIO_DMA_CHB))
+        {
+            g_sdio.wr_status = check_sdio_write_response(g_sdio.card_response);
+
+            if (g_sdio.wr_status != SDIO_OK)
+            {
+                rp2040_sdio_stop();
+                return;
+            }
+
+            g_sdio.blocks_done++;
+            if (g_sdio.blocks_done < g_sdio.total_blocks)
+            {
+                sdio_start_next_block_tx();
+                g_sdio.transfer_state = SDIO_TX;
+
+                if (g_sdio.blocks_checksumed < g_sdio.total_blocks)
+                {
+                    // Precompute the CRC for next block so that it is ready when
+                    // we want to send it.
+                    sdio_compute_next_tx_checksum();
+                }
+            }
+            else
+            {
+                rp2040_sdio_stop();
+            }
+        }    
+    }
+}
+
+// Check if transmission is complete
+sdio_status_t rp2040_sdio_tx_poll(uint32_t *bytes_complete)
+{
+    if (SCB->ICSR & SCB_ICSR_VECTACTIVE_Msk)
+    {
+        // Verify that IRQ handler gets called even if we are in hardfault handler
+        rp2040_sdio_tx_irq();
+    }
+
+    if (bytes_complete)
+    {
+        *bytes_complete = g_sdio.blocks_done * SDIO_BLOCK_SIZE;
+    }
+
+    if (g_sdio.transfer_state == SDIO_IDLE)
+    {
+        rp2040_sdio_stop();
+        return g_sdio.wr_status;
+    }
+    else if ((uint32_t)(millis() - g_sdio.transfer_start_time) > 1000)
+    {
+        azdbg("rp2040_sdio_tx_poll() timeout, "
+            "PIO PC: ", (int)pio_sm_get_pc(SDIO_PIO, SDIO_DATA_SM) - (int)g_sdio.pio_data_tx_offset,
+            " RXF: ", (int)pio_sm_get_rx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
+            " TXF: ", (int)pio_sm_get_tx_fifo_level(SDIO_PIO, SDIO_DATA_SM),
+            " DMA CNT: ", dma_hw->ch[SDIO_DMA_CH].al2_transfer_count);
+        rp2040_sdio_stop();
+        return SDIO_ERR_DATA_TIMEOUT;
+    }
+
+    return SDIO_BUSY;
+}
+
+// Force everything to idle state
+sdio_status_t rp2040_sdio_stop()
+{
+    dma_channel_abort(SDIO_DMA_CH);
+    dma_channel_abort(SDIO_DMA_CHB);
+    dma_set_irq1_channel_mask_enabled(1 << SDIO_DMA_CHB, 0);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
+    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_DATA_SM, SDIO_D0, 4, false);
+    g_sdio.transfer_state = SDIO_IDLE;
+    return SDIO_OK;
+}
+
+void rp2040_sdio_init(int clock_divider)
+{
+    // Mark resources as being in use, unless it has been done already.
+    static bool resources_claimed = false;
+    if (!resources_claimed)
+    {
+        pio_sm_claim(SDIO_PIO, SDIO_CMD_SM);
+        pio_sm_claim(SDIO_PIO, SDIO_DATA_SM);
+        dma_channel_claim(SDIO_DMA_CH);
+        dma_channel_claim(SDIO_DMA_CHB);
+        resources_claimed = true;
+    }
+
+    memset(&g_sdio, 0, sizeof(g_sdio));
+
+    dma_channel_abort(SDIO_DMA_CH);
+    dma_channel_abort(SDIO_DMA_CHB);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_CMD_SM, false);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_DATA_SM, false);
+
+    // Load PIO programs
+    pio_clear_instruction_memory(SDIO_PIO);
+
+    // Command & clock state machine
+    g_sdio.pio_cmd_clk_offset = pio_add_program(SDIO_PIO, &sdio_cmd_clk_program);
+    pio_sm_config cfg = sdio_cmd_clk_program_get_default_config(g_sdio.pio_cmd_clk_offset);
+    sm_config_set_out_pins(&cfg, SDIO_CMD, 1);
+    sm_config_set_in_pins(&cfg, SDIO_CMD);
+    sm_config_set_set_pins(&cfg, SDIO_CMD, 1);
+    sm_config_set_jmp_pin(&cfg, SDIO_CMD);
+    sm_config_set_sideset_pins(&cfg, SDIO_CLK);
+    sm_config_set_out_shift(&cfg, false, true, 32);
+    sm_config_set_in_shift(&cfg, false, true, 32);
+    sm_config_set_clkdiv_int_frac(&cfg, clock_divider, 0);
+    sm_config_set_mov_status(&cfg, STATUS_TX_LESSTHAN, 2);
+
+    pio_sm_init(SDIO_PIO, SDIO_CMD_SM, g_sdio.pio_cmd_clk_offset, &cfg);
+    pio_sm_set_consecutive_pindirs(SDIO_PIO, SDIO_CMD_SM, SDIO_CLK, 1, true);
+    pio_sm_set_enabled(SDIO_PIO, SDIO_CMD_SM, true);
+
+    // Data reception program
+    g_sdio.pio_data_rx_offset = pio_add_program(SDIO_PIO, &sdio_data_rx_program);
+    g_sdio.pio_cfg_data_rx = sdio_data_rx_program_get_default_config(g_sdio.pio_data_rx_offset);
+    sm_config_set_in_pins(&g_sdio.pio_cfg_data_rx, SDIO_D0);
+    sm_config_set_in_shift(&g_sdio.pio_cfg_data_rx, false, true, 32);
+    sm_config_set_out_shift(&g_sdio.pio_cfg_data_rx, false, true, 32);
+    sm_config_set_clkdiv_int_frac(&g_sdio.pio_cfg_data_rx, clock_divider, 0);
+
+    // Data transmission program
+    g_sdio.pio_data_tx_offset = pio_add_program(SDIO_PIO, &sdio_data_tx_program);
+    g_sdio.pio_cfg_data_tx = sdio_data_tx_program_get_default_config(g_sdio.pio_data_tx_offset);
+    sm_config_set_in_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0);
+    sm_config_set_set_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0, 4);
+    sm_config_set_out_pins(&g_sdio.pio_cfg_data_tx, SDIO_D0, 4);
+    sm_config_set_in_shift(&g_sdio.pio_cfg_data_tx, false, false, 32);
+    sm_config_set_out_shift(&g_sdio.pio_cfg_data_tx, false, true, 32);
+    sm_config_set_clkdiv_int_frac(&g_sdio.pio_cfg_data_tx, clock_divider, 0);
+
+    // Disable SDIO pins input synchronizer.
+    // This reduces input delay.
+    // Because the CLK is driven synchronously to CPU clock,
+    // there should be no metastability problems.
+    SDIO_PIO->input_sync_bypass |= (1 << SDIO_CLK) | (1 << SDIO_CMD)
+                                 | (1 << SDIO_D0) | (1 << SDIO_D1) | (1 << SDIO_D2) | (1 << SDIO_D3);
+
+    // Redirect GPIOs to PIO
+    gpio_set_function(SDIO_CMD, GPIO_FUNC_PIO1);
+    gpio_set_function(SDIO_CLK, GPIO_FUNC_PIO1);
+    gpio_set_function(SDIO_D0, GPIO_FUNC_PIO1);
+    gpio_set_function(SDIO_D1, GPIO_FUNC_PIO1);
+    gpio_set_function(SDIO_D2, GPIO_FUNC_PIO1);
+    gpio_set_function(SDIO_D3, GPIO_FUNC_PIO1);
+
+    // Set up IRQ handler when DMA completes.
+    irq_set_exclusive_handler(DMA_IRQ_1, rp2040_sdio_tx_irq);
+    irq_set_enabled(DMA_IRQ_1, true);
+}

+ 52 - 0
lib/ZuluSCSI_platform_BS2/rp2040_sdio.h

@@ -0,0 +1,52 @@
+// SD card access using SDIO for RP2040 platform.
+// This module contains the low-level SDIO bus implementation using
+// the PIO peripheral. The high-level commands are in sd_card_sdio.cpp.
+
+#pragma once
+#include <stdint.h>
+
+enum sdio_status_t {
+    SDIO_OK = 0,
+    SDIO_BUSY = 1,
+    SDIO_ERR_RESPONSE_TIMEOUT = 2, // Timed out waiting for response from card
+    SDIO_ERR_RESPONSE_CRC = 3,     // Response CRC is wrong
+    SDIO_ERR_RESPONSE_CODE = 4,    // Response command code does not match what was sent
+    SDIO_ERR_DATA_TIMEOUT = 5,     // Timed out waiting for data block
+    SDIO_ERR_DATA_CRC = 6,         // CRC for data packet is wrong
+    SDIO_ERR_WRITE_CRC = 7,        // Card reports bad CRC for write
+    SDIO_ERR_WRITE_FAIL = 8,       // Card reports write failure
+};
+
+#define SDIO_BLOCK_SIZE 512
+#define SDIO_WORDS_PER_BLOCK 128
+
+// Execute a command that has 48-bit reply (response types R1, R6, R7)
+// If response is NULL, does not wait for reply.
+sdio_status_t rp2040_sdio_command_R1(uint8_t command, uint32_t arg, uint32_t *response);
+
+// Execute a command that has 136-bit reply (response type R2)
+// Response buffer should have space for 16 bytes (the 128 bit payload)
+sdio_status_t rp2040_sdio_command_R2(uint8_t command, uint32_t arg, uint8_t *response);
+
+// Execute a command that has 48-bit reply but without CRC (response R3)
+sdio_status_t rp2040_sdio_command_R3(uint8_t command, uint32_t arg, uint32_t *response);
+
+// Start transferring data from SD card to memory buffer
+// Transfer block size is always 512 bytes.
+sdio_status_t rp2040_sdio_rx_start(uint8_t *buffer, uint32_t num_blocks);
+
+// Check if reception is complete
+// Returns SDIO_BUSY while transferring, SDIO_OK when done and error on failure.
+sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete = nullptr);
+
+// Start transferring data from memory to SD card
+sdio_status_t rp2040_sdio_tx_start(const uint8_t *buffer, uint32_t num_blocks);
+
+// Check if transmission is complete
+sdio_status_t rp2040_sdio_tx_poll(uint32_t *bytes_complete = nullptr);
+
+// Force everything to idle state
+sdio_status_t rp2040_sdio_stop();
+
+// (Re)initialize the SDIO interface
+void rp2040_sdio_init(int clock_divider = 1);

+ 145 - 0
lib/ZuluSCSI_platform_BS2/rp2040_sdio.pio

@@ -0,0 +1,145 @@
+; RP2040 PIO program for implementing SD card access in SDIO mode
+; Run "pioasm rp2040_sdio.pio rp2040_sdio.pio.h" to regenerate the C header from this.
+
+; The RP2040 official work-in-progress code at
+; https://github.com/raspberrypi/pico-extras/tree/master/src/rp2_common/pico_sd_card
+; may be useful reference, but this is independent implementation.
+;
+; For official SDIO specifications, refer to:
+; https://www.sdcard.org/downloads/pls/
+; "SDIO Physical Layer Simplified Specification Version 8.00"
+
+; Clock settings
+; For 3.3V communication the available speeds are:
+; - Default speed: max. 25 MHz clock
+; - High speed:    max. 50 MHz clock
+;
+; From the default RP2040 clock speed of 125 MHz, the closest dividers
+; are 3 for 41.7 MHz and 5 for 25 MHz. The CPU can apply further divider
+; through state machine registers for the initial handshake.
+;
+; Because data is written on the falling edge and read on the rising
+; edge, it is preferrable to have a long 0 state and short 1 state.
+;.define CLKDIV 3
+.define CLKDIV 5
+.define D0 ((CLKDIV + 1) / 2 - 1)
+.define D1 (CLKDIV/2 - 1)
+.define SDIO_CLK_GPIO 10
+
+; State machine 0 is used to:
+; - generate continuous clock on SDIO_CLK
+; - send CMD packets
+; - receive response packets
+;
+; Pin mapping for this state machine:
+; - Sideset    : CLK
+; - IN/OUT/SET : CMD
+; - JMP_PIN    : CMD
+;
+; The commands to send are put on TX fifo and must have two words:
+; Word 0 bits 31-24: Number of bits in command minus one (usually 47)
+; Word 0 bits 23-00: First 24 bits of the command packet, shifted out MSB first
+; Word 1 bits 31-08: Last 24 bits of the command packet, shifted out MSB first
+; Word 1 bits 07-00: Number of bits in response minus one (usually 47), or 0 if no response
+;
+; The response is put on RX fifo, starting with the MSB.
+; Partial last word will be padded with zero bits at the top.
+;
+; The state machine EXECCTRL should be set so that STATUS indicates TX FIFO < 2
+; and that AUTOPULL and AUTOPUSH are enabled.
+
+.program sdio_cmd_clk
+    .side_set 1
+
+    mov OSR, NULL       side 1 [D1]    ; Make sure OSR is full of zeros to prevent autopull
+
+wait_cmd:
+    mov Y, !STATUS      side 0 [D0]    ; Check if TX FIFO has data
+    jmp !Y wait_cmd     side 1 [D1]
+
+load_cmd:
+    out NULL, 32        side 0 [D0]    ; Load first word (trigger autopull)
+    out X, 8            side 1 [D1]    ; Number of bits to send
+    set pins, 1         side 0 [D0]    ; Initial state of CMD is high
+    set pindirs, 1      side 1 [D1]    ; Set SDIO_CMD as output
+
+send_cmd:
+    out pins, 1         side 0 [D0]    ; Write output on falling edge of CLK
+    jmp X-- send_cmd    side 1 [D1]
+
+prep_resp:
+    set pindirs, 0      side 0 [D0]    ; Set SDIO_CMD as input
+    out X, 8            side 1 [D1]    ; Get number of bits in response
+    nop                 side 0 [D0]    ; For clock alignment
+    jmp !X resp_done    side 1 [D1]    ; Check if we expect a response
+
+wait_resp:
+    nop                  side 0 [D0]
+    jmp PIN wait_resp    side 1 [D1]    ; Loop until SDIO_CMD = 0
+
+    ; Note: input bits are read at the same time as we write CLK=0.
+    ; Because the host controls the clock, the read happens before
+    ; the card sees the falling clock edge. This gives maximum time
+    ; for the data bit to settle.
+read_resp:
+    in PINS, 1          side 0 [D0]    ; Read input data bit
+    jmp X-- read_resp   side 1 [D1]    ; Loop to receive all data bits
+
+resp_done:
+    push                side 0 [D0]    ; Push the remaining part of response
+
+; State machine 1 is used to send and receive data blocks.
+; Pin mapping for this state machine:
+; - IN / OUT: SDIO_D0-D3
+; - GPIO defined at beginning of this file: SDIO_CLK
+
+; Data reception program
+; This program will wait for initial start of block token and then
+; receive a data block. The application must set number of nibbles
+; to receive minus 1 to Y register before running this program.
+.program sdio_data_rx
+
+wait_start:
+    mov X, Y                               ; Reinitialize number of nibbles to receive
+    wait 0 pin 0                           ; Wait for zero state on D0
+    wait 1 gpio SDIO_CLK_GPIO  [CLKDIV-1]  ; Wait for rising edge and then whole clock cycle
+
+rx_data:
+    in PINS, 4                 [CLKDIV-2]  ; Read nibble
+    jmp X--, rx_data
+
+; Data transmission program
+;
+; Before running this program, pindirs should be set as output
+; and register X should be initialized with the number of nibbles
+; to send minus 1 (typically 8 + 1024 + 16 + 1 - 1 = 1048)
+; and register Y with the number of response bits minus 1 (typically 31).
+;
+; Words written to TX FIFO must be:
+; - Word 0: start token 0xFFFFFFF0
+; - Word 1-128: transmitted data (512 bytes)
+; - Word 129-130: CRC checksum
+; - Word 131: end token 0xFFFFFFFF
+;
+; After the card reports idle status, RX FIFO will get a word that
+; contains the D0 line response from card.
+
+.program sdio_data_tx
+    wait 0 gpio SDIO_CLK_GPIO  
+    wait 1 gpio SDIO_CLK_GPIO  [CLKDIV + D1 - 1]; Synchronize so that write occurs on falling edge
+
+tx_loop:
+    out PINS, 4                [D0]    ; Write nibble and wait for whole clock cycle
+    jmp X-- tx_loop            [D1]
+
+    set pindirs, 0x00          [D0]    ; Set data bus as input
+
+.wrap_target
+response_loop:
+    in PINS, 1                 [D1]    ; Read D0 on rising edge
+    jmp Y--, response_loop     [D0]
+
+wait_idle:
+    wait 1 pin 0               [D1]    ; Wait for card to indicate idle condition
+    push                       [D0]    ; Push the response token
+.wrap

+ 121 - 0
lib/ZuluSCSI_platform_BS2/rp2040_sdio.pio.h

@@ -0,0 +1,121 @@
+// -------------------------------------------------- //
+// This file is autogenerated by pioasm; do not edit! //
+// -------------------------------------------------- //
+
+#pragma once
+
+#if !PICO_NO_HARDWARE
+#include "hardware/pio.h"
+#endif
+
+// ------------ //
+// sdio_cmd_clk //
+// ------------ //
+
+#define sdio_cmd_clk_wrap_target 0
+#define sdio_cmd_clk_wrap 17
+
+static const uint16_t sdio_cmd_clk_program_instructions[] = {
+            //     .wrap_target
+    0xb1e3, //  0: mov    osr, null       side 1 [1] 
+    0xa24d, //  1: mov    y, !status      side 0 [2] 
+    0x1161, //  2: jmp    !y, 1           side 1 [1] 
+    0x6260, //  3: out    null, 32        side 0 [2] 
+    0x7128, //  4: out    x, 8            side 1 [1] 
+    0xe201, //  5: set    pins, 1         side 0 [2] 
+    0xf181, //  6: set    pindirs, 1      side 1 [1] 
+    0x6201, //  7: out    pins, 1         side 0 [2] 
+    0x1147, //  8: jmp    x--, 7          side 1 [1] 
+    0xe280, //  9: set    pindirs, 0      side 0 [2] 
+    0x7128, // 10: out    x, 8            side 1 [1] 
+    0xa242, // 11: nop                    side 0 [2] 
+    0x1131, // 12: jmp    !x, 17          side 1 [1] 
+    0xa242, // 13: nop                    side 0 [2] 
+    0x11cd, // 14: jmp    pin, 13         side 1 [1] 
+    0x4201, // 15: in     pins, 1         side 0 [2] 
+    0x114f, // 16: jmp    x--, 15         side 1 [1] 
+    0x8220, // 17: push   block           side 0 [2] 
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program sdio_cmd_clk_program = {
+    .instructions = sdio_cmd_clk_program_instructions,
+    .length = 18,
+    .origin = -1,
+};
+
+static inline pio_sm_config sdio_cmd_clk_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + sdio_cmd_clk_wrap_target, offset + sdio_cmd_clk_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// ------------ //
+// sdio_data_rx //
+// ------------ //
+
+#define sdio_data_rx_wrap_target 0
+#define sdio_data_rx_wrap 4
+
+static const uint16_t sdio_data_rx_program_instructions[] = {
+            //     .wrap_target
+    0xa022, //  0: mov    x, y                       
+    0x2020, //  1: wait   0 pin, 0                   
+    0x248A, //  2: wait   1 gpio, 10             [4] 
+    0x4304, //  3: in     pins, 4                [3] 
+    0x0043, //  4: jmp    x--, 3                     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program sdio_data_rx_program = {
+    .instructions = sdio_data_rx_program_instructions,
+    .length = 5,
+    .origin = -1,
+};
+
+static inline pio_sm_config sdio_data_rx_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + sdio_data_rx_wrap_target, offset + sdio_data_rx_wrap);
+    return c;
+}
+#endif
+
+// ------------ //
+// sdio_data_tx //
+// ------------ //
+
+#define sdio_data_tx_wrap_target 5
+#define sdio_data_tx_wrap 8
+
+static const uint16_t sdio_data_tx_program_instructions[] = {
+    0x200A, //  0: wait   0 gpio, 10                 
+    0x258A, //  1: wait   1 gpio, 10             [5] 
+    0x6204, //  2: out    pins, 4                [2] 
+    0x0142, //  3: jmp    x--, 2                 [1] 
+    0xe280, //  4: set    pindirs, 0             [2] 
+            //     .wrap_target
+    0x4101, //  5: in     pins, 1                [1] 
+    0x0285, //  6: jmp    y--, 5                 [2] 
+    0x21a0, //  7: wait   1 pin, 0               [1] 
+    0x8220, //  8: push   block                  [2] 
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program sdio_data_tx_program = {
+    .instructions = sdio_data_tx_program_instructions,
+    .length = 9,
+    .origin = -1,
+};
+
+static inline pio_sm_config sdio_data_tx_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + sdio_data_tx_wrap_target, offset + sdio_data_tx_wrap);
+    return c;
+}
+#endif
+

+ 13 - 0
lib/ZuluSCSI_platform_BS2/scsi2sd_time.h

@@ -0,0 +1,13 @@
+// Timing functions for SCSI2SD.
+// This file is derived from time.h in SCSI2SD-V6.
+
+#pragma once
+
+#include <stdint.h>
+#include "ZuluSCSI_platform.h"
+
+#define s2s_getTime_ms() millis()
+#define s2s_elapsedTime_ms(since) ((uint32_t)(millis() - (since)))
+#define s2s_delay_ms(x) delay_ns(x * 1000000)
+#define s2s_delay_us(x) delay_ns(x * 1000)
+#define s2s_delay_ns(x) delay_ns(x)

+ 266 - 0
lib/ZuluSCSI_platform_BS2/scsiHostPhy.cpp

@@ -0,0 +1,266 @@
+#include "scsiHostPhy.h"
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "ZuluSCSI_log_trace.h"
+#include "scsi_accel_host.h"
+#include <assert.h>
+
+#include <scsi2sd.h>
+extern "C" {
+#include <scsi.h>
+}
+
+volatile int g_scsiHostPhyReset;
+
+// Release bus and pulse RST signal, initialize PHY to host mode.
+void scsiHostPhyReset(void)
+{
+    SCSI_RELEASE_OUTPUTS();
+    SCSI_ENABLE_INITIATOR();
+
+    scsi_accel_host_init();
+
+    SCSI_OUT(RST, 1);
+    delay(2);
+    SCSI_OUT(RST, 0);
+    delay(250);
+    g_scsiHostPhyReset = false;
+}
+
+// Select a device, id 0-7.
+// Returns true if the target answers to selection request.
+bool scsiHostPhySelect(int target_id)
+{
+    SCSI_RELEASE_OUTPUTS();
+
+    // We can't write individual data bus bits, so use a bit modified
+    // arbitration scheme. We always yield to any other initiator on
+    // the bus.
+    scsiLogInitiatorPhaseChange(BUS_BUSY);
+    SCSI_OUT(BSY, 1);
+    for (int wait = 0; wait < 10; wait++)
+    {
+        delayMicroseconds(1);
+
+        if (SCSI_IN_DATA() != 0)
+        {
+            azdbg("scsiHostPhySelect: bus is busy");
+            scsiLogInitiatorPhaseChange(BUS_FREE);
+            SCSI_RELEASE_OUTPUTS();
+            return false;
+        }
+    }
+
+    // Selection phase
+    scsiLogInitiatorPhaseChange(SELECTION);
+    azdbg("------ SELECTING ", target_id);
+    SCSI_OUT(SEL, 1);
+    delayMicroseconds(5);
+    SCSI_OUT_DATA(1 << target_id);
+    delayMicroseconds(5);
+    SCSI_OUT(BSY, 0);
+
+    // Wait for target to respond
+    for (int wait = 0; wait < 2500; wait++)
+    {
+        delayMicroseconds(100);
+        if (SCSI_IN(BSY))
+        {
+            break;
+        }
+    }
+
+    if (!SCSI_IN(BSY))
+    {
+        // No response
+        SCSI_RELEASE_OUTPUTS();
+        return false;
+    }
+
+    // We need to assert OUT_BSY to enable IO buffer U105 to read status signals.
+    SCSI_RELEASE_DATA_REQ();
+    SCSI_OUT(BSY, 1);
+    SCSI_OUT(SEL, 0);
+    return true;
+}
+
+// Read the current communication phase as signaled by the target
+int scsiHostPhyGetPhase()
+{
+    static absolute_time_t last_online_time;
+
+    if (g_scsiHostPhyReset)
+    {
+        // Reset request from watchdog timer
+        scsiHostPhyRelease();
+        return BUS_FREE;
+    }
+
+    int phase = 0;
+    bool req_in = SCSI_IN(REQ);
+    if (SCSI_IN(CD)) phase |= __scsiphase_cd;
+    if (SCSI_IN(IO)) phase |= __scsiphase_io;
+    if (SCSI_IN(MSG)) phase |= __scsiphase_msg;
+
+    if (phase == 0 && absolute_time_diff_us(last_online_time, get_absolute_time()) > 100)
+    {
+        // Disable OUT_BSY for a short time to see if the target is still on line
+        SCSI_OUT(BSY, 0);
+        delayMicroseconds(1);
+
+        if (!SCSI_IN(BSY))
+        {
+            scsiLogInitiatorPhaseChange(BUS_FREE);
+            return BUS_FREE;
+        }
+
+        // Still online, re-enable OUT_BSY to enable IO buffers
+        SCSI_OUT(BSY, 1);
+        last_online_time = get_absolute_time();
+    }
+    else if (phase != 0)
+    {
+        last_online_time = get_absolute_time();
+    }
+
+    if (!req_in)
+    {
+        // Don't act on phase changes until target asserts request signal.
+        // This filters out any spurious changes on control signals.
+        return BUS_BUSY;
+    }
+    else
+    {
+        scsiLogInitiatorPhaseChange(phase);
+        return phase;
+    }
+}
+
+bool scsiHostRequestWaiting()
+{
+    return SCSI_IN(REQ);
+}
+
+// Blocking data transfer
+#define SCSIHOST_WAIT_ACTIVE(pin) \
+  if (!SCSI_IN(pin)) { \
+    if (!SCSI_IN(pin)) { \
+      while(!SCSI_IN(pin) && !g_scsiHostPhyReset); \
+    } \
+  }
+
+#define SCSIHOST_WAIT_INACTIVE(pin) \
+  if (SCSI_IN(pin)) { \
+    if (SCSI_IN(pin)) { \
+      while(SCSI_IN(pin) && !g_scsiHostPhyReset); \
+    } \
+  }
+
+// Write one byte to SCSI target using the handshake mechanism
+static inline void scsiHostWriteOneByte(uint8_t value)
+{
+    SCSIHOST_WAIT_ACTIVE(REQ);
+    SCSI_OUT_DATA(value);
+    delay_100ns(); // DB setup time before ACK
+    SCSI_OUT(ACK, 1);
+    SCSIHOST_WAIT_INACTIVE(REQ);
+    SCSI_RELEASE_DATA_REQ();
+    SCSI_OUT(ACK, 0);
+}
+
+// Read one byte from SCSI target using the handshake mechanism.
+static inline uint8_t scsiHostReadOneByte(int* parityError)
+{
+    SCSIHOST_WAIT_ACTIVE(REQ);
+    uint16_t r = SCSI_IN_DATA();
+    SCSI_OUT(ACK, 1);
+    SCSIHOST_WAIT_INACTIVE(REQ);
+    SCSI_OUT(ACK, 0);
+
+    if (parityError && r != (g_scsi_parity_lookup[r & 0xFF] ^ SCSI_IO_DATA_MASK))
+    {
+        azlog("Parity error in scsiReadOneByte(): ", (uint32_t)r);
+        *parityError = 1;
+    }
+
+    return (uint8_t)r;
+}
+
+uint32_t scsiHostWrite(const uint8_t *data, uint32_t count)
+{
+    scsiLogDataOut(data, count);
+
+    int cd_start = SCSI_IN(CD);
+    int msg_start = SCSI_IN(MSG);
+
+    for (uint32_t i = 0; i < count; i++)
+    {
+        while (!SCSI_IN(REQ))
+        {
+            if (g_scsiHostPhyReset || SCSI_IN(IO) || SCSI_IN(CD) != cd_start || SCSI_IN(MSG) != msg_start)
+            {
+                // Target switched out of DATA_OUT mode
+                azlog("scsiHostWrite: sent ", (int)i, " bytes, expected ", (int)count);
+                return i;
+            }
+        }
+
+        scsiHostWriteOneByte(data[i]);
+    }
+
+    return count;
+}
+
+uint32_t scsiHostRead(uint8_t *data, uint32_t count)
+{
+    int parityError = 0;
+    uint32_t fullcount = count;
+
+    int cd_start = SCSI_IN(CD);
+    int msg_start = SCSI_IN(MSG);
+
+    if ((count & 1) == 0 && ((uint32_t)data & 1) == 0)
+    {
+        // Even number of bytes, use accelerated routine
+        count = scsi_accel_host_read(data, count, &parityError, &g_scsiHostPhyReset);
+    }
+    else
+    {
+        for (uint32_t i = 0; i < count; i++)
+        {
+            while (!SCSI_IN(REQ))
+            {
+                if (g_scsiHostPhyReset || !SCSI_IN(IO) || SCSI_IN(CD) != cd_start || SCSI_IN(MSG) != msg_start)
+                {
+                    // Target switched out of DATA_IN mode
+                    count = i;
+                }
+            }
+
+            data[i] = scsiHostReadOneByte(&parityError);
+        }
+    }
+
+    scsiLogDataIn(data, count);
+
+    if (g_scsiHostPhyReset || parityError)
+    {
+        return 0;
+    }
+    else
+    {
+        if (count < fullcount)
+        {
+            azlog("scsiHostRead: received ", (int)count, " bytes, expected ", (int)fullcount);
+        }
+
+        return count;
+    }
+}
+
+// Release all bus signals
+void scsiHostPhyRelease()
+{
+    scsiLogInitiatorPhaseChange(BUS_FREE);
+    SCSI_RELEASE_OUTPUTS();
+}

+ 32 - 0
lib/ZuluSCSI_platform_BS2/scsiHostPhy.h

@@ -0,0 +1,32 @@
+// Host side SCSI physical interface.
+// Used in initiator to interface to an SCSI drive.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+
+// Request to stop activity and reset the bus
+extern volatile int g_scsiHostPhyReset;
+
+// Release bus and pulse RST signal, initialize PHY to host mode.
+void scsiHostPhyReset(void);
+
+// Select a device, id 0-7.
+// Returns true if the target answers to selection request.
+bool scsiHostPhySelect(int target_id);
+
+// Read the current communication phase as signaled by the target
+// Matches SCSI_PHASE enumeration from scsi.h.
+int scsiHostPhyGetPhase();
+
+// Returns true if the device has asserted REQ signal, i.e. data waiting
+bool scsiHostRequestWaiting();
+
+// Blocking data transfer
+// These return the actual number of bytes transferred.
+uint32_t scsiHostWrite(const uint8_t *data, uint32_t count);
+uint32_t scsiHostRead(uint8_t *data, uint32_t count);
+
+// Release all bus signals
+void scsiHostPhyRelease();

+ 384 - 0
lib/ZuluSCSI_platform_BS2/scsiPhy.cpp

@@ -0,0 +1,384 @@
+// Implements the low level interface to SCSI bus
+// Partially derived from scsiPhy.c from SCSI2SD-V6
+
+#include "scsiPhy.h"
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "ZuluSCSI_log_trace.h"
+#include "ZuluSCSI_config.h"
+#include "scsi_accel_rp2040.h"
+#include "hardware/structs/iobank0.h"
+
+#include <scsi2sd.h>
+extern "C" {
+#include <scsi.h>
+#include <scsi2sd_time.h>
+}
+
+/***********************/
+/* SCSI status signals */
+/***********************/
+
+extern "C" bool scsiStatusATN()
+{
+    return SCSI_IN(ATN);
+}
+
+extern "C" bool scsiStatusBSY()
+{
+    return SCSI_IN(BSY);
+}
+
+/************************/
+/* SCSI selection logic */
+/************************/
+
+volatile uint8_t g_scsi_sts_selection;
+volatile uint8_t g_scsi_ctrl_bsy;
+
+void scsi_bsy_deassert_interrupt()
+{
+    if (SCSI_IN(SEL) && !SCSI_IN(BSY))
+    {
+        // Check if any of the targets we simulate is selected
+        uint8_t sel_bits = SCSI_IN_DATA();
+        int sel_id = -1;
+        for (int i = 0; i < S2S_MAX_TARGETS; i++)
+        {
+            if (scsiDev.targets[i].targetId <= 7 && scsiDev.targets[i].cfg)
+            {
+                if (sel_bits & (1 << scsiDev.targets[i].targetId))
+                {
+                    sel_id = scsiDev.targets[i].targetId;
+                    break;
+                }
+            }
+        }
+
+        if (sel_id >= 0)
+        {
+            // Set ATN flag here unconditionally, real value is only known after
+            // OUT_BSY is enabled in scsiStatusSEL() below.
+            g_scsi_sts_selection = SCSI_STS_SELECTION_SUCCEEDED | SCSI_STS_SELECTION_ATN | sel_id;
+        }
+
+        // selFlag is required for Philips P2000C which releases it after 600ns
+        // without waiting for BSY.
+        // Also required for some early Mac Plus roms
+        scsiDev.selFlag = *SCSI_STS_SELECTED;
+    }
+}
+
+extern "C" bool scsiStatusSEL()
+{
+    if (g_scsi_ctrl_bsy)
+    {
+        // We don't have direct register access to BSY bit like SCSI2SD scsi.c expects.
+        // Instead update the state here.
+        // Releasing happens with bus release.
+        g_scsi_ctrl_bsy = 0;
+        // @TODO See if needed
+        SCSI_OUT(CD, 0);
+        SCSI_OUT(MSG, 0);
+        SCSI_ENABLE_CONTROL_OUT();
+        // @TODO end
+        SCSI_OUT(BSY, 1);
+
+        // On RP2040 hardware the ATN signal is only available after OUT_BSY enables
+        // the IO buffer U105, so check the signal status here.
+        delay_100ns();
+        if (!scsiStatusATN())
+        {
+            // This is a SCSI1 host that does send IDENTIFY message
+            scsiDev.atnFlag = 0;
+            scsiDev.target->unitAttention = 0;
+            scsiDev.compatMode = COMPAT_SCSI1;
+        }
+    }
+
+    return SCSI_IN(SEL);
+}
+
+/************************/
+/* SCSI bus reset logic */
+/************************/
+
+static void scsi_rst_assert_interrupt()
+{
+    // Glitch filtering
+    bool rst1 = SCSI_IN(RST);
+    delay_ns(500);
+    bool rst2 = SCSI_IN(RST);
+
+    if (rst1 && rst2)
+    {
+        azdbg("BUS RESET");
+        scsiDev.resetFlag = 1;
+    }
+}
+
+static void scsiPhyIRQ(uint gpio, uint32_t events)
+{
+    if (gpio == SCSI_IN_BSY || gpio == SCSI_IN_SEL)
+    {
+        // Note BSY / SEL interrupts only when we are not driving OUT_BSY low ourselves.
+        // The BSY input pin may be shared with other signals.
+        if (sio_hw->gpio_out & (1 << SCSI_OUT_BSY))
+        {
+            scsi_bsy_deassert_interrupt();
+        }
+    }
+    else if (gpio == SCSI_IN_RST)
+    {
+        scsi_rst_assert_interrupt();
+    }
+}
+
+// This function is called to initialize the phy code.
+// It is called after power-on and after SCSI bus reset.
+extern "C" void scsiPhyReset(void)
+{
+    SCSI_RELEASE_OUTPUTS();
+    g_scsi_sts_selection = 0;
+    g_scsi_ctrl_bsy = 0;
+
+    scsi_accel_rp2040_init();
+
+    // Enable BSY, RST and SEL interrupts
+    // Note: RP2040 library currently supports only one callback,
+    // so it has to be same for both pins.
+    gpio_set_irq_enabled_with_callback(SCSI_IN_BSY, GPIO_IRQ_EDGE_RISE, true, scsiPhyIRQ);
+    gpio_set_irq_enabled(SCSI_IN_RST, GPIO_IRQ_EDGE_FALL, true);
+
+    // Check BSY line status when SEL goes active.
+    // This is needed to handle SCSI-1 hosts that use the single initiator mode.
+    // The host will just assert the SEL directly, without asserting BSY first.
+    gpio_set_irq_enabled(SCSI_IN_SEL, GPIO_IRQ_EDGE_FALL, true);
+}
+
+/************************/
+/* SCSI bus phase logic */
+/************************/
+
+static SCSI_PHASE g_scsi_phase;
+
+extern "C" void scsiEnterPhase(int phase)
+{
+    int delay = scsiEnterPhaseImmediate(phase);
+    if (delay > 0)
+    {
+        s2s_delay_ns(delay);
+    }
+}
+
+// Change state and return nanosecond delay to wait
+extern "C" uint32_t scsiEnterPhaseImmediate(int phase)
+{
+    if (phase != g_scsi_phase)
+    {
+        // ANSI INCITS 362-2002 SPI-3 10.7.1:
+        // Phase changes are not allowed while REQ or ACK is asserted.
+        while (likely(!scsiDev.resetFlag) && SCSI_IN(ACK)) {}
+
+        if (scsiDev.compatMode < COMPAT_SCSI2 && (phase == DATA_IN || phase == DATA_OUT))
+        {
+            // Akai S1000/S3000 seems to need extra delay before changing to data phase
+            // after a command. The code in ZuluSCSI_disk.cpp tries to do this while waiting
+            // for SD card, to avoid any extra latency.
+            s2s_delay_ns(400000);
+        }
+
+        int oldphase = g_scsi_phase;
+        g_scsi_phase = (SCSI_PHASE)phase;
+        scsiLogPhaseChange(phase);
+
+        // Select between synchronous vs. asynchronous SCSI writes
+        if (scsiDev.target->syncOffset > 0 && (g_scsi_phase == DATA_IN || g_scsi_phase == DATA_OUT))
+        {
+            scsi_accel_rp2040_setSyncMode(scsiDev.target->syncOffset, scsiDev.target->syncPeriod);
+        }
+        else
+        {
+            scsi_accel_rp2040_setSyncMode(0, 0);
+        }
+
+        if (phase < 0)
+        {
+            // Other communication on bus or reset state
+            SCSI_RELEASE_OUTPUTS();
+            return 0;
+        }
+        else
+        {
+            // The phase control signals should be changed close to simultaneously.
+            // The SCSI spec allows 400 ns for this, but some hosts do not seem to be that
+            // tolerant. The Cortex-M0 is also quite slow in bit twiddling.
+            //
+            // To avoid unnecessary delays, precalculate an XOR mask and then apply it
+            // simultaneously to all three signals.
+            uint32_t gpio_new = 0;
+            if (!(phase & __scsiphase_msg)) { gpio_new |= (1 << SCSI_OUT_MSG); }
+            if (!(phase & __scsiphase_cd)) { gpio_new |= (1 << SCSI_OUT_CD); }
+            if (!(phase & __scsiphase_io)) { gpio_new |= (1 << SCSI_OUT_IO); }
+
+            uint32_t mask = (1 << SCSI_OUT_MSG) | (1 << SCSI_OUT_CD) | (1 << SCSI_OUT_IO);
+            uint32_t gpio_xor = (sio_hw->gpio_out ^ gpio_new) & mask;
+            sio_hw->gpio_togl = gpio_xor;
+            SCSI_ENABLE_CONTROL_OUT();
+
+            int delayNs = 400; // Bus settle delay
+            if ((oldphase & __scsiphase_io) != (phase & __scsiphase_io))
+            {
+                delayNs += 400; // Data release delay
+            }
+
+            if (scsiDev.compatMode < COMPAT_SCSI2)
+            {
+                // EMU EMAX needs 100uS ! 10uS is not enough.
+                delayNs += 100000;
+            }
+
+            return delayNs;
+        }
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+// Release all signals
+void scsiEnterBusFree(void)
+{
+    g_scsi_phase = BUS_FREE;
+    g_scsi_sts_selection = 0;
+    g_scsi_ctrl_bsy = 0;
+    scsiDev.cdbLen = 0;
+
+    SCSI_RELEASE_OUTPUTS();
+}
+
+/********************/
+/* Transmit to host */
+/********************/
+
+#define SCSI_WAIT_ACTIVE(pin) \
+  if (!SCSI_IN(pin)) { \
+    if (!SCSI_IN(pin)) { \
+      while(!SCSI_IN(pin) && !scsiDev.resetFlag); \
+    } \
+  }
+
+// In synchronous mode the ACK pulse can be very short, so use edge IRQ to detect it.
+#define CHECK_EDGE(pin) \
+    ((iobank0_hw->intr[pin / 8] >> (4 * (pin % 8))) & GPIO_IRQ_EDGE_FALL)
+
+#define SCSI_WAIT_ACTIVE_EDGE(pin) \
+  if (!CHECK_EDGE(SCSI_IN_ ## pin)) { \
+    while(!SCSI_IN(pin) && !CHECK_EDGE(SCSI_IN_ ## pin) && !scsiDev.resetFlag); \
+  }
+
+#define SCSI_WAIT_INACTIVE(pin) \
+  if (SCSI_IN(pin)) { \
+    if (SCSI_IN(pin)) { \
+      while(SCSI_IN(pin) && !scsiDev.resetFlag); \
+    } \
+  }
+
+// Write one byte to SCSI host using the handshake mechanism
+// This is suitable for both asynchronous and synchronous communication.
+static inline void scsiWriteOneByte(uint8_t value)
+{
+    SCSI_OUT_DATA(value);
+    delay_100ns(); // DB setup time before REQ
+    gpio_acknowledge_irq(SCSI_IN_ACK, GPIO_IRQ_EDGE_FALL);
+    SCSI_OUT(REQ, 1);
+    SCSI_WAIT_ACTIVE_EDGE(ACK);
+    SCSI_RELEASE_DATA_REQ();
+    SCSI_WAIT_INACTIVE(ACK);
+}
+
+extern "C" void scsiWriteByte(uint8_t value)
+{
+    scsiLogDataIn(&value, 1);
+    scsiWriteOneByte(value);
+}
+
+extern "C" void scsiWrite(const uint8_t* data, uint32_t count)
+{
+    scsiStartWrite(data, count);
+    scsiFinishWrite();
+}
+
+extern "C" void scsiStartWrite(const uint8_t* data, uint32_t count)
+{
+    scsiLogDataIn(data, count);
+    scsi_accel_rp2040_startWrite(data, count, &scsiDev.resetFlag);
+}
+
+extern "C" bool scsiIsWriteFinished(const uint8_t *data)
+{
+    return scsi_accel_rp2040_isWriteFinished(data);
+}
+
+extern "C" void scsiFinishWrite()
+{
+    scsi_accel_rp2040_finishWrite(&scsiDev.resetFlag);
+}
+
+/*********************/
+/* Receive from host */
+/*********************/
+
+// Read one byte from SCSI host using the handshake mechanism.
+static inline uint8_t scsiReadOneByte(int* parityError)
+{
+    SCSI_OUT(REQ, 1);
+    SCSI_WAIT_ACTIVE(ACK);
+    delay_100ns();
+    uint16_t r = SCSI_IN_DATA();
+    SCSI_OUT(REQ, 0);
+    SCSI_WAIT_INACTIVE(ACK);
+
+    if (parityError && r != (g_scsi_parity_lookup[r & 0xFF] ^ SCSI_IO_DATA_MASK))
+    {
+        azlog("Parity error in scsiReadOneByte(): ", (uint32_t)r);
+        *parityError = 1;
+    }
+
+    return (uint8_t)r;
+}
+
+extern "C" uint8_t scsiReadByte(void)
+{
+    uint8_t r = scsiReadOneByte(NULL);
+    scsiLogDataOut(&r, 1);
+    return r;
+}
+
+extern "C" void scsiRead(uint8_t* data, uint32_t count, int* parityError)
+{
+    *parityError = 0;
+    if (!(scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY)) { parityError = NULL; }
+
+    scsiStartRead(data, count, parityError);
+    scsiFinishRead(data, count, parityError);
+}
+
+extern "C" void scsiStartRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    if (!(scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY)) { parityError = NULL; }
+    scsi_accel_rp2040_startRead(data, count, parityError, &scsiDev.resetFlag);
+}
+
+extern "C" void scsiFinishRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    if (!(scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY)) { parityError = NULL; }
+    scsi_accel_rp2040_finishRead(data, count, parityError, &scsiDev.resetFlag);
+    scsiLogDataOut(data, count);
+}
+
+extern "C" bool scsiIsReadFinished(const uint8_t *data)
+{
+    return scsi_accel_rp2040_isReadFinished(data);
+}

+ 74 - 0
lib/ZuluSCSI_platform_BS2/scsiPhy.h

@@ -0,0 +1,74 @@
+// Interface to SCSI physical interface.
+// This file is derived from scsiPhy.h in SCSI2SD-V6.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Read SCSI status signals
+bool scsiStatusATN();
+bool scsiStatusBSY();
+bool scsiStatusSEL();
+
+// Parity not yet implemented
+#define scsiParityError() 0
+
+// Get SCSI selection status.
+// This is latched by interrupt when BSY is deasserted while SEL is asserted.
+// Lowest 3 bits are the selected target id.
+// Highest bits are status information.
+#define SCSI_STS_SELECTION_SUCCEEDED 0x40
+#define SCSI_STS_SELECTION_ATN 0x80
+extern volatile uint8_t g_scsi_sts_selection;
+#define SCSI_STS_SELECTED (&g_scsi_sts_selection)
+extern volatile uint8_t g_scsi_ctrl_bsy;
+#define SCSI_CTRL_BSY (&g_scsi_ctrl_bsy)
+
+// Called when SCSI RST signal has been asserted, should release bus.
+void scsiPhyReset(void);
+
+// Change MSG / CD / IO signal states and wait for necessary transition time.
+// Phase argument is one of SCSI_PHASE enum values.
+void scsiEnterPhase(int phase);
+
+// Change state and return nanosecond delay to wait
+uint32_t scsiEnterPhaseImmediate(int phase);
+
+// Release all signals
+void scsiEnterBusFree(void);
+
+// Blocking data transfer
+void scsiWrite(const uint8_t* data, uint32_t count);
+void scsiRead(uint8_t* data, uint32_t count, int* parityError);
+void scsiWriteByte(uint8_t value);
+uint8_t scsiReadByte(void);
+
+// Non-blocking data transfer.
+// Depending on platform support the start() function may block.
+// The start function can be called multiple times, it may internally
+// either combine transfers or block until previous transfer completes.
+void scsiStartWrite(const uint8_t* data, uint32_t count);
+void scsiFinishWrite();
+void scsiStartRead(uint8_t* data, uint32_t count, int *parityError);
+void scsiFinishRead(uint8_t* data, uint32_t count, int *parityError);
+
+// Query whether the data at pointer has already been read, i.e. buffer can be reused.
+// If data is NULL, checks if all writes have completed.
+bool scsiIsWriteFinished(const uint8_t *data);
+
+// Query whether the data at pointer has already been written, i.e. can be processed.
+// If data is NULL, checks if all reads have completed.
+bool scsiIsReadFinished(const uint8_t *data);
+
+#define PLATFORM_SCSIPHY_HAS_NONBLOCKING_READ 1
+
+#define s2s_getScsiRateKBs() 0
+
+#ifdef __cplusplus
+}
+#endif

+ 104 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel.pio

@@ -0,0 +1,104 @@
+; RP2040 PIO program for accelerating SCSI communication
+; Run "pioasm scsi_accel.pio scsi_accel.pio.h" to regenerate the C header from this.
+; GPIO mapping:
+; - 0-7: DB0-DB7
+; -   8: DBP
+; Side set is REQ pin
+
+.define REQ 17
+.define ACK 26
+
+; Delay from data setup to REQ assertion.
+; deskew delay + cable skew delay = 55 ns minimum
+; One clock cycle is 8 ns => delay 7 clocks
+.define REQ_DLY 7
+
+; Adds parity to data that is to be written to SCSI
+; This works by generating addresses for DMA to fetch data from.
+; Register X should be initialized to the base address of the lookup table.
+.program scsi_parity
+    pull block
+    in NULL, 1
+    in OSR, 8
+    in X, 23
+
+; Write to SCSI bus using asynchronous handshake.
+; Data is written as 32-bit words that contain the 8 data bits + 1 parity bit.
+; 23 bits in each word are discarded.
+; Number of bytes to send must be multiple of 2.
+.program scsi_accel_async_write
+    .side_set 1
+
+    pull ifempty block          side 1  ; Get data from TX FIFO
+    out pins, 9                 side 1  ; Write data and parity bit
+    out null, 23 [REQ_DLY-2]    side 1  ; Discard unused bits, wait for data preset time
+    wait 1 gpio ACK             side 1  ; Wait for ACK to be inactive
+    wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
+
+; Read from SCSI bus using sync or async handshake.
+; Data is returned as 32-bit words:
+; - bit  0: always zero
+; - bits 1-8: data byte
+; - bit  9: parity bit
+; - bits 10-31: lookup table address
+; Lookup table address should be loaded into register Y.
+; One dummy word should be written to TX fifo for every byte to receive.
+.program scsi_accel_read
+    .side_set 1
+
+    pull block                  side 1  ; Pull from TX fifo for counting bytes and pacing sync mode
+    wait 1 gpio ACK             side 1  ; Wait for ACK high
+    in null, 1                  side 0  ; Zero bit because lookup table entries are 16-bit
+    wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
+    in pins, 9                  side 1  ; Deassert REQ, read GPIO
+    in y, 22                    side 1  ; Copy parity lookup table address
+
+; Data state machine for synchronous writes.
+; Takes the lowest 9 bits of each 32 bit word and writes them to bus with REQ pulse.
+; The delay times will be rewritten by C code to match the negotiated SCSI sync speed.
+;
+; Shifts one bit to ISR per every byte transmitted. This is used to control the transfer
+; pace, the RX fifo acts as a counter to keep track of unacknowledged bytes. The C code
+; can set the syncOffset by changing autopush threshold, e.g. threshold 3 = 12 bytes offset.
+.program scsi_sync_write
+    .side_set 1
+
+    out pins, 9      [0]        side 1  ; Write data and parity bit, wait for deskew delay
+    out null, 23     [0]        side 0  ; Assert REQ, wait for assert time
+    in null, 1       [0]        side 1  ; Deassert REQ, wait for transfer period, wait for space in ACK buffer
+
+; Data pacing state machine for synchronous writes.
+; Takes one bit from ISR on every falling edge of ACK.
+; The C code should set autopull threshold to match scsi_sync_write autopush threshold.
+; System DMA will then move words from scsi_sync_write RX fifo to scsi_sync_write_pacer TX fifo.
+.program scsi_sync_write_pacer
+    wait 1 gpio ACK
+    wait 0 gpio ACK   ; Wait for falling edge on ACK
+    out null, 1       ; Let scsi_sync_write send one more byte
+
+; Data pacing state machine for synchronous reads.
+; The delay times will be rewritten by C code to match the negotiated SCSI sync speed.
+; Number of bytes to receive minus one should be loaded into register X.
+; In synchronous mode this generates the REQ pulses and dummy words.
+; In asynchronous mode it just generates dummy words to feed to scsi_accel_read.
+.program scsi_sync_read_pacer
+    .side_set 1
+
+start:
+    push block      [0]      side 1  ; Send dummy word to scsi_accel_read, wait for transfer period
+    jmp x-- start   [0]      side 0  ; Assert REQ, wait for assert time
+
+finish:
+    jmp finish      [0]      side 1
+
+; Parity checker for reads from SCSI bus.
+; Receives 16-bit words from g_scsi_parity_check_lookup
+; Bottom 8 bits are the data byte, which is passed to output FIFO
+; The 9th bit is parity valid bit, which is 1 for valid and 0 for parity error.
+.program scsi_read_parity
+parity_valid:
+    out isr, 8                ; Take the 8 data bits for passing to RX fifo
+    push block                ; Push the data to RX fifo
+    out x, 24                 ; Take the parity valid bit, and the rest of 32-bit word
+    jmp x-- parity_valid      ; If parity valid bit is 1, repeat from start
+    irq set 0                 ; Parity error, set interrupt flag

+ 225 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel.pio.h

@@ -0,0 +1,225 @@
+// -------------------------------------------------- //
+// This file is autogenerated by pioasm; do not edit! //
+// -------------------------------------------------- //
+
+#pragma once
+
+#if !PICO_NO_HARDWARE
+#include "hardware/pio.h"
+#endif
+
+// ----------- //
+// scsi_parity //
+// ----------- //
+
+#define scsi_parity_wrap_target 0
+#define scsi_parity_wrap 3
+
+static const uint16_t scsi_parity_program_instructions[] = {
+            //     .wrap_target
+    0x80a0, //  0: pull   block                      
+    0x4061, //  1: in     null, 1                    
+    0x40e8, //  2: in     osr, 8                     
+    0x4037, //  3: in     x, 23                      
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_parity_program = {
+    .instructions = scsi_parity_program_instructions,
+    .length = 4,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_parity_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_parity_wrap_target, offset + scsi_parity_wrap);
+    return c;
+}
+#endif
+
+// ---------------------- //
+// scsi_accel_async_write //
+// ---------------------- //
+
+#define scsi_accel_async_write_wrap_target 0
+#define scsi_accel_async_write_wrap 4
+
+static const uint16_t scsi_accel_async_write_program_instructions[] = {
+            //     .wrap_target
+    0x90e0, //  0: pull   ifempty block   side 1     
+    0x7009, //  1: out    pins, 9         side 1     
+    0x7577, //  2: out    null, 23        side 1 [5] 
+    0x309a, //  3: wait   1 gpio, 26      side 1     
+    0x201a, //  4: wait   0 gpio, 26      side 0     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_accel_async_write_program = {
+    .instructions = scsi_accel_async_write_program_instructions,
+    .length = 5,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_accel_async_write_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_accel_async_write_wrap_target, offset + scsi_accel_async_write_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// --------------- //
+// scsi_accel_read //
+// --------------- //
+
+#define scsi_accel_read_wrap_target 0
+#define scsi_accel_read_wrap 5
+
+static const uint16_t scsi_accel_read_program_instructions[] = {
+            //     .wrap_target
+    0x90a0, //  0: pull   block           side 1     
+    0x309a, //  1: wait   1 gpio, 26      side 1     
+    0x4061, //  2: in     null, 1         side 0     
+    0x201a, //  3: wait   0 gpio, 26      side 0     
+    0x5009, //  4: in     pins, 9         side 1     
+    0x5056, //  5: in     y, 22           side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_accel_read_program = {
+    .instructions = scsi_accel_read_program_instructions,
+    .length = 6,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_accel_read_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_accel_read_wrap_target, offset + scsi_accel_read_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// --------------- //
+// scsi_sync_write //
+// --------------- //
+
+#define scsi_sync_write_wrap_target 0
+#define scsi_sync_write_wrap 2
+
+static const uint16_t scsi_sync_write_program_instructions[] = {
+            //     .wrap_target
+    0x7009, //  0: out    pins, 9         side 1     
+    0x6077, //  1: out    null, 23        side 0     
+    0x5061, //  2: in     null, 1         side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_sync_write_program = {
+    .instructions = scsi_sync_write_program_instructions,
+    .length = 3,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_sync_write_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_sync_write_wrap_target, offset + scsi_sync_write_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// --------------------- //
+// scsi_sync_write_pacer //
+// --------------------- //
+
+#define scsi_sync_write_pacer_wrap_target 0
+#define scsi_sync_write_pacer_wrap 2
+
+static const uint16_t scsi_sync_write_pacer_program_instructions[] = {
+            //     .wrap_target
+    0x209a, //  0: wait   1 gpio, 26                 
+    0x201a, //  1: wait   0 gpio, 26                 
+    0x6061, //  2: out    null, 1                    
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_sync_write_pacer_program = {
+    .instructions = scsi_sync_write_pacer_program_instructions,
+    .length = 3,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_sync_write_pacer_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_sync_write_pacer_wrap_target, offset + scsi_sync_write_pacer_wrap);
+    return c;
+}
+#endif
+
+// -------------------- //
+// scsi_sync_read_pacer //
+// -------------------- //
+
+#define scsi_sync_read_pacer_wrap_target 0
+#define scsi_sync_read_pacer_wrap 2
+
+static const uint16_t scsi_sync_read_pacer_program_instructions[] = {
+            //     .wrap_target
+    0x9020, //  0: push   block           side 1     
+    0x0040, //  1: jmp    x--, 0          side 0     
+    0x1002, //  2: jmp    2               side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_sync_read_pacer_program = {
+    .instructions = scsi_sync_read_pacer_program_instructions,
+    .length = 3,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_sync_read_pacer_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_sync_read_pacer_wrap_target, offset + scsi_sync_read_pacer_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// ---------------- //
+// scsi_read_parity //
+// ---------------- //
+
+#define scsi_read_parity_wrap_target 0
+#define scsi_read_parity_wrap 4
+
+static const uint16_t scsi_read_parity_program_instructions[] = {
+            //     .wrap_target
+    0x60c8, //  0: out    isr, 8                     
+    0x8020, //  1: push   block                      
+    0x6038, //  2: out    x, 24                      
+    0x0040, //  3: jmp    x--, 0                     
+    0xc000, //  4: irq    nowait 0                   
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_read_parity_program = {
+    .instructions = scsi_read_parity_program_instructions,
+    .length = 5,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_read_parity_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_read_parity_wrap_target, offset + scsi_read_parity_wrap);
+    return c;
+}
+#endif
+

+ 141 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_host.cpp

@@ -0,0 +1,141 @@
+// Accelerated SCSI subroutines for SCSI initiator/host side communication
+
+#include "scsi_accel_host.h"
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "scsi_accel_host.pio.h"
+#include <hardware/pio.h>
+#include <hardware/dma.h>
+#include <hardware/irq.h>
+#include <hardware/structs/iobank0.h>
+#include <hardware/sync.h>
+
+#define SCSI_PIO pio0
+#define SCSI_SM 0
+
+static struct {
+    // PIO configurations
+    uint32_t pio_offset_async_read;
+    pio_sm_config pio_cfg_async_read;
+} g_scsi_host;
+
+enum scsidma_state_t { SCSIHOST_IDLE = 0,
+                       SCSIHOST_READ };
+static volatile scsidma_state_t g_scsi_host_state;
+
+static void scsi_accel_host_config_gpio()
+{
+    if (g_scsi_host_state == SCSIHOST_IDLE)
+    {
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_ACK].ctrl = GPIO_FUNC_SIO;
+    }
+    else if (g_scsi_host_state == SCSIHOST_READ)
+    {
+        // Data bus and REQ as input, ACK pin as output
+        pio_sm_set_pins(SCSI_PIO, SCSI_SM, 0x7FF);
+        pio_sm_set_consecutive_pindirs(SCSI_PIO, SCSI_SM, 0, 10, false);
+        pio_sm_set_consecutive_pindirs(SCSI_PIO, SCSI_SM, 10, 1, true);
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_ACK].ctrl = GPIO_FUNC_PIO0;
+    }
+}
+
+uint32_t scsi_accel_host_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    // Currently this method just reads from the PIO RX fifo directly in software loop.
+    // The SD card access is parallelized using DMA, so there is limited benefit from using DMA here.
+    g_scsi_host_state = SCSIHOST_READ;
+
+    int cd_start = SCSI_IN(CD);
+    int msg_start = SCSI_IN(MSG);
+
+    pio_sm_init(SCSI_PIO, SCSI_SM, g_scsi_host.pio_offset_async_read, &g_scsi_host.pio_cfg_async_read);
+    scsi_accel_host_config_gpio();
+    pio_sm_set_enabled(SCSI_PIO, SCSI_SM, true);
+
+    // Set the number of bytes to read, must be divisible by 2.
+    assert((count & 1) == 0);
+    pio_sm_put(SCSI_PIO, SCSI_SM, count - 1);
+
+    // Read results from PIO RX FIFO
+    uint8_t *dst = buf;
+    uint8_t *end = buf + count;
+    uint32_t paritycheck = 0;
+    while (dst < end)
+    {
+        uint32_t available = pio_sm_get_rx_fifo_level(SCSI_PIO, SCSI_SM);
+
+        if (available == 0)
+        {
+            if (*resetFlag || !SCSI_IN(IO) || SCSI_IN(CD) != cd_start || SCSI_IN(MSG) != msg_start)
+            {
+                // Target switched out of DATA_IN mode
+                count = dst - buf;
+                break;
+            }
+        }
+
+        while (available > 0)
+        {
+            available--;
+            uint32_t word = pio_sm_get(SCSI_PIO, SCSI_SM);
+            paritycheck ^= word;
+            word = ~word;
+            *dst++ = word & 0xFF;
+            *dst++ = word >> 16;
+        }
+    }
+
+    // Check parity errors in whole block
+    // This doesn't detect if there is even number of parity errors in block.
+    uint8_t byte0 = ~(paritycheck & 0xFF);
+    uint8_t byte1 = ~(paritycheck >> 16);
+    if (paritycheck != ((g_scsi_parity_lookup[byte1] << 16) | g_scsi_parity_lookup[byte0]))
+    {
+        azlog("Parity error in scsi_accel_host_read(): ", paritycheck);
+        *parityError = 1;
+    }
+
+    g_scsi_host_state = SCSIHOST_IDLE;
+    SCSI_RELEASE_DATA_REQ();
+    scsi_accel_host_config_gpio();
+    pio_sm_set_enabled(SCSI_PIO, SCSI_SM, false);
+
+    return count;
+}
+
+
+void scsi_accel_host_init()
+{
+    g_scsi_host_state = SCSIHOST_IDLE;
+    scsi_accel_host_config_gpio();
+
+    // Load PIO programs
+    pio_clear_instruction_memory(SCSI_PIO);
+
+    // Asynchronous / synchronous SCSI read
+    g_scsi_host.pio_offset_async_read = pio_add_program(SCSI_PIO, &scsi_host_async_read_program);
+    g_scsi_host.pio_cfg_async_read = scsi_host_async_read_program_get_default_config(g_scsi_host.pio_offset_async_read);
+    sm_config_set_in_pins(&g_scsi_host.pio_cfg_async_read, SCSI_IO_DB0);
+    sm_config_set_sideset_pins(&g_scsi_host.pio_cfg_async_read, SCSI_OUT_ACK);
+    sm_config_set_out_shift(&g_scsi_host.pio_cfg_async_read, true, false, 32);
+    sm_config_set_in_shift(&g_scsi_host.pio_cfg_async_read, true, true, 32);
+}

+ 11 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_host.h

@@ -0,0 +1,11 @@
+// Accelerated SCSI subroutines for SCSI initiator/host side communication
+
+#pragma once
+
+#include <stdint.h>
+
+void scsi_accel_host_init();
+
+// Read data from SCSI bus.
+// Number of bytes to read must be divisible by two.
+uint32_t scsi_accel_host_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag);

+ 26 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_host.pio

@@ -0,0 +1,26 @@
+; RP2040 PIO program for accelerating SCSI initiator / host function
+; Run "pioasm scsi_accel_host.pio scsi_accel_host.pio.h" to regenerate the C header from this.
+; GPIO mapping:
+; - 0-7: DB0-DB7
+; -   8: DBP
+; Side set is ACK pin
+
+.define REQ 9
+.define ACK 10
+
+; Read from SCSI bus using asynchronous handshake.
+; Data is returned as 16-bit words that contain the 8 data bits + 1 parity bit.
+; Number of bytes to receive minus 1 should be written to TX fifo.
+; Number of bytes to receive must be divisible by 2.
+.program scsi_host_async_read
+    .side_set 1
+
+    pull block                  side 1  ; Get number of bytes to receive
+    mov x, osr                  side 1  ; Store to counter X
+
+start:
+    wait 0 gpio REQ             side 1  ; Wait for REQ low
+    in pins, 9                  side 0  ; Assert ACK, read GPIO
+    in null, 7                  side 0  ; Padding bits
+    wait 1 gpio REQ             side 0  ; Wait for REQ high
+    jmp x-- start               side 1  ; Deassert ACK, decrement byte count and jump to start

+ 44 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_host.pio.h

@@ -0,0 +1,44 @@
+// -------------------------------------------------- //
+// This file is autogenerated by pioasm; do not edit! //
+// -------------------------------------------------- //
+
+#pragma once
+
+#if !PICO_NO_HARDWARE
+#include "hardware/pio.h"
+#endif
+
+// -------------------- //
+// scsi_host_async_read //
+// -------------------- //
+
+#define scsi_host_async_read_wrap_target 0
+#define scsi_host_async_read_wrap 6
+
+static const uint16_t scsi_host_async_read_program_instructions[] = {
+            //     .wrap_target
+    0x90a0, //  0: pull   block           side 1     
+    0xb027, //  1: mov    x, osr          side 1     
+    0x3009, //  2: wait   0 gpio, 9       side 1     
+    0x4009, //  3: in     pins, 9         side 0     
+    0x4067, //  4: in     null, 7         side 0     
+    0x2089, //  5: wait   1 gpio, 9       side 0     
+    0x1042, //  6: jmp    x--, 2          side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_host_async_read_program = {
+    .instructions = scsi_host_async_read_program_instructions,
+    .length = 7,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_host_async_read_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_host_async_read_wrap_target, offset + scsi_host_async_read_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+

+ 1028 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_rp2040.cpp

@@ -0,0 +1,1028 @@
+/* Data flow in SCSI acceleration:
+ *
+ * 1. Application provides a buffer of bytes to send.
+ * 2. Code in this module adds parity bit to the bytes and packs two bytes into 32 bit words.
+ * 3. DMA controller copies the words to PIO peripheral FIFO.
+ * 4. PIO peripheral handles low-level SCSI handshake and writes bytes and parity to GPIO.
+ */
+
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include "scsi_accel_rp2040.h"
+#include "scsi_accel.pio.h"
+#include <hardware/pio.h>
+#include <hardware/dma.h>
+#include <hardware/irq.h>
+#include <hardware/structs/iobank0.h>
+#include <hardware/sync.h>
+#include <multicore.h>
+
+// SCSI bus write acceleration uses up to 3 PIO state machines:
+// SM0: Convert data bytes to lookup addresses to add parity
+// SM1: Write data to SCSI bus
+// SM2: For synchronous mode only, count ACK pulses
+#define SCSI_DMA_PIO pio0
+#define SCSI_PARITY_SM 0
+#define SCSI_DATA_SM 1
+#define SCSI_SYNC_SM 2
+
+// SCSI bus write acceleration uses 3 or 4 DMA channels (data flow A->B->C->D):
+// A: Bytes from RAM to scsi_parity PIO
+// B: Addresses from scsi_parity PIO to lookup DMA READ_ADDR register
+// C: Lookup from g_scsi_parity_lookup and copy to scsi_accel_async_write or scsi_sync_write PIO
+// D: For sync transfers, scsi_sync_write to scsi_sync_write_pacer PIO
+//
+// SCSI bus read acceleration uses 4 DMA channels (data flow D->C->B->A):
+// A: Bytes from scsi_read_parity PIO to memory buffer
+// B: Lookup from g_scsi_parity_check_lookup and copy to scsi_read_parity PIO
+// C: Addresses from scsi_accel_read PIO to lookup DMA READ_ADDR register
+// D: From pacer to data state machine to trigger transfers
+#define SCSI_DMA_CH_A 0
+#define SCSI_DMA_CH_B 1
+#define SCSI_DMA_CH_C 2
+#define SCSI_DMA_CH_D 3
+
+static struct {
+    uint8_t *app_buf; // Buffer provided by application
+    uint32_t app_bytes; // Bytes available in application buffer
+    uint32_t dma_bytes; // Bytes that have been scheduled for DMA so far
+    
+    uint8_t *next_app_buf; // Next buffer from application after current one finishes
+    uint32_t next_app_bytes; // Bytes in next buffer
+
+    // Synchronous mode?
+    int syncOffset;
+    int syncPeriod;
+    int syncOffsetDivider; // Autopush/autopull threshold for the write pacer state machine
+    int syncOffsetPreload; // Number of items to preload in the RX fifo of scsi_sync_write
+
+    // PIO configurations
+    uint32_t pio_offset_parity;
+    uint32_t pio_offset_async_write;
+    uint32_t pio_offset_sync_write_pacer;
+    uint32_t pio_offset_sync_write;
+    uint32_t pio_offset_read;
+    uint32_t pio_offset_read_parity;
+    uint32_t pio_offset_sync_read_pacer;
+    pio_sm_config pio_cfg_parity;
+    pio_sm_config pio_cfg_async_write;
+    pio_sm_config pio_cfg_sync_write_pacer;
+    pio_sm_config pio_cfg_sync_write;
+    pio_sm_config pio_cfg_read;
+    pio_sm_config pio_cfg_read_parity;
+    pio_sm_config pio_cfg_sync_read_pacer;
+    
+    // DMA configurations for write
+    dma_channel_config dmacfg_write_chA; // Data from RAM to scsi_parity PIO
+    dma_channel_config dmacfg_write_chB; // Addresses from scsi_parity PIO to lookup DMA
+    dma_channel_config dmacfg_write_chC; // Data from g_scsi_parity_lookup to scsi write PIO
+    dma_channel_config dmacfg_write_chD; // In synchronous mode only, transfer between state machines
+
+    // DMA configurations for read
+    dma_channel_config dmacfg_read_chA; // Data to destination memory buffer
+    dma_channel_config dmacfg_read_chB; // From lookup table to scsi_read_parity PIO
+    dma_channel_config dmacfg_read_chC; // From scsi_accel_read to channel B READ_ADDR
+    dma_channel_config dmacfg_read_chD; // From pacer to data state machine
+} g_scsi_dma;
+
+enum scsidma_state_t { SCSIDMA_IDLE = 0,
+                       SCSIDMA_WRITE, SCSIDMA_WRITE_DONE,
+                       SCSIDMA_READ, SCSIDMA_READ_DONE };
+static volatile scsidma_state_t g_scsi_dma_state;
+static bool g_channels_claimed = false;
+static void scsidma_config_gpio();
+
+
+/****************************************/
+/* Accelerated writes to SCSI bus       */
+/****************************************/
+
+// Load the SCSI parity state machine with the address of the parity lookup table.
+// Also sets up DMA channels B and C
+static void config_parity_sm_for_write()
+{
+    // Load base address to state machine register X
+    uint32_t addrbase = (uint32_t)&g_scsi_parity_lookup[0];
+    assert((addrbase & 0x1FF) == 0);
+    pio_sm_init(SCSI_DMA_PIO, SCSI_PARITY_SM, g_scsi_dma.pio_offset_parity, &g_scsi_dma.pio_cfg_parity);
+    pio_sm_put(SCSI_DMA_PIO, SCSI_PARITY_SM, addrbase >> 9);
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_PARITY_SM, pio_encode_pull(false, false));
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_PARITY_SM, pio_encode_mov(pio_x, pio_osr));
+    
+    // DMA channel B will copy addresses from parity PIO to DMA channel C read address register.
+    // It is triggered by the parity SM RX FIFO request
+    dma_channel_configure(SCSI_DMA_CH_B,
+        &g_scsi_dma.dmacfg_write_chB,
+        &dma_hw->ch[SCSI_DMA_CH_C].al3_read_addr_trig,
+        &SCSI_DMA_PIO->rxf[SCSI_PARITY_SM],
+        1, true);
+    
+    // DMA channel C will read g_scsi_parity_lookup to copy data + parity to SCSI write state machine.
+    // It is triggered by SCSI write machine TX FIFO request and chains to re-enable channel B.
+    dma_channel_configure(SCSI_DMA_CH_C,
+        &g_scsi_dma.dmacfg_write_chC,
+        &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
+        NULL,
+        1, false);
+}
+
+static void start_dma_write()
+{
+    if (g_scsi_dma.app_bytes <= g_scsi_dma.dma_bytes)
+    {
+        // Buffer has been fully processed, swap it
+        g_scsi_dma.dma_bytes = 0;
+        g_scsi_dma.app_buf = g_scsi_dma.next_app_buf;
+        g_scsi_dma.app_bytes = g_scsi_dma.next_app_bytes;
+        g_scsi_dma.next_app_buf = 0;
+        g_scsi_dma.next_app_bytes = 0;
+    }
+
+    // Check if we are all done.
+    // From SCSIDMA_WRITE_DONE state we can either go to IDLE in stopWrite()
+    // or back to WRITE in startWrite().
+    uint32_t bytes_to_send = g_scsi_dma.app_bytes - g_scsi_dma.dma_bytes;
+    if (bytes_to_send == 0)
+    {
+        g_scsi_dma_state = SCSIDMA_WRITE_DONE;
+        return;
+    }
+
+    uint8_t *src_buf = &g_scsi_dma.app_buf[g_scsi_dma.dma_bytes];
+    g_scsi_dma.dma_bytes += bytes_to_send;
+    
+    // Start DMA from current buffer to parity generator
+    dma_channel_configure(SCSI_DMA_CH_A,
+        &g_scsi_dma.dmacfg_write_chA,
+        &SCSI_DMA_PIO->txf[SCSI_PARITY_SM],
+        src_buf,
+        bytes_to_send,
+        true
+    );
+}
+
+void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
+{
+    // Any read requests should be matched with a stopRead()
+    assert(g_scsi_dma_state != SCSIDMA_READ && g_scsi_dma_state != SCSIDMA_READ_DONE);
+
+    __disable_irq();
+    if (g_scsi_dma_state == SCSIDMA_WRITE)
+    {
+        if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
+        {
+            // Combine with currently running request
+            g_scsi_dma.app_bytes += count;
+            count = 0;
+        }
+        else if (data == g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+        {
+            // Combine with queued request
+            g_scsi_dma.next_app_bytes += count;
+            count = 0;
+        }
+        else if (!g_scsi_dma.next_app_buf)
+        {
+            // Add as queued request
+            g_scsi_dma.next_app_buf = (uint8_t*)data;
+            g_scsi_dma.next_app_bytes = count;
+            count = 0;
+        }
+    }
+    __enable_irq();
+
+    // Check if the request was combined
+    if (count == 0) return;
+
+    if (g_scsi_dma_state != SCSIDMA_IDLE && g_scsi_dma_state != SCSIDMA_WRITE_DONE)
+    {
+        // Wait for previous request to finish
+        scsi_accel_rp2040_finishWrite(resetFlag);
+        if (*resetFlag)
+        {
+            return;
+        }
+    }
+
+    bool must_reconfig_gpio = (g_scsi_dma_state == SCSIDMA_IDLE);
+    g_scsi_dma_state = SCSIDMA_WRITE;
+    g_scsi_dma.app_buf = (uint8_t*)data;
+    g_scsi_dma.app_bytes = count;
+    g_scsi_dma.dma_bytes = 0;
+    g_scsi_dma.next_app_buf = 0;
+    g_scsi_dma.next_app_bytes = 0;
+    
+    if (must_reconfig_gpio)
+    {
+        SCSI_ENABLE_DATA_OUT();
+
+        if (g_scsi_dma.syncOffset == 0)
+        {
+            // Asynchronous write
+            config_parity_sm_for_write();
+            pio_sm_init(SCSI_DMA_PIO, SCSI_DATA_SM, g_scsi_dma.pio_offset_async_write, &g_scsi_dma.pio_cfg_async_write);
+            scsidma_config_gpio();
+
+            pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, true);
+            pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, true);
+        }
+        else
+        {
+            // Synchronous write
+            // Data state machine writes data to SCSI bus and dummy bits to its RX fifo.
+            // Sync state machine empties the dummy bits every time ACK is received, to control the transmit pace.
+            config_parity_sm_for_write();
+            pio_sm_init(SCSI_DMA_PIO, SCSI_DATA_SM, g_scsi_dma.pio_offset_sync_write, &g_scsi_dma.pio_cfg_sync_write);
+            pio_sm_init(SCSI_DMA_PIO, SCSI_SYNC_SM, g_scsi_dma.pio_offset_sync_write_pacer, &g_scsi_dma.pio_cfg_sync_write_pacer);
+            scsidma_config_gpio();
+
+            // Prefill RX fifo to set the syncOffset
+            for (int i = 0; i < g_scsi_dma.syncOffsetPreload; i++)
+            {
+                pio_sm_exec(SCSI_DMA_PIO, SCSI_DATA_SM,
+                    pio_encode_push(false, false) | pio_encode_sideset(1, 1));
+            }
+
+            // Fill the pacer TX fifo
+            // DMA should start transferring only after ACK pulses are received
+            for (int i = 0; i < 4; i++)
+            {
+                pio_sm_put(SCSI_DMA_PIO, SCSI_SYNC_SM, 0);
+            }
+
+            // Fill the pacer OSR
+            pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM,
+                pio_encode_mov(pio_osr, pio_null));
+
+            // Start DMA transfer to move dummy bits to write pacer
+            dma_channel_configure(SCSI_DMA_CH_D,
+                &g_scsi_dma.dmacfg_write_chD,
+                &SCSI_DMA_PIO->txf[SCSI_SYNC_SM],
+                &SCSI_DMA_PIO->rxf[SCSI_DATA_SM],
+                0xFFFFFFFF,
+                true
+            );
+
+            // Enable state machines
+            pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, true);
+            pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, true);
+            pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, true);
+        }
+        
+        dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, true);
+    }
+
+    start_dma_write();
+}
+
+bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
+{
+    // Check if everything has completed
+    if (g_scsi_dma_state == SCSIDMA_IDLE || g_scsi_dma_state == SCSIDMA_WRITE_DONE)
+    {
+        return true;
+    }
+
+    if (!data)
+        return false;
+    
+    // Check if this data item is still in queue.
+    bool finished = true;
+    __disable_irq();
+    if (data >= g_scsi_dma.app_buf &&
+        data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes &&
+        (uint32_t)data >= dma_hw->ch[SCSI_DMA_CH_A].al1_read_addr)
+    {
+        finished = false; // In current transfer
+    }
+    else if (data >= g_scsi_dma.next_app_buf &&
+             data < g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+    {
+        finished = false; // In queued transfer
+    }
+    __enable_irq();
+
+    return finished;
+}
+
+// Once DMA has finished, check if all PIO queues have been drained
+static bool scsi_accel_rp2040_isWriteDone()
+{
+    // Check if data is still waiting in PIO FIFO
+    if (!pio_sm_is_tx_fifo_empty(SCSI_DMA_PIO, SCSI_PARITY_SM) ||
+        !pio_sm_is_rx_fifo_empty(SCSI_DMA_PIO, SCSI_PARITY_SM) ||
+        !pio_sm_is_tx_fifo_empty(SCSI_DMA_PIO, SCSI_DATA_SM))
+    {
+        return false;
+    }
+
+    if (g_scsi_dma.syncOffset > 0)
+    {
+        // Check if all bytes of synchronous write have been acknowledged
+        if (pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM) > g_scsi_dma.syncOffsetPreload)
+            return false;
+    }
+    else
+    {
+        // Check if state machine has written out its OSR
+        if (pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM) != g_scsi_dma.pio_offset_async_write)
+            return false;
+    }
+
+    // Check if ACK of the final byte has finished
+    if (SCSI_IN(ACK))
+        return false;
+
+    return true;
+}
+
+static void scsi_accel_rp2040_stopWrite(volatile int *resetFlag)
+{
+    // Wait for TX fifo to be empty and ACK to go high
+    // For synchronous writes wait for all ACKs to be received also
+    uint32_t start = millis();
+    while (!scsi_accel_rp2040_isWriteDone() && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_stopWrite() timeout, FIFO levels ",
+                (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " ",
+                (int)pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " PC ",
+                (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM));
+            *resetFlag = 1;
+            break;
+        }
+    }
+
+    dma_channel_abort(SCSI_DMA_CH_A);
+    dma_channel_abort(SCSI_DMA_CH_B);
+    dma_channel_abort(SCSI_DMA_CH_C);
+    dma_channel_abort(SCSI_DMA_CH_D);
+    dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, false);
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    SCSI_RELEASE_DATA_REQ();
+    scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, false);
+}
+
+void scsi_accel_rp2040_finishWrite(volatile int *resetFlag)
+{
+    uint32_t start = millis();
+    while (g_scsi_dma_state != SCSIDMA_IDLE && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_finishWrite() timeout,"
+             " state: ", (int)g_scsi_dma_state, " ", (int)g_scsi_dma.dma_bytes, "/", (int)g_scsi_dma.app_bytes, ", ", (int)g_scsi_dma.next_app_bytes,
+             " PIO PC: ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_SYNC_SM),
+             " PIO FIFO: ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_SYNC_SM),
+             " DMA counts: ", dma_hw->ch[SCSI_DMA_CH_A].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_B].transfer_count,
+                         " ", dma_hw->ch[SCSI_DMA_CH_C].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_D].transfer_count);
+            *resetFlag = 1;
+            break;
+        }
+
+        if (g_scsi_dma_state == SCSIDMA_WRITE_DONE)
+        {
+            // DMA done, wait for PIO to finish also and reconfig GPIO.
+            scsi_accel_rp2040_stopWrite(resetFlag);
+        }
+    }
+}
+
+/****************************************/
+/* Accelerated reads from SCSI bus      */
+/****************************************/
+
+// Load the SCSI read state machine with the address of the parity lookup table.
+// Also sets up DMA channels B, C and D
+static void config_parity_sm_for_read()
+{
+    // Configure parity check state machine
+    pio_sm_init(SCSI_DMA_PIO, SCSI_PARITY_SM, g_scsi_dma.pio_offset_read_parity, &g_scsi_dma.pio_cfg_read_parity);
+
+    // Load base address to state machine register X
+    uint32_t addrbase = (uint32_t)&g_scsi_parity_check_lookup[0];
+    assert((addrbase & 0x3FF) == 0);
+    pio_sm_init(SCSI_DMA_PIO, SCSI_DATA_SM, g_scsi_dma.pio_offset_read, &g_scsi_dma.pio_cfg_read);
+    pio_sm_put(SCSI_DMA_PIO, SCSI_DATA_SM, addrbase >> 10);
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_DATA_SM, pio_encode_pull(false, false) | pio_encode_sideset(1, 1));
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_DATA_SM, pio_encode_mov(pio_y, pio_osr) | pio_encode_sideset(1, 1));
+    
+    // For synchronous mode, the REQ pin is driven by SCSI_SYNC_SM, so disable it in SCSI_DATA_SM
+    if (g_scsi_dma.syncOffset > 0)
+    {
+        pio_sm_set_sideset_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0);
+    }
+
+    // DMA channel B will read g_scsi_parity_check_lookup and write to scsi_read_parity PIO.
+    dma_channel_configure(SCSI_DMA_CH_B,
+        &g_scsi_dma.dmacfg_read_chB,
+        &SCSI_DMA_PIO->txf[SCSI_PARITY_SM],
+        NULL,
+        1, false);
+    
+    // DMA channel C will copy addresses from data PIO to DMA channel B read address register.
+    // It is triggered by the data SM RX FIFO request.
+    // This triggers channel B by writing to READ_ADDR_TRIG
+    // Channel B chaining re-enables this channel.
+    dma_channel_configure(SCSI_DMA_CH_C,
+        &g_scsi_dma.dmacfg_read_chC,
+        &dma_hw->ch[SCSI_DMA_CH_B].al3_read_addr_trig,
+        &SCSI_DMA_PIO->rxf[SCSI_DATA_SM],
+        1, true);
+
+    if (g_scsi_dma.syncOffset == 0)
+    {
+        // DMA channel D will copy dummy words to scsi_accel_read PIO to set the number
+        // of bytes to transfer.
+        static const uint32_t dummy = 0;
+        dma_channel_configure(SCSI_DMA_CH_D,
+            &g_scsi_dma.dmacfg_read_chD,
+            &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
+            &dummy,
+            0, false);
+    }
+    else
+    {
+        pio_sm_init(SCSI_DMA_PIO, SCSI_SYNC_SM, g_scsi_dma.pio_offset_sync_read_pacer, &g_scsi_dma.pio_cfg_sync_read_pacer);
+
+        // DMA channel D will copy words from scsi_sync_read_pacer to scsi_accel_read PIO
+        // to control the offset between REQ pulses sent and ACK pulses received.
+        dma_channel_configure(SCSI_DMA_CH_D,
+            &g_scsi_dma.dmacfg_read_chD,
+            &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
+            &SCSI_DMA_PIO->rxf[SCSI_SYNC_SM],
+            0, false);
+    }
+
+    // Clear PIO IRQ flag that is used to detect parity error
+    SCSI_DMA_PIO->irq = 1;
+}
+
+static void start_dma_read()
+{
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, false);
+    pio_sm_clear_fifos(SCSI_DMA_PIO, SCSI_PARITY_SM);
+    pio_sm_clear_fifos(SCSI_DMA_PIO, SCSI_DATA_SM);
+    
+    if (g_scsi_dma.app_bytes <= g_scsi_dma.dma_bytes)
+    {
+        // Buffer has been fully processed, swap it
+        g_scsi_dma.dma_bytes = 0;
+        g_scsi_dma.app_buf = g_scsi_dma.next_app_buf;
+        g_scsi_dma.app_bytes = g_scsi_dma.next_app_bytes;
+        g_scsi_dma.next_app_buf = 0;
+        g_scsi_dma.next_app_bytes = 0;
+    }
+    
+    // Check if we are all done.
+    // From SCSIDMA_READ_DONE state we can either go to IDLE in stopRead()
+    // or back to READ in startWrite().
+    uint32_t bytes_to_read = g_scsi_dma.app_bytes - g_scsi_dma.dma_bytes;
+    if (bytes_to_read == 0)
+    {
+        g_scsi_dma_state = SCSIDMA_READ_DONE;
+        return;
+    }
+
+    if (g_scsi_dma.syncOffset == 0)
+    {
+        // Start sending dummy words to scsi_accel_read state machine
+        dma_channel_set_trans_count(SCSI_DMA_CH_D, bytes_to_read, true);
+    }
+    else
+    {
+        // Set number of bytes to receive to the scsi_sync_read_pacer state machine register X
+        pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, false);
+        hw_clear_bits(&SCSI_DMA_PIO->sm[SCSI_SYNC_SM].shiftctrl, PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS);
+        pio_sm_put(SCSI_DMA_PIO, SCSI_SYNC_SM, bytes_to_read - 1);
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_pull(false, false) | pio_encode_sideset(1, 1));
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_mov(pio_x, pio_osr) | pio_encode_sideset(1, 1));
+        hw_set_bits(&SCSI_DMA_PIO->sm[SCSI_SYNC_SM].shiftctrl, PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS);
+        
+        // Prefill FIFOs to get correct syncOffset
+        int prefill = 12 - g_scsi_dma.syncOffset;
+        
+        // Always at least 1 word to avoid race condition between REQ and ACK pulses
+        if (prefill < 1) prefill = 1;
+
+        // Up to 4 words in SCSI_DATA_SM TX fifo
+        for (int i = 0; i < 4 && prefill > 0; i++)
+        {
+            pio_sm_put(SCSI_DMA_PIO, SCSI_DATA_SM, 0);
+            prefill--;
+        }
+
+        // Up to 8 words in SCSI_SYNC_SM RX fifo
+        for (int i = 0; i < 8 && prefill > 0; i++)
+        {
+            pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_push(false, false) | pio_encode_sideset(1, 1));
+            prefill--;
+        }
+        
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_jmp(g_scsi_dma.pio_offset_sync_read_pacer) | pio_encode_sideset(1, 1));
+
+        // Start transfers
+        dma_channel_set_trans_count(SCSI_DMA_CH_D, bytes_to_read, true);
+    }
+
+    // Start DMA to fill the destination buffer
+    uint8_t *dest_buf = &g_scsi_dma.app_buf[g_scsi_dma.dma_bytes];
+    g_scsi_dma.dma_bytes += bytes_to_read;
+    dma_channel_configure(SCSI_DMA_CH_A,
+        &g_scsi_dma.dmacfg_read_chA,
+        dest_buf,
+        &SCSI_DMA_PIO->rxf[SCSI_PARITY_SM],
+        bytes_to_read,
+        true
+    );
+
+    // Ready to start the data and parity check state machines
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, true);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, true);
+
+    if (g_scsi_dma.syncOffset > 0)
+    {
+        // Start sending REQ pulses
+        pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, true);
+    }
+}
+
+void scsi_accel_rp2040_startRead(uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    // Any write requests should be matched with a stopWrite()
+    assert(g_scsi_dma_state != SCSIDMA_WRITE && g_scsi_dma_state != SCSIDMA_WRITE_DONE);
+
+    __disable_irq();
+    if (g_scsi_dma_state == SCSIDMA_READ)
+    {
+        if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
+        {
+            // Combine with currently running request
+            g_scsi_dma.app_bytes += count;
+            count = 0;
+        }
+        else if (data == g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+        {
+            // Combine with queued request
+            g_scsi_dma.next_app_bytes += count;
+            count = 0;
+        }
+        else if (!g_scsi_dma.next_app_buf)
+        {
+            // Add as queued request
+            g_scsi_dma.next_app_buf = (uint8_t*)data;
+            g_scsi_dma.next_app_bytes = count;
+            count = 0;
+        }
+    }
+    __enable_irq();
+
+    // Check if the request was combined
+    if (count == 0) return;
+
+    if (g_scsi_dma_state != SCSIDMA_IDLE && g_scsi_dma_state != SCSIDMA_READ_DONE)
+    {
+        // Wait for previous request to finish
+        scsi_accel_rp2040_finishRead(NULL, 0, parityError, resetFlag);
+        if (*resetFlag)
+        {
+            return;
+        }
+    }
+
+    bool must_reconfig_gpio = (g_scsi_dma_state == SCSIDMA_IDLE);
+    g_scsi_dma_state = SCSIDMA_READ;
+    g_scsi_dma.app_buf = (uint8_t*)data;
+    g_scsi_dma.app_bytes = count;
+    g_scsi_dma.dma_bytes = 0;
+    g_scsi_dma.next_app_buf = 0;
+    g_scsi_dma.next_app_bytes = 0;
+
+    if (must_reconfig_gpio)
+    {
+        config_parity_sm_for_read();
+        scsidma_config_gpio();
+        dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, true);
+    }
+
+    start_dma_read();
+}
+
+bool scsi_accel_rp2040_isReadFinished(const uint8_t* data)
+{
+    // Check if everything has completed
+    if (g_scsi_dma_state == SCSIDMA_IDLE || g_scsi_dma_state == SCSIDMA_READ_DONE)
+    {
+        return true;
+    }
+
+    if (!data)
+        return false;
+
+    // Check if this data item is still in queue.
+    bool finished = true;
+    __disable_irq();
+    if (data >= g_scsi_dma.app_buf &&
+        data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes &&
+        (uint32_t)data >= dma_hw->ch[SCSI_DMA_CH_A].write_addr)
+    {
+        finished = false; // In current transfer
+    }
+    else if (data >= g_scsi_dma.next_app_buf &&
+             data < g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+    {
+        finished = false; // In queued transfer
+    }
+    __enable_irq();
+
+    return finished;
+}
+
+static void scsi_accel_rp2040_stopRead()
+{
+    dma_channel_abort(SCSI_DMA_CH_A);
+    dma_channel_abort(SCSI_DMA_CH_B);
+    dma_channel_abort(SCSI_DMA_CH_C);
+    dma_channel_abort(SCSI_DMA_CH_D);
+    dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, false);
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    SCSI_RELEASE_DATA_REQ();
+    scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, false);
+}
+
+void scsi_accel_rp2040_finishRead(const uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    uint32_t start = millis();
+    const uint8_t *query_addr = (data ? (data + count - 1) : NULL);
+    while (!scsi_accel_rp2040_isReadFinished(query_addr) && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_finishRead timeout,"
+             " state: ", (int)g_scsi_dma_state, " ", (int)g_scsi_dma.dma_bytes, "/", (int)g_scsi_dma.app_bytes, ", ", (int)g_scsi_dma.next_app_bytes,
+             " PIO PC: ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_SYNC_SM),
+             " PIO FIFO: ", (int)pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM),
+             " DMA counts: ", dma_hw->ch[SCSI_DMA_CH_A].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_B].transfer_count,
+                         " ", dma_hw->ch[SCSI_DMA_CH_C].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_D].transfer_count);
+            *resetFlag = 1;
+            break;
+        }
+    }
+    
+    if (g_scsi_dma_state == SCSIDMA_READ_DONE || *resetFlag)
+    {
+        // This was last buffer, release bus
+        scsi_accel_rp2040_stopRead();
+    }
+    
+    // Check if any parity errors have been detected during the transfer so far
+    if (parityError != NULL && (SCSI_DMA_PIO->irq & 1))
+    {
+        azdbg("scsi_accel_rp2040_finishRead(", bytearray(data, count), ") detected parity error");
+        *parityError = true;
+    }
+}
+
+/*******************************************************/
+/* Initialization functions common to read/write       */
+/*******************************************************/
+
+static void scsi_dma_irq()
+{
+    dma_hw->ints0 = (1 << SCSI_DMA_CH_A);
+
+    scsidma_state_t state = g_scsi_dma_state;
+    if (state == SCSIDMA_WRITE)
+    {
+        // Start writing from next buffer, if any, or set state to SCSIDMA_WRITE_DONE
+        start_dma_write();
+    }
+    else if (state == SCSIDMA_READ)
+    {
+        // Start reading into next buffer, if any, or set state to SCSIDMA_READ_DONE
+        start_dma_read();
+    }
+}
+
+// Select GPIO from PIO peripheral or from software controlled SIO
+static void scsidma_config_gpio()
+{
+    if (g_scsi_dma_state == SCSIDMA_IDLE)
+    {
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_SIO;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_WRITE)
+    {
+        // Make sure the initial state of all pins is high and output
+        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x201FF);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 9, true);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 17, 1, true);
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_READ)
+    {
+        if (g_scsi_dma.syncOffset == 0)
+        {
+            // Asynchronous read
+            // Data bus as input, REQ pin as output
+            pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x201FF);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 9, false);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 17, 1, true);
+        }
+        else
+        {
+            // Synchronous read, REQ pin is written by SYNC_SM
+            pio_sm_set_pins(SCSI_DMA_PIO, SCSI_SYNC_SM, 0x201FF);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 9, false);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_SYNC_SM, 17, 1, true);
+        }
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
+}
+
+void scsi_accel_rp2040_init()
+{
+    g_scsi_dma_state = SCSIDMA_IDLE;
+    scsidma_config_gpio();
+
+    // Mark channels as being in use, unless it has been done already
+    if (!g_channels_claimed)
+    {
+        pio_sm_claim(SCSI_DMA_PIO, SCSI_PARITY_SM);
+        pio_sm_claim(SCSI_DMA_PIO, SCSI_DATA_SM);
+        pio_sm_claim(SCSI_DMA_PIO, SCSI_SYNC_SM);
+        dma_channel_claim(SCSI_DMA_CH_A);
+        dma_channel_claim(SCSI_DMA_CH_B);
+        dma_channel_claim(SCSI_DMA_CH_C);
+        dma_channel_claim(SCSI_DMA_CH_D);
+        g_channels_claimed = true;
+    }
+
+    // Load PIO programs
+    pio_clear_instruction_memory(SCSI_DMA_PIO);
+    
+    // Parity lookup generator
+    g_scsi_dma.pio_offset_parity = pio_add_program(SCSI_DMA_PIO, &scsi_parity_program);
+    g_scsi_dma.pio_cfg_parity = scsi_parity_program_get_default_config(g_scsi_dma.pio_offset_parity);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_parity, true, false, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_parity, true, true, 32);
+
+    // Asynchronous SCSI write
+    g_scsi_dma.pio_offset_async_write = pio_add_program(SCSI_DMA_PIO, &scsi_accel_async_write_program);
+    g_scsi_dma.pio_cfg_async_write = scsi_accel_async_write_program_get_default_config(g_scsi_dma.pio_offset_async_write);
+    sm_config_set_out_pins(&g_scsi_dma.pio_cfg_async_write, SCSI_IO_DB0, 9);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_async_write, SCSI_OUT_REQ);
+    sm_config_set_fifo_join(&g_scsi_dma.pio_cfg_async_write, PIO_FIFO_JOIN_TX);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_async_write, true, false, 32);
+
+    // Synchronous SCSI write pacer / ACK handler
+    g_scsi_dma.pio_offset_sync_write_pacer = pio_add_program(SCSI_DMA_PIO, &scsi_sync_write_pacer_program);
+    g_scsi_dma.pio_cfg_sync_write_pacer = scsi_sync_write_pacer_program_get_default_config(g_scsi_dma.pio_offset_sync_write_pacer);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_sync_write_pacer, true, true, 1);
+
+    // Synchronous SCSI data writer
+    g_scsi_dma.pio_offset_sync_write = pio_add_program(SCSI_DMA_PIO, &scsi_sync_write_program);
+    g_scsi_dma.pio_cfg_sync_write = scsi_sync_write_program_get_default_config(g_scsi_dma.pio_offset_sync_write);
+    sm_config_set_out_pins(&g_scsi_dma.pio_cfg_sync_write, SCSI_IO_DB0, 9);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_sync_write, SCSI_OUT_REQ);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_sync_write, true, true, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_sync_write, true, true, 1);
+
+    // Asynchronous / synchronous SCSI read
+    g_scsi_dma.pio_offset_read = pio_add_program(SCSI_DMA_PIO, &scsi_accel_read_program);
+    g_scsi_dma.pio_cfg_read = scsi_accel_read_program_get_default_config(g_scsi_dma.pio_offset_read);
+    sm_config_set_in_pins(&g_scsi_dma.pio_cfg_read, SCSI_IO_DB0);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_read, SCSI_OUT_REQ);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_read, true, false, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_read, true, true, 32);
+
+    // Synchronous SCSI read pacer
+    g_scsi_dma.pio_offset_sync_read_pacer = pio_add_program(SCSI_DMA_PIO, &scsi_sync_read_pacer_program);
+    g_scsi_dma.pio_cfg_sync_read_pacer = scsi_sync_read_pacer_program_get_default_config(g_scsi_dma.pio_offset_sync_read_pacer);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_sync_read_pacer, SCSI_OUT_REQ);
+
+    // Read parity check
+    g_scsi_dma.pio_offset_read_parity = pio_add_program(SCSI_DMA_PIO, &scsi_read_parity_program);
+    g_scsi_dma.pio_cfg_read_parity = scsi_read_parity_program_get_default_config(g_scsi_dma.pio_offset_read_parity);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_read_parity, true, true, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_read_parity, true, false, 32);
+
+    // Create DMA channel configurations so they can be applied quickly later
+    
+    // For write to SCSI BUS:
+    // Channel A: Bytes from RAM to scsi_parity PIO
+    dma_channel_config cfg = dma_channel_get_default_config(SCSI_DMA_CH_A);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_8);
+    channel_config_set_read_increment(&cfg, true);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, true));
+    g_scsi_dma.dmacfg_write_chA = cfg;
+
+    // Channel B: Addresses from scsi_parity PIO to lookup DMA READ_ADDR register
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_B);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, false));
+    g_scsi_dma.dmacfg_write_chB = cfg;
+
+    // Channel C: Lookup from g_scsi_parity_lookup and copy to scsi_accel_async_write or scsi_sync_write PIO
+    // When done, chain to channel B
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_C);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_16);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, true));
+    channel_config_set_chain_to(&cfg, SCSI_DMA_CH_B);
+    g_scsi_dma.dmacfg_write_chC = cfg;
+
+    // Channel D: In synchronous mode a second DMA channel is used to transfer dummy bits
+    // from first state machine to second one.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_D);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_SYNC_SM, true));
+    g_scsi_dma.dmacfg_write_chD = cfg;
+
+    // For read from SCSI BUS:
+    // Channel A: Bytes from scsi_read_parity PIO to destination memory buffer
+    // This takes the bottom 8 bits which is the data without parity bit.
+    // Triggered by scsi_read_parity RX FIFO.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_A);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_8);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, true);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, false));
+    g_scsi_dma.dmacfg_read_chA = cfg;
+
+    // Channel B: Lookup from g_scsi_parity_check_lookup and copy to scsi_read_parity PIO
+    // Triggered by channel C writing to READ_ADDR_TRIG
+    // Re-enables channel C by chaining after done.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_B);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_16);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, DREQ_FORCE);
+    channel_config_set_chain_to(&cfg, SCSI_DMA_CH_C);
+    cfg.ctrl |= DMA_CH0_CTRL_TRIG_HIGH_PRIORITY_BITS;
+    g_scsi_dma.dmacfg_read_chB = cfg;
+
+    // Channel C: Addresses from scsi_read PIO to channel B READ_ADDR register
+    // A single transfer starts when PIO RX FIFO has data.
+    // The DMA channel is re-enabled by channel B chaining.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_C);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, false));
+    g_scsi_dma.dmacfg_read_chC = cfg;
+
+    // Channel D: In synchronous mode a second DMA channel is used to transfer dummy words
+    // from first state machine to second one to control the pace of data transfer.
+    // In asynchronous mode this just transfers words to control the number of bytes.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_D);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, true));
+    g_scsi_dma.dmacfg_read_chD = cfg;
+    
+    // Interrupts are used for data buffer swapping
+    irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_irq);
+    irq_set_enabled(DMA_IRQ_0, true);
+}
+
+void scsi_accel_rp2040_setSyncMode(int syncOffset, int syncPeriod)
+{
+    assert(g_scsi_dma_state == SCSIDMA_IDLE);
+
+    if (syncOffset != g_scsi_dma.syncOffset || syncPeriod != g_scsi_dma.syncPeriod)
+    {
+        g_scsi_dma.syncOffset = syncOffset;
+        g_scsi_dma.syncPeriod = syncPeriod;
+
+        if (syncOffset > 0)
+        {
+            // Set up offset amount to PIO state machine configs.
+            // The RX fifo of scsi_sync_write has 4 slots.
+            // We can preload it with 0-3 items and set the autopush threshold 1, 2, 4 ... 32
+            // to act as a divider. This allows offsets 1 to 128 bytes.
+            // SCSI2SD code currently only uses offsets up to 15.
+            if (syncOffset <= 4)
+            {
+                g_scsi_dma.syncOffsetDivider = 1;
+                g_scsi_dma.syncOffsetPreload = 5 - syncOffset;
+            }
+            else if (syncOffset <= 8)
+            {
+                g_scsi_dma.syncOffsetDivider = 2;
+                g_scsi_dma.syncOffsetPreload = 5 - syncOffset / 2;
+            }
+            else if (syncOffset <= 16)
+            {
+                g_scsi_dma.syncOffsetDivider = 4;
+                g_scsi_dma.syncOffsetPreload = 5 - syncOffset / 4;
+            }
+            else
+            {
+                g_scsi_dma.syncOffsetDivider = 4;
+                g_scsi_dma.syncOffsetPreload = 0;
+            }
+
+            // To properly detect when all bytes have been ACKed,
+            // we need at least one vacant slot in the FIFO.
+            if (g_scsi_dma.syncOffsetPreload > 3)
+                g_scsi_dma.syncOffsetPreload = 3;
+
+            sm_config_set_out_shift(&g_scsi_dma.pio_cfg_sync_write_pacer, true, true, g_scsi_dma.syncOffsetDivider);
+            sm_config_set_in_shift(&g_scsi_dma.pio_cfg_sync_write, true, true, g_scsi_dma.syncOffsetDivider);
+
+            // Set up the timing parameters to PIO program
+            // The scsi_sync_write PIO program consists of three instructions.
+            // The delays are in clock cycles, each taking 8 ns.
+            // delay0: Delay from data write to REQ assertion
+            // delay1: Delay from REQ assert to REQ deassert
+            // delay2: Delay from REQ deassert to data write
+            int delay0, delay1, delay2;
+            int totalDelay = syncPeriod * 4 / 8;
+
+            if (syncPeriod <= 25)
+            {
+                // Fast SCSI timing: 30 ns assertion period, 25 ns skew delay
+                // The hardware rise and fall time require some extra delay,
+                // the values below are tuned based on oscilloscope measurements.
+                delay0 = 3;
+                delay1 = 5;
+                delay2 = totalDelay - delay0 - delay1 - 3;
+                if (delay2 < 0) delay2 = 0;
+                if (delay2 > 15) delay2 = 15;
+            }
+            else
+            {
+                // Slow SCSI timing: 90 ns assertion period, 55 ns skew delay
+                delay0 = 6;
+                delay1 = 12;
+                delay2 = totalDelay - delay0 - delay1 - 3;
+                if (delay2 < 0) delay2 = 0;
+                if (delay2 > 15) delay2 = 15;
+            }
+
+            // Patch the delay values into the instructions in scsi_sync_write.
+            // The code in scsi_accel.pio must have delay set to 0 for this to work correctly.
+            uint16_t instr0 = scsi_sync_write_program_instructions[0] | pio_encode_delay(delay0);
+            uint16_t instr1 = scsi_sync_write_program_instructions[1] | pio_encode_delay(delay1);
+            uint16_t instr2 = scsi_sync_write_program_instructions[2] | pio_encode_delay(delay2);
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 0] = instr0;
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 1] = instr1;
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 2] = instr2;
+
+            // And similar patching for scsi_sync_read_pacer
+            int rdelay2 = totalDelay - delay1 - 2;
+            if (rdelay2 > 15) rdelay2 = 15;
+            if (rdelay2 < 5) rdelay2 = 5;
+            uint16_t rinstr0 = scsi_sync_read_pacer_program_instructions[0] | pio_encode_delay(rdelay2);
+            uint16_t rinstr1 = (scsi_sync_read_pacer_program_instructions[1] + g_scsi_dma.pio_offset_sync_read_pacer) | pio_encode_delay(delay1);
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_read_pacer + 0] = rinstr0;
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_read_pacer + 1] = rinstr1;
+        }
+    }
+
+}

+ 44 - 0
lib/ZuluSCSI_platform_BS2/scsi_accel_rp2040.h

@@ -0,0 +1,44 @@
+// Accelerated SCSI subroutines using RP2040 hardware PIO peripheral.
+
+#pragma once
+
+#include <stdint.h>
+
+void scsi_accel_rp2040_init();
+
+// Set SCSI access mode for synchronous transfers
+// Setting syncOffset = 0 enables asynchronous SCSI.
+// Setting syncOffset > 0 enables synchronous SCSI.
+void scsi_accel_rp2040_setSyncMode(int syncOffset, int syncPeriod);
+
+// Queue a request to write data from the buffer to SCSI bus.
+// This function typically returns immediately and the request will complete in background.
+// If there are too many queued requests, this function will block until previous request finishes.
+void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag);
+
+// Query whether the data at pointer has already been read, i.e. buffer can be reused.
+// If data is NULL, checks if all writes have completed.
+bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data);
+
+// Wait for all write requests to finish and release the bus.
+// If resetFlag is non-zero, aborts write immediately.
+void scsi_accel_rp2040_finishWrite(volatile int *resetFlag);
+
+// Queue a request to read data from SCSI bus to the buffer.
+// This function typically returns immediately and the request will complete in background.
+// If there are too many queued requests, this function will block until previous request finishes.
+void scsi_accel_rp2040_startRead(uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag);
+
+// Query whether data at address is part of a queued read request.
+// Returns true if there is no outstanding request.
+// If data is NULL, checks if all reads have completed.
+bool scsi_accel_rp2040_isReadFinished(const uint8_t* data);
+
+// Wait for a read request to complete.
+// If buf is not NULL, waits only until the data at data[0] .. data[count-1] is valid.
+// If buf is NULL, waits for all read requests to complete.
+// If there are no further read requests, releases the bus.
+// If resetFlag is non-zero, aborts read immediately.
+// If a parity error has been noticed in any buffer since starting the read, parityError is set to 1.
+void scsi_accel_rp2040_finishRead(const uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag);
+

+ 497 - 0
lib/ZuluSCSI_platform_BS2/sd_card_sdio.cpp

@@ -0,0 +1,497 @@
+// Driver for accessing SD card in SDIO mode on RP2040.
+
+#include "ZuluSCSI_platform.h"
+
+#ifdef SD_USE_SDIO
+
+#include "ZuluSCSI_log.h"
+#include "rp2040_sdio.h"
+#include <hardware/gpio.h>
+#include <SdFat.h>
+#include <SdCard/SdCardInfo.h>
+
+static uint32_t g_sdio_ocr; // Operating condition register from card
+static uint32_t g_sdio_rca; // Relative card address
+static cid_t g_sdio_cid;
+static csd_t g_sdio_csd;
+static int g_sdio_error_line;
+static sdio_status_t g_sdio_error;
+static uint32_t g_sdio_dma_buf[128];
+static uint32_t g_sdio_sector_count;
+
+#define checkReturnOk(call) ((g_sdio_error = (call)) == SDIO_OK ? true : logSDError(__LINE__))
+static bool logSDError(int line)
+{
+    g_sdio_error_line = line;
+    azlog("SDIO SD card error on line ", line, ", error code ", (int)g_sdio_error);
+    return false;
+}
+
+// Callback used by SCSI code for simultaneous processing
+static sd_callback_t m_stream_callback;
+static const uint8_t *m_stream_buffer;
+static uint32_t m_stream_count;
+static uint32_t m_stream_count_start;
+
+void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer)
+{
+    m_stream_callback = func;
+    m_stream_buffer = buffer;
+    m_stream_count = 0;
+    m_stream_count_start = 0;
+}
+
+static sd_callback_t get_stream_callback(const uint8_t *buf, uint32_t count, const char *accesstype, uint32_t sector)
+{
+    m_stream_count_start = m_stream_count;
+
+    if (m_stream_callback)
+    {
+        if (buf == m_stream_buffer + m_stream_count)
+        {
+            m_stream_count += count;
+            return m_stream_callback;
+        }
+        else
+        {
+            azdbg("SD card ", accesstype, "(", (int)sector,
+                  ") slow transfer, buffer", (uint32_t)buf, " vs. ", (uint32_t)(m_stream_buffer + m_stream_count));
+            return NULL;
+        }
+    }
+    
+    return NULL;
+}
+
+bool SdioCard::begin(SdioConfig sdioConfig)
+{
+    uint32_t reply;
+    sdio_status_t status;
+    
+    // Initialize at 1 MHz clock speed
+    rp2040_sdio_init(25);
+
+    // Establish initial connection with the card
+    for (int retries = 0; retries < 5; retries++)
+    {
+        delayMicroseconds(1000);
+        reply = 0;
+        rp2040_sdio_command_R1(CMD0, 0, NULL); // GO_IDLE_STATE
+        status = rp2040_sdio_command_R1(CMD8, 0x1AA, &reply); // SEND_IF_COND
+
+        if (status == SDIO_OK && reply == 0x1AA)
+        {
+            break;
+        }
+    }
+
+    if (reply != 0x1AA || status != SDIO_OK)
+    {
+        // azdbg("SDIO not responding to CMD8 SEND_IF_COND, status ", (int)status, " reply ", reply);
+        return false;
+    }
+
+    // Send ACMD41 to begin card initialization and wait for it to complete
+    uint32_t start = millis();
+    do {
+        if (!checkReturnOk(rp2040_sdio_command_R1(CMD55, 0, &reply)) || // APP_CMD
+            !checkReturnOk(rp2040_sdio_command_R3(ACMD41, 0xD0040000, &g_sdio_ocr))) // 3.0V voltage
+            // !checkReturnOk(rp2040_sdio_command_R1(ACMD41, 0xC0100000, &g_sdio_ocr)))
+        {
+            return false;
+        }
+
+        if ((uint32_t)(millis() - start) > 1000)
+        {
+            azlog("SDIO card initialization timeout");
+            return false;
+        }
+    } while (!(g_sdio_ocr & (1 << 31)));
+
+    // Get CID
+    if (!checkReturnOk(rp2040_sdio_command_R2(CMD2, 0, (uint8_t*)&g_sdio_cid)))
+    {
+        azdbg("SDIO failed to read CID");
+        return false;
+    }
+
+    // Get relative card address
+    if (!checkReturnOk(rp2040_sdio_command_R1(CMD3, 0, &g_sdio_rca)))
+    {
+        azdbg("SDIO failed to get RCA");
+        return false;
+    }
+
+    // Get CSD
+    if (!checkReturnOk(rp2040_sdio_command_R2(CMD9, g_sdio_rca, (uint8_t*)&g_sdio_csd)))
+    {
+        azdbg("SDIO failed to read CSD");
+        return false;
+    }
+
+    g_sdio_sector_count = sectorCount();
+
+    // Select card
+    if (!checkReturnOk(rp2040_sdio_command_R1(CMD7, g_sdio_rca, &reply)))
+    {
+        azdbg("SDIO failed to select card");
+        return false;
+    }
+
+    // Set 4-bit bus mode
+    if (!checkReturnOk(rp2040_sdio_command_R1(CMD55, g_sdio_rca, &reply)) ||
+        !checkReturnOk(rp2040_sdio_command_R1(ACMD6, 2, &reply)))
+    {
+        azdbg("SDIO failed to set bus width");
+        return false;
+    }
+
+    // Increase to 25 MHz clock rate
+    rp2040_sdio_init(1);
+
+    return true;
+}
+
+uint8_t SdioCard::errorCode() const
+{
+    return g_sdio_error;
+}
+
+uint32_t SdioCard::errorData() const
+{
+    return 0;
+}
+
+uint32_t SdioCard::errorLine() const
+{
+    return g_sdio_error_line;
+}
+
+bool SdioCard::isBusy() 
+{
+    return (sio_hw->gpio_in & (1 << SDIO_D0)) == 0;
+}
+
+uint32_t SdioCard::kHzSdClk()
+{
+    return 0;
+}
+
+bool SdioCard::readCID(cid_t* cid)
+{
+    *cid = g_sdio_cid;
+    return true;
+}
+
+bool SdioCard::readCSD(csd_t* csd)
+{
+    *csd = g_sdio_csd;
+    return true;
+}
+
+bool SdioCard::readOCR(uint32_t* ocr)
+{
+    // SDIO mode does not have CMD58, but main program uses this to
+    // poll for card presence. Return status register instead.
+    return checkReturnOk(rp2040_sdio_command_R1(CMD13, g_sdio_rca, ocr));
+}
+
+bool SdioCard::readData(uint8_t* dst)
+{
+    azlog("SdioCard::readData() called but not implemented!");
+    return false;
+}
+
+bool SdioCard::readStart(uint32_t sector)
+{
+    azlog("SdioCard::readStart() called but not implemented!");
+    return false;
+}
+
+bool SdioCard::readStop()
+{
+    azlog("SdioCard::readStop() called but not implemented!");
+    return false;
+}
+
+uint32_t SdioCard::sectorCount()
+{
+    return g_sdio_csd.capacity();
+}
+
+uint32_t SdioCard::status()
+{
+    uint32_t reply;
+    if (checkReturnOk(rp2040_sdio_command_R1(CMD13, g_sdio_rca, &reply)))
+        return reply;
+    else
+        return 0;
+}
+
+bool SdioCard::stopTransmission(bool blocking)
+{
+    uint32_t reply;
+    if (!checkReturnOk(rp2040_sdio_command_R1(CMD12, 0, &reply)))
+    {
+        return false;
+    }
+
+    if (!blocking)
+    {
+        return true;
+    }
+    else
+    {
+        uint32_t end = millis() + 100;
+        while (millis() < end && isBusy())
+        {
+            if (m_stream_callback)
+            {
+                m_stream_callback(m_stream_count);
+            }
+        }
+        if (isBusy())
+        {
+            azlog("SdioCard::stopTransmission() timeout");
+            return false;
+        }
+        else
+        {
+            return true;
+        }
+    }
+}
+
+bool SdioCard::syncDevice()
+{
+    return true;
+}
+
+uint8_t SdioCard::type() const
+{
+    if (g_sdio_ocr & (1 << 30))
+        return SD_CARD_TYPE_SDHC;
+    else
+        return SD_CARD_TYPE_SD2;
+}
+
+bool SdioCard::writeData(const uint8_t* src)
+{
+    azlog("SdioCard::writeData() called but not implemented!");
+    return false;
+}
+
+bool SdioCard::writeStart(uint32_t sector)
+{
+    azlog("SdioCard::writeStart() called but not implemented!");
+    return false;
+}
+
+bool SdioCard::writeStop()
+{
+    azlog("SdioCard::writeStop() called but not implemented!");
+    return false;
+}
+
+bool SdioCard::erase(uint32_t firstSector, uint32_t lastSector)
+{
+    azlog("SdioCard::erase() not implemented");
+    return false;
+}
+
+bool SdioCard::cardCMD6(uint32_t arg, uint8_t* status) {
+    azlog("SdioCard::cardCMD6() not implemented");
+    return false;
+}
+
+bool SdioCard::readSCR(scr_t* scr) {
+    azlog("SdioCard::readSCR() not implemented");
+    return false;
+}
+
+/* Writing and reading, with progress callback */
+
+bool SdioCard::writeSector(uint32_t sector, const uint8_t* src)
+{
+    if (((uint32_t)src & 3) != 0)
+    {
+        // Buffer is not aligned, need to memcpy() the data to a temporary buffer.
+        memcpy(g_sdio_dma_buf, src, sizeof(g_sdio_dma_buf));
+        src = (uint8_t*)g_sdio_dma_buf;
+    }
+
+    // If possible, report transfer status to application through callback.
+    sd_callback_t callback = get_stream_callback(src, 512, "writeSector", sector);
+
+    uint32_t reply;
+    if (!checkReturnOk(rp2040_sdio_command_R1(16, 512, &reply)) || // SET_BLOCKLEN
+        !checkReturnOk(rp2040_sdio_command_R1(CMD24, sector, &reply)) || // WRITE_BLOCK
+        !checkReturnOk(rp2040_sdio_tx_start(src, 1))) // Start transmission
+    {
+        return false;
+    }
+
+    do {
+        uint32_t bytes_done;
+        g_sdio_error = rp2040_sdio_tx_poll(&bytes_done);
+
+        if (callback)
+        {
+            callback(m_stream_count_start + bytes_done);
+        }
+    } while (g_sdio_error == SDIO_BUSY);
+
+    if (g_sdio_error != SDIO_OK)
+    {
+        azlog("SdioCard::writeSector(", sector, ") failed: ", (int)g_sdio_error);
+    }
+
+    return g_sdio_error == SDIO_OK;
+}
+
+bool SdioCard::writeSectors(uint32_t sector, const uint8_t* src, size_t n)
+{
+    if (((uint32_t)src & 3) != 0)
+    {
+        // Unaligned write, execute sector-by-sector
+        for (size_t i = 0; i < n; i++)
+        {
+            if (!writeSector(sector + i, src + 512 * i))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    sd_callback_t callback = get_stream_callback(src, n * 512, "writeSectors", sector);
+
+    uint32_t reply;
+    if (!checkReturnOk(rp2040_sdio_command_R1(16, 512, &reply)) || // SET_BLOCKLEN
+        !checkReturnOk(rp2040_sdio_command_R1(CMD55, g_sdio_rca, &reply)) || // APP_CMD
+        !checkReturnOk(rp2040_sdio_command_R1(ACMD23, n, &reply)) || // SET_WR_CLK_ERASE_COUNT
+        !checkReturnOk(rp2040_sdio_command_R1(CMD25, sector, &reply)) || // WRITE_MULTIPLE_BLOCK
+        !checkReturnOk(rp2040_sdio_tx_start(src, n))) // Start transmission
+    {
+        return false;
+    }
+
+    do {
+        uint32_t bytes_done;
+        g_sdio_error = rp2040_sdio_tx_poll(&bytes_done);
+
+        if (callback)
+        {
+            callback(m_stream_count_start + bytes_done);
+        }
+    } while (g_sdio_error == SDIO_BUSY);
+
+    if (g_sdio_error != SDIO_OK)
+    {
+        azlog("SdioCard::writeSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        stopTransmission(true);
+        return false;
+    }
+    else
+    {
+        return stopTransmission(true);
+    }
+}
+
+bool SdioCard::readSector(uint32_t sector, uint8_t* dst)
+{
+    uint8_t *real_dst = dst;
+    if (((uint32_t)dst & 3) != 0)
+    {
+        // Buffer is not aligned, need to memcpy() the data from a temporary buffer.
+        dst = (uint8_t*)g_sdio_dma_buf;
+    }
+
+    sd_callback_t callback = get_stream_callback(dst, 512, "readSector", sector);
+
+    uint32_t reply;
+    if (!checkReturnOk(rp2040_sdio_command_R1(16, 512, &reply)) || // SET_BLOCKLEN
+        !checkReturnOk(rp2040_sdio_rx_start(dst, 1)) || // Prepare for reception
+        !checkReturnOk(rp2040_sdio_command_R1(CMD17, sector, &reply))) // READ_SINGLE_BLOCK
+    {
+        return false;
+    }
+
+    do {
+        uint32_t bytes_done;
+        g_sdio_error = rp2040_sdio_rx_poll(&bytes_done);
+
+        if (callback)
+        {
+            callback(m_stream_count_start + bytes_done);
+        }
+    } while (g_sdio_error == SDIO_BUSY);
+
+    if (g_sdio_error != SDIO_OK)
+    {
+        azlog("SdioCard::readSector(", sector, ") failed: ", (int)g_sdio_error);
+    }
+
+    if (dst != real_dst)
+    {
+        memcpy(real_dst, g_sdio_dma_buf, sizeof(g_sdio_dma_buf));
+    }
+
+    return g_sdio_error == SDIO_OK;
+}
+
+bool SdioCard::readSectors(uint32_t sector, uint8_t* dst, size_t n)
+{
+    if (((uint32_t)dst & 3) != 0 || sector + n >= g_sdio_sector_count)
+    {
+        // Unaligned read or end-of-drive read, execute sector-by-sector
+        for (size_t i = 0; i < n; i++)
+        {
+            if (!readSector(sector + i, dst + 512 * i))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    sd_callback_t callback = get_stream_callback(dst, n * 512, "readSectors", sector);
+
+    uint32_t reply;
+    if (!checkReturnOk(rp2040_sdio_command_R1(16, 512, &reply)) || // SET_BLOCKLEN
+        !checkReturnOk(rp2040_sdio_rx_start(dst, n)) || // Prepare for reception
+        !checkReturnOk(rp2040_sdio_command_R1(CMD18, sector, &reply))) // READ_MULTIPLE_BLOCK
+    {
+        return false;
+    }
+
+    do {
+        uint32_t bytes_done;
+        g_sdio_error = rp2040_sdio_rx_poll(&bytes_done);
+
+        if (callback)
+        {
+            callback(m_stream_count_start + bytes_done);
+        }
+    } while (g_sdio_error == SDIO_BUSY);
+
+    if (g_sdio_error != SDIO_OK)
+    {
+        azlog("SdioCard::readSectors(", sector, ",...,", (int)n, ") failed: ", (int)g_sdio_error);
+        stopTransmission(true);
+        return false;
+    }
+    else
+    {
+        return stopTransmission(true);
+    }
+}
+
+// These functions are not used for SDIO mode but are needed to avoid build error.
+void sdCsInit(SdCsPin_t pin) {}
+void sdCsWrite(SdCsPin_t pin, bool level) {}
+
+// SDIO configuration for main program
+SdioConfig g_sd_sdio_config(DMA_SDIO);
+
+#endif

+ 82 - 0
lib/ZuluSCSI_platform_BS2/sd_card_spi.cpp

@@ -0,0 +1,82 @@
+// Driver and interface for accessing SD card in SPI mode
+
+#include "ZuluSCSI_platform.h"
+#include "ZuluSCSI_log.h"
+#include <hardware/spi.h>
+#include <SdFat.h>
+
+#ifndef SD_USE_SDIO
+
+class RP2040SPIDriver : public SdSpiBaseClass
+{
+public:
+    void begin(SdSpiConfig config) {
+    }
+
+    void activate() {
+        _spi_init(SD_SPI, m_sckfreq);
+        spi_set_format(SD_SPI, 8, SPI_CPOL_0, SPI_CPHA_0, SPI_MSB_FIRST);
+    }
+
+    void deactivate() {
+    }
+
+    void wait_idle() {
+        while (!(spi_get_hw(SD_SPI)->sr & SPI_SSPSR_TFE_BITS));
+        while (spi_get_hw(SD_SPI)->sr & SPI_SSPSR_BSY_BITS);
+    }
+
+    // Single byte receive
+    uint8_t receive() {
+        uint8_t tx = 0xFF;
+        uint8_t rx;
+        spi_write_read_blocking(SD_SPI, &tx, &rx, 1);
+        return rx;
+    }
+
+    // Single byte send
+    void send(uint8_t data) {
+        spi_write_blocking(SD_SPI, &data, 1);
+        wait_idle();
+    }
+
+    // Multiple byte receive
+    uint8_t receive(uint8_t* buf, size_t count)
+    {
+        spi_read_blocking(SD_SPI, 0xFF, buf, count);
+        return 0;
+    }
+
+    // Multiple byte send
+    void send(const uint8_t* buf, size_t count) {
+        spi_write_blocking(SD_SPI, buf, count);
+    }
+
+    void setSckSpeed(uint32_t maxSck) {
+        m_sckfreq = maxSck;
+    }
+
+private:
+    uint32_t m_sckfreq;
+};
+
+void sdCsInit(SdCsPin_t pin)
+{
+}
+
+void sdCsWrite(SdCsPin_t pin, bool level)
+{
+    if (level)
+        sio_hw->gpio_set = (1 << SD_SPI_CS);
+    else
+        sio_hw->gpio_clr = (1 << SD_SPI_CS);
+}
+
+RP2040SPIDriver g_sd_spi_port;
+SdSpiConfig g_sd_spi_config(0, DEDICATED_SPI, SD_SCK_MHZ(25), &g_sd_spi_port);
+
+void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer)
+{
+}
+
+#endif

+ 25 - 1
platformio.ini

@@ -1,7 +1,7 @@
 ; PlatformIO Project Configuration File https://docs.platformio.org/page/projectconf.html
 ; PlatformIO Project Configuration File https://docs.platformio.org/page/projectconf.html
 
 
 [platformio]
 [platformio]
-default_envs = ZuluSCSIv1_0, ZuluSCSIv1_0_mini, ZuluSCSIv1_1, ZuluSCSI_RP2040
+default_envs = ZuluSCSIv1_0, ZuluSCSIv1_0_mini, ZuluSCSIv1_1, ZuluSCSI_RP2040, ZuluSCSI_BS2
 
 
 ; Example platform to serve as a base for porting efforts
 ; Example platform to serve as a base for porting efforts
 [env:template]
 [env:template]
@@ -96,3 +96,27 @@ build_flags =
     -DHAS_SDIO_CLASS
     -DHAS_SDIO_CLASS
     -DUSE_ARDUINO=1
     -DUSE_ARDUINO=1
     -DZULUSCSI_V2_0
     -DZULUSCSI_V2_0
+
+
+; ZuluSCSI RP2040 hardware platform, based on the Raspberry Pi foundation RP2040 microcontroller
+[env:ZuluSCSI_BS2]
+platform = raspberrypi
+framework = arduino
+board = ZuluSCSI_RP2040
+extra_scripts = src/build_bootloader.py
+board_build.ldscript = lib/ZuluSCSI_platform_RP2040/rp2040.ld
+ldscript_bootloader = lib/ZuluSCSI_platform_RP2040/rp2040_btldr.ld
+lib_deps =
+    SdFat=https://github.com/rabbitholecomputing/SdFat#2.2.0-gpt
+    minIni
+    ZuluSCSI_platform_BS2
+    SCSI2SD
+build_flags =
+    -O2 -Isrc -ggdb -g3
+    -Wall -Wno-sign-compare -Wno-ignored-qualifiers
+    -DSPI_DRIVER_SELECT=3
+    -DSD_CHIP_SELECT_MODE=2
+    -DENABLE_DEDICATED_SPI=1
+    -DHAS_SDIO_CLASS
+    -DUSE_ARDUINO=1
+    -DZULUSCSI_V3_0