浏览代码

Merge pull request #60 from ZuluSCSI/rp2040_performance

RP2040 performance & bootloader improvements
Alex Perez 3 年之前
父节点
当前提交
42fef806ef

+ 5 - 1
README.md

@@ -59,7 +59,11 @@ When successful, the bootloader removes the update file and continues to main fi
 On failure, `Zuluerr.txt` is written on the SD card.
 On failure, `Zuluerr.txt` is written on the SD card.
 
 
 Alternatively, the board can be programmed using USB connection in DFU mode by setting DIP switch 4.
 Alternatively, the board can be programmed using USB connection in DFU mode by setting DIP switch 4.
-The necessary programmer utility for Windows can be downloaded from [GD32 website](http://www.gd32mcu.com/en/download?kw=dfu&lan=en). On Linux and MacOS, the standard 'dfu-util' can be used. It can be installed via your package manager under Linux. On MacOS, it is available through MacPorts and Brew as a package
+The necessary programmer utility for Windows can be downloaded from [GD32 website](http://www.gd32mcu.com/en/download?kw=dfu&lan=en). On Linux and MacOS, the standard 'dfu-util' can be used. It can be installed via your package manager under Linux. On MacOS, it is available through MacPorts and Brew as a package.
+
+    dfu-util --alt 0 --dfuse-address 0x08000000 --download ZuluSCSIv1_1_XXXXXX.bin
+
+For RP2040-based boards, the USB programming uses `.uf2` format file that can be copied to the USB drive that shows up in bootloader mode.
 
 
 DIP switches
 DIP switches
 ------------
 ------------

+ 114 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.cpp

@@ -11,6 +11,10 @@
 
 
 extern "C" {
 extern "C" {
 
 
+// As of 2022-09-13, the platformio RP2040 core is missing cplusplus guard on flash.h
+// For that reason this has to be inside the extern "C" here.
+#include <hardware/flash.h>
+
 const char *g_azplatform_name = PLATFORM_NAME;
 const char *g_azplatform_name = PLATFORM_NAME;
 
 
 void mbed_error_hook(const mbed_error_ctx * error_context);
 void mbed_error_hook(const mbed_error_ctx * error_context);
@@ -229,12 +233,122 @@ void azplatform_log(const char *s)
     uart_puts(uart0, s);
     uart_puts(uart0, s);
 }
 }
 
 
+static int g_watchdog_timeout;
+static bool g_watchdog_initialized;
+
+static void watchdog_callback(unsigned alarm_num)
+{
+    g_watchdog_timeout -= 1000;
+
+    if (g_watchdog_timeout <= WATCHDOG_CRASH_TIMEOUT - WATCHDOG_BUS_RESET_TIMEOUT)
+    {
+        if (!scsiDev.resetFlag)
+        {
+            azlog("WATCHDOG TIMEOUT, attempting bus reset");
+            scsiDev.resetFlag = 1;
+        }
+
+        if (g_watchdog_timeout <= 0)
+        {
+            assert(false);
+        }
+    }
+
+    hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+}
+
 // This function can be used to periodically reset watchdog timer for crash handling.
 // This function can be used to periodically reset watchdog timer for crash handling.
 // It can also be left empty if the platform does not use a watchdog timer.
 // It can also be left empty if the platform does not use a watchdog timer.
 void azplatform_reset_watchdog()
 void azplatform_reset_watchdog()
 {
 {
+    g_watchdog_timeout = WATCHDOG_CRASH_TIMEOUT;
+
+    if (!g_watchdog_initialized)
+    {
+        hardware_alarm_claim(3);
+        hardware_alarm_set_callback(3, &watchdog_callback);
+        hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+        g_watchdog_initialized = true;
+    }
+}
+
+/*****************************************/
+/* Flash reprogramming from bootloader   */
+/*****************************************/
+
+#ifdef AZPLATFORM_BOOTLOADER_SIZE
+
+extern uint32_t __real_vectors_start;
+extern uint32_t __StackTop;
+static volatile void *g_bootloader_exit_req;
+
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE])
+{
+    if (offset == AZPLATFORM_BOOTLOADER_SIZE)
+    {
+        if (buffer[3] != 0x20 || buffer[7] != 0x10)
+        {
+            azlog("Invalid firmware file, starts with: ", bytearray(buffer, 16));
+            return false;
+        }
+    }
+
+    azdbg("Writing flash at offset ", offset, " data ", bytearray(buffer, 4));
+    assert(offset % AZPLATFORM_FLASH_PAGE_SIZE == 0);
+    assert(offset >= AZPLATFORM_BOOTLOADER_SIZE);
+
+    __disable_irq();
+    flash_range_erase(offset, AZPLATFORM_FLASH_PAGE_SIZE);
+    flash_range_program(offset, buffer, AZPLATFORM_FLASH_PAGE_SIZE);
+    __enable_irq();
+
+    uint32_t *buf32 = (uint32_t*)buffer;
+    uint32_t num_words = AZPLATFORM_FLASH_PAGE_SIZE / 4;
+    for (int i = 0; i < num_words; i++)
+    {
+        uint32_t expected = buf32[i];
+        uint32_t actual = *(volatile uint32_t*)(XIP_NOCACHE_BASE + offset + i * 4);
+        if (actual != expected)
+        {
+            azlog("Flash verify failed at offset ", offset + i * 4, " got ", actual, " expected ", expected);
+            return false;
+        }
+    }
+    return true;
+}
+
+void azplatform_boot_to_main_firmware()
+{
+    // To ensure that the system state is reset properly, we perform
+    // a SYSRESETREQ and jump straight from the reset vector to main application.
+    g_bootloader_exit_req = &g_bootloader_exit_req;
+    SCB->AIRCR = 0x05FA0004;
+    while(1);
+}
+
+void btldr_reset_handler()
+{
+    uint32_t* application_base = &__real_vectors_start;
+    if (g_bootloader_exit_req == &g_bootloader_exit_req)
+    {
+        // Boot to main application
+        application_base = (uint32_t*)(XIP_BASE + AZPLATFORM_BOOTLOADER_SIZE);
+    }
+
+    SCB->VTOR = (uint32_t)application_base;
+    __asm__(
+        "msr msp, %0\n\t"
+        "bx %1" : : "r" (application_base[0]),
+                    "r" (application_base[1]) : "memory");
 }
 }
 
 
+// Replace the reset handler when building the bootloader
+// The rp2040_btldr.ld places real vector table at an offset.
+__attribute__((section(".btldr_vectors")))
+const void * btldr_vectors[2] = {&__StackTop, (void*)&btldr_reset_handler};
+
+#endif
+
 /**********************************************/
 /**********************************************/
 /* Mapping from data bytes to GPIO BOP values */
 /* Mapping from data bytes to GPIO BOP values */
 /**********************************************/
 /**********************************************/

+ 13 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.h

@@ -15,6 +15,9 @@ extern const char *g_azplatform_name;
 #define PLATFORM_NAME "ZuluSCSI RP2040"
 #define PLATFORM_NAME "ZuluSCSI RP2040"
 #define PLATFORM_REVISION "2.0"
 #define PLATFORM_REVISION "2.0"
 #define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
 #define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
+#define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 4096
+#define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 65536
+#define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 8192
 #define SD_USE_SDIO 1
 #define SD_USE_SDIO 1
 
 
 // NOTE: The driver supports synchronous speeds higher than 10MB/s, but this
 // NOTE: The driver supports synchronous speeds higher than 10MB/s, but this
@@ -24,6 +27,7 @@ extern const char *g_azplatform_name;
 // Debug logging function, can be used to print to e.g. serial port.
 // Debug logging function, can be used to print to e.g. serial port.
 // May get called from interrupt handlers.
 // May get called from interrupt handlers.
 void azplatform_log(const char *s);
 void azplatform_log(const char *s);
+void azplatform_emergency_log_save();
 
 
 // Timing and delay functions.
 // Timing and delay functions.
 // Arduino platform already provides these
 // Arduino platform already provides these
@@ -56,6 +60,15 @@ void azplatform_reset_watchdog();
 typedef void (*sd_callback_t)(uint32_t bytes_complete);
 typedef void (*sd_callback_t)(uint32_t bytes_complete);
 void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer);
 void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer);
 
 
+// Reprogram firmware in main program area.
+#ifndef RP2040_DISABLE_BOOTLOADER
+#define AZPLATFORM_BOOTLOADER_SIZE (128 * 1024)
+#define AZPLATFORM_FLASH_TOTAL_SIZE (1024 * 1024)
+#define AZPLATFORM_FLASH_PAGE_SIZE 4096
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE]);
+void azplatform_boot_to_main_firmware();
+#endif
+
 // Below are GPIO access definitions that are used from scsiPhy.cpp.
 // Below are GPIO access definitions that are used from scsiPhy.cpp.
 
 
 // Write a single SCSI pin.
 // Write a single SCSI pin.

+ 33 - 2
lib/ZuluSCSI_platform_RP2040/rp2040.ld

@@ -18,8 +18,18 @@ SECTIONS
     } > FLASH
     } > FLASH
     ASSERT(__boot2_end__ - __boot2_start__ == 256,
     ASSERT(__boot2_end__ - __boot2_start__ == 256,
         "ERROR: Pico second stage bootloader must be 256 bytes in size")
         "ERROR: Pico second stage bootloader must be 256 bytes in size")
+
+    /* If ZuluSCSI SD card bootloader is included, it goes in first 128 kB */
+    .text.bootloader : ALIGN(16) SUBALIGN(16)
+    {
+        KEEP(*(.text.btldr*))
+        . = ALIGN(131072);
+        CHECK_BOOTLOADER_SIZE = 1 / (. <= 131072);
+    } > FLASH
+
     .text : {
     .text : {
         __logical_binary_start = .;
         __logical_binary_start = .;
+        __real_vectors_start = .;
         KEEP (*(.vectors))
         KEEP (*(.vectors))
         KEEP (*(.binary_info_header))
         KEEP (*(.binary_info_header))
         __binary_info_header_end = .;
         __binary_info_header_end = .;
@@ -38,8 +48,23 @@ SECTIONS
         *(.dtors)
         *(.dtors)
         *(.eh_frame*)
         *(.eh_frame*)
         . = ALIGN(4);
         . = ALIGN(4);
-        *(.text)
-        *(.text*)
+
+        /* Put only non-timecritical code in flash
+         * This includes e.g. floating point math routines.
+         */
+        *libm*:(.text .text*)
+        *libc*:(.text .text*)
+        *libgcc*:*df*(.text .text*)
+        *USB*(.text .text*)
+        *SPI*(.text .text*)
+        *Spi*(.text .text*)
+        *spi*(.text .text*)
+        *stdc*:(.text .text*)
+        *supc*:(.text .text*)
+        *nosys*:(.text .text*)
+        *libc*:*printf*(.text .text*)
+        *libc*:*toa*(.text .text*)
+        *libminIni.a:(.text .text*)
     } > FLASH
     } > FLASH
     .rodata : {
     .rodata : {
         . = ALIGN(4);
         . = ALIGN(4);
@@ -74,7 +99,13 @@ SECTIONS
     .data : {
     .data : {
         __data_start__ = .;
         __data_start__ = .;
         *(vtable)
         *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
         *(.time_critical*)
         *(.time_critical*)
+        . = ALIGN(4);
+        *(.text)
+        *(.text*)
+
         . = ALIGN(4);
         . = ALIGN(4);
         *(.data*)
         *(.data*)
         . = ALIGN(4);
         . = ALIGN(4);

+ 170 - 0
lib/ZuluSCSI_platform_RP2040/rp2040_btldr.ld

@@ -0,0 +1,170 @@
+/*
+ *
+ * Customized linker script for building bootloader
+ *
+ */
+
+ MEMORY
+{
+    /* The bootloader is linked to begin at 0x12000100.
+     * First 256 bytes are reserved for RP2040 second stage bootloader,
+     * which comes as part of the main firmware.elf and is never overwritten.
+     * The bootloader also runs without XIP cache because that seemed to cause
+     * problems when writing flash.
+     */
+    FLASH(rx) : ORIGIN = 0x12000100, LENGTH = 128k-256
+    RAM(rwx) : ORIGIN = 0x20000000, LENGTH = 240k  /* Leave space for pico-debug */
+    SCRATCH_X(rwx) : ORIGIN = 0x20040000, LENGTH = 4k
+    SCRATCH_Y(rwx) : ORIGIN = 0x20041000, LENGTH = 4k
+}
+ENTRY(_entry_point)
+SECTIONS
+{
+    .flash_begin : {
+        __flash_binary_start = .;
+    } > FLASH
+
+    .text : {
+        __logical_binary_start = .;
+        KEEP (*(.btldr_vectors))
+        KEEP (*(.binary_info_header))
+        __binary_info_header_end = .;
+        . = ALIGN(256);
+        __real_vectors_start = .;
+        KEEP (*(.vectors))
+        KEEP (*(.reset))
+        KEEP (*(.init))
+        *(.fini)
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+        *crtbegin.o(.dtors)
+        *crtbegin?.o(.dtors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+        *(SORT(.dtors.*))
+        *(.dtors)
+        *(.eh_frame*)
+        *(.text .text*)
+        . = ALIGN(4);
+    } > FLASH
+    .rodata : {
+        . = ALIGN(4);
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*)))
+        *(.rodata)
+        *(.rodata*)
+        . = ALIGN(4);
+    } > FLASH
+    .ARM.extab :
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+    . = ALIGN(4);
+    __binary_info_start = .;
+    .binary_info :
+    {
+        KEEP(*(.binary_info.keep.*))
+        *(.binary_info.*)
+    } > FLASH
+    __binary_info_end = .;
+    . = ALIGN(4);
+    __etext = .;
+   .ram_vector_table (COPY): {
+        *(.ram_vector_table)
+    } > RAM
+    .data : {
+        __data_start__ = .;
+        *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
+        *(.time_critical*)
+
+        . = ALIGN(4);
+        *(.data*)
+        . = ALIGN(4);
+        *(.after_data.*)
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__mutex_array_start = .);
+        KEEP(*(SORT(.mutex_array.*)))
+        KEEP(*(.mutex_array))
+        PROVIDE_HIDDEN (__mutex_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(SORT(.preinit_array.*)))
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        *(SORT(.fini_array.*))
+        *(.fini_array)
+        PROVIDE_HIDDEN (__fini_array_end = .);
+        *(.jcr)
+        . = ALIGN(4);
+        __data_end__ = .;
+    } > RAM AT> FLASH
+    .uninitialized_data (COPY): {
+        . = ALIGN(4);
+        *(.uninitialized_data*)
+    } > RAM
+    .scratch_x : {
+        __scratch_x_start__ = .;
+        *(.scratch_x.*)
+        . = ALIGN(4);
+        __scratch_x_end__ = .;
+    } > SCRATCH_X AT > FLASH
+    __scratch_x_source__ = LOADADDR(.scratch_x);
+    .scratch_y : {
+        __scratch_y_start__ = .;
+        *(.scratch_y.*)
+        . = ALIGN(4);
+        __scratch_y_end__ = .;
+    } > SCRATCH_Y AT > FLASH
+    __scratch_y_source__ = LOADADDR(.scratch_y);
+    .bss : {
+        . = ALIGN(4);
+        __bss_start__ = .;
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+    .heap (COPY):
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        *(.heap*)
+        . = ORIGIN(RAM) + LENGTH(RAM) - 0x400;
+        __HeapLimit = .;
+    } > RAM
+    .stack1_dummy (COPY):
+    {
+        *(.stack1*)
+    } > SCRATCH_X
+    .stack_dummy (COPY):
+    {
+        *(.stack*)
+    } > RAM
+    .flash_end : {
+        __flash_binary_end = .;
+    } > FLASH
+    __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+    __StackLimit = __StackTop - 0x400;
+    __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X);
+    __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy);
+    __StackBottom = __StackTop - SIZEOF(.stack_dummy);
+    PROVIDE(__stack = __StackTop);
+    ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed")
+    ASSERT( __binary_info_header_end - __logical_binary_start <= 256, "Binary info must be in first 256 bytes of the binary")
+}

+ 39 - 29
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.cpp

@@ -94,31 +94,35 @@ static const uint8_t crc7_table[256] = {
 // When the SDIO bus operates in 4-bit mode, the CRC16 algorithm
 // When the SDIO bus operates in 4-bit mode, the CRC16 algorithm
 // is applied to each line separately and generates total of
 // is applied to each line separately and generates total of
 // 4 x 16 = 64 bits of checksum.
 // 4 x 16 = 64 bits of checksum.
+__attribute__((optimize("O3")))
 uint64_t sdio_crc16_4bit_checksum(uint32_t *data, uint32_t num_words)
 uint64_t sdio_crc16_4bit_checksum(uint32_t *data, uint32_t num_words)
 {
 {
     uint64_t crc = 0;
     uint64_t crc = 0;
     uint32_t *end = data + num_words;
     uint32_t *end = data + num_words;
     while (data < end)
     while (data < end)
     {
     {
-        // Each 32-bit word contains 8 bits per line.
-        // Reverse the bytes because SDIO protocol is big-endian.
-        uint32_t data_in = __builtin_bswap32(*data++);
+        for (int unroll = 0; unroll < 4; unroll++)
+        {
+            // Each 32-bit word contains 8 bits per line.
+            // Reverse the bytes because SDIO protocol is big-endian.
+            uint32_t data_in = __builtin_bswap32(*data++);
 
 
-        // Shift out 8 bits for each line
-        uint32_t data_out = crc >> 32;
-        crc <<= 32;
+            // Shift out 8 bits for each line
+            uint32_t data_out = crc >> 32;
+            crc <<= 32;
 
 
-        // XOR outgoing data to itself with 4 bit delay
-        data_out ^= (data_out >> 16);
+            // XOR outgoing data to itself with 4 bit delay
+            data_out ^= (data_out >> 16);
 
 
-        // XOR incoming data to outgoing data with 4 bit delay
-        data_out ^= (data_in >> 16);
+            // XOR incoming data to outgoing data with 4 bit delay
+            data_out ^= (data_in >> 16);
 
 
-        // XOR outgoing and incoming data to accumulator at each tap
-        uint64_t xorred = data_out ^ data_in;
-        crc ^= xorred;
-        crc ^= xorred << (5 * 4);
-        crc ^= xorred << (12 * 4);
+            // XOR outgoing and incoming data to accumulator at each tap
+            uint64_t xorred = data_out ^ data_in;
+            crc ^= xorred;
+            crc ^= xorred << (5 * 4);
+            crc ^= xorred << (12 * 4);
+        }
     }
     }
 
 
     return crc;
     return crc;
@@ -434,19 +438,29 @@ static void sdio_verify_rx_checksums(uint32_t maxcount)
 
 
 sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 {
 {
-    // Check how many DMA control blocks have been consumed
-    uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
-    dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
-
-    // Compute how many complete 512 byte SDIO blocks have been transferred
-    // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
-    g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
-
-    // Is it all done?
+    // Was everything done when the previous rx_poll() finished?
     if (g_sdio.blocks_done >= g_sdio.total_blocks)
     if (g_sdio.blocks_done >= g_sdio.total_blocks)
     {
     {
         g_sdio.transfer_state = SDIO_IDLE;
         g_sdio.transfer_state = SDIO_IDLE;
     }
     }
+    else
+    {
+        // Use the idle time to calculate checksums
+        sdio_verify_rx_checksums(4);
+
+        // Check how many DMA control blocks have been consumed
+        uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
+        dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
+
+        // Compute how many complete 512 byte SDIO blocks have been transferred
+        // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
+        g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
+
+        // NOTE: When all blocks are done, rx_poll() still returns SDIO_BUSY once.
+        // This provides a chance to start the SCSI transfer before the last checksums
+        // are computed. Any checksum failures can be indicated in SCSI status after
+        // the data transfer has finished.
+    }
 
 
     if (bytes_complete)
     if (bytes_complete)
     {
     {
@@ -455,6 +469,7 @@ sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 
 
     if (g_sdio.transfer_state == SDIO_IDLE)
     if (g_sdio.transfer_state == SDIO_IDLE)
     {
     {
+        // Verify all remaining checksums.
         sdio_verify_rx_checksums(g_sdio.total_blocks);
         sdio_verify_rx_checksums(g_sdio.total_blocks);
 
 
         if (g_sdio.checksum_errors == 0)
         if (g_sdio.checksum_errors == 0)
@@ -472,11 +487,6 @@ sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
         rp2040_sdio_stop();
         rp2040_sdio_stop();
         return SDIO_ERR_DATA_TIMEOUT;
         return SDIO_ERR_DATA_TIMEOUT;
     }
     }
-    else
-    {
-        // Use the idle time to calculate checksums
-        sdio_verify_rx_checksums(1);
-    }
 
 
     return SDIO_BUSY;
     return SDIO_BUSY;
 }
 }

+ 51 - 6
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.cpp

@@ -14,6 +14,8 @@
 #include <hardware/dma.h>
 #include <hardware/dma.h>
 #include <hardware/irq.h>
 #include <hardware/irq.h>
 #include <hardware/structs/iobank0.h>
 #include <hardware/structs/iobank0.h>
+#include <hardware/sync.h>
+#include <multicore.h>
 
 
 #define SCSI_DMA_PIO pio0
 #define SCSI_DMA_PIO pio0
 #define SCSI_DMA_SM 0
 #define SCSI_DMA_SM 0
@@ -59,6 +61,10 @@ static struct {
     uint32_t dma_countB;
     uint32_t dma_countB;
     uint32_t dma_bufA[DMA_BUF_SIZE];
     uint32_t dma_bufA[DMA_BUF_SIZE];
     uint32_t dma_bufB[DMA_BUF_SIZE];
     uint32_t dma_bufB[DMA_BUF_SIZE];
+
+    // Try to offload SCSI DMA interrupts to second core if possible
+    volatile bool core1_active;
+    mutex_t mutex;
 } g_scsi_dma;
 } g_scsi_dma;
 
 
 enum scsidma_state_t { SCSIDMA_IDLE = 0,
 enum scsidma_state_t { SCSIDMA_IDLE = 0,
@@ -217,6 +223,8 @@ static void scsi_dma_write_irq()
 {
 {
     dma_hw->ints0 = 1 << SCSI_DMA_CH;
     dma_hw->ints0 = 1 << SCSI_DMA_CH;
 
 
+    mutex_enter_blocking(&g_scsi_dma.mutex);
+
     if (g_scsi_dma.dma_current_buf == SCSIBUF_A)
     if (g_scsi_dma.dma_current_buf == SCSIBUF_A)
     {
     {
         // Transfer from buffer A finished
         // Transfer from buffer A finished
@@ -273,6 +281,30 @@ static void scsi_dma_write_irq()
             g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
             g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
         }
         }
     }
     }
+
+    mutex_exit(&g_scsi_dma.mutex);
+}
+
+// SCSI DMA interrupts are offloaded to the second core if possible
+static void enable_irq_second_core()
+{
+    irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+    irq_set_enabled(DMA_IRQ_0, true);
+    g_scsi_dma.core1_active = true;
+}
+
+// Block the SCSI DMA interrupt from executing on either core.
+// Used during setting of the buffer pointers.
+static void scsi_dma_block_irqs()
+{
+    __disable_irq();
+    mutex_enter_blocking(&g_scsi_dma.mutex);
+}
+
+static void scsi_dma_unblock_irqs()
+{
+    mutex_exit(&g_scsi_dma.mutex);
+    __enable_irq();
 }
 }
 
 
 void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
 void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
@@ -280,7 +312,7 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
     // Number of bytes should always be divisible by 2.
     // Number of bytes should always be divisible by 2.
     assert((count & 1) == 0);
     assert((count & 1) == 0);
 
 
-    __disable_irq();
+    scsi_dma_block_irqs();
     if (g_scsi_dma_state == SCSIDMA_WRITE)
     if (g_scsi_dma_state == SCSIDMA_WRITE)
     {
     {
         if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
         if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
@@ -303,7 +335,7 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
             count = 0;
             count = 0;
         }
         }
     }
     }
-    __enable_irq();
+    scsi_dma_unblock_irqs();
 
 
     // Check if the request was combined
     // Check if the request was combined
     if (count == 0) return;
     if (count == 0) return;
@@ -366,8 +398,6 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
         }
         }
         
         
         dma_channel_set_irq0_enabled(SCSI_DMA_CH, true);
         dma_channel_set_irq0_enabled(SCSI_DMA_CH, true);
-        irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
-        irq_set_enabled(DMA_IRQ_0, true);
     }
     }
 
 
     start_dma_write();
     start_dma_write();
@@ -385,8 +415,8 @@ bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
         return false;
         return false;
     
     
     // Check if this data item is still in queue.
     // Check if this data item is still in queue.
-    __disable_irq();
     bool finished = true;
     bool finished = true;
+    scsi_dma_block_irqs();
     if (data >= g_scsi_dma.app_buf + g_scsi_dma.dma_bytes &&
     if (data >= g_scsi_dma.app_buf + g_scsi_dma.dma_bytes &&
         data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
         data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
     {
     {
@@ -397,7 +427,7 @@ bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
     {
     {
         finished = false; // In queued transfer
         finished = false; // In queued transfer
     }
     }
-    __enable_irq();
+    scsi_dma_unblock_irqs();
 
 
     return finished;
     return finished;
 }
 }
@@ -522,6 +552,7 @@ void scsi_accel_rp2040_init()
     {
     {
         pio_sm_claim(SCSI_DMA_PIO, SCSI_DMA_SM);
         pio_sm_claim(SCSI_DMA_PIO, SCSI_DMA_SM);
         dma_channel_claim(SCSI_DMA_CH);
         dma_channel_claim(SCSI_DMA_CH);
+        mutex_init(&g_scsi_dma.mutex);
         g_channels_claimed = true;
         g_channels_claimed = true;
     }
     }
 
 
@@ -573,6 +604,20 @@ void scsi_accel_rp2040_init()
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DMA_SYNC_SM, true));
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DMA_SYNC_SM, true));
     g_scsi_dma.dma_write_pacer_config = cfg;
     g_scsi_dma.dma_write_pacer_config = cfg;
+
+    // Try to enable interrupt handling on second core
+    irq_set_enabled(DMA_IRQ_0, false);
+    g_scsi_dma.core1_active = false;
+    multicore_reset_core1();
+    multicore_launch_core1(&enable_irq_second_core);
+    delay(1);
+
+    if (!g_scsi_dma.core1_active)
+    {
+        azlog("Failed to offload SCSI DMA interrupts to second core, using first core");
+        irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+        irq_set_enabled(DMA_IRQ_0, true);
+    }
 }
 }
 
 
 void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod)
 void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod)

+ 3 - 1
platformio.ini

@@ -67,14 +67,16 @@ build_flags =
 platform = raspberrypi
 platform = raspberrypi
 framework = arduino
 framework = arduino
 board = ZuluSCSI_RP2040
 board = ZuluSCSI_RP2040
+extra_scripts = src/build_bootloader.py
 board_build.ldscript = lib/ZuluSCSI_platform_RP2040/rp2040.ld
 board_build.ldscript = lib/ZuluSCSI_platform_RP2040/rp2040.ld
+ldscript_bootloader = lib/ZuluSCSI_platform_RP2040/rp2040_btldr.ld
 lib_deps =
 lib_deps =
     SdFat=https://github.com/greiman/SdFat#2.1.2
     SdFat=https://github.com/greiman/SdFat#2.1.2
     minIni
     minIni
     ZuluSCSI_platform_RP2040
     ZuluSCSI_platform_RP2040
     SCSI2SD
     SCSI2SD
 build_flags =
 build_flags =
-    -Os -Isrc -ggdb -g3
+    -O2 -Isrc -ggdb -g3
     -Wall -Wno-sign-compare -Wno-ignored-qualifiers
     -Wall -Wno-sign-compare -Wno-ignored-qualifiers
     -DSPI_DRIVER_SELECT=3
     -DSPI_DRIVER_SELECT=3
     -DSD_CHIP_SELECT_MODE=2
     -DSD_CHIP_SELECT_MODE=2

+ 5 - 1
src/ZuluSCSI_bootloader.cpp

@@ -1,6 +1,7 @@
 // Simple bootloader that loads new firmware from SD card.
 // Simple bootloader that loads new firmware from SD card.
 
 
 #include <ZuluSCSI_platform.h>
 #include <ZuluSCSI_platform.h>
+#include "ZuluSCSI_config.h"
 #include "ZuluSCSI_log.h"
 #include "ZuluSCSI_log.h"
 #include <SdFat.h>
 #include <SdFat.h>
 #include <string.h>
 #include <string.h>
@@ -41,7 +42,10 @@ bool program_firmware(FsFile &file)
 {
 {
     uint32_t fwsize = file.size() - AZPLATFORM_BOOTLOADER_SIZE;
     uint32_t fwsize = file.size() - AZPLATFORM_BOOTLOADER_SIZE;
     uint32_t num_pages = (fwsize + AZPLATFORM_FLASH_PAGE_SIZE - 1) / AZPLATFORM_FLASH_PAGE_SIZE;
     uint32_t num_pages = (fwsize + AZPLATFORM_FLASH_PAGE_SIZE - 1) / AZPLATFORM_FLASH_PAGE_SIZE;
-    static uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE];
+
+    // Make sure the buffer is aligned to word boundary
+    static uint32_t buffer32[AZPLATFORM_FLASH_PAGE_SIZE / 4];
+    uint8_t *buffer = (uint8_t*)buffer32;
 
 
     if (fwsize > AZPLATFORM_FLASH_TOTAL_SIZE)
     if (fwsize > AZPLATFORM_FLASH_TOTAL_SIZE)
     {
     {

+ 12 - 0
src/ZuluSCSI_disk.cpp

@@ -93,6 +93,18 @@ public:
         else
         else
         {
         {
             m_fsfile = SD.open(filename, O_RDWR);
             m_fsfile = SD.open(filename, O_RDWR);
+
+            uint32_t begin = 0, end = 0;
+            if (m_fsfile.contiguousRange(&begin, &end))
+            {
+                // Convert to raw mapping, this avoids some unnecessary
+                // access overhead in SdFat library.
+                m_israw = true;
+                m_blockdev = SD.card();
+                m_bgnsector = begin;
+                m_endsector = end;
+                m_fsfile.close();
+            }
         }
         }
     }
     }