소스 검색

Merge pull request #60 from ZuluSCSI/rp2040_performance

RP2040 performance & bootloader improvements
Alex Perez 3 년 전
부모
커밋
42fef806ef

+ 5 - 1
README.md

@@ -59,7 +59,11 @@ When successful, the bootloader removes the update file and continues to main fi
 On failure, `Zuluerr.txt` is written on the SD card.
 
 Alternatively, the board can be programmed using USB connection in DFU mode by setting DIP switch 4.
-The necessary programmer utility for Windows can be downloaded from [GD32 website](http://www.gd32mcu.com/en/download?kw=dfu&lan=en). On Linux and MacOS, the standard 'dfu-util' can be used. It can be installed via your package manager under Linux. On MacOS, it is available through MacPorts and Brew as a package
+The necessary programmer utility for Windows can be downloaded from [GD32 website](http://www.gd32mcu.com/en/download?kw=dfu&lan=en). On Linux and MacOS, the standard 'dfu-util' can be used. It can be installed via your package manager under Linux. On MacOS, it is available through MacPorts and Brew as a package.
+
+    dfu-util --alt 0 --dfuse-address 0x08000000 --download ZuluSCSIv1_1_XXXXXX.bin
+
+For RP2040-based boards, the USB programming uses `.uf2` format file that can be copied to the USB drive that shows up in bootloader mode.
 
 DIP switches
 ------------

+ 114 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.cpp

@@ -11,6 +11,10 @@
 
 extern "C" {
 
+// As of 2022-09-13, the platformio RP2040 core is missing cplusplus guard on flash.h
+// For that reason this has to be inside the extern "C" here.
+#include <hardware/flash.h>
+
 const char *g_azplatform_name = PLATFORM_NAME;
 
 void mbed_error_hook(const mbed_error_ctx * error_context);
@@ -229,12 +233,122 @@ void azplatform_log(const char *s)
     uart_puts(uart0, s);
 }
 
+static int g_watchdog_timeout;
+static bool g_watchdog_initialized;
+
+static void watchdog_callback(unsigned alarm_num)
+{
+    g_watchdog_timeout -= 1000;
+
+    if (g_watchdog_timeout <= WATCHDOG_CRASH_TIMEOUT - WATCHDOG_BUS_RESET_TIMEOUT)
+    {
+        if (!scsiDev.resetFlag)
+        {
+            azlog("WATCHDOG TIMEOUT, attempting bus reset");
+            scsiDev.resetFlag = 1;
+        }
+
+        if (g_watchdog_timeout <= 0)
+        {
+            assert(false);
+        }
+    }
+
+    hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+}
+
 // This function can be used to periodically reset watchdog timer for crash handling.
 // It can also be left empty if the platform does not use a watchdog timer.
 void azplatform_reset_watchdog()
 {
+    g_watchdog_timeout = WATCHDOG_CRASH_TIMEOUT;
+
+    if (!g_watchdog_initialized)
+    {
+        hardware_alarm_claim(3);
+        hardware_alarm_set_callback(3, &watchdog_callback);
+        hardware_alarm_set_target(3, delayed_by_ms(get_absolute_time(), 1000));
+        g_watchdog_initialized = true;
+    }
+}
+
+/*****************************************/
+/* Flash reprogramming from bootloader   */
+/*****************************************/
+
+#ifdef AZPLATFORM_BOOTLOADER_SIZE
+
+extern uint32_t __real_vectors_start;
+extern uint32_t __StackTop;
+static volatile void *g_bootloader_exit_req;
+
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE])
+{
+    if (offset == AZPLATFORM_BOOTLOADER_SIZE)
+    {
+        if (buffer[3] != 0x20 || buffer[7] != 0x10)
+        {
+            azlog("Invalid firmware file, starts with: ", bytearray(buffer, 16));
+            return false;
+        }
+    }
+
+    azdbg("Writing flash at offset ", offset, " data ", bytearray(buffer, 4));
+    assert(offset % AZPLATFORM_FLASH_PAGE_SIZE == 0);
+    assert(offset >= AZPLATFORM_BOOTLOADER_SIZE);
+
+    __disable_irq();
+    flash_range_erase(offset, AZPLATFORM_FLASH_PAGE_SIZE);
+    flash_range_program(offset, buffer, AZPLATFORM_FLASH_PAGE_SIZE);
+    __enable_irq();
+
+    uint32_t *buf32 = (uint32_t*)buffer;
+    uint32_t num_words = AZPLATFORM_FLASH_PAGE_SIZE / 4;
+    for (int i = 0; i < num_words; i++)
+    {
+        uint32_t expected = buf32[i];
+        uint32_t actual = *(volatile uint32_t*)(XIP_NOCACHE_BASE + offset + i * 4);
+        if (actual != expected)
+        {
+            azlog("Flash verify failed at offset ", offset + i * 4, " got ", actual, " expected ", expected);
+            return false;
+        }
+    }
+    return true;
+}
+
+void azplatform_boot_to_main_firmware()
+{
+    // To ensure that the system state is reset properly, we perform
+    // a SYSRESETREQ and jump straight from the reset vector to main application.
+    g_bootloader_exit_req = &g_bootloader_exit_req;
+    SCB->AIRCR = 0x05FA0004;
+    while(1);
+}
+
+void btldr_reset_handler()
+{
+    uint32_t* application_base = &__real_vectors_start;
+    if (g_bootloader_exit_req == &g_bootloader_exit_req)
+    {
+        // Boot to main application
+        application_base = (uint32_t*)(XIP_BASE + AZPLATFORM_BOOTLOADER_SIZE);
+    }
+
+    SCB->VTOR = (uint32_t)application_base;
+    __asm__(
+        "msr msp, %0\n\t"
+        "bx %1" : : "r" (application_base[0]),
+                    "r" (application_base[1]) : "memory");
 }
 
+// Replace the reset handler when building the bootloader
+// The rp2040_btldr.ld places real vector table at an offset.
+__attribute__((section(".btldr_vectors")))
+const void * btldr_vectors[2] = {&__StackTop, (void*)&btldr_reset_handler};
+
+#endif
+
 /**********************************************/
 /* Mapping from data bytes to GPIO BOP values */
 /**********************************************/

+ 13 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.h

@@ -15,6 +15,9 @@ extern const char *g_azplatform_name;
 #define PLATFORM_NAME "ZuluSCSI RP2040"
 #define PLATFORM_REVISION "2.0"
 #define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
+#define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 4096
+#define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 65536
+#define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 8192
 #define SD_USE_SDIO 1
 
 // NOTE: The driver supports synchronous speeds higher than 10MB/s, but this
@@ -24,6 +27,7 @@ extern const char *g_azplatform_name;
 // Debug logging function, can be used to print to e.g. serial port.
 // May get called from interrupt handlers.
 void azplatform_log(const char *s);
+void azplatform_emergency_log_save();
 
 // Timing and delay functions.
 // Arduino platform already provides these
@@ -56,6 +60,15 @@ void azplatform_reset_watchdog();
 typedef void (*sd_callback_t)(uint32_t bytes_complete);
 void azplatform_set_sd_callback(sd_callback_t func, const uint8_t *buffer);
 
+// Reprogram firmware in main program area.
+#ifndef RP2040_DISABLE_BOOTLOADER
+#define AZPLATFORM_BOOTLOADER_SIZE (128 * 1024)
+#define AZPLATFORM_FLASH_TOTAL_SIZE (1024 * 1024)
+#define AZPLATFORM_FLASH_PAGE_SIZE 4096
+bool azplatform_rewrite_flash_page(uint32_t offset, uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE]);
+void azplatform_boot_to_main_firmware();
+#endif
+
 // Below are GPIO access definitions that are used from scsiPhy.cpp.
 
 // Write a single SCSI pin.

+ 33 - 2
lib/ZuluSCSI_platform_RP2040/rp2040.ld

@@ -18,8 +18,18 @@ SECTIONS
     } > FLASH
     ASSERT(__boot2_end__ - __boot2_start__ == 256,
         "ERROR: Pico second stage bootloader must be 256 bytes in size")
+
+    /* If ZuluSCSI SD card bootloader is included, it goes in first 128 kB */
+    .text.bootloader : ALIGN(16) SUBALIGN(16)
+    {
+        KEEP(*(.text.btldr*))
+        . = ALIGN(131072);
+        CHECK_BOOTLOADER_SIZE = 1 / (. <= 131072);
+    } > FLASH
+
     .text : {
         __logical_binary_start = .;
+        __real_vectors_start = .;
         KEEP (*(.vectors))
         KEEP (*(.binary_info_header))
         __binary_info_header_end = .;
@@ -38,8 +48,23 @@ SECTIONS
         *(.dtors)
         *(.eh_frame*)
         . = ALIGN(4);
-        *(.text)
-        *(.text*)
+
+        /* Put only non-timecritical code in flash
+         * This includes e.g. floating point math routines.
+         */
+        *libm*:(.text .text*)
+        *libc*:(.text .text*)
+        *libgcc*:*df*(.text .text*)
+        *USB*(.text .text*)
+        *SPI*(.text .text*)
+        *Spi*(.text .text*)
+        *spi*(.text .text*)
+        *stdc*:(.text .text*)
+        *supc*:(.text .text*)
+        *nosys*:(.text .text*)
+        *libc*:*printf*(.text .text*)
+        *libc*:*toa*(.text .text*)
+        *libminIni.a:(.text .text*)
     } > FLASH
     .rodata : {
         . = ALIGN(4);
@@ -74,7 +99,13 @@ SECTIONS
     .data : {
         __data_start__ = .;
         *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
         *(.time_critical*)
+        . = ALIGN(4);
+        *(.text)
+        *(.text*)
+
         . = ALIGN(4);
         *(.data*)
         . = ALIGN(4);

+ 170 - 0
lib/ZuluSCSI_platform_RP2040/rp2040_btldr.ld

@@ -0,0 +1,170 @@
+/*
+ *
+ * Customized linker script for building bootloader
+ *
+ */
+
+ MEMORY
+{
+    /* The bootloader is linked to begin at 0x12000100.
+     * First 256 bytes are reserved for RP2040 second stage bootloader,
+     * which comes as part of the main firmware.elf and is never overwritten.
+     * The bootloader also runs without XIP cache because that seemed to cause
+     * problems when writing flash.
+     */
+    FLASH(rx) : ORIGIN = 0x12000100, LENGTH = 128k-256
+    RAM(rwx) : ORIGIN = 0x20000000, LENGTH = 240k  /* Leave space for pico-debug */
+    SCRATCH_X(rwx) : ORIGIN = 0x20040000, LENGTH = 4k
+    SCRATCH_Y(rwx) : ORIGIN = 0x20041000, LENGTH = 4k
+}
+ENTRY(_entry_point)
+SECTIONS
+{
+    .flash_begin : {
+        __flash_binary_start = .;
+    } > FLASH
+
+    .text : {
+        __logical_binary_start = .;
+        KEEP (*(.btldr_vectors))
+        KEEP (*(.binary_info_header))
+        __binary_info_header_end = .;
+        . = ALIGN(256);
+        __real_vectors_start = .;
+        KEEP (*(.vectors))
+        KEEP (*(.reset))
+        KEEP (*(.init))
+        *(.fini)
+        *crtbegin.o(.ctors)
+        *crtbegin?.o(.ctors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
+        *(SORT(.ctors.*))
+        *(.ctors)
+        *crtbegin.o(.dtors)
+        *crtbegin?.o(.dtors)
+        *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
+        *(SORT(.dtors.*))
+        *(.dtors)
+        *(.eh_frame*)
+        *(.text .text*)
+        . = ALIGN(4);
+    } > FLASH
+    .rodata : {
+        . = ALIGN(4);
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.flashdata*)))
+        *(.rodata)
+        *(.rodata*)
+        . = ALIGN(4);
+    } > FLASH
+    .ARM.extab :
+    {
+        *(.ARM.extab* .gnu.linkonce.armextab.*)
+    } > FLASH
+    __exidx_start = .;
+    .ARM.exidx :
+    {
+        *(.ARM.exidx* .gnu.linkonce.armexidx.*)
+    } > FLASH
+    __exidx_end = .;
+    . = ALIGN(4);
+    __binary_info_start = .;
+    .binary_info :
+    {
+        KEEP(*(.binary_info.keep.*))
+        *(.binary_info.*)
+    } > FLASH
+    __binary_info_end = .;
+    . = ALIGN(4);
+    __etext = .;
+   .ram_vector_table (COPY): {
+        *(.ram_vector_table)
+    } > RAM
+    .data : {
+        __data_start__ = .;
+        *(vtable)
+
+        /* Time critical code will go here to avoid external flash latency */
+        *(.time_critical*)
+
+        . = ALIGN(4);
+        *(.data*)
+        . = ALIGN(4);
+        *(.after_data.*)
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__mutex_array_start = .);
+        KEEP(*(SORT(.mutex_array.*)))
+        KEEP(*(.mutex_array))
+        PROVIDE_HIDDEN (__mutex_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__preinit_array_start = .);
+        KEEP(*(SORT(.preinit_array.*)))
+        KEEP(*(.preinit_array))
+        PROVIDE_HIDDEN (__preinit_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__init_array_start = .);
+        KEEP(*(SORT(.init_array.*)))
+        KEEP(*(.init_array))
+        PROVIDE_HIDDEN (__init_array_end = .);
+        . = ALIGN(4);
+        PROVIDE_HIDDEN (__fini_array_start = .);
+        *(SORT(.fini_array.*))
+        *(.fini_array)
+        PROVIDE_HIDDEN (__fini_array_end = .);
+        *(.jcr)
+        . = ALIGN(4);
+        __data_end__ = .;
+    } > RAM AT> FLASH
+    .uninitialized_data (COPY): {
+        . = ALIGN(4);
+        *(.uninitialized_data*)
+    } > RAM
+    .scratch_x : {
+        __scratch_x_start__ = .;
+        *(.scratch_x.*)
+        . = ALIGN(4);
+        __scratch_x_end__ = .;
+    } > SCRATCH_X AT > FLASH
+    __scratch_x_source__ = LOADADDR(.scratch_x);
+    .scratch_y : {
+        __scratch_y_start__ = .;
+        *(.scratch_y.*)
+        . = ALIGN(4);
+        __scratch_y_end__ = .;
+    } > SCRATCH_Y AT > FLASH
+    __scratch_y_source__ = LOADADDR(.scratch_y);
+    .bss : {
+        . = ALIGN(4);
+        __bss_start__ = .;
+        *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
+        *(COMMON)
+        . = ALIGN(4);
+        __bss_end__ = .;
+    } > RAM
+    .heap (COPY):
+    {
+        __end__ = .;
+        PROVIDE(end = .);
+        *(.heap*)
+        . = ORIGIN(RAM) + LENGTH(RAM) - 0x400;
+        __HeapLimit = .;
+    } > RAM
+    .stack1_dummy (COPY):
+    {
+        *(.stack1*)
+    } > SCRATCH_X
+    .stack_dummy (COPY):
+    {
+        *(.stack*)
+    } > RAM
+    .flash_end : {
+        __flash_binary_end = .;
+    } > FLASH
+    __StackTop = ORIGIN(RAM) + LENGTH(RAM);
+    __StackLimit = __StackTop - 0x400;
+    __StackOneTop = ORIGIN(SCRATCH_X) + LENGTH(SCRATCH_X);
+    __StackOneBottom = __StackOneTop - SIZEOF(.stack1_dummy);
+    __StackBottom = __StackTop - SIZEOF(.stack_dummy);
+    PROVIDE(__stack = __StackTop);
+    ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed")
+    ASSERT( __binary_info_header_end - __logical_binary_start <= 256, "Binary info must be in first 256 bytes of the binary")
+}

+ 39 - 29
lib/ZuluSCSI_platform_RP2040/rp2040_sdio.cpp

@@ -94,31 +94,35 @@ static const uint8_t crc7_table[256] = {
 // When the SDIO bus operates in 4-bit mode, the CRC16 algorithm
 // is applied to each line separately and generates total of
 // 4 x 16 = 64 bits of checksum.
+__attribute__((optimize("O3")))
 uint64_t sdio_crc16_4bit_checksum(uint32_t *data, uint32_t num_words)
 {
     uint64_t crc = 0;
     uint32_t *end = data + num_words;
     while (data < end)
     {
-        // Each 32-bit word contains 8 bits per line.
-        // Reverse the bytes because SDIO protocol is big-endian.
-        uint32_t data_in = __builtin_bswap32(*data++);
+        for (int unroll = 0; unroll < 4; unroll++)
+        {
+            // Each 32-bit word contains 8 bits per line.
+            // Reverse the bytes because SDIO protocol is big-endian.
+            uint32_t data_in = __builtin_bswap32(*data++);
 
-        // Shift out 8 bits for each line
-        uint32_t data_out = crc >> 32;
-        crc <<= 32;
+            // Shift out 8 bits for each line
+            uint32_t data_out = crc >> 32;
+            crc <<= 32;
 
-        // XOR outgoing data to itself with 4 bit delay
-        data_out ^= (data_out >> 16);
+            // XOR outgoing data to itself with 4 bit delay
+            data_out ^= (data_out >> 16);
 
-        // XOR incoming data to outgoing data with 4 bit delay
-        data_out ^= (data_in >> 16);
+            // XOR incoming data to outgoing data with 4 bit delay
+            data_out ^= (data_in >> 16);
 
-        // XOR outgoing and incoming data to accumulator at each tap
-        uint64_t xorred = data_out ^ data_in;
-        crc ^= xorred;
-        crc ^= xorred << (5 * 4);
-        crc ^= xorred << (12 * 4);
+            // XOR outgoing and incoming data to accumulator at each tap
+            uint64_t xorred = data_out ^ data_in;
+            crc ^= xorred;
+            crc ^= xorred << (5 * 4);
+            crc ^= xorred << (12 * 4);
+        }
     }
 
     return crc;
@@ -434,19 +438,29 @@ static void sdio_verify_rx_checksums(uint32_t maxcount)
 
 sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 {
-    // Check how many DMA control blocks have been consumed
-    uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
-    dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
-
-    // Compute how many complete 512 byte SDIO blocks have been transferred
-    // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
-    g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
-
-    // Is it all done?
+    // Was everything done when the previous rx_poll() finished?
     if (g_sdio.blocks_done >= g_sdio.total_blocks)
     {
         g_sdio.transfer_state = SDIO_IDLE;
     }
+    else
+    {
+        // Use the idle time to calculate checksums
+        sdio_verify_rx_checksums(4);
+
+        // Check how many DMA control blocks have been consumed
+        uint32_t dma_ctrl_block_count = (dma_hw->ch[SDIO_DMA_CHB].read_addr - (uint32_t)&g_sdio.dma_blocks);
+        dma_ctrl_block_count /= sizeof(g_sdio.dma_blocks[0]);
+
+        // Compute how many complete 512 byte SDIO blocks have been transferred
+        // When transfer ends, dma_ctrl_block_count == g_sdio.total_blocks * 2 + 1
+        g_sdio.blocks_done = (dma_ctrl_block_count - 1) / 2;
+
+        // NOTE: When all blocks are done, rx_poll() still returns SDIO_BUSY once.
+        // This provides a chance to start the SCSI transfer before the last checksums
+        // are computed. Any checksum failures can be indicated in SCSI status after
+        // the data transfer has finished.
+    }
 
     if (bytes_complete)
     {
@@ -455,6 +469,7 @@ sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
 
     if (g_sdio.transfer_state == SDIO_IDLE)
     {
+        // Verify all remaining checksums.
         sdio_verify_rx_checksums(g_sdio.total_blocks);
 
         if (g_sdio.checksum_errors == 0)
@@ -472,11 +487,6 @@ sdio_status_t rp2040_sdio_rx_poll(uint32_t *bytes_complete)
         rp2040_sdio_stop();
         return SDIO_ERR_DATA_TIMEOUT;
     }
-    else
-    {
-        // Use the idle time to calculate checksums
-        sdio_verify_rx_checksums(1);
-    }
 
     return SDIO_BUSY;
 }

+ 51 - 6
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.cpp

@@ -14,6 +14,8 @@
 #include <hardware/dma.h>
 #include <hardware/irq.h>
 #include <hardware/structs/iobank0.h>
+#include <hardware/sync.h>
+#include <multicore.h>
 
 #define SCSI_DMA_PIO pio0
 #define SCSI_DMA_SM 0
@@ -59,6 +61,10 @@ static struct {
     uint32_t dma_countB;
     uint32_t dma_bufA[DMA_BUF_SIZE];
     uint32_t dma_bufB[DMA_BUF_SIZE];
+
+    // Try to offload SCSI DMA interrupts to second core if possible
+    volatile bool core1_active;
+    mutex_t mutex;
 } g_scsi_dma;
 
 enum scsidma_state_t { SCSIDMA_IDLE = 0,
@@ -217,6 +223,8 @@ static void scsi_dma_write_irq()
 {
     dma_hw->ints0 = 1 << SCSI_DMA_CH;
 
+    mutex_enter_blocking(&g_scsi_dma.mutex);
+
     if (g_scsi_dma.dma_current_buf == SCSIBUF_A)
     {
         // Transfer from buffer A finished
@@ -273,6 +281,30 @@ static void scsi_dma_write_irq()
             g_scsi_dma.dma_countB = refill_dmabuf(g_scsi_dma.dma_bufB);
         }
     }
+
+    mutex_exit(&g_scsi_dma.mutex);
+}
+
+// SCSI DMA interrupts are offloaded to the second core if possible
+static void enable_irq_second_core()
+{
+    irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+    irq_set_enabled(DMA_IRQ_0, true);
+    g_scsi_dma.core1_active = true;
+}
+
+// Block the SCSI DMA interrupt from executing on either core.
+// Used during setting of the buffer pointers.
+static void scsi_dma_block_irqs()
+{
+    __disable_irq();
+    mutex_enter_blocking(&g_scsi_dma.mutex);
+}
+
+static void scsi_dma_unblock_irqs()
+{
+    mutex_exit(&g_scsi_dma.mutex);
+    __enable_irq();
 }
 
 void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
@@ -280,7 +312,7 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
     // Number of bytes should always be divisible by 2.
     assert((count & 1) == 0);
 
-    __disable_irq();
+    scsi_dma_block_irqs();
     if (g_scsi_dma_state == SCSIDMA_WRITE)
     {
         if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
@@ -303,7 +335,7 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
             count = 0;
         }
     }
-    __enable_irq();
+    scsi_dma_unblock_irqs();
 
     // Check if the request was combined
     if (count == 0) return;
@@ -366,8 +398,6 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
         }
         
         dma_channel_set_irq0_enabled(SCSI_DMA_CH, true);
-        irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
-        irq_set_enabled(DMA_IRQ_0, true);
     }
 
     start_dma_write();
@@ -385,8 +415,8 @@ bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
         return false;
     
     // Check if this data item is still in queue.
-    __disable_irq();
     bool finished = true;
+    scsi_dma_block_irqs();
     if (data >= g_scsi_dma.app_buf + g_scsi_dma.dma_bytes &&
         data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
     {
@@ -397,7 +427,7 @@ bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
     {
         finished = false; // In queued transfer
     }
-    __enable_irq();
+    scsi_dma_unblock_irqs();
 
     return finished;
 }
@@ -522,6 +552,7 @@ void scsi_accel_rp2040_init()
     {
         pio_sm_claim(SCSI_DMA_PIO, SCSI_DMA_SM);
         dma_channel_claim(SCSI_DMA_CH);
+        mutex_init(&g_scsi_dma.mutex);
         g_channels_claimed = true;
     }
 
@@ -573,6 +604,20 @@ void scsi_accel_rp2040_init()
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DMA_SYNC_SM, true));
     g_scsi_dma.dma_write_pacer_config = cfg;
+
+    // Try to enable interrupt handling on second core
+    irq_set_enabled(DMA_IRQ_0, false);
+    g_scsi_dma.core1_active = false;
+    multicore_reset_core1();
+    multicore_launch_core1(&enable_irq_second_core);
+    delay(1);
+
+    if (!g_scsi_dma.core1_active)
+    {
+        azlog("Failed to offload SCSI DMA interrupts to second core, using first core");
+        irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+        irq_set_enabled(DMA_IRQ_0, true);
+    }
 }
 
 void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod)

+ 3 - 1
platformio.ini

@@ -67,14 +67,16 @@ build_flags =
 platform = raspberrypi
 framework = arduino
 board = ZuluSCSI_RP2040
+extra_scripts = src/build_bootloader.py
 board_build.ldscript = lib/ZuluSCSI_platform_RP2040/rp2040.ld
+ldscript_bootloader = lib/ZuluSCSI_platform_RP2040/rp2040_btldr.ld
 lib_deps =
     SdFat=https://github.com/greiman/SdFat#2.1.2
     minIni
     ZuluSCSI_platform_RP2040
     SCSI2SD
 build_flags =
-    -Os -Isrc -ggdb -g3
+    -O2 -Isrc -ggdb -g3
     -Wall -Wno-sign-compare -Wno-ignored-qualifiers
     -DSPI_DRIVER_SELECT=3
     -DSD_CHIP_SELECT_MODE=2

+ 5 - 1
src/ZuluSCSI_bootloader.cpp

@@ -1,6 +1,7 @@
 // Simple bootloader that loads new firmware from SD card.
 
 #include <ZuluSCSI_platform.h>
+#include "ZuluSCSI_config.h"
 #include "ZuluSCSI_log.h"
 #include <SdFat.h>
 #include <string.h>
@@ -41,7 +42,10 @@ bool program_firmware(FsFile &file)
 {
     uint32_t fwsize = file.size() - AZPLATFORM_BOOTLOADER_SIZE;
     uint32_t num_pages = (fwsize + AZPLATFORM_FLASH_PAGE_SIZE - 1) / AZPLATFORM_FLASH_PAGE_SIZE;
-    static uint8_t buffer[AZPLATFORM_FLASH_PAGE_SIZE];
+
+    // Make sure the buffer is aligned to word boundary
+    static uint32_t buffer32[AZPLATFORM_FLASH_PAGE_SIZE / 4];
+    uint8_t *buffer = (uint8_t*)buffer32;
 
     if (fwsize > AZPLATFORM_FLASH_TOTAL_SIZE)
     {

+ 12 - 0
src/ZuluSCSI_disk.cpp

@@ -93,6 +93,18 @@ public:
         else
         {
             m_fsfile = SD.open(filename, O_RDWR);
+
+            uint32_t begin = 0, end = 0;
+            if (m_fsfile.contiguousRange(&begin, &end))
+            {
+                // Convert to raw mapping, this avoids some unnecessary
+                // access overhead in SdFat library.
+                m_israw = true;
+                m_blockdev = SD.card();
+                m_bgnsector = begin;
+                m_endsector = end;
+                m_fsfile.close();
+            }
         }
     }