Browse Source

RP2040: Implement non-blocking read from SCSI bus

This allows SCSI write commands to transfer data continuously on RP2040
while SD card writes proceed.

Now achieves 8 MB/s read & 6 MB/s write in 10 MHz sync mode.
Petteri Aimonen 2 years ago
parent
commit
3d1ac5bb8a

+ 47 - 0
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.cpp

@@ -590,6 +590,53 @@ const uint16_t g_scsi_parity_lookup[256] __attribute__((aligned(512), section(".
 
 #undef X
 
+/* Similarly, another lookup table is used to verify parity of received data.
+ * This table is indexed by the 8 data bits + 1 parity bit from SCSI bus (active low)
+ * Each word contains the data byte (inverted to active-high) and a bit indicating whether parity is valid.
+ */
+#define X(n) (\
+    ((n & 0xFF) ^ 0xFF) | \
+    (((PARITY(n & 0xFF) ^ (n >> 8)) & 1) << 8) \
+)
+
+const uint16_t g_scsi_parity_check_lookup[512] __attribute__((aligned(1024), section(".scratch_x.parity"))) =
+{
+    X(0x000), X(0x001), X(0x002), X(0x003), X(0x004), X(0x005), X(0x006), X(0x007), X(0x008), X(0x009), X(0x00a), X(0x00b), X(0x00c), X(0x00d), X(0x00e), X(0x00f),
+    X(0x010), X(0x011), X(0x012), X(0x013), X(0x014), X(0x015), X(0x016), X(0x017), X(0x018), X(0x019), X(0x01a), X(0x01b), X(0x01c), X(0x01d), X(0x01e), X(0x01f),
+    X(0x020), X(0x021), X(0x022), X(0x023), X(0x024), X(0x025), X(0x026), X(0x027), X(0x028), X(0x029), X(0x02a), X(0x02b), X(0x02c), X(0x02d), X(0x02e), X(0x02f),
+    X(0x030), X(0x031), X(0x032), X(0x033), X(0x034), X(0x035), X(0x036), X(0x037), X(0x038), X(0x039), X(0x03a), X(0x03b), X(0x03c), X(0x03d), X(0x03e), X(0x03f),
+    X(0x040), X(0x041), X(0x042), X(0x043), X(0x044), X(0x045), X(0x046), X(0x047), X(0x048), X(0x049), X(0x04a), X(0x04b), X(0x04c), X(0x04d), X(0x04e), X(0x04f),
+    X(0x050), X(0x051), X(0x052), X(0x053), X(0x054), X(0x055), X(0x056), X(0x057), X(0x058), X(0x059), X(0x05a), X(0x05b), X(0x05c), X(0x05d), X(0x05e), X(0x05f),
+    X(0x060), X(0x061), X(0x062), X(0x063), X(0x064), X(0x065), X(0x066), X(0x067), X(0x068), X(0x069), X(0x06a), X(0x06b), X(0x06c), X(0x06d), X(0x06e), X(0x06f),
+    X(0x070), X(0x071), X(0x072), X(0x073), X(0x074), X(0x075), X(0x076), X(0x077), X(0x078), X(0x079), X(0x07a), X(0x07b), X(0x07c), X(0x07d), X(0x07e), X(0x07f),
+    X(0x080), X(0x081), X(0x082), X(0x083), X(0x084), X(0x085), X(0x086), X(0x087), X(0x088), X(0x089), X(0x08a), X(0x08b), X(0x08c), X(0x08d), X(0x08e), X(0x08f),
+    X(0x090), X(0x091), X(0x092), X(0x093), X(0x094), X(0x095), X(0x096), X(0x097), X(0x098), X(0x099), X(0x09a), X(0x09b), X(0x09c), X(0x09d), X(0x09e), X(0x09f),
+    X(0x0a0), X(0x0a1), X(0x0a2), X(0x0a3), X(0x0a4), X(0x0a5), X(0x0a6), X(0x0a7), X(0x0a8), X(0x0a9), X(0x0aa), X(0x0ab), X(0x0ac), X(0x0ad), X(0x0ae), X(0x0af),
+    X(0x0b0), X(0x0b1), X(0x0b2), X(0x0b3), X(0x0b4), X(0x0b5), X(0x0b6), X(0x0b7), X(0x0b8), X(0x0b9), X(0x0ba), X(0x0bb), X(0x0bc), X(0x0bd), X(0x0be), X(0x0bf),
+    X(0x0c0), X(0x0c1), X(0x0c2), X(0x0c3), X(0x0c4), X(0x0c5), X(0x0c6), X(0x0c7), X(0x0c8), X(0x0c9), X(0x0ca), X(0x0cb), X(0x0cc), X(0x0cd), X(0x0ce), X(0x0cf),
+    X(0x0d0), X(0x0d1), X(0x0d2), X(0x0d3), X(0x0d4), X(0x0d5), X(0x0d6), X(0x0d7), X(0x0d8), X(0x0d9), X(0x0da), X(0x0db), X(0x0dc), X(0x0dd), X(0x0de), X(0x0df),
+    X(0x0e0), X(0x0e1), X(0x0e2), X(0x0e3), X(0x0e4), X(0x0e5), X(0x0e6), X(0x0e7), X(0x0e8), X(0x0e9), X(0x0ea), X(0x0eb), X(0x0ec), X(0x0ed), X(0x0ee), X(0x0ef),
+    X(0x0f0), X(0x0f1), X(0x0f2), X(0x0f3), X(0x0f4), X(0x0f5), X(0x0f6), X(0x0f7), X(0x0f8), X(0x0f9), X(0x0fa), X(0x0fb), X(0x0fc), X(0x0fd), X(0x0fe), X(0x0ff),
+    X(0x100), X(0x101), X(0x102), X(0x103), X(0x104), X(0x105), X(0x106), X(0x107), X(0x108), X(0x109), X(0x10a), X(0x10b), X(0x10c), X(0x10d), X(0x10e), X(0x10f),
+    X(0x110), X(0x111), X(0x112), X(0x113), X(0x114), X(0x115), X(0x116), X(0x117), X(0x118), X(0x119), X(0x11a), X(0x11b), X(0x11c), X(0x11d), X(0x11e), X(0x11f),
+    X(0x120), X(0x121), X(0x122), X(0x123), X(0x124), X(0x125), X(0x126), X(0x127), X(0x128), X(0x129), X(0x12a), X(0x12b), X(0x12c), X(0x12d), X(0x12e), X(0x12f),
+    X(0x130), X(0x131), X(0x132), X(0x133), X(0x134), X(0x135), X(0x136), X(0x137), X(0x138), X(0x139), X(0x13a), X(0x13b), X(0x13c), X(0x13d), X(0x13e), X(0x13f),
+    X(0x140), X(0x141), X(0x142), X(0x143), X(0x144), X(0x145), X(0x146), X(0x147), X(0x148), X(0x149), X(0x14a), X(0x14b), X(0x14c), X(0x14d), X(0x14e), X(0x14f),
+    X(0x150), X(0x151), X(0x152), X(0x153), X(0x154), X(0x155), X(0x156), X(0x157), X(0x158), X(0x159), X(0x15a), X(0x15b), X(0x15c), X(0x15d), X(0x15e), X(0x15f),
+    X(0x160), X(0x161), X(0x162), X(0x163), X(0x164), X(0x165), X(0x166), X(0x167), X(0x168), X(0x169), X(0x16a), X(0x16b), X(0x16c), X(0x16d), X(0x16e), X(0x16f),
+    X(0x170), X(0x171), X(0x172), X(0x173), X(0x174), X(0x175), X(0x176), X(0x177), X(0x178), X(0x179), X(0x17a), X(0x17b), X(0x17c), X(0x17d), X(0x17e), X(0x17f),
+    X(0x180), X(0x181), X(0x182), X(0x183), X(0x184), X(0x185), X(0x186), X(0x187), X(0x188), X(0x189), X(0x18a), X(0x18b), X(0x18c), X(0x18d), X(0x18e), X(0x18f),
+    X(0x190), X(0x191), X(0x192), X(0x193), X(0x194), X(0x195), X(0x196), X(0x197), X(0x198), X(0x199), X(0x19a), X(0x19b), X(0x19c), X(0x19d), X(0x19e), X(0x19f),
+    X(0x1a0), X(0x1a1), X(0x1a2), X(0x1a3), X(0x1a4), X(0x1a5), X(0x1a6), X(0x1a7), X(0x1a8), X(0x1a9), X(0x1aa), X(0x1ab), X(0x1ac), X(0x1ad), X(0x1ae), X(0x1af),
+    X(0x1b0), X(0x1b1), X(0x1b2), X(0x1b3), X(0x1b4), X(0x1b5), X(0x1b6), X(0x1b7), X(0x1b8), X(0x1b9), X(0x1ba), X(0x1bb), X(0x1bc), X(0x1bd), X(0x1be), X(0x1bf),
+    X(0x1c0), X(0x1c1), X(0x1c2), X(0x1c3), X(0x1c4), X(0x1c5), X(0x1c6), X(0x1c7), X(0x1c8), X(0x1c9), X(0x1ca), X(0x1cb), X(0x1cc), X(0x1cd), X(0x1ce), X(0x1cf),
+    X(0x1d0), X(0x1d1), X(0x1d2), X(0x1d3), X(0x1d4), X(0x1d5), X(0x1d6), X(0x1d7), X(0x1d8), X(0x1d9), X(0x1da), X(0x1db), X(0x1dc), X(0x1dd), X(0x1de), X(0x1df),
+    X(0x1e0), X(0x1e1), X(0x1e2), X(0x1e3), X(0x1e4), X(0x1e5), X(0x1e6), X(0x1e7), X(0x1e8), X(0x1e9), X(0x1ea), X(0x1eb), X(0x1ec), X(0x1ed), X(0x1ee), X(0x1ef),
+    X(0x1f0), X(0x1f1), X(0x1f2), X(0x1f3), X(0x1f4), X(0x1f5), X(0x1f6), X(0x1f7), X(0x1f8), X(0x1f9), X(0x1fa), X(0x1fb), X(0x1fc), X(0x1fd), X(0x1fe), X(0x1ff),
+};
+
+#undef X
+
 } /* extern "C" */
 
 /* Logging from mbed */

+ 7 - 3
lib/ZuluSCSI_platform_RP2040/ZuluSCSI_platform.h

@@ -16,8 +16,8 @@ extern const char *g_azplatform_name;
 #define PLATFORM_NAME "ZuluSCSI RP2040"
 #define PLATFORM_REVISION "2.0"
 #define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
-#define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 4096
-#define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 32768
+#define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 32768
+#define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 65536
 #define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 8192
 #define SD_USE_SDIO 1
 #define PLATFORM_HAS_INITIATOR_MODE 1
@@ -88,6 +88,11 @@ bool azplatform_read_romdrive(uint8_t *dest, uint32_t start, uint32_t count);
 bool azplatform_write_romdrive(const uint8_t *data, uint32_t start, uint32_t count);
 #endif
 
+// Parity lookup tables for write and read from SCSI bus.
+// These are used by macros below and the code in scsi_accel_rp2040.cpp
+extern const uint16_t g_scsi_parity_lookup[256];
+extern const uint16_t g_scsi_parity_check_lookup[512];
+
 // Below are GPIO access definitions that are used from scsiPhy.cpp.
 
 // Write a single SCSI pin.
@@ -120,7 +125,6 @@ bool azplatform_write_romdrive(const uint8_t *data, uint32_t start, uint32_t cou
      sio_hw->gpio_oe_set = SCSI_IO_DATA_MASK)
 
 // Write SCSI data bus, also sets REQ to inactive.
-extern const uint16_t g_scsi_parity_lookup[256];
 #define SCSI_OUT_DATA(data) \
     gpio_put_masked(SCSI_IO_DATA_MASK | (1 << SCSI_OUT_REQ), \
                     g_scsi_parity_lookup[(uint8_t)(data)] | (1 << SCSI_OUT_REQ)), \

+ 20 - 4
lib/ZuluSCSI_platform_RP2040/scsiPhy.cpp

@@ -188,13 +188,13 @@ extern "C" uint32_t scsiEnterPhaseImmediate(int phase)
         scsiLogPhaseChange(phase);
 
         // Select between synchronous vs. asynchronous SCSI writes
-        if (g_scsi_phase == DATA_IN && scsiDev.target->syncOffset > 0)
+        if (scsiDev.target->syncOffset > 0 && (g_scsi_phase == DATA_IN || g_scsi_phase == DATA_OUT))
         {
-            scsi_accel_rp2040_setWriteMode(scsiDev.target->syncOffset, scsiDev.target->syncPeriod);
+            scsi_accel_rp2040_setSyncMode(scsiDev.target->syncOffset, scsiDev.target->syncPeriod);
         }
         else
         {
-            scsi_accel_rp2040_setWriteMode(0, 0);
+            scsi_accel_rp2040_setSyncMode(0, 0);
         }
 
         if (phase < 0)
@@ -343,6 +343,22 @@ extern "C" uint8_t scsiReadByte(void)
 extern "C" void scsiRead(uint8_t* data, uint32_t count, int* parityError)
 {
     *parityError = 0;
-    scsi_accel_rp2040_read(data, count, parityError, &scsiDev.resetFlag);
+    scsiStartRead(data, count, parityError);
+    scsiFinishRead(data, count, parityError);
+}
+
+extern "C" void scsiStartRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    scsi_accel_rp2040_startRead(data, count, parityError, &scsiDev.resetFlag);
+}
+
+extern "C" void scsiFinishRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    scsi_accel_rp2040_finishRead(data, count, parityError, &scsiDev.resetFlag);
     scsiLogDataOut(data, count);
 }
+
+extern "C" bool scsiIsReadFinished(const uint8_t *data)
+{
+    return scsi_accel_rp2040_isReadFinished(data);
+}

+ 7 - 0
lib/ZuluSCSI_platform_RP2040/scsiPhy.h

@@ -54,11 +54,18 @@ uint8_t scsiReadByte(void);
 // either combine transfers or block until previous transfer completes.
 void scsiStartWrite(const uint8_t* data, uint32_t count);
 void scsiFinishWrite();
+void scsiStartRead(uint8_t* data, uint32_t count, int *parityError);
+void scsiFinishRead(uint8_t* data, uint32_t count, int *parityError);
 
 // Query whether the data at pointer has already been read, i.e. buffer can be reused.
 // If data is NULL, checks if all writes have completed.
 bool scsiIsWriteFinished(const uint8_t *data);
 
+// Query whether the data at pointer has already been written, i.e. can be processed.
+// If data is NULL, checks if all reads have completed.
+bool scsiIsReadFinished(const uint8_t *data);
+
+#define PLATFORM_SCSIPHY_HAS_NONBLOCKING_READ 1
 
 #define s2s_getScsiRateKBs() 0
 

+ 41 - 14
lib/ZuluSCSI_platform_RP2040/scsi_accel.pio

@@ -35,23 +35,23 @@
     wait 1 gpio ACK             side 1  ; Wait for ACK to be inactive
     wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
 
-; Read from SCSI bus using asynchronous handshake.
-; Also works for synchronous mode down to 50 ns transfer period.
-; Data is returned as 32-bit words that contain the 8 data bits + 1 parity bit.
-; Number of bytes to receive minus 1 should be written to TX fifo.
-; Number of bytes to receive must be divisible by 2.
-.program scsi_accel_async_read
+; Read from SCSI bus using sync or async handshake.
+; Data is returned as 32-bit words:
+; - bit  0: always zero
+; - bits 1-8: data byte
+; - bit  9: parity bit
+; - bits 10-31: lookup table address
+; Lookup table address should be loaded into register Y.
+; One dummy word should be written to TX fifo for every byte to receive.
+.program scsi_accel_read
     .side_set 1
 
-    pull block                  side 1  ; Get number of bytes to receive
-    mov x, osr                  side 1  ; Store to counter X
-
-start:
+    pull block                  side 1  ; Pull from TX fifo for counting bytes and pacing sync mode
     wait 1 gpio ACK             side 1  ; Wait for ACK high
+    in null, 1                  side 0  ; Zero bit because lookup table entries are 16-bit
     wait 0 gpio ACK             side 0  ; Assert REQ, wait for ACK low
     in pins, 9                  side 1  ; Deassert REQ, read GPIO
-    in null, 23                 side 1  ; Padding bits
-    jmp x-- start               side 1  ; Decrement byte count and jump to start
+    in y, 22                    side 1  ; Copy parity lookup table address
 
 ; Data state machine for synchronous writes.
 ; Takes the lowest 9 bits of each 32 bit word and writes them to bus with REQ pulse.
@@ -63,9 +63,9 @@ start:
 .program scsi_sync_write
     .side_set 1
 
-    out pins, 9     [0]         side 1  ; Write data and parity bit, wait for deskew delay
+    out pins, 9      [0]        side 1  ; Write data and parity bit, wait for deskew delay
     out null, 23     [0]        side 0  ; Assert REQ, wait for assert time
-    in null, 1      [0]         side 1  ; Deassert REQ, wait for transfer period, wait for space in ACK buffer
+    in null, 1       [0]        side 1  ; Deassert REQ, wait for transfer period, wait for space in ACK buffer
 
 ; Data pacing state machine for synchronous writes.
 ; Takes one bit from ISR on every falling edge of ACK.
@@ -75,3 +75,30 @@ start:
     wait 1 gpio ACK
     wait 0 gpio ACK   ; Wait for falling edge on ACK
     out null, 1       ; Let scsi_sync_write send one more byte
+
+; Data pacing state machine for synchronous reads.
+; The delay times will be rewritten by C code to match the negotiated SCSI sync speed.
+; Number of bytes to receive minus one should be loaded into register X.
+; In synchronous mode this generates the REQ pulses and dummy words.
+; In asynchronous mode it just generates dummy words to feed to scsi_accel_read.
+.program scsi_sync_read_pacer
+    .side_set 1
+
+start:
+    push block      [0]      side 1  ; Send dummy word to scsi_accel_read, wait for transfer period
+    jmp x-- start   [0]      side 0  ; Assert REQ, wait for assert time
+
+finish:
+    jmp finish      [0]      side 1
+
+; Parity checker for reads from SCSI bus.
+; Receives 16-bit words from g_scsi_parity_check_lookup
+; Bottom 8 bits are the data byte, which is passed to output FIFO
+; The 9th bit is parity valid bit, which is 1 for valid and 0 for parity error.
+.program scsi_read_parity
+parity_valid:
+    out isr, 8                ; Take the 8 data bits for passing to RX fifo
+    push block                ; Push the data to RX fifo
+    out x, 24                 ; Take the parity valid bit, and the rest of 32-bit word
+    jmp x-- parity_valid      ; If parity valid bit is 1, repeat from start
+    irq set 0                 ; Parity error, set interrupt flag

+ 75 - 15
lib/ZuluSCSI_platform_RP2040/scsi_accel.pio.h

@@ -70,35 +70,34 @@ static inline pio_sm_config scsi_accel_async_write_program_get_default_config(ui
 }
 #endif
 
-// --------------------- //
-// scsi_accel_async_read //
-// --------------------- //
+// --------------- //
+// scsi_accel_read //
+// --------------- //
 
-#define scsi_accel_async_read_wrap_target 0
-#define scsi_accel_async_read_wrap 6
+#define scsi_accel_read_wrap_target 0
+#define scsi_accel_read_wrap 5
 
-static const uint16_t scsi_accel_async_read_program_instructions[] = {
+static const uint16_t scsi_accel_read_program_instructions[] = {
             //     .wrap_target
     0x90a0, //  0: pull   block           side 1     
-    0xb027, //  1: mov    x, osr          side 1     
-    0x308a, //  2: wait   1 gpio, 10      side 1     
+    0x308a, //  1: wait   1 gpio, 10      side 1     
+    0x4061, //  2: in     null, 1         side 0     
     0x200a, //  3: wait   0 gpio, 10      side 0     
     0x5009, //  4: in     pins, 9         side 1     
-    0x5077, //  5: in     null, 23        side 1     
-    0x1042, //  6: jmp    x--, 2          side 1     
+    0x5056, //  5: in     y, 22           side 1     
             //     .wrap
 };
 
 #if !PICO_NO_HARDWARE
-static const struct pio_program scsi_accel_async_read_program = {
-    .instructions = scsi_accel_async_read_program_instructions,
-    .length = 7,
+static const struct pio_program scsi_accel_read_program = {
+    .instructions = scsi_accel_read_program_instructions,
+    .length = 6,
     .origin = -1,
 };
 
-static inline pio_sm_config scsi_accel_async_read_program_get_default_config(uint offset) {
+static inline pio_sm_config scsi_accel_read_program_get_default_config(uint offset) {
     pio_sm_config c = pio_get_default_sm_config();
-    sm_config_set_wrap(&c, offset + scsi_accel_async_read_wrap_target, offset + scsi_accel_async_read_wrap);
+    sm_config_set_wrap(&c, offset + scsi_accel_read_wrap_target, offset + scsi_accel_read_wrap);
     sm_config_set_sideset(&c, 1, false, false);
     return c;
 }
@@ -163,3 +162,64 @@ static inline pio_sm_config scsi_sync_write_pacer_program_get_default_config(uin
 }
 #endif
 
+// -------------------- //
+// scsi_sync_read_pacer //
+// -------------------- //
+
+#define scsi_sync_read_pacer_wrap_target 0
+#define scsi_sync_read_pacer_wrap 2
+
+static const uint16_t scsi_sync_read_pacer_program_instructions[] = {
+            //     .wrap_target
+    0x9020, //  0: push   block           side 1     
+    0x0040, //  1: jmp    x--, 0          side 0     
+    0x1002, //  2: jmp    2               side 1     
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_sync_read_pacer_program = {
+    .instructions = scsi_sync_read_pacer_program_instructions,
+    .length = 3,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_sync_read_pacer_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_sync_read_pacer_wrap_target, offset + scsi_sync_read_pacer_wrap);
+    sm_config_set_sideset(&c, 1, false, false);
+    return c;
+}
+#endif
+
+// ---------------- //
+// scsi_read_parity //
+// ---------------- //
+
+#define scsi_read_parity_wrap_target 0
+#define scsi_read_parity_wrap 4
+
+static const uint16_t scsi_read_parity_program_instructions[] = {
+            //     .wrap_target
+    0x60c8, //  0: out    isr, 8                     
+    0x8020, //  1: push   block                      
+    0x6038, //  2: out    x, 24                      
+    0x0040, //  3: jmp    x--, 0                     
+    0xc000, //  4: irq    nowait 0                   
+            //     .wrap
+};
+
+#if !PICO_NO_HARDWARE
+static const struct pio_program scsi_read_parity_program = {
+    .instructions = scsi_read_parity_program_instructions,
+    .length = 5,
+    .origin = -1,
+};
+
+static inline pio_sm_config scsi_read_parity_program_get_default_config(uint offset) {
+    pio_sm_config c = pio_get_default_sm_config();
+    sm_config_set_wrap(&c, offset + scsi_read_parity_wrap_target, offset + scsi_read_parity_wrap);
+    return c;
+}
+#endif
+

+ 489 - 126
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.cpp

@@ -26,11 +26,17 @@
 #define SCSI_DATA_SM 1
 #define SCSI_SYNC_SM 2
 
-// SCSI bus write acceleration uses up to 4 DMA channels:
+// SCSI bus write acceleration uses 3 or 4 DMA channels (data flow A->B->C->D):
 // A: Bytes from RAM to scsi_parity PIO
 // B: Addresses from scsi_parity PIO to lookup DMA READ_ADDR register
 // C: Lookup from g_scsi_parity_lookup and copy to scsi_accel_async_write or scsi_sync_write PIO
 // D: For sync transfers, scsi_sync_write to scsi_sync_write_pacer PIO
+//
+// SCSI bus read acceleration uses 4 DMA channels (data flow D->C->B->A):
+// A: Bytes from scsi_read_parity PIO to memory buffer
+// B: Lookup from g_scsi_parity_check_lookup and copy to scsi_read_parity PIO
+// C: Addresses from scsi_accel_read PIO to lookup DMA READ_ADDR register
+// D: From pacer to data state machine to trigger transfers
 #define SCSI_DMA_CH_A 0
 #define SCSI_DMA_CH_B 1
 #define SCSI_DMA_CH_C 2
@@ -53,80 +59,43 @@ static struct {
     // PIO configurations
     uint32_t pio_offset_parity;
     uint32_t pio_offset_async_write;
-    uint32_t pio_offset_async_read;
     uint32_t pio_offset_sync_write_pacer;
     uint32_t pio_offset_sync_write;
+    uint32_t pio_offset_read;
+    uint32_t pio_offset_read_parity;
+    uint32_t pio_offset_sync_read_pacer;
     pio_sm_config pio_cfg_parity;
     pio_sm_config pio_cfg_async_write;
-    pio_sm_config pio_cfg_async_read;
     pio_sm_config pio_cfg_sync_write_pacer;
     pio_sm_config pio_cfg_sync_write;
-
-    // DMA configurations
-    dma_channel_config dma_parity_config; // Data from RAM to scsi_parity PIO
-    dma_channel_config dma_address_config; // Addresses from scsi_parity PIO to lookup DMA
-    dma_channel_config dma_lookup_config; // Data from g_scsi_parity_lookup to scsi write PIO
-    dma_channel_config dma_write_pacer_config; // In synchronous mode only, transfer between state machines
+    pio_sm_config pio_cfg_read;
+    pio_sm_config pio_cfg_read_parity;
+    pio_sm_config pio_cfg_sync_read_pacer;
+    
+    // DMA configurations for write
+    dma_channel_config dmacfg_write_chA; // Data from RAM to scsi_parity PIO
+    dma_channel_config dmacfg_write_chB; // Addresses from scsi_parity PIO to lookup DMA
+    dma_channel_config dmacfg_write_chC; // Data from g_scsi_parity_lookup to scsi write PIO
+    dma_channel_config dmacfg_write_chD; // In synchronous mode only, transfer between state machines
+
+    // DMA configurations for read
+    dma_channel_config dmacfg_read_chA; // Data to destination memory buffer
+    dma_channel_config dmacfg_read_chB; // From lookup table to scsi_read_parity PIO
+    dma_channel_config dmacfg_read_chC; // From scsi_accel_read to channel B READ_ADDR
+    dma_channel_config dmacfg_read_chD; // From pacer to data state machine
 } g_scsi_dma;
 
 enum scsidma_state_t { SCSIDMA_IDLE = 0,
                        SCSIDMA_WRITE, SCSIDMA_WRITE_DONE,
-                       SCSIDMA_READ };
+                       SCSIDMA_READ, SCSIDMA_READ_DONE };
 static volatile scsidma_state_t g_scsi_dma_state;
 static bool g_channels_claimed = false;
+static void scsidma_config_gpio();
 
-// Select GPIO from PIO peripheral or from software controlled SIO
-static void scsidma_config_gpio()
-{
-    if (g_scsi_dma_state == SCSIDMA_IDLE)
-    {
-        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_SIO;
-    }
-    else if (g_scsi_dma_state == SCSIDMA_WRITE)
-    {
-        // Make sure the initial state of all pins is high and output
-        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x3FF);
-        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 10, true);
 
-        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_PIO0;
-        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
-    }
-    else if (g_scsi_dma_state == SCSIDMA_READ)
-    {
-        // Data bus as input, REQ pin as output
-        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x3FF);
-        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 9, false);
-        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 9, 1, true);
-
-        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
-        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
-    }
-}
+/****************************************/
+/* Accelerated writes to SCSI bus       */
+/****************************************/
 
 // Load the SCSI parity state machine with the address of the parity lookup table.
 // Also sets up DMA channels B and C
@@ -143,7 +112,7 @@ static void config_parity_sm_for_write()
     // DMA channel B will copy addresses from parity PIO to DMA channel C read address register.
     // It is triggered by the parity SM RX FIFO request
     dma_channel_configure(SCSI_DMA_CH_B,
-        &g_scsi_dma.dma_address_config,
+        &g_scsi_dma.dmacfg_write_chB,
         &dma_hw->ch[SCSI_DMA_CH_C].al3_read_addr_trig,
         &SCSI_DMA_PIO->rxf[SCSI_PARITY_SM],
         1, true);
@@ -151,7 +120,7 @@ static void config_parity_sm_for_write()
     // DMA channel C will read g_scsi_parity_lookup to copy data + parity to SCSI write state machine.
     // It is triggered by SCSI write machine TX FIFO request and chains to re-enable channel B.
     dma_channel_configure(SCSI_DMA_CH_C,
-        &g_scsi_dma.dma_lookup_config,
+        &g_scsi_dma.dmacfg_write_chC,
         &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
         NULL,
         1, false);
@@ -178,28 +147,25 @@ static void start_dma_write()
         g_scsi_dma_state = SCSIDMA_WRITE_DONE;
         return;
     }
+
+    uint8_t *src_buf = &g_scsi_dma.app_buf[g_scsi_dma.dma_bytes];
+    g_scsi_dma.dma_bytes += bytes_to_send;
     
     // Start DMA from current buffer to parity generator
     dma_channel_configure(SCSI_DMA_CH_A,
-        &g_scsi_dma.dma_parity_config,
+        &g_scsi_dma.dmacfg_write_chA,
         &SCSI_DMA_PIO->txf[SCSI_PARITY_SM],
-        &g_scsi_dma.app_buf[g_scsi_dma.dma_bytes],
+        src_buf,
         bytes_to_send,
         true
     );
-    g_scsi_dma.dma_bytes += bytes_to_send;
-}
-
-static void scsi_dma_write_irq()
-{
-    dma_hw->ints0 = 1 << SCSI_DMA_CH_A;
-
-    // Start writing from next buffer, if any, or set state to SCSIDMA_WRITE_DONE
-    start_dma_write();
 }
 
 void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag)
 {
+    // Any read requests should be matched with a stopRead()
+    assert(g_scsi_dma_state != SCSIDMA_READ && g_scsi_dma_state != SCSIDMA_READ_DONE);
+
     __disable_irq();
     if (g_scsi_dma_state == SCSIDMA_WRITE)
     {
@@ -290,7 +256,7 @@ void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile
 
             // Start DMA transfer to move dummy bits to write pacer
             dma_channel_configure(SCSI_DMA_CH_D,
-                &g_scsi_dma.dma_write_pacer_config,
+                &g_scsi_dma.dmacfg_write_chD,
                 &SCSI_DMA_PIO->txf[SCSI_SYNC_SM],
                 &SCSI_DMA_PIO->rxf[SCSI_DATA_SM],
                 0xFFFFFFFF,
@@ -339,6 +305,7 @@ bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data)
     return finished;
 }
 
+// Once DMA has finished, check if all PIO queues have been drained
 static bool scsi_accel_rp2040_isWriteDone()
 {
     // Check if data is still waiting in PIO FIFO
@@ -369,7 +336,7 @@ static bool scsi_accel_rp2040_isWriteDone()
     return true;
 }
 
-void scsi_accel_rp2040_stopWrite(volatile int *resetFlag)
+static void scsi_accel_rp2040_stopWrite(volatile int *resetFlag)
 {
     // Wait for TX fifo to be empty and ACK to go high
     // For synchronous writes wait for all ACKs to be received also
@@ -411,8 +378,8 @@ void scsi_accel_rp2040_finishWrite(volatile int *resetFlag)
              " state: ", (int)g_scsi_dma_state, " ", (int)g_scsi_dma.dma_bytes, "/", (int)g_scsi_dma.app_bytes, ", ", (int)g_scsi_dma.next_app_bytes,
              " PIO PC: ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_SYNC_SM),
              " PIO FIFO: ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_SYNC_SM),
-             " DMA counts: ", dma_hw->ch[SCSI_DMA_CH_A].al2_transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_B].al2_transfer_count,
-                         " ", dma_hw->ch[SCSI_DMA_CH_C].al2_transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_D].al2_transfer_count);
+             " DMA counts: ", dma_hw->ch[SCSI_DMA_CH_A].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_B].transfer_count,
+                         " ", dma_hw->ch[SCSI_DMA_CH_C].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_D].transfer_count);
             *resetFlag = 1;
             break;
         }
@@ -425,58 +392,388 @@ void scsi_accel_rp2040_finishWrite(volatile int *resetFlag)
     }
 }
 
-void scsi_accel_rp2040_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag)
+/****************************************/
+/* Accelerated reads from SCSI bus      */
+/****************************************/
+
+// Load the SCSI read state machine with the address of the parity lookup table.
+// Also sets up DMA channels B, C and D
+static void config_parity_sm_for_read()
 {
-    // The hardware would support DMA for reading from SCSI bus also, but currently
-    // the rest of the software architecture does not. There is not much benefit
-    // because there isn't much else to do before we get the data from the SCSI bus.
-    //
-    // Currently this method just reads from the PIO RX fifo directly in software loop.
+    // Configure parity check state machine
+    pio_sm_init(SCSI_DMA_PIO, SCSI_PARITY_SM, g_scsi_dma.pio_offset_read_parity, &g_scsi_dma.pio_cfg_read_parity);
+
+    // Load base address to state machine register X
+    uint32_t addrbase = (uint32_t)&g_scsi_parity_check_lookup[0];
+    assert((addrbase & 0x3FF) == 0);
+    pio_sm_init(SCSI_DMA_PIO, SCSI_DATA_SM, g_scsi_dma.pio_offset_read, &g_scsi_dma.pio_cfg_read);
+    pio_sm_put(SCSI_DMA_PIO, SCSI_DATA_SM, addrbase >> 10);
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_DATA_SM, pio_encode_pull(false, false) | pio_encode_sideset(1, 1));
+    pio_sm_exec(SCSI_DMA_PIO, SCSI_DATA_SM, pio_encode_mov(pio_y, pio_osr) | pio_encode_sideset(1, 1));
     
-    g_scsi_dma_state = SCSIDMA_READ;
-    pio_sm_init(SCSI_DMA_PIO, SCSI_DATA_SM, g_scsi_dma.pio_offset_async_read, &g_scsi_dma.pio_cfg_async_read);
-    scsidma_config_gpio();
+    // For synchronous mode, the REQ pin is driven by SCSI_SYNC_SM, so disable it in SCSI_DATA_SM
+    if (g_scsi_dma.syncOffset > 0)
+    {
+        pio_sm_set_sideset_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0);
+    }
+
+    // DMA channel B will read g_scsi_parity_check_lookup and write to scsi_read_parity PIO.
+    dma_channel_configure(SCSI_DMA_CH_B,
+        &g_scsi_dma.dmacfg_read_chB,
+        &SCSI_DMA_PIO->txf[SCSI_PARITY_SM],
+        NULL,
+        1, false);
+    
+    // DMA channel C will copy addresses from data PIO to DMA channel B read address register.
+    // It is triggered by the data SM RX FIFO request.
+    // This triggers channel B by writing to READ_ADDR_TRIG
+    // Channel B chaining re-enables this channel.
+    dma_channel_configure(SCSI_DMA_CH_C,
+        &g_scsi_dma.dmacfg_read_chC,
+        &dma_hw->ch[SCSI_DMA_CH_B].al3_read_addr_trig,
+        &SCSI_DMA_PIO->rxf[SCSI_DATA_SM],
+        1, true);
+
+    if (g_scsi_dma.syncOffset == 0)
+    {
+        // DMA channel D will copy dummy words to scsi_accel_read PIO to set the number
+        // of bytes to transfer.
+        static const uint32_t dummy = 0;
+        dma_channel_configure(SCSI_DMA_CH_D,
+            &g_scsi_dma.dmacfg_read_chD,
+            &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
+            &dummy,
+            0, false);
+    }
+    else
+    {
+        pio_sm_init(SCSI_DMA_PIO, SCSI_SYNC_SM, g_scsi_dma.pio_offset_sync_read_pacer, &g_scsi_dma.pio_cfg_sync_read_pacer);
+
+        // DMA channel D will copy words from scsi_sync_read_pacer to scsi_accel_read PIO
+        // to control the offset between REQ pulses sent and ACK pulses received.
+        dma_channel_configure(SCSI_DMA_CH_D,
+            &g_scsi_dma.dmacfg_read_chD,
+            &SCSI_DMA_PIO->txf[SCSI_DATA_SM],
+            &SCSI_DMA_PIO->rxf[SCSI_SYNC_SM],
+            0, false);
+    }
+
+    // Clear PIO IRQ flag that is used to detect parity error
+    SCSI_DMA_PIO->irq = 1;
+}
+
+static void start_dma_read()
+{
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, false);
+    pio_sm_clear_fifos(SCSI_DMA_PIO, SCSI_PARITY_SM);
+    pio_sm_clear_fifos(SCSI_DMA_PIO, SCSI_DATA_SM);
+    
+    if (g_scsi_dma.app_bytes <= g_scsi_dma.dma_bytes)
+    {
+        // Buffer has been fully processed, swap it
+        g_scsi_dma.dma_bytes = 0;
+        g_scsi_dma.app_buf = g_scsi_dma.next_app_buf;
+        g_scsi_dma.app_bytes = g_scsi_dma.next_app_bytes;
+        g_scsi_dma.next_app_buf = 0;
+        g_scsi_dma.next_app_bytes = 0;
+    }
+    
+    // Check if we are all done.
+    // From SCSIDMA_READ_DONE state we can either go to IDLE in stopRead()
+    // or back to READ in startWrite().
+    uint32_t bytes_to_read = g_scsi_dma.app_bytes - g_scsi_dma.dma_bytes;
+    if (bytes_to_read == 0)
+    {
+        g_scsi_dma_state = SCSIDMA_READ_DONE;
+        return;
+    }
+
+    if (g_scsi_dma.syncOffset == 0)
+    {
+        // Start sending dummy words to scsi_accel_read state machine
+        dma_channel_set_trans_count(SCSI_DMA_CH_D, bytes_to_read, true);
+    }
+    else
+    {
+        // Set number of bytes to receive to the scsi_sync_read_pacer state machine register X
+        pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, false);
+        hw_clear_bits(&SCSI_DMA_PIO->sm[SCSI_SYNC_SM].shiftctrl, PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS);
+        pio_sm_put(SCSI_DMA_PIO, SCSI_SYNC_SM, bytes_to_read - 1);
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_pull(false, false) | pio_encode_sideset(1, 1));
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_mov(pio_x, pio_osr) | pio_encode_sideset(1, 1));
+        hw_set_bits(&SCSI_DMA_PIO->sm[SCSI_SYNC_SM].shiftctrl, PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS);
+        
+        // Prefill FIFOs to get correct syncOffset
+        int prefill = 12 - g_scsi_dma.syncOffset;
+        
+        // Always at least 1 word to avoid race condition between REQ and ACK pulses
+        if (prefill < 1) prefill = 1;
+
+        // Up to 4 words in SCSI_DATA_SM TX fifo
+        for (int i = 0; i < 4 && prefill > 0; i++)
+        {
+            pio_sm_put(SCSI_DMA_PIO, SCSI_DATA_SM, 0);
+            prefill--;
+        }
+
+        // Up to 8 words in SCSI_SYNC_SM RX fifo
+        for (int i = 0; i < 8 && prefill > 0; i++)
+        {
+            pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_push(false, false) | pio_encode_sideset(1, 1));
+            prefill--;
+        }
+        
+        pio_sm_exec(SCSI_DMA_PIO, SCSI_SYNC_SM, pio_encode_jmp(g_scsi_dma.pio_offset_sync_read_pacer) | pio_encode_sideset(1, 1));
+
+        // Start transfers
+        dma_channel_set_trans_count(SCSI_DMA_CH_D, bytes_to_read, true);
+    }
+
+    // Start DMA to fill the destination buffer
+    uint8_t *dest_buf = &g_scsi_dma.app_buf[g_scsi_dma.dma_bytes];
+    g_scsi_dma.dma_bytes += bytes_to_read;
+    dma_channel_configure(SCSI_DMA_CH_A,
+        &g_scsi_dma.dmacfg_read_chA,
+        dest_buf,
+        &SCSI_DMA_PIO->rxf[SCSI_PARITY_SM],
+        bytes_to_read,
+        true
+    );
+
+    // Ready to start the data and parity check state machines
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, true);
     pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, true);
 
-    // Set the number of bytes to read
-    pio_sm_put(SCSI_DMA_PIO, SCSI_DATA_SM, count - 1);
+    if (g_scsi_dma.syncOffset > 0)
+    {
+        // Start sending REQ pulses
+        pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, true);
+    }
+}
+
+void scsi_accel_rp2040_startRead(uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    // Any write requests should be matched with a stopWrite()
+    assert(g_scsi_dma_state != SCSIDMA_WRITE && g_scsi_dma_state != SCSIDMA_WRITE_DONE);
 
-    // Read results from PIO RX FIFO
-    uint8_t *dst = buf;
-    uint8_t *end = buf + count;
-    uint32_t paritycheck = 0;
-    while (dst < end)
+    __disable_irq();
+    if (g_scsi_dma_state == SCSIDMA_READ)
     {
-        if (*resetFlag)
+        if (!g_scsi_dma.next_app_buf && data == g_scsi_dma.app_buf + g_scsi_dma.app_bytes)
         {
-            break;
+            // Combine with currently running request
+            g_scsi_dma.app_bytes += count;
+            count = 0;
         }
+        else if (data == g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
+        {
+            // Combine with queued request
+            g_scsi_dma.next_app_bytes += count;
+            count = 0;
+        }
+        else if (!g_scsi_dma.next_app_buf)
+        {
+            // Add as queued request
+            g_scsi_dma.next_app_buf = (uint8_t*)data;
+            g_scsi_dma.next_app_bytes = count;
+            count = 0;
+        }
+    }
+    __enable_irq();
 
-        uint32_t available = pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM);
+    // Check if the request was combined
+    if (count == 0) return;
 
-        while (available > 0)
+    if (g_scsi_dma_state != SCSIDMA_IDLE && g_scsi_dma_state != SCSIDMA_READ_DONE)
+    {
+        // Wait for previous request to finish
+        scsi_accel_rp2040_finishRead(NULL, 0, parityError, resetFlag);
+        if (*resetFlag)
         {
-            available--;
-            uint32_t word = pio_sm_get(SCSI_DMA_PIO, SCSI_DATA_SM);
-            paritycheck ^= word;
-            word = ~word;
-            *dst++ = word & 0xFF;
+            return;
         }
     }
 
-    // Check parity errors in whole block
-    // This doesn't detect if there is even number of parity errors in block.
-    uint8_t byte0 = ~(paritycheck & 0xFF);
-    if (paritycheck != g_scsi_parity_lookup[byte0])
+    bool must_reconfig_gpio = (g_scsi_dma_state == SCSIDMA_IDLE);
+    g_scsi_dma_state = SCSIDMA_READ;
+    g_scsi_dma.app_buf = (uint8_t*)data;
+    g_scsi_dma.app_bytes = count;
+    g_scsi_dma.dma_bytes = 0;
+    g_scsi_dma.next_app_buf = 0;
+    g_scsi_dma.next_app_bytes = 0;
+
+    if (must_reconfig_gpio)
+    {
+        config_parity_sm_for_read();
+        scsidma_config_gpio();
+        dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, true);
+    }
+
+    start_dma_read();
+}
+
+bool scsi_accel_rp2040_isReadFinished(const uint8_t* data)
+{
+    // Check if everything has completed
+    if (g_scsi_dma_state == SCSIDMA_IDLE || g_scsi_dma_state == SCSIDMA_READ_DONE)
+    {
+        return true;
+    }
+
+    if (!data)
+        return false;
+
+    // Check if this data item is still in queue.
+    bool finished = true;
+    __disable_irq();
+    if (data >= g_scsi_dma.app_buf &&
+        data < g_scsi_dma.app_buf + g_scsi_dma.app_bytes &&
+        (uint32_t)data >= dma_hw->ch[SCSI_DMA_CH_A].write_addr)
+    {
+        finished = false; // In current transfer
+    }
+    else if (data >= g_scsi_dma.next_app_buf &&
+             data < g_scsi_dma.next_app_buf + g_scsi_dma.next_app_bytes)
     {
-        azdbg("Parity error in scsi_accel_rp2040_read(): ", paritycheck);
-        *parityError = 1;
+        finished = false; // In queued transfer
     }
+    __enable_irq();
 
+    return finished;
+}
+
+static void scsi_accel_rp2040_stopRead()
+{
+    dma_channel_abort(SCSI_DMA_CH_A);
+    dma_channel_abort(SCSI_DMA_CH_B);
+    dma_channel_abort(SCSI_DMA_CH_C);
+    dma_channel_abort(SCSI_DMA_CH_D);
+    dma_channel_set_irq0_enabled(SCSI_DMA_CH_A, false);
     g_scsi_dma_state = SCSIDMA_IDLE;
     SCSI_RELEASE_DATA_REQ();
     scsidma_config_gpio();
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_PARITY_SM, false);
     pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_DATA_SM, false);
+    pio_sm_set_enabled(SCSI_DMA_PIO, SCSI_SYNC_SM, false);
+}
+
+void scsi_accel_rp2040_finishRead(const uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag)
+{
+    uint32_t start = millis();
+    const uint8_t *query_addr = (data ? (data + count - 1) : NULL);
+    while (!scsi_accel_rp2040_isReadFinished(query_addr) && !*resetFlag)
+    {
+        if ((uint32_t)(millis() - start) > 5000)
+        {
+            azlog("scsi_accel_rp2040_finishRead timeout,"
+             " state: ", (int)g_scsi_dma_state, " ", (int)g_scsi_dma.dma_bytes, "/", (int)g_scsi_dma.app_bytes, ", ", (int)g_scsi_dma.next_app_bytes,
+             " PIO PC: ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_pc(SCSI_DMA_PIO, SCSI_SYNC_SM),
+             " PIO FIFO: ", (int)pio_sm_get_rx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM), " ", (int)pio_sm_get_tx_fifo_level(SCSI_DMA_PIO, SCSI_DATA_SM),
+             " DMA counts: ", dma_hw->ch[SCSI_DMA_CH_A].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_B].transfer_count,
+                         " ", dma_hw->ch[SCSI_DMA_CH_C].transfer_count, " ", dma_hw->ch[SCSI_DMA_CH_D].transfer_count);
+            *resetFlag = 1;
+            break;
+        }
+    }
+    
+    if (g_scsi_dma_state == SCSIDMA_READ_DONE || *resetFlag)
+    {
+        // This was last buffer, release bus
+        scsi_accel_rp2040_stopRead();
+    }
+    
+    // Check if any parity errors have been detected during the transfer so far
+    if (SCSI_DMA_PIO->irq & 1)
+    {
+        azdbg("scsi_accel_rp2040_finishRead(", bytearray(data, count), ") detected parity error");
+        *parityError = true;
+    }
+}
+
+/*******************************************************/
+/* Initialization functions common to read/write       */
+/*******************************************************/
+
+static void scsi_dma_irq()
+{
+    dma_hw->ints0 = (1 << SCSI_DMA_CH_A);
+
+    scsidma_state_t state = g_scsi_dma_state;
+    if (state == SCSIDMA_WRITE)
+    {
+        // Start writing from next buffer, if any, or set state to SCSIDMA_WRITE_DONE
+        start_dma_write();
+    }
+    else if (state == SCSIDMA_READ)
+    {
+        // Start reading into next buffer, if any, or set state to SCSIDMA_READ_DONE
+        start_dma_read();
+    }
+}
+
+// Select GPIO from PIO peripheral or from software controlled SIO
+static void scsidma_config_gpio()
+{
+    if (g_scsi_dma_state == SCSIDMA_IDLE)
+    {
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_SIO;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_WRITE)
+    {
+        // Make sure the initial state of all pins is high and output
+        pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x3FF);
+        pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 10, true);
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_PIO0;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
+    else if (g_scsi_dma_state == SCSIDMA_READ)
+    {
+        if (g_scsi_dma.syncOffset == 0)
+        {
+            // Asynchronous read
+            // Data bus as input, REQ pin as output
+            pio_sm_set_pins(SCSI_DMA_PIO, SCSI_DATA_SM, 0x3FF);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 9, false);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 9, 1, true);
+        }
+        else
+        {
+            // Synchronous read, REQ pin is written by SYNC_SM
+            pio_sm_set_pins(SCSI_DMA_PIO, SCSI_SYNC_SM, 0x3FF);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_DATA_SM, 0, 10, false);
+            pio_sm_set_consecutive_pindirs(SCSI_DMA_PIO, SCSI_SYNC_SM, 9, 1, true);
+        }
+
+        iobank0_hw->io[SCSI_IO_DB0].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB1].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB2].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB3].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB4].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB5].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB6].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DB7].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_IO_DBP].ctrl  = GPIO_FUNC_SIO;
+        iobank0_hw->io[SCSI_OUT_REQ].ctrl = GPIO_FUNC_PIO0;
+    }
 }
 
 void scsi_accel_rp2040_init()
@@ -514,14 +811,6 @@ void scsi_accel_rp2040_init()
     sm_config_set_fifo_join(&g_scsi_dma.pio_cfg_async_write, PIO_FIFO_JOIN_TX);
     sm_config_set_out_shift(&g_scsi_dma.pio_cfg_async_write, true, false, 32);
 
-    // Asynchronous / synchronous SCSI read
-    g_scsi_dma.pio_offset_async_read = pio_add_program(SCSI_DMA_PIO, &scsi_accel_async_read_program);
-    g_scsi_dma.pio_cfg_async_read = scsi_accel_async_read_program_get_default_config(g_scsi_dma.pio_offset_async_read);
-    sm_config_set_in_pins(&g_scsi_dma.pio_cfg_async_read, SCSI_IO_DB0);
-    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_async_read, SCSI_OUT_REQ);
-    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_async_read, true, false, 32);
-    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_async_read, true, true, 32);
-
     // Synchronous SCSI write pacer / ACK handler
     g_scsi_dma.pio_offset_sync_write_pacer = pio_add_program(SCSI_DMA_PIO, &scsi_sync_write_pacer_program);
     g_scsi_dma.pio_cfg_sync_write_pacer = scsi_sync_write_pacer_program_get_default_config(g_scsi_dma.pio_offset_sync_write_pacer);
@@ -535,15 +824,35 @@ void scsi_accel_rp2040_init()
     sm_config_set_out_shift(&g_scsi_dma.pio_cfg_sync_write, true, true, 32);
     sm_config_set_in_shift(&g_scsi_dma.pio_cfg_sync_write, true, true, 1);
 
-    // Create DMA channel configurations so they can be applied quickly later
+    // Asynchronous / synchronous SCSI read
+    g_scsi_dma.pio_offset_read = pio_add_program(SCSI_DMA_PIO, &scsi_accel_read_program);
+    g_scsi_dma.pio_cfg_read = scsi_accel_read_program_get_default_config(g_scsi_dma.pio_offset_read);
+    sm_config_set_in_pins(&g_scsi_dma.pio_cfg_read, SCSI_IO_DB0);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_read, SCSI_OUT_REQ);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_read, true, false, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_read, true, true, 32);
+
+    // Synchronous SCSI read pacer
+    g_scsi_dma.pio_offset_sync_read_pacer = pio_add_program(SCSI_DMA_PIO, &scsi_sync_read_pacer_program);
+    g_scsi_dma.pio_cfg_sync_read_pacer = scsi_sync_read_pacer_program_get_default_config(g_scsi_dma.pio_offset_sync_read_pacer);
+    sm_config_set_sideset_pins(&g_scsi_dma.pio_cfg_sync_read_pacer, SCSI_OUT_REQ);
+
+    // Read parity check
+    g_scsi_dma.pio_offset_read_parity = pio_add_program(SCSI_DMA_PIO, &scsi_read_parity_program);
+    g_scsi_dma.pio_cfg_read_parity = scsi_read_parity_program_get_default_config(g_scsi_dma.pio_offset_read_parity);
+    sm_config_set_out_shift(&g_scsi_dma.pio_cfg_read_parity, true, true, 32);
+    sm_config_set_in_shift(&g_scsi_dma.pio_cfg_read_parity, true, false, 32);
 
+    // Create DMA channel configurations so they can be applied quickly later
+    
+    // For write to SCSI BUS:
     // Channel A: Bytes from RAM to scsi_parity PIO
     dma_channel_config cfg = dma_channel_get_default_config(SCSI_DMA_CH_A);
     channel_config_set_transfer_data_size(&cfg, DMA_SIZE_8);
     channel_config_set_read_increment(&cfg, true);
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, true));
-    g_scsi_dma.dma_parity_config = cfg;
+    g_scsi_dma.dmacfg_write_chA = cfg;
 
     // Channel B: Addresses from scsi_parity PIO to lookup DMA READ_ADDR register
     cfg = dma_channel_get_default_config(SCSI_DMA_CH_B);
@@ -551,7 +860,7 @@ void scsi_accel_rp2040_init()
     channel_config_set_read_increment(&cfg, false);
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, false));
-    g_scsi_dma.dma_address_config = cfg;
+    g_scsi_dma.dmacfg_write_chB = cfg;
 
     // Channel C: Lookup from g_scsi_parity_lookup and copy to scsi_accel_async_write or scsi_sync_write PIO
     // When done, chain to channel B
@@ -561,7 +870,7 @@ void scsi_accel_rp2040_init()
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, true));
     channel_config_set_chain_to(&cfg, SCSI_DMA_CH_B);
-    g_scsi_dma.dma_lookup_config = cfg;
+    g_scsi_dma.dmacfg_write_chC = cfg;
 
     // Channel D: In synchronous mode a second DMA channel is used to transfer dummy bits
     // from first state machine to second one.
@@ -570,14 +879,60 @@ void scsi_accel_rp2040_init()
     channel_config_set_read_increment(&cfg, false);
     channel_config_set_write_increment(&cfg, false);
     channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_SYNC_SM, true));
-    g_scsi_dma.dma_write_pacer_config = cfg;
+    g_scsi_dma.dmacfg_write_chD = cfg;
 
-    irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_write_irq);
+    // For read from SCSI BUS:
+    // Channel A: Bytes from scsi_read_parity PIO to destination memory buffer
+    // This takes the bottom 8 bits which is the data without parity bit.
+    // Triggered by scsi_read_parity RX FIFO.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_A);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_8);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, true);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_PARITY_SM, false));
+    g_scsi_dma.dmacfg_read_chA = cfg;
+
+    // Channel B: Lookup from g_scsi_parity_check_lookup and copy to scsi_read_parity PIO
+    // Triggered by channel C writing to READ_ADDR_TRIG
+    // Re-enables channel C by chaining after done.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_B);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_16);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, DREQ_FORCE);
+    channel_config_set_chain_to(&cfg, SCSI_DMA_CH_C);
+    cfg.ctrl |= DMA_CH0_CTRL_TRIG_HIGH_PRIORITY_BITS;
+    g_scsi_dma.dmacfg_read_chB = cfg;
+
+    // Channel C: Addresses from scsi_read PIO to channel B READ_ADDR register
+    // A single transfer starts when PIO RX FIFO has data.
+    // The DMA channel is re-enabled by channel B chaining.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_C);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, false));
+    g_scsi_dma.dmacfg_read_chC = cfg;
+
+    // Channel D: In synchronous mode a second DMA channel is used to transfer dummy words
+    // from first state machine to second one to control the pace of data transfer.
+    // In asynchronous mode this just transfers words to control the number of bytes.
+    cfg = dma_channel_get_default_config(SCSI_DMA_CH_D);
+    channel_config_set_transfer_data_size(&cfg, DMA_SIZE_32);
+    channel_config_set_read_increment(&cfg, false);
+    channel_config_set_write_increment(&cfg, false);
+    channel_config_set_dreq(&cfg, pio_get_dreq(SCSI_DMA_PIO, SCSI_DATA_SM, true));
+    g_scsi_dma.dmacfg_read_chD = cfg;
+    
+    // Interrupts are used for data buffer swapping
+    irq_set_exclusive_handler(DMA_IRQ_0, scsi_dma_irq);
     irq_set_enabled(DMA_IRQ_0, true);
 }
 
-void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod)
+void scsi_accel_rp2040_setSyncMode(int syncOffset, int syncPeriod)
 {
+    assert(g_scsi_dma_state == SCSIDMA_IDLE);
+
     if (syncOffset != g_scsi_dma.syncOffset || syncPeriod != g_scsi_dma.syncPeriod)
     {
         g_scsi_dma.syncOffset = syncOffset;
@@ -649,15 +1004,23 @@ void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod)
                 if (delay2 > 15) delay2 = 15;
             }
 
-            // Patch the delay values into the instructions.
+            // Patch the delay values into the instructions in scsi_sync_write.
             // The code in scsi_accel.pio must have delay set to 0 for this to work correctly.
             uint16_t instr0 = scsi_sync_write_program_instructions[0] | pio_encode_delay(delay0);
             uint16_t instr1 = scsi_sync_write_program_instructions[1] | pio_encode_delay(delay1);
             uint16_t instr2 = scsi_sync_write_program_instructions[2] | pio_encode_delay(delay2);
-
             SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 0] = instr0;
             SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 1] = instr1;
             SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_write + 2] = instr2;
+
+            // And similar patching for scsi_sync_read_pacer
+            int rdelay2 = totalDelay - delay1 - 2;
+            if (rdelay2 > 15) rdelay2 = 15;
+            if (rdelay2 < 5) rdelay2 = 5;
+            uint16_t rinstr0 = scsi_sync_read_pacer_program_instructions[0] | pio_encode_delay(rdelay2);
+            uint16_t rinstr1 = (scsi_sync_read_pacer_program_instructions[1] + g_scsi_dma.pio_offset_sync_read_pacer) | pio_encode_delay(delay1);
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_read_pacer + 0] = rinstr0;
+            SCSI_DMA_PIO->instr_mem[g_scsi_dma.pio_offset_sync_read_pacer + 1] = rinstr1;
         }
     }
 

+ 27 - 7
lib/ZuluSCSI_platform_RP2040/scsi_accel_rp2040.h

@@ -6,19 +6,39 @@
 
 void scsi_accel_rp2040_init();
 
-// Set SCSI access mode for write requests.
+// Set SCSI access mode for synchronous transfers
 // Setting syncOffset = 0 enables asynchronous SCSI.
 // Setting syncOffset > 0 enables synchronous SCSI.
-void scsi_accel_rp2040_setWriteMode(int syncOffset, int syncPeriod);
+void scsi_accel_rp2040_setSyncMode(int syncOffset, int syncPeriod);
 
+// Queue a request to write data from the buffer to SCSI bus.
+// This function typically returns immediately and the request will complete in background.
+// If there are too many queued requests, this function will block until previous request finishes.
 void scsi_accel_rp2040_startWrite(const uint8_t* data, uint32_t count, volatile int *resetFlag);
-void scsi_accel_rp2040_stopWrite(volatile int *resetFlag);
-void scsi_accel_rp2040_finishWrite(volatile int *resetFlag);
 
 // Query whether the data at pointer has already been read, i.e. buffer can be reused.
 // If data is NULL, checks if all writes have completed.
 bool scsi_accel_rp2040_isWriteFinished(const uint8_t* data);
 
-// Read data from SCSI bus.
-// Works for both asynchronous and synchronous modes.
-void scsi_accel_rp2040_read(uint8_t *buf, uint32_t count, int *parityError, volatile int *resetFlag);
+// Wait for all write requests to finish and release the bus.
+// If resetFlag is non-zero, aborts write immediately.
+void scsi_accel_rp2040_finishWrite(volatile int *resetFlag);
+
+// Queue a request to read data from SCSI bus to the buffer.
+// This function typically returns immediately and the request will complete in background.
+// If there are too many queued requests, this function will block until previous request finishes.
+void scsi_accel_rp2040_startRead(uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag);
+
+// Query whether data at address is part of a queued read request.
+// Returns true if there is no outstanding request.
+// If data is NULL, checks if all reads have completed.
+bool scsi_accel_rp2040_isReadFinished(const uint8_t* data);
+
+// Wait for a read request to complete.
+// If buf is not NULL, waits only until the data at data[0] .. data[count-1] is valid.
+// If buf is NULL, waits for all read requests to complete.
+// If there are no further read requests, releases the bus.
+// If resetFlag is non-zero, aborts read immediately.
+// If a parity error has been noticed in any buffer since starting the read, parityError is set to 1.
+void scsi_accel_rp2040_finishRead(const uint8_t *data, uint32_t count, int *parityError, volatile int *resetFlag);
+

+ 130 - 58
src/ZuluSCSI_disk.cpp

@@ -44,6 +44,17 @@ extern "C" {
 #define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 512
 #endif
 
+// Optimal size for read block from SCSI bus
+// For platforms with nonblocking transfer, this can be large.
+// For Akai MPC60 compatibility this has to be at least 5120
+#ifndef PLATFORM_OPTIMAL_SCSI_READ_BLOCK_SIZE
+#ifdef PLATFORM_SCSIPHY_HAS_NONBLOCKING_READ
+#define PLATFORM_OPTIMAL_SCSI_READ_BLOCK_SIZE 65536
+#else
+#define PLATFORM_OPTIMAL_SCSI_READ_BLOCK_SIZE 8192
+#endif
+#endif
+
 #ifndef PLATFORM_HAS_ROM_DRIVE
 // Dummy defines for platforms without ROM drive support
 #define AZPLATFORM_ROMDRIVE_PAGE_SIZE 1024
@@ -52,6 +63,22 @@ bool azplatform_read_romdrive(uint8_t *dest, uint32_t start, uint32_t count) { r
 bool azplatform_write_romdrive(const uint8_t *data, uint32_t start, uint32_t count) { return false; }
 #endif
 
+#ifndef PLATFORM_SCSIPHY_HAS_NONBLOCKING_READ
+// For platforms that do not have non-blocking read from SCSI bus
+void scsiStartRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    scsiRead(data, count, parityError);
+}
+void scsiFinishRead(uint8_t* data, uint32_t count, int *parityError)
+{
+    
+}
+bool scsiIsReadFinished(const uint8_t *data)
+{
+    return true;
+}
+#endif
+
 // SD card sector size is always 512 bytes
 #define SD_SECTOR_SIZE 512
 
@@ -1268,8 +1295,9 @@ static struct {
     uint32_t bytes_sd; // Number of bytes that have been scheduled for transfer on SD card side
     uint32_t bytes_scsi; // Number of bytes that have been scheduled for transfer on SCSI side
 
-    uint32_t bytes_scsi_done;
+    uint32_t bytes_scsi_started;
     uint32_t sd_transfer_start;
+    int parityError;
 } g_disk_transfer;
 
 #ifdef PREFETCH_BUFFER_SIZE
@@ -1355,42 +1383,33 @@ void diskDataOut_callback(uint32_t bytes_complete)
     // For best performance, do SCSI reads in blocks of 4 or more bytes
     bytes_complete &= ~3;
 
-    if (g_disk_transfer.bytes_scsi_done < g_disk_transfer.bytes_scsi)
+    if (g_disk_transfer.bytes_scsi_started < g_disk_transfer.bytes_scsi)
     {
         // How many bytes remaining in the transfer?
-        uint32_t remain = g_disk_transfer.bytes_scsi - g_disk_transfer.bytes_scsi_done;
+        uint32_t remain = g_disk_transfer.bytes_scsi - g_disk_transfer.bytes_scsi_started;
         uint32_t len = remain;
         
-        // Limit maximum amount of data transferred at one go, to give enough callbacks to SD driver.
-        // Select the limit based on total bytes in the transfer.
-        // Transfer size is reduced towards the end of transfer to reduce the dead time between
-        // end of SCSI transfer and the SD write completing.
-        uint32_t limit = g_disk_transfer.bytes_scsi / 8;
-        uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
-        if (limit < PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE) limit = PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE;
-        if (limit > PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE) limit = PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE;
-        if (limit > len) limit = PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE;
-        if (limit < bytesPerSector) limit = bytesPerSector;
-
-        if (len > limit)
-        {
-            len = limit;
-        }
-
         // Split read so that it doesn't wrap around buffer edge
         uint32_t bufsize = sizeof(scsiDev.data);
-        uint32_t start = (g_disk_transfer.bytes_scsi_done % bufsize);
+        uint32_t start = (g_disk_transfer.bytes_scsi_started % bufsize);
         if (start + len > bufsize)
             len = bufsize - start;
 
+        // Apply platform-specific optimized transfer sizes
+        if (len > PLATFORM_OPTIMAL_SCSI_READ_BLOCK_SIZE)
+        {
+            len = PLATFORM_OPTIMAL_SCSI_READ_BLOCK_SIZE;
+        }
+
         // Don't overwrite data that has not yet been written to SD card
         uint32_t sd_ready_cnt = g_disk_transfer.bytes_sd + bytes_complete;
-        if (g_disk_transfer.bytes_scsi_done + len > sd_ready_cnt + bufsize)
-            len = sd_ready_cnt + bufsize - g_disk_transfer.bytes_scsi_done;
+        if (g_disk_transfer.bytes_scsi_started + len > sd_ready_cnt + bufsize)
+            len = sd_ready_cnt + bufsize - g_disk_transfer.bytes_scsi_started;
 
         // Keep transfers a multiple of sector size.
         // Macintosh SCSI driver seems to get confused if we have a delay
         // in middle of a sector.
+        uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
         if (remain >= bytesPerSector && len % bytesPerSector != 0)
         {
             len -= len % bytesPerSector;
@@ -1400,17 +1419,8 @@ void diskDataOut_callback(uint32_t bytes_complete)
             return;
 
         // azdbg("SCSI read ", (int)start, " + ", (int)len);
-        int parityError = 0;
-        scsiRead(&scsiDev.data[start], len, &parityError);
-        g_disk_transfer.bytes_scsi_done += len;
-
-        if (parityError)
-        {
-            scsiDev.status = CHECK_CONDITION;
-            scsiDev.target->sense.code = ABORTED_COMMAND;
-            scsiDev.target->sense.asc = SCSI_PARITY_ERROR;
-            scsiDev.phase = STATUS;
-        }
+        scsiStartRead(&scsiDev.data[start], len, &g_disk_transfer.parityError);
+        g_disk_transfer.bytes_scsi_started += len;
     }
 }
 
@@ -1424,46 +1434,108 @@ void diskDataOut()
     g_disk_transfer.buffer = scsiDev.data;
     g_disk_transfer.bytes_scsi = blockcount * bytesPerSector;
     g_disk_transfer.bytes_sd = 0;
-    g_disk_transfer.bytes_scsi_done = 0;
+    g_disk_transfer.bytes_scsi_started = 0;
     g_disk_transfer.sd_transfer_start = 0;
+    g_disk_transfer.parityError = 0;
 
     while (g_disk_transfer.bytes_sd < g_disk_transfer.bytes_scsi
            && scsiDev.phase == DATA_OUT
            && !scsiDev.resetFlag)
     {
-        // Read next block from SCSI bus
-        if (g_disk_transfer.bytes_sd == g_disk_transfer.bytes_scsi_done)
+        // Figure out how many contiguous bytes are available for writing to SD card.
+        uint32_t bufsize = sizeof(scsiDev.data);
+        uint32_t start = g_disk_transfer.bytes_sd % bufsize;
+        uint32_t len = 0;
+
+        // How much data until buffer edge wrap?
+        uint32_t available = g_disk_transfer.bytes_scsi_started - g_disk_transfer.bytes_sd;
+        if (start + available > bufsize)
+            available = bufsize - start;
+
+        // Count number of finished sectors
+        if (scsiIsReadFinished(&scsiDev.data[start + available - 1]))
         {
-            diskDataOut_callback(0);
+            len = available;
+        }
+        else
+        {
+            while (len < available && scsiIsReadFinished(&scsiDev.data[start + len + SD_SECTOR_SIZE - 1]))
+            {
+                len += SD_SECTOR_SIZE;
+            }
         }
 
-        // Figure out longest continuous block in buffer
-        uint32_t bufsize = sizeof(scsiDev.data);
-        uint32_t start = g_disk_transfer.bytes_sd % bufsize;
-        uint32_t len = g_disk_transfer.bytes_scsi_done - g_disk_transfer.bytes_sd;
-        if (start + len > bufsize) len = bufsize - start;
+        // In case the last sector is partial (256 byte SCSI sectors)
+        if (len > available)
+        {
+            len = available;
+        }
 
-        // Try to do writes in multiple of 512 bytes
-        // This allows better performance for SD card access.
-        if (len >= 512) len &= ~511;
+        // Apply platform-specific write size blocks for optimization
+        if (len > PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE)
+        {
+            len = PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE;
+        }
 
-        // Start writing to SD card and simultaneously reading more from SCSI bus
-        uint8_t *buf = &scsiDev.data[start];
-        g_disk_transfer.sd_transfer_start = start;
-        // azdbg("SD write ", (int)start, " + ", (int)len);
-        azplatform_set_sd_callback(&diskDataOut_callback, buf);
-        if (img.file.write(buf, len) != len)
+        uint32_t remain_in_transfer = g_disk_transfer.bytes_scsi - g_disk_transfer.bytes_sd;
+        if (len < bufsize - start && len < remain_in_transfer)
         {
-            azlog("SD card write failed: ", SD.sdErrorCode());
-            scsiDev.status = CHECK_CONDITION;
-            scsiDev.target->sense.code = MEDIUM_ERROR;
-            scsiDev.target->sense.asc = WRITE_ERROR_AUTO_REALLOCATION_FAILED;
-            scsiDev.phase = STATUS;
+            // Use large write blocks in middle of transfer and smaller at the end of transfer.
+            // This improves performance for large writes and reduces latency at end of request.
+            uint32_t min_write_size = PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE;
+            if (remain_in_transfer <= PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE)
+            {
+                min_write_size = PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE;
+            }
+
+            if (len < min_write_size)
+            {                
+                len = 0;
+            }
+        }
+
+        if (len == 0)
+        {
+            // Nothing ready to transfer, check if we can read more from SCSI bus
+            diskDataOut_callback(0);
+        }
+        else
+        {
+            // Finalize transfer on SCSI side
+            scsiFinishRead(&scsiDev.data[start], len, &g_disk_transfer.parityError);
+
+            // Check parity error status before writing to SD card
+            if (g_disk_transfer.parityError)
+            {
+                scsiDev.status = CHECK_CONDITION;
+                scsiDev.target->sense.code = ABORTED_COMMAND;
+                scsiDev.target->sense.asc = SCSI_PARITY_ERROR;
+                scsiDev.phase = STATUS;
+                break;
+            }
+
+            // Start writing to SD card and simultaneously start new SCSI transfers
+            // when buffer space is freed.
+            uint8_t *buf = &scsiDev.data[start];
+            g_disk_transfer.sd_transfer_start = start;
+            // azdbg("SD write ", (int)start, " + ", (int)len, " ", bytearray(buf, len));
+            azplatform_set_sd_callback(&diskDataOut_callback, buf);
+            if (img.file.write(buf, len) != len)
+            {
+                azlog("SD card write failed: ", SD.sdErrorCode());
+                scsiDev.status = CHECK_CONDITION;
+                scsiDev.target->sense.code = MEDIUM_ERROR;
+                scsiDev.target->sense.asc = WRITE_ERROR_AUTO_REALLOCATION_FAILED;
+                scsiDev.phase = STATUS;
+            }
+            azplatform_set_sd_callback(NULL, NULL);
+            g_disk_transfer.bytes_sd += len;
         }
-        g_disk_transfer.bytes_sd += len;
     }
 
-    azplatform_set_sd_callback(NULL, NULL);
+    // Release SCSI bus
+    scsiFinishRead(NULL, 0, &g_disk_transfer.parityError);
+
     transfer.currentBlock += blockcount;
     scsiDev.dataPtr = scsiDev.dataLen = 0;