Browse Source

Performance improvements for 2021 boards

Michael McMaster 3 years ago
parent
commit
982c24efae

+ 2 - 0
lib/SCSI2SD/CHANGELOG

@@ -1,6 +1,8 @@
 2022xxxx        6.4.14
     - Fix firmware version displaying as "0.0" in scsi2sd-util when there is no
     SD card inserted.
+    - Reduce some delays for slight performance improvements
+    - Use SD High-Speed mode on V6 2021 hardware.
 
 20220121        6.4.13
     - Fix SCSI writes with sector sizes larger than 512.

+ 102 - 96
lib/SCSI2SD/src/firmware/bsp_driver_sd.c

@@ -35,11 +35,14 @@
 /* Includes ------------------------------------------------------------------*/
 #include "bsp_driver_sd.h"
 #include "sd.h"
+#include "time.h"
 
 /* Extern variables ---------------------------------------------------------*/ 
   
 extern SD_HandleTypeDef hsd;
 
+static uint8_t HighSpeedSwitch();
+
 /**
   * @brief  Initializes the SD card device.
   * @param  None
@@ -47,111 +50,114 @@ extern SD_HandleTypeDef hsd;
   */
 uint8_t BSP_SD_Init(void)
 {
-  uint8_t SD_state = MSD_OK;
-  /* Check if the SD card is plugged in the slot */
-  if (BSP_SD_IsDetected() != SD_PRESENT)
-  {
-    return MSD_ERROR;
-  }
-  SD_state = HAL_SD_Init(&hsd);
-#ifdef BUS_4BITS
-  if (SD_state == HAL_OK)
-  {
-    if (HAL_SD_ConfigWideBusOperation(&hsd, SDIO_BUS_WIDE_4B) != HAL_OK)
+    uint8_t SD_state = MSD_OK;
+    /* Check if the SD card is plugged in the slot */
+    if (BSP_SD_IsDetected() != SD_PRESENT)
     {
-      SD_state = MSD_ERROR;
+        return MSD_ERROR;
     }
-    else
+    SD_state = HAL_SD_Init(&hsd);
+#ifdef BUS_4BITS
+    if (SD_state == HAL_OK)
     {
-      SD_state = MSD_OK;
+        if (HAL_SD_ConfigWideBusOperation(&hsd, SDIO_BUS_WIDE_4B) != HAL_OK)
+        {
+            SD_state = MSD_ERROR;
+        }
+        else
+        {
+            // Save the wide mode setting for when we call SDIO_Init again
+            // for high speed mode.
+            hsd.Init.BusWide = SDIO_BUS_WIDE_4B;
+            SD_state = MSD_OK;
 
 // Clock bypass mode is broken on STM32F205
-// #ifdef STM32F4xx
-#if 0
-      uint8_t SD_hs[64]  = {0};
-      //uint32_t SD_scr[2] = {0, 0};
-      //uint32_t SD_SPEC   = 0 ;
-      uint32_t count = 0;
-      uint32_t *tempbuff = (uint32_t *)SD_hs;
-
-      // Prepare to read 64 bytes status data
-      SDIO_DataInitTypeDef config;
-      config.DataTimeOut   = SDMMC_DATATIMEOUT;
-      config.DataLength    = 64;
-      config.DataBlockSize = SDIO_DATABLOCK_SIZE_64B;
-      config.TransferDir   = SDIO_TRANSFER_DIR_TO_SDIO;
-      config.TransferMode  = SDIO_TRANSFER_MODE_BLOCK;
-      config.DPSM          = SDIO_DPSM_ENABLE;
-      (void)SDIO_ConfigData(hsd.Instance, &config);
-
-      // High speed switch.
-      // SDR25 (25MB/s) mode 0x80FFFF01
-      // Which is the max without going to 1.8v
-      uint32_t errorstate = SDMMC_CmdSwitch(hsd.Instance, 0x80FFFF01);
-
-      // Now we read some status data
-
-      if (errorstate == HAL_SD_ERROR_NONE)
-      {
-          while(!__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR | SDIO_FLAG_DCRCFAIL | SDIO_FLAG_DTIMEOUT | SDIO_FLAG_DATAEND/* | SDIO_FLAG_STBITERR*/))
-          {
-              if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXFIFOHF))
-              {
-                  for (count = 0; count < 8; count++)
-                  {
-                      *(tempbuff + count) = SDIO_ReadFIFO(hsd.Instance);
-                  }
-
-                  tempbuff += 8;
-              }
-          }
-
-          if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DTIMEOUT))
-          {
-              __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DTIMEOUT);
-              SD_state = MSD_ERROR;
-          }
-          else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DCRCFAIL))
-          {
-              __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DCRCFAIL);
-              SD_state = MSD_ERROR;
-          }
-          else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR))
-          {
-              __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_RXOVERR);
-              SD_state = MSD_ERROR;
-          }
-          /*else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_STBITERR))
+#ifdef STM32F4xx
+            if (hsd.SdCard.CardType == CARD_SDHC_SDXC)
             {
-            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_STBITERR);
-            SD_state = MSD_ERROR;
-            }*/
-          else
-          {
-              count = SD_DATATIMEOUT;
-
-              while ((__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXDAVL)) && (count > 0))
-              {
-                  *tempbuff = SDIO_ReadFIFO(hsd.Instance);
-                  tempbuff++;
-                  count--;
-              }
-
-              /* Clear all the static flags */
-              __HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_FLAGS);
-
-              // After 8 "SD" clocks we can change speed
-              // Low-level init for the bypass. Changes registers only
-              hsd.Init.ClockBypass = SDIO_CLOCK_BYPASS_ENABLE;
-              SDIO_Init(hsd.Instance, hsd.Init); 
-
-          }
-      }
+                HighSpeedSwitch();
+            }
 #endif
+        }
     }
-  }
 #endif
-  return SD_state;
+    return SD_state;
+}
+
+static uint8_t HighSpeedSwitch()
+{
+    uint8_t SD_state = MSD_OK;
+
+    // Prepare to read 64 bytes status data
+    SDIO_DataInitTypeDef config;
+    config.DataTimeOut   = SDMMC_DATATIMEOUT;
+    config.DataLength    = 64;
+    config.DataBlockSize = SDIO_DATABLOCK_SIZE_64B;
+    config.TransferDir   = SDIO_TRANSFER_DIR_TO_SDIO;
+    config.TransferMode  = SDIO_TRANSFER_MODE_BLOCK;
+    config.DPSM          = SDIO_DPSM_ENABLE;
+    (void)SDIO_ConfigData(hsd.Instance, &config);
+
+    // High speed switch.
+    // SDR25 (25MB/s) mode 0x80FFFF01
+    // Which is the max without going to 1.8v
+    uint32_t errorstate = SDMMC_CmdSwitch(hsd.Instance, 0x80FFFFF1);
+
+    // Now we read some status data
+
+    if (errorstate == HAL_SD_ERROR_NONE)
+    {
+        uint32_t statusByteCount = 0;
+        while(!__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR | SDIO_FLAG_DCRCFAIL | SDIO_FLAG_DTIMEOUT | SDIO_FLAG_DBCKEND))
+        {
+            if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXFIFOHF) && statusByteCount < 64)
+            {
+                for ( uint32_t i = 0; i < 8; i++, statusByteCount += 4)
+                {
+                    SDIO_ReadFIFO(hsd.Instance);
+                }
+            }
+        }
+
+        if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DTIMEOUT))
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DTIMEOUT);
+            SD_state = MSD_ERROR;
+        }
+        else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DCRCFAIL))
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DCRCFAIL);
+            SD_state = MSD_ERROR;
+        }
+        else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR))
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_RXOVERR);
+            SD_state = MSD_ERROR;
+        }
+        else
+        {
+            // Read remaining data, could be the CRC bytes.
+            uint32_t count = SD_DATATIMEOUT;
+            while ((__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXDAVL)) && (count > 0))
+            {
+                SDIO_ReadFIFO(hsd.Instance);
+                count--;
+            }
+
+            /* Clear all the static flags */
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_FLAGS);
+
+            // After 8 "SD" clocks we can change speed
+            // Low-level init for the bypass. Changes registers only
+            hsd.Init.ClockBypass = SDIO_CLOCK_BYPASS_ENABLE;
+            SDIO_Init(hsd.Instance, hsd.Init); 
+
+            // 8 clocks is 160ns at 50Mhz
+            s2s_delay_ns(200);
+        }
+    }
+
+    return SD_state;
 }
 
 /**

+ 544 - 328
lib/SCSI2SD/src/firmware/disk.c

@@ -38,6 +38,8 @@
 #include "time.h"
 #include "bsp.h"
 
+#include "led.h"
+
 #include <string.h>
 
 // Global
@@ -216,15 +218,6 @@ static void doWrite(uint32_t lba, uint32_t blocks)
         // No need for single-block writes atm.  Overhead of the
         // multi-block write is minimal.
         transfer.multiBlock = 1;
-
-
-        // TODO uint32_t sdLBA =
-// TODO             SCSISector2SD(
-    // TODO             scsiDev.target->cfg->sdSectorStart,
-        // TODO         bytesPerSector,
-            // TODO     lba);
-        // TODO uint32_t sdBlocks = blocks * SDSectorsPerSCSISector(bytesPerSector);
-        // TODO sdWriteMultiSectorPrep(sdLBA, sdBlocks);
     }
 }
 
@@ -303,7 +296,15 @@ static void doSeek(uint32_t lba)
     }
     else
     {
-        s2s_delay_ms(10);
+        if (unlikely(scsiDev.target->cfg->deviceType == S2S_CFG_FLOPPY_14MB) ||
+            scsiDev.compatMode < COMPAT_SCSI2)
+        {
+            s2s_delay_ms(10);
+        }
+        else
+        {
+            s2s_delay_ms(1);
+        }
     }
 }
 
@@ -558,411 +559,626 @@ int scsiDiskCommand()
     return commandHandled;
 }
 
-void scsiDiskPoll()
+static void diskDataInBuffered(int totalSDSectors, uint32_t sdLBA, int useSlowDataCount, uint32_t* phaseChangeDelayNs)
 {
     uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
 
-    if (scsiDev.phase == DATA_IN &&
-        transfer.currentBlock != transfer.blocks)
-    {
-        // Take responsibility for waiting for the phase delays
-        uint32_t phaseChangeDelayUs = scsiEnterPhaseImmediate(DATA_IN);
-
-        int totalSDSectors =
-            transfer.blocks * SDSectorsPerSCSISector(bytesPerSector);
-        uint32_t sdLBA =
-            SCSISector2SD(
-                scsiDev.target->cfg->sdSectorStart,
-                bytesPerSector,
-                transfer.lba);
+    const int sdPerScsi = SDSectorsPerSCSISector(bytesPerSector);
+    const int buffers = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
+    int prep = 0;
+    int i = 0;
+    int scsiActive __attribute__((unused)) = 0; // unused if DMA disabled
+    int sdActive = 0;
 
-        const int sdPerScsi = SDSectorsPerSCSISector(bytesPerSector);
-        const int buffers = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
-        int prep = 0;
-        int i = 0;
-        int scsiActive __attribute__((unused)) = 0; // unused if DMA disabled
-        int sdActive = 0;
+    int gotHalf = 0;
+    int sentHalf = 0;
 
-        // It's highly unlikely that someone is going to use huge transfers
-        // per scsi command, but if they do it'll be slower than usual.
-        uint32_t totalScsiBytes = transfer.blocks * bytesPerSector;
-        int useSlowDataCount = totalScsiBytes >= SCSI_XFER_MAX;
-        if (!useSlowDataCount)
+    while ((i < totalSDSectors) &&
+        likely(scsiDev.phase == DATA_IN) &&
+        likely(!scsiDev.resetFlag))
+    {
+        int completedDmaSectors;
+        if (sdActive && (completedDmaSectors = sdReadDMAPoll(sdActive)))
         {
-            scsiSetDataCount(totalScsiBytes);
+            prep += completedDmaSectors;
+            sdActive -= completedDmaSectors;
+            gotHalf = 0;
         }
-
-        while ((i < totalSDSectors) &&
-            likely(scsiDev.phase == DATA_IN) &&
-            likely(!scsiDev.resetFlag))
+        else if (sdActive > 1)
         {
-            int completedDmaSectors;
-            if (sdActive && (completedDmaSectors = sdReadDMAPoll(sdActive)))
+            if ((scsiDev.data[SD_SECTOR_SIZE * (prep % buffers) + 510] != 0xAA) ||
+                (scsiDev.data[SD_SECTOR_SIZE * (prep % buffers) + 511] != 0x33))
             {
-                prep += completedDmaSectors;
-                sdActive -= completedDmaSectors;
-            } else if (sdActive > 1)
+                prep += 1;
+                sdActive -= 1;
+                gotHalf = 0;
+            }
+            else if (scsiDev.data[SD_SECTOR_SIZE * (prep % buffers) + 127] != 0xAA)
             {
-                if ((scsiDev.data[SD_SECTOR_SIZE * (prep % buffers) + 510] != 0xAA) ||
-                    (scsiDev.data[SD_SECTOR_SIZE * (prep % buffers) + 511] != 0x33))
-                {
-                    prep += 1;
-                    sdActive -= 1;
-                }
+                // Half-block
+                gotHalf = 1;
             }
+        }
 
-            if (!sdActive &&
-                (prep - i < buffers) &&
-                (prep < totalSDSectors) &&
-                ((totalSDSectors - prep) >= sdPerScsi) &&
-                (likely(!useSlowDataCount) || scsiPhyComplete()) &&
-                (HAL_SD_GetState(&hsd) != HAL_SD_STATE_BUSY)) // rx complete but IRQ not fired yet.
-            {
-                // Start an SD transfer if we have space.
-                uint32_t startBuffer = prep % buffers;
-                uint32_t sectors = totalSDSectors - prep;
-                uint32_t freeBuffers = buffers - (prep - i);
+        if (!sdActive &&
+            (prep - i < buffers) &&
+            (prep < totalSDSectors) &&
+            ((totalSDSectors - prep) >= sdPerScsi) &&
+            (likely(!useSlowDataCount) || scsiPhyComplete()) &&
+            (HAL_SD_GetState(&hsd) != HAL_SD_STATE_BUSY)) // rx complete but IRQ not fired yet.
+        {
+            // Start an SD transfer if we have space.
+            uint32_t startBuffer = prep % buffers;
+            uint32_t sectors = totalSDSectors - prep;
+            uint32_t freeBuffers = buffers - (prep - i);
 
-                uint32_t contiguousBuffers = buffers - startBuffer;
-                freeBuffers = freeBuffers < contiguousBuffers
-                    ? freeBuffers : contiguousBuffers;
-                sectors = sectors < freeBuffers ? sectors : freeBuffers;
+            uint32_t contiguousBuffers = buffers - startBuffer;
+            freeBuffers = freeBuffers < contiguousBuffers
+                ? freeBuffers : contiguousBuffers;
+            sectors = sectors < freeBuffers ? sectors : freeBuffers;
 
-                if (sectors > 128) sectors = 128; // 65536 DMA limit !!
+            if (sectors > 128) sectors = 128; // 65536 DMA limit !!
 
-                // Round-down when we have odd sector sizes.
-                if (sdPerScsi != 1)
-                {
-                    sectors = (sectors / sdPerScsi) * sdPerScsi;
-                }
+            // Round-down when we have odd sector sizes.
+            if (sdPerScsi != 1)
+            {
+                sectors = (sectors / sdPerScsi) * sdPerScsi;
+            }
 
-                for (int dodgy = 0; dodgy < sectors; dodgy++)
-                {
-                    scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 510] = 0xAA;
-                    scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 511] = 0x33;
-                }
+            for (int dodgy = 0; dodgy < sectors; dodgy++)
+            {
+                scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 127] = 0xAA;
 
-                sdReadDMA(sdLBA + prep, sectors, &scsiDev.data[SD_SECTOR_SIZE * startBuffer]);
+                scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 510] = 0xAA;
+                scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 511] = 0x33;
+            }
 
-                sdActive = sectors;
+            sdReadDMA(sdLBA + prep, sectors, &scsiDev.data[SD_SECTOR_SIZE * startBuffer]);
 
-                if (useSlowDataCount)
-                {
-                    scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);
-                }
+            sdActive = sectors;
 
-                // Wait now that the SD card is busy
-                // Chances are we've probably already waited sufficient time,
-                // but it's hard to measure microseconds cheaply. So just wait
-                // extra just-in-case. Hopefully it's in parallel with dma.
-                if (phaseChangeDelayUs > 0)
-                {
-                    s2s_delay_us(phaseChangeDelayUs);
-                    phaseChangeDelayUs = 0;
-                }
+            if (useSlowDataCount)
+            {
+                scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);
             }
 
-            if (((prep - i) > 0) &&
-                scsiFifoReady())
+            // Wait now that the SD card is busy
+            // Chances are we've probably already waited sufficient time,
+            // but it's hard to measure microseconds cheaply. So just wait
+            // extra just-in-case. Hopefully it's in parallel with dma.
+            if (*phaseChangeDelayNs > 0)
             {
-                int dmaBytes = SD_SECTOR_SIZE;
-                if ((i % sdPerScsi) == (sdPerScsi - 1))
-                {
-                    dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
-                    if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
-                }
+                s2s_delay_ns(*phaseChangeDelayNs);
+                *phaseChangeDelayNs = 0;
+            }
+        }
+
+        int fifoReady = scsiFifoReady();
+        if (((prep - i) > 0) && fifoReady)
+        {
+            int dmaBytes = SD_SECTOR_SIZE;
+            if ((i % sdPerScsi) == (sdPerScsi - 1))
+            {
+                dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
+                if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
+            }
 
-                uint8_t* scsiDmaData = &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers)]);
-                scsiWritePIO(scsiDmaData, dmaBytes);
+            uint8_t* scsiDmaData = &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers)]);
 
-                ++i;
+            if (sentHalf)
+            {
+                scsiDmaData += SD_SECTOR_SIZE / 2;
+                dmaBytes -= (SD_SECTOR_SIZE / 2);
             }
-        }
+            scsiWritePIO(scsiDmaData, dmaBytes);
 
-        if (phaseChangeDelayUs > 0 && !scsiDev.resetFlag) // zero bytes ?
+            ++i;
+            sentHalf = 0;
+            gotHalf = 0;
+        }
+        else if (gotHalf && !sentHalf && fifoReady && bytesPerSector == SD_SECTOR_SIZE)
         {
-            s2s_delay_us(phaseChangeDelayUs);
-            phaseChangeDelayUs = 0;
+            uint8_t* scsiDmaData = &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers)]);
+            scsiWritePIO(scsiDmaData, SD_SECTOR_SIZE / 2);
+            sentHalf = 1;
         }
+    }
+}
 
-        if (scsiDev.resetFlag)
+// Transfer from the SD card straight to the SCSI Fifo without storing in memory first for lower latency
+// This requires hardware flow control on the SD device (broken on stm32f205)
+// Only functional for 512 byte sectors.
+static void diskDataInDirect(uint32_t totalSDSectors, uint32_t sdLBA, int useSlowDataCount, uint32_t* phaseChangeDelayNs)
+{
+    sdReadPIO(sdLBA, totalSDSectors);
+
+    // Wait while the SD card starts buffering data
+    if (*phaseChangeDelayNs > 0)
+    {
+        s2s_delay_ns(*phaseChangeDelayNs);
+        *phaseChangeDelayNs = 0;
+    }
+
+    for (int i = 0; i < totalSDSectors && !scsiDev.resetFlag; ++i)
+    {
+        // TODO if i %128 == 0, and not in an error state, then do another read.
+
+        if (useSlowDataCount)
         {
-            HAL_SD_Abort(&hsd);
+            scsiSetDataCount(SD_SECTOR_SIZE);
         }
-        else
+
+        // The SCSI fifo is a full sector so we only need to check once.
+        while (!scsiFifoReady() && !scsiDev.resetFlag)
+        {}
+
+        int byteCount = 0;
+        while(byteCount < SD_SECTOR_SIZE &&
+            likely(!scsiDev.resetFlag) &&
+            likely(scsiDev.phase == DATA_IN) &&
+            !__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR | SDIO_FLAG_DCRCFAIL | SDIO_FLAG_DTIMEOUT))
         {
-            // Wait for the SD transfer to complete before we disable IRQs.
-            // (Otherwise some cards will cause an error if we don't sent the
-            // stop transfer command via the DMA complete handler in time)
-            while (HAL_SD_GetState(&hsd) == HAL_SD_STATE_BUSY)
+            if(__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXFIFOHF))
             {
-                // Wait while keeping BSY.
+                // The SDIO fifo is 32 x 32bits. As we're using the "half full" flag we must
+                // always read half the FIFO.
+
+                for (int j = 0; j < 4; ++j)
+                {
+                    uint32_t data[4];
+                    data[0] = SDIO_ReadFIFO(hsd.Instance);
+                    data[1] = SDIO_ReadFIFO(hsd.Instance);
+                    data[2] = SDIO_ReadFIFO(hsd.Instance);
+                    data[3] = SDIO_ReadFIFO(hsd.Instance);
+
+                    *((volatile uint32_t*)SCSI_FIFO_DATA) = data[0];
+                    *((volatile uint32_t*)SCSI_FIFO_DATA) = data[1];
+                    *((volatile uint32_t*)SCSI_FIFO_DATA) = data[2];
+                    *((volatile uint32_t*)SCSI_FIFO_DATA) = data[3];
+
+                    /*
+                    scsiPhyTx32(data[0] & 0xFFFF, data[0] >> 16);
+                    scsiPhyTx32(data[1] & 0xFFFF, data[1] >> 16);
+                    scsiPhyTx32(data[2] & 0xFFFF, data[2] >> 16);
+                    scsiPhyTx32(data[3] & 0xFFFF, data[3] >> 16);
+                    */
+                }
+
+                byteCount += 64;
             }
         }
 
-        HAL_SD_CardStateTypeDef cardState = HAL_SD_GetCardState(&hsd);
-        while (cardState == HAL_SD_CARD_PROGRAMMING || cardState == HAL_SD_CARD_SENDING) 
+        int error = 0;
+        if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DTIMEOUT))
         {
-            cardState = HAL_SD_GetCardState(&hsd);
-         }
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DTIMEOUT);
+            error = 1;
+        }
+        else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_DCRCFAIL))
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_DCRCFAIL);
+            error = 1;
+        }
+        else if (__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXOVERR))
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_FLAG_RXOVERR);
+            error = 1;
+        }
 
-        // We've finished transferring the data to the FPGA, now wait until it's
-        // written to he SCSI bus.
-        while (!scsiPhyComplete() &&
-            likely(scsiDev.phase == DATA_IN) &&
+        if (error && scsiDev.phase == DATA_IN)
+        {
+            __HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_FLAGS);
+
+            scsiDiskReset();
+
+            scsiDev.status = CHECK_CONDITION;
+            scsiDev.target->sense.code = HARDWARE_ERROR;
+            scsiDev.target->sense.asc = LOGICAL_UNIT_COMMUNICATION_FAILURE;
+            scsiDev.phase = STATUS;
+        }
+
+        // We need the SCSI FIFO count to complete even after the SD read has failed
+        while (byteCount < SD_SECTOR_SIZE &&
             likely(!scsiDev.resetFlag))
         {
-            __disable_irq();
-            if (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
-            {
-                __WFI();
-            }
-            __enable_irq();
+            scsiPhyTx32(0, 0);
+            byteCount += 4;
         }
 
-        if (scsiDev.phase == DATA_IN)
+        while (useSlowDataCount && !scsiDev.resetFlag && !scsiPhyComplete())
         {
-            scsiDev.phase = STATUS;
         }
-        scsiDiskReset();
     }
-    else if (scsiDev.phase == DATA_OUT &&
-        transfer.currentBlock != transfer.blocks)
+
+//while(1) { s2s_ledOn(); s2s_delay_ms(1000); s2s_ledOff(); s2s_delay_ms(1000); }
+
+    /* Send stop transmission command in case of multiblock read */
+    if(totalSDSectors > 1U)
     {
-        scsiEnterPhase(DATA_OUT);
+        SDMMC_CmdStopTransfer(hsd.Instance);
+    }
 
-        const int sdPerScsi = SDSectorsPerSCSISector(bytesPerSector);
-        int totalSDSectors = transfer.blocks * sdPerScsi;
-        uint32_t sdLBA =
-            SCSISector2SD(
-                scsiDev.target->cfg->sdSectorStart,
-                bytesPerSector,
-                transfer.lba);
-        int i = 0;
-        int clearBSY = 0;
-        int disconnected = 0;
+    // Read remaining data
+    uint32_t extraCount = SD_DATATIMEOUT;
+    while ((__HAL_SD_GET_FLAG(&hsd, SDIO_FLAG_RXDAVL)) && (extraCount > 0))
+    {
+        SDIO_ReadFIFO(hsd.Instance);
+        extraCount--;
+    }
 
-        int parityError = 0;
-        int enableParity = scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY;
+    __HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_DATA_FLAGS);
+    hsd.State = HAL_SD_STATE_READY;
+    
+    sdCompleteTransfer(); // Probably overkill
+}
 
-        uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
+static void diskDataIn()
+{
+    uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
+
+    // Take responsibility for waiting for the phase delays
+    uint32_t phaseChangeDelayNs = scsiEnterPhaseImmediate(DATA_IN);
+
+    int totalSDSectors =
+        transfer.blocks * SDSectorsPerSCSISector(bytesPerSector);
+    uint32_t sdLBA =
+        SCSISector2SD(
+            scsiDev.target->cfg->sdSectorStart,
+            bytesPerSector,
+            transfer.lba);
+
+    // It's highly unlikely that someone is going to use huge transfers
+    // per scsi command, but if they do it'll be slower than usual.
+    uint32_t totalScsiBytes = transfer.blocks * bytesPerSector;
+    int useSlowDataCount = totalScsiBytes >= SCSI_XFER_MAX;
+    if (!useSlowDataCount)
+    {
+        scsiSetDataCount(totalScsiBytes);
+    }
 
-        static_assert(SCSI_XFER_MAX >= sizeof(scsiDev.data), "Assumes SCSI_XFER_MAX >= sizeof(scsiDev.data)");
+#ifdef STM32F4xx
+    // Direct mode requires hardware flow control to be working on the SD peripheral
+    if (bytesPerSector == SD_SECTOR_SIZE && totalSDSectors < 128)
+    {
+        diskDataInDirect(totalSDSectors, sdLBA, useSlowDataCount, &phaseChangeDelayNs);
+    }
+    else
+#endif 
+    {
+        diskDataInBuffered(totalSDSectors, sdLBA, useSlowDataCount, &phaseChangeDelayNs);
+    }
+
+    if (phaseChangeDelayNs > 0 && !scsiDev.resetFlag) // zero bytes ?
+    {
+        s2s_delay_ns(phaseChangeDelayNs);
+        phaseChangeDelayNs = 0;
+    }
 
-        // Start reading and filling fifos as soon as possible.
-        // It's highly unlikely that someone is going to use huge transfers
-        // per scsi command, but if they do it'll be slower than usual.
-        // Note: Happens in Macintosh FWB HDD Toolkit benchmarks which default
-        // to 768kb
-        uint32_t totalTransferBytes = transfer.blocks * bytesPerSector;
-        int useSlowDataCount = totalTransferBytes >= SCSI_XFER_MAX;
-        if (!useSlowDataCount)
+    if (scsiDev.resetFlag)
+    {
+        HAL_SD_Abort(&hsd);
+    }
+    else
+    {
+        // Wait for the SD transfer to complete before we disable IRQs.
+        // (Otherwise some cards will cause an error if we don't sent the
+        // stop transfer command via the DMA complete handler in time)
+        while (HAL_SD_GetState(&hsd) == HAL_SD_STATE_BUSY)
         {
-            DWT->CYCCNT = 0; // Start counting cycles
-            scsiSetDataCount(totalTransferBytes);
+            // Wait while keeping BSY.
         }
+    }
 
-        int lastWriteSize = 0;
+    HAL_SD_CardStateTypeDef cardState = HAL_SD_GetCardState(&hsd);
+    while (cardState == HAL_SD_CARD_PROGRAMMING || cardState == HAL_SD_CARD_SENDING) 
+    {
+        cardState = HAL_SD_GetCardState(&hsd);
+    }
 
-        while ((i < totalSDSectors) &&
-            likely(scsiDev.phase == DATA_OUT) &&
-            likely(!scsiDev.resetFlag))
-            // KEEP GOING to ensure FIFOs are in a good state.
-            // likely(!parityError || !enableParity))
+    // We've finished transferring the data to the FPGA, now wait until it's
+    // written to he SCSI bus.
+    while (!scsiPhyComplete() &&
+        likely(scsiDev.phase == DATA_IN) &&
+        likely(!scsiDev.resetFlag))
+    {
+        __disable_irq();
+        if (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
         {
-            if (bytesPerSector == SD_SECTOR_SIZE)
-            {
-                uint32_t maxXferSectors = SCSI_XFER_MAX / SD_SECTOR_SIZE;
-                uint32_t rem = totalSDSectors - i;
-                uint32_t sectors = rem < maxXferSectors ? rem : maxXferSectors;
+            __WFI();
+        }
+        __enable_irq();
+    }
 
-                uint32_t totalBytes = sectors * SD_SECTOR_SIZE;
+    if (scsiDev.phase == DATA_IN)
+    {
+        scsiDev.phase = STATUS;
+    }
+    scsiDiskReset();
+}
 
-                if (useSlowDataCount)
-                {
-                    scsiSetDataCount(totalBytes);
-                }
+void diskDataOut_512(int totalSDSectors, uint32_t sdLBA, int useSlowDataCount, int* clearBSY, int* parityError)
+{
+    int i = 0;
+    int disconnected = 0;
 
-                lastWriteSize = sectors;
-                HAL_SD_WriteBlocks_DMA(&hsd, i + sdLBA, sectors);
-                int j = 0;
-                int prep = 0;
-                int sdActive = 0;
-                uint32_t dmaFinishTime = 0;
-                while (j < sectors && !scsiDev.resetFlag)
-                {
-                    if (sdActive &&
-                        HAL_SD_GetState(&hsd) != HAL_SD_STATE_BUSY &&
-                        !sdIsBusy())
-                    {
-                        j += sdActive;
-                        sdActive = 0;
-                    }
-                    if (!sdActive && ((prep - j) > 0))
-                    {
-                        // Start an SD transfer if we have space.
-                        HAL_SD_WriteBlocks_Data(&hsd, &scsiDev.data[SD_SECTOR_SIZE * (j % maxSectors)]);
-
-                        sdActive = 1;
-                    }
-
-                    if (((prep - j) < maxSectors) &&
-                        (prep < sectors) &&
-                        scsiFifoReady())
-                    {
-                        scsiReadPIO(
-                            &scsiDev.data[(prep % maxSectors) * SD_SECTOR_SIZE],
-                            SD_SECTOR_SIZE,
-                            &parityError);
-                        prep++;
-                        if (prep == sectors)
-                        {
-                            dmaFinishTime = s2s_getTime_ms();
-                        }
-                    }
-                
-                    if (i + prep >= totalSDSectors &&
-                        !disconnected &&
-                        (!parityError || !enableParity) &&
-                        s2s_elapsedTime_ms(dmaFinishTime) >= 180)
-                    {
-                        // We're transferring over the SCSI bus faster than the SD card
-                        // can write.  All data is buffered, and we're just waiting for
-                        // the SD card to complete. The host won't let us disconnect.
-                        // Some drivers set a 250ms timeout on transfers to complete.
-                        // SD card writes are supposed to complete
-                        // within 200ms, but sometimes they don't.
-                        // Just pretend we're finished.
-                        process_Status();
-                        clearBSY = process_MessageIn(0); // Will go to BUS_FREE state but keep BSY asserted.
-                        disconnected = 1;
-                    }
-                }
+    int enableParity = scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY;
 
-                if (scsiDev.resetFlag)
-                {
-                    HAL_SD_Abort(&hsd);
-                }
-                else
-                {
-                    while (HAL_SD_GetState(&hsd) == HAL_SD_STATE_BUSY) {} // Waits for DMA to complete
-                    if (lastWriteSize > 1)
-                    {
-                        SDMMC_CmdStopTransfer(hsd.Instance);
-                    }
-                }
+    uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
 
-                while (sdIsBusy() &&
-                    s2s_elapsedTime_ms(dmaFinishTime) < 180)
-                {
-                    // Wait while the SD card is writing buffer to flash
-                    // The card may remain in the RECEIVING state (even though it's programming) if
-                    // it has buffer space to receive more data available.
-                }
+    int lastWriteSize = 0;
 
-                if (!disconnected && 
-                    i + sectors >= totalSDSectors &&
-                    (!parityError || !enableParity))
-                {
-                    // We're transferring over the SCSI bus faster than the SD card
-                    // can write.  All data is buffered, and we're just waiting for
-                    // the SD card to complete. The host won't let us disconnect.
-                    // Some drivers set a 250ms timeout on transfers to complete.
-                    // SD card writes are supposed to complete
-                    // within 200ms, but sometimes they don't.
-                    // Just pretend we're finished.
-                    process_Status();
-                    clearBSY = process_MessageIn(0); // Will go to BUS_FREE state but keep BSY asserted.
-                }
+    while ((i < totalSDSectors) &&
+        likely(scsiDev.phase == DATA_OUT) &&
+        likely(!scsiDev.resetFlag))
+        // KEEP GOING to ensure FIFOs are in a good state.
+        // likely(!parityError || !enableParity))
+    {
 
-                // Wait while the SD card is writing buffer to flash
-                // The card may remain in the RECEIVING state (even though it's programming) if
-                // it has buffer space to receive more data available.
-                while (sdIsBusy()) {}
-                HAL_SD_CardStateTypeDef cardState = HAL_SD_GetCardState(&hsd);
-                while (cardState == HAL_SD_CARD_PROGRAMMING || cardState == HAL_SD_CARD_RECEIVING) 
-                {
-                    // Wait while the SD card is writing buffer to flash
-                    // The card may remain in the RECEIVING state (even though it's programming) if
-                    // it has buffer space to receive more data available.
-                    cardState = HAL_SD_GetCardState(&hsd);
-                }
+        uint32_t maxXferSectors = SCSI_XFER_MAX / SD_SECTOR_SIZE;
+        uint32_t rem = totalSDSectors - i;
+        uint32_t sectors = rem < maxXferSectors ? rem : maxXferSectors;
+
+        uint32_t totalBytes = sectors * SD_SECTOR_SIZE;
+
+        if (useSlowDataCount)
+        {
+            scsiSetDataCount(totalBytes);
+        }
 
-                i += sectors;
+        lastWriteSize = sectors;
+        HAL_SD_WriteBlocks_DMA(&hsd, i + sdLBA, sectors);
+        int j = 0;
+        int prep = 0;
+        int sdActive = 0;
+        uint32_t dmaFinishTime = 0;
+        while (j < sectors && !scsiDev.resetFlag)
+        {
+            if (sdActive &&
+                HAL_SD_GetState(&hsd) != HAL_SD_STATE_BUSY &&
+                !sdIsBusy())
+            {
+                j += sdActive;
+                sdActive = 0;
             }
-            else
+            if (!sdActive && ((prep - j) > 0))
             {
-                // Well, until we have some proper non-blocking SD code, we must
-                // do this in a half-duplex fashion. We need to write as much as
-                // possible in each SD card transaction.
-                // use sg_dd from sg_utils3 tools to test.
-
-                uint32_t rem = totalSDSectors - i;
-                uint32_t sectors;
-                if (rem <= maxSectors)
-                {
-                    sectors = rem;
-                }
-                else
-                {
-                    sectors = maxSectors;
-                    while (sectors % sdPerScsi) sectors--;
-                }
-                
+                // Start an SD transfer if we have space.
+                HAL_SD_WriteBlocks_Data(&hsd, &scsiDev.data[SD_SECTOR_SIZE * (j % maxSectors)]);
 
-                if (useSlowDataCount)
-                {
-                    scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);
-                }
+                sdActive = 1;
+            }
 
-                for (int scsiSector = i; scsiSector < i + sectors; ++scsiSector)
-                {
-                    int dmaBytes = SD_SECTOR_SIZE;
-                    if ((scsiSector % sdPerScsi) == (sdPerScsi - 1))
-                    {
-                        dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
-                        if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
-                    }
-
-                    scsiReadPIO(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, &parityError);
-                }
-                if (!parityError || !enableParity)
+            if (((prep - j) < maxSectors) &&
+                (prep < sectors) &&
+                scsiFifoReady())
+            {
+                scsiReadPIO(
+                    &scsiDev.data[(prep % maxSectors) * SD_SECTOR_SIZE],
+                    SD_SECTOR_SIZE,
+                    parityError);
+                prep++;
+                if (prep == sectors)
                 {
-                    BSP_SD_WriteBlocks_DMA(&scsiDev.data[0], i + sdLBA, sectors);
+                    dmaFinishTime = s2s_getTime_ms();
                 }
-                i += sectors;
+            }
+        
+            if (i + prep >= totalSDSectors &&
+                !disconnected &&
+                (!(*parityError) || !enableParity) &&
+                s2s_elapsedTime_ms(dmaFinishTime) >= 180)
+            {
+                // We're transferring over the SCSI bus faster than the SD card
+                // can write.  All data is buffered, and we're just waiting for
+                // the SD card to complete. The host won't let us disconnect.
+                // Some drivers set a 250ms timeout on transfers to complete.
+                // SD card writes are supposed to complete
+                // within 200ms, but sometimes they don't.
+                // Just pretend we're finished.
+                process_Status();
+                *clearBSY = process_MessageIn(0); // Will go to BUS_FREE state but keep BSY asserted.
+                disconnected = 1;
             }
         }
 
-        // Should already be complete here as we've ready the FIFOs
-        // by now. Check anyway.
-        __disable_irq();
-        while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
+        if (scsiDev.resetFlag)
         {
-            __WFI();
+            HAL_SD_Abort(&hsd);
         }
-        __enable_irq();
+        else
+        {
+            while (HAL_SD_GetState(&hsd) == HAL_SD_STATE_BUSY) {} // Waits for DMA to complete
+            if (lastWriteSize > 1)
+            {
+                SDMMC_CmdStopTransfer(hsd.Instance);
+            }
+        }
+
+        while (sdIsBusy() &&
+            s2s_elapsedTime_ms(dmaFinishTime) < 180)
+        {
+            // Wait while the SD card is writing buffer to flash
+            // The card may remain in the RECEIVING state (even though it's programming) if
+            // it has buffer space to receive more data available.
+        }
+
+        if (!disconnected && 
+            i + sectors >= totalSDSectors &&
+            (!parityError || !enableParity))
+        {
+            // We're transferring over the SCSI bus faster than the SD card
+            // can write.  All data is buffered, and we're just waiting for
+            // the SD card to complete. The host won't let us disconnect.
+            // Some drivers set a 250ms timeout on transfers to complete.
+            // SD card writes are supposed to complete
+            // within 200ms, but sometimes they don't.
+            // Just pretend we're finished.
+            process_Status();
+            *clearBSY = process_MessageIn(0); // Will go to BUS_FREE state but keep BSY asserted.
+        }
+
+        // Wait while the SD card is writing buffer to flash
+        // The card may remain in the RECEIVING state (even though it's programming) if
+        // it has buffer space to receive more data available.
+        while (sdIsBusy()) {}
+        HAL_SD_CardStateTypeDef cardState = HAL_SD_GetCardState(&hsd);
+        while (cardState == HAL_SD_CARD_PROGRAMMING || cardState == HAL_SD_CARD_RECEIVING) 
+        {
+            // Wait while the SD card is writing buffer to flash
+            // The card may remain in the RECEIVING state (even though it's programming) if
+            // it has buffer space to receive more data available.
+            cardState = HAL_SD_GetCardState(&hsd);
+        }
+
+        i += sectors;
+   
+    }
+}
+
+void diskDataOut_variableSectorSize(int sdPerScsi, int totalSDSectors, uint32_t sdLBA, int useSlowDataCount, int* parityError)
+{
+    uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
 
-        if (clearBSY)
+    int i = 0;
+
+    int enableParity = scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY;
+
+    uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
+
+    while ((i < totalSDSectors) &&
+        likely(scsiDev.phase == DATA_OUT) &&
+        likely(!scsiDev.resetFlag))
+        // KEEP GOING to ensure FIFOs are in a good state.
+        // likely(!parityError || !enableParity))
+    {
+        // Well, until we have some proper non-blocking SD code, we must
+        // do this in a half-duplex fashion. We need to write as much as
+        // possible in each SD card transaction.
+        // use sg_dd from sg_utils3 tools to test.
+
+        uint32_t rem = totalSDSectors - i;
+        uint32_t sectors;
+        if (rem <= maxSectors)
         {
-            enter_BusFree();
+            sectors = rem;
+        }
+        else
+        {
+            sectors = maxSectors;
+            while (sectors % sdPerScsi) sectors--;
+        }
+        
+
+        if (useSlowDataCount)
+        {
+            scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);
         }
 
-        if (scsiDev.phase == DATA_OUT)
+        for (int scsiSector = i; scsiSector < i + sectors; ++scsiSector)
         {
-            if (parityError &&
-                (scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY))
+            int dmaBytes = SD_SECTOR_SIZE;
+            if ((scsiSector % sdPerScsi) == (sdPerScsi - 1))
             {
-                scsiDev.target->sense.code = ABORTED_COMMAND;
-                scsiDev.target->sense.asc = SCSI_PARITY_ERROR;
-                scsiDev.status = CHECK_CONDITION;;
+                dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
+                if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
             }
-            scsiDev.phase = STATUS;
+
+            scsiReadPIO(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, parityError);
         }
-        scsiDiskReset();
+        if (!(*parityError) || !enableParity)
+        {
+            BSP_SD_WriteBlocks_DMA(&scsiDev.data[0], i + sdLBA, sectors);
+        }
+        i += sectors;
     }
 }
 
+void diskDataOut()
+{
+    uint32_t bytesPerSector = scsiDev.target->liveCfg.bytesPerSector;
+
+    scsiEnterPhase(DATA_OUT);
+
+    const int sdPerScsi = SDSectorsPerSCSISector(bytesPerSector);
+    int totalSDSectors = transfer.blocks * sdPerScsi;
+    uint32_t sdLBA =
+        SCSISector2SD(
+            scsiDev.target->cfg->sdSectorStart,
+            bytesPerSector,
+            transfer.lba);
+    int clearBSY = 0;
+
+    int parityError = 0;
+
+    static_assert(SCSI_XFER_MAX >= sizeof(scsiDev.data), "Assumes SCSI_XFER_MAX >= sizeof(scsiDev.data)");
+
+    // Start reading and filling fifos as soon as possible.
+    // It's highly unlikely that someone is going to use huge transfers
+    // per scsi command, but if they do it'll be slower than usual.
+    // Note: Happens in Macintosh FWB HDD Toolkit benchmarks which default
+    // to 768kb
+    uint32_t totalTransferBytes = transfer.blocks * bytesPerSector;
+    int useSlowDataCount = totalTransferBytes >= SCSI_XFER_MAX;
+    if (!useSlowDataCount)
+    {
+        DWT->CYCCNT = 0; // Start counting cycles
+        scsiSetDataCount(totalTransferBytes);
+    }
+
+    if (bytesPerSector == SD_SECTOR_SIZE)
+    {
+        diskDataOut_512(totalSDSectors, sdLBA, useSlowDataCount, &clearBSY, &parityError);
+    }
+    else
+    {
+        diskDataOut_variableSectorSize(sdPerScsi, totalSDSectors, sdLBA, useSlowDataCount, &parityError);
+    }
+    
+
+    // Should already be complete here as we've ready the FIFOs
+    // by now. Check anyway.
+    __disable_irq();
+    while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
+    {
+        __WFI();
+    }
+    __enable_irq();
+
+    if (clearBSY)
+    {
+        enter_BusFree();
+    }
+
+    if (scsiDev.phase == DATA_OUT)
+    {
+        if (parityError &&
+            (scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY))
+        {
+            scsiDev.target->sense.code = ABORTED_COMMAND;
+            scsiDev.target->sense.asc = SCSI_PARITY_ERROR;
+            scsiDev.status = CHECK_CONDITION;;
+        }
+        scsiDev.phase = STATUS;
+    }
+    scsiDiskReset();
+}
+
+
+void scsiDiskPoll()
+{
+    if (scsiDev.phase == DATA_IN &&
+        transfer.currentBlock != transfer.blocks)
+    {
+        diskDataIn();
+     }
+    else if (scsiDev.phase == DATA_OUT &&
+        transfer.currentBlock != transfer.blocks)
+    {
+        diskDataOut();
+    }
+}
+
+
 void scsiDiskReset()
 {
     scsiDev.dataPtr = 0;

+ 4 - 4
lib/SCSI2SD/src/firmware/scsi.c

@@ -56,21 +56,21 @@ void enter_BusFree()
 		s2s_delay_us(2);
 	}
 
-//#if 0
+#if 0
 	if (scsiDev.status != GOOD)// && isDebugEnabled())
 	{
 		// We want to capture debug information for failure cases.
 		s2s_delay_ms(80);
 	}
-//#endif
+#endif
 
 
 	scsiEnterBusFree();
 
 	// Wait for the initiator to cease driving signals
 	// Bus settle delay + bus clear delay = 1200ns
-	s2s_delay_us(2);
-
+    // Just waiting the clear delay is sufficient.
+	s2s_delay_ns(800);
 
 	s2s_ledOff();
 	scsiDev.phase = BUS_FREE;

+ 11 - 6
lib/SCSI2SD/src/firmware/scsiPhy.c

@@ -503,7 +503,7 @@ static inline void busSettleDelay(void)
 {
     // Data Release time (switching IO) = 400ns
     // + Bus Settle time (switching phase) = 400ns.
-    s2s_delay_us(1); // Close enough.
+    s2s_delay_ns(800);
 }
 
 void scsiEnterBusFree()
@@ -541,7 +541,7 @@ void scsiEnterPhase(int newPhase)
     uint32_t delay = scsiEnterPhaseImmediate(newPhase);
     if (delay > 0)
     {
-        s2s_delay_us(delay);
+        s2s_delay_ns(delay);
     }
 }
 
@@ -631,16 +631,21 @@ uint32_t scsiEnterPhaseImmediate(int newPhase)
                 asyncTiming[3]);
         }
 
-        uint32_t delayUs = 0;
+        uint32_t delayNs = 0;
         if (newPhase >= 0)
         {
             *SCSI_CTRL_PHASE = newPhase;
-            delayUs += 1; // busSettleDelay
+            delayNs += 400; // busSettleDelay
+
+            if ((oldPhase & __scsiphase_io) != (newPhase & __scsiphase_io))
+            {
+                delayNs += 400; // Data release delay
+            }
 
             if (scsiDev.compatMode < COMPAT_SCSI2)
             {
                 // EMU EMAX needs 100uS ! 10uS is not enough.
-                delayUs += 100;
+                delayNs += 100000;
             }
         }
         else
@@ -648,7 +653,7 @@ uint32_t scsiEnterPhaseImmediate(int newPhase)
             *SCSI_CTRL_PHASE = 0;
         }
 
-        return delayUs;
+        return delayNs;
     }
 
     return 0; // No change

+ 72 - 0
lib/SCSI2SD/src/firmware/sd.c

@@ -83,6 +83,78 @@ void sdReadDMA(uint32_t lba, uint32_t sectors, uint8_t* outputBuffer)
 	}
 }
 
+void sdReadPIO(uint32_t lba, uint32_t sectors)
+{
+	uint32_t errorstate;
+	hsd.ErrorCode = HAL_SD_ERROR_NONE;
+	hsd.State = HAL_SD_STATE_BUSY;
+
+	/* Initialize data control register */
+	hsd.Instance->DCTRL = 0U;
+
+	// The IRQ handler clears flags which we need to read the fifo data
+#if defined(SDIO_STA_STBITERR)
+    __HAL_SD_DISABLE_IT(&hsd, (SDIO_IT_DCRCFAIL | SDIO_IT_DTIMEOUT | SDIO_IT_RXOVERR | SDIO_IT_DATAEND | SDIO_FLAG_RXFIFOHF | SDIO_IT_STBITERR));
+#else
+    __HAL_SD_DISABLE_IT(&hsd, (SDIO_IT_DCRCFAIL | SDIO_IT_DTIMEOUT | SDIO_IT_RXOVERR | SDIO_IT_DATAEND | SDIO_FLAG_RXFIFOHF));
+#endif
+
+	if(hsd.SdCard.CardType != CARD_SDHC_SDXC)
+	{
+		lba *= 512U;
+
+		errorstate = SDMMC_CmdBlockLength(hsd.Instance, 512u);
+		if(errorstate != HAL_SD_ERROR_NONE)
+		{
+			__HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_FLAGS);
+			scsiDiskReset();
+
+			scsiDev.status = CHECK_CONDITION;
+			scsiDev.target->sense.code = HARDWARE_ERROR;
+			scsiDev.target->sense.asc = LOGICAL_UNIT_COMMUNICATION_FAILURE;
+			scsiDev.phase = STATUS;
+			return;
+		}
+	}
+
+	SDIO_DataInitTypeDef config;
+	config.DataTimeOut   = SDMMC_DATATIMEOUT;
+	config.DataLength    = sectors * 512u;
+	config.DataBlockSize = SDIO_DATABLOCK_SIZE_512B;
+	config.TransferDir   = SDIO_TRANSFER_DIR_TO_SDIO;
+	config.TransferMode  = SDIO_TRANSFER_MODE_BLOCK;
+	config.DPSM          = SDIO_DPSM_ENABLE;
+	SDIO_ConfigData(hsd.Instance, &config);
+
+	if(sectors > 1U)
+	{
+		hsd.Context = SD_CONTEXT_READ_MULTIPLE_BLOCK;
+		errorstate = SDMMC_CmdReadMultiBlock(hsd.Instance, lba);
+	}
+	else
+	{
+		hsd.Context = SD_CONTEXT_READ_SINGLE_BLOCK;
+		errorstate = SDMMC_CmdReadSingleBlock(hsd.Instance, lba);
+	}
+
+	if(errorstate != HAL_SD_ERROR_NONE)
+	{
+		__HAL_SD_CLEAR_FLAG(&hsd, SDIO_STATIC_FLAGS);
+
+		scsiDiskReset();
+
+		scsiDev.status = CHECK_CONDITION;
+		scsiDev.target->sense.code = HARDWARE_ERROR;
+		scsiDev.target->sense.asc = LOGICAL_UNIT_COMMUNICATION_FAILURE;
+		scsiDev.phase = STATUS;
+	}
+	else
+	{
+		sdCmdActive = 1;
+	}
+}
+
+
 void sdCompleteTransfer()
 {
 	if (sdCmdActive)

+ 3 - 0
lib/SCSI2SD/src/firmware/sd.h

@@ -36,6 +36,9 @@ int sdInit(void);
 
 void sdReadDMA(uint32_t lba, uint32_t sectors, uint8_t* outputBuffer);
 int sdReadDMAPoll(uint32_t remainingSectors);
+
+void sdReadPIO(uint32_t lba, uint32_t sectors);
+
 void sdCompleteTransfer();
 void sdKeepAlive();
 

+ 1 - 0
lib/SCSI2SD/src/firmware/time.h

@@ -33,6 +33,7 @@ uint32_t s2s_elapsedTime_ms(uint32_t since);
 
 #define s2s_delay_ms(delay) s2s_delay_clocks((delay) * (s2s_cpu_freq / 1000))
 #define s2s_delay_us(delay) s2s_delay_clocks((delay) * (s2s_cpu_freq / 1000000))
+#define s2s_delay_ns(delay) s2s_delay_clocks(((delay) * ((s2s_cpu_freq * 64LL + 500000000) / 1000000000)) / 64)
 void s2s_delay_clocks(uint32_t delay);
 
 #endif