Pārlūkot izejas kodu

Slight improvements to data throughput, which may assist SCSI hosts with short timeouts

Michael McMaster 6 gadi atpakaļ
vecāks
revīzija
a1cd1e1c91

+ 4 - 0
lib/SCSI2SD/CHANGELOG

@@ -1,3 +1,7 @@
+20191009		6.2.7
+	- Slight improvements to data throughput, which may assist SCSI hosts with
+	short timeouts.
+
 20190529		6.2.5
 	- Add scsi mode page 0 support
 	- Fix SD card hotswap bug when the SCSI host is constantly polling

+ 1 - 0
lib/SCSI2SD/STM32CubeMX/SCSI2SD-V6/Inc/stm32f2xx_it.h

@@ -46,6 +46,7 @@
 /* Exported functions ------------------------------------------------------- */
 
 void SysTick_Handler(void);
+void EXTI3_IRQHandler(void);
 void EXTI4_IRQHandler(void);
 void SDIO_IRQHandler(void);
 void DMA2_Stream3_IRQHandler(void);

+ 7 - 1
lib/SCSI2SD/STM32CubeMX/SCSI2SD-V6/Src/gpio.c

@@ -69,11 +69,17 @@ void MX_GPIO_Init(void)
   __GPIOD_CLK_ENABLE();
 
   /*Configure GPIO pins : PEPin PEPin PEPin PEPin */
-  GPIO_InitStruct.Pin = FPGA_GPIO2_Pin|FPGA_GPIO3_Pin|UNUSED_PE5_Pin|UNUSED_PE6_Pin;
+  GPIO_InitStruct.Pin = FPGA_GPIO2_Pin|UNUSED_PE5_Pin|UNUSED_PE6_Pin;
   GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
   GPIO_InitStruct.Pull = GPIO_PULLDOWN;
   HAL_GPIO_Init(GPIOE, &GPIO_InitStruct);
 
+  /*Configure GPIO pin : PE3 */
+  GPIO_InitStruct.Pin = FPGA_GPIO3_Pin;
+  GPIO_InitStruct.Mode = GPIO_MODE_INPUT;
+  GPIO_InitStruct.Pull = GPIO_NOPULL;
+  HAL_GPIO_Init(GPIOE, &GPIO_InitStruct);
+
   /*Configure GPIO pin : PE4 */
   GPIO_InitStruct.Pin = GPIO_PIN_4;
   GPIO_InitStruct.Mode = GPIO_MODE_IT_RISING;

+ 1 - 0
lib/SCSI2SD/STM32CubeMX/SCSI2SD-V6/Src/stm32f2xx_it.c

@@ -72,6 +72,7 @@ void SysTick_Handler(void)
 /* please refer to the startup file (startup_stm32f2xx.s).                    */
 /******************************************************************************/
 
+
 /**
 * @brief This function handles EXTI line4 interrupt.
 */

BIN
lib/SCSI2SD/rtl/fpga_bitmap.o


+ 1 - 1
lib/SCSI2SD/src/firmware/config.c

@@ -37,7 +37,7 @@
 
 #include <string.h>
 
-static const uint16_t FIRMWARE_VERSION = 0x0625;
+static const uint16_t FIRMWARE_VERSION = 0x0627;
 
 // 1 flash row
 static const uint8_t DEFAULT_CONFIG[128] =

+ 105 - 127
lib/SCSI2SD/src/firmware/disk.c

@@ -18,6 +18,8 @@
 
 #include "stm32f2xx.h"
 
+#include <assert.h>
+
 // For SD write direct routines
 #include "sdio.h"
 #include "bsp_driver_sd.h"
@@ -561,14 +563,17 @@ void scsiDiskPoll()
 		int scsiActive __attribute__((unused)) = 0; // unused if DMA disabled
 		int sdActive = 0;
 
-		uint32_t partialScsiChunk = 0;
-
-		// Start reading from the SD card FIRST, because we change state and
-		// wait for SCSI signals
-		int dataInStarted = 0;
+		// It's highly unlikely that someone is going to use huge transfers
+		// per scsi command, but if they do it'll be slower than usual.
+		uint32_t totalScsiBytes = transfer.blocks * bytesPerSector;
+		int useSlowDataCount = totalScsiBytes >= SCSI_XFER_MAX;
+		if (!useSlowDataCount)
+		{
+			scsiSetDataCount(totalScsiBytes);
+		}
 
 		while ((i < totalSDSectors) &&
-			(!dataInStarted || likely(scsiDev.phase == DATA_IN)) &&
+			likely(scsiDev.phase == DATA_IN) &&
 			likely(!scsiDev.resetFlag))
 		{
 			int completedDmaSectors;
@@ -588,12 +593,13 @@ void scsiDiskPoll()
 
 			if (!sdActive &&
 				(prep - i < buffers) &&
-				(prep < totalSDSectors))
+				(prep < totalSDSectors) &&
+				((totalSDSectors - prep) >= sdPerScsi) &&
+				(likely(!useSlowDataCount) || scsiPhyComplete()))
 			{
 				// Start an SD transfer if we have space.
 				uint32_t startBuffer = prep % buffers;
 				uint32_t sectors = totalSDSectors - prep;
-
 				uint32_t freeBuffers = buffers - (prep - i);
 
 				uint32_t contiguousBuffers = buffers - startBuffer;
@@ -603,6 +609,12 @@ void scsiDiskPoll()
 
 				if (sectors > 128) sectors = 128; // 65536 DMA limit !!
 
+				// Round-down when we have odd sector sizes.
+				if (sdPerScsi != 1)
+				{
+					sectors = (sectors / sdPerScsi) * sdPerScsi;
+				}
+
 				for (int dodgy = 0; dodgy < sectors; dodgy++)
 				{
 					scsiDev.data[SD_SECTOR_SIZE * (startBuffer + dodgy) + 510] = 0xAA;
@@ -613,6 +625,11 @@ void scsiDiskPoll()
 
 				sdActive = sectors;
 
+				if (useSlowDataCount)
+				{
+					scsiSetDataCount((sectors / sdPerScsi) * bytesPerSector);
+				}
+
 				// Wait now that the SD card is busy
 				// Chances are we've probably already waited sufficient time,
 				// but it's hard to measure microseconds cheaply. So just wait
@@ -624,26 +641,6 @@ void scsiDiskPoll()
 				}
 			}
 
-#ifdef SCSI_FSMC_DMA
-			#error this code not updated for 256 max bytes in scsi fifo
-			if (scsiActive && scsiPhyComplete() && scsiWriteDMAPoll())
-			{
-				scsiActive = 0;
-				i++;
-				scsiPhyFifoFlip();
-			}
-			if (!scsiActive && ((prep - i) > 0))
-			{
-				int dmaBytes = SD_SECTOR_SIZE;
-				if ((i % sdPerScsi) == (sdPerScsi - 1))
-				{
-					dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
-					if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
-				}
-				scsiWriteDMA(&scsiDev.data[SD_SECTOR_SIZE * (i % buffers)], dmaBytes);
-				scsiActive = 1;
-			}
-#else
 			if ((prep - i) > 0)
 			{
 				int dmaBytes = SD_SECTOR_SIZE;
@@ -653,42 +650,11 @@ void scsiDiskPoll()
 					if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
 				}
 
-				// Manually unrolled loop for performance.
-				// -Os won't unroll this for us automatically,
-				// especially since scsiPhyTx does volatile stuff.
-				// Reduces bus utilisation by making the fsmc split
-				// 32bits into 2 16 bit writes.
-
-				uint16_t* scsiDmaData = (uint16_t*) &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers) + partialScsiChunk]);
-
-				uint32_t chunk = ((dmaBytes - partialScsiChunk) > SCSI_FIFO_DEPTH)
-					? SCSI_FIFO_DEPTH : (dmaBytes - partialScsiChunk);
-
-				int k = 0;
-				for (; k + 4 < (chunk + 1) / 2; k += 4)
-				{
-					scsiPhyTx32(scsiDmaData[k], scsiDmaData[k+1]);
-					scsiPhyTx32(scsiDmaData[k+2], scsiDmaData[k+3]);
-				}
-				for (; k < (chunk + 1) / 2; ++k)
-				{
-					scsiPhyTx(scsiDmaData[k]);
-				}
-				while (!scsiPhyComplete() && !scsiDev.resetFlag)
-				{
-					__WFE(); // Wait for event
-				}
-				scsiPhyFifoFlip();
-				scsiSetDataCount(chunk);
+				uint8_t* scsiDmaData = &(scsiDev.data[SD_SECTOR_SIZE * (i % buffers)]);
+				scsiWritePIO(scsiDmaData, dmaBytes);
 
-				partialScsiChunk += chunk;
-				if (partialScsiChunk == dmaBytes)
-				{
-					partialScsiChunk = 0;
-					++i;
-				}
+				++i;
 			}
-#endif
 		}
 
 		if (phaseChangeDelayUs > 0 && !scsiDev.resetFlag) // zero bytes ?
@@ -699,13 +665,14 @@ void scsiDiskPoll()
 
 		// We've finished transferring the data to the FPGA, now wait until it's
 		// written to he SCSI bus.
+		__disable_irq();
 		while (!scsiPhyComplete() &&
 			likely(scsiDev.phase == DATA_IN) &&
 			likely(!scsiDev.resetFlag))
 		{
-			__WFE(); // Wait for event
+			__WFI();
 		}
-
+		__enable_irq();
 
 		if (scsiDev.phase == DATA_IN)
 		{
@@ -727,22 +694,28 @@ void scsiDiskPoll()
 				transfer.lba);
 		int i = 0;
 		int clearBSY = 0;
+		int extraSectors = 0;
 
 		int parityError = 0;
 		int enableParity = scsiDev.boardCfg.flags & S2S_CFG_ENABLE_PARITY;
 
+		uint32_t scsiSpeed = s2s_getScsiRateMBs();
+
+		uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
+
+		static_assert(SCSI_XFER_MAX >= sizeof(scsiDev.data), "Assumes SCSI_XFER_MAX >= sizeof(scsiDev.data)");
+
+		// Start reading and filling fifos as soon as possible.
+		scsiSetDataCount(transfer.blocks * bytesPerSector);
+
 		while ((i < totalSDSectors) &&
 			likely(scsiDev.phase == DATA_OUT) &&
-			likely(!scsiDev.resetFlag) &&
-			likely(!parityError || !enableParity))
+			likely(!scsiDev.resetFlag))
+			// KEEP GOING to ensure FIFOs are in a good state.
+			// likely(!parityError || !enableParity))
 		{
-			// Well, until we have some proper non-blocking SD code, we must
-			// do this in a half-duplex fashion. We need to write as much as
-			// possible in each SD card transaction.
-			uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
 			uint32_t rem = totalSDSectors - i;
-			uint32_t sectors =
-				rem < maxSectors ? rem : maxSectors;
+			uint32_t sectors = rem < maxSectors ? rem : maxSectors;
 
 			if (bytesPerSector == SD_SECTOR_SIZE)
 			{
@@ -750,19 +723,20 @@ void scsiDiskPoll()
 				// no flow control. This can be handled if a) the scsi interface
 				// doesn't block and b) we read enough SCSI sectors first so that
 				// the SD interface cannot catch up.
+				int prevExtraSectors = extraSectors;
 				uint32_t totalBytes = sectors * SD_SECTOR_SIZE;
-				uint32_t readAheadBytes = sectors * SD_SECTOR_SIZE;
+				extraSectors = 0;
+
+				int32_t readAheadBytes = totalBytes;
 				uint32_t sdSpeed = s2s_getSdRateMBs() + (scsiDev.sdUnderrunCount / 2);
-				uint32_t scsiSpeed = s2s_getScsiRateMBs();
 				// if (have blind writes)
 				if (scsiSpeed > 0 && scsiDev.sdUnderrunCount < 16)
 				{
 					// readAhead = sectors * (sd / scsi - 1 + 0.1);
-					readAheadBytes = totalBytes * sdSpeed / scsiSpeed - totalBytes + SCSI_FIFO_DEPTH;
-					if (readAheadBytes < SCSI_FIFO_DEPTH)
-					{
-						readAheadBytes = SCSI_FIFO_DEPTH;
-					}
+					readAheadBytes = totalBytes * sdSpeed / scsiSpeed - totalBytes;
+
+					// Round up to nearest FIFO size.
+					readAheadBytes = ((readAheadBytes / SCSI_FIFO_DEPTH) + 1) * SCSI_FIFO_DEPTH;
 
 					if (readAheadBytes > totalBytes)
 					{
@@ -770,60 +744,58 @@ void scsiDiskPoll()
 					}
 				}
 
-				uint32_t chunk = (readAheadBytes > SCSI_FIFO_DEPTH) ? SCSI_FIFO_DEPTH : readAheadBytes;
-				scsiSetDataCount(chunk);
+				uint32_t prevExtraBytes = prevExtraSectors * SD_SECTOR_SIZE;
+				uint32_t scsiBytesRead = prevExtraBytes;
+				readAheadBytes -= prevExtraBytes; // Must be signed!
 
-				uint32_t scsiBytesRead = 0;
-				while (scsiBytesRead < readAheadBytes)
+				if (readAheadBytes > 0)
 				{
-					while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
-					{
-						__WFE(); // Wait for event
-					}
-					parityError |= scsiParityError();
-					scsiPhyFifoFlip();
-					uint32_t nextChunk = ((totalBytes - scsiBytesRead - chunk) > SCSI_FIFO_DEPTH)
-						? SCSI_FIFO_DEPTH : (totalBytes - scsiBytesRead - chunk);
-
-					if (nextChunk > 0) scsiSetDataCount(nextChunk);
-					scsiReadPIO(&scsiDev.data[scsiBytesRead], chunk);
-					scsiBytesRead += chunk;
-					chunk = nextChunk;
+					scsiReadPIO(
+						&scsiDev.data[scsiBytesRead],
+						readAheadBytes,
+						&parityError);
+					scsiBytesRead += readAheadBytes;
 				}
 
 				HAL_SD_WriteBlocks_DMA(&hsd, (uint32_t*) (&scsiDev.data[0]), (i + sdLBA) * 512ll, SD_SECTOR_SIZE, sectors);
 
-				while (scsiBytesRead < totalBytes)
+				int underrun = 0;
+				if (scsiBytesRead < totalBytes && !scsiDev.resetFlag)
 				{
-					while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
-					{
-						__WFE(); // Wait for event
-					}
-					parityError |= scsiParityError();
-					scsiPhyFifoFlip();
-					uint32_t nextChunk = ((totalBytes - scsiBytesRead - chunk) > SCSI_FIFO_DEPTH)
-						? SCSI_FIFO_DEPTH : (totalBytes - scsiBytesRead - chunk);
-
-					if (nextChunk > 0) scsiSetDataCount(nextChunk);
-					scsiReadPIO(&scsiDev.data[scsiBytesRead], chunk);
-					scsiBytesRead += chunk;
-					chunk = nextChunk;
+					scsiReadPIO(
+						&scsiDev.data[scsiBytesRead],
+						totalBytes - readAheadBytes,
+						&parityError);
+
+					// Oh dear, SD finished first.
+					underrun = hsd.DmaTransferCplt;
+
+					scsiBytesRead += (totalBytes - readAheadBytes);
 				}
 
-				// Oh dear, SD finished first.
-				int underrun = totalBytes > readAheadBytes && hsd.DmaTransferCplt;
+				if (!underrun && rem > sectors)
+				{
+					// We probably have some time to waste reading more here.
+					// While noting this is going to drop us down into
+					// half-duplex operation (hence why we read max / 4 only)
+
+					extraSectors = rem - sectors > (maxSectors / 4)
+						? (maxSectors / 4)
+						: rem - sectors;
+
+					scsiReadPIO(
+						&scsiDev.data[0],
+						extraSectors * SD_SECTOR_SIZE,
+						&parityError);
+				}
 
 				uint32_t dmaFinishTime = s2s_getTime_ms();
-				while (!hsd.SdTransferCplt &&
+				while ((!hsd.SdTransferCplt ||
+						__HAL_SD_SDIO_GET_FLAG(&hsd, SDIO_FLAG_TXACT)) &&
 					s2s_elapsedTime_ms(dmaFinishTime) < 180)
 				{
 					// Wait while keeping BSY.
 				}
-				while((__HAL_SD_SDIO_GET_FLAG(&hsd, SDIO_FLAG_TXACT)) &&
-					s2s_elapsedTime_ms(dmaFinishTime) < 180)
-				{
-					// Wait for SD card while keeping BSY.
-				}
 
 				if (i + sectors >= totalSDSectors &&
 					!underrun &&
@@ -842,14 +814,14 @@ void scsiDiskPoll()
 
 				HAL_SD_CheckWriteOperation(&hsd, (uint32_t)SD_DATATIMEOUT);
 
-				if (underrun)
+				if (underrun && (!parityError || !enableParity))
 				{
 					// Try again. Data is still in memory.
 					sdTmpWrite(&scsiDev.data[0], i + sdLBA, sectors);
 					scsiDev.sdUnderrunCount++;
 				}
-				i += sectors;
 
+				i += sectors;
 			}
 			else
 			{
@@ -857,11 +829,7 @@ void scsiDiskPoll()
 				// do this in a half-duplex fashion. We need to write as much as
 				// possible in each SD card transaction.
 				// use sg_dd from sg_utils3 tools to test.
-				uint32_t maxSectors = sizeof(scsiDev.data) / SD_SECTOR_SIZE;
-				uint32_t rem = totalSDSectors - i;
-				uint32_t sectors = rem < maxSectors ? rem : maxSectors;
-				int scsiSector;
-				for (scsiSector = i; scsiSector < i + sectors; ++scsiSector)
+				for (int scsiSector = i; scsiSector < i + sectors; ++scsiSector)
 				{
 					int dmaBytes = SD_SECTOR_SIZE;
 					if ((scsiSector % sdPerScsi) == (sdPerScsi - 1))
@@ -869,9 +837,10 @@ void scsiDiskPoll()
 						dmaBytes = bytesPerSector % SD_SECTOR_SIZE;
 						if (dmaBytes == 0) dmaBytes = SD_SECTOR_SIZE;
 					}
-					scsiRead(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, &parityError);
+
+					scsiReadPIO(&scsiDev.data[SD_SECTOR_SIZE * (scsiSector - i)], dmaBytes, &parityError);
 				}
-				if (!parityError)
+				if (!parityError || !enableParity)
 				{
 					sdTmpWrite(&scsiDev.data[0], i + sdLBA, sectors);
 				}
@@ -879,6 +848,15 @@ void scsiDiskPoll()
 			}
 		}
 
+		// Should already be complete here as we've ready the FIFOs
+		// by now. Check anyway.
+		__disable_irq();
+		while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
+		{
+			__WFI();
+		}
+		__enable_irq();
+
 		if (clearBSY)
 		{
 			enter_BusFree();

+ 0 - 1
lib/SCSI2SD/src/firmware/scsi.c

@@ -303,7 +303,6 @@ static void process_Command()
 	{
 		scsiRead(scsiDev.cdb + 6, scsiDev.cdbLen - 6, &parityError);
 	}
-
 	command = scsiDev.cdb[0];
 
 	// Prefer LUN's set by IDENTIFY messages for newer hosts.

+ 3 - 1
lib/SCSI2SD/src/firmware/scsi.h

@@ -106,7 +106,9 @@ typedef struct
 typedef struct
 {
 	// TODO reduce this buffer size and add a proper cache
-	uint8_t data[MAX_SECTOR_SIZE * 8]; // Must be aligned for DMA
+	// Must be aligned for DMA
+	// 65536 bytes is the DMA limit
+	uint8_t data[MAX_SECTOR_SIZE * 8];
 
 	TargetState targets[S2S_MAX_TARGETS];
 	TargetState* target;

+ 334 - 227
lib/SCSI2SD/src/firmware/scsiPhy.c

@@ -30,7 +30,8 @@
 static uint8_t asyncTimings[][4] =
 {
 /* Speed,    Assert,    Deskew,    Hold,    Glitch */
-{/*1.5MB/s*/ 28,        18,        13,      15},
+{/*1.5MB/s*/ 28,        18,        7,      15},
+//{/*1.5MB/s*/ 63,        31,        7,      15},
 {/*3.3MB/s*/ 13,        6,         6,       13},
 {/*5MB/s*/   9,         6,         6,       6}, // 80ns
 {/*safe*/    3,         6,         6,       6}, // Probably safe
@@ -106,8 +107,6 @@ static DMA_HandleTypeDef fsmcToMem;
 volatile uint8_t scsiRxDMAComplete;
 volatile uint8_t scsiTxDMAComplete;
 
-uint8_t scsiPhyFifoSel = 0; // global
-
 // scsi IRQ handler is initialised by the STM32 HAL. Connected to
 // PE4
 // Note: naming is important to ensure this function is listed in the
@@ -120,15 +119,18 @@ void EXTI4_IRQHandler()
 		// Clear interrupt flag
 		__HAL_GPIO_EXTI_CLEAR_IT(GPIO_PIN_4);
 
-		scsiDev.resetFlag = scsiDev.resetFlag || scsiStatusRST();
+		uint8_t statusFlags = *SCSI_STS_SCSI;
+
+		scsiDev.resetFlag = scsiDev.resetFlag || (statusFlags & 0x04);
 
 		// selFlag is required for Philips P2000C which releases it after 600ns
 		// without waiting for BSY.
 		// Also required for some early Mac Plus roms
-		scsiDev.selFlag = *SCSI_STS_SELECTED;
+		if (statusFlags & 0x08) // Check SEL flag
+		{
+			scsiDev.selFlag = *SCSI_STS_SELECTED;
+		}
 	}
-
-	__SEV(); // Set event. See corresponding __WFE() calls.
 }
 
 static void assertFail()
@@ -145,92 +147,215 @@ static void assertFail()
 void
 scsiSetDataCount(uint32_t count)
 {
-	*SCSI_DATA_CNT_HI = count >> 8;
+	*SCSI_DATA_CNT_HI = (count >> 16) & 0xff;
+	*SCSI_DATA_CNT_MID = (count >> 8) & 0xff;
 	*SCSI_DATA_CNT_LO = count & 0xff;
 	*SCSI_DATA_CNT_SET = 1;
 }
 
+int scsiFifoReady(void)
+{
+	__NOP();
+	HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin);
+	__NOP();
+	return HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin) != 0;
+}
+
 uint8_t
 scsiReadByte(void)
 {
-#if FIFODEBUG
-	if (!scsiPhyFifoAltEmpty()) {
-		// Force a lock-up.
-		assertFail();
-	}
-#endif
 	scsiSetDataCount(1);
 
+	// Ready immediately. setDataCount resets fifos
+
 	while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
 	{
-		__WFE(); // Wait for event
+		__WFI(); // Wait for interrupt
 	}
-	scsiPhyFifoFlip();
+	__enable_irq();
+
 	uint8_t val = scsiPhyRx();
 	// TODO scsiDev.parityError = scsiDev.parityError || SCSI_Parity_Error_Read();
 
-#if FIFODEBUG
-	if (!scsiPhyFifoEmpty()) {
-		int j = 0;
-		uint8_t k __attribute((unused));
-		while (!scsiPhyFifoEmpty()) { k = scsiPhyRx(); ++j; }
-
-		// Force a lock-up.
-		assertFail();
-	}
-#endif
 	return val;
 }
 
 
 void
-scsiReadPIO(uint8_t* data, uint32_t count)
+scsiReadPIO(uint8_t* data, uint32_t count, int* parityError)
 {
 	uint16_t* fifoData = (uint16_t*)data;
+	uint32_t count16 = (count + 1) / 2;
 
-	for (int i = 0; i < (count + 1) / 2; ++i)
+	int i = 0;
+	while ((i  < count16) && likely(!scsiDev.resetFlag))
 	{
-		fifoData[i] = scsiPhyRx(); // TODO ASSUMES LITTLE ENDIAN
-	}
-}
-
-void
-scsiReadDMA(uint8_t* data, uint32_t count)
-{
-	// Prepare DMA transfer
-	dmaInProgress = 1;
-
-	scsiTxDMAComplete = 1; // TODO not used much
-	scsiRxDMAComplete = 0; // TODO not used much
+		// Wait until FIFO is full (or complete)
+		while (!scsiFifoReady() && likely(!scsiDev.resetFlag))
+		{
+			// spin
+		}
 
-	HAL_DMA_Start(
-		&fsmcToMem,
-		(uint32_t) SCSI_FIFO_DATA,
-		(uint32_t) data,
-		(count + 1) / 2);
-}
+		if (count16 - i >= SCSI_FIFO_DEPTH16)
+		{
+			uint32_t chunk16 = SCSI_FIFO_DEPTH16;
 
-int
-scsiReadDMAPoll()
-{
-	int complete = __HAL_DMA_GET_COUNTER(&fsmcToMem) == 0;
-	complete = complete && (HAL_DMA_PollForTransfer(&fsmcToMem, HAL_DMA_FULL_TRANSFER, 0xffffffff) == HAL_OK);
-	if (complete)
-	{
-		scsiTxDMAComplete = 1; // TODO MM FIX IRQ
-		scsiRxDMAComplete = 1;
+			// Let gcc unroll the loop as much as possible.
+			for (uint32_t k = 0; k + 128 <= chunk16; k += 128)
+			{
+				fifoData[i + k] = scsiPhyRx();
+				fifoData[i + k + 1] = scsiPhyRx();
+				fifoData[i + k + 2] = scsiPhyRx();
+				fifoData[i + k + 3] = scsiPhyRx();
+				fifoData[i + k + 4] = scsiPhyRx();
+				fifoData[i + k + 5] = scsiPhyRx();
+				fifoData[i + k + 6] = scsiPhyRx();
+				fifoData[i + k + 7] = scsiPhyRx();
+				fifoData[i + k + 8] = scsiPhyRx();
+				fifoData[i + k + 9] = scsiPhyRx();
+				fifoData[i + k + 10] = scsiPhyRx();
+				fifoData[i + k + 11] = scsiPhyRx();
+				fifoData[i + k + 12] = scsiPhyRx();
+				fifoData[i + k + 13] = scsiPhyRx();
+				fifoData[i + k + 14] = scsiPhyRx();
+				fifoData[i + k + 15] = scsiPhyRx();
+				fifoData[i + k + 16] = scsiPhyRx();
+				fifoData[i + k + 17] = scsiPhyRx();
+				fifoData[i + k + 18] = scsiPhyRx();
+				fifoData[i + k + 19] = scsiPhyRx();
+				fifoData[i + k + 20] = scsiPhyRx();
+				fifoData[i + k + 21] = scsiPhyRx();
+				fifoData[i + k + 22] = scsiPhyRx();
+				fifoData[i + k + 23] = scsiPhyRx();
+				fifoData[i + k + 24] = scsiPhyRx();
+				fifoData[i + k + 25] = scsiPhyRx();
+				fifoData[i + k + 26] = scsiPhyRx();
+				fifoData[i + k + 27] = scsiPhyRx();
+				fifoData[i + k + 28] = scsiPhyRx();
+				fifoData[i + k + 29] = scsiPhyRx();
+				fifoData[i + k + 30] = scsiPhyRx();
+				fifoData[i + k + 31] = scsiPhyRx();
+				fifoData[i + k + 32] = scsiPhyRx();
+				fifoData[i + k + 33] = scsiPhyRx();
+				fifoData[i + k + 34] = scsiPhyRx();
+				fifoData[i + k + 35] = scsiPhyRx();
+				fifoData[i + k + 36] = scsiPhyRx();
+				fifoData[i + k + 37] = scsiPhyRx();
+				fifoData[i + k + 38] = scsiPhyRx();
+				fifoData[i + k + 39] = scsiPhyRx();
+				fifoData[i + k + 40] = scsiPhyRx();
+				fifoData[i + k + 41] = scsiPhyRx();
+				fifoData[i + k + 42] = scsiPhyRx();
+				fifoData[i + k + 43] = scsiPhyRx();
+				fifoData[i + k + 44] = scsiPhyRx();
+				fifoData[i + k + 45] = scsiPhyRx();
+				fifoData[i + k + 46] = scsiPhyRx();
+				fifoData[i + k + 47] = scsiPhyRx();
+				fifoData[i + k + 48] = scsiPhyRx();
+				fifoData[i + k + 49] = scsiPhyRx();
+				fifoData[i + k + 50] = scsiPhyRx();
+				fifoData[i + k + 51] = scsiPhyRx();
+				fifoData[i + k + 52] = scsiPhyRx();
+				fifoData[i + k + 53] = scsiPhyRx();
+				fifoData[i + k + 54] = scsiPhyRx();
+				fifoData[i + k + 55] = scsiPhyRx();
+				fifoData[i + k + 56] = scsiPhyRx();
+				fifoData[i + k + 57] = scsiPhyRx();
+				fifoData[i + k + 58] = scsiPhyRx();
+				fifoData[i + k + 59] = scsiPhyRx();
+				fifoData[i + k + 60] = scsiPhyRx();
+				fifoData[i + k + 61] = scsiPhyRx();
+				fifoData[i + k + 62] = scsiPhyRx();
+				fifoData[i + k + 63] = scsiPhyRx();
+				fifoData[i + k + 64] = scsiPhyRx();
+				fifoData[i + k + 65] = scsiPhyRx();
+				fifoData[i + k + 66] = scsiPhyRx();
+				fifoData[i + k + 67] = scsiPhyRx();
+				fifoData[i + k + 68] = scsiPhyRx();
+				fifoData[i + k + 69] = scsiPhyRx();
+				fifoData[i + k + 70] = scsiPhyRx();
+				fifoData[i + k + 71] = scsiPhyRx();
+				fifoData[i + k + 72] = scsiPhyRx();
+				fifoData[i + k + 73] = scsiPhyRx();
+				fifoData[i + k + 74] = scsiPhyRx();
+				fifoData[i + k + 75] = scsiPhyRx();
+				fifoData[i + k + 76] = scsiPhyRx();
+				fifoData[i + k + 77] = scsiPhyRx();
+				fifoData[i + k + 78] = scsiPhyRx();
+				fifoData[i + k + 79] = scsiPhyRx();
+				fifoData[i + k + 80] = scsiPhyRx();
+				fifoData[i + k + 81] = scsiPhyRx();
+				fifoData[i + k + 82] = scsiPhyRx();
+				fifoData[i + k + 83] = scsiPhyRx();
+				fifoData[i + k + 84] = scsiPhyRx();
+				fifoData[i + k + 85] = scsiPhyRx();
+				fifoData[i + k + 86] = scsiPhyRx();
+				fifoData[i + k + 87] = scsiPhyRx();
+				fifoData[i + k + 88] = scsiPhyRx();
+				fifoData[i + k + 89] = scsiPhyRx();
+				fifoData[i + k + 90] = scsiPhyRx();
+				fifoData[i + k + 91] = scsiPhyRx();
+				fifoData[i + k + 92] = scsiPhyRx();
+				fifoData[i + k + 93] = scsiPhyRx();
+				fifoData[i + k + 94] = scsiPhyRx();
+				fifoData[i + k + 95] = scsiPhyRx();
+				fifoData[i + k + 96] = scsiPhyRx();
+				fifoData[i + k + 97] = scsiPhyRx();
+				fifoData[i + k + 98] = scsiPhyRx();
+				fifoData[i + k + 99] = scsiPhyRx();
+				fifoData[i + k + 100] = scsiPhyRx();
+				fifoData[i + k + 101] = scsiPhyRx();
+				fifoData[i + k + 102] = scsiPhyRx();
+				fifoData[i + k + 103] = scsiPhyRx();
+				fifoData[i + k + 104] = scsiPhyRx();
+				fifoData[i + k + 105] = scsiPhyRx();
+				fifoData[i + k + 106] = scsiPhyRx();
+				fifoData[i + k + 107] = scsiPhyRx();
+				fifoData[i + k + 108] = scsiPhyRx();
+				fifoData[i + k + 109] = scsiPhyRx();
+				fifoData[i + k + 110] = scsiPhyRx();
+				fifoData[i + k + 111] = scsiPhyRx();
+				fifoData[i + k + 112] = scsiPhyRx();
+				fifoData[i + k + 113] = scsiPhyRx();
+				fifoData[i + k + 114] = scsiPhyRx();
+				fifoData[i + k + 115] = scsiPhyRx();
+				fifoData[i + k + 116] = scsiPhyRx();
+				fifoData[i + k + 117] = scsiPhyRx();
+				fifoData[i + k + 118] = scsiPhyRx();
+				fifoData[i + k + 119] = scsiPhyRx();
+				fifoData[i + k + 120] = scsiPhyRx();
+				fifoData[i + k + 121] = scsiPhyRx();
+				fifoData[i + k + 122] = scsiPhyRx();
+				fifoData[i + k + 123] = scsiPhyRx();
+				fifoData[i + k + 124] = scsiPhyRx();
+				fifoData[i + k + 125] = scsiPhyRx();
+				fifoData[i + k + 126] = scsiPhyRx();
+				fifoData[i + k + 127] = scsiPhyRx();
+			}
 
-		dmaInProgress = 0;
-#if 0
-		// TODO MM scsiDev.parityError = scsiDev.parityError || SCSI_Parity_Error_Read();
-#endif
-		return 1;
+			i += chunk16;
+		}
+		else
+		{
+			uint32_t chunk16 = count16 - i;
 
+			uint32_t k = 0;
+			for (; k + 4 <= chunk16; k += 4)
+			{
+				fifoData[i + k] = scsiPhyRx();
+				fifoData[i + 1 + k] = scsiPhyRx();
+				fifoData[i + 2 + k] = scsiPhyRx();
+				fifoData[i + 3 + k] = scsiPhyRx();
+			}
+			for (; k < chunk16; ++k)
+			{
+				fifoData[i + k] = scsiPhyRx();
+			}
+			i += chunk16;
+		}
 	}
-	else
-	{
-		return 0;
-	}
+
+	*parityError |= scsiParityError();
 }
 
 void
@@ -239,208 +364,173 @@ scsiRead(uint8_t* data, uint32_t count, int* parityError)
 	int i = 0;
 	*parityError = 0;
 
-
-	uint32_t chunk = ((count - i) > SCSI_FIFO_DEPTH)
-		? SCSI_FIFO_DEPTH : (count - i);
-#ifdef SCSI_FSMC_DMA
-	if (chunk >= 16)
-	{
-		// DMA is doing 32bit transfers.
-		chunk = chunk & 0xFFFFFFF8;
-	}
-#endif
-	scsiSetDataCount(chunk);
-
 	while (i < count && likely(!scsiDev.resetFlag))
 	{
-		while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
-		{
-			__WFE(); // Wait for event
-		}
-		*parityError |= scsiParityError();
-		scsiPhyFifoFlip();
+		uint32_t chunk = ((count - i) > SCSI_XFER_MAX)
+			? SCSI_XFER_MAX : (count - i);
+		scsiSetDataCount(chunk);
 
-		uint32_t nextChunk = ((count - i - chunk) > SCSI_FIFO_DEPTH)
-			? SCSI_FIFO_DEPTH : (count - i - chunk);
-#ifdef SCSI_FSMC_DMA
-		if (nextChunk >= 16)
-		{
-			nextChunk = nextChunk & 0xFFFFFFF8;
-		}
-#endif
-		if (nextChunk > 0)
-		{
-			scsiSetDataCount(nextChunk);
-		}
+		scsiReadPIO(data + i, chunk, parityError);
 
-#ifdef SCSI_FSMC_DMA
-		if (chunk < 16)
-#endif
-		{
-			scsiReadPIO(data + i, chunk);
-		}
-#ifdef SCSI_FSMC_DMA
-		else
+		__disable_irq();
+		while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
 		{
-			scsiReadDMA(data + i, chunk);
-
-			while (!scsiReadDMAPoll() && likely(!scsiDev.resetFlag))
-			{
-			};
+			__WFI();
 		}
-#endif
-
+		__enable_irq();
 
 		i += chunk;
-		chunk = nextChunk;
 	}
-#if FIFODEBUG
-		if (!scsiPhyFifoEmpty() || !scsiPhyFifoAltEmpty()) {
-			int j = 0;
-			while (!scsiPhyFifoEmpty()) { scsiPhyRx(); ++j; }
-			scsiPhyFifoFlip();
-			int k = 0;
-			while (!scsiPhyFifoEmpty()) { scsiPhyRx(); ++k; }
-			// Force a lock-up.
-			assertFail();
-		}
-#endif
 }
 
 void
 scsiWriteByte(uint8_t value)
 {
-#if FIFODEBUG
-	if (!scsiPhyFifoEmpty()) {
-		// Force a lock-up.
-		assertFail();
-	}
-#endif
-	scsiPhyTx(value);
-	scsiPhyFifoFlip();
-
 	scsiSetDataCount(1);
+	scsiPhyTx(value);
 
+	__disable_irq();
 	while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
 	{
-		__WFE(); // Wait for event
-	}
-
-#if FIFODEBUG
-	if (!scsiPhyFifoAltEmpty()) {
-		// Force a lock-up.
-		assertFail();
+		__WFI();
 	}
-#endif
+	__enable_irq();
 }
 
-static void
+void
 scsiWritePIO(const uint8_t* data, uint32_t count)
 {
 	uint16_t* fifoData = (uint16_t*)data;
-	for (int i = 0; i < (count + 1) / 2; ++i)
+	uint32_t count16 = (count + 1) / 2;
+
+	int i = 0;
+	while ((i  < count16) && likely(!scsiDev.resetFlag))
 	{
-		scsiPhyTx(fifoData[i]);
-	}
-}
+		while (!scsiFifoReady() && likely(!scsiDev.resetFlag))
+		{
+			// Spin
+		}
 
-void
-scsiWriteDMA(const uint8_t* data, uint32_t count)
-{
-	// Prepare DMA transfer
-	dmaInProgress = 1;
+		if (count16 - i >= SCSI_FIFO_DEPTH16)
+		{
+			uint32_t chunk16 = SCSI_FIFO_DEPTH16;
 
-	scsiTxDMAComplete = 0;
-	scsiRxDMAComplete = 1;
+			// Let gcc unroll the loop as much as possible.
+			for (uint32_t k = 0; k + 128 <= chunk16; k += 128)
+			{
+				scsiPhyTx32(fifoData[i + k], fifoData[i + k + 1]);
+				scsiPhyTx32(fifoData[i + 2 + k], fifoData[i + k + 3]);
+				scsiPhyTx32(fifoData[i + 4 + k], fifoData[i + k + 5]);
+				scsiPhyTx32(fifoData[i + 6 + k], fifoData[i + k + 7]);
+				scsiPhyTx32(fifoData[i + 8 + k], fifoData[i + k + 9]);
+				scsiPhyTx32(fifoData[i + 10 + k], fifoData[i + k + 11]);
+				scsiPhyTx32(fifoData[i + 12 + k], fifoData[i + k + 13]);
+				scsiPhyTx32(fifoData[i + 14 + k], fifoData[i + k + 15]);
+				scsiPhyTx32(fifoData[i + 16 + k], fifoData[i + k + 17]);
+				scsiPhyTx32(fifoData[i + 18 + k], fifoData[i + k + 19]);
+				scsiPhyTx32(fifoData[i + 20 + k], fifoData[i + k + 21]);
+				scsiPhyTx32(fifoData[i + 22 + k], fifoData[i + k + 23]);
+				scsiPhyTx32(fifoData[i + 24 + k], fifoData[i + k + 25]);
+				scsiPhyTx32(fifoData[i + 26 + k], fifoData[i + k + 27]);
+				scsiPhyTx32(fifoData[i + 28 + k], fifoData[i + k + 29]);
+				scsiPhyTx32(fifoData[i + 30 + k], fifoData[i + k + 31]);
+
+				scsiPhyTx32(fifoData[i + 32 + k], fifoData[i + k + 33]);
+				scsiPhyTx32(fifoData[i + 34 + k], fifoData[i + k + 35]);
+				scsiPhyTx32(fifoData[i + 36 + k], fifoData[i + k + 37]);
+				scsiPhyTx32(fifoData[i + 38 + k], fifoData[i + k + 39]);
+				scsiPhyTx32(fifoData[i + 40 + k], fifoData[i + k + 41]);
+				scsiPhyTx32(fifoData[i + 42 + k], fifoData[i + k + 43]);
+				scsiPhyTx32(fifoData[i + 44 + k], fifoData[i + k + 45]);
+				scsiPhyTx32(fifoData[i + 46 + k], fifoData[i + k + 47]);
+				scsiPhyTx32(fifoData[i + 48 + k], fifoData[i + k + 49]);
+				scsiPhyTx32(fifoData[i + 50 + k], fifoData[i + k + 51]);
+				scsiPhyTx32(fifoData[i + 52 + k], fifoData[i + k + 53]);
+				scsiPhyTx32(fifoData[i + 54 + k], fifoData[i + k + 55]);
+				scsiPhyTx32(fifoData[i + 56 + k], fifoData[i + k + 57]);
+				scsiPhyTx32(fifoData[i + 58 + k], fifoData[i + k + 59]);
+				scsiPhyTx32(fifoData[i + 60 + k], fifoData[i + k + 61]);
+				scsiPhyTx32(fifoData[i + 62 + k], fifoData[i + k + 63]);
+
+				scsiPhyTx32(fifoData[i + 64 + k], fifoData[i + k + 65]);
+				scsiPhyTx32(fifoData[i + 66 + k], fifoData[i + k + 67]);
+				scsiPhyTx32(fifoData[i + 68 + k], fifoData[i + k + 69]);
+				scsiPhyTx32(fifoData[i + 70 + k], fifoData[i + k + 71]);
+				scsiPhyTx32(fifoData[i + 72 + k], fifoData[i + k + 73]);
+				scsiPhyTx32(fifoData[i + 74 + k], fifoData[i + k + 75]);
+				scsiPhyTx32(fifoData[i + 76 + k], fifoData[i + k + 77]);
+				scsiPhyTx32(fifoData[i + 78 + k], fifoData[i + k + 79]);
+				scsiPhyTx32(fifoData[i + 80 + k], fifoData[i + k + 81]);
+				scsiPhyTx32(fifoData[i + 82 + k], fifoData[i + k + 83]);
+				scsiPhyTx32(fifoData[i + 84 + k], fifoData[i + k + 85]);
+				scsiPhyTx32(fifoData[i + 86 + k], fifoData[i + k + 87]);
+				scsiPhyTx32(fifoData[i + 88 + k], fifoData[i + k + 89]);
+				scsiPhyTx32(fifoData[i + 90 + k], fifoData[i + k + 91]);
+				scsiPhyTx32(fifoData[i + 92 + k], fifoData[i + k + 93]);
+				scsiPhyTx32(fifoData[i + 94 + k], fifoData[i + k + 95]);
+
+				scsiPhyTx32(fifoData[i + 96 + k], fifoData[i + k + 97]);
+				scsiPhyTx32(fifoData[i + 98 + k], fifoData[i + k + 99]);
+				scsiPhyTx32(fifoData[i + 100 + k], fifoData[i + k + 101]);
+				scsiPhyTx32(fifoData[i + 102 + k], fifoData[i + k + 103]);
+				scsiPhyTx32(fifoData[i + 104 + k], fifoData[i + k + 105]);
+				scsiPhyTx32(fifoData[i + 106 + k], fifoData[i + k + 107]);
+				scsiPhyTx32(fifoData[i + 108 + k], fifoData[i + k + 109]);
+				scsiPhyTx32(fifoData[i + 110 + k], fifoData[i + k + 111]);
+				scsiPhyTx32(fifoData[i + 112 + k], fifoData[i + k + 113]);
+				scsiPhyTx32(fifoData[i + 114 + k], fifoData[i + k + 115]);
+				scsiPhyTx32(fifoData[i + 116 + k], fifoData[i + k + 117]);
+				scsiPhyTx32(fifoData[i + 118 + k], fifoData[i + k + 119]);
+				scsiPhyTx32(fifoData[i + 120 + k], fifoData[i + k + 121]);
+				scsiPhyTx32(fifoData[i + 122 + k], fifoData[i + k + 123]);
+				scsiPhyTx32(fifoData[i + 124 + k], fifoData[i + k + 125]);
+				scsiPhyTx32(fifoData[i + 126 + k], fifoData[i + k + 127]);
 
-	HAL_DMA_Start(
-		&memToFSMC,
-		(uint32_t) data,
-		(uint32_t) SCSI_FIFO_DATA,
-		count / 4);
-}
+			}
 
-int
-scsiWriteDMAPoll()
-{
-	int complete = __HAL_DMA_GET_COUNTER(&memToFSMC) == 0;
-	complete = complete && (HAL_DMA_PollForTransfer(&memToFSMC, HAL_DMA_FULL_TRANSFER, 0xffffffff) == HAL_OK);
-	if (complete)
-	{
-		scsiTxDMAComplete = 1; // TODO MM FIX IRQ
-		scsiRxDMAComplete = 1;
+			i += chunk16;
+		}
+		else
+		{
+			uint32_t chunk16 = count16 - i;
 
-		dmaInProgress = 0;
-		return 1;
-	}
-	else
-	{
-		return 0;
+			uint32_t k = 0;
+			for (; k + 4 <= chunk16; k += 4)
+			{
+				scsiPhyTx32(fifoData[i + k], fifoData[i + k + 1]);
+				scsiPhyTx32(fifoData[i + k + 2], fifoData[i + k + 3]);
+			}
+			for (; k < chunk16; ++k)
+			{
+				scsiPhyTx(fifoData[i + k]);
+			}
+			i += chunk16;
+		}
 	}
 }
 
+
 void
 scsiWrite(const uint8_t* data, uint32_t count)
 {
 	int i = 0;
 	while (i < count && likely(!scsiDev.resetFlag))
 	{
-		uint32_t chunk = ((count - i) > SCSI_FIFO_DEPTH)
-			? SCSI_FIFO_DEPTH : (count - i);
-
-#if FIFODEBUG
-		if (!scsiPhyFifoEmpty()) {
-			// Force a lock-up.
-			assertFail();
-		}
-#endif
-
-#ifdef SCSI_FSMC_DMA
-		if (chunk < 16)
-#endif
-		{
-			scsiWritePIO(data + i, chunk);
-		}
-#ifdef SCSI_FSMC_DMA
-		else
-		{
-			// DMA is doing 32bit transfers.
-			chunk = chunk & 0xFFFFFFF8;
-			scsiWriteDMA(data + i, chunk);
+		uint32_t chunk = ((count - i) > SCSI_XFER_MAX)
+			? SCSI_XFER_MAX : (count - i);
+		scsiSetDataCount(chunk);
 
-			while (!scsiWriteDMAPoll() && likely(!scsiDev.resetFlag))
-			{
-			}
-		}
-#endif
+		scsiWritePIO(data + i, chunk);
 
+		__disable_irq();
 		while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
 		{
-			__WFE(); // Wait for event
+			__WFI();
 		}
+		__enable_irq();
 
-#if FIFODEBUG
-		if (!scsiPhyFifoAltEmpty()) {
-			// Force a lock-up.
-			assertFail();
-		}
-#endif
-
-		scsiPhyFifoFlip();
-		scsiSetDataCount(chunk);
 		i += chunk;
 	}
-	while (!scsiPhyComplete() && likely(!scsiDev.resetFlag))
-	{
-		__WFE(); // Wait for event
-	}
-
-#if FIFODEBUG
-	if (!scsiPhyFifoAltEmpty()) {
-		// Force a lock-up.
-		assertFail();
-	}
-#endif
 }
 
 static inline void busSettleDelay(void)
@@ -498,10 +588,6 @@ uint32_t scsiEnterPhaseImmediate(int newPhase)
 
 	int oldPhase = *SCSI_CTRL_PHASE;
 
-	if (!scsiDev.resetFlag && (!scsiPhyFifoEmpty() || !scsiPhyFifoAltEmpty())) {
-		// Force a lock-up.
-		assertFail();
-	}
 	if (newPhase != oldPhase)
 	{
 		if ((newPhase == DATA_IN || newPhase == DATA_OUT) &&
@@ -639,8 +725,6 @@ void scsiPhyReset()
 
 	*SCSI_CTRL_PHASE = 0x00;
 	*SCSI_CTRL_BSY = 0x00;
-	scsiPhyFifoSel = 0;
-	*SCSI_FIFO_SEL = 0;
 	*SCSI_CTRL_DBX = 0;
 
 	*SCSI_CTRL_SYNC_OFFSET = 0;
@@ -674,6 +758,31 @@ void scsiPhyReset()
 	}
 	#endif
 
+	// PIO Benchmark code
+	// Currently 16.7MB/s.
+	//#define PIO_BENCHMARK 1
+	#ifdef PIO_BENCHMARK
+	while(1)
+	{
+		s2s_ledOn();
+
+		scsiEnterPhase(DATA_IN); // Need IO flag set for fifo ready flag
+
+		// 100MB
+		for (int i = 0; i < (100LL * 1024 * 1024 / SCSI_FIFO_DEPTH); ++i)
+		{
+			scsiSetDataCount(1); // Resets fifos.
+
+			// Shouldn't block
+			scsiDev.resetFlag = 0;
+			scsiWritePIO(&scsiDev.data[0], SCSI_FIFO_DEPTH);
+		}
+		s2s_ledOff();
+
+		for(int i = 0; i < 10; ++i) s2s_delay_ms(1000);
+	}
+	#endif
+
 	#ifdef SCSI_FREQ_TEST
 	while(1)
 	{
@@ -749,8 +858,6 @@ void scsiPhyInit()
 	*SCSI_CTRL_IDMASK = 0x00; // Reset in scsiPhyConfig
 	*SCSI_CTRL_PHASE = 0x00;
 	*SCSI_CTRL_BSY = 0x00;
-	scsiPhyFifoSel = 0;
-	*SCSI_FIFO_SEL = 0;
 	*SCSI_CTRL_DBX = 0;
 
 	*SCSI_CTRL_SYNC_OFFSET = 0;

+ 20 - 21
lib/SCSI2SD/src/firmware/scsiPhy.h

@@ -20,8 +20,8 @@
 #define SCSI_CTRL_IDMASK ((volatile uint8_t*)0x60000000)
 #define SCSI_CTRL_PHASE ((volatile uint8_t*)0x60000002)
 #define SCSI_CTRL_BSY ((volatile uint8_t*)0x60000004)
-#define SCSI_FIFO_SEL ((volatile uint8_t*)0x60000006)
-#define SCSI_DATA_CNT_HI ((volatile uint8_t*)0x60000008)
+#define SCSI_DATA_CNT_HI ((volatile uint8_t*)0x60000006)
+#define SCSI_DATA_CNT_MID ((volatile uint8_t*)0x60000008)
 #define SCSI_DATA_CNT_LO ((volatile uint8_t*)0x6000000A)
 #define SCSI_DATA_CNT_SET ((volatile uint8_t*)0x6000000C)
 #define SCSI_CTRL_DBX ((volatile uint8_t*)0x6000000E)
@@ -35,7 +35,7 @@
 #define SCSI_CTRL_SEL_TIMING ((volatile uint8_t*)0x60000018)
 
 #define SCSI_STS_FIFO ((volatile uint8_t*)0x60000020)
-#define SCSI_STS_ALTFIFO ((volatile uint8_t*)0x60000022)
+// Obsolete #define SCSI_STS_ALTFIFO ((volatile uint8_t*)0x60000022)
 #define SCSI_STS_FIFO_COMPLETE ((volatile uint8_t*)0x60000024)
 #define SCSI_STS_SELECTED ((volatile uint8_t*)0x60000026)
 #define SCSI_STS_SCSI ((volatile uint8_t*)0x60000028)
@@ -47,18 +47,17 @@
 #define SCSI_STS_PARITY_ERR ((volatile uint8_t*)0x6000002C)
 
 #define SCSI_FIFO_DATA ((volatile uint16_t*)0x60000040)
-#define SCSI_FIFO_DEPTH 256
 
+#define SCSI_FIFO_DEPTH 512
+#define SCSI_FIFO_DEPTH16 (SCSI_FIFO_DEPTH / 2)
+#define SCSI_XFER_MAX 524288
 
-#define scsiPhyFifoFull() ((*SCSI_STS_FIFO & 0x01) == 0x01)
-#define scsiPhyFifoEmpty() ((*SCSI_STS_FIFO & 0x02) == 0x02)
-#define scsiPhyFifoAltEmpty() ((*SCSI_STS_ALTFIFO & 0x02) == 0x02)
+// Check if FIFO is empty or full.
+// Replaced with method due to delays
+// #define scsiFifoReady() (HAL_GPIO_ReadPin(GPIOE, FPGA_GPIO3_Pin) != 0)
 
-#define scsiPhyFifoFlip() \
-{\
-	scsiPhyFifoSel ^= 1; \
-	*SCSI_FIFO_SEL = scsiPhyFifoSel; \
-}
+#define scsiPhyFifoFull() ((*SCSI_STS_FIFO & 0x01) != 0)
+#define scsiPhyFifoEmpty() ((*SCSI_STS_FIFO & 0x02) != 0)
 
 #define scsiPhyTx(val) *SCSI_FIFO_DATA = (val)
 
@@ -69,24 +68,23 @@
 #define scsiPhyRx() *SCSI_FIFO_DATA
 #define scsiPhyComplete() ((*SCSI_STS_FIFO_COMPLETE & 0x01) == 0x01)
 
-#define scsiStatusATN() ((*SCSI_STS_SCSI & 0x01) == 0x01)
-#define scsiStatusBSY() ((*SCSI_STS_SCSI & 0x02) == 0x02)
-#define scsiStatusRST() ((*SCSI_STS_SCSI & 0x04) == 0x04)
-#define scsiStatusSEL() ((*SCSI_STS_SCSI & 0x08) == 0x08)
-#define scsiStatusACK() ((*SCSI_STS_SCSI & 0x10) == 0x10)
+#define scsiStatusATN() ((*SCSI_STS_SCSI & 0x01) != 0)
+#define scsiStatusBSY() ((*SCSI_STS_SCSI & 0x02) != 0)
+#define scsiStatusRST() ((*SCSI_STS_SCSI & 0x04) != 0)
+#define scsiStatusSEL() ((*SCSI_STS_SCSI & 0x08) != 0)
+#define scsiStatusACK() ((*SCSI_STS_SCSI & 0x10) != 0)
 
-#define scsiParityError() ((*SCSI_STS_PARITY_ERR & 0x1) == 0x1)
+#define scsiParityError() ((*SCSI_STS_PARITY_ERR & 0x1) != 0)
 
 // Disable DMA due to errate with the STM32F205 DMA2 controller when
 // concurrently transferring FSMC (with FIFO) and APB (ie. sdio)
 // peripherals.
 #undef SCSI_FSMC_DMA
 
-extern uint8_t scsiPhyFifoSel;
-
 void scsiPhyInit(void);
 void scsiPhyConfig(void);
 void scsiPhyReset(void);
+int scsiFifoReady(void);
 
 void scsiEnterPhase(int phase);
 uint32_t scsiEnterPhaseImmediate(int phase);
@@ -111,7 +109,8 @@ void scsiReadDMA(uint8_t* data, uint32_t count);
 int scsiReadDMAPoll();
 
 // Low-level.
-void scsiReadPIO(uint8_t* data, uint32_t count);
+void scsiReadPIO(uint8_t* data, uint32_t count, int* parityError);
+void scsiWritePIO(const uint8_t* data, uint32_t count);
 
 void scsiWriteDMA(const uint8_t* data, uint32_t count);
 int scsiWriteDMAPoll();

+ 0 - 5
lib/SCSI2SD/src/firmware/sd.h

@@ -34,14 +34,9 @@ extern SdDevice sdDev;
 
 int sdInit(void);
 
-void sdWriteMultiSectorPrep(uint32_t sdLBA, uint32_t sdSectors);
-void sdWriteMultiSectorDMA(uint8_t* outputBuffer);
-int sdWriteSectorDMAPoll();
-
 void sdReadDMA(uint32_t lba, uint32_t sectors, uint8_t* outputBuffer);
 int sdReadDMAPoll(uint32_t remainingSectors);
 void sdCompleteTransfer();
 
-void sdPoll();
 
 #endif