Parcourir la source

Optimize synchronous mode in assembler

Petteri Aimonen il y a 3 ans
Parent
commit
acf9ad1de4

+ 1 - 0
azulscsi.ini

@@ -12,6 +12,7 @@ EnableUnitAttention = 0 # Post UNIT_ATTENTION status on power-on or SD card hotp
 EnableSCSI2 = 1 # Enable faster speeds of SCSI2
 EnableSelLatch = 0 # For Philips P2000C and other devices that release SEL signal before BSY
 MapLunsToIDs = 0 # For Philips P2000C simulate multiple LUNs
+MaxSyncSpeed = 10 # Set to 5 or 10 to enable synchronous SCSI mode, 0 to disable
 
 # Settings that can be specified either per-device or for all devices.
 #Vendor = "QUANTUM"

+ 1 - 1
lib/AzulSCSI_platform_GD32F205/AzulSCSI_platform.h

@@ -22,7 +22,7 @@ extern const char *g_azplatform_name;
 #elif defined(AZULSCSI_V1_1)
 #   define PLATFORM_NAME "AzulSCSI v1.1"
 #   define PLATFORM_REVISION "1.1"
-#   define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_5
+#   define PLATFORM_MAX_SCSI_SPEED S2S_CFG_SPEED_SYNC_10
 #   define PLATFORM_OPTIMAL_MIN_SD_WRITE_SIZE 4096
 #   define PLATFORM_OPTIMAL_MAX_SD_WRITE_SIZE 65536
 #   define PLATFORM_OPTIMAL_LAST_SD_WRITE_SIZE 8192

+ 2 - 0
lib/AzulSCSI_platform_GD32F205/AzulSCSI_v1_1_gpio.h

@@ -105,6 +105,8 @@
 #define SCSI_EXMC_DMA DMA0
 #define SCSI_EXMC_DMA_RCU RCU_DMA0
 #define SCSI_EXMC_DMACH DMA_CH0
+#define SCSI_SYNC_TIMER TIMER1
+#define SCSI_SYNC_TIMER_RCU RCU_TIMER1
 
 // BSY pin uses EXTI interrupt
 #define SCSI_BSY_PORT GPIOB

+ 4 - 1
lib/AzulSCSI_platform_GD32F205/scsiPhy.cpp

@@ -422,7 +422,10 @@ extern "C" bool scsiIsWriteFinished(const uint8_t *data)
     {
         // Process the transfer piece-by-piece while waiting
         // for SD card to react.
-        processPollingWrite(256);
+        int max_count = g_scsi_writereq.count / 8;
+        max_count &= ~255;
+        if (max_count < 256) max_count = 256;
+        processPollingWrite(max_count);
         return isPollingWriteFinished(data);
     }
     

+ 325 - 10
lib/AzulSCSI_platform_GD32F205/scsi_accel_sync.cpp

@@ -18,6 +18,7 @@
 #include "scsi_accel_sync.h"
 #include <AzulSCSI_log.h>
 #include <gd32f20x_exmc.h>
+#include <scsi.h>
 
 #ifndef SCSI_SYNC_MODE_AVAILABLE
 
@@ -38,8 +39,8 @@ static uint32_t g_sync_dma_buf[SYNC_DMA_BUFSIZE];
 void scsi_accel_sync_init()
 {
     rcu_periph_clock_enable(RCU_EXMC);
-    rcu_periph_clock_enable(SCSI_TIMER_RCU);
     rcu_periph_clock_enable(SCSI_EXMC_DMA_RCU);
+    rcu_periph_clock_enable(SCSI_SYNC_TIMER_RCU);
 
     exmc_norsram_timing_parameter_struct timing_param = {
         .asyn_access_mode = EXMC_ACCESS_MODE_A,
@@ -90,6 +91,14 @@ void scsi_accel_sync_init()
     dma_memory_to_memory_enable(SCSI_EXMC_DMA, SCSI_EXMC_DMACH);
 
     gpio_init(SCSI_IN_ACK_EXMC_NWAIT_PORT, GPIO_MODE_IN_FLOATING, 0, SCSI_IN_ACK_EXMC_NWAIT_PIN);
+    gpio_init(SCSI_TIMER_IN_PORT, GPIO_MODE_IN_FLOATING, 0, SCSI_TIMER_IN_PIN);
+
+    // TIMER1 is used to count ACK pulses
+    TIMER_CTL0(SCSI_SYNC_TIMER) = 0;
+    TIMER_SMCFG(SCSI_SYNC_TIMER) = TIMER_SLAVE_MODE_EXTERNAL0 | TIMER_SMCFG_TRGSEL_CI0FE0;
+    TIMER_CAR(SCSI_SYNC_TIMER) = 65535;
+    TIMER_PSC(SCSI_SYNC_TIMER) = 0;
+    TIMER_CHCTL0(SCSI_SYNC_TIMER) = 0x0001; // CH0 as input
 }
 
 void scsi_accel_sync_recv(uint8_t *data, uint32_t count, int* parityError, volatile int *resetFlag)
@@ -132,6 +141,7 @@ void scsi_accel_sync_recv(uint8_t *data, uint32_t count, int* parityError, volat
         }
 
         DMA_CHCTL(SCSI_EXMC_DMA, SCSI_EXMC_DMACH) &= ~DMA_CHXCTL_CHEN;
+        data = end;
     }
 
     GPIO_CTL0(SCSI_OUT_REQ_EXMC_NOE_PORT) = oldmode;
@@ -142,19 +152,324 @@ void scsi_accel_sync_recv(uint8_t *data, uint32_t count, int* parityError, volat
 /* Transfer from device to host */
 /********************************/
 
+// Simple delay, about 10 ns.
+// This is less likely to get optimized away by CPU pipeline than nop
+#define ASM_DELAY()  \
+"   ldr     %[tmp2], [%[reset_flag]] \n"
+
+// Take 8 bits from d and format them for writing
+// d is name of data operand, b is bit offset
+#define ASM_LOAD_DATA(b) \
+"        ubfx    %[tmp1], %[data], #" b ", #8 \n" \
+"        ldr     %[tmp1], [%[byte_lookup], %[tmp1], lsl #2] \n"
+
+// Write data to SCSI port and set REQ high
+#define ASM_SEND_DATA() \
+"        str     %[tmp1], [%[out_port_bop]] \n"
+
+// Set REQ low
+#define ASM_SET_REQ_LOW() \
+"        mov     %[tmp2], %[bop_req_low] \n" \
+"        str     %[tmp2], [%[out_port_bop]] \n"
+
+// Wait for ACK_TIMER - n to be less than num_bytes
+#define ASM_WAIT_ACK_TIMER(n) \
+    "wait_acks_" n "_%=: \n" \
+        "   ldr     %[tmp2], [%[ack_timer]] \n" \
+        "   sub     %[tmp2], # " n " \n" \
+        "   cmp     %[tmp2], %[num_bytes] \n" \
+        "   ble     got_acks_" n "_%= \n" \
+        "   ldr     %[tmp2], [%[reset_flag]] \n" \
+        "   cmp     %[tmp2], #0 \n" \
+        "   bne     all_done_%= \n" \
+        "   b       wait_acks_" n "_%= \n" \
+    "got_acks_" n "_%=: \n"
+
+// Send 4 bytes
+#define ASM_SEND_4BYTES() \
+ASM_LOAD_DATA("0") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("8") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("16") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("24") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW()
+
+// Send 1 byte, wait for ACK_TIMER to be less than num_bytes + n and send 3 bytes more
+// This interleaving minimizes the delay caused by WAIT_ACK_TIMER.
+#define ASM_SEND_4BYTES_WAIT(n) \
+ASM_LOAD_DATA("0") \
+ASM_SEND_DATA() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("8") \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+"   ldr     %[tmp2], [%[ack_timer]] \n" \
+"   sub     %[tmp2], # " n " \n" \
+ASM_SEND_DATA() \
+"   cmp     %[tmp2], %[num_bytes] \n" \
+"   ble     got_acks_" n "_%= \n" \
+ASM_WAIT_ACK_TIMER(n) \
+ASM_DELAY2() \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("16") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW() \
+ASM_DELAY2() \
+ASM_LOAD_DATA("24") \
+ASM_SEND_DATA() \
+ASM_DELAY1() \
+ASM_SET_REQ_LOW() \
+
+// Specialized routine for settings:
+// <=100 ns period, >=15 outstanding REQs
+static void sync_send_100ns_15off(const uint8_t *buf, uint32_t num_bytes, volatile int *resetFlag)
+{
+    volatile uint32_t *out_port_bop = (volatile uint32_t*)&GPIO_BOP(SCSI_OUT_PORT);
+    volatile uint32_t *ack_timer = &TIMER_CNT(SCSI_SYNC_TIMER);
+    const uint32_t *byte_lookup = g_scsi_out_byte_to_bop;
+    register uint32_t tmp1 = 0;
+    register uint32_t tmp2 = 0;
+    register uint32_t data = 0;
+
+#define ASM_DELAY1()
+#define ASM_DELAY2() ASM_DELAY()
+
+    asm volatile (
+    "main_loop_%=: \n"
+        "   subs  %[num_bytes], %[num_bytes], #16 \n"
+        "   bmi     last_bytes_%= \n"
+
+        /* At each point make sure there is at most 15 bytes in flight */
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES_WAIT("22")
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES()
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES_WAIT("14")
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES()
+
+        "   cbz   %[num_bytes], all_done_%= \n"
+        "   b     main_loop_%= \n"
+
+    "last_bytes_%=: \n"
+        "   add  %[num_bytes], %[num_bytes], #16 \n"
+    "last_bytes_loop_%=: \n"
+        "   ldrb    %[data], [%[buf]], #1 \n"
+        ASM_LOAD_DATA("0")
+
+        ASM_WAIT_ACK_TIMER("15")
+        ASM_SEND_DATA()
+        ASM_DELAY1()
+        ASM_SET_REQ_LOW()
+        ASM_DELAY2()
+
+        "   subs %[num_bytes], %[num_bytes], #1 \n"
+        "   bne  last_bytes_loop_%= \n"
+    "all_done_%=: \n"
+        ASM_DELAY1()
+
+    : /* Output */ [tmp1] "+l" (tmp1), [tmp2] "+l" (tmp2), [data] "+r" (data),
+                   [buf] "+r" (buf), [num_bytes] "+r" (num_bytes)
+    : /* Input */ [ack_timer] "r" (ack_timer),
+                  [bop_req_low] "I" (SCSI_OUT_REQ << 16),
+                  [out_port_bop] "r"(out_port_bop),
+                  [byte_lookup] "r" (byte_lookup),
+                  [reset_flag] "r" (resetFlag)
+    : /* Clobber */);
+
+#undef ASM_DELAY1
+#undef ASM_DELAY2
+
+    SCSI_RELEASE_DATA_REQ();
+}
+
+// Specialized routine for settings:
+// <=200 ns period, >=15 outstanding REQs
+static void sync_send_200ns_15off(const uint8_t *buf, uint32_t num_bytes, volatile int *resetFlag)
+{
+    volatile uint32_t *out_port_bop = (volatile uint32_t*)&GPIO_BOP(SCSI_OUT_PORT);
+    volatile uint32_t *ack_timer = &TIMER_CNT(SCSI_SYNC_TIMER);
+    const uint32_t *byte_lookup = g_scsi_out_byte_to_bop;
+    register uint32_t tmp1 = 0;
+    register uint32_t tmp2 = 0;
+    register uint32_t data = 0;
+
+#define ASM_DELAY1() ASM_DELAY() ASM_DELAY() ASM_DELAY()
+#define ASM_DELAY2() ASM_DELAY() ASM_DELAY() ASM_DELAY() ASM_DELAY()
+
+    asm volatile (
+    "main_loop_%=: \n"
+        "   subs  %[num_bytes], %[num_bytes], #16 \n"
+        "   bmi     last_bytes_%= \n"
+
+        /* At each point make sure there is at most 15 bytes in flight */
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES_WAIT("22")
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES()
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES_WAIT("14")
+        ASM_DELAY2()
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES()
+
+        "   cbz   %[num_bytes], all_done_%= \n"
+        "   b     main_loop_%= \n"
+
+    "last_bytes_%=: \n"
+        "   add  %[num_bytes], %[num_bytes], #16 \n"
+    "last_bytes_loop_%=: \n"
+        "   ldrb    %[data], [%[buf]], #1 \n"
+        ASM_LOAD_DATA("0")
+
+        ASM_WAIT_ACK_TIMER("15")
+        ASM_SEND_DATA()
+        ASM_DELAY1()
+        ASM_SET_REQ_LOW()
+        ASM_DELAY2()
+
+        "   subs %[num_bytes], %[num_bytes], #1 \n"
+        "   bne  last_bytes_loop_%= \n"
+    "all_done_%=: \n"
+        ASM_DELAY1()
+
+    : /* Output */ [tmp1] "+l" (tmp1), [tmp2] "+l" (tmp2), [data] "+r" (data),
+                   [buf] "+r" (buf), [num_bytes] "+r" (num_bytes)
+    : /* Input */ [ack_timer] "r" (ack_timer),
+                  [bop_req_low] "I" (SCSI_OUT_REQ << 16),
+                  [out_port_bop] "r"(out_port_bop),
+                  [byte_lookup] "r" (byte_lookup),
+                  [reset_flag] "r" (resetFlag)
+    : /* Clobber */);
+
+#undef ASM_DELAY1
+#undef ASM_DELAY2
+
+    SCSI_RELEASE_DATA_REQ();
+}
+
+// Specialized routine for settings:
+// <=260 ns period, >=7 outstanding REQs
+static void sync_send_260ns_7off(const uint8_t *buf, uint32_t num_bytes, volatile int *resetFlag)
+{
+    volatile uint32_t *out_port_bop = (volatile uint32_t*)&GPIO_BOP(SCSI_OUT_PORT);
+    volatile uint32_t *ack_timer = &TIMER_CNT(SCSI_SYNC_TIMER);
+    const uint32_t *byte_lookup = g_scsi_out_byte_to_bop;
+    register uint32_t tmp1 = 0;
+    register uint32_t tmp2 = 0;
+    register uint32_t data = 0;
+
+#define ASM_DELAY1() ASM_DELAY() ASM_DELAY() ASM_DELAY() ASM_DELAY() \
+                     ASM_DELAY() ASM_DELAY()
+#define ASM_DELAY2() ASM_DELAY() ASM_DELAY() ASM_DELAY() ASM_DELAY() \
+                     ASM_DELAY() ASM_DELAY() ASM_DELAY() ASM_DELAY()
+
+    asm volatile (
+    "main_loop_%=: \n"
+        "   subs  %[num_bytes], %[num_bytes], #4 \n"
+        "   bmi     last_bytes_%= \n"
+
+        /* At each point make sure there is at most 3 bytes in flight */
+        "   ldr   %[data], [%[buf]], #4 \n"
+        ASM_SEND_4BYTES_WAIT("7")
+
+        "   cbz   %[num_bytes], all_done_%= \n"
+        "   b     main_loop_%= \n"
+
+    "last_bytes_%=: \n"
+        "   add  %[num_bytes], %[num_bytes], #4 \n"
+    "last_bytes_loop_%=: \n"
+        "   ldrb    %[data], [%[buf]], #1 \n"
+        ASM_LOAD_DATA("0")
+
+        ASM_WAIT_ACK_TIMER("5")
+        ASM_SEND_DATA()
+        ASM_DELAY1()
+        ASM_SET_REQ_LOW()
+        ASM_DELAY2()
+
+        "   subs %[num_bytes], %[num_bytes], #1 \n"
+        "   bne  last_bytes_loop_%= \n"
+    "all_done_%=: \n"
+        ASM_DELAY1()
+
+    : /* Output */ [tmp1] "+l" (tmp1), [tmp2] "+l" (tmp2), [data] "+r" (data),
+                   [buf] "+r" (buf), [num_bytes] "+r" (num_bytes)
+    : /* Input */ [ack_timer] "r" (ack_timer),
+                  [bop_req_low] "I" (SCSI_OUT_REQ << 16),
+                  [out_port_bop] "r"(out_port_bop),
+                  [byte_lookup] "r" (byte_lookup),
+                  [reset_flag] "r" (resetFlag)
+    : /* Clobber */);
+
+#undef ASM_DELAY1
+#undef ASM_DELAY2
+
+    SCSI_RELEASE_DATA_REQ();
+}
+
 void scsi_accel_sync_send(const uint8_t* data, uint32_t count, volatile int *resetFlag)
 {
-    for (int i = 0; i < count; i++)
+    // Timer counts down from the initial number of bytes.
+    TIMER_CNT(SCSI_SYNC_TIMER) = count;
+    TIMER_CTL0(SCSI_SYNC_TIMER) = TIMER_CTL0_CEN | TIMER_CTL0_DIR;
+
+    int syncOffset = scsiDev.target->syncOffset;
+    int syncPeriod = scsiDev.target->syncPeriod;
+
+    if (syncOffset >= 15 && syncPeriod <= 25)
     {
-        SCSI_OUT_DATA(data[i]);
-        delay_100ns();
-        SCSI_OUT(REQ, 1);
-        delay_ns(200);
-        SCSI_OUT(REQ, 0);
-        delay_ns(500);
+        sync_send_100ns_15off(data, count, resetFlag);
     }
-    SCSI_RELEASE_DATA_REQ();
+    else if (syncOffset >= 15 && syncPeriod <= 50)
+    {
+        sync_send_200ns_15off(data, count, resetFlag);
+    }
+    else if (syncOffset >= 7 && syncPeriod <= 65)
+    {
+        sync_send_260ns_7off(data, count, resetFlag);
+    }
+    else
+    {
+        azdbg("No optimized routine for syncOffset=", syncOffset, " syndPeriod=", syncPeriod, ", using fallback");
+        while (count-- > 0)
+        {
+            while (TIMER_CNT(SCSI_SYNC_TIMER) > count + syncOffset && !*resetFlag);
+
+            SCSI_OUT_DATA(*data++);
+            delay_ns(syncPeriod * 2);
+            SCSI_OUT(REQ, 0);
+            delay_ns(syncPeriod * 2);
+        }
+        delay_ns(syncPeriod * 2);
+        SCSI_RELEASE_DATA_REQ();
+    }
+
+    while (TIMER_CNT(SCSI_SYNC_TIMER) > 0 && !*resetFlag);
+
+    TIMER_CTL0(SCSI_SYNC_TIMER) = 0;
 }
 
 
-#endif
+#endif

+ 6 - 0
src/AzulSCSI_disk.cpp

@@ -334,6 +334,12 @@ void s2s_configInit(S2S_BoardCfg* config)
     config->selectionDelay = ini_getl("SCSI", "SelectionDelay", 255, CONFIGFILE);
     config->flags6 = 0;
     config->scsiSpeed = PLATFORM_MAX_SCSI_SPEED;
+
+    int maxSyncSpeed = ini_getl("SCSI", "MaxSyncSpeed", 10, CONFIGFILE);
+    if (maxSyncSpeed < 5 && config->scsiSpeed > S2S_CFG_SPEED_ASYNC_50)
+        config->scsiSpeed = S2S_CFG_SPEED_ASYNC_50;
+    else if (maxSyncSpeed < 10 && config->scsiSpeed > S2S_CFG_SPEED_SYNC_5)
+        config->scsiSpeed = S2S_CFG_SPEED_SYNC_5;
     
     azlog("-- SelectionDelay: ", (int)config->selectionDelay);
 

+ 10 - 2
src/AzulSCSI_log_trace.cpp

@@ -97,11 +97,19 @@ static void printNewPhase(int phase)
             break;
         
         case DATA_IN:
-            azdbg("---- DATA_IN");
+            if (scsiDev.target->syncOffset > 0)
+                azdbg("---- DATA_IN, syncOffset ", (int)scsiDev.target->syncOffset,
+                                   " syncPeriod ", (int)scsiDev.target->syncPeriod);
+            else
+                azdbg("---- DATA_IN");
             break;
         
         case DATA_OUT:
-            azdbg("---- DATA_OUT");
+            if (scsiDev.target->syncOffset > 0)
+                azdbg("---- DATA_OUT, syncOffset ", (int)scsiDev.target->syncOffset,
+                                    " syncPeriod ", (int)scsiDev.target->syncPeriod);
+            else
+                azdbg("---- DATA_OUT");
             break;
         
         case MESSAGE_IN: