Browse Source

Revert "Revert "Optimized ASM bitbang routine""

This reverts commit d7722e9e7e572db62ab98043a4f0220d8afddef3.
Alex Perez 3 years ago
parent
commit
d70c9a4a9c

+ 138 - 59
lib/AzulSCSI_platform_GD32F205/AzulSCSI_platform.cpp

@@ -25,26 +25,12 @@ void delay(unsigned long ms)
 
 void delay_ns(unsigned long ns)
 {
+    uint32_t CNT_start = DWT->CYCCNT;
     if (ns <= 100) return; // Approximate call overhead
     ns -= 100;
 
-    uint32_t VAL_start = SysTick->VAL;
-
-    if (ns > 1000000)
-    {
-        int ms = ns / 1000000;
-        ns = ns - ms * 1000000;
-        delay(ms);
-    }
-
-    int cycles = ((uint64_t)ns * g_ns_to_cycles) >> 32;
-    int end = (int)VAL_start - cycles;
-    if (end <= 0)
-    {
-        end += SysTick->LOAD;
-        while (SysTick->VAL < end);
-    }
-    while (SysTick->VAL > end);
+    uint32_t cycles = ((uint64_t)ns * g_ns_to_cycles) >> 32;
+    while ((uint32_t)(DWT->CYCCNT - CNT_start) < cycles);
 }
 
 void SysTick_Handler(void)
@@ -71,10 +57,14 @@ void azplatform_init()
 
     // Enable SysTick to drive millis()
     g_millisecond_counter = 0;
-    g_ns_to_cycles = ((uint64_t)SystemCoreClock << 32) / 1000000000;
     SysTick_Config(SystemCoreClock / 1000U);
     NVIC_SetPriority(SysTick_IRQn, 0x00U);
 
+    // Enable DWT counter to drive delay_ns()
+    g_ns_to_cycles = ((uint64_t)SystemCoreClock << 32) / 1000000000;
+    CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+    DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+
     // Enable debug output on SWO pin
     DBG_CTL |= DBG_CTL_TRACE_IOEN;
     CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
@@ -294,9 +284,98 @@ extern volatile bool g_busreset;
     } \
   }
 
-#define SD_SPI SPI0
-#define SD_SPI_RX_DMA_CHANNEL DMA_CH1
-#define SD_SPI_TX_DMA_CHANNEL DMA_CH2
+// Optimized ASM blocks for the SCSI communication subroutine
+
+// Take 8 bits from d and format them for writing
+// d is name of data operand, b is bit offset, x is unique label
+#define ASM_LOAD_DATA(d, b, x) \
+"    load_data1_" x "_%=: \n" \
+"        ubfx    %[tmp1], %[" d "], #" b ", #8 \n" \
+"        ldr     %[tmp1], [%[byte_lookup], %[tmp1], lsl #2] \n"
+
+// Write data to SCSI port and set REQ high
+#define ASM_SEND_DATA(x) \
+"    send_data" x "_%=: \n" \
+"        str     %[tmp1], [%[out_port_bop]] \n"
+
+// Wait for ACK to be high, set REQ low, wait ACK low
+#define ASM_HANDSHAKE(x) \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        str     %[tmp2], [%[req_pin_bb]] \n" \
+"        cbnz    %[tmp2], req_is_low_now" x "_%= \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        str     %[tmp2], [%[req_pin_bb]] \n" \
+"        cbnz    %[tmp2], req_is_low_now" x "_%= \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        str     %[tmp2], [%[req_pin_bb]] \n" \
+"        cbnz    %[tmp2], req_is_low_now" x "_%= \n" \
+"    wait_ack_inactive" x "_%=: \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        str     %[tmp2], [%[req_pin_bb]] \n" \
+"        cbnz    %[tmp2], req_is_low_now" x "_%= \n" \
+"        b.n     wait_ack_inactive" x "_%= \n" \
+"    req_is_low_now" x "_%=: \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        cbz     %[tmp2], over_ack_active" x "_%= \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        cbz     %[tmp2], over_ack_active" x "_%= \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        cbz     %[tmp2], over_ack_active" x "_%= \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        cbz     %[tmp2], over_ack_active" x "_%= \n" \
+"    wait_ack_active" x "_%=: \n" \
+"        ldr     %[tmp2], [%[ack_pin_bb]] \n" \
+"        cbz     %[tmp2], over_ack_active" x "_%= \n" \
+"        b.n     wait_ack_active" x "_%= \n" \
+"    over_ack_active" x "_%=: \n" \
+
+// Send bytes to SCSI bus using the asynchronous handshake mechanism
+// Takes 4 bytes at a time for sending from buf.
+// Returns the next buffer pointer.
+static inline uint32_t *scsi_send_words_async(uint32_t *buf, uint32_t num_words)
+{
+    volatile uint32_t *out_port_bop = (volatile uint32_t*)&GPIO_BOP(SCSI_OUT_PORT);
+    const uint32_t *byte_lookup = g_scsi_out_byte_to_bop;
+    uint32_t ack_pin_bb = PERIPH_BB_BASE + (((uint32_t)&GPIO_ISTAT(SCSI_ACK_PORT)) - APB1_BUS_BASE) * 32 + 12 * 4;
+    uint32_t req_pin_bb = PERIPH_BB_BASE + (((uint32_t)out_port_bop) - APB1_BUS_BASE) * 32 + (9 + 16) * 4;
+    register uint32_t tmp1 = 0;
+    register uint32_t tmp2 = 0;
+    register uint32_t data = 0;
+
+    asm volatile (
+    "   ldr      %[data], [%[buf]], #4 \n" \
+        ASM_LOAD_DATA("data", "0", "first")
+
+    "inner_loop_%=: \n" \
+        ASM_SEND_DATA("0")
+        ASM_LOAD_DATA("data", "8", "8")
+        ASM_HANDSHAKE("0")
+        
+        ASM_SEND_DATA("8")
+        ASM_LOAD_DATA("data", "16", "16")
+        ASM_HANDSHAKE("8")
+
+        ASM_SEND_DATA("16")
+        ASM_LOAD_DATA("data", "24", "24")
+        ASM_HANDSHAKE("16")
+
+        ASM_SEND_DATA("24")
+    "   ldr      %[data], [%[buf]], #4 \n" \
+        ASM_LOAD_DATA("data", "0", "0")
+        ASM_HANDSHAKE("24")
+
+    "   subs     %[num_words], %[num_words], #1 \n" \
+    "   bne     inner_loop_%= \n"
+    : /* Output */ [tmp1] "+l" (tmp1), [tmp2] "+l" (tmp2), [data] "+r" (data),
+                   [buf] "+r" (buf), [num_words] "+r" (num_words)
+    : /* Input */ [ack_pin_bb] "r" (ack_pin_bb),
+                  [req_pin_bb] "r" (req_pin_bb),
+                  [out_port_bop] "r"(out_port_bop),
+                  [byte_lookup] "r" (byte_lookup)
+    : /* Clobber */ );
+
+    return buf - 1;
+}
 
 class GD32SPIDriver : public SdSpiBaseClass
 {
@@ -346,7 +425,7 @@ public:
         };
 
         // Select closest available divider based on system frequency
-        int divider = SystemCoreClock / m_sckfreq;
+        int divider = (SystemCoreClock + m_sckfreq / 2) / m_sckfreq;
         if (divider <= 2)
             config.prescale = SPI_PSC_2;
         else if (divider <= 4)
@@ -438,48 +517,48 @@ public:
     }
 
     // Stream data directly to SCSI bus
-    uint8_t stream_receive(size_t count)
-    {         
-        // Handle first byte
-        SPI_DATA(SD_SPI) = 0xFF;
-        while (!(SPI_STAT(SD_SPI) & SPI_STAT_RBNE));
-        uint8_t data = SPI_DATA(SD_SPI);
-        SCSI_OUT_DATA(data);
-        SPI_DATA(SD_SPI) = 0xFF;
-        SCSI_WAIT_INACTIVE(ACK);
-        SCSI_OUT(REQ, 1);
+    uint8_t stream_receive(uint8_t *buf, size_t count)
+    {
+        uint8_t tx_data = 0xFF;
+        DMA_INTC(DMA0) = DMA_FLAG_ADD(DMA_FLAG_FTF | DMA_FLAG_ERR, SD_SPI_RX_DMA_CHANNEL);
+        DMA_INTC(DMA0) = DMA_FLAG_ADD(DMA_FLAG_FTF | DMA_FLAG_ERR, SD_SPI_TX_DMA_CHANNEL);
+        DMA_CHMADDR(DMA0, SD_SPI_RX_DMA_CHANNEL) = (uint32_t)buf;
+        DMA_CHMADDR(DMA0, SD_SPI_TX_DMA_CHANNEL) = (uint32_t)&tx_data;
+        DMA_CHCTL(DMA0, SD_SPI_TX_DMA_CHANNEL) &= ~DMA_CHXCTL_MNAGA; // No memory increment for TX
+        DMA_CHCNT(DMA0, SD_SPI_RX_DMA_CHANNEL) = count;
+        DMA_CHCNT(DMA0, SD_SPI_TX_DMA_CHANNEL) = count;
+        DMA_CHCTL(DMA0, SD_SPI_RX_DMA_CHANNEL) |= DMA_CHXCTL_CHEN;
+        DMA_CHCTL(DMA0, SD_SPI_TX_DMA_CHANNEL) |= DMA_CHXCTL_CHEN;
 
-        // Handle main payload
-        for (size_t i = 1; i < count - 1; i++)
+        SPI_CTL1(SD_SPI) |= SPI_CTL1_DMAREN | SPI_CTL1_DMATEN;
+        
+        // DMA transfer is now running, we can start sending received bytes to SCSI
+        uint32_t *word_ptr = (uint32_t*)buf;
+        uint32_t *end_ptr = word_ptr + (count / 4);
+        while (word_ptr < end_ptr)
         {
-            // Wait that host confirms previous reception
-            SCSI_WAIT_ACTIVE(ACK);
-            SCSI_OUT(REQ, 0);
-
-            // Wait for received byte
-            while (!(SPI_STAT(SD_SPI) & SPI_STAT_RBNE));
-            data = SPI_DATA(SD_SPI);
+            uint32_t words_available = (count - DMA_CHCNT(DMA0, SD_SPI_RX_DMA_CHANNEL)) / 4;
+            if (words_available > 0)
+            {
+                if (word_ptr + words_available > end_ptr)
+                {
+                    words_available = end_ptr - word_ptr;
+                }
 
-            // Stream byte to SCSI
-            SCSI_OUT_DATA(data);
-            
-            // Start SPI transfer for next byte
-            SPI_DATA(SD_SPI) = 0xFF;
+                word_ptr = scsi_send_words_async(word_ptr, words_available);
+            }
+        }
+        
+        SCSI_RELEASE_DATA_REQ();
 
-            SCSI_WAIT_INACTIVE(ACK); // This takes long enough to fullfill the 100 ns setup time.
-            SCSI_OUT(REQ, 1);
+        if (DMA_INTF(DMA0) & DMA_FLAG_ADD(DMA_FLAG_ERR, SD_SPI_RX_DMA_CHANNEL))
+        {
+            azlog("ERROR: SPI DMA receive set DMA_FLAG_ERR");
         }
 
-        // Handle last byte
-        while (!(SPI_STAT(SD_SPI) & SPI_STAT_RBNE));
-        data = SPI_DATA(SD_SPI);
-        SCSI_OUT_DATA(data);
-        delay_100ns(); // DB hold time before REQ (DTC-510B)
-        SCSI_WAIT_INACTIVE(ACK);
-        SCSI_OUT(REQ, 1);
-        SCSI_WAIT_ACTIVE(ACK);
-        SCSI_RELEASE_DATA_REQ();
-        SCSI_WAIT_INACTIVE(ACK);
+        SPI_CTL1(SD_SPI) &= ~(SPI_CTL1_DMAREN | SPI_CTL1_DMATEN);
+        DMA_CHCTL(DMA0, SD_SPI_RX_DMA_CHANNEL) &= ~DMA_CHXCTL_CHEN;
+        DMA_CHCTL(DMA0, SD_SPI_TX_DMA_CHANNEL) &= ~DMA_CHXCTL_CHEN;
 
         m_stream_status += count;
         return 0;
@@ -561,7 +640,7 @@ void sdCsWrite(SdCsPin_t pin, bool level)
 }
 
 GD32SPIDriver g_sd_spi_port;
-SdSpiConfig g_sd_spi_config(0, DEDICATED_SPI, SD_SCK_MHZ(25), &g_sd_spi_port);
+SdSpiConfig g_sd_spi_config(0, DEDICATED_SPI, SD_SCK_MHZ(30), &g_sd_spi_port);
 
 void azplatform_prepare_stream(uint8_t *buffer)
 {

+ 4 - 1
lib/AzulSCSI_platform_GD32F205/AzulSCSI_platform.h

@@ -93,6 +93,9 @@ extern const char *g_azplatform_name;
 #define SD_CLK_PIN   GPIO_PIN_5
 #define SD_MISO_PIN  GPIO_PIN_6
 #define SD_MOSI_PIN  GPIO_PIN_7
+#define SD_SPI       SPI0
+#define SD_SPI_RX_DMA_CHANNEL DMA_CH1
+#define SD_SPI_TX_DMA_CHANNEL DMA_CH2
 
 // DIP switches
 #define DIP_PORT     GPIOB
@@ -131,7 +134,7 @@ static inline void delay_us(unsigned long us)
 // Approximate fast delay
 static inline void delay_100ns()
 {
-    asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop"); asm("nop");
+    asm volatile ("nop \n nop \n nop \n nop \n nop");
 }
 
 // Initialize SPI and GPIO configuration