Forráskód Böngészése

Pin down alignment for writeDataLoop and readDataLoop, and perform some additional timing adjustment.

Glenn Anderson 3 éve
szülő
commit
dd87235fc0
1 módosított fájl, 45 hozzáadás és 20 törlés
  1. 45 20
      src/BlueSCSI.cpp

+ 45 - 20
src/BlueSCSI.cpp

@@ -743,27 +743,35 @@ void writeDataPhase(int len, const byte* p)
 }
 
 #if READ_SPEED_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize ("-Os")
 /*
  * This loop is tuned to repeat the following pattern:
  * 1) Set REQ
- * 2) 5-6 cycles of work/delay
+ * 2) 5 cycles of work/delay
  * 3) Wait for ACK
  * Cycle time tunings are for 72MHz STM32F103
+ * Alignment matters. For the 3 instruction wait loops,it looks like crossing
+ * an 8 byte prefetch buffer can add 2 cycles of wait every branch taken.
  */
+void writeDataLoop(uint32_t blocksize) __attribute__ ((aligned(8)));
 void writeDataLoop(uint32_t blocksize)
 {
-#define REQ_ON() (*db_dst = BITMASK(vREQ)<<16);
+#define REQ_ON() (port_b->BRR = req_bit);
 #define FETCH_BSRR_DB() (bsrr_val = bsrr_tbl[*srcptr++])
-#define REQ_OFF_DB_SET(BSRR_VAL) *db_dst = BSRR_VAL;
-#define WAIT_ACK_ACTIVE()   while(!SCSI_IN(vACK))
-#define WAIT_ACK_INACTIVE() while(SCSI_IN(vACK))
+#define REQ_OFF_DB_SET(BSRR_VAL) port_b->BSRR = BSRR_VAL;
+#define WAIT_ACK_ACTIVE()   while((*port_a_idr>>(vACK&15)&1))
+#define WAIT_ACK_INACTIVE() while(!(*port_a_idr>>(vACK&15)&1))
 
   register byte *srcptr= m_buf;                 // Source buffer
   register byte *endptr= m_buf + blocksize;     // End pointer
 
   register const uint32_t *bsrr_tbl = db_bsrr;  // Table to convert to BSRR
   register uint32_t bsrr_val;                   // BSRR value to output (DB, DBP, REQ = ACTIVE)
-  register volatile uint32_t *db_dst = &(GPIOB->regs->BSRR); // Output port
+
+  register uint32_t req_bit = BITMASK(vREQ);
+  register gpio_reg_map *port_b = PBREG;
+  register volatile uint32_t *port_a_idr = &(GPIOA->regs->IDR);
 
   // Start the first bus cycle.
   FETCH_BSRR_DB();
@@ -772,19 +780,19 @@ void writeDataLoop(uint32_t blocksize)
   FETCH_BSRR_DB();
   WAIT_ACK_ACTIVE();
   REQ_OFF_DB_SET(bsrr_val);
+  // Align the starts of the do/while and WAIT loops to an 8 byte prefetch.
+  asm("nop.w;nop");
   do{
     WAIT_ACK_INACTIVE();
     REQ_ON();
-    // 6 cycle delay before reading ACK.
-    // Store plus 2 loads is 6 cycles.
-    REQ_ON();
+    // 4 cycles of work
     FETCH_BSRR_DB();
+    // Extra 1 cycle delay while keeping the loop within an 8 byte prefetch.
+    asm("nop");
     WAIT_ACK_ACTIVE();
     REQ_OFF_DB_SET(bsrr_val);
-    // 5 cycle delay before reading ACK.
-    // Branch taken is 2-4, seems to be taking 3. A second write is 2 more cycles.
-    // cmp is being pipelined in to a store so doesn't add any time.
-    REQ_OFF_DB_SET(bsrr_val);
+    // Extra 1 cycle delay, plus 4 cycles for the branch taken with prefetch.
+    asm("nop");
   }while(srcptr < endptr);
   WAIT_ACK_INACTIVE();
   // Finish the last bus cycle, byte is already on DB.
@@ -793,6 +801,7 @@ void writeDataLoop(uint32_t blocksize)
   REQ_OFF_DB_SET(bsrr_val);
   WAIT_ACK_INACTIVE();
 }
+#pragma GCC pop_options
 #endif
 
 /* 
@@ -841,32 +850,48 @@ void readDataPhase(int len, byte* p)
     p[i] = readHandshake();
 }
 
+#if WRITE_SPEED_OPTIMIZE
+#pragma GCC push_options
+#pragma GCC optimize ("-Os")
+    
+/*
+ * See writeDataLoop for optimization info.
+ */
+void readDataLoop(uint32_t blockSize) __attribute__ ((aligned(8)));
 void readDataLoop(uint32_t blockSize)
 {
   register byte *dstptr= m_buf;
   register byte *endptr= m_buf + blockSize - 1;
 
-#define REQ_ON() (port_b->BSRR = BITMASK(vREQ)<<16);
-#define REQ_OFF() (port_b->BSRR = BITMASK(vREQ));
-#define WAIT_ACK_ACTIVE()   while((*ack_src>>(vACK&15)&1))
-#define WAIT_ACK_INACTIVE() while(!(*ack_src>>(vACK&15)&1))
+#define REQ_ON() (port_b->BRR = req_bit);
+#define REQ_OFF() (port_b->BSRR = req_bit);
+#define WAIT_ACK_ACTIVE()   while((*port_a_idr>>(vACK&15)&1))
+#define WAIT_ACK_INACTIVE() while(!(*port_a_idr>>(vACK&15)&1))
+  register uint32_t req_bit = BITMASK(vREQ);
   register gpio_reg_map *port_b = PBREG;
-  register volatile uint32_t *ack_src = &(GPIOA->regs->IDR);
+  register volatile uint32_t *port_a_idr = &(GPIOA->regs->IDR);
   REQ_ON();
+  // Start of the do/while and WAIT are already aligned to 8 bytes.
   do {
     WAIT_ACK_ACTIVE();
-    uint32_t ret = GPIOB->regs->IDR;
+    uint32_t ret = port_b->IDR;
     REQ_OFF();
     *dstptr++ = ~(ret >> 8);
+    // Move wait loop in to a single 8 byte prefetch buffer
+    asm("nop.w");
     WAIT_ACK_INACTIVE();
     REQ_ON();
+    // Extra 1 cycle delay
+    asm("nop");
   } while(dstptr<endptr);
   WAIT_ACK_ACTIVE();
   uint32_t ret = GPIOB->regs->IDR;
   REQ_OFF();
-  *dstptr++ = ~(ret >> 8);
+  *dstptr = ~(ret >> 8);
   WAIT_ACK_INACTIVE();
 }
+#pragma GCC pop_options
+#endif
 
 /*
  * Data out phase.