Browse Source

riscv32: add the ctz instruction for better interrupt latency

Add the ctz instruction, and use it in the interrupt entry path to
reduce interrupt latency.

Restructure some of the code around the interrupt handling.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
H. Peter Anvin 2 years ago
parent
commit
bf9aa45886

BIN
esp32/output/max80.ino.bin


+ 2 - 2
fpga/max80.qpf

@@ -19,12 +19,12 @@
 #
 # Quartus Prime
 # Version 21.1.0 Build 842 10/21/2021 SJ Lite Edition
-# Date created = 15:49:53  December 22, 2022
+# Date created = 22:41:13  January 22, 2023
 #
 # -------------------------------------------------------------------------- #
 
 QUARTUS_VERSION = "21.1"
-DATE = "15:49:53  December 22, 2022"
+DATE = "22:41:13  January 22, 2023"
 
 # Revisions
 

BIN
fpga/output/bypass.jic


BIN
fpga/output/bypass.rbf.gz


BIN
fpga/output/bypass.rpd.gz


BIN
fpga/output/bypass.sof


BIN
fpga/output/bypass.svf.gz


BIN
fpga/output/bypass.xsvf.gz


BIN
fpga/output/max80.fw


BIN
fpga/output/v1.fw


BIN
fpga/output/v1.jic


BIN
fpga/output/v1.rbf.gz


BIN
fpga/output/v1.rpd.gz


BIN
fpga/output/v1.sof


BIN
fpga/output/v1.svf.gz


BIN
fpga/output/v1.xsvf.gz


BIN
fpga/output/v2.fw


BIN
fpga/output/v2.jic


BIN
fpga/output/v2.rbf.gz


BIN
fpga/output/v2.rpd.gz


BIN
fpga/output/v2.sof


BIN
fpga/output/v2.svf.gz


BIN
fpga/output/v2.xsvf.gz


+ 30 - 1
fpga/picorv32.v

@@ -72,6 +72,17 @@
 `define PICORV32_V
 
 
+function logic [31:0] do_ctz(logic [31:0] rs1);
+   logic [31:0] n = 32'd0;
+   for (int i = 0; i < 32; i++)
+     begin
+	if (rs1[i])
+	  break;
+	n++;
+     end
+   do_ctz = n;
+endfunction // do_ctz
+
 /***************************************************************
  * picorv32
  ***************************************************************/
@@ -685,6 +696,7 @@ module picorv32 #(
 	reg instr_add, instr_sub, instr_sll, instr_slt, instr_sltu, instr_xor, instr_srl, instr_sra, instr_or, instr_and;
 	reg instr_csrr, instr_ecall_ebreak;
 	reg instr_addqxi, instr_addxqi, instr_retirq, instr_maskirq, instr_waitirq, instr_timer;
+	reg instr_ctz;
         reg [2:0] instr_funct2;
 
 	wire instr_trap;
@@ -718,7 +730,7 @@ module picorv32 #(
 			instr_lb, instr_lh, instr_lw, instr_lbu, instr_lhu, instr_sb, instr_sh, instr_sw,
 			instr_addi, instr_slti, instr_sltiu, instr_xori, instr_ori, instr_andi, instr_slli, instr_srli, instr_srai,
 			instr_add, instr_sub, instr_sll, instr_slt, instr_sltu, instr_xor, instr_srl, instr_sra, instr_or, instr_and,
-			instr_csrr, instr_addqxi, instr_retirq, instr_maskirq, instr_waitirq, instr_timer};
+			instr_csrr, instr_addqxi, instr_retirq, instr_maskirq, instr_waitirq, instr_timer, instr_ctz};
 
 	reg [63:0] new_ascii_instr;
 	`FORMAL_KEEP reg [63:0] dbg_ascii_instr;
@@ -777,6 +789,7 @@ module picorv32 #(
 		if (instr_and)      new_ascii_instr = "and";
 
 		if (instr_csrr)     new_ascii_instr = "csrr";
+		if (instr_ctz)	    new_ascii_instr = "ctz";
 
 	        if (instr_addqxi)   new_ascii_instr = "addqxi";
 	        if (instr_addxqi)   new_ascii_instr = "addxqi";
@@ -1124,6 +1137,9 @@ module picorv32 #(
 			instr_or    <= is_alu_reg_reg && mem_rdata_q[14:12] == 3'b110 && mem_rdata_q[31:25] == 7'b0000000;
 			instr_and   <= is_alu_reg_reg && mem_rdata_q[14:12] == 3'b111 && mem_rdata_q[31:25] == 7'b0000000;
 
+			instr_ctz   <= is_alu_reg_imm && mem_rdata_q[14:12] == 3'b001 && mem_rdata_q[31:25] == 7'h30 &&
+				       mem_rdata_q[24:20] == 5'h01;
+
 		        instr_csrr     <= (mem_rdata_q[6:0] == 7'b1110011 && mem_rdata_q[13:12] != 2'b00);
 
 			instr_ecall_ebreak <= ((mem_rdata_q[6:0] == 7'b1110011 && !mem_rdata_q[13:12]) ||
@@ -1206,7 +1222,18 @@ module picorv32 #(
 			instr_sra    <= 0;
 			instr_or     <= 0;
 			instr_and    <= 0;
+
+		        instr_ctz    <= 0;
+		        instr_csrr   <= 0;
+
 		        instr_addqxi <= 0;
+		        instr_addxqi <= 0;
+		        instr_maskirq <= 0;
+		        instr_waitirq <= 0;
+		        instr_timer   <= 0;
+
+		        instr_ecall_ebreak <= 0;
+
 		end
 	end
 
@@ -1321,6 +1348,8 @@ module picorv32 #(
 				alu_out = reg_op1 | reg_op2;
 			instr_andi || instr_and:
 				alu_out = reg_op1 & reg_op2;
+		        instr_ctz:
+			        alu_out = do_ctz(reg_op1);
 			BARREL_SHIFTER && (instr_sll || instr_slli):
 				alu_out = alu_shl;
 			BARREL_SHIFTER && (instr_srl || instr_srli || instr_sra || instr_srai):

+ 1 - 2
rv32/.gitignore

@@ -7,8 +7,7 @@
 *.ver
 *.hex
 *.map
-*.build/
-tools/gnu/
+*.lst
 iodevs.h
 ioregsa.S
 irqtable.h

+ 6 - 5
rv32/Makefile

@@ -46,8 +46,9 @@ LIBS    = max80.a fatfs.a zlib.a
 ROMS    := $(wildcard roms/*.rom)
 ROMOBJS  = $(ROMS:.rom=.o)
 
-LIBOBJ   = head.o dummy.o die.o system.o \
-	   ioregsa.o irqasm.o irqtable.o spurious_irq.o \
+FORCEOBJ = head.o dummy.o die.o system.o killed.o
+
+LIBOBJ   = debug.o ioregsa.o irqasm.o irqtable.o spurious_irq.o \
 	   console.o rtc.o romcopy.o spiflash.o esp.o matchver.o \
 	   sdcard.o \
 	   abcmem.o abcio.o abcdisk.o abcrtc.o abcpun80.o \
@@ -114,7 +115,7 @@ max80.elf: diskcache.o
 
 jtagupd.elf: sbrk.o
 
-%.elf: %.ild %.o $(LIBS)
+%.elf: %.ild $(FORCEOBJ) %.o $(LIBS)
 	$(CC) $(LDFLAGS) -Wl,-T,$< -o $@ \
 		-Wl,--start-group $(filter-out $<,$^) -Wl,--end-group
 
@@ -128,10 +129,10 @@ jtagupd.elf: sbrk.o
 	$(CC) $(CFLAGS) $(CFLAGS_$<) $(gendeps) -E -o $@ $<
 
 %.o: %.S | $(genhdrs)
-	$(CC) $(SFLAGS) $(SFLAGS_$<) $(gendeps) -c -o $@ $<
+	$(CC) $(SFLAGS) $(SFLAGS_$<) -Wa,-ahlsm=$*.lst $(gendeps) -c -o $@ $<
 
 %.s: %.S | $(genhdrs)
-	$(CC) $(SFLAGS) $(SFLAGS_$<) $(gendeps) -E -o $@ $<
+	$(CC) $(SFLAGS) $(SFLAGS_$<) -Wa,-ahlsm=$*.lst $(gendeps) -E -o $@ $<
 
 ioregsa.S: ioregs.h ioregsa.pl | $(genhdrs)
 	$(PERL) ioregsa.pl $< $@

+ 1 - 1
rv32/checksum.h

@@ -1,4 +1,4 @@
 #ifndef CHECKSUM_H
 #define CHECKSUM_H
-#define SDRAM_SUM 0x6138ff71
+#define SDRAM_SUM 0x08d13422
 #endif

+ 10 - 0
rv32/common.h

@@ -51,6 +51,16 @@ static inline __constfunc size_t _align_up(size_t addr, size_t align)
 }
 #define align_up(a,l)   ((__typeof__(a))_align_up((size_t)(a),(l)))
 
+static uint32_t mask_lowest_set_bit(uint32_t mask)
+{
+    return mask & (mask-1);
+}
+
+static uint32_t lowest_set_bit(uint32_t mask)
+{
+    return mask - mask_lowest_set_bit(mask);
+}
+
 struct esplink_head;
 extern struct esplink_head esplink_head;
 extern uint32_t esplink[];

+ 47 - 0
rv32/debug.S

@@ -0,0 +1,47 @@
+#include "compiler.h"
+#include "sections.h"
+#include "picorv32.h"
+#include "ioregs.h"
+
+	// Debug functions to read and write x-registers from interrupt
+	// mode by register number
+	.pushsection ".text.hot.rdxreg","ax"
+	.globl	rdxreg
+	.balign 4
+rdxreg:
+	la a3,_xreg_smc
+	lw a2,8(a3)		// addqxi a0,zero,0
+	andi a0,a0,31
+	slli a0,a0,15		// rs1
+	or a2,a2,a0
+	sw a2,(a3)
+	jr a3
+	.type rdxreg, @function
+	.size rdxreg, . - rdxreg
+	.popsection
+
+	.pushsection ".text.hot.wrxreg","ax"
+	.globl	wrxreg
+	.balign 4
+wrxreg:
+	la a3,_xreg_smc
+	lw a2,12(a3)		// addxqi zero,a1,0
+	andi a0,a0,31
+	slli a0,a0,7		// rd
+	or a2,a2,a0
+	sw a2,(a3)
+	jr a3
+	.type wrxreg, @function
+	.size wrxreg, . - wrxreg
+	.popsection
+
+	__rwtext
+	.balign 4
+	.option norvc
+_xreg_smc:
+	nop
+	ret
+	addqxi a0,zero,0
+	addxqi zero,a1,0
+	.type _xreg_smc, @function
+	.size _xreg_smc, . - _xreg_smc

+ 1 - 1
rv32/head.S

@@ -44,7 +44,7 @@ _reset:
 	.type _reset, @function
 	.size _reset, . - _reset
 
-	.section ".init","ax"
+	.section ".text.hot","ax"
 	.globl __start
 __start:
 	not t0,zero

+ 7 - 3
rv32/irq.h

@@ -5,17 +5,18 @@
 #include "picorv32.h"
 #include "iodevs.h"		/* For _IRQ constants */
 
-typedef void (*irqhandler_t)(unsigned int vector, size_t pc);
+typedef void (*irqhandler_t)(unsigned int);
 extern irqhandler_t __irq_handler_table[];
 
 #define IRQHANDLER_DECL(x,n)						\
-    void irqhandler_##x##_##n (unsigned int vector __unused,		\
-			       size_t pc __unused)
+    void irqhandler_##x##_##n (unsigned int vector __unused)
 
 #define IRQHANDLER(x,n)				\
     IRQHANDLER_DECL(x,n);			\
     __hot __text_hot IRQHANDLER_DECL(x,n)
 
+IRQHANDLER_DECL(spurious,0);
+
 typedef unsigned int irqmask_t;
 
 /* Disable IRQs except fatal system errors (to facilitate debugging) */
@@ -69,4 +70,7 @@ static inline irqmask_t wait_for_irq(void)
 size_t rdxreg(unsigned int reg);
 void wrxreg(unsigned int reg, size_t val);
 
+#define IRQ_PC_REGISTER			"s10"
+#define IRQ_VECTOR_MASK_REGISTER	"s11"
+
 #endif /* IRQ_H */

+ 13 - 97
rv32/irqasm.S

@@ -12,115 +12,31 @@
 	.balign 4
 	.globl _irq
 	.option push
-	.option norvc	// Alignment matters more here
+	.option norvc		// Just messes up alignment
+	.option arch, +zbb	// Enable the ctz instruction
 _irq:
 	addqxi sp,sp,0
 	// s10 contains the IRQ return address, s11 the mask of
 	// IRQs to be handled.
 
 	// Send EOI for all interrupts (previously done in hardware)
+	// -> move back to hardware?
 	sw s11,SYS_EOI(zero)
 
-	// Fast dispatch for the ABC-bus interrupt handler
-	andi s0,s11,1 << ABC_IRQ
-	beqz s0,1f
-	jal irqhandler_abc_0
-	sub s11,s11,s0
-	beqz s11,.L_done
-1:
-
-	li s1, 0
 .Lirq_loop:
-	// ctz would make this more efficient...
-#if IRQ_VECTORS > 16
-	slli t0,s11,16
-	bnez t0,1f
-	srli s11,s11,16
-	addi s1,s1,16*4
-1:
-#endif
-#if IRQ_VECTORS > 8
-	zext.b t0,s11
-	bnez t0,2f
-	srli s11,s11,8
-	addi s1,s1,8*4
-2:
-#endif
-#if IRQ_VECTORS > 4
-	andi t0,s11,15
-	bnez t0,3f
-	srli s11,s11,4
-	addi s1,s1,4*4
-3:
-#endif
-#if IRQ_VECTORS > 2
-	andi t0,s11,3
-	bnez t0,4f
-	srli s11,s11,2
-	addi s1,s1,2*4
-4:
-#endif
-#if IRQ_VECTORS > 1
-	andi t0,s11,1
-	bnez t0,5f
-	srli s11,s11,1
-	addi s1,s1,1*4
-5:
-#endif
-	srli a0,s1,2	// Vector number
-	mv   a1,s10	// PC (including the rvc flag)
-	.option norelax	// ld will mess up trying to relax this instruction
-	jalr s1,%lo(__irq_handler_table) // Must be in zero page
+	ctz a0,s11	// Vector number
+	slli t1,a0,2	// Table index
+	.option norelax	// Relaxing this instruction will do bad things
+	lw ra,%lo(__irq_handler_table)(t1)
 	.option relax
-	srli s11,s11,1
-	addi s1,s1,4*1
-	bnez s11,.Lirq_loop
+	jalr ra
 
-.L_done:
+	// Strip the lowest set bit of s11
+	addi t1,s11,-1
+	and  s11,s11,t1
+	bnez s11,.Lirq_loop
 	mret
+
 	.type _irq, @function
 	.size _irq, . - _irq
 	.option pop
-
-	// Debug functions to read and write x-registers from interrupt
-	// mode by register number
-	.pushsection ".text.hot.rdxreg","ax"
-	.globl	rdxreg
-	.balign 4
-rdxreg:
-	la a3,_xreg_smc
-	lw a2,8(a3)		// addqxi a0,zero,0
-	andi a0,a0,31
-	slli a0,a0,15		// rs1
-	or a2,a2,a0
-	sw a2,(a3)
-	jr a3
-	.type rdxreg, @function
-	.size rdxreg, . - rdxreg
-	.popsection
-
-	.pushsection ".text.hot.wrxreg","ax"
-	.globl	wrxreg
-	.balign 4
-wrxreg:
-	la a3,_xreg_smc
-	lw a2,12(a3)		// addxqi zero,a1,0
-	andi a0,a0,31
-	slli a0,a0,7		// rd
-	or a2,a2,a0
-	sw a2,(a3)
-	jr a3
-	.type wrxreg, @function
-	.size wrxreg, . - wrxreg
-	.popsection
-
-	__rwtext
-	.balign 4
-	.option norvc
-_xreg_smc:
-	nop
-	ret
-	addqxi a0,zero,0
-	addxqi zero,a1,0
-	.type _xreg_smc, @function
-	.size _xreg_smc, . - _xreg_smc

+ 3 - 4
rv32/irqtable.S

@@ -3,16 +3,15 @@
 #include "picorv32.h"
 #include "iodevs.h"
 
-	__rwtext
+	__sdata
 	.balign	4
-	.option	norvc
 	.globl	__irq_handler_table
 __irq_handler_table:
 
 #define IRQENTRY(name,irqbase,irqn,irqcount)	\
-	j	irqhandler_ ## name ## _ ## irqn
+	.long	irqhandler_ ## name ## _ ## irqn
 
 #include "irqtable.h"
 
 	.size __irq_handler_table, . - __irq_handler_table
-	.type __irq_handler_table, @function
+	.type __irq_handler_table, @object

+ 2 - 5
rv32/jtagupd.c

@@ -6,16 +6,13 @@
 
 #define VJTAG_FLASH_CMD	0xabc80046
 
-/* IRQ handler not needed in this system */
-IRQHANDLER_DECL(spurious,0);
-
 IRQHANDLER(abc,0)
 {
-    irqhandler_spurious_0(vector, pc);
+    irqhandler_spurious_0(vector);
 }
 IRQHANDLER(tty,1)
 {
-    irqhandler_spurious_0(vector, pc);
+    irqhandler_spurious_0(vector);
 }
 
 /*

+ 54 - 0
rv32/killed.c

@@ -0,0 +1,54 @@
+#include "common.h"
+#include "io.h"
+#include "sys.h"
+#include "console.h"
+#include "esp.h"
+#include "irq.h"
+
+/* Don't mark no_return or gcc moves it to SDRAM */
+static void __hot __text_hot killed(const char *how, size_t pc)
+{
+    /* Cannot use con_printf() here */
+    const uint16_t *pcp;
+    size_t mtval;
+
+    asm volatile("csrr %0,mtval" : "=r" (mtval));
+
+    /* Try to move back to the previous instruction (if not a jump...) */
+    pc += -4 + (pc & 1);
+    pcp = (const uint16_t *)pc;
+
+    con_puts(hotstr("ERROR: "));
+    con_puts(how);
+    con_puts(hotstr(" at 0x"));
+    con_print_hex(pc);
+    con_puts(hotstr(" (0x"));
+    con_print_hex((pcp[1] << 16) + pcp[0]);
+    con_puts(hotstr(")\nBad address: 0x"));
+    con_print_hex(mtval);
+    con_putc('\n');
+
+    for (int i = 0; i < 32; i += 8) {
+	for (int j = 0; j < 8; j++) {
+	    uint32_t v = rdxreg(i+j);
+	    con_print_hex(v);
+	    con_putc((j == 7) ? '\n' : ' ');
+	}
+    }
+
+    con_flush();
+    udelay(5000000);
+    reset(SYS_RESET_SOFT);
+}
+
+register size_t _pc asm(IRQ_PC_REGISTER);
+
+IRQHANDLER(buserr,0)
+{
+    killed(hotstr("misaligned"), _pc);
+}
+
+IRQHANDLER(ebreak,0)
+{
+    killed(hotstr("invalid instruction"), _pc);
+}

+ 2 - 0
rv32/spurious_irq.c

@@ -4,6 +4,8 @@
 #include "irq.h"
 
 /* Spurious interrupt; just mask it */
+register irqmask_t irq_vector_mask asm(IRQ_VECTOR_MASK_REGISTER);
+
 IRQHANDLER(spurious,0)
 {
     mask_irq(vector);

+ 0 - 46
rv32/system.c

@@ -19,52 +19,6 @@ void __hot con_print_hex(unsigned int n)
     }
 }
 
-/* Don't mark no_return or gcc moves it to SDRAM */
-static void __hot __text_hot killed(const char *how, size_t pc)
-{
-    /* Cannot use con_printf() here */
-    const uint16_t *pcp;
-    size_t mtval;
-
-    asm volatile("csrr %0,mtval" : "=r" (mtval));
-
-    /* Try to move back to the previous instruction (if not a jump...) */
-    pc += -4 + (pc & 1);
-    pcp = (const uint16_t *)pc;
-
-    con_puts(hotstr("ERROR: "));
-    con_puts(how);
-    con_puts(hotstr(" at 0x"));
-    con_print_hex(pc);
-    con_puts(hotstr(" (0x"));
-    con_print_hex((pcp[1] << 16) + pcp[0]);
-    con_puts(hotstr(")\nBad address: 0x"));
-    con_print_hex(mtval);
-    con_putc('\n');
-
-    for (int i = 0; i < 32; i += 8) {
-	for (int j = 0; j < 8; j++) {
-	    uint32_t v = rdxreg(i+j);
-	    con_print_hex(v);
-	    con_putc((j == 7) ? '\n' : ' ');
-	}
-    }
-
-    con_flush();
-    udelay(5000000);
-    reset(SYS_RESET_SOFT);
-}
-
-IRQHANDLER(buserr,0)
-{
-    killed(hotstr("misaligned"), pc);
-}
-
-IRQHANDLER(ebreak,0)
-{
-    killed(hotstr("invalid instruction"), pc);
-}
-
 volatile __sbss uint32_t timer_irq_count;
 IRQHANDLER(sysclock,0)
 {