Sfoglia il codice sorgente

fw: optimized memcpy() for the aligned xword case

H. Peter Anvin 3 anni fa
parent
commit
7df391b813
8 ha cambiato i file con 4898 aggiunte e 4806 eliminazioni
  1. BIN
      fpga/output_files/max80.jbc
  2. BIN
      fpga/output_files/max80.jic
  3. BIN
      fpga/output_files/max80.pof
  4. BIN
      fpga/output_files/max80.sof
  5. 5 2
      fw/Makefile
  6. 4804 4804
      fw/boot.mif
  7. 2 0
      fw/fw.h
  8. 87 0
      fw/memcpy.S

BIN
fpga/output_files/max80.jbc


BIN
fpga/output_files/max80.jic


BIN
fpga/output_files/max80.pof


BIN
fpga/output_files/max80.sof


+ 5 - 2
fw/Makefile

@@ -37,8 +37,11 @@ boot_width  := 32
 boot_stride := 1
 
 max80.elf: head.o die.o dummy.o irqtable.o irqasm.o sbrk.o hello.o \
-	  console.o sdcard.o rtc.o memset.o \
-	  abcdrive.o testdata.o fatfs.a
+	  console.o sdcard.o rtc.o \
+	  abcdrive.o \
+	  memset.o memcpy.o \
+	  testdata.o \
+	  fatfs.a
 
 FATFS_C = $(wildcard fatfs/source/*.c)
 FATFS_O = $(FATFS_C:.c=.o)

File diff suppressed because it is too large
+ 4804 - 4804
fw/boot.mif


+ 2 - 0
fw/fw.h

@@ -12,6 +12,8 @@
 #define memset(s,c,n)	__builtin_memset(s,c,n)
 #define memcpy(d,s,n)	__builtin_memcpy(d,s,n)
 #define memmove(d,s,n)	__builtin_memmove(d,s,n)
+extern void *
+__memcpy_aligned(void * __restrict, const void * __restrict, size_t);
 
 #define likely(x)	__builtin_expect(!!(x), 1)
 #define unlikely(x)	__builtin_expect(!!(x), 0)

+ 87 - 0
fw/memcpy.S

@@ -0,0 +1,87 @@
+	.section ".text.memcpy","ax"
+
+	.balign 4
+	.globl	memcpy
+memcpy:
+	or	a5, a0, a1
+	or	a5, a5, a2
+	andi	a5, a5, 3
+	.option	norvc
+	bnez	a5, __memcpy_misaligned
+	.option rvc
+
+	.type memcpy, @function
+	.size memcpy, . - memcpy
+	
+
+	.balign 4
+	.globl __memcpy_aligned
+__memcpy_aligned:
+	add	a4, a0, a2
+	mv	a3, a0
+
+	andi	a2, a2, 7*4
+	.option	norelax
+	lui	a5, %hi(.L_case0)
+	sub	a5, a5, a2
+	jalr	a5, %lo(.L_case0)
+	.option	relax
+
+	.balign	4
+.L_aligned_loop:
+	add	a1, a1, a2
+	li	a2, 32
+	c.lw	a5, 28(a1)
+	c.sw	a5, 28(a3)
+.L_case7:
+	c.lw	a5, 24(a1)
+	c.sw	a5, 24(a3)
+.L_case6:
+	c.lw	a5, 20(a1)
+	c.sw	a5, 20(a3)
+.L_case5:
+	c.lw	a5, 16(a1)
+	c.sw	a5, 16(a3)
+.L_case4:
+	c.lw	a5, 12(a1)
+	c.sw	a5, 12(a3)
+.L_case3:
+	c.lw	a5,  8(a1)
+	c.sw	a5,  8(a3)
+.L_case2:
+	c.lw	a5,  4(a1)
+	c.sw	a5,  4(a3)
+.L_case1:
+	c.lw	a5,  0(a1)
+	c.sw	a5,  0(a3)
+.L_case0:
+	add	a3, a3, a2
+	bltu	a3, a4, .L_aligned_loop
+
+.L_empty:
+	ret
+
+	.type	__memcpy_aligned, @function
+	.size	__memcpy_aligned, . - __memcpy_aligned
+
+	.balign 4
+__memcpy_misaligned:
+	.option norvc
+	add	a4, a0, a2
+	mv	a3, a0
+	.option	rvc
+
+	// This could be optimized if it ever matters...
+	.balign 4
+.L_misaligned_loop:
+	lbu	a5, 0(a1)
+	sb	a5, 0(a3)
+	addi	a1, a1, 1
+	addi	a3, a3, 1
+	bltu	a3, a4, .L_misaligned_loop
+
+	ret
+
+	.type	__memcpy_misaligned, @function
+	.size	__memcpy_misaligned, . - __memcpy_misaligned
+	

Some files were not shown because too many files changed in this diff