kernel: unroll MIPS r4k cache blast function

Optimize the compiler output for larger cache blast cases that are common for DMA-based networking. On ar71xx, I measured a routing throughput increase of ~8% Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com> Signed-off-by: Rosen Penev <rosenp@gmail.com> Signed-off-by: Felix Fietkau <nbd@nbd.name>
2017-12-04 22:44:33 +01:00 · 2017-12-04 22:44:33 +01:00 · 4e8f1e9f4c
commit 4e8f1e9f4c
parent 916277a033
3 changed files with 180 additions and 7 deletions
--- a/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
+++ b/target/linux/brcm47xx/patches-4.9/159-cpu_fixes.patch
@ -204,7 +204,7 @@
 #define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
 static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
-@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
+@@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
 __BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
 /* build blast_xxx_range, protected_blast_xxx_range */
@ -214,18 +214,59 @@
 						    unsigned long end)	\
 {									\
 	unsigned long lsize = cpu_##desc##_line_size();			\
 -	unsigned long lsize_2 = lsize * 2;				\
 -	unsigned long lsize_3 = lsize * 3;				\
 -	unsigned long lsize_4 = lsize * 4;				\
 -	unsigned long lsize_5 = lsize * 5;				\
 -	unsigned long lsize_6 = lsize * 6;				\
 -	unsigned long lsize_7 = lsize * 7;				\
 -	unsigned long lsize_8 = lsize * 8;				\
 	unsigned long addr = start & ~(lsize - 1);			\
- 	unsigned long aend = (end - 1) & ~(lsize - 1);			\
+-	unsigned long aend = (end + lsize - 1) & ~(lsize - 1);		\
 -	int lines = (aend - addr) / lsize;				\
 +	unsigned long aend = (end - 1) & ~(lsize - 1);			\
 +	war								\
 									\
 	__##pfx##flush_prologue						\
 									\
- 	while (1) {							\
+-	while (lines >= 8) {						\
 -		prot##cache_op(hitop, addr);				\
 -		prot##cache_op(hitop, addr + lsize);			\
 -		prot##cache_op(hitop, addr + lsize_2);			\
 -		prot##cache_op(hitop, addr + lsize_3);			\
 -		prot##cache_op(hitop, addr + lsize_4);			\
 -		prot##cache_op(hitop, addr + lsize_5);			\
 -		prot##cache_op(hitop, addr + lsize_6);			\
 -		prot##cache_op(hitop, addr + lsize_7);			\
 -		addr += lsize_8;					\
 -		lines -= 8;						\
 -	}								\
 -									\
 -	if (lines & 0x4) {						\
 -		prot##cache_op(hitop, addr);				\
 -		prot##cache_op(hitop, addr + lsize);			\
 -		prot##cache_op(hitop, addr + lsize_2);			\
 -		prot##cache_op(hitop, addr + lsize_3);			\
 -		addr += lsize_4;					\
 -	}								\
 -									\
 -	if (lines & 0x2) {						\
 -		prot##cache_op(hitop, addr);				\
 -		prot##cache_op(hitop, addr + lsize);			\
 -		addr += lsize_2;					\
 -	}								\
 -									\
 -	if (lines & 0x1) {						\
 +	while (1) {							\
 +		war2							\
 		prot##cache_op(hitop, addr);				\
- 		if (addr == aend)					\
+		if (addr == aend)					\
- 			break;						\
+			break;						\
-@@ -682,8 +768,8 @@ static inline void prot##extra##blast_##
+		addr += lsize;						\
 	}								\
 									\
 	__##pfx##flush_epilogue						\
@@ -714,8 +768,8 @@ static inline void prot##extra##blast_##
 #ifndef CONFIG_EVA
@ -236,7 +277,7 @@
 #else
-@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
+@@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
 __BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
 #endif
--- a/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
+++ b/target/linux/generic/hack-4.14/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
@ -0,0 +1,66 @@
 From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
 Date: Fri, 7 Jun 2013 18:35:22 -0500
 Subject: MIPS: r4k_cache: use more efficient cache blast
 Optimize the compiler output for larger cache blast cases that are
 common for DMA-based networking.
 Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
 Signed-off-by: Felix Fietkau <nbd@nbd.name>
 ---
 --- a/arch/mips/include/asm/r4kcache.h
 +++ b/arch/mips/include/asm/r4kcache.h
@@ -682,16 +682,48 @@ static inline void prot##extra##blast_##
 						    unsigned long end)	\
 {									\
 	unsigned long lsize = cpu_##desc##_line_size();			\
 +	unsigned long lsize_2 = lsize * 2;				\
 +	unsigned long lsize_3 = lsize * 3;				\
 +	unsigned long lsize_4 = lsize * 4;				\
 +	unsigned long lsize_5 = lsize * 5;				\
 +	unsigned long lsize_6 = lsize * 6;				\
 +	unsigned long lsize_7 = lsize * 7;				\
 +	unsigned long lsize_8 = lsize * 8;				\
 	unsigned long addr = start & ~(lsize - 1);			\
 -	unsigned long aend = (end - 1) & ~(lsize - 1);			\
 +	unsigned long aend = (end + lsize - 1) & ~(lsize - 1);		\
 +	int lines = (aend - addr) / lsize;				\
 									\
 	__##pfx##flush_prologue						\
 									\
 -	while (1) {							\
 +	while (lines >= 8) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		prot##cache_op(hitop, addr + lsize_2);			\
 +		prot##cache_op(hitop, addr + lsize_3);			\
 +		prot##cache_op(hitop, addr + lsize_4);			\
 +		prot##cache_op(hitop, addr + lsize_5);			\
 +		prot##cache_op(hitop, addr + lsize_6);			\
 +		prot##cache_op(hitop, addr + lsize_7);			\
 +		addr += lsize_8;					\
 +		lines -= 8;						\
 +	}								\
 +									\
 +	if (lines & 0x4) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		prot##cache_op(hitop, addr + lsize_2);			\
 +		prot##cache_op(hitop, addr + lsize_3);			\
 +		addr += lsize_4;					\
 +	}								\
 +									\
 +	if (lines & 0x2) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		addr += lsize_2;					\
 +	}								\
 +									\
 +	if (lines & 0x1) {						\
 		prot##cache_op(hitop, addr);				\
 -		if (addr == aend)					\
 -			break;						\
 -		addr += lsize;						\
 	}								\
 									\
 	__##pfx##flush_epilogue						\
--- a/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
+++ b/target/linux/generic/hack-4.9/300-MIPS-r4k_cache-use-more-efficient-cache-blast.patch
@ -0,0 +1,66 @@
 From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
 Date: Fri, 7 Jun 2013 18:35:22 -0500
 Subject: MIPS: r4k_cache: use more efficient cache blast
 Optimize the compiler output for larger cache blast cases that are
 common for DMA-based networking.
 Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
 Signed-off-by: Felix Fietkau <nbd@nbd.name>
 ---
 --- a/arch/mips/include/asm/r4kcache.h
 +++ b/arch/mips/include/asm/r4kcache.h
@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start,
 						    unsigned long end)	\
 {									\
 	unsigned long lsize = cpu_##desc##_line_size();			\
 +	unsigned long lsize_2 = lsize * 2;				\
 +	unsigned long lsize_3 = lsize * 3;				\
 +	unsigned long lsize_4 = lsize * 4;				\
 +	unsigned long lsize_5 = lsize * 5;				\
 +	unsigned long lsize_6 = lsize * 6;				\
 +	unsigned long lsize_7 = lsize * 7;				\
 +	unsigned long lsize_8 = lsize * 8;				\
 	unsigned long addr = start & ~(lsize - 1);			\
 -	unsigned long aend = (end - 1) & ~(lsize - 1);			\
 +	unsigned long aend = (end + lsize - 1) & ~(lsize - 1);		\
 +	int lines = (aend - addr) / lsize;				\
 									\
 	__##pfx##flush_prologue						\
 									\
 -	while (1) {							\
 +	while (lines >= 8) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		prot##cache_op(hitop, addr + lsize_2);			\
 +		prot##cache_op(hitop, addr + lsize_3);			\
 +		prot##cache_op(hitop, addr + lsize_4);			\
 +		prot##cache_op(hitop, addr + lsize_5);			\
 +		prot##cache_op(hitop, addr + lsize_6);			\
 +		prot##cache_op(hitop, addr + lsize_7);			\
 +		addr += lsize_8;					\
 +		lines -= 8;						\
 +	}								\
 +									\
 +	if (lines & 0x4) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		prot##cache_op(hitop, addr + lsize_2);			\
 +		prot##cache_op(hitop, addr + lsize_3);			\
 +		addr += lsize_4;					\
 +	}								\
 +									\
 +	if (lines & 0x2) {						\
 +		prot##cache_op(hitop, addr);				\
 +		prot##cache_op(hitop, addr + lsize);			\
 +		addr += lsize_2;					\
 +	}								\
 +									\
 +	if (lines & 0x1) {						\
 		prot##cache_op(hitop, addr);				\
 -		if (addr == aend)					\
 -			break;						\
 -		addr += lsize;						\
 	}								\
 									\
 	__##pfx##flush_epilogue						\