kernel: unroll MIPS r4k cache blast function

Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.

On ar71xx, I measured a routing throughput increase of ~8%

Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Signed-off-by: Rosen Penev <rosenp@gmail.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
Felix Fietkau 2017-12-04 22:44:33 +01:00
parent 916277a033
commit 4e8f1e9f4c
3 changed files with 180 additions and 7 deletions

View file

@ -204,7 +204,7 @@
#define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \ #define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \ static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde @@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
__BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64) __BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
/* build blast_xxx_range, protected_blast_xxx_range */ /* build blast_xxx_range, protected_blast_xxx_range */
@ -214,18 +214,59 @@
unsigned long end) \ unsigned long end) \
{ \ { \
unsigned long lsize = cpu_##desc##_line_size(); \ unsigned long lsize = cpu_##desc##_line_size(); \
- unsigned long lsize_2 = lsize * 2; \
- unsigned long lsize_3 = lsize * 3; \
- unsigned long lsize_4 = lsize * 4; \
- unsigned long lsize_5 = lsize * 5; \
- unsigned long lsize_6 = lsize * 6; \
- unsigned long lsize_7 = lsize * 7; \
- unsigned long lsize_8 = lsize * 8; \
unsigned long addr = start & ~(lsize - 1); \ unsigned long addr = start & ~(lsize - 1); \
unsigned long aend = (end - 1) & ~(lsize - 1); \ - unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
- int lines = (aend - addr) / lsize; \
+ unsigned long aend = (end - 1) & ~(lsize - 1); \
+ war \ + war \
\ \
__##pfx##flush_prologue \ __##pfx##flush_prologue \
\ \
while (1) { \ - while (lines >= 8) { \
- prot##cache_op(hitop, addr); \
- prot##cache_op(hitop, addr + lsize); \
- prot##cache_op(hitop, addr + lsize_2); \
- prot##cache_op(hitop, addr + lsize_3); \
- prot##cache_op(hitop, addr + lsize_4); \
- prot##cache_op(hitop, addr + lsize_5); \
- prot##cache_op(hitop, addr + lsize_6); \
- prot##cache_op(hitop, addr + lsize_7); \
- addr += lsize_8; \
- lines -= 8; \
- } \
- \
- if (lines & 0x4) { \
- prot##cache_op(hitop, addr); \
- prot##cache_op(hitop, addr + lsize); \
- prot##cache_op(hitop, addr + lsize_2); \
- prot##cache_op(hitop, addr + lsize_3); \
- addr += lsize_4; \
- } \
- \
- if (lines & 0x2) { \
- prot##cache_op(hitop, addr); \
- prot##cache_op(hitop, addr + lsize); \
- addr += lsize_2; \
- } \
- \
- if (lines & 0x1) { \
+ while (1) { \
+ war2 \ + war2 \
prot##cache_op(hitop, addr); \ prot##cache_op(hitop, addr); \
if (addr == aend) \ + if (addr == aend) \
break; \ + break; \
@@ -682,8 +768,8 @@ static inline void prot##extra##blast_## + addr += lsize; \
} \
\
__##pfx##flush_epilogue \
@@ -714,8 +768,8 @@ static inline void prot##extra##blast_##
#ifndef CONFIG_EVA #ifndef CONFIG_EVA
@ -236,7 +277,7 @@
#else #else
@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache @@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
__BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I) __BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
#endif #endif

View file

@ -0,0 +1,66 @@
From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Date: Fri, 7 Jun 2013 18:35:22 -0500
Subject: MIPS: r4k_cache: use more efficient cache blast
Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.
Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -682,16 +682,48 @@ static inline void prot##extra##blast_##
unsigned long end) \
{ \
unsigned long lsize = cpu_##desc##_line_size(); \
+ unsigned long lsize_2 = lsize * 2; \
+ unsigned long lsize_3 = lsize * 3; \
+ unsigned long lsize_4 = lsize * 4; \
+ unsigned long lsize_5 = lsize * 5; \
+ unsigned long lsize_6 = lsize * 6; \
+ unsigned long lsize_7 = lsize * 7; \
+ unsigned long lsize_8 = lsize * 8; \
unsigned long addr = start & ~(lsize - 1); \
- unsigned long aend = (end - 1) & ~(lsize - 1); \
+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
+ int lines = (aend - addr) / lsize; \
\
__##pfx##flush_prologue \
\
- while (1) { \
+ while (lines >= 8) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ prot##cache_op(hitop, addr + lsize_4); \
+ prot##cache_op(hitop, addr + lsize_5); \
+ prot##cache_op(hitop, addr + lsize_6); \
+ prot##cache_op(hitop, addr + lsize_7); \
+ addr += lsize_8; \
+ lines -= 8; \
+ } \
+ \
+ if (lines & 0x4) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ addr += lsize_4; \
+ } \
+ \
+ if (lines & 0x2) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ addr += lsize_2; \
+ } \
+ \
+ if (lines & 0x1) { \
prot##cache_op(hitop, addr); \
- if (addr == aend) \
- break; \
- addr += lsize; \
} \
\
__##pfx##flush_epilogue \

View file

@ -0,0 +1,66 @@
From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Date: Fri, 7 Jun 2013 18:35:22 -0500
Subject: MIPS: r4k_cache: use more efficient cache blast
Optimize the compiler output for larger cache blast cases that are
common for DMA-based networking.
Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start,
unsigned long end) \
{ \
unsigned long lsize = cpu_##desc##_line_size(); \
+ unsigned long lsize_2 = lsize * 2; \
+ unsigned long lsize_3 = lsize * 3; \
+ unsigned long lsize_4 = lsize * 4; \
+ unsigned long lsize_5 = lsize * 5; \
+ unsigned long lsize_6 = lsize * 6; \
+ unsigned long lsize_7 = lsize * 7; \
+ unsigned long lsize_8 = lsize * 8; \
unsigned long addr = start & ~(lsize - 1); \
- unsigned long aend = (end - 1) & ~(lsize - 1); \
+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
+ int lines = (aend - addr) / lsize; \
\
__##pfx##flush_prologue \
\
- while (1) { \
+ while (lines >= 8) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ prot##cache_op(hitop, addr + lsize_4); \
+ prot##cache_op(hitop, addr + lsize_5); \
+ prot##cache_op(hitop, addr + lsize_6); \
+ prot##cache_op(hitop, addr + lsize_7); \
+ addr += lsize_8; \
+ lines -= 8; \
+ } \
+ \
+ if (lines & 0x4) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ prot##cache_op(hitop, addr + lsize_2); \
+ prot##cache_op(hitop, addr + lsize_3); \
+ addr += lsize_4; \
+ } \
+ \
+ if (lines & 0x2) { \
+ prot##cache_op(hitop, addr); \
+ prot##cache_op(hitop, addr + lsize); \
+ addr += lsize_2; \
+ } \
+ \
+ if (lines & 0x1) { \
prot##cache_op(hitop, addr); \
- if (addr == aend) \
- break; \
- addr += lsize; \
} \
\
__##pfx##flush_epilogue \