kernel: unroll MIPS r4k cache blast function
Optimize the compiler output for larger cache blast cases that are common for DMA-based networking. On ar71xx, I measured a routing throughput increase of ~8% Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com> Signed-off-by: Rosen Penev <rosenp@gmail.com> Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
parent
916277a033
commit
4e8f1e9f4c
3 changed files with 180 additions and 7 deletions
|
@ -204,7 +204,7 @@
|
||||||
|
|
||||||
#define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
|
#define __BUILD_BLAST_USER_CACHE(pfx, desc, indexop, hitop, lsize) \
|
||||||
static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
|
static inline void blast_##pfx##cache##lsize##_user_page(unsigned long page) \
|
||||||
@@ -660,17 +744,19 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
|
@@ -660,53 +744,23 @@ __BUILD_BLAST_USER_CACHE(d, dcache, Inde
|
||||||
__BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
|
__BUILD_BLAST_USER_CACHE(i, icache, Index_Invalidate_I, Hit_Invalidate_I, 64)
|
||||||
|
|
||||||
/* build blast_xxx_range, protected_blast_xxx_range */
|
/* build blast_xxx_range, protected_blast_xxx_range */
|
||||||
|
@ -214,18 +214,59 @@
|
||||||
unsigned long end) \
|
unsigned long end) \
|
||||||
{ \
|
{ \
|
||||||
unsigned long lsize = cpu_##desc##_line_size(); \
|
unsigned long lsize = cpu_##desc##_line_size(); \
|
||||||
|
- unsigned long lsize_2 = lsize * 2; \
|
||||||
|
- unsigned long lsize_3 = lsize * 3; \
|
||||||
|
- unsigned long lsize_4 = lsize * 4; \
|
||||||
|
- unsigned long lsize_5 = lsize * 5; \
|
||||||
|
- unsigned long lsize_6 = lsize * 6; \
|
||||||
|
- unsigned long lsize_7 = lsize * 7; \
|
||||||
|
- unsigned long lsize_8 = lsize * 8; \
|
||||||
unsigned long addr = start & ~(lsize - 1); \
|
unsigned long addr = start & ~(lsize - 1); \
|
||||||
unsigned long aend = (end - 1) & ~(lsize - 1); \
|
- unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
|
||||||
|
- int lines = (aend - addr) / lsize; \
|
||||||
|
+ unsigned long aend = (end - 1) & ~(lsize - 1); \
|
||||||
+ war \
|
+ war \
|
||||||
\
|
\
|
||||||
__##pfx##flush_prologue \
|
__##pfx##flush_prologue \
|
||||||
\
|
\
|
||||||
while (1) { \
|
- while (lines >= 8) { \
|
||||||
|
- prot##cache_op(hitop, addr); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_4); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_5); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_6); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_7); \
|
||||||
|
- addr += lsize_8; \
|
||||||
|
- lines -= 8; \
|
||||||
|
- } \
|
||||||
|
- \
|
||||||
|
- if (lines & 0x4) { \
|
||||||
|
- prot##cache_op(hitop, addr); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
- addr += lsize_4; \
|
||||||
|
- } \
|
||||||
|
- \
|
||||||
|
- if (lines & 0x2) { \
|
||||||
|
- prot##cache_op(hitop, addr); \
|
||||||
|
- prot##cache_op(hitop, addr + lsize); \
|
||||||
|
- addr += lsize_2; \
|
||||||
|
- } \
|
||||||
|
- \
|
||||||
|
- if (lines & 0x1) { \
|
||||||
|
+ while (1) { \
|
||||||
+ war2 \
|
+ war2 \
|
||||||
prot##cache_op(hitop, addr); \
|
prot##cache_op(hitop, addr); \
|
||||||
if (addr == aend) \
|
+ if (addr == aend) \
|
||||||
break; \
|
+ break; \
|
||||||
@@ -682,8 +768,8 @@ static inline void prot##extra##blast_##
|
+ addr += lsize; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
__##pfx##flush_epilogue \
|
||||||
|
@@ -714,8 +768,8 @@ static inline void prot##extra##blast_##
|
||||||
|
|
||||||
#ifndef CONFIG_EVA
|
#ifndef CONFIG_EVA
|
||||||
|
|
||||||
|
@ -236,7 +277,7 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@@ -720,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
|
@@ -752,14 +806,14 @@ __BUILD_PROT_BLAST_CACHE_RANGE(d, dcache
|
||||||
__BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
|
__BUILD_PROT_BLAST_CACHE_RANGE(i, icache, Hit_Invalidate_I)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
|
||||||
|
Date: Fri, 7 Jun 2013 18:35:22 -0500
|
||||||
|
Subject: MIPS: r4k_cache: use more efficient cache blast
|
||||||
|
|
||||||
|
Optimize the compiler output for larger cache blast cases that are
|
||||||
|
common for DMA-based networking.
|
||||||
|
|
||||||
|
Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
|
||||||
|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||||
|
---
|
||||||
|
--- a/arch/mips/include/asm/r4kcache.h
|
||||||
|
+++ b/arch/mips/include/asm/r4kcache.h
|
||||||
|
@@ -682,16 +682,48 @@ static inline void prot##extra##blast_##
|
||||||
|
unsigned long end) \
|
||||||
|
{ \
|
||||||
|
unsigned long lsize = cpu_##desc##_line_size(); \
|
||||||
|
+ unsigned long lsize_2 = lsize * 2; \
|
||||||
|
+ unsigned long lsize_3 = lsize * 3; \
|
||||||
|
+ unsigned long lsize_4 = lsize * 4; \
|
||||||
|
+ unsigned long lsize_5 = lsize * 5; \
|
||||||
|
+ unsigned long lsize_6 = lsize * 6; \
|
||||||
|
+ unsigned long lsize_7 = lsize * 7; \
|
||||||
|
+ unsigned long lsize_8 = lsize * 8; \
|
||||||
|
unsigned long addr = start & ~(lsize - 1); \
|
||||||
|
- unsigned long aend = (end - 1) & ~(lsize - 1); \
|
||||||
|
+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
|
||||||
|
+ int lines = (aend - addr) / lsize; \
|
||||||
|
\
|
||||||
|
__##pfx##flush_prologue \
|
||||||
|
\
|
||||||
|
- while (1) { \
|
||||||
|
+ while (lines >= 8) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_4); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_5); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_6); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_7); \
|
||||||
|
+ addr += lsize_8; \
|
||||||
|
+ lines -= 8; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x4) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
+ addr += lsize_4; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x2) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ addr += lsize_2; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x1) { \
|
||||||
|
prot##cache_op(hitop, addr); \
|
||||||
|
- if (addr == aend) \
|
||||||
|
- break; \
|
||||||
|
- addr += lsize; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
__##pfx##flush_epilogue \
|
|
@ -0,0 +1,66 @@
|
||||||
|
From: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
|
||||||
|
Date: Fri, 7 Jun 2013 18:35:22 -0500
|
||||||
|
Subject: MIPS: r4k_cache: use more efficient cache blast
|
||||||
|
|
||||||
|
Optimize the compiler output for larger cache blast cases that are
|
||||||
|
common for DMA-based networking.
|
||||||
|
|
||||||
|
Signed-off-by: Ben Menchaca <ben.menchaca@qca.qualcomm.com>
|
||||||
|
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||||
|
---
|
||||||
|
--- a/arch/mips/include/asm/r4kcache.h
|
||||||
|
+++ b/arch/mips/include/asm/r4kcache.h
|
||||||
|
@@ -665,16 +665,48 @@ static inline void prot##extra##blast_##pfx##cache##_range(unsigned long start,
|
||||||
|
unsigned long end) \
|
||||||
|
{ \
|
||||||
|
unsigned long lsize = cpu_##desc##_line_size(); \
|
||||||
|
+ unsigned long lsize_2 = lsize * 2; \
|
||||||
|
+ unsigned long lsize_3 = lsize * 3; \
|
||||||
|
+ unsigned long lsize_4 = lsize * 4; \
|
||||||
|
+ unsigned long lsize_5 = lsize * 5; \
|
||||||
|
+ unsigned long lsize_6 = lsize * 6; \
|
||||||
|
+ unsigned long lsize_7 = lsize * 7; \
|
||||||
|
+ unsigned long lsize_8 = lsize * 8; \
|
||||||
|
unsigned long addr = start & ~(lsize - 1); \
|
||||||
|
- unsigned long aend = (end - 1) & ~(lsize - 1); \
|
||||||
|
+ unsigned long aend = (end + lsize - 1) & ~(lsize - 1); \
|
||||||
|
+ int lines = (aend - addr) / lsize; \
|
||||||
|
\
|
||||||
|
__##pfx##flush_prologue \
|
||||||
|
\
|
||||||
|
- while (1) { \
|
||||||
|
+ while (lines >= 8) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_4); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_5); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_6); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_7); \
|
||||||
|
+ addr += lsize_8; \
|
||||||
|
+ lines -= 8; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x4) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_2); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize_3); \
|
||||||
|
+ addr += lsize_4; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x2) { \
|
||||||
|
+ prot##cache_op(hitop, addr); \
|
||||||
|
+ prot##cache_op(hitop, addr + lsize); \
|
||||||
|
+ addr += lsize_2; \
|
||||||
|
+ } \
|
||||||
|
+ \
|
||||||
|
+ if (lines & 0x1) { \
|
||||||
|
prot##cache_op(hitop, addr); \
|
||||||
|
- if (addr == aend) \
|
||||||
|
- break; \
|
||||||
|
- addr += lsize; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
__##pfx##flush_epilogue \
|
Loading…
Reference in a new issue