210 lines
6.5 KiB
Diff
210 lines
6.5 KiB
Diff
|
From 42b4a831d77ee33f67142882b2b68e4138cbf13e Mon Sep 17 00:00:00 2001
|
||
|
From: Harm Hanemaaijer <fgenfb@yahoo.com>
|
||
|
Date: Thu, 20 Jun 2013 20:21:39 +0200
|
||
|
Subject: [PATCH 068/174] Speed up console framebuffer imageblit function
|
||
|
|
||
|
Especially on platforms with a slower CPU but a relatively high
|
||
|
framebuffer fill bandwidth, like current ARM devices, the existing
|
||
|
console monochrome imageblit function used to draw console text is
|
||
|
suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
|
||
|
code is quite general and can deal with several pixel depths. By creating
|
||
|
special case functions for 16bpp and 32bpp, by far the most common pixel
|
||
|
formats used on modern systems, a significant speed-up is attained
|
||
|
which can be readily felt on ARM-based devices like the Raspberry Pi
|
||
|
and the Allwinner platform, but should help any platform using the
|
||
|
fb layer.
|
||
|
|
||
|
The special case functions allow constant folding, eliminating a number
|
||
|
of instructions including divide operations, and allow the use of an
|
||
|
unrolled loop, eliminating instructions with a variable shift size,
|
||
|
reducing source memory access instructions, and eliminating excessive
|
||
|
branching. These unrolled loops also allow much better code optimization
|
||
|
by the C compiler. The code that selects which optimized variant is used
|
||
|
is also simplified, eliminating integer divide instructions.
|
||
|
|
||
|
The speed-up, measured by timing 'cat file.txt' in the console, varies
|
||
|
between 40% and 70%, when testing on the Raspberry Pi and Allwinner
|
||
|
ARM-based platforms, depending on font size and the pixel depth, with
|
||
|
the greater benefit for 32bpp.
|
||
|
|
||
|
Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
|
||
|
---
|
||
|
drivers/video/cfbimgblt.c | 152 ++++++++++++++++++++++++++++++++++++++++++++--
|
||
|
1 file changed, 147 insertions(+), 5 deletions(-)
|
||
|
|
||
|
--- a/drivers/video/cfbimgblt.c
|
||
|
+++ b/drivers/video/cfbimgblt.c
|
||
|
@@ -28,6 +28,11 @@
|
||
|
*
|
||
|
* Also need to add code to deal with cards endians that are different than
|
||
|
* the native cpu endians. I also need to deal with MSB position in the word.
|
||
|
+ * Modified by Harm Hanemaaijer (fgenfb@yahoo.com) 2013:
|
||
|
+ * - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
|
||
|
+ * significantly faster than the previous implementation.
|
||
|
+ * - Simplify the fast/slow_imageblit selection code, avoiding integer
|
||
|
+ * divides.
|
||
|
*/
|
||
|
#include <linux/module.h>
|
||
|
#include <linux/string.h>
|
||
|
@@ -262,6 +267,133 @@ static inline void fast_imageblit(const
|
||
|
}
|
||
|
}
|
||
|
|
||
|
+/*
|
||
|
+ * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
|
||
|
+ * into the code, main loop unrolled.
|
||
|
+ */
|
||
|
+
|
||
|
+static inline void fast_imageblit16(const struct fb_image *image,
|
||
|
+ struct fb_info *p, u8 __iomem * dst1,
|
||
|
+ u32 fgcolor, u32 bgcolor)
|
||
|
+{
|
||
|
+ u32 fgx = fgcolor, bgx = bgcolor;
|
||
|
+ u32 spitch = (image->width + 7) / 8;
|
||
|
+ u32 end_mask, eorx;
|
||
|
+ const char *s = image->data, *src;
|
||
|
+ u32 __iomem *dst;
|
||
|
+ const u32 *tab = NULL;
|
||
|
+ int i, j, k;
|
||
|
+
|
||
|
+ tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
|
||
|
+
|
||
|
+ fgx <<= 16;
|
||
|
+ bgx <<= 16;
|
||
|
+ fgx |= fgcolor;
|
||
|
+ bgx |= bgcolor;
|
||
|
+
|
||
|
+ eorx = fgx ^ bgx;
|
||
|
+ k = image->width / 2;
|
||
|
+
|
||
|
+ for (i = image->height; i--;) {
|
||
|
+ dst = (u32 __iomem *) dst1;
|
||
|
+ src = s;
|
||
|
+
|
||
|
+ j = k;
|
||
|
+ while (j >= 4) {
|
||
|
+ u8 bits = *src;
|
||
|
+ end_mask = tab[(bits >> 6) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 4) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 2) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[bits & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ src++;
|
||
|
+ j -= 4;
|
||
|
+ }
|
||
|
+ if (j != 0) {
|
||
|
+ u8 bits = *src;
|
||
|
+ end_mask = tab[(bits >> 6) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ if (j >= 2) {
|
||
|
+ end_mask = tab[(bits >> 4) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ if (j == 3) {
|
||
|
+ end_mask = tab[(bits >> 2) & 3];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst);
|
||
|
+ }
|
||
|
+ }
|
||
|
+ }
|
||
|
+ dst1 += p->fix.line_length;
|
||
|
+ s += spitch;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
|
||
|
+ * into the code, main loop unrolled.
|
||
|
+ */
|
||
|
+
|
||
|
+static inline void fast_imageblit32(const struct fb_image *image,
|
||
|
+ struct fb_info *p, u8 __iomem * dst1,
|
||
|
+ u32 fgcolor, u32 bgcolor)
|
||
|
+{
|
||
|
+ u32 fgx = fgcolor, bgx = bgcolor;
|
||
|
+ u32 spitch = (image->width + 7) / 8;
|
||
|
+ u32 end_mask, eorx;
|
||
|
+ const char *s = image->data, *src;
|
||
|
+ u32 __iomem *dst;
|
||
|
+ const u32 *tab = NULL;
|
||
|
+ int i, j, k;
|
||
|
+
|
||
|
+ tab = cfb_tab32;
|
||
|
+
|
||
|
+ eorx = fgx ^ bgx;
|
||
|
+ k = image->width;
|
||
|
+
|
||
|
+ for (i = image->height; i--;) {
|
||
|
+ dst = (u32 __iomem *) dst1;
|
||
|
+ src = s;
|
||
|
+
|
||
|
+ j = k;
|
||
|
+ while (j >= 8) {
|
||
|
+ u8 bits = *src;
|
||
|
+ end_mask = tab[(bits >> 7) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 6) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 5) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 4) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 3) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 2) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[(bits >> 1) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ end_mask = tab[bits & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ src++;
|
||
|
+ j -= 8;
|
||
|
+ }
|
||
|
+ if (j != 0) {
|
||
|
+ u32 bits = (u32) * src;
|
||
|
+ while (j > 1) {
|
||
|
+ end_mask = tab[(bits >> 7) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
|
||
|
+ bits <<= 1;
|
||
|
+ j--;
|
||
|
+ }
|
||
|
+ end_mask = tab[(bits >> 7) & 1];
|
||
|
+ FB_WRITEL((end_mask & eorx) ^ bgx, dst);
|
||
|
+ }
|
||
|
+ dst1 += p->fix.line_length;
|
||
|
+ s += spitch;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
|
||
|
{
|
||
|
u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
|
||
|
@@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
|
||
|
bgcolor = image->bg_color;
|
||
|
}
|
||
|
|
||
|
- if (32 % bpp == 0 && !start_index && !pitch_index &&
|
||
|
- ((width & (32/bpp-1)) == 0) &&
|
||
|
- bpp >= 8 && bpp <= 32)
|
||
|
- fast_imageblit(image, p, dst1, fgcolor, bgcolor);
|
||
|
- else
|
||
|
+ if (!start_index && !pitch_index) {
|
||
|
+ if (bpp == 32)
|
||
|
+ fast_imageblit32(image, p, dst1, fgcolor,
|
||
|
+ bgcolor);
|
||
|
+ else if (bpp == 16 && (width & 1) == 0)
|
||
|
+ fast_imageblit16(image, p, dst1, fgcolor,
|
||
|
+ bgcolor);
|
||
|
+ else if (bpp == 8 && (width & 3) == 0)
|
||
|
+ fast_imageblit(image, p, dst1, fgcolor,
|
||
|
+ bgcolor);
|
||
|
+ else
|
||
|
+ slow_imageblit(image, p, dst1, fgcolor,
|
||
|
+ bgcolor,
|
||
|
+ start_index, pitch_index);
|
||
|
+ } else
|
||
|
slow_imageblit(image, p, dst1, fgcolor, bgcolor,
|
||
|
start_index, pitch_index);
|
||
|
} else
|