From 370c8243ec8e7f3abd8171b7d2dde170f4c5e63a Mon Sep 17 00:00:00 2001 From: Siarhei Siamashka Date: Mon, 17 Jun 2013 16:00:25 +0300 Subject: [PATCH 070/196] bcm2708_fb: DMA acceleration for fb_copyarea Based on http://www.raspberrypi.org/phpBB3/viewtopic.php?p=62425#p62425 Also used Simon's dmaer_master module as a reference for tweaking DMA settings for better performance. For now busylooping only. IRQ support might be added later. With non-overclocked Raspberry Pi, the performance is ~360 MB/s for simple copy or ~260 MB/s for two-pass copy (used when dragging windows to the right). In the case of using DMA channel 0, the performance improves to ~440 MB/s. For comparison, VFP optimized CPU copy can only do ~114 MB/s in the same conditions (hindered by reading uncached source buffer). Signed-off-by: Siarhei Siamashka --- drivers/video/bcm2708_fb.c | 162 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) diff --git a/drivers/video/bcm2708_fb.c b/drivers/video/bcm2708_fb.c index 08d9238..c10c5ee 100644 --- a/drivers/video/bcm2708_fb.c +++ b/drivers/video/bcm2708_fb.c @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -63,6 +64,11 @@ struct bcm2708_fb { struct fbinfo_s *info; dma_addr_t dma; u32 cmap[16]; + int dma_chan; + int dma_irq; + void __iomem *dma_chan_base; + void *cb_base; /* DMA control blocks */ + dma_addr_t cb_handle; }; #define to_bcm2708(info) container_of(info, struct bcm2708_fb, fb) @@ -312,11 +318,133 @@ static void bcm2708_fb_fillrect(struct fb_info *info, cfb_fillrect(info, rect); } +/* A helper function for configuring dma control block */ +static void set_dma_cb(struct bcm2708_dma_cb *cb, + int burst_size, + dma_addr_t dst, + int dst_stride, + dma_addr_t src, + int src_stride, + int w, + int h) +{ + cb->info = BCM2708_DMA_BURST(burst_size) | BCM2708_DMA_S_WIDTH | + BCM2708_DMA_S_INC | BCM2708_DMA_D_WIDTH | + BCM2708_DMA_D_INC | BCM2708_DMA_TDMODE; + cb->dst = dst; + cb->src = src; + /* + * This is not really obvious from the DMA documentation, + * but the top 16 bits must be programmmed to "height -1" + * and not "height" in 2D mode. + */ + cb->length = ((h - 1) << 16) | w; + cb->stride = ((dst_stride - w) << 16) | (u16)(src_stride - w); + cb->pad[0] = 0; + cb->pad[1] = 0; +} + static void bcm2708_fb_copyarea(struct fb_info *info, const struct fb_copyarea *region) { - /*print_debug("bcm2708_fb_copyarea\n"); */ - cfb_copyarea(info, region); + struct bcm2708_fb *fb = to_bcm2708(info); + struct bcm2708_dma_cb *cb = fb->cb_base; + int bytes_per_pixel = (info->var.bits_per_pixel + 7) >> 3; + /* Channel 0 supports larger bursts and is a bit faster */ + int burst_size = (fb->dma_chan == 0) ? 8 : 2; + + /* Fallback to cfb_copyarea() if we don't like something */ + if (bytes_per_pixel > 4 || + info->var.xres > 1920 || info->var.yres > 1200 || + region->width <= 0 || region->width > info->var.xres || + region->height <= 0 || region->height > info->var.yres || + region->sx < 0 || region->sx >= info->var.xres || + region->sy < 0 || region->sy >= info->var.yres || + region->dx < 0 || region->dx >= info->var.xres || + region->dy < 0 || region->dy >= info->var.yres || + region->sx + region->width > info->var.xres || + region->dx + region->width > info->var.xres || + region->sy + region->height > info->var.yres || + region->dy + region->height > info->var.yres) { + cfb_copyarea(info, region); + return; + } + + if (region->dy == region->sy && region->dx > region->sx) { + /* + * A difficult case of overlapped copy. Because DMA can't + * copy individual scanlines in backwards direction, we need + * two-pass processing. We do it by programming a chain of dma + * control blocks in the first 16K part of the buffer and use + * the remaining 48K as the intermediate temporary scratch + * buffer. The buffer size is sufficient to handle up to + * 1920x1200 resolution at 32bpp pixel depth. + */ + int y; + dma_addr_t control_block_pa = fb->cb_handle; + dma_addr_t scratchbuf = fb->cb_handle + 16 * 1024; + int scanline_size = bytes_per_pixel * region->width; + int scanlines_per_cb = (64 * 1024 - 16 * 1024) / scanline_size; + + for (y = 0; y < region->height; y += scanlines_per_cb) { + dma_addr_t src = + fb->fb.fix.smem_start + + bytes_per_pixel * region->sx + + (region->sy + y) * fb->fb.fix.line_length; + dma_addr_t dst = + fb->fb.fix.smem_start + + bytes_per_pixel * region->dx + + (region->dy + y) * fb->fb.fix.line_length; + + if (region->height - y < scanlines_per_cb) + scanlines_per_cb = region->height - y; + + set_dma_cb(cb, burst_size, scratchbuf, scanline_size, + src, fb->fb.fix.line_length, + scanline_size, scanlines_per_cb); + control_block_pa += sizeof(struct bcm2708_dma_cb); + cb->next = control_block_pa; + cb++; + + set_dma_cb(cb, burst_size, dst, fb->fb.fix.line_length, + scratchbuf, scanline_size, + scanline_size, scanlines_per_cb); + control_block_pa += sizeof(struct bcm2708_dma_cb); + cb->next = control_block_pa; + cb++; + } + /* move the pointer back to the last dma control block */ + cb--; + } else { + /* A single dma control block is enough. */ + int sy, dy, stride; + if (region->dy <= region->sy) { + /* processing from top to bottom */ + dy = region->dy; + sy = region->sy; + stride = fb->fb.fix.line_length; + } else { + /* processing from bottom to top */ + dy = region->dy + region->height - 1; + sy = region->sy + region->height - 1; + stride = -fb->fb.fix.line_length; + } + set_dma_cb(cb, burst_size, + fb->fb.fix.smem_start + dy * fb->fb.fix.line_length + + bytes_per_pixel * region->dx, + stride, + fb->fb.fix.smem_start + sy * fb->fb.fix.line_length + + bytes_per_pixel * region->sx, + stride, + region->width * bytes_per_pixel, + region->height); + } + + /* end of dma control blocks chain */ + cb->next = 0; + + bcm_dma_start(fb->dma_chan_base, fb->cb_handle); + bcm_dma_wait_idle(fb->dma_chan_base); } static void bcm2708_fb_imageblit(struct fb_info *info, @@ -359,7 +487,7 @@ static int bcm2708_fb_register(struct bcm2708_fb *fb) fb->dma = dma; } fb->fb.fbops = &bcm2708_fb_ops; - fb->fb.flags = FBINFO_FLAG_DEFAULT; + fb->fb.flags = FBINFO_FLAG_DEFAULT | FBINFO_HWACCEL_COPYAREA; fb->fb.pseudo_palette = fb->cmap; strncpy(fb->fb.fix.id, bcm2708_name, sizeof(fb->fb.fix.id)); @@ -424,6 +552,28 @@ static int bcm2708_fb_probe(struct platform_device *dev) } memset(fb, 0, sizeof(struct bcm2708_fb)); + fb->cb_base = dma_alloc_writecombine(&dev->dev, SZ_64K, + &fb->cb_handle, GFP_KERNEL); + if (!fb->cb_base) { + dev_err(&dev->dev, "cannot allocate DMA CBs\n"); + ret = -ENOMEM; + goto free_fb; + } + + pr_info("BCM2708FB: allocated DMA memory %08x\n", + fb->cb_handle); + + ret = bcm_dma_chan_alloc(BCM_DMA_FEATURE_BULK, + &fb->dma_chan_base, &fb->dma_irq); + if (ret < 0) { + dev_err(&dev->dev, "couldn't allocate a DMA channel\n"); + goto free_cb; + } + fb->dma_chan = ret; + + pr_info("BCM2708FB: allocated DMA channel %d @ %p\n", + fb->dma_chan, fb->dma_chan_base); + fb->dev = dev; ret = bcm2708_fb_register(fb); @@ -432,6 +582,9 @@ static int bcm2708_fb_probe(struct platform_device *dev) goto out; } +free_cb: + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); +free_fb: kfree(fb); free_region: dev_err(&dev->dev, "probe failed, err %d\n", ret); @@ -449,6 +602,9 @@ static int bcm2708_fb_remove(struct platform_device *dev) iounmap(fb->fb.screen_base); unregister_framebuffer(&fb->fb); + dma_free_writecombine(&dev->dev, SZ_64K, fb->cb_base, fb->cb_handle); + bcm_dma_chan_free(fb->dma_chan); + dma_free_coherent(NULL, PAGE_ALIGN(sizeof(*fb->info)), (void *)fb->info, fb->dma); kfree(fb); -- 1.9.1