430 lines
13 KiB
Diff
430 lines
13 KiB
Diff
|
From a2c21b04f340f594c16f9c7235ec7b7f78a96a1f Mon Sep 17 00:00:00 2001
|
||
|
From: Varad Gautam <varadgautam@gmail.com>
|
||
|
Date: Wed, 17 Feb 2016 19:08:21 +0530
|
||
|
Subject: [PATCH 294/304] drm/vc4: improve throughput by pipelining binning and
|
||
|
rendering jobs
|
||
|
|
||
|
The hardware provides us with separate threads for binning and
|
||
|
rendering, and the existing model waits for them both to complete
|
||
|
before submitting the next job.
|
||
|
|
||
|
Splitting the binning and rendering submissions reduces idle time and
|
||
|
gives us approx 20-30% speedup with some x11perf tests such as -line10
|
||
|
and -tilerect1. Improves openarena performance by 1.01897% +/-
|
||
|
0.247857% (n=16).
|
||
|
|
||
|
Thanks to anholt for suggesting this.
|
||
|
|
||
|
v2: Rebase on the spurious resets fix (change by anholt).
|
||
|
|
||
|
Signed-off-by: Varad Gautam <varadgautam@gmail.com>
|
||
|
Reviewed-by: Eric Anholt <eric@anholt.net>
|
||
|
Signed-off-by: Eric Anholt <eric@anholt.net>
|
||
|
(cherry picked from commit ca26d28bbaa39f31d5e7e4812603b015c8d54207)
|
||
|
---
|
||
|
drivers/gpu/drm/vc4/vc4_drv.h | 37 +++++++++----
|
||
|
drivers/gpu/drm/vc4/vc4_gem.c | 123 ++++++++++++++++++++++++++++++------------
|
||
|
drivers/gpu/drm/vc4/vc4_irq.c | 58 ++++++++++++++++----
|
||
|
3 files changed, 166 insertions(+), 52 deletions(-)
|
||
|
|
||
|
--- a/drivers/gpu/drm/vc4/vc4_drv.h
|
||
|
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
|
||
|
@@ -53,7 +53,7 @@ struct vc4_dev {
|
||
|
/* Protects bo_cache and the BO stats. */
|
||
|
struct mutex bo_lock;
|
||
|
|
||
|
- /* Sequence number for the last job queued in job_list.
|
||
|
+ /* Sequence number for the last job queued in bin_job_list.
|
||
|
* Starts at 0 (no jobs emitted).
|
||
|
*/
|
||
|
uint64_t emit_seqno;
|
||
|
@@ -63,11 +63,19 @@ struct vc4_dev {
|
||
|
*/
|
||
|
uint64_t finished_seqno;
|
||
|
|
||
|
- /* List of all struct vc4_exec_info for jobs to be executed.
|
||
|
- * The first job in the list is the one currently programmed
|
||
|
- * into ct0ca/ct1ca for execution.
|
||
|
+ /* List of all struct vc4_exec_info for jobs to be executed in
|
||
|
+ * the binner. The first job in the list is the one currently
|
||
|
+ * programmed into ct0ca for execution.
|
||
|
+ */
|
||
|
+ struct list_head bin_job_list;
|
||
|
+
|
||
|
+ /* List of all struct vc4_exec_info for jobs that have
|
||
|
+ * completed binning and are ready for rendering. The first
|
||
|
+ * job in the list is the one currently programmed into ct1ca
|
||
|
+ * for execution.
|
||
|
*/
|
||
|
- struct list_head job_list;
|
||
|
+ struct list_head render_job_list;
|
||
|
+
|
||
|
/* List of the finished vc4_exec_infos waiting to be freed by
|
||
|
* job_done_work.
|
||
|
*/
|
||
|
@@ -291,11 +299,20 @@ struct vc4_exec_info {
|
||
|
};
|
||
|
|
||
|
static inline struct vc4_exec_info *
|
||
|
-vc4_first_job(struct vc4_dev *vc4)
|
||
|
+vc4_first_bin_job(struct vc4_dev *vc4)
|
||
|
+{
|
||
|
+ if (list_empty(&vc4->bin_job_list))
|
||
|
+ return NULL;
|
||
|
+ return list_first_entry(&vc4->bin_job_list, struct vc4_exec_info, head);
|
||
|
+}
|
||
|
+
|
||
|
+static inline struct vc4_exec_info *
|
||
|
+vc4_first_render_job(struct vc4_dev *vc4)
|
||
|
{
|
||
|
- if (list_empty(&vc4->job_list))
|
||
|
+ if (list_empty(&vc4->render_job_list))
|
||
|
return NULL;
|
||
|
- return list_first_entry(&vc4->job_list, struct vc4_exec_info, head);
|
||
|
+ return list_first_entry(&vc4->render_job_list,
|
||
|
+ struct vc4_exec_info, head);
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
@@ -410,7 +427,9 @@ int vc4_wait_seqno_ioctl(struct drm_devi
|
||
|
struct drm_file *file_priv);
|
||
|
int vc4_wait_bo_ioctl(struct drm_device *dev, void *data,
|
||
|
struct drm_file *file_priv);
|
||
|
-void vc4_submit_next_job(struct drm_device *dev);
|
||
|
+void vc4_submit_next_bin_job(struct drm_device *dev);
|
||
|
+void vc4_submit_next_render_job(struct drm_device *dev);
|
||
|
+void vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec);
|
||
|
int vc4_wait_for_seqno(struct drm_device *dev, uint64_t seqno,
|
||
|
uint64_t timeout_ns, bool interruptible);
|
||
|
void vc4_job_handle_completed(struct vc4_dev *vc4);
|
||
|
--- a/drivers/gpu/drm/vc4/vc4_gem.c
|
||
|
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
|
||
|
@@ -154,10 +154,10 @@ vc4_save_hang_state(struct drm_device *d
|
||
|
struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
struct drm_vc4_get_hang_state *state;
|
||
|
struct vc4_hang_state *kernel_state;
|
||
|
- struct vc4_exec_info *exec;
|
||
|
+ struct vc4_exec_info *exec[2];
|
||
|
struct vc4_bo *bo;
|
||
|
unsigned long irqflags;
|
||
|
- unsigned int i, unref_list_count;
|
||
|
+ unsigned int i, j, unref_list_count, prev_idx;
|
||
|
|
||
|
kernel_state = kcalloc(1, sizeof(*kernel_state), GFP_KERNEL);
|
||
|
if (!kernel_state)
|
||
|
@@ -166,37 +166,55 @@ vc4_save_hang_state(struct drm_device *d
|
||
|
state = &kernel_state->user_state;
|
||
|
|
||
|
spin_lock_irqsave(&vc4->job_lock, irqflags);
|
||
|
- exec = vc4_first_job(vc4);
|
||
|
- if (!exec) {
|
||
|
+ exec[0] = vc4_first_bin_job(vc4);
|
||
|
+ exec[1] = vc4_first_render_job(vc4);
|
||
|
+ if (!exec[0] && !exec[1]) {
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
- unref_list_count = 0;
|
||
|
- list_for_each_entry(bo, &exec->unref_list, unref_head)
|
||
|
- unref_list_count++;
|
||
|
-
|
||
|
- state->bo_count = exec->bo_count + unref_list_count;
|
||
|
- kernel_state->bo = kcalloc(state->bo_count, sizeof(*kernel_state->bo),
|
||
|
- GFP_ATOMIC);
|
||
|
+ /* Get the bos from both binner and renderer into hang state. */
|
||
|
+ state->bo_count = 0;
|
||
|
+ for (i = 0; i < 2; i++) {
|
||
|
+ if (!exec[i])
|
||
|
+ continue;
|
||
|
+
|
||
|
+ unref_list_count = 0;
|
||
|
+ list_for_each_entry(bo, &exec[i]->unref_list, unref_head)
|
||
|
+ unref_list_count++;
|
||
|
+ state->bo_count += exec[i]->bo_count + unref_list_count;
|
||
|
+ }
|
||
|
+
|
||
|
+ kernel_state->bo = kcalloc(state->bo_count,
|
||
|
+ sizeof(*kernel_state->bo), GFP_ATOMIC);
|
||
|
+
|
||
|
if (!kernel_state->bo) {
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
- for (i = 0; i < exec->bo_count; i++) {
|
||
|
- drm_gem_object_reference(&exec->bo[i]->base);
|
||
|
- kernel_state->bo[i] = &exec->bo[i]->base;
|
||
|
- }
|
||
|
+ prev_idx = 0;
|
||
|
+ for (i = 0; i < 2; i++) {
|
||
|
+ if (!exec[i])
|
||
|
+ continue;
|
||
|
+
|
||
|
+ for (j = 0; j < exec[i]->bo_count; j++) {
|
||
|
+ drm_gem_object_reference(&exec[i]->bo[j]->base);
|
||
|
+ kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
|
||
|
+ }
|
||
|
|
||
|
- list_for_each_entry(bo, &exec->unref_list, unref_head) {
|
||
|
- drm_gem_object_reference(&bo->base.base);
|
||
|
- kernel_state->bo[i] = &bo->base.base;
|
||
|
- i++;
|
||
|
+ list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
|
||
|
+ drm_gem_object_reference(&bo->base.base);
|
||
|
+ kernel_state->bo[j + prev_idx] = &bo->base.base;
|
||
|
+ j++;
|
||
|
+ }
|
||
|
+ prev_idx = j + 1;
|
||
|
}
|
||
|
|
||
|
- state->start_bin = exec->ct0ca;
|
||
|
- state->start_render = exec->ct1ca;
|
||
|
+ if (exec[0])
|
||
|
+ state->start_bin = exec[0]->ct0ca;
|
||
|
+ if (exec[1])
|
||
|
+ state->start_render = exec[1]->ct1ca;
|
||
|
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
|
||
|
@@ -272,13 +290,15 @@ vc4_hangcheck_elapsed(unsigned long data
|
||
|
struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
uint32_t ct0ca, ct1ca;
|
||
|
unsigned long irqflags;
|
||
|
- struct vc4_exec_info *exec;
|
||
|
+ struct vc4_exec_info *bin_exec, *render_exec;
|
||
|
|
||
|
spin_lock_irqsave(&vc4->job_lock, irqflags);
|
||
|
- exec = vc4_first_job(vc4);
|
||
|
+
|
||
|
+ bin_exec = vc4_first_bin_job(vc4);
|
||
|
+ render_exec = vc4_first_render_job(vc4);
|
||
|
|
||
|
/* If idle, we can stop watching for hangs. */
|
||
|
- if (!exec) {
|
||
|
+ if (!bin_exec && !render_exec) {
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
return;
|
||
|
}
|
||
|
@@ -289,9 +309,12 @@ vc4_hangcheck_elapsed(unsigned long data
|
||
|
/* If we've made any progress in execution, rearm the timer
|
||
|
* and wait.
|
||
|
*/
|
||
|
- if (ct0ca != exec->last_ct0ca || ct1ca != exec->last_ct1ca) {
|
||
|
- exec->last_ct0ca = ct0ca;
|
||
|
- exec->last_ct1ca = ct1ca;
|
||
|
+ if ((bin_exec && ct0ca != bin_exec->last_ct0ca) ||
|
||
|
+ (render_exec && ct1ca != render_exec->last_ct1ca)) {
|
||
|
+ if (bin_exec)
|
||
|
+ bin_exec->last_ct0ca = ct0ca;
|
||
|
+ if (render_exec)
|
||
|
+ render_exec->last_ct1ca = ct1ca;
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
vc4_queue_hangcheck(dev);
|
||
|
return;
|
||
|
@@ -391,11 +414,13 @@ vc4_flush_caches(struct drm_device *dev)
|
||
|
* The job_lock should be held during this.
|
||
|
*/
|
||
|
void
|
||
|
-vc4_submit_next_job(struct drm_device *dev)
|
||
|
+vc4_submit_next_bin_job(struct drm_device *dev)
|
||
|
{
|
||
|
struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
- struct vc4_exec_info *exec = vc4_first_job(vc4);
|
||
|
+ struct vc4_exec_info *exec;
|
||
|
|
||
|
+again:
|
||
|
+ exec = vc4_first_bin_job(vc4);
|
||
|
if (!exec)
|
||
|
return;
|
||
|
|
||
|
@@ -405,11 +430,40 @@ vc4_submit_next_job(struct drm_device *d
|
||
|
V3D_WRITE(V3D_BPOA, 0);
|
||
|
V3D_WRITE(V3D_BPOS, 0);
|
||
|
|
||
|
- if (exec->ct0ca != exec->ct0ea)
|
||
|
+ /* Either put the job in the binner if it uses the binner, or
|
||
|
+ * immediately move it to the to-be-rendered queue.
|
||
|
+ */
|
||
|
+ if (exec->ct0ca != exec->ct0ea) {
|
||
|
submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
|
||
|
+ } else {
|
||
|
+ vc4_move_job_to_render(dev, exec);
|
||
|
+ goto again;
|
||
|
+ }
|
||
|
+}
|
||
|
+
|
||
|
+void
|
||
|
+vc4_submit_next_render_job(struct drm_device *dev)
|
||
|
+{
|
||
|
+ struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
+ struct vc4_exec_info *exec = vc4_first_render_job(vc4);
|
||
|
+
|
||
|
+ if (!exec)
|
||
|
+ return;
|
||
|
+
|
||
|
submit_cl(dev, 1, exec->ct1ca, exec->ct1ea);
|
||
|
}
|
||
|
|
||
|
+void
|
||
|
+vc4_move_job_to_render(struct drm_device *dev, struct vc4_exec_info *exec)
|
||
|
+{
|
||
|
+ struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
+ bool was_empty = list_empty(&vc4->render_job_list);
|
||
|
+
|
||
|
+ list_move_tail(&exec->head, &vc4->render_job_list);
|
||
|
+ if (was_empty)
|
||
|
+ vc4_submit_next_render_job(dev);
|
||
|
+}
|
||
|
+
|
||
|
static void
|
||
|
vc4_update_bo_seqnos(struct vc4_exec_info *exec, uint64_t seqno)
|
||
|
{
|
||
|
@@ -448,14 +502,14 @@ vc4_queue_submit(struct drm_device *dev,
|
||
|
exec->seqno = seqno;
|
||
|
vc4_update_bo_seqnos(exec, seqno);
|
||
|
|
||
|
- list_add_tail(&exec->head, &vc4->job_list);
|
||
|
+ list_add_tail(&exec->head, &vc4->bin_job_list);
|
||
|
|
||
|
/* If no job was executing, kick ours off. Otherwise, it'll
|
||
|
- * get started when the previous job's frame done interrupt
|
||
|
+ * get started when the previous job's flush done interrupt
|
||
|
* occurs.
|
||
|
*/
|
||
|
- if (vc4_first_job(vc4) == exec) {
|
||
|
- vc4_submit_next_job(dev);
|
||
|
+ if (vc4_first_bin_job(vc4) == exec) {
|
||
|
+ vc4_submit_next_bin_job(dev);
|
||
|
vc4_queue_hangcheck(dev);
|
||
|
}
|
||
|
|
||
|
@@ -849,7 +903,8 @@ vc4_gem_init(struct drm_device *dev)
|
||
|
{
|
||
|
struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
|
||
|
- INIT_LIST_HEAD(&vc4->job_list);
|
||
|
+ INIT_LIST_HEAD(&vc4->bin_job_list);
|
||
|
+ INIT_LIST_HEAD(&vc4->render_job_list);
|
||
|
INIT_LIST_HEAD(&vc4->job_done_list);
|
||
|
INIT_LIST_HEAD(&vc4->seqno_cb_list);
|
||
|
spin_lock_init(&vc4->job_lock);
|
||
|
--- a/drivers/gpu/drm/vc4/vc4_irq.c
|
||
|
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
|
||
|
@@ -30,6 +30,10 @@
|
||
|
* disables that specific interrupt, and 0s written are ignored
|
||
|
* (reading either one returns the set of enabled interrupts).
|
||
|
*
|
||
|
+ * When we take a binning flush done interrupt, we need to submit the
|
||
|
+ * next frame for binning and move the finished frame to the render
|
||
|
+ * thread.
|
||
|
+ *
|
||
|
* When we take a render frame interrupt, we need to wake the
|
||
|
* processes waiting for some frame to be done, and get the next frame
|
||
|
* submitted ASAP (so the hardware doesn't sit idle when there's work
|
||
|
@@ -44,6 +48,7 @@
|
||
|
#include "vc4_regs.h"
|
||
|
|
||
|
#define V3D_DRIVER_IRQS (V3D_INT_OUTOMEM | \
|
||
|
+ V3D_INT_FLDONE | \
|
||
|
V3D_INT_FRDONE)
|
||
|
|
||
|
DECLARE_WAIT_QUEUE_HEAD(render_wait);
|
||
|
@@ -77,7 +82,7 @@ vc4_overflow_mem_work(struct work_struct
|
||
|
unsigned long irqflags;
|
||
|
|
||
|
spin_lock_irqsave(&vc4->job_lock, irqflags);
|
||
|
- current_exec = vc4_first_job(vc4);
|
||
|
+ current_exec = vc4_first_bin_job(vc4);
|
||
|
if (current_exec) {
|
||
|
vc4->overflow_mem->seqno = vc4->finished_seqno + 1;
|
||
|
list_add_tail(&vc4->overflow_mem->unref_head,
|
||
|
@@ -98,17 +103,43 @@ vc4_overflow_mem_work(struct work_struct
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
-vc4_irq_finish_job(struct drm_device *dev)
|
||
|
+vc4_irq_finish_bin_job(struct drm_device *dev)
|
||
|
+{
|
||
|
+ struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
+ struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
|
||
|
+
|
||
|
+ if (!exec)
|
||
|
+ return;
|
||
|
+
|
||
|
+ vc4_move_job_to_render(dev, exec);
|
||
|
+ vc4_submit_next_bin_job(dev);
|
||
|
+}
|
||
|
+
|
||
|
+static void
|
||
|
+vc4_cancel_bin_job(struct drm_device *dev)
|
||
|
+{
|
||
|
+ struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
+ struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
|
||
|
+
|
||
|
+ if (!exec)
|
||
|
+ return;
|
||
|
+
|
||
|
+ list_move_tail(&exec->head, &vc4->bin_job_list);
|
||
|
+ vc4_submit_next_bin_job(dev);
|
||
|
+}
|
||
|
+
|
||
|
+static void
|
||
|
+vc4_irq_finish_render_job(struct drm_device *dev)
|
||
|
{
|
||
|
struct vc4_dev *vc4 = to_vc4_dev(dev);
|
||
|
- struct vc4_exec_info *exec = vc4_first_job(vc4);
|
||
|
+ struct vc4_exec_info *exec = vc4_first_render_job(vc4);
|
||
|
|
||
|
if (!exec)
|
||
|
return;
|
||
|
|
||
|
vc4->finished_seqno++;
|
||
|
list_move_tail(&exec->head, &vc4->job_done_list);
|
||
|
- vc4_submit_next_job(dev);
|
||
|
+ vc4_submit_next_render_job(dev);
|
||
|
|
||
|
wake_up_all(&vc4->job_wait_queue);
|
||
|
schedule_work(&vc4->job_done_work);
|
||
|
@@ -125,9 +156,10 @@ vc4_irq(int irq, void *arg)
|
||
|
barrier();
|
||
|
intctl = V3D_READ(V3D_INTCTL);
|
||
|
|
||
|
- /* Acknowledge the interrupts we're handling here. The render
|
||
|
- * frame done interrupt will be cleared, while OUTOMEM will
|
||
|
- * stay high until the underlying cause is cleared.
|
||
|
+ /* Acknowledge the interrupts we're handling here. The binner
|
||
|
+ * last flush / render frame done interrupt will be cleared,
|
||
|
+ * while OUTOMEM will stay high until the underlying cause is
|
||
|
+ * cleared.
|
||
|
*/
|
||
|
V3D_WRITE(V3D_INTCTL, intctl);
|
||
|
|
||
|
@@ -138,9 +170,16 @@ vc4_irq(int irq, void *arg)
|
||
|
status = IRQ_HANDLED;
|
||
|
}
|
||
|
|
||
|
+ if (intctl & V3D_INT_FLDONE) {
|
||
|
+ spin_lock(&vc4->job_lock);
|
||
|
+ vc4_irq_finish_bin_job(dev);
|
||
|
+ spin_unlock(&vc4->job_lock);
|
||
|
+ status = IRQ_HANDLED;
|
||
|
+ }
|
||
|
+
|
||
|
if (intctl & V3D_INT_FRDONE) {
|
||
|
spin_lock(&vc4->job_lock);
|
||
|
- vc4_irq_finish_job(dev);
|
||
|
+ vc4_irq_finish_render_job(dev);
|
||
|
spin_unlock(&vc4->job_lock);
|
||
|
status = IRQ_HANDLED;
|
||
|
}
|
||
|
@@ -205,6 +244,7 @@ void vc4_irq_reset(struct drm_device *de
|
||
|
V3D_WRITE(V3D_INTENA, V3D_DRIVER_IRQS);
|
||
|
|
||
|
spin_lock_irqsave(&vc4->job_lock, irqflags);
|
||
|
- vc4_irq_finish_job(dev);
|
||
|
+ vc4_cancel_bin_job(dev);
|
||
|
+ vc4_irq_finish_render_job(dev);
|
||
|
spin_unlock_irqrestore(&vc4->job_lock, irqflags);
|
||
|
}
|