[v2,21/37] panfrost: Allocate tiler and scratchpad BOs per-batch

Submitted by Boris Brezillon on Sept. 16, 2019, 9:36 a.m.

Details

Message ID 20190916093715.32203-22-boris.brezillon@collabora.com
State New
Headers show
Series "panfrost: Support batch pipelining" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Boris Brezillon Sept. 16, 2019, 9:36 a.m.
If we want to execute several batches in parallel they need to have
their own tiler and scratchpad BOs. Let move those objects to
panfrost_batch and allocate them on a per-batch basis.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
Note to Alyssa: I tried removing the dummy_tiler BO replacing it by a
dummy value (tried both 0xdeafbeef and 0x0) and unfortunately it
crashed, so I decided to keep this dummy allocation for now.
---
 src/gallium/drivers/panfrost/pan_context.c | 43 +++++++---------------
 src/gallium/drivers/panfrost/pan_context.h |  4 --
 src/gallium/drivers/panfrost/pan_job.c     | 41 +++++++++++++++++++--
 src/gallium/drivers/panfrost/pan_job.h     | 18 +++++++++
 4 files changed, 68 insertions(+), 38 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 34bc6e41218d..6cb6cdd1a686 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -57,7 +57,6 @@ 
 static struct midgard_tiler_descriptor
 panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
 {
-        struct panfrost_context *ctx = batch->ctx;
         struct midgard_tiler_descriptor t = {};
         unsigned height = batch->key.height;
         unsigned width = batch->key.width;
@@ -76,21 +75,28 @@  panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
         /* Sanity check */
 
         if (t.hierarchy_mask) {
+                struct panfrost_bo *tiler_heap;
+
+                tiler_heap = panfrost_batch_get_tiler_heap(batch);
                 t.polygon_list = panfrost_batch_get_polygon_list(batch,
                                                                  header_size +
                                                                  t.polygon_list_size);
 
 
                 /* Allow the entire tiler heap */
-                t.heap_start = ctx->tiler_heap->gpu;
-                t.heap_end = ctx->tiler_heap->gpu + ctx->tiler_heap->size;
+                t.heap_start = tiler_heap->gpu;
+                t.heap_end = tiler_heap->gpu + tiler_heap->size;
         } else {
+                struct panfrost_bo *tiler_dummy;
+
+                tiler_dummy = panfrost_batch_get_tiler_dummy(batch);
+
                 /* The tiler is disabled, so don't allow the tiler heap */
-                t.heap_start = ctx->tiler_heap->gpu;
+                t.heap_start = tiler_dummy->gpu;
                 t.heap_end = t.heap_start;
 
                 /* Use a dummy polygon list */
-                t.polygon_list = ctx->tiler_dummy->gpu;
+                t.polygon_list = tiler_dummy->gpu;
 
                 /* Disable the tiler */
                 t.hierarchy_mask |= MALI_TILER_DISABLED;
@@ -105,7 +111,6 @@  panfrost_emit_midg_tiler(struct panfrost_batch *batch, unsigned vertex_count)
 struct mali_single_framebuffer
 panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count)
 {
-        struct panfrost_context *ctx = batch->ctx;
         unsigned width = batch->key.width;
         unsigned height = batch->key.height;
 
@@ -115,7 +120,7 @@  panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count)
                 .unknown2 = 0x1f,
                 .format = 0x30000000,
                 .clear_flags = 0x1000,
-                .unknown_address_0 = ctx->scratchpad->gpu,
+                .unknown_address_0 = panfrost_batch_get_scratchpad(batch)->gpu,
                 .tiler = panfrost_emit_midg_tiler(batch, vertex_count),
         };
 
@@ -125,7 +130,6 @@  panfrost_emit_sfbd(struct panfrost_batch *batch, unsigned vertex_count)
 struct bifrost_framebuffer
 panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count)
 {
-        struct panfrost_context *ctx = batch->ctx;
         unsigned width = batch->key.width;
         unsigned height = batch->key.height;
 
@@ -143,7 +147,7 @@  panfrost_emit_mfbd(struct panfrost_batch *batch, unsigned vertex_count)
 
                 .unknown2 = 0x1f,
 
-                .scratchpad = ctx->scratchpad->gpu,
+                .scratchpad = panfrost_batch_get_scratchpad(batch)->gpu,
                 .tiler = panfrost_emit_midg_tiler(batch, vertex_count)
         };
 
@@ -2427,10 +2431,6 @@  panfrost_destroy(struct pipe_context *pipe)
         if (panfrost->blitter_wallpaper)
                 util_blitter_destroy(panfrost->blitter_wallpaper);
 
-        panfrost_bo_unreference(panfrost->scratchpad);
-        panfrost_bo_unreference(panfrost->tiler_heap);
-        panfrost_bo_unreference(panfrost->tiler_dummy);
-
         ralloc_free(pipe);
 }
 
@@ -2607,21 +2607,6 @@  panfrost_set_stream_output_targets(struct pipe_context *pctx,
         so->num_targets = num_targets;
 }
 
-static void
-panfrost_setup_hardware(struct panfrost_context *ctx)
-{
-        struct pipe_context *gallium = (struct pipe_context *) ctx;
-        struct panfrost_screen *screen = pan_screen(gallium->screen);
-
-        ctx->scratchpad = panfrost_bo_create(screen, 64 * 4 * 4096, 0);
-        ctx->tiler_heap = panfrost_bo_create(screen, 4096 * 4096,
-                                                 PAN_BO_INVISIBLE |
-                                                 PAN_BO_GROWABLE);
-        ctx->tiler_dummy = panfrost_bo_create(screen, 4096,
-                                                  PAN_BO_INVISIBLE);
-        assert(ctx->scratchpad && ctx->tiler_heap && ctx->tiler_dummy);
-}
-
 /* New context creation, which also does hardware initialisation since I don't
  * know the better way to structure this :smirk: */
 
@@ -2706,8 +2691,6 @@  panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
                                &ctx->out_sync);
         assert(!ret);
 
-        panfrost_setup_hardware(ctx);
-
         /* XXX: leaks */
         gallium->stream_uploader = u_upload_create_default(gallium);
         gallium->const_uploader = gallium->stream_uploader;
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index 4f5c1374720f..c145d589757e 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -123,10 +123,6 @@  struct panfrost_context {
         struct pipe_framebuffer_state pipe_framebuffer;
         struct panfrost_streamout streamout;
 
-        struct panfrost_bo *scratchpad;
-        struct panfrost_bo *tiler_heap;
-        struct panfrost_bo *tiler_dummy;
-
         bool active_queries;
         uint64_t prims_generated;
         uint64_t tf_prims_generated;
diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
index 5b9a51325c3b..69047ca62a6c 100644
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -195,6 +195,43 @@  panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size)
         return batch->polygon_list->gpu;
 }
 
+struct panfrost_bo *
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch)
+{
+        if (batch->scratchpad)
+                return batch->scratchpad;
+
+        batch->scratchpad = panfrost_batch_create_bo(batch, 64 * 4 * 4096,
+                                                     PAN_BO_INVISIBLE);
+        assert(batch->scratchpad);
+        return batch->scratchpad;
+}
+
+struct panfrost_bo *
+panfrost_batch_get_tiler_heap(struct panfrost_batch *batch)
+{
+        if (batch->tiler_heap)
+                return batch->tiler_heap;
+
+        batch->tiler_heap = panfrost_batch_create_bo(batch, 4096 * 4096,
+                                                     PAN_BO_INVISIBLE |
+                                                     PAN_BO_GROWABLE);
+        assert(batch->tiler_heap);
+        return batch->tiler_heap;
+}
+
+struct panfrost_bo *
+panfrost_batch_get_tiler_dummy(struct panfrost_batch *batch)
+{
+        if (batch->tiler_dummy)
+                return batch->tiler_dummy;
+
+        batch->tiler_dummy = panfrost_batch_create_bo(batch, 4096,
+                                                      PAN_BO_INVISIBLE);
+        assert(batch->tiler_dummy);
+        return batch->tiler_dummy;
+}
+
 static void
 panfrost_batch_draw_wallpaper(struct panfrost_batch *batch)
 {
@@ -345,13 +382,9 @@  panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
 static int
 panfrost_batch_submit_jobs(struct panfrost_batch *batch)
 {
-        struct panfrost_context *ctx = batch->ctx;
         bool has_draws = batch->first_job.gpu;
         int ret = 0;
 
-        panfrost_batch_add_bo(batch, ctx->scratchpad);
-        panfrost_batch_add_bo(batch, ctx->tiler_heap);
-
         if (has_draws) {
                 ret = panfrost_batch_submit_ioctl(batch, batch->first_job.gpu, 0);
                 assert(!ret);
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index 3474a102f5a4..e5e0e323f2fd 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -109,6 +109,15 @@  struct panfrost_batch {
         /* Polygon list bound to the batch, or NULL if none bound yet */
         struct panfrost_bo *polygon_list;
 
+        /* Scratchpath BO bound to the batch, or NULL if none bound yet */
+        struct panfrost_bo *scratchpad;
+
+        /* Tiler heap BO bound to the batch, or NULL if none bound yet */
+        struct panfrost_bo *tiler_heap;
+
+        /* Dummy tiler BO bound to the batch, or NULL if none bound yet */
+        struct panfrost_bo *tiler_dummy;
+
         /* Framebuffer descriptor. */
         mali_ptr framebuffer;
 };
@@ -139,6 +148,15 @@  panfrost_batch_set_requirements(struct panfrost_batch *batch);
 mali_ptr
 panfrost_batch_get_polygon_list(struct panfrost_batch *batch, unsigned size);
 
+struct panfrost_bo *
+panfrost_batch_get_scratchpad(struct panfrost_batch *batch);
+
+struct panfrost_bo *
+panfrost_batch_get_tiler_heap(struct panfrost_batch *batch);
+
+struct panfrost_bo *
+panfrost_batch_get_tiler_dummy(struct panfrost_batch *batch);
+
 void
 panfrost_batch_clear(struct panfrost_batch *batch,
                      unsigned buffers,

Comments

> Note to Alyssa: I tried removing the dummy_tiler BO replacing it by a
> dummy value (tried both 0xdeafbeef and 0x0) and unfortunately it
> crashed, so I decided to keep this dummy allocation for now.

Very well. That'd be a neat but very low prio RE task, so *shrug*

> -static void
> -panfrost_setup_hardware(struct panfrost_context *ctx)
> -{
> -        struct pipe_context *gallium = (struct pipe_context *) ctx;
> -        struct panfrost_screen *screen = pan_screen(gallium->screen);
> -
> -        ctx->scratchpad = panfrost_bo_create(screen, 64 * 4 * 4096, 0);
> -        ctx->tiler_heap = panfrost_bo_create(screen, 4096 * 4096,
> -                                                 PAN_BO_INVISIBLE |
> -                                                 PAN_BO_GROWABLE);
> -        ctx->tiler_dummy = panfrost_bo_create(screen, 4096,
> -                                                  PAN_BO_INVISIBLE);
> -        assert(ctx->scratchpad && ctx->tiler_heap && ctx->tiler_dummy);
> -}
> -
>  /* New context creation, which also does hardware initialisation since I don't
>   * know the better way to structure this :smirk: */

Glad to see that function gone. Could you remove the comment as well?

> +struct panfrost_bo *
> +panfrost_batch_get_scratchpad(struct panfrost_batch *batch)
> +{
> +        if (batch->scratchpad)
> +                return batch->scratchpad;
> +
> +        batch->scratchpad = panfrost_batch_create_bo(batch, 64 * 4 * 4096,
> +                                                     PAN_BO_INVISIBLE);
> +        assert(batch->scratchpad);
> +        return batch->scratchpad;
> +}

FWIW, the size of the scratchpad is computed from the amount of stack
memory used by a given shader (known at shader compile-time), summed
across all the shaders using stack memory in the batch, times a thread
constant we get from the kernel parameter.

We don't actually do this computation yet; we probably should. But it
should give you some idea of what will be needed here.

---

Conditional on removing the :smirk: comment, R-b