[2/5] panfrost: Allocate shaders in their own BOs

Submitted by Tomeu Vizoso on Aug. 5, 2019, 3:18 p.m.

Details

Message ID 20190805151836.12293-2-tomeu.vizoso@collabora.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Tomeu Vizoso Aug. 5, 2019, 3:18 p.m.
Instead of all shaders being stored in a single BO, have each shader in
its own.

This removes the need for a 16MB allocation per context, and allows us
to place transient blend shaders in BOs marked as executable (before
they were allocated in the transient pool, which shouldn't be
executable).

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
---
 src/gallium/drivers/panfrost/pan_allocate.h   |  2 +
 src/gallium/drivers/panfrost/pan_assemble.c   |  5 ++-
 src/gallium/drivers/panfrost/pan_blend.h      | 11 ++++--
 src/gallium/drivers/panfrost/pan_blend_cso.c  | 39 ++++++++++++++-----
 .../drivers/panfrost/pan_blend_shaders.c      |  7 +++-
 src/gallium/drivers/panfrost/pan_bo_cache.c   |  5 +--
 src/gallium/drivers/panfrost/pan_context.c    | 15 +++++--
 src/gallium/drivers/panfrost/pan_context.h    |  3 +-
 src/gallium/drivers/panfrost/pan_drm.c        |  5 ++-
 9 files changed, 68 insertions(+), 24 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/panfrost/pan_allocate.h b/src/gallium/drivers/panfrost/pan_allocate.h
index 8d925ee38a4c..0e06567d2069 100644
--- a/src/gallium/drivers/panfrost/pan_allocate.h
+++ b/src/gallium/drivers/panfrost/pan_allocate.h
@@ -59,6 +59,8 @@  struct panfrost_bo {
         size_t size;
 
         int gem_handle;
+
+        uint32_t flags;
 };
 
 struct panfrost_memory {
diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c
index 15f77585a25c..dd877056d3b5 100644
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@@ -43,6 +43,7 @@  panfrost_shader_compile(
                 gl_shader_stage stage,
                 struct panfrost_shader_state *state)
 {
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
         uint8_t *dst;
 
         nir_shader *s;
@@ -80,7 +81,9 @@  panfrost_shader_compile(
          * I bet someone just thought that would be a cute pun. At least,
          * that's how I'd do it. */
 
-        meta->shader = panfrost_upload(&ctx->shaders, dst, size) | program.first_tag;
+        state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE);
+        memcpy(state->bo->cpu, dst, size);
+        meta->shader = state->bo->gpu | program.first_tag;
 
         util_dynarray_fini(&program.compiled);
 
diff --git a/src/gallium/drivers/panfrost/pan_blend.h b/src/gallium/drivers/panfrost/pan_blend.h
index a881e0945f48..1269a9b0aa08 100644
--- a/src/gallium/drivers/panfrost/pan_blend.h
+++ b/src/gallium/drivers/panfrost/pan_blend.h
@@ -33,8 +33,10 @@ 
 /* An internal blend shader descriptor, from the compiler */
 
 struct panfrost_blend_shader {
+        struct panfrost_context *ctx;
+
         /* The compiled shader in GPU memory */
-        struct panfrost_transfer shader;
+        struct panfrost_bo *bo;
 
         /* Byte count of the shader */
         unsigned size;
@@ -53,8 +55,11 @@  struct panfrost_blend_shader {
 /* A blend shader descriptor ready for actual use */
 
 struct panfrost_blend_shader_final {
-        /* The upload, possibly to transient memory */
-        mali_ptr gpu;
+        /* The compiled shader in GPU memory, possibly patched */
+        struct panfrost_bo *bo;
+
+        /* First instruction tag (for tagging the pointer) */
+        unsigned first_tag;
 
         /* Same meaning as panfrost_blend_shader */
         unsigned work_count;
diff --git a/src/gallium/drivers/panfrost/pan_blend_cso.c b/src/gallium/drivers/panfrost/pan_blend_cso.c
index f685b25b41b3..92b94326b346 100644
--- a/src/gallium/drivers/panfrost/pan_blend_cso.c
+++ b/src/gallium/drivers/panfrost/pan_blend_cso.c
@@ -151,11 +151,23 @@  panfrost_bind_blend_state(struct pipe_context *pipe,
         ctx->dirty |= PAN_DIRTY_FS;
 }
 
+static void
+panfrost_delete_blend_shader(struct hash_entry *entry)
+{
+        struct panfrost_blend_shader *shader = (struct panfrost_blend_shader *)entry->data;
+        panfrost_bo_unreference(shader->ctx->base.screen, shader->bo);
+}
+
 static void
 panfrost_delete_blend_state(struct pipe_context *pipe,
-                            void *blend)
+                            void *cso)
 {
-        /* TODO: Free shader binary? */
+        struct panfrost_blend_state *blend = (struct panfrost_blend_state *) cso;
+
+        for (unsigned c = 0; c < 4; ++c) {
+                struct panfrost_blend_rt *rt = &blend->rt[c];
+                _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
+        }
         ralloc_free(blend);
 }
 
@@ -208,6 +220,9 @@  panfrost_blend_constant(float *out, float *in, unsigned mask)
 struct panfrost_blend_final
 panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
 {
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
         /* Grab the format, falling back gracefully if called invalidly (which
          * has to happen for no-color-attachment FBOs, for instance)  */
         struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
@@ -241,23 +256,29 @@  panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
         struct panfrost_blend_shader *shader = panfrost_get_blend_shader(ctx, blend, fmt, rti);
         final.is_shader = true;
         final.shader.work_count = shader->work_count;
+        final.shader.first_tag = shader->first_tag;
 
         if (shader->patch_index) {
                 /* We have to specialize the blend shader to use constants, so
-                 * patch in the current constants and upload to transient
-                 * memory */
+                 * copy the shader and patch in the current constants */
 
-                float *patch = (float *) (shader->shader.cpu + shader->patch_index);
+                final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE);
+                memcpy(final.shader.bo->cpu, shader->bo->cpu, shader->size);
+
+                float *patch = (float *) (final.shader.bo->cpu + shader->patch_index);
                 memcpy(patch, ctx->blend_color.color, sizeof(float) * 4);
 
-                final.shader.gpu = panfrost_upload_transient(
-                                           ctx, shader->shader.cpu, shader->size);
+                /* Pass ownership to job */
+                panfrost_job_add_bo(job, final.shader.bo);
+                panfrost_bo_unreference(ctx->base.screen, final.shader.bo);
         } else {
                 /* No need to specialize further, use the preuploaded */
-                final.shader.gpu = shader->shader.gpu;
+                final.shader.bo = shader->bo;
+
+                /* Hold a ref to the BO for the duration of this job */
+                panfrost_job_add_bo(job, final.shader.bo);
         }
 
-        final.shader.gpu |= shader->first_tag;
         return final;
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_blend_shaders.c b/src/gallium/drivers/panfrost/pan_blend_shaders.c
index 0063290d7e77..2cf8c89008f7 100644
--- a/src/gallium/drivers/panfrost/pan_blend_shaders.c
+++ b/src/gallium/drivers/panfrost/pan_blend_shaders.c
@@ -132,6 +132,9 @@  panfrost_compile_blend_shader(
         enum pipe_format format)
 {
         struct panfrost_blend_shader res;
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+
+        res.ctx = ctx;
 
         /* Build the shader */
 
@@ -177,8 +180,8 @@  panfrost_compile_blend_shader(
         int size = program.compiled.size;
         uint8_t *dst = program.compiled.data;
 
-        res.shader.cpu = mem_dup(dst, size);
-        res.shader.gpu = panfrost_upload(&ctx->shaders, dst, size);
+        res.bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE);
+        memcpy(res.bo->cpu, dst, size);
 
         /* At least two work registers are needed due to an encoding quirk */
         res.work_count = MAX2(program.work_register_count, 2);
diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
index fba495c1dd69..7378d0a8abea 100644
--- a/src/gallium/drivers/panfrost/pan_bo_cache.c
+++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
@@ -84,11 +84,10 @@  panfrost_bo_cache_fetch(
 {
         struct list_head *bucket = pan_bucket(screen, size);
 
-        /* TODO: Honour flags? */
-
         /* Iterate the bucket looking for something suitable */
         list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
-                if (entry->size >= size) {
+                if (entry->size >= size &&
+                    entry->flags == flags) {
                         /* This one works, splice it out of the cache */
                         list_del(&entry->link);
 
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index e9d18605dd02..94c2e76eff80 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -1140,7 +1140,7 @@  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 
                 if (blend.is_shader) {
                         ctx->fragment_shader_core.blend.shader =
-                                blend.shader.gpu;
+                                blend.shader.bo->gpu | blend.shader.first_tag;
                 } else {
                         ctx->fragment_shader_core.blend.shader = 0;
                 }
@@ -1215,7 +1215,7 @@  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                 assert(!(is_srgb && blend.is_shader));
 
                                 if (blend.is_shader) {
-                                        rts[i].blend.shader = blend.shader.gpu;
+                                        rts[i].blend.shader = blend.shader.bo->gpu | blend.shader.first_tag;
                                 } else {
                                         rts[i].blend.equation = *blend.equation.equation;
                                         rts[i].blend.constant = blend.equation.constant;
@@ -1904,6 +1904,12 @@  panfrost_delete_shader_state(
                 DBG("Deleting TGSI shader leaks duplicated tokens\n");
         }
 
+        for (unsigned i = 0; i < cso->variant_count; ++i) {
+                struct panfrost_shader_state *shader_state = &cso->variants[i];
+                panfrost_bo_unreference(pctx->screen, shader_state->bo);
+                shader_state->bo = NULL;
+        }
+
         free(so);
 }
 
@@ -2035,6 +2041,7 @@  panfrost_bind_shader_state(
         enum pipe_shader_type type)
 {
         struct panfrost_context *ctx = pan_context(pctx);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
 
         ctx->shader[type] = hwcso;
 
@@ -2098,6 +2105,8 @@  panfrost_bind_shader_state(
 
                 shader_state->compiled = true;
         }
+
+        panfrost_job_add_bo(job, shader_state->bo);
 }
 
 static void
@@ -2528,7 +2537,6 @@  panfrost_destroy(struct pipe_context *pipe)
                 util_blitter_destroy(panfrost->blitter_wallpaper);
 
         panfrost_drm_free_slab(screen, &panfrost->scratchpad);
-        panfrost_drm_free_slab(screen, &panfrost->shaders);
         panfrost_drm_free_slab(screen, &panfrost->tiler_heap);
         panfrost_drm_free_slab(screen, &panfrost->tiler_dummy);
 
@@ -2673,7 +2681,6 @@  panfrost_setup_hardware(struct panfrost_context *ctx)
         struct panfrost_screen *screen = pan_screen(gallium->screen);
 
         panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
-        panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_dummy, 1, false, PAN_ALLOCATE_INVISIBLE, 0, 0);
 }
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index e8d2417e0cb2..9a34e243b74b 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -108,7 +108,6 @@  struct panfrost_context {
         struct pipe_framebuffer_state pipe_framebuffer;
 
         struct panfrost_memory cmdstream_persistent;
-        struct panfrost_memory shaders;
         struct panfrost_memory scratchpad;
         struct panfrost_memory tiler_heap;
         struct panfrost_memory tiler_dummy;
@@ -224,6 +223,8 @@  struct panfrost_shader_state {
 
         /* Should we enable helper invocations */
         bool helper_invocations;
+
+        struct panfrost_bo *bo;
 };
 
 /* A collection of varyings (the CSO) */
diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c
index 42cf17503344..8ae541ae11b6 100644
--- a/src/gallium/drivers/panfrost/pan_drm.c
+++ b/src/gallium/drivers/panfrost/pan_drm.c
@@ -88,6 +88,9 @@  panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size,
         /* Kernel will fail (confusingly) with EPERM otherwise */
         assert(size > 0);
 
+        /* To maximize BO cache usage, don't allocate tiny BOs */
+        size = MAX2(size, 4096);
+
         unsigned translated_flags = 0;
 
         /* TODO: translate flags to kernel flags, if the kernel supports */
@@ -120,6 +123,7 @@  panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size,
                 bo->size = create_bo.size;
                 bo->gpu = create_bo.offset;
                 bo->gem_handle = create_bo.handle;
+                bo->flags = flags;
         }
 
         /* Only mmap now if we know we need to. For CPU-invisible buffers, we
@@ -285,7 +289,6 @@  panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool
         struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
 
         /* TODO: Add here the transient pools */
-        panfrost_job_add_bo(job, ctx->shaders.bo);
         panfrost_job_add_bo(job, ctx->scratchpad.bo);
         panfrost_job_add_bo(job, ctx->tiler_heap.bo);
         panfrost_job_add_bo(job, job->polygon_list);

Comments

> +        for (unsigned c = 0; c < 4; ++c) {
> +                struct panfrost_blend_rt *rt = &blend->rt[c];
> +                _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
> +        }

What's the implication  of the clear call? Does that recursively free
the blend CSOs due to ralloc infrastructure?

> +                memcpy(final.shader.bo->cpu, shader->bo->cpu, shader->size);

I'm not totally comfortable with reading off CPU-writeable GPU-mapped
memory. We do it for scoreboarding too (which is probably a bug). But as
discussed, this may have performance and/or security implications that
we don't really understand. Maybe we might be better off having a purely
CPU copy of shaders we'll need to patch?

> +                    entry->flags == flags) {

This is definitely necessary, +1

I'm wondering if we should be optimizing this somehow but we can sail
that boat when we get there, I guess.

> @@ -2035,6 +2041,7 @@ panfrost_bind_shader_state(
>          enum pipe_shader_type type)
>  {
>          struct panfrost_context *ctx = pan_context(pctx);
> +        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
>  
>          ctx->shader[type] = hwcso;
>  
> @@ -2098,6 +2105,8 @@ panfrost_bind_shader_state(
>  
>                  shader_state->compiled = true;
>          }
> +
> +        panfrost_job_add_bo(job, shader_state->bo);

This doesn't strike me as quite right. Conceptually, a shader state
could be bound on one frame and then just reused for 2000 frames in a
row, no? I guess that's not something you'll hit in CTS / normal apps,
but it still strikes me as something we need to pay attention to.

I think we want the add_bo() call to be part of emit_for_draw().

> +        /* To maximize BO cache usage, don't allocate tiny BOs */
> +        size = MAX2(size, 4096);

FWIW I think we round up to 4k anyway elsewhere? Also, how does this
interact with shaders -- if you have a bunch of tiny shaders of 128
bytes each, you'll be wasting 3968 bytes per shader, no? At least the
old strategy  (with the 16MB slab) allowed subdivision with byte
granularity.
On Mon, 5 Aug 2019 at 19:01, Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
>
> > +        for (unsigned c = 0; c < 4; ++c) {
> > +                struct panfrost_blend_rt *rt = &blend->rt[c];
> > +                _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
> > +        }
>
> What's the implication  of the clear call? Does that recursively free
> the blend CSOs due to ralloc infrastructure?

struct panfrost_blend_shader is allocated here with memdup, so I was
indeed missing a free(). Thanks!

> > +                memcpy(final.shader.bo->cpu, shader->bo->cpu, shader->size);
>
> I'm not totally comfortable with reading off CPU-writeable GPU-mapped
> memory. We do it for scoreboarding too (which is probably a bug). But as
> discussed, this may have performance and/or security implications that
> we don't really understand. Maybe we might be better off having a purely
> CPU copy of shaders we'll need to patch?

Hmm, guess would be better to play safe in this case.

> > +                    entry->flags == flags) {
>
> This is definitely necessary, +1
>
> I'm wondering if we should be optimizing this somehow but we can sail
> that boat when we get there, I guess.

Yep.

> > @@ -2035,6 +2041,7 @@ panfrost_bind_shader_state(
> >          enum pipe_shader_type type)
> >  {
> >          struct panfrost_context *ctx = pan_context(pctx);
> > +        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
> >
> >          ctx->shader[type] = hwcso;
> >
> > @@ -2098,6 +2105,8 @@ panfrost_bind_shader_state(
> >
> >                  shader_state->compiled = true;
> >          }
> > +
> > +        panfrost_job_add_bo(job, shader_state->bo);
>
> This doesn't strike me as quite right. Conceptually, a shader state
> could be bound on one frame and then just reused for 2000 frames in a
> row, no? I guess that's not something you'll hit in CTS / normal apps,
> but it still strikes me as something we need to pay attention to.
>
> I think we want the add_bo() call to be part of emit_for_draw().

Makes sense.

> > +        /* To maximize BO cache usage, don't allocate tiny BOs */
> > +        size = MAX2(size, 4096);
>
> FWIW I think we round up to 4k anyway elsewhere?

I think that happens only when going though panfrost_drm_allocate_slab.

> Also, how does this
> interact with shaders -- if you have a bunch of tiny shaders of 128
> bytes each, you'll be wasting 3968 bytes per shader, no? At least the
> old strategy  (with the 16MB slab) allowed subdivision with byte
> granularity.

Yeah, there's the wastage you describe, but we are currently wasting
several megs so it's a vast improvement in this regard.

For more fine-grained performance and memory usage improvements, I
think we should track these kind of regressions in CI, or we'll
struggle to consistently improve.

Thanks,

Tomeu
> Yeah, there's the wastage you describe, but we are currently wasting
> several megs so it's a vast improvement in this regard.

Ah, touche.

> For more fine-grained performance and memory usage improvements, I
> think we should track these kind of regressions in CI, or we'll
> struggle to consistently improve.

I'm not sure how performance/memory CI would work exactly, but it could
be interesting. (I don't think a build should 'fail' if perf regressed
but functionality improved..)