[v2,2/5] panfrost: Allocate shaders in their own BOs

Submitted by Tomeu Vizoso on Aug. 7, 2019, 8:36 a.m.

Details

Message ID 20190807083657.40757-2-tomeu.vizoso@collabora.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Tomeu Vizoso Aug. 7, 2019, 8:36 a.m.
Instead of all shaders being stored in a single BO, have each shader in
its own.

This removes the need for a 16MB allocation per context, and allows us
to place transient blend shaders in BOs marked as executable (before
they were allocated in the transient pool, which shouldn't be
executable).

v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
      reading from GPU-accessible memory when patching (Alyssa).
    - Free struct panfrost_blend_shader (Alyssa).
    - Give the job a reference to regular shaders when emitting
      (Alyssa).

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
---
 src/gallium/drivers/panfrost/pan_allocate.h   |  2 +
 src/gallium/drivers/panfrost/pan_assemble.c   |  5 ++-
 src/gallium/drivers/panfrost/pan_blend.h      | 13 ++++--
 src/gallium/drivers/panfrost/pan_blend_cso.c  | 41 +++++++++++++------
 .../drivers/panfrost/pan_blend_shaders.c      | 13 ++----
 src/gallium/drivers/panfrost/pan_bo_cache.c   |  5 +--
 src/gallium/drivers/panfrost/pan_context.c    | 14 +++++--
 src/gallium/drivers/panfrost/pan_context.h    |  3 +-
 src/gallium/drivers/panfrost/pan_drm.c        |  5 ++-
 9 files changed, 66 insertions(+), 35 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/panfrost/pan_allocate.h b/src/gallium/drivers/panfrost/pan_allocate.h
index 8d925ee38a4c..0e06567d2069 100644
--- a/src/gallium/drivers/panfrost/pan_allocate.h
+++ b/src/gallium/drivers/panfrost/pan_allocate.h
@@ -59,6 +59,8 @@  struct panfrost_bo {
         size_t size;
 
         int gem_handle;
+
+        uint32_t flags;
 };
 
 struct panfrost_memory {
diff --git a/src/gallium/drivers/panfrost/pan_assemble.c b/src/gallium/drivers/panfrost/pan_assemble.c
index 15f77585a25c..dd877056d3b5 100644
--- a/src/gallium/drivers/panfrost/pan_assemble.c
+++ b/src/gallium/drivers/panfrost/pan_assemble.c
@@ -43,6 +43,7 @@  panfrost_shader_compile(
                 gl_shader_stage stage,
                 struct panfrost_shader_state *state)
 {
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
         uint8_t *dst;
 
         nir_shader *s;
@@ -80,7 +81,9 @@  panfrost_shader_compile(
          * I bet someone just thought that would be a cute pun. At least,
          * that's how I'd do it. */
 
-        meta->shader = panfrost_upload(&ctx->shaders, dst, size) | program.first_tag;
+        state->bo = panfrost_drm_create_bo(screen, size, PAN_ALLOCATE_EXECUTE);
+        memcpy(state->bo->cpu, dst, size);
+        meta->shader = state->bo->gpu | program.first_tag;
 
         util_dynarray_fini(&program.compiled);
 
diff --git a/src/gallium/drivers/panfrost/pan_blend.h b/src/gallium/drivers/panfrost/pan_blend.h
index a881e0945f48..a29353ff0014 100644
--- a/src/gallium/drivers/panfrost/pan_blend.h
+++ b/src/gallium/drivers/panfrost/pan_blend.h
@@ -33,8 +33,10 @@ 
 /* An internal blend shader descriptor, from the compiler */
 
 struct panfrost_blend_shader {
-        /* The compiled shader in GPU memory */
-        struct panfrost_transfer shader;
+        struct panfrost_context *ctx;
+
+        /* The compiled shader */
+        void *buffer;
 
         /* Byte count of the shader */
         unsigned size;
@@ -53,8 +55,11 @@  struct panfrost_blend_shader {
 /* A blend shader descriptor ready for actual use */
 
 struct panfrost_blend_shader_final {
-        /* The upload, possibly to transient memory */
-        mali_ptr gpu;
+        /* The compiled shader in GPU memory, possibly patched */
+        struct panfrost_bo *bo;
+
+        /* First instruction tag (for tagging the pointer) */
+        unsigned first_tag;
 
         /* Same meaning as panfrost_blend_shader */
         unsigned work_count;
diff --git a/src/gallium/drivers/panfrost/pan_blend_cso.c b/src/gallium/drivers/panfrost/pan_blend_cso.c
index f685b25b41b3..08a86980ca88 100644
--- a/src/gallium/drivers/panfrost/pan_blend_cso.c
+++ b/src/gallium/drivers/panfrost/pan_blend_cso.c
@@ -151,11 +151,24 @@  panfrost_bind_blend_state(struct pipe_context *pipe,
         ctx->dirty |= PAN_DIRTY_FS;
 }
 
+static void
+panfrost_delete_blend_shader(struct hash_entry *entry)
+{
+        struct panfrost_blend_shader *shader = (struct panfrost_blend_shader *)entry->data;
+        free(shader->buffer);
+        free(shader);
+}
+
 static void
 panfrost_delete_blend_state(struct pipe_context *pipe,
-                            void *blend)
+                            void *cso)
 {
-        /* TODO: Free shader binary? */
+        struct panfrost_blend_state *blend = (struct panfrost_blend_state *) cso;
+
+        for (unsigned c = 0; c < 4; ++c) {
+                struct panfrost_blend_rt *rt = &blend->rt[c];
+                _mesa_hash_table_u64_clear(rt->shaders, panfrost_delete_blend_shader);
+        }
         ralloc_free(blend);
 }
 
@@ -208,6 +221,9 @@  panfrost_blend_constant(float *out, float *in, unsigned mask)
 struct panfrost_blend_final
 panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
 {
+        struct panfrost_screen *screen = pan_screen(ctx->base.screen);
+        struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
+
         /* Grab the format, falling back gracefully if called invalidly (which
          * has to happen for no-color-attachment FBOs, for instance)  */
         struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
@@ -241,23 +257,24 @@  panfrost_get_blend_for_context(struct panfrost_context *ctx, unsigned rti)
         struct panfrost_blend_shader *shader = panfrost_get_blend_shader(ctx, blend, fmt, rti);
         final.is_shader = true;
         final.shader.work_count = shader->work_count;
+        final.shader.first_tag = shader->first_tag;
+
+        /* Upload the shader */
+        final.shader.bo = panfrost_drm_create_bo(screen, shader->size, PAN_ALLOCATE_EXECUTE);
+        memcpy(final.shader.bo->cpu, shader->buffer, shader->size);
+
+        /* Pass BO ownership to job */
+        panfrost_job_add_bo(job, final.shader.bo);
+        panfrost_bo_unreference(ctx->base.screen, final.shader.bo);
 
         if (shader->patch_index) {
                 /* We have to specialize the blend shader to use constants, so
-                 * patch in the current constants and upload to transient
-                 * memory */
+                 * patch in the current constants */
 
-                float *patch = (float *) (shader->shader.cpu + shader->patch_index);
+                float *patch = (float *) (final.shader.bo->cpu + shader->patch_index);
                 memcpy(patch, ctx->blend_color.color, sizeof(float) * 4);
-
-                final.shader.gpu = panfrost_upload_transient(
-                                           ctx, shader->shader.cpu, shader->size);
-        } else {
-                /* No need to specialize further, use the preuploaded */
-                final.shader.gpu = shader->shader.gpu;
         }
 
-        final.shader.gpu |= shader->first_tag;
         return final;
 }
 
diff --git a/src/gallium/drivers/panfrost/pan_blend_shaders.c b/src/gallium/drivers/panfrost/pan_blend_shaders.c
index 0063290d7e77..2ee86b4e7db3 100644
--- a/src/gallium/drivers/panfrost/pan_blend_shaders.c
+++ b/src/gallium/drivers/panfrost/pan_blend_shaders.c
@@ -133,6 +133,8 @@  panfrost_compile_blend_shader(
 {
         struct panfrost_blend_shader res;
 
+        res.ctx = ctx;
+
         /* Build the shader */
 
         nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_FRAGMENT, &midgard_nir_options, NULL);
@@ -172,21 +174,14 @@  panfrost_compile_blend_shader(
         midgard_program program;
         midgard_compile_shader_nir(&ctx->compiler, shader, &program, true);
 
-        /* Upload the shader */
-
-        int size = program.compiled.size;
-        uint8_t *dst = program.compiled.data;
-
-        res.shader.cpu = mem_dup(dst, size);
-        res.shader.gpu = panfrost_upload(&ctx->shaders, dst, size);
-
         /* At least two work registers are needed due to an encoding quirk */
         res.work_count = MAX2(program.work_register_count, 2);
 
         /* Allow us to patch later */
         res.patch_index = program.blend_patch_offset;
         res.first_tag = program.first_tag;
-        res.size = size;
+        res.size = program.compiled.size;
+        res.buffer = program.compiled.data;
 
         return res;
 }
diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
index fba495c1dd69..7378d0a8abea 100644
--- a/src/gallium/drivers/panfrost/pan_bo_cache.c
+++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
@@ -84,11 +84,10 @@  panfrost_bo_cache_fetch(
 {
         struct list_head *bucket = pan_bucket(screen, size);
 
-        /* TODO: Honour flags? */
-
         /* Iterate the bucket looking for something suitable */
         list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
-                if (entry->size >= size) {
+                if (entry->size >= size &&
+                    entry->flags == flags) {
                         /* This one works, splice it out of the cache */
                         list_del(&entry->link);
 
diff --git a/src/gallium/drivers/panfrost/pan_context.c b/src/gallium/drivers/panfrost/pan_context.c
index 284c246677df..2fc5ac361979 100644
--- a/src/gallium/drivers/panfrost/pan_context.c
+++ b/src/gallium/drivers/panfrost/pan_context.c
@@ -1062,6 +1062,8 @@  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 
                 panfrost_patch_shader_state(ctx, variant, PIPE_SHADER_FRAGMENT, false);
 
+                panfrost_job_add_bo(job, variant->bo);
+
 #define COPY(name) ctx->fragment_shader_core.name = variant->tripipe->name
 
                 COPY(shader);
@@ -1140,7 +1142,7 @@  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
 
                 if (blend.is_shader) {
                         ctx->fragment_shader_core.blend.shader =
-                                blend.shader.gpu;
+                                blend.shader.bo->gpu | blend.shader.first_tag;
                 } else {
                         ctx->fragment_shader_core.blend.shader = 0;
                 }
@@ -1215,7 +1217,7 @@  panfrost_emit_for_draw(struct panfrost_context *ctx, bool with_vertex_data)
                                 assert(!(is_srgb && blend.is_shader));
 
                                 if (blend.is_shader) {
-                                        rts[i].blend.shader = blend.shader.gpu;
+                                        rts[i].blend.shader = blend.shader.bo->gpu | blend.shader.first_tag;
                                 } else {
                                         rts[i].blend.equation = *blend.equation.equation;
                                         rts[i].blend.constant = blend.equation.constant;
@@ -1904,6 +1906,12 @@  panfrost_delete_shader_state(
                 DBG("Deleting TGSI shader leaks duplicated tokens\n");
         }
 
+        for (unsigned i = 0; i < cso->variant_count; ++i) {
+                struct panfrost_shader_state *shader_state = &cso->variants[i];
+                panfrost_bo_unreference(pctx->screen, shader_state->bo);
+                shader_state->bo = NULL;
+        }
+
         free(so);
 }
 
@@ -2528,7 +2536,6 @@  panfrost_destroy(struct pipe_context *pipe)
                 util_blitter_destroy(panfrost->blitter_wallpaper);
 
         panfrost_drm_free_slab(screen, &panfrost->scratchpad);
-        panfrost_drm_free_slab(screen, &panfrost->shaders);
         panfrost_drm_free_slab(screen, &panfrost->tiler_heap);
         panfrost_drm_free_slab(screen, &panfrost->tiler_dummy);
 
@@ -2673,7 +2680,6 @@  panfrost_setup_hardware(struct panfrost_context *ctx)
         struct panfrost_screen *screen = pan_screen(gallium->screen);
 
         panfrost_drm_allocate_slab(screen, &ctx->scratchpad, 64*4, false, 0, 0, 0);
-        panfrost_drm_allocate_slab(screen, &ctx->shaders, 4096, true, PAN_ALLOCATE_EXECUTE, 0, 0);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_heap, 4096, false, PAN_ALLOCATE_INVISIBLE | PAN_ALLOCATE_GROWABLE, 1, 128);
         panfrost_drm_allocate_slab(screen, &ctx->tiler_dummy, 1, false, PAN_ALLOCATE_INVISIBLE, 0, 0);
 }
diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index e8d2417e0cb2..9a34e243b74b 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -108,7 +108,6 @@  struct panfrost_context {
         struct pipe_framebuffer_state pipe_framebuffer;
 
         struct panfrost_memory cmdstream_persistent;
-        struct panfrost_memory shaders;
         struct panfrost_memory scratchpad;
         struct panfrost_memory tiler_heap;
         struct panfrost_memory tiler_dummy;
@@ -224,6 +223,8 @@  struct panfrost_shader_state {
 
         /* Should we enable helper invocations */
         bool helper_invocations;
+
+        struct panfrost_bo *bo;
 };
 
 /* A collection of varyings (the CSO) */
diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c
index 42cf17503344..8ae541ae11b6 100644
--- a/src/gallium/drivers/panfrost/pan_drm.c
+++ b/src/gallium/drivers/panfrost/pan_drm.c
@@ -88,6 +88,9 @@  panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size,
         /* Kernel will fail (confusingly) with EPERM otherwise */
         assert(size > 0);
 
+        /* To maximize BO cache usage, don't allocate tiny BOs */
+        size = MAX2(size, 4096);
+
         unsigned translated_flags = 0;
 
         /* TODO: translate flags to kernel flags, if the kernel supports */
@@ -120,6 +123,7 @@  panfrost_drm_create_bo(struct panfrost_screen *screen, size_t size,
                 bo->size = create_bo.size;
                 bo->gpu = create_bo.offset;
                 bo->gem_handle = create_bo.handle;
+                bo->flags = flags;
         }
 
         /* Only mmap now if we know we need to. For CPU-invisible buffers, we
@@ -285,7 +289,6 @@  panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool
         struct panfrost_job *job = panfrost_get_job_for_fbo(ctx);
 
         /* TODO: Add here the transient pools */
-        panfrost_job_add_bo(job, ctx->shaders.bo);
         panfrost_job_add_bo(job, ctx->scratchpad.bo);
         panfrost_job_add_bo(job, ctx->tiler_heap.bo);
         panfrost_job_add_bo(job, job->polygon_list);

Comments


On Wed, Aug 7, 2019 at 2:37 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
>
> Instead of all shaders being stored in a single BO, have each shader in
> its own.
>
> This removes the need for a 16MB allocation per context, and allows us
> to place transient blend shaders in BOs marked as executable (before
> they were allocated in the transient pool, which shouldn't be
> executable).
>
> v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
>       reading from GPU-accessible memory when patching (Alyssa).
>     - Free struct panfrost_blend_shader (Alyssa).
>     - Give the job a reference to regular shaders when emitting
>       (Alyssa).
>
> Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>


> diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
> index fba495c1dd69..7378d0a8abea 100644
> --- a/src/gallium/drivers/panfrost/pan_bo_cache.c
> +++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
> @@ -84,11 +84,10 @@ panfrost_bo_cache_fetch(
>  {
>          struct list_head *bucket = pan_bucket(screen, size);
>
> -        /* TODO: Honour flags? */
> -
>          /* Iterate the bucket looking for something suitable */
>          list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
> -                if (entry->size >= size) {
> +                if (entry->size >= size &&
> +                    entry->flags == flags) {

This change probably warrants its own patch IMO. This is using the
untranslated flags, but I think it should be the 'translated_flags' as
those are the ones changing the allocation. For example, I don't think
there's any reason for DELAYED_MMAP to be used as a match criteria
(BTW, I'm also not sure if we can reclaim BOs if they remain mmap'ed).

Another problem I see, if we have a 100MB buffer in the cache, would
we really want to hit on a 4KB allocation? Perhaps a 'entry->size * 2
< size' check.

>                          /* This one works, splice it out of the cache */
>                          list_del(&entry->link);
>

On Thu, 8 Aug 2019 at 00:47, Rob Herring <robh@kernel.org> wrote:
>
> On Wed, Aug 7, 2019 at 2:37 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> >
> > Instead of all shaders being stored in a single BO, have each shader in
> > its own.
> >
> > This removes the need for a 16MB allocation per context, and allows us
> > to place transient blend shaders in BOs marked as executable (before
> > they were allocated in the transient pool, which shouldn't be
> > executable).
> >
> > v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
> >       reading from GPU-accessible memory when patching (Alyssa).
> >     - Free struct panfrost_blend_shader (Alyssa).
> >     - Give the job a reference to regular shaders when emitting
> >       (Alyssa).
> >
> > Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
>
>
> > diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > index fba495c1dd69..7378d0a8abea 100644
> > --- a/src/gallium/drivers/panfrost/pan_bo_cache.c
> > +++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > @@ -84,11 +84,10 @@ panfrost_bo_cache_fetch(
> >  {
> >          struct list_head *bucket = pan_bucket(screen, size);
> >
> > -        /* TODO: Honour flags? */
> > -
> >          /* Iterate the bucket looking for something suitable */
> >          list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
> > -                if (entry->size >= size) {
> > +                if (entry->size >= size &&
> > +                    entry->flags == flags) {
>
> This change probably warrants its own patch IMO.

Agreed.

> This is using the
> untranslated flags, but I think it should be the 'translated_flags' as
> those are the ones changing the allocation. For example, I don't think
> there's any reason for DELAYED_MMAP to be used as a match criteria
> (BTW, I'm also not sure if we can reclaim BOs if they remain mmap'ed).
>
> Another problem I see, if we have a 100MB buffer in the cache, would
> we really want to hit on a 4KB allocation? Perhaps a 'entry->size * 2
> < size' check.

Yeah, as mentioned in the v1 discussion, we have plenty of room for
improvements here, but the goal now is just to stop doing memory
allocation so greedily that we reach OOM after launching a few GL
clients.

For 19.3 we could start tracking performance and other metrics such as
peak memory usage, etc.

Cheers,

Tomeu

>
> >                          /* This one works, splice it out of the cache */
> >                          list_del(&entry->link);
> >
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Wed, Aug 7, 2019 at 11:23 PM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
>
> On Thu, 8 Aug 2019 at 00:47, Rob Herring <robh@kernel.org> wrote:
> >
> > On Wed, Aug 7, 2019 at 2:37 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> > >
> > > Instead of all shaders being stored in a single BO, have each shader in
> > > its own.
> > >
> > > This removes the need for a 16MB allocation per context, and allows us
> > > to place transient blend shaders in BOs marked as executable (before
> > > they were allocated in the transient pool, which shouldn't be
> > > executable).
> > >
> > > v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
> > >       reading from GPU-accessible memory when patching (Alyssa).
> > >     - Free struct panfrost_blend_shader (Alyssa).
> > >     - Give the job a reference to regular shaders when emitting
> > >       (Alyssa).
> > >
> > > Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> >
> >
> > > diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > index fba495c1dd69..7378d0a8abea 100644
> > > --- a/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > +++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > @@ -84,11 +84,10 @@ panfrost_bo_cache_fetch(
> > >  {
> > >          struct list_head *bucket = pan_bucket(screen, size);
> > >
> > > -        /* TODO: Honour flags? */
> > > -
> > >          /* Iterate the bucket looking for something suitable */
> > >          list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
> > > -                if (entry->size >= size) {
> > > +                if (entry->size >= size &&
> > > +                    entry->flags == flags) {
> >
> > This change probably warrants its own patch IMO.
>
> Agreed.
>
> > This is using the
> > untranslated flags, but I think it should be the 'translated_flags' as
> > those are the ones changing the allocation. For example, I don't think
> > there's any reason for DELAYED_MMAP to be used as a match criteria
> > (BTW, I'm also not sure if we can reclaim BOs if they remain mmap'ed).
> >
> > Another problem I see, if we have a 100MB buffer in the cache, would
> > we really want to hit on a 4KB allocation? Perhaps a 'entry->size * 2
> > < size' check.
>
> Yeah, as mentioned in the v1 discussion, we have plenty of room for
> improvements here, but the goal now is just to stop doing memory
> allocation so greedily that we reach OOM after launching a few GL
> clients.

Sure. IMO, committing the BO cache without madvise was a mistake.
Without madvise, 2 instances of glmark will OOM. I should be able to
send out the patch for it today. I think it's going to need to disable
caching when madvise is not supported.

Rob
On Wed, Aug 7, 2019 at 5:47 PM Alyssa Rosenzweig
<alyssa.rosenzweig@collabora.com> wrote:
>
> > This is using the
> > untranslated flags, but I think it should be the 'translated_flags' as
> > those are the ones changing the allocation.
>
> It's a little more complex than that. There some hypothetical
> untranslated flags that I would want to match on. For instance, future
> CPU read-only/write-only modifiers -- those affect the mmap (and need to
> be accounted for in the BO cache) but aren't specified as
> translated_flags to the kernel.

I'll still argue that we shouldn't leave cached BOs mmap'ed so that
example would be mute.

The more bits we have to match on, the less effective the BO cache
will be. Either we should use translated_flags or we should filter the
untranslated flags to the ones we care about. The latter would be more
flexible I guess.

Rob
On Thu, 8 Aug 2019 at 16:19, Rob Herring <robh@kernel.org> wrote:
>
> On Wed, Aug 7, 2019 at 11:23 PM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> >
> > On Thu, 8 Aug 2019 at 00:47, Rob Herring <robh@kernel.org> wrote:
> > >
> > > On Wed, Aug 7, 2019 at 2:37 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> > > >
> > > > Instead of all shaders being stored in a single BO, have each shader in
> > > > its own.
> > > >
> > > > This removes the need for a 16MB allocation per context, and allows us
> > > > to place transient blend shaders in BOs marked as executable (before
> > > > they were allocated in the transient pool, which shouldn't be
> > > > executable).
> > > >
> > > > v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
> > > >       reading from GPU-accessible memory when patching (Alyssa).
> > > >     - Free struct panfrost_blend_shader (Alyssa).
> > > >     - Give the job a reference to regular shaders when emitting
> > > >       (Alyssa).
> > > >
> > > > Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> > >
> > >
> > > > diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > index fba495c1dd69..7378d0a8abea 100644
> > > > --- a/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > +++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > @@ -84,11 +84,10 @@ panfrost_bo_cache_fetch(
> > > >  {
> > > >          struct list_head *bucket = pan_bucket(screen, size);
> > > >
> > > > -        /* TODO: Honour flags? */
> > > > -
> > > >          /* Iterate the bucket looking for something suitable */
> > > >          list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
> > > > -                if (entry->size >= size) {
> > > > +                if (entry->size >= size &&
> > > > +                    entry->flags == flags) {
> > >
> > > This change probably warrants its own patch IMO.
> >
> > Agreed.
> >
> > > This is using the
> > > untranslated flags, but I think it should be the 'translated_flags' as
> > > those are the ones changing the allocation. For example, I don't think
> > > there's any reason for DELAYED_MMAP to be used as a match criteria
> > > (BTW, I'm also not sure if we can reclaim BOs if they remain mmap'ed).
> > >
> > > Another problem I see, if we have a 100MB buffer in the cache, would
> > > we really want to hit on a 4KB allocation? Perhaps a 'entry->size * 2
> > > < size' check.
> >
> > Yeah, as mentioned in the v1 discussion, we have plenty of room for
> > improvements here, but the goal now is just to stop doing memory
> > allocation so greedily that we reach OOM after launching a few GL
> > clients.
>
> Sure. IMO, committing the BO cache without madvise was a mistake.
> Without madvise, 2 instances of glmark will OOM.

How can I test that? I just checked here and I'm running 10 instances
of it within gnome-shell with 1GB still free (from a total of 2GB).
This is with HEAP support, without it we'd be still allocating one
16MB buffer per context, but it's still not that bad.

It used to be pretty bad when we were allocating gigantic buffers on
context creation, just to be safe. But Mesa master now is much more
careful with that and I think .

> I should be able to
> send out the patch for it today. I think it's going to need to disable
> caching when madvise is not supported.

Can you check if that would be still needed, please?

Thanks,

Tomeu

> Rob
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Thu, 8 Aug 2019 at 16:50, Rob Herring <robh@kernel.org> wrote:
>
> On Wed, Aug 7, 2019 at 5:47 PM Alyssa Rosenzweig
> <alyssa.rosenzweig@collabora.com> wrote:
> >
> > > This is using the
> > > untranslated flags, but I think it should be the 'translated_flags' as
> > > those are the ones changing the allocation.
> >
> > It's a little more complex than that. There some hypothetical
> > untranslated flags that I would want to match on. For instance, future
> > CPU read-only/write-only modifiers -- those affect the mmap (and need to
> > be accounted for in the BO cache) but aren't specified as
> > translated_flags to the kernel.
>
> I'll still argue that we shouldn't leave cached BOs mmap'ed so that
> example would be mute.
>
> The more bits we have to match on, the less effective the BO cache
> will be. Either we should use translated_flags or we should filter the
> untranslated flags to the ones we care about. The latter would be more
> flexible I guess.

Yeah, there's lots to optimize still, fortunately :)

Cheers,

Tomeu
On Fri, Aug 9, 2019 at 3:31 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
>
> On Thu, 8 Aug 2019 at 16:19, Rob Herring <robh@kernel.org> wrote:
> >
> > On Wed, Aug 7, 2019 at 11:23 PM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> > >
> > > On Thu, 8 Aug 2019 at 00:47, Rob Herring <robh@kernel.org> wrote:
> > > >
> > > > On Wed, Aug 7, 2019 at 2:37 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
> > > > >
> > > > > Instead of all shaders being stored in a single BO, have each shader in
> > > > > its own.
> > > > >
> > > > > This removes the need for a 16MB allocation per context, and allows us
> > > > > to place transient blend shaders in BOs marked as executable (before
> > > > > they were allocated in the transient pool, which shouldn't be
> > > > > executable).
> > > > >
> > > > > v2: - Store compiled blend shaders in a malloc'ed buffer, to avoid
> > > > >       reading from GPU-accessible memory when patching (Alyssa).
> > > > >     - Free struct panfrost_blend_shader (Alyssa).
> > > > >     - Give the job a reference to regular shaders when emitting
> > > > >       (Alyssa).
> > > > >
> > > > > Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> > > >
> > > >
> > > > > diff --git a/src/gallium/drivers/panfrost/pan_bo_cache.c b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > > index fba495c1dd69..7378d0a8abea 100644
> > > > > --- a/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > > +++ b/src/gallium/drivers/panfrost/pan_bo_cache.c
> > > > > @@ -84,11 +84,10 @@ panfrost_bo_cache_fetch(
> > > > >  {
> > > > >          struct list_head *bucket = pan_bucket(screen, size);
> > > > >
> > > > > -        /* TODO: Honour flags? */
> > > > > -
> > > > >          /* Iterate the bucket looking for something suitable */
> > > > >          list_for_each_entry_safe(struct panfrost_bo, entry, bucket, link) {
> > > > > -                if (entry->size >= size) {
> > > > > +                if (entry->size >= size &&
> > > > > +                    entry->flags == flags) {
> > > >
> > > > This change probably warrants its own patch IMO.
> > >
> > > Agreed.
> > >
> > > > This is using the
> > > > untranslated flags, but I think it should be the 'translated_flags' as
> > > > those are the ones changing the allocation. For example, I don't think
> > > > there's any reason for DELAYED_MMAP to be used as a match criteria
> > > > (BTW, I'm also not sure if we can reclaim BOs if they remain mmap'ed).
> > > >
> > > > Another problem I see, if we have a 100MB buffer in the cache, would
> > > > we really want to hit on a 4KB allocation? Perhaps a 'entry->size * 2
> > > > < size' check.
> > >
> > > Yeah, as mentioned in the v1 discussion, we have plenty of room for
> > > improvements here, but the goal now is just to stop doing memory
> > > allocation so greedily that we reach OOM after launching a few GL
> > > clients.
> >
> > Sure. IMO, committing the BO cache without madvise was a mistake.
> > Without madvise, 2 instances of glmark will OOM.
>
> How can I test that? I just checked here and I'm running 10 instances
> of it within gnome-shell with 1GB still free (from a total of 2GB).
> This is with HEAP support, without it we'd be still allocating one
> 16MB buffer per context, but it's still not that bad.
>
> It used to be pretty bad when we were allocating gigantic buffers on
> context creation, just to be safe. But Mesa master now is much more
> careful with that and I think .
>
> > I should be able to
> > send out the patch for it today. I think it's going to need to disable
> > caching when madvise is not supported.
>
> Can you check if that would be still needed, please?

It definitely seems better now.

I think before I also had lots of debug dmsg going to disk and filling
the page cache. I tried reproducing that, but still seems okay now.

Rob