[v3,06/17] panfrost: Start tracking inter-batch dependencies

Submitted by Boris Brezillon on Sept. 18, 2019, 1:24 p.m.

Details

Message ID 20190918132439.27705-7-boris.brezillon@collabora.com
State New
Headers show
Series "panfrost: Support batch pipelining" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Boris Brezillon Sept. 18, 2019, 1:24 p.m.
The idea is to track which BO are being accessed and the type of access
to determine when a dependency exists. Thanks to that we can build a
dependency graph that will allow us to flush batches in the correct
order.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
Changes in v3:
* Fix coding style issues
* Do not check for batch presence in the reader array when updating
  a BO access (we already have this information)
* Add more comments to explain what we're doing and why we're doing
  it like that
---
 src/gallium/drivers/panfrost/pan_context.h |   3 +
 src/gallium/drivers/panfrost/pan_job.c     | 355 ++++++++++++++++++++-
 src/gallium/drivers/panfrost/pan_job.h     |   3 +
 3 files changed, 356 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index ce3e0c899a4f..3b09952345cf 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -114,6 +114,9 @@  struct panfrost_context {
         struct panfrost_batch *batch;
         struct hash_table *batches;
 
+        /* panfrost_bo -> panfrost_bo_access */
+        struct hash_table *accessed_bos;
+
         /* Within a launch_grid call.. */
         const struct pipe_grid_info *compute_grid;
 
diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
index 872c846207bf..b0494af3482f 100644
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -36,6 +36,29 @@ 
 #include "pan_util.h"
 #include "pandecode/decode.h"
 
+/* panfrost_bo_access is here to help us keep track of batch accesses to BOs
+ * and build a proper dependency graph such that batches can be pipelined for
+ * better GPU utilization.
+ *
+ * Each accessed BO has a corresponding entry in the ->accessed_bos hash table.
+ * A BO is either being written or read at any time, that's what the type field
+ * encodes.
+ * When the last access is a write, the batch writing the BO might have read
+ * dependencies (readers that have not been executed yet and want to read the
+ * previous BO content), and when the last access is a read, all readers might
+ * depend on another batch to push its results to memory. That's what the
+ * readers/writers keep track off.
+ * There can only be one writer at any given time, if a new batch wants to
+ * write to the same BO, a dependency will be added between the new writer and
+ * the old writer (at the batch level), and panfrost_bo_access->writer will be
+ * updated to point to the new writer.
+ */
+struct panfrost_bo_access {
+        uint32_t type;
+        struct util_dynarray readers;
+        struct panfrost_batch_fence *writer;
+};
+
 static struct panfrost_batch_fence *
 panfrost_create_batch_fence(struct panfrost_batch *batch)
 {
@@ -92,6 +115,7 @@  panfrost_create_batch(struct panfrost_context *ctx,
 
         util_dynarray_init(&batch->headers, batch);
         util_dynarray_init(&batch->gpu_headers, batch);
+        util_dynarray_init(&batch->dependencies, batch);
         batch->out_sync = panfrost_create_batch_fence(batch);
         util_copy_framebuffer_state(&batch->key, key);
 
@@ -151,6 +175,11 @@  panfrost_free_batch(struct panfrost_batch *batch)
         hash_table_foreach(batch->bos, entry)
                 panfrost_bo_unreference((struct panfrost_bo *)entry->key);
 
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                panfrost_batch_fence_unreference(*dep);
+        }
+
         /* The out_sync fence lifetime is different from the the batch one
          * since other batches might want to wait on a fence of already
          * submitted/signaled batch. All we need to do here is make sure the
@@ -164,6 +193,56 @@  panfrost_free_batch(struct panfrost_batch *batch)
         ralloc_free(batch);
 }
 
+#ifndef NDEBUG
+static bool
+panfrost_dep_graph_contains_batch(struct panfrost_batch *root,
+                                  struct panfrost_batch *batch)
+{
+        if (!root)
+                return false;
+
+        util_dynarray_foreach(&root->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if ((*dep)->batch == batch ||
+                    panfrost_dep_graph_contains_batch((*dep)->batch, batch))
+                        return true;
+        }
+
+        return false;
+}
+#endif
+
+static void
+panfrost_batch_add_dep(struct panfrost_batch *batch,
+                       struct panfrost_batch_fence *newdep)
+{
+        if (batch == newdep->batch)
+                return;
+
+        /* We might want to turn ->dependencies into a set if the number of
+         * deps turns out to be big enough to make this 'is dep already there'
+         * search inefficient.
+         */
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if (*dep == newdep)
+                        return;
+        }
+
+        /* Make sure the dependency graph is acyclic. */
+        assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch));
+
+        panfrost_batch_fence_reference(newdep);
+        util_dynarray_append(&batch->dependencies,
+                             struct panfrost_batch_fence *, newdep);
+
+        /* We now have a batch depending on us, let's make sure new draw/clear
+         * calls targeting the same FBO use a new batch object.
+         */
+        if (newdep->batch)
+                panfrost_freeze_batch(newdep->batch);
+}
+
 static struct panfrost_batch *
 panfrost_get_batch(struct panfrost_context *ctx,
                    const struct pipe_framebuffer_state *key)
@@ -214,6 +293,216 @@  panfrost_get_batch_for_fbo(struct panfrost_context *ctx)
         return batch;
 }
 
+static bool
+panfrost_batch_fence_is_signaled(struct panfrost_batch_fence *fence)
+{
+        if (fence->signaled)
+                return true;
+
+        /* Batch has not been submitted yet. */
+        if (fence->batch)
+                return false;
+
+        int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd,
+                                 &fence->syncobj, 1, 0, 0, NULL);
+
+        /* Cache whether the fence was signaled */
+        fence->signaled = ret >= 0;
+        return fence->signaled;
+}
+
+static void
+panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
+                             struct panfrost_bo_access *access,
+			     const struct panfrost_bo *bo)
+{
+        if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) {
+                panfrost_batch_fence_unreference(access->writer);
+                access->writer = NULL;
+        }
+
+        unsigned nreaders = 0;
+        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
+                              reader) {
+                if (!(*reader))
+                        continue;
+
+                if (panfrost_batch_fence_is_signaled(*reader)) {
+                        panfrost_batch_fence_unreference(*reader);
+                        *reader = NULL;
+                } else {
+                        nreaders++;
+                }
+        }
+
+        if (!nreaders)
+                util_dynarray_clear(&access->readers);
+}
+
+/* Collect signaled fences to keep the kernel-side syncobj-map small. The
+ * idea is to collect those signaled fences at the end of each flush_all
+ * call. This function is likely to collect only fences from previous
+ * batch flushes not the one that have just have just been submitted and
+ * are probably still in flight when we trigger the garbage collection.
+ * Anyway, we need to do this garbage collection at some point if we don't
+ * want the BO access map to keep invalid entries around and retain
+ * syncobjs forever.
+ */
+static void
+panfrost_gc_fences(struct panfrost_context *ctx)
+{
+        hash_table_foreach(ctx->accessed_bos, entry) {
+                struct panfrost_bo_access *access = entry->data;
+
+                assert(access);
+                panfrost_bo_access_gc_fences(ctx, access, entry->key);
+                if (!util_dynarray_num_elements(&access->readers,
+                                                struct panfrost_batch_fence *) &&
+                    !access->writer)
+                        _mesa_hash_table_remove(ctx->accessed_bos, entry);
+        }
+}
+
+#ifndef NDEBUG
+static bool
+panfrost_batch_in_readers(struct panfrost_batch *batch,
+                          struct panfrost_bo_access *access)
+{
+        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
+                              reader) {
+                if (*reader && (*reader)->batch == batch)
+                        return true;
+        }
+
+        return false;
+}
+#endif
+
+static void
+panfrost_batch_update_bo_access(struct panfrost_batch *batch,
+                                struct panfrost_bo *bo, uint32_t access_type,
+                                bool already_accessed)
+{
+        struct panfrost_context *ctx = batch->ctx;
+        struct panfrost_bo_access *access;
+        uint32_t old_access_type;
+        struct hash_entry *entry;
+
+        assert(access_type == PAN_BO_ACCESS_WRITE ||
+               access_type == PAN_BO_ACCESS_READ);
+
+        entry = _mesa_hash_table_search(ctx->accessed_bos, bo);
+        access = entry ? entry->data : NULL;
+        if (access) {
+                old_access_type = access->type;
+        } else {
+                access = rzalloc(ctx, struct panfrost_bo_access);
+                util_dynarray_init(&access->readers, access);
+                _mesa_hash_table_insert(ctx->accessed_bos, bo, access);
+                /* We are the first to access this BO, let's initialize
+                 * old_access_type to our own access type in that case.
+                 */
+                old_access_type = access_type;
+                access->type = access_type;
+        }
+
+        assert(access);
+
+        if (access_type == PAN_BO_ACCESS_WRITE &&
+            old_access_type == PAN_BO_ACCESS_READ) {
+                /* Previous access was a read and we want to write this BO.
+                 * We first need to add explicit deps between our batch and
+                 * the previous readers.
+                 */
+                util_dynarray_foreach(&access->readers,
+                                      struct panfrost_batch_fence *, reader) {
+                        /* We were already reading the BO, no need to add a dep
+                         * on ourself (the acyclic check would complain about
+                         * that).
+                         */
+                        if (!(*reader) || (*reader)->batch == batch)
+                                continue;
+
+                        panfrost_batch_add_dep(batch, *reader);
+                }
+                panfrost_batch_fence_reference(batch->out_sync);
+
+                /* We now are the new writer. */
+                access->writer = batch->out_sync;
+                access->type = access_type;
+
+                /* Release the previous readers and reset the readers array. */
+                util_dynarray_foreach(&access->readers,
+                                      struct panfrost_batch_fence *,
+                                      reader) {
+                        if (!*reader)
+                                continue;
+                        panfrost_batch_fence_unreference(*reader);
+                }
+
+                util_dynarray_clear(&access->readers);
+        } else if (access_type == PAN_BO_ACCESS_WRITE &&
+                   old_access_type == PAN_BO_ACCESS_WRITE) {
+                /* Previous access was a write and we want to write this BO.
+                 * First check if we were the previous writer, in that case
+                 * there's nothing to do. Otherwise we need to add a
+                 * dependency between the new writer and the old one.
+                 */
+		if (access->writer != batch->out_sync) {
+                        if (access->writer) {
+                                panfrost_batch_add_dep(batch, access->writer);
+                                panfrost_batch_fence_unreference(access->writer);
+                        }
+                        panfrost_batch_fence_reference(batch->out_sync);
+                        access->writer = batch->out_sync;
+                }
+        } else if (access_type == PAN_BO_ACCESS_READ &&
+                   old_access_type == PAN_BO_ACCESS_WRITE) {
+                /* Previous access was a write and we want to read this BO.
+                 * First check if we were the previous writer, in that case
+                 * we want to keep the access type unchanged, as a write is
+                 * more constraining than a read.
+                 */
+                if (access->writer != batch->out_sync) {
+                        /* Add a dependency on the previous writer. */
+                        panfrost_batch_add_dep(batch, access->writer);
+
+                        /* The previous access was a write, there's no reason
+                         * to have entries in the readers array.
+                         */
+                        assert(!util_dynarray_num_elements(&access->readers,
+                                                           struct panfrost_batch_fence *));
+
+                        /* Add ourselves to the readers array. */
+                        panfrost_batch_fence_reference(batch->out_sync);
+                        util_dynarray_append(&access->readers,
+                                             struct panfrost_batch_fence *,
+                                             batch->out_sync);
+                        access->type = PAN_BO_ACCESS_READ;
+                }
+        } else {
+                /* We already accessed this BO before, so we should already be
+                 * in the reader array.
+                 */
+                if (already_accessed) {
+                        assert(panfrost_batch_in_readers(batch, access));
+                        return;
+                }
+
+                /* Previous access was a read and we want to read this BO.
+                 * Add ourselves to the readers array and add a dependency on
+                 * the previous writer if any.
+                 */
+                panfrost_batch_fence_reference(batch->out_sync);
+                util_dynarray_append(&access->readers,
+                                     struct panfrost_batch_fence *,
+                                     batch->out_sync);
+
+                if (access->writer)
+                        panfrost_batch_add_dep(batch, access->writer);
+        }
+}
+
 void
 panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
                       uint32_t flags)
@@ -231,6 +520,10 @@  panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
                 panfrost_bo_reference(bo);
 	} else {
                 old_flags = (uintptr_t)entry->data;
+
+                /* All batches have to agree on the shared flag. */
+                assert((old_flags & PAN_BO_ACCESS_SHARED) ==
+                       (flags & PAN_BO_ACCESS_SHARED));
         }
 
         assert(entry);
@@ -240,6 +533,25 @@  panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
 
         flags |= old_flags;
         entry->data = (void *)(uintptr_t)flags;
+
+        /* If this is not a shared BO, we don't really care about dependency
+         * tracking.
+         */
+        if (!(flags & PAN_BO_ACCESS_SHARED))
+                return;
+
+        /* All dependencies should have been flushed before we execute the
+         * wallpaper draw, so it should be harmless to skip the
+         * update_bo_access() call.
+         */
+        if (batch == batch->ctx->wallpaper_batch)
+                return;
+
+        /* Only pass R/W flags to the dep tracking logic. */
+        assert(flags & PAN_BO_ACCESS_RW);
+        flags = (flags & PAN_BO_ACCESS_WRITE) ?
+                PAN_BO_ACCESS_WRITE : PAN_BO_ACCESS_READ;
+        panfrost_batch_update_bo_access(batch, bo, flags, old_flags != 0);
 }
 
 void panfrost_batch_add_fbo_bos(struct panfrost_batch *batch)
@@ -459,15 +771,36 @@  panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         struct pipe_context *gallium = (struct pipe_context *) ctx;
         struct panfrost_screen *screen = pan_screen(gallium->screen);
         struct drm_panfrost_submit submit = {0,};
-        uint32_t *bo_handles;
+        uint32_t *bo_handles, *in_syncs = NULL;
+        bool is_fragment_shader;
         int ret;
 
-
-        if (ctx->last_out_sync) {
+        is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job.gpu;
+        if (is_fragment_shader)
                 submit.in_sync_count = 1;
-                submit.in_syncs = (uintptr_t)&ctx->last_out_sync->syncobj;
+        else
+                submit.in_sync_count = util_dynarray_num_elements(&batch->dependencies,
+                                                                  struct panfrost_batch_fence *);
+
+        if (submit.in_sync_count) {
+                in_syncs = calloc(submit.in_sync_count, sizeof(*in_syncs));
+                assert(in_syncs);
         }
 
+        /* The fragment job always depends on the vertex/tiler job if there's
+         * one
+         */
+        if (is_fragment_shader) {
+                in_syncs[0] = batch->out_sync->syncobj;
+        } else {
+                unsigned int i = 0;
+
+                util_dynarray_foreach(&batch->dependencies,
+                                      struct panfrost_batch_fence *, dep)
+                        in_syncs[i++] = (*dep)->syncobj;
+        }
+
+        submit.in_syncs = (uintptr_t)in_syncs;
         submit.out_sync = batch->out_sync->syncobj;
         submit.jc = first_job_desc;
         submit.requirements = reqs;
@@ -484,6 +817,7 @@  panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         submit.bo_handles = (u64) (uintptr_t) bo_handles;
         ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
         free(bo_handles);
+        free(in_syncs);
 
         /* Release the last batch fence if any, and retain the new one */
         if (ctx->last_out_sync)
@@ -534,6 +868,13 @@  panfrost_batch_submit(struct panfrost_batch *batch)
 {
         assert(batch);
 
+        /* Submit the dependencies first. */
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if ((*dep)->batch)
+                        panfrost_batch_submit((*dep)->batch);
+        }
+
         struct panfrost_context *ctx = batch->ctx;
         int ret;
 
@@ -567,7 +908,6 @@  panfrost_batch_submit(struct panfrost_batch *batch)
 
 out:
         panfrost_freeze_batch(batch);
-        assert(!ctx->batch || batch == ctx->batch);
 
         /* We always stall the pipeline for correct results since pipelined
          * rendering is quite broken right now (to be fixed by the panfrost_job
@@ -579,6 +919,9 @@  out:
                                NULL);
 
         panfrost_free_batch(batch);
+
+        /* Collect batch fences before returning */
+        panfrost_gc_fences(ctx);
 }
 
 void
@@ -785,4 +1128,6 @@  panfrost_batch_init(struct panfrost_context *ctx)
         ctx->batches = _mesa_hash_table_create(ctx,
                                                panfrost_batch_hash,
                                                panfrost_batch_compare);
+        ctx->accessed_bos = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
+                                                    _mesa_key_pointer_equal);
 }
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index 88f1e4620fd0..63813dff652d 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -153,6 +153,9 @@  struct panfrost_batch {
 
         /* Output sync object. Only valid when submitted is true. */
         struct panfrost_batch_fence *out_sync;
+
+        /* Batch dependencies */
+        struct util_dynarray dependencies;
 };
 
 /* Functions for managing the above */

Comments

R-b. nice work!

On Wed, Sep 18, 2019 at 03:24:28PM +0200, Boris Brezillon wrote:
> The idea is to track which BO are being accessed and the type of access
> to determine when a dependency exists. Thanks to that we can build a
> dependency graph that will allow us to flush batches in the correct
> order.
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> ---
> Changes in v3:
> * Fix coding style issues
> * Do not check for batch presence in the reader array when updating
>   a BO access (we already have this information)
> * Add more comments to explain what we're doing and why we're doing
>   it like that
> ---
>  src/gallium/drivers/panfrost/pan_context.h |   3 +
>  src/gallium/drivers/panfrost/pan_job.c     | 355 ++++++++++++++++++++-
>  src/gallium/drivers/panfrost/pan_job.h     |   3 +
>  3 files changed, 356 insertions(+), 5 deletions(-)
> 
> diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
> index ce3e0c899a4f..3b09952345cf 100644
> --- a/src/gallium/drivers/panfrost/pan_context.h
> +++ b/src/gallium/drivers/panfrost/pan_context.h
> @@ -114,6 +114,9 @@ struct panfrost_context {
>          struct panfrost_batch *batch;
>          struct hash_table *batches;
>  
> +        /* panfrost_bo -> panfrost_bo_access */
> +        struct hash_table *accessed_bos;
> +
>          /* Within a launch_grid call.. */
>          const struct pipe_grid_info *compute_grid;
>  
> diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
> index 872c846207bf..b0494af3482f 100644
> --- a/src/gallium/drivers/panfrost/pan_job.c
> +++ b/src/gallium/drivers/panfrost/pan_job.c
> @@ -36,6 +36,29 @@
>  #include "pan_util.h"
>  #include "pandecode/decode.h"
>  
> +/* panfrost_bo_access is here to help us keep track of batch accesses to BOs
> + * and build a proper dependency graph such that batches can be pipelined for
> + * better GPU utilization.
> + *
> + * Each accessed BO has a corresponding entry in the ->accessed_bos hash table.
> + * A BO is either being written or read at any time, that's what the type field
> + * encodes.
> + * When the last access is a write, the batch writing the BO might have read
> + * dependencies (readers that have not been executed yet and want to read the
> + * previous BO content), and when the last access is a read, all readers might
> + * depend on another batch to push its results to memory. That's what the
> + * readers/writers keep track off.
> + * There can only be one writer at any given time, if a new batch wants to
> + * write to the same BO, a dependency will be added between the new writer and
> + * the old writer (at the batch level), and panfrost_bo_access->writer will be
> + * updated to point to the new writer.
> + */
> +struct panfrost_bo_access {
> +        uint32_t type;
> +        struct util_dynarray readers;
> +        struct panfrost_batch_fence *writer;
> +};
> +
>  static struct panfrost_batch_fence *
>  panfrost_create_batch_fence(struct panfrost_batch *batch)
>  {
> @@ -92,6 +115,7 @@ panfrost_create_batch(struct panfrost_context *ctx,
>  
>          util_dynarray_init(&batch->headers, batch);
>          util_dynarray_init(&batch->gpu_headers, batch);
> +        util_dynarray_init(&batch->dependencies, batch);
>          batch->out_sync = panfrost_create_batch_fence(batch);
>          util_copy_framebuffer_state(&batch->key, key);
>  
> @@ -151,6 +175,11 @@ panfrost_free_batch(struct panfrost_batch *batch)
>          hash_table_foreach(batch->bos, entry)
>                  panfrost_bo_unreference((struct panfrost_bo *)entry->key);
>  
> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                panfrost_batch_fence_unreference(*dep);
> +        }
> +
>          /* The out_sync fence lifetime is different from the the batch one
>           * since other batches might want to wait on a fence of already
>           * submitted/signaled batch. All we need to do here is make sure the
> @@ -164,6 +193,56 @@ panfrost_free_batch(struct panfrost_batch *batch)
>          ralloc_free(batch);
>  }
>  
> +#ifndef NDEBUG
> +static bool
> +panfrost_dep_graph_contains_batch(struct panfrost_batch *root,
> +                                  struct panfrost_batch *batch)
> +{
> +        if (!root)
> +                return false;
> +
> +        util_dynarray_foreach(&root->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                if ((*dep)->batch == batch ||
> +                    panfrost_dep_graph_contains_batch((*dep)->batch, batch))
> +                        return true;
> +        }
> +
> +        return false;
> +}
> +#endif
> +
> +static void
> +panfrost_batch_add_dep(struct panfrost_batch *batch,
> +                       struct panfrost_batch_fence *newdep)
> +{
> +        if (batch == newdep->batch)
> +                return;
> +
> +        /* We might want to turn ->dependencies into a set if the number of
> +         * deps turns out to be big enough to make this 'is dep already there'
> +         * search inefficient.
> +         */
> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                if (*dep == newdep)
> +                        return;
> +        }
> +
> +        /* Make sure the dependency graph is acyclic. */
> +        assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch));
> +
> +        panfrost_batch_fence_reference(newdep);
> +        util_dynarray_append(&batch->dependencies,
> +                             struct panfrost_batch_fence *, newdep);
> +
> +        /* We now have a batch depending on us, let's make sure new draw/clear
> +         * calls targeting the same FBO use a new batch object.
> +         */
> +        if (newdep->batch)
> +                panfrost_freeze_batch(newdep->batch);
> +}
> +
>  static struct panfrost_batch *
>  panfrost_get_batch(struct panfrost_context *ctx,
>                     const struct pipe_framebuffer_state *key)
> @@ -214,6 +293,216 @@ panfrost_get_batch_for_fbo(struct panfrost_context *ctx)
>          return batch;
>  }
>  
> +static bool
> +panfrost_batch_fence_is_signaled(struct panfrost_batch_fence *fence)
> +{
> +        if (fence->signaled)
> +                return true;
> +
> +        /* Batch has not been submitted yet. */
> +        if (fence->batch)
> +                return false;
> +
> +        int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd,
> +                                 &fence->syncobj, 1, 0, 0, NULL);
> +
> +        /* Cache whether the fence was signaled */
> +        fence->signaled = ret >= 0;
> +        return fence->signaled;
> +}
> +
> +static void
> +panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
> +                             struct panfrost_bo_access *access,
> +			     const struct panfrost_bo *bo)
> +{
> +        if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) {
> +                panfrost_batch_fence_unreference(access->writer);
> +                access->writer = NULL;
> +        }
> +
> +        unsigned nreaders = 0;
> +        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
> +                              reader) {
> +                if (!(*reader))
> +                        continue;
> +
> +                if (panfrost_batch_fence_is_signaled(*reader)) {
> +                        panfrost_batch_fence_unreference(*reader);
> +                        *reader = NULL;
> +                } else {
> +                        nreaders++;
> +                }
> +        }
> +
> +        if (!nreaders)
> +                util_dynarray_clear(&access->readers);
> +}
> +
> +/* Collect signaled fences to keep the kernel-side syncobj-map small. The
> + * idea is to collect those signaled fences at the end of each flush_all
> + * call. This function is likely to collect only fences from previous
> + * batch flushes not the one that have just have just been submitted and
> + * are probably still in flight when we trigger the garbage collection.
> + * Anyway, we need to do this garbage collection at some point if we don't
> + * want the BO access map to keep invalid entries around and retain
> + * syncobjs forever.
> + */
> +static void
> +panfrost_gc_fences(struct panfrost_context *ctx)
> +{
> +        hash_table_foreach(ctx->accessed_bos, entry) {
> +                struct panfrost_bo_access *access = entry->data;
> +
> +                assert(access);
> +                panfrost_bo_access_gc_fences(ctx, access, entry->key);
> +                if (!util_dynarray_num_elements(&access->readers,
> +                                                struct panfrost_batch_fence *) &&
> +                    !access->writer)
> +                        _mesa_hash_table_remove(ctx->accessed_bos, entry);
> +        }
> +}
> +
> +#ifndef NDEBUG
> +static bool
> +panfrost_batch_in_readers(struct panfrost_batch *batch,
> +                          struct panfrost_bo_access *access)
> +{
> +        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
> +                              reader) {
> +                if (*reader && (*reader)->batch == batch)
> +                        return true;
> +        }
> +
> +        return false;
> +}
> +#endif
> +
> +static void
> +panfrost_batch_update_bo_access(struct panfrost_batch *batch,
> +                                struct panfrost_bo *bo, uint32_t access_type,
> +                                bool already_accessed)
> +{
> +        struct panfrost_context *ctx = batch->ctx;
> +        struct panfrost_bo_access *access;
> +        uint32_t old_access_type;
> +        struct hash_entry *entry;
> +
> +        assert(access_type == PAN_BO_ACCESS_WRITE ||
> +               access_type == PAN_BO_ACCESS_READ);
> +
> +        entry = _mesa_hash_table_search(ctx->accessed_bos, bo);
> +        access = entry ? entry->data : NULL;
> +        if (access) {
> +                old_access_type = access->type;
> +        } else {
> +                access = rzalloc(ctx, struct panfrost_bo_access);
> +                util_dynarray_init(&access->readers, access);
> +                _mesa_hash_table_insert(ctx->accessed_bos, bo, access);
> +                /* We are the first to access this BO, let's initialize
> +                 * old_access_type to our own access type in that case.
> +                 */
> +                old_access_type = access_type;
> +                access->type = access_type;
> +        }
> +
> +        assert(access);
> +
> +        if (access_type == PAN_BO_ACCESS_WRITE &&
> +            old_access_type == PAN_BO_ACCESS_READ) {
> +                /* Previous access was a read and we want to write this BO.
> +                 * We first need to add explicit deps between our batch and
> +                 * the previous readers.
> +                 */
> +                util_dynarray_foreach(&access->readers,
> +                                      struct panfrost_batch_fence *, reader) {
> +                        /* We were already reading the BO, no need to add a dep
> +                         * on ourself (the acyclic check would complain about
> +                         * that).
> +                         */
> +                        if (!(*reader) || (*reader)->batch == batch)
> +                                continue;
> +
> +                        panfrost_batch_add_dep(batch, *reader);
> +                }
> +                panfrost_batch_fence_reference(batch->out_sync);
> +
> +                /* We now are the new writer. */
> +                access->writer = batch->out_sync;
> +                access->type = access_type;
> +
> +                /* Release the previous readers and reset the readers array. */
> +                util_dynarray_foreach(&access->readers,
> +                                      struct panfrost_batch_fence *,
> +                                      reader) {
> +                        if (!*reader)
> +                                continue;
> +                        panfrost_batch_fence_unreference(*reader);
> +                }
> +
> +                util_dynarray_clear(&access->readers);
> +        } else if (access_type == PAN_BO_ACCESS_WRITE &&
> +                   old_access_type == PAN_BO_ACCESS_WRITE) {
> +                /* Previous access was a write and we want to write this BO.
> +                 * First check if we were the previous writer, in that case
> +                 * there's nothing to do. Otherwise we need to add a
> +                 * dependency between the new writer and the old one.
> +                 */
> +		if (access->writer != batch->out_sync) {
> +                        if (access->writer) {
> +                                panfrost_batch_add_dep(batch, access->writer);
> +                                panfrost_batch_fence_unreference(access->writer);
> +                        }
> +                        panfrost_batch_fence_reference(batch->out_sync);
> +                        access->writer = batch->out_sync;
> +                }
> +        } else if (access_type == PAN_BO_ACCESS_READ &&
> +                   old_access_type == PAN_BO_ACCESS_WRITE) {
> +                /* Previous access was a write and we want to read this BO.
> +                 * First check if we were the previous writer, in that case
> +                 * we want to keep the access type unchanged, as a write is
> +                 * more constraining than a read.
> +                 */
> +                if (access->writer != batch->out_sync) {
> +                        /* Add a dependency on the previous writer. */
> +                        panfrost_batch_add_dep(batch, access->writer);
> +
> +                        /* The previous access was a write, there's no reason
> +                         * to have entries in the readers array.
> +                         */
> +                        assert(!util_dynarray_num_elements(&access->readers,
> +                                                           struct panfrost_batch_fence *));
> +
> +                        /* Add ourselves to the readers array. */
> +                        panfrost_batch_fence_reference(batch->out_sync);
> +                        util_dynarray_append(&access->readers,
> +                                             struct panfrost_batch_fence *,
> +                                             batch->out_sync);
> +                        access->type = PAN_BO_ACCESS_READ;
> +                }
> +        } else {
> +                /* We already accessed this BO before, so we should already be
> +                 * in the reader array.
> +                 */
> +                if (already_accessed) {
> +                        assert(panfrost_batch_in_readers(batch, access));
> +                        return;
> +                }
> +
> +                /* Previous access was a read and we want to read this BO.
> +                 * Add ourselves to the readers array and add a dependency on
> +                 * the previous writer if any.
> +                 */
> +                panfrost_batch_fence_reference(batch->out_sync);
> +                util_dynarray_append(&access->readers,
> +                                     struct panfrost_batch_fence *,
> +                                     batch->out_sync);
> +
> +                if (access->writer)
> +                        panfrost_batch_add_dep(batch, access->writer);
> +        }
> +}
> +
>  void
>  panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
>                        uint32_t flags)
> @@ -231,6 +520,10 @@ panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
>                  panfrost_bo_reference(bo);
>  	} else {
>                  old_flags = (uintptr_t)entry->data;
> +
> +                /* All batches have to agree on the shared flag. */
> +                assert((old_flags & PAN_BO_ACCESS_SHARED) ==
> +                       (flags & PAN_BO_ACCESS_SHARED));
>          }
>  
>          assert(entry);
> @@ -240,6 +533,25 @@ panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
>  
>          flags |= old_flags;
>          entry->data = (void *)(uintptr_t)flags;
> +
> +        /* If this is not a shared BO, we don't really care about dependency
> +         * tracking.
> +         */
> +        if (!(flags & PAN_BO_ACCESS_SHARED))
> +                return;
> +
> +        /* All dependencies should have been flushed before we execute the
> +         * wallpaper draw, so it should be harmless to skip the
> +         * update_bo_access() call.
> +         */
> +        if (batch == batch->ctx->wallpaper_batch)
> +                return;
> +
> +        /* Only pass R/W flags to the dep tracking logic. */
> +        assert(flags & PAN_BO_ACCESS_RW);
> +        flags = (flags & PAN_BO_ACCESS_WRITE) ?
> +                PAN_BO_ACCESS_WRITE : PAN_BO_ACCESS_READ;
> +        panfrost_batch_update_bo_access(batch, bo, flags, old_flags != 0);
>  }
>  
>  void panfrost_batch_add_fbo_bos(struct panfrost_batch *batch)
> @@ -459,15 +771,36 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
>          struct pipe_context *gallium = (struct pipe_context *) ctx;
>          struct panfrost_screen *screen = pan_screen(gallium->screen);
>          struct drm_panfrost_submit submit = {0,};
> -        uint32_t *bo_handles;
> +        uint32_t *bo_handles, *in_syncs = NULL;
> +        bool is_fragment_shader;
>          int ret;
>  
> -
> -        if (ctx->last_out_sync) {
> +        is_fragment_shader = (reqs & PANFROST_JD_REQ_FS) && batch->first_job.gpu;
> +        if (is_fragment_shader)
>                  submit.in_sync_count = 1;
> -                submit.in_syncs = (uintptr_t)&ctx->last_out_sync->syncobj;
> +        else
> +                submit.in_sync_count = util_dynarray_num_elements(&batch->dependencies,
> +                                                                  struct panfrost_batch_fence *);
> +
> +        if (submit.in_sync_count) {
> +                in_syncs = calloc(submit.in_sync_count, sizeof(*in_syncs));
> +                assert(in_syncs);
>          }
>  
> +        /* The fragment job always depends on the vertex/tiler job if there's
> +         * one
> +         */
> +        if (is_fragment_shader) {
> +                in_syncs[0] = batch->out_sync->syncobj;
> +        } else {
> +                unsigned int i = 0;
> +
> +                util_dynarray_foreach(&batch->dependencies,
> +                                      struct panfrost_batch_fence *, dep)
> +                        in_syncs[i++] = (*dep)->syncobj;
> +        }
> +
> +        submit.in_syncs = (uintptr_t)in_syncs;
>          submit.out_sync = batch->out_sync->syncobj;
>          submit.jc = first_job_desc;
>          submit.requirements = reqs;
> @@ -484,6 +817,7 @@ panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
>          submit.bo_handles = (u64) (uintptr_t) bo_handles;
>          ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
>          free(bo_handles);
> +        free(in_syncs);
>  
>          /* Release the last batch fence if any, and retain the new one */
>          if (ctx->last_out_sync)
> @@ -534,6 +868,13 @@ panfrost_batch_submit(struct panfrost_batch *batch)
>  {
>          assert(batch);
>  
> +        /* Submit the dependencies first. */
> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                if ((*dep)->batch)
> +                        panfrost_batch_submit((*dep)->batch);
> +        }
> +
>          struct panfrost_context *ctx = batch->ctx;
>          int ret;
>  
> @@ -567,7 +908,6 @@ panfrost_batch_submit(struct panfrost_batch *batch)
>  
>  out:
>          panfrost_freeze_batch(batch);
> -        assert(!ctx->batch || batch == ctx->batch);
>  
>          /* We always stall the pipeline for correct results since pipelined
>           * rendering is quite broken right now (to be fixed by the panfrost_job
> @@ -579,6 +919,9 @@ out:
>                                 NULL);
>  
>          panfrost_free_batch(batch);
> +
> +        /* Collect batch fences before returning */
> +        panfrost_gc_fences(ctx);
>  }
>  
>  void
> @@ -785,4 +1128,6 @@ panfrost_batch_init(struct panfrost_context *ctx)
>          ctx->batches = _mesa_hash_table_create(ctx,
>                                                 panfrost_batch_hash,
>                                                 panfrost_batch_compare);
> +        ctx->accessed_bos = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
> +                                                    _mesa_key_pointer_equal);
>  }
> diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
> index 88f1e4620fd0..63813dff652d 100644
> --- a/src/gallium/drivers/panfrost/pan_job.h
> +++ b/src/gallium/drivers/panfrost/pan_job.h
> @@ -153,6 +153,9 @@ struct panfrost_batch {
>  
>          /* Output sync object. Only valid when submitted is true. */
>          struct panfrost_batch_fence *out_sync;
> +
> +        /* Batch dependencies */
> +        struct util_dynarray dependencies;
>  };
>  
>  /* Functions for managing the above */
> -- 
> 2.21.0