[v2,28/37] panfrost: Start tracking inter-batch dependencies

Submitted by Boris Brezillon on Sept. 16, 2019, 9:37 a.m.

Details

Message ID 20190916093715.32203-29-boris.brezillon@collabora.com
State New
Headers show
Series "panfrost: Support batch pipelining" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Boris Brezillon Sept. 16, 2019, 9:37 a.m.
The idea is to track which BO are being accessed and the type of access
to determine when a dependency exists. Thanks to that we can build a
dependency graph that will allow us to flush batches in the correct
order.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
---
 src/gallium/drivers/panfrost/pan_context.h |   3 +
 src/gallium/drivers/panfrost/pan_job.c     | 324 ++++++++++++++++++++-
 src/gallium/drivers/panfrost/pan_job.h     |   3 +
 3 files changed, 325 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/panfrost/pan_context.h b/src/gallium/drivers/panfrost/pan_context.h
index ce3e0c899a4f..3b09952345cf 100644
--- a/src/gallium/drivers/panfrost/pan_context.h
+++ b/src/gallium/drivers/panfrost/pan_context.h
@@ -114,6 +114,9 @@  struct panfrost_context {
         struct panfrost_batch *batch;
         struct hash_table *batches;
 
+        /* panfrost_bo -> panfrost_bo_access */
+        struct hash_table *accessed_bos;
+
         /* Within a launch_grid call.. */
         const struct pipe_grid_info *compute_grid;
 
diff --git a/src/gallium/drivers/panfrost/pan_job.c b/src/gallium/drivers/panfrost/pan_job.c
index ec397b855a69..04f4f22dab74 100644
--- a/src/gallium/drivers/panfrost/pan_job.c
+++ b/src/gallium/drivers/panfrost/pan_job.c
@@ -36,6 +36,29 @@ 
 #include "pan_util.h"
 #include "pandecode/decode.h"
 
+/* panfrost_bo_access is here to help us keep track of batch accesses to BOs
+ * and build a proper dependency graph such that batches can be pipelined for
+ * better GPU utilization.
+ *
+ * Each accessed BO has a corresponding entry in the ->accessed_bos hash table.
+ * A BO is either being written or read at any time, that's what the type field
+ * encodes.
+ * When the last access is a write, the batch writing the BO might have read
+ * dependencies (readers that have not been executed yet and want to read the
+ * previous BO content), and when the last access is a read, all readers might
+ * depend on another batch to push its results to memory. That's what the
+ * readers/writers keep track off.
+ * There can only be one writer at any given time, if a new batch wants to
+ * write to the same BO, a dependency will be added between the new writer and
+ * the old writer (at the batch level), and panfrost_bo_access->writer will be
+ * updated to point to the new writer.
+ */
+struct panfrost_bo_access {
+        uint32_t type;
+        struct util_dynarray readers;
+        struct panfrost_batch_fence *writer;
+};
+
 static struct panfrost_batch_fence *
 panfrost_create_batch_fence(struct panfrost_batch *batch)
 {
@@ -94,6 +117,7 @@  panfrost_create_batch(struct panfrost_context *ctx,
 
         util_dynarray_init(&batch->headers, batch);
         util_dynarray_init(&batch->gpu_headers, batch);
+        util_dynarray_init(&batch->dependencies, batch);
         batch->out_sync = panfrost_create_batch_fence(batch);
         util_copy_framebuffer_state(&batch->key, key);
 
@@ -153,6 +177,10 @@  panfrost_free_batch(struct panfrost_batch *batch)
         hash_table_foreach(batch->bos, entry)
                 panfrost_bo_unreference((struct panfrost_bo *)entry->key);
 
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep)
+                panfrost_batch_fence_unreference(*dep);
+
         /* The out_sync fence lifetime is different from the the batch one
          * since other batches might want to wait on an fence of already
          * submitted/signaled batch. All we need to do here is make sure the
@@ -166,6 +194,52 @@  panfrost_free_batch(struct panfrost_batch *batch)
         ralloc_free(batch);
 }
 
+#ifndef NDEBUG
+static bool
+panfrost_dep_graph_contains_batch(struct panfrost_batch *root,
+                                  struct panfrost_batch *batch)
+{
+        if (!root)
+                return false;
+
+        util_dynarray_foreach(&root->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if ((*dep)->batch == batch ||
+                    panfrost_dep_graph_contains_batch((*dep)->batch, batch))
+                        return true;
+        }
+
+        return false;
+}
+#endif
+
+static void
+panfrost_batch_add_dep(struct panfrost_batch *batch,
+                       struct panfrost_batch_fence *newdep)
+{
+        if (batch == newdep->batch)
+                return;
+
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if (*dep == newdep)
+                        return;
+        }
+
+        /* Make sure the dependency graph is acyclic. */
+        assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch));
+
+        panfrost_batch_fence_reference(newdep);
+        util_dynarray_append(&batch->dependencies,
+                             struct panfrost_batch_fence *, newdep);
+
+        /* We now have a batch depending on us, let's make sure new draw/clear
+         * calls targeting the same FBO use a new batch object.
+         */
+        if (newdep->batch)
+                panfrost_freeze_batch(newdep->batch);
+}
+
 static struct panfrost_batch *
 panfrost_get_batch(struct panfrost_context *ctx,
                    const struct pipe_framebuffer_state *key)
@@ -216,6 +290,192 @@  panfrost_get_batch_for_fbo(struct panfrost_context *ctx)
         return batch;
 }
 
+static bool
+panfrost_batch_fence_is_signaled(struct panfrost_batch_fence *fence)
+{
+        if (fence->signaled)
+                return true;
+
+        /* Batch has not been submitted yet. */
+        if (fence->batch)
+                return false;
+
+        int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd,
+                                 &fence->syncobj, 1, 0, 0, NULL);
+
+        fence->signaled = ret >= 0;
+        return fence->signaled;
+}
+
+static void
+panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
+                             struct panfrost_bo_access *access,
+			     const struct panfrost_bo *bo)
+{
+        if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) {
+                panfrost_batch_fence_unreference(access->writer);
+                access->writer = NULL;
+	}
+
+        unsigned nreaders = 0;
+        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
+                              reader) {
+                if (!*reader)
+                        continue;
+
+                if (panfrost_batch_fence_is_signaled(*reader)) {
+                        panfrost_batch_fence_unreference(*reader);
+                        *reader = NULL;
+                } else {
+                        nreaders++;
+                }
+        }
+
+        if (!nreaders)
+                util_dynarray_clear(&access->readers);
+}
+
+static void
+panfrost_gc_fences(struct panfrost_context *ctx)
+{
+        hash_table_foreach(ctx->accessed_bos, entry) {
+                struct panfrost_bo_access *access = entry->data;
+
+                assert(access);
+                panfrost_bo_access_gc_fences(ctx, access, entry->key);
+                if (!util_dynarray_num_elements(&access->readers,
+                                                struct panfrost_batch_fence *) &&
+                    !access->writer)
+                        _mesa_hash_table_remove(ctx->accessed_bos, entry);
+        }
+}
+
+static void
+panfrost_batch_update_bo_access(struct panfrost_batch *batch,
+                                struct panfrost_bo *bo, uint32_t access_type)
+{
+        struct panfrost_context *ctx = batch->ctx;
+        struct panfrost_bo_access *access;
+        uint32_t old_access_type;
+        struct hash_entry *entry;
+
+        assert(access_type == PAN_BO_GPU_ACCESS_WRITE ||
+               access_type == PAN_BO_GPU_ACCESS_READ);
+
+        entry = _mesa_hash_table_search(ctx->accessed_bos, bo);
+        access = entry ? entry->data : NULL;
+        if (access) {
+                old_access_type = access->type;
+        } else {
+                access = rzalloc(ctx, struct panfrost_bo_access);
+                util_dynarray_init(&access->readers, access);
+                _mesa_hash_table_insert(ctx->accessed_bos, bo, access);
+                /* We are the first to access this BO, let's initialize
+                 * old_access_type to our own access type in that case.
+                 */
+                old_access_type = access_type;
+                access->type = access_type;
+        }
+
+        assert(access);
+
+        if (access_type == PAN_BO_GPU_ACCESS_WRITE &&
+            old_access_type == PAN_BO_GPU_ACCESS_READ) {
+                /* Previous access was a read and we want to write this BO.
+                 * We first need to add explicit deps between our batch and
+                 * the previous readers.
+                 */
+                util_dynarray_foreach(&access->readers,
+                                      struct panfrost_batch_fence *, reader) {
+                        /* We were already reading the BO, no need to add a dep
+                         * on ourself (the acyclic check would complain about
+                         * that).
+                         */
+                        if (!*reader || (*reader)->batch == batch)
+                                continue;
+
+                        panfrost_batch_add_dep(batch, *reader);
+                }
+                panfrost_batch_fence_reference(batch->out_sync);
+
+                /* We now are the new writer. */
+                access->writer = batch->out_sync;
+                access->type = access_type;
+
+                /* Release the previous readers and reset the readers array. */
+                util_dynarray_foreach(&access->readers,
+                                      struct panfrost_batch_fence *,
+                                      reader) {
+                        if (!*reader)
+                                continue;
+                        panfrost_batch_fence_unreference(*reader);
+                }
+                util_dynarray_clear(&access->readers);
+        } else if (access_type == PAN_BO_GPU_ACCESS_WRITE &&
+                   old_access_type == PAN_BO_GPU_ACCESS_WRITE) {
+                /* Previous access was a write and we want to write this BO.
+                 * First check if we were the previous writer, in that case
+                 * there's nothing to do. Otherwise we need to add a
+                 * dependency between the new writer and the old one.
+                 */
+		if (access->writer != batch->out_sync) {
+                        if (access->writer) {
+                                panfrost_batch_add_dep(batch, access->writer);
+                                panfrost_batch_fence_unreference(access->writer);
+                        }
+                        panfrost_batch_fence_reference(batch->out_sync);
+                        access->writer = batch->out_sync;
+                }
+        } else if (access_type == PAN_BO_GPU_ACCESS_READ &&
+                   old_access_type == PAN_BO_GPU_ACCESS_WRITE) {
+                /* Previous access was a write and we want to read this BO.
+                 * First check if we were the previous writer, in that case
+                 * we want to keep the access type unchanged, as a write is
+                 * more constraining than a read.
+                 */
+                if (access->writer != batch->out_sync) {
+                        /* Add a dependency on the previous writer. */
+                        panfrost_batch_add_dep(batch, access->writer);
+
+                        /* The previous access was a write, there's no reason
+                         * to have entries in the readers array.
+                         */
+                        assert(!util_dynarray_num_elements(&access->readers,
+                                                           struct panfrost_batch_fence *));
+
+                        /* Add ourselves to the readers array. */
+                        panfrost_batch_fence_reference(batch->out_sync);
+                        util_dynarray_append(&access->readers,
+                                             struct panfrost_batch_fence *,
+                                             batch->out_sync);
+                        access->type = access_type;
+                }
+        } else {
+                /* Previous access was a read and we want to read this BO.
+                 * Add ourselves to the readers array if we're not already
+                 * there and add a dependency on the previous writer (if any).
+                 */
+	        bool already_there = false;
+
+                util_dynarray_foreach(&access->readers,
+                                      struct panfrost_batch_fence *, reader) {
+                        if (*reader && (*reader)->batch == batch) {
+                                already_there = true;
+                                break;
+                        }
+                }
+
+                if (!already_there) {
+                        panfrost_batch_fence_reference(batch->out_sync);
+                        util_dynarray_append(&access->readers,
+                                             struct panfrost_batch_fence *,
+                                             batch->out_sync);
+                        if (access->writer)
+                                panfrost_batch_add_dep(batch, access->writer);
+                }
+        }
+}
+
 void
 panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
                       uint32_t flags)
@@ -233,6 +493,10 @@  panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
                 panfrost_bo_reference(bo);
 	} else {
                 old_flags = (uintptr_t)entry->data;
+
+                /* All batches have to agree on the shared flag. */
+                assert((old_flags & PAN_BO_GPU_ACCESS_SHARED) ==
+                       (flags & PAN_BO_GPU_ACCESS_SHARED));
         }
 
         assert(entry);
@@ -242,6 +506,25 @@  panfrost_batch_add_bo(struct panfrost_batch *batch, struct panfrost_bo *bo,
 
         flags |= old_flags;
         entry->data = (void *)(uintptr_t)flags;
+
+        /* If this is not a shared BO, we don't really care about dependency
+         * tracking.
+         */
+        if (!(flags & PAN_BO_GPU_ACCESS_SHARED))
+                return;
+
+        /* All dependencies should have been flushed before we execute the
+         * wallpaper draw, so it should be harmless to skip the
+         * update_bo_access() call.
+         */
+        if (batch == batch->ctx->wallpaper_batch)
+                return;
+
+        /* Only pass R/W flags to the dep tracking logic. */
+        assert(flags & PAN_BO_GPU_ACCESS_RW);
+        flags = (flags & PAN_BO_GPU_ACCESS_WRITE) ?
+                PAN_BO_GPU_ACCESS_WRITE : PAN_BO_GPU_ACCESS_READ;
+        panfrost_batch_update_bo_access(batch, bo, flags);
 }
 
 void panfrost_batch_add_fbo_bos(struct panfrost_batch *batch)
@@ -461,15 +744,34 @@  panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         struct pipe_context *gallium = (struct pipe_context *) ctx;
         struct panfrost_screen *screen = pan_screen(gallium->screen);
         struct drm_panfrost_submit submit = {0,};
-        uint32_t *bo_handles;
+        uint32_t *bo_handles, *in_syncs = NULL;
         int ret;
 
-
-        if (ctx->last_out_sync) {
+        if (reqs & PANFROST_JD_REQ_FS && batch->first_job.gpu)
                 submit.in_sync_count = 1;
-                submit.in_syncs = (uintptr_t)&ctx->last_out_sync->syncobj;
+        else
+                submit.in_sync_count = util_dynarray_num_elements(&batch->dependencies,
+                                                                  struct panfrost_batch_fence *);
+
+        if (submit.in_sync_count) {
+                in_syncs = calloc(submit.in_sync_count, sizeof(*in_syncs));
+                assert(in_syncs);
         }
 
+        /* The fragment job always depends on the vertex/tiler job if there's
+         * one
+         */
+        if (reqs & PANFROST_JD_REQ_FS && batch->first_job.gpu) {
+                in_syncs[0] = batch->out_sync->syncobj;
+        } else {
+                unsigned int i = 0;
+
+                util_dynarray_foreach(&batch->dependencies,
+                                      struct panfrost_batch_fence *, dep)
+                        in_syncs[i++] = (*dep)->syncobj;
+        }
+
+        submit.in_syncs = (uintptr_t)in_syncs;
         submit.out_sync = batch->out_sync->syncobj;
         submit.jc = first_job_desc;
         submit.requirements = reqs;
@@ -493,6 +795,7 @@  panfrost_batch_submit_ioctl(struct panfrost_batch *batch,
         submit.bo_handles = (u64) (uintptr_t) bo_handles;
         ret = drmIoctl(screen->fd, DRM_IOCTL_PANFROST_SUBMIT, &submit);
         free(bo_handles);
+        free(in_syncs);
 
         /* Release the last batch fence if any, and retain the new one */
         if (ctx->last_out_sync)
@@ -543,6 +846,13 @@  panfrost_batch_submit(struct panfrost_batch *batch)
 {
         assert(batch);
 
+        /* Submit the dependencies first. */
+        util_dynarray_foreach(&batch->dependencies,
+                              struct panfrost_batch_fence *, dep) {
+                if ((*dep)->batch)
+                        panfrost_batch_submit((*dep)->batch);
+        }
+
         struct panfrost_context *ctx = batch->ctx;
         int ret;
 
@@ -562,7 +872,6 @@  panfrost_batch_submit(struct panfrost_batch *batch)
 
 out:
         panfrost_freeze_batch(batch);
-        assert(!ctx->batch || batch == ctx->batch);
 
         /* We always stall the pipeline for correct results since pipelined
          * rendering is quite broken right now (to be fixed by the panfrost_job
@@ -571,6 +880,9 @@  out:
         drmSyncobjWait(pan_screen(ctx->base.screen)->fd,
                        &batch->out_sync->syncobj, 1, INT64_MAX, 0, NULL);
         panfrost_free_batch(batch);
+
+        /* Collect batch fences before returning */
+        panfrost_gc_fences(ctx);
 }
 
 void
@@ -777,4 +1089,6 @@  panfrost_batch_init(struct panfrost_context *ctx)
         ctx->batches = _mesa_hash_table_create(ctx,
                                                panfrost_batch_hash,
                                                panfrost_batch_compare);
+        ctx->accessed_bos = _mesa_hash_table_create(ctx, _mesa_hash_pointer,
+                                                    _mesa_key_pointer_equal);
 }
diff --git a/src/gallium/drivers/panfrost/pan_job.h b/src/gallium/drivers/panfrost/pan_job.h
index 32bfc1fe3388..faf8cb06eebc 100644
--- a/src/gallium/drivers/panfrost/pan_job.h
+++ b/src/gallium/drivers/panfrost/pan_job.h
@@ -153,6 +153,9 @@  struct panfrost_batch {
 
         /* Output sync object. Only valid when submitted is true. */
         struct panfrost_batch_fence *out_sync;
+
+        /* Batch dependencies */
+        struct util_dynarray dependencies;
 };
 
 /* Functions for managing the above */

Comments

> + * A BO is either being written or read at any time, that's what the type field
> + * encodes.

Might this be inferred from (writer != NULL) and (readers->length > 0)?

> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep)
> +                panfrost_batch_fence_unreference(*dep);
> +

Nit: Wrap in { braces } for 2+ lines for clarity

> +static void
> +panfrost_batch_add_dep(struct panfrost_batch *batch,
> +                       struct panfrost_batch_fence *newdep)
> +{
> +        if (batch == newdep->batch)
> +                return;
> +
> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                if (*dep == newdep)
> +                        return;
> +        }
> +
> +        /* Make sure the dependency graph is acyclic. */
> +        assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch));
> +
> +        panfrost_batch_fence_reference(newdep);
> +        util_dynarray_append(&batch->dependencies,
> +                             struct panfrost_batch_fence *, newdep);
> +
> +        /* We now have a batch depending on us, let's make sure new draw/clear
> +         * calls targeting the same FBO use a new batch object.
> +         */
> +        if (newdep->batch)
> +                panfrost_freeze_batch(newdep->batch);
> +}

I'm wondering if batch->dependencies should be expressed as a set,
rather than a dynarray, such that testing whether a batch has a
given dependency is ideally O(1), not O(N).

In practice I don't know if the asymptotic complexity matters, esp. for
small numbers of batches, and in practice hash table iteration is slow
enough in mesa* that maybe it would be counterproductive.

Just something I thought I might throw out there.

* Set iteration ought to be no slower than array iteration, but constant
  factors are a thing.

> +        int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd,
> +                                 &fence->syncobj, 1, 0, 0, NULL);
> +
> +        fence->signaled = ret >= 0;
> +        return fence->signaled;
> +}

Nit: Add a "/* Cache whether the fence was signaled */" comment?

> +static void
> +panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
> +                             struct panfrost_bo_access *access,
> +			     const struct panfrost_bo *bo)

Could you remind me what gc abbreviates? Sorry.

I'm a little unclear on what the function's purpose is based on the
name.

> +{
> +        if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) {
> +                panfrost_batch_fence_unreference(access->writer);
> +                access->writer = NULL;
> +	}

Spacing.

> +
> +        unsigned nreaders = 0;
> +        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
> +                              reader) {
> +                if (!*reader)
> +                        continue;

Please add parens (!(*reader)) for clarity.

> +static void
> +panfrost_gc_fences(struct panfrost_context *ctx)
> +{
> +        hash_table_foreach(ctx->accessed_bos, entry) {
> +                struct panfrost_bo_access *access = entry->data;
> +
> +                assert(access);
> +                panfrost_bo_access_gc_fences(ctx, access, entry->key);
> +                if (!util_dynarray_num_elements(&access->readers,
> +                                                struct panfrost_batch_fence *) &&
> +                    !access->writer)
> +                        _mesa_hash_table_remove(ctx->accessed_bos, entry);
> +        }
> +}

Question: is it safe to remove entries while iterating the table?
(Not a review comment, I don't know the details of mesa's
implementation)

Also not clear what panfrost_gc_fences is for.

> +static void
> +panfrost_batch_update_bo_access(struct panfrost_batch *batch,
> +                                struct panfrost_bo *bo, uint32_t access_type)
> +{
> +        struct panfrost_context *ctx = batch->ctx;
> +        struct panfrost_bo_access *access;
> +        uint32_t old_access_type;
> +        struct hash_entry *entry;
> +
> +        assert(access_type == PAN_BO_GPU_ACCESS_WRITE ||
> +               access_type == PAN_BO_GPU_ACCESS_READ);
> +
> +        entry = _mesa_hash_table_search(ctx->accessed_bos, bo);
> +        access = entry ? entry->data : NULL;
> +        if (access) {
> +                old_access_type = access->type;
> +        } else {
> +                access = rzalloc(ctx, struct panfrost_bo_access);
> +                util_dynarray_init(&access->readers, access);
> +                _mesa_hash_table_insert(ctx->accessed_bos, bo, access);
> +                /* We are the first to access this BO, let's initialize
> +                 * old_access_type to our own access type in that case.
> +                 */
> +                old_access_type = access_type;
> +                access->type = access_type;
> +        }
> +
> +        assert(access);
> +
> +        if (access_type == PAN_BO_GPU_ACCESS_WRITE &&
> +            old_access_type == PAN_BO_GPU_ACCESS_READ) {
> +                /* Previous access was a read and we want to write this BO.
> +                 * We first need to add explicit deps between our batch and
> +                 * the previous readers.
> +                 */
> +                util_dynarray_foreach(&access->readers,
> +                                      struct panfrost_batch_fence *, reader) {
> +                        /* We were already reading the BO, no need to add a dep
> +                         * on ourself (the acyclic check would complain about
> +                         * that).
> +                         */
> +                        if (!*reader || (*reader)->batch == batch)

!(*reader)

> +                                continue;
> +
> +                        panfrost_batch_add_dep(batch, *reader);
> +                }
> +                panfrost_batch_fence_reference(batch->out_sync);

Nit: Add a blank line between
> +        } else if (access_type == PAN_BO_GPU_ACCESS_READ &&
> +                   old_access_type == PAN_BO_GPU_ACCESS_WRITE) {
> +                /* Previous access was a write and we want to read this BO.
> +                 * First check if we were the previous writer, in that case
> +                 * we want to keep the access type unchanged, as a write is
> +                 * more constraining than a read.
> +                 */
> +                if (access->writer != batch->out_sync) {
> +                        /* Add a dependency on the previous writer. */
> +                        panfrost_batch_add_dep(batch, access->writer);
> +
> +                        /* The previous access was a write, there's no reason
> +                         * to have entries in the readers array.
> +                         */
> +                        assert(!util_dynarray_num_elements(&access->readers,
> +                                                           struct panfrost_batch_fence *));
> +
> +                        /* Add ourselves to the readers array. */
> +                        panfrost_batch_fence_reference(batch->out_sync);
> +                        util_dynarray_append(&access->readers,
> +                                             struct panfrost_batch_fence *,
> +                                             batch->out_sync);
> +                        access->type = access_type;

I would prefer an explicit `access->type = PAN_BO_GPU_ACCESS_READ`, so
the parallelism with `readers` is visually clear without needing to
glance back up at the condition.

> +                }
> +        } else {
> +                /* Previous access was a read and we want to read this BO.
> +                 * Add ourselves to the readers array if we're not already
> +                 * there and add a dependency on the previous writer (if any).
> +                 */
> +	        bool already_there = false;

Spacing.

> +                util_dynarray_foreach(&access->readers,
> +                                      struct panfrost_batch_fence *, reader) {
> +                        if (*reader && (*reader)->batch == batch) {
> +                                already_there = true;
> +                                break;
> +                        }
> +                }

This raises the point that access->readers should probably also be a
set, not an array, but again, not sure how much it matters.

> -        if (ctx->last_out_sync) {
> +        if (reqs & PANFROST_JD_REQ_FS && batch->first_job.gpu)

Could we add a `bool is_fragment` with this condition and then make this
as well as the if below simple if (is_fragment)?

> +        /* Submit the dependencies first. */
> +        util_dynarray_foreach(&batch->dependencies,
> +                              struct panfrost_batch_fence *, dep) {
> +                if ((*dep)->batch)
> +                        panfrost_batch_submit((*dep)->batch);
> +        }

I assume this logic will be refined later in the series...?
On Mon, 16 Sep 2019 16:15:14 -0400
Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:

> > + * A BO is either being written or read at any time, that's what the type field
> > + * encodes.  
> 
> Might this be inferred from (writer != NULL) and (readers->length > 0)?

No, I need to keep the old writer around when the new access is a
read because all subsequent reads will need to have the reader -> writer
dep defined. 

> 
> > +        util_dynarray_foreach(&batch->dependencies,
> > +                              struct panfrost_batch_fence *, dep)
> > +                panfrost_batch_fence_unreference(*dep);
> > +  
> 
> Nit: Wrap in { braces } for 2+ lines for clarity
> 
> > +static void
> > +panfrost_batch_add_dep(struct panfrost_batch *batch,
> > +                       struct panfrost_batch_fence *newdep)
> > +{
> > +        if (batch == newdep->batch)
> > +                return;
> > +
> > +        util_dynarray_foreach(&batch->dependencies,
> > +                              struct panfrost_batch_fence *, dep) {
> > +                if (*dep == newdep)
> > +                        return;
> > +        }
> > +
> > +        /* Make sure the dependency graph is acyclic. */
> > +        assert(!panfrost_dep_graph_contains_batch(newdep->batch, batch));
> > +
> > +        panfrost_batch_fence_reference(newdep);
> > +        util_dynarray_append(&batch->dependencies,
> > +                             struct panfrost_batch_fence *, newdep);
> > +
> > +        /* We now have a batch depending on us, let's make sure new draw/clear
> > +         * calls targeting the same FBO use a new batch object.
> > +         */
> > +        if (newdep->batch)
> > +                panfrost_freeze_batch(newdep->batch);
> > +}  
> 
> I'm wondering if batch->dependencies should be expressed as a set,
> rather than a dynarray, such that testing whether a batch has a
> given dependency is ideally O(1), not O(N).
> 
> In practice I don't know if the asymptotic complexity matters, esp. for
> small numbers of batches, and in practice hash table iteration is slow
> enough in mesa* that maybe it would be counterproductive.
> 
> Just something I thought I might throw out there.
> 
> * Set iteration ought to be no slower than array iteration, but constant
>   factors are a thing.

I thought the number of deps would be small enough to not justify the
use of a set, but maybe I'm wrong.

> 
> > +        int ret = drmSyncobjWait(pan_screen(fence->ctx->base.screen)->fd,
> > +                                 &fence->syncobj, 1, 0, 0, NULL);
> > +
> > +        fence->signaled = ret >= 0;
> > +        return fence->signaled;
> > +}  
> 
> Nit: Add a "/* Cache whether the fence was signaled */" comment?

Sure.

> 
> > +static void
> > +panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
> > +                             struct panfrost_bo_access *access,
> > +			     const struct panfrost_bo *bo)  
> 
> Could you remind me what gc abbreviates? Sorry.

Garbage collect.

> 
> I'm a little unclear on what the function's purpose is based on the
> name.

Collect signaled fences to keep the kernel-side syncobj-map small. The
idea is to collect those signaled fences at the end of each flush_all
call. This function is likely to collect only fences from previous
batch flushes not the one that have just have just been submitted and
are probably still in flight when we trigger the garbage collection.
Anyway, we need to do this garbage collection at some point if we don't
want the BO access map to keep invalid entries around and retain
syncobjs forever.

> 
> > +{
> > +        if (access->writer && panfrost_batch_fence_is_signaled(access->writer)) {
> > +                panfrost_batch_fence_unreference(access->writer);
> > +                access->writer = NULL;
> > +	}  
> 
> Spacing.
> 
> > +
> > +        unsigned nreaders = 0;
> > +        util_dynarray_foreach(&access->readers, struct panfrost_batch_fence *,
> > +                              reader) {
> > +                if (!*reader)
> > +                        continue;  
> 
> Please add parens (!(*reader)) for clarity.
> 
> > +static void
> > +panfrost_gc_fences(struct panfrost_context *ctx)
> > +{
> > +        hash_table_foreach(ctx->accessed_bos, entry) {
> > +                struct panfrost_bo_access *access = entry->data;
> > +
> > +                assert(access);
> > +                panfrost_bo_access_gc_fences(ctx, access, entry->key);
> > +                if (!util_dynarray_num_elements(&access->readers,
> > +                                                struct panfrost_batch_fence *) &&
> > +                    !access->writer)
> > +                        _mesa_hash_table_remove(ctx->accessed_bos, entry);
> > +        }
> > +}  
> 
> Question: is it safe to remove entries while iterating the table?
> (Not a review comment, I don't know the details of mesa's
> implementation)

According to the doc, it is.

> 
> Also not clear what panfrost_gc_fences is for.

See above.

> 
> > +static void
> > +panfrost_batch_update_bo_access(struct panfrost_batch *batch,
> > +                                struct panfrost_bo *bo, uint32_t access_type)
> > +{
> > +        struct panfrost_context *ctx = batch->ctx;
> > +        struct panfrost_bo_access *access;
> > +        uint32_t old_access_type;
> > +        struct hash_entry *entry;
> > +
> > +        assert(access_type == PAN_BO_GPU_ACCESS_WRITE ||
> > +               access_type == PAN_BO_GPU_ACCESS_READ);
> > +
> > +        entry = _mesa_hash_table_search(ctx->accessed_bos, bo);
> > +        access = entry ? entry->data : NULL;
> > +        if (access) {
> > +                old_access_type = access->type;
> > +        } else {
> > +                access = rzalloc(ctx, struct panfrost_bo_access);
> > +                util_dynarray_init(&access->readers, access);
> > +                _mesa_hash_table_insert(ctx->accessed_bos, bo, access);
> > +                /* We are the first to access this BO, let's initialize
> > +                 * old_access_type to our own access type in that case.
> > +                 */
> > +                old_access_type = access_type;
> > +                access->type = access_type;
> > +        }
> > +
> > +        assert(access);
> > +
> > +        if (access_type == PAN_BO_GPU_ACCESS_WRITE &&
> > +            old_access_type == PAN_BO_GPU_ACCESS_READ) {
> > +                /* Previous access was a read and we want to write this BO.
> > +                 * We first need to add explicit deps between our batch and
> > +                 * the previous readers.
> > +                 */
> > +                util_dynarray_foreach(&access->readers,
> > +                                      struct panfrost_batch_fence *, reader) {
> > +                        /* We were already reading the BO, no need to add a dep
> > +                         * on ourself (the acyclic check would complain about
> > +                         * that).
> > +                         */
> > +                        if (!*reader || (*reader)->batch == batch)  
> 
> !(*reader)
> 
> > +                                continue;
> > +
> > +                        panfrost_batch_add_dep(batch, *reader);
> > +                }
> > +                panfrost_batch_fence_reference(batch->out_sync);  
> 
> Nit: Add a blank line between
> > +        } else if (access_type == PAN_BO_GPU_ACCESS_READ &&
> > +                   old_access_type == PAN_BO_GPU_ACCESS_WRITE) {
> > +                /* Previous access was a write and we want to read this BO.
> > +                 * First check if we were the previous writer, in that case
> > +                 * we want to keep the access type unchanged, as a write is
> > +                 * more constraining than a read.
> > +                 */
> > +                if (access->writer != batch->out_sync) {
> > +                        /* Add a dependency on the previous writer. */
> > +                        panfrost_batch_add_dep(batch, access->writer);
> > +
> > +                        /* The previous access was a write, there's no reason
> > +                         * to have entries in the readers array.
> > +                         */
> > +                        assert(!util_dynarray_num_elements(&access->readers,
> > +                                                           struct panfrost_batch_fence *));
> > +
> > +                        /* Add ourselves to the readers array. */
> > +                        panfrost_batch_fence_reference(batch->out_sync);
> > +                        util_dynarray_append(&access->readers,
> > +                                             struct panfrost_batch_fence *,
> > +                                             batch->out_sync);
> > +                        access->type = access_type;  
> 
> I would prefer an explicit `access->type = PAN_BO_GPU_ACCESS_READ`, so
> the parallelism with `readers` is visually clear without needing to
> glance back up at the condition.

I can do that.

> 
> > +                }
> > +        } else {
> > +                /* Previous access was a read and we want to read this BO.
> > +                 * Add ourselves to the readers array if we're not already
> > +                 * there and add a dependency on the previous writer (if any).
> > +                 */
> > +	        bool already_there = false;  
> 
> Spacing.
> 
> > +                util_dynarray_foreach(&access->readers,
> > +                                      struct panfrost_batch_fence *, reader) {
> > +                        if (*reader && (*reader)->batch == batch) {
> > +                                already_there = true;
> > +                                break;
> > +                        }
> > +                }  
> 
> This raises the point that access->readers should probably also be a
> set, not an array, but again, not sure how much it matters.

Yep, same reply: I was expecting the number of entries to be rather
small, but that's something we can easily change.

> 
> > -        if (ctx->last_out_sync) {
> > +        if (reqs & PANFROST_JD_REQ_FS && batch->first_job.gpu)  
> 
> Could we add a `bool is_fragment` with this condition and then make this
> as well as the if below simple if (is_fragment)?

Hm, the condition is actually is_fragment_and_has_draws, so I'm not
sure is_fragment is good name. Maybe depends_on_vertex_tiler.

> 
> > +        /* Submit the dependencies first. */
> > +        util_dynarray_foreach(&batch->dependencies,
> > +                              struct panfrost_batch_fence *, dep) {
> > +                if ((*dep)->batch)
> > +                        panfrost_batch_submit((*dep)->batch);
> > +        }  
> 
> I assume this logic will be refined later in the series...?

It's not actually. Why do you think it should?
> > I'm wondering if batch->dependencies should be expressed as a set,
> > rather than a dynarray, such that testing whether a batch has a
> > given dependency is ideally O(1), not O(N).
> > 
> > In practice I don't know if the asymptotic complexity matters, esp. for
> > small numbers of batches, and in practice hash table iteration is slow
> > enough in mesa* that maybe it would be counterproductive.
> > 
> > Just something I thought I might throw out there.
> > 
> > * Set iteration ought to be no slower than array iteration, but constant
> >   factors are a thing.
> 
> I thought the number of deps would be small enough to not justify the
> use of a set, but maybe I'm wrong.

Mm, after all this lands we can profile and revisit, there are bigger
fish to fry.

> > > +static void
> > > +panfrost_bo_access_gc_fences(struct panfrost_context *ctx,
> > > +                             struct panfrost_bo_access *access,
> > > +			     const struct panfrost_bo *bo)  
> > 
> > Could you remind me what gc abbreviates? Sorry.
> 
> Garbage collect.
> 
> > 
> > I'm a little unclear on what the function's purpose is based on the
> > name.
> 
> Collect signaled fences to keep the kernel-side syncobj-map small. The
> idea is to collect those signaled fences at the end of each flush_all
> call. This function is likely to collect only fences from previous
> batch flushes not the one that have just have just been submitted and
> are probably still in flight when we trigger the garbage collection.
> Anyway, we need to do this garbage collection at some point if we don't
> want the BO access map to keep invalid entries around and retain
> syncobjs forever.

This definitely needs to be documented somewhere, then. Maybe paste the
"Collect...forever" paragraph into a comment above access_gc_fences?

> > Question: is it safe to remove entries while iterating the table?
> > (Not a review comment, I don't know the details of mesa's
> > implementation)
> 
> According to the doc, it is.

Good to know, thank you.

> > > -        if (ctx->last_out_sync) {
> > > +        if (reqs & PANFROST_JD_REQ_FS && batch->first_job.gpu)  
> > 
> > Could we add a `bool is_fragment` with this condition and then make this
> > as well as the if below simple if (is_fragment)?
> 
> Hm, the condition is actually is_fragment_and_has_draws, so I'm not
> sure is_fragment is good name. Maybe depends_on_vertex_tiler.

is_fragment_shader?

> > > +        /* Submit the dependencies first. */
> > > +        util_dynarray_foreach(&batch->dependencies,
> > > +                              struct panfrost_batch_fence *, dep) {
> > > +                if ((*dep)->batch)
> > > +                        panfrost_batch_submit((*dep)->batch);
> > > +        }  
> > 
> > I assume this logic will be refined later in the series...?
> 
> It's not actually. Why do you think it should?

Oh, I was just assuming the dependency graph stuff would be more
explicit, I guess.