[8/8] panfrost: Add backend targeting the DRM driver

Submitted by Tomeu Vizoso on March 4, 2019, 4:11 p.m.

Details

Message ID 20190304161144.52910-9-tomeu.vizoso@collabora.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Tomeu Vizoso March 4, 2019, 4:11 p.m.
This backend interacts with the new DRM driver for Midgard GPUs which is
currently in development.

When using this backend, Panfrost has roughly on-par functionality as
when using the non-DRM driver from Arm.

Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
---
 include/drm-uapi/panfrost_drm.h        | 128 +++++++++
 src/gallium/drivers/panfrost/pan_drm.c | 362 +++++++++++++++++++++++++
 2 files changed, 490 insertions(+)
 create mode 100644 include/drm-uapi/panfrost_drm.h

Patch hide | download patch | download mbox

diff --git a/include/drm-uapi/panfrost_drm.h b/include/drm-uapi/panfrost_drm.h
new file mode 100644
index 000000000000..d4d271e37206
--- /dev/null
+++ b/include/drm-uapi/panfrost_drm.h
@@ -0,0 +1,128 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright 2018, Linaro, Ltd., Rob Herring <robh@kernel.org> */
+
+#ifndef __PANFROST_DRM_H__
+#define __PANFROST_DRM_H__
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* timeouts are specified in clock-monotonic absolute times (to simplify
+ * restarting interrupted ioctls).  The following struct is logically the
+ * same as 'struct timespec' but 32/64b ABI safe.
+ */
+struct drm_panfrost_timespec {
+	__s64 tv_sec;          /* seconds */
+	__s64 tv_nsec;         /* nanoseconds */
+};
+
+#define PANFROST_PARAM_GPU_ID		0x01
+
+struct drm_panfrost_get_param {
+	__u32 param;	/* in */
+	__u32 pad;
+	__u64 value;	/* out */
+};
+
+struct drm_panfrost_gem_new {
+	__u64 size;           /* in */
+	__u32 flags;          /* in, mask of ETNA_BO_x */
+	__u32 handle;         /* out */
+	/**
+	 * Returned offset for the BO in the GPU address space.  This offset
+	 * is private to the DRM fd and is valid for the lifetime of the GEM
+	 * handle.
+	 *
+	 * This offset value will always be nonzero, since various HW
+	 * units treat 0 specially.
+	 */
+	__u64 offset;
+};
+struct drm_panfrost_gem_info {
+	__u32 handle;         /* in */
+	__u32 pad;
+	__u64 offset;         /* out, offset to pass to mmap() */
+};
+
+/**
+ * Returns the offset for the BO in the GPU address space for this DRM fd.
+ * This is the same value returned by drm_panfrost_gem_new, if that was called
+ * from this DRM fd.
+ */
+struct drm_panfrost_get_bo_offset {
+	__u32 handle;
+	__u32 pad;
+	__u64 offset;
+};
+
+
+#define ETNA_PREP_READ        0x01
+#define ETNA_PREP_WRITE       0x02
+#define ETNA_PREP_NOSYNC      0x04
+
+struct drm_panfrost_gem_cpu_prep {
+	__u32 handle;         /* in */
+	__u32 op;             /* in, mask of ETNA_PREP_x */
+	struct drm_panfrost_timespec timeout;   /* in */
+};
+
+struct drm_panfrost_gem_cpu_fini {
+	__u32 handle;         /* in */
+	__u32 flags;          /* in, placeholder for now, no defined values */
+};
+
+/*
+ * Cmdstream Submission:
+ */
+
+#define PANFROST_JD_REQ_FS (1 << 0)
+
+#define PANFROST_DEP_TYPE_ORDER	0x01
+#define PANFROST_DEP_TYPE_DATA	0x02
+
+struct drm_panfrost_gem_submit_atom_dep {
+	__u32 atom_nr;	/* job ID of dependency */
+	__u32 type;	/* one of PANFROST_DEP_TYPE_* */
+};
+
+struct drm_panfrost_gem_submit_atom {
+	__u64 jc;           /* in, address to GPU mapping of job descriptor */
+	__u32 atom_nr;      /* in, job ID */
+	__u32 requirements; /* in, a combination of PANFROST_JD_REQ_* */
+	__u64 bo_handles;
+	__u32 bo_handle_count;
+	struct drm_panfrost_gem_submit_atom_dep deps[2];
+};
+
+struct drm_panfrost_gem_submit {
+	__u32 nr_atoms;         /* in, number of submit_atom */
+	__u32 pad;
+	__u64 atoms;            /* in, ptr to array of submit_atom */
+};
+
+
+
+#define DRM_PANFROST_GET_PARAM		0x00
+#define DRM_PANFROST_GEM_NEW		0x01
+#define DRM_PANFROST_GEM_INFO		0x02
+#define DRM_PANFROST_GEM_CPU_PREP	0x03
+#define DRM_PANFROST_GEM_CPU_FINI	0x04
+#define DRM_PANFROST_GEM_SUBMIT		0x05
+#define DRM_PANFROST_GET_BO_OFFSET	0x06
+
+#define DRM_IOCTL_PANFROST_GET_PARAM	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
+#define DRM_IOCTL_PANFROST_GEM_NEW	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GEM_NEW, struct drm_panfrost_gem_new)
+#define DRM_IOCTL_PANFROST_GEM_INFO	DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GEM_INFO, struct drm_panfrost_gem_info)
+#define DRM_IOCTL_PANFROST_GEM_CPU_PREP	DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_CPU_PREP, struct drm_panfrost_gem_cpu_prep)
+#define DRM_IOCTL_PANFROST_GEM_CPU_FINI	DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_CPU_FINI, struct drm_panfrost_gem_cpu_fini)
+#define DRM_IOCTL_PANFROST_GEM_SUBMIT	DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_SUBMIT, struct drm_panfrost_gem_submit)
+#define DRM_IOCTL_PANFROST_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __PANFROST_DRM_H__ */
diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c
index cb01cce634f7..1ae775c5f763 100644
--- a/src/gallium/drivers/panfrost/pan_drm.c
+++ b/src/gallium/drivers/panfrost/pan_drm.c
@@ -21,16 +21,364 @@ 
  * SOFTWARE.
  *
  */
+
+#include <fcntl.h>
+#include <xf86drm.h>
+
+#include "drm-uapi/panfrost_drm.h"
+
 #include "util/u_memory.h"
+#include "util/os_time.h"
 
 #include "pan_screen.h"
+#include "pan_resource.h"
+#include "pan_context.h"
 #include "pan_drm.h"
+#include "pan_trace.h"
 
 struct panfrost_drm {
 	struct panfrost_driver base;
 	int fd;
 };
 
+static void
+panfrost_drm_allocate_slab(struct panfrost_screen *screen,
+		           struct panfrost_memory *mem,
+		           size_t pages,
+		           bool same_va,
+		           int extra_flags,
+		           int commit_count,
+		           int extent)
+{
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+	struct drm_panfrost_gem_new gem_new = {
+		        .size = pages * 4096,
+		        .flags = 0,  // TODO figure out proper flags..
+	};
+	struct drm_panfrost_gem_info gem_info = {0,};
+	int ret;
+
+	// TODO cache allocations
+	// TODO properly handle errors
+	// TODO take into account extra_flags
+
+	ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_NEW, &gem_new);
+	if (ret) {
+                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_NEW failed: %d\n", ret);
+		assert(0);
+	}
+
+	mem->gpu = gem_new.offset;
+	mem->gem_handle = gem_new.handle;
+        mem->stack_bottom = 0;
+        mem->size = gem_new.size;
+
+	// TODO map and unmap on demand?
+	gem_info.handle = gem_new.handle;
+	ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_INFO, &gem_info);
+	if (ret) {
+                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_INFO failed: %d\n", ret);
+		assert(0);
+	}
+
+        mem->cpu = mmap(NULL, mem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                       drm->fd, gem_info.offset);
+        if (mem->cpu == MAP_FAILED) {
+                fprintf(stderr, "mmap failed: %p\n", mem->cpu);
+		assert(0);
+	}
+
+        /* Record the mmap if we're tracing */
+        if (!(extra_flags & PAN_ALLOCATE_GROWABLE))
+                pantrace_mmap(mem->gpu, mem->cpu, mem->size, NULL);
+}
+
+static void
+panfrost_drm_free_slab(struct panfrost_screen *screen, struct panfrost_memory *mem)
+{
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+	struct drm_gem_close gem_close = {
+		.handle = mem->gem_handle,
+	};
+	int ret;
+
+        if (munmap((void *) (uintptr_t) mem->cpu, mem->size)) {
+                perror("munmap");
+                abort();
+        }
+
+	mem->cpu = NULL;
+
+	ret = drmIoctl(drm->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+	if (ret) {
+                fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %d\n", ret);
+		assert(0);
+	}
+
+	mem->gem_handle = -1;
+}
+
+static struct panfrost_bo *
+panfrost_drm_import_bo(struct panfrost_screen *screen, struct winsys_handle *whandle)
+{
+	struct panfrost_bo *bo = CALLOC_STRUCT(panfrost_bo);
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
+        int ret;
+        unsigned gem_handle;
+
+	ret = drmPrimeFDToHandle(drm->fd, whandle->handle, &gem_handle);
+	assert(!ret);
+
+	get_bo_offset.handle = gem_handle;
+        ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
+        assert(!ret);
+
+	bo->gem_handle = gem_handle;
+        bo->gpu[0] = (mali_ptr) get_bo_offset.offset;
+
+	/* Why would we need to map the dumb buffer now? */
+        bo->cpu[0] = NULL;
+
+        return bo;
+}
+
+static int
+panfrost_drm_export_bo(struct panfrost_screen *screen, int gem_handle, struct winsys_handle *whandle)
+{
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct drm_prime_handle args = {
+                .handle = gem_handle,
+                .flags = DRM_CLOEXEC,
+        };
+
+        int ret = drmIoctl(drm->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
+        if (ret == -1)
+                return FALSE;
+
+        whandle->handle = args.fd;
+
+        return TRUE;
+}
+
+static void
+panfrost_drm_free_imported_bo(struct panfrost_screen *screen, struct panfrost_bo *bo) 
+{
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+	struct drm_gem_close gem_close = {
+		.handle = bo->gem_handle,
+	};
+	int ret;
+
+	ret = drmIoctl(drm->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
+	if (ret) {
+                fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %d\n", ret);
+		assert(0);
+	}
+
+	bo->gem_handle = -1;
+	bo->gpu[0] = (mali_ptr)NULL;
+}
+
+static uint8_t
+allocate_atom()
+{
+	/* Use to allocate atom numbers for jobs. We probably want to overhaul this in kernel space at some point. */
+	static uint8_t atom_counter = 0;
+
+        atom_counter++;
+
+        /* Workaround quirk where atoms must be strictly positive */
+
+        if (atom_counter == 0)
+                atom_counter++;
+
+        return atom_counter;
+}
+
+static int
+panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool is_scanout)
+{
+        struct pipe_context *gallium = (struct pipe_context *) ctx;
+        struct panfrost_screen *screen = pan_screen(gallium->screen);
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct drm_panfrost_gem_submit_atom atoms[2];
+        struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0];
+        int idx = 0;
+
+	memset(atoms, 0, sizeof(atoms));
+
+        atoms[idx].deps[0].atom_nr = screen->last_fragment_id;
+        atoms[idx].deps[0].type = PANFROST_DEP_TYPE_ORDER;
+
+        if (has_draws) {
+		atoms[idx].jc = ctx->set_value_job;
+		atoms[idx].atom_nr = allocate_atom();
+		idx++;
+		atoms[idx].deps[0].atom_nr = atoms[idx - 1].atom_nr;
+		atoms[idx].deps[0].type = PANFROST_DEP_TYPE_DATA;
+	}
+
+	atoms[idx].jc = panfrost_fragment_job(ctx);
+	atoms[idx].atom_nr = allocate_atom();
+	atoms[idx].requirements = PANFROST_JD_REQ_FS;
+	if (surf) {
+		struct panfrost_resource *res = pan_resource(surf->texture);
+		atoms[idx].bo_handles = (__u64) &res->bo->gem_handle;
+		atoms[idx].bo_handle_count = 1;
+	}
+
+        struct drm_panfrost_gem_submit submit = {
+                .atoms = (__u64)atoms,
+                .nr_atoms = idx + 1,
+        };
+
+        /* Dump memory _before_ submitting so we're not corrupted with actual GPU results */
+        pantrace_dump_memory();
+
+        if (drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_SUBMIT, &submit)) {
+                printf("Error submitting\n");
+                assert(0);
+        }
+
+        /* Trace the job if we're doing that and do a memory dump. We may
+         * want to adjust this logic once we're ready to trace FBOs */
+        for (unsigned i = (has_draws ? 0 : 1); i < 2; ++i) {
+                pantrace_submit_job(atoms[i].jc, atoms[i].requirements, FALSE);
+        }
+
+        /* Return fragment ID */
+        return atoms[idx].atom_nr;
+}
+
+static struct panfrost_fence *
+panfrost_fence_create(struct panfrost_context *ctx)
+{
+        struct pipe_context *gallium = (struct pipe_context *) ctx;
+        struct panfrost_screen *screen = pan_screen(gallium->screen);
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct panfrost_fence *f = calloc(1, sizeof(*f));
+        if (!f)
+                return NULL;
+
+        /* Snapshot the last Panfrost's rendering's out fence.  We'd rather have
+         * another syncobj instead of a sync file, but this is all we get.
+         * (HandleToFD/FDToHandle just gives you another syncobj ID for the
+         * same syncobj).
+         */
+        drmSyncobjExportSyncFile(drm->fd, ctx->out_sync, &f->fd);
+        if (f->fd == -1) {
+                fprintf(stderr, "export failed\n");
+                free(f);
+                return NULL;
+        }
+
+        pipe_reference_init(&f->reference, 1);
+
+        return f;
+}
+
+static void
+panfrost_drm_force_flush_fragment(struct panfrost_context *ctx,
+				  struct pipe_fence_handle **fence)
+{
+        struct pipe_context *gallium = (struct pipe_context *) ctx;
+
+        if (fence) {
+                struct panfrost_fence *f = panfrost_fence_create(ctx);
+                gallium->screen->fence_reference(gallium->screen, fence, NULL);
+                *fence = (struct pipe_fence_handle *)f;
+        }
+}
+
+static void
+panfrost_drm_enable_counters(struct panfrost_screen *screen)
+{
+	fprintf(stderr, "unimplemented: %s\n", __func__);
+}
+
+static void
+panfrost_drm_dump_counters(struct panfrost_screen *screen)
+{
+	fprintf(stderr, "unimplemented: %s\n", __func__);
+}
+
+static unsigned
+panfrost_drm_query_gpu_version(struct panfrost_screen *screen)
+{
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct drm_panfrost_get_param get_param = {0,};
+        int ret;
+
+	get_param.param = PANFROST_PARAM_GPU_ID;
+        ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
+        assert(!ret);
+
+	return get_param.value;
+}
+
+static int
+panfrost_drm_init_context(struct panfrost_context *ctx)
+{
+        struct pipe_context *gallium = (struct pipe_context *) ctx;
+        struct panfrost_screen *screen = pan_screen(gallium->screen);
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+
+        return drmSyncobjCreate(drm->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                &ctx->out_sync);
+}
+
+static void
+panfrost_drm_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+        struct panfrost_fence **p = (struct panfrost_fence **)ptr;
+        struct panfrost_fence *f = (struct panfrost_fence *)fence;
+        struct panfrost_fence *old = *p;
+
+        if (pipe_reference(&(*p)->reference, &f->reference)) {
+                close(old->fd);
+                free(old);
+        }
+        *p = f;
+}
+
+static boolean
+panfrost_drm_fence_finish(struct pipe_screen *pscreen,
+                      struct pipe_context *ctx,
+                      struct pipe_fence_handle *fence,
+                      uint64_t timeout)
+{
+        struct panfrost_screen *screen = pan_screen(pscreen);
+	struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
+        struct panfrost_fence *f = (struct panfrost_fence *)fence;
+        int ret;
+
+        unsigned syncobj;
+        ret = drmSyncobjCreate(drm->fd, 0, &syncobj);
+        if (ret) {
+                fprintf(stderr, "Failed to create syncobj to wait on: %m\n");
+                return false;
+        }
+
+        drmSyncobjImportSyncFile(drm->fd, syncobj, f->fd);
+        if (ret) {
+                fprintf(stderr, "Failed to import fence to syncobj: %m\n");
+                return false;
+        }
+
+        uint64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+        if (abs_timeout == OS_TIMEOUT_INFINITE)
+                abs_timeout = INT64_MAX;
+
+        ret = drmSyncobjWait(drm->fd, &syncobj, 1, abs_timeout, 0, NULL);
+
+        drmSyncobjDestroy(drm->fd, syncobj);
+
+        return ret >= 0;
+}
+
 struct panfrost_driver *
 panfrost_create_drm_driver(int fd)
 {
@@ -38,5 +386,19 @@  panfrost_create_drm_driver(int fd)
 
 	driver->fd = fd;
 
+	driver->base.import_bo = panfrost_drm_import_bo;
+	driver->base.export_bo = panfrost_drm_export_bo;
+	driver->base.free_imported_bo = panfrost_drm_free_imported_bo;
+	driver->base.submit_vs_fs_job = panfrost_drm_submit_vs_fs_job;
+	driver->base.force_flush_fragment = panfrost_drm_force_flush_fragment;
+	driver->base.allocate_slab = panfrost_drm_allocate_slab;
+	driver->base.free_slab = panfrost_drm_free_slab;
+	driver->base.enable_counters = panfrost_drm_enable_counters;
+	driver->base.query_gpu_version = panfrost_drm_query_gpu_version;
+	driver->base.init_context = panfrost_drm_init_context;
+	driver->base.fence_reference = panfrost_drm_fence_reference;
+	driver->base.fence_finish = panfrost_drm_fence_finish;
+	driver->base.dump_counters = panfrost_drm_dump_counters;
+
         return &driver->base;
 }

Comments

On Mon, Mar 4, 2019 at 10:12 AM Tomeu Vizoso <tomeu.vizoso@collabora.com> wrote:
>
> This backend interacts with the new DRM driver for Midgard GPUs which is
> currently in development.
>
> When using this backend, Panfrost has roughly on-par functionality as
> when using the non-DRM driver from Arm.
>
> Signed-off-by: Tomeu Vizoso <tomeu.vizoso@collabora.com>
> ---
>  include/drm-uapi/panfrost_drm.h        | 128 +++++++++
>  src/gallium/drivers/panfrost/pan_drm.c | 362 +++++++++++++++++++++++++
>  2 files changed, 490 insertions(+)
>  create mode 100644 include/drm-uapi/panfrost_drm.h
>
> diff --git a/include/drm-uapi/panfrost_drm.h b/include/drm-uapi/panfrost_drm.h
> new file mode 100644
> index 000000000000..d4d271e37206
> --- /dev/null
> +++ b/include/drm-uapi/panfrost_drm.h
> @@ -0,0 +1,128 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +/* Copyright 2018, Linaro, Ltd., Rob Herring <robh@kernel.org> */
> +
> +#ifndef __PANFROST_DRM_H__
> +#define __PANFROST_DRM_H__
> +
> +#include "drm.h"
> +
> +#if defined(__cplusplus)
> +extern "C" {
> +#endif
> +
> +/* timeouts are specified in clock-monotonic absolute times (to simplify
> + * restarting interrupted ioctls).  The following struct is logically the
> + * same as 'struct timespec' but 32/64b ABI safe.
> + */
> +struct drm_panfrost_timespec {
> +       __s64 tv_sec;          /* seconds */
> +       __s64 tv_nsec;         /* nanoseconds */
> +};
> +
> +#define PANFROST_PARAM_GPU_ID          0x01
> +
> +struct drm_panfrost_get_param {
> +       __u32 param;    /* in */
> +       __u32 pad;
> +       __u64 value;    /* out */
> +};
> +
> +struct drm_panfrost_gem_new {
> +       __u64 size;           /* in */
> +       __u32 flags;          /* in, mask of ETNA_BO_x */
> +       __u32 handle;         /* out */
> +       /**
> +        * Returned offset for the BO in the GPU address space.  This offset
> +        * is private to the DRM fd and is valid for the lifetime of the GEM
> +        * handle.
> +        *
> +        * This offset value will always be nonzero, since various HW
> +        * units treat 0 specially.
> +        */
> +       __u64 offset;
> +};
> +struct drm_panfrost_gem_info {
> +       __u32 handle;         /* in */
> +       __u32 pad;
> +       __u64 offset;         /* out, offset to pass to mmap() */
> +};
> +
> +/**
> + * Returns the offset for the BO in the GPU address space for this DRM fd.
> + * This is the same value returned by drm_panfrost_gem_new, if that was called
> + * from this DRM fd.
> + */
> +struct drm_panfrost_get_bo_offset {
> +       __u32 handle;
> +       __u32 pad;
> +       __u64 offset;
> +};
> +
> +
> +#define ETNA_PREP_READ        0x01
> +#define ETNA_PREP_WRITE       0x02
> +#define ETNA_PREP_NOSYNC      0x04

Guess we need to rename or delete these and a few other spots with
'ETNA'. I'd hoped to keep some alignment with other drivers, but I
don't think that's going to happen.

> +
> +struct drm_panfrost_gem_cpu_prep {
> +       __u32 handle;         /* in */
> +       __u32 op;             /* in, mask of ETNA_PREP_x */
> +       struct drm_panfrost_timespec timeout;   /* in */
> +};
> +
> +struct drm_panfrost_gem_cpu_fini {
> +       __u32 handle;         /* in */
> +       __u32 flags;          /* in, placeholder for now, no defined values */
> +};
> +
> +/*
> + * Cmdstream Submission:
> + */
> +
> +#define PANFROST_JD_REQ_FS (1 << 0)
> +
> +#define PANFROST_DEP_TYPE_ORDER        0x01
> +#define PANFROST_DEP_TYPE_DATA 0x02
> +
> +struct drm_panfrost_gem_submit_atom_dep {
> +       __u32 atom_nr;  /* job ID of dependency */
> +       __u32 type;     /* one of PANFROST_DEP_TYPE_* */
> +};
> +
> +struct drm_panfrost_gem_submit_atom {
> +       __u64 jc;           /* in, address to GPU mapping of job descriptor */
> +       __u32 atom_nr;      /* in, job ID */
> +       __u32 requirements; /* in, a combination of PANFROST_JD_REQ_* */
> +       __u64 bo_handles;
> +       __u32 bo_handle_count;
> +       struct drm_panfrost_gem_submit_atom_dep deps[2];
> +};
> +
> +struct drm_panfrost_gem_submit {
> +       __u32 nr_atoms;         /* in, number of submit_atom */
> +       __u32 pad;
> +       __u64 atoms;            /* in, ptr to array of submit_atom */
> +};
> +
> +
> +
> +#define DRM_PANFROST_GET_PARAM         0x00
> +#define DRM_PANFROST_GEM_NEW           0x01
> +#define DRM_PANFROST_GEM_INFO          0x02
> +#define DRM_PANFROST_GEM_CPU_PREP      0x03
> +#define DRM_PANFROST_GEM_CPU_FINI      0x04
> +#define DRM_PANFROST_GEM_SUBMIT                0x05
> +#define DRM_PANFROST_GET_BO_OFFSET     0x06
> +
> +#define DRM_IOCTL_PANFROST_GET_PARAM   DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_PARAM, struct drm_panfrost_get_param)
> +#define DRM_IOCTL_PANFROST_GEM_NEW     DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GEM_NEW, struct drm_panfrost_gem_new)
> +#define DRM_IOCTL_PANFROST_GEM_INFO    DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GEM_INFO, struct drm_panfrost_gem_info)
> +#define DRM_IOCTL_PANFROST_GEM_CPU_PREP        DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_CPU_PREP, struct drm_panfrost_gem_cpu_prep)
> +#define DRM_IOCTL_PANFROST_GEM_CPU_FINI        DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_CPU_FINI, struct drm_panfrost_gem_cpu_fini)
> +#define DRM_IOCTL_PANFROST_GEM_SUBMIT  DRM_IOW(DRM_COMMAND_BASE + DRM_PANFROST_GEM_SUBMIT, struct drm_panfrost_gem_submit)
> +#define DRM_IOCTL_PANFROST_GET_BO_OFFSET DRM_IOWR(DRM_COMMAND_BASE + DRM_PANFROST_GET_BO_OFFSET, struct drm_panfrost_get_bo_offset)
> +
> +#if defined(__cplusplus)
> +}
> +#endif
> +
> +#endif /* __PANFROST_DRM_H__ */
> diff --git a/src/gallium/drivers/panfrost/pan_drm.c b/src/gallium/drivers/panfrost/pan_drm.c
> index cb01cce634f7..1ae775c5f763 100644
> --- a/src/gallium/drivers/panfrost/pan_drm.c
> +++ b/src/gallium/drivers/panfrost/pan_drm.c
> @@ -21,16 +21,364 @@
>   * SOFTWARE.
>   *
>   */
> +
> +#include <fcntl.h>
> +#include <xf86drm.h>
> +
> +#include "drm-uapi/panfrost_drm.h"
> +
>  #include "util/u_memory.h"
> +#include "util/os_time.h"
>
>  #include "pan_screen.h"
> +#include "pan_resource.h"
> +#include "pan_context.h"
>  #include "pan_drm.h"
> +#include "pan_trace.h"
>
>  struct panfrost_drm {
>         struct panfrost_driver base;
>         int fd;
>  };
>
> +static void
> +panfrost_drm_allocate_slab(struct panfrost_screen *screen,
> +                          struct panfrost_memory *mem,
> +                          size_t pages,
> +                          bool same_va,
> +                          int extra_flags,
> +                          int commit_count,
> +                          int extent)
> +{
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +       struct drm_panfrost_gem_new gem_new = {
> +                       .size = pages * 4096,
> +                       .flags = 0,  // TODO figure out proper flags..
> +       };
> +       struct drm_panfrost_gem_info gem_info = {0,};
> +       int ret;
> +
> +       // TODO cache allocations
> +       // TODO properly handle errors
> +       // TODO take into account extra_flags
> +
> +       ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_NEW, &gem_new);
> +       if (ret) {
> +                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_NEW failed: %d\n", ret);
> +               assert(0);
> +       }
> +
> +       mem->gpu = gem_new.offset;
> +       mem->gem_handle = gem_new.handle;
> +        mem->stack_bottom = 0;
> +        mem->size = gem_new.size;
> +
> +       // TODO map and unmap on demand?
> +       gem_info.handle = gem_new.handle;
> +       ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_INFO, &gem_info);
> +       if (ret) {
> +                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_INFO failed: %d\n", ret);
> +               assert(0);
> +       }
> +
> +        mem->cpu = mmap(NULL, mem->size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +                       drm->fd, gem_info.offset);
> +        if (mem->cpu == MAP_FAILED) {
> +                fprintf(stderr, "mmap failed: %p\n", mem->cpu);
> +               assert(0);
> +       }
> +
> +        /* Record the mmap if we're tracing */
> +        if (!(extra_flags & PAN_ALLOCATE_GROWABLE))
> +                pantrace_mmap(mem->gpu, mem->cpu, mem->size, NULL);
> +}
> +
> +static void
> +panfrost_drm_free_slab(struct panfrost_screen *screen, struct panfrost_memory *mem)
> +{
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +       struct drm_gem_close gem_close = {
> +               .handle = mem->gem_handle,
> +       };
> +       int ret;
> +
> +        if (munmap((void *) (uintptr_t) mem->cpu, mem->size)) {
> +                perror("munmap");
> +                abort();
> +        }
> +
> +       mem->cpu = NULL;
> +
> +       ret = drmIoctl(drm->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
> +       if (ret) {
> +                fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %d\n", ret);
> +               assert(0);
> +       }
> +
> +       mem->gem_handle = -1;
> +}
> +
> +static struct panfrost_bo *
> +panfrost_drm_import_bo(struct panfrost_screen *screen, struct winsys_handle *whandle)
> +{
> +       struct panfrost_bo *bo = CALLOC_STRUCT(panfrost_bo);
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct drm_panfrost_get_bo_offset get_bo_offset = {0,};
> +        int ret;
> +        unsigned gem_handle;
> +
> +       ret = drmPrimeFDToHandle(drm->fd, whandle->handle, &gem_handle);
> +       assert(!ret);
> +
> +       get_bo_offset.handle = gem_handle;
> +        ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GET_BO_OFFSET, &get_bo_offset);
> +        assert(!ret);
> +
> +       bo->gem_handle = gem_handle;
> +        bo->gpu[0] = (mali_ptr) get_bo_offset.offset;
> +
> +       /* Why would we need to map the dumb buffer now? */
> +        bo->cpu[0] = NULL;
> +
> +        return bo;
> +}
> +
> +static int
> +panfrost_drm_export_bo(struct panfrost_screen *screen, int gem_handle, struct winsys_handle *whandle)
> +{
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct drm_prime_handle args = {
> +                .handle = gem_handle,
> +                .flags = DRM_CLOEXEC,
> +        };
> +
> +        int ret = drmIoctl(drm->fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &args);
> +        if (ret == -1)
> +                return FALSE;
> +
> +        whandle->handle = args.fd;
> +
> +        return TRUE;
> +}
> +
> +static void
> +panfrost_drm_free_imported_bo(struct panfrost_screen *screen, struct panfrost_bo *bo)
> +{
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +       struct drm_gem_close gem_close = {
> +               .handle = bo->gem_handle,
> +       };
> +       int ret;
> +
> +       ret = drmIoctl(drm->fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
> +       if (ret) {
> +                fprintf(stderr, "DRM_IOCTL_GEM_CLOSE failed: %d\n", ret);
> +               assert(0);
> +       }
> +
> +       bo->gem_handle = -1;
> +       bo->gpu[0] = (mali_ptr)NULL;
> +}
> +
> +static uint8_t
> +allocate_atom()
> +{
> +       /* Use to allocate atom numbers for jobs. We probably want to overhaul this in kernel space at some point. */
> +       static uint8_t atom_counter = 0;
> +
> +        atom_counter++;
> +
> +        /* Workaround quirk where atoms must be strictly positive */
> +
> +        if (atom_counter == 0)
> +                atom_counter++;
> +
> +        return atom_counter;
> +}
> +
> +static int
> +panfrost_drm_submit_vs_fs_job(struct panfrost_context *ctx, bool has_draws, bool is_scanout)
> +{
> +        struct pipe_context *gallium = (struct pipe_context *) ctx;
> +        struct panfrost_screen *screen = pan_screen(gallium->screen);
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct drm_panfrost_gem_submit_atom atoms[2];
> +        struct pipe_surface *surf = ctx->pipe_framebuffer.cbufs[0];
> +        int idx = 0;
> +
> +       memset(atoms, 0, sizeof(atoms));
> +
> +        atoms[idx].deps[0].atom_nr = screen->last_fragment_id;
> +        atoms[idx].deps[0].type = PANFROST_DEP_TYPE_ORDER;
> +
> +        if (has_draws) {
> +               atoms[idx].jc = ctx->set_value_job;
> +               atoms[idx].atom_nr = allocate_atom();
> +               idx++;
> +               atoms[idx].deps[0].atom_nr = atoms[idx - 1].atom_nr;
> +               atoms[idx].deps[0].type = PANFROST_DEP_TYPE_DATA;
> +       }
> +
> +       atoms[idx].jc = panfrost_fragment_job(ctx);
> +       atoms[idx].atom_nr = allocate_atom();
> +       atoms[idx].requirements = PANFROST_JD_REQ_FS;
> +       if (surf) {
> +               struct panfrost_resource *res = pan_resource(surf->texture);
> +               atoms[idx].bo_handles = (__u64) &res->bo->gem_handle;
> +               atoms[idx].bo_handle_count = 1;
> +       }
> +
> +        struct drm_panfrost_gem_submit submit = {
> +                .atoms = (__u64)atoms,
> +                .nr_atoms = idx + 1,
> +        };
> +
> +        /* Dump memory _before_ submitting so we're not corrupted with actual GPU results */
> +        pantrace_dump_memory();
> +
> +        if (drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_SUBMIT, &submit)) {
> +                printf("Error submitting\n");
> +                assert(0);
> +        }
> +
> +        /* Trace the job if we're doing that and do a memory dump. We may
> +         * want to adjust this logic once we're ready to trace FBOs */
> +        for (unsigned i = (has_draws ? 0 : 1); i < 2; ++i) {
> +                pantrace_submit_job(atoms[i].jc, atoms[i].requirements, FALSE);
> +        }
> +
> +        /* Return fragment ID */
> +        return atoms[idx].atom_nr;
> +}
> +
> +static struct panfrost_fence *
> +panfrost_fence_create(struct panfrost_context *ctx)
> +{
> +        struct pipe_context *gallium = (struct pipe_context *) ctx;
> +        struct panfrost_screen *screen = pan_screen(gallium->screen);
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct panfrost_fence *f = calloc(1, sizeof(*f));
> +        if (!f)
> +                return NULL;
> +
> +        /* Snapshot the last Panfrost's rendering's out fence.  We'd rather have
> +         * another syncobj instead of a sync file, but this is all we get.
> +         * (HandleToFD/FDToHandle just gives you another syncobj ID for the
> +         * same syncobj).
> +         */
> +        drmSyncobjExportSyncFile(drm->fd, ctx->out_sync, &f->fd);
> +        if (f->fd == -1) {
> +                fprintf(stderr, "export failed\n");
> +                free(f);
> +                return NULL;
> +        }
> +
> +        pipe_reference_init(&f->reference, 1);
> +
> +        return f;
> +}
> +
> +static void
> +panfrost_drm_force_flush_fragment(struct panfrost_context *ctx,
> +                                 struct pipe_fence_handle **fence)
> +{
> +        struct pipe_context *gallium = (struct pipe_context *) ctx;
> +
> +        if (fence) {
> +                struct panfrost_fence *f = panfrost_fence_create(ctx);
> +                gallium->screen->fence_reference(gallium->screen, fence, NULL);
> +                *fence = (struct pipe_fence_handle *)f;
> +        }
> +}
> +
> +static void
> +panfrost_drm_enable_counters(struct panfrost_screen *screen)
> +{
> +       fprintf(stderr, "unimplemented: %s\n", __func__);
> +}
> +
> +static void
> +panfrost_drm_dump_counters(struct panfrost_screen *screen)
> +{
> +       fprintf(stderr, "unimplemented: %s\n", __func__);
> +}
> +
> +static unsigned
> +panfrost_drm_query_gpu_version(struct panfrost_screen *screen)
> +{
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct drm_panfrost_get_param get_param = {0,};
> +        int ret;
> +
> +       get_param.param = PANFROST_PARAM_GPU_ID;
> +        ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GET_PARAM, &get_param);
> +        assert(!ret);
> +
> +       return get_param.value;
> +}
> +
> +static int
> +panfrost_drm_init_context(struct panfrost_context *ctx)
> +{
> +        struct pipe_context *gallium = (struct pipe_context *) ctx;
> +        struct panfrost_screen *screen = pan_screen(gallium->screen);
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +
> +        return drmSyncobjCreate(drm->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
> +                                &ctx->out_sync);
> +}
> +
> +static void
> +panfrost_drm_fence_reference(struct pipe_screen *screen,
> +                         struct pipe_fence_handle **ptr,
> +                         struct pipe_fence_handle *fence)
> +{
> +        struct panfrost_fence **p = (struct panfrost_fence **)ptr;
> +        struct panfrost_fence *f = (struct panfrost_fence *)fence;
> +        struct panfrost_fence *old = *p;
> +
> +        if (pipe_reference(&(*p)->reference, &f->reference)) {
> +                close(old->fd);
> +                free(old);
> +        }
> +        *p = f;
> +}
> +
> +static boolean
> +panfrost_drm_fence_finish(struct pipe_screen *pscreen,
> +                      struct pipe_context *ctx,
> +                      struct pipe_fence_handle *fence,
> +                      uint64_t timeout)
> +{
> +        struct panfrost_screen *screen = pan_screen(pscreen);
> +       struct panfrost_drm *drm = (struct panfrost_drm *)screen->driver;
> +        struct panfrost_fence *f = (struct panfrost_fence *)fence;
> +        int ret;
> +
> +        unsigned syncobj;
> +        ret = drmSyncobjCreate(drm->fd, 0, &syncobj);
> +        if (ret) {
> +                fprintf(stderr, "Failed to create syncobj to wait on: %m\n");
> +                return false;
> +        }
> +
> +        drmSyncobjImportSyncFile(drm->fd, syncobj, f->fd);
> +        if (ret) {
> +                fprintf(stderr, "Failed to import fence to syncobj: %m\n");
> +                return false;
> +        }
> +
> +        uint64_t abs_timeout = os_time_get_absolute_timeout(timeout);
> +        if (abs_timeout == OS_TIMEOUT_INFINITE)
> +                abs_timeout = INT64_MAX;
> +
> +        ret = drmSyncobjWait(drm->fd, &syncobj, 1, abs_timeout, 0, NULL);
> +
> +        drmSyncobjDestroy(drm->fd, syncobj);
> +
> +        return ret >= 0;
> +}
> +
>  struct panfrost_driver *
>  panfrost_create_drm_driver(int fd)
>  {
> @@ -38,5 +386,19 @@ panfrost_create_drm_driver(int fd)
>
>         driver->fd = fd;
>
> +       driver->base.import_bo = panfrost_drm_import_bo;
> +       driver->base.export_bo = panfrost_drm_export_bo;
> +       driver->base.free_imported_bo = panfrost_drm_free_imported_bo;
> +       driver->base.submit_vs_fs_job = panfrost_drm_submit_vs_fs_job;
> +       driver->base.force_flush_fragment = panfrost_drm_force_flush_fragment;
> +       driver->base.allocate_slab = panfrost_drm_allocate_slab;
> +       driver->base.free_slab = panfrost_drm_free_slab;
> +       driver->base.enable_counters = panfrost_drm_enable_counters;
> +       driver->base.query_gpu_version = panfrost_drm_query_gpu_version;
> +       driver->base.init_context = panfrost_drm_init_context;
> +       driver->base.fence_reference = panfrost_drm_fence_reference;
> +       driver->base.fence_finish = panfrost_drm_fence_finish;
> +       driver->base.dump_counters = panfrost_drm_dump_counters;
> +
>          return &driver->base;
>  }
> --
> 2.20.1
>
Wooo!!!!!!!

Seeing this patch in my inbox has been some of the best news all day!

Without further ado, my (solicited?) comments:

> +/* Copyright 2018, Linaro, Ltd., Rob Herring <robh@kernel.org> */

Please triple check upstream that this license is okay; the other files
in include/drm-uapi are BSDish.

> +/* timeouts are specified in clock-monotonic absolute times (to simplify
> + * restarting interrupted ioctls).  The following struct is logically the
> + * same as 'struct timespec' but 32/64b ABI safe.
> + */
> +struct drm_panfrost_timespec {
> +	__s64 tv_sec;          /* seconds */
> +	__s64 tv_nsec;         /* nanoseconds */
> +};

What's this for -- is there not a shared struct for this if it's
necessary? (I'm assuming this was copied from etna..?)

> +	__u32 flags;          /* in, mask of ETNA_BO_x */

As Rob said, s/ETNA/PAN/g

> +	struct drm_panfrost_timespec timeout;   /* in */

(Presumably we just want a timeout in one of nanoseconds / milliseconds
/ second, not the full timespec... 64-bit time gives a pretty wide range
even for high-precision timeouts, which I don't know that we need. Also,
it's signed for some reason...?)

> +	struct drm_panfrost_gem_submit_atom_dep deps[2];

I'm concerned about hardcoding deps to 2 here. I know the Arm driver
does it, but I'm super uncomfortable by them doing it too. Keep in mind,
when they have complex dependencies they insert dummy "dependency-only"
jobs into the graph, though I concede I haven't seen this yet.

I'm not at all sure what the right number is, especially since the new
kernel interface seems to support waiting on BOs? Or am I
misinterpreting this?

Regardless, this will start to matter when we implement support for more
complex FBOs (where various FBOs samples from various other FBOs), which
I think can have arbitrarily complicated dependency graphs. So
hardcoding to [2] is inappropriate if we want to use deps for waiting on
FBOs. On the other hand, if we use a kernel-side BO wait or fences or
something to resolve dependencies, do we even need two deps, or just
one?

> +	// TODO cache allocations
> +	// TODO properly handle errors
> +	// TODO take into account extra_flags

Not a blocker at all for merge, but these should be taken care of before
we drop support for the non-DRM stuff (since at least the
lack of GROWABLE support represents a regression compared to the Arm
driver).

> +	// TODO map and unmap on demand?

I don't know if this matters too much...? Usually if we're allocating a
slab, that means we want it mapped immediately, unless we set the
INVISIBLE flag which means we odn't want it mapped at all.

Maybe we'll have fancier scenarios in the future, but at the moment I
think we can safely cross off at least the first half of the TODO.

As you know I'm not a believer in unmapping/freeing resources ever, so I
don't get an opinion there ;)

> +	gem_info.handle = gem_new.handle;
> +	ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_INFO, &gem_info);
> +	if (ret) {
> +                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_INFO failed: %d\n", ret);
> +		assert(0);
> +	}

Maybe a question for Rob, but why isn't this info passed along with the
allocate in the first place? I guess the extra roundtrip to the kernel
isn't a huge deal, but it seems a little odd...?

> +static uint8_t
> +allocate_atom()
> +{
> +	/* Use to allocate atom numbers for jobs. We probably want to overhaul this in kernel space at some point. */
> +	static uint8_t atom_counter = 0;
> +
> +        atom_counter++;
> +
> +        /* Workaround quirk where atoms must be strictly positive */
> +
> +        if (atom_counter == 0)
> +                atom_counter++;
> +
> +        return atom_counter;
> +}

So, this routine (which is copied straight from the non-DRM code) is
specifically to workaround a quirk in the Arm driver where atom numbers
must be non-zero u8's. I'm skeptical the DRM interface needs this.

> +		idx++;

Nice one, no idea why I didn't think of doing it this way :)

> +panfrost_fence_create(struct panfrost_context *ctx)

I'd appreciate a link to documentation about mainline fences, since I
have no idea what's happening here :)

> +static void
> +panfrost_drm_enable_counters(struct panfrost_screen *screen)
> +{
> +	fprintf(stderr, "unimplemented: %s\n", __func__);
> +}

If nobody else is taking it, would anyone mind if I play with adding
performance counters to the kernel? I want to at least dip my feet into
the other side of the stack, and that seems like a nice low-hanging
fruit (especially since I actually grok the performance counters, more
or less) :)

> +        return drmSyncobjCreate(drm->fd, DRM_SYNCOBJ_CREATE_SIGNALED,

(See fence question)
On Mon, Mar 4, 2019 at 4:43 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
>
> Wooo!!!!!!!
>
> Seeing this patch in my inbox has been some of the best news all day!
>
> Without further ado, my (solicited?) comments:
>
> > +/* Copyright 2018, Linaro, Ltd., Rob Herring <robh@kernel.org> */
>
> Please triple check upstream that this license is okay; the other files
> in include/drm-uapi are BSDish.
>
> > +/* timeouts are specified in clock-monotonic absolute times (to simplify
> > + * restarting interrupted ioctls).  The following struct is logically the
> > + * same as 'struct timespec' but 32/64b ABI safe.
> > + */
> > +struct drm_panfrost_timespec {
> > +     __s64 tv_sec;          /* seconds */
> > +     __s64 tv_nsec;         /* nanoseconds */
> > +};
>
> What's this for -- is there not a shared struct for this if it's
> necessary? (I'm assuming this was copied from etna..?)
>
> > +     __u32 flags;          /* in, mask of ETNA_BO_x */
>
> As Rob said, s/ETNA/PAN/g
>
> > +     struct drm_panfrost_timespec timeout;   /* in */
>
> (Presumably we just want a timeout in one of nanoseconds / milliseconds
> / second, not the full timespec... 64-bit time gives a pretty wide range
> even for high-precision timeouts, which I don't know that we need. Also,
> it's signed for some reason...?)
>
> > +     struct drm_panfrost_gem_submit_atom_dep deps[2];
>
> I'm concerned about hardcoding deps to 2 here. I know the Arm driver
> does it, but I'm super uncomfortable by them doing it too. Keep in mind,
> when they have complex dependencies they insert dummy "dependency-only"
> jobs into the graph, though I concede I haven't seen this yet.

Why aren't we using regular dma-buf fences here? The submit ioctl
should be able to take a number of in fences to wait on and return an
out fence if requested. See I915_EXEC_FENCE_OUT and
I915_EXEC_FENCE_ARRAY in
https://cgit.freedesktop.org/~airlied/linux/tree/include/uapi/drm/i915_drm.h?h=drm-next
or MSM_SUBMIT_FENCE_FD_IN/OUT in
https://cgit.freedesktop.org/~airlied/linux/tree/include/uapi/drm/msm_drm.h?h=drm-next

> I'm not at all sure what the right number is, especially since the new
> kernel interface seems to support waiting on BOs? Or am I
> misinterpreting this?
>
> Regardless, this will start to matter when we implement support for more
> complex FBOs (where various FBOs samples from various other FBOs), which
> I think can have arbitrarily complicated dependency graphs. So
> hardcoding to [2] is inappropriate if we want to use deps for waiting on
> FBOs. On the other hand, if we use a kernel-side BO wait or fences or
> something to resolve dependencies, do we even need two deps, or just
> one?
>
> > +     // TODO cache allocations
> > +     // TODO properly handle errors
> > +     // TODO take into account extra_flags
>
> Not a blocker at all for merge, but these should be taken care of before
> we drop support for the non-DRM stuff (since at least the
> lack of GROWABLE support represents a regression compared to the Arm
> driver).
>
> > +     // TODO map and unmap on demand?
>
> I don't know if this matters too much...? Usually if we're allocating a
> slab, that means we want it mapped immediately, unless we set the
> INVISIBLE flag which means we odn't want it mapped at all.
>
> Maybe we'll have fancier scenarios in the future, but at the moment I
> think we can safely cross off at least the first half of the TODO.
>
> As you know I'm not a believer in unmapping/freeing resources ever, so I
> don't get an opinion there ;)
>
> > +     gem_info.handle = gem_new.handle;
> > +     ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_INFO, &gem_info);
> > +     if (ret) {
> > +                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_INFO failed: %d\n", ret);
> > +             assert(0);
> > +     }
>
> Maybe a question for Rob, but why isn't this info passed along with the
> allocate in the first place? I guess the extra roundtrip to the kernel
> isn't a huge deal, but it seems a little odd...?
>
> > +static uint8_t
> > +allocate_atom()
> > +{
> > +     /* Use to allocate atom numbers for jobs. We probably want to overhaul this in kernel space at some point. */
> > +     static uint8_t atom_counter = 0;
> > +
> > +        atom_counter++;
> > +
> > +        /* Workaround quirk where atoms must be strictly positive */
> > +
> > +        if (atom_counter == 0)
> > +                atom_counter++;
> > +
> > +        return atom_counter;
> > +}
>
> So, this routine (which is copied straight from the non-DRM code) is
> specifically to workaround a quirk in the Arm driver where atom numbers
> must be non-zero u8's. I'm skeptical the DRM interface needs this.
>
> > +             idx++;
>
> Nice one, no idea why I didn't think of doing it this way :)
>
> > +panfrost_fence_create(struct panfrost_context *ctx)
>
> I'd appreciate a link to documentation about mainline fences, since I
> have no idea what's happening here :)
>
> > +static void
> > +panfrost_drm_enable_counters(struct panfrost_screen *screen)
> > +{
> > +     fprintf(stderr, "unimplemented: %s\n", __func__);
> > +}
>
> If nobody else is taking it, would anyone mind if I play with adding
> performance counters to the kernel? I want to at least dip my feet into
> the other side of the stack, and that seems like a nice low-hanging
> fruit (especially since I actually grok the performance counters, more
> or less) :)
>
> > +        return drmSyncobjCreate(drm->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
>
> (See fence question)
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> Why aren't we using regular dma-buf fences here? The submit ioctl
> should be able to take a number of in fences to wait on and return an
> out fence if requested. 

Ah-ha, that sounds like the "proper" approach for mainline. Much of this
was (incorrectly) inherited from the Arm driver. Thank you for the
pointer.
On Mon, Mar 4, 2019 at 6:11 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
>
> > Why aren't we using regular dma-buf fences here? The submit ioctl
> > should be able to take a number of in fences to wait on and return an
> > out fence if requested.
>
> Ah-ha, that sounds like the "proper" approach for mainline. Much of this
> was (incorrectly) inherited from the Arm driver. Thank you for the
> pointer.

I'm not sure - I mean, the submit should take in/out fences, but the
atom mechanism here sounds more like it's for declaring the
dependencies between multiple batches in a renderpass/frame to allow
the kernel to shcedule them? The sync fd may be a little to heavy
handed for that, and if you want to express that kind of dependency to
allow the kernel to reschedule, maybe we need both?
On Tue, 5 Mar 2019 at 12:20, Kristian Høgsberg <hoegsberg@gmail.com> wrote:
>
> On Mon, Mar 4, 2019 at 6:11 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
> >
> > > Why aren't we using regular dma-buf fences here? The submit ioctl
> > > should be able to take a number of in fences to wait on and return an
> > > out fence if requested.
> >
> > Ah-ha, that sounds like the "proper" approach for mainline. Much of this
> > was (incorrectly) inherited from the Arm driver. Thank you for the
> > pointer.
>
> I'm not sure - I mean, the submit should take in/out fences, but the
> atom mechanism here sounds more like it's for declaring the
> dependencies between multiple batches in a renderpass/frame to allow
> the kernel to shcedule them? The sync fd may be a little to heavy
> handed for that, and if you want to express that kind of dependency to
> allow the kernel to reschedule, maybe we need both?

You should more likely be using syncobjects, not fences.

You can convert syncobjs to fences, but fences consume an fd which you
only really want if inter-device.

Dave.
On Mon, Mar 4, 2019 at 6:29 PM Dave Airlie <airlied@gmail.com> wrote:
>
> On Tue, 5 Mar 2019 at 12:20, Kristian Høgsberg <hoegsberg@gmail.com> wrote:
> >
> > On Mon, Mar 4, 2019 at 6:11 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
> > >
> > > > Why aren't we using regular dma-buf fences here? The submit ioctl
> > > > should be able to take a number of in fences to wait on and return an
> > > > out fence if requested.
> > >
> > > Ah-ha, that sounds like the "proper" approach for mainline. Much of this
> > > was (incorrectly) inherited from the Arm driver. Thank you for the
> > > pointer.
> >
> > I'm not sure - I mean, the submit should take in/out fences, but the
> > atom mechanism here sounds more like it's for declaring the
> > dependencies between multiple batches in a renderpass/frame to allow
> > the kernel to shcedule them? The sync fd may be a little to heavy
> > handed for that, and if you want to express that kind of dependency to
> > allow the kernel to reschedule, maybe we need both?
>
> You should more likely be using syncobjects, not fences.
>
> You can convert syncobjs to fences, but fences consume an fd which you
> only really want if inter-device.

Fence fd's are also required for passing through protocol for explicit
synchronization.

>
> Dave.
> You should more likely be using syncobjects, not fences.

...I still don't know what either of them actually are...?
On 3/5/19 3:29 AM, Dave Airlie wrote:
> On Tue, 5 Mar 2019 at 12:20, Kristian Høgsberg <hoegsberg@gmail.com> wrote:
>>
>> On Mon, Mar 4, 2019 at 6:11 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
>>>
>>>> Why aren't we using regular dma-buf fences here? The submit ioctl
>>>> should be able to take a number of in fences to wait on and return an
>>>> out fence if requested.
>>>
>>> Ah-ha, that sounds like the "proper" approach for mainline. Much of this
>>> was (incorrectly) inherited from the Arm driver. Thank you for the
>>> pointer.
>>
>> I'm not sure - I mean, the submit should take in/out fences, but the
>> atom mechanism here sounds more like it's for declaring the
>> dependencies between multiple batches in a renderpass/frame to allow
>> the kernel to shcedule them? The sync fd may be a little to heavy
>> handed for that, and if you want to express that kind of dependency to
>> allow the kernel to reschedule, maybe we need both?
> 
> You should more likely be using syncobjects, not fences.

Yeah, so the dependency is currently expressed by the ID of the atom it 
depends on. This is needed in the current approach because at submit time 
we cannot have a fence yet for the dependency if both atoms are in the 
same submit.

Alyssa: do you see any problem if we change to submit only one atom per 
ioctl?

Then we would get a syncobj for the first atom that we could pass as an 
in-fence for any dependencies, in separate ioctls.

> You can convert syncobjs to fences, but fences consume an fd which you
> only really want if inter-device.

Guess syncobj refs are akin to GEM handles and fences to dmabuf buffers 
from the userspace POV?

Thanks,

Tomeu
On Mon, Mar 4, 2019 at 6:43 PM Alyssa Rosenzweig <alyssa@rosenzweig.io> wrote:
>
> Wooo!!!!!!!
>
> Seeing this patch in my inbox has been some of the best news all day!
>
> Without further ado, my (solicited?) comments:
>
> > +/* Copyright 2018, Linaro, Ltd., Rob Herring <robh@kernel.org> */
>
> Please triple check upstream that this license is okay; the other files
> in include/drm-uapi are BSDish.

There's a mixture of MIT and 'GPL-2.0 WITH Linux-syscall-note'. These
are the ones with GPL:

include/uapi/drm/armada_drm.h:/* SPDX-License-Identifier: GPL-2.0 WITH
Linux-syscall-note */
include/uapi/drm/etnaviv_drm.h:/* SPDX-License-Identifier: GPL-2.0
WITH Linux-syscall-note */
include/uapi/drm/exynos_drm.h:/* SPDX-License-Identifier: GPL-2.0+
WITH Linux-syscall-note */
include/uapi/drm/i810_drm.h:/* SPDX-License-Identifier: GPL-2.0 WITH
Linux-syscall-note */
include/uapi/drm/omap_drm.h:/* SPDX-License-Identifier: GPL-2.0 WITH
Linux-syscall-note */

I don't think it matters, but imagine all corporate entities would prefer MIT.

>
> > +/* timeouts are specified in clock-monotonic absolute times (to simplify
> > + * restarting interrupted ioctls).  The following struct is logically the
> > + * same as 'struct timespec' but 32/64b ABI safe.
> > + */
> > +struct drm_panfrost_timespec {
> > +     __s64 tv_sec;          /* seconds */
> > +     __s64 tv_nsec;         /* nanoseconds */
> > +};
>
> What's this for -- is there not a shared struct for this if it's
> necessary? (I'm assuming this was copied from etna..?)

I couldn't find any common struct.

>
> > +     __u32 flags;          /* in, mask of ETNA_BO_x */
>
> As Rob said, s/ETNA/PAN/g
>
> > +     struct drm_panfrost_timespec timeout;   /* in */
>
> (Presumably we just want a timeout in one of nanoseconds / milliseconds
> / second, not the full timespec... 64-bit time gives a pretty wide range
> even for high-precision timeouts, which I don't know that we need. Also,
> it's signed for some reason...?)

Yes, but I was keeping it aligned with etnaviv. Changing to just u64
ns should be fine as I think rollover in 500 years is acceptable.

Arnd confirmed with me that a plain u64 nanoseconds is preferred.

> > +     struct drm_panfrost_gem_submit_atom_dep deps[2];
>
> I'm concerned about hardcoding deps to 2 here. I know the Arm driver
> does it, but I'm super uncomfortable by them doing it too. Keep in mind,
> when they have complex dependencies they insert dummy "dependency-only"
> jobs into the graph, though I concede I haven't seen this yet.
>
> I'm not at all sure what the right number is, especially since the new
> kernel interface seems to support waiting on BOs? Or am I
> misinterpreting this?
>
> Regardless, this will start to matter when we implement support for more
> complex FBOs (where various FBOs samples from various other FBOs), which
> I think can have arbitrarily complicated dependency graphs. So
> hardcoding to [2] is inappropriate if we want to use deps for waiting on
> FBOs. On the other hand, if we use a kernel-side BO wait or fences or
> something to resolve dependencies, do we even need two deps, or just
> one?
>
> > +     // TODO cache allocations
> > +     // TODO properly handle errors
> > +     // TODO take into account extra_flags
>
> Not a blocker at all for merge, but these should be taken care of before
> we drop support for the non-DRM stuff (since at least the
> lack of GROWABLE support represents a regression compared to the Arm
> driver).

Need to research how other drivers deal with this. Presumably other
drivers should need some heap as well? I found one case that has a
specific ioctl call to do allocate it (don't remember which one
offhand).

>
> > +     // TODO map and unmap on demand?
>
> I don't know if this matters too much...? Usually if we're allocating a
> slab, that means we want it mapped immediately, unless we set the
> INVISIBLE flag which means we odn't want it mapped at all.

Map to what? The GPU or CPU?

> Maybe we'll have fancier scenarios in the future, but at the moment I
> think we can safely cross off at least the first half of the TODO.
>
> As you know I'm not a believer in unmapping/freeing resources ever, so I
> don't get an opinion there ;)
>
> > +     gem_info.handle = gem_new.handle;
> > +     ret = drmIoctl(drm->fd, DRM_IOCTL_PANFROST_GEM_INFO, &gem_info);
> > +     if (ret) {
> > +                fprintf(stderr, "DRM_IOCTL_PANFROST_GEM_INFO failed: %d\n", ret);
> > +             assert(0);
> > +     }
>
> Maybe a question for Rob, but why isn't this info passed along with the
> allocate in the first place? I guess the extra roundtrip to the kernel
> isn't a huge deal, but it seems a little odd...?

It is. GEM_INFO should only be needed when you import a dmabuf.

>
> > +static uint8_t
> > +allocate_atom()
> > +{
> > +     /* Use to allocate atom numbers for jobs. We probably want to overhaul this in kernel space at some point. */
> > +     static uint8_t atom_counter = 0;
> > +
> > +        atom_counter++;
> > +
> > +        /* Workaround quirk where atoms must be strictly positive */
> > +
> > +        if (atom_counter == 0)
> > +                atom_counter++;
> > +
> > +        return atom_counter;
> > +}
>
> So, this routine (which is copied straight from the non-DRM code) is
> specifically to workaround a quirk in the Arm driver where atom numbers
> must be non-zero u8's. I'm skeptical the DRM interface needs this.

And why u8? A sequence number is pretty common and we should just copy
what other drivers do rather than think about it.

Rob
> Alyssa: do you see any problem if we change to submit only one atom per
> ioctl?

I don't think so; aesthetically I don't like the extra kernel traffic,
but that's not a real concern, and it sounds like that's the correct
approach anyway.

A big reason we submit together on non-DRM is so we can get the kernel
to deal with dependency tracking; if we have proper syncobjs/fences,
that doesn't matter I don't think.

> Guess syncobj refs are akin to GEM handles and fences to dmabuf buffers from
> the userspace POV?

*tries to figure out what the difference between GEM handles and dmabuf
buffers*

As in, syncobjs are within the driver, whereas fences can be shared
across processes/parts of the chip and imported/exported?