[RFC] radeonsi: set a per-buffer flag that disables inter-process sharing (v2)

Submitted by Marek Olšák on July 18, 2017, 8:08 p.m.

Details

Message ID 1500408524-925-1-git-send-email-maraeo@gmail.com
State New
Headers show
Series "radeonsi: set a per-buffer flag that disables inter-process sharing" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Marek Olšák July 18, 2017, 8:08 p.m.
From: Marek Olšák <marek.olsak@amd.com>

For lower overhead in the CS ioctl.
Winsys allocators are not used with interprocess-sharable resources.

v2: It shouldn't crash anymore, but the kernel will reject the new flag.
---
 src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
 src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36 ++++++++++++++++---------
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
 4 files changed, 62 insertions(+), 28 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index dd1c209..2747ac4 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -160,20 +160,27 @@  void r600_init_resource_fields(struct r600_common_screen *rscreen,
 	}
 
 	/* Tiled textures are unmappable. Always put them in VRAM. */
 	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
 	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
 		res->domains = RADEON_DOMAIN_VRAM;
 		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
 			 RADEON_FLAG_GTT_WC;
 	}
 
+	/* Only displayable single-sample textures can be shared between
+	 * processes. */
+	if (res->b.b.target == PIPE_BUFFER ||
+	    res->b.b.nr_samples >= 2 ||
+	    rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
+		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
 	/* If VRAM is just stolen system memory, allow both VRAM and
 	 * GTT, whichever has free space. If a buffer is evicted from
 	 * VRAM to GTT, it will stay there.
 	 *
 	 * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
 	 * placements even with a low amount of stolen VRAM.
 	 */
 	if (!rscreen->info.has_dedicated_vram &&
 	    (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
 	    res->domains == RADEON_DOMAIN_VRAM) {
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 351edcd..0abcb56 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -47,20 +47,21 @@  enum radeon_bo_domain { /* bitfield */
     RADEON_DOMAIN_GTT  = 2,
     RADEON_DOMAIN_VRAM = 4,
     RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
 };
 
 enum radeon_bo_flag { /* bitfield */
     RADEON_FLAG_GTT_WC =        (1 << 0),
     RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
     RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
     RADEON_FLAG_SPARSE =        (1 << 3),
+    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
 };
 
 enum radeon_bo_usage { /* bitfield */
     RADEON_USAGE_READ = 2,
     RADEON_USAGE_WRITE = 4,
     RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
 
     /* The winsys ensures that the CS submission will be scheduled after
      * previously flushed CSs referencing this BO in a conflicting way.
      */
@@ -685,28 +686,33 @@  static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
     default:
         assert(0);
         return (enum radeon_bo_domain)0;
     }
 }
 
 static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
 {
     switch (heap) {
     case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
-        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
+        return RADEON_FLAG_GTT_WC |
+               RADEON_FLAG_NO_CPU_ACCESS |
+               RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
     case RADEON_HEAP_VRAM:
     case RADEON_HEAP_VRAM_GTT:
     case RADEON_HEAP_GTT_WC:
-        return RADEON_FLAG_GTT_WC;
+        return RADEON_FLAG_GTT_WC |
+               RADEON_FLAG_NO_INTERPROCESS_SHARING;
+
     case RADEON_HEAP_GTT:
     default:
-        return 0;
+        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
     }
 }
 
 /* The pb cache bucket is chosen to minimize pb_cache misses.
  * It must be between 0 and 3 inclusive.
  */
 static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
 {
     switch (heap) {
     case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
@@ -724,22 +730,28 @@  static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
 
 /* Return the heap index for winsys allocators, or -1 on failure. */
 static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
                                         enum radeon_bo_flag flags)
 {
     /* VRAM implies WC (write combining) */
     assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
     /* NO_CPU_ACCESS implies VRAM only. */
     assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM);
 
+    /* Resources with interprocess sharing don't use any winsys allocators. */
+    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
+        return -1;
+
     /* Unsupported flags: NO_SUBALLOC, SPARSE. */
-    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
+    if (flags & ~(RADEON_FLAG_GTT_WC |
+                  RADEON_FLAG_NO_CPU_ACCESS |
+                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
         return -1;
 
     switch (domain) {
     case RADEON_DOMAIN_VRAM:
         if (flags & RADEON_FLAG_NO_CPU_ACCESS)
             return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
         else
             return RADEON_HEAP_VRAM;
     case RADEON_DOMAIN_VRAM_GTT:
         return RADEON_HEAP_VRAM_GTT;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 97bbe23..06b8198 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -31,20 +31,24 @@ 
 
 #include "amdgpu_cs.h"
 
 #include "os/os_time.h"
 #include "state_tracker/drm_driver.h"
 #include <amdgpu_drm.h>
 #include <xf86drm.h>
 #include <stdio.h>
 #include <inttypes.h>
 
+#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
+#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
+#endif
+
 /* Set to 1 for verbose output showing committed sparse buffer ranges. */
 #define DEBUG_SPARSE_COMMITS 0
 
 struct amdgpu_sparse_backing_chunk {
    uint32_t begin, end;
 };
 
 static struct pb_buffer *
 amdgpu_bo_create(struct radeon_winsys *rws,
                  uint64_t size,
@@ -395,20 +399,22 @@  static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
 
    if (initial_domain & RADEON_DOMAIN_VRAM)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
    if (initial_domain & RADEON_DOMAIN_GTT)
       request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
 
    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
       request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
    if (flags & RADEON_FLAG_GTT_WC)
       request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
+      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
 
    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    if (r) {
       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
       fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
       fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
       fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
       goto error_bo_alloc;
    }
 
@@ -1127,21 +1133,21 @@  static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
 
 static struct pb_buffer *
 amdgpu_bo_create(struct radeon_winsys *rws,
                  uint64_t size,
                  unsigned alignment,
                  enum radeon_bo_domain domain,
                  enum radeon_bo_flag flags)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_winsys_bo *bo;
-   unsigned usage = 0, pb_cache_bucket;
+   unsigned usage = 0, pb_cache_bucket = 0;
 
    /* VRAM implies WC. This is not optional. */
    assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
 
    /* NO_CPU_ACCESS is valid with VRAM only. */
    assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
 
    /* Sub-allocate small buffers from slabs. */
    if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
        size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
@@ -1182,48 +1188,52 @@  no_slab:
    /* This flag is irrelevant for the cache. */
    flags &= ~RADEON_FLAG_NO_SUBALLOC;
 
    /* Align size to page size. This is the minimum alignment for normal
     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
     * like constant/uniform buffers, can benefit from better and more reuse.
     */
    size = align64(size, ws->info.gart_page_size);
    alignment = align(alignment, ws->info.gart_page_size);
 
-   int heap = radeon_get_heap_index(domain, flags);
-   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
-   usage = 1 << heap; /* Only set one usage bit for each heap. */
+   bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
 
-   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
-   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+   if (use_reusable_pool) {
+       int heap = radeon_get_heap_index(domain, flags);
+       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+       usage = 1 << heap; /* Only set one usage bit for each heap. */
 
-   /* Get a buffer from the cache. */
-   bo = (struct amdgpu_winsys_bo*)
-        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
-                                pb_cache_bucket);
-   if (bo)
-      return &bo->base;
+       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
+       assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
+       /* Get a buffer from the cache. */
+       bo = (struct amdgpu_winsys_bo*)
+            pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
+                                    pb_cache_bucket);
+       if (bo)
+          return &bo->base;
+   }
 
    /* Create a new one. */
    bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
                          pb_cache_bucket);
    if (!bo) {
       /* Clear the cache and try again. */
       pb_slabs_reclaim(&ws->bo_slabs);
       pb_cache_release_all_buffers(&ws->bo_cache);
       bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
                             pb_cache_bucket);
       if (!bo)
          return NULL;
    }
 
-   bo->u.real.use_reusable_pool = true;
+   bo->u.real.use_reusable_pool = use_reusable_pool;
    return &bo->base;
 }
 
 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
                                                struct winsys_handle *whandle,
                                                unsigned *stride,
                                                unsigned *offset)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_winsys_bo *bo;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 8027a5f..15e9d38 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -907,21 +907,21 @@  static void radeon_bo_set_metadata(struct pb_buffer *_buf,
 
 static struct pb_buffer *
 radeon_winsys_bo_create(struct radeon_winsys *rws,
                         uint64_t size,
                         unsigned alignment,
                         enum radeon_bo_domain domain,
                         enum radeon_bo_flag flags)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
     struct radeon_bo *bo;
-    unsigned usage = 0, pb_cache_bucket;
+    unsigned usage = 0, pb_cache_bucket = 0;
 
     assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
 
     /* Only 32-bit sizes are supported. */
     if (size > UINT_MAX)
         return NULL;
 
     /* VRAM implies WC. This is not optional. */
     if (domain & RADEON_DOMAIN_VRAM)
         flags |= RADEON_FLAG_GTT_WC;
@@ -962,46 +962,51 @@  no_slab:
     /* This flag is irrelevant for the cache. */
     flags &= ~RADEON_FLAG_NO_SUBALLOC;
 
     /* Align size to page size. This is the minimum alignment for normal
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
      * like constant/uniform buffers, can benefit from better and more reuse.
      */
     size = align(size, ws->info.gart_page_size);
     alignment = align(alignment, ws->info.gart_page_size);
 
-    int heap = radeon_get_heap_index(domain, flags);
-    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
-    usage = 1 << heap; /* Only set one usage bit for each heap. */
+    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
 
-    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
-    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+    /* Shared resources don't use cached heaps. */
+    if (use_reusable_pool) {
+        int heap = radeon_get_heap_index(domain, flags);
+        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
+        usage = 1 << heap; /* Only set one usage bit for each heap. */
 
-    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
-                                           usage, pb_cache_bucket));
-    if (bo)
-        return &bo->base;
+        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
+        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
+
+        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
+                                               usage, pb_cache_bucket));
+        if (bo)
+            return &bo->base;
+    }
 
     bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
                           pb_cache_bucket);
     if (!bo) {
         /* Clear the cache and try again. */
         if (ws->info.has_virtual_memory)
             pb_slabs_reclaim(&ws->bo_slabs);
         pb_cache_release_all_buffers(&ws->bo_cache);
         bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
                               pb_cache_bucket);
         if (!bo)
             return NULL;
     }
 
-    bo->u.real.use_reusable_pool = true;
+    bo->u.real.use_reusable_pool = use_reusable_pool;
 
     mtx_lock(&ws->bo_handles_mutex);
     util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
     mtx_unlock(&ws->bo_handles_mutex);
 
     return &bo->base;
 }
 
 static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
                                                    void *pointer, uint64_t size)

Comments

On 18/07/17 04:08 PM, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak@amd.com>
> 
> For lower overhead in the CS ioctl.
> Winsys allocators are not used with interprocess-sharable resources.
> 
> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
> ---
>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36 ++++++++++++++++---------
>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>   4 files changed, 62 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index dd1c209..2747ac4 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
>   	}
>   
>   	/* Tiled textures are unmappable. Always put them in VRAM. */
>   	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
>   	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>   		res->domains = RADEON_DOMAIN_VRAM;
>   		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>   			 RADEON_FLAG_GTT_WC;
>   	}
>   
> +	/* Only displayable single-sample textures can be shared between
> +	 * processes. */
> +	if (res->b.b.target == PIPE_BUFFER ||
> +	    res->b.b.nr_samples >= 2 ||
> +	    rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
> +		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;

We might want to share non-displayable textures at some point for 
performance, right? Will we still be able to flag some textures as 
non-shareable in that case?


> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 97bbe23..06b8198 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -31,20 +31,24 @@
>   
>   #include "amdgpu_cs.h"
>   
>   #include "os/os_time.h"
>   #include "state_tracker/drm_driver.h"
>   #include <amdgpu_drm.h>
>   #include <xf86drm.h>
>   #include <stdio.h>
>   #include <inttypes.h>
>   
> +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
> +#endif

I advise against the #ifndef check for this kind of thing, because it 
prevents the preprocessor from warning about conflicting definitions.
On Tue, Jul 18, 2017 at 5:11 PM, Michel Dänzer <michel@daenzer.net> wrote:
> On 18/07/17 04:08 PM, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak@amd.com>
>>
>> For lower overhead in the CS ioctl.
>> Winsys allocators are not used with interprocess-sharable resources.
>>
>> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
>> ---
>>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36
>> ++++++++++++++++---------
>>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>>   4 files changed, 62 insertions(+), 28 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
>> b/src/gallium/drivers/radeon/r600_buffer_common.c
>> index dd1c209..2747ac4 100644
>> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
>> r600_common_screen *rscreen,
>>         }
>>         /* Tiled textures are unmappable. Always put them in VRAM. */
>>         if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear)
>> ||
>>             res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>>                 res->domains = RADEON_DOMAIN_VRAM;
>>                 res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>>                          RADEON_FLAG_GTT_WC;
>>         }
>>   +     /* Only displayable single-sample textures can be shared between
>> +        * processes. */
>> +       if (res->b.b.target == PIPE_BUFFER ||
>> +           res->b.b.nr_samples >= 2 ||
>> +           rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
>> +               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
>
>
> We might want to share non-displayable textures at some point for
> performance, right? Will we still be able to flag some textures as
> non-shareable in that case?

Yes if we weren't CPU-bound in 2D acceleration on VI. As long as we
are CPU-bound, GPU optimizations won't probably make much difference.

Marek
On 2017年07月19日 04:08, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak@amd.com>
>
> For lower overhead in the CS ioctl.
> Winsys allocators are not used with interprocess-sharable resources.
Hi Marek,

Could I know from how your this way reduces overhead in CS ioctl? 
reusing BO to short bo list?

Thanks,
David Zhou
>
> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
> ---
>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36 ++++++++++++++++---------
>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>   4 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
> index dd1c209..2747ac4 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct r600_common_screen *rscreen,
>   	}
>   
>   	/* Tiled textures are unmappable. Always put them in VRAM. */
>   	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
>   	    res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>   		res->domains = RADEON_DOMAIN_VRAM;
>   		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>   			 RADEON_FLAG_GTT_WC;
>   	}
>   
> +	/* Only displayable single-sample textures can be shared between
> +	 * processes. */
> +	if (res->b.b.target == PIPE_BUFFER ||
> +	    res->b.b.nr_samples >= 2 ||
> +	    rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
> +		res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>   	/* If VRAM is just stolen system memory, allow both VRAM and
>   	 * GTT, whichever has free space. If a buffer is evicted from
>   	 * VRAM to GTT, it will stay there.
>   	 *
>   	 * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
>   	 * placements even with a low amount of stolen VRAM.
>   	 */
>   	if (!rscreen->info.has_dedicated_vram &&
>   	    (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
>   	    res->domains == RADEON_DOMAIN_VRAM) {
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
> index 351edcd..0abcb56 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>       RADEON_DOMAIN_GTT  = 2,
>       RADEON_DOMAIN_VRAM = 4,
>       RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
>   };
>   
>   enum radeon_bo_flag { /* bitfield */
>       RADEON_FLAG_GTT_WC =        (1 << 0),
>       RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>       RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>       RADEON_FLAG_SPARSE =        (1 << 3),
> +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>   };
>   
>   enum radeon_bo_usage { /* bitfield */
>       RADEON_USAGE_READ = 2,
>       RADEON_USAGE_WRITE = 4,
>       RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
>   
>       /* The winsys ensures that the CS submission will be scheduled after
>        * previously flushed CSs referencing this BO in a conflicting way.
>        */
> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain radeon_domain_from_heap(enum radeon_heap hea
>       default:
>           assert(0);
>           return (enum radeon_bo_domain)0;
>       }
>   }
>   
>   static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_CPU_ACCESS |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_VRAM:
>       case RADEON_HEAP_VRAM_GTT:
>       case RADEON_HEAP_GTT_WC:
> -        return RADEON_FLAG_GTT_WC;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_GTT:
>       default:
> -        return 0;
> +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>       }
>   }
>   
>   /* The pb cache bucket is chosen to minimize pb_cache misses.
>    * It must be between 0 and 3 inclusive.
>    */
>   static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> @@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>   
>   /* Return the heap index for winsys allocators, or -1 on failure. */
>   static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
>                                           enum radeon_bo_flag flags)
>   {
>       /* VRAM implies WC (write combining) */
>       assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>       /* NO_CPU_ACCESS implies VRAM only. */
>       assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain == RADEON_DOMAIN_VRAM);
>   
> +    /* Resources with interprocess sharing don't use any winsys allocators. */
> +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
> +        return -1;
> +
>       /* Unsupported flags: NO_SUBALLOC, SPARSE. */
> -    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
> +    if (flags & ~(RADEON_FLAG_GTT_WC |
> +                  RADEON_FLAG_NO_CPU_ACCESS |
> +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>           return -1;
>   
>       switch (domain) {
>       case RADEON_DOMAIN_VRAM:
>           if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>               return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>           else
>               return RADEON_HEAP_VRAM;
>       case RADEON_DOMAIN_VRAM_GTT:
>           return RADEON_HEAP_VRAM_GTT;
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 97bbe23..06b8198 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -31,20 +31,24 @@
>   
>   #include "amdgpu_cs.h"
>   
>   #include "os/os_time.h"
>   #include "state_tracker/drm_driver.h"
>   #include <amdgpu_drm.h>
>   #include <xf86drm.h>
>   #include <stdio.h>
>   #include <inttypes.h>
>   
> +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
> +#endif
> +
>   /* Set to 1 for verbose output showing committed sparse buffer ranges. */
>   #define DEBUG_SPARSE_COMMITS 0
>   
>   struct amdgpu_sparse_backing_chunk {
>      uint32_t begin, end;
>   };
>   
>   static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
>   
>      if (initial_domain & RADEON_DOMAIN_VRAM)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>      if (initial_domain & RADEON_DOMAIN_GTT)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>   
>      if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>         request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>      if (flags & RADEON_FLAG_GTT_WC)
>         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
> +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>   
>      r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>      if (r) {
>         fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>         fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
>         fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
>         fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
>         goto error_bo_alloc;
>      }
>   
> @@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>   
>   static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
>                    unsigned alignment,
>                    enum radeon_bo_domain domain,
>                    enum radeon_bo_flag flags)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> -   unsigned usage = 0, pb_cache_bucket;
> +   unsigned usage = 0, pb_cache_bucket = 0;
>   
>      /* VRAM implies WC. This is not optional. */
>      assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>   
>      /* NO_CPU_ACCESS is valid with VRAM only. */
>      assert(domain == RADEON_DOMAIN_VRAM || !(flags & RADEON_FLAG_NO_CPU_ACCESS));
>   
>      /* Sub-allocate small buffers from slabs. */
>      if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
>          size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
> @@ -1182,48 +1188,52 @@ no_slab:
>      /* This flag is irrelevant for the cache. */
>      flags &= ~RADEON_FLAG_NO_SUBALLOC;
>   
>      /* Align size to page size. This is the minimum alignment for normal
>       * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
>       * like constant/uniform buffers, can benefit from better and more reuse.
>       */
>      size = align64(size, ws->info.gart_page_size);
>      alignment = align(alignment, ws->info.gart_page_size);
>   
> -   int heap = radeon_get_heap_index(domain, flags);
> -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -   usage = 1 << heap; /* Only set one usage bit for each heap. */
> +   bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   
> -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +   if (use_reusable_pool) {
> +       int heap = radeon_get_heap_index(domain, flags);
> +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +       usage = 1 << heap; /* Only set one usage bit for each heap. */
>   
> -   /* Get a buffer from the cache. */
> -   bo = (struct amdgpu_winsys_bo*)
> -        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> -                                pb_cache_bucket);
> -   if (bo)
> -      return &bo->base;
> +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +       assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +       /* Get a buffer from the cache. */
> +       bo = (struct amdgpu_winsys_bo*)
> +            pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> +                                    pb_cache_bucket);
> +       if (bo)
> +          return &bo->base;
> +   }
>   
>      /* Create a new one. */
>      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                            pb_cache_bucket);
>      if (!bo) {
>         /* Clear the cache and try again. */
>         pb_slabs_reclaim(&ws->bo_slabs);
>         pb_cache_release_all_buffers(&ws->bo_cache);
>         bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                               pb_cache_bucket);
>         if (!bo)
>            return NULL;
>      }
>   
> -   bo->u.real.use_reusable_pool = true;
> +   bo->u.real.use_reusable_pool = use_reusable_pool;
>      return &bo->base;
>   }
>   
>   static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
>                                                  struct winsys_handle *whandle,
>                                                  unsigned *stride,
>                                                  unsigned *offset)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> index 8027a5f..15e9d38 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> @@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf,
>   
>   static struct pb_buffer *
>   radeon_winsys_bo_create(struct radeon_winsys *rws,
>                           uint64_t size,
>                           unsigned alignment,
>                           enum radeon_bo_domain domain,
>                           enum radeon_bo_flag flags)
>   {
>       struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>       struct radeon_bo *bo;
> -    unsigned usage = 0, pb_cache_bucket;
> +    unsigned usage = 0, pb_cache_bucket = 0;
>   
>       assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>   
>       /* Only 32-bit sizes are supported. */
>       if (size > UINT_MAX)
>           return NULL;
>   
>       /* VRAM implies WC. This is not optional. */
>       if (domain & RADEON_DOMAIN_VRAM)
>           flags |= RADEON_FLAG_GTT_WC;
> @@ -962,46 +962,51 @@ no_slab:
>       /* This flag is irrelevant for the cache. */
>       flags &= ~RADEON_FLAG_NO_SUBALLOC;
>   
>       /* Align size to page size. This is the minimum alignment for normal
>        * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
>        * like constant/uniform buffers, can benefit from better and more reuse.
>        */
>       size = align(size, ws->info.gart_page_size);
>       alignment = align(alignment, ws->info.gart_page_size);
>   
> -    int heap = radeon_get_heap_index(domain, flags);
> -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -    usage = 1 << heap; /* Only set one usage bit for each heap. */
> +    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   
> -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +    /* Shared resources don't use cached heaps. */
> +    if (use_reusable_pool) {
> +        int heap = radeon_get_heap_index(domain, flags);
> +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +        usage = 1 << heap; /* Only set one usage bit for each heap. */
>   
> -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> -                                           usage, pb_cache_bucket));
> -    if (bo)
> -        return &bo->base;
> +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> +                                               usage, pb_cache_bucket));
> +        if (bo)
> +            return &bo->base;
> +    }
>   
>       bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                             pb_cache_bucket);
>       if (!bo) {
>           /* Clear the cache and try again. */
>           if (ws->info.has_virtual_memory)
>               pb_slabs_reclaim(&ws->bo_slabs);
>           pb_cache_release_all_buffers(&ws->bo_cache);
>           bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                                 pb_cache_bucket);
>           if (!bo)
>               return NULL;
>       }
>   
> -    bo->u.real.use_reusable_pool = true;
> +    bo->u.real.use_reusable_pool = use_reusable_pool;
>   
>       mtx_lock(&ws->bo_handles_mutex);
>       util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo);
>       mtx_unlock(&ws->bo_handles_mutex);
>   
>       return &bo->base;
>   }
>   
>   static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
>                                                      void *pointer, uint64_t size)
On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com> wrote:



On 2017年07月19日 04:08, Marek Olšák wrote:

> From: Marek Olšák <marek.olsak@amd.com>
>
> For lower overhead in the CS ioctl.
> Winsys allocators are not used with interprocess-sharable resources.
>
Hi Marek,

Could I know from how your this way reduces overhead in CS ioctl? reusing
BO to short bo list?


The kernel part of the work hasn't been done yet. The idea is that
nonsharable buffers don't have to be revalidated by TTM, so it should
remove a lot of kernel overhead and the BO list remains the same.

Marek



Thanks,
David Zhou


> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
> ---
>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36
> ++++++++++++++++---------
>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>   4 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
> b/src/gallium/drivers/radeon/r600_buffer_common.c
> index dd1c209..2747ac4 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
> r600_common_screen *rscreen,
>         }
>         /* Tiled textures are unmappable. Always put them in VRAM. */
>         if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
>             res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>                 res->domains = RADEON_DOMAIN_VRAM;
>                 res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>                          RADEON_FLAG_GTT_WC;
>         }
>   +     /* Only displayable single-sample textures can be shared between
> +        * processes. */
> +       if (res->b.b.target == PIPE_BUFFER ||
> +           res->b.b.nr_samples >= 2 ||
> +           rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
> +               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>         /* If VRAM is just stolen system memory, allow both VRAM and
>          * GTT, whichever has free space. If a buffer is evicted from
>          * VRAM to GTT, it will stay there.
>          *
>          * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
>          * placements even with a low amount of stolen VRAM.
>          */
>         if (!rscreen->info.has_dedicated_vram &&
>             (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
>             res->domains == RADEON_DOMAIN_VRAM) {
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
> b/src/gallium/drivers/radeon/radeon_winsys.h
> index 351edcd..0abcb56 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>       RADEON_DOMAIN_GTT  = 2,
>       RADEON_DOMAIN_VRAM = 4,
>       RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
>   };
>     enum radeon_bo_flag { /* bitfield */
>       RADEON_FLAG_GTT_WC =        (1 << 0),
>       RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>       RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>       RADEON_FLAG_SPARSE =        (1 << 3),
> +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>   };
>     enum radeon_bo_usage { /* bitfield */
>       RADEON_USAGE_READ = 2,
>       RADEON_USAGE_WRITE = 4,
>       RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
>         /* The winsys ensures that the CS submission will be scheduled
> after
>        * previously flushed CSs referencing this BO in a conflicting way.
>        */
> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
> radeon_domain_from_heap(enum radeon_heap hea
>       default:
>           assert(0);
>           return (enum radeon_bo_domain)0;
>       }
>   }
>     static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_CPU_ACCESS |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_VRAM:
>       case RADEON_HEAP_VRAM_GTT:
>       case RADEON_HEAP_GTT_WC:
> -        return RADEON_FLAG_GTT_WC;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_GTT:
>       default:
> -        return 0;
> +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>       }
>   }
>     /* The pb cache bucket is chosen to minimize pb_cache misses.
>    * It must be between 0 and 3 inclusive.
>    */
>   static inline unsigned radeon_get_pb_cache_bucket_index(enum
> radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> @@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum
> radeon_heap heap)
>     /* Return the heap index for winsys allocators, or -1 on failure. */
>   static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
>                                           enum radeon_bo_flag flags)
>   {
>       /* VRAM implies WC (write combining) */
>       assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>       /* NO_CPU_ACCESS implies VRAM only. */
>       assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain ==
> RADEON_DOMAIN_VRAM);
>   +    /* Resources with interprocess sharing don't use any winsys
> allocators. */
> +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
> +        return -1;
> +
>       /* Unsupported flags: NO_SUBALLOC, SPARSE. */
> -    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
> +    if (flags & ~(RADEON_FLAG_GTT_WC |
> +                  RADEON_FLAG_NO_CPU_ACCESS |
> +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>           return -1;
>         switch (domain) {
>       case RADEON_DOMAIN_VRAM:
>           if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>               return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>           else
>               return RADEON_HEAP_VRAM;
>       case RADEON_DOMAIN_VRAM_GTT:
>           return RADEON_HEAP_VRAM_GTT;
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 97bbe23..06b8198 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -31,20 +31,24 @@
>     #include "amdgpu_cs.h"
>     #include "os/os_time.h"
>   #include "state_tracker/drm_driver.h"
>   #include <amdgpu_drm.h>
>   #include <xf86drm.h>
>   #include <stdio.h>
>   #include <inttypes.h>
>   +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
> +#endif
> +
>   /* Set to 1 for verbose output showing committed sparse buffer ranges. */
>   #define DEBUG_SPARSE_COMMITS 0
>     struct amdgpu_sparse_backing_chunk {
>      uint32_t begin, end;
>   };
>     static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
> *amdgpu_create_bo(struct amdgpu_winsys *ws,
>        if (initial_domain & RADEON_DOMAIN_VRAM)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>      if (initial_domain & RADEON_DOMAIN_GTT)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>        if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>         request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>      if (flags & RADEON_FLAG_GTT_WC)
>         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
> +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>        r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>      if (r) {
>         fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>         fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
>         fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
>         fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
>         goto error_bo_alloc;
>      }
>   @@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct
> pb_buffer *_buf,
>     static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
>                    unsigned alignment,
>                    enum radeon_bo_domain domain,
>                    enum radeon_bo_flag flags)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> -   unsigned usage = 0, pb_cache_bucket;
> +   unsigned usage = 0, pb_cache_bucket = 0;
>        /* VRAM implies WC. This is not optional. */
>      assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>        /* NO_CPU_ACCESS is valid with VRAM only. */
>      assert(domain == RADEON_DOMAIN_VRAM || !(flags &
> RADEON_FLAG_NO_CPU_ACCESS));
>        /* Sub-allocate small buffers from slabs. */
>      if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
>          size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
> @@ -1182,48 +1188,52 @@ no_slab:
>      /* This flag is irrelevant for the cache. */
>      flags &= ~RADEON_FLAG_NO_SUBALLOC;
>        /* Align size to page size. This is the minimum alignment for normal
>       * BOs. Aligning this here helps the cached bufmgr. Especially small
> BOs,
>       * like constant/uniform buffers, can benefit from better and more
> reuse.
>       */
>      size = align64(size, ws->info.gart_page_size);
>      alignment = align(alignment, ws->info.gart_page_size);
>   -   int heap = radeon_get_heap_index(domain, flags);
> -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -   usage = 1 << heap; /* Only set one usage bit for each heap. */
> +   bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +   if (use_reusable_pool) {
> +       int heap = radeon_get_heap_index(domain, flags);
> +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +       usage = 1 << heap; /* Only set one usage bit for each heap. */
>   -   /* Get a buffer from the cache. */
> -   bo = (struct amdgpu_winsys_bo*)
> -        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> -                                pb_cache_bucket);
> -   if (bo)
> -      return &bo->base;
> +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +       assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +       /* Get a buffer from the cache. */
> +       bo = (struct amdgpu_winsys_bo*)
> +            pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> usage,
> +                                    pb_cache_bucket);
> +       if (bo)
> +          return &bo->base;
> +   }
>        /* Create a new one. */
>      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                            pb_cache_bucket);
>      if (!bo) {
>         /* Clear the cache and try again. */
>         pb_slabs_reclaim(&ws->bo_slabs);
>         pb_cache_release_all_buffers(&ws->bo_cache);
>         bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                               pb_cache_bucket);
>         if (!bo)
>            return NULL;
>      }
>   -   bo->u.real.use_reusable_pool = true;
> +   bo->u.real.use_reusable_pool = use_reusable_pool;
>      return &bo->base;
>   }
>     static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys
> *rws,
>                                                  struct winsys_handle
> *whandle,
>                                                  unsigned *stride,
>                                                  unsigned *offset)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> index 8027a5f..15e9d38 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> @@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer
> *_buf,
>     static struct pb_buffer *
>   radeon_winsys_bo_create(struct radeon_winsys *rws,
>                           uint64_t size,
>                           unsigned alignment,
>                           enum radeon_bo_domain domain,
>                           enum radeon_bo_flag flags)
>   {
>       struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>       struct radeon_bo *bo;
> -    unsigned usage = 0, pb_cache_bucket;
> +    unsigned usage = 0, pb_cache_bucket = 0;
>         assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>         /* Only 32-bit sizes are supported. */
>       if (size > UINT_MAX)
>           return NULL;
>         /* VRAM implies WC. This is not optional. */
>       if (domain & RADEON_DOMAIN_VRAM)
>           flags |= RADEON_FLAG_GTT_WC;
> @@ -962,46 +962,51 @@ no_slab:
>       /* This flag is irrelevant for the cache. */
>       flags &= ~RADEON_FLAG_NO_SUBALLOC;
>         /* Align size to page size. This is the minimum alignment for
> normal
>        * BOs. Aligning this here helps the cached bufmgr. Especially small
> BOs,
>        * like constant/uniform buffers, can benefit from better and more
> reuse.
>        */
>       size = align(size, ws->info.gart_page_size);
>       alignment = align(alignment, ws->info.gart_page_size);
>   -    int heap = radeon_get_heap_index(domain, flags);
> -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -    usage = 1 << heap; /* Only set one usage bit for each heap. */
> +    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +    /* Shared resources don't use cached heaps. */
> +    if (use_reusable_pool) {
> +        int heap = radeon_get_heap_index(domain, flags);
> +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +        usage = 1 << heap; /* Only set one usage bit for each heap. */
>   -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
> alignment,
> -                                           usage, pb_cache_bucket));
> -    if (bo)
> -        return &bo->base;
> +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
> alignment,
> +                                               usage, pb_cache_bucket));
> +        if (bo)
> +            return &bo->base;
> +    }
>         bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                             pb_cache_bucket);
>       if (!bo) {
>           /* Clear the cache and try again. */
>           if (ws->info.has_virtual_memory)
>               pb_slabs_reclaim(&ws->bo_slabs);
>           pb_cache_release_all_buffers(&ws->bo_cache);
>           bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                                 pb_cache_bucket);
>           if (!bo)
>               return NULL;
>       }
>   -    bo->u.real.use_reusable_pool = true;
> +    bo->u.real.use_reusable_pool = use_reusable_pool;
>         mtx_lock(&ws->bo_handles_mutex);
>       util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle,
> bo);
>       mtx_unlock(&ws->bo_handles_mutex);
>         return &bo->base;
>   }
>     static struct pb_buffer *radeon_winsys_bo_from_ptr(struct
> radeon_winsys *rws,
>                                                      void *pointer,
> uint64_t size)
>
On 2017年07月19日 23:34, Marek Olšák wrote:
>
>
> On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com 
> <mailto:david1.zhou@amd.com>> wrote:
>
>
>
>     On 2017年07月19日 04:08, Marek Olšák wrote:
>
>         From: Marek Olšák <marek.olsak@amd.com
>         <mailto:marek.olsak@amd.com>>
>
>         For lower overhead in the CS ioctl.
>         Winsys allocators are not used with interprocess-sharable
>         resources.
>
>     Hi Marek,
>
>     Could I know from how your this way reduces overhead in CS ioctl?
>     reusing BO to short bo list?
>
>
> The kernel part of the work hasn't been done yet. The idea is that 
> nonsharable buffers don't have to be revalidated by TTM,
OK, Maybe I only can see the whole picture of this idea when you 
complete kernel part.
Out of curious,  why/how can nonsharable buffers be revalidated by TTM 
without exposing like amdgpu_bo_make_resident api?

With mentioned in another thread, if we can expose make_resident api, we 
can remove bo_list, even we can remove reservation operation in CS ioctl.
And now, I think our bo list is a very bad design,
first, umd must create bo list for every command submission, this is a 
extra cpu overhead compared with traditional way.
second, kernel also have to iterate the list, when bo list is too long, 
like OpenCL program, they always throw several thousands BOs to bo list, 
reservation must keep these thousands ww_mutex safe, CPU overhead is too 
big.

So I strongly suggest we should expose make_resident api to user space. 
if cannot, I want to know any specific reason to see if we can solve it.


Regards,
David Zhou
> so it should remove a lot of kernel overhead and the BO list remains 
> the same.
>
> Marek
>
>
>
>     Thanks,
>     David Zhou
>
>
>         v2: It shouldn't crash anymore, but the kernel will reject the
>         new flag.
>         ---
>           src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>           src/gallium/drivers/radeon/radeon_winsys.h   | 20 +++++++++++---
>           src/gallium/winsys/amdgpu/drm/amdgpu_bo.c    | 36
>         ++++++++++++++++---------
>           src/gallium/winsys/radeon/drm/radeon_drm_bo.c  | 27
>         +++++++++++--------
>           4 files changed, 62 insertions(+), 28 deletions(-)
>
>         diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
>         b/src/gallium/drivers/radeon/r600_buffer_common.c
>         index dd1c209..2747ac4 100644
>         --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>         +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>         @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
>         r600_common_screen *rscreen,
>                 }
>                 /* Tiled textures are unmappable. Always put them in
>         VRAM. */
>                 if ((res->b.b.target != PIPE_BUFFER &&
>         !rtex->surface.is_linear) ||
>                     res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>                         res->domains = RADEON_DOMAIN_VRAM;
>                         res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>                                  RADEON_FLAG_GTT_WC;
>                 }
>           +     /* Only displayable single-sample textures can be
>         shared between
>         +        * processes. */
>         +       if (res->b.b.target == PIPE_BUFFER ||
>         +           res->b.b.nr_samples >= 2 ||
>         +           rtex->surface.micro_tile_mode !=
>         RADEON_MICRO_MODE_DISPLAY)
>         +               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
>         +
>                 /* If VRAM is just stolen system memory, allow both
>         VRAM and
>                  * GTT, whichever has free space. If a buffer is
>         evicted from
>                  * VRAM to GTT, it will stay there.
>                  *
>                  * DRM 3.6.0 has good BO move throttling, so we can
>         allow VRAM-only
>                  * placements even with a low amount of stolen VRAM.
>                  */
>                 if (!rscreen->info.has_dedicated_vram &&
>                     (rscreen->info.drm_major < 3 ||
>         rscreen->info.drm_minor < 6) &&
>                     res->domains == RADEON_DOMAIN_VRAM) {
>         diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
>         b/src/gallium/drivers/radeon/radeon_winsys.h
>         index 351edcd..0abcb56 100644
>         --- a/src/gallium/drivers/radeon/radeon_winsys.h
>         +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>         @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>               RADEON_DOMAIN_GTT  = 2,
>               RADEON_DOMAIN_VRAM = 4,
>               RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM |
>         RADEON_DOMAIN_GTT
>           };
>             enum radeon_bo_flag { /* bitfield */
>               RADEON_FLAG_GTT_WC =        (1 << 0),
>               RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>               RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>               RADEON_FLAG_SPARSE =        (1 << 3),
>         +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>           };
>             enum radeon_bo_usage { /* bitfield */
>               RADEON_USAGE_READ = 2,
>               RADEON_USAGE_WRITE = 4,
>               RADEON_USAGE_READWRITE = RADEON_USAGE_READ |
>         RADEON_USAGE_WRITE,
>                 /* The winsys ensures that the CS submission will be
>         scheduled after
>                * previously flushed CSs referencing this BO in a
>         conflicting way.
>                */
>         @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
>         radeon_domain_from_heap(enum radeon_heap hea
>               default:
>                   assert(0);
>                   return (enum radeon_bo_domain)0;
>               }
>           }
>             static inline unsigned radeon_flags_from_heap(enum
>         radeon_heap heap)
>           {
>               switch (heap) {
>               case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>         -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
>         +        return RADEON_FLAG_GTT_WC |
>         +               RADEON_FLAG_NO_CPU_ACCESS |
>         +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
>         +
>               case RADEON_HEAP_VRAM:
>               case RADEON_HEAP_VRAM_GTT:
>               case RADEON_HEAP_GTT_WC:
>         -        return RADEON_FLAG_GTT_WC;
>         +        return RADEON_FLAG_GTT_WC |
>         +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
>         +
>               case RADEON_HEAP_GTT:
>               default:
>         -        return 0;
>         +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>               }
>           }
>             /* The pb cache bucket is chosen to minimize pb_cache misses.
>            * It must be between 0 and 3 inclusive.
>            */
>           static inline unsigned radeon_get_pb_cache_bucket_index(enum
>         radeon_heap heap)
>           {
>               switch (heap) {
>               case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>         @@ -724,22 +730,28 @@ static inline unsigned
>         radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>             /* Return the heap index for winsys allocators, or -1 on
>         failure. */
>           static inline int radeon_get_heap_index(enum
>         radeon_bo_domain domain,
>                                                   enum radeon_bo_flag
>         flags)
>           {
>               /* VRAM implies WC (write combining) */
>               assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>         RADEON_FLAG_GTT_WC);
>               /* NO_CPU_ACCESS implies VRAM only. */
>               assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain ==
>         RADEON_DOMAIN_VRAM);
>           +    /* Resources with interprocess sharing don't use any
>         winsys allocators. */
>         +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
>         +        return -1;
>         +
>               /* Unsupported flags: NO_SUBALLOC, SPARSE. */
>         -    if (flags & ~(RADEON_FLAG_GTT_WC |
>         RADEON_FLAG_NO_CPU_ACCESS))
>         +    if (flags & ~(RADEON_FLAG_GTT_WC |
>         +                  RADEON_FLAG_NO_CPU_ACCESS |
>         +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>                   return -1;
>                 switch (domain) {
>               case RADEON_DOMAIN_VRAM:
>                   if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>                       return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>                   else
>                       return RADEON_HEAP_VRAM;
>               case RADEON_DOMAIN_VRAM_GTT:
>                   return RADEON_HEAP_VRAM_GTT;
>         diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>         b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>         index 97bbe23..06b8198 100644
>         --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>         +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>         @@ -31,20 +31,24 @@
>             #include "amdgpu_cs.h"
>             #include "os/os_time.h"
>           #include "state_tracker/drm_driver.h"
>           #include <amdgpu_drm.h>
>           #include <xf86drm.h>
>           #include <stdio.h>
>           #include <inttypes.h>
>           +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
>         +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
>         +#endif
>         +
>           /* Set to 1 for verbose output showing committed sparse
>         buffer ranges. */
>           #define DEBUG_SPARSE_COMMITS 0
>             struct amdgpu_sparse_backing_chunk {
>              uint32_t begin, end;
>           };
>             static struct pb_buffer *
>           amdgpu_bo_create(struct radeon_winsys *rws,
>                            uint64_t size,
>         @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
>         *amdgpu_create_bo(struct amdgpu_winsys *ws,
>                if (initial_domain & RADEON_DOMAIN_VRAM)
>                 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>              if (initial_domain & RADEON_DOMAIN_GTT)
>                 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>                if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>                 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>              if (flags & RADEON_FLAG_GTT_WC)
>                 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
>         +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
>         +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>                r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>              if (r) {
>                 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>                 fprintf(stderr, "amdgpu:    size      : %"PRIu64"
>         bytes\n", size);
>                 fprintf(stderr, "amdgpu:    alignment : %u bytes\n",
>         alignment);
>                 fprintf(stderr, "amdgpu:    domains   : %u\n",
>         initial_domain);
>                 goto error_bo_alloc;
>              }
>           @@ -1127,21 +1133,21 @@ static void
>         amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>             static struct pb_buffer *
>           amdgpu_bo_create(struct radeon_winsys *rws,
>                            uint64_t size,
>                            unsigned alignment,
>                            enum radeon_bo_domain domain,
>                            enum radeon_bo_flag flags)
>           {
>              struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>              struct amdgpu_winsys_bo *bo;
>         -   unsigned usage = 0, pb_cache_bucket;
>         +   unsigned usage = 0, pb_cache_bucket = 0;
>                /* VRAM implies WC. This is not optional. */
>              assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>         RADEON_FLAG_GTT_WC);
>                /* NO_CPU_ACCESS is valid with VRAM only. */
>              assert(domain == RADEON_DOMAIN_VRAM || !(flags &
>         RADEON_FLAG_NO_CPU_ACCESS));
>                /* Sub-allocate small buffers from slabs. */
>              if (!(flags & (RADEON_FLAG_NO_SUBALLOC |
>         RADEON_FLAG_SPARSE)) &&
>                  size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>         @@ -1182,48 +1188,52 @@ no_slab:
>              /* This flag is irrelevant for the cache. */
>              flags &= ~RADEON_FLAG_NO_SUBALLOC;
>                /* Align size to page size. This is the minimum
>         alignment for normal
>               * BOs. Aligning this here helps the cached bufmgr.
>         Especially small BOs,
>               * like constant/uniform buffers, can benefit from better
>         and more reuse.
>               */
>              size = align64(size, ws->info.gart_page_size);
>              alignment = align(alignment, ws->info.gart_page_size);
>           -   int heap = radeon_get_heap_index(domain, flags);
>         -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>         -   usage = 1 << heap; /* Only set one usage bit for each heap. */
>         +   bool use_reusable_pool = flags &
>         RADEON_FLAG_NO_INTERPROCESS_SHARING;
>           -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>         -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>         +   if (use_reusable_pool) {
>         +       int heap = radeon_get_heap_index(domain, flags);
>         +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>         +       usage = 1 << heap; /* Only set one usage bit for each
>         heap. */
>           -   /* Get a buffer from the cache. */
>         -   bo = (struct amdgpu_winsys_bo*)
>         -        pb_cache_reclaim_buffer(&ws->bo_cache, size,
>         alignment, usage,
>         -                                pb_cache_bucket);
>         -   if (bo)
>         -      return &bo->base;
>         +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>         +       assert(pb_cache_bucket <
>         ARRAY_SIZE(ws->bo_cache.buckets));
>         +
>         +       /* Get a buffer from the cache. */
>         +       bo = (struct amdgpu_winsys_bo*)
>         +            pb_cache_reclaim_buffer(&ws->bo_cache, size,
>         alignment, usage,
>         + pb_cache_bucket);
>         +       if (bo)
>         +          return &bo->base;
>         +   }
>                /* Create a new one. */
>              bo = amdgpu_create_bo(ws, size, alignment, usage, domain,
>         flags,
>                                    pb_cache_bucket);
>              if (!bo) {
>                 /* Clear the cache and try again. */
>                 pb_slabs_reclaim(&ws->bo_slabs);
>                 pb_cache_release_all_buffers(&ws->bo_cache);
>                 bo = amdgpu_create_bo(ws, size, alignment, usage,
>         domain, flags,
>                                       pb_cache_bucket);
>                 if (!bo)
>                    return NULL;
>              }
>           -   bo->u.real.use_reusable_pool = true;
>         +   bo->u.real.use_reusable_pool = use_reusable_pool;
>              return &bo->base;
>           }
>             static struct pb_buffer *amdgpu_bo_from_handle(struct
>         radeon_winsys *rws,
>          struct winsys_handle *whandle,
>          unsigned *stride,
>          unsigned *offset)
>           {
>              struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>              struct amdgpu_winsys_bo *bo;
>         diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>         b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>         index 8027a5f..15e9d38 100644
>         --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>         +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>         @@ -907,21 +907,21 @@ static void
>         radeon_bo_set_metadata(struct pb_buffer *_buf,
>             static struct pb_buffer *
>           radeon_winsys_bo_create(struct radeon_winsys *rws,
>                                   uint64_t size,
>                                   unsigned alignment,
>                                   enum radeon_bo_domain domain,
>                                   enum radeon_bo_flag flags)
>           {
>               struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>               struct radeon_bo *bo;
>         -    unsigned usage = 0, pb_cache_bucket;
>         +    unsigned usage = 0, pb_cache_bucket = 0;
>                 assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>                 /* Only 32-bit sizes are supported. */
>               if (size > UINT_MAX)
>                   return NULL;
>                 /* VRAM implies WC. This is not optional. */
>               if (domain & RADEON_DOMAIN_VRAM)
>                   flags |= RADEON_FLAG_GTT_WC;
>         @@ -962,46 +962,51 @@ no_slab:
>               /* This flag is irrelevant for the cache. */
>               flags &= ~RADEON_FLAG_NO_SUBALLOC;
>                 /* Align size to page size. This is the minimum
>         alignment for normal
>                * BOs. Aligning this here helps the cached bufmgr.
>         Especially small BOs,
>                * like constant/uniform buffers, can benefit from
>         better and more reuse.
>                */
>               size = align(size, ws->info.gart_page_size);
>               alignment = align(alignment, ws->info.gart_page_size);
>           -    int heap = radeon_get_heap_index(domain, flags);
>         -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>         -    usage = 1 << heap; /* Only set one usage bit for each
>         heap. */
>         +    bool use_reusable_pool = flags &
>         RADEON_FLAG_NO_INTERPROCESS_SHARING;
>           -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>         -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>         +    /* Shared resources don't use cached heaps. */
>         +    if (use_reusable_pool) {
>         +        int heap = radeon_get_heap_index(domain, flags);
>         +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>         +        usage = 1 << heap; /* Only set one usage bit for each
>         heap. */
>           -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>         size, alignment,
>         -                                           usage,
>         pb_cache_bucket));
>         -    if (bo)
>         -        return &bo->base;
>         +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>         +        assert(pb_cache_bucket <
>         ARRAY_SIZE(ws->bo_cache.buckets));
>         +
>         +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>         size, alignment,
>         +  usage, pb_cache_bucket));
>         +        if (bo)
>         +            return &bo->base;
>         +    }
>                 bo = radeon_create_bo(ws, size, alignment, usage,
>         domain, flags,
>                                     pb_cache_bucket);
>               if (!bo) {
>                   /* Clear the cache and try again. */
>                   if (ws->info.has_virtual_memory)
>                       pb_slabs_reclaim(&ws->bo_slabs);
>                   pb_cache_release_all_buffers(&ws->bo_cache);
>                   bo = radeon_create_bo(ws, size, alignment, usage,
>         domain, flags,
>                                         pb_cache_bucket);
>                   if (!bo)
>                       return NULL;
>               }
>           -    bo->u.real.use_reusable_pool = true;
>         +    bo->u.real.use_reusable_pool = use_reusable_pool;
>                 mtx_lock(&ws->bo_handles_mutex);
>               util_hash_table_set(ws->bo_handles,
>         (void*)(uintptr_t)bo->handle, bo);
>               mtx_unlock(&ws->bo_handles_mutex);
>                 return &bo->base;
>           }
>             static struct pb_buffer *radeon_winsys_bo_from_ptr(struct
>         radeon_winsys *rws,
>          void *pointer, uint64_t size)
>
>
>
On 07/20/2017 04:20 AM, zhoucm1 wrote:
> 
> 
> On 2017年07月19日 23:34, Marek Olšák wrote:
>>
>>
>> On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com 
>> <mailto:david1.zhou@amd.com>> wrote:
>>
>>
>>
>>     On 2017年07月19日 04:08, Marek Olšák wrote:
>>
>>         From: Marek Olšák <marek.olsak@amd.com
>>         <mailto:marek.olsak@amd.com>>
>>
>>         For lower overhead in the CS ioctl.
>>         Winsys allocators are not used with interprocess-sharable
>>         resources.
>>
>>     Hi Marek,
>>
>>     Could I know from how your this way reduces overhead in CS ioctl?
>>     reusing BO to short bo list?
>>
>>
>> The kernel part of the work hasn't been done yet. The idea is that 
>> nonsharable buffers don't have to be revalidated by TTM,
> OK, Maybe I only can see the whole picture of this idea when you 
> complete kernel part.
> Out of curious,  why/how can nonsharable buffers be revalidated by TTM 
> without exposing like amdgpu_bo_make_resident api?
> 
> With mentioned in another thread, if we can expose make_resident api, we 
> can remove bo_list, even we can remove reservation operation in CS ioctl.
> And now, I think our bo list is a very bad design,
> first, umd must create bo list for every command submission, this is a 
> extra cpu overhead compared with traditional way.
> second, kernel also have to iterate the list, when bo list is too long, 
> like OpenCL program, they always throw several thousands BOs to bo list, 
> reservation must keep these thousands ww_mutex safe, CPU overhead is too 
> big.
> 
> So I strongly suggest we should expose make_resident api to user space. 
> if cannot, I want to know any specific reason to see if we can solve it.

Introducing a make_resident API will also help ARB_bindless_texture a 
lot, because currently when a texture is marked as resident, we have to 
re-validate the related buffers for every new CS, like traditional 
buffers. With a resident BO list the whole mechanism could be skipped.

> 
> 
> Regards,
> David Zhou
>> so it should remove a lot of kernel overhead and the BO list remains 
>> the same.
>>
>> Marek
>>
>>
>>
>>     Thanks,
>>     David Zhou
>>
>>
>>         v2: It shouldn't crash anymore, but the kernel will reject the
>>         new flag.
>>         ---
>>           src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>>           src/gallium/drivers/radeon/radeon_winsys.h   | 20 +++++++++++---
>>           src/gallium/winsys/amdgpu/drm/amdgpu_bo.c    | 36
>>         ++++++++++++++++---------
>>           src/gallium/winsys/radeon/drm/radeon_drm_bo.c  | 27
>>         +++++++++++--------
>>           4 files changed, 62 insertions(+), 28 deletions(-)
>>
>>         diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
>>         b/src/gallium/drivers/radeon/r600_buffer_common.c
>>         index dd1c209..2747ac4 100644
>>         --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>>         +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>>         @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
>>         r600_common_screen *rscreen,
>>                 }
>>                 /* Tiled textures are unmappable. Always put them in
>>         VRAM. */
>>                 if ((res->b.b.target != PIPE_BUFFER &&
>>         !rtex->surface.is_linear) ||
>>                     res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>>                         res->domains = RADEON_DOMAIN_VRAM;
>>                         res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>>                                  RADEON_FLAG_GTT_WC;
>>                 }
>>           +     /* Only displayable single-sample textures can be
>>         shared between
>>         +        * processes. */
>>         +       if (res->b.b.target == PIPE_BUFFER ||
>>         +           res->b.b.nr_samples >= 2 ||
>>         +           rtex->surface.micro_tile_mode !=
>>         RADEON_MICRO_MODE_DISPLAY)
>>         +               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>         +
>>                 /* If VRAM is just stolen system memory, allow both
>>         VRAM and
>>                  * GTT, whichever has free space. If a buffer is
>>         evicted from
>>                  * VRAM to GTT, it will stay there.
>>                  *
>>                  * DRM 3.6.0 has good BO move throttling, so we can
>>         allow VRAM-only
>>                  * placements even with a low amount of stolen VRAM.
>>                  */
>>                 if (!rscreen->info.has_dedicated_vram &&
>>                     (rscreen->info.drm_major < 3 ||
>>         rscreen->info.drm_minor < 6) &&
>>                     res->domains == RADEON_DOMAIN_VRAM) {
>>         diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
>>         b/src/gallium/drivers/radeon/radeon_winsys.h
>>         index 351edcd..0abcb56 100644
>>         --- a/src/gallium/drivers/radeon/radeon_winsys.h
>>         +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>>         @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>>               RADEON_DOMAIN_GTT  = 2,
>>               RADEON_DOMAIN_VRAM = 4,
>>               RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM |
>>         RADEON_DOMAIN_GTT
>>           };
>>             enum radeon_bo_flag { /* bitfield */
>>               RADEON_FLAG_GTT_WC =        (1 << 0),
>>               RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>>               RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>>               RADEON_FLAG_SPARSE =        (1 << 3),
>>         +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>>           };
>>             enum radeon_bo_usage { /* bitfield */
>>               RADEON_USAGE_READ = 2,
>>               RADEON_USAGE_WRITE = 4,
>>               RADEON_USAGE_READWRITE = RADEON_USAGE_READ |
>>         RADEON_USAGE_WRITE,
>>                 /* The winsys ensures that the CS submission will be
>>         scheduled after
>>                * previously flushed CSs referencing this BO in a
>>         conflicting way.
>>                */
>>         @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
>>         radeon_domain_from_heap(enum radeon_heap hea
>>               default:
>>                   assert(0);
>>                   return (enum radeon_bo_domain)0;
>>               }
>>           }
>>             static inline unsigned radeon_flags_from_heap(enum
>>         radeon_heap heap)
>>           {
>>               switch (heap) {
>>               case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>         -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
>>         +        return RADEON_FLAG_GTT_WC |
>>         +               RADEON_FLAG_NO_CPU_ACCESS |
>>         +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>         +
>>               case RADEON_HEAP_VRAM:
>>               case RADEON_HEAP_VRAM_GTT:
>>               case RADEON_HEAP_GTT_WC:
>>         -        return RADEON_FLAG_GTT_WC;
>>         +        return RADEON_FLAG_GTT_WC |
>>         +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>         +
>>               case RADEON_HEAP_GTT:
>>               default:
>>         -        return 0;
>>         +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>               }
>>           }
>>             /* The pb cache bucket is chosen to minimize pb_cache misses.
>>            * It must be between 0 and 3 inclusive.
>>            */
>>           static inline unsigned radeon_get_pb_cache_bucket_index(enum
>>         radeon_heap heap)
>>           {
>>               switch (heap) {
>>               case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>         @@ -724,22 +730,28 @@ static inline unsigned
>>         radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>>             /* Return the heap index for winsys allocators, or -1 on
>>         failure. */
>>           static inline int radeon_get_heap_index(enum
>>         radeon_bo_domain domain,
>>                                                   enum radeon_bo_flag
>>         flags)
>>           {
>>               /* VRAM implies WC (write combining) */
>>               assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>         RADEON_FLAG_GTT_WC);
>>               /* NO_CPU_ACCESS implies VRAM only. */
>>               assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain ==
>>         RADEON_DOMAIN_VRAM);
>>           +    /* Resources with interprocess sharing don't use any
>>         winsys allocators. */
>>         +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>         +        return -1;
>>         +
>>               /* Unsupported flags: NO_SUBALLOC, SPARSE. */
>>         -    if (flags & ~(RADEON_FLAG_GTT_WC |
>>         RADEON_FLAG_NO_CPU_ACCESS))
>>         +    if (flags & ~(RADEON_FLAG_GTT_WC |
>>         +                  RADEON_FLAG_NO_CPU_ACCESS |
>>         +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>                   return -1;
>>                 switch (domain) {
>>               case RADEON_DOMAIN_VRAM:
>>                   if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                       return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>>                   else
>>                       return RADEON_HEAP_VRAM;
>>               case RADEON_DOMAIN_VRAM_GTT:
>>                   return RADEON_HEAP_VRAM_GTT;
>>         diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>         b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>         index 97bbe23..06b8198 100644
>>         --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>         +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>         @@ -31,20 +31,24 @@
>>             #include "amdgpu_cs.h"
>>             #include "os/os_time.h"
>>           #include "state_tracker/drm_driver.h"
>>           #include <amdgpu_drm.h>
>>           #include <xf86drm.h>
>>           #include <stdio.h>
>>           #include <inttypes.h>
>>           +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
>>         +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
>>         +#endif
>>         +
>>           /* Set to 1 for verbose output showing committed sparse
>>         buffer ranges. */
>>           #define DEBUG_SPARSE_COMMITS 0
>>             struct amdgpu_sparse_backing_chunk {
>>              uint32_t begin, end;
>>           };
>>             static struct pb_buffer *
>>           amdgpu_bo_create(struct radeon_winsys *rws,
>>                            uint64_t size,
>>         @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
>>         *amdgpu_create_bo(struct amdgpu_winsys *ws,
>>                if (initial_domain & RADEON_DOMAIN_VRAM)
>>                 request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>>              if (initial_domain & RADEON_DOMAIN_GTT)
>>                 request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>>                if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                 request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>>              if (flags & RADEON_FLAG_GTT_WC)
>>                 request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
>>         +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
>>         +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>>                r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>>              if (r) {
>>                 fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>>                 fprintf(stderr, "amdgpu:    size      : %"PRIu64"
>>         bytes\n", size);
>>                 fprintf(stderr, "amdgpu:    alignment : %u bytes\n",
>>         alignment);
>>                 fprintf(stderr, "amdgpu:    domains   : %u\n",
>>         initial_domain);
>>                 goto error_bo_alloc;
>>              }
>>           @@ -1127,21 +1133,21 @@ static void
>>         amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>>             static struct pb_buffer *
>>           amdgpu_bo_create(struct radeon_winsys *rws,
>>                            uint64_t size,
>>                            unsigned alignment,
>>                            enum radeon_bo_domain domain,
>>                            enum radeon_bo_flag flags)
>>           {
>>              struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>              struct amdgpu_winsys_bo *bo;
>>         -   unsigned usage = 0, pb_cache_bucket;
>>         +   unsigned usage = 0, pb_cache_bucket = 0;
>>                /* VRAM implies WC. This is not optional. */
>>              assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>         RADEON_FLAG_GTT_WC);
>>                /* NO_CPU_ACCESS is valid with VRAM only. */
>>              assert(domain == RADEON_DOMAIN_VRAM || !(flags &
>>         RADEON_FLAG_NO_CPU_ACCESS));
>>                /* Sub-allocate small buffers from slabs. */
>>              if (!(flags & (RADEON_FLAG_NO_SUBALLOC |
>>         RADEON_FLAG_SPARSE)) &&
>>                  size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>>         @@ -1182,48 +1188,52 @@ no_slab:
>>              /* This flag is irrelevant for the cache. */
>>              flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                /* Align size to page size. This is the minimum
>>         alignment for normal
>>               * BOs. Aligning this here helps the cached bufmgr.
>>         Especially small BOs,
>>               * like constant/uniform buffers, can benefit from better
>>         and more reuse.
>>               */
>>              size = align64(size, ws->info.gart_page_size);
>>              alignment = align(alignment, ws->info.gart_page_size);
>>           -   int heap = radeon_get_heap_index(domain, flags);
>>         -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>         -   usage = 1 << heap; /* Only set one usage bit for each heap. */
>>         +   bool use_reusable_pool = flags &
>>         RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>           -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>>         -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>>         +   if (use_reusable_pool) {
>>         +       int heap = radeon_get_heap_index(domain, flags);
>>         +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>         +       usage = 1 << heap; /* Only set one usage bit for each
>>         heap. */
>>           -   /* Get a buffer from the cache. */
>>         -   bo = (struct amdgpu_winsys_bo*)
>>         -        pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>         alignment, usage,
>>         -                                pb_cache_bucket);
>>         -   if (bo)
>>         -      return &bo->base;
>>         +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>>         +       assert(pb_cache_bucket <
>>         ARRAY_SIZE(ws->bo_cache.buckets));
>>         +
>>         +       /* Get a buffer from the cache. */
>>         +       bo = (struct amdgpu_winsys_bo*)
>>         +            pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>         alignment, usage,
>>         + pb_cache_bucket);
>>         +       if (bo)
>>         +          return &bo->base;
>>         +   }
>>                /* Create a new one. */
>>              bo = amdgpu_create_bo(ws, size, alignment, usage, domain,
>>         flags,
>>                                    pb_cache_bucket);
>>              if (!bo) {
>>                 /* Clear the cache and try again. */
>>                 pb_slabs_reclaim(&ws->bo_slabs);
>>                 pb_cache_release_all_buffers(&ws->bo_cache);
>>                 bo = amdgpu_create_bo(ws, size, alignment, usage,
>>         domain, flags,
>>                                       pb_cache_bucket);
>>                 if (!bo)
>>                    return NULL;
>>              }
>>           -   bo->u.real.use_reusable_pool = true;
>>         +   bo->u.real.use_reusable_pool = use_reusable_pool;
>>              return &bo->base;
>>           }
>>             static struct pb_buffer *amdgpu_bo_from_handle(struct
>>         radeon_winsys *rws,
>>          struct winsys_handle *whandle,
>>          unsigned *stride,
>>          unsigned *offset)
>>           {
>>              struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>              struct amdgpu_winsys_bo *bo;
>>         diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>         b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>         index 8027a5f..15e9d38 100644
>>         --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>         +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>         @@ -907,21 +907,21 @@ static void
>>         radeon_bo_set_metadata(struct pb_buffer *_buf,
>>             static struct pb_buffer *
>>           radeon_winsys_bo_create(struct radeon_winsys *rws,
>>                                   uint64_t size,
>>                                   unsigned alignment,
>>                                   enum radeon_bo_domain domain,
>>                                   enum radeon_bo_flag flags)
>>           {
>>               struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>>               struct radeon_bo *bo;
>>         -    unsigned usage = 0, pb_cache_bucket;
>>         +    unsigned usage = 0, pb_cache_bucket = 0;
>>                 assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>>                 /* Only 32-bit sizes are supported. */
>>               if (size > UINT_MAX)
>>                   return NULL;
>>                 /* VRAM implies WC. This is not optional. */
>>               if (domain & RADEON_DOMAIN_VRAM)
>>                   flags |= RADEON_FLAG_GTT_WC;
>>         @@ -962,46 +962,51 @@ no_slab:
>>               /* This flag is irrelevant for the cache. */
>>               flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                 /* Align size to page size. This is the minimum
>>         alignment for normal
>>                * BOs. Aligning this here helps the cached bufmgr.
>>         Especially small BOs,
>>                * like constant/uniform buffers, can benefit from
>>         better and more reuse.
>>                */
>>               size = align(size, ws->info.gart_page_size);
>>               alignment = align(alignment, ws->info.gart_page_size);
>>           -    int heap = radeon_get_heap_index(domain, flags);
>>         -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>         -    usage = 1 << heap; /* Only set one usage bit for each
>>         heap. */
>>         +    bool use_reusable_pool = flags &
>>         RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>           -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>>         -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
>>         +    /* Shared resources don't use cached heaps. */
>>         +    if (use_reusable_pool) {
>>         +        int heap = radeon_get_heap_index(domain, flags);
>>         +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>         +        usage = 1 << heap; /* Only set one usage bit for each
>>         heap. */
>>           -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>>         size, alignment,
>>         -                                           usage,
>>         pb_cache_bucket));
>>         -    if (bo)
>>         -        return &bo->base;
>>         +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
>>         +        assert(pb_cache_bucket <
>>         ARRAY_SIZE(ws->bo_cache.buckets));
>>         +
>>         +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache,
>>         size, alignment,
>>         +  usage, pb_cache_bucket));
>>         +        if (bo)
>>         +            return &bo->base;
>>         +    }
>>                 bo = radeon_create_bo(ws, size, alignment, usage,
>>         domain, flags,
>>                                     pb_cache_bucket);
>>               if (!bo) {
>>                   /* Clear the cache and try again. */
>>                   if (ws->info.has_virtual_memory)
>>                       pb_slabs_reclaim(&ws->bo_slabs);
>>                   pb_cache_release_all_buffers(&ws->bo_cache);
>>                   bo = radeon_create_bo(ws, size, alignment, usage,
>>         domain, flags,
>>                                         pb_cache_bucket);
>>                   if (!bo)
>>                       return NULL;
>>               }
>>           -    bo->u.real.use_reusable_pool = true;
>>         +    bo->u.real.use_reusable_pool = use_reusable_pool;
>>                 mtx_lock(&ws->bo_handles_mutex);
>>               util_hash_table_set(ws->bo_handles,
>>         (void*)(uintptr_t)bo->handle, bo);
>>               mtx_unlock(&ws->bo_handles_mutex);
>>                 return &bo->base;
>>           }
>>             static struct pb_buffer *radeon_winsys_bo_from_ptr(struct
>>         radeon_winsys *rws,
>>          void *pointer, uint64_t size)
>>
>>
>>
> 
> 
> 
> N�n�r����)em�h�yhiם�w^��
>
On Jul 19, 2017 10:21 PM, "zhoucm1" <david1.zhou@amd.com> wrote:



On 2017年07月19日 23:34, Marek Olšák wrote:



On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com> wrote:



On 2017年07月19日 04:08, Marek Olšák wrote:

> From: Marek Olšák <marek.olsak@amd.com>
>
> For lower overhead in the CS ioctl.
> Winsys allocators are not used with interprocess-sharable resources.
>
Hi Marek,

Could I know from how your this way reduces overhead in CS ioctl? reusing
BO to short bo list?


The kernel part of the work hasn't been done yet. The idea is that
nonsharable buffers don't have to be revalidated by TTM,

OK, Maybe I only can see the whole picture of this idea when you complete
kernel part.
Out of curious,  why/how can nonsharable buffers be revalidated by TTM
without exposing like amdgpu_bo_make_resident api?


I think the idea is that all nonsharable buffers will be backed by the same
reservation object, so TTM can skip buffer validation if no buffer has been
moved. It's just an optimization for the current design.


With mentioned in another thread, if we can expose make_resident api, we
can remove bo_list, even we can remove reservation operation in CS ioctl.
And now, I think our bo list is a very bad design,
first, umd must create bo list for every command submission, this is a
extra cpu overhead compared with traditional way.
second, kernel also have to iterate the list, when bo list is too long,
like OpenCL program, they always throw several thousands BOs to bo list,
reservation must keep these thousands ww_mutex safe, CPU overhead is too
big.

So I strongly suggest we should expose make_resident api to user space. if
cannot, I want to know any specific reason to see if we can solve it.


Yeah, I think the BO list idea is likely to die sooner or later. It made
sense for GL before bindless was a thing. Nowadays I don't see much value
in it.

MesaGL will keep tracking the BO list because it's a requirement for good
GL performance (it determines whether to flush IBs before BO
synchronization, it allows tracking fences for each BO, which are used to
determine dependencies between IBs, and that all allows async SDMA and
async compute for GL, which doesn't have separate queues).

However, we don't need any BO list at the libdrm level and lower. I think a
BO_CREATE flag that causes that the buffer is added to a kernel-side per-fd
BO list would be sufficient. How the kernel manages its BO list should be
its own implementation detail. Initially we can just move the current BO
list management into the kernel.

Marek





Regards,
David Zhou

so it should remove a lot of kernel overhead and the BO list remains the
same.

Marek



Thanks,
David Zhou


> v2: It shouldn't crash anymore, but the kernel will reject the new flag.
> ---
>   src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>   src/gallium/drivers/radeon/radeon_winsys.h      | 20 +++++++++++---
>   src/gallium/winsys/amdgpu/drm/amdgpu_bo.c       | 36
> ++++++++++++++++---------
>   src/gallium/winsys/radeon/drm/radeon_drm_bo.c   | 27 +++++++++++--------
>   4 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c
> b/src/gallium/drivers/radeon/r600_buffer_common.c
> index dd1c209..2747ac4 100644
> --- a/src/gallium/drivers/radeon/r600_buffer_common.c
> +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
> @@ -160,20 +160,27 @@ void r600_init_resource_fields(struct
> r600_common_screen *rscreen,
>         }
>         /* Tiled textures are unmappable. Always put them in VRAM. */
>         if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
>             res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>                 res->domains = RADEON_DOMAIN_VRAM;
>                 res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>                          RADEON_FLAG_GTT_WC;
>         }
>   +     /* Only displayable single-sample textures can be shared between
> +        * processes. */
> +       if (res->b.b.target == PIPE_BUFFER ||
> +           res->b.b.nr_samples >= 2 ||
> +           rtex->surface.micro_tile_mode != RADEON_MICRO_MODE_DISPLAY)
> +               res->flags |= RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>         /* If VRAM is just stolen system memory, allow both VRAM and
>          * GTT, whichever has free space. If a buffer is evicted from
>          * VRAM to GTT, it will stay there.
>          *
>          * DRM 3.6.0 has good BO move throttling, so we can allow VRAM-only
>          * placements even with a low amount of stolen VRAM.
>          */
>         if (!rscreen->info.has_dedicated_vram &&
>             (rscreen->info.drm_major < 3 || rscreen->info.drm_minor < 6) &&
>             res->domains == RADEON_DOMAIN_VRAM) {
> diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
> b/src/gallium/drivers/radeon/radeon_winsys.h
> index 351edcd..0abcb56 100644
> --- a/src/gallium/drivers/radeon/radeon_winsys.h
> +++ b/src/gallium/drivers/radeon/radeon_winsys.h
> @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>       RADEON_DOMAIN_GTT  = 2,
>       RADEON_DOMAIN_VRAM = 4,
>       RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GTT
>   };
>     enum radeon_bo_flag { /* bitfield */
>       RADEON_FLAG_GTT_WC =        (1 << 0),
>       RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>       RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>       RADEON_FLAG_SPARSE =        (1 << 3),
> +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>   };
>     enum radeon_bo_usage { /* bitfield */
>       RADEON_USAGE_READ = 2,
>       RADEON_USAGE_WRITE = 4,
>       RADEON_USAGE_READWRITE = RADEON_USAGE_READ | RADEON_USAGE_WRITE,
>         /* The winsys ensures that the CS submission will be scheduled
> after
>        * previously flushed CSs referencing this BO in a conflicting way.
>        */
> @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
> radeon_domain_from_heap(enum radeon_heap hea
>       default:
>           assert(0);
>           return (enum radeon_bo_domain)0;
>       }
>   }
>     static inline unsigned radeon_flags_from_heap(enum radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> -        return RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_CPU_ACCESS |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_VRAM:
>       case RADEON_HEAP_VRAM_GTT:
>       case RADEON_HEAP_GTT_WC:
> -        return RADEON_FLAG_GTT_WC;
> +        return RADEON_FLAG_GTT_WC |
> +               RADEON_FLAG_NO_INTERPROCESS_SHARING;
> +
>       case RADEON_HEAP_GTT:
>       default:
> -        return 0;
> +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>       }
>   }
>     /* The pb cache bucket is chosen to minimize pb_cache misses.
>    * It must be between 0 and 3 inclusive.
>    */
>   static inline unsigned radeon_get_pb_cache_bucket_index(enum
> radeon_heap heap)
>   {
>       switch (heap) {
>       case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
> @@ -724,22 +730,28 @@ static inline unsigned radeon_get_pb_cache_bucket_index(enum
> radeon_heap heap)
>     /* Return the heap index for winsys allocators, or -1 on failure. */
>   static inline int radeon_get_heap_index(enum radeon_bo_domain domain,
>                                           enum radeon_bo_flag flags)
>   {
>       /* VRAM implies WC (write combining) */
>       assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>       /* NO_CPU_ACCESS implies VRAM only. */
>       assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) || domain ==
> RADEON_DOMAIN_VRAM);
>   +    /* Resources with interprocess sharing don't use any winsys
> allocators. */
> +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
> +        return -1;
> +
>       /* Unsupported flags: NO_SUBALLOC, SPARSE. */
> -    if (flags & ~(RADEON_FLAG_GTT_WC | RADEON_FLAG_NO_CPU_ACCESS))
> +    if (flags & ~(RADEON_FLAG_GTT_WC |
> +                  RADEON_FLAG_NO_CPU_ACCESS |
> +                  RADEON_FLAG_NO_INTERPROCESS_SHARING))
>           return -1;
>         switch (domain) {
>       case RADEON_DOMAIN_VRAM:
>           if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>               return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>           else
>               return RADEON_HEAP_VRAM;
>       case RADEON_DOMAIN_VRAM_GTT:
>           return RADEON_HEAP_VRAM_GTT;
> diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> index 97bbe23..06b8198 100644
> --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
> @@ -31,20 +31,24 @@
>     #include "amdgpu_cs.h"
>     #include "os/os_time.h"
>   #include "state_tracker/drm_driver.h"
>   #include <amdgpu_drm.h>
>   #include <xf86drm.h>
>   #include <stdio.h>
>   #include <inttypes.h>
>   +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
> +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
> +#endif
> +
>   /* Set to 1 for verbose output showing committed sparse buffer ranges. */
>   #define DEBUG_SPARSE_COMMITS 0
>     struct amdgpu_sparse_backing_chunk {
>      uint32_t begin, end;
>   };
>     static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
> @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
> *amdgpu_create_bo(struct amdgpu_winsys *ws,
>        if (initial_domain & RADEON_DOMAIN_VRAM)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>      if (initial_domain & RADEON_DOMAIN_GTT)
>         request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>        if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>         request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>      if (flags & RADEON_FLAG_GTT_WC)
>         request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
> +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
> +      request.flags |= AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>        r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>      if (r) {
>         fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
>         fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
>         fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
>         fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
>         goto error_bo_alloc;
>      }
>   @@ -1127,21 +1133,21 @@ static void amdgpu_buffer_set_metadata(struct
> pb_buffer *_buf,
>     static struct pb_buffer *
>   amdgpu_bo_create(struct radeon_winsys *rws,
>                    uint64_t size,
>                    unsigned alignment,
>                    enum radeon_bo_domain domain,
>                    enum radeon_bo_flag flags)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> -   unsigned usage = 0, pb_cache_bucket;
> +   unsigned usage = 0, pb_cache_bucket = 0;
>        /* VRAM implies WC. This is not optional. */
>      assert(!(domain & RADEON_DOMAIN_VRAM) || flags & RADEON_FLAG_GTT_WC);
>        /* NO_CPU_ACCESS is valid with VRAM only. */
>      assert(domain == RADEON_DOMAIN_VRAM || !(flags &
> RADEON_FLAG_NO_CPU_ACCESS));
>        /* Sub-allocate small buffers from slabs. */
>      if (!(flags & (RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_SPARSE)) &&
>          size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
> @@ -1182,48 +1188,52 @@ no_slab:
>      /* This flag is irrelevant for the cache. */
>      flags &= ~RADEON_FLAG_NO_SUBALLOC;
>        /* Align size to page size. This is the minimum alignment for normal
>       * BOs. Aligning this here helps the cached bufmgr. Especially small
> BOs,
>       * like constant/uniform buffers, can benefit from better and more
> reuse.
>       */
>      size = align64(size, ws->info.gart_page_size);
>      alignment = align(alignment, ws->info.gart_page_size);
>   -   int heap = radeon_get_heap_index(domain, flags);
> -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -   usage = 1 << heap; /* Only set one usage bit for each heap. */
> +   bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   -   pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -   assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +   if (use_reusable_pool) {
> +       int heap = radeon_get_heap_index(domain, flags);
> +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +       usage = 1 << heap; /* Only set one usage bit for each heap. */
>   -   /* Get a buffer from the cache. */
> -   bo = (struct amdgpu_winsys_bo*)
> -        pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage,
> -                                pb_cache_bucket);
> -   if (bo)
> -      return &bo->base;
> +       pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +       assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +       /* Get a buffer from the cache. */
> +       bo = (struct amdgpu_winsys_bo*)
> +            pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
> usage,
> +                                    pb_cache_bucket);
> +       if (bo)
> +          return &bo->base;
> +   }
>        /* Create a new one. */
>      bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                            pb_cache_bucket);
>      if (!bo) {
>         /* Clear the cache and try again. */
>         pb_slabs_reclaim(&ws->bo_slabs);
>         pb_cache_release_all_buffers(&ws->bo_cache);
>         bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags,
>                               pb_cache_bucket);
>         if (!bo)
>            return NULL;
>      }
>   -   bo->u.real.use_reusable_pool = true;
> +   bo->u.real.use_reusable_pool = use_reusable_pool;
>      return &bo->base;
>   }
>     static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys
> *rws,
>                                                  struct winsys_handle
> *whandle,
>                                                  unsigned *stride,
>                                                  unsigned *offset)
>   {
>      struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>      struct amdgpu_winsys_bo *bo;
> diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> index 8027a5f..15e9d38 100644
> --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
> @@ -907,21 +907,21 @@ static void radeon_bo_set_metadata(struct pb_buffer
> *_buf,
>     static struct pb_buffer *
>   radeon_winsys_bo_create(struct radeon_winsys *rws,
>                           uint64_t size,
>                           unsigned alignment,
>                           enum radeon_bo_domain domain,
>                           enum radeon_bo_flag flags)
>   {
>       struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>       struct radeon_bo *bo;
> -    unsigned usage = 0, pb_cache_bucket;
> +    unsigned usage = 0, pb_cache_bucket = 0;
>         assert(!(flags & RADEON_FLAG_SPARSE)); /* not supported */
>         /* Only 32-bit sizes are supported. */
>       if (size > UINT_MAX)
>           return NULL;
>         /* VRAM implies WC. This is not optional. */
>       if (domain & RADEON_DOMAIN_VRAM)
>           flags |= RADEON_FLAG_GTT_WC;
> @@ -962,46 +962,51 @@ no_slab:
>       /* This flag is irrelevant for the cache. */
>       flags &= ~RADEON_FLAG_NO_SUBALLOC;
>         /* Align size to page size. This is the minimum alignment for
> normal
>        * BOs. Aligning this here helps the cached bufmgr. Especially small
> BOs,
>        * like constant/uniform buffers, can benefit from better and more
> reuse.
>        */
>       size = align(size, ws->info.gart_page_size);
>       alignment = align(alignment, ws->info.gart_page_size);
>   -    int heap = radeon_get_heap_index(domain, flags);
> -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> -    usage = 1 << heap; /* Only set one usage bit for each heap. */
> +    bool use_reusable_pool = flags & RADEON_FLAG_NO_INTERPROCESS_SHARING;
>   -    pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> -    assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +    /* Shared resources don't use cached heaps. */
> +    if (use_reusable_pool) {
> +        int heap = radeon_get_heap_index(domain, flags);
> +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
> +        usage = 1 << heap; /* Only set one usage bit for each heap. */
>   -    bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
> alignment,
> -                                           usage, pb_cache_bucket));
> -    if (bo)
> -        return &bo->base;
> +        pb_cache_bucket = radeon_get_pb_cache_bucket_index(heap);
> +        assert(pb_cache_bucket < ARRAY_SIZE(ws->bo_cache.buckets));
> +
> +        bo = radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
> alignment,
> +                                               usage, pb_cache_bucket));
> +        if (bo)
> +            return &bo->base;
> +    }
>         bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                             pb_cache_bucket);
>       if (!bo) {
>           /* Clear the cache and try again. */
>           if (ws->info.has_virtual_memory)
>               pb_slabs_reclaim(&ws->bo_slabs);
>           pb_cache_release_all_buffers(&ws->bo_cache);
>           bo = radeon_create_bo(ws, size, alignment, usage, domain, flags,
>                                 pb_cache_bucket);
>           if (!bo)
>               return NULL;
>       }
>   -    bo->u.real.use_reusable_pool = true;
> +    bo->u.real.use_reusable_pool = use_reusable_pool;
>         mtx_lock(&ws->bo_handles_mutex);
>       util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle,
> bo);
>       mtx_unlock(&ws->bo_handles_mutex);
>         return &bo->base;
>   }
>     static struct pb_buffer *radeon_winsys_bo_from_ptr(struct
> radeon_winsys *rws,
>                                                      void *pointer,
> uint64_t size)
>
Am 20.07.2017 um 16:59 schrieb Marek Olšák:
>
>
> On Jul 19, 2017 10:21 PM, "zhoucm1" <david1.zhou@amd.com 
> <mailto:david1.zhou@amd.com>> wrote:
>
>
>
>     On 2017年07月19日 23:34, Marek Olšák wrote:
>>
>>
>>     On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com
>>     <mailto:david1.zhou@amd.com>> wrote:
>>
>>
>>
>>         On 2017年07月19日 04:08, Marek Olšák wrote:
>>
>>             From: Marek Olšák <marek.olsak@amd.com
>>             <mailto:marek.olsak@amd.com>>
>>
>>             For lower overhead in the CS ioctl.
>>             Winsys allocators are not used with interprocess-sharable
>>             resources.
>>
>>         Hi Marek,
>>
>>         Could I know from how your this way reduces overhead in CS
>>         ioctl? reusing BO to short bo list?
>>
>>
>>     The kernel part of the work hasn't been done yet. The idea is
>>     that nonsharable buffers don't have to be revalidated by TTM,
>     OK, Maybe I only can see the whole picture of this idea when you
>     complete kernel part.
>     Out of curious,  why/how can nonsharable buffers be revalidated by
>     TTM without exposing like amdgpu_bo_make_resident api?
>
>
> I think the idea is that all nonsharable buffers will be backed by the 
> same reservation object, so TTM can skip buffer validation if no 
> buffer has been moved. It's just an optimization for the current design.
>
>
>     With mentioned in another thread, if we can expose make_resident
>     api, we can remove bo_list, even we can remove reservation
>     operation in CS ioctl.
>

Actually that is NOT a resident api. E.g. in other words BOs marked as 
this are just swapped out like other BOs.

It's just that on command submission additionally to the BOs which come 
from the BO list we have the BOs associated to the process as well.

>     And now, I think our bo list is a very bad design,
>     first, umd must create bo list for every command submission, this
>     is a extra cpu overhead compared with traditional way.
>     second, kernel also have to iterate the list, when bo list is too
>     long, like OpenCL program, they always throw several thousands BOs
>     to bo list, reservation must keep these thousands ww_mutex safe,
>     CPU overhead is too big.
>
>     So I strongly suggest we should expose make_resident api to user
>     space. if cannot, I want to know any specific reason to see if we
>     can solve it.
>
>
> Yeah, I think the BO list idea is likely to die sooner or later. It 
> made sense for GL before bindless was a thing. Nowadays I don't see 
> much value in it.

Completely agree. The BO list API was not a good idea in the first place.

Regards,
Christian.

>
> MesaGL will keep tracking the BO list because it's a requirement for 
> good GL performance (it determines whether to flush IBs before BO 
> synchronization, it allows tracking fences for each BO, which are used 
> to determine dependencies between IBs, and that all allows async SDMA 
> and async compute for GL, which doesn't have separate queues).
>
> However, we don't need any BO list at the libdrm level and lower. I 
> think a BO_CREATE flag that causes that the buffer is added to a 
> kernel-side per-fd BO list would be sufficient. How the kernel manages 
> its BO list should be its own implementation detail. Initially we can 
> just move the current BO list management into the kernel.
>
> Marek
>
>
>
>
>
>     Regards,
>     David Zhou
>
>>     so it should remove a lot of kernel overhead and the BO list
>>     remains the same.
>>
>>     Marek
>>
>>
>>
>>         Thanks,
>>         David Zhou
>>
>>
>>             v2: It shouldn't crash anymore, but the kernel will
>>             reject the new flag.
>>             ---
>>               src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>>               src/gallium/drivers/radeon/radeon_winsys.h     | 20
>>             +++++++++++---
>>               src/gallium/winsys/amdgpu/drm/amdgpu_bo.c      | 36
>>             ++++++++++++++++---------
>>               src/gallium/winsys/radeon/drm/radeon_drm_bo.c  | 27
>>             +++++++++++--------
>>               4 files changed, 62 insertions(+), 28 deletions(-)
>>
>>             diff --git
>>             a/src/gallium/drivers/radeon/r600_buffer_common.c
>>             b/src/gallium/drivers/radeon/r600_buffer_common.c
>>             index dd1c209..2747ac4 100644
>>             --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>>             +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>>             @@ -160,20 +160,27 @@ void
>>             r600_init_resource_fields(struct r600_common_screen *rscreen,
>>                     }
>>                     /* Tiled textures are unmappable. Always put them
>>             in VRAM. */
>>                     if ((res->b.b.target != PIPE_BUFFER &&
>>             !rtex->surface.is_linear) ||
>>                         res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>>                             res->domains = RADEON_DOMAIN_VRAM;
>>                             res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>>              RADEON_FLAG_GTT_WC;
>>                     }
>>               +     /* Only displayable single-sample textures can be
>>             shared between
>>             +        * processes. */
>>             +       if (res->b.b.target == PIPE_BUFFER ||
>>             +           res->b.b.nr_samples >= 2 ||
>>             +  rtex->surface.micro_tile_mode !=
>>             RADEON_MICRO_MODE_DISPLAY)
>>             +               res->flags |=
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                     /* If VRAM is just stolen system memory, allow
>>             both VRAM and
>>                      * GTT, whichever has free space. If a buffer is
>>             evicted from
>>                      * VRAM to GTT, it will stay there.
>>                      *
>>                      * DRM 3.6.0 has good BO move throttling, so we
>>             can allow VRAM-only
>>                      * placements even with a low amount of stolen VRAM.
>>                      */
>>                     if (!rscreen->info.has_dedicated_vram &&
>>             (rscreen->info.drm_major < 3 || rscreen->info.drm_minor <
>>             6) &&
>>                         res->domains == RADEON_DOMAIN_VRAM) {
>>             diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
>>             b/src/gallium/drivers/radeon/radeon_winsys.h
>>             index 351edcd..0abcb56 100644
>>             --- a/src/gallium/drivers/radeon/radeon_winsys.h
>>             +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>>             @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>>                   RADEON_DOMAIN_GTT  = 2,
>>                   RADEON_DOMAIN_VRAM = 4,
>>                   RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM |
>>             RADEON_DOMAIN_GTT
>>               };
>>                 enum radeon_bo_flag { /* bitfield */
>>                   RADEON_FLAG_GTT_WC =        (1 << 0),
>>                   RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>>                   RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>>                   RADEON_FLAG_SPARSE =        (1 << 3),
>>             +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>>               };
>>                 enum radeon_bo_usage { /* bitfield */
>>                   RADEON_USAGE_READ = 2,
>>                   RADEON_USAGE_WRITE = 4,
>>                   RADEON_USAGE_READWRITE = RADEON_USAGE_READ |
>>             RADEON_USAGE_WRITE,
>>                     /* The winsys ensures that the CS submission will
>>             be scheduled after
>>                    * previously flushed CSs referencing this BO in a
>>             conflicting way.
>>                    */
>>             @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
>>             radeon_domain_from_heap(enum radeon_heap hea
>>                   default:
>>                       assert(0);
>>                       return (enum radeon_bo_domain)0;
>>                   }
>>               }
>>                 static inline unsigned radeon_flags_from_heap(enum
>>             radeon_heap heap)
>>               {
>>                   switch (heap) {
>>                   case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>             -        return RADEON_FLAG_GTT_WC |
>>             RADEON_FLAG_NO_CPU_ACCESS;
>>             +        return RADEON_FLAG_GTT_WC |
>>             +  RADEON_FLAG_NO_CPU_ACCESS |
>>             +  RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                   case RADEON_HEAP_VRAM:
>>                   case RADEON_HEAP_VRAM_GTT:
>>                   case RADEON_HEAP_GTT_WC:
>>             -        return RADEON_FLAG_GTT_WC;
>>             +        return RADEON_FLAG_GTT_WC |
>>             +  RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                   case RADEON_HEAP_GTT:
>>                   default:
>>             -        return 0;
>>             +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>                   }
>>               }
>>                 /* The pb cache bucket is chosen to minimize pb_cache
>>             misses.
>>                * It must be between 0 and 3 inclusive.
>>                */
>>               static inline unsigned
>>             radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>>               {
>>                   switch (heap) {
>>                   case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>             @@ -724,22 +730,28 @@ static inline unsigned
>>             radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>>                 /* Return the heap index for winsys allocators, or -1
>>             on failure. */
>>               static inline int radeon_get_heap_index(enum
>>             radeon_bo_domain domain,
>>                   enum radeon_bo_flag flags)
>>               {
>>                   /* VRAM implies WC (write combining) */
>>                   assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>             RADEON_FLAG_GTT_WC);
>>                   /* NO_CPU_ACCESS implies VRAM only. */
>>                   assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) ||
>>             domain == RADEON_DOMAIN_VRAM);
>>               +    /* Resources with interprocess sharing don't use
>>             any winsys allocators. */
>>             +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>             +        return -1;
>>             +
>>                   /* Unsupported flags: NO_SUBALLOC, SPARSE. */
>>             -    if (flags & ~(RADEON_FLAG_GTT_WC |
>>             RADEON_FLAG_NO_CPU_ACCESS))
>>             +    if (flags & ~(RADEON_FLAG_GTT_WC |
>>             + RADEON_FLAG_NO_CPU_ACCESS |
>>             + RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>                       return -1;
>>                     switch (domain) {
>>                   case RADEON_DOMAIN_VRAM:
>>                       if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                           return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>>                       else
>>                           return RADEON_HEAP_VRAM;
>>                   case RADEON_DOMAIN_VRAM_GTT:
>>                       return RADEON_HEAP_VRAM_GTT;
>>             diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             index 97bbe23..06b8198 100644
>>             --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             @@ -31,20 +31,24 @@
>>                 #include "amdgpu_cs.h"
>>                 #include "os/os_time.h"
>>               #include "state_tracker/drm_driver.h"
>>               #include <amdgpu_drm.h>
>>               #include <xf86drm.h>
>>               #include <stdio.h>
>>               #include <inttypes.h>
>>               +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
>>             +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
>>             +#endif
>>             +
>>               /* Set to 1 for verbose output showing committed sparse
>>             buffer ranges. */
>>               #define DEBUG_SPARSE_COMMITS 0
>>                 struct amdgpu_sparse_backing_chunk {
>>                  uint32_t begin, end;
>>               };
>>                 static struct pb_buffer *
>>               amdgpu_bo_create(struct radeon_winsys *rws,
>>                                uint64_t size,
>>             @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
>>             *amdgpu_create_bo(struct amdgpu_winsys *ws,
>>                    if (initial_domain & RADEON_DOMAIN_VRAM)
>>                     request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>>                  if (initial_domain & RADEON_DOMAIN_GTT)
>>                     request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>>                    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                     request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>>                  if (flags & RADEON_FLAG_GTT_WC)
>>                     request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
>>             +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
>>             +      request.flags |=
>>             AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>>                    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>>                  if (r) {
>>                     fprintf(stderr, "amdgpu: Failed to allocate a
>>             buffer:\n");
>>                     fprintf(stderr, "amdgpu: size      : %"PRIu64"
>>             bytes\n", size);
>>                     fprintf(stderr, "amdgpu: alignment : %u bytes\n",
>>             alignment);
>>                     fprintf(stderr, "amdgpu: domains   : %u\n",
>>             initial_domain);
>>                     goto error_bo_alloc;
>>                  }
>>               @@ -1127,21 +1133,21 @@ static void
>>             amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>>                 static struct pb_buffer *
>>               amdgpu_bo_create(struct radeon_winsys *rws,
>>                                uint64_t size,
>>                                unsigned alignment,
>>                                enum radeon_bo_domain domain,
>>                                enum radeon_bo_flag flags)
>>               {
>>                  struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>                  struct amdgpu_winsys_bo *bo;
>>             -   unsigned usage = 0, pb_cache_bucket;
>>             +   unsigned usage = 0, pb_cache_bucket = 0;
>>                    /* VRAM implies WC. This is not optional. */
>>                  assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>             RADEON_FLAG_GTT_WC);
>>                    /* NO_CPU_ACCESS is valid with VRAM only. */
>>                  assert(domain == RADEON_DOMAIN_VRAM || !(flags &
>>             RADEON_FLAG_NO_CPU_ACCESS));
>>                    /* Sub-allocate small buffers from slabs. */
>>                  if (!(flags & (RADEON_FLAG_NO_SUBALLOC |
>>             RADEON_FLAG_SPARSE)) &&
>>                      size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>>             @@ -1182,48 +1188,52 @@ no_slab:
>>                  /* This flag is irrelevant for the cache. */
>>                  flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                    /* Align size to page size. This is the minimum
>>             alignment for normal
>>                   * BOs. Aligning this here helps the cached bufmgr.
>>             Especially small BOs,
>>                   * like constant/uniform buffers, can benefit from
>>             better and more reuse.
>>                   */
>>                  size = align64(size, ws->info.gart_page_size);
>>                  alignment = align(alignment, ws->info.gart_page_size);
>>               -   int heap = radeon_get_heap_index(domain, flags);
>>             -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             -   usage = 1 << heap; /* Only set one usage bit for each
>>             heap. */
>>             +   bool use_reusable_pool = flags &
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>               -   pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             -   assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +   if (use_reusable_pool) {
>>             +       int heap = radeon_get_heap_index(domain, flags);
>>             +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             +       usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>               -   /* Get a buffer from the cache. */
>>             -   bo = (struct amdgpu_winsys_bo*)
>>             - pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
>>             usage,
>>             - pb_cache_bucket);
>>             -   if (bo)
>>             -      return &bo->base;
>>             +       pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             +       assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +
>>             +       /* Get a buffer from the cache. */
>>             +       bo = (struct amdgpu_winsys_bo*)
>>             + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
>>             usage,
>>             + pb_cache_bucket);
>>             +       if (bo)
>>             +          return &bo->base;
>>             +   }
>>                    /* Create a new one. */
>>                  bo = amdgpu_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>              pb_cache_bucket);
>>                  if (!bo) {
>>                     /* Clear the cache and try again. */
>>             pb_slabs_reclaim(&ws->bo_slabs);
>>             pb_cache_release_all_buffers(&ws->bo_cache);
>>                     bo = amdgpu_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>             pb_cache_bucket);
>>                     if (!bo)
>>                        return NULL;
>>                  }
>>               -  bo->u.real.use_reusable_pool = true;
>>             +   bo->u.real.use_reusable_pool = use_reusable_pool;
>>                  return &bo->base;
>>               }
>>                 static struct pb_buffer *amdgpu_bo_from_handle(struct
>>             radeon_winsys *rws,
>>                          struct winsys_handle *whandle,
>>                          unsigned *stride,
>>                          unsigned *offset)
>>               {
>>                  struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>                  struct amdgpu_winsys_bo *bo;
>>             diff --git
>>             a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             index 8027a5f..15e9d38 100644
>>             --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             @@ -907,21 +907,21 @@ static void
>>             radeon_bo_set_metadata(struct pb_buffer *_buf,
>>                 static struct pb_buffer *
>>               radeon_winsys_bo_create(struct radeon_winsys *rws,
>>                                       uint64_t size,
>>                                       unsigned alignment,
>>                                       enum radeon_bo_domain domain,
>>                                       enum radeon_bo_flag flags)
>>               {
>>                   struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>>                   struct radeon_bo *bo;
>>             -    unsigned usage = 0, pb_cache_bucket;
>>             +    unsigned usage = 0, pb_cache_bucket = 0;
>>                     assert(!(flags & RADEON_FLAG_SPARSE)); /* not
>>             supported */
>>                     /* Only 32-bit sizes are supported. */
>>                   if (size > UINT_MAX)
>>                       return NULL;
>>                     /* VRAM implies WC. This is not optional. */
>>                   if (domain & RADEON_DOMAIN_VRAM)
>>                       flags |= RADEON_FLAG_GTT_WC;
>>             @@ -962,46 +962,51 @@ no_slab:
>>                   /* This flag is irrelevant for the cache. */
>>                   flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                     /* Align size to page size. This is the minimum
>>             alignment for normal
>>                    * BOs. Aligning this here helps the cached bufmgr.
>>             Especially small BOs,
>>                    * like constant/uniform buffers, can benefit from
>>             better and more reuse.
>>                    */
>>                   size = align(size, ws->info.gart_page_size);
>>                   alignment = align(alignment, ws->info.gart_page_size);
>>               -    int heap = radeon_get_heap_index(domain, flags);
>>             -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             -    usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>             +    bool use_reusable_pool = flags &
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>               -    pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             -    assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +    /* Shared resources don't use cached heaps. */
>>             +    if (use_reusable_pool) {
>>             +        int heap = radeon_get_heap_index(domain, flags);
>>             +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             +        usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>               -    bo =
>>             radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>             alignment,
>>             -        usage, pb_cache_bucket));
>>             -    if (bo)
>>             -        return &bo->base;
>>             +        pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             +        assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +
>>             +        bo =
>>             radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>             alignment,
>>             +            usage, pb_cache_bucket));
>>             +        if (bo)
>>             +            return &bo->base;
>>             +    }
>>                     bo = radeon_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>             pb_cache_bucket);
>>                   if (!bo) {
>>                       /* Clear the cache and try again. */
>>                       if (ws->info.has_virtual_memory)
>>             pb_slabs_reclaim(&ws->bo_slabs);
>>             pb_cache_release_all_buffers(&ws->bo_cache);
>>                       bo = radeon_create_bo(ws, size, alignment,
>>             usage, domain, flags,
>>             pb_cache_bucket);
>>                       if (!bo)
>>                           return NULL;
>>                   }
>>               - bo->u.real.use_reusable_pool = true;
>>             +    bo->u.real.use_reusable_pool = use_reusable_pool;
>>             mtx_lock(&ws->bo_handles_mutex);
>>             util_hash_table_set(ws->bo_handles,
>>             (void*)(uintptr_t)bo->handle, bo);
>>             mtx_unlock(&ws->bo_handles_mutex);
>>                     return &bo->base;
>>               }
>>                 static struct pb_buffer
>>             *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
>>                              void *pointer, uint64_t size)
>>
>>
>>
>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On 2017年07月20日 22:59, Marek Olšák wrote:
>
>
> On Jul 19, 2017 10:21 PM, "zhoucm1" <david1.zhou@amd.com 
> <mailto:david1.zhou@amd.com>> wrote:
>
>
>
>     On 2017年07月19日 23:34, Marek Olšák wrote:
>>
>>
>>     On Jul 19, 2017 3:36 AM, "zhoucm1" <david1.zhou@amd.com
>>     <mailto:david1.zhou@amd.com>> wrote:
>>
>>
>>
>>         On 2017年07月19日 04:08, Marek Olšák wrote:
>>
>>             From: Marek Olšák <marek.olsak@amd.com
>>             <mailto:marek.olsak@amd.com>>
>>
>>             For lower overhead in the CS ioctl.
>>             Winsys allocators are not used with interprocess-sharable
>>             resources.
>>
>>         Hi Marek,
>>
>>         Could I know from how your this way reduces overhead in CS
>>         ioctl? reusing BO to short bo list?
>>
>>
>>     The kernel part of the work hasn't been done yet. The idea is
>>     that nonsharable buffers don't have to be revalidated by TTM,
>     OK, Maybe I only can see the whole picture of this idea when you
>     complete kernel part.
>     Out of curious,  why/how can nonsharable buffers be revalidated by
>     TTM without exposing like amdgpu_bo_make_resident api?
>
>
> I think the idea is that all nonsharable buffers will be backed by the 
> same reservation object, so TTM can skip buffer validation if no 
> buffer has been moved. It's just an optimization for the current design.
>
>
>     With mentioned in another thread, if we can expose make_resident
>     api, we can remove bo_list, even we can remove reservation
>     operation in CS ioctl.
>     And now, I think our bo list is a very bad design,
>     first, umd must create bo list for every command submission, this
>     is a extra cpu overhead compared with traditional way.
>     second, kernel also have to iterate the list, when bo list is too
>     long, like OpenCL program, they always throw several thousands BOs
>     to bo list, reservation must keep these thousands ww_mutex safe,
>     CPU overhead is too big.
>
>     So I strongly suggest we should expose make_resident api to user
>     space. if cannot, I want to know any specific reason to see if we
>     can solve it.
>
>
> Yeah, I think the BO list idea is likely to die sooner or later. It 
> made sense for GL before bindless was a thing. Nowadays I don't see 
> much value in it.
>
> MesaGL will keep tracking the BO list because it's a requirement for 
> good GL performance (it determines whether to flush IBs before BO 
> synchronization, it allows tracking fences for each BO, which are used 
> to determine dependencies between IBs, and that all allows async SDMA 
> and async compute for GL, which doesn't have separate queues).
>
> However, we don't need any BO list at the libdrm level and lower. I 
> think a BO_CREATE flag that causes that the buffer is added to a 
> kernel-side per-fd BO list would be sufficient. How the kernel manages 
> its BO list should be its own implementation detail. Initially we can 
> just move the current BO list management into the kernel.
I guess this idea will make bo list worse, which just decrease umd 
effort, but increase kernel driver complication.

First, from your and Christian's comments, we can get this agreement 
that bo list design is not a good way.
My proposal of exposing amdgpu_bo_make_resident is to replace bo list.
If we can make all needed bo resident, then we don't need to validate it 
again in cs ioctl, then we don't need their reservation lock more. After 
job pushed to scheduler, then we can un-resident BOs.
Even we can make it for VM bo, then we don't need to check vm update 
again while done in va map ioctl.

If this is got done(eviction has been improved more), I cannot see any 
obvious gap for performance.

What do you think of this proposal of exposing amdgpu_bo_make_resident 
api to user space? Or any other idea we can discuss.

If you all agree with, I can volunteer to try with UMD guys.

Regards,
David Zhou

>
> Marek
>
>
>
>
>
>     Regards,
>     David Zhou
>
>>     so it should remove a lot of kernel overhead and the BO list
>>     remains the same.
>>
>>     Marek
>>
>>
>>
>>         Thanks,
>>         David Zhou
>>
>>
>>             v2: It shouldn't crash anymore, but the kernel will
>>             reject the new flag.
>>             ---
>>               src/gallium/drivers/radeon/r600_buffer_common.c |  7 +++++
>>               src/gallium/drivers/radeon/radeon_winsys.h     | 20
>>             +++++++++++---
>>               src/gallium/winsys/amdgpu/drm/amdgpu_bo.c      | 36
>>             ++++++++++++++++---------
>>               src/gallium/winsys/radeon/drm/radeon_drm_bo.c  | 27
>>             +++++++++++--------
>>               4 files changed, 62 insertions(+), 28 deletions(-)
>>
>>             diff --git
>>             a/src/gallium/drivers/radeon/r600_buffer_common.c
>>             b/src/gallium/drivers/radeon/r600_buffer_common.c
>>             index dd1c209..2747ac4 100644
>>             --- a/src/gallium/drivers/radeon/r600_buffer_common.c
>>             +++ b/src/gallium/drivers/radeon/r600_buffer_common.c
>>             @@ -160,20 +160,27 @@ void
>>             r600_init_resource_fields(struct r600_common_screen *rscreen,
>>                     }
>>                     /* Tiled textures are unmappable. Always put them
>>             in VRAM. */
>>                     if ((res->b.b.target != PIPE_BUFFER &&
>>             !rtex->surface.is_linear) ||
>>                         res->flags & R600_RESOURCE_FLAG_UNMAPPABLE) {
>>                             res->domains = RADEON_DOMAIN_VRAM;
>>                             res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
>>              RADEON_FLAG_GTT_WC;
>>                     }
>>               +     /* Only displayable single-sample textures can be
>>             shared between
>>             +        * processes. */
>>             +       if (res->b.b.target == PIPE_BUFFER ||
>>             +           res->b.b.nr_samples >= 2 ||
>>             +  rtex->surface.micro_tile_mode !=
>>             RADEON_MICRO_MODE_DISPLAY)
>>             +               res->flags |=
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                     /* If VRAM is just stolen system memory, allow
>>             both VRAM and
>>                      * GTT, whichever has free space. If a buffer is
>>             evicted from
>>                      * VRAM to GTT, it will stay there.
>>                      *
>>                      * DRM 3.6.0 has good BO move throttling, so we
>>             can allow VRAM-only
>>                      * placements even with a low amount of stolen VRAM.
>>                      */
>>                     if (!rscreen->info.has_dedicated_vram &&
>>             (rscreen->info.drm_major < 3 || rscreen->info.drm_minor <
>>             6) &&
>>                         res->domains == RADEON_DOMAIN_VRAM) {
>>             diff --git a/src/gallium/drivers/radeon/radeon_winsys.h
>>             b/src/gallium/drivers/radeon/radeon_winsys.h
>>             index 351edcd..0abcb56 100644
>>             --- a/src/gallium/drivers/radeon/radeon_winsys.h
>>             +++ b/src/gallium/drivers/radeon/radeon_winsys.h
>>             @@ -47,20 +47,21 @@ enum radeon_bo_domain { /* bitfield */
>>                   RADEON_DOMAIN_GTT  = 2,
>>                   RADEON_DOMAIN_VRAM = 4,
>>                   RADEON_DOMAIN_VRAM_GTT = RADEON_DOMAIN_VRAM |
>>             RADEON_DOMAIN_GTT
>>               };
>>                 enum radeon_bo_flag { /* bitfield */
>>                   RADEON_FLAG_GTT_WC =        (1 << 0),
>>                   RADEON_FLAG_NO_CPU_ACCESS = (1 << 1),
>>                   RADEON_FLAG_NO_SUBALLOC =   (1 << 2),
>>                   RADEON_FLAG_SPARSE =        (1 << 3),
>>             +    RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 4),
>>               };
>>                 enum radeon_bo_usage { /* bitfield */
>>                   RADEON_USAGE_READ = 2,
>>                   RADEON_USAGE_WRITE = 4,
>>                   RADEON_USAGE_READWRITE = RADEON_USAGE_READ |
>>             RADEON_USAGE_WRITE,
>>                     /* The winsys ensures that the CS submission will
>>             be scheduled after
>>                    * previously flushed CSs referencing this BO in a
>>             conflicting way.
>>                    */
>>             @@ -685,28 +686,33 @@ static inline enum radeon_bo_domain
>>             radeon_domain_from_heap(enum radeon_heap hea
>>                   default:
>>                       assert(0);
>>                       return (enum radeon_bo_domain)0;
>>                   }
>>               }
>>                 static inline unsigned radeon_flags_from_heap(enum
>>             radeon_heap heap)
>>               {
>>                   switch (heap) {
>>                   case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>             -        return RADEON_FLAG_GTT_WC |
>>             RADEON_FLAG_NO_CPU_ACCESS;
>>             +        return RADEON_FLAG_GTT_WC |
>>             +  RADEON_FLAG_NO_CPU_ACCESS |
>>             +  RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                   case RADEON_HEAP_VRAM:
>>                   case RADEON_HEAP_VRAM_GTT:
>>                   case RADEON_HEAP_GTT_WC:
>>             -        return RADEON_FLAG_GTT_WC;
>>             +        return RADEON_FLAG_GTT_WC |
>>             +  RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>             +
>>                   case RADEON_HEAP_GTT:
>>                   default:
>>             -        return 0;
>>             +        return RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>                   }
>>               }
>>                 /* The pb cache bucket is chosen to minimize pb_cache
>>             misses.
>>                * It must be between 0 and 3 inclusive.
>>                */
>>               static inline unsigned
>>             radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>>               {
>>                   switch (heap) {
>>                   case RADEON_HEAP_VRAM_NO_CPU_ACCESS:
>>             @@ -724,22 +730,28 @@ static inline unsigned
>>             radeon_get_pb_cache_bucket_index(enum radeon_heap heap)
>>                 /* Return the heap index for winsys allocators, or -1
>>             on failure. */
>>               static inline int radeon_get_heap_index(enum
>>             radeon_bo_domain domain,
>>                   enum radeon_bo_flag flags)
>>               {
>>                   /* VRAM implies WC (write combining) */
>>                   assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>             RADEON_FLAG_GTT_WC);
>>                   /* NO_CPU_ACCESS implies VRAM only. */
>>                   assert(!(flags & RADEON_FLAG_NO_CPU_ACCESS) ||
>>             domain == RADEON_DOMAIN_VRAM);
>>               +    /* Resources with interprocess sharing don't use
>>             any winsys allocators. */
>>             +    if (!(flags & RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>             +        return -1;
>>             +
>>                   /* Unsupported flags: NO_SUBALLOC, SPARSE. */
>>             -    if (flags & ~(RADEON_FLAG_GTT_WC |
>>             RADEON_FLAG_NO_CPU_ACCESS))
>>             +    if (flags & ~(RADEON_FLAG_GTT_WC |
>>             + RADEON_FLAG_NO_CPU_ACCESS |
>>             + RADEON_FLAG_NO_INTERPROCESS_SHARING))
>>                       return -1;
>>                     switch (domain) {
>>                   case RADEON_DOMAIN_VRAM:
>>                       if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                           return RADEON_HEAP_VRAM_NO_CPU_ACCESS;
>>                       else
>>                           return RADEON_HEAP_VRAM;
>>                   case RADEON_DOMAIN_VRAM_GTT:
>>                       return RADEON_HEAP_VRAM_GTT;
>>             diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             index 97bbe23..06b8198 100644
>>             --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
>>             @@ -31,20 +31,24 @@
>>                 #include "amdgpu_cs.h"
>>                 #include "os/os_time.h"
>>               #include "state_tracker/drm_driver.h"
>>               #include <amdgpu_drm.h>
>>               #include <xf86drm.h>
>>               #include <stdio.h>
>>               #include <inttypes.h>
>>               +#ifndef AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING
>>             +#define AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING (1 << 6)
>>             +#endif
>>             +
>>               /* Set to 1 for verbose output showing committed sparse
>>             buffer ranges. */
>>               #define DEBUG_SPARSE_COMMITS 0
>>                 struct amdgpu_sparse_backing_chunk {
>>                  uint32_t begin, end;
>>               };
>>                 static struct pb_buffer *
>>               amdgpu_bo_create(struct radeon_winsys *rws,
>>                                uint64_t size,
>>             @@ -395,20 +399,22 @@ static struct amdgpu_winsys_bo
>>             *amdgpu_create_bo(struct amdgpu_winsys *ws,
>>                    if (initial_domain & RADEON_DOMAIN_VRAM)
>>                     request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM;
>>                  if (initial_domain & RADEON_DOMAIN_GTT)
>>                     request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT;
>>                    if (flags & RADEON_FLAG_NO_CPU_ACCESS)
>>                     request.flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>>                  if (flags & RADEON_FLAG_GTT_WC)
>>                     request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC;
>>             +   if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING)
>>             +      request.flags |=
>>             AMDGPU_GEM_CREATE_NO_INTERPROCESS_SHARING;
>>                    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
>>                  if (r) {
>>                     fprintf(stderr, "amdgpu: Failed to allocate a
>>             buffer:\n");
>>                     fprintf(stderr, "amdgpu: size      : %"PRIu64"
>>             bytes\n", size);
>>                     fprintf(stderr, "amdgpu: alignment : %u bytes\n",
>>             alignment);
>>                     fprintf(stderr, "amdgpu: domains   : %u\n",
>>             initial_domain);
>>                     goto error_bo_alloc;
>>                  }
>>               @@ -1127,21 +1133,21 @@ static void
>>             amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
>>                 static struct pb_buffer *
>>               amdgpu_bo_create(struct radeon_winsys *rws,
>>                                uint64_t size,
>>                                unsigned alignment,
>>                                enum radeon_bo_domain domain,
>>                                enum radeon_bo_flag flags)
>>               {
>>                  struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>                  struct amdgpu_winsys_bo *bo;
>>             -   unsigned usage = 0, pb_cache_bucket;
>>             +   unsigned usage = 0, pb_cache_bucket = 0;
>>                    /* VRAM implies WC. This is not optional. */
>>                  assert(!(domain & RADEON_DOMAIN_VRAM) || flags &
>>             RADEON_FLAG_GTT_WC);
>>                    /* NO_CPU_ACCESS is valid with VRAM only. */
>>                  assert(domain == RADEON_DOMAIN_VRAM || !(flags &
>>             RADEON_FLAG_NO_CPU_ACCESS));
>>                    /* Sub-allocate small buffers from slabs. */
>>                  if (!(flags & (RADEON_FLAG_NO_SUBALLOC |
>>             RADEON_FLAG_SPARSE)) &&
>>                      size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) &&
>>             @@ -1182,48 +1188,52 @@ no_slab:
>>                  /* This flag is irrelevant for the cache. */
>>                  flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                    /* Align size to page size. This is the minimum
>>             alignment for normal
>>                   * BOs. Aligning this here helps the cached bufmgr.
>>             Especially small BOs,
>>                   * like constant/uniform buffers, can benefit from
>>             better and more reuse.
>>                   */
>>                  size = align64(size, ws->info.gart_page_size);
>>                  alignment = align(alignment, ws->info.gart_page_size);
>>               -   int heap = radeon_get_heap_index(domain, flags);
>>             -   assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             -   usage = 1 << heap; /* Only set one usage bit for each
>>             heap. */
>>             +   bool use_reusable_pool = flags &
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>               -   pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             -   assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +   if (use_reusable_pool) {
>>             +       int heap = radeon_get_heap_index(domain, flags);
>>             +       assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             +       usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>               -   /* Get a buffer from the cache. */
>>             -   bo = (struct amdgpu_winsys_bo*)
>>             - pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
>>             usage,
>>             - pb_cache_bucket);
>>             -   if (bo)
>>             -      return &bo->base;
>>             +       pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             +       assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +
>>             +       /* Get a buffer from the cache. */
>>             +       bo = (struct amdgpu_winsys_bo*)
>>             + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment,
>>             usage,
>>             + pb_cache_bucket);
>>             +       if (bo)
>>             +          return &bo->base;
>>             +   }
>>                    /* Create a new one. */
>>                  bo = amdgpu_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>              pb_cache_bucket);
>>                  if (!bo) {
>>                     /* Clear the cache and try again. */
>>             pb_slabs_reclaim(&ws->bo_slabs);
>>             pb_cache_release_all_buffers(&ws->bo_cache);
>>                     bo = amdgpu_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>             pb_cache_bucket);
>>                     if (!bo)
>>                        return NULL;
>>                  }
>>               -  bo->u.real.use_reusable_pool = true;
>>             +   bo->u.real.use_reusable_pool = use_reusable_pool;
>>                  return &bo->base;
>>               }
>>                 static struct pb_buffer *amdgpu_bo_from_handle(struct
>>             radeon_winsys *rws,
>>                          struct winsys_handle *whandle,
>>                          unsigned *stride,
>>                          unsigned *offset)
>>               {
>>                  struct amdgpu_winsys *ws = amdgpu_winsys(rws);
>>                  struct amdgpu_winsys_bo *bo;
>>             diff --git
>>             a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             index 8027a5f..15e9d38 100644
>>             --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
>>             @@ -907,21 +907,21 @@ static void
>>             radeon_bo_set_metadata(struct pb_buffer *_buf,
>>                 static struct pb_buffer *
>>               radeon_winsys_bo_create(struct radeon_winsys *rws,
>>                                       uint64_t size,
>>                                       unsigned alignment,
>>                                       enum radeon_bo_domain domain,
>>                                       enum radeon_bo_flag flags)
>>               {
>>                   struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
>>                   struct radeon_bo *bo;
>>             -    unsigned usage = 0, pb_cache_bucket;
>>             +    unsigned usage = 0, pb_cache_bucket = 0;
>>                     assert(!(flags & RADEON_FLAG_SPARSE)); /* not
>>             supported */
>>                     /* Only 32-bit sizes are supported. */
>>                   if (size > UINT_MAX)
>>                       return NULL;
>>                     /* VRAM implies WC. This is not optional. */
>>                   if (domain & RADEON_DOMAIN_VRAM)
>>                       flags |= RADEON_FLAG_GTT_WC;
>>             @@ -962,46 +962,51 @@ no_slab:
>>                   /* This flag is irrelevant for the cache. */
>>                   flags &= ~RADEON_FLAG_NO_SUBALLOC;
>>                     /* Align size to page size. This is the minimum
>>             alignment for normal
>>                    * BOs. Aligning this here helps the cached bufmgr.
>>             Especially small BOs,
>>                    * like constant/uniform buffers, can benefit from
>>             better and more reuse.
>>                    */
>>                   size = align(size, ws->info.gart_page_size);
>>                   alignment = align(alignment, ws->info.gart_page_size);
>>               -    int heap = radeon_get_heap_index(domain, flags);
>>             -    assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             -    usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>             +    bool use_reusable_pool = flags &
>>             RADEON_FLAG_NO_INTERPROCESS_SHARING;
>>               -    pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             -    assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +    /* Shared resources don't use cached heaps. */
>>             +    if (use_reusable_pool) {
>>             +        int heap = radeon_get_heap_index(domain, flags);
>>             +        assert(heap >= 0 && heap < RADEON_MAX_CACHED_HEAPS);
>>             +        usage = 1 << heap; /* Only set one usage bit for
>>             each heap. */
>>               -    bo =
>>             radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>             alignment,
>>             -        usage, pb_cache_bucket));
>>             -    if (bo)
>>             -        return &bo->base;
>>             +        pb_cache_bucket =
>>             radeon_get_pb_cache_bucket_index(heap);
>>             +        assert(pb_cache_bucket <
>>             ARRAY_SIZE(ws->bo_cache.buckets));
>>             +
>>             +        bo =
>>             radeon_bo(pb_cache_reclaim_buffer(&ws->bo_cache, size,
>>             alignment,
>>             +            usage, pb_cache_bucket));
>>             +        if (bo)
>>             +            return &bo->base;
>>             +    }
>>                     bo = radeon_create_bo(ws, size, alignment, usage,
>>             domain, flags,
>>             pb_cache_bucket);
>>                   if (!bo) {
>>                       /* Clear the cache and try again. */
>>                       if (ws->info.has_virtual_memory)
>>             pb_slabs_reclaim(&ws->bo_slabs);
>>             pb_cache_release_all_buffers(&ws->bo_cache);
>>                       bo = radeon_create_bo(ws, size, alignment,
>>             usage, domain, flags,
>>             pb_cache_bucket);
>>                       if (!bo)
>>                           return NULL;
>>                   }
>>               - bo->u.real.use_reusable_pool = true;
>>             +    bo->u.real.use_reusable_pool = use_reusable_pool;
>>             mtx_lock(&ws->bo_handles_mutex);
>>             util_hash_table_set(ws->bo_handles,
>>             (void*)(uintptr_t)bo->handle, bo);
>>             mtx_unlock(&ws->bo_handles_mutex);
>>                     return &bo->base;
>>               }
>>                 static struct pb_buffer
>>             *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
>>                              void *pointer, uint64_t size)
>>
>>
>>
>
>