[1/2] intel: align reuse buffer's size on page size instead

Submitted by James Xiong on Feb. 20, 2018, 5:48 p.m.

Details

Message ID 1519148883-18999-1-git-send-email-james.xiong@intel.com
State New
Series "Series without cover letter"
Headers show

Commit Message

James Xiong Feb. 20, 2018, 5:48 p.m.
From: "Xiong, James" <james.xiong@intel.com>

With gem_reuse enabled, when a buffer size is different than
the sizes of buckets, it is aligned to the next bucket's size,
which means about 25% more memory than the requested is allocated
in the worst senario. For example:

Orignal size    Actual
32KB+1Byte      40KB
.
.
.
8MB+1Byte       10MB
.
.
.
96MB+1Byte      112MB

This is very memory expensive and make the reuse feature less
favorable than it deserves to be.

This commit aligns the reuse buffer size on page size instead,
the buffer whose size falls between bucket[n] and bucket[n+1] is
put in bucket[n] when it's done; And when searching for a cached
buffer for reuse, it goes through the cached buffers list in the
bucket until a cached buffer, whose size is larger than or equal
to the requested size, is found.

Signed-off-by: Xiong, James <james.xiong@intel.com>
---
 intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------
 libdrm_lists.h           |   6 ++
 2 files changed, 101 insertions(+), 85 deletions(-)

Patch hide | download patch | download mbox

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 386da30..5b2d0d0 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -402,11 +402,10 @@  drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
 {
 	int i;
 
-	for (i = 0; i < bufmgr_gem->num_buckets; i++) {
-		struct drm_intel_gem_bo_bucket *bucket =
-		    &bufmgr_gem->cache_bucket[i];
-		if (bucket->size >= size) {
-			return bucket;
+	for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {
+		if (size >= bufmgr_gem->cache_bucket[i].size &&
+		    size < bufmgr_gem->cache_bucket[i+1].size) {
+			return &bufmgr_gem->cache_bucket[i];
 		}
 	}
 
@@ -685,25 +684,95 @@  drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)
 		 madv);
 }
 
-/* drop the oldest entries that have been purged by the kernel */
+/* drop the entries that are older than the given time */
 static void
 drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
-				    struct drm_intel_gem_bo_bucket *bucket)
+				    struct drm_intel_gem_bo_bucket *bucket,
+				    time_t time)
 {
-	while (!DRMLISTEMPTY(&bucket->head)) {
-		drm_intel_bo_gem *bo_gem;
-
-		bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-				      bucket->head.next, head);
-		if (drm_intel_gem_bo_madvise_internal
-		    (bufmgr_gem, bo_gem, I915_MADV_DONTNEED))
-			break;
-
-		DRMLISTDEL(&bo_gem->head);
-		drm_intel_gem_bo_free(&bo_gem->bo);
+	drm_intel_bo_gem *bo_gem, *temp;
+	DRMLISTFOREACHENTRYSAFE(bo_gem, temp, &bucket->head, head) {
+		if (bo_gem->free_time >= time) {
+			drm_intel_gem_bo_madvise_internal
+				(bufmgr_gem, bo_gem, I915_MADV_DONTNEED);
+			DRMLISTDEL(&bo_gem->head);
+			drm_intel_gem_bo_free(&bo_gem->bo);
+		}
 	}
 }
 
+static drm_intel_bo_gem *
+drm_intel_gem_bo_cached_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
+                                 unsigned long size,
+                                 uint32_t tiling_mode,
+                                 unsigned long stride,
+                                 unsigned long alignment,
+                                 bool for_render)
+{
+	struct drm_intel_gem_bo_bucket *bucket =
+		drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
+
+	if(bucket != NULL) {
+		drm_intel_bo_gem *bo_gem, *temp_bo_gem;
+retry:
+		bo_gem = NULL;
+		if (for_render) {
+			/* Allocate new render-target BOs from the tail (MRU)
+			 * of the list, as it will likely be hot in the GPU
+			 * cache and in the aperture for us.
+			 */
+			DRMLISTFOREACHENTRYREVERSE(temp_bo_gem, &bucket->head, head) {
+				if (temp_bo_gem->bo.size >= size) {
+					bo_gem = temp_bo_gem;
+					DRMLISTDEL(&bo_gem->head);
+					bo_gem->bo.align = alignment;
+					break;
+				}
+			}
+		} else {
+			assert(alignment == 0);
+			/* For non-render-target BOs (where we're probably
+			 * going to map it first thing in order to fill it
+			 * with data), check if the last BO in the cache is
+			 * unbusy, and only reuse in that case. Otherwise,
+			 * allocating a new buffer is probably faster than
+			 * waiting for the GPU to finish.
+			 */
+			DRMLISTFOREACHENTRY(temp_bo_gem, &bucket->head, head) {
+				if (temp_bo_gem->bo.size >= size &&
+				    !drm_intel_gem_bo_busy(&temp_bo_gem->bo)) {
+					bo_gem = temp_bo_gem;
+					DRMLISTDEL(&bo_gem->head);
+					break;
+				}
+			}
+		}
+
+		if(bo_gem) {
+			if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
+							       I915_MADV_WILLNEED)) {
+				/* failed to reuse the cached buffer.
+				 * the buffers older cannot be reused and can be purged
+				 * from the bucket.
+				 */
+				drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem, bucket,
+								    bo_gem->free_time);
+				drm_intel_gem_bo_free(&bo_gem->bo);
+				return NULL;
+			}
+
+			if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,
+								 tiling_mode,
+								 stride)) {
+				drm_intel_gem_bo_free(&bo_gem->bo);
+				goto retry;
+			}
+		}
+		return bo_gem;
+	}
+	return NULL;
+}
+
 static drm_intel_bo *
 drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 				const char *name,
@@ -715,81 +784,22 @@  drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
 	drm_intel_bo_gem *bo_gem;
-	unsigned int page_size = getpagesize();
 	int ret;
-	struct drm_intel_gem_bo_bucket *bucket;
-	bool alloc_from_cache;
-	unsigned long bo_size;
 	bool for_render = false;
 
 	if (flags & BO_ALLOC_FOR_RENDER)
 		for_render = true;
 
-	/* Round the allocated size up to a power of two number of pages. */
-	bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
-
-	/* If we don't have caching at this size, don't actually round the
-	 * allocation up.
-	 */
-	if (bucket == NULL) {
-		bo_size = size;
-		if (bo_size < page_size)
-			bo_size = page_size;
-	} else {
-		bo_size = bucket->size;
-	}
+	/* first align the size on page boundary */
+	size = ALIGN(size, getpagesize());
 
 	pthread_mutex_lock(&bufmgr_gem->lock);
+
 	/* Get a buffer out of the cache if available */
-retry:
-	alloc_from_cache = false;
-	if (bucket != NULL && !DRMLISTEMPTY(&bucket->head)) {
-		if (for_render) {
-			/* Allocate new render-target BOs from the tail (MRU)
-			 * of the list, as it will likely be hot in the GPU
-			 * cache and in the aperture for us.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.prev, head);
-			DRMLISTDEL(&bo_gem->head);
-			alloc_from_cache = true;
-			bo_gem->bo.align = alignment;
-		} else {
-			assert(alignment == 0);
-			/* For non-render-target BOs (where we're probably
-			 * going to map it first thing in order to fill it
-			 * with data), check if the last BO in the cache is
-			 * unbusy, and only reuse in that case. Otherwise,
-			 * allocating a new buffer is probably faster than
-			 * waiting for the GPU to finish.
-			 */
-			bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
-					      bucket->head.next, head);
-			if (!drm_intel_gem_bo_busy(&bo_gem->bo)) {
-				alloc_from_cache = true;
-				DRMLISTDEL(&bo_gem->head);
-			}
-		}
+	bo_gem = drm_intel_gem_bo_cached_for_size(bufmgr_gem, size, tiling_mode,
+						  stride, alignment, for_render);
 
-		if (alloc_from_cache) {
-			if (!drm_intel_gem_bo_madvise_internal
-			    (bufmgr_gem, bo_gem, I915_MADV_WILLNEED)) {
-				drm_intel_gem_bo_free(&bo_gem->bo);
-				drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem,
-								    bucket);
-				goto retry;
-			}
-
-			if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,
-								 tiling_mode,
-								 stride)) {
-				drm_intel_gem_bo_free(&bo_gem->bo);
-				goto retry;
-			}
-		}
-	}
-
-	if (!alloc_from_cache) {
+	if (bo_gem == NULL) {
 		struct drm_i915_gem_create create;
 
 		bo_gem = calloc(1, sizeof(*bo_gem));
@@ -800,10 +810,10 @@  retry:
 		   list (vma_list), so better set the list head here */
 		DRMINITLISTHEAD(&bo_gem->vma_list);
 
-		bo_gem->bo.size = bo_size;
+		bo_gem->bo.size = size;
 
 		memclear(create);
-		create.size = bo_size;
+		create.size = size;
 
 		ret = drmIoctl(bufmgr_gem->fd,
 			       DRM_IOCTL_I915_GEM_CREATE,
diff --git a/libdrm_lists.h b/libdrm_lists.h
index 8926d8d..400c731 100644
--- a/libdrm_lists.h
+++ b/libdrm_lists.h
@@ -101,6 +101,12 @@  typedef struct _drmMMListHead
 	     (__item) = DRMLISTENTRY(typeof(*__item),                          \
 				     (__item)->__head.next, __head))
 
+#define DRMLISTFOREACHENTRYREVERSE(__item, __list, __head)                     \
+	for ((__item) = DRMLISTENTRY(typeof(*__item), (__list)->prev, __head); \
+	     &(__item)->__head != (__list);                                    \
+	     (__item) = DRMLISTENTRY(typeof(*__item),                          \
+				     (__item)->__head.prev, __head))
+
 #define DRMLISTFOREACHENTRYSAFE(__item, __temp, __list, __head)                \
 	for ((__item) = DRMLISTENTRY(typeof(*__item), (__list)->next, __head), \
 	     (__temp) = DRMLISTENTRY(typeof(*__item),                          \

Comments

Chris Wilson Feb. 21, 2018, 9:43 p.m.
Quoting James Xiong (2018-02-20 17:48:03)
> From: "Xiong, James" <james.xiong@intel.com>
> 
> With gem_reuse enabled, when a buffer size is different than
> the sizes of buckets, it is aligned to the next bucket's size,
> which means about 25% more memory than the requested is allocated
> in the worst senario. For example:
> 
> Orignal size    Actual
> 32KB+1Byte      40KB
> .
> .
> .
> 8MB+1Byte       10MB
> .
> .
> .
> 96MB+1Byte      112MB
> 
> This is very memory expensive and make the reuse feature less
> favorable than it deserves to be.

The reuse feature also misses one important source: reusing temporaries
within a batch.
 
> This commit aligns the reuse buffer size on page size instead,
> the buffer whose size falls between bucket[n] and bucket[n+1] is
> put in bucket[n] when it's done; And when searching for a cached
> buffer for reuse, it goes through the cached buffers list in the
> bucket until a cached buffer, whose size is larger than or equal
> to the requested size, is found.

So how many times do you have to allocate a new buffer because you
refused to hand back a slightly larger buffer? Have you checked the
impact on !llc? With mmaps? On how wide a workload?

> Signed-off-by: Xiong, James <james.xiong@intel.com>
> ---
>  intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------
>  libdrm_lists.h           |   6 ++
>  2 files changed, 101 insertions(+), 85 deletions(-)
> 
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index 386da30..5b2d0d0 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -402,11 +402,10 @@ drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
>  {
>         int i;
>  
> -       for (i = 0; i < bufmgr_gem->num_buckets; i++) {
> -               struct drm_intel_gem_bo_bucket *bucket =
> -                   &bufmgr_gem->cache_bucket[i];
> -               if (bucket->size >= size) {
> -                       return bucket;
> +       for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {
> +               if (size >= bufmgr_gem->cache_bucket[i].size &&
> +                   size < bufmgr_gem->cache_bucket[i+1].size) {
> +                       return &bufmgr_gem->cache_bucket[i];

So you never return the last bucket?

Given the ordered set of buckets, the test remains correct even when
every bo within the bucket is not the full size (each bo is still at
least bigger than the previous bucket).

>                 }
>         }
>  
> @@ -685,25 +684,95 @@ drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)
>                  madv);
>  }
>  
> -/* drop the oldest entries that have been purged by the kernel */
> +/* drop the entries that are older than the given time */
>  static void
>  drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,
> -                                   struct drm_intel_gem_bo_bucket *bucket)
> +                                   struct drm_intel_gem_bo_bucket *bucket,
> +                                   time_t time)
>  {
> -       while (!DRMLISTEMPTY(&bucket->head)) {
> -               drm_intel_bo_gem *bo_gem;
> -
> -               bo_gem = DRMLISTENTRY(drm_intel_bo_gem,
> -                                     bucket->head.next, head);
> -               if (drm_intel_gem_bo_madvise_internal
> -                   (bufmgr_gem, bo_gem, I915_MADV_DONTNEED))
> -                       break;
> -
> -               DRMLISTDEL(&bo_gem->head);
> -               drm_intel_gem_bo_free(&bo_gem->bo);
> +       drm_intel_bo_gem *bo_gem, *temp;
> +       DRMLISTFOREACHENTRYSAFE(bo_gem, temp, &bucket->head, head) {
> +               if (bo_gem->free_time >= time) {
> +                       drm_intel_gem_bo_madvise_internal
> +                               (bufmgr_gem, bo_gem, I915_MADV_DONTNEED);
> +                       DRMLISTDEL(&bo_gem->head);
> +                       drm_intel_gem_bo_free(&bo_gem->bo);
> +               }

This function is called after the kernel reports that it purged a
buffer, and the intent here is that we find all the buffers that the
kernel purged and evict them. It is not about discarding old buffers,
just throwing out the empty.

Honestly, it's pointless. The cost of having the empty purged bo around
is insignificant (we reclaim a little bit of mmap virtual space
quicker).

>         }
>  }
>  
> +static drm_intel_bo_gem *
> +drm_intel_gem_bo_cached_for_size(drm_intel_bufmgr_gem *bufmgr_gem,
> +                                 unsigned long size,
> +                                 uint32_t tiling_mode,
> +                                 unsigned long stride,
> +                                 unsigned long alignment,
> +                                 bool for_render)
> +{
> +       struct drm_intel_gem_bo_bucket *bucket =
> +               drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
> +
> +       if(bucket != NULL) {
> +               drm_intel_bo_gem *bo_gem, *temp_bo_gem;
> +retry:
> +               bo_gem = NULL;
> +               if (for_render) {
> +                       /* Allocate new render-target BOs from the tail (MRU)
> +                        * of the list, as it will likely be hot in the GPU
> +                        * cache and in the aperture for us.
> +                        */
> +                       DRMLISTFOREACHENTRYREVERSE(temp_bo_gem, &bucket->head, head) {
> +                               if (temp_bo_gem->bo.size >= size) {
> +                                       bo_gem = temp_bo_gem;
> +                                       DRMLISTDEL(&bo_gem->head);
> +                                       bo_gem->bo.align = alignment;
> +                                       break;
> +                               }
> +                       }
> +               } else {
> +                       assert(alignment == 0);

This is random.

> +                       /* For non-render-target BOs (where we're probably
> +                        * going to map it first thing in order to fill it
> +                        * with data), check if the last BO in the cache is
> +                        * unbusy, and only reuse in that case. Otherwise,
> +                        * allocating a new buffer is probably faster than
> +                        * waiting for the GPU to finish.
> +                        */
> +                       DRMLISTFOREACHENTRY(temp_bo_gem, &bucket->head, head) {
> +                               if (temp_bo_gem->bo.size >= size &&
> +                                   !drm_intel_gem_bo_busy(&temp_bo_gem->bo)) {
> +                                       bo_gem = temp_bo_gem;
> +                                       DRMLISTDEL(&bo_gem->head);
> +                                       break;
> +                               }
> +                       }
> +               }
> +
> +               if(bo_gem) {
> +                       if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,
> +                                                              I915_MADV_WILLNEED)) {
> +                               /* failed to reuse the cached buffer.
> +                                * the buffers older cannot be reused and can be purged
> +                                * from the bucket.
> +                                */
> +                               drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem, bucket,
> +                                                                   bo_gem->free_time);
> +                               drm_intel_gem_bo_free(&bo_gem->bo);
> +                               return NULL;
> +                       }
> +
> +                       if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,
> +                                                                tiling_mode,
> +                                                                stride)) {
> +                               drm_intel_gem_bo_free(&bo_gem->bo);
> +                               goto retry;
> +                       }
> +               }
> +               return bo_gem;
> +       }
> +       return NULL;
> +}
> +
>  static drm_intel_bo *
>  drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
>                                 const char *name,
> @@ -715,81 +784,22 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,
>  {
>         drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;
>         drm_intel_bo_gem *bo_gem;
> -       unsigned int page_size = getpagesize();
>         int ret;
> -       struct drm_intel_gem_bo_bucket *bucket;
> -       bool alloc_from_cache;
> -       unsigned long bo_size;
>         bool for_render = false;
>  
>         if (flags & BO_ALLOC_FOR_RENDER)
>                 for_render = true;
>  
> -       /* Round the allocated size up to a power of two number of pages. */
> -       bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);
> -
> -       /* If we don't have caching at this size, don't actually round the
> -        * allocation up.
> -        */
> -       if (bucket == NULL) {
> -               bo_size = size;
> -               if (bo_size < page_size)
> -                       bo_size = page_size;
> -       } else {
> -               bo_size = bucket->size;
> -       }
> +       /* first align the size on page boundary */
> +       size = ALIGN(size, getpagesize());

Ime, you want tight (exact) fitting for long lived objects and loose
fitting for temporaries. Do you want to pass in a flag?
James Xiong Feb. 21, 2018, 11:54 p.m.
On Wed, Feb 21, 2018 at 09:43:55PM +0000, Chris Wilson wrote:
> Quoting James Xiong (2018-02-20 17:48:03)

> > From: "Xiong, James" <james.xiong@intel.com>

> > 

> > With gem_reuse enabled, when a buffer size is different than

> > the sizes of buckets, it is aligned to the next bucket's size,

> > which means about 25% more memory than the requested is allocated

> > in the worst senario. For example:

> > 

> > Orignal size    Actual

> > 32KB+1Byte      40KB

> > .

> > .

> > .

> > 8MB+1Byte       10MB

> > .

> > .

> > .

> > 96MB+1Byte      112MB

> > 

> > This is very memory expensive and make the reuse feature less

> > favorable than it deserves to be.

> 

> The reuse feature also misses one important source: reusing temporaries

> within a batch.

>  

> > This commit aligns the reuse buffer size on page size instead,

> > the buffer whose size falls between bucket[n] and bucket[n+1] is

> > put in bucket[n] when it's done; And when searching for a cached

> > buffer for reuse, it goes through the cached buffers list in the

> > bucket until a cached buffer, whose size is larger than or equal

> > to the requested size, is found.

> 

> So how many times do you have to allocate a new buffer because you

> refused to hand back a slightly larger buffer? Have you checked the

> impact on !llc? With mmaps? On how wide a workload?

bucket[n] contains a list of buffers with size between bucket[n].size
and bucket[n+1].size - 1, a larger cached buffer could still be reused
if available.
I managed to run some performance tests on GEN9 without noticeable
performance impact but didn't have chance to test for more coverages.
> 

> > Signed-off-by: Xiong, James <james.xiong@intel.com>

> > ---

> >  intel/intel_bufmgr_gem.c | 180 +++++++++++++++++++++++++----------------------

> >  libdrm_lists.h           |   6 ++

> >  2 files changed, 101 insertions(+), 85 deletions(-)

> > 

> > diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c

> > index 386da30..5b2d0d0 100644

> > --- a/intel/intel_bufmgr_gem.c

> > +++ b/intel/intel_bufmgr_gem.c

> > @@ -402,11 +402,10 @@ drm_intel_gem_bo_bucket_for_size(drm_intel_bufmgr_gem *bufmgr_gem,

> >  {

> >         int i;

> >  

> > -       for (i = 0; i < bufmgr_gem->num_buckets; i++) {

> > -               struct drm_intel_gem_bo_bucket *bucket =

> > -                   &bufmgr_gem->cache_bucket[i];

> > -               if (bucket->size >= size) {

> > -                       return bucket;

> > +       for (i = 0; i < bufmgr_gem->num_buckets - 1; i++) {

> > +               if (size >= bufmgr_gem->cache_bucket[i].size &&

> > +                   size < bufmgr_gem->cache_bucket[i+1].size) {

> > +                       return &bufmgr_gem->cache_bucket[i];

> 

> So you never return the last bucket?

The last bucket's size is 112MB, I can add a 128M bucket at the end to
cache and reuse allocations between [112MB, 128M-1] if you think it's
important.
> 

> Given the ordered set of buckets, the test remains correct even when

> every bo within the bucket is not the full size (each bo is still at

> least bigger than the previous bucket).

The function returns an bucket according to the requested buffer size
at allocating.
if (buffer_size in [bucket[n].size, bucket[n+1].size))
    return &bucket[n];
it also gets called at freeing. In both cases, the correct bucket is returned.
> 

> >                 }

> >         }

> >  

> > @@ -685,25 +684,95 @@ drm_intel_gem_bo_madvise(drm_intel_bo *bo, int madv)

> >                  madv);

> >  }

> >  

> > -/* drop the oldest entries that have been purged by the kernel */

> > +/* drop the entries that are older than the given time */

> >  static void

> >  drm_intel_gem_bo_cache_purge_bucket(drm_intel_bufmgr_gem *bufmgr_gem,

> > -                                   struct drm_intel_gem_bo_bucket *bucket)

> > +                                   struct drm_intel_gem_bo_bucket *bucket,

> > +                                   time_t time)

> >  {

> > -       while (!DRMLISTEMPTY(&bucket->head)) {

> > -               drm_intel_bo_gem *bo_gem;

> > -

> > -               bo_gem = DRMLISTENTRY(drm_intel_bo_gem,

> > -                                     bucket->head.next, head);

> > -               if (drm_intel_gem_bo_madvise_internal

> > -                   (bufmgr_gem, bo_gem, I915_MADV_DONTNEED))

> > -                       break;

> > -

> > -               DRMLISTDEL(&bo_gem->head);

> > -               drm_intel_gem_bo_free(&bo_gem->bo);

> > +       drm_intel_bo_gem *bo_gem, *temp;

> > +       DRMLISTFOREACHENTRYSAFE(bo_gem, temp, &bucket->head, head) {

> > +               if (bo_gem->free_time >= time) {

> > +                       drm_intel_gem_bo_madvise_internal

> > +                               (bufmgr_gem, bo_gem, I915_MADV_DONTNEED);

> > +                       DRMLISTDEL(&bo_gem->head);

> > +                       drm_intel_gem_bo_free(&bo_gem->bo);

> > +               }

> 

> This function is called after the kernel reports that it purged a

> buffer, and the intent here is that we find all the buffers that the

> kernel purged and evict them. It is not about discarding old buffers,

> just throwing out the empty.

Yes, and previously each cached buffer in the bucket has the same size,
the MRU buffer is taken and if it's failed to be reused(purged by kernel),
every buffer in bucket is older than MRU and evicted.
The new code follows the same logic: find the MRU cached buffer that's large
enough, if it's purged by kernel, the buffers that are older than this one
will be evicted: the buffer's free_time is passed in, we go through the list
and evict older buffers.
> 

> Honestly, it's pointless. The cost of having the empty purged bo around

> is insignificant (we reclaim a little bit of mmap virtual space

> quicker).

> 

> >         }

> >  }

> >  

> > +static drm_intel_bo_gem *

> > +drm_intel_gem_bo_cached_for_size(drm_intel_bufmgr_gem *bufmgr_gem,

> > +                                 unsigned long size,

> > +                                 uint32_t tiling_mode,

> > +                                 unsigned long stride,

> > +                                 unsigned long alignment,

> > +                                 bool for_render)

> > +{

> > +       struct drm_intel_gem_bo_bucket *bucket =

> > +               drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);

> > +

> > +       if(bucket != NULL) {

> > +               drm_intel_bo_gem *bo_gem, *temp_bo_gem;

> > +retry:

> > +               bo_gem = NULL;

> > +               if (for_render) {

> > +                       /* Allocate new render-target BOs from the tail (MRU)

> > +                        * of the list, as it will likely be hot in the GPU

> > +                        * cache and in the aperture for us.

> > +                        */

> > +                       DRMLISTFOREACHENTRYREVERSE(temp_bo_gem, &bucket->head, head) {

> > +                               if (temp_bo_gem->bo.size >= size) {

> > +                                       bo_gem = temp_bo_gem;

> > +                                       DRMLISTDEL(&bo_gem->head);

> > +                                       bo_gem->bo.align = alignment;

> > +                                       break;

> > +                               }

> > +                       }

> > +               } else {

> > +                       assert(alignment == 0);

> 

> This is random.

My understanding is the alignment for non-render has to be 0, this assert is in the original
code and copied over.
> 

> > +                       /* For non-render-target BOs (where we're probably

> > +                        * going to map it first thing in order to fill it

> > +                        * with data), check if the last BO in the cache is

> > +                        * unbusy, and only reuse in that case. Otherwise,

> > +                        * allocating a new buffer is probably faster than

> > +                        * waiting for the GPU to finish.

> > +                        */

> > +                       DRMLISTFOREACHENTRY(temp_bo_gem, &bucket->head, head) {

> > +                               if (temp_bo_gem->bo.size >= size &&

> > +                                   !drm_intel_gem_bo_busy(&temp_bo_gem->bo)) {

> > +                                       bo_gem = temp_bo_gem;

> > +                                       DRMLISTDEL(&bo_gem->head);

> > +                                       break;

> > +                               }

> > +                       }

> > +               }

> > +

> > +               if(bo_gem) {

> > +                       if (!drm_intel_gem_bo_madvise_internal(bufmgr_gem, bo_gem,

> > +                                                              I915_MADV_WILLNEED)) {

> > +                               /* failed to reuse the cached buffer.

> > +                                * the buffers older cannot be reused and can be purged

> > +                                * from the bucket.

> > +                                */

> > +                               drm_intel_gem_bo_cache_purge_bucket(bufmgr_gem, bucket,

> > +                                                                   bo_gem->free_time);

> > +                               drm_intel_gem_bo_free(&bo_gem->bo);

> > +                               return NULL;

> > +                       }

> > +

> > +                       if (drm_intel_gem_bo_set_tiling_internal(&bo_gem->bo,

> > +                                                                tiling_mode,

> > +                                                                stride)) {

> > +                               drm_intel_gem_bo_free(&bo_gem->bo);

> > +                               goto retry;

> > +                       }

> > +               }

> > +               return bo_gem;

> > +       }

> > +       return NULL;

> > +}

> > +

> >  static drm_intel_bo *

> >  drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,

> >                                 const char *name,

> > @@ -715,81 +784,22 @@ drm_intel_gem_bo_alloc_internal(drm_intel_bufmgr *bufmgr,

> >  {

> >         drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bufmgr;

> >         drm_intel_bo_gem *bo_gem;

> > -       unsigned int page_size = getpagesize();

> >         int ret;

> > -       struct drm_intel_gem_bo_bucket *bucket;

> > -       bool alloc_from_cache;

> > -       unsigned long bo_size;

> >         bool for_render = false;

> >  

> >         if (flags & BO_ALLOC_FOR_RENDER)

> >                 for_render = true;

> >  

> > -       /* Round the allocated size up to a power of two number of pages. */

> > -       bucket = drm_intel_gem_bo_bucket_for_size(bufmgr_gem, size);

> > -

> > -       /* If we don't have caching at this size, don't actually round the

> > -        * allocation up.

> > -        */

> > -       if (bucket == NULL) {

> > -               bo_size = size;

> > -               if (bo_size < page_size)

> > -                       bo_size = page_size;

> > -       } else {

> > -               bo_size = bucket->size;

> > -       }

> > +       /* first align the size on page boundary */

> > +       size = ALIGN(size, getpagesize());

> 

> Ime, you want tight (exact) fitting for long lived objects and loose

> fitting for temporaries. Do you want to pass in a flag?

Not sure if I understand you correctly, yes we could pass a flag in to indicate
whether a buffer should be cached i.e. loose/big alignment. but many times we
don't know/have no control if/how long a buffer lasts, we also have to go through
code and look for these known cases.