[14/41] drm/i915: Use a radixtree for random access to the object's backing storage

Submitted by Chris Wilson on Oct. 14, 2016, 12:18 p.m.

Details

Message ID 20161014121833.439-15-chris@chris-wilson.co.uk
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Intel GFX

Browsing this patch as part of:
"Series without cover letter" rev 1 in Intel GFX
<< prev patch [14/41] next patch >>

Commit Message

Chris Wilson Oct. 14, 2016, 12:18 p.m.
A while ago we switched from a contiguous array of pages into an sglist,
for that was both more convenient for mapping to hardware and avoided
the requirement for a vmalloc array of pages on every object. However,
certain GEM API calls (like pwrite, pread as well as performing
relocations) do desire access to individual struct pages. A quick hack
was to introduce a cache of the last access such that finding the
following page was quick - this works so long as the caller desired
sequential access. Walking backwards, or multiple callers, still hits a
slow linear search for each page. One solution is to store each
successful lookup in a radix tree.

v2: Rewrite building the radixtree for clarity, hopefully.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  69 +++++-------
 drivers/gpu/drm/i915/i915_gem.c         | 179 +++++++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_gem_stolen.c  |   4 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c |   4 +-
 4 files changed, 193 insertions(+), 63 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 38467dde1efe..53cf4b0e5359 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2273,9 +2273,12 @@  struct drm_i915_gem_object {
 
 	struct sg_table *pages;
 	int pages_pin_count;
-	struct get_page {
-		struct scatterlist *sg;
-		int last;
+	struct i915_gem_object_page_iter {
+		struct scatterlist *sg_pos;
+		unsigned long sg_idx;
+
+		struct radix_tree_root radix;
+		spinlock_t lock;
 	} get_page;
 	void *mapping;
 
@@ -2478,6 +2481,14 @@  static __always_inline struct sgt_iter {
 	return s;
 }
 
+static inline struct scatterlist *____sg_next(struct scatterlist *sg)
+{
+	++sg;
+	if (unlikely(sg_is_chain(sg)))
+		sg = sg_chain_ptr(sg);
+	return sg;
+}
+
 /**
  * __sg_next - return the next scatterlist entry in a list
  * @sg:		The current sg entry
@@ -2492,9 +2503,7 @@  static inline struct scatterlist *__sg_next(struct scatterlist *sg)
 #ifdef CONFIG_DEBUG_SG
 	BUG_ON(sg->sg_magic != SG_MAGIC);
 #endif
-	return sg_is_last(sg) ? NULL :
-		likely(!sg_is_chain(++sg)) ? sg :
-		sg_chain_ptr(sg);
+	return sg_is_last(sg) ? NULL : ____sg_next(sg);
 }
 
 /**
@@ -3170,45 +3179,21 @@  static inline int __sg_page_count(struct scatterlist *sg)
 	return sg->length >> PAGE_SHIFT;
 }
 
-struct page *
-i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n);
-
-static inline dma_addr_t
-i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj, int n)
-{
-	if (n < obj->get_page.last) {
-		obj->get_page.sg = obj->pages->sgl;
-		obj->get_page.last = 0;
-	}
-
-	while (obj->get_page.last + __sg_page_count(obj->get_page.sg) <= n) {
-		obj->get_page.last += __sg_page_count(obj->get_page.sg++);
-		if (unlikely(sg_is_chain(obj->get_page.sg)))
-			obj->get_page.sg = sg_chain_ptr(obj->get_page.sg);
-	}
-
-	return sg_dma_address(obj->get_page.sg) + ((n - obj->get_page.last) << PAGE_SHIFT);
-}
-
-static inline struct page *
-i915_gem_object_get_page(struct drm_i915_gem_object *obj, int n)
-{
-	if (WARN_ON(n >= obj->base.size >> PAGE_SHIFT))
-		return NULL;
+struct scatterlist *
+i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
+		       unsigned long n, unsigned int *offset);
 
-	if (n < obj->get_page.last) {
-		obj->get_page.sg = obj->pages->sgl;
-		obj->get_page.last = 0;
-	}
+struct page *
+i915_gem_object_get_page(struct drm_i915_gem_object *obj,
+			 unsigned long n);
 
-	while (obj->get_page.last + __sg_page_count(obj->get_page.sg) <= n) {
-		obj->get_page.last += __sg_page_count(obj->get_page.sg++);
-		if (unlikely(sg_is_chain(obj->get_page.sg)))
-			obj->get_page.sg = sg_chain_ptr(obj->get_page.sg);
-	}
+struct page *
+i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
+			       unsigned long n);
 
-	return nth_page(sg_page(obj->get_page.sg), n - obj->get_page.last);
-}
+dma_addr_t
+i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
+				unsigned long n);
 
 static inline void i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 95781c63f605..d736beb4d57f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2333,6 +2333,15 @@  i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 	kfree(obj->pages);
 }
 
+static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
+{
+	struct radix_tree_iter iter;
+	void **slot;
+
+	radix_tree_for_each_slot(slot, &obj->get_page.radix, &iter, 0)
+		radix_tree_delete(&obj->get_page.radix, iter.index);
+}
+
 int
 i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 {
@@ -2365,6 +2374,8 @@  i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
 		obj->mapping = NULL;
 	}
 
+	__i915_gem_object_reset_page_iter(obj);
+
 	ops->put_pages(obj);
 	obj->pages = NULL;
 
@@ -2534,8 +2545,8 @@  i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
 
 	list_add_tail(&obj->global_list, &dev_priv->mm.unbound_list);
 
-	obj->get_page.sg = obj->pages->sgl;
-	obj->get_page.last = 0;
+	obj->get_page.sg_pos = obj->pages->sgl;
+	obj->get_page.sg_idx = 0;
 
 	return 0;
 }
@@ -4338,6 +4349,8 @@  void i915_gem_object_init(struct drm_i915_gem_object *obj,
 
 	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
 	obj->madv = I915_MADV_WILLNEED;
+	INIT_RADIX_TREE(&obj->get_page.radix, GFP_ATOMIC | __GFP_NOWARN);
+	spin_lock_init(&obj->get_page.lock);
 
 	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
 }
@@ -5016,21 +5029,6 @@  void i915_gem_track_fb(struct drm_i915_gem_object *old,
 	}
 }
 
-/* Like i915_gem_object_get_page(), but mark the returned page dirty */
-struct page *
-i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n)
-{
-	struct page *page;
-
-	/* Only default objects have per-page dirty tracking */
-	if (WARN_ON(!i915_gem_object_has_struct_page(obj)))
-		return NULL;
-
-	page = i915_gem_object_get_page(obj, n);
-	set_page_dirty(page);
-	return page;
-}
-
 /* Allocate a new GEM object and fill it with the supplied data */
 struct drm_i915_gem_object *
 i915_gem_object_create_from_data(struct drm_device *dev,
@@ -5071,3 +5069,150 @@  fail:
 	i915_gem_object_put(obj);
 	return ERR_PTR(ret);
 }
+
+struct scatterlist *
+i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
+		       unsigned long n,
+		       unsigned int *offset)
+{
+	struct i915_gem_object_page_iter *iter = &obj->get_page;
+	struct scatterlist *sg;
+	unsigned long idx, count;
+
+	GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
+	GEM_BUG_ON(obj->pages_pin_count == 0);
+
+	/* As we iterate forward through the sg, we record each entry in a
+	 * radixtree for quick repeated (backwards) lookups. If we have seen
+	 * this index previously, we will have an entry for it.
+	 *
+	 * Initial lookup is O(N), but this is amortized to O(1) for
+	 * sequential page access (where each new request is consecutive
+	 * to the previous one). Repeated lookups are O(lg(obj->base.size)),
+	 * i.e. O(1) with a large constant!
+	 */
+	if (n < READ_ONCE(iter->sg_idx))
+		goto lookup;
+
+	spin_lock(&iter->lock);
+
+	sg = iter->sg_pos;
+	idx = iter->sg_idx;
+	count = __sg_page_count(sg);
+
+	while (idx + count < n) {
+		unsigned long exception, i;
+		int ret;
+
+		/* If we cannot allocate and insert this entry, or the
+		 * individual pages from this range, cancel updating the
+		 * sg_idx so that on this lookup we are forced to linearly
+		 * scan onwards, but on future lookups we will try the
+		 * insertion again (in which case we need to be careful of
+		 * the error return reporting that we have already inserted
+		 * this index).
+		 */
+		ret = radix_tree_insert(&iter->radix, idx, sg);
+		if (ret && ret != -EEXIST)
+			goto scan;
+
+		exception =
+			RADIX_TREE_EXCEPTIONAL_ENTRY |
+			idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
+		for (i = 1; i < count; i++) {
+			ret = radix_tree_insert(&iter->radix, idx + i,
+						(void *)exception);
+			if (ret && ret != -EEXIST)
+				goto scan;
+		}
+
+		idx += count;
+		sg = ____sg_next(sg);
+		count = __sg_page_count(sg);
+	}
+
+scan:
+	iter->sg_pos = sg;
+	iter->sg_idx = idx;
+
+	spin_unlock(&iter->lock);
+
+	if (unlikely(n < idx)) /* insertion completed by another thread */
+		goto lookup;
+
+	/* In case we failed to insert the entry into the radixtree, we need
+	 * to look beyond the current sg.
+	 */
+	while (idx + count <= n) {
+		idx += count;
+		sg = ____sg_next(sg);
+		count = __sg_page_count(sg);
+	}
+
+	*offset = n - idx;
+	return sg;
+
+lookup:
+	rcu_read_lock();
+
+	sg = radix_tree_lookup(&iter->radix, n);
+	GEM_BUG_ON(!sg);
+
+	/* If this index is in the middle of multi-page sg entry,
+	 * the radixtree will contain an exceptional entry that points
+	 * to the start of that range. We will return the pointer to
+	 * the base page and the offset of this page within the
+	 * sg entry's range.
+	 */
+	*offset = 0;
+	if (unlikely(radix_tree_exception(sg))) {
+		unsigned long base =
+			(unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
+
+		sg = radix_tree_lookup(&iter->radix, base);
+		GEM_BUG_ON(!sg);
+
+		*offset = n - base;
+	}
+
+	rcu_read_unlock();
+
+	return sg;
+}
+
+struct page *
+i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned long n)
+{
+	struct scatterlist *sg;
+	unsigned int offset;
+
+	GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
+
+	sg = i915_gem_object_get_sg(obj, n, &offset);
+	return nth_page(sg_page(sg), offset);
+}
+
+/* Like i915_gem_object_get_page(), but mark the returned page dirty */
+struct page *
+i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
+			       unsigned long n)
+{
+	struct page *page;
+
+	page = i915_gem_object_get_page(obj, n);
+	if (!obj->dirty)
+		set_page_dirty(page);
+
+	return page;
+}
+
+dma_addr_t
+i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
+				unsigned long n)
+{
+	struct scatterlist *sg;
+	unsigned int offset;
+
+	sg = i915_gem_object_get_sg(obj, n, &offset);
+	return sg_dma_address(sg) + (offset << PAGE_SHIFT);
+}
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index f4f6d3a48b05..70e61bc35c60 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -595,8 +595,8 @@  _i915_gem_object_create_stolen(struct drm_device *dev,
 	if (obj->pages == NULL)
 		goto cleanup;
 
-	obj->get_page.sg = obj->pages->sgl;
-	obj->get_page.last = 0;
+	obj->get_page.sg_pos = obj->pages->sgl;
+	obj->get_page.sg_idx = 0;
 
 	i915_gem_object_pin_pages(obj);
 	obj->stolen = stolen;
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 1c891b92ac80..cb95789da76e 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -526,8 +526,8 @@  __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
 			if (ret == 0) {
 				list_add_tail(&obj->global_list,
 					      &to_i915(dev)->mm.unbound_list);
-				obj->get_page.sg = obj->pages->sgl;
-				obj->get_page.last = 0;
+				obj->get_page.sg_pos = obj->pages->sgl;
+				obj->get_page.sg_idx = 0;
 				pinned = 0;
 			}
 		}

Comments

On 14/10/2016 13:18, Chris Wilson wrote:
> A while ago we switched from a contiguous array of pages into an sglist,
> for that was both more convenient for mapping to hardware and avoided
> the requirement for a vmalloc array of pages on every object. However,
> certain GEM API calls (like pwrite, pread as well as performing
> relocations) do desire access to individual struct pages. A quick hack
> was to introduce a cache of the last access such that finding the
> following page was quick - this works so long as the caller desired
> sequential access. Walking backwards, or multiple callers, still hits a
> slow linear search for each page. One solution is to store each
> successful lookup in a radix tree.
>
> v2: Rewrite building the radixtree for clarity, hopefully.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h         |  69 +++++-------
>   drivers/gpu/drm/i915/i915_gem.c         | 179 +++++++++++++++++++++++++++++---
>   drivers/gpu/drm/i915/i915_gem_stolen.c  |   4 +-
>   drivers/gpu/drm/i915/i915_gem_userptr.c |   4 +-
>   4 files changed, 193 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 38467dde1efe..53cf4b0e5359 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2273,9 +2273,12 @@ struct drm_i915_gem_object {
>   
>   	struct sg_table *pages;
>   	int pages_pin_count;
> -	struct get_page {
> -		struct scatterlist *sg;
> -		int last;
> +	struct i915_gem_object_page_iter {
> +		struct scatterlist *sg_pos;
> +		unsigned long sg_idx;

We are not consistent in the code with type used for number of pages in 
an object. sg_alloc_table even takes an unsigned int for it, so for as 
long as we build them as we do, unsigned long is a waste.

> +
> +		struct radix_tree_root radix;
> +		spinlock_t lock;
>   	} get_page;
>   	void *mapping;
>   
> @@ -2478,6 +2481,14 @@ static __always_inline struct sgt_iter {
>   	return s;
>   }
>   
> +static inline struct scatterlist *____sg_next(struct scatterlist *sg)
> +{
> +	++sg;
> +	if (unlikely(sg_is_chain(sg)))
> +		sg = sg_chain_ptr(sg);
> +	return sg;
> +}
> +
>   /**
>    * __sg_next - return the next scatterlist entry in a list
>    * @sg:		The current sg entry
> @@ -2492,9 +2503,7 @@ static inline struct scatterlist *__sg_next(struct scatterlist *sg)
>   #ifdef CONFIG_DEBUG_SG
>   	BUG_ON(sg->sg_magic != SG_MAGIC);
>   #endif
> -	return sg_is_last(sg) ? NULL :
> -		likely(!sg_is_chain(++sg)) ? sg :
> -		sg_chain_ptr(sg);
> +	return sg_is_last(sg) ? NULL : ____sg_next(sg);
>   }
>   
>   /**
> @@ -3170,45 +3179,21 @@ static inline int __sg_page_count(struct scatterlist *sg)
>   	return sg->length >> PAGE_SHIFT;
>   }
>   
> -struct page *
> -i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n);
> -
> -static inline dma_addr_t
> -i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj, int n)
> -{
> -	if (n < obj->get_page.last) {
> -		obj->get_page.sg = obj->pages->sgl;
> -		obj->get_page.last = 0;
> -	}
> -
> -	while (obj->get_page.last + __sg_page_count(obj->get_page.sg) <= n) {
> -		obj->get_page.last += __sg_page_count(obj->get_page.sg++);
> -		if (unlikely(sg_is_chain(obj->get_page.sg)))
> -			obj->get_page.sg = sg_chain_ptr(obj->get_page.sg);
> -	}
> -
> -	return sg_dma_address(obj->get_page.sg) + ((n - obj->get_page.last) << PAGE_SHIFT);
> -}
> -
> -static inline struct page *
> -i915_gem_object_get_page(struct drm_i915_gem_object *obj, int n)
> -{
> -	if (WARN_ON(n >= obj->base.size >> PAGE_SHIFT))
> -		return NULL;
> +struct scatterlist *
> +i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
> +		       unsigned long n, unsigned int *offset);
>   
> -	if (n < obj->get_page.last) {
> -		obj->get_page.sg = obj->pages->sgl;
> -		obj->get_page.last = 0;
> -	}
> +struct page *
> +i915_gem_object_get_page(struct drm_i915_gem_object *obj,
> +			 unsigned long n);
>   
> -	while (obj->get_page.last + __sg_page_count(obj->get_page.sg) <= n) {
> -		obj->get_page.last += __sg_page_count(obj->get_page.sg++);
> -		if (unlikely(sg_is_chain(obj->get_page.sg)))
> -			obj->get_page.sg = sg_chain_ptr(obj->get_page.sg);
> -	}
> +struct page *
> +i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
> +			       unsigned long n);
>   
> -	return nth_page(sg_page(obj->get_page.sg), n - obj->get_page.last);
> -}
> +dma_addr_t
> +i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
> +				unsigned long n);
>   
>   static inline void i915_gem_object_pin_pages(struct drm_i915_gem_object *obj)
>   {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 95781c63f605..d736beb4d57f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2333,6 +2333,15 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
>   	kfree(obj->pages);
>   }
>   
> +static void __i915_gem_object_reset_page_iter(struct drm_i915_gem_object *obj)
> +{
> +	struct radix_tree_iter iter;
> +	void **slot;
> +
> +	radix_tree_for_each_slot(slot, &obj->get_page.radix, &iter, 0)
> +		radix_tree_delete(&obj->get_page.radix, iter.index);
> +}
> +
>   int
>   i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
>   {
> @@ -2365,6 +2374,8 @@ i915_gem_object_put_pages(struct drm_i915_gem_object *obj)
>   		obj->mapping = NULL;
>   	}
>   
> +	__i915_gem_object_reset_page_iter(obj);
> +
>   	ops->put_pages(obj);
>   	obj->pages = NULL;
>   
> @@ -2534,8 +2545,8 @@ i915_gem_object_get_pages(struct drm_i915_gem_object *obj)
>   
>   	list_add_tail(&obj->global_list, &dev_priv->mm.unbound_list);
>   
> -	obj->get_page.sg = obj->pages->sgl;
> -	obj->get_page.last = 0;
> +	obj->get_page.sg_pos = obj->pages->sgl;
> +	obj->get_page.sg_idx = 0;
>   
>   	return 0;
>   }
> @@ -4338,6 +4349,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
>   
>   	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
>   	obj->madv = I915_MADV_WILLNEED;
> +	INIT_RADIX_TREE(&obj->get_page.radix, GFP_ATOMIC | __GFP_NOWARN);

Pros & cons of GFP_ATOMIC here? Versus perhaps going with the mutex? I 
don't know how much data radix tree allocates with this, per node, but 
we can have a lot of pages. Would this create extra pressure on slab 
shrinking, and in turn out objects?

> +	spin_lock_init(&obj->get_page.lock);
>   
>   	i915_gem_info_add_obj(to_i915(obj->base.dev), obj->base.size);
>   }
> @@ -5016,21 +5029,6 @@ void i915_gem_track_fb(struct drm_i915_gem_object *old,
>   	}
>   }
>   
> -/* Like i915_gem_object_get_page(), but mark the returned page dirty */
> -struct page *
> -i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj, int n)
> -{
> -	struct page *page;
> -
> -	/* Only default objects have per-page dirty tracking */
> -	if (WARN_ON(!i915_gem_object_has_struct_page(obj)))
> -		return NULL;
> -
> -	page = i915_gem_object_get_page(obj, n);
> -	set_page_dirty(page);
> -	return page;
> -}
> -
>   /* Allocate a new GEM object and fill it with the supplied data */
>   struct drm_i915_gem_object *
>   i915_gem_object_create_from_data(struct drm_device *dev,
> @@ -5071,3 +5069,150 @@ fail:
>   	i915_gem_object_put(obj);
>   	return ERR_PTR(ret);
>   }
> +
> +struct scatterlist *
> +i915_gem_object_get_sg(struct drm_i915_gem_object *obj,
> +		       unsigned long n,
Same unsigned long vs unsigned int as above.

> +		       unsigned int *offset)
> +{
> +	struct i915_gem_object_page_iter *iter = &obj->get_page;
> +	struct scatterlist *sg;
> +	unsigned long idx, count;
> +
> +	GEM_BUG_ON(n >= obj->base.size >> PAGE_SHIFT);
> +	GEM_BUG_ON(obj->pages_pin_count == 0);
> +
> +	/* As we iterate forward through the sg, we record each entry in a
> +	 * radixtree for quick repeated (backwards) lookups. If we have seen
> +	 * this index previously, we will have an entry for it.
> +	 *
> +	 * Initial lookup is O(N), but this is amortized to O(1) for
> +	 * sequential page access (where each new request is consecutive
> +	 * to the previous one). Repeated lookups are O(lg(obj->base.size)),
> +	 * i.e. O(1) with a large constant!
> +	 */
> +	if (n < READ_ONCE(iter->sg_idx))
> +		goto lookup;
> +
> +	spin_lock(&iter->lock);
> +
> +	sg = iter->sg_pos;
> +	idx = iter->sg_idx;
> +	count = __sg_page_count(sg);
> +
> +	while (idx + count < n) {
> +		unsigned long exception, i;
> +		int ret;
> +
> +		/* If we cannot allocate and insert this entry, or the
> +		 * individual pages from this range, cancel updating the
> +		 * sg_idx so that on this lookup we are forced to linearly
> +		 * scan onwards, but on future lookups we will try the
> +		 * insertion again (in which case we need to be careful of
> +		 * the error return reporting that we have already inserted
> +		 * this index).
> +		 */
> +		ret = radix_tree_insert(&iter->radix, idx, sg);
> +		if (ret && ret != -EEXIST)
> +			goto scan;

What other error can we get here? Internal allocation failure?

> +
> +		exception =
> +			RADIX_TREE_EXCEPTIONAL_ENTRY |
> +			idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
> +		for (i = 1; i < count; i++) {
> +			ret = radix_tree_insert(&iter->radix, idx + i,
> +						(void *)exception);
> +			if (ret && ret != -EEXIST)
> +				goto scan;
> +		}
> +
> +		idx += count;
> +		sg = ____sg_next(sg);
> +		count = __sg_page_count(sg);
> +	}
> +
> +scan:
> +	iter->sg_pos = sg;
> +	iter->sg_idx = idx;
> +
> +	spin_unlock(&iter->lock);
> +
> +	if (unlikely(n < idx)) /* insertion completed by another thread */
> +		goto lookup;
> +
> +	/* In case we failed to insert the entry into the radixtree, we need
> +	 * to look beyond the current sg.
> +	 */
> +	while (idx + count <= n) {
> +		idx += count;
> +		sg = ____sg_next(sg);
> +		count = __sg_page_count(sg);
> +	}
> +

Hmm... I assume failures happen then since you added this fallback. Due 
GFP_ATOMIC?

> +	*offset = n - idx;
> +	return sg;
> +
> +lookup:
> +	rcu_read_lock();
> +

Property of the radix tree implementation that the RCU lock is sufficient?

> +	sg = radix_tree_lookup(&iter->radix, n);
> +	GEM_BUG_ON(!sg);
> +
> +	/* If this index is in the middle of multi-page sg entry,
> +	 * the radixtree will contain an exceptional entry that points
> +	 * to the start of that range. We will return the pointer to
> +	 * the base page and the offset of this page within the
> +	 * sg entry's range.
> +	 */
> +	*offset = 0;
> +	if (unlikely(radix_tree_exception(sg))) {
> +		unsigned long base =
> +			(unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
> +
> +		sg = radix_tree_lookup(&iter->radix, base);
> +		GEM_BUG_ON(!sg);
> +
> +		*offset = n - base;
> +	}
> +
> +	rcu_read_unlock();
> +
> +	return sg;
> +}
> +
> +struct page *
> +i915_gem_object_get_page(struct drm_i915_gem_object *obj, unsigned long n)
> +{
> +	struct scatterlist *sg;
> +	unsigned int offset;
> +
> +	GEM_BUG_ON(!i915_gem_object_has_struct_page(obj));
> +
> +	sg = i915_gem_object_get_sg(obj, n, &offset);
> +	return nth_page(sg_page(sg), offset);
> +}
> +
> +/* Like i915_gem_object_get_page(), but mark the returned page dirty */
> +struct page *
> +i915_gem_object_get_dirty_page(struct drm_i915_gem_object *obj,
> +			       unsigned long n)
> +{
> +	struct page *page;
> +
> +	page = i915_gem_object_get_page(obj, n);
> +	if (!obj->dirty)
> +		set_page_dirty(page);
> +
> +	return page;
> +}
> +
> +dma_addr_t
> +i915_gem_object_get_dma_address(struct drm_i915_gem_object *obj,
> +				unsigned long n)
> +{
> +	struct scatterlist *sg;
> +	unsigned int offset;
> +
> +	sg = i915_gem_object_get_sg(obj, n, &offset);
> +	return sg_dma_address(sg) + (offset << PAGE_SHIFT);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> index f4f6d3a48b05..70e61bc35c60 100644
> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> @@ -595,8 +595,8 @@ _i915_gem_object_create_stolen(struct drm_device *dev,
>   	if (obj->pages == NULL)
>   		goto cleanup;
>   
> -	obj->get_page.sg = obj->pages->sgl;
> -	obj->get_page.last = 0;
> +	obj->get_page.sg_pos = obj->pages->sgl;
> +	obj->get_page.sg_idx = 0;
>   
>   	i915_gem_object_pin_pages(obj);
>   	obj->stolen = stolen;
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 1c891b92ac80..cb95789da76e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -526,8 +526,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>   			if (ret == 0) {
>   				list_add_tail(&obj->global_list,
>   					      &to_i915(dev)->mm.unbound_list);
> -				obj->get_page.sg = obj->pages->sgl;
> -				obj->get_page.last = 0;
> +				obj->get_page.sg_pos = obj->pages->sgl;
> +				obj->get_page.sg_idx = 0;

Almost like these ones would be better in a helper like 
i915_gem_object_init_page_lookup or something.

>   				pinned = 0;
>   			}
>   		}

Regards,

Tvrtko
On Fri, Oct 14, 2016 at 02:32:03PM +0100, Tvrtko Ursulin wrote:
> 
> On 14/10/2016 13:18, Chris Wilson wrote:
> >A while ago we switched from a contiguous array of pages into an sglist,
> >for that was both more convenient for mapping to hardware and avoided
> >the requirement for a vmalloc array of pages on every object. However,
> >certain GEM API calls (like pwrite, pread as well as performing
> >relocations) do desire access to individual struct pages. A quick hack
> >was to introduce a cache of the last access such that finding the
> >following page was quick - this works so long as the caller desired
> >sequential access. Walking backwards, or multiple callers, still hits a
> >slow linear search for each page. One solution is to store each
> >successful lookup in a radix tree.
> >
> >v2: Rewrite building the radixtree for clarity, hopefully.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_drv.h         |  69 +++++-------
> >  drivers/gpu/drm/i915/i915_gem.c         | 179 +++++++++++++++++++++++++++++---
> >  drivers/gpu/drm/i915/i915_gem_stolen.c  |   4 +-
> >  drivers/gpu/drm/i915/i915_gem_userptr.c |   4 +-
> >  4 files changed, 193 insertions(+), 63 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 38467dde1efe..53cf4b0e5359 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -2273,9 +2273,12 @@ struct drm_i915_gem_object {
> >  	struct sg_table *pages;
> >  	int pages_pin_count;
> >-	struct get_page {
> >-		struct scatterlist *sg;
> >-		int last;
> >+	struct i915_gem_object_page_iter {
> >+		struct scatterlist *sg_pos;
> >+		unsigned long sg_idx;
> 
> We are not consistent in the code with type used for number of pages
> in an object. sg_alloc_table even takes an unsigned int for it, so
> for as long as we build them as we do, unsigned long is a waste.

I know. It's worrying, today there is a possibility that we overflow a
32bit size. If this was counting in pages, we would have a few more
years of grace. All I can say is that we are fortunate that memory
remains expensive in the exabyte range.

> >@@ -4338,6 +4349,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
> >  	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
> >  	obj->madv = I915_MADV_WILLNEED;
> >+	INIT_RADIX_TREE(&obj->get_page.radix, GFP_ATOMIC | __GFP_NOWARN);
> 
> Pros & cons of GFP_ATOMIC here? Versus perhaps going with the mutex?
> I don't know how much data radix tree allocates with this, per node,
> but we can have a lot of pages. Would this create extra pressure on
> slab shrinking, and in turn out objects?

The problem is that we require sg lookup on a !pagefault path, hence
mutexes and GFP_KERNEL turn out to be illegal. :|

> >+		/* If we cannot allocate and insert this entry, or the
> >+		 * individual pages from this range, cancel updating the
> >+		 * sg_idx so that on this lookup we are forced to linearly
> >+		 * scan onwards, but on future lookups we will try the
> >+		 * insertion again (in which case we need to be careful of
> >+		 * the error return reporting that we have already inserted
> >+		 * this index).
> >+		 */
> >+		ret = radix_tree_insert(&iter->radix, idx, sg);
> >+		if (ret && ret != -EEXIST)
> >+			goto scan;
> 
> What other error can we get here? Internal allocation failure?

Yes. ENOMEM is the only other error.
> 
> >+
> >+		exception =
> >+			RADIX_TREE_EXCEPTIONAL_ENTRY |
> >+			idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
> >+		for (i = 1; i < count; i++) {
> >+			ret = radix_tree_insert(&iter->radix, idx + i,
> >+						(void *)exception);
> >+			if (ret && ret != -EEXIST)
> >+				goto scan;
> >+		}
> >+
> >+		idx += count;
> >+		sg = ____sg_next(sg);
> >+		count = __sg_page_count(sg);
> >+	}
> >+
> >+scan:
> >+	iter->sg_pos = sg;
> >+	iter->sg_idx = idx;
> >+
> >+	spin_unlock(&iter->lock);
> >+
> >+	if (unlikely(n < idx)) /* insertion completed by another thread */
> >+		goto lookup;
> >+
> >+	/* In case we failed to insert the entry into the radixtree, we need
> >+	 * to look beyond the current sg.
> >+	 */
> >+	while (idx + count <= n) {
> >+		idx += count;
> >+		sg = ____sg_next(sg);
> >+		count = __sg_page_count(sg);
> >+	}
> >+
> 
> Hmm... I assume failures happen then since you added this fallback.
> Due GFP_ATOMIC?

No, this was always in the code, because malloc failures happen. Quite
often if you run igt ;)
 
> >+	*offset = n - idx;
> >+	return sg;
> >+
> >+lookup:
> >+	rcu_read_lock();
> >+
> 
> Property of the radix tree implementation that the RCU lock is sufficient?

Yes. Lookups are RCU safe (the slots are freed via RCU). Writers must be
serialised by the caller.

> >diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> >index f4f6d3a48b05..70e61bc35c60 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> >@@ -595,8 +595,8 @@ _i915_gem_object_create_stolen(struct drm_device *dev,
> >  	if (obj->pages == NULL)
> >  		goto cleanup;
> >-	obj->get_page.sg = obj->pages->sgl;
> >-	obj->get_page.last = 0;
> >+	obj->get_page.sg_pos = obj->pages->sgl;
> >+	obj->get_page.sg_idx = 0;
> >  	i915_gem_object_pin_pages(obj);
> >  	obj->stolen = stolen;
> >diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> >index 1c891b92ac80..cb95789da76e 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> >@@ -526,8 +526,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
> >  			if (ret == 0) {
> >  				list_add_tail(&obj->global_list,
> >  					      &to_i915(dev)->mm.unbound_list);
> >-				obj->get_page.sg = obj->pages->sgl;
> >-				obj->get_page.last = 0;
> >+				obj->get_page.sg_pos = obj->pages->sgl;
> >+				obj->get_page.sg_idx = 0;
> 
> Almost like these ones would be better in a helper like
> i915_gem_object_init_page_lookup or something.

Yup.  I think I have a patch for that, with your r-b on it :)
-Chris
On 14/10/2016 15:07, Chris Wilson wrote:
> On Fri, Oct 14, 2016 at 02:32:03PM +0100, Tvrtko Ursulin wrote:
>> On 14/10/2016 13:18, Chris Wilson wrote:
>>> A while ago we switched from a contiguous array of pages into an sglist,
>>> for that was both more convenient for mapping to hardware and avoided
>>> the requirement for a vmalloc array of pages on every object. However,
>>> certain GEM API calls (like pwrite, pread as well as performing
>>> relocations) do desire access to individual struct pages. A quick hack
>>> was to introduce a cache of the last access such that finding the
>>> following page was quick - this works so long as the caller desired
>>> sequential access. Walking backwards, or multiple callers, still hits a
>>> slow linear search for each page. One solution is to store each
>>> successful lookup in a radix tree.
>>>
>>> v2: Rewrite building the radixtree for clarity, hopefully.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>   drivers/gpu/drm/i915/i915_drv.h         |  69 +++++-------
>>>   drivers/gpu/drm/i915/i915_gem.c         | 179 +++++++++++++++++++++++++++++---
>>>   drivers/gpu/drm/i915/i915_gem_stolen.c  |   4 +-
>>>   drivers/gpu/drm/i915/i915_gem_userptr.c |   4 +-
>>>   4 files changed, 193 insertions(+), 63 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 38467dde1efe..53cf4b0e5359 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -2273,9 +2273,12 @@ struct drm_i915_gem_object {
>>>   	struct sg_table *pages;
>>>   	int pages_pin_count;
>>> -	struct get_page {
>>> -		struct scatterlist *sg;
>>> -		int last;
>>> +	struct i915_gem_object_page_iter {
>>> +		struct scatterlist *sg_pos;
>>> +		unsigned long sg_idx;
>> We are not consistent in the code with type used for number of pages
>> in an object. sg_alloc_table even takes an unsigned int for it, so
>> for as long as we build them as we do, unsigned long is a waste.
> I know. It's worrying, today there is a possibility that we overflow a
> 32bit size. If this was counting in pages, we would have a few more
> years of grace. All I can say is that we are fortunate that memory
> remains expensive in the exabyte range.
>
>>> @@ -4338,6 +4349,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
>>>   	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
>>>   	obj->madv = I915_MADV_WILLNEED;
>>> +	INIT_RADIX_TREE(&obj->get_page.radix, GFP_ATOMIC | __GFP_NOWARN);
>> Pros & cons of GFP_ATOMIC here? Versus perhaps going with the mutex?
>> I don't know how much data radix tree allocates with this, per node,
>> but we can have a lot of pages. Would this create extra pressure on
>> slab shrinking, and in turn out objects?
> The problem is that we require sg lookup on a !pagefault path, hence
> mutexes and GFP_KERNEL turn out to be illegal. :|

Bummer.  I don't know enough about the atomic reserve to judge how bad 
this might be then. Because userspace could drain it easily after this 
work by just pread/pwrite on large objects.

>>> +		/* If we cannot allocate and insert this entry, or the
>>> +		 * individual pages from this range, cancel updating the
>>> +		 * sg_idx so that on this lookup we are forced to linearly
>>> +		 * scan onwards, but on future lookups we will try the
>>> +		 * insertion again (in which case we need to be careful of
>>> +		 * the error return reporting that we have already inserted
>>> +		 * this index).
>>> +		 */
>>> +		ret = radix_tree_insert(&iter->radix, idx, sg);
>>> +		if (ret && ret != -EEXIST)
>>> +			goto scan;
>> What other error can we get here? Internal allocation failure?
> Yes. ENOMEM is the only other error.
>>> +
>>> +		exception =
>>> +			RADIX_TREE_EXCEPTIONAL_ENTRY |
>>> +			idx << RADIX_TREE_EXCEPTIONAL_SHIFT;
>>> +		for (i = 1; i < count; i++) {
>>> +			ret = radix_tree_insert(&iter->radix, idx + i,
>>> +						(void *)exception);
>>> +			if (ret && ret != -EEXIST)
>>> +				goto scan;
>>> +		}
>>> +
>>> +		idx += count;
>>> +		sg = ____sg_next(sg);
>>> +		count = __sg_page_count(sg);
>>> +	}
>>> +
>>> +scan:
>>> +	iter->sg_pos = sg;
>>> +	iter->sg_idx = idx;
>>> +
>>> +	spin_unlock(&iter->lock);
>>> +
>>> +	if (unlikely(n < idx)) /* insertion completed by another thread */
>>> +		goto lookup;
>>> +
>>> +	/* In case we failed to insert the entry into the radixtree, we need
>>> +	 * to look beyond the current sg.
>>> +	 */
>>> +	while (idx + count <= n) {
>>> +		idx += count;
>>> +		sg = ____sg_next(sg);
>>> +		count = __sg_page_count(sg);
>>> +	}
>>> +
>> Hmm... I assume failures happen then since you added this fallback.
>> Due GFP_ATOMIC?
> No, this was always in the code, because malloc failures happen. Quite
> often if you run igt ;)
>   

But GFP_ATOMIC failures primarily, yes?

It is a bit unfortunate that with this fancy lookup we can easily and 
silently fall back to linear lookup and lose the performance benefit. Do 
you know how often this happens? Should we have some stats collected and 
exported via debugfs to evaluate on realistically busy/loaded systems?

>>> +	*offset = n - idx;
>>> +	return sg;
>>> +
>>> +lookup:
>>> +	rcu_read_lock();
>>> +
>> Property of the radix tree implementation that the RCU lock is sufficient?
> Yes. Lookups are RCU safe (the slots are freed via RCU). Writers must be
> serialised by the caller.
>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
>>> index f4f6d3a48b05..70e61bc35c60 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
>>> @@ -595,8 +595,8 @@ _i915_gem_object_create_stolen(struct drm_device *dev,
>>>   	if (obj->pages == NULL)
>>>   		goto cleanup;
>>> -	obj->get_page.sg = obj->pages->sgl;
>>> -	obj->get_page.last = 0;
>>> +	obj->get_page.sg_pos = obj->pages->sgl;
>>> +	obj->get_page.sg_idx = 0;
>>>   	i915_gem_object_pin_pages(obj);
>>>   	obj->stolen = stolen;
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
>>> index 1c891b92ac80..cb95789da76e 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
>>> @@ -526,8 +526,8 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
>>>   			if (ret == 0) {
>>>   				list_add_tail(&obj->global_list,
>>>   					      &to_i915(dev)->mm.unbound_list);
>>> -				obj->get_page.sg = obj->pages->sgl;
>>> -				obj->get_page.last = 0;
>>> +				obj->get_page.sg_pos = obj->pages->sgl;
>>> +				obj->get_page.sg_idx = 0;
>> Almost like these ones would be better in a helper like
>> i915_gem_object_init_page_lookup or something.
> Yup.  I think I have a patch for that, with your r-b on it :)

Ok then. :)

Regards,

Tvrtko
On Mon, Oct 17, 2016 at 10:56:27AM +0100, Tvrtko Ursulin wrote:
> 
> On 14/10/2016 15:07, Chris Wilson wrote:
> >On Fri, Oct 14, 2016 at 02:32:03PM +0100, Tvrtko Ursulin wrote:
> >>On 14/10/2016 13:18, Chris Wilson wrote:
> >>>A while ago we switched from a contiguous array of pages into an sglist,
> >>>for that was both more convenient for mapping to hardware and avoided
> >>>the requirement for a vmalloc array of pages on every object. However,
> >>>certain GEM API calls (like pwrite, pread as well as performing
> >>>relocations) do desire access to individual struct pages. A quick hack
> >>>was to introduce a cache of the last access such that finding the
> >>>following page was quick - this works so long as the caller desired
> >>>sequential access. Walking backwards, or multiple callers, still hits a
> >>>slow linear search for each page. One solution is to store each
> >>>successful lookup in a radix tree.
> >>>
> >>>v2: Rewrite building the radixtree for clarity, hopefully.
> >>>
> >>>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>---
> >>>  drivers/gpu/drm/i915/i915_drv.h         |  69 +++++-------
> >>>  drivers/gpu/drm/i915/i915_gem.c         | 179 +++++++++++++++++++++++++++++---
> >>>  drivers/gpu/drm/i915/i915_gem_stolen.c  |   4 +-
> >>>  drivers/gpu/drm/i915/i915_gem_userptr.c |   4 +-
> >>>  4 files changed, 193 insertions(+), 63 deletions(-)
> >>>
> >>>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >>>index 38467dde1efe..53cf4b0e5359 100644
> >>>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>>@@ -2273,9 +2273,12 @@ struct drm_i915_gem_object {
> >>>  	struct sg_table *pages;
> >>>  	int pages_pin_count;
> >>>-	struct get_page {
> >>>-		struct scatterlist *sg;
> >>>-		int last;
> >>>+	struct i915_gem_object_page_iter {
> >>>+		struct scatterlist *sg_pos;
> >>>+		unsigned long sg_idx;
> >>We are not consistent in the code with type used for number of pages
> >>in an object. sg_alloc_table even takes an unsigned int for it, so
> >>for as long as we build them as we do, unsigned long is a waste.
> >I know. It's worrying, today there is a possibility that we overflow a
> >32bit size. If this was counting in pages, we would have a few more
> >years of grace. All I can say is that we are fortunate that memory
> >remains expensive in the exabyte range.
> >
> >>>@@ -4338,6 +4349,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
> >>>  	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
> >>>  	obj->madv = I915_MADV_WILLNEED;
> >>>+	INIT_RADIX_TREE(&obj->get_page.radix, GFP_ATOMIC | __GFP_NOWARN);
> >>Pros & cons of GFP_ATOMIC here? Versus perhaps going with the mutex?
> >>I don't know how much data radix tree allocates with this, per node,
> >>but we can have a lot of pages. Would this create extra pressure on
> >>slab shrinking, and in turn out objects?
> >The problem is that we require sg lookup on a !pagefault path, hence
> >mutexes and GFP_KERNEL turn out to be illegal. :|
> 
> Bummer.  I don't know enough about the atomic reserve to judge how
> bad this might be then. Because userspace could drain it easily
> after this work by just pread/pwrite on large objects.

Yes. The pathological case of touching the last page (and only the last
page) will cause large amount of allocation (one page for every 512
pages in the object, and so on recursively - a 64MiB object will require
32 + 1 atomic page allocations, aiui). Not that bad really.

> >>Hmm... I assume failures happen then since you added this fallback.
> >>Due GFP_ATOMIC?
> >No, this was always in the code, because malloc failures happen. Quite
> >often if you run igt ;)
> 
> But GFP_ATOMIC failures primarily, yes?

GFP_ATOMIC certainly increases the likelihood of failure (by removing
the direct reclaim and sleep paths), on the other hand we do get a
reserve pool.
 
> It is a bit unfortunate that with this fancy lookup we can easily
> and silently fall back to linear lookup and lose the performance
> benefit. Do you know how often this happens? Should we have some
> stats collected and exported via debugfs to evaluate on
> realistically busy/loaded systems?

Hmm, in my head I had it wrongly sketched out as basically falling back
to the old behaviour after allocation failure. Ok, if I stored the last
successful allocation (currently sg_idx-1) and the current scan position
separately, we can keep the old style scheme in place for allocation
failure and so not degrade performance for the worst case.

I wasn't planning on seeing failures outside of igt/gem_shrink (with the
exception of people running firefox or chrome!)
-Chris