[v2,17/18] drm/i915: Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset

Submitted by Michel Thierry on June 10, 2015, 4:46 p.m.

Details

Message ID 1433954816-13787-18-git-send-email-michel.thierry@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Michel Thierry June 10, 2015, 4:46 p.m.
There are some allocations that must be only referenced by 32bit
offsets. To limit the chances of having the first 4GB already full,
objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
DRM_MM_CREATE_TOP flags

User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
be allocated above the 32b address range.

The flag is ignored in 32b PPGTT.

v2: Changed flag logic from neeeds_32b, to supports_48b.

Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h            |  1 +
 drivers/gpu/drm/i915/i915_gem.c            | 19 ++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 +++++++++++++++++++++---------
 include/uapi/drm/i915_drm.h                |  4 +++-
 4 files changed, 47 insertions(+), 13 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 95f59d3..d73dddb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2714,6 +2714,7 @@  void i915_gem_vma_destroy(struct i915_vma *vma);
 #define PIN_OFFSET_BIAS	(1<<3)
 #define PIN_USER	(1<<4)
 #define PIN_UPDATE	(1<<5)
+#define PIN_FULL_RANGE	(1<<6)
 #define PIN_OFFSET_MASK (~4095)
 int __must_check
 i915_gem_object_pin(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 35690ef..e6325a4a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3672,6 +3672,8 @@  i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 fence_alignment, unfenced_alignment;
 	u64 size, fence_size;
+	u32 search_flag = DRM_MM_SEARCH_DEFAULT;
+	u32 alloc_flag = DRM_MM_CREATE_DEFAULT;
 	u64 start =
 		flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
 	u64 end =
@@ -3713,6 +3715,19 @@  i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
 						   obj->tiling_mode,
 						   false);
 		size = flags & PIN_MAPPABLE ? fence_size : obj->base.size;
+
+		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
+		 * limit address to 4GB-1 for objects requiring this wa; for
+		 * others, set alloc flag to TOP.
+		 */
+		if (USES_FULL_48BIT_PPGTT(dev)) {
+			if (flags & PIN_FULL_RANGE) {
+				search_flag = DRM_MM_SEARCH_BELOW;
+				alloc_flag = DRM_MM_CREATE_TOP;
+			} else {
+				end = ((4ULL << GEN8_PDPE_SHIFT) - 1);
+			}
+		}
 	}
 
 	if (alignment == 0)
@@ -3755,8 +3770,8 @@  search_free:
 						  size, alignment,
 						  obj->cache_level,
 						  start, end,
-						  DRM_MM_SEARCH_DEFAULT,
-						  DRM_MM_CREATE_DEFAULT);
+						  search_flag,
+						  alloc_flag);
 	if (ret) {
 		ret = i915_gem_evict_something(dev, vm, size, alignment,
 					       obj->cache_level,
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index bd0e4bd..04af62b 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -577,7 +577,8 @@  static bool only_mappable_for_reloc(unsigned int flags)
 static int
 i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 				struct intel_engine_cs *ring,
-				bool *need_reloc)
+				bool *need_reloc,
+				bool support_48baddr)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
@@ -588,6 +589,9 @@  i915_gem_execbuffer_reserve_vma(struct i915_vma *vma,
 	if (entry->flags & EXEC_OBJECT_NEEDS_GTT)
 		flags |= PIN_GLOBAL;
 
+	if (support_48baddr)
+		flags |= PIN_FULL_RANGE;
+
 	if (!drm_mm_node_allocated(&vma->node)) {
 		if (entry->flags & __EXEC_OBJECT_NEEDS_MAP)
 			flags |= PIN_GLOBAL | PIN_MAPPABLE;
@@ -650,7 +654,7 @@  need_reloc_mappable(struct i915_vma *vma)
 }
 
 static bool
-eb_vma_misplaced(struct i915_vma *vma)
+eb_vma_misplaced(struct i915_vma *vma, bool support_48baddr)
 {
 	struct drm_i915_gem_exec_object2 *entry = vma->exec_entry;
 	struct drm_i915_gem_object *obj = vma->obj;
@@ -670,13 +674,18 @@  eb_vma_misplaced(struct i915_vma *vma)
 	if (entry->flags & __EXEC_OBJECT_NEEDS_MAP && !obj->map_and_fenceable)
 		return !only_mappable_for_reloc(entry->flags);
 
+	if (!support_48baddr &&
+	    vma->node.start >= (1ULL << 32))
+		return true;
+
 	return false;
 }
 
 static int
 i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			    struct list_head *vmas,
-			    bool *need_relocs)
+			    bool *need_relocs,
+			    bool support_48baddr)
 {
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
@@ -737,10 +746,11 @@  i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
-			if (eb_vma_misplaced(vma))
+			if (eb_vma_misplaced(vma, support_48baddr))
 				ret = i915_vma_unbind(vma);
 			else
-				ret = i915_gem_execbuffer_reserve_vma(vma, ring, need_relocs);
+				ret = i915_gem_execbuffer_reserve_vma(vma, ring, need_relocs,
+								      support_48baddr);
 			if (ret)
 				goto err;
 		}
@@ -750,7 +760,9 @@  i915_gem_execbuffer_reserve(struct intel_engine_cs *ring,
 			if (drm_mm_node_allocated(&vma->node))
 				continue;
 
-			ret = i915_gem_execbuffer_reserve_vma(vma, ring, need_relocs);
+			ret = i915_gem_execbuffer_reserve_vma(vma, ring,
+							      need_relocs,
+							      support_48baddr);
 			if (ret)
 				goto err;
 		}
@@ -780,7 +792,7 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 	struct drm_i915_gem_relocation_entry *reloc;
 	struct i915_address_space *vm;
 	struct i915_vma *vma;
-	bool need_relocs;
+	bool need_relocs, support_48baddr;
 	int *reloc_offset;
 	int i, total, ret;
 	unsigned count = args->buffer_count;
@@ -861,7 +873,9 @@  i915_gem_execbuffer_relocate_slow(struct drm_device *dev,
 		goto err;
 
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs);
+	support_48baddr = (args->flags & I915_EXEC_SUPPORT_48BADDRESS);
+	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs,
+					  support_48baddr);
 	if (ret)
 		goto err;
 
@@ -1411,7 +1425,7 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	u64 exec_start = args->batch_start_offset;
 	u32 dispatch_flags;
 	int ret;
-	bool need_relocs;
+	bool need_relocs, support_48baddr;
 
 	if (!i915_gem_check_execbuffer(args))
 		return -EINVAL;
@@ -1519,7 +1533,9 @@  i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 
 	/* Move the objects en-masse into the GTT, evicting if necessary. */
 	need_relocs = (args->flags & I915_EXEC_NO_RELOC) == 0;
-	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs);
+	support_48baddr = (args->flags & I915_EXEC_SUPPORT_48BADDRESS);
+	ret = i915_gem_execbuffer_reserve(ring, &eb->vmas, &need_relocs,
+					  support_48baddr);
 	if (ret)
 		goto err;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4851d66..18df34a 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -760,7 +760,9 @@  struct drm_i915_gem_execbuffer2 {
 #define I915_EXEC_BSD_RING1		(1<<13)
 #define I915_EXEC_BSD_RING2		(2<<13)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(1<<15)
+#define I915_EXEC_SUPPORT_48BADDRESS	(1<<15)
+
+#define __I915_EXEC_UNKNOWN_FLAGS -(1<<16)
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \

Comments

On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> There are some allocations that must be only referenced by 32bit
> offsets. To limit the chances of having the first 4GB already full,
> objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> DRM_MM_CREATE_TOP flags
> 
> User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> be allocated above the 32b address range.

This should be a per-object flag not per-execbuffer.
 
> The flag is ignored in 32b PPGTT.
> 
> v2: Changed flag logic from neeeds_32b, to supports_48b.
> 
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h            |  1 +
>  drivers/gpu/drm/i915/i915_gem.c            | 19 ++++++++++++++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 +++++++++++++++++++++---------
>  include/uapi/drm/i915_drm.h                |  4 +++-
>  4 files changed, 47 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 95f59d3..d73dddb 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2714,6 +2714,7 @@ void i915_gem_vma_destroy(struct i915_vma *vma);
>  #define PIN_OFFSET_BIAS	(1<<3)
>  #define PIN_USER	(1<<4)
>  #define PIN_UPDATE	(1<<5)
> +#define PIN_FULL_RANGE	(1<<6)

Halfway through our flags. We should get around to putting a reminder
that 1<<11 is the last flag.

>  #define PIN_OFFSET_MASK (~4095)
>  int __must_check
>  i915_gem_object_pin(struct drm_i915_gem_object *obj,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 35690ef..e6325a4a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3672,6 +3672,8 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
>  	struct drm_i915_private *dev_priv = dev->dev_private;
>  	u32 fence_alignment, unfenced_alignment;
>  	u64 size, fence_size;
> +	u32 search_flag = DRM_MM_SEARCH_DEFAULT;
> +	u32 alloc_flag = DRM_MM_CREATE_DEFAULT;
>  	u64 start =
>  		flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
>  	u64 end =
> @@ -3713,6 +3715,19 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
>  						   obj->tiling_mode,
>  						   false);
>  		size = flags & PIN_MAPPABLE ? fence_size : obj->base.size;
> +
> +		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
> +		 * limit address to 4GB-1 for objects requiring this wa; for
> +		 * others, set alloc flag to TOP.
> +		 */
> +		if (USES_FULL_48BIT_PPGTT(dev)) {
> +			if (flags & PIN_FULL_RANGE) {
> +				search_flag = DRM_MM_SEARCH_BELOW;
> +				alloc_flag = DRM_MM_CREATE_TOP;
> +			} else {
> +				end = ((4ULL << GEN8_PDPE_SHIFT) - 1);

Looking at this, I think this is better as two flags. I have used
SEARCH_BELOW in the past to try and keep objects out of the mappable
aperture. Having that as separate flag is quite useful in its own right.

Then the second flag is PIN_BELOW_4G which we can set by default (and
cleared when the user specifies EXEC_OBJECT_48BIT). Not so sure, but I
think with the right combination of flags you can avoid having device
and w/a specific logic here. (This should be mechanism, push the policy
out to the boundary, preferrably into userspace.)

All the i915_gem_execbuffer changes are wrong due to the flag not being
on the object.
-Chris
On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > There are some allocations that must be only referenced by 32bit
> > offsets. To limit the chances of having the first 4GB already full,
> > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > DRM_MM_CREATE_TOP flags
> > 
> > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > be allocated above the 32b address range.
> 
> This should be a per-object flag not per-execbuffer.

We need both. This one to opt into the large address space, the per-object
one to apply the w/a. Also libdrm/mesa patches for this are still missing.
-Daniel

>  
> > The flag is ignored in 32b PPGTT.
> > 
> > v2: Changed flag logic from neeeds_32b, to supports_48b.
> > 
> > Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h            |  1 +
> >  drivers/gpu/drm/i915/i915_gem.c            | 19 ++++++++++++++--
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 +++++++++++++++++++++---------
> >  include/uapi/drm/i915_drm.h                |  4 +++-
> >  4 files changed, 47 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 95f59d3..d73dddb 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2714,6 +2714,7 @@ void i915_gem_vma_destroy(struct i915_vma *vma);
> >  #define PIN_OFFSET_BIAS	(1<<3)
> >  #define PIN_USER	(1<<4)
> >  #define PIN_UPDATE	(1<<5)
> > +#define PIN_FULL_RANGE	(1<<6)
> 
> Halfway through our flags. We should get around to putting a reminder
> that 1<<11 is the last flag.
> 
> >  #define PIN_OFFSET_MASK (~4095)
> >  int __must_check
> >  i915_gem_object_pin(struct drm_i915_gem_object *obj,
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 35690ef..e6325a4a 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3672,6 +3672,8 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
> >  	struct drm_i915_private *dev_priv = dev->dev_private;
> >  	u32 fence_alignment, unfenced_alignment;
> >  	u64 size, fence_size;
> > +	u32 search_flag = DRM_MM_SEARCH_DEFAULT;
> > +	u32 alloc_flag = DRM_MM_CREATE_DEFAULT;
> >  	u64 start =
> >  		flags & PIN_OFFSET_BIAS ? flags & PIN_OFFSET_MASK : 0;
> >  	u64 end =
> > @@ -3713,6 +3715,19 @@ i915_gem_object_bind_to_vm(struct drm_i915_gem_object *obj,
> >  						   obj->tiling_mode,
> >  						   false);
> >  		size = flags & PIN_MAPPABLE ? fence_size : obj->base.size;
> > +
> > +		/* Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset,
> > +		 * limit address to 4GB-1 for objects requiring this wa; for
> > +		 * others, set alloc flag to TOP.
> > +		 */
> > +		if (USES_FULL_48BIT_PPGTT(dev)) {
> > +			if (flags & PIN_FULL_RANGE) {
> > +				search_flag = DRM_MM_SEARCH_BELOW;
> > +				alloc_flag = DRM_MM_CREATE_TOP;
> > +			} else {
> > +				end = ((4ULL << GEN8_PDPE_SHIFT) - 1);
> 
> Looking at this, I think this is better as two flags. I have used
> SEARCH_BELOW in the past to try and keep objects out of the mappable
> aperture. Having that as separate flag is quite useful in its own right.
> 
> Then the second flag is PIN_BELOW_4G which we can set by default (and
> cleared when the user specifies EXEC_OBJECT_48BIT). Not so sure, but I
> think with the right combination of flags you can avoid having device
> and w/a specific logic here. (This should be mechanism, push the policy
> out to the boundary, preferrably into userspace.)
> 
> All the i915_gem_execbuffer changes are wrong due to the flag not being
> on the object.
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > There are some allocations that must be only referenced by 32bit
> > > offsets. To limit the chances of having the first 4GB already full,
> > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > DRM_MM_CREATE_TOP flags
> > > 
> > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > be allocated above the 32b address range.
> > 
> > This should be a per-object flag not per-execbuffer.
> 
> We need both. This one to opt into the large address space, the per-object
> one to apply the w/a. Also libdrm/mesa patches for this are still missing.

Do we need the opt in on the context? The 48bit vm is lazily
constructed, if no object asks to use the high range, it will never be
populated. Or is there a cost with preparing a 48bit vm?
-Chris
On Wed, Jun 17, 2015 at 01:53:17PM +0100, Chris Wilson wrote:
> On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> > On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > > There are some allocations that must be only referenced by 32bit
> > > > offsets. To limit the chances of having the first 4GB already full,
> > > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > > DRM_MM_CREATE_TOP flags
> > > > 
> > > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > > be allocated above the 32b address range.
> > > 
> > > This should be a per-object flag not per-execbuffer.
> > 
> > We need both. This one to opt into the large address space, the per-object
> > one to apply the w/a. Also libdrm/mesa patches for this are still missing.
> 
> Do we need the opt in on the context? The 48bit vm is lazily
> constructed, if no object asks to use the high range, it will never be
> populated. Or is there a cost with preparing a 48bit vm?

If we restrict to 4G we'll evict objects if we run out, and will stay
correct even when processing fairly large workloads. With just lazily
eating into 48b that won't be the case. A bit far-fetched, but if we go
to the trouble of implementing this might as well do it right.
-Daniel
On Wed, Jun 17, 2015 at 05:03:19PM +0200, Daniel Vetter wrote:
> On Wed, Jun 17, 2015 at 01:53:17PM +0100, Chris Wilson wrote:
> > On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> > > On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > > > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > > > There are some allocations that must be only referenced by 32bit
> > > > > offsets. To limit the chances of having the first 4GB already full,
> > > > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > > > DRM_MM_CREATE_TOP flags
> > > > > 
> > > > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > > > be allocated above the 32b address range.
> > > > 
> > > > This should be a per-object flag not per-execbuffer.
> > > 
> > > We need both. This one to opt into the large address space, the per-object
> > > one to apply the w/a. Also libdrm/mesa patches for this are still missing.
> > 
> > Do we need the opt in on the context? The 48bit vm is lazily
> > constructed, if no object asks to use the high range, it will never be
> > populated. Or is there a cost with preparing a 48bit vm?
> 
> If we restrict to 4G we'll evict objects if we run out, and will stay
> correct even when processing fairly large workloads. With just lazily
> eating into 48b that won't be the case. A bit far-fetched, but if we go
> to the trouble of implementing this might as well do it right.

i915_evict_something runs between the range requested for pinning. If we
run out of 4G space and the desired pin does not opt into 48bit, we will
evict from the lower 4G.

I obviously missed your concern. Care to elaborate?
-Chris
On Wed, Jun 17, 2015 at 06:37:03PM +0100, Chris Wilson wrote:
> On Wed, Jun 17, 2015 at 05:03:19PM +0200, Daniel Vetter wrote:
> > On Wed, Jun 17, 2015 at 01:53:17PM +0100, Chris Wilson wrote:
> > > On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> > > > On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > > > > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > > > > There are some allocations that must be only referenced by 32bit
> > > > > > offsets. To limit the chances of having the first 4GB already full,
> > > > > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > > > > DRM_MM_CREATE_TOP flags
> > > > > > 
> > > > > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > > > > be allocated above the 32b address range.
> > > > > 
> > > > > This should be a per-object flag not per-execbuffer.
> > > > 
> > > > We need both. This one to opt into the large address space, the per-object
> > > > one to apply the w/a. Also libdrm/mesa patches for this are still missing.
> > > 
> > > Do we need the opt in on the context? The 48bit vm is lazily
> > > constructed, if no object asks to use the high range, it will never be
> > > populated. Or is there a cost with preparing a 48bit vm?
> > 
> > If we restrict to 4G we'll evict objects if we run out, and will stay
> > correct even when processing fairly large workloads. With just lazily
> > eating into 48b that won't be the case. A bit far-fetched, but if we go
> > to the trouble of implementing this might as well do it right.
> 
> i915_evict_something runs between the range requested for pinning. If we
> run out of 4G space and the desired pin does not opt into 48bit, we will
> evict from the lower 4G.
> 
> I obviously missed your concern. Care to elaborate?

Current situation: You always get an address below 4G for all objects,
even if you use more than 4G of textures - the evict code will make space.

New situation with 48b address space enabled but existing userspace and a
total BO set bigger than 4G: The kernel will eventually hand out ppgtt
addresses > 4G, which means if we get such an address potentially even for
an object where this wa needs to apply. This would be a regression. But if
we make 48b strictly opt-in the kernel will restrict _all_ objects to
below 4G, creating no regression.

Ofc new userspace on 48b would set both the execbuf opt-in (or context
flag, we have those now) plus the per-obj "I need this below 4G" flag for
the objects that need this wa.
-Daniel
On Thu, Jun 18, 2015 at 08:45:50AM +0200, Daniel Vetter wrote:
> On Wed, Jun 17, 2015 at 06:37:03PM +0100, Chris Wilson wrote:
> > On Wed, Jun 17, 2015 at 05:03:19PM +0200, Daniel Vetter wrote:
> > > On Wed, Jun 17, 2015 at 01:53:17PM +0100, Chris Wilson wrote:
> > > > On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> > > > > On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > > > > > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > > > > > There are some allocations that must be only referenced by 32bit
> > > > > > > offsets. To limit the chances of having the first 4GB already full,
> > > > > > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > > > > > DRM_MM_CREATE_TOP flags
> > > > > > > 
> > > > > > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > > > > > be allocated above the 32b address range.
> > > > > > 
> > > > > > This should be a per-object flag not per-execbuffer.
> > > > > 
> > > > > We need both. This one to opt into the large address space, the per-object
> > > > > one to apply the w/a. Also libdrm/mesa patches for this are still missing.
> > > > 
> > > > Do we need the opt in on the context? The 48bit vm is lazily
> > > > constructed, if no object asks to use the high range, it will never be
> > > > populated. Or is there a cost with preparing a 48bit vm?
> > > 
> > > If we restrict to 4G we'll evict objects if we run out, and will stay
> > > correct even when processing fairly large workloads. With just lazily
> > > eating into 48b that won't be the case. A bit far-fetched, but if we go
> > > to the trouble of implementing this might as well do it right.
> > 
> > i915_evict_something runs between the range requested for pinning. If we
> > run out of 4G space and the desired pin does not opt into 48bit, we will
> > evict from the lower 4G.
> > 
> > I obviously missed your concern. Care to elaborate?
> 
> Current situation: You always get an address below 4G for all objects,
> even if you use more than 4G of textures - the evict code will make space.
> 
> New situation with 48b address space enabled but existing userspace and a
> total BO set bigger than 4G: The kernel will eventually hand out ppgtt
> addresses > 4G, which means if we get such an address potentially even for
> an object where this wa needs to apply. This would be a regression. But if
> we make 48b strictly opt-in the kernel will restrict _all_ objects to
> below 4G, creating no regression.

How? The pin code requires PIN_48BIT to be set to hand out higher
addresses. That is only set by execbuffer if execobject->flags is also set.
 
> Ofc new userspace on 48b would set both the execbuf opt-in (or context
> flag, we have those now) plus the per-obj "I need this below 4G" flag for
> the objects that need this wa.

I don't see why we need another flag beyond the per-object flag. If you
are thinking validation, we have to validate per-object flags anyway.
-Chris
On Thu, Jun 18, 2015 at 08:03:26AM +0100, Chris Wilson wrote:
> On Thu, Jun 18, 2015 at 08:45:50AM +0200, Daniel Vetter wrote:
> > On Wed, Jun 17, 2015 at 06:37:03PM +0100, Chris Wilson wrote:
> > > On Wed, Jun 17, 2015 at 05:03:19PM +0200, Daniel Vetter wrote:
> > > > On Wed, Jun 17, 2015 at 01:53:17PM +0100, Chris Wilson wrote:
> > > > > On Wed, Jun 17, 2015 at 02:49:47PM +0200, Daniel Vetter wrote:
> > > > > > On Wed, Jun 10, 2015 at 07:09:03PM +0100, Chris Wilson wrote:
> > > > > > > On Wed, Jun 10, 2015 at 05:46:54PM +0100, Michel Thierry wrote:
> > > > > > > > There are some allocations that must be only referenced by 32bit
> > > > > > > > offsets. To limit the chances of having the first 4GB already full,
> > > > > > > > objects not requiring this workaround use DRM_MM_SEARCH_BELOW/
> > > > > > > > DRM_MM_CREATE_TOP flags
> > > > > > > > 
> > > > > > > > User must pass I915_EXEC_SUPPORTS_48BADDRESS flag to indicate it can
> > > > > > > > be allocated above the 32b address range.
> > > > > > > 
> > > > > > > This should be a per-object flag not per-execbuffer.
> > > > > > 
> > > > > > We need both. This one to opt into the large address space, the per-object
> > > > > > one to apply the w/a. Also libdrm/mesa patches for this are still missing.
> > > > > 
> > > > > Do we need the opt in on the context? The 48bit vm is lazily
> > > > > constructed, if no object asks to use the high range, it will never be
> > > > > populated. Or is there a cost with preparing a 48bit vm?
> > > > 
> > > > If we restrict to 4G we'll evict objects if we run out, and will stay
> > > > correct even when processing fairly large workloads. With just lazily
> > > > eating into 48b that won't be the case. A bit far-fetched, but if we go
> > > > to the trouble of implementing this might as well do it right.
> > > 
> > > i915_evict_something runs between the range requested for pinning. If we
> > > run out of 4G space and the desired pin does not opt into 48bit, we will
> > > evict from the lower 4G.
> > > 
> > > I obviously missed your concern. Care to elaborate?
> > 
> > Current situation: You always get an address below 4G for all objects,
> > even if you use more than 4G of textures - the evict code will make space.
> > 
> > New situation with 48b address space enabled but existing userspace and a
> > total BO set bigger than 4G: The kernel will eventually hand out ppgtt
> > addresses > 4G, which means if we get such an address potentially even for
> > an object where this wa needs to apply. This would be a regression. But if
> > we make 48b strictly opt-in the kernel will restrict _all_ objects to
> > below 4G, creating no regression.
> 
> How? The pin code requires PIN_48BIT to be set to hand out higher
> addresses. That is only set by execbuffer if execobject->flags is also set.

I've been dense, somehow I thought we need the execbuf opt-in with the
object opt-out. But opt-in at the object level is indeed all we need.
-Daniel
On Thu, Jun 18, 2015 at 09:11:46AM +0200, Daniel Vetter wrote:
> I've been dense, somehow I thought we need the execbuf opt-in with the
> object opt-out. But opt-in at the object level is indeed all we need.

To be fair and recap our discussion on irc, the other side of the coin
is that at some point we want to use 48bit by default (gen9, gen10,
whenever it is robust!) Daniel's argument is that with an high level
enable bit + opt-out, there is less work in userspace to dtrt.

Imo, having changed userspace to opt-in when possible with gen8, having
userspace opt-in for all objects is then trivial (plus it is then easier
for userspace to disable it again). Having a flag at the execbuf level
would be nice, but given the changes we need in userspace today (to
support either the opt-in/opt-out model), I think a second flag is of no
practical value.
-Chris