[06/10] drm/ttm: fix busy memory to fail other user v10

Submitted by Christian König on May 22, 2019, 12:59 p.m.

Details

Message ID 20190522125947.4592-6-christian.koenig@amd.com
State New
Headers show
Series "Series without cover letter" ( rev: 2 1 ) in DRI devel

Not browsing as part of any series.

Commit Message

Christian König May 22, 2019, 12:59 p.m.
BOs on the LRU might be blocked during command submission
and cause OOM situations.

Avoid this by blocking for the first busy BO not locked by
the same ticket as the BO we are searching space for.

v10: completely start over with the patch since we didn't
     handled a whole bunch of corner cases.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 11 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 4c6389d849ed..861facac33d4 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -771,32 +771,72 @@  EXPORT_SYMBOL(ttm_bo_eviction_valuable);
  * b. Otherwise, trylock it.
  */
 static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
-			struct ttm_operation_ctx *ctx, bool *locked)
+			struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
 {
 	bool ret = false;
 
-	*locked = false;
 	if (bo->resv == ctx->resv) {
 		reservation_object_assert_held(bo->resv);
 		if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
 		    || !list_empty(&bo->ddestroy))
 			ret = true;
+		*locked = false;
+		if (busy)
+			*busy = false;
 	} else {
-		*locked = reservation_object_trylock(bo->resv);
-		ret = *locked;
+		ret = reservation_object_trylock(bo->resv);
+		*locked = ret;
+		if (busy)
+			*busy = !ret;
 	}
 
 	return ret;
 }
 
+/**
+ * ttm_mem_evict_wait_busy - wait for a busy BO to become available
+ *
+ * @busy_bo: BO which couldn't be locked with trylock
+ * @ctx: operation context
+ * @ticket: acquire ticket
+ *
+ * Try to lock a busy buffer object to avoid failing eviction.
+ */
+static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
+				   struct ttm_operation_ctx *ctx,
+				   struct ww_acquire_ctx *ticket)
+{
+	int r;
+
+	if (!busy_bo || !ticket)
+		return -EBUSY;
+
+	if (ctx->interruptible)
+		r = reservation_object_lock_interruptible(busy_bo->resv,
+							  ticket);
+	else
+		r = reservation_object_lock(busy_bo->resv, ticket);
+
+	/*
+	 * TODO: It would be better to keep the BO locked until allocation is at
+	 * least tried one more time, but that would mean a much larger rework
+	 * of TTM.
+	 */
+	if (!r)
+		reservation_object_unlock(busy_bo->resv);
+
+	return r == -EDEADLK ? -EAGAIN : r;
+}
+
 static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 			       uint32_t mem_type,
 			       const struct ttm_place *place,
-			       struct ttm_operation_ctx *ctx)
+			       struct ttm_operation_ctx *ctx,
+			       struct ww_acquire_ctx *ticket)
 {
+	struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
 	struct ttm_bo_global *glob = bdev->glob;
 	struct ttm_mem_type_manager *man = &bdev->man[mem_type];
-	struct ttm_buffer_object *bo = NULL;
 	bool locked = false;
 	unsigned i;
 	int ret;
@@ -804,8 +844,15 @@  static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &man->lru[i], lru) {
-			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
+			bool busy;
+
+			if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							    &busy)) {
+				if (busy && !busy_bo &&
+				    bo->resv->lock.ctx != ticket)
+					busy_bo = bo;
 				continue;
+			}
 
 			if (place && !bdev->driver->eviction_valuable(bo,
 								      place)) {
@@ -824,8 +871,13 @@  static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
 	}
 
 	if (!bo) {
+		if (busy_bo)
+			ttm_bo_get(busy_bo);
 		spin_unlock(&glob->lru_lock);
-		return -EBUSY;
+		ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
+		if (busy_bo)
+			ttm_bo_put(busy_bo);
+		return ret;
 	}
 
 	kref_get(&bo->list_kref);
@@ -911,7 +963,8 @@  static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
 			return ret;
 		if (mem->mm_node)
 			break;
-		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
+		ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
+					  bo->resv->lock.ctx);
 		if (unlikely(ret != 0))
 			return ret;
 	} while (1);
@@ -1426,7 +1479,8 @@  static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		while (!list_empty(&man->lru[i])) {
 			spin_unlock(&glob->lru_lock);
-			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
+			ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
+						  NULL);
 			if (ret)
 				return ret;
 			spin_lock(&glob->lru_lock);
@@ -1797,7 +1851,8 @@  int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
 	spin_lock(&glob->lru_lock);
 	for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
 		list_for_each_entry(bo, &glob->swap_lru[i], swap) {
-			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
+			if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
+							   NULL)) {
 				ret = 0;
 				break;
 			}

Comments

On 2019年05月22日 20:59, Christian König wrote:
> [CAUTION: External Email]
>
> BOs on the LRU might be blocked during command submission
> and cause OOM situations.
>
> Avoid this by blocking for the first busy BO not locked by
> the same ticket as the BO we are searching space for.
>
> v10: completely start over with the patch since we didn't
>       handled a whole bunch of corner cases.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>   1 file changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index 4c6389d849ed..861facac33d4 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>    * b. Otherwise, trylock it.
>    */
>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo,
> -                       struct ttm_operation_ctx *ctx, bool *locked)
> +                       struct ttm_operation_ctx *ctx, bool *locked, bool *busy)
>   {
>          bool ret = false;
>
> -       *locked = false;
>          if (bo->resv == ctx->resv) {
>                  reservation_object_assert_held(bo->resv);
>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>                      || !list_empty(&bo->ddestroy))
>                          ret = true;
> +               *locked = false;
> +               if (busy)
> +                       *busy = false;
>          } else {
> -               *locked = reservation_object_trylock(bo->resv);
> -               ret = *locked;
> +               ret = reservation_object_trylock(bo->resv);
> +               *locked = ret;
> +               if (busy)
> +                       *busy = !ret;
>          }
>
>          return ret;
>   }
>
> +/**
> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
> + *
> + * @busy_bo: BO which couldn't be locked with trylock
> + * @ctx: operation context
> + * @ticket: acquire ticket
> + *
> + * Try to lock a busy buffer object to avoid failing eviction.
> + */
> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
> +                                  struct ttm_operation_ctx *ctx,
> +                                  struct ww_acquire_ctx *ticket)
> +{
> +       int r;
> +
> +       if (!busy_bo || !ticket)
> +               return -EBUSY;
> +
> +       if (ctx->interruptible)
> +               r = reservation_object_lock_interruptible(busy_bo->resv,
> +                                                         ticket);
> +       else
> +               r = reservation_object_lock(busy_bo->resv, ticket);
> +
> +       /*
> +        * TODO: It would be better to keep the BO locked until allocation is at
> +        * least tried one more time, but that would mean a much larger rework
> +        * of TTM.
> +        */
> +       if (!r)
> +               reservation_object_unlock(busy_bo->resv);
> +
> +       return r == -EDEADLK ? -EAGAIN : r;
> +}
> +
>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>                                 uint32_t mem_type,
>                                 const struct ttm_place *place,
> -                              struct ttm_operation_ctx *ctx)
> +                              struct ttm_operation_ctx *ctx,
> +                              struct ww_acquire_ctx *ticket)
>   {
> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>          struct ttm_bo_global *glob = bdev->glob;
>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
> -       struct ttm_buffer_object *bo = NULL;
>          bool locked = false;
>          unsigned i;
>          int ret;
> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &man->lru[i], lru) {
> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked))
> +                       bool busy;
> +
> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                           &busy)) {
> +                               if (busy && !busy_bo &&
> +                                   bo->resv->lock.ctx != ticket)
> +                                       busy_bo = bo;
>                                  continue;
> +                       }
>
>                          if (place && !bdev->driver->eviction_valuable(bo,
>                                                                        place)) {
> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>          }
>
>          if (!bo) {
> +               if (busy_bo)
> +                       ttm_bo_get(busy_bo);
>                  spin_unlock(&glob->lru_lock);
> -               return -EBUSY;
> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
If you rely on EAGAIN, why do you still try to lock busy_bo? any 
negative effect if directly return EAGAIN without tring lock?

-David
> +               if (busy_bo)
> +                       ttm_bo_put(busy_bo);
> +               return ret;
>          }
>
>          kref_get(&bo->list_kref);
> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
>                          return ret;
>                  if (mem->mm_node)
>                          break;
> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx);
> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, ctx,
> +                                         bo->resv->lock.ctx);
>                  if (unlikely(ret != 0))
>                          return ret;
>          } while (1);
> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  while (!list_empty(&man->lru[i])) {
>                          spin_unlock(&glob->lru_lock);
> -                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx);
> +                       ret = ttm_mem_evict_first(bdev, mem_type, NULL, &ctx,
> +                                                 NULL);
>                          if (ret)
>                                  return ret;
>                          spin_lock(&glob->lru_lock);
> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct ttm_operation_ctx *ctx)
>          spin_lock(&glob->lru_lock);
>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked)) {
> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx, &locked,
> +                                                          NULL)) {
>                                  ret = 0;
>                                  break;
>                          }
> --
> 2.17.1
>
Am 23.05.19 um 12:24 schrieb zhoucm1:
>
>
> On 2019年05月22日 20:59, Christian König wrote:
>> [CAUTION: External Email]
>>
>> BOs on the LRU might be blocked during command submission
>> and cause OOM situations.
>>
>> Avoid this by blocking for the first busy BO not locked by
>> the same ticket as the BO we are searching space for.
>>
>> v10: completely start over with the patch since we didn't
>>       handled a whole bunch of corner cases.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>   1 file changed, 66 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> index 4c6389d849ed..861facac33d4 100644
>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>    * b. Otherwise, trylock it.
>>    */
>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object 
>> *bo,
>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>> +                       struct ttm_operation_ctx *ctx, bool *locked, 
>> bool *busy)
>>   {
>>          bool ret = false;
>>
>> -       *locked = false;
>>          if (bo->resv == ctx->resv) {
>>                  reservation_object_assert_held(bo->resv);
>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>                      || !list_empty(&bo->ddestroy))
>>                          ret = true;
>> +               *locked = false;
>> +               if (busy)
>> +                       *busy = false;
>>          } else {
>> -               *locked = reservation_object_trylock(bo->resv);
>> -               ret = *locked;
>> +               ret = reservation_object_trylock(bo->resv);
>> +               *locked = ret;
>> +               if (busy)
>> +                       *busy = !ret;
>>          }
>>
>>          return ret;
>>   }
>>
>> +/**
>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>> + *
>> + * @busy_bo: BO which couldn't be locked with trylock
>> + * @ctx: operation context
>> + * @ticket: acquire ticket
>> + *
>> + * Try to lock a busy buffer object to avoid failing eviction.
>> + */
>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>> +                                  struct ttm_operation_ctx *ctx,
>> +                                  struct ww_acquire_ctx *ticket)
>> +{
>> +       int r;
>> +
>> +       if (!busy_bo || !ticket)
>> +               return -EBUSY;
>> +
>> +       if (ctx->interruptible)
>> +               r = reservation_object_lock_interruptible(busy_bo->resv,
>> + ticket);
>> +       else
>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>> +
>> +       /*
>> +        * TODO: It would be better to keep the BO locked until 
>> allocation is at
>> +        * least tried one more time, but that would mean a much 
>> larger rework
>> +        * of TTM.
>> +        */
>> +       if (!r)
>> +               reservation_object_unlock(busy_bo->resv);
>> +
>> +       return r == -EDEADLK ? -EAGAIN : r;
>> +}
>> +
>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>                                 uint32_t mem_type,
>>                                 const struct ttm_place *place,
>> -                              struct ttm_operation_ctx *ctx)
>> +                              struct ttm_operation_ctx *ctx,
>> +                              struct ww_acquire_ctx *ticket)
>>   {
>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>          struct ttm_bo_global *glob = bdev->glob;
>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>> -       struct ttm_buffer_object *bo = NULL;
>>          bool locked = false;
>>          unsigned i;
>>          int ret;
>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &man->lru[i], lru) {
>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked))
>> +                       bool busy;
>> +
>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked,
>> + &busy)) {
>> +                               if (busy && !busy_bo &&
>> +                                   bo->resv->lock.ctx != ticket)
>> +                                       busy_bo = bo;
>>                                  continue;
>> +                       }
>>
>>                          if (place && 
>> !bdev->driver->eviction_valuable(bo,
>> place)) {
>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct 
>> ttm_bo_device *bdev,
>>          }
>>
>>          if (!bo) {
>> +               if (busy_bo)
>> +                       ttm_bo_get(busy_bo);
>>                  spin_unlock(&glob->lru_lock);
>> -               return -EBUSY;
>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
> If you rely on EAGAIN, why do you still try to lock busy_bo? any 
> negative effect if directly return EAGAIN without tring lock?

Yeah, that would burn a lot of CPU cycles because we would essentially 
busy wait for the BO to become unlocked.

When we only return in case of a deadlock the other thread can continue 
with its eviction while we reacquire all looks during EAGAIN handling.

Even directly unlocking the BO as I do here is a bit questionable. But I 
couldn't get the original logic with finding a new BO to evict to work 
correctly, that's why I have the TODO comment in the function itself as 
well.

Christian.

>
> -David
>> +               if (busy_bo)
>> +                       ttm_bo_put(busy_bo);
>> +               return ret;
>>          }
>>
>>          kref_get(&bo->list_kref);
>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct 
>> ttm_buffer_object *bo,
>>                          return ret;
>>                  if (mem->mm_node)
>>                          break;
>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, 
>> ctx);
>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place, 
>> ctx,
>> + bo->resv->lock.ctx);
>>                  if (unlikely(ret != 0))
>>                          return ret;
>>          } while (1);
>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct 
>> ttm_bo_device *bdev,
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  while (!list_empty(&man->lru[i])) {
>>                          spin_unlock(&glob->lru_lock);
>> -                       ret = ttm_mem_evict_first(bdev, mem_type, 
>> NULL, &ctx);
>> +                       ret = ttm_mem_evict_first(bdev, mem_type, 
>> NULL, &ctx,
>> +                                                 NULL);
>>                          if (ret)
>>                                  return ret;
>>                          spin_lock(&glob->lru_lock);
>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, 
>> struct ttm_operation_ctx *ctx)
>>          spin_lock(&glob->lru_lock);
>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked)) {
>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx, 
>> &locked,
>> + NULL)) {
>>                                  ret = 0;
>>                                  break;
>>                          }
>> -- 
>> 2.17.1
>>
>
在 2019/5/23 19:03, Christian König 写道:
> [CAUTION: External Email]

>

> Am 23.05.19 um 12:24 schrieb zhoucm1:

>>

>>

>> On 2019年05月22日 20:59, Christian König wrote:

>>> [CAUTION: External Email]

>>>

>>> BOs on the LRU might be blocked during command submission

>>> and cause OOM situations.

>>>

>>> Avoid this by blocking for the first busy BO not locked by

>>> the same ticket as the BO we are searching space for.

>>>

>>> v10: completely start over with the patch since we didn't

>>>       handled a whole bunch of corner cases.

>>>

>>> Signed-off-by: Christian König <christian.koenig@amd.com>

>>> ---

>>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 

>>> ++++++++++++++++++++++++++++++------

>>>   1 file changed, 66 insertions(+), 11 deletions(-)

>>>

>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 

>>> b/drivers/gpu/drm/ttm/ttm_bo.c

>>> index 4c6389d849ed..861facac33d4 100644

>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c

>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c

>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);

>>>    * b. Otherwise, trylock it.

>>>    */

>>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object

>>> *bo,

>>> -                       struct ttm_operation_ctx *ctx, bool *locked)

>>> +                       struct ttm_operation_ctx *ctx, bool *locked,

>>> bool *busy)

>>>   {

>>>          bool ret = false;

>>>

>>> -       *locked = false;

>>>          if (bo->resv == ctx->resv) {

>>>                  reservation_object_assert_held(bo->resv);

>>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT

>>>                      || !list_empty(&bo->ddestroy))

>>>                          ret = true;

>>> +               *locked = false;

>>> +               if (busy)

>>> +                       *busy = false;

>>>          } else {

>>> -               *locked = reservation_object_trylock(bo->resv);

>>> -               ret = *locked;

>>> +               ret = reservation_object_trylock(bo->resv);

>>> +               *locked = ret;

>>> +               if (busy)

>>> +                       *busy = !ret;

>>>          }

>>>

>>>          return ret;

>>>   }

>>>

>>> +/**

>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available

>>> + *

>>> + * @busy_bo: BO which couldn't be locked with trylock

>>> + * @ctx: operation context

>>> + * @ticket: acquire ticket

>>> + *

>>> + * Try to lock a busy buffer object to avoid failing eviction.

>>> + */

>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,

>>> +                                  struct ttm_operation_ctx *ctx,

>>> +                                  struct ww_acquire_ctx *ticket)

>>> +{

>>> +       int r;

>>> +

>>> +       if (!busy_bo || !ticket)

>>> +               return -EBUSY;

>>> +

>>> +       if (ctx->interruptible)

>>> +               r = 

>>> reservation_object_lock_interruptible(busy_bo->resv,

>>> + ticket);

>>> +       else

>>> +               r = reservation_object_lock(busy_bo->resv, ticket);

>>> +

>>> +       /*

>>> +        * TODO: It would be better to keep the BO locked until

>>> allocation is at

>>> +        * least tried one more time, but that would mean a much

>>> larger rework

>>> +        * of TTM.

>>> +        */

>>> +       if (!r)

>>> +               reservation_object_unlock(busy_bo->resv);

>>> +

>>> +       return r == -EDEADLK ? -EAGAIN : r;

>>> +}

>>> +

>>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,

>>>                                 uint32_t mem_type,

>>>                                 const struct ttm_place *place,

>>> -                              struct ttm_operation_ctx *ctx)

>>> +                              struct ttm_operation_ctx *ctx,

>>> +                              struct ww_acquire_ctx *ticket)

>>>   {

>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;

>>>          struct ttm_bo_global *glob = bdev->glob;

>>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];

>>> -       struct ttm_buffer_object *bo = NULL;

>>>          bool locked = false;

>>>          unsigned i;

>>>          int ret;

>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct

>>> ttm_bo_device *bdev,

>>>          spin_lock(&glob->lru_lock);

>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>                  list_for_each_entry(bo, &man->lru[i], lru) {

>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>>> &locked))

>>> +                       bool busy;

>>> +

>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>>> &locked,

>>> + &busy)) {

>>> +                               if (busy && !busy_bo &&

>>> +                                   bo->resv->lock.ctx != ticket)

>>> +                                       busy_bo = bo;

>>>                                  continue;

>>> +                       }

>>>

>>>                          if (place &&

>>> !bdev->driver->eviction_valuable(bo,

>>> place)) {

>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct

>>> ttm_bo_device *bdev,

>>>          }

>>>

>>>          if (!bo) {

>>> +               if (busy_bo)

>>> +                       ttm_bo_get(busy_bo);

>>>                  spin_unlock(&glob->lru_lock);

>>> -               return -EBUSY;

>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);

>> If you rely on EAGAIN, why do you still try to lock busy_bo? any

>> negative effect if directly return EAGAIN without tring lock?

>

> Yeah, that would burn a lot of CPU cycles because we would essentially

> busy wait for the BO to become unlocked.

>

> When we only return in case of a deadlock the other thread can continue

> with its eviction while we reacquire all looks during EAGAIN handling.

>

> Even directly unlocking the BO as I do here is a bit questionable. But I

> couldn't get the original logic with finding a new BO to evict to work

> correctly, that's why I have the TODO comment in the function itself as

> well.


Yes, it looks very wired.

original logic should already work verified by Prike. Friendly, you  
need go through lookup lru loop again, judge allowable, and whether 
busy_bo->resv and requried_bo->resv are same, and make evict_allowable 
in ctx for same lock of busy_bo before loop again.

Or at least, if busy_bo is evict allowable, you  can direclty evict 
busy_bo after locked successfully.

-David

>

> Christian.

>

>>

>> -David

>>> +               if (busy_bo)

>>> +                       ttm_bo_put(busy_bo);

>>> +               return ret;

>>>          }

>>>

>>>          kref_get(&bo->list_kref);

>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct

>>> ttm_buffer_object *bo,

>>>                          return ret;

>>>                  if (mem->mm_node)

>>>                          break;

>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>>> ctx);

>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>>> ctx,

>>> + bo->resv->lock.ctx);

>>>                  if (unlikely(ret != 0))

>>>                          return ret;

>>>          } while (1);

>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct

>>> ttm_bo_device *bdev,

>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>                  while (!list_empty(&man->lru[i])) {

>>>                          spin_unlock(&glob->lru_lock);

>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,

>>> NULL, &ctx);

>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,

>>> NULL, &ctx,

>>> +                                                 NULL);

>>>                          if (ret)

>>>                                  return ret;

>>>                          spin_lock(&glob->lru_lock);

>>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,

>>> struct ttm_operation_ctx *ctx)

>>>          spin_lock(&glob->lru_lock);

>>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {

>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>>> &locked)) {

>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>>> &locked,

>>> + NULL)) {

>>>                                  ret = 0;

>>>                                  break;

>>>                          }

>>> -- 

>>> 2.17.1

>>>

>>

>
Am 23.05.19 um 13:50 schrieb Zhou, David(ChunMing):
> 在 2019/5/23 19:03, Christian König 写道:

>> [CAUTION: External Email]

>>

>> Am 23.05.19 um 12:24 schrieb zhoucm1:

>>>

>>> On 2019年05月22日 20:59, Christian König wrote:

>>>> [CAUTION: External Email]

>>>>

>>>> BOs on the LRU might be blocked during command submission

>>>> and cause OOM situations.

>>>>

>>>> Avoid this by blocking for the first busy BO not locked by

>>>> the same ticket as the BO we are searching space for.

>>>>

>>>> v10: completely start over with the patch since we didn't

>>>>        handled a whole bunch of corner cases.

>>>>

>>>> Signed-off-by: Christian König <christian.koenig@amd.com>

>>>> ---

>>>>    drivers/gpu/drm/ttm/ttm_bo.c | 77

>>>> ++++++++++++++++++++++++++++++------

>>>>    1 file changed, 66 insertions(+), 11 deletions(-)

>>>>

>>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c

>>>> b/drivers/gpu/drm/ttm/ttm_bo.c

>>>> index 4c6389d849ed..861facac33d4 100644

>>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c

>>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c

>>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);

>>>>     * b. Otherwise, trylock it.

>>>>     */

>>>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object

>>>> *bo,

>>>> -                       struct ttm_operation_ctx *ctx, bool *locked)

>>>> +                       struct ttm_operation_ctx *ctx, bool *locked,

>>>> bool *busy)

>>>>    {

>>>>           bool ret = false;

>>>>

>>>> -       *locked = false;

>>>>           if (bo->resv == ctx->resv) {

>>>>                   reservation_object_assert_held(bo->resv);

>>>>                   if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT

>>>>                       || !list_empty(&bo->ddestroy))

>>>>                           ret = true;

>>>> +               *locked = false;

>>>> +               if (busy)

>>>> +                       *busy = false;

>>>>           } else {

>>>> -               *locked = reservation_object_trylock(bo->resv);

>>>> -               ret = *locked;

>>>> +               ret = reservation_object_trylock(bo->resv);

>>>> +               *locked = ret;

>>>> +               if (busy)

>>>> +                       *busy = !ret;

>>>>           }

>>>>

>>>>           return ret;

>>>>    }

>>>>

>>>> +/**

>>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available

>>>> + *

>>>> + * @busy_bo: BO which couldn't be locked with trylock

>>>> + * @ctx: operation context

>>>> + * @ticket: acquire ticket

>>>> + *

>>>> + * Try to lock a busy buffer object to avoid failing eviction.

>>>> + */

>>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,

>>>> +                                  struct ttm_operation_ctx *ctx,

>>>> +                                  struct ww_acquire_ctx *ticket)

>>>> +{

>>>> +       int r;

>>>> +

>>>> +       if (!busy_bo || !ticket)

>>>> +               return -EBUSY;

>>>> +

>>>> +       if (ctx->interruptible)

>>>> +               r =

>>>> reservation_object_lock_interruptible(busy_bo->resv,

>>>> + ticket);

>>>> +       else

>>>> +               r = reservation_object_lock(busy_bo->resv, ticket);

>>>> +

>>>> +       /*

>>>> +        * TODO: It would be better to keep the BO locked until

>>>> allocation is at

>>>> +        * least tried one more time, but that would mean a much

>>>> larger rework

>>>> +        * of TTM.

>>>> +        */

>>>> +       if (!r)

>>>> +               reservation_object_unlock(busy_bo->resv);

>>>> +

>>>> +       return r == -EDEADLK ? -EAGAIN : r;

>>>> +}

>>>> +

>>>>    static int ttm_mem_evict_first(struct ttm_bo_device *bdev,

>>>>                                  uint32_t mem_type,

>>>>                                  const struct ttm_place *place,

>>>> -                              struct ttm_operation_ctx *ctx)

>>>> +                              struct ttm_operation_ctx *ctx,

>>>> +                              struct ww_acquire_ctx *ticket)

>>>>    {

>>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;

>>>>           struct ttm_bo_global *glob = bdev->glob;

>>>>           struct ttm_mem_type_manager *man = &bdev->man[mem_type];

>>>> -       struct ttm_buffer_object *bo = NULL;

>>>>           bool locked = false;

>>>>           unsigned i;

>>>>           int ret;

>>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct

>>>> ttm_bo_device *bdev,

>>>>           spin_lock(&glob->lru_lock);

>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>>                   list_for_each_entry(bo, &man->lru[i], lru) {

>>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>>>> &locked))

>>>> +                       bool busy;

>>>> +

>>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>>>> &locked,

>>>> + &busy)) {

>>>> +                               if (busy && !busy_bo &&

>>>> +                                   bo->resv->lock.ctx != ticket)

>>>> +                                       busy_bo = bo;

>>>>                                   continue;

>>>> +                       }

>>>>

>>>>                           if (place &&

>>>> !bdev->driver->eviction_valuable(bo,

>>>> place)) {

>>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct

>>>> ttm_bo_device *bdev,

>>>>           }

>>>>

>>>>           if (!bo) {

>>>> +               if (busy_bo)

>>>> +                       ttm_bo_get(busy_bo);

>>>>                   spin_unlock(&glob->lru_lock);

>>>> -               return -EBUSY;

>>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);

>>> If you rely on EAGAIN, why do you still try to lock busy_bo? any

>>> negative effect if directly return EAGAIN without tring lock?

>> Yeah, that would burn a lot of CPU cycles because we would essentially

>> busy wait for the BO to become unlocked.

>>

>> When we only return in case of a deadlock the other thread can continue

>> with its eviction while we reacquire all looks during EAGAIN handling.

>>

>> Even directly unlocking the BO as I do here is a bit questionable. But I

>> couldn't get the original logic with finding a new BO to evict to work

>> correctly, that's why I have the TODO comment in the function itself as

>> well.

> Yes, it looks very wired.

>

> original logic should already work verified by Prike.


Unfortunately that didn't worked out correctly. Marek came up with a GDS 
related test case which showed that we corrupted the locking object somehow.

> Friendly, you

> need go through lookup lru loop again, judge allowable, and whether

> busy_bo->resv and requried_bo->resv are same, and make evict_allowable

> in ctx for same lock of busy_bo before loop again.


Actually we don't necessarily need to do this. See Mareks test case was 
that you not only have two tasks fighting for resources, but three.

So what happens is that task A is doing a command submission swapping 
things in and task B and C are waiting. When A unlocks  B got the lock 
and swapped things out again. When B then unlocked C got the lock and 
also tried to evict something and failed miserable.

In this failing path of C we somehow overwrote the memory of the lock 
(some pointer must have gone banana).

Anyway the solution we have here now seems to work for both Prikes and 
Mareks test case, but is definitely not ideal. E.g. we rely on that we 
either be able to evict something sooner or later or run out of BOs to 
wait to become idle.

Apart from that Prike now reported an OOM in the VM code, but that is 
completely unrelated to this problem.

Regards,
Christian.

> Or at least, if busy_bo is evict allowable, you  can direclty evict

> busy_bo after locked successfully.

>

> -David

>

>> Christian.

>>

>>> -David

>>>> +               if (busy_bo)

>>>> +                       ttm_bo_put(busy_bo);

>>>> +               return ret;

>>>>           }

>>>>

>>>>           kref_get(&bo->list_kref);

>>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct

>>>> ttm_buffer_object *bo,

>>>>                           return ret;

>>>>                   if (mem->mm_node)

>>>>                           break;

>>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>>>> ctx);

>>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>>>> ctx,

>>>> + bo->resv->lock.ctx);

>>>>                   if (unlikely(ret != 0))

>>>>                           return ret;

>>>>           } while (1);

>>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct

>>>> ttm_bo_device *bdev,

>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>>                   while (!list_empty(&man->lru[i])) {

>>>>                           spin_unlock(&glob->lru_lock);

>>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,

>>>> NULL, &ctx);

>>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,

>>>> NULL, &ctx,

>>>> +                                                 NULL);

>>>>                           if (ret)

>>>>                                   return ret;

>>>>                           spin_lock(&glob->lru_lock);

>>>> @@ -1797,7 +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,

>>>> struct ttm_operation_ctx *ctx)

>>>>           spin_lock(&glob->lru_lock);

>>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>>>                   list_for_each_entry(bo, &glob->swap_lru[i], swap) {

>>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>>>> &locked)) {

>>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>>>> &locked,

>>>> + NULL)) {

>>>>                                   ret = 0;

>>>>                                   break;

>>>>                           }

>>>> -- 

>>>> 2.17.1

>>>>
Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will 
return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
 can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .

Thanks,
Prike
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 

Sent: Thursday, May 23, 2019 7:04 PM
To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Olsak, Marek <Marek.Olsak@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10

[CAUTION: External Email]

Am 23.05.19 um 12:24 schrieb zhoucm1:
>

>

> On 2019年05月22日 20:59, Christian König wrote:

>> [CAUTION: External Email]

>>

>> BOs on the LRU might be blocked during command submission and cause 

>> OOM situations.

>>

>> Avoid this by blocking for the first busy BO not locked by the same 

>> ticket as the BO we are searching space for.

>>

>> v10: completely start over with the patch since we didn't

>>       handled a whole bunch of corner cases.

>>

>> Signed-off-by: Christian König <christian.koenig@amd.com>

>> ---

>>   drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------

>>   1 file changed, 66 insertions(+), 11 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c 

>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4 

>> 100644

>> --- a/drivers/gpu/drm/ttm/ttm_bo.c

>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c

>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);

>>    * b. Otherwise, trylock it.

>>    */

>>   static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object 

>> *bo,

>> -                       struct ttm_operation_ctx *ctx, bool *locked)

>> +                       struct ttm_operation_ctx *ctx, bool *locked,

>> bool *busy)

>>   {

>>          bool ret = false;

>>

>> -       *locked = false;

>>          if (bo->resv == ctx->resv) {

>>                  reservation_object_assert_held(bo->resv);

>>                  if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT

>>                      || !list_empty(&bo->ddestroy))

>>                          ret = true;

>> +               *locked = false;

>> +               if (busy)

>> +                       *busy = false;

>>          } else {

>> -               *locked = reservation_object_trylock(bo->resv);

>> -               ret = *locked;

>> +               ret = reservation_object_trylock(bo->resv);

>> +               *locked = ret;

>> +               if (busy)

>> +                       *busy = !ret;

>>          }

>>

>>          return ret;

>>   }

>>

>> +/**

>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available

>> + *

>> + * @busy_bo: BO which couldn't be locked with trylock

>> + * @ctx: operation context

>> + * @ticket: acquire ticket

>> + *

>> + * Try to lock a busy buffer object to avoid failing eviction.

>> + */

>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,

>> +                                  struct ttm_operation_ctx *ctx,

>> +                                  struct ww_acquire_ctx *ticket) {

>> +       int r;

>> +

>> +       if (!busy_bo || !ticket)

>> +               return -EBUSY;

>> +

>> +       if (ctx->interruptible)

>> +               r = 

>> + reservation_object_lock_interruptible(busy_bo->resv,

>> + ticket);

>> +       else

>> +               r = reservation_object_lock(busy_bo->resv, ticket);

>> +

>> +       /*

>> +        * TODO: It would be better to keep the BO locked until

>> allocation is at

>> +        * least tried one more time, but that would mean a much

>> larger rework

>> +        * of TTM.

>> +        */

>> +       if (!r)

>> +               reservation_object_unlock(busy_bo->resv);

>> +

>> +       return r == -EDEADLK ? -EAGAIN : r; }

>> +

>>   static int ttm_mem_evict_first(struct ttm_bo_device *bdev,

>>                                 uint32_t mem_type,

>>                                 const struct ttm_place *place,

>> -                              struct ttm_operation_ctx *ctx)

>> +                              struct ttm_operation_ctx *ctx,

>> +                              struct ww_acquire_ctx *ticket)

>>   {

>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;

>>          struct ttm_bo_global *glob = bdev->glob;

>>          struct ttm_mem_type_manager *man = &bdev->man[mem_type];

>> -       struct ttm_buffer_object *bo = NULL;

>>          bool locked = false;

>>          unsigned i;

>>          int ret;

>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct 

>> ttm_bo_device *bdev,

>>          spin_lock(&glob->lru_lock);

>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>                  list_for_each_entry(bo, &man->lru[i], lru) {

>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>> &locked))

>> +                       bool busy;

>> +

>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,

>> &locked,

>> + &busy)) {

>> +                               if (busy && !busy_bo &&

>> +                                   bo->resv->lock.ctx != ticket)

>> +                                       busy_bo = bo;

>>                                  continue;

>> +                       }

>>

>>                          if (place && 

>> !bdev->driver->eviction_valuable(bo,

>> place)) {

>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct 

>> ttm_bo_device *bdev,

>>          }

>>

>>          if (!bo) {

>> +               if (busy_bo)

>> +                       ttm_bo_get(busy_bo);

>>                  spin_unlock(&glob->lru_lock);

>> -               return -EBUSY;

>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);

> If you rely on EAGAIN, why do you still try to lock busy_bo? any 

> negative effect if directly return EAGAIN without tring lock?


Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.

When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.

Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.

Christian.

>

> -David

>> +               if (busy_bo)

>> +                       ttm_bo_put(busy_bo);

>> +               return ret;

>>          }

>>

>>          kref_get(&bo->list_kref);

>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct 

>> ttm_buffer_object *bo,

>>                          return ret;

>>                  if (mem->mm_node)

>>                          break;

>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>> ctx);

>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,

>> ctx,

>> + bo->resv->lock.ctx);

>>                  if (unlikely(ret != 0))

>>                          return ret;

>>          } while (1);

>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct 

>> ttm_bo_device *bdev,

>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>                  while (!list_empty(&man->lru[i])) {

>>                          spin_unlock(&glob->lru_lock);

>> -                       ret = ttm_mem_evict_first(bdev, mem_type,

>> NULL, &ctx);

>> +                       ret = ttm_mem_evict_first(bdev, mem_type,

>> NULL, &ctx,

>> +                                                 NULL);

>>                          if (ret)

>>                                  return ret;

>>                          spin_lock(&glob->lru_lock); @@ -1797,7 

>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct 

>> ttm_operation_ctx *ctx)

>>          spin_lock(&glob->lru_lock);

>>          for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {

>>                  list_for_each_entry(bo, &glob->swap_lru[i], swap) {

>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>> &locked)) {

>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,

>> &locked,

>> + NULL)) {

>>                                  ret = 0;

>>                                  break;

>>                          }

>> --

>> 2.17.1

>>

>
Yeah, that shouldn't be a problem. We just need to make sure we don't 
busy wait for the BOs to become available.

Christian.

Am 24.05.19 um 07:35 schrieb Liang, Prike:
> Use Abaqus torturing the amdgpu driver more times will running into locking first busy BO deadlock .Then the caller will
> return EAGAIN and eventually dm_plane_helper_prepare_fb popups out pinned failed message .For this case, the patch#7
>   can we add EAGAIN as ERESTARTSYS which filter out the annoying error message .
>
> Thanks,
> Prike
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken@gmail.com>
> Sent: Thursday, May 23, 2019 7:04 PM
> To: Zhou, David(ChunMing) <David1.Zhou@amd.com>; Olsak, Marek <Marek.Olsak@amd.com>; Zhou, David(ChunMing) <David1.Zhou@amd.com>; Liang, Prike <Prike.Liang@amd.com>; dri-devel@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH 06/10] drm/ttm: fix busy memory to fail other user v10
>
> [CAUTION: External Email]
>
> Am 23.05.19 um 12:24 schrieb zhoucm1:
>>
>> On 2019年05月22日 20:59, Christian König wrote:
>>> [CAUTION: External Email]
>>>
>>> BOs on the LRU might be blocked during command submission and cause
>>> OOM situations.
>>>
>>> Avoid this by blocking for the first busy BO not locked by the same
>>> ticket as the BO we are searching space for.
>>>
>>> v10: completely start over with the patch since we didn't
>>>        handled a whole bunch of corner cases.
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> ---
>>>    drivers/gpu/drm/ttm/ttm_bo.c | 77 ++++++++++++++++++++++++++++++------
>>>    1 file changed, 66 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c
>>> b/drivers/gpu/drm/ttm/ttm_bo.c index 4c6389d849ed..861facac33d4
>>> 100644
>>> --- a/drivers/gpu/drm/ttm/ttm_bo.c
>>> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>>> @@ -771,32 +771,72 @@ EXPORT_SYMBOL(ttm_bo_eviction_valuable);
>>>     * b. Otherwise, trylock it.
>>>     */
>>>    static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object
>>> *bo,
>>> -                       struct ttm_operation_ctx *ctx, bool *locked)
>>> +                       struct ttm_operation_ctx *ctx, bool *locked,
>>> bool *busy)
>>>    {
>>>           bool ret = false;
>>>
>>> -       *locked = false;
>>>           if (bo->resv == ctx->resv) {
>>>                   reservation_object_assert_held(bo->resv);
>>>                   if (ctx->flags & TTM_OPT_FLAG_ALLOW_RES_EVICT
>>>                       || !list_empty(&bo->ddestroy))
>>>                           ret = true;
>>> +               *locked = false;
>>> +               if (busy)
>>> +                       *busy = false;
>>>           } else {
>>> -               *locked = reservation_object_trylock(bo->resv);
>>> -               ret = *locked;
>>> +               ret = reservation_object_trylock(bo->resv);
>>> +               *locked = ret;
>>> +               if (busy)
>>> +                       *busy = !ret;
>>>           }
>>>
>>>           return ret;
>>>    }
>>>
>>> +/**
>>> + * ttm_mem_evict_wait_busy - wait for a busy BO to become available
>>> + *
>>> + * @busy_bo: BO which couldn't be locked with trylock
>>> + * @ctx: operation context
>>> + * @ticket: acquire ticket
>>> + *
>>> + * Try to lock a busy buffer object to avoid failing eviction.
>>> + */
>>> +static int ttm_mem_evict_wait_busy(struct ttm_buffer_object *busy_bo,
>>> +                                  struct ttm_operation_ctx *ctx,
>>> +                                  struct ww_acquire_ctx *ticket) {
>>> +       int r;
>>> +
>>> +       if (!busy_bo || !ticket)
>>> +               return -EBUSY;
>>> +
>>> +       if (ctx->interruptible)
>>> +               r =
>>> + reservation_object_lock_interruptible(busy_bo->resv,
>>> + ticket);
>>> +       else
>>> +               r = reservation_object_lock(busy_bo->resv, ticket);
>>> +
>>> +       /*
>>> +        * TODO: It would be better to keep the BO locked until
>>> allocation is at
>>> +        * least tried one more time, but that would mean a much
>>> larger rework
>>> +        * of TTM.
>>> +        */
>>> +       if (!r)
>>> +               reservation_object_unlock(busy_bo->resv);
>>> +
>>> +       return r == -EDEADLK ? -EAGAIN : r; }
>>> +
>>>    static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
>>>                                  uint32_t mem_type,
>>>                                  const struct ttm_place *place,
>>> -                              struct ttm_operation_ctx *ctx)
>>> +                              struct ttm_operation_ctx *ctx,
>>> +                              struct ww_acquire_ctx *ticket)
>>>    {
>>> +       struct ttm_buffer_object *bo = NULL, *busy_bo = NULL;
>>>           struct ttm_bo_global *glob = bdev->glob;
>>>           struct ttm_mem_type_manager *man = &bdev->man[mem_type];
>>> -       struct ttm_buffer_object *bo = NULL;
>>>           bool locked = false;
>>>           unsigned i;
>>>           int ret;
>>> @@ -804,8 +844,15 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &man->lru[i], lru) {
>>> -                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked))
>>> +                       bool busy;
>>> +
>>> +                       if (!ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + &busy)) {
>>> +                               if (busy && !busy_bo &&
>>> +                                   bo->resv->lock.ctx != ticket)
>>> +                                       busy_bo = bo;
>>>                                   continue;
>>> +                       }
>>>
>>>                           if (place &&
>>> !bdev->driver->eviction_valuable(bo,
>>> place)) {
>>> @@ -824,8 +871,13 @@ static int ttm_mem_evict_first(struct
>>> ttm_bo_device *bdev,
>>>           }
>>>
>>>           if (!bo) {
>>> +               if (busy_bo)
>>> +                       ttm_bo_get(busy_bo);
>>>                   spin_unlock(&glob->lru_lock);
>>> -               return -EBUSY;
>>> +               ret = ttm_mem_evict_wait_busy(busy_bo, ctx, ticket);
>> If you rely on EAGAIN, why do you still try to lock busy_bo? any
>> negative effect if directly return EAGAIN without tring lock?
> Yeah, that would burn a lot of CPU cycles because we would essentially busy wait for the BO to become unlocked.
>
> When we only return in case of a deadlock the other thread can continue with its eviction while we reacquire all looks during EAGAIN handling.
>
> Even directly unlocking the BO as I do here is a bit questionable. But I couldn't get the original logic with finding a new BO to evict to work correctly, that's why I have the TODO comment in the function itself as well.
>
> Christian.
>
>> -David
>>> +               if (busy_bo)
>>> +                       ttm_bo_put(busy_bo);
>>> +               return ret;
>>>           }
>>>
>>>           kref_get(&bo->list_kref);
>>> @@ -911,7 +963,8 @@ static int ttm_bo_mem_force_space(struct
>>> ttm_buffer_object *bo,
>>>                           return ret;
>>>                   if (mem->mm_node)
>>>                           break;
>>> -               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx);
>>> +               ret = ttm_mem_evict_first(bdev, mem->mem_type, place,
>>> ctx,
>>> + bo->resv->lock.ctx);
>>>                   if (unlikely(ret != 0))
>>>                           return ret;
>>>           } while (1);
>>> @@ -1426,7 +1479,8 @@ static int ttm_bo_force_list_clean(struct
>>> ttm_bo_device *bdev,
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   while (!list_empty(&man->lru[i])) {
>>>                           spin_unlock(&glob->lru_lock);
>>> -                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx);
>>> +                       ret = ttm_mem_evict_first(bdev, mem_type,
>>> NULL, &ctx,
>>> +                                                 NULL);
>>>                           if (ret)
>>>                                   return ret;
>>>                           spin_lock(&glob->lru_lock); @@ -1797,7
>>> +1851,8 @@ int ttm_bo_swapout(struct ttm_bo_global *glob, struct
>>> ttm_operation_ctx *ctx)
>>>           spin_lock(&glob->lru_lock);
>>>           for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
>>>                   list_for_each_entry(bo, &glob->swap_lru[i], swap) {
>>> -                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked)) {
>>> +                       if (ttm_bo_evict_swapout_allowable(bo, ctx,
>>> &locked,
>>> + NULL)) {
>>>                                   ret = 0;
>>>                                   break;
>>>                           }
>>> --
>>> 2.17.1
>>>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx