[v2] drm/amdgpu: Fix the dead lock issue.

Submitted by Deng, Emily on Sept. 10, 2018, 9:34 a.m.

Details

Message ID 1536572057-22749-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/amdgpu: Fix the dead lock issue." ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily Sept. 10, 2018, 9:34 a.m.
It will ramdomly have the dead lock issue when test TDR:
1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
2. amdgpu_bo_create locked the bo's resv lock
3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
lock.

v2:
   Make a local copy of the list

Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index acfc63e..2b9f597 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3006,6 +3006,9 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 	long r = 1;
 	int i = 0;
 	long tmo;
+	struct list_head local_shadow_list;
+
+	INIT_LIST_HEAD(&local_shadow_list);
 
 	if (amdgpu_sriov_runtime(adev))
 		tmo = msecs_to_jiffies(8000);
@@ -3013,8 +3016,15 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 		tmo = msecs_to_jiffies(100);
 
 	DRM_INFO("recover vram bo from shadow start\n");
+
 	mutex_lock(&adev->shadow_list_lock);
 	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+		amdgpu_bo_ref(bo);
+		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
+	}
+	mutex_unlock(&adev->shadow_list_lock);
+
+	list_for_each_entry_safe(bo, tmp, &local_shadow_list, copy_shadow_list) {
 		next = NULL;
 		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
 		if (fence) {
@@ -3033,8 +3043,8 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 
 		dma_fence_put(fence);
 		fence = next;
+		amdgpu_bo_unref(&bo);
 	}
-	mutex_unlock(&adev->shadow_list_lock);
 
 	if (fence) {
 		r = dma_fence_wait_timeout(fence, false, tmo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 907fdf4..cfee16c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -103,6 +103,7 @@  struct amdgpu_bo {
 		struct list_head	mn_list;
 		struct list_head	shadow_list;
 	};
+        struct list_head                copy_shadow_list;
 
 	struct kgd_mem                  *kfd_bo;
 };

Comments

Am 10.09.2018 um 11:34 schrieb Emily Deng:
> It will ramdomly have the dead lock issue when test TDR:
> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
> 2. amdgpu_bo_create locked the bo's resv lock
> 3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
> 4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
> lock.
>
> v2:
>     Make a local copy of the list
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
>   2 files changed, 12 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index acfc63e..2b9f597 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3006,6 +3006,9 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   	long r = 1;
>   	int i = 0;
>   	long tmo;
> +	struct list_head local_shadow_list;
> +
> +	INIT_LIST_HEAD(&local_shadow_list);
>   
>   	if (amdgpu_sriov_runtime(adev))
>   		tmo = msecs_to_jiffies(8000);
> @@ -3013,8 +3016,15 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   		tmo = msecs_to_jiffies(100);
>   
>   	DRM_INFO("recover vram bo from shadow start\n");
> +
>   	mutex_lock(&adev->shadow_list_lock);
>   	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> +		amdgpu_bo_ref(bo);
> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
> +	}

Please don't add an extra copy_shadow_list field to amdgpu_bo.

Instead just use bo->shadow list for this. When you hold a reference to 
the BO it should not be removed from the shadow list.

Additional to that you can just use list_splice_init() to move the whole 
shadow list to your local list.

Christian.

> +	mutex_unlock(&adev->shadow_list_lock);
> +
> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, copy_shadow_list) {
>   		next = NULL;
>   		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>   		if (fence) {
> @@ -3033,8 +3043,8 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   
>   		dma_fence_put(fence);
>   		fence = next;
> +		amdgpu_bo_unref(&bo);
>   	}
> -	mutex_unlock(&adev->shadow_list_lock);
>   
>   	if (fence) {
>   		r = dma_fence_wait_timeout(fence, false, tmo);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 907fdf4..cfee16c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -103,6 +103,7 @@ struct amdgpu_bo {
>   		struct list_head	mn_list;
>   		struct list_head	shadow_list;
>   	};
> +        struct list_head                copy_shadow_list;
>   
>   	struct kgd_mem                  *kfd_bo;
>   };
>-----Original Message-----

>From: Christian König <ckoenig.leichtzumerken@gmail.com>

>Sent: Monday, September 10, 2018 5:41 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>

>Am 10.09.2018 um 11:34 schrieb Emily Deng:

>> It will ramdomly have the dead lock issue when test TDR:

>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>> amdgpu_bo_create locked the bo's resv lock 3. amdgpu_bo_create_shadow

>> is waiting for the shadow_list_lock 4.

>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv

>> lock.

>>

>> v2:

>>     Make a local copy of the list

>>

>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +

>>   2 files changed, 12 insertions(+), 1 deletion(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> index acfc63e..2b9f597 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> @@ -3006,6 +3006,9 @@ static int

>amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>   	long r = 1;

>>   	int i = 0;

>>   	long tmo;

>> +	struct list_head local_shadow_list;

>> +

>> +	INIT_LIST_HEAD(&local_shadow_list);

>>

>>   	if (amdgpu_sriov_runtime(adev))

>>   		tmo = msecs_to_jiffies(8000);

>> @@ -3013,8 +3016,15 @@ static int

>amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>   		tmo = msecs_to_jiffies(100);

>>

>>   	DRM_INFO("recover vram bo from shadow start\n");

>> +

>>   	mutex_lock(&adev->shadow_list_lock);

>>   	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list)

>> {

>> +		amdgpu_bo_ref(bo);

>> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);

>> +	}

>

>Please don't add an extra copy_shadow_list field to amdgpu_bo.

If don't use an extra variable, the local shadow list will change according the adev->shadow_list, 
both for adding or deleting, it is not we want.

>Instead just use bo->shadow list for this. When you hold a reference to the BO

>it should not be removed from the shadow list.

>

>Additional to that you can just use list_splice_init() to move the whole shadow

>list to your local list.

>

>Christian.

>

>> +	mutex_unlock(&adev->shadow_list_lock);

>> +

>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,

>> +copy_shadow_list) {

>>   		next = NULL;

>>   		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,

>&next);

>>   		if (fence) {

>> @@ -3033,8 +3043,8 @@ static int

>amdgpu_device_handle_vram_lost(struct

>> amdgpu_device *adev)

>>

>>   		dma_fence_put(fence);

>>   		fence = next;

>> +		amdgpu_bo_unref(&bo);

>>   	}

>> -	mutex_unlock(&adev->shadow_list_lock);

>>

>>   	if (fence) {

>>   		r = dma_fence_wait_timeout(fence, false, tmo); diff --git

>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>> index 907fdf4..cfee16c 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>> @@ -103,6 +103,7 @@ struct amdgpu_bo {

>>   		struct list_head	mn_list;

>>   		struct list_head	shadow_list;

>>   	};

>> +        struct list_head                copy_shadow_list;

>>

>>   	struct kgd_mem                  *kfd_bo;

>>   };
Am 10.09.2018 um 11:47 schrieb Deng, Emily:
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Monday, September 10, 2018 5:41 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>
>> Am 10.09.2018 um 11:34 schrieb Emily Deng:
>>> It will ramdomly have the dead lock issue when test TDR:
>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.
>>> amdgpu_bo_create locked the bo's resv lock 3. amdgpu_bo_create_shadow
>>> is waiting for the shadow_list_lock 4.
>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
>>> lock.
>>>
>>> v2:
>>>      Make a local copy of the list
>>>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
>>>    2 files changed, 12 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index acfc63e..2b9f597 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3006,6 +3006,9 @@ static int
>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>    	long r = 1;
>>>    	int i = 0;
>>>    	long tmo;
>>> +	struct list_head local_shadow_list;
>>> +
>>> +	INIT_LIST_HEAD(&local_shadow_list);
>>>
>>>    	if (amdgpu_sriov_runtime(adev))
>>>    		tmo = msecs_to_jiffies(8000);
>>> @@ -3013,8 +3016,15 @@ static int
>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>    		tmo = msecs_to_jiffies(100);
>>>
>>>    	DRM_INFO("recover vram bo from shadow start\n");
>>> +
>>>    	mutex_lock(&adev->shadow_list_lock);
>>>    	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list)
>>> {
>>> +		amdgpu_bo_ref(bo);
>>> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
>>> +	}
>> Please don't add an extra copy_shadow_list field to amdgpu_bo.
> If don't use an extra variable, the local shadow list will change according the adev->shadow_list,
> both for adding or deleting, it is not we want.

That is not correct, see amdgpu_bo_destroy:
>         if (!list_empty(&bo->shadow_list)) {
>                 mutex_lock(&adev->shadow_list_lock);
>                 list_del_init(&bo->shadow_list);
>                 mutex_unlock(&adev->shadow_list_lock);
>         }

The BO is only removed from the list when it is destroyed, since we 
grabbed a local reference it can't be destroyed. So we are safe here.

Regards,
Christian.

>
>> Instead just use bo->shadow list for this. When you hold a reference to the BO
>> it should not be removed from the shadow list.
>>
>> Additional to that you can just use list_splice_init() to move the whole shadow
>> list to your local list.
>>
>> Christian.
>>
>>> +	mutex_unlock(&adev->shadow_list_lock);
>>> +
>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,
>>> +copy_shadow_list) {
>>>    		next = NULL;
>>>    		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,
>> &next);
>>>    		if (fence) {
>>> @@ -3033,8 +3043,8 @@ static int
>> amdgpu_device_handle_vram_lost(struct
>>> amdgpu_device *adev)
>>>
>>>    		dma_fence_put(fence);
>>>    		fence = next;
>>> +		amdgpu_bo_unref(&bo);
>>>    	}
>>> -	mutex_unlock(&adev->shadow_list_lock);
>>>
>>>    	if (fence) {
>>>    		r = dma_fence_wait_timeout(fence, false, tmo); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 907fdf4..cfee16c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {
>>>    		struct list_head	mn_list;
>>>    		struct list_head	shadow_list;
>>>    	};
>>> +        struct list_head                copy_shadow_list;
>>>
>>>    	struct kgd_mem                  *kfd_bo;
>>>    };
>-----Original Message-----

>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>Christian König

>Sent: Monday, September 10, 2018 5:49 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>

>Am 10.09.2018 um 11:47 schrieb Deng, Emily:

>>> -----Original Message-----

>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>> Sent: Monday, September 10, 2018 5:41 PM

>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>>

>>> Am 10.09.2018 um 11:34 schrieb Emily Deng:

>>>> It will ramdomly have the dead lock issue when test TDR:

>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>>>> amdgpu_bo_create locked the bo's resv lock 3.

>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.

>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv

>>>> lock.

>>>>

>>>> v2:

>>>>      Make a local copy of the list

>>>>

>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>> ---

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +

>>>>    2 files changed, 12 insertions(+), 1 deletion(-)

>>>>

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> index acfc63e..2b9f597 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> @@ -3006,6 +3006,9 @@ static int

>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>    	long r = 1;

>>>>    	int i = 0;

>>>>    	long tmo;

>>>> +	struct list_head local_shadow_list;

>>>> +

>>>> +	INIT_LIST_HEAD(&local_shadow_list);

>>>>

>>>>    	if (amdgpu_sriov_runtime(adev))

>>>>    		tmo = msecs_to_jiffies(8000);

>>>> @@ -3013,8 +3016,15 @@ static int

>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>    		tmo = msecs_to_jiffies(100);

>>>>

>>>>    	DRM_INFO("recover vram bo from shadow start\n");

>>>> +

>>>>    	mutex_lock(&adev->shadow_list_lock);

>>>>    	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,

>>>> shadow_list) {

>>>> +		amdgpu_bo_ref(bo);

>>>> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);

>>>> +	}

>>> Please don't add an extra copy_shadow_list field to amdgpu_bo.

>> If don't use an extra variable, the local shadow list will change

>> according the adev->shadow_list, both for adding or deleting, it is not we

>want.

>

>That is not correct, see amdgpu_bo_destroy:

>>         if (!list_empty(&bo->shadow_list)) {

>>                 mutex_lock(&adev->shadow_list_lock);

>>                 list_del_init(&bo->shadow_list);

>>                 mutex_unlock(&adev->shadow_list_lock);

>>         }

>

>The BO is only removed from the list when it is destroyed, since we grabbed a

>local reference it can't be destroyed. So we are safe here.

Sorry I am not meaning the delete, what about the adding.
>Regards,

>Christian.

>

>>

>>> Instead just use bo->shadow list for this. When you hold a reference

>>> to the BO it should not be removed from the shadow list.

>>>

>>> Additional to that you can just use list_splice_init() to move the

>>> whole shadow list to your local list.

>>>

>>> Christian.

>>>

>>>> +	mutex_unlock(&adev->shadow_list_lock);

>>>> +

>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,

>>>> +copy_shadow_list) {

>>>>    		next = NULL;

>>>>    		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,

>>> &next);

>>>>    		if (fence) {

>>>> @@ -3033,8 +3043,8 @@ static int

>>> amdgpu_device_handle_vram_lost(struct

>>>> amdgpu_device *adev)

>>>>

>>>>    		dma_fence_put(fence);

>>>>    		fence = next;

>>>> +		amdgpu_bo_unref(&bo);

>>>>    	}

>>>> -	mutex_unlock(&adev->shadow_list_lock);

>>>>

>>>>    	if (fence) {

>>>>    		r = dma_fence_wait_timeout(fence, false, tmo); diff --git

>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>> index 907fdf4..cfee16c 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {

>>>>    		struct list_head	mn_list;

>>>>    		struct list_head	shadow_list;

>>>>    	};

>>>> +        struct list_head                copy_shadow_list;

>>>>

>>>>    	struct kgd_mem                  *kfd_bo;

>>>>    };

>

>_______________________________________________

>amd-gfx mailing list

>amd-gfx@lists.freedesktop.org

>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 10.09.2018 um 11:55 schrieb Deng, Emily:
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Monday, September 10, 2018 5:49 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>
>> Am 10.09.2018 um 11:47 schrieb Deng, Emily:
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Monday, September 10, 2018 5:41 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>>>
>>>> Am 10.09.2018 um 11:34 schrieb Emily Deng:
>>>>> It will ramdomly have the dead lock issue when test TDR:
>>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.
>>>>> amdgpu_bo_create locked the bo's resv lock 3.
>>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.
>>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
>>>>> lock.
>>>>>
>>>>> v2:
>>>>>       Make a local copy of the list
>>>>>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
>>>>>     2 files changed, 12 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index acfc63e..2b9f597 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3006,6 +3006,9 @@ static int
>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>     	long r = 1;
>>>>>     	int i = 0;
>>>>>     	long tmo;
>>>>> +	struct list_head local_shadow_list;
>>>>> +
>>>>> +	INIT_LIST_HEAD(&local_shadow_list);
>>>>>
>>>>>     	if (amdgpu_sriov_runtime(adev))
>>>>>     		tmo = msecs_to_jiffies(8000);
>>>>> @@ -3013,8 +3016,15 @@ static int
>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>     		tmo = msecs_to_jiffies(100);
>>>>>
>>>>>     	DRM_INFO("recover vram bo from shadow start\n");
>>>>> +
>>>>>     	mutex_lock(&adev->shadow_list_lock);
>>>>>     	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,
>>>>> shadow_list) {
>>>>> +		amdgpu_bo_ref(bo);
>>>>> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);
>>>>> +	}
>>>> Please don't add an extra copy_shadow_list field to amdgpu_bo.
>>> If don't use an extra variable, the local shadow list will change
>>> according the adev->shadow_list, both for adding or deleting, it is not we
>> want.
>>
>> That is not correct, see amdgpu_bo_destroy:
>>>          if (!list_empty(&bo->shadow_list)) {
>>>                  mutex_lock(&adev->shadow_list_lock);
>>>                  list_del_init(&bo->shadow_list);
>>>                  mutex_unlock(&adev->shadow_list_lock);
>>>          }
>> The BO is only removed from the list when it is destroyed, since we grabbed a
>> local reference it can't be destroyed. So we are safe here.
> Sorry I am not meaning the delete, what about the adding.

That will still go to adev->shadow_list and not affect our local list in 
any way.

We are not interested in any newly allocated shadow BOs, so that should 
be unproblematic.

Regards,
Christian.

>> Regards,
>> Christian.
>>
>>>> Instead just use bo->shadow list for this. When you hold a reference
>>>> to the BO it should not be removed from the shadow list.
>>>>
>>>> Additional to that you can just use list_splice_init() to move the
>>>> whole shadow list to your local list.
>>>>
>>>> Christian.
>>>>
>>>>> +	mutex_unlock(&adev->shadow_list_lock);
>>>>> +
>>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,
>>>>> +copy_shadow_list) {
>>>>>     		next = NULL;
>>>>>     		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,
>>>> &next);
>>>>>     		if (fence) {
>>>>> @@ -3033,8 +3043,8 @@ static int
>>>> amdgpu_device_handle_vram_lost(struct
>>>>> amdgpu_device *adev)
>>>>>
>>>>>     		dma_fence_put(fence);
>>>>>     		fence = next;
>>>>> +		amdgpu_bo_unref(&bo);
>>>>>     	}
>>>>> -	mutex_unlock(&adev->shadow_list_lock);
>>>>>
>>>>>     	if (fence) {
>>>>>     		r = dma_fence_wait_timeout(fence, false, tmo); diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> index 907fdf4..cfee16c 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {
>>>>>     		struct list_head	mn_list;
>>>>>     		struct list_head	shadow_list;
>>>>>     	};
>>>>> +        struct list_head                copy_shadow_list;
>>>>>
>>>>>     	struct kgd_mem                  *kfd_bo;
>>>>>     };
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>-----Original Message-----

>From: Koenig, Christian

>Sent: Monday, September 10, 2018 6:02 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>

>Am 10.09.2018 um 11:55 schrieb Deng, Emily:

>>> -----Original Message-----

>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>>> Christian König

>>> Sent: Monday, September 10, 2018 5:49 PM

>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>>

>>> Am 10.09.2018 um 11:47 schrieb Deng, Emily:

>>>>> -----Original Message-----

>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>>>> Sent: Monday, September 10, 2018 5:41 PM

>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>>>>

>>>>> Am 10.09.2018 um 11:34 schrieb Emily Deng:

>>>>>> It will ramdomly have the dead lock issue when test TDR:

>>>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>>>>>> amdgpu_bo_create locked the bo's resv lock 3.

>>>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.

>>>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's

>>>>>> resv lock.

>>>>>>

>>>>>> v2:

>>>>>>       Make a local copy of the list

>>>>>>

>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>>>> ---

>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++++-

>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +

>>>>>>     2 files changed, 12 insertions(+), 1 deletion(-)

>>>>>>

>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>> index acfc63e..2b9f597 100644

>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>> @@ -3006,6 +3006,9 @@ static int

>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>>>     	long r = 1;

>>>>>>     	int i = 0;

>>>>>>     	long tmo;

>>>>>> +	struct list_head local_shadow_list;

>>>>>> +

>>>>>> +	INIT_LIST_HEAD(&local_shadow_list);

>>>>>>

>>>>>>     	if (amdgpu_sriov_runtime(adev))

>>>>>>     		tmo = msecs_to_jiffies(8000); @@ -3013,8 +3016,15

>@@ static

>>>>>> int

>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>>>     		tmo = msecs_to_jiffies(100);

>>>>>>

>>>>>>     	DRM_INFO("recover vram bo from shadow start\n");

>>>>>> +

>>>>>>     	mutex_lock(&adev->shadow_list_lock);

>>>>>>     	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,

>>>>>> shadow_list) {

>>>>>> +		amdgpu_bo_ref(bo);

>>>>>> +		list_add_tail(&bo->copy_shadow_list, &local_shadow_list);

>>>>>> +	}

>>>>> Please don't add an extra copy_shadow_list field to amdgpu_bo.

>>>> If don't use an extra variable, the local shadow list will change

>>>> according the adev->shadow_list, both for adding or deleting, it is

>>>> not we

>>> want.

>>>

>>> That is not correct, see amdgpu_bo_destroy:

>>>>          if (!list_empty(&bo->shadow_list)) {

>>>>                  mutex_lock(&adev->shadow_list_lock);

>>>>                  list_del_init(&bo->shadow_list);

>>>>                  mutex_unlock(&adev->shadow_list_lock);

>>>>          }

>>> The BO is only removed from the list when it is destroyed, since we

>>> grabbed a local reference it can't be destroyed. So we are safe here.

>> Sorry I am not meaning the delete, what about the adding.

>

>That will still go to adev->shadow_list and not affect our local list in any way.


>We are not interested in any newly allocated shadow BOs, so that should be

>unproblematic.

Understand, will send a patch later.

Best wishes
Emily Deng
>Regards,

>Christian.

>

>>> Regards,

>>> Christian.

>>>

>>>>> Instead just use bo->shadow list for this. When you hold a

>>>>> reference to the BO it should not be removed from the shadow list.

>>>>>

>>>>> Additional to that you can just use list_splice_init() to move the

>>>>> whole shadow list to your local list.

>>>>>

>>>>> Christian.

>>>>>

>>>>>> +	mutex_unlock(&adev->shadow_list_lock);

>>>>>> +

>>>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,

>>>>>> +copy_shadow_list) {

>>>>>>     		next = NULL;

>>>>>>     		amdgpu_device_recover_vram_from_shadow(adev,

>ring, bo,

>>>>> &next);

>>>>>>     		if (fence) {

>>>>>> @@ -3033,8 +3043,8 @@ static int

>>>>> amdgpu_device_handle_vram_lost(struct

>>>>>> amdgpu_device *adev)

>>>>>>

>>>>>>     		dma_fence_put(fence);

>>>>>>     		fence = next;

>>>>>> +		amdgpu_bo_unref(&bo);

>>>>>>     	}

>>>>>> -	mutex_unlock(&adev->shadow_list_lock);

>>>>>>

>>>>>>     	if (fence) {

>>>>>>     		r = dma_fence_wait_timeout(fence, false, tmo); diff --

>git

>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>> index 907fdf4..cfee16c 100644

>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {

>>>>>>     		struct list_head	mn_list;

>>>>>>     		struct list_head	shadow_list;

>>>>>>     	};

>>>>>> +        struct list_head                copy_shadow_list;

>>>>>>

>>>>>>     	struct kgd_mem                  *kfd_bo;

>>>>>>     };

>>> _______________________________________________

>>> amd-gfx mailing list

>>> amd-gfx@lists.freedesktop.org

>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>-----Original Message-----

>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,

>Emily

>Sent: Monday, September 10, 2018 6:33 PM

>To: Koenig, Christian <Christian.Koenig@amd.com>; amd-

>gfx@lists.freedesktop.org

>Subject: RE: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>

>>-----Original Message-----

>>From: Koenig, Christian

>>Sent: Monday, September 10, 2018 6:02 PM

>>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>

>>Am 10.09.2018 um 11:55 schrieb Deng, Emily:

>>>> -----Original Message-----

>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>>>> Christian König

>>>> Sent: Monday, September 10, 2018 5:49 PM

>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>>>

>>>> Am 10.09.2018 um 11:47 schrieb Deng, Emily:

>>>>>> -----Original Message-----

>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>>>>> Sent: Monday, September 10, 2018 5:41 PM

>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;

>>>>>> amd-gfx@lists.freedesktop.org

>>>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.

>>>>>>

>>>>>> Am 10.09.2018 um 11:34 schrieb Emily Deng:

>>>>>>> It will ramdomly have the dead lock issue when test TDR:

>>>>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>>>>>>> amdgpu_bo_create locked the bo's resv lock 3.

>>>>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.

>>>>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's

>>>>>>> resv lock.

>>>>>>>

>>>>>>> v2:

>>>>>>>       Make a local copy of the list

>>>>>>>

>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>>>>> ---

>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12

>+++++++++++-

>>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +

>>>>>>>     2 files changed, 12 insertions(+), 1 deletion(-)

>>>>>>>

>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>>> index acfc63e..2b9f597 100644

>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>>>>> @@ -3006,6 +3006,9 @@ static int

>>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>>>>     	long r = 1;

>>>>>>>     	int i = 0;

>>>>>>>     	long tmo;

>>>>>>> +	struct list_head local_shadow_list;

>>>>>>> +

>>>>>>> +	INIT_LIST_HEAD(&local_shadow_list);

>>>>>>>

>>>>>>>     	if (amdgpu_sriov_runtime(adev))

>>>>>>>     		tmo = msecs_to_jiffies(8000); @@ -3013,8 +3016,15

>>@@ static

>>>>>>> int

>>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>>>>     		tmo = msecs_to_jiffies(100);

>>>>>>>

>>>>>>>     	DRM_INFO("recover vram bo from shadow start\n");

>>>>>>> +

>>>>>>>     	mutex_lock(&adev->shadow_list_lock);

>>>>>>>     	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,

>>>>>>> shadow_list) {

>>>>>>> +		amdgpu_bo_ref(bo);

>>>>>>> +		list_add_tail(&bo->copy_shadow_list,

>&local_shadow_list);

>>>>>>> +	}

>>>>>> Please don't add an extra copy_shadow_list field to amdgpu_bo.

>>>>> If don't use an extra variable, the local shadow list will change

>>>>> according the adev->shadow_list, both for adding or deleting, it is

>>>>> not we

>>>> want.

>>>>

>>>> That is not correct, see amdgpu_bo_destroy:

>>>>>          if (!list_empty(&bo->shadow_list)) {

>>>>>                  mutex_lock(&adev->shadow_list_lock);

>>>>>                  list_del_init(&bo->shadow_list);

>>>>>                  mutex_unlock(&adev->shadow_list_lock);

>>>>>          }

>>>> The BO is only removed from the list when it is destroyed, since we

>>>> grabbed a local reference it can't be destroyed. So we are safe here.

The amdgpu_bo_ref could only avoid deleting the bo, but
when it is already in amdgpu_bo_destroy, the shadow_list seems still will be delete.
>>> Sorry I am not meaning the delete, what about the adding.

>>

>>That will still go to adev->shadow_list and not affect our local list in any way.

>

>>We are not interested in any newly allocated shadow BOs, so that should

>>be unproblematic.

>Understand, will send a patch later.

>

>Best wishes

>Emily Deng

>>Regards,

>>Christian.

>>

>>>> Regards,

>>>> Christian.

>>>>

>>>>>> Instead just use bo->shadow list for this. When you hold a

>>>>>> reference to the BO it should not be removed from the shadow list.

>>>>>>

>>>>>> Additional to that you can just use list_splice_init() to move the

>>>>>> whole shadow list to your local list.

>>>>>>

>>>>>> Christian.

>>>>>>

>>>>>>> +	mutex_unlock(&adev->shadow_list_lock);

>>>>>>> +

>>>>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,

>>>>>>> +copy_shadow_list) {

>>>>>>>     		next = NULL;

>>>>>>>     		amdgpu_device_recover_vram_from_shadow(adev,

>>ring, bo,

>>>>>> &next);

>>>>>>>     		if (fence) {

>>>>>>> @@ -3033,8 +3043,8 @@ static int

>>>>>> amdgpu_device_handle_vram_lost(struct

>>>>>>> amdgpu_device *adev)

>>>>>>>

>>>>>>>     		dma_fence_put(fence);

>>>>>>>     		fence = next;

>>>>>>> +		amdgpu_bo_unref(&bo);

>>>>>>>     	}

>>>>>>> -	mutex_unlock(&adev->shadow_list_lock);

>>>>>>>

>>>>>>>     	if (fence) {

>>>>>>>     		r = dma_fence_wait_timeout(fence, false, tmo); diff --

>>git

>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>>> index 907fdf4..cfee16c 100644

>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

>>>>>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {

>>>>>>>     		struct list_head	mn_list;

>>>>>>>     		struct list_head	shadow_list;

>>>>>>>     	};

>>>>>>> +        struct list_head                copy_shadow_list;

>>>>>>>

>>>>>>>     	struct kgd_mem                  *kfd_bo;

>>>>>>>     };

>>>> _______________________________________________

>>>> amd-gfx mailing list

>>>> amd-gfx@lists.freedesktop.org

>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

>

>_______________________________________________

>amd-gfx mailing list

>amd-gfx@lists.freedesktop.org

>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 11.09.2018 um 04:16 schrieb Deng, Emily:
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Deng,
>> Emily
>> Sent: Monday, September 10, 2018 6:33 PM
>> To: Koenig, Christian <Christian.Koenig@amd.com>; amd-
>> gfx@lists.freedesktop.org
>> Subject: RE: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>
>>> -----Original Message-----
>>> From: Koenig, Christian
>>> Sent: Monday, September 10, 2018 6:02 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>>
>>> Am 10.09.2018 um 11:55 schrieb Deng, Emily:
>>>>> -----Original Message-----
>>>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>>>>> Christian König
>>>>> Sent: Monday, September 10, 2018 5:49 PM
>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>>>>
>>>>> Am 10.09.2018 um 11:47 schrieb Deng, Emily:
>>>>>>> -----Original Message-----
>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>>> Sent: Monday, September 10, 2018 5:41 PM
>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;
>>>>>>> amd-gfx@lists.freedesktop.org
>>>>>>> Subject: Re: [PATCH v2] drm/amdgpu: Fix the dead lock issue.
>>>>>>>
>>>>>>> Am 10.09.2018 um 11:34 schrieb Emily Deng:
>>>>>>>> It will ramdomly have the dead lock issue when test TDR:
>>>>>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.
>>>>>>>> amdgpu_bo_create locked the bo's resv lock 3.
>>>>>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.
>>>>>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's
>>>>>>>> resv lock.
>>>>>>>>
>>>>>>>> v2:
>>>>>>>>        Make a local copy of the list
>>>>>>>>
>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12
>> +++++++++++-
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  1 +
>>>>>>>>      2 files changed, 12 insertions(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index acfc63e..2b9f597 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -3006,6 +3006,9 @@ static int
>>>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>>>>      	long r = 1;
>>>>>>>>      	int i = 0;
>>>>>>>>      	long tmo;
>>>>>>>> +	struct list_head local_shadow_list;
>>>>>>>> +
>>>>>>>> +	INIT_LIST_HEAD(&local_shadow_list);
>>>>>>>>
>>>>>>>>      	if (amdgpu_sriov_runtime(adev))
>>>>>>>>      		tmo = msecs_to_jiffies(8000); @@ -3013,8 +3016,15
>>> @@ static
>>>>>>>> int
>>>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>>>>      		tmo = msecs_to_jiffies(100);
>>>>>>>>
>>>>>>>>      	DRM_INFO("recover vram bo from shadow start\n");
>>>>>>>> +
>>>>>>>>      	mutex_lock(&adev->shadow_list_lock);
>>>>>>>>      	list_for_each_entry_safe(bo, tmp, &adev->shadow_list,
>>>>>>>> shadow_list) {
>>>>>>>> +		amdgpu_bo_ref(bo);
>>>>>>>> +		list_add_tail(&bo->copy_shadow_list,
>> &local_shadow_list);
>>>>>>>> +	}
>>>>>>> Please don't add an extra copy_shadow_list field to amdgpu_bo.
>>>>>> If don't use an extra variable, the local shadow list will change
>>>>>> according the adev->shadow_list, both for adding or deleting, it is
>>>>>> not we
>>>>> want.
>>>>>
>>>>> That is not correct, see amdgpu_bo_destroy:
>>>>>>           if (!list_empty(&bo->shadow_list)) {
>>>>>>                   mutex_lock(&adev->shadow_list_lock);
>>>>>>                   list_del_init(&bo->shadow_list);
>>>>>>                   mutex_unlock(&adev->shadow_list_lock);
>>>>>>           }
>>>>> The BO is only removed from the list when it is destroyed, since we
>>>>> grabbed a local reference it can't be destroyed. So we are safe here.
> The amdgpu_bo_ref could only avoid deleting the bo, but
> when it is already in amdgpu_bo_destroy, the shadow_list seems still will be delete.

Yeah, had the same thought this night. To make it even worse the 
reservation object will already be destroyed and so we actually can't 
reserve the BOs here.

Christian.

>>>> Sorry I am not meaning the delete, what about the adding.
>>> That will still go to adev->shadow_list and not affect our local list in any way.
>>> We are not interested in any newly allocated shadow BOs, so that should
>>> be unproblematic.
>> Understand, will send a patch later.
>>
>> Best wishes
>> Emily Deng
>>> Regards,
>>> Christian.
>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>> Instead just use bo->shadow list for this. When you hold a
>>>>>>> reference to the BO it should not be removed from the shadow list.
>>>>>>>
>>>>>>> Additional to that you can just use list_splice_init() to move the
>>>>>>> whole shadow list to your local list.
>>>>>>>
>>>>>>> Christian.
>>>>>>>
>>>>>>>> +	mutex_unlock(&adev->shadow_list_lock);
>>>>>>>> +
>>>>>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list,
>>>>>>>> +copy_shadow_list) {
>>>>>>>>      		next = NULL;
>>>>>>>>      		amdgpu_device_recover_vram_from_shadow(adev,
>>> ring, bo,
>>>>>>> &next);
>>>>>>>>      		if (fence) {
>>>>>>>> @@ -3033,8 +3043,8 @@ static int
>>>>>>> amdgpu_device_handle_vram_lost(struct
>>>>>>>> amdgpu_device *adev)
>>>>>>>>
>>>>>>>>      		dma_fence_put(fence);
>>>>>>>>      		fence = next;
>>>>>>>> +		amdgpu_bo_unref(&bo);
>>>>>>>>      	}
>>>>>>>> -	mutex_unlock(&adev->shadow_list_lock);
>>>>>>>>
>>>>>>>>      	if (fence) {
>>>>>>>>      		r = dma_fence_wait_timeout(fence, false, tmo); diff --
>>> git
>>>>>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>>>>> index 907fdf4..cfee16c 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>>>>>> @@ -103,6 +103,7 @@ struct amdgpu_bo {
>>>>>>>>      		struct list_head	mn_list;
>>>>>>>>      		struct list_head	shadow_list;
>>>>>>>>      	};
>>>>>>>> +        struct list_head                copy_shadow_list;
>>>>>>>>
>>>>>>>>      	struct kgd_mem                  *kfd_bo;
>>>>>>>>      };
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx