drm/amdgpu: Fix the dead lock issue.

Submitted by Deng, Emily on Sept. 11, 2018, 2:51 a.m.

Details

Message ID 1536634293-26099-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/amdgpu: Fix the dead lock issue." ( rev: 3 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily Sept. 11, 2018, 2:51 a.m.
It will ramdomly have the dead lock issue when test TDR:
1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
2. amdgpu_bo_create locked the bo's resv lock
3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
lock.

v2:
   Make a local copy of the list

Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2a21267..8c81404 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3105,6 +3105,9 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 	long r = 1;
 	int i = 0;
 	long tmo;
+	struct list_head local_shadow_list;
+
+	INIT_LIST_HEAD(&local_shadow_list);
 
 	if (amdgpu_sriov_runtime(adev))
 		tmo = msecs_to_jiffies(8000);
@@ -3112,8 +3115,19 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 		tmo = msecs_to_jiffies(100);
 
 	DRM_INFO("recover vram bo from shadow start\n");
+
+	mutex_lock(&adev->shadow_list_lock);
+	list_splice_init(&adev->shadow_list, &local_shadow_list);
+	mutex_unlock(&adev->shadow_list_lock);
+
+
 	mutex_lock(&adev->shadow_list_lock);
-	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
+	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {
+		mutex_unlock(&adev->shadow_list_lock);
+
+		if (!bo)
+			continue;
+
 		next = NULL;
 		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
 		if (fence) {
@@ -3132,9 +3146,14 @@  static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
 
 		dma_fence_put(fence);
 		fence = next;
+		mutex_lock(&adev->shadow_list_lock);
 	}
 	mutex_unlock(&adev->shadow_list_lock);
 
+	mutex_lock(&adev->shadow_list_lock);
+	list_splice_init(&local_shadow_list, &adev->shadow_list);
+	mutex_unlock(&adev->shadow_list_lock);
+
 	if (fence) {
 		r = dma_fence_wait_timeout(fence, false, tmo);
 		if (r == 0)

Comments

On 09/11/2018 10:51 AM, Emily Deng wrote:
> It will ramdomly have the dead lock issue when test TDR:
> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
> 2. amdgpu_bo_create locked the bo's resv lock
> 3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
> 4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
> lock.
>
> v2:
>     Make a local copy of the list
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>   1 file changed, 20 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2a21267..8c81404 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3105,6 +3105,9 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   	long r = 1;
>   	int i = 0;
>   	long tmo;
> +	struct list_head local_shadow_list;
> +
> +	INIT_LIST_HEAD(&local_shadow_list);
>
>   	if (amdgpu_sriov_runtime(adev))
>   		tmo = msecs_to_jiffies(8000);
> @@ -3112,8 +3115,19 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   		tmo = msecs_to_jiffies(100);
>
>   	DRM_INFO("recover vram bo from shadow start\n");
> +
> +	mutex_lock(&adev->shadow_list_lock);
> +	list_splice_init(&adev->shadow_list, &local_shadow_list);
> +	mutex_unlock(&adev->shadow_list_lock);
> +
> +
>   	mutex_lock(&adev->shadow_list_lock);
> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {
> +		mutex_unlock(&adev->shadow_list_lock);

We may not to use shadow_list_lock when traverse the local shadow list.

Regards,
Jerry

> +
> +		if (!bo)
> +			continue;
> +
>   		next = NULL;
>   		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>   		if (fence) {
> @@ -3132,9 +3146,14 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>
>   		dma_fence_put(fence);
>   		fence = next;
> +		mutex_lock(&adev->shadow_list_lock);
>   	}
>   	mutex_unlock(&adev->shadow_list_lock);
>
> +	mutex_lock(&adev->shadow_list_lock);
> +	list_splice_init(&local_shadow_list, &adev->shadow_list);
> +	mutex_unlock(&adev->shadow_list_lock);
> +
>   	if (fence) {
>   		r = dma_fence_wait_timeout(fence, false, tmo);
>   		if (r == 0)
>
On 2018年09月11日 10:51, Emily Deng wrote:
> It will ramdomly have the dead lock issue when test TDR:
> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock
> 2. amdgpu_bo_create locked the bo's resv lock
> 3. amdgpu_bo_create_shadow is waiting for the shadow_list_lock
> 4. amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
> lock.
>
> v2:
>     Make a local copy of the list
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 ++++++++++++++++++++-
>   1 file changed, 20 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2a21267..8c81404 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3105,6 +3105,9 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   	long r = 1;
>   	int i = 0;
>   	long tmo;
> +	struct list_head local_shadow_list;
> +
> +	INIT_LIST_HEAD(&local_shadow_list);
>   
>   	if (amdgpu_sriov_runtime(adev))
>   		tmo = msecs_to_jiffies(8000);
> @@ -3112,8 +3115,19 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   		tmo = msecs_to_jiffies(100);
>   
>   	DRM_INFO("recover vram bo from shadow start\n");
> +
> +	mutex_lock(&adev->shadow_list_lock);
> +	list_splice_init(&adev->shadow_list, &local_shadow_list);
> +	mutex_unlock(&adev->shadow_list_lock);
> +
> +
>   	mutex_lock(&adev->shadow_list_lock);
local_shadow_list is a local variable, I think it doesn't need lock at 
all, no one change it. Otherwise looks good to me.

Thanks,
David Zhou
> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {
> +		mutex_unlock(&adev->shadow_list_lock);
> +
> +		if (!bo)
> +			continue;
> +
>   		next = NULL;
>   		amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next);
>   		if (fence) {
> @@ -3132,9 +3146,14 @@ static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>   
>   		dma_fence_put(fence);
>   		fence = next;
> +		mutex_lock(&adev->shadow_list_lock);
>   	}
>   	mutex_unlock(&adev->shadow_list_lock);
>   
> +	mutex_lock(&adev->shadow_list_lock);
> +	list_splice_init(&local_shadow_list, &adev->shadow_list);
> +	mutex_unlock(&adev->shadow_list_lock);
> +
>   	if (fence) {
>   		r = dma_fence_wait_timeout(fence, false, tmo);
>   		if (r == 0)
>-----Original Message-----

>From: Zhou, David(ChunMing)

>Sent: Tuesday, September 11, 2018 11:03 AM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.

>

>

>

>On 2018年09月11日 10:51, Emily Deng wrote:

>> It will ramdomly have the dead lock issue when test TDR:

>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>> amdgpu_bo_create locked the bo's resv lock 3. amdgpu_bo_create_shadow

>> is waiting for the shadow_list_lock 4.

>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv

>> lock.

>>

>> v2:

>>     Make a local copy of the list

>>

>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21

>++++++++++++++++++++-

>>   1 file changed, 20 insertions(+), 1 deletion(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> index 2a21267..8c81404 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> @@ -3105,6 +3105,9 @@ static int

>amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>   	long r = 1;

>>   	int i = 0;

>>   	long tmo;

>> +	struct list_head local_shadow_list;

>> +

>> +	INIT_LIST_HEAD(&local_shadow_list);

>>

>>   	if (amdgpu_sriov_runtime(adev))

>>   		tmo = msecs_to_jiffies(8000);

>> @@ -3112,8 +3115,19 @@ static int

>amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>   		tmo = msecs_to_jiffies(100);

>>

>>   	DRM_INFO("recover vram bo from shadow start\n");

>> +

>> +	mutex_lock(&adev->shadow_list_lock);

>> +	list_splice_init(&adev->shadow_list, &local_shadow_list);

>> +	mutex_unlock(&adev->shadow_list_lock);

>> +

>> +

>>   	mutex_lock(&adev->shadow_list_lock);

>local_shadow_list is a local variable, I think it doesn't need lock at all, no one

>change it. Otherwise looks good to me.

The bo->shadow_list which now is in local_shadow_list maybe destroy in case that it already in amdgpu_bo_destroy, then it will
change local_shadow_list, so need lock the shadow_list_lock.
Best wishes
Emily Deng
>Thanks,

>David Zhou

>> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {

>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {

>> +		mutex_unlock(&adev->shadow_list_lock);

>> +

>> +		if (!bo)

>> +			continue;

>> +

>>   		next = NULL;

>>   		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,

>&next);

>>   		if (fence) {

>> @@ -3132,9 +3146,14 @@ static int

>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>

>>   		dma_fence_put(fence);

>>   		fence = next;

>> +		mutex_lock(&adev->shadow_list_lock);

>>   	}

>>   	mutex_unlock(&adev->shadow_list_lock);

>>

>> +	mutex_lock(&adev->shadow_list_lock);

>> +	list_splice_init(&local_shadow_list, &adev->shadow_list);

>> +	mutex_unlock(&adev->shadow_list_lock);

>> +

>>   	if (fence) {

>>   		r = dma_fence_wait_timeout(fence, false, tmo);

>>   		if (r == 0)
On 2018年09月11日 11:23, Deng, Emily wrote:
>> -----Original Message-----
>> From: Zhou, David(ChunMing)
>> Sent: Tuesday, September 11, 2018 11:03 AM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.
>>
>>
>>
>> On 2018年09月11日 10:51, Emily Deng wrote:
>>> It will ramdomly have the dead lock issue when test TDR:
>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.
>>> amdgpu_bo_create locked the bo's resv lock 3. amdgpu_bo_create_shadow
>>> is waiting for the shadow_list_lock 4.
>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
>>> lock.
>>>
>>> v2:
>>>      Make a local copy of the list
>>>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21
>> ++++++++++++++++++++-
>>>    1 file changed, 20 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 2a21267..8c81404 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3105,6 +3105,9 @@ static int
>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>    	long r = 1;
>>>    	int i = 0;
>>>    	long tmo;
>>> +	struct list_head local_shadow_list;
>>> +
>>> +	INIT_LIST_HEAD(&local_shadow_list);
>>>
>>>    	if (amdgpu_sriov_runtime(adev))
>>>    		tmo = msecs_to_jiffies(8000);
>>> @@ -3112,8 +3115,19 @@ static int
>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>    		tmo = msecs_to_jiffies(100);
>>>
>>>    	DRM_INFO("recover vram bo from shadow start\n");
>>> +
>>> +	mutex_lock(&adev->shadow_list_lock);
>>> +	list_splice_init(&adev->shadow_list, &local_shadow_list);
>>> +	mutex_unlock(&adev->shadow_list_lock);
>>> +
>>> +
>>>    	mutex_lock(&adev->shadow_list_lock);
>> local_shadow_list is a local variable, I think it doesn't need lock at all, no one
>> change it. Otherwise looks good to me.
> The bo->shadow_list which now is in local_shadow_list maybe destroy in case that it already in amdgpu_bo_destroy, then it will
> change local_shadow_list, so need lock the shadow_list_lock.
Ah, sorry for noise, I forget you don't reference these BOs.

Thanks,
David Zhou
> Best wishes
> Emily Deng
>> Thanks,
>> David Zhou
>>> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {
>>> +		mutex_unlock(&adev->shadow_list_lock);
>>> +
>>> +		if (!bo)
>>> +			continue;
>>> +
>>>    		next = NULL;
>>>    		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,
>> &next);
>>>    		if (fence) {
>>> @@ -3132,9 +3146,14 @@ static int
>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>
>>>    		dma_fence_put(fence);
>>>    		fence = next;
>>> +		mutex_lock(&adev->shadow_list_lock);
>>>    	}
>>>    	mutex_unlock(&adev->shadow_list_lock);
>>>
>>> +	mutex_lock(&adev->shadow_list_lock);
>>> +	list_splice_init(&local_shadow_list, &adev->shadow_list);
>>> +	mutex_unlock(&adev->shadow_list_lock);
>>> +
>>>    	if (fence) {
>>>    		r = dma_fence_wait_timeout(fence, false, tmo);
>>>    		if (r == 0)
>-----Original Message-----

>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>zhoucm1

>Sent: Tuesday, September 11, 2018 11:28 AM

>To: Deng, Emily <Emily.Deng@amd.com>; Zhou, David(ChunMing)

><David1.Zhou@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.

>

>

>

>On 2018年09月11日 11:23, Deng, Emily wrote:

>>> -----Original Message-----

>>> From: Zhou, David(ChunMing)

>>> Sent: Tuesday, September 11, 2018 11:03 AM

>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.

>>>

>>>

>>>

>>> On 2018年09月11日 10:51, Emily Deng wrote:

>>>> It will ramdomly have the dead lock issue when test TDR:

>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.

>>>> amdgpu_bo_create locked the bo's resv lock 3.

>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.

>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv

>>>> lock.

>>>>

>>>> v2:

>>>>      Make a local copy of the list

>>>>

>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>> ---

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21

>>> ++++++++++++++++++++-

>>>>    1 file changed, 20 insertions(+), 1 deletion(-)

>>>>

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> index 2a21267..8c81404 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> @@ -3105,6 +3105,9 @@ static int

>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>    	long r = 1;

>>>>    	int i = 0;

>>>>    	long tmo;

>>>> +	struct list_head local_shadow_list;

>>>> +

>>>> +	INIT_LIST_HEAD(&local_shadow_list);

>>>>

>>>>    	if (amdgpu_sriov_runtime(adev))

>>>>    		tmo = msecs_to_jiffies(8000);

>>>> @@ -3112,8 +3115,19 @@ static int

>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>    		tmo = msecs_to_jiffies(100);

>>>>

>>>>    	DRM_INFO("recover vram bo from shadow start\n");

>>>> +

>>>> +	mutex_lock(&adev->shadow_list_lock);

>>>> +	list_splice_init(&adev->shadow_list, &local_shadow_list);

>>>> +	mutex_unlock(&adev->shadow_list_lock);

>>>> +

>>>> +

>>>>    	mutex_lock(&adev->shadow_list_lock);

>>> local_shadow_list is a local variable, I think it doesn't need lock

>>> at all, no one change it. Otherwise looks good to me.

>> The bo->shadow_list which now is in local_shadow_list maybe destroy in

>> case that it already in amdgpu_bo_destroy, then it will change

>local_shadow_list, so need lock the shadow_list_lock.

>Ah, sorry for noise, I forget you don't reference these BOs.

Yes, I don't reference these Bos, as I found even reference these Bos, it still couldn't avoid the case that another process is already
in amdgpu_bo_destroy.
>

>Thanks,

>David Zhou

>> Best wishes

>> Emily Deng

>>> Thanks,

>>> David Zhou

>>>> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {

>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {

>>>> +		mutex_unlock(&adev->shadow_list_lock);

>>>> +

>>>> +		if (!bo)

>>>> +			continue;

>>>> +

>>>>    		next = NULL;

>>>>    		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,

>>> &next);

>>>>    		if (fence) {

>>>> @@ -3132,9 +3146,14 @@ static int

>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)

>>>>

>>>>    		dma_fence_put(fence);

>>>>    		fence = next;

>>>> +		mutex_lock(&adev->shadow_list_lock);

>>>>    	}

>>>>    	mutex_unlock(&adev->shadow_list_lock);

>>>>

>>>> +	mutex_lock(&adev->shadow_list_lock);

>>>> +	list_splice_init(&local_shadow_list, &adev->shadow_list);

>>>> +	mutex_unlock(&adev->shadow_list_lock);

>>>> +

>>>>    	if (fence) {

>>>>    		r = dma_fence_wait_timeout(fence, false, tmo);

>>>>    		if (r == 0)

>

>_______________________________________________

>amd-gfx mailing list

>amd-gfx@lists.freedesktop.org

>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On 2018年09月11日 11:32, Deng, Emily wrote:
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> zhoucm1
>> Sent: Tuesday, September 11, 2018 11:28 AM
>> To: Deng, Emily <Emily.Deng@amd.com>; Zhou, David(ChunMing)
>> <David1.Zhou@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.
>>
>>
>>
>> On 2018年09月11日 11:23, Deng, Emily wrote:
>>>> -----Original Message-----
>>>> From: Zhou, David(ChunMing)
>>>> Sent: Tuesday, September 11, 2018 11:03 AM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu: Fix the dead lock issue.
>>>>
>>>>
>>>>
>>>> On 2018年09月11日 10:51, Emily Deng wrote:
>>>>> It will ramdomly have the dead lock issue when test TDR:
>>>>> 1. amdgpu_device_handle_vram_lost gets the lock shadow_list_lock 2.
>>>>> amdgpu_bo_create locked the bo's resv lock 3.
>>>>> amdgpu_bo_create_shadow is waiting for the shadow_list_lock 4.
>>>>> amdgpu_device_recover_vram_from_shadow is waiting for the bo's resv
>>>>> lock.
>>>>>
>>>>> v2:
>>>>>       Make a local copy of the list
>>>>>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21
>>>> ++++++++++++++++++++-
>>>>>     1 file changed, 20 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 2a21267..8c81404 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3105,6 +3105,9 @@ static int
>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>     	long r = 1;
>>>>>     	int i = 0;
>>>>>     	long tmo;
>>>>> +	struct list_head local_shadow_list;
>>>>> +
>>>>> +	INIT_LIST_HEAD(&local_shadow_list);
>>>>>
>>>>>     	if (amdgpu_sriov_runtime(adev))
>>>>>     		tmo = msecs_to_jiffies(8000);
>>>>> @@ -3112,8 +3115,19 @@ static int
>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>     		tmo = msecs_to_jiffies(100);
>>>>>
>>>>>     	DRM_INFO("recover vram bo from shadow start\n");
>>>>> +
>>>>> +	mutex_lock(&adev->shadow_list_lock);
>>>>> +	list_splice_init(&adev->shadow_list, &local_shadow_list);
>>>>> +	mutex_unlock(&adev->shadow_list_lock);
>>>>> +
>>>>> +
>>>>>     	mutex_lock(&adev->shadow_list_lock);
>>>> local_shadow_list is a local variable, I think it doesn't need lock
>>>> at all, no one change it. Otherwise looks good to me.
>>> The bo->shadow_list which now is in local_shadow_list maybe destroy in
>>> case that it already in amdgpu_bo_destroy, then it will change
>> local_shadow_list, so need lock the shadow_list_lock.
>> Ah, sorry for noise, I forget you don't reference these BOs.
> Yes, I don't reference these Bos, as I found even reference these Bos, it still couldn't avoid the case that another process is already
> in amdgpu_bo_destroy.
??? that shouldn't happen, the reference is belonged to list. But back 
to here, we don't need reference them.
And since no shadow BO is added to local after splice, we'd better to 
use list_next_entry to iterate the local shadow list instead of 
list_for_each_entry_safe.

Thanks,
David Zhou
>> Thanks,
>> David Zhou
>>> Best wishes
>>> Emily Deng
>>>> Thanks,
>>>> David Zhou
>>>>> -	list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) {
>>>>> +	list_for_each_entry_safe(bo, tmp, &local_shadow_list, shadow_list) {
>>>>> +		mutex_unlock(&adev->shadow_list_lock);
>>>>> +
>>>>> +		if (!bo)
>>>>> +			continue;
>>>>> +
>>>>>     		next = NULL;
>>>>>     		amdgpu_device_recover_vram_from_shadow(adev, ring, bo,
>>>> &next);
>>>>>     		if (fence) {
>>>>> @@ -3132,9 +3146,14 @@ static int
>>>>> amdgpu_device_handle_vram_lost(struct amdgpu_device *adev)
>>>>>
>>>>>     		dma_fence_put(fence);
>>>>>     		fence = next;
>>>>> +		mutex_lock(&adev->shadow_list_lock);
>>>>>     	}
>>>>>     	mutex_unlock(&adev->shadow_list_lock);
>>>>>
>>>>> +	mutex_lock(&adev->shadow_list_lock);
>>>>> +	list_splice_init(&local_shadow_list, &adev->shadow_list);
>>>>> +	mutex_unlock(&adev->shadow_list_lock);
>>>>> +
>>>>>     	if (fence) {
>>>>>     		r = dma_fence_wait_timeout(fence, false, tmo);
>>>>>     		if (r == 0)
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx