drm/amdgpu: Add more page fault info printing for GFX10

Submitted by Zhao, Yong on Aug. 12, 2019, 7:05 p.m.

Details

Message ID 20190812190536.22744-1-Yong.Zhao@amd.com
State New
Headers show
Series "drm/amdgpu: Add more page fault info printing for GFX10" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Zhao, Yong Aug. 12, 2019, 7:05 p.m.
The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
it now.

Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
 2 files changed, 32 insertions(+), 6 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4e3ac1084a94..f23be98e9897 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -140,17 +140,40 @@  static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
 	}
 
 	if (printk_ratelimit()) {
+		struct amdgpu_task_info task_info;
+
+		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
+		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
+
 		dev_err(adev->dev,
-			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
+			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
+			"for process:%s pid:%d thread:%s pid:%d)\n",
 			entry->vmid_src ? "mmhub" : "gfxhub",
 			entry->src_id, entry->ring_id, entry->vmid,
-			entry->pasid);
-		dev_err(adev->dev, "  at page 0x%016llx from %d\n",
+			entry->pasid, task_info.process_name, task_info.tgid,
+			task_info.task_name, task_info.pid);
+		dev_err(adev->dev, "  in page starting at address 0x%016llx from client %d\n",
 			addr, entry->client_id);
-		if (!amdgpu_sriov_vf(adev))
+		if (!amdgpu_sriov_vf(adev)) {
 			dev_err(adev->dev,
-				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
+				"GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
 				status);
+			dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
+				REG_GET_FIELD(status,
+				GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
+			dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
+				REG_GET_FIELD(status,
+				GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
+			dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
+				REG_GET_FIELD(status,
+				GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));
+			dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
+				REG_GET_FIELD(status,
+				GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+			dev_err(adev->dev, "\t RW: 0x%lx\n",
+				REG_GET_FIELD(status,
+				GCVM_L2_PROTECTION_FAULT_STATUS, RW));
+		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 296e2d982578..34c4c2d08550 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -364,7 +364,7 @@  static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 
 		dev_err(adev->dev,
 			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
-			"pasid:%u, for process %s pid %d thread %s pid %d)\n",
+			"pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",
 			hub_name, retry_fault ? "retry" : "no-retry",
 			entry->src_id, entry->ring_id, entry->vmid,
 			entry->pasid, task_info.process_name, task_info.tgid,
@@ -387,6 +387,9 @@  static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
 				REG_GET_FIELD(status,
 				VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
+			dev_err(adev->dev, "\t RW: 0x%lx\n",
+				REG_GET_FIELD(status,
+				VM_L2_PROTECTION_FAULT_STATUS, RW));
 
 		}
 	}

Comments

Am 12.08.19 um 21:05 schrieb Zhao, Yong:
> The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
> it now.
>
> Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
> Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++++++++++++++++++++++----
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
>   2 files changed, 32 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4e3ac1084a94..f23be98e9897 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>   	}
>   
>   	if (printk_ratelimit()) {
> +		struct amdgpu_task_info task_info;
> +
> +		memset(&task_info, 0, sizeof(struct amdgpu_task_info));
> +		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
> +
>   		dev_err(adev->dev,
> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
> +			"[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
> +			"for process:%s pid:%d thread:%s pid:%d)\n",
>   			entry->vmid_src ? "mmhub" : "gfxhub",
>   			entry->src_id, entry->ring_id, entry->vmid,
> -			entry->pasid);
> -		dev_err(adev->dev, "  at page 0x%016llx from %d\n",
> +			entry->pasid, task_info.process_name, task_info.tgid,
> +			task_info.task_name, task_info.pid);
> +		dev_err(adev->dev, "  in page starting at address 0x%016llx from client %d\n",
>   			addr, entry->client_id);
> -		if (!amdgpu_sriov_vf(adev))
> +		if (!amdgpu_sriov_vf(adev)) {
>   			dev_err(adev->dev,
> -				"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
> +				"GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>   				status);
> +			dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
> +			dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
> +			dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));
> +			dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
> +			dev_err(adev->dev, "\t RW: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				GCVM_L2_PROTECTION_FAULT_STATUS, RW));
> +		}
>   	}
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 296e2d982578..34c4c2d08550 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   
>   		dev_err(adev->dev,
>   			"[%s] %s page fault (src_id:%u ring:%u vmid:%u "
> -			"pasid:%u, for process %s pid %d thread %s pid %d)\n",
> +			"pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",

I think the text actually looks better without the ":".

>   			hub_name, retry_fault ? "retry" : "no-retry",
>   			entry->src_id, entry->ring_id, entry->vmid,
>   			entry->pasid, task_info.process_name, task_info.tgid,
> @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   			dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
>   				REG_GET_FIELD(status,
>   				VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
> +			dev_err(adev->dev, "\t RW: 0x%lx\n",
> +				REG_GET_FIELD(status,
> +				VM_L2_PROTECTION_FAULT_STATUS, RW));

That should probably be a separate patch since it is fixing gfx9.

Apart from that the patch looks good to me,
Christian.

>   
>   		}
>   	}
Hi Christian,

I feel with ":" it is better, because without it I found it is not easy 
to interpret the printing. Moreover, it continues the format of the 
former part.

It looks like this:

[  190.686668] amdgpu 0000:03:00.0: [gfxhub0] retry page fault (src_id:0 
ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest 
pid:3273)

vs without ":"

[  190.686668] amdgpu 0000:03:00.0: [gfxhub0] retry page fault (src_id:0 
ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest 
pid 3273)


If you are not convinced, I can change it to without ":".


Regards,

Yong

On 2019-08-12 3:12 p.m., Christian König wrote:
> Am 12.08.19 um 21:05 schrieb Zhao, Yong:

>> The printing we did for GFX9 was not propogated to GFX10 somehow, so fix

>> it now.

>>

>> Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca

>> Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++++++++++++++++++++++----

>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-

>>   2 files changed, 32 insertions(+), 6 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 

>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

>> index 4e3ac1084a94..f23be98e9897 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

>> @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct 

>> amdgpu_device *adev,

>>       }

>>         if (printk_ratelimit()) {

>> +        struct amdgpu_task_info task_info;

>> +

>> +        memset(&task_info, 0, sizeof(struct amdgpu_task_info));

>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);

>> +

>>           dev_err(adev->dev,

>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u 

>> pasid:%u)\n",

>> +            "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "

>> +            "for process:%s pid:%d thread:%s pid:%d)\n",

>>               entry->vmid_src ? "mmhub" : "gfxhub",

>>               entry->src_id, entry->ring_id, entry->vmid,

>> -            entry->pasid);

>> -        dev_err(adev->dev, "  at page 0x%016llx from %d\n",

>> +            entry->pasid, task_info.process_name, task_info.tgid,

>> +            task_info.task_name, task_info.pid);

>> +        dev_err(adev->dev, "  in page starting at address 0x%016llx 

>> from client %d\n",

>>               addr, entry->client_id);

>> -        if (!amdgpu_sriov_vf(adev))

>> +        if (!amdgpu_sriov_vf(adev)) {

>>               dev_err(adev->dev,

>> -                "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",

>> +                "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",

>>                   status);

>> +            dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));

>> +            dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));

>> +            dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));

>> +            dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));

>> +            dev_err(adev->dev, "\t RW: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                GCVM_L2_PROTECTION_FAULT_STATUS, RW));

>> +        }

>>       }

>>         return 0;

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 

>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> index 296e2d982578..34c4c2d08550 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct 

>> amdgpu_device *adev,

>>             dev_err(adev->dev,

>>               "[%s] %s page fault (src_id:%u ring:%u vmid:%u "

>> -            "pasid:%u, for process %s pid %d thread %s pid %d)\n",

>> +            "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",

>

> I think the text actually looks better without the ":".

>

>>               hub_name, retry_fault ? "retry" : "no-retry",

>>               entry->src_id, entry->ring_id, entry->vmid,

>>               entry->pasid, task_info.process_name, task_info.tgid,

>> @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct 

>> amdgpu_device *adev,

>>               dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",

>>                   REG_GET_FIELD(status,

>>                   VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));

>> +            dev_err(adev->dev, "\t RW: 0x%lx\n",

>> +                REG_GET_FIELD(status,

>> +                VM_L2_PROTECTION_FAULT_STATUS, RW));

>

> That should probably be a separate patch since it is fixing gfx9.

>

> Apart from that the patch looks good to me,

> Christian.

>

>>             }

>>       }

>
Hi Yong,

that is an intentional differentiation.

The values behind the ":" are hardware values from the page faults 
interrupt vector.

All values printed without the ":" are additional things we get from the 
kernel task information based on the pasid.

Please keep that,
Christian.

Am 12.08.19 um 21:20 schrieb Zhao, Yong:
> Hi Christian,
>
> I feel with ":" it is better, because without it I found it is not easy
> to interpret the printing. Moreover, it continues the format of the
> former part.
>
> It looks like this:
>
> [  190.686668] amdgpu 0000:03:00.0: [gfxhub0] retry page fault (src_id:0
> ring:0 vmid:8 pasid:32771, for process:kfdtest pid:3273 thread:kfdtest
> pid:3273)
>
> vs without ":"
>
> [  190.686668] amdgpu 0000:03:00.0: [gfxhub0] retry page fault (src_id:0
> ring:0 vmid:8 pasid:32771, for process kfdtest pid 3273 thread kfdtest
> pid 3273)
>
>
> If you are not convinced, I can change it to without ":".
>
>
> Regards,
>
> Yong
>
> On 2019-08-12 3:12 p.m., Christian König wrote:
>> Am 12.08.19 um 21:05 schrieb Zhao, Yong:
>>> The printing we did for GFX9 was not propogated to GFX10 somehow, so fix
>>> it now.
>>>
>>> Change-Id: Ic0b8381134340b83cd69c3fe186ac7a8a97b1bca
>>> Signed-off-by: Yong Zhao <Yong.Zhao@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 33 ++++++++++++++++++++++----
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  5 +++-
>>>    2 files changed, 32 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 4e3ac1084a94..f23be98e9897 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -140,17 +140,40 @@ static int gmc_v10_0_process_interrupt(struct
>>> amdgpu_device *adev,
>>>        }
>>>          if (printk_ratelimit()) {
>>> +        struct amdgpu_task_info task_info;
>>> +
>>> +        memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>> +        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>> +
>>>            dev_err(adev->dev,
>>> -            "[%s] VMC page fault (src_id:%u ring:%u vmid:%u
>>> pasid:%u)\n",
>>> +            "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, "
>>> +            "for process:%s pid:%d thread:%s pid:%d)\n",
>>>                entry->vmid_src ? "mmhub" : "gfxhub",
>>>                entry->src_id, entry->ring_id, entry->vmid,
>>> -            entry->pasid);
>>> -        dev_err(adev->dev, "  at page 0x%016llx from %d\n",
>>> +            entry->pasid, task_info.process_name, task_info.tgid,
>>> +            task_info.task_name, task_info.pid);
>>> +        dev_err(adev->dev, "  in page starting at address 0x%016llx
>>> from client %d\n",
>>>                addr, entry->client_id);
>>> -        if (!amdgpu_sriov_vf(adev))
>>> +        if (!amdgpu_sriov_vf(adev)) {
>>>                dev_err(adev->dev,
>>> -                "VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>>> +                "GCVM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",
>>>                    status);
>>> +            dev_err(adev->dev, "\t MORE_FAULTS: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                GCVM_L2_PROTECTION_FAULT_STATUS, MORE_FAULTS));
>>> +            dev_err(adev->dev, "\t WALKER_ERROR: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                GCVM_L2_PROTECTION_FAULT_STATUS, WALKER_ERROR));
>>> +            dev_err(adev->dev, "\t PERMISSION_FAULTS: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                GCVM_L2_PROTECTION_FAULT_STATUS, PERMISSION_FAULTS));
>>> +            dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                GCVM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
>>> +            dev_err(adev->dev, "\t RW: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                GCVM_L2_PROTECTION_FAULT_STATUS, RW));
>>> +        }
>>>        }
>>>          return 0;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 296e2d982578..34c4c2d08550 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -364,7 +364,7 @@ static int gmc_v9_0_process_interrupt(struct
>>> amdgpu_device *adev,
>>>              dev_err(adev->dev,
>>>                "[%s] %s page fault (src_id:%u ring:%u vmid:%u "
>>> -            "pasid:%u, for process %s pid %d thread %s pid %d)\n",
>>> +            "pasid:%u, for process:%s pid:%d thread:%s pid:%d)\n",
>> I think the text actually looks better without the ":".
>>
>>>                hub_name, retry_fault ? "retry" : "no-retry",
>>>                entry->src_id, entry->ring_id, entry->vmid,
>>>                entry->pasid, task_info.process_name, task_info.tgid,
>>> @@ -387,6 +387,9 @@ static int gmc_v9_0_process_interrupt(struct
>>> amdgpu_device *adev,
>>>                dev_err(adev->dev, "\t MAPPING_ERROR: 0x%lx\n",
>>>                    REG_GET_FIELD(status,
>>>                    VM_L2_PROTECTION_FAULT_STATUS, MAPPING_ERROR));
>>> +            dev_err(adev->dev, "\t RW: 0x%lx\n",
>>> +                REG_GET_FIELD(status,
>>> +                VM_L2_PROTECTION_FAULT_STATUS, RW));
>> That should probably be a separate patch since it is fixing gfx9.
>>
>> Apart from that the patch looks good to me,
>> Christian.
>>
>>>              }
>>>        }
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx