drm/amdgpu/sriov: For sriov runtime, use kiq to do invalidate tlb

Submitted by Deng, Emily on Aug. 14, 2018, 7:46 a.m.

Details

Message ID 1534232783-30577-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/amdgpu/sriov: For sriov runtime, use kiq to do invalidate tlb" ( rev: 3 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily Aug. 14, 2018, 7:46 a.m.
To avoid the tlb flush not interrupted by world switch, use kiq and one
command to do tlb invalidate.

SWDEV-161497

Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++
 3 files changed, 57 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 21adb1b6..aa6ddcc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -233,6 +233,56 @@  void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 	pr_err("failed to write reg:%x\n", reg);
 }
 
+void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct amdgpu_vmhub *hub,
+		unsigned eng, u32 req, uint32_t vmid)
+{
+	signed long r, cnt = 0;
+	unsigned long flags;
+	uint32_t seq;
+	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
+	struct amdgpu_ring *ring = &kiq->ring;
+
+	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);
+
+	spin_lock_irqsave(&kiq->ring_lock, flags);
+	amdgpu_ring_alloc(ring, 32);
+	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
+					    hub->vm_inv_eng0_ack + eng,
+					    req, 1 << vmid);
+	amdgpu_fence_emit_polling(ring, &seq);
+	amdgpu_ring_commit(ring);
+	spin_unlock_irqrestore(&kiq->ring_lock, flags);
+
+	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
+
+	/* don't wait anymore for gpu reset case because this way may
+	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
+	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
+	 * never return if we keep waiting in virt_kiq_rreg, which cause
+	 * gpu_recover() hang there.
+	 *
+	 * also don't wait anymore for IRQ context
+	 * */
+	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
+		goto failed_kiq;
+
+	if (in_interrupt())
+		might_sleep();
+
+	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
+		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
+		r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
+	}
+
+	if (cnt > MAX_KIQ_REG_TRY)
+		goto failed_kiq;
+
+	return;
+
+failed_kiq:
+	pr_err("failed to invalidate tlb with kiq\n");
+}
+
 /**
  * amdgpu_virt_request_full_gpu() - request full gpu access
  * @amdgpu:	amdgpu device.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
index 880ac11..a2e3c78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
@@ -288,6 +288,8 @@  void amdgpu_free_static_csa(struct amdgpu_device *adev);
 void amdgpu_virt_init_setting(struct amdgpu_device *adev);
 uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
 void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
+void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct amdgpu_vmhub *hub,
+		unsigned eng, u32 req, uint32_t vmid);
 int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
 int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index ed467de..6a886d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -339,6 +339,11 @@  static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 		struct amdgpu_vmhub *hub = &adev->vmhub[i];
 		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
 
+		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {
+			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp, vmid);
+			continue;
+		}
+
 		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
 
 		/* Busy wait for ACK.*/

Comments

Am 14.08.2018 um 09:46 schrieb Emily Deng:
> To avoid the tlb flush not interrupted by world switch, use kiq and one
> command to do tlb invalidate.

Well NAK, this just duplicates the TLB handling and moves it outside of 
the GMC code.

Instead just lower the timeout and suppress the warning when SRIOV is 
active.

Christian.

>
> SWDEV-161497
>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50 ++++++++++++++++++++++++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++
>   3 files changed, 57 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> index 21adb1b6..aa6ddcc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>   	pr_err("failed to write reg:%x\n", reg);
>   }
>   
> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct amdgpu_vmhub *hub,
> +		unsigned eng, u32 req, uint32_t vmid)
> +{
> +	signed long r, cnt = 0;
> +	unsigned long flags;
> +	uint32_t seq;
> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
> +	struct amdgpu_ring *ring = &kiq->ring;
> +
> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);
> +
> +	spin_lock_irqsave(&kiq->ring_lock, flags);
> +	amdgpu_ring_alloc(ring, 32);
> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
> +					    hub->vm_inv_eng0_ack + eng,
> +					    req, 1 << vmid);
> +	amdgpu_fence_emit_polling(ring, &seq);
> +	amdgpu_ring_commit(ring);
> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
> +
> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
> +
> +	/* don't wait anymore for gpu reset case because this way may
> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
> +	 * never return if we keep waiting in virt_kiq_rreg, which cause
> +	 * gpu_recover() hang there.
> +	 *
> +	 * also don't wait anymore for IRQ context
> +	 * */
> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
> +		goto failed_kiq;
> +
> +	if (in_interrupt())
> +		might_sleep();
> +
> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
> +		r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
> +	}
> +
> +	if (cnt > MAX_KIQ_REG_TRY)
> +		goto failed_kiq;
> +
> +	return;
> +
> +failed_kiq:
> +	pr_err("failed to invalidate tlb with kiq\n");
> +}
> +
>   /**
>    * amdgpu_virt_request_full_gpu() - request full gpu access
>    * @amdgpu:	amdgpu device.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index 880ac11..a2e3c78 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct amdgpu_device *adev);
>   void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>   uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v);
> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct amdgpu_vmhub *hub,
> +		unsigned eng, u32 req, uint32_t vmid);
>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index ed467de..6a886d9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>   
> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {
> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp, vmid);
> +			continue;
> +		}
> +
>   		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>   
>   		/* Busy wait for ACK.*/
>-----Original Message-----

>From: Christian König <ckoenig.leichtzumerken@gmail.com>

>Sent: Tuesday, August 14, 2018 3:54 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do

>invalidate tlb

>

>Am 14.08.2018 um 09:46 schrieb Emily Deng:

>> To avoid the tlb flush not interrupted by world switch, use kiq and

>> one command to do tlb invalidate.

>

>Well NAK, this just duplicates the TLB handling and moves it outside of the GMC

>code.

No, it not duplicates the TLB handling, it only send one command, and not duplicate. With kiq,
it only use one command to do the invalidate tlb and wait ack, and won't be interrupted by world switch.
>Instead just lower the timeout and suppress the warning when SRIOV is active.

With the kiq to do tlb flush, no warning issue expected.

Best wishes
Emily Deng
>Christian.

>

>>

>> SWDEV-161497

>>

>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50

>++++++++++++++++++++++++++++++++

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++

>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++

>>   3 files changed, 57 insertions(+)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>> index 21adb1b6..aa6ddcc 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device

>*adev, uint32_t reg, uint32_t v)

>>   	pr_err("failed to write reg:%x\n", reg);

>>   }

>>

>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct

>amdgpu_vmhub *hub,

>> +		unsigned eng, u32 req, uint32_t vmid) {

>> +	signed long r, cnt = 0;

>> +	unsigned long flags;

>> +	uint32_t seq;

>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

>> +	struct amdgpu_ring *ring = &kiq->ring;

>> +

>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);

>> +

>> +	spin_lock_irqsave(&kiq->ring_lock, flags);

>> +	amdgpu_ring_alloc(ring, 32);

>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +

>eng,

>> +					    hub->vm_inv_eng0_ack + eng,

>> +					    req, 1 << vmid);

>> +	amdgpu_fence_emit_polling(ring, &seq);

>> +	amdgpu_ring_commit(ring);

>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);

>> +

>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);

>> +

>> +	/* don't wait anymore for gpu reset case because this way may

>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg

>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will

>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause

>> +	 * gpu_recover() hang there.

>> +	 *

>> +	 * also don't wait anymore for IRQ context

>> +	 * */

>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))

>> +		goto failed_kiq;

>> +

>> +	if (in_interrupt())

>> +		might_sleep();

>> +

>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {

>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);

>> +		r = amdgpu_fence_wait_polling(ring, seq,

>MAX_KIQ_REG_WAIT);

>> +	}

>> +

>> +	if (cnt > MAX_KIQ_REG_TRY)

>> +		goto failed_kiq;

>> +

>> +	return;

>> +

>> +failed_kiq:

>> +	pr_err("failed to invalidate tlb with kiq\n"); }

>> +

>>   /**

>>    * amdgpu_virt_request_full_gpu() - request full gpu access

>>    * @amdgpu:	amdgpu device.

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>> index 880ac11..a2e3c78 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct amdgpu_device

>*adev);

>>   void amdgpu_virt_init_setting(struct amdgpu_device *adev);

>>   uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);

>>   void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg,

>> uint32_t v);

>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct

>amdgpu_vmhub *hub,

>> +		unsigned eng, u32 req, uint32_t vmid);

>>   int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);

>>   int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);

>>   int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git

>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> index ed467de..6a886d9 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct

>amdgpu_device *adev,

>>   		struct amdgpu_vmhub *hub = &adev->vmhub[i];

>>   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);

>>

>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {

>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,

>vmid);

>> +			continue;

>> +		}

>> +

>>   		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);

>>

>>   		/* Busy wait for ACK.*/
Am 14.08.2018 um 10:13 schrieb Deng, Emily:
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Tuesday, August 14, 2018 3:54 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do
>> invalidate tlb
>>
>> Am 14.08.2018 um 09:46 schrieb Emily Deng:
>>> To avoid the tlb flush not interrupted by world switch, use kiq and
>>> one command to do tlb invalidate.
>> Well NAK, this just duplicates the TLB handling and moves it outside of the GMC
>> code.
> No, it not duplicates the TLB handling, it only send one command, and not duplicate. With kiq,
> it only use one command to do the invalidate tlb and wait ack, and won't be interrupted by world switch.

That is effectively duplicating the TLB handling.

>> Instead just lower the timeout and suppress the warning when SRIOV is active.
> With the kiq to do tlb flush, no warning issue expected.

Either fix that issue thoughtfully or let us live with the warning message.

But please no more SRIOV workaround we can't test on bare metal and 
break things with older firmware.

Regards,
Christian.

>
> Best wishes
> Emily Deng
>> Christian.
>>
>>> SWDEV-161497
>>>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50
>> ++++++++++++++++++++++++++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++
>>>    3 files changed, 57 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> index 21adb1b6..aa6ddcc 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device
>> *adev, uint32_t reg, uint32_t v)
>>>    	pr_err("failed to write reg:%x\n", reg);
>>>    }
>>>
>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct
>> amdgpu_vmhub *hub,
>>> +		unsigned eng, u32 req, uint32_t vmid) {
>>> +	signed long r, cnt = 0;
>>> +	unsigned long flags;
>>> +	uint32_t seq;
>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>> +	struct amdgpu_ring *ring = &kiq->ring;
>>> +
>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);
>>> +
>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);
>>> +	amdgpu_ring_alloc(ring, 32);
>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +
>> eng,
>>> +					    hub->vm_inv_eng0_ack + eng,
>>> +					    req, 1 << vmid);
>>> +	amdgpu_fence_emit_polling(ring, &seq);
>>> +	amdgpu_ring_commit(ring);
>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>> +
>>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>> +
>>> +	/* don't wait anymore for gpu reset case because this way may
>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause
>>> +	 * gpu_recover() hang there.
>>> +	 *
>>> +	 * also don't wait anymore for IRQ context
>>> +	 * */
>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>> +		goto failed_kiq;
>>> +
>>> +	if (in_interrupt())
>>> +		might_sleep();
>>> +
>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
>>> +		r = amdgpu_fence_wait_polling(ring, seq,
>> MAX_KIQ_REG_WAIT);
>>> +	}
>>> +
>>> +	if (cnt > MAX_KIQ_REG_TRY)
>>> +		goto failed_kiq;
>>> +
>>> +	return;
>>> +
>>> +failed_kiq:
>>> +	pr_err("failed to invalidate tlb with kiq\n"); }
>>> +
>>>    /**
>>>     * amdgpu_virt_request_full_gpu() - request full gpu access
>>>     * @amdgpu:	amdgpu device.
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> index 880ac11..a2e3c78 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct amdgpu_device
>> *adev);
>>>    void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>>>    uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg);
>>>    void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg,
>>> uint32_t v);
>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev, struct
>> amdgpu_vmhub *hub,
>>> +		unsigned eng, u32 req, uint32_t vmid);
>>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>>>    int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>>>    int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index ed467de..6a886d9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>> amdgpu_device *adev,
>>>    		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>>>    		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>>>
>>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {
>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,
>> vmid);
>>> +			continue;
>>> +		}
>>> +
>>>    		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>>>
>>>    		/* Busy wait for ACK.*/
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>-----Original Message-----

>From: Christian König <ckoenig.leichtzumerken@gmail.com>

>Sent: Tuesday, August 14, 2018 4:20 PM

>To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian

><Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do

>invalidate tlb

>

>Am 14.08.2018 um 10:13 schrieb Deng, Emily:

>>> -----Original Message-----

>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>> Sent: Tuesday, August 14, 2018 3:54 PM

>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to

>>> do invalidate tlb

>>>

>>> Am 14.08.2018 um 09:46 schrieb Emily Deng:

>>>> To avoid the tlb flush not interrupted by world switch, use kiq and

>>>> one command to do tlb invalidate.

>>> Well NAK, this just duplicates the TLB handling and moves it outside

>>> of the GMC code.

>> No, it not duplicates the TLB handling, it only send one command, and

>> not duplicate. With kiq, it only use one command to do the invalidate tlb and

>wait ack, and won't be interrupted by world switch.

>

>That is effectively duplicating the TLB handling.

>

>>> Instead just lower the timeout and suppress the warning when SRIOV is

>active.

>> With the kiq to do tlb flush, no warning issue expected.

>

>Either fix that issue thoughtfully or let us live with the warning message.

>

Sorry, I don't know your mean, with one command, it will have no any warning. It will wait the ack in
the same command.
>But please no more SRIOV workaround we can't test on bare metal and break

>things with older firmware.

It seems a workaround here, but it truly is not a workaround. I think it is doing the right thing. 
As the new firmware support the one command, why we need use two commands to do the invalidate 
and wait ack, and we don't know when it will sent the ack back?
>

>Regards,

>Christian.

>

>>

>> Best wishes

>> Emily Deng

>>> Christian.

>>>

>>>> SWDEV-161497

>>>>

>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>> ---

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50

>>> ++++++++++++++++++++++++++++++++

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++

>>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++

>>>>    3 files changed, 57 insertions(+)

>>>>

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>> index 21adb1b6..aa6ddcc 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct

>amdgpu_device

>>> *adev, uint32_t reg, uint32_t v)

>>>>    	pr_err("failed to write reg:%x\n", reg);

>>>>    }

>>>>

>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>> +struct

>>> amdgpu_vmhub *hub,

>>>> +		unsigned eng, u32 req, uint32_t vmid) {

>>>> +	signed long r, cnt = 0;

>>>> +	unsigned long flags;

>>>> +	uint32_t seq;

>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

>>>> +	struct amdgpu_ring *ring = &kiq->ring;

>>>> +

>>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);

>>>> +

>>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);

>>>> +	amdgpu_ring_alloc(ring, 32);

>>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +

>>> eng,

>>>> +					    hub->vm_inv_eng0_ack + eng,

>>>> +					    req, 1 << vmid);

>>>> +	amdgpu_fence_emit_polling(ring, &seq);

>>>> +	amdgpu_ring_commit(ring);

>>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);

>>>> +

>>>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);

>>>> +

>>>> +	/* don't wait anymore for gpu reset case because this way may

>>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg

>>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will

>>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause

>>>> +	 * gpu_recover() hang there.

>>>> +	 *

>>>> +	 * also don't wait anymore for IRQ context

>>>> +	 * */

>>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))

>>>> +		goto failed_kiq;

>>>> +

>>>> +	if (in_interrupt())

>>>> +		might_sleep();

>>>> +

>>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {

>>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);

>>>> +		r = amdgpu_fence_wait_polling(ring, seq,

>>> MAX_KIQ_REG_WAIT);

>>>> +	}

>>>> +

>>>> +	if (cnt > MAX_KIQ_REG_TRY)

>>>> +		goto failed_kiq;

>>>> +

>>>> +	return;

>>>> +

>>>> +failed_kiq:

>>>> +	pr_err("failed to invalidate tlb with kiq\n"); }

>>>> +

>>>>    /**

>>>>     * amdgpu_virt_request_full_gpu() - request full gpu access

>>>>     * @amdgpu:	amdgpu device.

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>> index 880ac11..a2e3c78 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct

>amdgpu_device

>>> *adev);

>>>>    void amdgpu_virt_init_setting(struct amdgpu_device *adev);

>>>>    uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t

>reg);

>>>>    void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t

>>>> reg, uint32_t v);

>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>> +struct

>>> amdgpu_vmhub *hub,

>>>> +		unsigned eng, u32 req, uint32_t vmid);

>>>>    int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);

>>>>    int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);

>>>>    int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git

>>>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>> index ed467de..6a886d9 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct

>>> amdgpu_device *adev,

>>>>    		struct amdgpu_vmhub *hub = &adev->vmhub[i];

>>>>    		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);

>>>>

>>>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {

>>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,

>>> vmid);

>>>> +			continue;

>>>> +		}

>>>> +

>>>>    		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);

>>>>

>>>>    		/* Busy wait for ACK.*/

>> _______________________________________________

>> amd-gfx mailing list

>> amd-gfx@lists.freedesktop.org

>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 14.08.2018 um 10:26 schrieb Deng, Emily:
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>> Sent: Tuesday, August 14, 2018 4:20 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do
>> invalidate tlb
>>
>> Am 14.08.2018 um 10:13 schrieb Deng, Emily:
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Tuesday, August 14, 2018 3:54 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to
>>>> do invalidate tlb
>>>>
>>>> Am 14.08.2018 um 09:46 schrieb Emily Deng:
>>>>> To avoid the tlb flush not interrupted by world switch, use kiq and
>>>>> one command to do tlb invalidate.
>>>> Well NAK, this just duplicates the TLB handling and moves it outside
>>>> of the GMC code.
>>> No, it not duplicates the TLB handling, it only send one command, and
>>> not duplicate. With kiq, it only use one command to do the invalidate tlb and
>> wait ack, and won't be interrupted by world switch.
>>
>> That is effectively duplicating the TLB handling.
>>
>>>> Instead just lower the timeout and suppress the warning when SRIOV is
>> active.
>>> With the kiq to do tlb flush, no warning issue expected.
>> Either fix that issue thoughtfully or let us live with the warning message.
>>
> Sorry, I don't know your mean, with one command, it will have no any warning. It will wait the ack in
> the same command.
>> But please no more SRIOV workaround we can't test on bare metal and break
>> things with older firmware.
> It seems a workaround here, but it truly is not a workaround. I think it is doing the right thing.
> As the new firmware support the one command, why we need use two commands to do the invalidate
> and wait ack, and we don't know when it will sent the ack back?

The problem is that you depend on new firmware here.

See firmware and driver are distributed separately and we got into quite 
a bunch of problems because the ring_emit_reg_write_reg_wait() isn't 
correctly implemented.

See gfx_v9_0_ring_emit_reg_write_reg_wait as well:
> static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring 
> *ring,
>                                                   uint32_t reg0, 
> uint32_t reg1,
>                                                   uint32_t ref, 
> uint32_t mask)
> {
>         int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>
>         if (amdgpu_sriov_vf(ring->adev))
>                 gfx_v9_0_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
>                                       ref, mask, 0x20);
>         else
>                 amdgpu_ring_emit_reg_write_reg_wait_helper(ring, reg0, 
> reg1,
>                                                            ref, mask);
> }

Before we add any more broken workarounds like that please fix the 
existing code to check for the for the firmware version and NOT if SRIOV 
is present or not.

Regards,
Christian.

>> Regards,
>> Christian.
>>
>>> Best wishes
>>> Emily Deng
>>>> Christian.
>>>>
>>>>> SWDEV-161497
>>>>>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50
>>>> ++++++++++++++++++++++++++++++++
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++
>>>>>     3 files changed, 57 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>> index 21adb1b6..aa6ddcc 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct
>> amdgpu_device
>>>> *adev, uint32_t reg, uint32_t v)
>>>>>     	pr_err("failed to write reg:%x\n", reg);
>>>>>     }
>>>>>
>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,
>>>>> +struct
>>>> amdgpu_vmhub *hub,
>>>>> +		unsigned eng, u32 req, uint32_t vmid) {
>>>>> +	signed long r, cnt = 0;
>>>>> +	unsigned long flags;
>>>>> +	uint32_t seq;
>>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>> +	struct amdgpu_ring *ring = &kiq->ring;
>>>>> +
>>>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);
>>>>> +
>>>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>> +	amdgpu_ring_alloc(ring, 32);
>>>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +
>>>> eng,
>>>>> +					    hub->vm_inv_eng0_ack + eng,
>>>>> +					    req, 1 << vmid);
>>>>> +	amdgpu_fence_emit_polling(ring, &seq);
>>>>> +	amdgpu_ring_commit(ring);
>>>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>>> +
>>>>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>> +
>>>>> +	/* don't wait anymore for gpu reset case because this way may
>>>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
>>>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
>>>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause
>>>>> +	 * gpu_recover() hang there.
>>>>> +	 *
>>>>> +	 * also don't wait anymore for IRQ context
>>>>> +	 * */
>>>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>>>> +		goto failed_kiq;
>>>>> +
>>>>> +	if (in_interrupt())
>>>>> +		might_sleep();
>>>>> +
>>>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
>>>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
>>>>> +		r = amdgpu_fence_wait_polling(ring, seq,
>>>> MAX_KIQ_REG_WAIT);
>>>>> +	}
>>>>> +
>>>>> +	if (cnt > MAX_KIQ_REG_TRY)
>>>>> +		goto failed_kiq;
>>>>> +
>>>>> +	return;
>>>>> +
>>>>> +failed_kiq:
>>>>> +	pr_err("failed to invalidate tlb with kiq\n"); }
>>>>> +
>>>>>     /**
>>>>>      * amdgpu_virt_request_full_gpu() - request full gpu access
>>>>>      * @amdgpu:	amdgpu device.
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> index 880ac11..a2e3c78 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct
>> amdgpu_device
>>>> *adev);
>>>>>     void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>>>>>     uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t
>> reg);
>>>>>     void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t
>>>>> reg, uint32_t v);
>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,
>>>>> +struct
>>>> amdgpu_vmhub *hub,
>>>>> +		unsigned eng, u32 req, uint32_t vmid);
>>>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
>>>>>     int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
>>>>>     int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff --git
>>>>> a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index ed467de..6a886d9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>>>> amdgpu_device *adev,
>>>>>     		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>>>>>     		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>>>>>
>>>>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {
>>>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,
>>>> vmid);
>>>>> +			continue;
>>>>> +		}
>>>>> +
>>>>>     		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>>>>>
>>>>>     		/* Busy wait for ACK.*/
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>-----Original Message-----

>From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>Christian König

>Sent: Tuesday, August 14, 2018 4:32 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do

>invalidate tlb

>

>Am 14.08.2018 um 10:26 schrieb Deng, Emily:

>>> -----Original Message-----

>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>> Sent: Tuesday, August 14, 2018 4:20 PM

>>> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian

>>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to

>>> do invalidate tlb

>>>

>>> Am 14.08.2018 um 10:13 schrieb Deng, Emily:

>>>>> -----Original Message-----

>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>>>> Sent: Tuesday, August 14, 2018 3:54 PM

>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq

>>>>> to do invalidate tlb

>>>>>

>>>>> Am 14.08.2018 um 09:46 schrieb Emily Deng:

>>>>>> To avoid the tlb flush not interrupted by world switch, use kiq

>>>>>> and one command to do tlb invalidate.

>>>>> Well NAK, this just duplicates the TLB handling and moves it

>>>>> outside of the GMC code.

>>>> No, it not duplicates the TLB handling, it only send one command,

>>>> and not duplicate. With kiq, it only use one command to do the

>>>> invalidate tlb and

>>> wait ack, and won't be interrupted by world switch.

>>>

>>> That is effectively duplicating the TLB handling.

>>>

>>>>> Instead just lower the timeout and suppress the warning when SRIOV

>>>>> is

>>> active.

>>>> With the kiq to do tlb flush, no warning issue expected.

>>> Either fix that issue thoughtfully or let us live with the warning message.

>>>

>> Sorry, I don't know your mean, with one command, it will have no any

>> warning. It will wait the ack in the same command.

>>> But please no more SRIOV workaround we can't test on bare metal and

>>> break things with older firmware.

>> It seems a workaround here, but it truly is not a workaround. I think it is doing

>the right thing.

>> As the new firmware support the one command, why we need use two

>> commands to do the invalidate and wait ack, and we don't know when it will

>sent the ack back?

>

>The problem is that you depend on new firmware here.

>See firmware and driver are distributed separately and we got into quite a

>bunch of problems because the ring_emit_reg_write_reg_wait() isn't correctly

>implemented.

So it uses sriov to distinguish this, as sriov always use the tip firmware. Do you think need to add the 
firmware version check here? And also add to the below code?
Best wishes
Emily Deng

>See gfx_v9_0_ring_emit_reg_write_reg_wait as well:

>> static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring

>> *ring,

>>                                                   uint32_t reg0,

>> uint32_t reg1,

>>                                                   uint32_t ref,

>> uint32_t mask) {

>>         int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);

>>

>>         if (amdgpu_sriov_vf(ring->adev))

>>                 gfx_v9_0_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,

>>                                       ref, mask, 0x20);

>>         else

>>                 amdgpu_ring_emit_reg_write_reg_wait_helper(ring, reg0,

>> reg1,

>>                                                            ref, mask);

>> }

>

>Before we add any more broken workarounds like that please fix the existing

>code to check for the for the firmware version and NOT if SRIOV is present or

>not.

>

>Regards,

>Christian.

>

>>> Regards,

>>> Christian.

>>>

>>>> Best wishes

>>>> Emily Deng

>>>>> Christian.

>>>>>

>>>>>> SWDEV-161497

>>>>>>

>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>>>> ---

>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50

>>>>> ++++++++++++++++++++++++++++++++

>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++

>>>>>>     drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++

>>>>>>     3 files changed, 57 insertions(+)

>>>>>>

>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>> index 21adb1b6..aa6ddcc 100644

>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct

>>> amdgpu_device

>>>>> *adev, uint32_t reg, uint32_t v)

>>>>>>     	pr_err("failed to write reg:%x\n", reg);

>>>>>>     }

>>>>>>

>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>>>> +struct

>>>>> amdgpu_vmhub *hub,

>>>>>> +		unsigned eng, u32 req, uint32_t vmid) {

>>>>>> +	signed long r, cnt = 0;

>>>>>> +	unsigned long flags;

>>>>>> +	uint32_t seq;

>>>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

>>>>>> +	struct amdgpu_ring *ring = &kiq->ring;

>>>>>> +

>>>>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);

>>>>>> +

>>>>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);

>>>>>> +	amdgpu_ring_alloc(ring, 32);

>>>>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +

>>>>> eng,

>>>>>> +					    hub->vm_inv_eng0_ack + eng,

>>>>>> +					    req, 1 << vmid);

>>>>>> +	amdgpu_fence_emit_polling(ring, &seq);

>>>>>> +	amdgpu_ring_commit(ring);

>>>>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);

>>>>>> +

>>>>>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);

>>>>>> +

>>>>>> +	/* don't wait anymore for gpu reset case because this way may

>>>>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg

>>>>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will

>>>>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause

>>>>>> +	 * gpu_recover() hang there.

>>>>>> +	 *

>>>>>> +	 * also don't wait anymore for IRQ context

>>>>>> +	 * */

>>>>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))

>>>>>> +		goto failed_kiq;

>>>>>> +

>>>>>> +	if (in_interrupt())

>>>>>> +		might_sleep();

>>>>>> +

>>>>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {

>>>>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);

>>>>>> +		r = amdgpu_fence_wait_polling(ring, seq,

>>>>> MAX_KIQ_REG_WAIT);

>>>>>> +	}

>>>>>> +

>>>>>> +	if (cnt > MAX_KIQ_REG_TRY)

>>>>>> +		goto failed_kiq;

>>>>>> +

>>>>>> +	return;

>>>>>> +

>>>>>> +failed_kiq:

>>>>>> +	pr_err("failed to invalidate tlb with kiq\n"); }

>>>>>> +

>>>>>>     /**

>>>>>>      * amdgpu_virt_request_full_gpu() - request full gpu access

>>>>>>      * @amdgpu:	amdgpu device.

>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>> index 880ac11..a2e3c78 100644

>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct

>>> amdgpu_device

>>>>> *adev);

>>>>>>     void amdgpu_virt_init_setting(struct amdgpu_device *adev);

>>>>>>     uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev,

>>>>>> uint32_t

>>> reg);

>>>>>>     void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t

>>>>>> reg, uint32_t v);

>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>>>> +struct

>>>>> amdgpu_vmhub *hub,

>>>>>> +		unsigned eng, u32 req, uint32_t vmid);

>>>>>>     int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool

>init);

>>>>>>     int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool

>init);

>>>>>>     int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff

>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>> index ed467de..6a886d9 100644

>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct

>>>>> amdgpu_device *adev,

>>>>>>     		struct amdgpu_vmhub *hub = &adev->vmhub[i];

>>>>>>     		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);

>>>>>>

>>>>>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {

>>>>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,

>>>>> vmid);

>>>>>> +			continue;

>>>>>> +		}

>>>>>> +

>>>>>>     		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);

>>>>>>

>>>>>>     		/* Busy wait for ACK.*/

>>>> _______________________________________________

>>>> amd-gfx mailing list

>>>> amd-gfx@lists.freedesktop.org

>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

>

>_______________________________________________

>amd-gfx mailing list

>amd-gfx@lists.freedesktop.org

>https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 14.08.2018 um 10:35 schrieb Deng, Emily:
>> -----Original Message-----
>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
>> Christian König
>> Sent: Tuesday, August 14, 2018 4:32 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do
>> invalidate tlb
>>
>> Am 14.08.2018 um 10:26 schrieb Deng, Emily:
>>>> -----Original Message-----
>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>> Sent: Tuesday, August 14, 2018 4:20 PM
>>>> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian
>>>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org
>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to
>>>> do invalidate tlb
>>>>
>>>> Am 14.08.2018 um 10:13 schrieb Deng, Emily:
>>>>>> -----Original Message-----
>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>>>>> Sent: Tuesday, August 14, 2018 3:54 PM
>>>>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq
>>>>>> to do invalidate tlb
>>>>>>
>>>>>> Am 14.08.2018 um 09:46 schrieb Emily Deng:
>>>>>>> To avoid the tlb flush not interrupted by world switch, use kiq
>>>>>>> and one command to do tlb invalidate.
>>>>>> Well NAK, this just duplicates the TLB handling and moves it
>>>>>> outside of the GMC code.
>>>>> No, it not duplicates the TLB handling, it only send one command,
>>>>> and not duplicate. With kiq, it only use one command to do the
>>>>> invalidate tlb and
>>>> wait ack, and won't be interrupted by world switch.
>>>>
>>>> That is effectively duplicating the TLB handling.
>>>>
>>>>>> Instead just lower the timeout and suppress the warning when SRIOV
>>>>>> is
>>>> active.
>>>>> With the kiq to do tlb flush, no warning issue expected.
>>>> Either fix that issue thoughtfully or let us live with the warning message.
>>>>
>>> Sorry, I don't know your mean, with one command, it will have no any
>>> warning. It will wait the ack in the same command.
>>>> But please no more SRIOV workaround we can't test on bare metal and
>>>> break things with older firmware.
>>> It seems a workaround here, but it truly is not a workaround. I think it is doing
>> the right thing.
>>> As the new firmware support the one command, why we need use two
>>> commands to do the invalidate and wait ack, and we don't know when it will
>> sent the ack back?
>>
>> The problem is that you depend on new firmware here.
>> See firmware and driver are distributed separately and we got into quite a
>> bunch of problems because the ring_emit_reg_write_reg_wait() isn't correctly
>> implemented.
> So it uses sriov to distinguish this, as sriov always use the tip firmware.

That assumption is not correct. You SRIOV users should use the tip 
firmware, but there is no guarantee that they are actually doing it.

> Do you think need to add the firmware version check here?

Yes, please do so.

> And also add to the below code?

No, probably best approach would be to use the same code path for both 
bare metal and SRIOV.

Regards,
Christian.

> Best wishes
> Emily Deng
>
>> See gfx_v9_0_ring_emit_reg_write_reg_wait as well:
>>> static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring
>>> *ring,
>>>                                                    uint32_t reg0,
>>> uint32_t reg1,
>>>                                                    uint32_t ref,
>>> uint32_t mask) {
>>>          int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>>>
>>>          if (amdgpu_sriov_vf(ring->adev))
>>>                  gfx_v9_0_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
>>>                                        ref, mask, 0x20);
>>>          else
>>>                  amdgpu_ring_emit_reg_write_reg_wait_helper(ring, reg0,
>>> reg1,
>>>                                                             ref, mask);
>>> }
>> Before we add any more broken workarounds like that please fix the existing
>> code to check for the for the firmware version and NOT if SRIOV is present or
>> not.
>>
>> Regards,
>> Christian.
>>
>>>> Regards,
>>>> Christian.
>>>>
>>>>> Best wishes
>>>>> Emily Deng
>>>>>> Christian.
>>>>>>
>>>>>>> SWDEV-161497
>>>>>>>
>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50
>>>>>> ++++++++++++++++++++++++++++++++
>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++
>>>>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++
>>>>>>>      3 files changed, 57 insertions(+)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>>>> index 21adb1b6..aa6ddcc 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
>>>>>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct
>>>> amdgpu_device
>>>>>> *adev, uint32_t reg, uint32_t v)
>>>>>>>      	pr_err("failed to write reg:%x\n", reg);
>>>>>>>      }
>>>>>>>
>>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,
>>>>>>> +struct
>>>>>> amdgpu_vmhub *hub,
>>>>>>> +		unsigned eng, u32 req, uint32_t vmid) {
>>>>>>> +	signed long r, cnt = 0;
>>>>>>> +	unsigned long flags;
>>>>>>> +	uint32_t seq;
>>>>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>>>>>> +	struct amdgpu_ring *ring = &kiq->ring;
>>>>>>> +
>>>>>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);
>>>>>>> +
>>>>>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);
>>>>>>> +	amdgpu_ring_alloc(ring, 32);
>>>>>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req +
>>>>>> eng,
>>>>>>> +					    hub->vm_inv_eng0_ack + eng,
>>>>>>> +					    req, 1 << vmid);
>>>>>>> +	amdgpu_fence_emit_polling(ring, &seq);
>>>>>>> +	amdgpu_ring_commit(ring);
>>>>>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);
>>>>>>> +
>>>>>>> +	r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT);
>>>>>>> +
>>>>>>> +	/* don't wait anymore for gpu reset case because this way may
>>>>>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg
>>>>>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will
>>>>>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause
>>>>>>> +	 * gpu_recover() hang there.
>>>>>>> +	 *
>>>>>>> +	 * also don't wait anymore for IRQ context
>>>>>>> +	 * */
>>>>>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))
>>>>>>> +		goto failed_kiq;
>>>>>>> +
>>>>>>> +	if (in_interrupt())
>>>>>>> +		might_sleep();
>>>>>>> +
>>>>>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {
>>>>>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);
>>>>>>> +		r = amdgpu_fence_wait_polling(ring, seq,
>>>>>> MAX_KIQ_REG_WAIT);
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	if (cnt > MAX_KIQ_REG_TRY)
>>>>>>> +		goto failed_kiq;
>>>>>>> +
>>>>>>> +	return;
>>>>>>> +
>>>>>>> +failed_kiq:
>>>>>>> +	pr_err("failed to invalidate tlb with kiq\n"); }
>>>>>>> +
>>>>>>>      /**
>>>>>>>       * amdgpu_virt_request_full_gpu() - request full gpu access
>>>>>>>       * @amdgpu:	amdgpu device.
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> index 880ac11..a2e3c78 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
>>>>>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct
>>>> amdgpu_device
>>>>>> *adev);
>>>>>>>      void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>>>>>>>      uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev,
>>>>>>> uint32_t
>>>> reg);
>>>>>>>      void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t
>>>>>>> reg, uint32_t v);
>>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,
>>>>>>> +struct
>>>>>> amdgpu_vmhub *hub,
>>>>>>> +		unsigned eng, u32 req, uint32_t vmid);
>>>>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool
>> init);
>>>>>>>      int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool
>> init);
>>>>>>>      int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff
>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> index ed467de..6a886d9 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct
>>>>>> amdgpu_device *adev,
>>>>>>>      		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>>>>>>>      		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>>>>>>>
>>>>>>> +		if (amdgpu_sriov_vf(adev) && amdgpu_sriov_runtime(adev)) {
>>>>>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng, tmp,
>>>>>> vmid);
>>>>>>> +			continue;
>>>>>>> +		}
>>>>>>> +
>>>>>>>      		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
>>>>>>>
>>>>>>>      		/* Busy wait for ACK.*/
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx@lists.freedesktop.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>-----Original Message-----

>From: Koenig, Christian

>Sent: Tuesday, August 14, 2018 4:38 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to do

>invalidate tlb

>

>Am 14.08.2018 um 10:35 schrieb Deng, Emily:

>>> -----Original Message-----

>>> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of

>>> Christian König

>>> Sent: Tuesday, August 14, 2018 4:32 PM

>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq to

>>> do invalidate tlb

>>>

>>> Am 14.08.2018 um 10:26 schrieb Deng, Emily:

>>>>> -----Original Message-----

>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>>>> Sent: Tuesday, August 14, 2018 4:20 PM

>>>>> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian

>>>>> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org

>>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq

>>>>> to do invalidate tlb

>>>>>

>>>>> Am 14.08.2018 um 10:13 schrieb Deng, Emily:

>>>>>>> -----Original Message-----

>>>>>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>>>>>>> Sent: Tuesday, August 14, 2018 3:54 PM

>>>>>>> To: Deng, Emily <Emily.Deng@amd.com>;

>>>>>>> amd-gfx@lists.freedesktop.org

>>>>>>> Subject: Re: [PATCH] drm/amdgpu/sriov: For sriov runtime, use kiq

>>>>>>> to do invalidate tlb

>>>>>>>

>>>>>>> Am 14.08.2018 um 09:46 schrieb Emily Deng:

>>>>>>>> To avoid the tlb flush not interrupted by world switch, use kiq

>>>>>>>> and one command to do tlb invalidate.

>>>>>>> Well NAK, this just duplicates the TLB handling and moves it

>>>>>>> outside of the GMC code.

>>>>>> No, it not duplicates the TLB handling, it only send one command,

>>>>>> and not duplicate. With kiq, it only use one command to do the

>>>>>> invalidate tlb and

>>>>> wait ack, and won't be interrupted by world switch.

>>>>>

>>>>> That is effectively duplicating the TLB handling.

>>>>>

>>>>>>> Instead just lower the timeout and suppress the warning when

>>>>>>> SRIOV is

>>>>> active.

>>>>>> With the kiq to do tlb flush, no warning issue expected.

>>>>> Either fix that issue thoughtfully or let us live with the warning message.

>>>>>

>>>> Sorry, I don't know your mean, with one command, it will have no any

>>>> warning. It will wait the ack in the same command.

>>>>> But please no more SRIOV workaround we can't test on bare metal and

>>>>> break things with older firmware.

>>>> It seems a workaround here, but it truly is not a workaround. I

>>>> think it is doing

>>> the right thing.

>>>> As the new firmware support the one command, why we need use two

>>>> commands to do the invalidate and wait ack, and we don't know when

>>>> it will

>>> sent the ack back?

>>>

>>> The problem is that you depend on new firmware here.

>>> See firmware and driver are distributed separately and we got into

>>> quite a bunch of problems because the ring_emit_reg_write_reg_wait()

>>> isn't correctly implemented.

>> So it uses sriov to distinguish this, as sriov always use the tip firmware.

>

>That assumption is not correct. You SRIOV users should use the tip firmware,

>but there is no guarantee that they are actually doing it.

>

>> Do you think need to add the firmware version check here?

>

>Yes, please do so.

Ok, I will send a v2 patch
>> And also add to the below code?

>

>No, probably best approach would be to use the same code path for both bare

>metal and SRIOV.

But the bare metal doesn't have world switch, the SRIOV has, and don't know when it will
be switched back to this VF. So I think for SRIOV, it will be better to use one command.
And it have another issue that I think you already know what I am mentioning.
>

>Regards,

>Christian.

>

>> Best wishes

>> Emily Deng

>>

>>> See gfx_v9_0_ring_emit_reg_write_reg_wait as well:

>>>> static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring

>>>> *ring,

>>>>                                                    uint32_t reg0,

>>>> uint32_t reg1,

>>>>                                                    uint32_t ref,

>>>> uint32_t mask) {

>>>>          int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);

>>>>

>>>>          if (amdgpu_sriov_vf(ring->adev))

>>>>                  gfx_v9_0_wait_reg_mem(ring, usepfp, 0, 1, reg0,

>>>> reg1,

>>>>                                        ref, mask, 0x20);

>>>>          else

>>>>                  amdgpu_ring_emit_reg_write_reg_wait_helper(ring,

>>>> reg0, reg1,

>>>>                                                             ref,

>>>> mask); }

>>> Before we add any more broken workarounds like that please fix the

>>> existing code to check for the for the firmware version and NOT if

>>> SRIOV is present or not.

>>>

>>> Regards,

>>> Christian.

>>>

>>>>> Regards,

>>>>> Christian.

>>>>>

>>>>>> Best wishes

>>>>>> Emily Deng

>>>>>>> Christian.

>>>>>>>

>>>>>>>> SWDEV-161497

>>>>>>>>

>>>>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

>>>>>>>> ---

>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 50

>>>>>>> ++++++++++++++++++++++++++++++++

>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h |  2 ++

>>>>>>>>      drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    |  5 ++++

>>>>>>>>      3 files changed, 57 insertions(+)

>>>>>>>>

>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>>>> index 21adb1b6..aa6ddcc 100644

>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

>>>>>>>> @@ -233,6 +233,56 @@ void amdgpu_virt_kiq_wreg(struct

>>>>> amdgpu_device

>>>>>>> *adev, uint32_t reg, uint32_t v)

>>>>>>>>      	pr_err("failed to write reg:%x\n", reg);

>>>>>>>>      }

>>>>>>>>

>>>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>>>>>> +struct

>>>>>>> amdgpu_vmhub *hub,

>>>>>>>> +		unsigned eng, u32 req, uint32_t vmid) {

>>>>>>>> +	signed long r, cnt = 0;

>>>>>>>> +	unsigned long flags;

>>>>>>>> +	uint32_t seq;

>>>>>>>> +	struct amdgpu_kiq *kiq = &adev->gfx.kiq;

>>>>>>>> +	struct amdgpu_ring *ring = &kiq->ring;

>>>>>>>> +

>>>>>>>> +	BUG_ON(!ring->funcs->emit_reg_write_reg_wait);

>>>>>>>> +

>>>>>>>> +	spin_lock_irqsave(&kiq->ring_lock, flags);

>>>>>>>> +	amdgpu_ring_alloc(ring, 32);

>>>>>>>> +	amdgpu_ring_emit_reg_write_reg_wait(ring, hub-

>>vm_inv_eng0_req

>>>>>>>> ++

>>>>>>> eng,

>>>>>>>> +					    hub->vm_inv_eng0_ack + eng,

>>>>>>>> +					    req, 1 << vmid);

>>>>>>>> +	amdgpu_fence_emit_polling(ring, &seq);

>>>>>>>> +	amdgpu_ring_commit(ring);

>>>>>>>> +	spin_unlock_irqrestore(&kiq->ring_lock, flags);

>>>>>>>> +

>>>>>>>> +	r = amdgpu_fence_wait_polling(ring, seq,

>MAX_KIQ_REG_WAIT);

>>>>>>>> +

>>>>>>>> +	/* don't wait anymore for gpu reset case because this way may

>>>>>>>> +	 * block gpu_recover() routine forever, e.g. this virt_kiq_rreg

>>>>>>>> +	 * is triggered in TTM and ttm_bo_lock_delayed_workqueue()

>will

>>>>>>>> +	 * never return if we keep waiting in virt_kiq_rreg, which cause

>>>>>>>> +	 * gpu_recover() hang there.

>>>>>>>> +	 *

>>>>>>>> +	 * also don't wait anymore for IRQ context

>>>>>>>> +	 * */

>>>>>>>> +	if (r < 1 && (adev->in_gpu_reset || in_interrupt()))

>>>>>>>> +		goto failed_kiq;

>>>>>>>> +

>>>>>>>> +	if (in_interrupt())

>>>>>>>> +		might_sleep();

>>>>>>>> +

>>>>>>>> +	while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) {

>>>>>>>> +		msleep(MAX_KIQ_REG_BAILOUT_INTERVAL);

>>>>>>>> +		r = amdgpu_fence_wait_polling(ring, seq,

>>>>>>> MAX_KIQ_REG_WAIT);

>>>>>>>> +	}

>>>>>>>> +

>>>>>>>> +	if (cnt > MAX_KIQ_REG_TRY)

>>>>>>>> +		goto failed_kiq;

>>>>>>>> +

>>>>>>>> +	return;

>>>>>>>> +

>>>>>>>> +failed_kiq:

>>>>>>>> +	pr_err("failed to invalidate tlb with kiq\n"); }

>>>>>>>> +

>>>>>>>>      /**

>>>>>>>>       * amdgpu_virt_request_full_gpu() - request full gpu access

>>>>>>>>       * @amdgpu:	amdgpu device.

>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>>>> index 880ac11..a2e3c78 100644

>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

>>>>>>>> @@ -288,6 +288,8 @@ void amdgpu_free_static_csa(struct

>>>>> amdgpu_device

>>>>>>> *adev);

>>>>>>>>      void amdgpu_virt_init_setting(struct amdgpu_device *adev);

>>>>>>>>      uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev,

>>>>>>>> uint32_t

>>>>> reg);

>>>>>>>>      void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev,

>>>>>>>> uint32_t reg, uint32_t v);

>>>>>>>> +void amdgpu_virt_kiq_invalidate_tlb(struct amdgpu_device *adev,

>>>>>>>> +struct

>>>>>>> amdgpu_vmhub *hub,

>>>>>>>> +		unsigned eng, u32 req, uint32_t vmid);

>>>>>>>>      int amdgpu_virt_request_full_gpu(struct amdgpu_device

>>>>>>>> *adev, bool

>>> init);

>>>>>>>>      int amdgpu_virt_release_full_gpu(struct amdgpu_device

>>>>>>>> *adev, bool

>>> init);

>>>>>>>>      int amdgpu_virt_reset_gpu(struct amdgpu_device *adev); diff

>>>>>>>> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>>>> index ed467de..6a886d9 100644

>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

>>>>>>>> @@ -339,6 +339,11 @@ static void gmc_v9_0_flush_gpu_tlb(struct

>>>>>>> amdgpu_device *adev,

>>>>>>>>      		struct amdgpu_vmhub *hub = &adev->vmhub[i];

>>>>>>>>      		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);

>>>>>>>>

>>>>>>>> +		if (amdgpu_sriov_vf(adev) &&

>amdgpu_sriov_runtime(adev)) {

>>>>>>>> +			amdgpu_virt_kiq_invalidate_tlb(adev, hub, eng,

>tmp,

>>>>>>> vmid);

>>>>>>>> +			continue;

>>>>>>>> +		}

>>>>>>>> +

>>>>>>>>      		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);

>>>>>>>>

>>>>>>>>      		/* Busy wait for ACK.*/

>>>>>> _______________________________________________

>>>>>> amd-gfx mailing list

>>>>>> amd-gfx@lists.freedesktop.org

>>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

>>> _______________________________________________

>>> amd-gfx mailing list

>>> amd-gfx@lists.freedesktop.org

>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx