drm/amdgpu: introduce vram lost paramter for reset function

Submitted by Liu, Monk on Aug. 23, 2019, 3:34 a.m.

Details

Message ID 1566531249-1396-1-git-send-email-Monk.Liu@amd.com
State New
Headers show
Series "drm/amdgpu: introduce vram lost paramter for reset function" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Liu, Monk Aug. 23, 2019, 3:34 a.m.
for SOC15/vega10 the BACO reset would introduce vram lost in
the high end address range and current kmd's vram lost
checking cannot catch it since it only check visible frame buffer

TODO:
to confirm if mode1/2 reset would introduce vram lost

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
 drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
 drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
 drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
 drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
 drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
 7 files changed, 22 insertions(+), 14 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f6ae565..1fe3756 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -552,7 +552,7 @@  struct amdgpu_asic_funcs {
 	int (*read_register)(struct amdgpu_device *adev, u32 se_num,
 			     u32 sh_num, u32 reg_offset, u32 *value);
 	void (*set_vga_state)(struct amdgpu_device *adev, bool state);
-	int (*reset)(struct amdgpu_device *adev);
+	int (*reset)(struct amdgpu_device *adev, bool *lost);
 	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
 	/* get the reference clock */
 	u32 (*get_xclk)(struct amdgpu_device *adev);
@@ -1136,7 +1136,7 @@  int emu_soc_asic_init(struct amdgpu_device *adev);
  * ASICs macro.
  */
 #define amdgpu_asic_set_vga_state(adev, state) (adev)->asic_funcs->set_vga_state((adev), (state))
-#define amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
+#define amdgpu_asic_reset(adev, lost) (adev)->asic_funcs->reset((adev), (lost))
 #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))
 #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
 #define amdgpu_asic_set_uvd_clocks(adev, v, d) (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 02b3e7d..8668cb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2546,7 +2546,7 @@  static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
 
-	adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
 	if (adev->asic_reset_res)
 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
 			 adev->asic_reset_res, adev->ddev->unique);
@@ -2751,7 +2751,7 @@  int amdgpu_device_init(struct amdgpu_device *adev,
 	 *  E.g., driver was not cleanly unloaded previously, etc.
 	 */
 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
+		r = amdgpu_asic_reset(adev, NULL);
 		if (r) {
 			dev_err(adev->dev, "asic reset on init failed\n");
 			goto failed;
@@ -3084,7 +3084,7 @@  int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
 		pci_disable_device(dev->pdev);
 		pci_set_power_state(dev->pdev, PCI_D3hot);
 	} else {
-		r = amdgpu_asic_reset(adev);
+		r = amdgpu_asic_reset(adev, NULL);
 		if (r)
 			DRM_ERROR("amdgpu asic reset failed\n");
 	}
@@ -3604,7 +3604,7 @@  static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
 					r = -EALREADY;
 			} else
-				r = amdgpu_asic_reset(tmp_adev);
+				r = amdgpu_asic_reset(tmp_adev, &vram_lost);
 
 			if (r) {
 				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
@@ -3645,7 +3645,9 @@  static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 				if (r)
 					goto out;
 
-				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+				if (!vram_lost)
+					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+
 				if (vram_lost) {
 					DRM_INFO("VRAM is lost due to GPU reset!\n");
 					atomic_inc(&tmp_adev->vram_lost_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c
index 7b63d7a..0f25b82 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik.c
@@ -1277,7 +1277,7 @@  static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)
  * to reset them.
  * Returns 0 for success.
  */
-static int cik_asic_reset(struct amdgpu_device *adev)
+static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	int r;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index a3d99f2..53de7a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -301,7 +301,7 @@  nv_asic_reset_method(struct amdgpu_device *adev)
 		return AMD_RESET_METHOD_MODE1;
 }
 
-static int nv_asic_reset(struct amdgpu_device *adev)
+static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 
 	/* FIXME: it doesn't work since vega10 */
@@ -315,10 +315,14 @@  static int nv_asic_reset(struct amdgpu_device *adev)
 	int ret = 0;
 	struct smu_context *smu = &adev->smu;
 
-	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+		if (vramlost)
+			*vramlost = true;
 		ret = smu_baco_reset(smu);
-	else
+	}
+	else {
 		ret = nv_asic_mode1_reset(adev);
+	}
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
index 9043614..f324099 100644
--- a/drivers/gpu/drm/amd/amdgpu/si.c
+++ b/drivers/gpu/drm/amd/amdgpu/si.c
@@ -1180,7 +1180,7 @@  static bool si_read_bios_from_rom(struct amdgpu_device *adev,
 }
 
 //xxx: not implemented
-static int si_asic_reset(struct amdgpu_device *adev)
+static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index fe2212df..12b2966 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -553,10 +553,12 @@  soc15_asic_reset_method(struct amdgpu_device *adev)
 		return AMD_RESET_METHOD_MODE1;
 }
 
-static int soc15_asic_reset(struct amdgpu_device *adev)
+static int soc15_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	switch (soc15_asic_reset_method(adev)) {
 		case AMD_RESET_METHOD_BACO:
+			if (vramlost)
+				*vramlost = true;
 			return soc15_asic_baco_reset(adev);
 		case AMD_RESET_METHOD_MODE2:
 			return soc15_mode2_reset(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 56c882b..8eceb00 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -696,7 +696,7 @@  static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)
  * to reset them.
  * Returns 0 for success.
  */
-static int vi_asic_reset(struct amdgpu_device *adev)
+static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
 {
 	int r;
 

Comments

Am 23.08.19 um 05:34 schrieb Monk Liu:
> for SOC15/vega10 the BACO reset would introduce vram lost in
> the high end address range and current kmd's vram lost
> checking cannot catch it since it only check visible frame buffer
>
> TODO:
> to confirm if mode1/2 reset would introduce vram lost

Looks good in general, but I would make the value mandatory or maybe use 
a special return code instead.

On the other hand wouldn't it be simpler to just increment 
vram_lost_counter?

Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
>   drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
>   drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
>   drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
>   drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
>   drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
>   7 files changed, 22 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f6ae565..1fe3756 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
>   	int (*read_register)(struct amdgpu_device *adev, u32 se_num,
>   			     u32 sh_num, u32 reg_offset, u32 *value);
>   	void (*set_vga_state)(struct amdgpu_device *adev, bool state);
> -	int (*reset)(struct amdgpu_device *adev);
> +	int (*reset)(struct amdgpu_device *adev, bool *lost);
>   	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
>   	/* get the reference clock */
>   	u32 (*get_xclk)(struct amdgpu_device *adev);
> @@ -1136,7 +1136,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
>    * ASICs macro.
>    */
>   #define amdgpu_asic_set_vga_state(adev, state) (adev)->asic_funcs->set_vga_state((adev), (state))
> -#define amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
> +#define amdgpu_asic_reset(adev, lost) (adev)->asic_funcs->reset((adev), (lost))
>   #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))
>   #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
>   #define amdgpu_asic_set_uvd_clocks(adev, v, d) (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 02b3e7d..8668cb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>   	struct amdgpu_device *adev =
>   		container_of(__work, struct amdgpu_device, xgmi_reset_work);
>   
> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);
> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
>   	if (adev->asic_reset_res)
>   		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>   			 adev->asic_reset_res, adev->ddev->unique);
> @@ -2751,7 +2751,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   	 *  E.g., driver was not cleanly unloaded previously, etc.
>   	 */
>   	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
> -		r = amdgpu_asic_reset(adev);
> +		r = amdgpu_asic_reset(adev, NULL);
>   		if (r) {
>   			dev_err(adev->dev, "asic reset on init failed\n");
>   			goto failed;
> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
>   		pci_disable_device(dev->pdev);
>   		pci_set_power_state(dev->pdev, PCI_D3hot);
>   	} else {
> -		r = amdgpu_asic_reset(adev);
> +		r = amdgpu_asic_reset(adev, NULL);
>   		if (r)
>   			DRM_ERROR("amdgpu asic reset failed\n");
>   	}
> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>   					r = -EALREADY;
>   			} else
> -				r = amdgpu_asic_reset(tmp_adev);
> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);
>   
>   			if (r) {
>   				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
> @@ -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>   				if (r)
>   					goto out;
>   
> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> +				if (!vram_lost)
> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> +
>   				if (vram_lost) {
>   					DRM_INFO("VRAM is lost due to GPU reset!\n");
>   					atomic_inc(&tmp_adev->vram_lost_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c
> index 7b63d7a..0f25b82 100644
> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)
>    * to reset them.
>    * Returns 0 for success.
>    */
> -static int cik_asic_reset(struct amdgpu_device *adev)
> +static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   	int r;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
> index a3d99f2..53de7a6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
>   		return AMD_RESET_METHOD_MODE1;
>   }
>   
> -static int nv_asic_reset(struct amdgpu_device *adev)
> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   
>   	/* FIXME: it doesn't work since vega10 */
> @@ -315,10 +315,14 @@ static int nv_asic_reset(struct amdgpu_device *adev)
>   	int ret = 0;
>   	struct smu_context *smu = &adev->smu;
>   
> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
> +		if (vramlost)
> +			*vramlost = true;
>   		ret = smu_baco_reset(smu);
> -	else
> +	}
> +	else {
>   		ret = nv_asic_mode1_reset(adev);
> +	}
>   
>   	return ret;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c
> index 9043614..f324099 100644
> --- a/drivers/gpu/drm/amd/amdgpu/si.c
> +++ b/drivers/gpu/drm/amd/amdgpu/si.c
> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,
>   }
>   
>   //xxx: not implemented
> -static int si_asic_reset(struct amdgpu_device *adev)
> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index fe2212df..12b2966 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
>   		return AMD_RESET_METHOD_MODE1;
>   }
>   
> -static int soc15_asic_reset(struct amdgpu_device *adev)
> +static int soc15_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   	switch (soc15_asic_reset_method(adev)) {
>   		case AMD_RESET_METHOD_BACO:
> +			if (vramlost)
> +				*vramlost = true;
>   			return soc15_asic_baco_reset(adev);
>   		case AMD_RESET_METHOD_MODE2:
>   			return soc15_mode2_reset(adev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
> index 56c882b..8eceb00 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)
>    * to reset them.
>    * Returns 0 for success.
>    */
> -static int vi_asic_reset(struct amdgpu_device *adev)
> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)
>   {
>   	int r;
>
>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:
1) PF FLR
2) mode1/2 reset
3) magic reset through config space
4) BACO reset

PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO reset is definitely a vram lost reset 

If you increase the counter in general function that will be not accurate 
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Christian König <ckoenig.leichtzumerken@gmail.com> 

Sent: Friday, August 23, 2019 4:27 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

Am 23.08.19 um 05:34 schrieb Monk Liu:
> for SOC15/vega10 the BACO reset would introduce vram lost in the high 

> end address range and current kmd's vram lost checking cannot catch it 

> since it only check visible frame buffer

>

> TODO:

> to confirm if mode1/2 reset would introduce vram lost


Looks good in general, but I would make the value mandatory or maybe use a special return code instead.

On the other hand wouldn't it be simpler to just increment vram_lost_counter?

Christian.

>

> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----

>   drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-

>   drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---

>   drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-

>   drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-

>   drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-

>   7 files changed, 22 insertions(+), 14 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 

> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index f6ae565..1fe3756 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {

>   	int (*read_register)(struct amdgpu_device *adev, u32 se_num,

>   			     u32 sh_num, u32 reg_offset, u32 *value);

>   	void (*set_vga_state)(struct amdgpu_device *adev, bool state);

> -	int (*reset)(struct amdgpu_device *adev);

> +	int (*reset)(struct amdgpu_device *adev, bool *lost);

>   	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);

>   	/* get the reference clock */

>   	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 @@ 

> int emu_soc_asic_init(struct amdgpu_device *adev);

>    * ASICs macro.

>    */

>   #define amdgpu_asic_set_vga_state(adev, state) 

> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define 

> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))

> +#define amdgpu_asic_reset(adev, lost) 

> +(adev)->asic_funcs->reset((adev), (lost))

>   #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))

>   #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))

>   #define amdgpu_asic_set_uvd_clocks(adev, v, d) 

> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git 

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index 02b3e7d..8668cb8 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>   	struct amdgpu_device *adev =

>   		container_of(__work, struct amdgpu_device, xgmi_reset_work);

>   

> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);

> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);

>   	if (adev->asic_reset_res)

>   		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>   			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 +2751,7 @@ 

> int amdgpu_device_init(struct amdgpu_device *adev,

>   	 *  E.g., driver was not cleanly unloaded previously, etc.

>   	 */

>   	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

> -		r = amdgpu_asic_reset(adev);

> +		r = amdgpu_asic_reset(adev, NULL);

>   		if (r) {

>   			dev_err(adev->dev, "asic reset on init failed\n");

>   			goto failed;

> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)

>   		pci_disable_device(dev->pdev);

>   		pci_set_power_state(dev->pdev, PCI_D3hot);

>   	} else {

> -		r = amdgpu_asic_reset(adev);

> +		r = amdgpu_asic_reset(adev, NULL);

>   		if (r)

>   			DRM_ERROR("amdgpu asic reset failed\n");

>   	}

> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>   				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

>   					r = -EALREADY;

>   			} else

> -				r = amdgpu_asic_reset(tmp_adev);

> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);

>   

>   			if (r) {

>   				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", @@ 

> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>   				if (r)

>   					goto out;

>   

> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

> +				if (!vram_lost)

> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

> +

>   				if (vram_lost) {

>   					DRM_INFO("VRAM is lost due to GPU reset!\n");

>   					atomic_inc(&tmp_adev->vram_lost_counter);

> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c 

> b/drivers/gpu/drm/amd/amdgpu/cik.c

> index 7b63d7a..0f25b82 100644

> --- a/drivers/gpu/drm/amd/amdgpu/cik.c

> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c

> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)

>    * to reset them.

>    * Returns 0 for success.

>    */

> -static int cik_asic_reset(struct amdgpu_device *adev)

> +static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>   {

>   	int r;

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c 

> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/nv.c

> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c

> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)

>   		return AMD_RESET_METHOD_MODE1;

>   }

>   

> -static int nv_asic_reset(struct amdgpu_device *adev)

> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>   {

>   

>   	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@ 

> static int nv_asic_reset(struct amdgpu_device *adev)

>   	int ret = 0;

>   	struct smu_context *smu = &adev->smu;

>   

> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

> +		if (vramlost)

> +			*vramlost = true;

>   		ret = smu_baco_reset(smu);

> -	else

> +	}

> +	else {

>   		ret = nv_asic_mode1_reset(adev);

> +	}

>   

>   	return ret;

>   }

> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c 

> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644

> --- a/drivers/gpu/drm/amd/amdgpu/si.c

> +++ b/drivers/gpu/drm/amd/amdgpu/si.c

> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,

>   }

>   

>   //xxx: not implemented

> -static int si_asic_reset(struct amdgpu_device *adev)

> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>   {

>   	return 0;

>   }

> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 

> b/drivers/gpu/drm/amd/amdgpu/soc15.c

> index fe2212df..12b2966 100644

> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c

> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

>   		return AMD_RESET_METHOD_MODE1;

>   }

>   

> -static int soc15_asic_reset(struct amdgpu_device *adev)

> +static int soc15_asic_reset(struct amdgpu_device *adev, bool 

> +*vramlost)

>   {

>   	switch (soc15_asic_reset_method(adev)) {

>   		case AMD_RESET_METHOD_BACO:

> +			if (vramlost)

> +				*vramlost = true;

>   			return soc15_asic_baco_reset(adev);

>   		case AMD_RESET_METHOD_MODE2:

>   			return soc15_mode2_reset(adev);

> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c 

> b/drivers/gpu/drm/amd/amdgpu/vi.c index 56c882b..8eceb00 100644

> --- a/drivers/gpu/drm/amd/amdgpu/vi.c

> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c

> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)

>    * to reset them.

>    * Returns 0 for success.

>    */

> -static int vi_asic_reset(struct amdgpu_device *adev)

> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>   {

>   	int r;

>
I thought in the BACO reset function.

The top level reset function doesn't do much more than increment the 
vram_lost_counter either.

Christian.

Am 23.08.19 um 10:32 schrieb Liu, Monk:
>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:

> 1) PF FLR

> 2) mode1/2 reset

> 3) magic reset through config space

> 4) BACO reset

>

> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO reset is definitely a vram lost reset

>

> If you increase the counter in general function that will be not accurate

> _____________________________________

> Monk Liu|GPU Virtualization Team |AMD

>

>

> -----Original Message-----

> From: Christian König <ckoenig.leichtzumerken@gmail.com>

> Sent: Friday, August 23, 2019 4:27 PM

> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

>

> Am 23.08.19 um 05:34 schrieb Monk Liu:

>> for SOC15/vega10 the BACO reset would introduce vram lost in the high

>> end address range and current kmd's vram lost checking cannot catch it

>> since it only check visible frame buffer

>>

>> TODO:

>> to confirm if mode1/2 reset would introduce vram lost

> Looks good in general, but I would make the value mandatory or maybe use a special return code instead.

>

> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>

> Christian.

>

>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

>> ---

>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--

>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----

>>    drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-

>>    drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---

>>    drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-

>>    drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-

>>    drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-

>>    7 files changed, 22 insertions(+), 14 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> index f6ae565..1fe3756 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {

>>    	int (*read_register)(struct amdgpu_device *adev, u32 se_num,

>>    			     u32 sh_num, u32 reg_offset, u32 *value);

>>    	void (*set_vga_state)(struct amdgpu_device *adev, bool state);

>> -	int (*reset)(struct amdgpu_device *adev);

>> +	int (*reset)(struct amdgpu_device *adev, bool *lost);

>>    	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);

>>    	/* get the reference clock */

>>    	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 @@

>> int emu_soc_asic_init(struct amdgpu_device *adev);

>>     * ASICs macro.

>>     */

>>    #define amdgpu_asic_set_vga_state(adev, state)

>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define

>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))

>> +#define amdgpu_asic_reset(adev, lost)

>> +(adev)->asic_funcs->reset((adev), (lost))

>>    #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))

>>    #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))

>>    #define amdgpu_asic_set_uvd_clocks(adev, v, d)

>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git

>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> index 02b3e7d..8668cb8 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>>    	struct amdgpu_device *adev =

>>    		container_of(__work, struct amdgpu_device, xgmi_reset_work);

>>    

>> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);

>> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);

>>    	if (adev->asic_reset_res)

>>    		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>>    			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 +2751,7 @@

>> int amdgpu_device_init(struct amdgpu_device *adev,

>>    	 *  E.g., driver was not cleanly unloaded previously, etc.

>>    	 */

>>    	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

>> -		r = amdgpu_asic_reset(adev);

>> +		r = amdgpu_asic_reset(adev, NULL);

>>    		if (r) {

>>    			dev_err(adev->dev, "asic reset on init failed\n");

>>    			goto failed;

>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)

>>    		pci_disable_device(dev->pdev);

>>    		pci_set_power_state(dev->pdev, PCI_D3hot);

>>    	} else {

>> -		r = amdgpu_asic_reset(adev);

>> +		r = amdgpu_asic_reset(adev, NULL);

>>    		if (r)

>>    			DRM_ERROR("amdgpu asic reset failed\n");

>>    	}

>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>    				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

>>    					r = -EALREADY;

>>    			} else

>> -				r = amdgpu_asic_reset(tmp_adev);

>> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);

>>    

>>    			if (r) {

>>    				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", @@

>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>    				if (r)

>>    					goto out;

>>    

>> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>> +				if (!vram_lost)

>> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>> +

>>    				if (vram_lost) {

>>    					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>    					atomic_inc(&tmp_adev->vram_lost_counter);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c

>> b/drivers/gpu/drm/amd/amdgpu/cik.c

>> index 7b63d7a..0f25b82 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c

>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)

>>     * to reset them.

>>     * Returns 0 for success.

>>     */

>> -static int cik_asic_reset(struct amdgpu_device *adev)

>> +static int cik_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    	int r;

>>    

>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c

>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c

>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)

>>    		return AMD_RESET_METHOD_MODE1;

>>    }

>>    

>> -static int nv_asic_reset(struct amdgpu_device *adev)

>> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    

>>    	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@

>> static int nv_asic_reset(struct amdgpu_device *adev)

>>    	int ret = 0;

>>    	struct smu_context *smu = &adev->smu;

>>    

>> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

>> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

>> +		if (vramlost)

>> +			*vramlost = true;

>>    		ret = smu_baco_reset(smu);

>> -	else

>> +	}

>> +	else {

>>    		ret = nv_asic_mode1_reset(adev);

>> +	}

>>    

>>    	return ret;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c

>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/si.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c

>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,

>>    }

>>    

>>    //xxx: not implemented

>> -static int si_asic_reset(struct amdgpu_device *adev)

>> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    	return 0;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c

>> b/drivers/gpu/drm/amd/amdgpu/soc15.c

>> index fe2212df..12b2966 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

>>    		return AMD_RESET_METHOD_MODE1;

>>    }

>>    

>> -static int soc15_asic_reset(struct amdgpu_device *adev)

>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool

>> +*vramlost)

>>    {

>>    	switch (soc15_asic_reset_method(adev)) {

>>    		case AMD_RESET_METHOD_BACO:

>> +			if (vramlost)

>> +				*vramlost = true;

>>    			return soc15_asic_baco_reset(adev);

>>    		case AMD_RESET_METHOD_MODE2:

>>    			return soc15_mode2_reset(adev);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c

>> b/drivers/gpu/drm/amd/amdgpu/vi.c index 56c882b..8eceb00 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c

>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)

>>     * to reset them.

>>     * Returns 0 for success.

>>     */

>> -static int vi_asic_reset(struct amdgpu_device *adev)

>> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    	int r;

>>
>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>    				if (vram_lost) {

>>    					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>    					atomic_inc(&tmp_adev->vram_lost_counter);


Above is the original logic, if we increment the counter in BACO reset routine, we would potentially 
Have another counter increasement if by coincidence the "amdgpu_device_check_vram_lost" successfully detected the vram lost (although right now it didn't ..)

Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in device_recovery() routine ?
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 

Sent: Friday, August 23, 2019 4:34 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

I thought in the BACO reset function.

The top level reset function doesn't do much more than increment the vram_lost_counter either.

Christian.

Am 23.08.19 um 10:32 schrieb Liu, Monk:
>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:

> 1) PF FLR

> 2) mode1/2 reset

> 3) magic reset through config space

> 4) BACO reset

>

> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO 

> reset is definitely a vram lost reset

>

> If you increase the counter in general function that will be not 

> accurate _____________________________________

> Monk Liu|GPU Virtualization Team |AMD

>

>

> -----Original Message-----

> From: Christian König <ckoenig.leichtzumerken@gmail.com>

> Sent: Friday, August 23, 2019 4:27 PM

> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for 

> reset function

>

> Am 23.08.19 um 05:34 schrieb Monk Liu:

>> for SOC15/vega10 the BACO reset would introduce vram lost in the high 

>> end address range and current kmd's vram lost checking cannot catch 

>> it since it only check visible frame buffer

>>

>> TODO:

>> to confirm if mode1/2 reset would introduce vram lost

> Looks good in general, but I would make the value mandatory or maybe use a special return code instead.

>

> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>

> Christian.

>

>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

>> ---

>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--

>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----

>>    drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-

>>    drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---

>>    drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-

>>    drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-

>>    drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-

>>    7 files changed, 22 insertions(+), 14 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> index f6ae565..1fe3756 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {

>>    	int (*read_register)(struct amdgpu_device *adev, u32 se_num,

>>    			     u32 sh_num, u32 reg_offset, u32 *value);

>>    	void (*set_vga_state)(struct amdgpu_device *adev, bool state);

>> -	int (*reset)(struct amdgpu_device *adev);

>> +	int (*reset)(struct amdgpu_device *adev, bool *lost);

>>    	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);

>>    	/* get the reference clock */

>>    	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 

>> @@ int emu_soc_asic_init(struct amdgpu_device *adev);

>>     * ASICs macro.

>>     */

>>    #define amdgpu_asic_set_vga_state(adev, state) 

>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define

>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))

>> +#define amdgpu_asic_reset(adev, lost) 

>> +(adev)->asic_funcs->reset((adev), (lost))

>>    #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))

>>    #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))

>>    #define amdgpu_asic_set_uvd_clocks(adev, v, d) 

>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git 

>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> index 02b3e7d..8668cb8 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>>    	struct amdgpu_device *adev =

>>    		container_of(__work, struct amdgpu_device, xgmi_reset_work);

>>    

>> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);

>> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);

>>    	if (adev->asic_reset_res)

>>    		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>>    			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 +2751,7 

>> @@ int amdgpu_device_init(struct amdgpu_device *adev,

>>    	 *  E.g., driver was not cleanly unloaded previously, etc.

>>    	 */

>>    	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

>> -		r = amdgpu_asic_reset(adev);

>> +		r = amdgpu_asic_reset(adev, NULL);

>>    		if (r) {

>>    			dev_err(adev->dev, "asic reset on init failed\n");

>>    			goto failed;

>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)

>>    		pci_disable_device(dev->pdev);

>>    		pci_set_power_state(dev->pdev, PCI_D3hot);

>>    	} else {

>> -		r = amdgpu_asic_reset(adev);

>> +		r = amdgpu_asic_reset(adev, NULL);

>>    		if (r)

>>    			DRM_ERROR("amdgpu asic reset failed\n");

>>    	}

>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>    				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

>>    					r = -EALREADY;

>>    			} else

>> -				r = amdgpu_asic_reset(tmp_adev);

>> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);

>>    

>>    			if (r) {

>>    				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", 

>> @@

>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>    				if (r)

>>    					goto out;

>>    

>> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>> +				if (!vram_lost)

>> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>> +

>>    				if (vram_lost) {

>>    					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>    					atomic_inc(&tmp_adev->vram_lost_counter);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c

>> b/drivers/gpu/drm/amd/amdgpu/cik.c

>> index 7b63d7a..0f25b82 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c

>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)

>>     * to reset them.

>>     * Returns 0 for success.

>>     */

>> -static int cik_asic_reset(struct amdgpu_device *adev)

>> +static int cik_asic_reset(struct amdgpu_device *adev, bool 

>> +*vramlost)

>>    {

>>    	int r;

>>    

>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c 

>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c

>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)

>>    		return AMD_RESET_METHOD_MODE1;

>>    }

>>    

>> -static int nv_asic_reset(struct amdgpu_device *adev)

>> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    

>>    	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@ 

>> static int nv_asic_reset(struct amdgpu_device *adev)

>>    	int ret = 0;

>>    	struct smu_context *smu = &adev->smu;

>>    

>> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

>> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

>> +		if (vramlost)

>> +			*vramlost = true;

>>    		ret = smu_baco_reset(smu);

>> -	else

>> +	}

>> +	else {

>>    		ret = nv_asic_mode1_reset(adev);

>> +	}

>>    

>>    	return ret;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c 

>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/si.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c

>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,

>>    }

>>    

>>    //xxx: not implemented

>> -static int si_asic_reset(struct amdgpu_device *adev)

>> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    	return 0;

>>    }

>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c

>> b/drivers/gpu/drm/amd/amdgpu/soc15.c

>> index fe2212df..12b2966 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

>>    		return AMD_RESET_METHOD_MODE1;

>>    }

>>    

>> -static int soc15_asic_reset(struct amdgpu_device *adev)

>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool

>> +*vramlost)

>>    {

>>    	switch (soc15_asic_reset_method(adev)) {

>>    		case AMD_RESET_METHOD_BACO:

>> +			if (vramlost)

>> +				*vramlost = true;

>>    			return soc15_asic_baco_reset(adev);

>>    		case AMD_RESET_METHOD_MODE2:

>>    			return soc15_mode2_reset(adev); diff --git 

>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c 

>> index 56c882b..8eceb00 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c

>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)

>>     * to reset them.

>>     * Returns 0 for success.

>>     */

>> -static int vi_asic_reset(struct amdgpu_device *adev)

>> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>    {

>>    	int r;

>>
Am 23.08.19 um 10:57 schrieb Liu, Monk:
>>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>>     				if (vram_lost) {

>>>     					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>>     					atomic_inc(&tmp_adev->vram_lost_counter);

> Above is the original logic, if we increment the counter in BACO reset routine, we would potentially

> Have another counter increasement if by coincidence the "amdgpu_device_check_vram_lost" successfully detected the vram lost (although right now it didn't ..)


Yeah, but would increment it twice be a problem? I don't think so.

> Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in device_recovery() routine ?


Please no, that thing certainly proved to be useful. Maybe we could 
investigate why it failed to auto detect the lost VRAM.

Christian.

> _____________________________________

> Monk Liu|GPU Virtualization Team |AMD

>

>

> -----Original Message-----

> From: Koenig, Christian <Christian.Koenig@amd.com>

> Sent: Friday, August 23, 2019 4:34 PM

> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

>

> I thought in the BACO reset function.

>

> The top level reset function doesn't do much more than increment the vram_lost_counter either.

>

> Christian.

>

> Am 23.08.19 um 10:32 schrieb Liu, Monk:

>>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:

>> 1) PF FLR

>> 2) mode1/2 reset

>> 3) magic reset through config space

>> 4) BACO reset

>>

>> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO

>> reset is definitely a vram lost reset

>>

>> If you increase the counter in general function that will be not

>> accurate _____________________________________

>> Monk Liu|GPU Virtualization Team |AMD

>>

>>

>> -----Original Message-----

>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>> Sent: Friday, August 23, 2019 4:27 PM

>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

>> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for

>> reset function

>>

>> Am 23.08.19 um 05:34 schrieb Monk Liu:

>>> for SOC15/vega10 the BACO reset would introduce vram lost in the high

>>> end address range and current kmd's vram lost checking cannot catch

>>> it since it only check visible frame buffer

>>>

>>> TODO:

>>> to confirm if mode1/2 reset would introduce vram lost

>> Looks good in general, but I would make the value mandatory or maybe use a special return code instead.

>>

>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>>

>> Christian.

>>

>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

>>> ---

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----

>>>     drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-

>>>     drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---

>>>     drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-

>>>     drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-

>>>     drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-

>>>     7 files changed, 22 insertions(+), 14 deletions(-)

>>>

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> index f6ae565..1fe3756 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {

>>>     	int (*read_register)(struct amdgpu_device *adev, u32 se_num,

>>>     			     u32 sh_num, u32 reg_offset, u32 *value);

>>>     	void (*set_vga_state)(struct amdgpu_device *adev, bool state);

>>> -	int (*reset)(struct amdgpu_device *adev);

>>> +	int (*reset)(struct amdgpu_device *adev, bool *lost);

>>>     	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);

>>>     	/* get the reference clock */

>>>     	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7

>>> @@ int emu_soc_asic_init(struct amdgpu_device *adev);

>>>      * ASICs macro.

>>>      */

>>>     #define amdgpu_asic_set_vga_state(adev, state)

>>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define

>>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))

>>> +#define amdgpu_asic_reset(adev, lost)

>>> +(adev)->asic_funcs->reset((adev), (lost))

>>>     #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))

>>>     #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))

>>>     #define amdgpu_asic_set_uvd_clocks(adev, v, d)

>>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git

>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> index 02b3e7d..8668cb8 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>>>     	struct amdgpu_device *adev =

>>>     		container_of(__work, struct amdgpu_device, xgmi_reset_work);

>>>     

>>> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);

>>> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);

>>>     	if (adev->asic_reset_res)

>>>     		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>>>     			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 +2751,7

>>> @@ int amdgpu_device_init(struct amdgpu_device *adev,

>>>     	 *  E.g., driver was not cleanly unloaded previously, etc.

>>>     	 */

>>>     	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

>>> -		r = amdgpu_asic_reset(adev);

>>> +		r = amdgpu_asic_reset(adev, NULL);

>>>     		if (r) {

>>>     			dev_err(adev->dev, "asic reset on init failed\n");

>>>     			goto failed;

>>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)

>>>     		pci_disable_device(dev->pdev);

>>>     		pci_set_power_state(dev->pdev, PCI_D3hot);

>>>     	} else {

>>> -		r = amdgpu_asic_reset(adev);

>>> +		r = amdgpu_asic_reset(adev, NULL);

>>>     		if (r)

>>>     			DRM_ERROR("amdgpu asic reset failed\n");

>>>     	}

>>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>>     				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

>>>     					r = -EALREADY;

>>>     			} else

>>> -				r = amdgpu_asic_reset(tmp_adev);

>>> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);

>>>     

>>>     			if (r) {

>>>     				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",

>>> @@

>>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>>     				if (r)

>>>     					goto out;

>>>     

>>> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>> +				if (!vram_lost)

>>> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>> +

>>>     				if (vram_lost) {

>>>     					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>>     					atomic_inc(&tmp_adev->vram_lost_counter);

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c

>>> b/drivers/gpu/drm/amd/amdgpu/cik.c

>>> index 7b63d7a..0f25b82 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c

>>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)

>>>      * to reset them.

>>>      * Returns 0 for success.

>>>      */

>>> -static int cik_asic_reset(struct amdgpu_device *adev)

>>> +static int cik_asic_reset(struct amdgpu_device *adev, bool

>>> +*vramlost)

>>>     {

>>>     	int r;

>>>     

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c

>>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c

>>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)

>>>     		return AMD_RESET_METHOD_MODE1;

>>>     }

>>>     

>>> -static int nv_asic_reset(struct amdgpu_device *adev)

>>> +static int nv_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>>     {

>>>     

>>>     	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@

>>> static int nv_asic_reset(struct amdgpu_device *adev)

>>>     	int ret = 0;

>>>     	struct smu_context *smu = &adev->smu;

>>>     

>>> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

>>> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

>>> +		if (vramlost)

>>> +			*vramlost = true;

>>>     		ret = smu_baco_reset(smu);

>>> -	else

>>> +	}

>>> +	else {

>>>     		ret = nv_asic_mode1_reset(adev);

>>> +	}

>>>     

>>>     	return ret;

>>>     }

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c

>>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/si.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c

>>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,

>>>     }

>>>     

>>>     //xxx: not implemented

>>> -static int si_asic_reset(struct amdgpu_device *adev)

>>> +static int si_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>>     {

>>>     	return 0;

>>>     }

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> index fe2212df..12b2966 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

>>>     		return AMD_RESET_METHOD_MODE1;

>>>     }

>>>     

>>> -static int soc15_asic_reset(struct amdgpu_device *adev)

>>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool

>>> +*vramlost)

>>>     {

>>>     	switch (soc15_asic_reset_method(adev)) {

>>>     		case AMD_RESET_METHOD_BACO:

>>> +			if (vramlost)

>>> +				*vramlost = true;

>>>     			return soc15_asic_baco_reset(adev);

>>>     		case AMD_RESET_METHOD_MODE2:

>>>     			return soc15_mode2_reset(adev); diff --git

>>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c

>>> index 56c882b..8eceb00 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c

>>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)

>>>      * to reset them.

>>>      * Returns 0 for success.

>>>      */

>>> -static int vi_asic_reset(struct amdgpu_device *adev)

>>> +static int vi_asic_reset(struct amdgpu_device *adev, bool *vramlost)

>>>     {

>>>     	int r;

>>>

>> Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM.


The reason is with BACO reset I found VRAM lost in high address e.g. 15~16 G (for 16 G vega10),  amdgpu_device_check_vram_lost only checks the very ahead visible part  

>> Yeah, but would increment it twice be a problem? I don't think so.


So your suggestion is we increase the counter in BACO reset , and no need to introduce the new "bool *" parameter, right ?
_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Koenig, Christian <Christian.Koenig@amd.com> 

Sent: Friday, August 23, 2019 8:47 PM
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function

Am 23.08.19 um 10:57 schrieb Liu, Monk:
>>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>>     				if (vram_lost) {

>>>     					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>>     					atomic_inc(&tmp_adev->vram_lost_counter);

> Above is the original logic, if we increment the counter in BACO reset 

> routine, we would potentially Have another counter increasement if by 

> coincidence the "amdgpu_device_check_vram_lost" successfully detected 

> the vram lost (although right now it didn't ..)


Yeah, but would increment it twice be a problem? I don't think so.

> Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in device_recovery() routine ?


Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM.

Christian.

> _____________________________________

> Monk Liu|GPU Virtualization Team |AMD

>

>

> -----Original Message-----

> From: Koenig, Christian <Christian.Koenig@amd.com>

> Sent: Friday, August 23, 2019 4:34 PM

> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for 

> reset function

>

> I thought in the BACO reset function.

>

> The top level reset function doesn't do much more than increment the vram_lost_counter either.

>

> Christian.

>

> Am 23.08.19 um 10:32 schrieb Liu, Monk:

>>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:

>> 1) PF FLR

>> 2) mode1/2 reset

>> 3) magic reset through config space

>> 4) BACO reset

>>

>> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO 

>> reset is definitely a vram lost reset

>>

>> If you increase the counter in general function that will be not 

>> accurate _____________________________________

>> Monk Liu|GPU Virtualization Team |AMD

>>

>>

>> -----Original Message-----

>> From: Christian König <ckoenig.leichtzumerken@gmail.com>

>> Sent: Friday, August 23, 2019 4:27 PM

>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org

>> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for 

>> reset function

>>

>> Am 23.08.19 um 05:34 schrieb Monk Liu:

>>> for SOC15/vega10 the BACO reset would introduce vram lost in the 

>>> high end address range and current kmd's vram lost checking cannot 

>>> catch it since it only check visible frame buffer

>>>

>>> TODO:

>>> to confirm if mode1/2 reset would introduce vram lost

>> Looks good in general, but I would make the value mandatory or maybe use a special return code instead.

>>

>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?

>>

>> Christian.

>>

>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

>>> ---

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----

>>>     drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-

>>>     drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---

>>>     drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-

>>>     drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-

>>>     drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-

>>>     7 files changed, 22 insertions(+), 14 deletions(-)

>>>

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> index f6ae565..1fe3756 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {

>>>     	int (*read_register)(struct amdgpu_device *adev, u32 se_num,

>>>     			     u32 sh_num, u32 reg_offset, u32 *value);

>>>     	void (*set_vga_state)(struct amdgpu_device *adev, bool state);

>>> -	int (*reset)(struct amdgpu_device *adev);

>>> +	int (*reset)(struct amdgpu_device *adev, bool *lost);

>>>     	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);

>>>     	/* get the reference clock */

>>>     	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7 

>>> @@ int emu_soc_asic_init(struct amdgpu_device *adev);

>>>      * ASICs macro.

>>>      */

>>>     #define amdgpu_asic_set_vga_state(adev, state) 

>>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define

>>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))

>>> +#define amdgpu_asic_reset(adev, lost) 

>>> +(adev)->asic_funcs->reset((adev), (lost))

>>>     #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))

>>>     #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))

>>>     #define amdgpu_asic_set_uvd_clocks(adev, v, d) 

>>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git 

>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> index 02b3e7d..8668cb8 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)

>>>     	struct amdgpu_device *adev =

>>>     		container_of(__work, struct amdgpu_device, xgmi_reset_work);

>>>     

>>> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);

>>> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);

>>>     	if (adev->asic_reset_res)

>>>     		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",

>>>     			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7 

>>> +2751,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,

>>>     	 *  E.g., driver was not cleanly unloaded previously, etc.

>>>     	 */

>>>     	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {

>>> -		r = amdgpu_asic_reset(adev);

>>> +		r = amdgpu_asic_reset(adev, NULL);

>>>     		if (r) {

>>>     			dev_err(adev->dev, "asic reset on init failed\n");

>>>     			goto failed;

>>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)

>>>     		pci_disable_device(dev->pdev);

>>>     		pci_set_power_state(dev->pdev, PCI_D3hot);

>>>     	} else {

>>> -		r = amdgpu_asic_reset(adev);

>>> +		r = amdgpu_asic_reset(adev, NULL);

>>>     		if (r)

>>>     			DRM_ERROR("amdgpu asic reset failed\n");

>>>     	}

>>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>>     				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))

>>>     					r = -EALREADY;

>>>     			} else

>>> -				r = amdgpu_asic_reset(tmp_adev);

>>> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);

>>>     

>>>     			if (r) {

>>>     				DRM_ERROR("ASIC reset failed with error, %d for drm dev, 

>>> %s", @@

>>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>>>     				if (r)

>>>     					goto out;

>>>     

>>> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>> +				if (!vram_lost)

>>> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);

>>> +

>>>     				if (vram_lost) {

>>>     					DRM_INFO("VRAM is lost due to GPU reset!\n");

>>>     					atomic_inc(&tmp_adev->vram_lost_counter);

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c

>>> b/drivers/gpu/drm/amd/amdgpu/cik.c

>>> index 7b63d7a..0f25b82 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c

>>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)

>>>      * to reset them.

>>>      * Returns 0 for success.

>>>      */

>>> -static int cik_asic_reset(struct amdgpu_device *adev)

>>> +static int cik_asic_reset(struct amdgpu_device *adev, bool

>>> +*vramlost)

>>>     {

>>>     	int r;

>>>     

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c 

>>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c

>>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)

>>>     		return AMD_RESET_METHOD_MODE1;

>>>     }

>>>     

>>> -static int nv_asic_reset(struct amdgpu_device *adev)

>>> +static int nv_asic_reset(struct amdgpu_device *adev, bool 

>>> +*vramlost)

>>>     {

>>>     

>>>     	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@ 

>>> static int nv_asic_reset(struct amdgpu_device *adev)

>>>     	int ret = 0;

>>>     	struct smu_context *smu = &adev->smu;

>>>     

>>> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)

>>> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {

>>> +		if (vramlost)

>>> +			*vramlost = true;

>>>     		ret = smu_baco_reset(smu);

>>> -	else

>>> +	}

>>> +	else {

>>>     		ret = nv_asic_mode1_reset(adev);

>>> +	}

>>>     

>>>     	return ret;

>>>     }

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c 

>>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/si.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c

>>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,

>>>     }

>>>     

>>>     //xxx: not implemented

>>> -static int si_asic_reset(struct amdgpu_device *adev)

>>> +static int si_asic_reset(struct amdgpu_device *adev, bool 

>>> +*vramlost)

>>>     {

>>>     	return 0;

>>>     }

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> index fe2212df..12b2966 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c

>>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)

>>>     		return AMD_RESET_METHOD_MODE1;

>>>     }

>>>     

>>> -static int soc15_asic_reset(struct amdgpu_device *adev)

>>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool

>>> +*vramlost)

>>>     {

>>>     	switch (soc15_asic_reset_method(adev)) {

>>>     		case AMD_RESET_METHOD_BACO:

>>> +			if (vramlost)

>>> +				*vramlost = true;

>>>     			return soc15_asic_baco_reset(adev);

>>>     		case AMD_RESET_METHOD_MODE2:

>>>     			return soc15_mode2_reset(adev); diff --git 

>>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c 

>>> index 56c882b..8eceb00 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c

>>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)

>>>      * to reset them.

>>>      * Returns 0 for success.

>>>      */

>>> -static int vi_asic_reset(struct amdgpu_device *adev)

>>> +static int vi_asic_reset(struct amdgpu_device *adev, bool 

>>> +*vramlost)

>>>     {

>>>     	int r;

>>>

> So your suggestion is we increase the counter in BACO reset , and no need to introduce the new "bool *" parameter, right ?
Correct, yes.

Cleanest way looks like adding some helper function like 
amdgpu_device_vram_lost(adev) which is called by the BACO reset code.

Christian.

Am 23.08.19 um 16:35 schrieb Liu, Monk:
>>> Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM.
> The reason is with BACO reset I found VRAM lost in high address e.g. 15~16 G (for 16 G vega10),  amdgpu_device_check_vram_lost only checks the very ahead visible part
>
>>> Yeah, but would increment it twice be a problem? I don't think so.
> So your suggestion is we increase the counter in BACO reset , and no need to introduce the new "bool *" parameter, right ?
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Koenig, Christian <Christian.Koenig@amd.com>
> Sent: Friday, August 23, 2019 8:47 PM
> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for reset function
>
> Am 23.08.19 um 10:57 schrieb Liu, Monk:
>>>> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>>>>      				if (vram_lost) {
>>>>      					DRM_INFO("VRAM is lost due to GPU reset!\n");
>>>>      					atomic_inc(&tmp_adev->vram_lost_counter);
>> Above is the original logic, if we increment the counter in BACO reset
>> routine, we would potentially Have another counter increasement if by
>> coincidence the "amdgpu_device_check_vram_lost" successfully detected
>> the vram lost (although right now it didn't ..)
> Yeah, but would increment it twice be a problem? I don't think so.
>
>> Do you mean we remove the amdgpu_device_check_vram_lost(tmp_adev) in device_recovery() routine ?
> Please no, that thing certainly proved to be useful. Maybe we could investigate why it failed to auto detect the lost VRAM.
>
> Christian.
>
>> _____________________________________
>> Monk Liu|GPU Virtualization Team |AMD
>>
>>
>> -----Original Message-----
>> From: Koenig, Christian <Christian.Koenig@amd.com>
>> Sent: Friday, August 23, 2019 4:34 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for
>> reset function
>>
>> I thought in the BACO reset function.
>>
>> The top level reset function doesn't do much more than increment the vram_lost_counter either.
>>
>> Christian.
>>
>> Am 23.08.19 um 10:32 schrieb Liu, Monk:
>>>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?
>>> In where ? if you mean in amdgpu_device_recover routine I won't do that since the reset() can do any kind of reset like:
>>> 1) PF FLR
>>> 2) mode1/2 reset
>>> 3) magic reset through config space
>>> 4) BACO reset
>>>
>>> PF FLR won't cause VRAM lost, mode_1/2 is not clear to me, only BACO
>>> reset is definitely a vram lost reset
>>>
>>> If you increase the counter in general function that will be not
>>> accurate _____________________________________
>>> Monk Liu|GPU Virtualization Team |AMD
>>>
>>>
>>> -----Original Message-----
>>> From: Christian König <ckoenig.leichtzumerken@gmail.com>
>>> Sent: Friday, August 23, 2019 4:27 PM
>>> To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/amdgpu: introduce vram lost paramter for
>>> reset function
>>>
>>> Am 23.08.19 um 05:34 schrieb Monk Liu:
>>>> for SOC15/vega10 the BACO reset would introduce vram lost in the
>>>> high end address range and current kmd's vram lost checking cannot
>>>> catch it since it only check visible frame buffer
>>>>
>>>> TODO:
>>>> to confirm if mode1/2 reset would introduce vram lost
>>> Looks good in general, but I would make the value mandatory or maybe use a special return code instead.
>>>
>>> On the other hand wouldn't it be simpler to just increment vram_lost_counter?
>>>
>>> Christian.
>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> ---
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++--
>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++-----
>>>>      drivers/gpu/drm/amd/amdgpu/cik.c           |  2 +-
>>>>      drivers/gpu/drm/amd/amdgpu/nv.c            | 10 +++++++---
>>>>      drivers/gpu/drm/amd/amdgpu/si.c            |  2 +-
>>>>      drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +++-
>>>>      drivers/gpu/drm/amd/amdgpu/vi.c            |  2 +-
>>>>      7 files changed, 22 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index f6ae565..1fe3756 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -552,7 +552,7 @@ struct amdgpu_asic_funcs {
>>>>      	int (*read_register)(struct amdgpu_device *adev, u32 se_num,
>>>>      			     u32 sh_num, u32 reg_offset, u32 *value);
>>>>      	void (*set_vga_state)(struct amdgpu_device *adev, bool state);
>>>> -	int (*reset)(struct amdgpu_device *adev);
>>>> +	int (*reset)(struct amdgpu_device *adev, bool *lost);
>>>>      	enum amd_reset_method (*reset_method)(struct amdgpu_device *adev);
>>>>      	/* get the reference clock */
>>>>      	u32 (*get_xclk)(struct amdgpu_device *adev); @@ -1136,7 +1136,7
>>>> @@ int emu_soc_asic_init(struct amdgpu_device *adev);
>>>>       * ASICs macro.
>>>>       */
>>>>      #define amdgpu_asic_set_vga_state(adev, state)
>>>> (adev)->asic_funcs->set_vga_state((adev), (state)) -#define
>>>> amdgpu_asic_reset(adev) (adev)->asic_funcs->reset((adev))
>>>> +#define amdgpu_asic_reset(adev, lost)
>>>> +(adev)->asic_funcs->reset((adev), (lost))
>>>>      #define amdgpu_asic_reset_method(adev) (adev)->asic_funcs->reset_method((adev))
>>>>      #define amdgpu_asic_get_xclk(adev) (adev)->asic_funcs->get_xclk((adev))
>>>>      #define amdgpu_asic_set_uvd_clocks(adev, v, d)
>>>> (adev)->asic_funcs->set_uvd_clocks((adev), (v), (d)) diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 02b3e7d..8668cb8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2546,7 +2546,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
>>>>      	struct amdgpu_device *adev =
>>>>      		container_of(__work, struct amdgpu_device, xgmi_reset_work);
>>>>      
>>>> -	adev->asic_reset_res =  amdgpu_asic_reset(adev);
>>>> +	adev->asic_reset_res =  amdgpu_asic_reset(adev, NULL);
>>>>      	if (adev->asic_reset_res)
>>>>      		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
>>>>      			 adev->asic_reset_res, adev->ddev->unique); @@ -2751,7
>>>> +2751,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>>      	 *  E.g., driver was not cleanly unloaded previously, etc.
>>>>      	 */
>>>>      	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
>>>> -		r = amdgpu_asic_reset(adev);
>>>> +		r = amdgpu_asic_reset(adev, NULL);
>>>>      		if (r) {
>>>>      			dev_err(adev->dev, "asic reset on init failed\n");
>>>>      			goto failed;
>>>> @@ -3084,7 +3084,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
>>>>      		pci_disable_device(dev->pdev);
>>>>      		pci_set_power_state(dev->pdev, PCI_D3hot);
>>>>      	} else {
>>>> -		r = amdgpu_asic_reset(adev);
>>>> +		r = amdgpu_asic_reset(adev, NULL);
>>>>      		if (r)
>>>>      			DRM_ERROR("amdgpu asic reset failed\n");
>>>>      	}
>>>> @@ -3604,7 +3604,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
>>>>      					r = -EALREADY;
>>>>      			} else
>>>> -				r = amdgpu_asic_reset(tmp_adev);
>>>> +				r = amdgpu_asic_reset(tmp_adev, &vram_lost);
>>>>      
>>>>      			if (r) {
>>>>      				DRM_ERROR("ASIC reset failed with error, %d for drm dev,
>>>> %s", @@
>>>> -3645,7 +3645,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>>      				if (r)
>>>>      					goto out;
>>>>      
>>>> -				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>>>> +				if (!vram_lost)
>>>> +					vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>>>> +
>>>>      				if (vram_lost) {
>>>>      					DRM_INFO("VRAM is lost due to GPU reset!\n");
>>>>      					atomic_inc(&tmp_adev->vram_lost_counter);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c
>>>> b/drivers/gpu/drm/amd/amdgpu/cik.c
>>>> index 7b63d7a..0f25b82 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/cik.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik.c
>>>> @@ -1277,7 +1277,7 @@ static int cik_gpu_pci_config_reset(struct amdgpu_device *adev)
>>>>       * to reset them.
>>>>       * Returns 0 for success.
>>>>       */
>>>> -static int cik_asic_reset(struct amdgpu_device *adev)
>>>> +static int cik_asic_reset(struct amdgpu_device *adev, bool
>>>> +*vramlost)
>>>>      {
>>>>      	int r;
>>>>      
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c
>>>> b/drivers/gpu/drm/amd/amdgpu/nv.c index a3d99f2..53de7a6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
>>>> @@ -301,7 +301,7 @@ nv_asic_reset_method(struct amdgpu_device *adev)
>>>>      		return AMD_RESET_METHOD_MODE1;
>>>>      }
>>>>      
>>>> -static int nv_asic_reset(struct amdgpu_device *adev)
>>>> +static int nv_asic_reset(struct amdgpu_device *adev, bool
>>>> +*vramlost)
>>>>      {
>>>>      
>>>>      	/* FIXME: it doesn't work since vega10 */ @@ -315,10 +315,14 @@
>>>> static int nv_asic_reset(struct amdgpu_device *adev)
>>>>      	int ret = 0;
>>>>      	struct smu_context *smu = &adev->smu;
>>>>      
>>>> -	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
>>>> +	if (nv_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
>>>> +		if (vramlost)
>>>> +			*vramlost = true;
>>>>      		ret = smu_baco_reset(smu);
>>>> -	else
>>>> +	}
>>>> +	else {
>>>>      		ret = nv_asic_mode1_reset(adev);
>>>> +	}
>>>>      
>>>>      	return ret;
>>>>      }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/si.c
>>>> b/drivers/gpu/drm/amd/amdgpu/si.c index 9043614..f324099 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/si.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/si.c
>>>> @@ -1180,7 +1180,7 @@ static bool si_read_bios_from_rom(struct amdgpu_device *adev,
>>>>      }
>>>>      
>>>>      //xxx: not implemented
>>>> -static int si_asic_reset(struct amdgpu_device *adev)
>>>> +static int si_asic_reset(struct amdgpu_device *adev, bool
>>>> +*vramlost)
>>>>      {
>>>>      	return 0;
>>>>      }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>>> b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>>> index fe2212df..12b2966 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
>>>> @@ -553,10 +553,12 @@ soc15_asic_reset_method(struct amdgpu_device *adev)
>>>>      		return AMD_RESET_METHOD_MODE1;
>>>>      }
>>>>      
>>>> -static int soc15_asic_reset(struct amdgpu_device *adev)
>>>> +static int soc15_asic_reset(struct amdgpu_device *adev, bool
>>>> +*vramlost)
>>>>      {
>>>>      	switch (soc15_asic_reset_method(adev)) {
>>>>      		case AMD_RESET_METHOD_BACO:
>>>> +			if (vramlost)
>>>> +				*vramlost = true;
>>>>      			return soc15_asic_baco_reset(adev);
>>>>      		case AMD_RESET_METHOD_MODE2:
>>>>      			return soc15_mode2_reset(adev); diff --git
>>>> a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
>>>> index 56c882b..8eceb00 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
>>>> @@ -696,7 +696,7 @@ static int vi_gpu_pci_config_reset(struct amdgpu_device *adev)
>>>>       * to reset them.
>>>>       * Returns 0 for success.
>>>>       */
>>>> -static int vi_asic_reset(struct amdgpu_device *adev)
>>>> +static int vi_asic_reset(struct amdgpu_device *adev, bool
>>>> +*vramlost)
>>>>      {
>>>>      	int r;
>>>>      
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx