drm/amdgpu: reserve GDS resources on the gfx ring for gfx10

Submitted by Marek Olšák on July 17, 2019, 12:06 a.m.

Details

Message ID 20190717000606.7181-1-maraeo@gmail.com
State New
Headers show
Series "drm/amdgpu: reserve GDS resources on the gfx ring for gfx10" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Marek Olšák July 17, 2019, 12:06 a.m.
From: Marek Olšák <marek.olsak@amd.com>

Hopefully we'll only use 1 gfx ring, because otherwise we'd have to have
separate GDS buffers for each gfx ring.

This is a workaround to ensure stability of transform feedback. Shaders hang
waiting for a GDS instruction (ds_sub, not ds_ordered_count).

The disadvantage is that compute IBs might get a different VMID,
because now gfx always has GDS and compute doesn't.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  6 ++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 20 ++++++++++++++++++++
 4 files changed, 37 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4b514a44184c..cbd55d061b72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -456,20 +456,21 @@  struct amdgpu_cs_parser {
 	struct drm_file		*filp;
 	struct amdgpu_ctx	*ctx;
 
 	/* chunks */
 	unsigned		nchunks;
 	struct amdgpu_cs_chunk	*chunks;
 
 	/* scheduler job object */
 	struct amdgpu_job	*job;
 	struct drm_sched_entity	*entity;
+	unsigned		hw_ip;
 
 	/* buffer objects */
 	struct ww_acquire_ctx		ticket;
 	struct amdgpu_bo_list		*bo_list;
 	struct amdgpu_mn		*mn;
 	struct amdgpu_bo_list_entry	vm_pd;
 	struct list_head		validated;
 	struct dma_fence		*fence;
 	uint64_t			bytes_moved_threshold;
 	uint64_t			bytes_moved_vis_threshold;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index c691df6f7a57..9125cd69a124 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -678,20 +678,28 @@  static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	if (r)
 		goto error_validate;
 
 	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
 				     p->bytes_moved_vis);
 
 	gds = p->bo_list->gds_obj;
 	gws = p->bo_list->gws_obj;
 	oa = p->bo_list->oa_obj;
 
+	if (p->hw_ip == AMDGPU_HW_IP_GFX) {
+		/* Only gfx10 allocates these. */
+		if (!gds)
+			gds = p->adev->gds.gds_gfx_bo;
+		if (!oa)
+			oa = p->adev->gds.oa_gfx_bo;
+	}
+
 	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
 		struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
 
 		/* Make sure we use the exclusive slot for shared BOs */
 		if (bo->prime_shared_count)
 			e->tv.num_shared = 0;
 		e->bo_va = amdgpu_vm_bo_find(vm, bo);
 	}
 
 	if (gds) {
@@ -954,20 +962,22 @@  static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 		struct drm_amdgpu_cs_chunk_ib *chunk_ib;
 		struct drm_sched_entity *entity;
 
 		chunk = &parser->chunks[i];
 		ib = &parser->job->ibs[j];
 		chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
 
 		if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
 			continue;
 
+		parser->hw_ip = chunk_ib->ip_type;
+
 		if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
 		    (amdgpu_mcbp || amdgpu_sriov_vf(adev))) {
 			if (chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
 				if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
 					ce_preempt++;
 				else
 					de_preempt++;
 			}
 
 			/* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index df8a23554831..0943b8e1d97e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -26,20 +26,26 @@ 
 
 struct amdgpu_ring;
 struct amdgpu_bo;
 
 struct amdgpu_gds {
 	uint32_t gds_size;
 	uint32_t gws_size;
 	uint32_t oa_size;
 	uint32_t gds_compute_max_wave_id;
 	uint32_t vgt_gs_max_wave_id;
+
+	/* Reserved OA for the gfx ring. (gfx10) */
+	uint32_t gds_gfx_reservation_size;
+	uint32_t oa_gfx_reservation_size;
+	struct amdgpu_bo *gds_gfx_bo;
+	struct amdgpu_bo *oa_gfx_bo;
 };
 
 struct amdgpu_gds_reg_offset {
 	uint32_t	mem_base;
 	uint32_t	mem_size;
 	uint32_t	gws;
 	uint32_t	oa;
 };
 
 #endif /* __AMDGPU_GDS_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 618291df659b..3952754c04ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -1314,20 +1314,33 @@  static int gfx_v10_0_sw_init(void *handle)
 
 	kiq = &adev->gfx.kiq;
 	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
 	if (r)
 		return r;
 
 	r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd));
 	if (r)
 		return r;
 
+	/* allocate reserved GDS resources for transform feedback */
+	r = amdgpu_bo_create_kernel(adev, adev->gds.gds_gfx_reservation_size,
+				    4, AMDGPU_GEM_DOMAIN_GDS,
+				    &adev->gds.gds_gfx_bo, NULL, NULL);
+	if (r)
+		return r;
+
+	r = amdgpu_bo_create_kernel(adev, adev->gds.oa_gfx_reservation_size,
+				    1, AMDGPU_GEM_DOMAIN_OA,
+				    &adev->gds.oa_gfx_bo, NULL, NULL);
+	if (r)
+		return r;
+
 	/* allocate visible FB for rlc auto-loading fw */
 	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
 		r = gfx_v10_0_rlc_backdoor_autoload_buffer_init(adev);
 		if (r)
 			return r;
 	}
 
 	adev->gfx.ce_ram_size = F32_CE_PROGRAM_RAM_SIZE;
 
 	gfx_v10_0_gpu_early_init(adev);
@@ -1354,20 +1367,23 @@  static void gfx_v10_0_me_fini(struct amdgpu_device *adev)
 	amdgpu_bo_free_kernel(&adev->gfx.me.me_fw_obj,
 			      &adev->gfx.me.me_fw_gpu_addr,
 			      (void **)&adev->gfx.me.me_fw_ptr);
 }
 
 static int gfx_v10_0_sw_fini(void *handle)
 {
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
+	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
+
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
 
 	amdgpu_gfx_mqd_sw_fini(adev);
 	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
 	amdgpu_gfx_kiq_fini(adev);
 
 	gfx_v10_0_pfp_fini(adev);
@@ -5181,20 +5197,24 @@  static void gfx_v10_0_set_gds_init(struct amdgpu_device *adev)
 	case CHIP_NAVI10:
 	default:
 		adev->gds.gds_size = 0x10000;
 		adev->gds.gds_compute_max_wave_id = 0x4ff;
 		adev->gds.vgt_gs_max_wave_id = 0x3ff;
 		break;
 	}
 
 	adev->gds.gws_size = 64;
 	adev->gds.oa_size = 16;
+
+	/* Reserved for transform feedback. */
+	adev->gds.gds_gfx_reservation_size = 256;
+	adev->gds.oa_gfx_reservation_size = 4;
 }
 
 static void gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(struct amdgpu_device *adev,
 							  u32 bitmap)
 {
 	u32 data;
 
 	if (!bitmap)
 		return;
 

Comments

Am 17.07.19 um 02:06 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> Hopefully we'll only use 1 gfx ring, because otherwise we'd have to have
> separate GDS buffers for each gfx ring.
>
> This is a workaround to ensure stability of transform feedback. Shaders hang
> waiting for a GDS instruction (ds_sub, not ds_ordered_count).
>
> The disadvantage is that compute IBs might get a different VMID,
> because now gfx always has GDS and compute doesn't.

So far compute is only using GWS, but I don't think that those 
reservations are a good idea in general.

How severe is the ENOMEM problem you see with using an explicit GWS 
allocation?

Regards,
Christian.

>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 10 ++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  6 ++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 20 ++++++++++++++++++++
>   4 files changed, 37 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 4b514a44184c..cbd55d061b72 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -456,20 +456,21 @@ struct amdgpu_cs_parser {
>   	struct drm_file		*filp;
>   	struct amdgpu_ctx	*ctx;
>   
>   	/* chunks */
>   	unsigned		nchunks;
>   	struct amdgpu_cs_chunk	*chunks;
>   
>   	/* scheduler job object */
>   	struct amdgpu_job	*job;
>   	struct drm_sched_entity	*entity;
> +	unsigned		hw_ip;
>   
>   	/* buffer objects */
>   	struct ww_acquire_ctx		ticket;
>   	struct amdgpu_bo_list		*bo_list;
>   	struct amdgpu_mn		*mn;
>   	struct amdgpu_bo_list_entry	vm_pd;
>   	struct list_head		validated;
>   	struct dma_fence		*fence;
>   	uint64_t			bytes_moved_threshold;
>   	uint64_t			bytes_moved_vis_threshold;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index c691df6f7a57..9125cd69a124 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -678,20 +678,28 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   	if (r)
>   		goto error_validate;
>   
>   	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>   				     p->bytes_moved_vis);
>   
>   	gds = p->bo_list->gds_obj;
>   	gws = p->bo_list->gws_obj;
>   	oa = p->bo_list->oa_obj;
>   
> +	if (p->hw_ip == AMDGPU_HW_IP_GFX) {
> +		/* Only gfx10 allocates these. */
> +		if (!gds)
> +			gds = p->adev->gds.gds_gfx_bo;
> +		if (!oa)
> +			oa = p->adev->gds.oa_gfx_bo;
> +	}
> +
>   	amdgpu_bo_list_for_each_entry(e, p->bo_list) {
>   		struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>   
>   		/* Make sure we use the exclusive slot for shared BOs */
>   		if (bo->prime_shared_count)
>   			e->tv.num_shared = 0;
>   		e->bo_va = amdgpu_vm_bo_find(vm, bo);
>   	}
>   
>   	if (gds) {
> @@ -954,20 +962,22 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   		struct drm_amdgpu_cs_chunk_ib *chunk_ib;
>   		struct drm_sched_entity *entity;
>   
>   		chunk = &parser->chunks[i];
>   		ib = &parser->job->ibs[j];
>   		chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
>   
>   		if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
>   			continue;
>   
> +		parser->hw_ip = chunk_ib->ip_type;
> +
>   		if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
>   		    (amdgpu_mcbp || amdgpu_sriov_vf(adev))) {
>   			if (chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
>   				if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
>   					ce_preempt++;
>   				else
>   					de_preempt++;
>   			}
>   
>   			/* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index df8a23554831..0943b8e1d97e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -26,20 +26,26 @@
>   
>   struct amdgpu_ring;
>   struct amdgpu_bo;
>   
>   struct amdgpu_gds {
>   	uint32_t gds_size;
>   	uint32_t gws_size;
>   	uint32_t oa_size;
>   	uint32_t gds_compute_max_wave_id;
>   	uint32_t vgt_gs_max_wave_id;
> +
> +	/* Reserved OA for the gfx ring. (gfx10) */
> +	uint32_t gds_gfx_reservation_size;
> +	uint32_t oa_gfx_reservation_size;
> +	struct amdgpu_bo *gds_gfx_bo;
> +	struct amdgpu_bo *oa_gfx_bo;
>   };
>   
>   struct amdgpu_gds_reg_offset {
>   	uint32_t	mem_base;
>   	uint32_t	mem_size;
>   	uint32_t	gws;
>   	uint32_t	oa;
>   };
>   
>   #endif /* __AMDGPU_GDS_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 618291df659b..3952754c04ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -1314,20 +1314,33 @@ static int gfx_v10_0_sw_init(void *handle)
>   
>   	kiq = &adev->gfx.kiq;
>   	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>   	if (r)
>   		return r;
>   
>   	r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd));
>   	if (r)
>   		return r;
>   
> +	/* allocate reserved GDS resources for transform feedback */
> +	r = amdgpu_bo_create_kernel(adev, adev->gds.gds_gfx_reservation_size,
> +				    4, AMDGPU_GEM_DOMAIN_GDS,
> +				    &adev->gds.gds_gfx_bo, NULL, NULL);
> +	if (r)
> +		return r;
> +
> +	r = amdgpu_bo_create_kernel(adev, adev->gds.oa_gfx_reservation_size,
> +				    1, AMDGPU_GEM_DOMAIN_OA,
> +				    &adev->gds.oa_gfx_bo, NULL, NULL);
> +	if (r)
> +		return r;
> +
>   	/* allocate visible FB for rlc auto-loading fw */
>   	if (adev->firmware.load_type == AMDGPU_FW_LOAD_RLC_BACKDOOR_AUTO) {
>   		r = gfx_v10_0_rlc_backdoor_autoload_buffer_init(adev);
>   		if (r)
>   			return r;
>   	}
>   
>   	adev->gfx.ce_ram_size = F32_CE_PROGRAM_RAM_SIZE;
>   
>   	gfx_v10_0_gpu_early_init(adev);
> @@ -1354,20 +1367,23 @@ static void gfx_v10_0_me_fini(struct amdgpu_device *adev)
>   	amdgpu_bo_free_kernel(&adev->gfx.me.me_fw_obj,
>   			      &adev->gfx.me.me_fw_gpu_addr,
>   			      (void **)&adev->gfx.me.me_fw_ptr);
>   }
>   
>   static int gfx_v10_0_sw_fini(void *handle)
>   {
>   	int i;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> +	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
> +	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
> +
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>   
>   	amdgpu_gfx_mqd_sw_fini(adev);
>   	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
>   	amdgpu_gfx_kiq_fini(adev);
>   
>   	gfx_v10_0_pfp_fini(adev);
> @@ -5181,20 +5197,24 @@ static void gfx_v10_0_set_gds_init(struct amdgpu_device *adev)
>   	case CHIP_NAVI10:
>   	default:
>   		adev->gds.gds_size = 0x10000;
>   		adev->gds.gds_compute_max_wave_id = 0x4ff;
>   		adev->gds.vgt_gs_max_wave_id = 0x3ff;
>   		break;
>   	}
>   
>   	adev->gds.gws_size = 64;
>   	adev->gds.oa_size = 16;
> +
> +	/* Reserved for transform feedback. */
> +	adev->gds.gds_gfx_reservation_size = 256;
> +	adev->gds.oa_gfx_reservation_size = 4;
>   }
>   
>   static void gfx_v10_0_set_user_wgp_inactive_bitmap_per_sh(struct amdgpu_device *adev,
>   							  u32 bitmap)
>   {
>   	u32 data;
>   
>   	if (!bitmap)
>   		return;
>