drm/amdgpu: reserve GDS resources statically

Submitted by Marek Olšák on Sept. 12, 2018, 8:33 p.m.

Details

Message ID 20180912203358.2172-1-maraeo@gmail.com
State New
Headers show
Series "drm/amdgpu: reserve GDS resources statically" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Marek Olšák Sept. 12, 2018, 8:33 p.m.
From: Marek Olšák <marek.olsak@amd.com>

I've chosen to do it like this because it's easy and allows an arbitrary
number of processes.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   3 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |  20 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h     |  19 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  24 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c     |   6 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h     |   7 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |   3 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  14 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  21 ----
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |   6 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |   5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  61 ----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |   8 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  34 +-----
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c       | 125 +++++---------------
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c       | 123 +++++--------------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c       | 124 ++++++-------------
 include/uapi/drm/amdgpu_drm.h               |  15 +--
 19 files changed, 109 insertions(+), 519 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
index b80243d3972e..7264a4930b88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
@@ -71,23 +71,20 @@  int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
 				/ sizeof(struct amdgpu_bo_list_entry))
 		return -EINVAL;
 
 	size = sizeof(struct amdgpu_bo_list);
 	size += num_entries * sizeof(struct amdgpu_bo_list_entry);
 	list = kvmalloc(size, GFP_KERNEL);
 	if (!list)
 		return -ENOMEM;
 
 	kref_init(&list->refcount);
-	list->gds_obj = adev->gds.gds_gfx_bo;
-	list->gws_obj = adev->gds.gws_gfx_bo;
-	list->oa_obj = adev->gds.oa_gfx_bo;
 
 	array = amdgpu_bo_list_array_entry(list, 0);
 	memset(array, 0, num_entries * sizeof(struct amdgpu_bo_list_entry));
 
 	for (i = 0; i < num_entries; ++i) {
 		struct amdgpu_bo_list_entry *entry;
 		struct drm_gem_object *gobj;
 		struct amdgpu_bo *bo;
 		struct mm_struct *usermm;
 
@@ -111,27 +108,20 @@  int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
 		} else {
 			entry = &array[last_entry++];
 		}
 
 		entry->robj = bo;
 		entry->priority = min(info[i].bo_priority,
 				      AMDGPU_BO_LIST_MAX_PRIORITY);
 		entry->tv.bo = &entry->robj->tbo;
 		entry->tv.shared = !entry->robj->prime_shared_count;
 
-		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_GDS)
-			list->gds_obj = entry->robj;
-		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_GWS)
-			list->gws_obj = entry->robj;
-		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_OA)
-			list->oa_obj = entry->robj;
-
 		total_size += amdgpu_bo_size(entry->robj);
 		trace_amdgpu_bo_list_set(list, entry->robj);
 	}
 
 	list->first_userptr = first_userptr;
 	list->num_entries = num_entries;
 
 	trace_amdgpu_cs_bo_status(list->num_entries, total_size);
 
 	*result = list;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 61b089768e1c..30f12a60aa28 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -36,23 +36,20 @@  struct amdgpu_bo_list_entry {
 	struct ttm_validate_buffer	tv;
 	struct amdgpu_bo_va		*bo_va;
 	uint32_t			priority;
 	struct page			**user_pages;
 	int				user_invalidated;
 };
 
 struct amdgpu_bo_list {
 	struct rcu_head rhead;
 	struct kref refcount;
-	struct amdgpu_bo *gds_obj;
-	struct amdgpu_bo *gws_obj;
-	struct amdgpu_bo *oa_obj;
 	unsigned first_userptr;
 	unsigned num_entries;
 };
 
 int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
 		       struct amdgpu_bo_list **result);
 void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
 			     struct list_head *validated);
 void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
 int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1081fd00b059..88b58facf29e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -565,23 +565,20 @@  static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,
 	return 0;
 }
 
 static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 				union drm_amdgpu_cs *cs)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
 	struct amdgpu_vm *vm = &fpriv->vm;
 	struct amdgpu_bo_list_entry *e;
 	struct list_head duplicates;
-	struct amdgpu_bo *gds;
-	struct amdgpu_bo *gws;
-	struct amdgpu_bo *oa;
 	unsigned tries = 10;
 	int r;
 
 	INIT_LIST_HEAD(&p->validated);
 
 	/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
 	if (cs->in.bo_list_handle) {
 		if (p->bo_list)
 			return -EINVAL;
 
@@ -705,40 +702,23 @@  static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 
 	r = amdgpu_cs_list_validate(p, &p->validated);
 	if (r) {
 		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
 		goto error_validate;
 	}
 
 	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
 				     p->bytes_moved_vis);
 
-	gds = p->bo_list->gds_obj;
-	gws = p->bo_list->gws_obj;
-	oa = p->bo_list->oa_obj;
-
 	amdgpu_bo_list_for_each_entry(e, p->bo_list)
 		e->bo_va = amdgpu_vm_bo_find(vm, e->robj);
 
-	if (gds) {
-		p->job->gds_base = amdgpu_bo_gpu_offset(gds);
-		p->job->gds_size = amdgpu_bo_size(gds);
-	}
-	if (gws) {
-		p->job->gws_base = amdgpu_bo_gpu_offset(gws);
-		p->job->gws_size = amdgpu_bo_size(gws);
-	}
-	if (oa) {
-		p->job->oa_base = amdgpu_bo_gpu_offset(oa);
-		p->job->oa_size = amdgpu_bo_size(oa);
-	}
-
 	if (!r && p->uf_entry.robj) {
 		struct amdgpu_bo *uf = p->uf_entry.robj;
 
 		r = amdgpu_ttm_alloc_gart(&uf->tbo);
 		p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
 	}
 
 error_validate:
 	if (r)
 		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index e73728d90388..69ba25c2e921 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -17,48 +17,33 @@ 
  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  *
  */
 
 #ifndef __AMDGPU_GDS_H__
 #define __AMDGPU_GDS_H__
 
-/* Because TTM request that alloacted buffer should be PAGE_SIZE aligned,
- * we should report GDS/GWS/OA size as PAGE_SIZE aligned
- * */
-#define AMDGPU_GDS_SHIFT	2
-#define AMDGPU_GWS_SHIFT	PAGE_SHIFT
-#define AMDGPU_OA_SHIFT		PAGE_SHIFT
-
 struct amdgpu_ring;
 struct amdgpu_bo;
 
 struct amdgpu_gds_asic_info {
 	uint32_t	total_size;
-	uint32_t	gfx_partition_size;
-	uint32_t	cs_partition_size;
+	uint32_t	gfx_size_per_vmid;
+	uint32_t	kfd_size_per_vmid;
 };
 
 struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
-	/* At present, GDS, GWS and OA resources for gfx (graphics)
-	 * is always pre-allocated and available for graphics operation.
-	 * Such resource is shared between all gfx clients.
-	 * TODO: move this operation to user space
-	 * */
-	struct amdgpu_bo*		gds_gfx_bo;
-	struct amdgpu_bo*		gws_gfx_bo;
-	struct amdgpu_bo*		oa_gfx_bo;
 };
 
 struct amdgpu_gds_reg_offset {
 	uint32_t	mem_base;
 	uint32_t	mem_size;
 	uint32_t	gws;
 	uint32_t	oa;
 };
 
 #endif /* __AMDGPU_GDS_H__ */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d30a0838851b..c87ad4b4d0b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -223,43 +223,25 @@  int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
 	if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
 		      AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
 		      AMDGPU_GEM_CREATE_CPU_GTT_USWC |
 		      AMDGPU_GEM_CREATE_VRAM_CLEARED |
 		      AMDGPU_GEM_CREATE_VM_ALWAYS_VALID |
 		      AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
 
 		return -EINVAL;
 
 	/* reject invalid gem domains */
-	if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
+	if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
+				 AMDGPU_GEM_DOMAIN_GTT |
+				 AMDGPU_GEM_DOMAIN_VRAM))
 		return -EINVAL;
 
-	/* create a gem object to contain this object in */
-	if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
-	    AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
-		if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
-			/* if gds bo is created from user space, it must be
-			 * passed to bo list
-			 */
-			DRM_ERROR("GDS bo cannot be per-vm-bo\n");
-			return -EINVAL;
-		}
-		flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
-		if (args->in.domains == AMDGPU_GEM_DOMAIN_GDS)
-			size = size << AMDGPU_GDS_SHIFT;
-		else if (args->in.domains == AMDGPU_GEM_DOMAIN_GWS)
-			size = size << AMDGPU_GWS_SHIFT;
-		else if (args->in.domains == AMDGPU_GEM_DOMAIN_OA)
-			size = size << AMDGPU_OA_SHIFT;
-		else
-			return -EINVAL;
-	}
 	size = roundup(size, PAGE_SIZE);
 
 	if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
 		r = amdgpu_bo_reserve(vm->root.base.bo, false);
 		if (r)
 			return r;
 
 		resv = vm->root.base.bo->tbo.resv;
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index 3a072a7a39f0..c2e6a1a11d7f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
@@ -516,26 +516,20 @@  void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
  * Reset saved GDW, GWS and OA to force switch on next flush.
  */
 void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
 		       unsigned vmid)
 {
 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 	struct amdgpu_vmid *id = &id_mgr->ids[vmid];
 
 	mutex_lock(&id_mgr->lock);
 	id->owner = 0;
-	id->gds_base = 0;
-	id->gds_size = 0;
-	id->gws_base = 0;
-	id->gws_size = 0;
-	id->oa_base = 0;
-	id->oa_size = 0;
 	mutex_unlock(&id_mgr->lock);
 }
 
 /**
  * amdgpu_vmid_reset_all - reset VMID to zero
  *
  * @adev: amdgpu device structure
  *
  * Reset VMID to force flush on next use
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
index 7625419f0fc2..06078e665532 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
@@ -44,27 +44,20 @@  struct amdgpu_vmid {
 	struct amdgpu_sync	active;
 	struct dma_fence	*last_flush;
 	uint64_t		owner;
 
 	uint64_t		pd_gpu_addr;
 	/* last flushed PD/PT update */
 	struct dma_fence	*flushed_updates;
 
 	uint32_t                current_gpu_reset_count;
 
-	uint32_t		gds_base;
-	uint32_t		gds_size;
-	uint32_t		gws_base;
-	uint32_t		gws_size;
-	uint32_t		oa_base;
-	uint32_t		oa_size;
-
 	unsigned		pasid;
 	struct dma_fence	*pasid_mapping;
 };
 
 struct amdgpu_vmid_mgr {
 	struct mutex		lock;
 	unsigned		num_ids;
 	struct list_head	ids_lru;
 	struct amdgpu_vmid	ids[AMDGPU_NUM_VMID];
 	atomic_t		reserved_vmid_num;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 57cfe78a262b..3db553f6ad01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -42,23 +42,20 @@  struct amdgpu_job {
 	struct amdgpu_sync	sched_sync;
 	struct amdgpu_ib	*ibs;
 	struct dma_fence	*fence; /* the hw fence */
 	uint32_t		preamble_status;
 	uint32_t		num_ibs;
 	void			*owner;
 	bool                    vm_needs_flush;
 	uint64_t		vm_pd_addr;
 	unsigned		vmid;
 	unsigned		pasid;
-	uint32_t		gds_base, gds_size;
-	uint32_t		gws_base, gws_size;
-	uint32_t		oa_base, oa_size;
 	uint32_t		vram_lost_counter;
 
 	/* user fence handling */
 	uint64_t		uf_addr;
 	uint64_t		uf_sequence;
 
 };
 
 int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 		     struct amdgpu_job **job, struct amdgpu_vm *vm);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 29ac3873eeb0..209954290954 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -517,27 +517,27 @@  static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
 	case AMDGPU_INFO_VIS_VRAM_USAGE:
 		ui64 = amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_GTT_USAGE:
 		ui64 = amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_GDS_CONFIG: {
 		struct drm_amdgpu_info_gds gds_info;
 
 		memset(&gds_info, 0, sizeof(gds_info));
-		gds_info.gds_gfx_partition_size = adev->gds.mem.gfx_partition_size >> AMDGPU_GDS_SHIFT;
-		gds_info.compute_partition_size = adev->gds.mem.cs_partition_size >> AMDGPU_GDS_SHIFT;
-		gds_info.gds_total_size = adev->gds.mem.total_size >> AMDGPU_GDS_SHIFT;
-		gds_info.gws_per_gfx_partition = adev->gds.gws.gfx_partition_size >> AMDGPU_GWS_SHIFT;
-		gds_info.gws_per_compute_partition = adev->gds.gws.cs_partition_size >> AMDGPU_GWS_SHIFT;
-		gds_info.oa_per_gfx_partition = adev->gds.oa.gfx_partition_size >> AMDGPU_OA_SHIFT;
-		gds_info.oa_per_compute_partition = adev->gds.oa.cs_partition_size >> AMDGPU_OA_SHIFT;
+		gds_info.gds_gfx_partition_size = adev->gds.mem.gfx_size_per_vmid;
+		gds_info.compute_partition_size = adev->gds.mem.kfd_size_per_vmid;
+		gds_info.gds_total_size = adev->gds.mem.total_size;
+		gds_info.gws_per_gfx_partition = adev->gds.gws.gfx_size_per_vmid;
+		gds_info.gws_per_compute_partition = adev->gds.gws.kfd_size_per_vmid;
+		gds_info.oa_per_gfx_partition = adev->gds.oa.gfx_size_per_vmid;
+		gds_info.oa_per_compute_partition = adev->gds.oa.kfd_size_per_vmid;
 		return copy_to_user(out, &gds_info,
 				    min((size_t)size, sizeof(gds_info))) ? -EFAULT : 0;
 	}
 	case AMDGPU_INFO_VRAM_GTT: {
 		struct drm_amdgpu_info_vram_gtt vram_gtt;
 
 		vram_gtt.vram_size = adev->gmc.real_vram_size -
 			atomic64_read(&adev->vram_pin_size);
 		vram_gtt.vram_cpu_accessible_size = adev->gmc.visible_vram_size -
 			atomic64_read(&adev->visible_pin_size);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index de990bdcdd6c..76770a8c29a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -178,41 +178,20 @@  void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
 		places[c].lpfn = 0;
 		places[c].flags = TTM_PL_FLAG_SYSTEM;
 		if (flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
 			places[c].flags |= TTM_PL_FLAG_WC |
 				TTM_PL_FLAG_UNCACHED;
 		else
 			places[c].flags |= TTM_PL_FLAG_CACHED;
 		c++;
 	}
 
-	if (domain & AMDGPU_GEM_DOMAIN_GDS) {
-		places[c].fpfn = 0;
-		places[c].lpfn = 0;
-		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_GDS;
-		c++;
-	}
-
-	if (domain & AMDGPU_GEM_DOMAIN_GWS) {
-		places[c].fpfn = 0;
-		places[c].lpfn = 0;
-		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_GWS;
-		c++;
-	}
-
-	if (domain & AMDGPU_GEM_DOMAIN_OA) {
-		places[c].fpfn = 0;
-		places[c].lpfn = 0;
-		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_OA;
-		c++;
-	}
-
 	if (!c) {
 		places[c].fpfn = 0;
 		places[c].lpfn = 0;
 		places[c].flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM;
 		c++;
 	}
 
 	BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS);
 
 	placement->num_placement = c;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 907fdf46d895..e089964cbcb7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -120,26 +120,20 @@  static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
  */
 static inline unsigned amdgpu_mem_type_to_domain(u32 mem_type)
 {
 	switch (mem_type) {
 	case TTM_PL_VRAM:
 		return AMDGPU_GEM_DOMAIN_VRAM;
 	case TTM_PL_TT:
 		return AMDGPU_GEM_DOMAIN_GTT;
 	case TTM_PL_SYSTEM:
 		return AMDGPU_GEM_DOMAIN_CPU;
-	case AMDGPU_PL_GDS:
-		return AMDGPU_GEM_DOMAIN_GDS;
-	case AMDGPU_PL_GWS:
-		return AMDGPU_GEM_DOMAIN_GWS;
-	case AMDGPU_PL_OA:
-		return AMDGPU_GEM_DOMAIN_OA;
 	default:
 		break;
 	}
 	return 0;
 }
 
 /**
  * amdgpu_bo_reserve - reserve bo
  * @bo:		bo structure
  * @no_intr:	don't return -ERESTARTSYS on pending signal
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 9cc239968e40..f6ea9604e611 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -130,24 +130,20 @@  struct amdgpu_ring_funcs {
 	/* command emit functions */
 	void (*emit_ib)(struct amdgpu_ring *ring,
 			struct amdgpu_ib *ib,
 			unsigned vmid, bool ctx_switch);
 	void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr,
 			   uint64_t seq, unsigned flags);
 	void (*emit_pipeline_sync)(struct amdgpu_ring *ring);
 	void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid,
 			      uint64_t pd_addr);
 	void (*emit_hdp_flush)(struct amdgpu_ring *ring);
-	void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid,
-				uint32_t gds_base, uint32_t gds_size,
-				uint32_t gws_base, uint32_t gws_size,
-				uint32_t oa_base, uint32_t oa_size);
 	/* testing functions */
 	int (*test_ring)(struct amdgpu_ring *ring);
 	int (*test_ib)(struct amdgpu_ring *ring, long timeout);
 	/* insert NOP packets */
 	void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count);
 	void (*insert_start)(struct amdgpu_ring *ring);
 	void (*insert_end)(struct amdgpu_ring *ring);
 	/* pad the indirect buffer to the necessary number of dw */
 	void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
 	unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
@@ -226,21 +222,20 @@  struct amdgpu_ring {
 #define amdgpu_ring_patch_cs_in_place(r, p, ib) ((r)->funcs->patch_cs_in_place((p), (ib)))
 #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r))
 #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t))
 #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r))
 #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r))
 #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r))
 #define amdgpu_ring_emit_ib(r, ib, vmid, c) (r)->funcs->emit_ib((r), (ib), (vmid), (c))
 #define amdgpu_ring_emit_pipeline_sync(r) (r)->funcs->emit_pipeline_sync((r))
 #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
 #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
-#define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
 #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
 #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
 #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
 #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
 #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
 #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 8a158ee922f7..2cc62b0e7ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -195,30 +195,20 @@  static int amdgpu_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
 		break;
 	case TTM_PL_VRAM:
 		/* "On-card" video ram */
 		man->func = &amdgpu_vram_mgr_func;
 		man->gpu_offset = adev->gmc.vram_start;
 		man->flags = TTM_MEMTYPE_FLAG_FIXED |
 			     TTM_MEMTYPE_FLAG_MAPPABLE;
 		man->available_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
 		man->default_caching = TTM_PL_FLAG_WC;
 		break;
-	case AMDGPU_PL_GDS:
-	case AMDGPU_PL_GWS:
-	case AMDGPU_PL_OA:
-		/* On-chip GDS memory*/
-		man->func = &ttm_bo_manager_func;
-		man->gpu_offset = 0;
-		man->flags = TTM_MEMTYPE_FLAG_FIXED | TTM_MEMTYPE_FLAG_CMA;
-		man->available_caching = TTM_PL_FLAG_UNCACHED;
-		man->default_caching = TTM_PL_FLAG_UNCACHED;
-		break;
 	default:
 		DRM_ERROR("Unsupported memory type %u\n", (unsigned)type);
 		return -EINVAL;
 	}
 	return 0;
 }
 
 /**
  * amdgpu_evict_flags - Compute placement flags
  *
@@ -1039,25 +1029,20 @@  static int amdgpu_ttm_backend_bind(struct ttm_tt *ttm,
 		if (r) {
 			DRM_ERROR("failed to pin userptr\n");
 			return r;
 		}
 	}
 	if (!ttm->num_pages) {
 		WARN(1, "nothing to bind %lu pages for mreg %p back %p!\n",
 		     ttm->num_pages, bo_mem, ttm);
 	}
 
-	if (bo_mem->mem_type == AMDGPU_PL_GDS ||
-	    bo_mem->mem_type == AMDGPU_PL_GWS ||
-	    bo_mem->mem_type == AMDGPU_PL_OA)
-		return -EINVAL;
-
 	if (!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
 		gtt->offset = AMDGPU_BO_INVALID_OFFSET;
 		return 0;
 	}
 
 	/* compute PTE flags relevant to this BO memory */
 	flags = amdgpu_ttm_tt_pte_flags(adev, ttm, bo_mem);
 
 	/* bind pages into GART page tables */
 	gtt->offset = ((u64)bo_mem->start << PAGE_SHIFT) - adev->gmc.gart_start;
@@ -1818,60 +1803,20 @@  int amdgpu_ttm_init(struct amdgpu_device *adev)
 
 	/* Initialize GTT memory pool */
 	r = ttm_bo_init_mm(&adev->mman.bdev, TTM_PL_TT, gtt_size >> PAGE_SHIFT);
 	if (r) {
 		DRM_ERROR("Failed initializing GTT heap.\n");
 		return r;
 	}
 	DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
 		 (unsigned)(gtt_size / (1024 * 1024)));
 
-	/* Initialize various on-chip memory pools */
-	adev->gds.mem.total_size = adev->gds.mem.total_size << AMDGPU_GDS_SHIFT;
-	adev->gds.mem.gfx_partition_size = adev->gds.mem.gfx_partition_size << AMDGPU_GDS_SHIFT;
-	adev->gds.mem.cs_partition_size = adev->gds.mem.cs_partition_size << AMDGPU_GDS_SHIFT;
-	adev->gds.gws.total_size = adev->gds.gws.total_size << AMDGPU_GWS_SHIFT;
-	adev->gds.gws.gfx_partition_size = adev->gds.gws.gfx_partition_size << AMDGPU_GWS_SHIFT;
-	adev->gds.gws.cs_partition_size = adev->gds.gws.cs_partition_size << AMDGPU_GWS_SHIFT;
-	adev->gds.oa.total_size = adev->gds.oa.total_size << AMDGPU_OA_SHIFT;
-	adev->gds.oa.gfx_partition_size = adev->gds.oa.gfx_partition_size << AMDGPU_OA_SHIFT;
-	adev->gds.oa.cs_partition_size = adev->gds.oa.cs_partition_size << AMDGPU_OA_SHIFT;
-	/* GDS Memory */
-	if (adev->gds.mem.total_size) {
-		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GDS,
-				   adev->gds.mem.total_size >> PAGE_SHIFT);
-		if (r) {
-			DRM_ERROR("Failed initializing GDS heap.\n");
-			return r;
-		}
-	}
-
-	/* GWS */
-	if (adev->gds.gws.total_size) {
-		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GWS,
-				   adev->gds.gws.total_size >> PAGE_SHIFT);
-		if (r) {
-			DRM_ERROR("Failed initializing gws heap.\n");
-			return r;
-		}
-	}
-
-	/* OA */
-	if (adev->gds.oa.total_size) {
-		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_OA,
-				   adev->gds.oa.total_size >> PAGE_SHIFT);
-		if (r) {
-			DRM_ERROR("Failed initializing oa heap.\n");
-			return r;
-		}
-	}
-
 	/* Register debugfs entries for amdgpu_ttm */
 	r = amdgpu_ttm_debugfs_init(adev);
 	if (r) {
 		DRM_ERROR("Failed to init debugfs\n");
 		return r;
 	}
 	return 0;
 }
 
 /**
@@ -1892,26 +1837,20 @@  void amdgpu_ttm_fini(struct amdgpu_device *adev)
 		return;
 
 	amdgpu_ttm_debugfs_fini(adev);
 	amdgpu_ttm_fw_reserve_vram_fini(adev);
 	if (adev->mman.aper_base_kaddr)
 		iounmap(adev->mman.aper_base_kaddr);
 	adev->mman.aper_base_kaddr = NULL;
 
 	ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_VRAM);
 	ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_TT);
-	if (adev->gds.mem.total_size)
-		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GDS);
-	if (adev->gds.gws.total_size)
-		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GWS);
-	if (adev->gds.oa.total_size)
-		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_OA);
 	ttm_bo_device_release(&adev->mman.bdev);
 	amdgpu_ttm_global_fini(adev);
 	adev->mman.initialized = false;
 	DRM_INFO("amdgpu: ttm finalized\n");
 }
 
 /**
  * amdgpu_ttm_set_buffer_funcs_status - enable/disable use of buffer functions
  *
  * @adev: amdgpu_device pointer
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index fe8f276e9811..04557a382b19 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -20,28 +20,20 @@ 
  * OTHER DEALINGS IN THE SOFTWARE.
  *
  */
 
 #ifndef __AMDGPU_TTM_H__
 #define __AMDGPU_TTM_H__
 
 #include "amdgpu.h"
 #include <drm/gpu_scheduler.h>
 
-#define AMDGPU_PL_GDS		(TTM_PL_PRIV + 0)
-#define AMDGPU_PL_GWS		(TTM_PL_PRIV + 1)
-#define AMDGPU_PL_OA		(TTM_PL_PRIV + 2)
-
-#define AMDGPU_PL_FLAG_GDS		(TTM_PL_FLAG_PRIV << 0)
-#define AMDGPU_PL_FLAG_GWS		(TTM_PL_FLAG_PRIV << 1)
-#define AMDGPU_PL_FLAG_OA		(TTM_PL_FLAG_PRIV << 2)
-
 #define AMDGPU_GTT_MAX_TRANSFER_SIZE	512
 #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS	2
 
 struct amdgpu_mman {
 	struct ttm_bo_global_ref        bo_global_ref;
 	struct drm_global_reference	mem_global_ref;
 	struct ttm_bo_device		bdev;
 	bool				mem_global_referenced;
 	bool				initialized;
 	void __iomem			*aper_base_kaddr;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index be1659fedf94..c66f1c6f0ba8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -803,86 +803,69 @@  void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
  * Returns:
  * True if sync is needed.
  */
 bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
 				  struct amdgpu_job *job)
 {
 	struct amdgpu_device *adev = ring->adev;
 	unsigned vmhub = ring->funcs->vmhub;
 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 	struct amdgpu_vmid *id;
-	bool gds_switch_needed;
 	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
 
 	if (job->vmid == 0)
 		return false;
 	id = &id_mgr->ids[job->vmid];
-	gds_switch_needed = ring->funcs->emit_gds_switch && (
-		id->gds_base != job->gds_base ||
-		id->gds_size != job->gds_size ||
-		id->gws_base != job->gws_base ||
-		id->gws_size != job->gws_size ||
-		id->oa_base != job->oa_base ||
-		id->oa_size != job->oa_size);
 
 	if (amdgpu_vmid_had_gpu_reset(adev, id))
 		return true;
 
-	return vm_flush_needed || gds_switch_needed;
+	return vm_flush_needed;
 }
 
 /**
  * amdgpu_vm_flush - hardware flush the vm
  *
  * @ring: ring to use for flush
  * @job:  related job
  * @need_pipe_sync: is pipe sync needed
  *
  * Emit a VM flush when it is necessary.
  *
  * Returns:
  * 0 on success, errno otherwise.
  */
 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
 {
 	struct amdgpu_device *adev = ring->adev;
 	unsigned vmhub = ring->funcs->vmhub;
 	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
 	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
-	bool gds_switch_needed = ring->funcs->emit_gds_switch && (
-		id->gds_base != job->gds_base ||
-		id->gds_size != job->gds_size ||
-		id->gws_base != job->gws_base ||
-		id->gws_size != job->gws_size ||
-		id->oa_base != job->oa_base ||
-		id->oa_size != job->oa_size);
 	bool vm_flush_needed = job->vm_needs_flush;
 	bool pasid_mapping_needed = id->pasid != job->pasid ||
 		!id->pasid_mapping ||
 		!dma_fence_is_signaled(id->pasid_mapping);
 	struct dma_fence *fence = NULL;
 	unsigned patch_offset = 0;
 	int r;
 
 	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
-		gds_switch_needed = true;
 		vm_flush_needed = true;
 		pasid_mapping_needed = true;
 	}
 
-	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
 	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
 			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
 	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
 		ring->funcs->emit_wreg;
 
-	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
+	if (!vm_flush_needed && !need_pipe_sync)
 		return 0;
 
 	if (ring->funcs->init_cond_exec)
 		patch_offset = amdgpu_ring_init_cond_exec(ring);
 
 	if (need_pipe_sync)
 		amdgpu_ring_emit_pipeline_sync(ring);
 
 	if (vm_flush_needed) {
 		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
@@ -907,33 +890,20 @@  int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
 		mutex_unlock(&id_mgr->lock);
 	}
 
 	if (pasid_mapping_needed) {
 		id->pasid = job->pasid;
 		dma_fence_put(id->pasid_mapping);
 		id->pasid_mapping = dma_fence_get(fence);
 	}
 	dma_fence_put(fence);
 
-	if (ring->funcs->emit_gds_switch && gds_switch_needed) {
-		id->gds_base = job->gds_base;
-		id->gds_size = job->gds_size;
-		id->gws_base = job->gws_base;
-		id->gws_size = job->gws_size;
-		id->oa_base = job->oa_base;
-		id->oa_size = job->oa_size;
-		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
-					    job->gds_size, job->gws_base,
-					    job->gws_size, job->oa_base,
-					    job->oa_size);
-	}
-
 	if (ring->funcs->patch_cond_exec)
 		amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
 	/* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
 	if (ring->funcs->emit_switch_buffer) {
 		amdgpu_ring_emit_switch_buffer(ring);
 		amdgpu_ring_emit_switch_buffer(ring);
 	}
 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index a15d9c0f233b..f5228e169c3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -1890,21 +1890,21 @@  static void gfx_v7_0_config_init(struct amdgpu_device *adev)
  *
  * @adev: amdgpu_device pointer
  *
  * Configures the 3D engine and tiling configuration
  * registers so that the 3D engine is usable.
  */
 static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
 {
 	u32 sh_mem_cfg, sh_static_mem_cfg, sh_mem_base;
 	u32 tmp;
-	int i;
+	int i, vmid;
 
 	WREG32(mmGRBM_CNTL, (0xff << GRBM_CNTL__READ_TIMEOUT__SHIFT));
 
 	WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
 	WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
 	WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
 
 	gfx_v7_0_tiling_mode_table_init(adev);
 
 	gfx_v7_0_setup_rb(adev);
@@ -2014,20 +2014,42 @@  static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
 
 	tmp = RREG32(mmSPI_ARB_PRIORITY);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
 	WREG32(mmSPI_ARB_PRIORITY, tmp);
 
 	mutex_unlock(&adev->grbm_idx_mutex);
 
+	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
+		unsigned gds_size, gws_size, oa_size;
+
+		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
+			gds_size = adev->gds.mem.gfx_size_per_vmid;
+			gws_size = adev->gds.gws.gfx_size_per_vmid;
+			oa_size = adev->gds.oa.gfx_size_per_vmid;
+		} else {
+			gds_size = adev->gds.mem.kfd_size_per_vmid;
+			gws_size = adev->gds.gws.kfd_size_per_vmid;
+			oa_size = adev->gds.oa.kfd_size_per_vmid;
+		}
+
+		WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid * gds_size);
+		WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
+		WREG32(amdgpu_gds_reg_offset[vmid].gws,
+		       (vmid * gws_size) |
+		       (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
+		WREG32(amdgpu_gds_reg_offset[vmid].oa,
+		       ((1 << oa_size) - 1) << (vmid * oa_size));
+	}
+
 	udelay(50);
 }
 
 /*
  * GPU scratch registers helpers function.
  */
 /**
  * gfx_v7_0_scratch_init - setup driver info for CP scratch regs
  *
  * @adev: amdgpu_device pointer
@@ -4157,68 +4179,20 @@  static uint64_t gfx_v7_0_get_gpu_clock_counter(struct amdgpu_device *adev)
 	uint64_t clock;
 
 	mutex_lock(&adev->gfx.gpu_clock_mutex);
 	WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
 	clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
 		((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
 	mutex_unlock(&adev->gfx.gpu_clock_mutex);
 	return clock;
 }
 
-static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
-					  uint32_t vmid,
-					  uint32_t gds_base, uint32_t gds_size,
-					  uint32_t gws_base, uint32_t gws_size,
-					  uint32_t oa_base, uint32_t oa_size)
-{
-	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
-	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
-
-	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
-	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
-
-	oa_base = oa_base >> AMDGPU_OA_SHIFT;
-	oa_size = oa_size >> AMDGPU_OA_SHIFT;
-
-	/* GDS Base */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gds_base);
-
-	/* GDS Size */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gds_size);
-
-	/* GWS */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
-
-	/* OA */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
-}
-
 static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
 {
 	struct amdgpu_device *adev = ring->adev;
 	uint32_t value = 0;
 
 	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
 	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
 	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
 	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
 	WREG32(mmSQ_CMD, value);
@@ -4584,55 +4558,32 @@  static int gfx_v7_0_sw_init(void *handle)
 								ring_id,
 								i, k, j);
 				if (r)
 					return r;
 
 				ring_id++;
 			}
 		}
 	}
 
-	/* reserve GDS, GWS and OA resource for gfx */
-	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
-				    &adev->gds.gds_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
-				    &adev->gds.gws_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
-				    &adev->gds.oa_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
 	adev->gfx.ce_ram_size = 0x8000;
 
 	gfx_v7_0_gpu_early_init(adev);
 
 	return r;
 }
 
 static int gfx_v7_0_sw_fini(void *handle)
 {
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
-
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
 
 	gfx_v7_0_cp_compute_fini(adev);
 	gfx_v7_0_rlc_fini(adev);
 	gfx_v7_0_mec_fini(adev);
 	amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
 				&adev->gfx.rlc.clear_state_gpu_addr,
@@ -5073,64 +5024,60 @@  static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
 	.type = AMDGPU_RING_TYPE_GFX,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = false,
 	.get_rptr = gfx_v7_0_ring_get_rptr,
 	.get_wptr = gfx_v7_0_ring_get_wptr_gfx,
 	.set_wptr = gfx_v7_0_ring_set_wptr_gfx,
 	.emit_frame_size =
-		20 + /* gfx_v7_0_ring_emit_gds_switch */
 		7 + /* gfx_v7_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		12 + 12 + 12 + /* gfx_v7_0_ring_emit_fence_gfx x3 for user fence, vm fence */
 		7 + 4 + /* gfx_v7_0_ring_emit_pipeline_sync */
 		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + 6 + /* gfx_v7_0_ring_emit_vm_flush */
 		3 + 4, /* gfx_v7_ring_emit_cntxcntl including vgt flush*/
 	.emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_gfx */
 	.emit_ib = gfx_v7_0_ring_emit_ib_gfx,
 	.emit_fence = gfx_v7_0_ring_emit_fence_gfx,
 	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v7_0_ring_test_ring,
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
 	.emit_wreg = gfx_v7_0_ring_emit_wreg,
 	.soft_recovery = gfx_v7_0_ring_soft_recovery,
 };
 
 static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
 	.type = AMDGPU_RING_TYPE_COMPUTE,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = false,
 	.get_rptr = gfx_v7_0_ring_get_rptr,
 	.get_wptr = gfx_v7_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v7_0_ring_set_wptr_compute,
 	.emit_frame_size =
-		20 + /* gfx_v7_0_ring_emit_gds_switch */
 		7 + /* gfx_v7_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
 		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
 	.emit_ib_size =	4, /* gfx_v7_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v7_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v7_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v7_0_ring_test_ring,
 	.test_ib = gfx_v7_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_wreg = gfx_v7_0_ring_emit_wreg,
 };
 
 static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
 {
@@ -5169,42 +5116,28 @@  static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
 	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
 }
 
 static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
-	if (adev->gds.mem.total_size == 64 * 1024) {
-		adev->gds.mem.gfx_partition_size = 4096;
-		adev->gds.mem.cs_partition_size = 4096;
-
-		adev->gds.gws.gfx_partition_size = 4;
-		adev->gds.gws.cs_partition_size = 4;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 1;
-	} else {
-		adev->gds.mem.gfx_partition_size = 1024;
-		adev->gds.mem.cs_partition_size = 1024;
-
-		adev->gds.gws.gfx_partition_size = 16;
-		adev->gds.gws.cs_partition_size = 16;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 4;
-	}
+	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
+	adev->gds.oa.kfd_size_per_vmid = 0;
 }
 
-
 static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
 {
 	int i, j, k, counter, active_cu_number = 0;
 	u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
 	struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
 	unsigned disable_masks[4 * 2];
 	u32 ao_cu_num;
 
 	if (adev->flags & AMD_IS_APU)
 		ao_cu_num = 2;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 3882689b2d8f..b11a54bd0668 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -2154,57 +2154,34 @@  static int gfx_v8_0_sw_init(void *handle)
 	kiq = &adev->gfx.kiq;
 	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
 	if (r)
 		return r;
 
 	/* create MQD for all compute queues as well as KIQ for SRIOV case */
 	r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct vi_mqd_allocation));
 	if (r)
 		return r;
 
-	/* reserve GDS, GWS and OA resource for gfx */
-	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
-				    &adev->gds.gds_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
-				    &adev->gds.gws_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
-				    &adev->gds.oa_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
 	adev->gfx.ce_ram_size = 0x8000;
 
 	r = gfx_v8_0_gpu_early_init(adev);
 	if (r)
 		return r;
 
 	return 0;
 }
 
 static int gfx_v8_0_sw_fini(void *handle)
 {
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
-
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
 
 	amdgpu_gfx_compute_mqd_sw_fini(adev);
 	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
 	amdgpu_gfx_kiq_fini(adev);
 
 	gfx_v8_0_mec_fini(adev);
@@ -3850,21 +3827,21 @@  static void gfx_v8_0_config_init(struct amdgpu_device *adev)
 	case CHIP_CARRIZO:
 	case CHIP_STONEY:
 		adev->gfx.config.double_offchip_lds_buf = 0;
 		break;
 	}
 }
 
 static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
 {
 	u32 tmp, sh_static_mem_cfg;
-	int i;
+	int i, vmid;
 
 	WREG32_FIELD(GRBM_CNTL, READ_TIMEOUT, 0xFF);
 	WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
 	WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
 	WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
 
 	gfx_v8_0_tiling_mode_table_init(adev);
 	gfx_v8_0_setup_rb(adev);
 	gfx_v8_0_get_cu_info(adev);
 	gfx_v8_0_config_init(adev);
@@ -3927,20 +3904,41 @@  static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
 
 	tmp = RREG32(mmSPI_ARB_PRIORITY);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
 	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
 	WREG32(mmSPI_ARB_PRIORITY, tmp);
 
 	mutex_unlock(&adev->grbm_idx_mutex);
 
+	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
+		unsigned gds_size, gws_size, oa_size;
+
+		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
+			gds_size = adev->gds.mem.gfx_size_per_vmid;
+			gws_size = adev->gds.gws.gfx_size_per_vmid;
+			oa_size = adev->gds.oa.gfx_size_per_vmid;
+		} else {
+			gds_size = adev->gds.mem.kfd_size_per_vmid;
+			gws_size = adev->gds.gws.kfd_size_per_vmid;
+			oa_size = adev->gds.oa.kfd_size_per_vmid;
+		}
+
+		WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid * gds_size);
+		WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
+		WREG32(amdgpu_gds_reg_offset[vmid].gws,
+		       (vmid * gws_size) |
+		       (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
+		WREG32(amdgpu_gds_reg_offset[vmid].oa,
+		       ((1 << oa_size) - 1) << (vmid * oa_size));
+	}
 }
 
 static void gfx_v8_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
 {
 	u32 i, j, k;
 	u32 mask;
 
 	mutex_lock(&adev->grbm_idx_mutex);
 	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
 		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
@@ -5383,68 +5381,20 @@  static uint64_t gfx_v8_0_get_gpu_clock_counter(struct amdgpu_device *adev)
 	uint64_t clock;
 
 	mutex_lock(&adev->gfx.gpu_clock_mutex);
 	WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
 	clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
 		((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
 	mutex_unlock(&adev->gfx.gpu_clock_mutex);
 	return clock;
 }
 
-static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
-					  uint32_t vmid,
-					  uint32_t gds_base, uint32_t gds_size,
-					  uint32_t gws_base, uint32_t gws_size,
-					  uint32_t oa_base, uint32_t oa_size)
-{
-	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
-	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
-
-	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
-	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
-
-	oa_base = oa_base >> AMDGPU_OA_SHIFT;
-	oa_size = oa_size >> AMDGPU_OA_SHIFT;
-
-	/* GDS Base */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gds_base);
-
-	/* GDS Size */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gds_size);
-
-	/* GWS */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
-
-	/* OA */
-	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
-	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
-				WRITE_DATA_DST_SEL(0)));
-	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
-	amdgpu_ring_write(ring, 0);
-	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
-}
-
 static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
 {
 	WREG32(mmSQ_IND_INDEX,
 		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
 		(simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
 		(address << SQ_IND_INDEX__INDEX__SHIFT) |
 		(SQ_IND_INDEX__FORCE_READ_MASK));
 	return RREG32(mmSQ_IND_DATA);
 }
 
@@ -7132,21 +7082,20 @@  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 		31 + /*	DE_META */
 		3 + /* CNTX_CTRL */
 		5 + /* HDP_INVL */
 		8 + 8 + /* FENCE x2 */
 		2, /* SWITCH_BUFFER */
 	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_gfx */
 	.emit_ib = gfx_v8_0_ring_emit_ib_gfx,
 	.emit_fence = gfx_v8_0_ring_emit_fence_gfx,
 	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v8_ring_emit_sb,
 	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
 	.init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
@@ -7155,51 +7104,48 @@  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
 	.type = AMDGPU_RING_TYPE_COMPUTE,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = false,
 	.get_rptr = gfx_v8_0_ring_get_rptr,
 	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
 	.emit_frame_size =
-		20 + /* gfx_v8_0_ring_emit_gds_switch */
 		7 + /* gfx_v8_0_ring_emit_hdp_flush */
 		5 + /* hdp_invalidate */
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
 	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
 	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.test_ib = gfx_v8_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v8_0_ring_set_priority_compute,
 	.emit_wreg = gfx_v8_0_ring_emit_wreg,
 };
 
 static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
 	.type = AMDGPU_RING_TYPE_KIQ,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = false,
 	.get_rptr = gfx_v8_0_ring_get_rptr,
 	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
 	.emit_frame_size =
-		20 + /* gfx_v8_0_ring_emit_gds_switch */
 		7 + /* gfx_v8_0_ring_emit_hdp_flush */
 		5 + /* hdp_invalidate */
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		17 + /* gfx_v8_0_ring_emit_vm_flush */
 		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
 	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
 	.test_ring = gfx_v8_0_ring_test_ring,
 	.test_ib = gfx_v8_0_ring_test_ib,
@@ -7278,39 +7224,26 @@  static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
 	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
 }
 
 static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
-	if (adev->gds.mem.total_size == 64 * 1024) {
-		adev->gds.mem.gfx_partition_size = 4096;
-		adev->gds.mem.cs_partition_size = 4096;
-
-		adev->gds.gws.gfx_partition_size = 4;
-		adev->gds.gws.cs_partition_size = 4;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 1;
-	} else {
-		adev->gds.mem.gfx_partition_size = 1024;
-		adev->gds.mem.cs_partition_size = 1024;
-
-		adev->gds.gws.gfx_partition_size = 16;
-		adev->gds.gws.cs_partition_size = 16;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 4;
-	}
+	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
+	adev->gds.oa.kfd_size_per_vmid = 0;
 }
 
 static void gfx_v8_0_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
 						 u32 bitmap)
 {
 	u32 data;
 
 	if (!bitmap)
 		return;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 3594704a6f9b..48a7e25514f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1351,31 +1351,32 @@  static int gfx_v9_0_ngg_fini(struct amdgpu_device *adev)
 				      NULL);
 
 	memset(&adev->gfx.ngg.buf[0], 0,
 			sizeof(struct amdgpu_ngg_buf) * NGG_BUF_MAX);
 
 	adev->gfx.ngg.init = false;
 
 	return 0;
 }
 
+/* TODO: remove */
 static int gfx_v9_0_ngg_init(struct amdgpu_device *adev)
 {
 	int r;
 
 	if (!amdgpu_ngg || adev->gfx.ngg.init == true)
 		return 0;
 
 	/* GDS reserve memory: 64 bytes alignment */
 	adev->gfx.ngg.gds_reserve_size = ALIGN(5 * 4, 0x40);
 	adev->gds.mem.total_size -= adev->gfx.ngg.gds_reserve_size;
-	adev->gds.mem.gfx_partition_size -= adev->gfx.ngg.gds_reserve_size;
+	adev->gds.mem.gfx_size_per_vmid -= adev->gfx.ngg.gds_reserve_size;
 	adev->gfx.ngg.gds_reserve_addr = RREG32_SOC15(GC, 0, mmGDS_VMID0_BASE);
 	adev->gfx.ngg.gds_reserve_addr += RREG32_SOC15(GC, 0, mmGDS_VMID0_SIZE);
 
 	/* Primitive Buffer */
 	r = gfx_v9_0_ngg_create_buf(adev, &adev->gfx.ngg.buf[NGG_PRIM],
 				    amdgpu_prim_buf_per_se,
 				    64 * 1024);
 	if (r) {
 		dev_err(adev->dev, "Failed to create Primitive Buffer\n");
 		goto err;
@@ -1412,20 +1413,21 @@  static int gfx_v9_0_ngg_init(struct amdgpu_device *adev)
 	}
 
 out:
 	adev->gfx.ngg.init = true;
 	return 0;
 err:
 	gfx_v9_0_ngg_fini(adev);
 	return r;
 }
 
+/* TODO: remove */
 static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
 {
 	struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
 	int r;
 	u32 data, base;
 
 	if (!amdgpu_ngg)
 		return 0;
 
 	/* Program buffer size */
@@ -1469,23 +1471,22 @@  static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
 	/* Clear GDS reserved memory */
 	r = amdgpu_ring_alloc(ring, 17);
 	if (r) {
 		DRM_ERROR("amdgpu: NGG failed to lock ring %d (%d).\n",
 			  ring->idx, r);
 		return r;
 	}
 
 	gfx_v9_0_write_data_to_reg(ring, 0, false,
 				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_SIZE),
-			           (adev->gds.mem.total_size +
-				    adev->gfx.ngg.gds_reserve_size) >>
-				   AMDGPU_GDS_SHIFT);
+			           adev->gds.mem.total_size +
+				   adev->gfx.ngg.gds_reserve_size);
 
 	amdgpu_ring_write(ring, PACKET3(PACKET3_DMA_DATA, 5));
 	amdgpu_ring_write(ring, (PACKET3_DMA_DATA_CP_SYNC |
 				PACKET3_DMA_DATA_DST_SEL(1) |
 				PACKET3_DMA_DATA_SRC_SEL(2)));
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, adev->gfx.ngg.gds_reserve_addr);
 	amdgpu_ring_write(ring, 0);
 	amdgpu_ring_write(ring, PACKET3_DMA_DATA_CMD_RAW_WAIT |
@@ -1644,62 +1645,39 @@  static int gfx_v9_0_sw_init(void *handle)
 	kiq = &adev->gfx.kiq;
 	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
 	if (r)
 		return r;
 
 	/* create MQD for all compute queues as wel as KIQ for SRIOV case */
 	r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct v9_mqd_allocation));
 	if (r)
 		return r;
 
-	/* reserve GDS, GWS and OA resource for gfx */
-	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
-				    &adev->gds.gds_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
-				    &adev->gds.gws_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
-	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
-				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
-				    &adev->gds.oa_gfx_bo, NULL, NULL);
-	if (r)
-		return r;
-
 	adev->gfx.ce_ram_size = 0x8000;
 
 	r = gfx_v9_0_gpu_early_init(adev);
 	if (r)
 		return r;
 
 	r = gfx_v9_0_ngg_init(adev);
 	if (r)
 		return r;
 
 	return 0;
 }
 
 
 static int gfx_v9_0_sw_fini(void *handle)
 {
 	int i;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
-	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
-
 	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
 	for (i = 0; i < adev->gfx.num_compute_rings; i++)
 		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
 
 	amdgpu_gfx_compute_mqd_sw_fini(adev);
 	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
 	amdgpu_gfx_kiq_fini(adev);
 
 	gfx_v9_0_mec_fini(adev);
@@ -1813,21 +1791,21 @@  static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
 		WREG32_SOC15(GC, 0, mmSH_MEM_CONFIG, sh_mem_config);
 		WREG32_SOC15(GC, 0, mmSH_MEM_BASES, sh_mem_bases);
 	}
 	soc15_grbm_select(adev, 0, 0, 0, 0);
 	mutex_unlock(&adev->srbm_mutex);
 }
 
 static void gfx_v9_0_gpu_init(struct amdgpu_device *adev)
 {
 	u32 tmp;
-	int i;
+	int i, vmid;
 
 	WREG32_FIELD15(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff);
 
 	gfx_v9_0_tiling_mode_table_init(adev);
 
 	gfx_v9_0_setup_rb(adev);
 	gfx_v9_0_get_cu_info(adev, &adev->gfx.cu_info);
 	adev->gfx.config.db_debug2 = RREG32_SOC15(GC, 0, mmDB_DEBUG2);
 
 	/* XXX SH_MEM regs */
@@ -1869,20 +1847,43 @@  static void gfx_v9_0_gpu_init(struct amdgpu_device *adev)
 		   (adev->gfx.config.sc_prim_fifo_size_frontend <<
 			PA_SC_FIFO_SIZE__SC_FRONTEND_PRIM_FIFO_SIZE__SHIFT) |
 		   (adev->gfx.config.sc_prim_fifo_size_backend <<
 			PA_SC_FIFO_SIZE__SC_BACKEND_PRIM_FIFO_SIZE__SHIFT) |
 		   (adev->gfx.config.sc_hiz_tile_fifo_size <<
 			PA_SC_FIFO_SIZE__SC_HIZ_TILE_FIFO_SIZE__SHIFT) |
 		   (adev->gfx.config.sc_earlyz_tile_fifo_size <<
 			PA_SC_FIFO_SIZE__SC_EARLYZ_TILE_FIFO_SIZE__SHIFT));
 	mutex_unlock(&adev->grbm_idx_mutex);
 
+	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
+		unsigned gds_size, gws_size, oa_size;
+
+		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
+			gds_size = adev->gds.mem.gfx_size_per_vmid;
+			gws_size = adev->gds.gws.gfx_size_per_vmid;
+			oa_size = adev->gds.oa.gfx_size_per_vmid;
+		} else {
+			gds_size = adev->gds.mem.kfd_size_per_vmid;
+			gws_size = adev->gds.gws.kfd_size_per_vmid;
+			oa_size = adev->gds.oa.kfd_size_per_vmid;
+		}
+
+		WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * vmid,
+				    vmid * gds_size);
+		WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * vmid,
+				    gds_size);
+		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, vmid,
+				    (vmid * gws_size) |
+				    (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
+		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, vmid,
+				    ((1 << oa_size) - 1) << (vmid * oa_size));
+	}
 }
 
 static void gfx_v9_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
 {
 	u32 i, j, k;
 	u32 mask;
 
 	mutex_lock(&adev->grbm_idx_mutex);
 	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
 		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
@@ -3407,58 +3408,20 @@  static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev)
 	uint64_t clock;
 
 	mutex_lock(&adev->gfx.gpu_clock_mutex);
 	WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
 	clock = (uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_LSB) |
 		((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
 	mutex_unlock(&adev->gfx.gpu_clock_mutex);
 	return clock;
 }
 
-static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
-					  uint32_t vmid,
-					  uint32_t gds_base, uint32_t gds_size,
-					  uint32_t gws_base, uint32_t gws_size,
-					  uint32_t oa_base, uint32_t oa_size)
-{
-	struct amdgpu_device *adev = ring->adev;
-
-	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
-	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
-
-	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
-	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
-
-	oa_base = oa_base >> AMDGPU_OA_SHIFT;
-	oa_size = oa_size >> AMDGPU_OA_SHIFT;
-
-	/* GDS Base */
-	gfx_v9_0_write_data_to_reg(ring, 0, false,
-				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_BASE) + 2 * vmid,
-				   gds_base);
-
-	/* GDS Size */
-	gfx_v9_0_write_data_to_reg(ring, 0, false,
-				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_SIZE) + 2 * vmid,
-				   gds_size);
-
-	/* GWS */
-	gfx_v9_0_write_data_to_reg(ring, 0, false,
-				   SOC15_REG_OFFSET(GC, 0, mmGDS_GWS_VMID0) + vmid,
-				   gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
-
-	/* OA */
-	gfx_v9_0_write_data_to_reg(ring, 0, false,
-				   SOC15_REG_OFFSET(GC, 0, mmGDS_OA_VMID0) + vmid,
-				   (1 << (oa_size + oa_base)) - (1 << oa_base));
-}
-
 static int gfx_v9_0_early_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
 	adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
 	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
 	gfx_v9_0_set_ring_funcs(adev);
 	gfx_v9_0_set_irq_funcs(adev);
 	gfx_v9_0_set_gds_init(adev);
 	gfx_v9_0_set_rlc_funcs(adev);
@@ -4695,21 +4658,20 @@  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 		31 + /*	DE_META */
 		3 + /* CNTX_CTRL */
 		5 + /* HDP_INVL */
 		8 + 8 + /* FENCE x2 */
 		2, /* SWITCH_BUFFER */
 	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_gfx */
 	.emit_ib = gfx_v9_0_ring_emit_ib_gfx,
 	.emit_fence = gfx_v9_0_ring_emit_fence,
 	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.test_ib = gfx_v9_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.emit_switch_buffer = gfx_v9_ring_emit_sb,
 	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
 	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
 	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
 	.emit_tmz = gfx_v9_0_ring_emit_tmz,
@@ -4722,34 +4684,32 @@  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.type = AMDGPU_RING_TYPE_COMPUTE,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = true,
 	.vmhub = AMDGPU_GFXHUB,
 	.get_rptr = gfx_v9_0_ring_get_rptr_compute,
 	.get_wptr = gfx_v9_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
 	.emit_frame_size =
-		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
 	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v9_0_ring_emit_fence,
 	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
 	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
-	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
 	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
 	.test_ring = gfx_v9_0_ring_test_ring,
 	.test_ib = gfx_v9_0_ring_test_ib,
 	.insert_nop = amdgpu_ring_insert_nop,
 	.pad_ib = amdgpu_ring_generic_pad_ib,
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
 	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
 };
@@ -4757,21 +4717,20 @@  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
 	.type = AMDGPU_RING_TYPE_KIQ,
 	.align_mask = 0xff,
 	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
 	.support_64bit_ptrs = true,
 	.vmhub = AMDGPU_GFXHUB,
 	.get_rptr = gfx_v9_0_ring_get_rptr_compute,
 	.get_wptr = gfx_v9_0_ring_get_wptr_compute,
 	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
 	.emit_frame_size =
-		20 + /* gfx_v9_0_ring_emit_gds_switch */
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
 		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
 	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
 	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
 	.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
@@ -4847,39 +4806,26 @@  static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev)
 	}
 }
 
 static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32_SOC15(GC, 0, mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
-	if (adev->gds.mem.total_size == 64 * 1024) {
-		adev->gds.mem.gfx_partition_size = 4096;
-		adev->gds.mem.cs_partition_size = 4096;
-
-		adev->gds.gws.gfx_partition_size = 4;
-		adev->gds.gws.cs_partition_size = 4;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 1;
-	} else {
-		adev->gds.mem.gfx_partition_size = 1024;
-		adev->gds.mem.cs_partition_size = 1024;
-
-		adev->gds.gws.gfx_partition_size = 16;
-		adev->gds.gws.cs_partition_size = 16;
-
-		adev->gds.oa.gfx_partition_size = 4;
-		adev->gds.oa.cs_partition_size = 4;
-	}
+	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
+	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
+	adev->gds.oa.kfd_size_per_vmid = 0;
 }
 
 static void gfx_v9_0_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
 						 u32 bitmap)
 {
 	u32 data;
 
 	if (!bitmap)
 		return;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 94444eeba55b..9b9512b14cae 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -81,36 +81,27 @@  extern "C" {
  * %AMDGPU_GEM_DOMAIN_CPU	System memory that is not GPU accessible.
  * Memory in this pool could be swapped out to disk if there is pressure.
  *
  * %AMDGPU_GEM_DOMAIN_GTT	GPU accessible system memory, mapped into the
  * GPU's virtual address space via gart. Gart memory linearizes non-contiguous
  * pages of system memory, allows GPU access system memory in a linezrized
  * fashion.
  *
  * %AMDGPU_GEM_DOMAIN_VRAM	Local video memory. For APUs, it is memory
  * carved out by the BIOS.
- *
- * %AMDGPU_GEM_DOMAIN_GDS	Global on-chip data storage used to share data
- * across shader threads.
- *
- * %AMDGPU_GEM_DOMAIN_GWS	Global wave sync, used to synchronize the
- * execution of all the waves on a device.
- *
- * %AMDGPU_GEM_DOMAIN_OA	Ordered append, used by 3D or Compute engines
- * for appending data.
  */
 #define AMDGPU_GEM_DOMAIN_CPU		0x1
 #define AMDGPU_GEM_DOMAIN_GTT		0x2
 #define AMDGPU_GEM_DOMAIN_VRAM		0x4
-#define AMDGPU_GEM_DOMAIN_GDS		0x8
-#define AMDGPU_GEM_DOMAIN_GWS		0x10
-#define AMDGPU_GEM_DOMAIN_OA		0x20
+#define AMDGPU_GEM_DOMAIN_GDS		0x8 /* non-functional */
+#define AMDGPU_GEM_DOMAIN_GWS		0x10 /* non-functional */
+#define AMDGPU_GEM_DOMAIN_OA		0x20 /* non-functional */
 #define AMDGPU_GEM_DOMAIN_MASK		(AMDGPU_GEM_DOMAIN_CPU | \
 					 AMDGPU_GEM_DOMAIN_GTT | \
 					 AMDGPU_GEM_DOMAIN_VRAM | \
 					 AMDGPU_GEM_DOMAIN_GDS | \
 					 AMDGPU_GEM_DOMAIN_GWS | \
 					 AMDGPU_GEM_DOMAIN_OA)
 
 /* Flag that CPU access will be required for the case of VRAM domain */
 #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED	(1 << 0)
 /* Flag that CPU access will not work, this VRAM domain is invisible */

Comments

As discussed internally that doesn't work because threads don't 
necessary get the same VMID assigned.

Christian.

Am 12.09.2018 um 22:33 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> I've chosen to do it like this because it's easy and allows an arbitrary
> number of processes.
>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  10 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   3 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |  20 ----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h     |  19 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  24 +---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c     |   6 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h     |   7 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |   3 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  14 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  21 ----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |   6 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |   5 -
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  61 ----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |   8 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  34 +-----
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c       | 125 +++++---------------
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c       | 123 +++++--------------
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c       | 124 ++++++-------------
>   include/uapi/drm/amdgpu_drm.h               |  15 +--
>   19 files changed, 109 insertions(+), 519 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> index b80243d3972e..7264a4930b88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
> @@ -71,23 +71,20 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
>   				/ sizeof(struct amdgpu_bo_list_entry))
>   		return -EINVAL;
>   
>   	size = sizeof(struct amdgpu_bo_list);
>   	size += num_entries * sizeof(struct amdgpu_bo_list_entry);
>   	list = kvmalloc(size, GFP_KERNEL);
>   	if (!list)
>   		return -ENOMEM;
>   
>   	kref_init(&list->refcount);
> -	list->gds_obj = adev->gds.gds_gfx_bo;
> -	list->gws_obj = adev->gds.gws_gfx_bo;
> -	list->oa_obj = adev->gds.oa_gfx_bo;
>   
>   	array = amdgpu_bo_list_array_entry(list, 0);
>   	memset(array, 0, num_entries * sizeof(struct amdgpu_bo_list_entry));
>   
>   	for (i = 0; i < num_entries; ++i) {
>   		struct amdgpu_bo_list_entry *entry;
>   		struct drm_gem_object *gobj;
>   		struct amdgpu_bo *bo;
>   		struct mm_struct *usermm;
>   
> @@ -111,27 +108,20 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev, struct drm_file *filp,
>   		} else {
>   			entry = &array[last_entry++];
>   		}
>   
>   		entry->robj = bo;
>   		entry->priority = min(info[i].bo_priority,
>   				      AMDGPU_BO_LIST_MAX_PRIORITY);
>   		entry->tv.bo = &entry->robj->tbo;
>   		entry->tv.shared = !entry->robj->prime_shared_count;
>   
> -		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_GDS)
> -			list->gds_obj = entry->robj;
> -		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_GWS)
> -			list->gws_obj = entry->robj;
> -		if (entry->robj->preferred_domains == AMDGPU_GEM_DOMAIN_OA)
> -			list->oa_obj = entry->robj;
> -
>   		total_size += amdgpu_bo_size(entry->robj);
>   		trace_amdgpu_bo_list_set(list, entry->robj);
>   	}
>   
>   	list->first_userptr = first_userptr;
>   	list->num_entries = num_entries;
>   
>   	trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>   
>   	*result = list;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> index 61b089768e1c..30f12a60aa28 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
> @@ -36,23 +36,20 @@ struct amdgpu_bo_list_entry {
>   	struct ttm_validate_buffer	tv;
>   	struct amdgpu_bo_va		*bo_va;
>   	uint32_t			priority;
>   	struct page			**user_pages;
>   	int				user_invalidated;
>   };
>   
>   struct amdgpu_bo_list {
>   	struct rcu_head rhead;
>   	struct kref refcount;
> -	struct amdgpu_bo *gds_obj;
> -	struct amdgpu_bo *gws_obj;
> -	struct amdgpu_bo *oa_obj;
>   	unsigned first_userptr;
>   	unsigned num_entries;
>   };
>   
>   int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
>   		       struct amdgpu_bo_list **result);
>   void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
>   			     struct list_head *validated);
>   void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
>   int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 1081fd00b059..88b58facf29e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -565,23 +565,20 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,
>   	return 0;
>   }
>   
>   static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   				union drm_amdgpu_cs *cs)
>   {
>   	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>   	struct amdgpu_vm *vm = &fpriv->vm;
>   	struct amdgpu_bo_list_entry *e;
>   	struct list_head duplicates;
> -	struct amdgpu_bo *gds;
> -	struct amdgpu_bo *gws;
> -	struct amdgpu_bo *oa;
>   	unsigned tries = 10;
>   	int r;
>   
>   	INIT_LIST_HEAD(&p->validated);
>   
>   	/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
>   	if (cs->in.bo_list_handle) {
>   		if (p->bo_list)
>   			return -EINVAL;
>   
> @@ -705,40 +702,23 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   
>   	r = amdgpu_cs_list_validate(p, &p->validated);
>   	if (r) {
>   		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
>   		goto error_validate;
>   	}
>   
>   	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>   				     p->bytes_moved_vis);
>   
> -	gds = p->bo_list->gds_obj;
> -	gws = p->bo_list->gws_obj;
> -	oa = p->bo_list->oa_obj;
> -
>   	amdgpu_bo_list_for_each_entry(e, p->bo_list)
>   		e->bo_va = amdgpu_vm_bo_find(vm, e->robj);
>   
> -	if (gds) {
> -		p->job->gds_base = amdgpu_bo_gpu_offset(gds);
> -		p->job->gds_size = amdgpu_bo_size(gds);
> -	}
> -	if (gws) {
> -		p->job->gws_base = amdgpu_bo_gpu_offset(gws);
> -		p->job->gws_size = amdgpu_bo_size(gws);
> -	}
> -	if (oa) {
> -		p->job->oa_base = amdgpu_bo_gpu_offset(oa);
> -		p->job->oa_size = amdgpu_bo_size(oa);
> -	}
> -
>   	if (!r && p->uf_entry.robj) {
>   		struct amdgpu_bo *uf = p->uf_entry.robj;
>   
>   		r = amdgpu_ttm_alloc_gart(&uf->tbo);
>   		p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
>   	}
>   
>   error_validate:
>   	if (r)
>   		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index e73728d90388..69ba25c2e921 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -17,48 +17,33 @@
>    * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
>    * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>    * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>    * OTHER DEALINGS IN THE SOFTWARE.
>    *
>    */
>   
>   #ifndef __AMDGPU_GDS_H__
>   #define __AMDGPU_GDS_H__
>   
> -/* Because TTM request that alloacted buffer should be PAGE_SIZE aligned,
> - * we should report GDS/GWS/OA size as PAGE_SIZE aligned
> - * */
> -#define AMDGPU_GDS_SHIFT	2
> -#define AMDGPU_GWS_SHIFT	PAGE_SHIFT
> -#define AMDGPU_OA_SHIFT		PAGE_SHIFT
> -
>   struct amdgpu_ring;
>   struct amdgpu_bo;
>   
>   struct amdgpu_gds_asic_info {
>   	uint32_t	total_size;
> -	uint32_t	gfx_partition_size;
> -	uint32_t	cs_partition_size;
> +	uint32_t	gfx_size_per_vmid;
> +	uint32_t	kfd_size_per_vmid;
>   };
>   
>   struct amdgpu_gds {
>   	struct amdgpu_gds_asic_info	mem;
>   	struct amdgpu_gds_asic_info	gws;
>   	struct amdgpu_gds_asic_info	oa;
> -	/* At present, GDS, GWS and OA resources for gfx (graphics)
> -	 * is always pre-allocated and available for graphics operation.
> -	 * Such resource is shared between all gfx clients.
> -	 * TODO: move this operation to user space
> -	 * */
> -	struct amdgpu_bo*		gds_gfx_bo;
> -	struct amdgpu_bo*		gws_gfx_bo;
> -	struct amdgpu_bo*		oa_gfx_bo;
>   };
>   
>   struct amdgpu_gds_reg_offset {
>   	uint32_t	mem_base;
>   	uint32_t	mem_size;
>   	uint32_t	gws;
>   	uint32_t	oa;
>   };
>   
>   #endif /* __AMDGPU_GDS_H__ */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index d30a0838851b..c87ad4b4d0b6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -223,43 +223,25 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
>   	if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>   		      AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
>   		      AMDGPU_GEM_CREATE_CPU_GTT_USWC |
>   		      AMDGPU_GEM_CREATE_VRAM_CLEARED |
>   		      AMDGPU_GEM_CREATE_VM_ALWAYS_VALID |
>   		      AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
>   
>   		return -EINVAL;
>   
>   	/* reject invalid gem domains */
> -	if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
> +	if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
> +				 AMDGPU_GEM_DOMAIN_GTT |
> +				 AMDGPU_GEM_DOMAIN_VRAM))
>   		return -EINVAL;
>   
> -	/* create a gem object to contain this object in */
> -	if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
> -	    AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
> -		if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
> -			/* if gds bo is created from user space, it must be
> -			 * passed to bo list
> -			 */
> -			DRM_ERROR("GDS bo cannot be per-vm-bo\n");
> -			return -EINVAL;
> -		}
> -		flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
> -		if (args->in.domains == AMDGPU_GEM_DOMAIN_GDS)
> -			size = size << AMDGPU_GDS_SHIFT;
> -		else if (args->in.domains == AMDGPU_GEM_DOMAIN_GWS)
> -			size = size << AMDGPU_GWS_SHIFT;
> -		else if (args->in.domains == AMDGPU_GEM_DOMAIN_OA)
> -			size = size << AMDGPU_OA_SHIFT;
> -		else
> -			return -EINVAL;
> -	}
>   	size = roundup(size, PAGE_SIZE);
>   
>   	if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>   		r = amdgpu_bo_reserve(vm->root.base.bo, false);
>   		if (r)
>   			return r;
>   
>   		resv = vm->root.base.bo->tbo.resv;
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> index 3a072a7a39f0..c2e6a1a11d7f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
> @@ -516,26 +516,20 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev,
>    * Reset saved GDW, GWS and OA to force switch on next flush.
>    */
>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>   		       unsigned vmid)
>   {
>   	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>   	struct amdgpu_vmid *id = &id_mgr->ids[vmid];
>   
>   	mutex_lock(&id_mgr->lock);
>   	id->owner = 0;
> -	id->gds_base = 0;
> -	id->gds_size = 0;
> -	id->gws_base = 0;
> -	id->gws_size = 0;
> -	id->oa_base = 0;
> -	id->oa_size = 0;
>   	mutex_unlock(&id_mgr->lock);
>   }
>   
>   /**
>    * amdgpu_vmid_reset_all - reset VMID to zero
>    *
>    * @adev: amdgpu device structure
>    *
>    * Reset VMID to force flush on next use
>    */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> index 7625419f0fc2..06078e665532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
> @@ -44,27 +44,20 @@ struct amdgpu_vmid {
>   	struct amdgpu_sync	active;
>   	struct dma_fence	*last_flush;
>   	uint64_t		owner;
>   
>   	uint64_t		pd_gpu_addr;
>   	/* last flushed PD/PT update */
>   	struct dma_fence	*flushed_updates;
>   
>   	uint32_t                current_gpu_reset_count;
>   
> -	uint32_t		gds_base;
> -	uint32_t		gds_size;
> -	uint32_t		gws_base;
> -	uint32_t		gws_size;
> -	uint32_t		oa_base;
> -	uint32_t		oa_size;
> -
>   	unsigned		pasid;
>   	struct dma_fence	*pasid_mapping;
>   };
>   
>   struct amdgpu_vmid_mgr {
>   	struct mutex		lock;
>   	unsigned		num_ids;
>   	struct list_head	ids_lru;
>   	struct amdgpu_vmid	ids[AMDGPU_NUM_VMID];
>   	atomic_t		reserved_vmid_num;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 57cfe78a262b..3db553f6ad01 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -42,23 +42,20 @@ struct amdgpu_job {
>   	struct amdgpu_sync	sched_sync;
>   	struct amdgpu_ib	*ibs;
>   	struct dma_fence	*fence; /* the hw fence */
>   	uint32_t		preamble_status;
>   	uint32_t		num_ibs;
>   	void			*owner;
>   	bool                    vm_needs_flush;
>   	uint64_t		vm_pd_addr;
>   	unsigned		vmid;
>   	unsigned		pasid;
> -	uint32_t		gds_base, gds_size;
> -	uint32_t		gws_base, gws_size;
> -	uint32_t		oa_base, oa_size;
>   	uint32_t		vram_lost_counter;
>   
>   	/* user fence handling */
>   	uint64_t		uf_addr;
>   	uint64_t		uf_sequence;
>   
>   };
>   
>   int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>   		     struct amdgpu_job **job, struct amdgpu_vm *vm);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 29ac3873eeb0..209954290954 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -517,27 +517,27 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
>   	case AMDGPU_INFO_VIS_VRAM_USAGE:
>   		ui64 = amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
>   		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>   	case AMDGPU_INFO_GTT_USAGE:
>   		ui64 = amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]);
>   		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>   	case AMDGPU_INFO_GDS_CONFIG: {
>   		struct drm_amdgpu_info_gds gds_info;
>   
>   		memset(&gds_info, 0, sizeof(gds_info));
> -		gds_info.gds_gfx_partition_size = adev->gds.mem.gfx_partition_size >> AMDGPU_GDS_SHIFT;
> -		gds_info.compute_partition_size = adev->gds.mem.cs_partition_size >> AMDGPU_GDS_SHIFT;
> -		gds_info.gds_total_size = adev->gds.mem.total_size >> AMDGPU_GDS_SHIFT;
> -		gds_info.gws_per_gfx_partition = adev->gds.gws.gfx_partition_size >> AMDGPU_GWS_SHIFT;
> -		gds_info.gws_per_compute_partition = adev->gds.gws.cs_partition_size >> AMDGPU_GWS_SHIFT;
> -		gds_info.oa_per_gfx_partition = adev->gds.oa.gfx_partition_size >> AMDGPU_OA_SHIFT;
> -		gds_info.oa_per_compute_partition = adev->gds.oa.cs_partition_size >> AMDGPU_OA_SHIFT;
> +		gds_info.gds_gfx_partition_size = adev->gds.mem.gfx_size_per_vmid;
> +		gds_info.compute_partition_size = adev->gds.mem.kfd_size_per_vmid;
> +		gds_info.gds_total_size = adev->gds.mem.total_size;
> +		gds_info.gws_per_gfx_partition = adev->gds.gws.gfx_size_per_vmid;
> +		gds_info.gws_per_compute_partition = adev->gds.gws.kfd_size_per_vmid;
> +		gds_info.oa_per_gfx_partition = adev->gds.oa.gfx_size_per_vmid;
> +		gds_info.oa_per_compute_partition = adev->gds.oa.kfd_size_per_vmid;
>   		return copy_to_user(out, &gds_info,
>   				    min((size_t)size, sizeof(gds_info))) ? -EFAULT : 0;
>   	}
>   	case AMDGPU_INFO_VRAM_GTT: {
>   		struct drm_amdgpu_info_vram_gtt vram_gtt;
>   
>   		vram_gtt.vram_size = adev->gmc.real_vram_size -
>   			atomic64_read(&adev->vram_pin_size);
>   		vram_gtt.vram_cpu_accessible_size = adev->gmc.visible_vram_size -
>   			atomic64_read(&adev->visible_pin_size);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index de990bdcdd6c..76770a8c29a5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -178,41 +178,20 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)
>   		places[c].lpfn = 0;
>   		places[c].flags = TTM_PL_FLAG_SYSTEM;
>   		if (flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
>   			places[c].flags |= TTM_PL_FLAG_WC |
>   				TTM_PL_FLAG_UNCACHED;
>   		else
>   			places[c].flags |= TTM_PL_FLAG_CACHED;
>   		c++;
>   	}
>   
> -	if (domain & AMDGPU_GEM_DOMAIN_GDS) {
> -		places[c].fpfn = 0;
> -		places[c].lpfn = 0;
> -		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_GDS;
> -		c++;
> -	}
> -
> -	if (domain & AMDGPU_GEM_DOMAIN_GWS) {
> -		places[c].fpfn = 0;
> -		places[c].lpfn = 0;
> -		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_GWS;
> -		c++;
> -	}
> -
> -	if (domain & AMDGPU_GEM_DOMAIN_OA) {
> -		places[c].fpfn = 0;
> -		places[c].lpfn = 0;
> -		places[c].flags = TTM_PL_FLAG_UNCACHED | AMDGPU_PL_FLAG_OA;
> -		c++;
> -	}
> -
>   	if (!c) {
>   		places[c].fpfn = 0;
>   		places[c].lpfn = 0;
>   		places[c].flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM;
>   		c++;
>   	}
>   
>   	BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS);
>   
>   	placement->num_placement = c;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> index 907fdf46d895..e089964cbcb7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
> @@ -120,26 +120,20 @@ static inline struct amdgpu_bo *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>    */
>   static inline unsigned amdgpu_mem_type_to_domain(u32 mem_type)
>   {
>   	switch (mem_type) {
>   	case TTM_PL_VRAM:
>   		return AMDGPU_GEM_DOMAIN_VRAM;
>   	case TTM_PL_TT:
>   		return AMDGPU_GEM_DOMAIN_GTT;
>   	case TTM_PL_SYSTEM:
>   		return AMDGPU_GEM_DOMAIN_CPU;
> -	case AMDGPU_PL_GDS:
> -		return AMDGPU_GEM_DOMAIN_GDS;
> -	case AMDGPU_PL_GWS:
> -		return AMDGPU_GEM_DOMAIN_GWS;
> -	case AMDGPU_PL_OA:
> -		return AMDGPU_GEM_DOMAIN_OA;
>   	default:
>   		break;
>   	}
>   	return 0;
>   }
>   
>   /**
>    * amdgpu_bo_reserve - reserve bo
>    * @bo:		bo structure
>    * @no_intr:	don't return -ERESTARTSYS on pending signal
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 9cc239968e40..f6ea9604e611 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -130,24 +130,20 @@ struct amdgpu_ring_funcs {
>   	/* command emit functions */
>   	void (*emit_ib)(struct amdgpu_ring *ring,
>   			struct amdgpu_ib *ib,
>   			unsigned vmid, bool ctx_switch);
>   	void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr,
>   			   uint64_t seq, unsigned flags);
>   	void (*emit_pipeline_sync)(struct amdgpu_ring *ring);
>   	void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid,
>   			      uint64_t pd_addr);
>   	void (*emit_hdp_flush)(struct amdgpu_ring *ring);
> -	void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid,
> -				uint32_t gds_base, uint32_t gds_size,
> -				uint32_t gws_base, uint32_t gws_size,
> -				uint32_t oa_base, uint32_t oa_size);
>   	/* testing functions */
>   	int (*test_ring)(struct amdgpu_ring *ring);
>   	int (*test_ib)(struct amdgpu_ring *ring, long timeout);
>   	/* insert NOP packets */
>   	void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count);
>   	void (*insert_start)(struct amdgpu_ring *ring);
>   	void (*insert_end)(struct amdgpu_ring *ring);
>   	/* pad the indirect buffer to the necessary number of dw */
>   	void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>   	unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
> @@ -226,21 +222,20 @@ struct amdgpu_ring {
>   #define amdgpu_ring_patch_cs_in_place(r, p, ib) ((r)->funcs->patch_cs_in_place((p), (ib)))
>   #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r))
>   #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t))
>   #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r))
>   #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r))
>   #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r))
>   #define amdgpu_ring_emit_ib(r, ib, vmid, c) (r)->funcs->emit_ib((r), (ib), (vmid), (c))
>   #define amdgpu_ring_emit_pipeline_sync(r) (r)->funcs->emit_pipeline_sync((r))
>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr) (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>   #define amdgpu_ring_emit_fence(r, addr, seq, flags) (r)->funcs->emit_fence((r), (addr), (seq), (flags))
> -#define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as) (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>   #define amdgpu_ring_emit_switch_buffer(r) (r)->funcs->emit_switch_buffer((r))
>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r), (d))
>   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m) (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 8a158ee922f7..2cc62b0e7ea8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -195,30 +195,20 @@ static int amdgpu_init_mem_type(struct ttm_bo_device *bdev, uint32_t type,
>   		break;
>   	case TTM_PL_VRAM:
>   		/* "On-card" video ram */
>   		man->func = &amdgpu_vram_mgr_func;
>   		man->gpu_offset = adev->gmc.vram_start;
>   		man->flags = TTM_MEMTYPE_FLAG_FIXED |
>   			     TTM_MEMTYPE_FLAG_MAPPABLE;
>   		man->available_caching = TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_WC;
>   		man->default_caching = TTM_PL_FLAG_WC;
>   		break;
> -	case AMDGPU_PL_GDS:
> -	case AMDGPU_PL_GWS:
> -	case AMDGPU_PL_OA:
> -		/* On-chip GDS memory*/
> -		man->func = &ttm_bo_manager_func;
> -		man->gpu_offset = 0;
> -		man->flags = TTM_MEMTYPE_FLAG_FIXED | TTM_MEMTYPE_FLAG_CMA;
> -		man->available_caching = TTM_PL_FLAG_UNCACHED;
> -		man->default_caching = TTM_PL_FLAG_UNCACHED;
> -		break;
>   	default:
>   		DRM_ERROR("Unsupported memory type %u\n", (unsigned)type);
>   		return -EINVAL;
>   	}
>   	return 0;
>   }
>   
>   /**
>    * amdgpu_evict_flags - Compute placement flags
>    *
> @@ -1039,25 +1029,20 @@ static int amdgpu_ttm_backend_bind(struct ttm_tt *ttm,
>   		if (r) {
>   			DRM_ERROR("failed to pin userptr\n");
>   			return r;
>   		}
>   	}
>   	if (!ttm->num_pages) {
>   		WARN(1, "nothing to bind %lu pages for mreg %p back %p!\n",
>   		     ttm->num_pages, bo_mem, ttm);
>   	}
>   
> -	if (bo_mem->mem_type == AMDGPU_PL_GDS ||
> -	    bo_mem->mem_type == AMDGPU_PL_GWS ||
> -	    bo_mem->mem_type == AMDGPU_PL_OA)
> -		return -EINVAL;
> -
>   	if (!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
>   		gtt->offset = AMDGPU_BO_INVALID_OFFSET;
>   		return 0;
>   	}
>   
>   	/* compute PTE flags relevant to this BO memory */
>   	flags = amdgpu_ttm_tt_pte_flags(adev, ttm, bo_mem);
>   
>   	/* bind pages into GART page tables */
>   	gtt->offset = ((u64)bo_mem->start << PAGE_SHIFT) - adev->gmc.gart_start;
> @@ -1818,60 +1803,20 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>   
>   	/* Initialize GTT memory pool */
>   	r = ttm_bo_init_mm(&adev->mman.bdev, TTM_PL_TT, gtt_size >> PAGE_SHIFT);
>   	if (r) {
>   		DRM_ERROR("Failed initializing GTT heap.\n");
>   		return r;
>   	}
>   	DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
>   		 (unsigned)(gtt_size / (1024 * 1024)));
>   
> -	/* Initialize various on-chip memory pools */
> -	adev->gds.mem.total_size = adev->gds.mem.total_size << AMDGPU_GDS_SHIFT;
> -	adev->gds.mem.gfx_partition_size = adev->gds.mem.gfx_partition_size << AMDGPU_GDS_SHIFT;
> -	adev->gds.mem.cs_partition_size = adev->gds.mem.cs_partition_size << AMDGPU_GDS_SHIFT;
> -	adev->gds.gws.total_size = adev->gds.gws.total_size << AMDGPU_GWS_SHIFT;
> -	adev->gds.gws.gfx_partition_size = adev->gds.gws.gfx_partition_size << AMDGPU_GWS_SHIFT;
> -	adev->gds.gws.cs_partition_size = adev->gds.gws.cs_partition_size << AMDGPU_GWS_SHIFT;
> -	adev->gds.oa.total_size = adev->gds.oa.total_size << AMDGPU_OA_SHIFT;
> -	adev->gds.oa.gfx_partition_size = adev->gds.oa.gfx_partition_size << AMDGPU_OA_SHIFT;
> -	adev->gds.oa.cs_partition_size = adev->gds.oa.cs_partition_size << AMDGPU_OA_SHIFT;
> -	/* GDS Memory */
> -	if (adev->gds.mem.total_size) {
> -		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GDS,
> -				   adev->gds.mem.total_size >> PAGE_SHIFT);
> -		if (r) {
> -			DRM_ERROR("Failed initializing GDS heap.\n");
> -			return r;
> -		}
> -	}
> -
> -	/* GWS */
> -	if (adev->gds.gws.total_size) {
> -		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GWS,
> -				   adev->gds.gws.total_size >> PAGE_SHIFT);
> -		if (r) {
> -			DRM_ERROR("Failed initializing gws heap.\n");
> -			return r;
> -		}
> -	}
> -
> -	/* OA */
> -	if (adev->gds.oa.total_size) {
> -		r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_OA,
> -				   adev->gds.oa.total_size >> PAGE_SHIFT);
> -		if (r) {
> -			DRM_ERROR("Failed initializing oa heap.\n");
> -			return r;
> -		}
> -	}
> -
>   	/* Register debugfs entries for amdgpu_ttm */
>   	r = amdgpu_ttm_debugfs_init(adev);
>   	if (r) {
>   		DRM_ERROR("Failed to init debugfs\n");
>   		return r;
>   	}
>   	return 0;
>   }
>   
>   /**
> @@ -1892,26 +1837,20 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>   		return;
>   
>   	amdgpu_ttm_debugfs_fini(adev);
>   	amdgpu_ttm_fw_reserve_vram_fini(adev);
>   	if (adev->mman.aper_base_kaddr)
>   		iounmap(adev->mman.aper_base_kaddr);
>   	adev->mman.aper_base_kaddr = NULL;
>   
>   	ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_VRAM);
>   	ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_TT);
> -	if (adev->gds.mem.total_size)
> -		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GDS);
> -	if (adev->gds.gws.total_size)
> -		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GWS);
> -	if (adev->gds.oa.total_size)
> -		ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_OA);
>   	ttm_bo_device_release(&adev->mman.bdev);
>   	amdgpu_ttm_global_fini(adev);
>   	adev->mman.initialized = false;
>   	DRM_INFO("amdgpu: ttm finalized\n");
>   }
>   
>   /**
>    * amdgpu_ttm_set_buffer_funcs_status - enable/disable use of buffer functions
>    *
>    * @adev: amdgpu_device pointer
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index fe8f276e9811..04557a382b19 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -20,28 +20,20 @@
>    * OTHER DEALINGS IN THE SOFTWARE.
>    *
>    */
>   
>   #ifndef __AMDGPU_TTM_H__
>   #define __AMDGPU_TTM_H__
>   
>   #include "amdgpu.h"
>   #include <drm/gpu_scheduler.h>
>   
> -#define AMDGPU_PL_GDS		(TTM_PL_PRIV + 0)
> -#define AMDGPU_PL_GWS		(TTM_PL_PRIV + 1)
> -#define AMDGPU_PL_OA		(TTM_PL_PRIV + 2)
> -
> -#define AMDGPU_PL_FLAG_GDS		(TTM_PL_FLAG_PRIV << 0)
> -#define AMDGPU_PL_FLAG_GWS		(TTM_PL_FLAG_PRIV << 1)
> -#define AMDGPU_PL_FLAG_OA		(TTM_PL_FLAG_PRIV << 2)
> -
>   #define AMDGPU_GTT_MAX_TRANSFER_SIZE	512
>   #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS	2
>   
>   struct amdgpu_mman {
>   	struct ttm_bo_global_ref        bo_global_ref;
>   	struct drm_global_reference	mem_global_ref;
>   	struct ttm_bo_device		bdev;
>   	bool				mem_global_referenced;
>   	bool				initialized;
>   	void __iomem			*aper_base_kaddr;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index be1659fedf94..c66f1c6f0ba8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -803,86 +803,69 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
>    * Returns:
>    * True if sync is needed.
>    */
>   bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>   				  struct amdgpu_job *job)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	unsigned vmhub = ring->funcs->vmhub;
>   	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>   	struct amdgpu_vmid *id;
> -	bool gds_switch_needed;
>   	bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
>   
>   	if (job->vmid == 0)
>   		return false;
>   	id = &id_mgr->ids[job->vmid];
> -	gds_switch_needed = ring->funcs->emit_gds_switch && (
> -		id->gds_base != job->gds_base ||
> -		id->gds_size != job->gds_size ||
> -		id->gws_base != job->gws_base ||
> -		id->gws_size != job->gws_size ||
> -		id->oa_base != job->oa_base ||
> -		id->oa_size != job->oa_size);
>   
>   	if (amdgpu_vmid_had_gpu_reset(adev, id))
>   		return true;
>   
> -	return vm_flush_needed || gds_switch_needed;
> +	return vm_flush_needed;
>   }
>   
>   /**
>    * amdgpu_vm_flush - hardware flush the vm
>    *
>    * @ring: ring to use for flush
>    * @job:  related job
>    * @need_pipe_sync: is pipe sync needed
>    *
>    * Emit a VM flush when it is necessary.
>    *
>    * Returns:
>    * 0 on success, errno otherwise.
>    */
>   int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	unsigned vmhub = ring->funcs->vmhub;
>   	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>   	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
> -	bool gds_switch_needed = ring->funcs->emit_gds_switch && (
> -		id->gds_base != job->gds_base ||
> -		id->gds_size != job->gds_size ||
> -		id->gws_base != job->gws_base ||
> -		id->gws_size != job->gws_size ||
> -		id->oa_base != job->oa_base ||
> -		id->oa_size != job->oa_size);
>   	bool vm_flush_needed = job->vm_needs_flush;
>   	bool pasid_mapping_needed = id->pasid != job->pasid ||
>   		!id->pasid_mapping ||
>   		!dma_fence_is_signaled(id->pasid_mapping);
>   	struct dma_fence *fence = NULL;
>   	unsigned patch_offset = 0;
>   	int r;
>   
>   	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
> -		gds_switch_needed = true;
>   		vm_flush_needed = true;
>   		pasid_mapping_needed = true;
>   	}
>   
> -	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>   	vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>   			job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>   	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>   		ring->funcs->emit_wreg;
>   
> -	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
> +	if (!vm_flush_needed && !need_pipe_sync)
>   		return 0;
>   
>   	if (ring->funcs->init_cond_exec)
>   		patch_offset = amdgpu_ring_init_cond_exec(ring);
>   
>   	if (need_pipe_sync)
>   		amdgpu_ring_emit_pipeline_sync(ring);
>   
>   	if (vm_flush_needed) {
>   		trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
> @@ -907,33 +890,20 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_
>   		mutex_unlock(&id_mgr->lock);
>   	}
>   
>   	if (pasid_mapping_needed) {
>   		id->pasid = job->pasid;
>   		dma_fence_put(id->pasid_mapping);
>   		id->pasid_mapping = dma_fence_get(fence);
>   	}
>   	dma_fence_put(fence);
>   
> -	if (ring->funcs->emit_gds_switch && gds_switch_needed) {
> -		id->gds_base = job->gds_base;
> -		id->gds_size = job->gds_size;
> -		id->gws_base = job->gws_base;
> -		id->gws_size = job->gws_size;
> -		id->oa_base = job->oa_base;
> -		id->oa_size = job->oa_size;
> -		amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
> -					    job->gds_size, job->gws_base,
> -					    job->gws_size, job->oa_base,
> -					    job->oa_size);
> -	}
> -
>   	if (ring->funcs->patch_cond_exec)
>   		amdgpu_ring_patch_cond_exec(ring, patch_offset);
>   
>   	/* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
>   	if (ring->funcs->emit_switch_buffer) {
>   		amdgpu_ring_emit_switch_buffer(ring);
>   		amdgpu_ring_emit_switch_buffer(ring);
>   	}
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index a15d9c0f233b..f5228e169c3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -1890,21 +1890,21 @@ static void gfx_v7_0_config_init(struct amdgpu_device *adev)
>    *
>    * @adev: amdgpu_device pointer
>    *
>    * Configures the 3D engine and tiling configuration
>    * registers so that the 3D engine is usable.
>    */
>   static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
>   {
>   	u32 sh_mem_cfg, sh_static_mem_cfg, sh_mem_base;
>   	u32 tmp;
> -	int i;
> +	int i, vmid;
>   
>   	WREG32(mmGRBM_CNTL, (0xff << GRBM_CNTL__READ_TIMEOUT__SHIFT));
>   
>   	WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>   	WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>   	WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>   
>   	gfx_v7_0_tiling_mode_table_init(adev);
>   
>   	gfx_v7_0_setup_rb(adev);
> @@ -2014,20 +2014,42 @@ static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
>   
>   	tmp = RREG32(mmSPI_ARB_PRIORITY);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>   	WREG32(mmSPI_ARB_PRIORITY, tmp);
>   
>   	mutex_unlock(&adev->grbm_idx_mutex);
>   
> +	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
> +		unsigned gds_size, gws_size, oa_size;
> +
> +		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
> +			gds_size = adev->gds.mem.gfx_size_per_vmid;
> +			gws_size = adev->gds.gws.gfx_size_per_vmid;
> +			oa_size = adev->gds.oa.gfx_size_per_vmid;
> +		} else {
> +			gds_size = adev->gds.mem.kfd_size_per_vmid;
> +			gws_size = adev->gds.gws.kfd_size_per_vmid;
> +			oa_size = adev->gds.oa.kfd_size_per_vmid;
> +		}
> +
> +		WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid * gds_size);
> +		WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
> +		WREG32(amdgpu_gds_reg_offset[vmid].gws,
> +		       (vmid * gws_size) |
> +		       (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
> +		WREG32(amdgpu_gds_reg_offset[vmid].oa,
> +		       ((1 << oa_size) - 1) << (vmid * oa_size));
> +	}
> +
>   	udelay(50);
>   }
>   
>   /*
>    * GPU scratch registers helpers function.
>    */
>   /**
>    * gfx_v7_0_scratch_init - setup driver info for CP scratch regs
>    *
>    * @adev: amdgpu_device pointer
> @@ -4157,68 +4179,20 @@ static uint64_t gfx_v7_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>   	uint64_t clock;
>   
>   	mutex_lock(&adev->gfx.gpu_clock_mutex);
>   	WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>   	clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>   		((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>   	mutex_unlock(&adev->gfx.gpu_clock_mutex);
>   	return clock;
>   }
>   
> -static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> -					  uint32_t vmid,
> -					  uint32_t gds_base, uint32_t gds_size,
> -					  uint32_t gws_base, uint32_t gws_size,
> -					  uint32_t oa_base, uint32_t oa_size)
> -{
> -	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
> -	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
> -
> -	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
> -	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
> -
> -	oa_base = oa_base >> AMDGPU_OA_SHIFT;
> -	oa_size = oa_size >> AMDGPU_OA_SHIFT;
> -
> -	/* GDS Base */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gds_base);
> -
> -	/* GDS Size */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gds_size);
> -
> -	/* GWS */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
> -
> -	/* OA */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> -}
> -
>   static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
>   {
>   	struct amdgpu_device *adev = ring->adev;
>   	uint32_t value = 0;
>   
>   	value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
>   	value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
>   	value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
>   	value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
>   	WREG32(mmSQ_CMD, value);
> @@ -4584,55 +4558,32 @@ static int gfx_v7_0_sw_init(void *handle)
>   								ring_id,
>   								i, k, j);
>   				if (r)
>   					return r;
>   
>   				ring_id++;
>   			}
>   		}
>   	}
>   
> -	/* reserve GDS, GWS and OA resource for gfx */
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
> -				    &adev->gds.gds_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
> -				    &adev->gds.gws_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
> -				    &adev->gds.oa_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
>   	adev->gfx.ce_ram_size = 0x8000;
>   
>   	gfx_v7_0_gpu_early_init(adev);
>   
>   	return r;
>   }
>   
>   static int gfx_v7_0_sw_fini(void *handle)
>   {
>   	int i;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
> -
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>   
>   	gfx_v7_0_cp_compute_fini(adev);
>   	gfx_v7_0_rlc_fini(adev);
>   	gfx_v7_0_mec_fini(adev);
>   	amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>   				&adev->gfx.rlc.clear_state_gpu_addr,
> @@ -5073,64 +5024,60 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {
>   
>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>   	.type = AMDGPU_RING_TYPE_GFX,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = false,
>   	.get_rptr = gfx_v7_0_ring_get_rptr,
>   	.get_wptr = gfx_v7_0_ring_get_wptr_gfx,
>   	.set_wptr = gfx_v7_0_ring_set_wptr_gfx,
>   	.emit_frame_size =
> -		20 + /* gfx_v7_0_ring_emit_gds_switch */
>   		7 + /* gfx_v7_0_ring_emit_hdp_flush */
>   		5 + /* hdp invalidate */
>   		12 + 12 + 12 + /* gfx_v7_0_ring_emit_fence_gfx x3 for user fence, vm fence */
>   		7 + 4 + /* gfx_v7_0_ring_emit_pipeline_sync */
>   		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + 6 + /* gfx_v7_0_ring_emit_vm_flush */
>   		3 + 4, /* gfx_v7_ring_emit_cntxcntl including vgt flush*/
>   	.emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_gfx */
>   	.emit_ib = gfx_v7_0_ring_emit_ib_gfx,
>   	.emit_fence = gfx_v7_0_ring_emit_fence_gfx,
>   	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v7_0_ring_test_ring,
>   	.test_ib = gfx_v7_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>   	.emit_wreg = gfx_v7_0_ring_emit_wreg,
>   	.soft_recovery = gfx_v7_0_ring_soft_recovery,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
>   	.type = AMDGPU_RING_TYPE_COMPUTE,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = false,
>   	.get_rptr = gfx_v7_0_ring_get_rptr,
>   	.get_wptr = gfx_v7_0_ring_get_wptr_compute,
>   	.set_wptr = gfx_v7_0_ring_set_wptr_compute,
>   	.emit_frame_size =
> -		20 + /* gfx_v7_0_ring_emit_gds_switch */
>   		7 + /* gfx_v7_0_ring_emit_hdp_flush */
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>   		CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v7_0_ring_emit_vm_flush */
>   		7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user fence, vm fence */
>   	.emit_ib_size =	4, /* gfx_v7_0_ring_emit_ib_compute */
>   	.emit_ib = gfx_v7_0_ring_emit_ib_compute,
>   	.emit_fence = gfx_v7_0_ring_emit_fence_compute,
>   	.emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v7_0_ring_test_ring,
>   	.test_ib = gfx_v7_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_wreg = gfx_v7_0_ring_emit_wreg,
>   };
>   
>   static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
>   {
> @@ -5169,42 +5116,28 @@ static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
>   	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>   }
>   
>   static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
>   
> -	if (adev->gds.mem.total_size == 64 * 1024) {
> -		adev->gds.mem.gfx_partition_size = 4096;
> -		adev->gds.mem.cs_partition_size = 4096;
> -
> -		adev->gds.gws.gfx_partition_size = 4;
> -		adev->gds.gws.cs_partition_size = 4;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 1;
> -	} else {
> -		adev->gds.mem.gfx_partition_size = 1024;
> -		adev->gds.mem.cs_partition_size = 1024;
> -
> -		adev->gds.gws.gfx_partition_size = 16;
> -		adev->gds.gws.cs_partition_size = 16;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 4;
> -	}
> +	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
> +	adev->gds.oa.kfd_size_per_vmid = 0;
>   }
>   
> -
>   static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
>   {
>   	int i, j, k, counter, active_cu_number = 0;
>   	u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
>   	struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
>   	unsigned disable_masks[4 * 2];
>   	u32 ao_cu_num;
>   
>   	if (adev->flags & AMD_IS_APU)
>   		ao_cu_num = 2;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 3882689b2d8f..b11a54bd0668 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -2154,57 +2154,34 @@ static int gfx_v8_0_sw_init(void *handle)
>   	kiq = &adev->gfx.kiq;
>   	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>   	if (r)
>   		return r;
>   
>   	/* create MQD for all compute queues as well as KIQ for SRIOV case */
>   	r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct vi_mqd_allocation));
>   	if (r)
>   		return r;
>   
> -	/* reserve GDS, GWS and OA resource for gfx */
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
> -				    &adev->gds.gds_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
> -				    &adev->gds.gws_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
> -				    &adev->gds.oa_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
>   	adev->gfx.ce_ram_size = 0x8000;
>   
>   	r = gfx_v8_0_gpu_early_init(adev);
>   	if (r)
>   		return r;
>   
>   	return 0;
>   }
>   
>   static int gfx_v8_0_sw_fini(void *handle)
>   {
>   	int i;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
> -
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>   
>   	amdgpu_gfx_compute_mqd_sw_fini(adev);
>   	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
>   	amdgpu_gfx_kiq_fini(adev);
>   
>   	gfx_v8_0_mec_fini(adev);
> @@ -3850,21 +3827,21 @@ static void gfx_v8_0_config_init(struct amdgpu_device *adev)
>   	case CHIP_CARRIZO:
>   	case CHIP_STONEY:
>   		adev->gfx.config.double_offchip_lds_buf = 0;
>   		break;
>   	}
>   }
>   
>   static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
>   {
>   	u32 tmp, sh_static_mem_cfg;
> -	int i;
> +	int i, vmid;
>   
>   	WREG32_FIELD(GRBM_CNTL, READ_TIMEOUT, 0xFF);
>   	WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>   	WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>   	WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>   
>   	gfx_v8_0_tiling_mode_table_init(adev);
>   	gfx_v8_0_setup_rb(adev);
>   	gfx_v8_0_get_cu_info(adev);
>   	gfx_v8_0_config_init(adev);
> @@ -3927,20 +3904,41 @@ static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
>   
>   	tmp = RREG32(mmSPI_ARB_PRIORITY);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>   	tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>   	WREG32(mmSPI_ARB_PRIORITY, tmp);
>   
>   	mutex_unlock(&adev->grbm_idx_mutex);
>   
> +	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
> +		unsigned gds_size, gws_size, oa_size;
> +
> +		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
> +			gds_size = adev->gds.mem.gfx_size_per_vmid;
> +			gws_size = adev->gds.gws.gfx_size_per_vmid;
> +			oa_size = adev->gds.oa.gfx_size_per_vmid;
> +		} else {
> +			gds_size = adev->gds.mem.kfd_size_per_vmid;
> +			gws_size = adev->gds.gws.kfd_size_per_vmid;
> +			oa_size = adev->gds.oa.kfd_size_per_vmid;
> +		}
> +
> +		WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid * gds_size);
> +		WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
> +		WREG32(amdgpu_gds_reg_offset[vmid].gws,
> +		       (vmid * gws_size) |
> +		       (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
> +		WREG32(amdgpu_gds_reg_offset[vmid].oa,
> +		       ((1 << oa_size) - 1) << (vmid * oa_size));
> +	}
>   }
>   
>   static void gfx_v8_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
>   {
>   	u32 i, j, k;
>   	u32 mask;
>   
>   	mutex_lock(&adev->grbm_idx_mutex);
>   	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
>   		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
> @@ -5383,68 +5381,20 @@ static uint64_t gfx_v8_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>   	uint64_t clock;
>   
>   	mutex_lock(&adev->gfx.gpu_clock_mutex);
>   	WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>   	clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>   		((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>   	mutex_unlock(&adev->gfx.gpu_clock_mutex);
>   	return clock;
>   }
>   
> -static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> -					  uint32_t vmid,
> -					  uint32_t gds_base, uint32_t gds_size,
> -					  uint32_t gws_base, uint32_t gws_size,
> -					  uint32_t oa_base, uint32_t oa_size)
> -{
> -	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
> -	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
> -
> -	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
> -	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
> -
> -	oa_base = oa_base >> AMDGPU_OA_SHIFT;
> -	oa_size = oa_size >> AMDGPU_OA_SHIFT;
> -
> -	/* GDS Base */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gds_base);
> -
> -	/* GDS Size */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gds_size);
> -
> -	/* GWS */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
> -
> -	/* OA */
> -	amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
> -	amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
> -				WRITE_DATA_DST_SEL(0)));
> -	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
> -	amdgpu_ring_write(ring, 0);
> -	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> -}
> -
>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
>   {
>   	WREG32(mmSQ_IND_INDEX,
>   		(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
>   		(simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
>   		(address << SQ_IND_INDEX__INDEX__SHIFT) |
>   		(SQ_IND_INDEX__FORCE_READ_MASK));
>   	return RREG32(mmSQ_IND_DATA);
>   }
>   
> @@ -7132,21 +7082,20 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   		31 + /*	DE_META */
>   		3 + /* CNTX_CTRL */
>   		5 + /* HDP_INVL */
>   		8 + 8 + /* FENCE x2 */
>   		2, /* SWITCH_BUFFER */
>   	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_gfx */
>   	.emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>   	.emit_fence = gfx_v8_0_ring_emit_fence_gfx,
>   	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v8_0_ring_test_ring,
>   	.test_ib = gfx_v8_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v8_ring_emit_sb,
>   	.emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>   	.init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
>   	.emit_wreg = gfx_v8_0_ring_emit_wreg,
> @@ -7155,51 +7104,48 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>   	.type = AMDGPU_RING_TYPE_COMPUTE,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = false,
>   	.get_rptr = gfx_v8_0_ring_get_rptr,
>   	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
>   	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
>   	.emit_frame_size =
> -		20 + /* gfx_v8_0_ring_emit_gds_switch */
>   		7 + /* gfx_v8_0_ring_emit_hdp_flush */
>   		5 + /* hdp_invalidate */
>   		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>   		VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /* gfx_v8_0_ring_emit_vm_flush */
>   		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */
>   	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
>   	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
>   	.emit_fence = gfx_v8_0_ring_emit_fence_compute,
>   	.emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v8_0_ring_test_ring,
>   	.test_ib = gfx_v8_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v8_0_ring_set_priority_compute,
>   	.emit_wreg = gfx_v8_0_ring_emit_wreg,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = {
>   	.type = AMDGPU_RING_TYPE_KIQ,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = false,
>   	.get_rptr = gfx_v8_0_ring_get_rptr,
>   	.get_wptr = gfx_v8_0_ring_get_wptr_compute,
>   	.set_wptr = gfx_v8_0_ring_set_wptr_compute,
>   	.emit_frame_size =
> -		20 + /* gfx_v8_0_ring_emit_gds_switch */
>   		7 + /* gfx_v8_0_ring_emit_hdp_flush */
>   		5 + /* hdp_invalidate */
>   		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
>   		17 + /* gfx_v8_0_ring_emit_vm_flush */
>   		7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */
>   	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_compute */
>   	.emit_ib = gfx_v8_0_ring_emit_ib_compute,
>   	.emit_fence = gfx_v8_0_ring_emit_fence_kiq,
>   	.test_ring = gfx_v8_0_ring_test_ring,
>   	.test_ib = gfx_v8_0_ring_test_ib,
> @@ -7278,39 +7224,26 @@ static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
>   	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
>   }
>   
>   static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
>   
> -	if (adev->gds.mem.total_size == 64 * 1024) {
> -		adev->gds.mem.gfx_partition_size = 4096;
> -		adev->gds.mem.cs_partition_size = 4096;
> -
> -		adev->gds.gws.gfx_partition_size = 4;
> -		adev->gds.gws.cs_partition_size = 4;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 1;
> -	} else {
> -		adev->gds.mem.gfx_partition_size = 1024;
> -		adev->gds.mem.cs_partition_size = 1024;
> -
> -		adev->gds.gws.gfx_partition_size = 16;
> -		adev->gds.gws.cs_partition_size = 16;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 4;
> -	}
> +	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
> +	adev->gds.oa.kfd_size_per_vmid = 0;
>   }
>   
>   static void gfx_v8_0_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
>   						 u32 bitmap)
>   {
>   	u32 data;
>   
>   	if (!bitmap)
>   		return;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 3594704a6f9b..48a7e25514f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -1351,31 +1351,32 @@ static int gfx_v9_0_ngg_fini(struct amdgpu_device *adev)
>   				      NULL);
>   
>   	memset(&adev->gfx.ngg.buf[0], 0,
>   			sizeof(struct amdgpu_ngg_buf) * NGG_BUF_MAX);
>   
>   	adev->gfx.ngg.init = false;
>   
>   	return 0;
>   }
>   
> +/* TODO: remove */
>   static int gfx_v9_0_ngg_init(struct amdgpu_device *adev)
>   {
>   	int r;
>   
>   	if (!amdgpu_ngg || adev->gfx.ngg.init == true)
>   		return 0;
>   
>   	/* GDS reserve memory: 64 bytes alignment */
>   	adev->gfx.ngg.gds_reserve_size = ALIGN(5 * 4, 0x40);
>   	adev->gds.mem.total_size -= adev->gfx.ngg.gds_reserve_size;
> -	adev->gds.mem.gfx_partition_size -= adev->gfx.ngg.gds_reserve_size;
> +	adev->gds.mem.gfx_size_per_vmid -= adev->gfx.ngg.gds_reserve_size;
>   	adev->gfx.ngg.gds_reserve_addr = RREG32_SOC15(GC, 0, mmGDS_VMID0_BASE);
>   	adev->gfx.ngg.gds_reserve_addr += RREG32_SOC15(GC, 0, mmGDS_VMID0_SIZE);
>   
>   	/* Primitive Buffer */
>   	r = gfx_v9_0_ngg_create_buf(adev, &adev->gfx.ngg.buf[NGG_PRIM],
>   				    amdgpu_prim_buf_per_se,
>   				    64 * 1024);
>   	if (r) {
>   		dev_err(adev->dev, "Failed to create Primitive Buffer\n");
>   		goto err;
> @@ -1412,20 +1413,21 @@ static int gfx_v9_0_ngg_init(struct amdgpu_device *adev)
>   	}
>   
>   out:
>   	adev->gfx.ngg.init = true;
>   	return 0;
>   err:
>   	gfx_v9_0_ngg_fini(adev);
>   	return r;
>   }
>   
> +/* TODO: remove */
>   static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_ring *ring = &adev->gfx.gfx_ring[0];
>   	int r;
>   	u32 data, base;
>   
>   	if (!amdgpu_ngg)
>   		return 0;
>   
>   	/* Program buffer size */
> @@ -1469,23 +1471,22 @@ static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
>   	/* Clear GDS reserved memory */
>   	r = amdgpu_ring_alloc(ring, 17);
>   	if (r) {
>   		DRM_ERROR("amdgpu: NGG failed to lock ring %d (%d).\n",
>   			  ring->idx, r);
>   		return r;
>   	}
>   
>   	gfx_v9_0_write_data_to_reg(ring, 0, false,
>   				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_SIZE),
> -			           (adev->gds.mem.total_size +
> -				    adev->gfx.ngg.gds_reserve_size) >>
> -				   AMDGPU_GDS_SHIFT);
> +			           adev->gds.mem.total_size +
> +				   adev->gfx.ngg.gds_reserve_size);
>   
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_DMA_DATA, 5));
>   	amdgpu_ring_write(ring, (PACKET3_DMA_DATA_CP_SYNC |
>   				PACKET3_DMA_DATA_DST_SEL(1) |
>   				PACKET3_DMA_DATA_SRC_SEL(2)));
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, adev->gfx.ngg.gds_reserve_addr);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, PACKET3_DMA_DATA_CMD_RAW_WAIT |
> @@ -1644,62 +1645,39 @@ static int gfx_v9_0_sw_init(void *handle)
>   	kiq = &adev->gfx.kiq;
>   	r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>   	if (r)
>   		return r;
>   
>   	/* create MQD for all compute queues as wel as KIQ for SRIOV case */
>   	r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct v9_mqd_allocation));
>   	if (r)
>   		return r;
>   
> -	/* reserve GDS, GWS and OA resource for gfx */
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
> -				    &adev->gds.gds_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
> -				    &adev->gds.gws_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
> -	r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
> -				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
> -				    &adev->gds.oa_gfx_bo, NULL, NULL);
> -	if (r)
> -		return r;
> -
>   	adev->gfx.ce_ram_size = 0x8000;
>   
>   	r = gfx_v9_0_gpu_early_init(adev);
>   	if (r)
>   		return r;
>   
>   	r = gfx_v9_0_ngg_init(adev);
>   	if (r)
>   		return r;
>   
>   	return 0;
>   }
>   
>   
>   static int gfx_v9_0_sw_fini(void *handle)
>   {
>   	int i;
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> -	amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
> -	amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
> -
>   	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>   	for (i = 0; i < adev->gfx.num_compute_rings; i++)
>   		amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>   
>   	amdgpu_gfx_compute_mqd_sw_fini(adev);
>   	amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
>   	amdgpu_gfx_kiq_fini(adev);
>   
>   	gfx_v9_0_mec_fini(adev);
> @@ -1813,21 +1791,21 @@ static void gfx_v9_0_init_compute_vmid(struct amdgpu_device *adev)
>   		WREG32_SOC15(GC, 0, mmSH_MEM_CONFIG, sh_mem_config);
>   		WREG32_SOC15(GC, 0, mmSH_MEM_BASES, sh_mem_bases);
>   	}
>   	soc15_grbm_select(adev, 0, 0, 0, 0);
>   	mutex_unlock(&adev->srbm_mutex);
>   }
>   
>   static void gfx_v9_0_gpu_init(struct amdgpu_device *adev)
>   {
>   	u32 tmp;
> -	int i;
> +	int i, vmid;
>   
>   	WREG32_FIELD15(GC, 0, GRBM_CNTL, READ_TIMEOUT, 0xff);
>   
>   	gfx_v9_0_tiling_mode_table_init(adev);
>   
>   	gfx_v9_0_setup_rb(adev);
>   	gfx_v9_0_get_cu_info(adev, &adev->gfx.cu_info);
>   	adev->gfx.config.db_debug2 = RREG32_SOC15(GC, 0, mmDB_DEBUG2);
>   
>   	/* XXX SH_MEM regs */
> @@ -1869,20 +1847,43 @@ static void gfx_v9_0_gpu_init(struct amdgpu_device *adev)
>   		   (adev->gfx.config.sc_prim_fifo_size_frontend <<
>   			PA_SC_FIFO_SIZE__SC_FRONTEND_PRIM_FIFO_SIZE__SHIFT) |
>   		   (adev->gfx.config.sc_prim_fifo_size_backend <<
>   			PA_SC_FIFO_SIZE__SC_BACKEND_PRIM_FIFO_SIZE__SHIFT) |
>   		   (adev->gfx.config.sc_hiz_tile_fifo_size <<
>   			PA_SC_FIFO_SIZE__SC_HIZ_TILE_FIFO_SIZE__SHIFT) |
>   		   (adev->gfx.config.sc_earlyz_tile_fifo_size <<
>   			PA_SC_FIFO_SIZE__SC_EARLYZ_TILE_FIFO_SIZE__SHIFT));
>   	mutex_unlock(&adev->grbm_idx_mutex);
>   
> +	for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
> +		unsigned gds_size, gws_size, oa_size;
> +
> +		if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids) {
> +			gds_size = adev->gds.mem.gfx_size_per_vmid;
> +			gws_size = adev->gds.gws.gfx_size_per_vmid;
> +			oa_size = adev->gds.oa.gfx_size_per_vmid;
> +		} else {
> +			gds_size = adev->gds.mem.kfd_size_per_vmid;
> +			gws_size = adev->gds.gws.kfd_size_per_vmid;
> +			oa_size = adev->gds.oa.kfd_size_per_vmid;
> +		}
> +
> +		WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * vmid,
> +				    vmid * gds_size);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * vmid,
> +				    gds_size);
> +		WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, vmid,
> +				    (vmid * gws_size) |
> +				    (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
> +		WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, vmid,
> +				    ((1 << oa_size) - 1) << (vmid * oa_size));
> +	}
>   }
>   
>   static void gfx_v9_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
>   {
>   	u32 i, j, k;
>   	u32 mask;
>   
>   	mutex_lock(&adev->grbm_idx_mutex);
>   	for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
>   		for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
> @@ -3407,58 +3408,20 @@ static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>   	uint64_t clock;
>   
>   	mutex_lock(&adev->gfx.gpu_clock_mutex);
>   	WREG32_SOC15(GC, 0, mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>   	clock = (uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_LSB) |
>   		((uint64_t)RREG32_SOC15(GC, 0, mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>   	mutex_unlock(&adev->gfx.gpu_clock_mutex);
>   	return clock;
>   }
>   
> -static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
> -					  uint32_t vmid,
> -					  uint32_t gds_base, uint32_t gds_size,
> -					  uint32_t gws_base, uint32_t gws_size,
> -					  uint32_t oa_base, uint32_t oa_size)
> -{
> -	struct amdgpu_device *adev = ring->adev;
> -
> -	gds_base = gds_base >> AMDGPU_GDS_SHIFT;
> -	gds_size = gds_size >> AMDGPU_GDS_SHIFT;
> -
> -	gws_base = gws_base >> AMDGPU_GWS_SHIFT;
> -	gws_size = gws_size >> AMDGPU_GWS_SHIFT;
> -
> -	oa_base = oa_base >> AMDGPU_OA_SHIFT;
> -	oa_size = oa_size >> AMDGPU_OA_SHIFT;
> -
> -	/* GDS Base */
> -	gfx_v9_0_write_data_to_reg(ring, 0, false,
> -				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_BASE) + 2 * vmid,
> -				   gds_base);
> -
> -	/* GDS Size */
> -	gfx_v9_0_write_data_to_reg(ring, 0, false,
> -				   SOC15_REG_OFFSET(GC, 0, mmGDS_VMID0_SIZE) + 2 * vmid,
> -				   gds_size);
> -
> -	/* GWS */
> -	gfx_v9_0_write_data_to_reg(ring, 0, false,
> -				   SOC15_REG_OFFSET(GC, 0, mmGDS_GWS_VMID0) + vmid,
> -				   gws_size << GDS_GWS_VMID0__SIZE__SHIFT | gws_base);
> -
> -	/* OA */
> -	gfx_v9_0_write_data_to_reg(ring, 0, false,
> -				   SOC15_REG_OFFSET(GC, 0, mmGDS_OA_VMID0) + vmid,
> -				   (1 << (oa_size + oa_base)) - (1 << oa_base));
> -}
> -
>   static int gfx_v9_0_early_init(void *handle)
>   {
>   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   	adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
>   	adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>   	gfx_v9_0_set_ring_funcs(adev);
>   	gfx_v9_0_set_irq_funcs(adev);
>   	gfx_v9_0_set_gds_init(adev);
>   	gfx_v9_0_set_rlc_funcs(adev);
> @@ -4695,21 +4658,20 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		31 + /*	DE_META */
>   		3 + /* CNTX_CTRL */
>   		5 + /* HDP_INVL */
>   		8 + 8 + /* FENCE x2 */
>   		2, /* SWITCH_BUFFER */
>   	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_gfx */
>   	.emit_ib = gfx_v9_0_ring_emit_ib_gfx,
>   	.emit_fence = gfx_v9_0_ring_emit_fence,
>   	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v9_0_ring_test_ring,
>   	.test_ib = gfx_v9_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.emit_switch_buffer = gfx_v9_ring_emit_sb,
>   	.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
>   	.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
>   	.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
>   	.emit_tmz = gfx_v9_0_ring_emit_tmz,
> @@ -4722,34 +4684,32 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.type = AMDGPU_RING_TYPE_COMPUTE,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = true,
>   	.vmhub = AMDGPU_GFXHUB,
>   	.get_rptr = gfx_v9_0_ring_get_rptr_compute,
>   	.get_wptr = gfx_v9_0_ring_get_wptr_compute,
>   	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
>   	.emit_frame_size =
> -		20 + /* gfx_v9_0_ring_emit_gds_switch */
>   		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>   		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
>   	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
>   	.emit_fence = gfx_v9_0_ring_emit_fence,
>   	.emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
>   	.emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
> -	.emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
>   	.emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
>   	.test_ring = gfx_v9_0_ring_test_ring,
>   	.test_ib = gfx_v9_0_ring_test_ib,
>   	.insert_nop = amdgpu_ring_insert_nop,
>   	.pad_ib = amdgpu_ring_generic_pad_ib,
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>   	.emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
>   };
> @@ -4757,21 +4717,20 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>   	.type = AMDGPU_RING_TYPE_KIQ,
>   	.align_mask = 0xff,
>   	.nop = PACKET3(PACKET3_NOP, 0x3FFF),
>   	.support_64bit_ptrs = true,
>   	.vmhub = AMDGPU_GFXHUB,
>   	.get_rptr = gfx_v9_0_ring_get_rptr_compute,
>   	.get_wptr = gfx_v9_0_ring_get_wptr_compute,
>   	.set_wptr = gfx_v9_0_ring_set_wptr_compute,
>   	.emit_frame_size =
> -		20 + /* gfx_v9_0_ring_emit_gds_switch */
>   		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>   		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
>   	.emit_ib_size =	4, /* gfx_v9_0_ring_emit_ib_compute */
>   	.emit_ib = gfx_v9_0_ring_emit_ib_compute,
>   	.emit_fence = gfx_v9_0_ring_emit_fence_kiq,
> @@ -4847,39 +4806,26 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev)
>   	}
>   }
>   
>   static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32_SOC15(GC, 0, mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
>   
> -	if (adev->gds.mem.total_size == 64 * 1024) {
> -		adev->gds.mem.gfx_partition_size = 4096;
> -		adev->gds.mem.cs_partition_size = 4096;
> -
> -		adev->gds.gws.gfx_partition_size = 4;
> -		adev->gds.gws.cs_partition_size = 4;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 1;
> -	} else {
> -		adev->gds.mem.gfx_partition_size = 1024;
> -		adev->gds.mem.cs_partition_size = 1024;
> -
> -		adev->gds.gws.gfx_partition_size = 16;
> -		adev->gds.gws.cs_partition_size = 16;
> -
> -		adev->gds.oa.gfx_partition_size = 4;
> -		adev->gds.oa.cs_partition_size = 4;
> -	}
> +	adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size / AMDGPU_NUM_VMID;
> +	adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /* gfx only */
> +	adev->gds.oa.kfd_size_per_vmid = 0;
>   }
>   
>   static void gfx_v9_0_set_user_cu_inactive_bitmap(struct amdgpu_device *adev,
>   						 u32 bitmap)
>   {
>   	u32 data;
>   
>   	if (!bitmap)
>   		return;
>   
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 94444eeba55b..9b9512b14cae 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -81,36 +81,27 @@ extern "C" {
>    * %AMDGPU_GEM_DOMAIN_CPU	System memory that is not GPU accessible.
>    * Memory in this pool could be swapped out to disk if there is pressure.
>    *
>    * %AMDGPU_GEM_DOMAIN_GTT	GPU accessible system memory, mapped into the
>    * GPU's virtual address space via gart. Gart memory linearizes non-contiguous
>    * pages of system memory, allows GPU access system memory in a linezrized
>    * fashion.
>    *
>    * %AMDGPU_GEM_DOMAIN_VRAM	Local video memory. For APUs, it is memory
>    * carved out by the BIOS.
> - *
> - * %AMDGPU_GEM_DOMAIN_GDS	Global on-chip data storage used to share data
> - * across shader threads.
> - *
> - * %AMDGPU_GEM_DOMAIN_GWS	Global wave sync, used to synchronize the
> - * execution of all the waves on a device.
> - *
> - * %AMDGPU_GEM_DOMAIN_OA	Ordered append, used by 3D or Compute engines
> - * for appending data.
>    */
>   #define AMDGPU_GEM_DOMAIN_CPU		0x1
>   #define AMDGPU_GEM_DOMAIN_GTT		0x2
>   #define AMDGPU_GEM_DOMAIN_VRAM		0x4
> -#define AMDGPU_GEM_DOMAIN_GDS		0x8
> -#define AMDGPU_GEM_DOMAIN_GWS		0x10
> -#define AMDGPU_GEM_DOMAIN_OA		0x20
> +#define AMDGPU_GEM_DOMAIN_GDS		0x8 /* non-functional */
> +#define AMDGPU_GEM_DOMAIN_GWS		0x10 /* non-functional */
> +#define AMDGPU_GEM_DOMAIN_OA		0x20 /* non-functional */
>   #define AMDGPU_GEM_DOMAIN_MASK		(AMDGPU_GEM_DOMAIN_CPU | \
>   					 AMDGPU_GEM_DOMAIN_GTT | \
>   					 AMDGPU_GEM_DOMAIN_VRAM | \
>   					 AMDGPU_GEM_DOMAIN_GDS | \
>   					 AMDGPU_GEM_DOMAIN_GWS | \
>   					 AMDGPU_GEM_DOMAIN_OA)
>   
>   /* Flag that CPU access will be required for the case of VRAM domain */
>   #define AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED	(1 << 0)
>   /* Flag that CPU access will not work, this VRAM domain is invisible */
That's OK. We don't need IBs to get the same VMID.

Marek

On Thu, Sep 13, 2018 at 4:40 AM, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> As discussed internally that doesn't work because threads don't necessary
> get the same VMID assigned.
>
> Christian.
>
> Am 12.09.2018 um 22:33 schrieb Marek Olšák:
>>
>> From: Marek Olšák <marek.olsak@amd.com>
>>
>> I've chosen to do it like this because it's easy and allows an arbitrary
>> number of processes.
>>
>> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  10 --
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   3 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |  20 ----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h     |  19 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  24 +---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c     |   6 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h     |   7 --
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |   3 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  14 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  21 ----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |   6 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |   5 -
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  61 ----------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |   8 --
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  34 +-----
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c       | 125 +++++---------------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c       | 123 +++++--------------
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c       | 124 ++++++-------------
>>   include/uapi/drm/amdgpu_drm.h               |  15 +--
>>   19 files changed, 109 insertions(+), 519 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> index b80243d3972e..7264a4930b88 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>> @@ -71,23 +71,20 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev,
>> struct drm_file *filp,
>>                                 / sizeof(struct amdgpu_bo_list_entry))
>>                 return -EINVAL;
>>         size = sizeof(struct amdgpu_bo_list);
>>         size += num_entries * sizeof(struct amdgpu_bo_list_entry);
>>         list = kvmalloc(size, GFP_KERNEL);
>>         if (!list)
>>                 return -ENOMEM;
>>         kref_init(&list->refcount);
>> -       list->gds_obj = adev->gds.gds_gfx_bo;
>> -       list->gws_obj = adev->gds.gws_gfx_bo;
>> -       list->oa_obj = adev->gds.oa_gfx_bo;
>>         array = amdgpu_bo_list_array_entry(list, 0);
>>         memset(array, 0, num_entries * sizeof(struct
>> amdgpu_bo_list_entry));
>>         for (i = 0; i < num_entries; ++i) {
>>                 struct amdgpu_bo_list_entry *entry;
>>                 struct drm_gem_object *gobj;
>>                 struct amdgpu_bo *bo;
>>                 struct mm_struct *usermm;
>>   @@ -111,27 +108,20 @@ int amdgpu_bo_list_create(struct amdgpu_device
>> *adev, struct drm_file *filp,
>>                 } else {
>>                         entry = &array[last_entry++];
>>                 }
>>                 entry->robj = bo;
>>                 entry->priority = min(info[i].bo_priority,
>>                                       AMDGPU_BO_LIST_MAX_PRIORITY);
>>                 entry->tv.bo = &entry->robj->tbo;
>>                 entry->tv.shared = !entry->robj->prime_shared_count;
>>   -             if (entry->robj->preferred_domains ==
>> AMDGPU_GEM_DOMAIN_GDS)
>> -                       list->gds_obj = entry->robj;
>> -               if (entry->robj->preferred_domains ==
>> AMDGPU_GEM_DOMAIN_GWS)
>> -                       list->gws_obj = entry->robj;
>> -               if (entry->robj->preferred_domains ==
>> AMDGPU_GEM_DOMAIN_OA)
>> -                       list->oa_obj = entry->robj;
>> -
>>                 total_size += amdgpu_bo_size(entry->robj);
>>                 trace_amdgpu_bo_list_set(list, entry->robj);
>>         }
>>         list->first_userptr = first_userptr;
>>         list->num_entries = num_entries;
>>         trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>>         *result = list;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> index 61b089768e1c..30f12a60aa28 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>> @@ -36,23 +36,20 @@ struct amdgpu_bo_list_entry {
>>         struct ttm_validate_buffer      tv;
>>         struct amdgpu_bo_va             *bo_va;
>>         uint32_t                        priority;
>>         struct page                     **user_pages;
>>         int                             user_invalidated;
>>   };
>>     struct amdgpu_bo_list {
>>         struct rcu_head rhead;
>>         struct kref refcount;
>> -       struct amdgpu_bo *gds_obj;
>> -       struct amdgpu_bo *gws_obj;
>> -       struct amdgpu_bo *oa_obj;
>>         unsigned first_userptr;
>>         unsigned num_entries;
>>   };
>>     int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
>>                        struct amdgpu_bo_list **result);
>>   void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
>>                              struct list_head *validated);
>>   void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
>>   int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 1081fd00b059..88b58facf29e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -565,23 +565,20 @@ static int amdgpu_cs_list_validate(struct
>> amdgpu_cs_parser *p,
>>         return 0;
>>   }
>>     static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>>                                 union drm_amdgpu_cs *cs)
>>   {
>>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>         struct amdgpu_vm *vm = &fpriv->vm;
>>         struct amdgpu_bo_list_entry *e;
>>         struct list_head duplicates;
>> -       struct amdgpu_bo *gds;
>> -       struct amdgpu_bo *gws;
>> -       struct amdgpu_bo *oa;
>>         unsigned tries = 10;
>>         int r;
>>         INIT_LIST_HEAD(&p->validated);
>>         /* p->bo_list could already be assigned if
>> AMDGPU_CHUNK_ID_BO_HANDLES is present */
>>         if (cs->in.bo_list_handle) {
>>                 if (p->bo_list)
>>                         return -EINVAL;
>>   @@ -705,40 +702,23 @@ static int amdgpu_cs_parser_bos(struct
>> amdgpu_cs_parser *p,
>>         r = amdgpu_cs_list_validate(p, &p->validated);
>>         if (r) {
>>                 DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
>>                 goto error_validate;
>>         }
>>         amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>>                                      p->bytes_moved_vis);
>>   -     gds = p->bo_list->gds_obj;
>> -       gws = p->bo_list->gws_obj;
>> -       oa = p->bo_list->oa_obj;
>> -
>>         amdgpu_bo_list_for_each_entry(e, p->bo_list)
>>                 e->bo_va = amdgpu_vm_bo_find(vm, e->robj);
>>   -     if (gds) {
>> -               p->job->gds_base = amdgpu_bo_gpu_offset(gds);
>> -               p->job->gds_size = amdgpu_bo_size(gds);
>> -       }
>> -       if (gws) {
>> -               p->job->gws_base = amdgpu_bo_gpu_offset(gws);
>> -               p->job->gws_size = amdgpu_bo_size(gws);
>> -       }
>> -       if (oa) {
>> -               p->job->oa_base = amdgpu_bo_gpu_offset(oa);
>> -               p->job->oa_size = amdgpu_bo_size(oa);
>> -       }
>> -
>>         if (!r && p->uf_entry.robj) {
>>                 struct amdgpu_bo *uf = p->uf_entry.robj;
>>                 r = amdgpu_ttm_alloc_gart(&uf->tbo);
>>                 p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
>>         }
>>     error_validate:
>>         if (r)
>>                 ttm_eu_backoff_reservation(&p->ticket, &p->validated);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>> index e73728d90388..69ba25c2e921 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>> @@ -17,48 +17,33 @@
>>    * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES
>> OR
>>    * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>    * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>    * OTHER DEALINGS IN THE SOFTWARE.
>>    *
>>    */
>>     #ifndef __AMDGPU_GDS_H__
>>   #define __AMDGPU_GDS_H__
>>   -/* Because TTM request that alloacted buffer should be PAGE_SIZE
>> aligned,
>> - * we should report GDS/GWS/OA size as PAGE_SIZE aligned
>> - * */
>> -#define AMDGPU_GDS_SHIFT       2
>> -#define AMDGPU_GWS_SHIFT       PAGE_SHIFT
>> -#define AMDGPU_OA_SHIFT                PAGE_SHIFT
>> -
>>   struct amdgpu_ring;
>>   struct amdgpu_bo;
>>     struct amdgpu_gds_asic_info {
>>         uint32_t        total_size;
>> -       uint32_t        gfx_partition_size;
>> -       uint32_t        cs_partition_size;
>> +       uint32_t        gfx_size_per_vmid;
>> +       uint32_t        kfd_size_per_vmid;
>>   };
>>     struct amdgpu_gds {
>>         struct amdgpu_gds_asic_info     mem;
>>         struct amdgpu_gds_asic_info     gws;
>>         struct amdgpu_gds_asic_info     oa;
>> -       /* At present, GDS, GWS and OA resources for gfx (graphics)
>> -        * is always pre-allocated and available for graphics operation.
>> -        * Such resource is shared between all gfx clients.
>> -        * TODO: move this operation to user space
>> -        * */
>> -       struct amdgpu_bo*               gds_gfx_bo;
>> -       struct amdgpu_bo*               gws_gfx_bo;
>> -       struct amdgpu_bo*               oa_gfx_bo;
>>   };
>>     struct amdgpu_gds_reg_offset {
>>         uint32_t        mem_base;
>>         uint32_t        mem_size;
>>         uint32_t        gws;
>>         uint32_t        oa;
>>   };
>>     #endif /* __AMDGPU_GDS_H__ */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> index d30a0838851b..c87ad4b4d0b6 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>> @@ -223,43 +223,25 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev,
>> void *data,
>>         if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>                       AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
>>                       AMDGPU_GEM_CREATE_CPU_GTT_USWC |
>>                       AMDGPU_GEM_CREATE_VRAM_CLEARED |
>>                       AMDGPU_GEM_CREATE_VM_ALWAYS_VALID |
>>                       AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
>>                 return -EINVAL;
>>         /* reject invalid gem domains */
>> -       if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
>> +       if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
>> +                                AMDGPU_GEM_DOMAIN_GTT |
>> +                                AMDGPU_GEM_DOMAIN_VRAM))
>>                 return -EINVAL;
>>   -     /* create a gem object to contain this object in */
>> -       if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
>> -           AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
>> -               if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>> -                       /* if gds bo is created from user space, it must
>> be
>> -                        * passed to bo list
>> -                        */
>> -                       DRM_ERROR("GDS bo cannot be per-vm-bo\n");
>> -                       return -EINVAL;
>> -               }
>> -               flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>> -               if (args->in.domains == AMDGPU_GEM_DOMAIN_GDS)
>> -                       size = size << AMDGPU_GDS_SHIFT;
>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_GWS)
>> -                       size = size << AMDGPU_GWS_SHIFT;
>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_OA)
>> -                       size = size << AMDGPU_OA_SHIFT;
>> -               else
>> -                       return -EINVAL;
>> -       }
>>         size = roundup(size, PAGE_SIZE);
>>         if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>>                 r = amdgpu_bo_reserve(vm->root.base.bo, false);
>>                 if (r)
>>                         return r;
>>                 resv = vm->root.base.bo->tbo.resv;
>>         }
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> index 3a072a7a39f0..c2e6a1a11d7f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>> @@ -516,26 +516,20 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device
>> *adev,
>>    * Reset saved GDW, GWS and OA to force switch on next flush.
>>    */
>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>                        unsigned vmid)
>>   {
>>         struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>         struct amdgpu_vmid *id = &id_mgr->ids[vmid];
>>         mutex_lock(&id_mgr->lock);
>>         id->owner = 0;
>> -       id->gds_base = 0;
>> -       id->gds_size = 0;
>> -       id->gws_base = 0;
>> -       id->gws_size = 0;
>> -       id->oa_base = 0;
>> -       id->oa_size = 0;
>>         mutex_unlock(&id_mgr->lock);
>>   }
>>     /**
>>    * amdgpu_vmid_reset_all - reset VMID to zero
>>    *
>>    * @adev: amdgpu device structure
>>    *
>>    * Reset VMID to force flush on next use
>>    */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> index 7625419f0fc2..06078e665532 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>> @@ -44,27 +44,20 @@ struct amdgpu_vmid {
>>         struct amdgpu_sync      active;
>>         struct dma_fence        *last_flush;
>>         uint64_t                owner;
>>         uint64_t                pd_gpu_addr;
>>         /* last flushed PD/PT update */
>>         struct dma_fence        *flushed_updates;
>>         uint32_t                current_gpu_reset_count;
>>   -     uint32_t                gds_base;
>> -       uint32_t                gds_size;
>> -       uint32_t                gws_base;
>> -       uint32_t                gws_size;
>> -       uint32_t                oa_base;
>> -       uint32_t                oa_size;
>> -
>>         unsigned                pasid;
>>         struct dma_fence        *pasid_mapping;
>>   };
>>     struct amdgpu_vmid_mgr {
>>         struct mutex            lock;
>>         unsigned                num_ids;
>>         struct list_head        ids_lru;
>>         struct amdgpu_vmid      ids[AMDGPU_NUM_VMID];
>>         atomic_t                reserved_vmid_num;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 57cfe78a262b..3db553f6ad01 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -42,23 +42,20 @@ struct amdgpu_job {
>>         struct amdgpu_sync      sched_sync;
>>         struct amdgpu_ib        *ibs;
>>         struct dma_fence        *fence; /* the hw fence */
>>         uint32_t                preamble_status;
>>         uint32_t                num_ibs;
>>         void                    *owner;
>>         bool                    vm_needs_flush;
>>         uint64_t                vm_pd_addr;
>>         unsigned                vmid;
>>         unsigned                pasid;
>> -       uint32_t                gds_base, gds_size;
>> -       uint32_t                gws_base, gws_size;
>> -       uint32_t                oa_base, oa_size;
>>         uint32_t                vram_lost_counter;
>>         /* user fence handling */
>>         uint64_t                uf_addr;
>>         uint64_t                uf_sequence;
>>     };
>>     int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>                      struct amdgpu_job **job, struct amdgpu_vm *vm);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 29ac3873eeb0..209954290954 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -517,27 +517,27 @@ static int amdgpu_info_ioctl(struct drm_device *dev,
>> void *data, struct drm_file
>>         case AMDGPU_INFO_VIS_VRAM_USAGE:
>>                 ui64 =
>> amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
>>                 return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT :
>> 0;
>>         case AMDGPU_INFO_GTT_USAGE:
>>                 ui64 =
>> amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]);
>>                 return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT :
>> 0;
>>         case AMDGPU_INFO_GDS_CONFIG: {
>>                 struct drm_amdgpu_info_gds gds_info;
>>                 memset(&gds_info, 0, sizeof(gds_info));
>> -               gds_info.gds_gfx_partition_size =
>> adev->gds.mem.gfx_partition_size >> AMDGPU_GDS_SHIFT;
>> -               gds_info.compute_partition_size =
>> adev->gds.mem.cs_partition_size >> AMDGPU_GDS_SHIFT;
>> -               gds_info.gds_total_size = adev->gds.mem.total_size >>
>> AMDGPU_GDS_SHIFT;
>> -               gds_info.gws_per_gfx_partition =
>> adev->gds.gws.gfx_partition_size >> AMDGPU_GWS_SHIFT;
>> -               gds_info.gws_per_compute_partition =
>> adev->gds.gws.cs_partition_size >> AMDGPU_GWS_SHIFT;
>> -               gds_info.oa_per_gfx_partition =
>> adev->gds.oa.gfx_partition_size >> AMDGPU_OA_SHIFT;
>> -               gds_info.oa_per_compute_partition =
>> adev->gds.oa.cs_partition_size >> AMDGPU_OA_SHIFT;
>> +               gds_info.gds_gfx_partition_size =
>> adev->gds.mem.gfx_size_per_vmid;
>> +               gds_info.compute_partition_size =
>> adev->gds.mem.kfd_size_per_vmid;
>> +               gds_info.gds_total_size = adev->gds.mem.total_size;
>> +               gds_info.gws_per_gfx_partition =
>> adev->gds.gws.gfx_size_per_vmid;
>> +               gds_info.gws_per_compute_partition =
>> adev->gds.gws.kfd_size_per_vmid;
>> +               gds_info.oa_per_gfx_partition =
>> adev->gds.oa.gfx_size_per_vmid;
>> +               gds_info.oa_per_compute_partition =
>> adev->gds.oa.kfd_size_per_vmid;
>>                 return copy_to_user(out, &gds_info,
>>                                     min((size_t)size, sizeof(gds_info))) ?
>> -EFAULT : 0;
>>         }
>>         case AMDGPU_INFO_VRAM_GTT: {
>>                 struct drm_amdgpu_info_vram_gtt vram_gtt;
>>                 vram_gtt.vram_size = adev->gmc.real_vram_size -
>>                         atomic64_read(&adev->vram_pin_size);
>>                 vram_gtt.vram_cpu_accessible_size =
>> adev->gmc.visible_vram_size -
>>                         atomic64_read(&adev->visible_pin_size);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> index de990bdcdd6c..76770a8c29a5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>> @@ -178,41 +178,20 @@ void amdgpu_bo_placement_from_domain(struct
>> amdgpu_bo *abo, u32 domain)
>>                 places[c].lpfn = 0;
>>                 places[c].flags = TTM_PL_FLAG_SYSTEM;
>>                 if (flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
>>                         places[c].flags |= TTM_PL_FLAG_WC |
>>                                 TTM_PL_FLAG_UNCACHED;
>>                 else
>>                         places[c].flags |= TTM_PL_FLAG_CACHED;
>>                 c++;
>>         }
>>   -     if (domain & AMDGPU_GEM_DOMAIN_GDS) {
>> -               places[c].fpfn = 0;
>> -               places[c].lpfn = 0;
>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>> AMDGPU_PL_FLAG_GDS;
>> -               c++;
>> -       }
>> -
>> -       if (domain & AMDGPU_GEM_DOMAIN_GWS) {
>> -               places[c].fpfn = 0;
>> -               places[c].lpfn = 0;
>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>> AMDGPU_PL_FLAG_GWS;
>> -               c++;
>> -       }
>> -
>> -       if (domain & AMDGPU_GEM_DOMAIN_OA) {
>> -               places[c].fpfn = 0;
>> -               places[c].lpfn = 0;
>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>> AMDGPU_PL_FLAG_OA;
>> -               c++;
>> -       }
>> -
>>         if (!c) {
>>                 places[c].fpfn = 0;
>>                 places[c].lpfn = 0;
>>                 places[c].flags = TTM_PL_MASK_CACHING |
>> TTM_PL_FLAG_SYSTEM;
>>                 c++;
>>         }
>>         BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS);
>>         placement->num_placement = c;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> index 907fdf46d895..e089964cbcb7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>> @@ -120,26 +120,20 @@ static inline struct amdgpu_bo
>> *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>>    */
>>   static inline unsigned amdgpu_mem_type_to_domain(u32 mem_type)
>>   {
>>         switch (mem_type) {
>>         case TTM_PL_VRAM:
>>                 return AMDGPU_GEM_DOMAIN_VRAM;
>>         case TTM_PL_TT:
>>                 return AMDGPU_GEM_DOMAIN_GTT;
>>         case TTM_PL_SYSTEM:
>>                 return AMDGPU_GEM_DOMAIN_CPU;
>> -       case AMDGPU_PL_GDS:
>> -               return AMDGPU_GEM_DOMAIN_GDS;
>> -       case AMDGPU_PL_GWS:
>> -               return AMDGPU_GEM_DOMAIN_GWS;
>> -       case AMDGPU_PL_OA:
>> -               return AMDGPU_GEM_DOMAIN_OA;
>>         default:
>>                 break;
>>         }
>>         return 0;
>>   }
>>     /**
>>    * amdgpu_bo_reserve - reserve bo
>>    * @bo:               bo structure
>>    * @no_intr:  don't return -ERESTARTSYS on pending signal
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 9cc239968e40..f6ea9604e611 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -130,24 +130,20 @@ struct amdgpu_ring_funcs {
>>         /* command emit functions */
>>         void (*emit_ib)(struct amdgpu_ring *ring,
>>                         struct amdgpu_ib *ib,
>>                         unsigned vmid, bool ctx_switch);
>>         void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr,
>>                            uint64_t seq, unsigned flags);
>>         void (*emit_pipeline_sync)(struct amdgpu_ring *ring);
>>         void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid,
>>                               uint64_t pd_addr);
>>         void (*emit_hdp_flush)(struct amdgpu_ring *ring);
>> -       void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid,
>> -                               uint32_t gds_base, uint32_t gds_size,
>> -                               uint32_t gws_base, uint32_t gws_size,
>> -                               uint32_t oa_base, uint32_t oa_size);
>>         /* testing functions */
>>         int (*test_ring)(struct amdgpu_ring *ring);
>>         int (*test_ib)(struct amdgpu_ring *ring, long timeout);
>>         /* insert NOP packets */
>>         void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count);
>>         void (*insert_start)(struct amdgpu_ring *ring);
>>         void (*insert_end)(struct amdgpu_ring *ring);
>>         /* pad the indirect buffer to the necessary number of dw */
>>         void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>         unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
>> @@ -226,21 +222,20 @@ struct amdgpu_ring {
>>   #define amdgpu_ring_patch_cs_in_place(r, p, ib)
>> ((r)->funcs->patch_cs_in_place((p), (ib)))
>>   #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r))
>>   #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t))
>>   #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r))
>>   #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r))
>>   #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r))
>>   #define amdgpu_ring_emit_ib(r, ib, vmid, c) (r)->funcs->emit_ib((r),
>> (ib), (vmid), (c))
>>   #define amdgpu_ring_emit_pipeline_sync(r)
>> (r)->funcs->emit_pipeline_sync((r))
>>   #define amdgpu_ring_emit_vm_flush(r, vmid, addr)
>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>   #define amdgpu_ring_emit_fence(r, addr, seq, flags)
>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>> -#define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as)
>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
>>   #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>   #define amdgpu_ring_emit_switch_buffer(r)
>> (r)->funcs->emit_switch_buffer((r))
>>   #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r),
>> (d))
>>   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d),
>> (v))
>>   #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>   #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>>   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index 8a158ee922f7..2cc62b0e7ea8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -195,30 +195,20 @@ static int amdgpu_init_mem_type(struct ttm_bo_device
>> *bdev, uint32_t type,
>>                 break;
>>         case TTM_PL_VRAM:
>>                 /* "On-card" video ram */
>>                 man->func = &amdgpu_vram_mgr_func;
>>                 man->gpu_offset = adev->gmc.vram_start;
>>                 man->flags = TTM_MEMTYPE_FLAG_FIXED |
>>                              TTM_MEMTYPE_FLAG_MAPPABLE;
>>                 man->available_caching = TTM_PL_FLAG_UNCACHED |
>> TTM_PL_FLAG_WC;
>>                 man->default_caching = TTM_PL_FLAG_WC;
>>                 break;
>> -       case AMDGPU_PL_GDS:
>> -       case AMDGPU_PL_GWS:
>> -       case AMDGPU_PL_OA:
>> -               /* On-chip GDS memory*/
>> -               man->func = &ttm_bo_manager_func;
>> -               man->gpu_offset = 0;
>> -               man->flags = TTM_MEMTYPE_FLAG_FIXED |
>> TTM_MEMTYPE_FLAG_CMA;
>> -               man->available_caching = TTM_PL_FLAG_UNCACHED;
>> -               man->default_caching = TTM_PL_FLAG_UNCACHED;
>> -               break;
>>         default:
>>                 DRM_ERROR("Unsupported memory type %u\n", (unsigned)type);
>>                 return -EINVAL;
>>         }
>>         return 0;
>>   }
>>     /**
>>    * amdgpu_evict_flags - Compute placement flags
>>    *
>> @@ -1039,25 +1029,20 @@ static int amdgpu_ttm_backend_bind(struct ttm_tt
>> *ttm,
>>                 if (r) {
>>                         DRM_ERROR("failed to pin userptr\n");
>>                         return r;
>>                 }
>>         }
>>         if (!ttm->num_pages) {
>>                 WARN(1, "nothing to bind %lu pages for mreg %p back
>> %p!\n",
>>                      ttm->num_pages, bo_mem, ttm);
>>         }
>>   -     if (bo_mem->mem_type == AMDGPU_PL_GDS ||
>> -           bo_mem->mem_type == AMDGPU_PL_GWS ||
>> -           bo_mem->mem_type == AMDGPU_PL_OA)
>> -               return -EINVAL;
>> -
>>         if (!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
>>                 gtt->offset = AMDGPU_BO_INVALID_OFFSET;
>>                 return 0;
>>         }
>>         /* compute PTE flags relevant to this BO memory */
>>         flags = amdgpu_ttm_tt_pte_flags(adev, ttm, bo_mem);
>>         /* bind pages into GART page tables */
>>         gtt->offset = ((u64)bo_mem->start << PAGE_SHIFT) -
>> adev->gmc.gart_start;
>> @@ -1818,60 +1803,20 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>>         /* Initialize GTT memory pool */
>>         r = ttm_bo_init_mm(&adev->mman.bdev, TTM_PL_TT, gtt_size >>
>> PAGE_SHIFT);
>>         if (r) {
>>                 DRM_ERROR("Failed initializing GTT heap.\n");
>>                 return r;
>>         }
>>         DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
>>                  (unsigned)(gtt_size / (1024 * 1024)));
>>   -     /* Initialize various on-chip memory pools */
>> -       adev->gds.mem.total_size = adev->gds.mem.total_size <<
>> AMDGPU_GDS_SHIFT;
>> -       adev->gds.mem.gfx_partition_size =
>> adev->gds.mem.gfx_partition_size << AMDGPU_GDS_SHIFT;
>> -       adev->gds.mem.cs_partition_size = adev->gds.mem.cs_partition_size
>> << AMDGPU_GDS_SHIFT;
>> -       adev->gds.gws.total_size = adev->gds.gws.total_size <<
>> AMDGPU_GWS_SHIFT;
>> -       adev->gds.gws.gfx_partition_size =
>> adev->gds.gws.gfx_partition_size << AMDGPU_GWS_SHIFT;
>> -       adev->gds.gws.cs_partition_size = adev->gds.gws.cs_partition_size
>> << AMDGPU_GWS_SHIFT;
>> -       adev->gds.oa.total_size = adev->gds.oa.total_size <<
>> AMDGPU_OA_SHIFT;
>> -       adev->gds.oa.gfx_partition_size = adev->gds.oa.gfx_partition_size
>> << AMDGPU_OA_SHIFT;
>> -       adev->gds.oa.cs_partition_size = adev->gds.oa.cs_partition_size <<
>> AMDGPU_OA_SHIFT;
>> -       /* GDS Memory */
>> -       if (adev->gds.mem.total_size) {
>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GDS,
>> -                                  adev->gds.mem.total_size >>
>> PAGE_SHIFT);
>> -               if (r) {
>> -                       DRM_ERROR("Failed initializing GDS heap.\n");
>> -                       return r;
>> -               }
>> -       }
>> -
>> -       /* GWS */
>> -       if (adev->gds.gws.total_size) {
>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GWS,
>> -                                  adev->gds.gws.total_size >>
>> PAGE_SHIFT);
>> -               if (r) {
>> -                       DRM_ERROR("Failed initializing gws heap.\n");
>> -                       return r;
>> -               }
>> -       }
>> -
>> -       /* OA */
>> -       if (adev->gds.oa.total_size) {
>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_OA,
>> -                                  adev->gds.oa.total_size >> PAGE_SHIFT);
>> -               if (r) {
>> -                       DRM_ERROR("Failed initializing oa heap.\n");
>> -                       return r;
>> -               }
>> -       }
>> -
>>         /* Register debugfs entries for amdgpu_ttm */
>>         r = amdgpu_ttm_debugfs_init(adev);
>>         if (r) {
>>                 DRM_ERROR("Failed to init debugfs\n");
>>                 return r;
>>         }
>>         return 0;
>>   }
>>     /**
>> @@ -1892,26 +1837,20 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>>                 return;
>>         amdgpu_ttm_debugfs_fini(adev);
>>         amdgpu_ttm_fw_reserve_vram_fini(adev);
>>         if (adev->mman.aper_base_kaddr)
>>                 iounmap(adev->mman.aper_base_kaddr);
>>         adev->mman.aper_base_kaddr = NULL;
>>         ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_VRAM);
>>         ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_TT);
>> -       if (adev->gds.mem.total_size)
>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GDS);
>> -       if (adev->gds.gws.total_size)
>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GWS);
>> -       if (adev->gds.oa.total_size)
>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_OA);
>>         ttm_bo_device_release(&adev->mman.bdev);
>>         amdgpu_ttm_global_fini(adev);
>>         adev->mman.initialized = false;
>>         DRM_INFO("amdgpu: ttm finalized\n");
>>   }
>>     /**
>>    * amdgpu_ttm_set_buffer_funcs_status - enable/disable use of buffer
>> functions
>>    *
>>    * @adev: amdgpu_device pointer
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index fe8f276e9811..04557a382b19 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -20,28 +20,20 @@
>>    * OTHER DEALINGS IN THE SOFTWARE.
>>    *
>>    */
>>     #ifndef __AMDGPU_TTM_H__
>>   #define __AMDGPU_TTM_H__
>>     #include "amdgpu.h"
>>   #include <drm/gpu_scheduler.h>
>>   -#define AMDGPU_PL_GDS                (TTM_PL_PRIV + 0)
>> -#define AMDGPU_PL_GWS          (TTM_PL_PRIV + 1)
>> -#define AMDGPU_PL_OA           (TTM_PL_PRIV + 2)
>> -
>> -#define AMDGPU_PL_FLAG_GDS             (TTM_PL_FLAG_PRIV << 0)
>> -#define AMDGPU_PL_FLAG_GWS             (TTM_PL_FLAG_PRIV << 1)
>> -#define AMDGPU_PL_FLAG_OA              (TTM_PL_FLAG_PRIV << 2)
>> -
>>   #define AMDGPU_GTT_MAX_TRANSFER_SIZE  512
>>   #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS       2
>>     struct amdgpu_mman {
>>         struct ttm_bo_global_ref        bo_global_ref;
>>         struct drm_global_reference     mem_global_ref;
>>         struct ttm_bo_device            bdev;
>>         bool                            mem_global_referenced;
>>         bool                            initialized;
>>         void __iomem                    *aper_base_kaddr;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index be1659fedf94..c66f1c6f0ba8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -803,86 +803,69 @@ void amdgpu_vm_check_compute_bug(struct
>> amdgpu_device *adev)
>>    * Returns:
>>    * True if sync is needed.
>>    */
>>   bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>>                                   struct amdgpu_job *job)
>>   {
>>         struct amdgpu_device *adev = ring->adev;
>>         unsigned vmhub = ring->funcs->vmhub;
>>         struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>         struct amdgpu_vmid *id;
>> -       bool gds_switch_needed;
>>         bool vm_flush_needed = job->vm_needs_flush ||
>> ring->has_compute_vm_bug;
>>         if (job->vmid == 0)
>>                 return false;
>>         id = &id_mgr->ids[job->vmid];
>> -       gds_switch_needed = ring->funcs->emit_gds_switch && (
>> -               id->gds_base != job->gds_base ||
>> -               id->gds_size != job->gds_size ||
>> -               id->gws_base != job->gws_base ||
>> -               id->gws_size != job->gws_size ||
>> -               id->oa_base != job->oa_base ||
>> -               id->oa_size != job->oa_size);
>>         if (amdgpu_vmid_had_gpu_reset(adev, id))
>>                 return true;
>>   -     return vm_flush_needed || gds_switch_needed;
>> +       return vm_flush_needed;
>>   }
>>     /**
>>    * amdgpu_vm_flush - hardware flush the vm
>>    *
>>    * @ring: ring to use for flush
>>    * @job:  related job
>>    * @need_pipe_sync: is pipe sync needed
>>    *
>>    * Emit a VM flush when it is necessary.
>>    *
>>    * Returns:
>>    * 0 on success, errno otherwise.
>>    */
>>   int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>> bool need_pipe_sync)
>>   {
>>         struct amdgpu_device *adev = ring->adev;
>>         unsigned vmhub = ring->funcs->vmhub;
>>         struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>         struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>> -       bool gds_switch_needed = ring->funcs->emit_gds_switch && (
>> -               id->gds_base != job->gds_base ||
>> -               id->gds_size != job->gds_size ||
>> -               id->gws_base != job->gws_base ||
>> -               id->gws_size != job->gws_size ||
>> -               id->oa_base != job->oa_base ||
>> -               id->oa_size != job->oa_size);
>>         bool vm_flush_needed = job->vm_needs_flush;
>>         bool pasid_mapping_needed = id->pasid != job->pasid ||
>>                 !id->pasid_mapping ||
>>                 !dma_fence_is_signaled(id->pasid_mapping);
>>         struct dma_fence *fence = NULL;
>>         unsigned patch_offset = 0;
>>         int r;
>>         if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>> -               gds_switch_needed = true;
>>                 vm_flush_needed = true;
>>                 pasid_mapping_needed = true;
>>         }
>>   -     gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>         vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>>                         job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>         pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>                 ring->funcs->emit_wreg;
>>   -     if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>> +       if (!vm_flush_needed && !need_pipe_sync)
>>                 return 0;
>>         if (ring->funcs->init_cond_exec)
>>                 patch_offset = amdgpu_ring_init_cond_exec(ring);
>>         if (need_pipe_sync)
>>                 amdgpu_ring_emit_pipeline_sync(ring);
>>         if (vm_flush_needed) {
>>                 trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
>> @@ -907,33 +890,20 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct
>> amdgpu_job *job, bool need_
>>                 mutex_unlock(&id_mgr->lock);
>>         }
>>         if (pasid_mapping_needed) {
>>                 id->pasid = job->pasid;
>>                 dma_fence_put(id->pasid_mapping);
>>                 id->pasid_mapping = dma_fence_get(fence);
>>         }
>>         dma_fence_put(fence);
>>   -     if (ring->funcs->emit_gds_switch && gds_switch_needed) {
>> -               id->gds_base = job->gds_base;
>> -               id->gds_size = job->gds_size;
>> -               id->gws_base = job->gws_base;
>> -               id->gws_size = job->gws_size;
>> -               id->oa_base = job->oa_base;
>> -               id->oa_size = job->oa_size;
>> -               amdgpu_ring_emit_gds_switch(ring, job->vmid,
>> job->gds_base,
>> -                                           job->gds_size, job->gws_base,
>> -                                           job->gws_size, job->oa_base,
>> -                                           job->oa_size);
>> -       }
>> -
>>         if (ring->funcs->patch_cond_exec)
>>                 amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>         /* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC
>> */
>>         if (ring->funcs->emit_switch_buffer) {
>>                 amdgpu_ring_emit_switch_buffer(ring);
>>                 amdgpu_ring_emit_switch_buffer(ring);
>>         }
>>         return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index a15d9c0f233b..f5228e169c3a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -1890,21 +1890,21 @@ static void gfx_v7_0_config_init(struct
>> amdgpu_device *adev)
>>    *
>>    * @adev: amdgpu_device pointer
>>    *
>>    * Configures the 3D engine and tiling configuration
>>    * registers so that the 3D engine is usable.
>>    */
>>   static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
>>   {
>>         u32 sh_mem_cfg, sh_static_mem_cfg, sh_mem_base;
>>         u32 tmp;
>> -       int i;
>> +       int i, vmid;
>>         WREG32(mmGRBM_CNTL, (0xff << GRBM_CNTL__READ_TIMEOUT__SHIFT));
>>         WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>         WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>         WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>         gfx_v7_0_tiling_mode_table_init(adev);
>>         gfx_v7_0_setup_rb(adev);
>> @@ -2014,20 +2014,42 @@ static void gfx_v7_0_gpu_init(struct amdgpu_device
>> *adev)
>>         tmp = RREG32(mmSPI_ARB_PRIORITY);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>         WREG32(mmSPI_ARB_PRIORITY, tmp);
>>         mutex_unlock(&adev->grbm_idx_mutex);
>>   +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>> +               unsigned gds_size, gws_size, oa_size;
>> +
>> +               if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>> {
>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>> +               } else {
>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>> +               }
>> +
>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>> gds_size);
>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>> +                      (vmid * gws_size) |
>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>> +       }
>> +
>>         udelay(50);
>>   }
>>     /*
>>    * GPU scratch registers helpers function.
>>    */
>>   /**
>>    * gfx_v7_0_scratch_init - setup driver info for CP scratch regs
>>    *
>>    * @adev: amdgpu_device pointer
>> @@ -4157,68 +4179,20 @@ static uint64_t
>> gfx_v7_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>         uint64_t clock;
>>         mutex_lock(&adev->gfx.gpu_clock_mutex);
>>         WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>         clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>                 ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>         mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>         return clock;
>>   }
>>   -static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>> -                                         uint32_t vmid,
>> -                                         uint32_t gds_base, uint32_t
>> gds_size,
>> -                                         uint32_t gws_base, uint32_t
>> gws_size,
>> -                                         uint32_t oa_base, uint32_t
>> oa_size)
>> -{
>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>> -
>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>> -
>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>> -
>> -       /* GDS Base */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gds_base);
>> -
>> -       /* GDS Size */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gds_size);
>> -
>> -       /* GWS */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT |
>> gws_base);
>> -
>> -       /* OA */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>> oa_base));
>> -}
>> -
>>   static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring,
>> unsigned vmid)
>>   {
>>         struct amdgpu_device *adev = ring->adev;
>>         uint32_t value = 0;
>>         value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
>>         value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
>>         value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
>>         value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
>>         WREG32(mmSQ_CMD, value);
>> @@ -4584,55 +4558,32 @@ static int gfx_v7_0_sw_init(void *handle)
>>                                                                 ring_id,
>>                                                                 i, k, j);
>>                                 if (r)
>>                                         return r;
>>                                 ring_id++;
>>                         }
>>                 }
>>         }
>>   -     /* reserve GDS, GWS and OA resource for gfx */
>> -       r = amdgpu_bo_create_kernel(adev,
>> adev->gds.mem.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>> -       r = amdgpu_bo_create_kernel(adev,
>> adev->gds.gws.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>> -       r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>>         adev->gfx.ce_ram_size = 0x8000;
>>         gfx_v7_0_gpu_early_init(adev);
>>         return r;
>>   }
>>     static int gfx_v7_0_sw_fini(void *handle)
>>   {
>>         int i;
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>> -
>>         for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>                 amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>         for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>                 amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>         gfx_v7_0_cp_compute_fini(adev);
>>         gfx_v7_0_rlc_fini(adev);
>>         gfx_v7_0_mec_fini(adev);
>>         amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>>                                 &adev->gfx.rlc.clear_state_gpu_addr,
>> @@ -5073,64 +5024,60 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs
>> = {
>>     static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>         .type = AMDGPU_RING_TYPE_GFX,
>>         .align_mask = 0xff,
>>         .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>         .support_64bit_ptrs = false,
>>         .get_rptr = gfx_v7_0_ring_get_rptr,
>>         .get_wptr = gfx_v7_0_ring_get_wptr_gfx,
>>         .set_wptr = gfx_v7_0_ring_set_wptr_gfx,
>>         .emit_frame_size =
>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>                 7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>                 5 + /* hdp invalidate */
>>                 12 + 12 + 12 + /* gfx_v7_0_ring_emit_fence_gfx x3 for user
>> fence, vm fence */
>>                 7 + 4 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>                 CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + 6 + /*
>> gfx_v7_0_ring_emit_vm_flush */
>>                 3 + 4, /* gfx_v7_ring_emit_cntxcntl including vgt flush*/
>>         .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_gfx */
>>         .emit_ib = gfx_v7_0_ring_emit_ib_gfx,
>>         .emit_fence = gfx_v7_0_ring_emit_fence_gfx,
>>         .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>         .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>         .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>         .test_ring = gfx_v7_0_ring_test_ring,
>>         .test_ib = gfx_v7_0_ring_test_ib,
>>         .insert_nop = amdgpu_ring_insert_nop,
>>         .pad_ib = amdgpu_ring_generic_pad_ib,
>>         .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>         .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>         .soft_recovery = gfx_v7_0_ring_soft_recovery,
>>   };
>>     static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
>>         .type = AMDGPU_RING_TYPE_COMPUTE,
>>         .align_mask = 0xff,
>>         .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>         .support_64bit_ptrs = false,
>>         .get_rptr = gfx_v7_0_ring_get_rptr,
>>         .get_wptr = gfx_v7_0_ring_get_wptr_compute,
>>         .set_wptr = gfx_v7_0_ring_set_wptr_compute,
>>         .emit_frame_size =
>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>                 7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>                 5 + /* hdp invalidate */
>>                 7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>                 CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>> gfx_v7_0_ring_emit_vm_flush */
>>                 7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user
>> fence, vm fence */
>>         .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
>>         .emit_ib = gfx_v7_0_ring_emit_ib_compute,
>>         .emit_fence = gfx_v7_0_ring_emit_fence_compute,
>>         .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>         .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>         .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>         .test_ring = gfx_v7_0_ring_test_ring,
>>         .test_ib = gfx_v7_0_ring_test_ib,
>>         .insert_nop = amdgpu_ring_insert_nop,
>>         .pad_ib = amdgpu_ring_generic_pad_ib,
>>         .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>   };
>>     static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
>>   {
>> @@ -5169,42 +5116,28 @@ static void gfx_v7_0_set_irq_funcs(struct
>> amdgpu_device *adev)
>>         adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>>   }
>>     static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>>   {
>>         /* init asci gds info */
>>         adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>>         adev->gds.gws.total_size = 64;
>>         adev->gds.oa.total_size = 16;
>>   -     if (adev->gds.mem.total_size == 64 * 1024) {
>> -               adev->gds.mem.gfx_partition_size = 4096;
>> -               adev->gds.mem.cs_partition_size = 4096;
>> -
>> -               adev->gds.gws.gfx_partition_size = 4;
>> -               adev->gds.gws.cs_partition_size = 4;
>> -
>> -               adev->gds.oa.gfx_partition_size = 4;
>> -               adev->gds.oa.cs_partition_size = 1;
>> -       } else {
>> -               adev->gds.mem.gfx_partition_size = 1024;
>> -               adev->gds.mem.cs_partition_size = 1024;
>> -
>> -               adev->gds.gws.gfx_partition_size = 16;
>> -               adev->gds.gws.cs_partition_size = 16;
>> -
>> -               adev->gds.oa.gfx_partition_size = 4;
>> -               adev->gds.oa.cs_partition_size = 4;
>> -       }
>> +       adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size /
>> AMDGPU_NUM_VMID;
>> +       adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size /
>> AMDGPU_NUM_VMID;
>> +       adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size /
>> AMDGPU_NUM_VMID;
>> +       adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size /
>> AMDGPU_NUM_VMID;
>> +       adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /*
>> gfx only */
>> +       adev->gds.oa.kfd_size_per_vmid = 0;
>>   }
>>   -
>>   static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
>>   {
>>         int i, j, k, counter, active_cu_number = 0;
>>         u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
>>         struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
>>         unsigned disable_masks[4 * 2];
>>         u32 ao_cu_num;
>>         if (adev->flags & AMD_IS_APU)
>>                 ao_cu_num = 2;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index 3882689b2d8f..b11a54bd0668 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -2154,57 +2154,34 @@ static int gfx_v8_0_sw_init(void *handle)
>>         kiq = &adev->gfx.kiq;
>>         r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>>         if (r)
>>                 return r;
>>         /* create MQD for all compute queues as well as KIQ for SRIOV case
>> */
>>         r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct
>> vi_mqd_allocation));
>>         if (r)
>>                 return r;
>>   -     /* reserve GDS, GWS and OA resource for gfx */
>> -       r = amdgpu_bo_create_kernel(adev,
>> adev->gds.mem.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>> -       r = amdgpu_bo_create_kernel(adev,
>> adev->gds.gws.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>> -       r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>> -       if (r)
>> -               return r;
>> -
>>         adev->gfx.ce_ram_size = 0x8000;
>>         r = gfx_v8_0_gpu_early_init(adev);
>>         if (r)
>>                 return r;
>>         return 0;
>>   }
>>     static int gfx_v8_0_sw_fini(void *handle)
>>   {
>>         int i;
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>   -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>> -
>>         for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>                 amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>         for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>                 amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>         amdgpu_gfx_compute_mqd_sw_fini(adev);
>>         amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
>>         amdgpu_gfx_kiq_fini(adev);
>>         gfx_v8_0_mec_fini(adev);
>> @@ -3850,21 +3827,21 @@ static void gfx_v8_0_config_init(struct
>> amdgpu_device *adev)
>>         case CHIP_CARRIZO:
>>         case CHIP_STONEY:
>>                 adev->gfx.config.double_offchip_lds_buf = 0;
>>                 break;
>>         }
>>   }
>>     static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
>>   {
>>         u32 tmp, sh_static_mem_cfg;
>> -       int i;
>> +       int i, vmid;
>>         WREG32_FIELD(GRBM_CNTL, READ_TIMEOUT, 0xFF);
>>         WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>         WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>         WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>         gfx_v8_0_tiling_mode_table_init(adev);
>>         gfx_v8_0_setup_rb(adev);
>>         gfx_v8_0_get_cu_info(adev);
>>         gfx_v8_0_config_init(adev);
>> @@ -3927,20 +3904,41 @@ static void gfx_v8_0_gpu_init(struct amdgpu_device
>> *adev)
>>         tmp = RREG32(mmSPI_ARB_PRIORITY);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>         tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>         WREG32(mmSPI_ARB_PRIORITY, tmp);
>>         mutex_unlock(&adev->grbm_idx_mutex);
>>   +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>> +               unsigned gds_size, gws_size, oa_size;
>> +
>> +               if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>> {
>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>> +               } else {
>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>> +               }
>> +
>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>> gds_size);
>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>> +                      (vmid * gws_size) |
>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>> +       }
>>   }
>>     static void gfx_v8_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
>>   {
>>         u32 i, j, k;
>>         u32 mask;
>>         mutex_lock(&adev->grbm_idx_mutex);
>>         for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
>>                 for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
>> @@ -5383,68 +5381,20 @@ static uint64_t
>> gfx_v8_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>         uint64_t clock;
>>         mutex_lock(&adev->gfx.gpu_clock_mutex);
>>         WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>         clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>                 ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>         mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>         return clock;
>>   }
>>   -static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>> -                                         uint32_t vmid,
>> -                                         uint32_t gds_base, uint32_t
>> gds_size,
>> -                                         uint32_t gws_base, uint32_t
>> gws_size,
>> -                                         uint32_t oa_base, uint32_t
>> oa_size)
>> -{
>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>> -
>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>> -
>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>> -
>> -       /* GDS Base */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gds_base);
>> -
>> -       /* GDS Size */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gds_size);
>> -
>> -       /* GWS */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT |
>> gws_base);
>> -
>> -       /* OA */
>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>> -                               WRITE_DATA_DST_SEL(0)));
>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>> -       amdgpu_ring_write(ring, 0);
>> -       amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>> oa_base));
>> -}
>> -
>>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd,
>> uint32_t wave, uint32_t address)
>>   {
>>         WREG32(mmSQ_IND_INDEX,
>>                 (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
>>                 (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
>>                 (address << SQ_IND_INDEX__INDEX__SHIFT) |
>>                 (SQ_IND_INDEX__FORCE_READ_MASK));
>>         return RREG32(mmSQ_IND_DATA);
>>   }
>>   @@ -7132,21 +7082,20 @@ static const struct amdgpu_ring_funcs
>> gfx_v8_0_ring_funcs_gfx = {
>>                 31 + /* DE_META */
>>                 3 + /* CNTX_CTRL */
>>                 5 + /* HDP_INVL */
>>                 8 + 8 + /* FENCE x2 */
>>                 2, /* SWITCH_BUFFER */
>>         .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>>         .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>>         .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
>>         .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>>         .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
>> -       .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>>         .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>>         .test_ring = gfx_v8_0_ring_test_ring,
>>         .test_ib = gfx_v8_0_ring_test_ib,
>>         .insert_nop = amdgpu_ring_insert_nop,
>>         .pad_ib = amdgpu_ring_generic_pad_ib,
>>         .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>         .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>         .init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
>>         .patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
>>         .emit_wreg = gfx_v8_0_ring_emit_wreg,
>> @@ -7155,51 +7104,48 @@ static const struct amdgpu_ring_funcs
>> gfx_v8_0_ring_funcs_gfx = {
>>     static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>>         .type = AMDGPU_RING_TYPE_COMPUTE,
>>         .align_mask = 0xff,
>>         .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>         .support_64bit_ptrs = false,
>>         .get_rptr = gfx_v8_0_ring_get_rptr,
Are you sure of that? I mean it is rather pointless to have a Global 
Data Share when it can't be used to share anything?

On the other hand I'm not opposed to get rid of all that stuff if we 
really don't need it.

Christian.

Am 13.09.2018 um 17:27 schrieb Marek Olšák:
> That's OK. We don't need IBs to get the same VMID.
>
> Marek
>
> On Thu, Sep 13, 2018 at 4:40 AM, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>> As discussed internally that doesn't work because threads don't necessary
>> get the same VMID assigned.
>>
>> Christian.
>>
>> Am 12.09.2018 um 22:33 schrieb Marek Olšák:
>>> From: Marek Olšák <marek.olsak@amd.com>
>>>
>>> I've chosen to do it like this because it's easy and allows an arbitrary
>>> number of processes.
>>>
>>> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  10 --
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   3 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |  20 ----
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h     |  19 +--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  24 +---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c     |   6 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h     |   7 --
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |   3 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  14 +--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  21 ----
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |   6 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |   5 -
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  61 ----------
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |   8 --
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  34 +-----
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c       | 125 +++++---------------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c       | 123 +++++--------------
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c       | 124 ++++++-------------
>>>    include/uapi/drm/amdgpu_drm.h               |  15 +--
>>>    19 files changed, 109 insertions(+), 519 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> index b80243d3972e..7264a4930b88 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>> @@ -71,23 +71,20 @@ int amdgpu_bo_list_create(struct amdgpu_device *adev,
>>> struct drm_file *filp,
>>>                                  / sizeof(struct amdgpu_bo_list_entry))
>>>                  return -EINVAL;
>>>          size = sizeof(struct amdgpu_bo_list);
>>>          size += num_entries * sizeof(struct amdgpu_bo_list_entry);
>>>          list = kvmalloc(size, GFP_KERNEL);
>>>          if (!list)
>>>                  return -ENOMEM;
>>>          kref_init(&list->refcount);
>>> -       list->gds_obj = adev->gds.gds_gfx_bo;
>>> -       list->gws_obj = adev->gds.gws_gfx_bo;
>>> -       list->oa_obj = adev->gds.oa_gfx_bo;
>>>          array = amdgpu_bo_list_array_entry(list, 0);
>>>          memset(array, 0, num_entries * sizeof(struct
>>> amdgpu_bo_list_entry));
>>>          for (i = 0; i < num_entries; ++i) {
>>>                  struct amdgpu_bo_list_entry *entry;
>>>                  struct drm_gem_object *gobj;
>>>                  struct amdgpu_bo *bo;
>>>                  struct mm_struct *usermm;
>>>    @@ -111,27 +108,20 @@ int amdgpu_bo_list_create(struct amdgpu_device
>>> *adev, struct drm_file *filp,
>>>                  } else {
>>>                          entry = &array[last_entry++];
>>>                  }
>>>                  entry->robj = bo;
>>>                  entry->priority = min(info[i].bo_priority,
>>>                                        AMDGPU_BO_LIST_MAX_PRIORITY);
>>>                  entry->tv.bo = &entry->robj->tbo;
>>>                  entry->tv.shared = !entry->robj->prime_shared_count;
>>>    -             if (entry->robj->preferred_domains ==
>>> AMDGPU_GEM_DOMAIN_GDS)
>>> -                       list->gds_obj = entry->robj;
>>> -               if (entry->robj->preferred_domains ==
>>> AMDGPU_GEM_DOMAIN_GWS)
>>> -                       list->gws_obj = entry->robj;
>>> -               if (entry->robj->preferred_domains ==
>>> AMDGPU_GEM_DOMAIN_OA)
>>> -                       list->oa_obj = entry->robj;
>>> -
>>>                  total_size += amdgpu_bo_size(entry->robj);
>>>                  trace_amdgpu_bo_list_set(list, entry->robj);
>>>          }
>>>          list->first_userptr = first_userptr;
>>>          list->num_entries = num_entries;
>>>          trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>>>          *result = list;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> index 61b089768e1c..30f12a60aa28 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>> @@ -36,23 +36,20 @@ struct amdgpu_bo_list_entry {
>>>          struct ttm_validate_buffer      tv;
>>>          struct amdgpu_bo_va             *bo_va;
>>>          uint32_t                        priority;
>>>          struct page                     **user_pages;
>>>          int                             user_invalidated;
>>>    };
>>>      struct amdgpu_bo_list {
>>>          struct rcu_head rhead;
>>>          struct kref refcount;
>>> -       struct amdgpu_bo *gds_obj;
>>> -       struct amdgpu_bo *gws_obj;
>>> -       struct amdgpu_bo *oa_obj;
>>>          unsigned first_userptr;
>>>          unsigned num_entries;
>>>    };
>>>      int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
>>>                         struct amdgpu_bo_list **result);
>>>    void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
>>>                               struct list_head *validated);
>>>    void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
>>>    int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 1081fd00b059..88b58facf29e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -565,23 +565,20 @@ static int amdgpu_cs_list_validate(struct
>>> amdgpu_cs_parser *p,
>>>          return 0;
>>>    }
>>>      static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>>>                                  union drm_amdgpu_cs *cs)
>>>    {
>>>          struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>>          struct amdgpu_vm *vm = &fpriv->vm;
>>>          struct amdgpu_bo_list_entry *e;
>>>          struct list_head duplicates;
>>> -       struct amdgpu_bo *gds;
>>> -       struct amdgpu_bo *gws;
>>> -       struct amdgpu_bo *oa;
>>>          unsigned tries = 10;
>>>          int r;
>>>          INIT_LIST_HEAD(&p->validated);
>>>          /* p->bo_list could already be assigned if
>>> AMDGPU_CHUNK_ID_BO_HANDLES is present */
>>>          if (cs->in.bo_list_handle) {
>>>                  if (p->bo_list)
>>>                          return -EINVAL;
>>>    @@ -705,40 +702,23 @@ static int amdgpu_cs_parser_bos(struct
>>> amdgpu_cs_parser *p,
>>>          r = amdgpu_cs_list_validate(p, &p->validated);
>>>          if (r) {
>>>                  DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");
>>>                  goto error_validate;
>>>          }
>>>          amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>>>                                       p->bytes_moved_vis);
>>>    -     gds = p->bo_list->gds_obj;
>>> -       gws = p->bo_list->gws_obj;
>>> -       oa = p->bo_list->oa_obj;
>>> -
>>>          amdgpu_bo_list_for_each_entry(e, p->bo_list)
>>>                  e->bo_va = amdgpu_vm_bo_find(vm, e->robj);
>>>    -     if (gds) {
>>> -               p->job->gds_base = amdgpu_bo_gpu_offset(gds);
>>> -               p->job->gds_size = amdgpu_bo_size(gds);
>>> -       }
>>> -       if (gws) {
>>> -               p->job->gws_base = amdgpu_bo_gpu_offset(gws);
>>> -               p->job->gws_size = amdgpu_bo_size(gws);
>>> -       }
>>> -       if (oa) {
>>> -               p->job->oa_base = amdgpu_bo_gpu_offset(oa);
>>> -               p->job->oa_size = amdgpu_bo_size(oa);
>>> -       }
>>> -
>>>          if (!r && p->uf_entry.robj) {
>>>                  struct amdgpu_bo *uf = p->uf_entry.robj;
>>>                  r = amdgpu_ttm_alloc_gart(&uf->tbo);
>>>                  p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
>>>          }
>>>      error_validate:
>>>          if (r)
>>>                  ttm_eu_backoff_reservation(&p->ticket, &p->validated);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>> index e73728d90388..69ba25c2e921 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>> @@ -17,48 +17,33 @@
>>>     * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES
>>> OR
>>>     * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>>     * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>>     * OTHER DEALINGS IN THE SOFTWARE.
>>>     *
>>>     */
>>>      #ifndef __AMDGPU_GDS_H__
>>>    #define __AMDGPU_GDS_H__
>>>    -/* Because TTM request that alloacted buffer should be PAGE_SIZE
>>> aligned,
>>> - * we should report GDS/GWS/OA size as PAGE_SIZE aligned
>>> - * */
>>> -#define AMDGPU_GDS_SHIFT       2
>>> -#define AMDGPU_GWS_SHIFT       PAGE_SHIFT
>>> -#define AMDGPU_OA_SHIFT                PAGE_SHIFT
>>> -
>>>    struct amdgpu_ring;
>>>    struct amdgpu_bo;
>>>      struct amdgpu_gds_asic_info {
>>>          uint32_t        total_size;
>>> -       uint32_t        gfx_partition_size;
>>> -       uint32_t        cs_partition_size;
>>> +       uint32_t        gfx_size_per_vmid;
>>> +       uint32_t        kfd_size_per_vmid;
>>>    };
>>>      struct amdgpu_gds {
>>>          struct amdgpu_gds_asic_info     mem;
>>>          struct amdgpu_gds_asic_info     gws;
>>>          struct amdgpu_gds_asic_info     oa;
>>> -       /* At present, GDS, GWS and OA resources for gfx (graphics)
>>> -        * is always pre-allocated and available for graphics operation.
>>> -        * Such resource is shared between all gfx clients.
>>> -        * TODO: move this operation to user space
>>> -        * */
>>> -       struct amdgpu_bo*               gds_gfx_bo;
>>> -       struct amdgpu_bo*               gws_gfx_bo;
>>> -       struct amdgpu_bo*               oa_gfx_bo;
>>>    };
>>>      struct amdgpu_gds_reg_offset {
>>>          uint32_t        mem_base;
>>>          uint32_t        mem_size;
>>>          uint32_t        gws;
>>>          uint32_t        oa;
>>>    };
>>>      #endif /* __AMDGPU_GDS_H__ */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> index d30a0838851b..c87ad4b4d0b6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>> @@ -223,43 +223,25 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev,
>>> void *data,
>>>          if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>                        AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
>>>                        AMDGPU_GEM_CREATE_CPU_GTT_USWC |
>>>                        AMDGPU_GEM_CREATE_VRAM_CLEARED |
>>>                        AMDGPU_GEM_CREATE_VM_ALWAYS_VALID |
>>>                        AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
>>>                  return -EINVAL;
>>>          /* reject invalid gem domains */
>>> -       if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
>>> +       if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
>>> +                                AMDGPU_GEM_DOMAIN_GTT |
>>> +                                AMDGPU_GEM_DOMAIN_VRAM))
>>>                  return -EINVAL;
>>>    -     /* create a gem object to contain this object in */
>>> -       if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
>>> -           AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
>>> -               if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>>> -                       /* if gds bo is created from user space, it must
>>> be
>>> -                        * passed to bo list
>>> -                        */
>>> -                       DRM_ERROR("GDS bo cannot be per-vm-bo\n");
>>> -                       return -EINVAL;
>>> -               }
>>> -               flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>>> -               if (args->in.domains == AMDGPU_GEM_DOMAIN_GDS)
>>> -                       size = size << AMDGPU_GDS_SHIFT;
>>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_GWS)
>>> -                       size = size << AMDGPU_GWS_SHIFT;
>>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_OA)
>>> -                       size = size << AMDGPU_OA_SHIFT;
>>> -               else
>>> -                       return -EINVAL;
>>> -       }
>>>          size = roundup(size, PAGE_SIZE);
>>>          if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>>>                  r = amdgpu_bo_reserve(vm->root.base.bo, false);
>>>                  if (r)
>>>                          return r;
>>>                  resv = vm->root.base.bo->tbo.resv;
>>>          }
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> index 3a072a7a39f0..c2e6a1a11d7f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> @@ -516,26 +516,20 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device
>>> *adev,
>>>     * Reset saved GDW, GWS and OA to force switch on next flush.
>>>     */
>>>    void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>                         unsigned vmid)
>>>    {
>>>          struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>>          struct amdgpu_vmid *id = &id_mgr->ids[vmid];
>>>          mutex_lock(&id_mgr->lock);
>>>          id->owner = 0;
>>> -       id->gds_base = 0;
>>> -       id->gds_size = 0;
>>> -       id->gws_base = 0;
>>> -       id->gws_size = 0;
>>> -       id->oa_base = 0;
>>> -       id->oa_size = 0;
>>>          mutex_unlock(&id_mgr->lock);
>>>    }
>>>      /**
>>>     * amdgpu_vmid_reset_all - reset VMID to zero
>>>     *
>>>     * @adev: amdgpu device structure
>>>     *
>>>     * Reset VMID to force flush on next use
>>>     */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> index 7625419f0fc2..06078e665532 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>> @@ -44,27 +44,20 @@ struct amdgpu_vmid {
>>>          struct amdgpu_sync      active;
>>>          struct dma_fence        *last_flush;
>>>          uint64_t                owner;
>>>          uint64_t                pd_gpu_addr;
>>>          /* last flushed PD/PT update */
>>>          struct dma_fence        *flushed_updates;
>>>          uint32_t                current_gpu_reset_count;
>>>    -     uint32_t                gds_base;
>>> -       uint32_t                gds_size;
>>> -       uint32_t                gws_base;
>>> -       uint32_t                gws_size;
>>> -       uint32_t                oa_base;
>>> -       uint32_t                oa_size;
>>> -
>>>          unsigned                pasid;
>>>          struct dma_fence        *pasid_mapping;
>>>    };
>>>      struct amdgpu_vmid_mgr {
>>>          struct mutex            lock;
>>>          unsigned                num_ids;
>>>          struct list_head        ids_lru;
>>>          struct amdgpu_vmid      ids[AMDGPU_NUM_VMID];
>>>          atomic_t                reserved_vmid_num;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 57cfe78a262b..3db553f6ad01 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -42,23 +42,20 @@ struct amdgpu_job {
>>>          struct amdgpu_sync      sched_sync;
>>>          struct amdgpu_ib        *ibs;
>>>          struct dma_fence        *fence; /* the hw fence */
>>>          uint32_t                preamble_status;
>>>          uint32_t                num_ibs;
>>>          void                    *owner;
>>>          bool                    vm_needs_flush;
>>>          uint64_t                vm_pd_addr;
>>>          unsigned                vmid;
>>>          unsigned                pasid;
>>> -       uint32_t                gds_base, gds_size;
>>> -       uint32_t                gws_base, gws_size;
>>> -       uint32_t                oa_base, oa_size;
>>>          uint32_t                vram_lost_counter;
>>>          /* user fence handling */
>>>          uint64_t                uf_addr;
>>>          uint64_t                uf_sequence;
>>>      };
>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>                       struct amdgpu_job **job, struct amdgpu_vm *vm);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 29ac3873eeb0..209954290954 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -517,27 +517,27 @@ static int amdgpu_info_ioctl(struct drm_device *dev,
>>> void *data, struct drm_file
>>>          case AMDGPU_INFO_VIS_VRAM_USAGE:
>>>                  ui64 =
>>> amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
>>>                  return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT :
>>> 0;
>>>          case AMDGPU_INFO_GTT_USAGE:
>>>                  ui64 =
>>> amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]);
>>>                  return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT :
>>> 0;
>>>          case AMDGPU_INFO_GDS_CONFIG: {
>>>                  struct drm_amdgpu_info_gds gds_info;
>>>                  memset(&gds_info, 0, sizeof(gds_info));
>>> -               gds_info.gds_gfx_partition_size =
>>> adev->gds.mem.gfx_partition_size >> AMDGPU_GDS_SHIFT;
>>> -               gds_info.compute_partition_size =
>>> adev->gds.mem.cs_partition_size >> AMDGPU_GDS_SHIFT;
>>> -               gds_info.gds_total_size = adev->gds.mem.total_size >>
>>> AMDGPU_GDS_SHIFT;
>>> -               gds_info.gws_per_gfx_partition =
>>> adev->gds.gws.gfx_partition_size >> AMDGPU_GWS_SHIFT;
>>> -               gds_info.gws_per_compute_partition =
>>> adev->gds.gws.cs_partition_size >> AMDGPU_GWS_SHIFT;
>>> -               gds_info.oa_per_gfx_partition =
>>> adev->gds.oa.gfx_partition_size >> AMDGPU_OA_SHIFT;
>>> -               gds_info.oa_per_compute_partition =
>>> adev->gds.oa.cs_partition_size >> AMDGPU_OA_SHIFT;
>>> +               gds_info.gds_gfx_partition_size =
>>> adev->gds.mem.gfx_size_per_vmid;
>>> +               gds_info.compute_partition_size =
>>> adev->gds.mem.kfd_size_per_vmid;
>>> +               gds_info.gds_total_size = adev->gds.mem.total_size;
>>> +               gds_info.gws_per_gfx_partition =
>>> adev->gds.gws.gfx_size_per_vmid;
>>> +               gds_info.gws_per_compute_partition =
>>> adev->gds.gws.kfd_size_per_vmid;
>>> +               gds_info.oa_per_gfx_partition =
>>> adev->gds.oa.gfx_size_per_vmid;
>>> +               gds_info.oa_per_compute_partition =
>>> adev->gds.oa.kfd_size_per_vmid;
>>>                  return copy_to_user(out, &gds_info,
>>>                                      min((size_t)size, sizeof(gds_info))) ?
>>> -EFAULT : 0;
>>>          }
>>>          case AMDGPU_INFO_VRAM_GTT: {
>>>                  struct drm_amdgpu_info_vram_gtt vram_gtt;
>>>                  vram_gtt.vram_size = adev->gmc.real_vram_size -
>>>                          atomic64_read(&adev->vram_pin_size);
>>>                  vram_gtt.vram_cpu_accessible_size =
>>> adev->gmc.visible_vram_size -
>>>                          atomic64_read(&adev->visible_pin_size);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> index de990bdcdd6c..76770a8c29a5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>> @@ -178,41 +178,20 @@ void amdgpu_bo_placement_from_domain(struct
>>> amdgpu_bo *abo, u32 domain)
>>>                  places[c].lpfn = 0;
>>>                  places[c].flags = TTM_PL_FLAG_SYSTEM;
>>>                  if (flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
>>>                          places[c].flags |= TTM_PL_FLAG_WC |
>>>                                  TTM_PL_FLAG_UNCACHED;
>>>                  else
>>>                          places[c].flags |= TTM_PL_FLAG_CACHED;
>>>                  c++;
>>>          }
>>>    -     if (domain & AMDGPU_GEM_DOMAIN_GDS) {
>>> -               places[c].fpfn = 0;
>>> -               places[c].lpfn = 0;
>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>> AMDGPU_PL_FLAG_GDS;
>>> -               c++;
>>> -       }
>>> -
>>> -       if (domain & AMDGPU_GEM_DOMAIN_GWS) {
>>> -               places[c].fpfn = 0;
>>> -               places[c].lpfn = 0;
>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>> AMDGPU_PL_FLAG_GWS;
>>> -               c++;
>>> -       }
>>> -
>>> -       if (domain & AMDGPU_GEM_DOMAIN_OA) {
>>> -               places[c].fpfn = 0;
>>> -               places[c].lpfn = 0;
>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>> AMDGPU_PL_FLAG_OA;
>>> -               c++;
>>> -       }
>>> -
>>>          if (!c) {
>>>                  places[c].fpfn = 0;
>>>                  places[c].lpfn = 0;
>>>                  places[c].flags = TTM_PL_MASK_CACHING |
>>> TTM_PL_FLAG_SYSTEM;
>>>                  c++;
>>>          }
>>>          BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS);
>>>          placement->num_placement = c;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> index 907fdf46d895..e089964cbcb7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>> @@ -120,26 +120,20 @@ static inline struct amdgpu_bo
>>> *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>>>     */
>>>    static inline unsigned amdgpu_mem_type_to_domain(u32 mem_type)
>>>    {
>>>          switch (mem_type) {
>>>          case TTM_PL_VRAM:
>>>                  return AMDGPU_GEM_DOMAIN_VRAM;
>>>          case TTM_PL_TT:
>>>                  return AMDGPU_GEM_DOMAIN_GTT;
>>>          case TTM_PL_SYSTEM:
>>>                  return AMDGPU_GEM_DOMAIN_CPU;
>>> -       case AMDGPU_PL_GDS:
>>> -               return AMDGPU_GEM_DOMAIN_GDS;
>>> -       case AMDGPU_PL_GWS:
>>> -               return AMDGPU_GEM_DOMAIN_GWS;
>>> -       case AMDGPU_PL_OA:
>>> -               return AMDGPU_GEM_DOMAIN_OA;
>>>          default:
>>>                  break;
>>>          }
>>>          return 0;
>>>    }
>>>      /**
>>>     * amdgpu_bo_reserve - reserve bo
>>>     * @bo:               bo structure
>>>     * @no_intr:  don't return -ERESTARTSYS on pending signal
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 9cc239968e40..f6ea9604e611 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -130,24 +130,20 @@ struct amdgpu_ring_funcs {
>>>          /* command emit functions */
>>>          void (*emit_ib)(struct amdgpu_ring *ring,
>>>                          struct amdgpu_ib *ib,
>>>                          unsigned vmid, bool ctx_switch);
>>>          void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr,
>>>                             uint64_t seq, unsigned flags);
>>>          void (*emit_pipeline_sync)(struct amdgpu_ring *ring);
>>>          void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid,
>>>                                uint64_t pd_addr);
>>>          void (*emit_hdp_flush)(struct amdgpu_ring *ring);
>>> -       void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid,
>>> -                               uint32_t gds_base, uint32_t gds_size,
>>> -                               uint32_t gws_base, uint32_t gws_size,
>>> -                               uint32_t oa_base, uint32_t oa_size);
>>>          /* testing functions */
>>>          int (*test_ring)(struct amdgpu_ring *ring);
>>>          int (*test_ib)(struct amdgpu_ring *ring, long timeout);
>>>          /* insert NOP packets */
>>>          void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count);
>>>          void (*insert_start)(struct amdgpu_ring *ring);
>>>          void (*insert_end)(struct amdgpu_ring *ring);
>>>          /* pad the indirect buffer to the necessary number of dw */
>>>          void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>>          unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
>>> @@ -226,21 +222,20 @@ struct amdgpu_ring {
>>>    #define amdgpu_ring_patch_cs_in_place(r, p, ib)
>>> ((r)->funcs->patch_cs_in_place((p), (ib)))
>>>    #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r))
>>>    #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t))
>>>    #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r))
>>>    #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r))
>>>    #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r))
>>>    #define amdgpu_ring_emit_ib(r, ib, vmid, c) (r)->funcs->emit_ib((r),
>>> (ib), (vmid), (c))
>>>    #define amdgpu_ring_emit_pipeline_sync(r)
>>> (r)->funcs->emit_pipeline_sync((r))
>>>    #define amdgpu_ring_emit_vm_flush(r, vmid, addr)
>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>    #define amdgpu_ring_emit_fence(r, addr, seq, flags)
>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>> -#define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as)
>>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab), (as))
>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>> (r)->funcs->emit_switch_buffer((r))
>>>    #define amdgpu_ring_emit_cntxcntl(r, d) (r)->funcs->emit_cntxcntl((r),
>>> (d))
>>>    #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d),
>>> (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>>>    #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> index 8a158ee922f7..2cc62b0e7ea8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>> @@ -195,30 +195,20 @@ static int amdgpu_init_mem_type(struct ttm_bo_device
>>> *bdev, uint32_t type,
>>>                  break;
>>>          case TTM_PL_VRAM:
>>>                  /* "On-card" video ram */
>>>                  man->func = &amdgpu_vram_mgr_func;
>>>                  man->gpu_offset = adev->gmc.vram_start;
>>>                  man->flags = TTM_MEMTYPE_FLAG_FIXED |
>>>                               TTM_MEMTYPE_FLAG_MAPPABLE;
>>>                  man->available_caching = TTM_PL_FLAG_UNCACHED |
>>> TTM_PL_FLAG_WC;
>>>                  man->default_caching = TTM_PL_FLAG_WC;
>>>                  break;
>>> -       case AMDGPU_PL_GDS:
>>> -       case AMDGPU_PL_GWS:
>>> -       case AMDGPU_PL_OA:
>>> -               /* On-chip GDS memory*/
>>> -               man->func = &ttm_bo_manager_func;
>>> -               man->gpu_offset = 0;
>>> -               man->flags = TTM_MEMTYPE_FLAG_FIXED |
>>> TTM_MEMTYPE_FLAG_CMA;
>>> -               man->available_caching = TTM_PL_FLAG_UNCACHED;
>>> -               man->default_caching = TTM_PL_FLAG_UNCACHED;
>>> -               break;
>>>          default:
>>>                  DRM_ERROR("Unsupported memory type %u\n", (unsigned)type);
>>>                  return -EINVAL;
>>>          }
>>>          return 0;
>>>    }
>>>      /**
>>>     * amdgpu_evict_flags - Compute placement flags
>>>     *
>>> @@ -1039,25 +1029,20 @@ static int amdgpu_ttm_backend_bind(struct ttm_tt
>>> *ttm,
>>>                  if (r) {
>>>                          DRM_ERROR("failed to pin userptr\n");
>>>                          return r;
>>>                  }
>>>          }
>>>          if (!ttm->num_pages) {
>>>                  WARN(1, "nothing to bind %lu pages for mreg %p back
>>> %p!\n",
>>>                       ttm->num_pages, bo_mem, ttm);
>>>          }
>>>    -     if (bo_mem->mem_type == AMDGPU_PL_GDS ||
>>> -           bo_mem->mem_type == AMDGPU_PL_GWS ||
>>> -           bo_mem->mem_type == AMDGPU_PL_OA)
>>> -               return -EINVAL;
>>> -
>>>          if (!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
>>>                  gtt->offset = AMDGPU_BO_INVALID_OFFSET;
>>>                  return 0;
>>>          }
>>>          /* compute PTE flags relevant to this BO memory */
>>>          flags = amdgpu_ttm_tt_pte_flags(adev, ttm, bo_mem);
>>>          /* bind pages into GART page tables */
>>>          gtt->offset = ((u64)bo_mem->start << PAGE_SHIFT) -
>>> adev->gmc.gart_start;
>>> @@ -1818,60 +1803,20 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>>>          /* Initialize GTT memory pool */
>>>          r = ttm_bo_init_mm(&adev->mman.bdev, TTM_PL_TT, gtt_size >>
>>> PAGE_SHIFT);
>>>          if (r) {
>>>                  DRM_ERROR("Failed initializing GTT heap.\n");
>>>                  return r;
>>>          }
>>>          DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
>>>                   (unsigned)(gtt_size / (1024 * 1024)));
>>>    -     /* Initialize various on-chip memory pools */
>>> -       adev->gds.mem.total_size = adev->gds.mem.total_size <<
>>> AMDGPU_GDS_SHIFT;
>>> -       adev->gds.mem.gfx_partition_size =
>>> adev->gds.mem.gfx_partition_size << AMDGPU_GDS_SHIFT;
>>> -       adev->gds.mem.cs_partition_size = adev->gds.mem.cs_partition_size
>>> << AMDGPU_GDS_SHIFT;
>>> -       adev->gds.gws.total_size = adev->gds.gws.total_size <<
>>> AMDGPU_GWS_SHIFT;
>>> -       adev->gds.gws.gfx_partition_size =
>>> adev->gds.gws.gfx_partition_size << AMDGPU_GWS_SHIFT;
>>> -       adev->gds.gws.cs_partition_size = adev->gds.gws.cs_partition_size
>>> << AMDGPU_GWS_SHIFT;
>>> -       adev->gds.oa.total_size = adev->gds.oa.total_size <<
>>> AMDGPU_OA_SHIFT;
>>> -       adev->gds.oa.gfx_partition_size = adev->gds.oa.gfx_partition_size
>>> << AMDGPU_OA_SHIFT;
>>> -       adev->gds.oa.cs_partition_size = adev->gds.oa.cs_partition_size <<
>>> AMDGPU_OA_SHIFT;
>>> -       /* GDS Memory */
>>> -       if (adev->gds.mem.total_size) {
>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GDS,
>>> -                                  adev->gds.mem.total_size >>
>>> PAGE_SHIFT);
>>> -               if (r) {
>>> -                       DRM_ERROR("Failed initializing GDS heap.\n");
>>> -                       return r;
>>> -               }
>>> -       }
>>> -
>>> -       /* GWS */
>>> -       if (adev->gds.gws.total_size) {
>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GWS,
>>> -                                  adev->gds.gws.total_size >>
>>> PAGE_SHIFT);
>>> -               if (r) {
>>> -                       DRM_ERROR("Failed initializing gws heap.\n");
>>> -                       return r;
>>> -               }
>>> -       }
>>> -
>>> -       /* OA */
>>> -       if (adev->gds.oa.total_size) {
>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_OA,
>>> -                                  adev->gds.oa.total_size >> PAGE_SHIFT);
>>> -               if (r) {
>>> -                       DRM_ERROR("Failed initializing oa heap.\n");
>>> -                       return r;
>>> -               }
>>> -       }
>>> -
>>>          /* Register debugfs entries for amdgpu_ttm */
>>>          r = amdgpu_ttm_debugfs_init(adev);
>>>          if (r) {
>>>                  DRM_ERROR("Failed to init debugfs\n");
>>>                  return r;
>>>          }
>>>          return 0;
>>>    }
>>>      /**
>>> @@ -1892,26 +1837,20 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>>>                  return;
>>>          amdgpu_ttm_debugfs_fini(adev);
>>>          amdgpu_ttm_fw_reserve_vram_fini(adev);
>>>          if (adev->mman.aper_base_kaddr)
>>>                  iounmap(adev->mman.aper_base_kaddr);
>>>          adev->mman.aper_base_kaddr = NULL;
>>>          ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_VRAM);
>>>          ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_TT);
>>> -       if (adev->gds.mem.total_size)
>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GDS);
>>> -       if (adev->gds.gws.total_size)
>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GWS);
>>> -       if (adev->gds.oa.total_size)
>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_OA);
>>>          ttm_bo_device_release(&adev->mman.bdev);
>>>          amdgpu_ttm_global_fini(adev);
>>>          adev->mman.initialized = false;
>>>          DRM_INFO("amdgpu: ttm finalized\n");
>>>    }
>>>      /**
>>>     * amdgpu_ttm_set_buffer_funcs_status - enable/disable use of buffer
>>> functions
>>>     *
>>>     * @adev: amdgpu_device pointer
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index fe8f276e9811..04557a382b19 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -20,28 +20,20 @@
>>>     * OTHER DEALINGS IN THE SOFTWARE.
>>>     *
>>>     */
>>>      #ifndef __AMDGPU_TTM_H__
>>>    #define __AMDGPU_TTM_H__
>>>      #include "amdgpu.h"
>>>    #include <drm/gpu_scheduler.h>
>>>    -#define AMDGPU_PL_GDS                (TTM_PL_PRIV + 0)
>>> -#define AMDGPU_PL_GWS          (TTM_PL_PRIV + 1)
>>> -#define AMDGPU_PL_OA           (TTM_PL_PRIV + 2)
>>> -
>>> -#define AMDGPU_PL_FLAG_GDS             (TTM_PL_FLAG_PRIV << 0)
>>> -#define AMDGPU_PL_FLAG_GWS             (TTM_PL_FLAG_PRIV << 1)
>>> -#define AMDGPU_PL_FLAG_OA              (TTM_PL_FLAG_PRIV << 2)
>>> -
>>>    #define AMDGPU_GTT_MAX_TRANSFER_SIZE  512
>>>    #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS       2
>>>      struct amdgpu_mman {
>>>          struct ttm_bo_global_ref        bo_global_ref;
>>>          struct drm_global_reference     mem_global_ref;
>>>          struct ttm_bo_device            bdev;
>>>          bool                            mem_global_referenced;
>>>          bool                            initialized;
>>>          void __iomem                    *aper_base_kaddr;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index be1659fedf94..c66f1c6f0ba8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -803,86 +803,69 @@ void amdgpu_vm_check_compute_bug(struct
>>> amdgpu_device *adev)
>>>     * Returns:
>>>     * True if sync is needed.
>>>     */
>>>    bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>>>                                    struct amdgpu_job *job)
>>>    {
>>>          struct amdgpu_device *adev = ring->adev;
>>>          unsigned vmhub = ring->funcs->vmhub;
>>>          struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>>          struct amdgpu_vmid *id;
>>> -       bool gds_switch_needed;
>>>          bool vm_flush_needed = job->vm_needs_flush ||
>>> ring->has_compute_vm_bug;
>>>          if (job->vmid == 0)
>>>                  return false;
>>>          id = &id_mgr->ids[job->vmid];
>>> -       gds_switch_needed = ring->funcs->emit_gds_switch && (
>>> -               id->gds_base != job->gds_base ||
>>> -               id->gds_size != job->gds_size ||
>>> -               id->gws_base != job->gws_base ||
>>> -               id->gws_size != job->gws_size ||
>>> -               id->oa_base != job->oa_base ||
>>> -               id->oa_size != job->oa_size);
>>>          if (amdgpu_vmid_had_gpu_reset(adev, id))
>>>                  return true;
>>>    -     return vm_flush_needed || gds_switch_needed;
>>> +       return vm_flush_needed;
>>>    }
>>>      /**
>>>     * amdgpu_vm_flush - hardware flush the vm
>>>     *
>>>     * @ring: ring to use for flush
>>>     * @job:  related job
>>>     * @need_pipe_sync: is pipe sync needed
>>>     *
>>>     * Emit a VM flush when it is necessary.
>>>     *
>>>     * Returns:
>>>     * 0 on success, errno otherwise.
>>>     */
>>>    int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>> bool need_pipe_sync)
>>>    {
>>>          struct amdgpu_device *adev = ring->adev;
>>>          unsigned vmhub = ring->funcs->vmhub;
>>>          struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
>>>          struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>> -       bool gds_switch_needed = ring->funcs->emit_gds_switch && (
>>> -               id->gds_base != job->gds_base ||
>>> -               id->gds_size != job->gds_size ||
>>> -               id->gws_base != job->gws_base ||
>>> -               id->gws_size != job->gws_size ||
>>> -               id->oa_base != job->oa_base ||
>>> -               id->oa_size != job->oa_size);
>>>          bool vm_flush_needed = job->vm_needs_flush;
>>>          bool pasid_mapping_needed = id->pasid != job->pasid ||
>>>                  !id->pasid_mapping ||
>>>                  !dma_fence_is_signaled(id->pasid_mapping);
>>>          struct dma_fence *fence = NULL;
>>>          unsigned patch_offset = 0;
>>>          int r;
>>>          if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>> -               gds_switch_needed = true;
>>>                  vm_flush_needed = true;
>>>                  pasid_mapping_needed = true;
>>>          }
>>>    -     gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>>          vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>>>                          job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>          pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
>>>                  ring->funcs->emit_wreg;
>>>    -     if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>> +       if (!vm_flush_needed && !need_pipe_sync)
>>>                  return 0;
>>>          if (ring->funcs->init_cond_exec)
>>>                  patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>          if (need_pipe_sync)
>>>                  amdgpu_ring_emit_pipeline_sync(ring);
>>>          if (vm_flush_needed) {
>>>                  trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
>>> @@ -907,33 +890,20 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct
>>> amdgpu_job *job, bool need_
>>>                  mutex_unlock(&id_mgr->lock);
>>>          }
>>>          if (pasid_mapping_needed) {
>>>                  id->pasid = job->pasid;
>>>                  dma_fence_put(id->pasid_mapping);
>>>                  id->pasid_mapping = dma_fence_get(fence);
>>>          }
>>>          dma_fence_put(fence);
>>>    -     if (ring->funcs->emit_gds_switch && gds_switch_needed) {
>>> -               id->gds_base = job->gds_base;
>>> -               id->gds_size = job->gds_size;
>>> -               id->gws_base = job->gws_base;
>>> -               id->gws_size = job->gws_size;
>>> -               id->oa_base = job->oa_base;
>>> -               id->oa_size = job->oa_size;
>>> -               amdgpu_ring_emit_gds_switch(ring, job->vmid,
>>> job->gds_base,
>>> -                                           job->gds_size, job->gws_base,
>>> -                                           job->gws_size, job->oa_base,
>>> -                                           job->oa_size);
>>> -       }
>>> -
>>>          if (ring->funcs->patch_cond_exec)
>>>                  amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>          /* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC
>>> */
>>>          if (ring->funcs->emit_switch_buffer) {
>>>                  amdgpu_ring_emit_switch_buffer(ring);
>>>                  amdgpu_ring_emit_switch_buffer(ring);
>>>          }
>>>          return 0;
>>>    }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> index a15d9c0f233b..f5228e169c3a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>> @@ -1890,21 +1890,21 @@ static void gfx_v7_0_config_init(struct
>>> amdgpu_device *adev)
>>>     *
>>>     * @adev: amdgpu_device pointer
>>>     *
>>>     * Configures the 3D engine and tiling configuration
>>>     * registers so that the 3D engine is usable.
>>>     */
>>>    static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
>>>    {
>>>          u32 sh_mem_cfg, sh_static_mem_cfg, sh_mem_base;
>>>          u32 tmp;
>>> -       int i;
>>> +       int i, vmid;
>>>          WREG32(mmGRBM_CNTL, (0xff << GRBM_CNTL__READ_TIMEOUT__SHIFT));
>>>          WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>          WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>          WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>>          gfx_v7_0_tiling_mode_table_init(adev);
>>>          gfx_v7_0_setup_rb(adev);
>>> @@ -2014,20 +2014,42 @@ static void gfx_v7_0_gpu_init(struct amdgpu_device
>>> *adev)
>>>          tmp = RREG32(mmSPI_ARB_PRIORITY);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>>          WREG32(mmSPI_ARB_PRIORITY, tmp);
>>>          mutex_unlock(&adev->grbm_idx_mutex);
>>>    +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>>> +               unsigned gds_size, gws_size, oa_size;
>>> +
>>> +               if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>>> {
>>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>>> +               } else {
>>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>>> +               }
>>> +
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>>> gds_size);
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>>> +                      (vmid * gws_size) |
>>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>>> +       }
>>> +
>>>          udelay(50);
>>>    }
>>>      /*
>>>     * GPU scratch registers helpers function.
>>>     */
>>>    /**
>>>     * gfx_v7_0_scratch_init - setup driver info for CP scratch regs
>>>     *
>>>     * @adev: amdgpu_device pointer
>>> @@ -4157,68 +4179,20 @@ static uint64_t
>>> gfx_v7_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>>          uint64_t clock;
>>>          mutex_lock(&adev->gfx.gpu_clock_mutex);
>>>          WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>>          clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>>                  ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>>          mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>>          return clock;
>>>    }
>>>    -static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>> -                                         uint32_t vmid,
>>> -                                         uint32_t gds_base, uint32_t
>>> gds_size,
>>> -                                         uint32_t gws_base, uint32_t
>>> gws_size,
>>> -                                         uint32_t oa_base, uint32_t
>>> oa_size)
>>> -{
>>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>>> -
>>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>>> -
>>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>>> -
>>> -       /* GDS Base */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gds_base);
>>> -
>>> -       /* GDS Size */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gds_size);
>>> -
>>> -       /* GWS */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT |
>>> gws_base);
>>> -
>>> -       /* OA */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>>> oa_base));
>>> -}
>>> -
>>>    static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring,
>>> unsigned vmid)
>>>    {
>>>          struct amdgpu_device *adev = ring->adev;
>>>          uint32_t value = 0;
>>>          value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
>>>          value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
>>>          value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
>>>          value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
>>>          WREG32(mmSQ_CMD, value);
>>> @@ -4584,55 +4558,32 @@ static int gfx_v7_0_sw_init(void *handle)
>>>                                                                  ring_id,
>>>                                                                  i, k, j);
>>>                                  if (r)
>>>                                          return r;
>>>                                  ring_id++;
>>>                          }
>>>                  }
>>>          }
>>>    -     /* reserve GDS, GWS and OA resource for gfx */
>>> -       r = amdgpu_bo_create_kernel(adev,
>>> adev->gds.mem.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>> -       r = amdgpu_bo_create_kernel(adev,
>>> adev->gds.gws.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>> -       r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>>          adev->gfx.ce_ram_size = 0x8000;
>>>          gfx_v7_0_gpu_early_init(adev);
>>>          return r;
>>>    }
>>>      static int gfx_v7_0_sw_fini(void *handle)
>>>    {
>>>          int i;
>>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>    -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>>> -
>>>          for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>>                  amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>>          for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>>                  amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>>          gfx_v7_0_cp_compute_fini(adev);
>>>          gfx_v7_0_rlc_fini(adev);
>>>          gfx_v7_0_mec_fini(adev);
>>>          amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>>>                                  &adev->gfx.rlc.clear_state_gpu_addr,
>>> @@ -5073,64 +5024,60 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs
>>> = {
>>>      static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>          .type = AMDGPU_RING_TYPE_GFX,
>>>          .align_mask = 0xff,
>>>          .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>>          .support_64bit_ptrs = false,
>>>          .get_rptr = gfx_v7_0_ring_get_rptr,
>>>          .get_wptr = gfx_v7_0_ring_get_wptr_gfx,
>>>          .set_wptr = gfx_v7_0_ring_set_wptr_gfx,
>>>          .emit_frame_size =
>>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>>                  7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>>                  5 + /* hdp invalidate */
>>>                  12 + 12 + 12 + /* gfx_v7_0_ring_emit_fence_gfx x3 for user
>>> fence, vm fence */
>>>                  7 + 4 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>>                  CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + 6 + /*
>>> gfx_v7_0_ring_emit_vm_flush */
>>>                  3 + 4, /* gfx_v7_ring_emit_cntxcntl including vgt flush*/
>>>          .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_gfx */
>>>          .emit_ib = gfx_v7_0_ring_emit_ib_gfx,
>>>          .emit_fence = gfx_v7_0_ring_emit_fence_gfx,
>>>          .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>>          .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>>          .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>>          .test_ring = gfx_v7_0_ring_test_ring,
>>>          .test_ib = gfx_v7_0_ring_test_ib,
>>>          .insert_nop = amdgpu_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>          .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>>          .soft_recovery = gfx_v7_0_ring_soft_recovery,
>>>    };
>>>      static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute = {
>>>          .type = AMDGPU_RING_TYPE_COMPUTE,
>>>          .align_mask = 0xff,
>>>          .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>>          .support_64bit_ptrs = false,
>>>          .get_rptr = gfx_v7_0_ring_get_rptr,
>>>          .get_wptr = gfx_v7_0_ring_get_wptr_compute,
>>>          .set_wptr = gfx_v7_0_ring_set_wptr_compute,
>>>          .emit_frame_size =
>>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>>                  7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>>                  5 + /* hdp invalidate */
>>>                  7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>>                  CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>>> gfx_v7_0_ring_emit_vm_flush */
>>>                  7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for user
>>> fence, vm fence */
>>>          .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
>>>          .emit_ib = gfx_v7_0_ring_emit_ib_compute,
>>>          .emit_fence = gfx_v7_0_ring_emit_fence_compute,
>>>          .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>>          .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>>          .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>>          .test_ring = gfx_v7_0_ring_test_ring,
>>>          .test_ib = gfx_v7_0_ring_test_ib,
>>>          .insert_nop = amdgpu_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>>    };
>>>      static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
>>>    {
>>> @@ -5169,42 +5116,28 @@ static void gfx_v7_0_set_irq_funcs(struct
>>> amdgpu_device *adev)
>>>          adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>>>    }
>>>      static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>>>    {
>>>          /* init asci gds info */
>>>          adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>>>          adev->gds.gws.total_size = 64;
>>>          adev->gds.oa.total_size = 16;
>>>    -     if (adev->gds.mem.total_size == 64 * 1024) {
>>> -               adev->gds.mem.gfx_partition_size = 4096;
>>> -               adev->gds.mem.cs_partition_size = 4096;
>>> -
>>> -               adev->gds.gws.gfx_partition_size = 4;
>>> -               adev->gds.gws.cs_partition_size = 4;
>>> -
>>> -               adev->gds.oa.gfx_partition_size = 4;
>>> -               adev->gds.oa.cs_partition_size = 1;
>>> -       } else {
>>> -               adev->gds.mem.gfx_partition_size = 1024;
>>> -               adev->gds.mem.cs_partition_size = 1024;
>>> -
>>> -               adev->gds.gws.gfx_partition_size = 16;
>>> -               adev->gds.gws.cs_partition_size = 16;
>>> -
>>> -               adev->gds.oa.gfx_partition_size = 4;
>>> -               adev->gds.oa.cs_partition_size = 4;
>>> -       }
>>> +       adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size /
>>> AMDGPU_NUM_VMID;
>>> +       adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size /
>>> AMDGPU_NUM_VMID;
>>> +       adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size /
>>> AMDGPU_NUM_VMID;
>>> +       adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size /
>>> AMDGPU_NUM_VMID;
>>> +       adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /*
>>> gfx only */
>>> +       adev->gds.oa.kfd_size_per_vmid = 0;
>>>    }
>>>    -
>>>    static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
>>>    {
>>>          int i, j, k, counter, active_cu_number = 0;
>>>          u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
>>>          struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
>>>          unsigned disable_masks[4 * 2];
>>>          u32 ao_cu_num;
>>>          if (adev->flags & AMD_IS_APU)
>>>                  ao_cu_num = 2;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> index 3882689b2d8f..b11a54bd0668 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>> @@ -2154,57 +2154,34 @@ static int gfx_v8_0_sw_init(void *handle)
>>>          kiq = &adev->gfx.kiq;
>>>          r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>>>          if (r)
>>>                  return r;
>>>          /* create MQD for all compute queues as well as KIQ for SRIOV case
>>> */
>>>          r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct
>>> vi_mqd_allocation));
>>>          if (r)
>>>                  return r;
>>>    -     /* reserve GDS, GWS and OA resource for gfx */
>>> -       r = amdgpu_bo_create_kernel(adev,
>>> adev->gds.mem.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>> -       r = amdgpu_bo_create_kernel(adev,
>>> adev->gds.gws.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>> -       r = amdgpu_bo_create_kernel(adev, adev->gds.oa.gfx_partition_size,
>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>>> -       if (r)
>>> -               return r;
>>> -
>>>          adev->gfx.ce_ram_size = 0x8000;
>>>          r = gfx_v8_0_gpu_early_init(adev);
>>>          if (r)
>>>                  return r;
>>>          return 0;
>>>    }
>>>      static int gfx_v8_0_sw_fini(void *handle)
>>>    {
>>>          int i;
>>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>    -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>>> -
>>>          for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>>                  amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>>          for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>>                  amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>>          amdgpu_gfx_compute_mqd_sw_fini(adev);
>>>          amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
>>>          amdgpu_gfx_kiq_fini(adev);
>>>          gfx_v8_0_mec_fini(adev);
>>> @@ -3850,21 +3827,21 @@ static void gfx_v8_0_config_init(struct
>>> amdgpu_device *adev)
>>>          case CHIP_CARRIZO:
>>>          case CHIP_STONEY:
>>>                  adev->gfx.config.double_offchip_lds_buf = 0;
>>>                  break;
>>>          }
>>>    }
>>>      static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
>>>    {
>>>          u32 tmp, sh_static_mem_cfg;
>>> -       int i;
>>> +       int i, vmid;
>>>          WREG32_FIELD(GRBM_CNTL, READ_TIMEOUT, 0xFF);
>>>          WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>          WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>          WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>>          gfx_v8_0_tiling_mode_table_init(adev);
>>>          gfx_v8_0_setup_rb(adev);
>>>          gfx_v8_0_get_cu_info(adev);
>>>          gfx_v8_0_config_init(adev);
>>> @@ -3927,20 +3904,41 @@ static void gfx_v8_0_gpu_init(struct amdgpu_device
>>> *adev)
>>>          tmp = RREG32(mmSPI_ARB_PRIORITY);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>>          WREG32(mmSPI_ARB_PRIORITY, tmp);
>>>          mutex_unlock(&adev->grbm_idx_mutex);
>>>    +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>>> +               unsigned gds_size, gws_size, oa_size;
>>> +
>>> +               if (vmid < adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>>> {
>>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>>> +               } else {
>>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>>> +               }
>>> +
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>>> gds_size);
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>>> +                      (vmid * gws_size) |
>>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>>> +       }
>>>    }
>>>      static void gfx_v8_0_wait_for_rlc_serdes(struct amdgpu_device *adev)
>>>    {
>>>          u32 i, j, k;
>>>          u32 mask;
>>>          mutex_lock(&adev->grbm_idx_mutex);
>>>          for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
>>>                  for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
>>> @@ -5383,68 +5381,20 @@ static uint64_t
>>> gfx_v8_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>>          uint64_t clock;
>>>          mutex_lock(&adev->gfx.gpu_clock_mutex);
>>>          WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>>          clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>>                  ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>>          mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>>          return clock;
>>>    }
>>>    -static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>> -                                         uint32_t vmid,
>>> -                                         uint32_t gds_base, uint32_t
>>> gds_size,
>>> -                                         uint32_t gws_base, uint32_t
>>> gws_size,
>>> -                                         uint32_t oa_base, uint32_t
>>> oa_size)
>>> -{
>>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>>> -
>>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>>> -
>>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>>> -
>>> -       /* GDS Base */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gds_base);
>>> -
>>> -       /* GDS Size */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gds_size);
>>> -
>>> -       /* GWS */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT |
>>> gws_base);
>>> -
>>> -       /* OA */
>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>> -                               WRITE_DATA_DST_SEL(0)));
>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>>> -       amdgpu_ring_write(ring, 0);
>>> -       amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>>> oa_base));
>>> -}
>>> -
>>>    static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd,
>>> uint32_t wave, uint32_t address)
>>>    {
>>>          WREG32(mmSQ_IND_INDEX,
>>>                  (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
>>>                  (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
>>>                  (address << SQ_IND_INDEX__INDEX__SHIFT) |
>>>                  (SQ_IND_INDEX__FORCE_READ_MASK));
>>>          return RREG32(mmSQ_IND_DATA);
>>>    }
>>>    @@ -7132,21 +7082,20 @@ static const struct amdgpu_ring_funcs
>>> gfx_v8_0_ring_funcs_gfx = {
>>>                  31 + /* DE_META */
>>>                  3 + /* CNTX_CTRL */
>>>                  5 + /* HDP_INVL */
>>>                  8 + 8 + /* FENCE x2 */
>>>                  2, /* SWITCH_BUFFER */
>>>          .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
>>>          .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
>>>          .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
>>>          .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
>>>          .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
>>> -       .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
>>>          .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
>>>          .test_ring = gfx_v8_0_ring_test_ring,
>>>          .test_ib = gfx_v8_0_ring_test_ib,
>>>          .insert_nop = amdgpu_ring_insert_nop,
>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>          .emit_switch_buffer = gfx_v8_ring_emit_sb,
>>>          .emit_cntxcntl = gfx_v8_ring_emit_cntxcntl,
>>>          .init_cond_exec = gfx_v8_0_ring_emit_init_cond_exec,
>>>          .patch_cond_exec = gfx_v8_0_ring_emit_patch_cond_exec,
>>>          .emit_wreg = gfx_v8_0_ring_emit_wreg,
>>> @@ -7155,51 +7104,48 @@ static const struct amdgpu_ring_funcs
>>> gfx_v8_0_ring_funcs_gfx = {
>>>      static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = {
>>>          .type = AMDGPU_RING_TYPE_COMPUTE,
>>>          .align_mask = 0xff,
>>>          .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>>          .support_64bit_ptrs = false,
>>>          .get_rptr = gfx_v8_0_ring_get_rptr,
GDS is a temporary memory. Its purpose depends on the job, but most of
the time, the idea is:
- beginning of IB
- initialize GDS variables
- dispatch compute that works with GDS variables
- when done, copy GDS variables to memory
- repeat ...
- end of IB

GDS is like a pool of global shader GPRs.

GDS is too small for persistent data.

Marek

On Thu, Sep 13, 2018 at 1:26 PM, Christian König
<christian.koenig@amd.com> wrote:
> Are you sure of that? I mean it is rather pointless to have a Global Data
> Share when it can't be used to share anything?
>
> On the other hand I'm not opposed to get rid of all that stuff if we really
> don't need it.
>
> Christian.
>
> Am 13.09.2018 um 17:27 schrieb Marek Olšák:
>>
>> That's OK. We don't need IBs to get the same VMID.
>>
>> Marek
>>
>> On Thu, Sep 13, 2018 at 4:40 AM, Christian König
>> <ckoenig.leichtzumerken@gmail.com> wrote:
>>>
>>> As discussed internally that doesn't work because threads don't necessary
>>> get the same VMID assigned.
>>>
>>> Christian.
>>>
>>> Am 12.09.2018 um 22:33 schrieb Marek Olšák:
>>>>
>>>> From: Marek Olšák <marek.olsak@amd.com>
>>>>
>>>> I've chosen to do it like this because it's easy and allows an arbitrary
>>>> number of processes.
>>>>
>>>> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
>>>> ---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  10 --
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |   3 -
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c      |  20 ----
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h     |  19 +--
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c     |  24 +---
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c     |   6 -
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h     |   7 --
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.h     |   3 -
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  14 +--
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  |  21 ----
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |   6 -
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |   5 -
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c     |  61 ----------
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |   8 --
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  34 +-----
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c       | 125
>>>> +++++---------------
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c       | 123 +++++--------------
>>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c       | 124 ++++++-------------
>>>>    include/uapi/drm/amdgpu_drm.h               |  15 +--
>>>>    19 files changed, 109 insertions(+), 519 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>>> index b80243d3972e..7264a4930b88 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c
>>>> @@ -71,23 +71,20 @@ int amdgpu_bo_list_create(struct amdgpu_device
>>>> *adev,
>>>> struct drm_file *filp,
>>>>                                  / sizeof(struct amdgpu_bo_list_entry))
>>>>                  return -EINVAL;
>>>>          size = sizeof(struct amdgpu_bo_list);
>>>>          size += num_entries * sizeof(struct amdgpu_bo_list_entry);
>>>>          list = kvmalloc(size, GFP_KERNEL);
>>>>          if (!list)
>>>>                  return -ENOMEM;
>>>>          kref_init(&list->refcount);
>>>> -       list->gds_obj = adev->gds.gds_gfx_bo;
>>>> -       list->gws_obj = adev->gds.gws_gfx_bo;
>>>> -       list->oa_obj = adev->gds.oa_gfx_bo;
>>>>          array = amdgpu_bo_list_array_entry(list, 0);
>>>>          memset(array, 0, num_entries * sizeof(struct
>>>> amdgpu_bo_list_entry));
>>>>          for (i = 0; i < num_entries; ++i) {
>>>>                  struct amdgpu_bo_list_entry *entry;
>>>>                  struct drm_gem_object *gobj;
>>>>                  struct amdgpu_bo *bo;
>>>>                  struct mm_struct *usermm;
>>>>    @@ -111,27 +108,20 @@ int amdgpu_bo_list_create(struct amdgpu_device
>>>> *adev, struct drm_file *filp,
>>>>                  } else {
>>>>                          entry = &array[last_entry++];
>>>>                  }
>>>>                  entry->robj = bo;
>>>>                  entry->priority = min(info[i].bo_priority,
>>>>                                        AMDGPU_BO_LIST_MAX_PRIORITY);
>>>>                  entry->tv.bo = &entry->robj->tbo;
>>>>                  entry->tv.shared = !entry->robj->prime_shared_count;
>>>>    -             if (entry->robj->preferred_domains ==
>>>> AMDGPU_GEM_DOMAIN_GDS)
>>>> -                       list->gds_obj = entry->robj;
>>>> -               if (entry->robj->preferred_domains ==
>>>> AMDGPU_GEM_DOMAIN_GWS)
>>>> -                       list->gws_obj = entry->robj;
>>>> -               if (entry->robj->preferred_domains ==
>>>> AMDGPU_GEM_DOMAIN_OA)
>>>> -                       list->oa_obj = entry->robj;
>>>> -
>>>>                  total_size += amdgpu_bo_size(entry->robj);
>>>>                  trace_amdgpu_bo_list_set(list, entry->robj);
>>>>          }
>>>>          list->first_userptr = first_userptr;
>>>>          list->num_entries = num_entries;
>>>>          trace_amdgpu_cs_bo_status(list->num_entries, total_size);
>>>>          *result = list;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>>> index 61b089768e1c..30f12a60aa28 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
>>>> @@ -36,23 +36,20 @@ struct amdgpu_bo_list_entry {
>>>>          struct ttm_validate_buffer      tv;
>>>>          struct amdgpu_bo_va             *bo_va;
>>>>          uint32_t                        priority;
>>>>          struct page                     **user_pages;
>>>>          int                             user_invalidated;
>>>>    };
>>>>      struct amdgpu_bo_list {
>>>>          struct rcu_head rhead;
>>>>          struct kref refcount;
>>>> -       struct amdgpu_bo *gds_obj;
>>>> -       struct amdgpu_bo *gws_obj;
>>>> -       struct amdgpu_bo *oa_obj;
>>>>          unsigned first_userptr;
>>>>          unsigned num_entries;
>>>>    };
>>>>      int amdgpu_bo_list_get(struct amdgpu_fpriv *fpriv, int id,
>>>>                         struct amdgpu_bo_list **result);
>>>>    void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list,
>>>>                               struct list_head *validated);
>>>>    void amdgpu_bo_list_put(struct amdgpu_bo_list *list);
>>>>    int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in
>>>> *in,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index 1081fd00b059..88b58facf29e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -565,23 +565,20 @@ static int amdgpu_cs_list_validate(struct
>>>> amdgpu_cs_parser *p,
>>>>          return 0;
>>>>    }
>>>>      static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>>>>                                  union drm_amdgpu_cs *cs)
>>>>    {
>>>>          struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>>>>          struct amdgpu_vm *vm = &fpriv->vm;
>>>>          struct amdgpu_bo_list_entry *e;
>>>>          struct list_head duplicates;
>>>> -       struct amdgpu_bo *gds;
>>>> -       struct amdgpu_bo *gws;
>>>> -       struct amdgpu_bo *oa;
>>>>          unsigned tries = 10;
>>>>          int r;
>>>>          INIT_LIST_HEAD(&p->validated);
>>>>          /* p->bo_list could already be assigned if
>>>> AMDGPU_CHUNK_ID_BO_HANDLES is present */
>>>>          if (cs->in.bo_list_handle) {
>>>>                  if (p->bo_list)
>>>>                          return -EINVAL;
>>>>    @@ -705,40 +702,23 @@ static int amdgpu_cs_parser_bos(struct
>>>> amdgpu_cs_parser *p,
>>>>          r = amdgpu_cs_list_validate(p, &p->validated);
>>>>          if (r) {
>>>>                  DRM_ERROR("amdgpu_cs_list_validate(validated)
>>>> failed.\n");
>>>>                  goto error_validate;
>>>>          }
>>>>          amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
>>>>                                       p->bytes_moved_vis);
>>>>    -     gds = p->bo_list->gds_obj;
>>>> -       gws = p->bo_list->gws_obj;
>>>> -       oa = p->bo_list->oa_obj;
>>>> -
>>>>          amdgpu_bo_list_for_each_entry(e, p->bo_list)
>>>>                  e->bo_va = amdgpu_vm_bo_find(vm, e->robj);
>>>>    -     if (gds) {
>>>> -               p->job->gds_base = amdgpu_bo_gpu_offset(gds);
>>>> -               p->job->gds_size = amdgpu_bo_size(gds);
>>>> -       }
>>>> -       if (gws) {
>>>> -               p->job->gws_base = amdgpu_bo_gpu_offset(gws);
>>>> -               p->job->gws_size = amdgpu_bo_size(gws);
>>>> -       }
>>>> -       if (oa) {
>>>> -               p->job->oa_base = amdgpu_bo_gpu_offset(oa);
>>>> -               p->job->oa_size = amdgpu_bo_size(oa);
>>>> -       }
>>>> -
>>>>          if (!r && p->uf_entry.robj) {
>>>>                  struct amdgpu_bo *uf = p->uf_entry.robj;
>>>>                  r = amdgpu_ttm_alloc_gart(&uf->tbo);
>>>>                  p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
>>>>          }
>>>>      error_validate:
>>>>          if (r)
>>>>                  ttm_eu_backoff_reservation(&p->ticket, &p->validated);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>>> index e73728d90388..69ba25c2e921 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
>>>> @@ -17,48 +17,33 @@
>>>>     * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
>>>> DAMAGES
>>>> OR
>>>>     * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
>>>> OTHERWISE,
>>>>     * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
>>>> OR
>>>>     * OTHER DEALINGS IN THE SOFTWARE.
>>>>     *
>>>>     */
>>>>      #ifndef __AMDGPU_GDS_H__
>>>>    #define __AMDGPU_GDS_H__
>>>>    -/* Because TTM request that alloacted buffer should be PAGE_SIZE
>>>> aligned,
>>>> - * we should report GDS/GWS/OA size as PAGE_SIZE aligned
>>>> - * */
>>>> -#define AMDGPU_GDS_SHIFT       2
>>>> -#define AMDGPU_GWS_SHIFT       PAGE_SHIFT
>>>> -#define AMDGPU_OA_SHIFT                PAGE_SHIFT
>>>> -
>>>>    struct amdgpu_ring;
>>>>    struct amdgpu_bo;
>>>>      struct amdgpu_gds_asic_info {
>>>>          uint32_t        total_size;
>>>> -       uint32_t        gfx_partition_size;
>>>> -       uint32_t        cs_partition_size;
>>>> +       uint32_t        gfx_size_per_vmid;
>>>> +       uint32_t        kfd_size_per_vmid;
>>>>    };
>>>>      struct amdgpu_gds {
>>>>          struct amdgpu_gds_asic_info     mem;
>>>>          struct amdgpu_gds_asic_info     gws;
>>>>          struct amdgpu_gds_asic_info     oa;
>>>> -       /* At present, GDS, GWS and OA resources for gfx (graphics)
>>>> -        * is always pre-allocated and available for graphics operation.
>>>> -        * Such resource is shared between all gfx clients.
>>>> -        * TODO: move this operation to user space
>>>> -        * */
>>>> -       struct amdgpu_bo*               gds_gfx_bo;
>>>> -       struct amdgpu_bo*               gws_gfx_bo;
>>>> -       struct amdgpu_bo*               oa_gfx_bo;
>>>>    };
>>>>      struct amdgpu_gds_reg_offset {
>>>>          uint32_t        mem_base;
>>>>          uint32_t        mem_size;
>>>>          uint32_t        gws;
>>>>          uint32_t        oa;
>>>>    };
>>>>      #endif /* __AMDGPU_GDS_H__ */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> index d30a0838851b..c87ad4b4d0b6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
>>>> @@ -223,43 +223,25 @@ int amdgpu_gem_create_ioctl(struct drm_device
>>>> *dev,
>>>> void *data,
>>>>          if (flags & ~(AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>                        AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
>>>>                        AMDGPU_GEM_CREATE_CPU_GTT_USWC |
>>>>                        AMDGPU_GEM_CREATE_VRAM_CLEARED |
>>>>                        AMDGPU_GEM_CREATE_VM_ALWAYS_VALID |
>>>>                        AMDGPU_GEM_CREATE_EXPLICIT_SYNC))
>>>>                  return -EINVAL;
>>>>          /* reject invalid gem domains */
>>>> -       if (args->in.domains & ~AMDGPU_GEM_DOMAIN_MASK)
>>>> +       if (args->in.domains & ~(AMDGPU_GEM_DOMAIN_CPU |
>>>> +                                AMDGPU_GEM_DOMAIN_GTT |
>>>> +                                AMDGPU_GEM_DOMAIN_VRAM))
>>>>                  return -EINVAL;
>>>>    -     /* create a gem object to contain this object in */
>>>> -       if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
>>>> -           AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {
>>>> -               if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>>>> -                       /* if gds bo is created from user space, it must
>>>> be
>>>> -                        * passed to bo list
>>>> -                        */
>>>> -                       DRM_ERROR("GDS bo cannot be per-vm-bo\n");
>>>> -                       return -EINVAL;
>>>> -               }
>>>> -               flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
>>>> -               if (args->in.domains == AMDGPU_GEM_DOMAIN_GDS)
>>>> -                       size = size << AMDGPU_GDS_SHIFT;
>>>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_GWS)
>>>> -                       size = size << AMDGPU_GWS_SHIFT;
>>>> -               else if (args->in.domains == AMDGPU_GEM_DOMAIN_OA)
>>>> -                       size = size << AMDGPU_OA_SHIFT;
>>>> -               else
>>>> -                       return -EINVAL;
>>>> -       }
>>>>          size = roundup(size, PAGE_SIZE);
>>>>          if (flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID) {
>>>>                  r = amdgpu_bo_reserve(vm->root.base.bo, false);
>>>>                  if (r)
>>>>                          return r;
>>>>                  resv = vm->root.base.bo->tbo.resv;
>>>>          }
>>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> index 3a072a7a39f0..c2e6a1a11d7f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> @@ -516,26 +516,20 @@ void amdgpu_vmid_free_reserved(struct
>>>> amdgpu_device
>>>> *adev,
>>>>     * Reset saved GDW, GWS and OA to force switch on next flush.
>>>>     */
>>>>    void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>>                         unsigned vmid)
>>>>    {
>>>>          struct amdgpu_vmid_mgr *id_mgr =
>>>> &adev->vm_manager.id_mgr[vmhub];
>>>>          struct amdgpu_vmid *id = &id_mgr->ids[vmid];
>>>>          mutex_lock(&id_mgr->lock);
>>>>          id->owner = 0;
>>>> -       id->gds_base = 0;
>>>> -       id->gds_size = 0;
>>>> -       id->gws_base = 0;
>>>> -       id->gws_size = 0;
>>>> -       id->oa_base = 0;
>>>> -       id->oa_size = 0;
>>>>          mutex_unlock(&id_mgr->lock);
>>>>    }
>>>>      /**
>>>>     * amdgpu_vmid_reset_all - reset VMID to zero
>>>>     *
>>>>     * @adev: amdgpu device structure
>>>>     *
>>>>     * Reset VMID to force flush on next use
>>>>     */
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> index 7625419f0fc2..06078e665532 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h
>>>> @@ -44,27 +44,20 @@ struct amdgpu_vmid {
>>>>          struct amdgpu_sync      active;
>>>>          struct dma_fence        *last_flush;
>>>>          uint64_t                owner;
>>>>          uint64_t                pd_gpu_addr;
>>>>          /* last flushed PD/PT update */
>>>>          struct dma_fence        *flushed_updates;
>>>>          uint32_t                current_gpu_reset_count;
>>>>    -     uint32_t                gds_base;
>>>> -       uint32_t                gds_size;
>>>> -       uint32_t                gws_base;
>>>> -       uint32_t                gws_size;
>>>> -       uint32_t                oa_base;
>>>> -       uint32_t                oa_size;
>>>> -
>>>>          unsigned                pasid;
>>>>          struct dma_fence        *pasid_mapping;
>>>>    };
>>>>      struct amdgpu_vmid_mgr {
>>>>          struct mutex            lock;
>>>>          unsigned                num_ids;
>>>>          struct list_head        ids_lru;
>>>>          struct amdgpu_vmid      ids[AMDGPU_NUM_VMID];
>>>>          atomic_t                reserved_vmid_num;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index 57cfe78a262b..3db553f6ad01 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -42,23 +42,20 @@ struct amdgpu_job {
>>>>          struct amdgpu_sync      sched_sync;
>>>>          struct amdgpu_ib        *ibs;
>>>>          struct dma_fence        *fence; /* the hw fence */
>>>>          uint32_t                preamble_status;
>>>>          uint32_t                num_ibs;
>>>>          void                    *owner;
>>>>          bool                    vm_needs_flush;
>>>>          uint64_t                vm_pd_addr;
>>>>          unsigned                vmid;
>>>>          unsigned                pasid;
>>>> -       uint32_t                gds_base, gds_size;
>>>> -       uint32_t                gws_base, gws_size;
>>>> -       uint32_t                oa_base, oa_size;
>>>>          uint32_t                vram_lost_counter;
>>>>          /* user fence handling */
>>>>          uint64_t                uf_addr;
>>>>          uint64_t                uf_sequence;
>>>>      };
>>>>      int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
>>>>                       struct amdgpu_job **job, struct amdgpu_vm *vm);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> index 29ac3873eeb0..209954290954 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> @@ -517,27 +517,27 @@ static int amdgpu_info_ioctl(struct drm_device
>>>> *dev,
>>>> void *data, struct drm_file
>>>>          case AMDGPU_INFO_VIS_VRAM_USAGE:
>>>>                  ui64 =
>>>> amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]);
>>>>                  return copy_to_user(out, &ui64, min(size, 8u)) ?
>>>> -EFAULT :
>>>> 0;
>>>>          case AMDGPU_INFO_GTT_USAGE:
>>>>                  ui64 =
>>>> amdgpu_gtt_mgr_usage(&adev->mman.bdev.man[TTM_PL_TT]);
>>>>                  return copy_to_user(out, &ui64, min(size, 8u)) ?
>>>> -EFAULT :
>>>> 0;
>>>>          case AMDGPU_INFO_GDS_CONFIG: {
>>>>                  struct drm_amdgpu_info_gds gds_info;
>>>>                  memset(&gds_info, 0, sizeof(gds_info));
>>>> -               gds_info.gds_gfx_partition_size =
>>>> adev->gds.mem.gfx_partition_size >> AMDGPU_GDS_SHIFT;
>>>> -               gds_info.compute_partition_size =
>>>> adev->gds.mem.cs_partition_size >> AMDGPU_GDS_SHIFT;
>>>> -               gds_info.gds_total_size = adev->gds.mem.total_size >>
>>>> AMDGPU_GDS_SHIFT;
>>>> -               gds_info.gws_per_gfx_partition =
>>>> adev->gds.gws.gfx_partition_size >> AMDGPU_GWS_SHIFT;
>>>> -               gds_info.gws_per_compute_partition =
>>>> adev->gds.gws.cs_partition_size >> AMDGPU_GWS_SHIFT;
>>>> -               gds_info.oa_per_gfx_partition =
>>>> adev->gds.oa.gfx_partition_size >> AMDGPU_OA_SHIFT;
>>>> -               gds_info.oa_per_compute_partition =
>>>> adev->gds.oa.cs_partition_size >> AMDGPU_OA_SHIFT;
>>>> +               gds_info.gds_gfx_partition_size =
>>>> adev->gds.mem.gfx_size_per_vmid;
>>>> +               gds_info.compute_partition_size =
>>>> adev->gds.mem.kfd_size_per_vmid;
>>>> +               gds_info.gds_total_size = adev->gds.mem.total_size;
>>>> +               gds_info.gws_per_gfx_partition =
>>>> adev->gds.gws.gfx_size_per_vmid;
>>>> +               gds_info.gws_per_compute_partition =
>>>> adev->gds.gws.kfd_size_per_vmid;
>>>> +               gds_info.oa_per_gfx_partition =
>>>> adev->gds.oa.gfx_size_per_vmid;
>>>> +               gds_info.oa_per_compute_partition =
>>>> adev->gds.oa.kfd_size_per_vmid;
>>>>                  return copy_to_user(out, &gds_info,
>>>>                                      min((size_t)size,
>>>> sizeof(gds_info))) ?
>>>> -EFAULT : 0;
>>>>          }
>>>>          case AMDGPU_INFO_VRAM_GTT: {
>>>>                  struct drm_amdgpu_info_vram_gtt vram_gtt;
>>>>                  vram_gtt.vram_size = adev->gmc.real_vram_size -
>>>>                          atomic64_read(&adev->vram_pin_size);
>>>>                  vram_gtt.vram_cpu_accessible_size =
>>>> adev->gmc.visible_vram_size -
>>>>                          atomic64_read(&adev->visible_pin_size);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> index de990bdcdd6c..76770a8c29a5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
>>>> @@ -178,41 +178,20 @@ void amdgpu_bo_placement_from_domain(struct
>>>> amdgpu_bo *abo, u32 domain)
>>>>                  places[c].lpfn = 0;
>>>>                  places[c].flags = TTM_PL_FLAG_SYSTEM;
>>>>                  if (flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC)
>>>>                          places[c].flags |= TTM_PL_FLAG_WC |
>>>>                                  TTM_PL_FLAG_UNCACHED;
>>>>                  else
>>>>                          places[c].flags |= TTM_PL_FLAG_CACHED;
>>>>                  c++;
>>>>          }
>>>>    -     if (domain & AMDGPU_GEM_DOMAIN_GDS) {
>>>> -               places[c].fpfn = 0;
>>>> -               places[c].lpfn = 0;
>>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>>> AMDGPU_PL_FLAG_GDS;
>>>> -               c++;
>>>> -       }
>>>> -
>>>> -       if (domain & AMDGPU_GEM_DOMAIN_GWS) {
>>>> -               places[c].fpfn = 0;
>>>> -               places[c].lpfn = 0;
>>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>>> AMDGPU_PL_FLAG_GWS;
>>>> -               c++;
>>>> -       }
>>>> -
>>>> -       if (domain & AMDGPU_GEM_DOMAIN_OA) {
>>>> -               places[c].fpfn = 0;
>>>> -               places[c].lpfn = 0;
>>>> -               places[c].flags = TTM_PL_FLAG_UNCACHED |
>>>> AMDGPU_PL_FLAG_OA;
>>>> -               c++;
>>>> -       }
>>>> -
>>>>          if (!c) {
>>>>                  places[c].fpfn = 0;
>>>>                  places[c].lpfn = 0;
>>>>                  places[c].flags = TTM_PL_MASK_CACHING |
>>>> TTM_PL_FLAG_SYSTEM;
>>>>                  c++;
>>>>          }
>>>>          BUG_ON(c >= AMDGPU_BO_MAX_PLACEMENTS);
>>>>          placement->num_placement = c;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> index 907fdf46d895..e089964cbcb7 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
>>>> @@ -120,26 +120,20 @@ static inline struct amdgpu_bo
>>>> *ttm_to_amdgpu_bo(struct ttm_buffer_object *tbo)
>>>>     */
>>>>    static inline unsigned amdgpu_mem_type_to_domain(u32 mem_type)
>>>>    {
>>>>          switch (mem_type) {
>>>>          case TTM_PL_VRAM:
>>>>                  return AMDGPU_GEM_DOMAIN_VRAM;
>>>>          case TTM_PL_TT:
>>>>                  return AMDGPU_GEM_DOMAIN_GTT;
>>>>          case TTM_PL_SYSTEM:
>>>>                  return AMDGPU_GEM_DOMAIN_CPU;
>>>> -       case AMDGPU_PL_GDS:
>>>> -               return AMDGPU_GEM_DOMAIN_GDS;
>>>> -       case AMDGPU_PL_GWS:
>>>> -               return AMDGPU_GEM_DOMAIN_GWS;
>>>> -       case AMDGPU_PL_OA:
>>>> -               return AMDGPU_GEM_DOMAIN_OA;
>>>>          default:
>>>>                  break;
>>>>          }
>>>>          return 0;
>>>>    }
>>>>      /**
>>>>     * amdgpu_bo_reserve - reserve bo
>>>>     * @bo:               bo structure
>>>>     * @no_intr:  don't return -ERESTARTSYS on pending signal
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index 9cc239968e40..f6ea9604e611 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -130,24 +130,20 @@ struct amdgpu_ring_funcs {
>>>>          /* command emit functions */
>>>>          void (*emit_ib)(struct amdgpu_ring *ring,
>>>>                          struct amdgpu_ib *ib,
>>>>                          unsigned vmid, bool ctx_switch);
>>>>          void (*emit_fence)(struct amdgpu_ring *ring, uint64_t addr,
>>>>                             uint64_t seq, unsigned flags);
>>>>          void (*emit_pipeline_sync)(struct amdgpu_ring *ring);
>>>>          void (*emit_vm_flush)(struct amdgpu_ring *ring, unsigned vmid,
>>>>                                uint64_t pd_addr);
>>>>          void (*emit_hdp_flush)(struct amdgpu_ring *ring);
>>>> -       void (*emit_gds_switch)(struct amdgpu_ring *ring, uint32_t vmid,
>>>> -                               uint32_t gds_base, uint32_t gds_size,
>>>> -                               uint32_t gws_base, uint32_t gws_size,
>>>> -                               uint32_t oa_base, uint32_t oa_size);
>>>>          /* testing functions */
>>>>          int (*test_ring)(struct amdgpu_ring *ring);
>>>>          int (*test_ib)(struct amdgpu_ring *ring, long timeout);
>>>>          /* insert NOP packets */
>>>>          void (*insert_nop)(struct amdgpu_ring *ring, uint32_t count);
>>>>          void (*insert_start)(struct amdgpu_ring *ring);
>>>>          void (*insert_end)(struct amdgpu_ring *ring);
>>>>          /* pad the indirect buffer to the necessary number of dw */
>>>>          void (*pad_ib)(struct amdgpu_ring *ring, struct amdgpu_ib *ib);
>>>>          unsigned (*init_cond_exec)(struct amdgpu_ring *ring);
>>>> @@ -226,21 +222,20 @@ struct amdgpu_ring {
>>>>    #define amdgpu_ring_patch_cs_in_place(r, p, ib)
>>>> ((r)->funcs->patch_cs_in_place((p), (ib)))
>>>>    #define amdgpu_ring_test_ring(r) (r)->funcs->test_ring((r))
>>>>    #define amdgpu_ring_test_ib(r, t) (r)->funcs->test_ib((r), (t))
>>>>    #define amdgpu_ring_get_rptr(r) (r)->funcs->get_rptr((r))
>>>>    #define amdgpu_ring_get_wptr(r) (r)->funcs->get_wptr((r))
>>>>    #define amdgpu_ring_set_wptr(r) (r)->funcs->set_wptr((r))
>>>>    #define amdgpu_ring_emit_ib(r, ib, vmid, c) (r)->funcs->emit_ib((r),
>>>> (ib), (vmid), (c))
>>>>    #define amdgpu_ring_emit_pipeline_sync(r)
>>>> (r)->funcs->emit_pipeline_sync((r))
>>>>    #define amdgpu_ring_emit_vm_flush(r, vmid, addr)
>>>> (r)->funcs->emit_vm_flush((r), (vmid), (addr))
>>>>    #define amdgpu_ring_emit_fence(r, addr, seq, flags)
>>>> (r)->funcs->emit_fence((r), (addr), (seq), (flags))
>>>> -#define amdgpu_ring_emit_gds_switch(r, v, db, ds, wb, ws, ab, as)
>>>> (r)->funcs->emit_gds_switch((r), (v), (db), (ds), (wb), (ws), (ab),
>>>> (as))
>>>>    #define amdgpu_ring_emit_hdp_flush(r) (r)->funcs->emit_hdp_flush((r))
>>>>    #define amdgpu_ring_emit_switch_buffer(r)
>>>> (r)->funcs->emit_switch_buffer((r))
>>>>    #define amdgpu_ring_emit_cntxcntl(r, d)
>>>> (r)->funcs->emit_cntxcntl((r),
>>>> (d))
>>>>    #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r),
>>>> (d),
>>>> (v))
>>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>>>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>>    #define amdgpu_ring_emit_reg_write_reg_wait(r, d0, d1, v, m)
>>>> (r)->funcs->emit_reg_write_reg_wait((r), (d0), (d1), (v), (m))
>>>>    #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>>    #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> index 8a158ee922f7..2cc62b0e7ea8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>>>> @@ -195,30 +195,20 @@ static int amdgpu_init_mem_type(struct
>>>> ttm_bo_device
>>>> *bdev, uint32_t type,
>>>>                  break;
>>>>          case TTM_PL_VRAM:
>>>>                  /* "On-card" video ram */
>>>>                  man->func = &amdgpu_vram_mgr_func;
>>>>                  man->gpu_offset = adev->gmc.vram_start;
>>>>                  man->flags = TTM_MEMTYPE_FLAG_FIXED |
>>>>                               TTM_MEMTYPE_FLAG_MAPPABLE;
>>>>                  man->available_caching = TTM_PL_FLAG_UNCACHED |
>>>> TTM_PL_FLAG_WC;
>>>>                  man->default_caching = TTM_PL_FLAG_WC;
>>>>                  break;
>>>> -       case AMDGPU_PL_GDS:
>>>> -       case AMDGPU_PL_GWS:
>>>> -       case AMDGPU_PL_OA:
>>>> -               /* On-chip GDS memory*/
>>>> -               man->func = &ttm_bo_manager_func;
>>>> -               man->gpu_offset = 0;
>>>> -               man->flags = TTM_MEMTYPE_FLAG_FIXED |
>>>> TTM_MEMTYPE_FLAG_CMA;
>>>> -               man->available_caching = TTM_PL_FLAG_UNCACHED;
>>>> -               man->default_caching = TTM_PL_FLAG_UNCACHED;
>>>> -               break;
>>>>          default:
>>>>                  DRM_ERROR("Unsupported memory type %u\n",
>>>> (unsigned)type);
>>>>                  return -EINVAL;
>>>>          }
>>>>          return 0;
>>>>    }
>>>>      /**
>>>>     * amdgpu_evict_flags - Compute placement flags
>>>>     *
>>>> @@ -1039,25 +1029,20 @@ static int amdgpu_ttm_backend_bind(struct ttm_tt
>>>> *ttm,
>>>>                  if (r) {
>>>>                          DRM_ERROR("failed to pin userptr\n");
>>>>                          return r;
>>>>                  }
>>>>          }
>>>>          if (!ttm->num_pages) {
>>>>                  WARN(1, "nothing to bind %lu pages for mreg %p back
>>>> %p!\n",
>>>>                       ttm->num_pages, bo_mem, ttm);
>>>>          }
>>>>    -     if (bo_mem->mem_type == AMDGPU_PL_GDS ||
>>>> -           bo_mem->mem_type == AMDGPU_PL_GWS ||
>>>> -           bo_mem->mem_type == AMDGPU_PL_OA)
>>>> -               return -EINVAL;
>>>> -
>>>>          if (!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
>>>>                  gtt->offset = AMDGPU_BO_INVALID_OFFSET;
>>>>                  return 0;
>>>>          }
>>>>          /* compute PTE flags relevant to this BO memory */
>>>>          flags = amdgpu_ttm_tt_pte_flags(adev, ttm, bo_mem);
>>>>          /* bind pages into GART page tables */
>>>>          gtt->offset = ((u64)bo_mem->start << PAGE_SHIFT) -
>>>> adev->gmc.gart_start;
>>>> @@ -1818,60 +1803,20 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>>>>          /* Initialize GTT memory pool */
>>>>          r = ttm_bo_init_mm(&adev->mman.bdev, TTM_PL_TT, gtt_size >>
>>>> PAGE_SHIFT);
>>>>          if (r) {
>>>>                  DRM_ERROR("Failed initializing GTT heap.\n");
>>>>                  return r;
>>>>          }
>>>>          DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
>>>>                   (unsigned)(gtt_size / (1024 * 1024)));
>>>>    -     /* Initialize various on-chip memory pools */
>>>> -       adev->gds.mem.total_size = adev->gds.mem.total_size <<
>>>> AMDGPU_GDS_SHIFT;
>>>> -       adev->gds.mem.gfx_partition_size =
>>>> adev->gds.mem.gfx_partition_size << AMDGPU_GDS_SHIFT;
>>>> -       adev->gds.mem.cs_partition_size =
>>>> adev->gds.mem.cs_partition_size
>>>> << AMDGPU_GDS_SHIFT;
>>>> -       adev->gds.gws.total_size = adev->gds.gws.total_size <<
>>>> AMDGPU_GWS_SHIFT;
>>>> -       adev->gds.gws.gfx_partition_size =
>>>> adev->gds.gws.gfx_partition_size << AMDGPU_GWS_SHIFT;
>>>> -       adev->gds.gws.cs_partition_size =
>>>> adev->gds.gws.cs_partition_size
>>>> << AMDGPU_GWS_SHIFT;
>>>> -       adev->gds.oa.total_size = adev->gds.oa.total_size <<
>>>> AMDGPU_OA_SHIFT;
>>>> -       adev->gds.oa.gfx_partition_size =
>>>> adev->gds.oa.gfx_partition_size
>>>> << AMDGPU_OA_SHIFT;
>>>> -       adev->gds.oa.cs_partition_size = adev->gds.oa.cs_partition_size
>>>> <<
>>>> AMDGPU_OA_SHIFT;
>>>> -       /* GDS Memory */
>>>> -       if (adev->gds.mem.total_size) {
>>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GDS,
>>>> -                                  adev->gds.mem.total_size >>
>>>> PAGE_SHIFT);
>>>> -               if (r) {
>>>> -                       DRM_ERROR("Failed initializing GDS heap.\n");
>>>> -                       return r;
>>>> -               }
>>>> -       }
>>>> -
>>>> -       /* GWS */
>>>> -       if (adev->gds.gws.total_size) {
>>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_GWS,
>>>> -                                  adev->gds.gws.total_size >>
>>>> PAGE_SHIFT);
>>>> -               if (r) {
>>>> -                       DRM_ERROR("Failed initializing gws heap.\n");
>>>> -                       return r;
>>>> -               }
>>>> -       }
>>>> -
>>>> -       /* OA */
>>>> -       if (adev->gds.oa.total_size) {
>>>> -               r = ttm_bo_init_mm(&adev->mman.bdev, AMDGPU_PL_OA,
>>>> -                                  adev->gds.oa.total_size >>
>>>> PAGE_SHIFT);
>>>> -               if (r) {
>>>> -                       DRM_ERROR("Failed initializing oa heap.\n");
>>>> -                       return r;
>>>> -               }
>>>> -       }
>>>> -
>>>>          /* Register debugfs entries for amdgpu_ttm */
>>>>          r = amdgpu_ttm_debugfs_init(adev);
>>>>          if (r) {
>>>>                  DRM_ERROR("Failed to init debugfs\n");
>>>>                  return r;
>>>>          }
>>>>          return 0;
>>>>    }
>>>>      /**
>>>> @@ -1892,26 +1837,20 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
>>>>                  return;
>>>>          amdgpu_ttm_debugfs_fini(adev);
>>>>          amdgpu_ttm_fw_reserve_vram_fini(adev);
>>>>          if (adev->mman.aper_base_kaddr)
>>>>                  iounmap(adev->mman.aper_base_kaddr);
>>>>          adev->mman.aper_base_kaddr = NULL;
>>>>          ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_VRAM);
>>>>          ttm_bo_clean_mm(&adev->mman.bdev, TTM_PL_TT);
>>>> -       if (adev->gds.mem.total_size)
>>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GDS);
>>>> -       if (adev->gds.gws.total_size)
>>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_GWS);
>>>> -       if (adev->gds.oa.total_size)
>>>> -               ttm_bo_clean_mm(&adev->mman.bdev, AMDGPU_PL_OA);
>>>>          ttm_bo_device_release(&adev->mman.bdev);
>>>>          amdgpu_ttm_global_fini(adev);
>>>>          adev->mman.initialized = false;
>>>>          DRM_INFO("amdgpu: ttm finalized\n");
>>>>    }
>>>>      /**
>>>>     * amdgpu_ttm_set_buffer_funcs_status - enable/disable use of buffer
>>>> functions
>>>>     *
>>>>     * @adev: amdgpu_device pointer
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> index fe8f276e9811..04557a382b19 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> @@ -20,28 +20,20 @@
>>>>     * OTHER DEALINGS IN THE SOFTWARE.
>>>>     *
>>>>     */
>>>>      #ifndef __AMDGPU_TTM_H__
>>>>    #define __AMDGPU_TTM_H__
>>>>      #include "amdgpu.h"
>>>>    #include <drm/gpu_scheduler.h>
>>>>    -#define AMDGPU_PL_GDS                (TTM_PL_PRIV + 0)
>>>> -#define AMDGPU_PL_GWS          (TTM_PL_PRIV + 1)
>>>> -#define AMDGPU_PL_OA           (TTM_PL_PRIV + 2)
>>>> -
>>>> -#define AMDGPU_PL_FLAG_GDS             (TTM_PL_FLAG_PRIV << 0)
>>>> -#define AMDGPU_PL_FLAG_GWS             (TTM_PL_FLAG_PRIV << 1)
>>>> -#define AMDGPU_PL_FLAG_OA              (TTM_PL_FLAG_PRIV << 2)
>>>> -
>>>>    #define AMDGPU_GTT_MAX_TRANSFER_SIZE  512
>>>>    #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS       2
>>>>      struct amdgpu_mman {
>>>>          struct ttm_bo_global_ref        bo_global_ref;
>>>>          struct drm_global_reference     mem_global_ref;
>>>>          struct ttm_bo_device            bdev;
>>>>          bool                            mem_global_referenced;
>>>>          bool                            initialized;
>>>>          void __iomem                    *aper_base_kaddr;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index be1659fedf94..c66f1c6f0ba8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -803,86 +803,69 @@ void amdgpu_vm_check_compute_bug(struct
>>>> amdgpu_device *adev)
>>>>     * Returns:
>>>>     * True if sync is needed.
>>>>     */
>>>>    bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
>>>>                                    struct amdgpu_job *job)
>>>>    {
>>>>          struct amdgpu_device *adev = ring->adev;
>>>>          unsigned vmhub = ring->funcs->vmhub;
>>>>          struct amdgpu_vmid_mgr *id_mgr =
>>>> &adev->vm_manager.id_mgr[vmhub];
>>>>          struct amdgpu_vmid *id;
>>>> -       bool gds_switch_needed;
>>>>          bool vm_flush_needed = job->vm_needs_flush ||
>>>> ring->has_compute_vm_bug;
>>>>          if (job->vmid == 0)
>>>>                  return false;
>>>>          id = &id_mgr->ids[job->vmid];
>>>> -       gds_switch_needed = ring->funcs->emit_gds_switch && (
>>>> -               id->gds_base != job->gds_base ||
>>>> -               id->gds_size != job->gds_size ||
>>>> -               id->gws_base != job->gws_base ||
>>>> -               id->gws_size != job->gws_size ||
>>>> -               id->oa_base != job->oa_base ||
>>>> -               id->oa_size != job->oa_size);
>>>>          if (amdgpu_vmid_had_gpu_reset(adev, id))
>>>>                  return true;
>>>>    -     return vm_flush_needed || gds_switch_needed;
>>>> +       return vm_flush_needed;
>>>>    }
>>>>      /**
>>>>     * amdgpu_vm_flush - hardware flush the vm
>>>>     *
>>>>     * @ring: ring to use for flush
>>>>     * @job:  related job
>>>>     * @need_pipe_sync: is pipe sync needed
>>>>     *
>>>>     * Emit a VM flush when it is necessary.
>>>>     *
>>>>     * Returns:
>>>>     * 0 on success, errno otherwise.
>>>>     */
>>>>    int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
>>>> bool need_pipe_sync)
>>>>    {
>>>>          struct amdgpu_device *adev = ring->adev;
>>>>          unsigned vmhub = ring->funcs->vmhub;
>>>>          struct amdgpu_vmid_mgr *id_mgr =
>>>> &adev->vm_manager.id_mgr[vmhub];
>>>>          struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>>> -       bool gds_switch_needed = ring->funcs->emit_gds_switch && (
>>>> -               id->gds_base != job->gds_base ||
>>>> -               id->gds_size != job->gds_size ||
>>>> -               id->gws_base != job->gws_base ||
>>>> -               id->gws_size != job->gws_size ||
>>>> -               id->oa_base != job->oa_base ||
>>>> -               id->oa_size != job->oa_size);
>>>>          bool vm_flush_needed = job->vm_needs_flush;
>>>>          bool pasid_mapping_needed = id->pasid != job->pasid ||
>>>>                  !id->pasid_mapping ||
>>>>                  !dma_fence_is_signaled(id->pasid_mapping);
>>>>          struct dma_fence *fence = NULL;
>>>>          unsigned patch_offset = 0;
>>>>          int r;
>>>>          if (amdgpu_vmid_had_gpu_reset(adev, id)) {
>>>> -               gds_switch_needed = true;
>>>>                  vm_flush_needed = true;
>>>>                  pasid_mapping_needed = true;
>>>>          }
>>>>    -     gds_switch_needed &= !!ring->funcs->emit_gds_switch;
>>>>          vm_flush_needed &= !!ring->funcs->emit_vm_flush  &&
>>>>                          job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
>>>>          pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping
>>>> &&
>>>>                  ring->funcs->emit_wreg;
>>>>    -     if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
>>>> +       if (!vm_flush_needed && !need_pipe_sync)
>>>>                  return 0;
>>>>          if (ring->funcs->init_cond_exec)
>>>>                  patch_offset = amdgpu_ring_init_cond_exec(ring);
>>>>          if (need_pipe_sync)
>>>>                  amdgpu_ring_emit_pipeline_sync(ring);
>>>>          if (vm_flush_needed) {
>>>>                  trace_amdgpu_vm_flush(ring, job->vmid,
>>>> job->vm_pd_addr);
>>>> @@ -907,33 +890,20 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>> struct
>>>> amdgpu_job *job, bool need_
>>>>                  mutex_unlock(&id_mgr->lock);
>>>>          }
>>>>          if (pasid_mapping_needed) {
>>>>                  id->pasid = job->pasid;
>>>>                  dma_fence_put(id->pasid_mapping);
>>>>                  id->pasid_mapping = dma_fence_get(fence);
>>>>          }
>>>>          dma_fence_put(fence);
>>>>    -     if (ring->funcs->emit_gds_switch && gds_switch_needed) {
>>>> -               id->gds_base = job->gds_base;
>>>> -               id->gds_size = job->gds_size;
>>>> -               id->gws_base = job->gws_base;
>>>> -               id->gws_size = job->gws_size;
>>>> -               id->oa_base = job->oa_base;
>>>> -               id->oa_size = job->oa_size;
>>>> -               amdgpu_ring_emit_gds_switch(ring, job->vmid,
>>>> job->gds_base,
>>>> -                                           job->gds_size,
>>>> job->gws_base,
>>>> -                                           job->gws_size, job->oa_base,
>>>> -                                           job->oa_size);
>>>> -       }
>>>> -
>>>>          if (ring->funcs->patch_cond_exec)
>>>>                  amdgpu_ring_patch_cond_exec(ring, patch_offset);
>>>>          /* the double SWITCH_BUFFER here *cannot* be skipped by
>>>> COND_EXEC
>>>> */
>>>>          if (ring->funcs->emit_switch_buffer) {
>>>>                  amdgpu_ring_emit_switch_buffer(ring);
>>>>                  amdgpu_ring_emit_switch_buffer(ring);
>>>>          }
>>>>          return 0;
>>>>    }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> index a15d9c0f233b..f5228e169c3a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>>>> @@ -1890,21 +1890,21 @@ static void gfx_v7_0_config_init(struct
>>>> amdgpu_device *adev)
>>>>     *
>>>>     * @adev: amdgpu_device pointer
>>>>     *
>>>>     * Configures the 3D engine and tiling configuration
>>>>     * registers so that the 3D engine is usable.
>>>>     */
>>>>    static void gfx_v7_0_gpu_init(struct amdgpu_device *adev)
>>>>    {
>>>>          u32 sh_mem_cfg, sh_static_mem_cfg, sh_mem_base;
>>>>          u32 tmp;
>>>> -       int i;
>>>> +       int i, vmid;
>>>>          WREG32(mmGRBM_CNTL, (0xff << GRBM_CNTL__READ_TIMEOUT__SHIFT));
>>>>          WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>>          WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>>          WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>>>          gfx_v7_0_tiling_mode_table_init(adev);
>>>>          gfx_v7_0_setup_rb(adev);
>>>> @@ -2014,20 +2014,42 @@ static void gfx_v7_0_gpu_init(struct
>>>> amdgpu_device
>>>> *adev)
>>>>          tmp = RREG32(mmSPI_ARB_PRIORITY);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>>>          WREG32(mmSPI_ARB_PRIORITY, tmp);
>>>>          mutex_unlock(&adev->grbm_idx_mutex);
>>>>    +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>>>> +               unsigned gds_size, gws_size, oa_size;
>>>> +
>>>> +               if (vmid <
>>>> adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>>>> {
>>>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>>>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>>>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>>>> +               } else {
>>>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>>>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>>>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>>>> +               }
>>>> +
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>>>> gds_size);
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>>>> +                      (vmid * gws_size) |
>>>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>>>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>>>> +       }
>>>> +
>>>>          udelay(50);
>>>>    }
>>>>      /*
>>>>     * GPU scratch registers helpers function.
>>>>     */
>>>>    /**
>>>>     * gfx_v7_0_scratch_init - setup driver info for CP scratch regs
>>>>     *
>>>>     * @adev: amdgpu_device pointer
>>>> @@ -4157,68 +4179,20 @@ static uint64_t
>>>> gfx_v7_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>>>          uint64_t clock;
>>>>          mutex_lock(&adev->gfx.gpu_clock_mutex);
>>>>          WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>>>          clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>>>                  ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>>>          mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>>>          return clock;
>>>>    }
>>>>    -static void gfx_v7_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>> -                                         uint32_t vmid,
>>>> -                                         uint32_t gds_base, uint32_t
>>>> gds_size,
>>>> -                                         uint32_t gws_base, uint32_t
>>>> gws_size,
>>>> -                                         uint32_t oa_base, uint32_t
>>>> oa_size)
>>>> -{
>>>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>>>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>>>> -
>>>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>>>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>>>> -
>>>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>>>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>>>> -
>>>> -       /* GDS Base */
>>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>>> -                               WRITE_DATA_DST_SEL(0)));
>>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_base);
>>>> -       amdgpu_ring_write(ring, 0);
>>>> -       amdgpu_ring_write(ring, gds_base);
>>>> -
>>>> -       /* GDS Size */
>>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>>> -                               WRITE_DATA_DST_SEL(0)));
>>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].mem_size);
>>>> -       amdgpu_ring_write(ring, 0);
>>>> -       amdgpu_ring_write(ring, gds_size);
>>>> -
>>>> -       /* GWS */
>>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>>> -                               WRITE_DATA_DST_SEL(0)));
>>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].gws);
>>>> -       amdgpu_ring_write(ring, 0);
>>>> -       amdgpu_ring_write(ring, gws_size << GDS_GWS_VMID0__SIZE__SHIFT |
>>>> gws_base);
>>>> -
>>>> -       /* OA */
>>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
>>>> -                               WRITE_DATA_DST_SEL(0)));
>>>> -       amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>>>> -       amdgpu_ring_write(ring, 0);
>>>> -       amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 <<
>>>> oa_base));
>>>> -}
>>>> -
>>>>    static void gfx_v7_0_ring_soft_recovery(struct amdgpu_ring *ring,
>>>> unsigned vmid)
>>>>    {
>>>>          struct amdgpu_device *adev = ring->adev;
>>>>          uint32_t value = 0;
>>>>          value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
>>>>          value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
>>>>          value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
>>>>          value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
>>>>          WREG32(mmSQ_CMD, value);
>>>> @@ -4584,55 +4558,32 @@ static int gfx_v7_0_sw_init(void *handle)
>>>>
>>>> ring_id,
>>>>                                                                  i, k,
>>>> j);
>>>>                                  if (r)
>>>>                                          return r;
>>>>                                  ring_id++;
>>>>                          }
>>>>                  }
>>>>          }
>>>>    -     /* reserve GDS, GWS and OA resource for gfx */
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.mem.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>>>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.gws.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>>>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.oa.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>>>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>>          adev->gfx.ce_ram_size = 0x8000;
>>>>          gfx_v7_0_gpu_early_init(adev);
>>>>          return r;
>>>>    }
>>>>      static int gfx_v7_0_sw_fini(void *handle)
>>>>    {
>>>>          int i;
>>>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>    -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>>>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>>>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>>>> -
>>>>          for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>>>                  amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>>>          for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>>>                  amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>>>          gfx_v7_0_cp_compute_fini(adev);
>>>>          gfx_v7_0_rlc_fini(adev);
>>>>          gfx_v7_0_mec_fini(adev);
>>>>          amdgpu_bo_free_kernel(&adev->gfx.rlc.clear_state_obj,
>>>>                                  &adev->gfx.rlc.clear_state_gpu_addr,
>>>> @@ -5073,64 +5024,60 @@ static const struct amd_ip_funcs
>>>> gfx_v7_0_ip_funcs
>>>> = {
>>>>      static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = {
>>>>          .type = AMDGPU_RING_TYPE_GFX,
>>>>          .align_mask = 0xff,
>>>>          .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>>>          .support_64bit_ptrs = false,
>>>>          .get_rptr = gfx_v7_0_ring_get_rptr,
>>>>          .get_wptr = gfx_v7_0_ring_get_wptr_gfx,
>>>>          .set_wptr = gfx_v7_0_ring_set_wptr_gfx,
>>>>          .emit_frame_size =
>>>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>>>                  7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>>>                  5 + /* hdp invalidate */
>>>>                  12 + 12 + 12 + /* gfx_v7_0_ring_emit_fence_gfx x3 for
>>>> user
>>>> fence, vm fence */
>>>>                  7 + 4 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>>>                  CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + 6 + /*
>>>> gfx_v7_0_ring_emit_vm_flush */
>>>>                  3 + 4, /* gfx_v7_ring_emit_cntxcntl including vgt
>>>> flush*/
>>>>          .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_gfx */
>>>>          .emit_ib = gfx_v7_0_ring_emit_ib_gfx,
>>>>          .emit_fence = gfx_v7_0_ring_emit_fence_gfx,
>>>>          .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>>>          .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>>>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>>>          .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>>>          .test_ring = gfx_v7_0_ring_test_ring,
>>>>          .test_ib = gfx_v7_0_ring_test_ib,
>>>>          .insert_nop = amdgpu_ring_insert_nop,
>>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>          .emit_cntxcntl = gfx_v7_ring_emit_cntxcntl,
>>>>          .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>>>          .soft_recovery = gfx_v7_0_ring_soft_recovery,
>>>>    };
>>>>      static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_compute =
>>>> {
>>>>          .type = AMDGPU_RING_TYPE_COMPUTE,
>>>>          .align_mask = 0xff,
>>>>          .nop = PACKET3(PACKET3_NOP, 0x3FFF),
>>>>          .support_64bit_ptrs = false,
>>>>          .get_rptr = gfx_v7_0_ring_get_rptr,
>>>>          .get_wptr = gfx_v7_0_ring_get_wptr_compute,
>>>>          .set_wptr = gfx_v7_0_ring_set_wptr_compute,
>>>>          .emit_frame_size =
>>>> -               20 + /* gfx_v7_0_ring_emit_gds_switch */
>>>>                  7 + /* gfx_v7_0_ring_emit_hdp_flush */
>>>>                  5 + /* hdp invalidate */
>>>>                  7 + /* gfx_v7_0_ring_emit_pipeline_sync */
>>>>                  CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
>>>> gfx_v7_0_ring_emit_vm_flush */
>>>>                  7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3 for
>>>> user
>>>> fence, vm fence */
>>>>          .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
>>>>          .emit_ib = gfx_v7_0_ring_emit_ib_compute,
>>>>          .emit_fence = gfx_v7_0_ring_emit_fence_compute,
>>>>          .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
>>>>          .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
>>>> -       .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
>>>>          .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
>>>>          .test_ring = gfx_v7_0_ring_test_ring,
>>>>          .test_ib = gfx_v7_0_ring_test_ib,
>>>>          .insert_nop = amdgpu_ring_insert_nop,
>>>>          .pad_ib = amdgpu_ring_generic_pad_ib,
>>>>          .emit_wreg = gfx_v7_0_ring_emit_wreg,
>>>>    };
>>>>      static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev)
>>>>    {
>>>> @@ -5169,42 +5116,28 @@ static void gfx_v7_0_set_irq_funcs(struct
>>>> amdgpu_device *adev)
>>>>          adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>>>>    }
>>>>      static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>>>>    {
>>>>          /* init asci gds info */
>>>>          adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>>>>          adev->gds.gws.total_size = 64;
>>>>          adev->gds.oa.total_size = 16;
>>>>    -     if (adev->gds.mem.total_size == 64 * 1024) {
>>>> -               adev->gds.mem.gfx_partition_size = 4096;
>>>> -               adev->gds.mem.cs_partition_size = 4096;
>>>> -
>>>> -               adev->gds.gws.gfx_partition_size = 4;
>>>> -               adev->gds.gws.cs_partition_size = 4;
>>>> -
>>>> -               adev->gds.oa.gfx_partition_size = 4;
>>>> -               adev->gds.oa.cs_partition_size = 1;
>>>> -       } else {
>>>> -               adev->gds.mem.gfx_partition_size = 1024;
>>>> -               adev->gds.mem.cs_partition_size = 1024;
>>>> -
>>>> -               adev->gds.gws.gfx_partition_size = 16;
>>>> -               adev->gds.gws.cs_partition_size = 16;
>>>> -
>>>> -               adev->gds.oa.gfx_partition_size = 4;
>>>> -               adev->gds.oa.cs_partition_size = 4;
>>>> -       }
>>>> +       adev->gds.mem.gfx_size_per_vmid = adev->gds.mem.total_size /
>>>> AMDGPU_NUM_VMID;
>>>> +       adev->gds.mem.kfd_size_per_vmid = adev->gds.mem.total_size /
>>>> AMDGPU_NUM_VMID;
>>>> +       adev->gds.gws.gfx_size_per_vmid = adev->gds.gws.total_size /
>>>> AMDGPU_NUM_VMID;
>>>> +       adev->gds.gws.kfd_size_per_vmid = adev->gds.gws.total_size /
>>>> AMDGPU_NUM_VMID;
>>>> +       adev->gds.oa.gfx_size_per_vmid = adev->gds.oa.total_size / 8; /*
>>>> gfx only */
>>>> +       adev->gds.oa.kfd_size_per_vmid = 0;
>>>>    }
>>>>    -
>>>>    static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev)
>>>>    {
>>>>          int i, j, k, counter, active_cu_number = 0;
>>>>          u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
>>>>          struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
>>>>          unsigned disable_masks[4 * 2];
>>>>          u32 ao_cu_num;
>>>>          if (adev->flags & AMD_IS_APU)
>>>>                  ao_cu_num = 2;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> index 3882689b2d8f..b11a54bd0668 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>>>> @@ -2154,57 +2154,34 @@ static int gfx_v8_0_sw_init(void *handle)
>>>>          kiq = &adev->gfx.kiq;
>>>>          r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>>>>          if (r)
>>>>                  return r;
>>>>          /* create MQD for all compute queues as well as KIQ for SRIOV
>>>> case
>>>> */
>>>>          r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct
>>>> vi_mqd_allocation));
>>>>          if (r)
>>>>                  return r;
>>>>    -     /* reserve GDS, GWS and OA resource for gfx */
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.mem.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>>>> -                                   &adev->gds.gds_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.gws.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_GWS,
>>>> -                                   &adev->gds.gws_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>> -       r = amdgpu_bo_create_kernel(adev,
>>>> adev->gds.oa.gfx_partition_size,
>>>> -                                   PAGE_SIZE, AMDGPU_GEM_DOMAIN_OA,
>>>> -                                   &adev->gds.oa_gfx_bo, NULL, NULL);
>>>> -       if (r)
>>>> -               return r;
>>>> -
>>>>          adev->gfx.ce_ram_size = 0x8000;
>>>>          r = gfx_v8_0_gpu_early_init(adev);
>>>>          if (r)
>>>>                  return r;
>>>>          return 0;
>>>>    }
>>>>      static int gfx_v8_0_sw_fini(void *handle)
>>>>    {
>>>>          int i;
>>>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>>>    -     amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
>>>> -       amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
>>>> -       amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
>>>> -
>>>>          for (i = 0; i < adev->gfx.num_gfx_rings; i++)
>>>>                  amdgpu_ring_fini(&adev->gfx.gfx_ring[i]);
>>>>          for (i = 0; i < adev->gfx.num_compute_rings; i++)
>>>>                  amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
>>>>          amdgpu_gfx_compute_mqd_sw_fini(adev);
>>>>          amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring,
>>>> &adev->gfx.kiq.irq);
>>>>          amdgpu_gfx_kiq_fini(adev);
>>>>          gfx_v8_0_mec_fini(adev);
>>>> @@ -3850,21 +3827,21 @@ static void gfx_v8_0_config_init(struct
>>>> amdgpu_device *adev)
>>>>          case CHIP_CARRIZO:
>>>>          case CHIP_STONEY:
>>>>                  adev->gfx.config.double_offchip_lds_buf = 0;
>>>>                  break;
>>>>          }
>>>>    }
>>>>      static void gfx_v8_0_gpu_init(struct amdgpu_device *adev)
>>>>    {
>>>>          u32 tmp, sh_static_mem_cfg;
>>>> -       int i;
>>>> +       int i, vmid;
>>>>          WREG32_FIELD(GRBM_CNTL, READ_TIMEOUT, 0xFF);
>>>>          WREG32(mmGB_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>>          WREG32(mmHDP_ADDR_CONFIG, adev->gfx.config.gb_addr_config);
>>>>          WREG32(mmDMIF_ADDR_CALC, adev->gfx.config.gb_addr_config);
>>>>          gfx_v8_0_tiling_mode_table_init(adev);
>>>>          gfx_v8_0_setup_rb(adev);
>>>>          gfx_v8_0_get_cu_info(adev);
>>>>          gfx_v8_0_config_init(adev);
>>>> @@ -3927,20 +3904,41 @@ static void gfx_v8_0_gpu_init(struct
>>>> amdgpu_device
>>>> *adev)
>>>>          tmp = RREG32(mmSPI_ARB_PRIORITY);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS0, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS1, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS2, 2);
>>>>          tmp = REG_SET_FIELD(tmp, SPI_ARB_PRIORITY, PIPE_ORDER_TS3, 2);
>>>>          WREG32(mmSPI_ARB_PRIORITY, tmp);
>>>>          mutex_unlock(&adev->grbm_idx_mutex);
>>>>    +     for (vmid = 0; vmid < AMDGPU_NUM_VMID; vmid++) {
>>>> +               unsigned gds_size, gws_size, oa_size;
>>>> +
>>>> +               if (vmid <
>>>> adev->vm_manager.id_mgr[AMDGPU_GFXHUB].num_ids)
>>>> {
>>>> +                       gds_size = adev->gds.mem.gfx_size_per_vmid;
>>>> +                       gws_size = adev->gds.gws.gfx_size_per_vmid;
>>>> +                       oa_size = adev->gds.oa.gfx_size_per_vmid;
>>>> +               } else {
>>>> +                       gds_size = adev->gds.mem.kfd_size_per_vmid;
>>>> +                       gws_size = adev->gds.gws.kfd_size_per_vmid;
>>>> +                       oa_size = adev->gds.oa.kfd_size_per_vmid;
>>>> +               }
>>>> +
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_base, vmid *
>>>> gds_size);
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].mem_size, gds_size);
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].gws,
>>>> +                      (vmid * gws_size) |
>>>> +                      (gws_size << GDS_GWS_VMID0__SIZE__SHIFT));
>>>> +               WREG32(amdgpu_gds_reg_offset[vmid].oa,
>>>> +                      ((1 << oa_size) - 1) << (vmid * oa_size));
>>>> +       }
>>>>    }
>>>>      static void gfx_v8_0_wait_for_rlc_serdes(struct amdgpu_device
>>>> *adev)
>>>>    {
>>>>          u32 i, j, k;
>>>>          u32 mask;
>>>>          mutex_lock(&adev->grbm_idx_mutex);
>>>>          for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
>>>>                  for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
>>>> @@ -5383,68 +5381,20 @@ static uint64_t
>>>> gfx_v8_0_get_gpu_clock_counter(struct amdgpu_device *adev)
>>>>          uint64_t clock;
>>>>          mutex_lock(&adev->gfx.gpu_clock_mutex);
>>>>          WREG32(mmRLC_CAPTURE_GPU_CLOCK_COUNT, 1);
>>>>          clock = (uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_LSB) |
>>>>                  ((uint64_t)RREG32(mmRLC_GPU_CLOCK_COUNT_MSB) << 32ULL);
>>>>          mutex_unlock(&adev->gfx.gpu_clock_mutex);
>>>>          return clock;
>>>>    }
>>>>    -static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>>> -                                         uint32_t vmid,
>>>> -                                         uint32_t gds_base, uint32_t
>>>> gds_size,
>>>> -                                         uint32_t gws_base, uint32_t
>>>> gws_size,
>>>> -                                         uint32_t oa_base, uint32_t
>>>> oa_size)
>>>> -{
>>>> -       gds_base = gds_base >> AMDGPU_GDS_SHIFT;
>>>> -       gds_size = gds_size >> AMDGPU_GDS_SHIFT;
>>>> -
>>>> -       gws_base = gws_base >> AMDGPU_GWS_SHIFT;
>>>> -       gws_size = gws_size >> AMDGPU_GWS_SHIFT;
>>>> -
>>>> -       oa_base = oa_base >> AMDGPU_OA_SHIFT;
>>>> -       oa_size = oa_size >> AMDGPU_OA_SHIFT;
>>>> -
>>>> -       /* GDS Base */
>>>> -       amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
>>>> -       amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(0) |
Why don't we just fix up the current GDS code so it works the same as vram and then we can add a new CS or context flag to ignore the current static allocation for gfx.  We can ignore data persistence if it's too much trouble.  Assume you always have to init the memory before you use it.  That's already the case.


Alex
I was thinking about that too, but it would be too much trouble for
something we don't need.

Marek

On Thu, Sep 13, 2018 at 2:57 PM, Deucher, Alexander
<Alexander.Deucher@amd.com> wrote:
> Why don't we just fix up the current GDS code so it works the same as vram
> and then we can add a new CS or context flag to ignore the current static
> allocation for gfx.  We can ignore data persistence if it's too much
> trouble.  Assume you always have to init the memory before you use it.
> That's already the case.
>
>
> Alex
To be fair, since we have only 7 user VMIDs and 8 chunks of GDS, we
can make the 8th GDS chunk global and allocatable and use it based on
a CS flag. It would need more work and a lot of testing though. I
don't think we can do the testing part now because of the complexity
of interactions between per-VMID GDS and global GDS, but it's
certainly something that people could add in the future.

Marek

On Thu, Sep 13, 2018 at 3:04 PM, Marek Olšák <maraeo@gmail.com> wrote:
> I was thinking about that too, but it would be too much trouble for
> something we don't need.
>
> Marek
>
> On Thu, Sep 13, 2018 at 2:57 PM, Deucher, Alexander
> <Alexander.Deucher@amd.com> wrote:
>> Why don't we just fix up the current GDS code so it works the same as vram
>> and then we can add a new CS or context flag to ignore the current static
>> allocation for gfx.  We can ignore data persistence if it's too much
>> trouble.  Assume you always have to init the memory before you use it.
>> That's already the case.
>>
>>
>> Alex
Well as long as we don't need to save any content it should be trivial 
to implement resource management with the existing code.

I will take a look why allocating GDS BOs fail at the moment, if it is 
something trivial we could still fix it.

Christian.

Am 13.09.2018 um 23:01 schrieb Marek Olšák:
> To be fair, since we have only 7 user VMIDs and 8 chunks of GDS, we
> can make the 8th GDS chunk global and allocatable and use it based on
> a CS flag. It would need more work and a lot of testing though. I
> don't think we can do the testing part now because of the complexity
> of interactions between per-VMID GDS and global GDS, but it's
> certainly something that people could add in the future.
>
> Marek
>
> On Thu, Sep 13, 2018 at 3:04 PM, Marek Olšák <maraeo@gmail.com> wrote:
>> I was thinking about that too, but it would be too much trouble for
>> something we don't need.
>>
>> Marek
>>
>> On Thu, Sep 13, 2018 at 2:57 PM, Deucher, Alexander
>> <Alexander.Deucher@amd.com> wrote:
>>> Why don't we just fix up the current GDS code so it works the same as vram
>>> and then we can add a new CS or context flag to ignore the current static
>>> allocation for gfx.  We can ignore data persistence if it's too much
>>> trouble.  Assume you always have to init the memory before you use it.
>>> That's already the case.
>>>
>>>
>>> Alex
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx