drm/amdgpu: Optimize recursion in amdgpu_vm_update_level

Submitted by Kuehling, Felix on July 12, 2017, 3:56 a.m.

Details

Message ID 1499831769-3764-1-git-send-email-Felix.Kuehling@amd.com
State New
Headers show
Series "drm/amdgpu: Optimize recursion in amdgpu_vm_update_level" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Kuehling, Felix July 12, 2017, 3:56 a.m.
When lots of virtual address spaces is used, there can be thousands
of page table BOs. amdgpu_vm_update_level iterates over all of them
recursively. In many cases only a few or none at all need to be
updated. Minimize unnecessary code execution and memory usage in
those cases.

This speeds up memory mapping in a synthetic KFD memory mapping
benchmark by roughly a factor two.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
 1 file changed, 55 insertions(+), 54 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ff5de3a..23b899b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1025,7 +1025,7 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 {
 	struct amdgpu_bo *shadow;
 	struct amdgpu_ring *ring = NULL;
-	uint64_t pd_addr, shadow_addr = 0;
+	uint64_t pd_addr = 0, shadow_addr = 0;
 	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
 	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
 	unsigned count = 0, pt_idx, ndw = 0;
@@ -1044,48 +1044,19 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	WARN_ON(vm->use_cpu_for_update && shadow);
 	if (vm->use_cpu_for_update && !shadow) {
-		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
-		if (r)
-			return r;
-		r = amdgpu_vm_bo_wait(adev, parent->bo);
-		if (unlikely(r)) {
-			amdgpu_bo_kunmap(parent->bo);
-			return r;
-		}
+		/* Defer kmapping until it's actually needed. Some
+		 * PDBs may need no update at all
+		 */
 		params.func = amdgpu_vm_cpu_set_ptes;
+		params.ib = (void *)(long)-1;
 	} else {
-		if (shadow) {
-			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
-			if (r)
-				return r;
-		}
-		ring = container_of(vm->entity.sched, struct amdgpu_ring,
-				    sched);
-
-		/* padding, etc. */
-		ndw = 64;
-
-		/* assume the worst case */
-		ndw += parent->last_entry_used * 6;
-
-		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
-
-		if (shadow) {
-			shadow_addr = amdgpu_bo_gpu_offset(shadow);
-			ndw *= 2;
-		} else {
-			shadow_addr = 0;
-		}
-
-		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-		if (r)
-			return r;
-
-		params.ib = &job->ibs[0];
+		/* Defer IB allocation until it's actually
+		 * needed. Some PDBs may need no update at all
+		 */
+		params.ib = NULL;
 		params.func = amdgpu_vm_do_set_ptes;
 	}
 
-
 	/* walk over the address space and update the directory */
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
 		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
@@ -1094,22 +1065,53 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		if (bo == NULL)
 			continue;
 
-		if (bo->shadow) {
-			struct amdgpu_bo *pt_shadow = bo->shadow;
-
-			r = amdgpu_ttm_bind(&pt_shadow->tbo,
-					    &pt_shadow->tbo.mem);
-			if (r)
-				return r;
-		}
-
-		pt = amdgpu_bo_gpu_offset(bo);
-		pt = amdgpu_gart_get_vm_pde(adev, pt);
+		pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
 		if (parent->entries[pt_idx].addr == pt)
 			continue;
 
 		parent->entries[pt_idx].addr = pt;
 
+		if (!params.ib) {
+			if (shadow) {
+				r = amdgpu_ttm_bind(&shadow->tbo,
+						    &shadow->tbo.mem);
+				if (r)
+					return r;
+			}
+
+			ring = container_of(vm->entity.sched,
+					    struct amdgpu_ring, sched);
+
+			/* padding, etc. */
+			ndw = 64;
+
+			/* assume the worst case */
+			ndw += (parent->last_entry_used - pt_idx) * 6;
+
+			pd_addr = parent->bo->tbo.offset;
+
+			if (shadow) {
+				shadow_addr = shadow->tbo.offset;
+				ndw *= 2;
+			} else {
+				shadow_addr = 0;
+			}
+			r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+			if (r)
+				return r;
+
+			params.ib = &job->ibs[0];
+		} else if (!pd_addr) {
+			r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
+			if (r)
+				return r;
+			r = amdgpu_vm_bo_wait(adev, parent->bo);
+			if (unlikely(r)) {
+				amdgpu_bo_kunmap(parent->bo);
+				return r;
+			}
+		}
+
 		pde = pd_addr + pt_idx * 8;
 		if (((last_pde + 8 * count) != pde) ||
 		    ((last_pt + incr * count) != pt) ||
@@ -1148,9 +1150,9 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 	if (params.func == amdgpu_vm_cpu_set_ptes)
 		amdgpu_bo_kunmap(parent->bo);
-	else if (params.ib->length_dw == 0) {
+	else if (params.ib && params.ib->length_dw == 0) {
 		amdgpu_job_free(job);
-	} else {
+	} else if (params.ib) {
 		amdgpu_ring_pad_ib(ring, params.ib);
 		amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
 				 AMDGPU_FENCE_OWNER_VM);
@@ -1166,8 +1168,7 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 		amdgpu_bo_fence(parent->bo, fence, true);
 		dma_fence_put(vm->last_dir_update);
-		vm->last_dir_update = dma_fence_get(fence);
-		dma_fence_put(fence);
+		vm->last_dir_update = fence;
 	}
 	/*
 	 * Recurse into the subdirectories. This recursion is harmless because
@@ -1176,7 +1177,7 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
 		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
 
-		if (!entry->bo)
+		if (!entry->bo || !entry->entries)
 			continue;
 
 		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);

Comments

nice improvement, just some nitpick inline, otherwise the patch is 
Reviewed-by: Chunming Zhou <david1.zhou@amd.com>.

On 2017年07月12日 11:56, Felix Kuehling wrote:
> When lots of virtual address spaces is used, there can be thousands
> of page table BOs. amdgpu_vm_update_level iterates over all of them
> recursively. In many cases only a few or none at all need to be
> updated. Minimize unnecessary code execution and memory usage in
> those cases.
>
> This speeds up memory mapping in a synthetic KFD memory mapping
> benchmark by roughly a factor two.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
>   1 file changed, 55 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ff5de3a..23b899b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_bo *shadow;
>   	struct amdgpu_ring *ring = NULL;
> -	uint64_t pd_addr, shadow_addr = 0;
> +	uint64_t pd_addr = 0, shadow_addr = 0;
>   	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>   	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>   	unsigned count = 0, pt_idx, ndw = 0;
> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	WARN_ON(vm->use_cpu_for_update && shadow);
>   	if (vm->use_cpu_for_update && !shadow) {
> -		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> -		if (r)
> -			return r;
> -		r = amdgpu_vm_bo_wait(adev, parent->bo);
> -		if (unlikely(r)) {
> -			amdgpu_bo_kunmap(parent->bo);
> -			return r;
> -		}
> +		/* Defer kmapping until it's actually needed. Some
> +		 * PDBs may need no update at all
> +		 */
>   		params.func = amdgpu_vm_cpu_set_ptes;
> +		params.ib = (void *)(long)-1;
>   	} else {
> -		if (shadow) {
> -			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -		ring = container_of(vm->entity.sched, struct amdgpu_ring,
> -				    sched);
> -
> -		/* padding, etc. */
> -		ndw = 64;
> -
> -		/* assume the worst case */
> -		ndw += parent->last_entry_used * 6;
> -
> -		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
> -
> -		if (shadow) {
> -			shadow_addr = amdgpu_bo_gpu_offset(shadow);
> -			ndw *= 2;
> -		} else {
> -			shadow_addr = 0;
> -		}
> -
> -		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> -		if (r)
> -			return r;
> -
> -		params.ib = &job->ibs[0];
> +		/* Defer IB allocation until it's actually
> +		 * needed. Some PDBs may need no update at all
> +		 */
> +		params.ib = NULL;
>   		params.func = amdgpu_vm_do_set_ptes;
>   	}
>   
> -
>   	/* walk over the address space and update the directory */
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   		if (bo == NULL)
>   			continue;
>   
> -		if (bo->shadow) {
> -			struct amdgpu_bo *pt_shadow = bo->shadow;
> -
> -			r = amdgpu_ttm_bind(&pt_shadow->tbo,
> -					    &pt_shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -
> -		pt = amdgpu_bo_gpu_offset(bo);
> -		pt = amdgpu_gart_get_vm_pde(adev, pt);
> +		pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think, 
although we can assume pt bo is a known domain.

>   		if (parent->entries[pt_idx].addr == pt)
>   			continue;
>   
>   		parent->entries[pt_idx].addr = pt;
>   
> +		if (!params.ib) {
> +			if (shadow) {
> +				r = amdgpu_ttm_bind(&shadow->tbo,
> +						    &shadow->tbo.mem);
> +				if (r)
> +					return r;
> +			}
> +
> +			ring = container_of(vm->entity.sched,
> +					    struct amdgpu_ring, sched);
> +
> +			/* padding, etc. */
> +			ndw = 64;
> +
> +			/* assume the worst case */
> +			ndw += (parent->last_entry_used - pt_idx) * 6;
> +
> +			pd_addr = parent->bo->tbo.offset;
> +
> +			if (shadow) {
> +				shadow_addr = shadow->tbo.offset;
> +				ndw *= 2;
> +			} else {
> +				shadow_addr = 0;
> +			}
> +			r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> +			if (r)
> +				return r;
> +
> +			params.ib = &job->ibs[0];
> +		} else if (!pd_addr) {
> +			r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> +			if (r)
> +				return r;
> +			r = amdgpu_vm_bo_wait(adev, parent->bo);
> +			if (unlikely(r)) {
> +				amdgpu_bo_kunmap(parent->bo);
> +				return r;
> +			}
> +		}
> +
>   		pde = pd_addr + pt_idx * 8;
>   		if (((last_pde + 8 * count) != pde) ||
>   		    ((last_pt + incr * count) != pt) ||
> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	if (params.func == amdgpu_vm_cpu_set_ptes)
>   		amdgpu_bo_kunmap(parent->bo);
> -	else if (params.ib->length_dw == 0) {
> +	else if (params.ib && params.ib->length_dw == 0) {
>   		amdgpu_job_free(job);
> -	} else {
> +	} else if (params.ib) {
>   		amdgpu_ring_pad_ib(ring, params.ib);
>   		amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>   				 AMDGPU_FENCE_OWNER_VM);
> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   		amdgpu_bo_fence(parent->bo, fence, true);
>   		dma_fence_put(vm->last_dir_update);
> -		vm->last_dir_update = dma_fence_get(fence);
> -		dma_fence_put(fence);
> +		vm->last_dir_update = fence;
Here I prefer to keep previous code, the logic is more clear.:)
>   	}
>   	/*
>   	 * Recurse into the subdirectories. This recursion is harmless because
> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>   
> -		if (!entry->bo)
> +		if (!entry->bo || !entry->entries)
In fact, !entry->entries would be checked at the begin of this function 
in next loop, but either way is ok to me here.

Regards,
David Zhou
>   			continue;
>   
>   		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
Hi David,

Responses inline ...

On 17-07-12 12:30 AM, zhoucm1 wrote:
> nice improvement, just some nitpick inline, otherwise the patch is
> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>.
>
> On 2017年07月12日 11:56, Felix Kuehling wrote:
>> When lots of virtual address spaces is used, there can be thousands
>> of page table BOs. amdgpu_vm_update_level iterates over all of them
>> recursively. In many cases only a few or none at all need to be
>> updated. Minimize unnecessary code execution and memory usage in
>> those cases.
>>
>> This speeds up memory mapping in a synthetic KFD memory mapping
>> benchmark by roughly a factor two.
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>> +++++++++++++++++----------------
>>   1 file changed, 55 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index ff5de3a..23b899b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_bo *shadow;
>>       struct amdgpu_ring *ring = NULL;
>> -    uint64_t pd_addr, shadow_addr = 0;
>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>       uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>       uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>       unsigned count = 0, pt_idx, ndw = 0;
>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         WARN_ON(vm->use_cpu_for_update && shadow);
>>       if (vm->use_cpu_for_update && !shadow) {
>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> -        if (r)
>> -            return r;
>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>> -        if (unlikely(r)) {
>> -            amdgpu_bo_kunmap(parent->bo);
>> -            return r;
>> -        }
>> +        /* Defer kmapping until it's actually needed. Some
>> +         * PDBs may need no update at all
>> +         */
>>           params.func = amdgpu_vm_cpu_set_ptes;
>> +        params.ib = (void *)(long)-1;
>>       } else {
>> -        if (shadow) {
>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>> -                    sched);
>> -
>> -        /* padding, etc. */
>> -        ndw = 64;
>> -
>> -        /* assume the worst case */
>> -        ndw += parent->last_entry_used * 6;
>> -
>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>> -
>> -        if (shadow) {
>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>> -            ndw *= 2;
>> -        } else {
>> -            shadow_addr = 0;
>> -        }
>> -
>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> -        if (r)
>> -            return r;
>> -
>> -        params.ib = &job->ibs[0];
>> +        /* Defer IB allocation until it's actually
>> +         * needed. Some PDBs may need no update at all
>> +         */
>> +        params.ib = NULL;
>>           params.func = amdgpu_vm_do_set_ptes;
>>       }
>>   -
>>       /* walk over the address space and update the directory */
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>           if (bo == NULL)
>>               continue;
>>   -        if (bo->shadow) {
>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>> -
>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>> -                        &pt_shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -
>> -        pt = amdgpu_bo_gpu_offset(bo);
>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
> Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think,
> although we can assume pt bo is a known domain.

amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They
are redundant here, because we can assume that the page table BOs have
been properly created and validated by kernel code we can trust more
than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top"
when running my memory mapping benchmark, so I think it makes sense to
eliminate the function call and redundant sanity checks in this code path.

>
>>           if (parent->entries[pt_idx].addr == pt)
>>               continue;
>>             parent->entries[pt_idx].addr = pt;
>>   +        if (!params.ib) {
>> +            if (shadow) {
>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>> +                            &shadow->tbo.mem);
>> +                if (r)
>> +                    return r;
>> +            }
>> +
>> +            ring = container_of(vm->entity.sched,
>> +                        struct amdgpu_ring, sched);
>> +
>> +            /* padding, etc. */
>> +            ndw = 64;
>> +
>> +            /* assume the worst case */
>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>> +
>> +            pd_addr = parent->bo->tbo.offset;
>> +
>> +            if (shadow) {
>> +                shadow_addr = shadow->tbo.offset;
>> +                ndw *= 2;
>> +            } else {
>> +                shadow_addr = 0;
>> +            }
>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> +            if (r)
>> +                return r;
>> +
>> +            params.ib = &job->ibs[0];
>> +        } else if (!pd_addr) {
>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> +            if (r)
>> +                return r;
>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>> +            if (unlikely(r)) {
>> +                amdgpu_bo_kunmap(parent->bo);
>> +                return r;
>> +            }
>> +        }
>> +
>>           pde = pd_addr + pt_idx * 8;
>>           if (((last_pde + 8 * count) != pde) ||
>>               ((last_pt + incr * count) != pt) ||
>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         if (params.func == amdgpu_vm_cpu_set_ptes)
>>           amdgpu_bo_kunmap(parent->bo);
>> -    else if (params.ib->length_dw == 0) {
>> +    else if (params.ib && params.ib->length_dw == 0) {
>>           amdgpu_job_free(job);
>> -    } else {
>> +    } else if (params.ib) {
>>           amdgpu_ring_pad_ib(ring, params.ib);
>>           amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>                    AMDGPU_FENCE_OWNER_VM);
>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>             amdgpu_bo_fence(parent->bo, fence, true);
>>           dma_fence_put(vm->last_dir_update);
>> -        vm->last_dir_update = dma_fence_get(fence);
>> -        dma_fence_put(fence);
>> +        vm->last_dir_update = fence;
> Here I prefer to keep previous code, the logic is more clear.:)

Again, this is a code path that can be called very frequently. So I'd
like to eliminate redundant atomic operations here.

>>       }
>>       /*
>>        * Recurse into the subdirectories. This recursion is harmless
>> because
>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>   -        if (!entry->bo)
>> +        if (!entry->bo || !entry->entries)
> In fact, !entry->entries would be checked at the begin of this
> function in next loop, but either way is ok to me here.

Doing the check here eliminates the lowest level of recursive function
calls. That's where it counts the most. Each successive level of the
recursion runs 512 times as often as the level above it. So completely
eliminating the function call overhead of the lowest level is a good thing.

Regards,
  Felix

>
> Regards,
> David Zhou
>>               continue;
>>             r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On 2017年07月12日 12:56, Felix Kuehling wrote:
> Hi David,
>
> Responses inline ...
Yeah, I see your mean, you mainly want to save some cpu overhead. If 
others have no object, feel free add my RB then.

Regards,
David Zhou
>
> On 17-07-12 12:30 AM, zhoucm1 wrote:
>> nice improvement, just some nitpick inline, otherwise the patch is
>> Reviewed-by: Chunming Zhou <david1.zhou@amd.com>.
>>
>> On 2017年07月12日 11:56, Felix Kuehling wrote:
>>> When lots of virtual address spaces is used, there can be thousands
>>> of page table BOs. amdgpu_vm_update_level iterates over all of them
>>> recursively. In many cases only a few or none at all need to be
>>> updated. Minimize unnecessary code execution and memory usage in
>>> those cases.
>>>
>>> This speeds up memory mapping in a synthetic KFD memory mapping
>>> benchmark by roughly a factor two.
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>>> +++++++++++++++++----------------
>>>    1 file changed, 55 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index ff5de3a..23b899b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_bo *shadow;
>>>        struct amdgpu_ring *ring = NULL;
>>> -    uint64_t pd_addr, shadow_addr = 0;
>>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>>        uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>>        uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>>        unsigned count = 0, pt_idx, ndw = 0;
>>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          WARN_ON(vm->use_cpu_for_update && shadow);
>>>        if (vm->use_cpu_for_update && !shadow) {
>>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> -        if (r)
>>> -            return r;
>>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> -        if (unlikely(r)) {
>>> -            amdgpu_bo_kunmap(parent->bo);
>>> -            return r;
>>> -        }
>>> +        /* Defer kmapping until it's actually needed. Some
>>> +         * PDBs may need no update at all
>>> +         */
>>>            params.func = amdgpu_vm_cpu_set_ptes;
>>> +        params.ib = (void *)(long)-1;
>>>        } else {
>>> -        if (shadow) {
>>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>>> -                    sched);
>>> -
>>> -        /* padding, etc. */
>>> -        ndw = 64;
>>> -
>>> -        /* assume the worst case */
>>> -        ndw += parent->last_entry_used * 6;
>>> -
>>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>>> -
>>> -        if (shadow) {
>>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> -            ndw *= 2;
>>> -        } else {
>>> -            shadow_addr = 0;
>>> -        }
>>> -
>>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> -        if (r)
>>> -            return r;
>>> -
>>> -        params.ib = &job->ibs[0];
>>> +        /* Defer IB allocation until it's actually
>>> +         * needed. Some PDBs may need no update at all
>>> +         */
>>> +        params.ib = NULL;
>>>            params.func = amdgpu_vm_do_set_ptes;
>>>        }
>>>    -
>>>        /* walk over the address space and update the directory */
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>            if (bo == NULL)
>>>                continue;
>>>    -        if (bo->shadow) {
>>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>>> -
>>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>>> -                        &pt_shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -
>>> -        pt = amdgpu_bo_gpu_offset(bo);
>>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>> Getting offset from amdgpu_bo_gpu_offset(bo)  is better I think,
>> although we can assume pt bo is a known domain.
> amdgpu_bo_gpu_offset does a bunch of sanity checks (WARN_ON_ONCE). They
> are redundant here, because we can assume that the page table BOs have
> been properly created and validated by kernel code we can trust more
> than user mode. I saw amdgpu_bo_gpu_offset near the top with "perf top"
> when running my memory mapping benchmark, so I think it makes sense to
> eliminate the function call and redundant sanity checks in this code path.
>
>>>            if (parent->entries[pt_idx].addr == pt)
>>>                continue;
>>>              parent->entries[pt_idx].addr = pt;
>>>    +        if (!params.ib) {
>>> +            if (shadow) {
>>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>>> +                            &shadow->tbo.mem);
>>> +                if (r)
>>> +                    return r;
>>> +            }
>>> +
>>> +            ring = container_of(vm->entity.sched,
>>> +                        struct amdgpu_ring, sched);
>>> +
>>> +            /* padding, etc. */
>>> +            ndw = 64;
>>> +
>>> +            /* assume the worst case */
>>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>>> +
>>> +            pd_addr = parent->bo->tbo.offset;
>>> +
>>> +            if (shadow) {
>>> +                shadow_addr = shadow->tbo.offset;
>>> +                ndw *= 2;
>>> +            } else {
>>> +                shadow_addr = 0;
>>> +            }
>>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            params.ib = &job->ibs[0];
>>> +        } else if (!pd_addr) {
>>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> +            if (r)
>>> +                return r;
>>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> +            if (unlikely(r)) {
>>> +                amdgpu_bo_kunmap(parent->bo);
>>> +                return r;
>>> +            }
>>> +        }
>>> +
>>>            pde = pd_addr + pt_idx * 8;
>>>            if (((last_pde + 8 * count) != pde) ||
>>>                ((last_pt + incr * count) != pt) ||
>>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          if (params.func == amdgpu_vm_cpu_set_ptes)
>>>            amdgpu_bo_kunmap(parent->bo);
>>> -    else if (params.ib->length_dw == 0) {
>>> +    else if (params.ib && params.ib->length_dw == 0) {
>>>            amdgpu_job_free(job);
>>> -    } else {
>>> +    } else if (params.ib) {
>>>            amdgpu_ring_pad_ib(ring, params.ib);
>>>            amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>>                     AMDGPU_FENCE_OWNER_VM);
>>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>              amdgpu_bo_fence(parent->bo, fence, true);
>>>            dma_fence_put(vm->last_dir_update);
>>> -        vm->last_dir_update = dma_fence_get(fence);
>>> -        dma_fence_put(fence);
>>> +        vm->last_dir_update = fence;
>> Here I prefer to keep previous code, the logic is more clear.:)
> Again, this is a code path that can be called very frequently. So I'd
> like to eliminate redundant atomic operations here.
>
>>>        }
>>>        /*
>>>         * Recurse into the subdirectories. This recursion is harmless
>>> because
>>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>>    -        if (!entry->bo)
>>> +        if (!entry->bo || !entry->entries)
>> In fact, !entry->entries would be checked at the begin of this
>> function in next loop, but either way is ok to me here.
> Doing the check here eliminates the lowest level of recursive function
> calls. That's where it counts the most. Each successive level of the
> recursion runs 512 times as often as the level above it. So completely
> eliminating the function call overhead of the lowest level is a good thing.
>
> Regards,
>    Felix
>
>> Regards,
>> David Zhou
>>>                continue;
>>>              r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 12.07.2017 um 05:56 schrieb Felix Kuehling:
> When lots of virtual address spaces is used, there can be thousands
> of page table BOs. amdgpu_vm_update_level iterates over all of them
> recursively. In many cases only a few or none at all need to be
> updated. Minimize unnecessary code execution and memory usage in
> those cases.
>
> This speeds up memory mapping in a synthetic KFD memory mapping
> benchmark by roughly a factor two.
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

That optimization is unnecessary.

I have a patch in the pipeline that makes the VM BO permanently CPU 
mapped and also fixes a couple of bugs in those code path.

Give me a moment to clean that up and send it out.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109 +++++++++++++++++----------------
>   1 file changed, 55 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index ff5de3a..23b899b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   {
>   	struct amdgpu_bo *shadow;
>   	struct amdgpu_ring *ring = NULL;
> -	uint64_t pd_addr, shadow_addr = 0;
> +	uint64_t pd_addr = 0, shadow_addr = 0;
>   	uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>   	uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>   	unsigned count = 0, pt_idx, ndw = 0;
> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	WARN_ON(vm->use_cpu_for_update && shadow);
>   	if (vm->use_cpu_for_update && !shadow) {
> -		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> -		if (r)
> -			return r;
> -		r = amdgpu_vm_bo_wait(adev, parent->bo);
> -		if (unlikely(r)) {
> -			amdgpu_bo_kunmap(parent->bo);
> -			return r;
> -		}
> +		/* Defer kmapping until it's actually needed. Some
> +		 * PDBs may need no update at all
> +		 */
>   		params.func = amdgpu_vm_cpu_set_ptes;
> +		params.ib = (void *)(long)-1;
>   	} else {
> -		if (shadow) {
> -			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -		ring = container_of(vm->entity.sched, struct amdgpu_ring,
> -				    sched);
> -
> -		/* padding, etc. */
> -		ndw = 64;
> -
> -		/* assume the worst case */
> -		ndw += parent->last_entry_used * 6;
> -
> -		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
> -
> -		if (shadow) {
> -			shadow_addr = amdgpu_bo_gpu_offset(shadow);
> -			ndw *= 2;
> -		} else {
> -			shadow_addr = 0;
> -		}
> -
> -		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> -		if (r)
> -			return r;
> -
> -		params.ib = &job->ibs[0];
> +		/* Defer IB allocation until it's actually
> +		 * needed. Some PDBs may need no update at all
> +		 */
> +		params.ib = NULL;
>   		params.func = amdgpu_vm_do_set_ptes;
>   	}
>   
> -
>   	/* walk over the address space and update the directory */
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   		if (bo == NULL)
>   			continue;
>   
> -		if (bo->shadow) {
> -			struct amdgpu_bo *pt_shadow = bo->shadow;
> -
> -			r = amdgpu_ttm_bind(&pt_shadow->tbo,
> -					    &pt_shadow->tbo.mem);
> -			if (r)
> -				return r;
> -		}
> -
> -		pt = amdgpu_bo_gpu_offset(bo);
> -		pt = amdgpu_gart_get_vm_pde(adev, pt);
> +		pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>   		if (parent->entries[pt_idx].addr == pt)
>   			continue;
>   
>   		parent->entries[pt_idx].addr = pt;
>   
> +		if (!params.ib) {
> +			if (shadow) {
> +				r = amdgpu_ttm_bind(&shadow->tbo,
> +						    &shadow->tbo.mem);
> +				if (r)
> +					return r;
> +			}
> +
> +			ring = container_of(vm->entity.sched,
> +					    struct amdgpu_ring, sched);
> +
> +			/* padding, etc. */
> +			ndw = 64;
> +
> +			/* assume the worst case */
> +			ndw += (parent->last_entry_used - pt_idx) * 6;
> +
> +			pd_addr = parent->bo->tbo.offset;
> +
> +			if (shadow) {
> +				shadow_addr = shadow->tbo.offset;
> +				ndw *= 2;
> +			} else {
> +				shadow_addr = 0;
> +			}
> +			r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
> +			if (r)
> +				return r;
> +
> +			params.ib = &job->ibs[0];
> +		} else if (!pd_addr) {
> +			r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
> +			if (r)
> +				return r;
> +			r = amdgpu_vm_bo_wait(adev, parent->bo);
> +			if (unlikely(r)) {
> +				amdgpu_bo_kunmap(parent->bo);
> +				return r;
> +			}
> +		}
> +
>   		pde = pd_addr + pt_idx * 8;
>   		if (((last_pde + 8 * count) != pde) ||
>   		    ((last_pt + incr * count) != pt) ||
> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   	if (params.func == amdgpu_vm_cpu_set_ptes)
>   		amdgpu_bo_kunmap(parent->bo);
> -	else if (params.ib->length_dw == 0) {
> +	else if (params.ib && params.ib->length_dw == 0) {
>   		amdgpu_job_free(job);
> -	} else {
> +	} else if (params.ib) {
>   		amdgpu_ring_pad_ib(ring, params.ib);
>   		amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>   				 AMDGPU_FENCE_OWNER_VM);
> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   
>   		amdgpu_bo_fence(parent->bo, fence, true);
>   		dma_fence_put(vm->last_dir_update);
> -		vm->last_dir_update = dma_fence_get(fence);
> -		dma_fence_put(fence);
> +		vm->last_dir_update = fence;
>   	}
>   	/*
>   	 * Recurse into the subdirectories. This recursion is harmless because
> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>   		struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>   
> -		if (!entry->bo)
> +		if (!entry->bo || !entry->entries)
>   			continue;
>   
>   		r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
On 17-07-12 03:50 AM, Christian König wrote:
>  
> That optimization is unnecessary.
>
> I have a patch in the pipeline that makes the VM BO permanently CPU
> mapped and also fixes a couple of bugs in those code path.

You mean just for the CPU-update path? I think deferring the SDMA IB
allocation still makes sense. In fact that's the code path I tested.

>
> Give me a moment to clean that up and send it out.

Sure. BTW, I found that the most expensive operation turned out to be
amdgpu_vm_move_pt_bos_in_lru. It has to recurse through all PT BOs and
does lots of redundant list pointer manipulations for two lists (LRU and
swap), and kref get/put.

For KFD VMs I'm removing the call to it. I think it's useless for us.

amdgpu_vm_validate_pt_bos is also expensive, but not as bad because it's
only done if there was an eviction.

I was thinking about ways to optimize this. It should be possible to
allocate multiple PTBs and PDBs in a single BO in amdgpu_vm_alloc_pts.
That would potentially save lots of time in vm_move_pt_bos and
vm_validate. What do you think?

Regards,
  Felix

>
> Regards,
> Christian.
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>> +++++++++++++++++----------------
>>   1 file changed, 55 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index ff5de3a..23b899b 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>   {
>>       struct amdgpu_bo *shadow;
>>       struct amdgpu_ring *ring = NULL;
>> -    uint64_t pd_addr, shadow_addr = 0;
>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>       uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>       uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>       unsigned count = 0, pt_idx, ndw = 0;
>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         WARN_ON(vm->use_cpu_for_update && shadow);
>>       if (vm->use_cpu_for_update && !shadow) {
>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> -        if (r)
>> -            return r;
>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>> -        if (unlikely(r)) {
>> -            amdgpu_bo_kunmap(parent->bo);
>> -            return r;
>> -        }
>> +        /* Defer kmapping until it's actually needed. Some
>> +         * PDBs may need no update at all
>> +         */
>>           params.func = amdgpu_vm_cpu_set_ptes;
>> +        params.ib = (void *)(long)-1;
>>       } else {
>> -        if (shadow) {
>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>> -                    sched);
>> -
>> -        /* padding, etc. */
>> -        ndw = 64;
>> -
>> -        /* assume the worst case */
>> -        ndw += parent->last_entry_used * 6;
>> -
>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>> -
>> -        if (shadow) {
>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>> -            ndw *= 2;
>> -        } else {
>> -            shadow_addr = 0;
>> -        }
>> -
>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> -        if (r)
>> -            return r;
>> -
>> -        params.ib = &job->ibs[0];
>> +        /* Defer IB allocation until it's actually
>> +         * needed. Some PDBs may need no update at all
>> +         */
>> +        params.ib = NULL;
>>           params.func = amdgpu_vm_do_set_ptes;
>>       }
>>   -
>>       /* walk over the address space and update the directory */
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>           if (bo == NULL)
>>               continue;
>>   -        if (bo->shadow) {
>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>> -
>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>> -                        &pt_shadow->tbo.mem);
>> -            if (r)
>> -                return r;
>> -        }
>> -
>> -        pt = amdgpu_bo_gpu_offset(bo);
>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>>           if (parent->entries[pt_idx].addr == pt)
>>               continue;
>>             parent->entries[pt_idx].addr = pt;
>>   +        if (!params.ib) {
>> +            if (shadow) {
>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>> +                            &shadow->tbo.mem);
>> +                if (r)
>> +                    return r;
>> +            }
>> +
>> +            ring = container_of(vm->entity.sched,
>> +                        struct amdgpu_ring, sched);
>> +
>> +            /* padding, etc. */
>> +            ndw = 64;
>> +
>> +            /* assume the worst case */
>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>> +
>> +            pd_addr = parent->bo->tbo.offset;
>> +
>> +            if (shadow) {
>> +                shadow_addr = shadow->tbo.offset;
>> +                ndw *= 2;
>> +            } else {
>> +                shadow_addr = 0;
>> +            }
>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>> +            if (r)
>> +                return r;
>> +
>> +            params.ib = &job->ibs[0];
>> +        } else if (!pd_addr) {
>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>> +            if (r)
>> +                return r;
>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>> +            if (unlikely(r)) {
>> +                amdgpu_bo_kunmap(parent->bo);
>> +                return r;
>> +            }
>> +        }
>> +
>>           pde = pd_addr + pt_idx * 8;
>>           if (((last_pde + 8 * count) != pde) ||
>>               ((last_pt + incr * count) != pt) ||
>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>         if (params.func == amdgpu_vm_cpu_set_ptes)
>>           amdgpu_bo_kunmap(parent->bo);
>> -    else if (params.ib->length_dw == 0) {
>> +    else if (params.ib && params.ib->length_dw == 0) {
>>           amdgpu_job_free(job);
>> -    } else {
>> +    } else if (params.ib) {
>>           amdgpu_ring_pad_ib(ring, params.ib);
>>           amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>                    AMDGPU_FENCE_OWNER_VM);
>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>             amdgpu_bo_fence(parent->bo, fence, true);
>>           dma_fence_put(vm->last_dir_update);
>> -        vm->last_dir_update = dma_fence_get(fence);
>> -        dma_fence_put(fence);
>> +        vm->last_dir_update = fence;
>>       }
>>       /*
>>        * Recurse into the subdirectories. This recursion is harmless
>> because
>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>       for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>           struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>   -        if (!entry->bo)
>> +        if (!entry->bo || !entry->entries)
>>               continue;
>>             r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>
>
Am 12.07.2017 um 21:38 schrieb Felix Kuehling:
> On 17-07-12 03:50 AM, Christian König wrote:
>>   
>> That optimization is unnecessary.
>>
>> I have a patch in the pipeline that makes the VM BO permanently CPU
>> mapped and also fixes a couple of bugs in those code path.
> You mean just for the CPU-update path? I think deferring the SDMA IB
> allocation still makes sense. In fact that's the code path I tested.

I've added tracking if we need to update anything to all paths, not just 
the CPU update path.

>> Give me a moment to clean that up and send it out.
> Sure. BTW, I found that the most expensive operation turned out to be
> amdgpu_vm_move_pt_bos_in_lru. It has to recurse through all PT BOs and
> does lots of redundant list pointer manipulations for two lists (LRU and
> swap), and kref get/put.

Yeah, that's why I've stopped using the separate remove/add functions 
and wanted to optimize just moving everything to the end.

But that turned out to be more problematic than I thought.

> For KFD VMs I'm removing the call to it. I think it's useless for us.
>
> amdgpu_vm_validate_pt_bos is also expensive, but not as bad because it's
> only done if there was an eviction.
>
> I was thinking about ways to optimize this. It should be possible to
> allocate multiple PTBs and PDBs in a single BO in amdgpu_vm_alloc_pts.
> That would potentially save lots of time in vm_move_pt_bos and
> vm_validate. What do you think?

I think allocating separate BOs is still the right approach, a single BO 
is tricky to allocate and probably won't worth it.

Instead I move the LRU handling after the validation. So BOs are only 
moved in the LRU when there actually was some eviction.

Since we don't care what kind of eviction it was this basically turns 
LRU handling on/off based on memory pressure (e.g. evictions).

Regards,
Christian.

>
> Regards,
>    Felix
>
>> Regards,
>> Christian.
>>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 109
>>> +++++++++++++++++----------------
>>>    1 file changed, 55 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> index ff5de3a..23b899b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>> @@ -1025,7 +1025,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>    {
>>>        struct amdgpu_bo *shadow;
>>>        struct amdgpu_ring *ring = NULL;
>>> -    uint64_t pd_addr, shadow_addr = 0;
>>> +    uint64_t pd_addr = 0, shadow_addr = 0;
>>>        uint32_t incr = amdgpu_vm_bo_size(adev, level + 1);
>>>        uint64_t last_pde = ~0, last_pt = ~0, last_shadow = ~0;
>>>        unsigned count = 0, pt_idx, ndw = 0;
>>> @@ -1044,48 +1044,19 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          WARN_ON(vm->use_cpu_for_update && shadow);
>>>        if (vm->use_cpu_for_update && !shadow) {
>>> -        r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> -        if (r)
>>> -            return r;
>>> -        r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> -        if (unlikely(r)) {
>>> -            amdgpu_bo_kunmap(parent->bo);
>>> -            return r;
>>> -        }
>>> +        /* Defer kmapping until it's actually needed. Some
>>> +         * PDBs may need no update at all
>>> +         */
>>>            params.func = amdgpu_vm_cpu_set_ptes;
>>> +        params.ib = (void *)(long)-1;
>>>        } else {
>>> -        if (shadow) {
>>> -            r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -        ring = container_of(vm->entity.sched, struct amdgpu_ring,
>>> -                    sched);
>>> -
>>> -        /* padding, etc. */
>>> -        ndw = 64;
>>> -
>>> -        /* assume the worst case */
>>> -        ndw += parent->last_entry_used * 6;
>>> -
>>> -        pd_addr = amdgpu_bo_gpu_offset(parent->bo);
>>> -
>>> -        if (shadow) {
>>> -            shadow_addr = amdgpu_bo_gpu_offset(shadow);
>>> -            ndw *= 2;
>>> -        } else {
>>> -            shadow_addr = 0;
>>> -        }
>>> -
>>> -        r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> -        if (r)
>>> -            return r;
>>> -
>>> -        params.ib = &job->ibs[0];
>>> +        /* Defer IB allocation until it's actually
>>> +         * needed. Some PDBs may need no update at all
>>> +         */
>>> +        params.ib = NULL;
>>>            params.func = amdgpu_vm_do_set_ptes;
>>>        }
>>>    -
>>>        /* walk over the address space and update the directory */
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_bo *bo = parent->entries[pt_idx].bo;
>>> @@ -1094,22 +1065,53 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>            if (bo == NULL)
>>>                continue;
>>>    -        if (bo->shadow) {
>>> -            struct amdgpu_bo *pt_shadow = bo->shadow;
>>> -
>>> -            r = amdgpu_ttm_bind(&pt_shadow->tbo,
>>> -                        &pt_shadow->tbo.mem);
>>> -            if (r)
>>> -                return r;
>>> -        }
>>> -
>>> -        pt = amdgpu_bo_gpu_offset(bo);
>>> -        pt = amdgpu_gart_get_vm_pde(adev, pt);
>>> +        pt = amdgpu_gart_get_vm_pde(adev, bo->tbo.offset);
>>>            if (parent->entries[pt_idx].addr == pt)
>>>                continue;
>>>              parent->entries[pt_idx].addr = pt;
>>>    +        if (!params.ib) {
>>> +            if (shadow) {
>>> +                r = amdgpu_ttm_bind(&shadow->tbo,
>>> +                            &shadow->tbo.mem);
>>> +                if (r)
>>> +                    return r;
>>> +            }
>>> +
>>> +            ring = container_of(vm->entity.sched,
>>> +                        struct amdgpu_ring, sched);
>>> +
>>> +            /* padding, etc. */
>>> +            ndw = 64;
>>> +
>>> +            /* assume the worst case */
>>> +            ndw += (parent->last_entry_used - pt_idx) * 6;
>>> +
>>> +            pd_addr = parent->bo->tbo.offset;
>>> +
>>> +            if (shadow) {
>>> +                shadow_addr = shadow->tbo.offset;
>>> +                ndw *= 2;
>>> +            } else {
>>> +                shadow_addr = 0;
>>> +            }
>>> +            r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
>>> +            if (r)
>>> +                return r;
>>> +
>>> +            params.ib = &job->ibs[0];
>>> +        } else if (!pd_addr) {
>>> +            r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
>>> +            if (r)
>>> +                return r;
>>> +            r = amdgpu_vm_bo_wait(adev, parent->bo);
>>> +            if (unlikely(r)) {
>>> +                amdgpu_bo_kunmap(parent->bo);
>>> +                return r;
>>> +            }
>>> +        }
>>> +
>>>            pde = pd_addr + pt_idx * 8;
>>>            if (((last_pde + 8 * count) != pde) ||
>>>                ((last_pt + incr * count) != pt) ||
>>> @@ -1148,9 +1150,9 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>          if (params.func == amdgpu_vm_cpu_set_ptes)
>>>            amdgpu_bo_kunmap(parent->bo);
>>> -    else if (params.ib->length_dw == 0) {
>>> +    else if (params.ib && params.ib->length_dw == 0) {
>>>            amdgpu_job_free(job);
>>> -    } else {
>>> +    } else if (params.ib) {
>>>            amdgpu_ring_pad_ib(ring, params.ib);
>>>            amdgpu_sync_resv(adev, &job->sync, parent->bo->tbo.resv,
>>>                     AMDGPU_FENCE_OWNER_VM);
>>> @@ -1166,8 +1168,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>              amdgpu_bo_fence(parent->bo, fence, true);
>>>            dma_fence_put(vm->last_dir_update);
>>> -        vm->last_dir_update = dma_fence_get(fence);
>>> -        dma_fence_put(fence);
>>> +        vm->last_dir_update = fence;
>>>        }
>>>        /*
>>>         * Recurse into the subdirectories. This recursion is harmless
>>> because
>>> @@ -1176,7 +1177,7 @@ static int amdgpu_vm_update_level(struct
>>> amdgpu_device *adev,
>>>        for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
>>>            struct amdgpu_vm_pt *entry = &parent->entries[pt_idx];
>>>    -        if (!entry->bo)
>>> +        if (!entry->bo || !entry->entries)
>>>                continue;
>>>              r = amdgpu_vm_update_level(adev, vm, entry, level + 1);
>>