drm/amdgpu: fix VM sync with always valid BOs

Submitted by Christian König on Sept. 8, 2017, 12:18 p.m.

Details

Message ID 1504873115-7651-1-git-send-email-deathsimple@vodafone.de
State New
Headers show
Series "drm/amdgpu: fix VM sync with always valid BOs" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Christian König Sept. 8, 2017, 12:18 p.m.
From: Christian König <christian.koenig@amd.com>

All users of a VM must always wait for updates with always
valid BOs to be completed.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
 3 files changed, 21 insertions(+), 10 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 8aa37e0..d6e66b7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -752,10 +752,6 @@  static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
 	if (r)
 		return r;
 
-	r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
-	if (r)
-		return r;
-
 	r = amdgpu_vm_clear_freed(adev, vm, NULL);
 	if (r)
 		return r;
@@ -797,6 +793,10 @@  static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
 			if (bo_va == NULL)
 				continue;
 
+			if (bo_va->base.bo->flags &
+			    AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
+				continue;
+
 			r = amdgpu_vm_bo_update(adev, bo_va, false);
 			if (r)
 				return r;
@@ -810,6 +810,12 @@  static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
 	}
 
 	r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
+	if (r)
+		return r;
+
+	r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
+	if (r)
+		return r;
 
 	if (amdgpu_vm_debug && p->bo_list) {
 		/* Invalidate all BOs to test for userspace bugs */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 55f1ecb..12c8a4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1140,9 +1140,8 @@  static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 				goto error_free;
 
 			amdgpu_bo_fence(parent->base.bo, fence, true);
-			dma_fence_put(vm->last_dir_update);
-			vm->last_dir_update = dma_fence_get(fence);
-			dma_fence_put(fence);
+			dma_fence_put(vm->mandatory_update);
+			vm->mandatory_update = fence;
 		}
 	}
 
@@ -1803,6 +1802,12 @@  int amdgpu_vm_bo_update(struct amdgpu_device *adev,
 			trace_amdgpu_vm_bo_mapping(mapping);
 	}
 
+	if (bo_va->base.bo &&
+	    bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
+		dma_fence_put(vm->mandatory_update);
+		vm->mandatory_update = dma_fence_get(bo_va->last_pt_update);
+	}
+
 	return 0;
 }
 
@@ -2586,7 +2591,7 @@  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			 vm->use_cpu_for_update ? "CPU" : "SDMA");
 	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
 		  "CPU update of VM recommended only for large BAR system\n");
-	vm->last_dir_update = NULL;
+	vm->mandatory_update = NULL;
 
 	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 			AMDGPU_GEM_CREATE_VRAM_CLEARED;
@@ -2692,7 +2697,7 @@  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 	}
 
 	amdgpu_vm_free_levels(&vm->root);
-	dma_fence_put(vm->last_dir_update);
+	dma_fence_put(vm->mandatory_update);
 	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
 		amdgpu_vm_free_reserved_vmid(adev, vm, i);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index c1accd1..63fa2e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -140,7 +140,7 @@  struct amdgpu_vm {
 
 	/* contains the page directory */
 	struct amdgpu_vm_pt     root;
-	struct dma_fence	*last_dir_update;
+	struct dma_fence	*mandatory_update;
 
 	/* protecting freed */
 	spinlock_t		freed_lock;

Comments

Marek this one will most likely fix your issues with always valid BOs on 
Raven.

Please give it a try when you have time.

Thanks,
Christian.

Am 08.09.2017 um 14:18 schrieb Christian König:
> From: Christian König <christian.koenig@amd.com>
>
> All users of a VM must always wait for updates with always
> valid BOs to be completed.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>   3 files changed, 21 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 8aa37e0..d6e66b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -752,10 +752,6 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>   	if (r)
>   		return r;
>   
> -	r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
> -	if (r)
> -		return r;
> -
>   	r = amdgpu_vm_clear_freed(adev, vm, NULL);
>   	if (r)
>   		return r;
> @@ -797,6 +793,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>   			if (bo_va == NULL)
>   				continue;
>   
> +			if (bo_va->base.bo->flags &
> +			    AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
> +				continue;
> +
>   			r = amdgpu_vm_bo_update(adev, bo_va, false);
>   			if (r)
>   				return r;
> @@ -810,6 +810,12 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>   	}
>   
>   	r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
> +	if (r)
> +		return r;
> +
> +	r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
> +	if (r)
> +		return r;
>   
>   	if (amdgpu_vm_debug && p->bo_list) {
>   		/* Invalidate all BOs to test for userspace bugs */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 55f1ecb..12c8a4c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1140,9 +1140,8 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>   				goto error_free;
>   
>   			amdgpu_bo_fence(parent->base.bo, fence, true);
> -			dma_fence_put(vm->last_dir_update);
> -			vm->last_dir_update = dma_fence_get(fence);
> -			dma_fence_put(fence);
> +			dma_fence_put(vm->mandatory_update);
> +			vm->mandatory_update = fence;
>   		}
>   	}
>   
> @@ -1803,6 +1802,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>   			trace_amdgpu_vm_bo_mapping(mapping);
>   	}
>   
> +	if (bo_va->base.bo &&
> +	    bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
> +		dma_fence_put(vm->mandatory_update);
> +		vm->mandatory_update = dma_fence_get(bo_va->last_pt_update);
> +	}
> +
>   	return 0;
>   }
>   
> @@ -2586,7 +2591,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			 vm->use_cpu_for_update ? "CPU" : "SDMA");
>   	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
>   		  "CPU update of VM recommended only for large BAR system\n");
> -	vm->last_dir_update = NULL;
> +	vm->mandatory_update = NULL;
>   
>   	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>   			AMDGPU_GEM_CREATE_VRAM_CLEARED;
> @@ -2692,7 +2697,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   	}
>   
>   	amdgpu_vm_free_levels(&vm->root);
> -	dma_fence_put(vm->last_dir_update);
> +	dma_fence_put(vm->mandatory_update);
>   	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
>   		amdgpu_vm_free_reserved_vmid(adev, vm, i);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index c1accd1..63fa2e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -140,7 +140,7 @@ struct amdgpu_vm {
>   
>   	/* contains the page directory */
>   	struct amdgpu_vm_pt     root;
> -	struct dma_fence	*last_dir_update;
> +	struct dma_fence	*mandatory_update;
>   
>   	/* protecting freed */
>   	spinlock_t		freed_lock;
The hang seems to be gone with this patch.

Marek

On Fri, Sep 8, 2017 at 2:26 PM, Christian König <deathsimple@vodafone.de> wrote:
> Marek this one will most likely fix your issues with always valid BOs on
> Raven.
>
> Please give it a try when you have time.
>
> Thanks,
> Christian.
>
>
> Am 08.09.2017 um 14:18 schrieb Christian König:
>>
>> From: Christian König <christian.koenig@amd.com>
>>
>> All users of a VM must always wait for updates with always
>> valid BOs to be completed.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>>   3 files changed, 21 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 8aa37e0..d6e66b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -752,10 +752,6 @@ static int amdgpu_bo_vm_update_pte(struct
>> amdgpu_cs_parser *p)
>>         if (r)
>>                 return r;
>>   -     r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
>> -       if (r)
>> -               return r;
>> -
>>         r = amdgpu_vm_clear_freed(adev, vm, NULL);
>>         if (r)
>>                 return r;
>> @@ -797,6 +793,10 @@ static int amdgpu_bo_vm_update_pte(struct
>> amdgpu_cs_parser *p)
>>                         if (bo_va == NULL)
>>                                 continue;
>>   +                     if (bo_va->base.bo->flags &
>> +                           AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
>> +                               continue;
>> +
>>                         r = amdgpu_vm_bo_update(adev, bo_va, false);
>>                         if (r)
>>                                 return r;
>> @@ -810,6 +810,12 @@ static int amdgpu_bo_vm_update_pte(struct
>> amdgpu_cs_parser *p)
>>         }
>>         r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
>> +       if (r)
>> +               return r;
>> +
>> +       r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
>> +       if (r)
>> +               return r;
>>         if (amdgpu_vm_debug && p->bo_list) {
>>                 /* Invalidate all BOs to test for userspace bugs */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 55f1ecb..12c8a4c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1140,9 +1140,8 @@ static int amdgpu_vm_update_level(struct
>> amdgpu_device *adev,
>>                                 goto error_free;
>>                         amdgpu_bo_fence(parent->base.bo, fence, true);
>> -                       dma_fence_put(vm->last_dir_update);
>> -                       vm->last_dir_update = dma_fence_get(fence);
>> -                       dma_fence_put(fence);
>> +                       dma_fence_put(vm->mandatory_update);
>> +                       vm->mandatory_update = fence;
>>                 }
>>         }
>>   @@ -1803,6 +1802,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device
>> *adev,
>>                         trace_amdgpu_vm_bo_mapping(mapping);
>>         }
>>   +     if (bo_va->base.bo &&
>> +           bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
>> +               dma_fence_put(vm->mandatory_update);
>> +               vm->mandatory_update =
>> dma_fence_get(bo_va->last_pt_update);
>> +       }
>> +
>>         return 0;
>>   }
>>   @@ -2586,7 +2591,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm,
>>                          vm->use_cpu_for_update ? "CPU" : "SDMA");
>>         WARN_ONCE((vm->use_cpu_for_update &
>> !amdgpu_vm_is_large_bar(adev)),
>>                   "CPU update of VM recommended only for large BAR
>> system\n");
>> -       vm->last_dir_update = NULL;
>> +       vm->mandatory_update = NULL;
>>         flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>>                         AMDGPU_GEM_CREATE_VRAM_CLEARED;
>> @@ -2692,7 +2697,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm)
>>         }
>>         amdgpu_vm_free_levels(&vm->root);
>> -       dma_fence_put(vm->last_dir_update);
>> +       dma_fence_put(vm->mandatory_update);
>>         for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
>>                 amdgpu_vm_free_reserved_vmid(adev, vm, i);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index c1accd1..63fa2e5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -140,7 +140,7 @@ struct amdgpu_vm {
>>         /* contains the page directory */
>>         struct amdgpu_vm_pt     root;
>> -       struct dma_fence        *last_dir_update;
>> +       struct dma_fence        *mandatory_update;
>>         /* protecting freed */
>>         spinlock_t              freed_lock;
>
>
>
I think this change could use a little more explanation. I also have
some ideas for simplifying the code and making it easier to understand.
See comments inline.

But it looks good to me otherwise. Reviewed-by: Felix Kuehling
<Felix.Kuehling@amd.com>

Regards,
  Felix


On 2017-09-08 08:18 AM, Christian König wrote:
> From: Christian König <christian.koenig@amd.com>
>
> All users of a VM must always wait for updates with always
> valid BOs to be completed.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>  3 files changed, 21 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 8aa37e0..d6e66b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -752,10 +752,6 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>  	if (r)
>  		return r;
>  
> -	r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
> -	if (r)
> -		return r;
> -
>  	r = amdgpu_vm_clear_freed(adev, vm, NULL);
>  	if (r)
>  		return r;
> @@ -797,6 +793,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>  			if (bo_va == NULL)
>  				continue;
>  
> +			if (bo_va->base.bo->flags &
> +			    AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
> +				continue;

This looks like somewhat unrelated. If you don't make it a separate
patch, at least mention it in the change description. Something like:
Always-valid BOs' page table entries are updated by
amdgpu_vm_handle_moved. Skip them in amdgpu_bo_vm_update_pte.

> +
>  			r = amdgpu_vm_bo_update(adev, bo_va, false);
>  			if (r)
>  				return r;
> @@ -810,6 +810,12 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>  	}
>  
>  	r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
> +	if (r)
> +		return r;
> +
> +	r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
> +	if (r)
> +		return r;
>  
>  	if (amdgpu_vm_debug && p->bo_list) {
>  		/* Invalidate all BOs to test for userspace bugs */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 55f1ecb..12c8a4c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -1140,9 +1140,8 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>  				goto error_free;
>  
>  			amdgpu_bo_fence(parent->base.bo, fence, true);
> -			dma_fence_put(vm->last_dir_update);
> -			vm->last_dir_update = dma_fence_get(fence);
> -			dma_fence_put(fence);
> +			dma_fence_put(vm->mandatory_update);
> +			vm->mandatory_update = fence;
>  		}
>  	}
>  
> @@ -1803,6 +1802,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>  			trace_amdgpu_vm_bo_mapping(mapping);
>  	}
>  
> +	if (bo_va->base.bo &&
> +	    bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
> +		dma_fence_put(vm->mandatory_update);
> +		vm->mandatory_update = dma_fence_get(bo_va->last_pt_update);
> +	}
> +

A suggestion: This could be handled with fewer cryptic conditions
littered throughout the code by passing a struct dma_fence **fence
parameter to amdgpu_vm_bo_update. When this is called for updating VM
BOs (page tables or always valid) in handle_moved, pass
vm->mandatory_update as the fence pointer.

>  	return 0;
>  }
>  
> @@ -2586,7 +2591,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  			 vm->use_cpu_for_update ? "CPU" : "SDMA");
>  	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
>  		  "CPU update of VM recommended only for large BAR system\n");
> -	vm->last_dir_update = NULL;
> +	vm->mandatory_update = NULL;
>  
>  	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>  			AMDGPU_GEM_CREATE_VRAM_CLEARED;
> @@ -2692,7 +2697,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>  	}
>  
>  	amdgpu_vm_free_levels(&vm->root);
> -	dma_fence_put(vm->last_dir_update);
> +	dma_fence_put(vm->mandatory_update);
>  	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
>  		amdgpu_vm_free_reserved_vmid(adev, vm, i);
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index c1accd1..63fa2e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -140,7 +140,7 @@ struct amdgpu_vm {
>  
>  	/* contains the page directory */
>  	struct amdgpu_vm_pt     root;
> -	struct dma_fence	*last_dir_update;
> +	struct dma_fence	*mandatory_update;

I'm not sure about this renaming. I don't find the new name clearer than
the old one. What's mandatory? The update, or syncing with the fence? I
think the name should reflect whatever completes when the fence signals.
I can see that last_dir_update is no longer accurate because it also
includes other VM-local (always valid) BOs. Maybe last_vm_update?

I'm thinking with the fence pointer idea above, maybe you could get rid
of this completely. Just return a fence from each amdgpu_vm_bo_update
call and also return that fence from amdgpu_vm_handle_moved. Then
amdgpu_bo_vm_update_pte can treat the fences from regular BOs, always
valid BOs and page tables all in the same way.

>  
>  	/* protecting freed */
>  	spinlock_t		freed_lock;
+Roger,

Roger, this change could also help GuoKai's issue(he needs to pass bo in 
first CS when enabling Per-VM-BOs).


Regards,

David Zhou


On 2017年09月08日 20:26, Christian König wrote:
> Marek this one will most likely fix your issues with always valid BOs 
> on Raven.
>
> Please give it a try when you have time.
>
> Thanks,
> Christian.
>
> Am 08.09.2017 um 14:18 schrieb Christian König:
>> From: Christian König <christian.koenig@amd.com>
>>
>> All users of a VM must always wait for updates with always
>> valid BOs to be completed.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>>   3 files changed, 21 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 8aa37e0..d6e66b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -752,10 +752,6 @@ static int amdgpu_bo_vm_update_pte(struct 
>> amdgpu_cs_parser *p)
>>       if (r)
>>           return r;
>>   -    r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
>> -    if (r)
>> -        return r;
>> -
>>       r = amdgpu_vm_clear_freed(adev, vm, NULL);
>>       if (r)
>>           return r;
>> @@ -797,6 +793,10 @@ static int amdgpu_bo_vm_update_pte(struct 
>> amdgpu_cs_parser *p)
>>               if (bo_va == NULL)
>>                   continue;
>>   +            if (bo_va->base.bo->flags &
>> +                AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
>> +                continue;
>> +
>>               r = amdgpu_vm_bo_update(adev, bo_va, false);
>>               if (r)
>>                   return r;
>> @@ -810,6 +810,12 @@ static int amdgpu_bo_vm_update_pte(struct 
>> amdgpu_cs_parser *p)
>>       }
>>         r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
>> +    if (r)
>> +        return r;
>> +
>> +    r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
>> +    if (r)
>> +        return r;
>>         if (amdgpu_vm_debug && p->bo_list) {
>>           /* Invalidate all BOs to test for userspace bugs */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 55f1ecb..12c8a4c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1140,9 +1140,8 @@ static int amdgpu_vm_update_level(struct 
>> amdgpu_device *adev,
>>                   goto error_free;
>>                 amdgpu_bo_fence(parent->base.bo, fence, true);
>> -            dma_fence_put(vm->last_dir_update);
>> -            vm->last_dir_update = dma_fence_get(fence);
>> -            dma_fence_put(fence);
>> +            dma_fence_put(vm->mandatory_update);
>> +            vm->mandatory_update = fence;
>>           }
>>       }
>>   @@ -1803,6 +1802,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device 
>> *adev,
>>               trace_amdgpu_vm_bo_mapping(mapping);
>>       }
>>   +    if (bo_va->base.bo &&
>> +        bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
>> +        dma_fence_put(vm->mandatory_update);
>> +        vm->mandatory_update = dma_fence_get(bo_va->last_pt_update);
>> +    }
>> +
>>       return 0;
>>   }
>>   @@ -2586,7 +2591,7 @@ int amdgpu_vm_init(struct amdgpu_device 
>> *adev, struct amdgpu_vm *vm,
>>                vm->use_cpu_for_update ? "CPU" : "SDMA");
>>       WARN_ONCE((vm->use_cpu_for_update & 
>> !amdgpu_vm_is_large_bar(adev)),
>>             "CPU update of VM recommended only for large BAR system\n");
>> -    vm->last_dir_update = NULL;
>> +    vm->mandatory_update = NULL;
>>         flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>>               AMDGPU_GEM_CREATE_VRAM_CLEARED;
>> @@ -2692,7 +2697,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, 
>> struct amdgpu_vm *vm)
>>       }
>>         amdgpu_vm_free_levels(&vm->root);
>> -    dma_fence_put(vm->last_dir_update);
>> +    dma_fence_put(vm->mandatory_update);
>>       for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
>>           amdgpu_vm_free_reserved_vmid(adev, vm, i);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index c1accd1..63fa2e5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -140,7 +140,7 @@ struct amdgpu_vm {
>>         /* contains the page directory */
>>       struct amdgpu_vm_pt     root;
>> -    struct dma_fence    *last_dir_update;
>> +    struct dma_fence    *mandatory_update;
>>         /* protecting freed */
>>       spinlock_t        freed_lock;
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Thanks for the review, please see some comments inline.

Am 08.09.2017 um 23:12 schrieb Felix Kuehling:
> I think this change could use a little more explanation. I also have
> some ideas for simplifying the code and making it easier to understand.
> See comments inline.
>
> But it looks good to me otherwise. Reviewed-by: Felix Kuehling
> <Felix.Kuehling@amd.com>
>
> Regards,
>    Felix
>
>
> On 2017-09-08 08:18 AM, Christian König wrote:
>> From: Christian König <christian.koenig@amd.com>
>>
>> All users of a VM must always wait for updates with always
>> valid BOs to be completed.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 ++++++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 ++++++++++-----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>>   3 files changed, 21 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 8aa37e0..d6e66b7 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -752,10 +752,6 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>>   	if (r)
>>   		return r;
>>   
>> -	r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_dir_update);
>> -	if (r)
>> -		return r;
>> -
>>   	r = amdgpu_vm_clear_freed(adev, vm, NULL);
>>   	if (r)
>>   		return r;
>> @@ -797,6 +793,10 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>>   			if (bo_va == NULL)
>>   				continue;
>>   
>> +			if (bo_va->base.bo->flags &
>> +			    AMDGPU_GEM_CREATE_VM_ALWAYS_VALID)
>> +				continue;
> This looks like somewhat unrelated. If you don't make it a separate
> patch, at least mention it in the change description. Something like:
> Always-valid BOs' page table entries are updated by
> amdgpu_vm_handle_moved. Skip them in amdgpu_bo_vm_update_pte.

Sorry that was just a leftover from debugging and not intended to be 
comitted. Chunk is removed in V2.

>
>> +
>>   			r = amdgpu_vm_bo_update(adev, bo_va, false);
>>   			if (r)
>>   				return r;
>> @@ -810,6 +810,12 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p)
>>   	}
>>   
>>   	r = amdgpu_vm_handle_moved(adev, vm, &p->job->sync);
>> +	if (r)
>> +		return r;
>> +
>> +	r = amdgpu_sync_fence(adev, &p->job->sync, vm->mandatory_update);
>> +	if (r)
>> +		return r;
>>   
>>   	if (amdgpu_vm_debug && p->bo_list) {
>>   		/* Invalidate all BOs to test for userspace bugs */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 55f1ecb..12c8a4c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -1140,9 +1140,8 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
>>   				goto error_free;
>>   
>>   			amdgpu_bo_fence(parent->base.bo, fence, true);
>> -			dma_fence_put(vm->last_dir_update);
>> -			vm->last_dir_update = dma_fence_get(fence);
>> -			dma_fence_put(fence);
>> +			dma_fence_put(vm->mandatory_update);
>> +			vm->mandatory_update = fence;
>>   		}
>>   	}
>>   
>> @@ -1803,6 +1802,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev,
>>   			trace_amdgpu_vm_bo_mapping(mapping);
>>   	}
>>   
>> +	if (bo_va->base.bo &&
>> +	    bo_va->base.bo->tbo.resv == vm->root.base.bo->tbo.resv) {
>> +		dma_fence_put(vm->mandatory_update);
>> +		vm->mandatory_update = dma_fence_get(bo_va->last_pt_update);
>> +	}
>> +
> A suggestion: This could be handled with fewer cryptic conditions
> littered throughout the code by passing a struct dma_fence **fence
> parameter to amdgpu_vm_bo_update. When this is called for updating VM
> BOs (page tables or always valid) in handle_moved, pass
> vm->mandatory_update as the fence pointer.

That won't work. We need the fence both in vm->mandatory_update (or 
whatever we are calling it now) and bo_va->last_pt_update.

>
>>   	return 0;
>>   }
>>   
>> @@ -2586,7 +2591,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>   			 vm->use_cpu_for_update ? "CPU" : "SDMA");
>>   	WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
>>   		  "CPU update of VM recommended only for large BAR system\n");
>> -	vm->last_dir_update = NULL;
>> +	vm->mandatory_update = NULL;
>>   
>>   	flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>>   			AMDGPU_GEM_CREATE_VRAM_CLEARED;
>> @@ -2692,7 +2697,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>>   	}
>>   
>>   	amdgpu_vm_free_levels(&vm->root);
>> -	dma_fence_put(vm->last_dir_update);
>> +	dma_fence_put(vm->mandatory_update);
>>   	for (i = 0; i < AMDGPU_MAX_VMHUBS; i++)
>>   		amdgpu_vm_free_reserved_vmid(adev, vm, i);
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index c1accd1..63fa2e5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -140,7 +140,7 @@ struct amdgpu_vm {
>>   
>>   	/* contains the page directory */
>>   	struct amdgpu_vm_pt     root;
>> -	struct dma_fence	*last_dir_update;
>> +	struct dma_fence	*mandatory_update;
> I'm not sure about this renaming. I don't find the new name clearer than
> the old one. What's mandatory? The update, or syncing with the fence? I
> think the name should reflect whatever completes when the fence signals.
> I can see that last_dir_update is no longer accurate because it also
> includes other VM-local (always valid) BOs. Maybe last_vm_update?

I've just named it last_update, cause that it is an VM update should be 
obvious.

> I'm thinking with the fence pointer idea above, maybe you could get rid
> of this completely. Just return a fence from each amdgpu_vm_bo_update
> call and also return that fence from amdgpu_vm_handle_moved. Then
> amdgpu_bo_vm_update_pte can treat the fences from regular BOs, always
> valid BOs and page tables all in the same way.

That won't work at all.

See the problem is that we not only need to sync to those updates in the 
current submission, but also all future submissions.

That's why we should probably getting rid of passing the fences as 
parameters instead.

Please review V2 as well.

Thanks,
Christian.

>
>>   
>>   	/* protecting freed */
>>   	spinlock_t		freed_lock;