[6/6] drm/amdgpu: keep the MMU lock until the update ends v2

Submitted by Christian König on Sept. 6, 2017, 12:34 p.m.

Details

Message ID 1504701283-3317-6-git-send-email-deathsimple@vodafone.de
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Christian König Sept. 6, 2017, 12:34 p.m.
From: Christian König <christian.koenig@amd.com>

This is quite controversial because it adds another lock which is held during
page table updates, but I don't see much other option.

v2: allow multiple updates to be in flight at the same time

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v1)
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 108 +++++++++++++++++++++++++++++++--
 1 file changed, 104 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 459f02e..333521e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -52,6 +52,10 @@  struct amdgpu_mn {
 	/* objects protected by lock */
 	struct rw_semaphore	lock;
 	struct rb_root		objects;
+
+	/* protected by task_lock */
+	spinlock_t		task_lock;
+	DECLARE_HASHTABLE(task_hash, 7);
 };
 
 struct amdgpu_mn_node {
@@ -59,6 +63,12 @@  struct amdgpu_mn_node {
 	struct list_head		bos;
 };
 
+struct amdgpu_mn_task {
+	struct hlist_node	node;
+	struct task_struct	*task;
+	unsigned		recursion;
+};
+
 /**
  * amdgpu_mn_destroy - destroy the rmn
  *
@@ -107,6 +117,75 @@  static void amdgpu_mn_release(struct mmu_notifier *mn,
 }
 
 /**
+ * amdgpu_mn_read_lock - take the rmn read lock
+ *
+ * @rmn: our notifier
+ *
+ * Take the rmn read side lock.
+ */
+static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn)
+{
+	struct amdgpu_mn_task *task;
+
+	spin_lock(&rmn->task_lock);
+	hash_for_each_possible(rmn->task_hash, task, node,
+			       (unsigned long)current) {
+		if (task->task != current)
+			continue;
+
+		spin_unlock(&rmn->task_lock);
+		++task->recursion;
+		return;
+	}
+	spin_unlock(&rmn->task_lock);
+
+	task = kmalloc(sizeof(*task), GFP_KERNEL);
+	task->task = current;
+	task->recursion = 1;
+
+	down_read(&rmn->lock);
+
+	spin_lock(&rmn->task_lock);
+	hash_add(rmn->task_hash, &task->node, (unsigned long)current);
+	spin_unlock(&rmn->task_lock);
+}
+
+/**
+ * amdgpu_mn_read_unlock - drop the rmn read lock
+ *
+ * @rmn: our notifier
+ *
+ * Drop the rmn read side lock.
+ */
+static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn)
+{
+	struct amdgpu_mn_task *task;
+
+	spin_lock(&rmn->task_lock);
+	hash_for_each_possible(rmn->task_hash, task, node,
+			       (unsigned long)current) {
+		if (task->task != current)
+			continue;
+
+		spin_unlock(&rmn->task_lock);
+		if (--task->recursion) {
+			return;
+		}
+
+		spin_lock(&rmn->task_lock);
+		hash_del(&task->node);
+		spin_unlock(&rmn->task_lock);
+		kfree(task);
+
+		up_read(&rmn->lock);
+		return;
+	}
+	spin_unlock(&rmn->task_lock);
+
+	BUG();
+}
+
+/**
  * amdgpu_mn_invalidate_node - unmap all BOs of a node
  *
  * @node: the node with the BOs to unmap
@@ -152,7 +231,7 @@  static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
 	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
 	struct interval_tree_node *it;
 
-	down_read(&rmn->lock);
+	amdgpu_mn_read_lock(rmn);
 
 	it = interval_tree_iter_first(&rmn->objects, address, address);
 	if (it) {
@@ -162,7 +241,7 @@  static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
 		amdgpu_mn_invalidate_node(node, address, address);
 	}
 
-	up_read(&rmn->lock);
+	amdgpu_mn_read_unlock(rmn);
 }
 
 /**
@@ -187,7 +266,7 @@  static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
 	/* notification is exclusive, but interval is inclusive */
 	end -= 1;
 
-	down_read(&rmn->lock);
+	amdgpu_mn_read_lock(rmn);
 
 	it = interval_tree_iter_first(&rmn->objects, start, end);
 	while (it) {
@@ -198,14 +277,33 @@  static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
 
 		amdgpu_mn_invalidate_node(node, start, end);
 	}
+}
+
+/**
+ * amdgpu_mn_invalidate_range_end - callback to notify about mm change
+ *
+ * @mn: our notifier
+ * @mn: the mm this callback is about
+ * @start: start of updated range
+ * @end: end of updated range
+ *
+ * Release the lock again to allow new command submissions.
+ */
+static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn,
+					   struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
 
-	up_read(&rmn->lock);
+	amdgpu_mn_read_unlock(rmn);
 }
 
 static const struct mmu_notifier_ops amdgpu_mn_ops = {
 	.release = amdgpu_mn_release,
 	.invalidate_page = amdgpu_mn_invalidate_page,
 	.invalidate_range_start = amdgpu_mn_invalidate_range_start,
+	.invalidate_range_end = amdgpu_mn_invalidate_range_end,
 };
 
 /**
@@ -242,6 +340,8 @@  struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
 	rmn->mn.ops = &amdgpu_mn_ops;
 	init_rwsem(&rmn->lock);
 	rmn->objects = RB_ROOT;
+	spin_lock_init(&rmn->task_lock);
+	hash_init(rmn->task_hash);
 
 	r = __mmu_notifier_register(&rmn->mn, mm);
 	if (r)

Comments

I think all this does (compared to v1) is, avoid taking the same read
lock twice, but keep track of how many readers there are outside the
rw_semaphore to avoid recursive lock warnings.

I don't really understand why you need to count readers per task (using
the task hashtable in the amdgpu_mn struct). The amdgpu_mn structure is
already per process. Each instance should only be used by one process
(as identified by its mm_struct). So the counting of readers could be
done with a simple atomic counter in the amdgpu_mn struct:

lock:

    if (atomic_inc_return(mn->read_count) == 1)
        down_read(&mn->lock);

unlock:

    if (atomic_dec_return(mn->read_count) == 0)
        up_read(&mn->lock);

Am I missing something? Even if there are multiple threads in the
process, it shouldn't matter because they all need to use the same read
lock anyway and as long as any thread holds the read lock, any other
thread is safe to use it. Maybe there is a concern about locking a
semaphore in one thread and unlocking it on a different thread. I know
that mutexes have quite strict semantics in this regard. But I think
semaphores are less strict.

Regards,
  Felix


On 2017-09-06 08:34 AM, Christian König wrote:
> From: Christian König <christian.koenig@amd.com>
>
> This is quite controversial because it adds another lock which is held during
> page table updates, but I don't see much other option.
>
> v2: allow multiple updates to be in flight at the same time
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v1)
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 108 +++++++++++++++++++++++++++++++--
>  1 file changed, 104 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> index 459f02e..333521e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
> @@ -52,6 +52,10 @@ struct amdgpu_mn {
>  	/* objects protected by lock */
>  	struct rw_semaphore	lock;
>  	struct rb_root		objects;
> +
> +	/* protected by task_lock */
> +	spinlock_t		task_lock;
> +	DECLARE_HASHTABLE(task_hash, 7);
>  };
>  
>  struct amdgpu_mn_node {
> @@ -59,6 +63,12 @@ struct amdgpu_mn_node {
>  	struct list_head		bos;
>  };
>  
> +struct amdgpu_mn_task {
> +	struct hlist_node	node;
> +	struct task_struct	*task;
> +	unsigned		recursion;
> +};
> +
>  /**
>   * amdgpu_mn_destroy - destroy the rmn
>   *
> @@ -107,6 +117,75 @@ static void amdgpu_mn_release(struct mmu_notifier *mn,
>  }
>  
>  /**
> + * amdgpu_mn_read_lock - take the rmn read lock
> + *
> + * @rmn: our notifier
> + *
> + * Take the rmn read side lock.
> + */
> +static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn)
> +{
> +	struct amdgpu_mn_task *task;
> +
> +	spin_lock(&rmn->task_lock);
> +	hash_for_each_possible(rmn->task_hash, task, node,
> +			       (unsigned long)current) {
> +		if (task->task != current)
> +			continue;
> +
> +		spin_unlock(&rmn->task_lock);
> +		++task->recursion;
> +		return;
> +	}
> +	spin_unlock(&rmn->task_lock);
> +
> +	task = kmalloc(sizeof(*task), GFP_KERNEL);
> +	task->task = current;
> +	task->recursion = 1;
> +
> +	down_read(&rmn->lock);
> +
> +	spin_lock(&rmn->task_lock);
> +	hash_add(rmn->task_hash, &task->node, (unsigned long)current);
> +	spin_unlock(&rmn->task_lock);
> +}
> +
> +/**
> + * amdgpu_mn_read_unlock - drop the rmn read lock
> + *
> + * @rmn: our notifier
> + *
> + * Drop the rmn read side lock.
> + */
> +static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn)
> +{
> +	struct amdgpu_mn_task *task;
> +
> +	spin_lock(&rmn->task_lock);
> +	hash_for_each_possible(rmn->task_hash, task, node,
> +			       (unsigned long)current) {
> +		if (task->task != current)
> +			continue;
> +
> +		spin_unlock(&rmn->task_lock);
> +		if (--task->recursion) {
> +			return;
> +		}
> +
> +		spin_lock(&rmn->task_lock);
> +		hash_del(&task->node);
> +		spin_unlock(&rmn->task_lock);
> +		kfree(task);
> +
> +		up_read(&rmn->lock);
> +		return;
> +	}
> +	spin_unlock(&rmn->task_lock);
> +
> +	BUG();
> +}
> +
> +/**
>   * amdgpu_mn_invalidate_node - unmap all BOs of a node
>   *
>   * @node: the node with the BOs to unmap
> @@ -152,7 +231,7 @@ static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
>  	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>  	struct interval_tree_node *it;
>  
> -	down_read(&rmn->lock);
> +	amdgpu_mn_read_lock(rmn);
>  
>  	it = interval_tree_iter_first(&rmn->objects, address, address);
>  	if (it) {
> @@ -162,7 +241,7 @@ static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
>  		amdgpu_mn_invalidate_node(node, address, address);
>  	}
>  
> -	up_read(&rmn->lock);
> +	amdgpu_mn_read_unlock(rmn);
>  }
>  
>  /**
> @@ -187,7 +266,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>  	/* notification is exclusive, but interval is inclusive */
>  	end -= 1;
>  
> -	down_read(&rmn->lock);
> +	amdgpu_mn_read_lock(rmn);
>  
>  	it = interval_tree_iter_first(&rmn->objects, start, end);
>  	while (it) {
> @@ -198,14 +277,33 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>  
>  		amdgpu_mn_invalidate_node(node, start, end);
>  	}
> +}
> +
> +/**
> + * amdgpu_mn_invalidate_range_end - callback to notify about mm change
> + *
> + * @mn: our notifier
> + * @mn: the mm this callback is about
> + * @start: start of updated range
> + * @end: end of updated range
> + *
> + * Release the lock again to allow new command submissions.
> + */
> +static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn,
> +					   struct mm_struct *mm,
> +					   unsigned long start,
> +					   unsigned long end)
> +{
> +	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>  
> -	up_read(&rmn->lock);
> +	amdgpu_mn_read_unlock(rmn);
>  }
>  
>  static const struct mmu_notifier_ops amdgpu_mn_ops = {
>  	.release = amdgpu_mn_release,
>  	.invalidate_page = amdgpu_mn_invalidate_page,
>  	.invalidate_range_start = amdgpu_mn_invalidate_range_start,
> +	.invalidate_range_end = amdgpu_mn_invalidate_range_end,
>  };
>  
>  /**
> @@ -242,6 +340,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
>  	rmn->mn.ops = &amdgpu_mn_ops;
>  	init_rwsem(&rmn->lock);
>  	rmn->objects = RB_ROOT;
> +	spin_lock_init(&rmn->task_lock);
> +	hash_init(rmn->task_hash);
>  
>  	r = __mmu_notifier_register(&rmn->mn, mm);
>  	if (r)
> I think all this does (compared to v1) is, avoid taking the same read
> lock twice, but keep track of how many readers there are outside the
> rw_semaphore to avoid recursive lock warnings.
Well the primary problem is that taking a read side twice in the same 
thread can cause a deadlock and is forbidden. That's what lockdep is 
complaining about.

> Am I missing something?
When the first thread is waiting for the lock to be acquired other 
threads would just continue without waiting for the write side to end.

The alternative would be to use recursion counter + another lock + non 
owner release.

> Maybe there is a concern about locking a semaphore in one thread and unlocking it on a different thread.
It is discouraged, but we could use 
down_read_non_owner()/up_read_non_owner() for this.

I'm going to give that a try,
Christian.

Am 06.09.2017 um 18:00 schrieb Felix Kuehling:
> I think all this does (compared to v1) is, avoid taking the same read
> lock twice, but keep track of how many readers there are outside the
> rw_semaphore to avoid recursive lock warnings.
>
> I don't really understand why you need to count readers per task (using
> the task hashtable in the amdgpu_mn struct). The amdgpu_mn structure is
> already per process. Each instance should only be used by one process
> (as identified by its mm_struct). So the counting of readers could be
> done with a simple atomic counter in the amdgpu_mn struct:
>
> lock:
>
>      if (atomic_inc_return(mn->read_count) == 1)
>          down_read(&mn->lock);
>
> unlock:
>
>      if (atomic_dec_return(mn->read_count) == 0)
>          up_read(&mn->lock);
>
> Am I missing something? Even if there are multiple threads in the
> process, it shouldn't matter because they all need to use the same read
> lock anyway and as long as any thread holds the read lock, any other
> thread is safe to use it. Maybe there is a concern about locking a
> semaphore in one thread and unlocking it on a different thread. I know
> that mutexes have quite strict semantics in this regard. But I think
> semaphores are less strict.
>
> Regards,
>    Felix
>
>
> On 2017-09-06 08:34 AM, Christian König wrote:
>> From: Christian König <christian.koenig@amd.com>
>>
>> This is quite controversial because it adds another lock which is held during
>> page table updates, but I don't see much other option.
>>
>> v2: allow multiple updates to be in flight at the same time
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v1)
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 108 +++++++++++++++++++++++++++++++--
>>   1 file changed, 104 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>> index 459f02e..333521e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>> @@ -52,6 +52,10 @@ struct amdgpu_mn {
>>   	/* objects protected by lock */
>>   	struct rw_semaphore	lock;
>>   	struct rb_root		objects;
>> +
>> +	/* protected by task_lock */
>> +	spinlock_t		task_lock;
>> +	DECLARE_HASHTABLE(task_hash, 7);
>>   };
>>   
>>   struct amdgpu_mn_node {
>> @@ -59,6 +63,12 @@ struct amdgpu_mn_node {
>>   	struct list_head		bos;
>>   };
>>   
>> +struct amdgpu_mn_task {
>> +	struct hlist_node	node;
>> +	struct task_struct	*task;
>> +	unsigned		recursion;
>> +};
>> +
>>   /**
>>    * amdgpu_mn_destroy - destroy the rmn
>>    *
>> @@ -107,6 +117,75 @@ static void amdgpu_mn_release(struct mmu_notifier *mn,
>>   }
>>   
>>   /**
>> + * amdgpu_mn_read_lock - take the rmn read lock
>> + *
>> + * @rmn: our notifier
>> + *
>> + * Take the rmn read side lock.
>> + */
>> +static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn)
>> +{
>> +	struct amdgpu_mn_task *task;
>> +
>> +	spin_lock(&rmn->task_lock);
>> +	hash_for_each_possible(rmn->task_hash, task, node,
>> +			       (unsigned long)current) {
>> +		if (task->task != current)
>> +			continue;
>> +
>> +		spin_unlock(&rmn->task_lock);
>> +		++task->recursion;
>> +		return;
>> +	}
>> +	spin_unlock(&rmn->task_lock);
>> +
>> +	task = kmalloc(sizeof(*task), GFP_KERNEL);
>> +	task->task = current;
>> +	task->recursion = 1;
>> +
>> +	down_read(&rmn->lock);
>> +
>> +	spin_lock(&rmn->task_lock);
>> +	hash_add(rmn->task_hash, &task->node, (unsigned long)current);
>> +	spin_unlock(&rmn->task_lock);
>> +}
>> +
>> +/**
>> + * amdgpu_mn_read_unlock - drop the rmn read lock
>> + *
>> + * @rmn: our notifier
>> + *
>> + * Drop the rmn read side lock.
>> + */
>> +static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn)
>> +{
>> +	struct amdgpu_mn_task *task;
>> +
>> +	spin_lock(&rmn->task_lock);
>> +	hash_for_each_possible(rmn->task_hash, task, node,
>> +			       (unsigned long)current) {
>> +		if (task->task != current)
>> +			continue;
>> +
>> +		spin_unlock(&rmn->task_lock);
>> +		if (--task->recursion) {
>> +			return;
>> +		}
>> +
>> +		spin_lock(&rmn->task_lock);
>> +		hash_del(&task->node);
>> +		spin_unlock(&rmn->task_lock);
>> +		kfree(task);
>> +
>> +		up_read(&rmn->lock);
>> +		return;
>> +	}
>> +	spin_unlock(&rmn->task_lock);
>> +
>> +	BUG();
>> +}
>> +
>> +/**
>>    * amdgpu_mn_invalidate_node - unmap all BOs of a node
>>    *
>>    * @node: the node with the BOs to unmap
>> @@ -152,7 +231,7 @@ static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
>>   	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>>   	struct interval_tree_node *it;
>>   
>> -	down_read(&rmn->lock);
>> +	amdgpu_mn_read_lock(rmn);
>>   
>>   	it = interval_tree_iter_first(&rmn->objects, address, address);
>>   	if (it) {
>> @@ -162,7 +241,7 @@ static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
>>   		amdgpu_mn_invalidate_node(node, address, address);
>>   	}
>>   
>> -	up_read(&rmn->lock);
>> +	amdgpu_mn_read_unlock(rmn);
>>   }
>>   
>>   /**
>> @@ -187,7 +266,7 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>>   	/* notification is exclusive, but interval is inclusive */
>>   	end -= 1;
>>   
>> -	down_read(&rmn->lock);
>> +	amdgpu_mn_read_lock(rmn);
>>   
>>   	it = interval_tree_iter_first(&rmn->objects, start, end);
>>   	while (it) {
>> @@ -198,14 +277,33 @@ static void amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>>   
>>   		amdgpu_mn_invalidate_node(node, start, end);
>>   	}
>> +}
>> +
>> +/**
>> + * amdgpu_mn_invalidate_range_end - callback to notify about mm change
>> + *
>> + * @mn: our notifier
>> + * @mn: the mm this callback is about
>> + * @start: start of updated range
>> + * @end: end of updated range
>> + *
>> + * Release the lock again to allow new command submissions.
>> + */
>> +static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn,
>> +					   struct mm_struct *mm,
>> +					   unsigned long start,
>> +					   unsigned long end)
>> +{
>> +	struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>>   
>> -	up_read(&rmn->lock);
>> +	amdgpu_mn_read_unlock(rmn);
>>   }
>>   
>>   static const struct mmu_notifier_ops amdgpu_mn_ops = {
>>   	.release = amdgpu_mn_release,
>>   	.invalidate_page = amdgpu_mn_invalidate_page,
>>   	.invalidate_range_start = amdgpu_mn_invalidate_range_start,
>> +	.invalidate_range_end = amdgpu_mn_invalidate_range_end,
>>   };
>>   
>>   /**
>> @@ -242,6 +340,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev)
>>   	rmn->mn.ops = &amdgpu_mn_ops;
>>   	init_rwsem(&rmn->lock);
>>   	rmn->objects = RB_ROOT;
>> +	spin_lock_init(&rmn->task_lock);
>> +	hash_init(rmn->task_hash);
>>   
>>   	r = __mmu_notifier_register(&rmn->mn, mm);
>>   	if (r)
On 2017-09-06 02:42 PM, Christian König wrote:
>> I think all this does (compared to v1) is, avoid taking the same read
>> lock twice, but keep track of how many readers there are outside the
>> rw_semaphore to avoid recursive lock warnings.
> Well the primary problem is that taking a read side twice in the same
> thread can cause a deadlock and is forbidden. That's what lockdep is
> complaining about.
>
>> Am I missing something?
> When the first thread is waiting for the lock to be acquired other
> threads would just continue without waiting for the write side to end.

Yeah, OK, that was pretty obvious. So we still need a spinlock ...

>
> The alternative would be to use recursion counter + another lock + non
> owner release.
>
>> Maybe there is a concern about locking a semaphore in one thread and
>> unlocking it on a different thread.
> It is discouraged, but we could use
> down_read_non_owner()/up_read_non_owner() for this.

Sounds good.

rwsem.h has a comment that the proper abstraction to use here would be a
completion. If you want to avoid a hack, a completion could do the job.
Instead of holding the rwsem until the range_end notifier, we could
signal a completion in range_end (if it's the last range_end).

The writers would have to wait for the completion after taking the write
side lock before proceeding.

I'm not quite sure about how to do that without race conditions or
deadlocks between the lock and the completion, though.

Regards,
  Felix

>
> I'm going to give that a try,
> Christian.
>
> Am 06.09.2017 um 18:00 schrieb Felix Kuehling:
>> I think all this does (compared to v1) is, avoid taking the same read
>> lock twice, but keep track of how many readers there are outside the
>> rw_semaphore to avoid recursive lock warnings.
>>
>> I don't really understand why you need to count readers per task (using
>> the task hashtable in the amdgpu_mn struct). The amdgpu_mn structure is
>> already per process. Each instance should only be used by one process
>> (as identified by its mm_struct). So the counting of readers could be
>> done with a simple atomic counter in the amdgpu_mn struct:
>>
>> lock:
>>
>>      if (atomic_inc_return(mn->read_count) == 1)
>>          down_read(&mn->lock);
>>
>> unlock:
>>
>>      if (atomic_dec_return(mn->read_count) == 0)
>>          up_read(&mn->lock);
>>
>> Am I missing something? Even if there are multiple threads in the
>> process, it shouldn't matter because they all need to use the same read
>> lock anyway and as long as any thread holds the read lock, any other
>> thread is safe to use it. Maybe there is a concern about locking a
>> semaphore in one thread and unlocking it on a different thread. I know
>> that mutexes have quite strict semantics in this regard. But I think
>> semaphores are less strict.
>>
>> Regards,
>>    Felix
>>
>>
>> On 2017-09-06 08:34 AM, Christian König wrote:
>>> From: Christian König <christian.koenig@amd.com>
>>>
>>> This is quite controversial because it adds another lock which is
>>> held during
>>> page table updates, but I don't see much other option.
>>>
>>> v2: allow multiple updates to be in flight at the same time
>>>
>>> Signed-off-by: Christian König <christian.koenig@amd.com>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> (v1)
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 108
>>> +++++++++++++++++++++++++++++++--
>>>   1 file changed, 104 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>>> index 459f02e..333521e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
>>> @@ -52,6 +52,10 @@ struct amdgpu_mn {
>>>       /* objects protected by lock */
>>>       struct rw_semaphore    lock;
>>>       struct rb_root        objects;
>>> +
>>> +    /* protected by task_lock */
>>> +    spinlock_t        task_lock;
>>> +    DECLARE_HASHTABLE(task_hash, 7);
>>>   };
>>>     struct amdgpu_mn_node {
>>> @@ -59,6 +63,12 @@ struct amdgpu_mn_node {
>>>       struct list_head        bos;
>>>   };
>>>   +struct amdgpu_mn_task {
>>> +    struct hlist_node    node;
>>> +    struct task_struct    *task;
>>> +    unsigned        recursion;
>>> +};
>>> +
>>>   /**
>>>    * amdgpu_mn_destroy - destroy the rmn
>>>    *
>>> @@ -107,6 +117,75 @@ static void amdgpu_mn_release(struct
>>> mmu_notifier *mn,
>>>   }
>>>     /**
>>> + * amdgpu_mn_read_lock - take the rmn read lock
>>> + *
>>> + * @rmn: our notifier
>>> + *
>>> + * Take the rmn read side lock.
>>> + */
>>> +static void amdgpu_mn_read_lock(struct amdgpu_mn *rmn)
>>> +{
>>> +    struct amdgpu_mn_task *task;
>>> +
>>> +    spin_lock(&rmn->task_lock);
>>> +    hash_for_each_possible(rmn->task_hash, task, node,
>>> +                   (unsigned long)current) {
>>> +        if (task->task != current)
>>> +            continue;
>>> +
>>> +        spin_unlock(&rmn->task_lock);
>>> +        ++task->recursion;
>>> +        return;
>>> +    }
>>> +    spin_unlock(&rmn->task_lock);
>>> +
>>> +    task = kmalloc(sizeof(*task), GFP_KERNEL);
>>> +    task->task = current;
>>> +    task->recursion = 1;
>>> +
>>> +    down_read(&rmn->lock);
>>> +
>>> +    spin_lock(&rmn->task_lock);
>>> +    hash_add(rmn->task_hash, &task->node, (unsigned long)current);
>>> +    spin_unlock(&rmn->task_lock);
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_mn_read_unlock - drop the rmn read lock
>>> + *
>>> + * @rmn: our notifier
>>> + *
>>> + * Drop the rmn read side lock.
>>> + */
>>> +static void amdgpu_mn_read_unlock(struct amdgpu_mn *rmn)
>>> +{
>>> +    struct amdgpu_mn_task *task;
>>> +
>>> +    spin_lock(&rmn->task_lock);
>>> +    hash_for_each_possible(rmn->task_hash, task, node,
>>> +                   (unsigned long)current) {
>>> +        if (task->task != current)
>>> +            continue;
>>> +
>>> +        spin_unlock(&rmn->task_lock);
>>> +        if (--task->recursion) {
>>> +            return;
>>> +        }
>>> +
>>> +        spin_lock(&rmn->task_lock);
>>> +        hash_del(&task->node);
>>> +        spin_unlock(&rmn->task_lock);
>>> +        kfree(task);
>>> +
>>> +        up_read(&rmn->lock);
>>> +        return;
>>> +    }
>>> +    spin_unlock(&rmn->task_lock);
>>> +
>>> +    BUG();
>>> +}
>>> +
>>> +/**
>>>    * amdgpu_mn_invalidate_node - unmap all BOs of a node
>>>    *
>>>    * @node: the node with the BOs to unmap
>>> @@ -152,7 +231,7 @@ static void amdgpu_mn_invalidate_page(struct
>>> mmu_notifier *mn,
>>>       struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>>>       struct interval_tree_node *it;
>>>   -    down_read(&rmn->lock);
>>> +    amdgpu_mn_read_lock(rmn);
>>>         it = interval_tree_iter_first(&rmn->objects, address, address);
>>>       if (it) {
>>> @@ -162,7 +241,7 @@ static void amdgpu_mn_invalidate_page(struct
>>> mmu_notifier *mn,
>>>           amdgpu_mn_invalidate_node(node, address, address);
>>>       }
>>>   -    up_read(&rmn->lock);
>>> +    amdgpu_mn_read_unlock(rmn);
>>>   }
>>>     /**
>>> @@ -187,7 +266,7 @@ static void
>>> amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>>>       /* notification is exclusive, but interval is inclusive */
>>>       end -= 1;
>>>   -    down_read(&rmn->lock);
>>> +    amdgpu_mn_read_lock(rmn);
>>>         it = interval_tree_iter_first(&rmn->objects, start, end);
>>>       while (it) {
>>> @@ -198,14 +277,33 @@ static void
>>> amdgpu_mn_invalidate_range_start(struct mmu_notifier *mn,
>>>             amdgpu_mn_invalidate_node(node, start, end);
>>>       }
>>> +}
>>> +
>>> +/**
>>> + * amdgpu_mn_invalidate_range_end - callback to notify about mm change
>>> + *
>>> + * @mn: our notifier
>>> + * @mn: the mm this callback is about
>>> + * @start: start of updated range
>>> + * @end: end of updated range
>>> + *
>>> + * Release the lock again to allow new command submissions.
>>> + */
>>> +static void amdgpu_mn_invalidate_range_end(struct mmu_notifier *mn,
>>> +                       struct mm_struct *mm,
>>> +                       unsigned long start,
>>> +                       unsigned long end)
>>> +{
>>> +    struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
>>>   -    up_read(&rmn->lock);
>>> +    amdgpu_mn_read_unlock(rmn);
>>>   }
>>>     static const struct mmu_notifier_ops amdgpu_mn_ops = {
>>>       .release = amdgpu_mn_release,
>>>       .invalidate_page = amdgpu_mn_invalidate_page,
>>>       .invalidate_range_start = amdgpu_mn_invalidate_range_start,
>>> +    .invalidate_range_end = amdgpu_mn_invalidate_range_end,
>>>   };
>>>     /**
>>> @@ -242,6 +340,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct
>>> amdgpu_device *adev)
>>>       rmn->mn.ops = &amdgpu_mn_ops;
>>>       init_rwsem(&rmn->lock);
>>>       rmn->objects = RB_ROOT;
>>> +    spin_lock_init(&rmn->task_lock);
>>> +    hash_init(rmn->task_hash);
>>>         r = __mmu_notifier_register(&rmn->mn, mm);
>>>       if (r)
>
>