drm/amdgpu: optimize out a spin lock (v2)

Submitted by Xie, AlexBin on June 22, 2017, 2:45 a.m.

Details

Message ID 1498099554-31861-1-git-send-email-AlexBin.Xie@amd.com
State New
Headers show
Series "drm/amdgpu: optimize out a spin lock (v2)" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Xie, AlexBin June 22, 2017, 2:45 a.m.
Use atomic instead of spin lock.
v2: Adjust commit message

Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h        |   5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 110 +++++++++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   1 -
 3 files changed, 76 insertions(+), 40 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 7caf514..21d318b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1588,9 +1588,8 @@  struct amdgpu_device {
 
 	/* data for buffer migration throttling */
 	struct {
-		spinlock_t		lock;
-		s64			last_update_us;
-		s64			accum_us; /* accumulated microseconds */
+		atomic64_t		last_update_us;
+		atomic64_t		accum_us; /* accumulated microseconds */
 		u32			log2_max_MBps;
 	} mm_stats;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 82131d7..7b6f42e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -225,6 +225,9 @@  static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	s64 time_us, increment_us;
 	u64 max_bytes;
 	u64 free_vram, total_vram, used_vram;
+	s64 old_update_us, head_time_us;
+	s64 accum_us;
+	s64 old_accum_us, head_accum_us;
 
 	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 	 * throttling.
@@ -242,47 +245,83 @@  static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 	used_vram = atomic64_read(&adev->vram_usage);
 	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
-	spin_lock(&adev->mm_stats.lock);
-
 	/* Increase the amount of accumulated us. */
-	time_us = ktime_to_us(ktime_get());
-	increment_us = time_us - adev->mm_stats.last_update_us;
-	adev->mm_stats.last_update_us = time_us;
-	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-                                      us_upper_bound);
-
-	/* This prevents the short period of low performance when the VRAM
-	 * usage is low and the driver is in debt or doesn't have enough
-	 * accumulated us to fill VRAM quickly.
-	 *
-	 * The situation can occur in these cases:
-	 * - a lot of VRAM is freed by userspace
-	 * - the presence of a big buffer causes a lot of evictions
-	 *   (solution: split buffers into smaller ones)
-	 *
-	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-	 * accum_us to a positive number.
-	 */
-	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
-		s64 min_us;
-
-		/* Be more aggresive on dGPUs. Try to fill a portion of free
-		 * VRAM now.
-		 */
-		if (!(adev->flags & AMD_IS_APU))
-			min_us = bytes_to_us(adev, free_vram / 4);
+	old_update_us = atomic64_read(&adev->mm_stats.last_update_us);
+	for (;;) {
+		time_us = ktime_to_us(ktime_get());
+		head_time_us = atomic64_cmpxchg(&adev->mm_stats.last_update_us,
+						old_update_us, time_us);
+
+		if (likely(head_time_us == old_update_us))
+			/*
+			 * No other task modified adev->mm_stats.last_update_us.
+			 * Update was successful.
+			 */
+			break;
 		else
-			min_us = 0; /* Reset accum_us on APUs. */
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_update_us = head_time_us;
+	}
+	increment_us = time_us - old_update_us;
+
+	old_accum_us = atomic64_read(&adev->mm_stats.accum_us);
+
+	for (;;) {
+		accum_us = min(old_accum_us + increment_us, us_upper_bound);
+
+		/* This prevents the short period of low performance when the
+		 * VRAM usage is low and the driver is in debt or doesn't have
+		 * enough accumulated us to fill VRAM quickly.
+		 *
+		 * The situation can occur in these cases:
+		 * - a lot of VRAM is freed by userspace
+		 * - the presence of a big buffer causes a lot of evictions
+		 *   (solution: split buffers into smaller ones)
+		 *
+		 * If 128 MB or 1/8th of VRAM is free, start filling it now by
+		 * setting accum_us to a positive number.
+		 */
+		if (free_vram >= 128 * 1024 * 1024 ||
+			free_vram >= total_vram / 8) {
+			s64 min_us;
+
+			/* Be more aggresive on dGPUs. Try to fill a portion of
+			 * free VRAM now.
+			 */
+			if (!(adev->flags & AMD_IS_APU))
+				min_us = bytes_to_us(adev, free_vram / 4);
+			else
+				min_us = 0; /* Reset accum_us on APUs. */
+
+			accum_us = max(min_us, accum_us);
+		}
+
+		head_accum_us = atomic64_cmpxchg(&adev->mm_stats.accum_us,
+							old_accum_us, accum_us);
 
-		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+		if (likely(head_accum_us == old_accum_us))
+			/*
+			 * No other task modified adev->mm_stats.accum_us.
+			 * Update was successful.
+			 */
+			break;
+		else
+			/* Another task modified the value after we read it.
+			 * A rare contention happens, let us retry.
+			 * In most case, one retry can do the job.
+			 * See function atomic64_add_unless as a similar idea.
+			 */
+			old_accum_us = head_accum_us;
 	}
 
 	/* This returns 0 if the driver is in debt to disallow (optional)
 	 * buffer moves.
 	 */
-	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
-
-	spin_unlock(&adev->mm_stats.lock);
+	max_bytes = us_to_bytes(adev, accum_us);
 	return max_bytes;
 }
 
@@ -292,9 +331,8 @@  static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
  */
 void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes)
 {
-	spin_lock(&adev->mm_stats.lock);
-	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
-	spin_unlock(&adev->mm_stats.lock);
+	s64 i = bytes_to_us(adev, num_bytes);
+	atomic64_sub(i, &adev->mm_stats.accum_us);
 }
 
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ff90f78..9e9d592 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2117,7 +2117,6 @@  int amdgpu_device_init(struct amdgpu_device *adev,
 	spin_lock_init(&adev->didt_idx_lock);
 	spin_lock_init(&adev->gc_cac_idx_lock);
 	spin_lock_init(&adev->audio_endpt_idx_lock);
-	spin_lock_init(&adev->mm_stats.lock);
 
 	INIT_LIST_HEAD(&adev->shadow_list);
 	mutex_init(&adev->shadow_list_lock);

Comments

On 22/06/17 11:45 AM, Alex Xie wrote:
> Use atomic instead of spin lock.
> v2: Adjust commit message
> 
> Signed-off-by: Alex Xie <AlexBin.Xie@amd.com>

The shortlog should be more specific, e.g. something like "drm/amdgpu:
Drop spinlock from mm_stats".

It's important for the Git commit shortlog to be as specific as possible
because in many cases only the shortlogs of commits are visible.


I'll leave it to others to judge whether the conversion from spinlock to
atomics is safe / an overall win.


>  	/* This returns 0 if the driver is in debt to disallow (optional)
>  	 * buffer moves.
>  	 */
> -	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
> -
> -	spin_unlock(&adev->mm_stats.lock);
> +	max_bytes = us_to_bytes(adev, accum_us);
>  	return max_bytes;
>  }

You can just make this

	return us_to_bytes(adev, accum_us);

and remove the max_bytes local.