[4/4] drm/msm: Optimize adreno_show_object()

Submitted by Sharat Masetty on Nov. 20, 2018, 11:37 a.m.

Details

Message ID 1542713851-14375-4-git-send-email-smasetty@codeaurora.org
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Freedreno

Not browsing as part of any series.

Commit Message

Sharat Masetty Nov. 20, 2018, 11:37 a.m.
When the userspace tries to read the crashstate dump, the read side
implementation in the driver currently ascii85 encodes all the binary
buffers and it does this each time the read system call is called.
A userspace tool like cat typically does a page by page read and the
number of read calls depends on the size of the data captured by the
driver. This is certainly not desirable and does not scale well with
large captures.

This patch encodes the buffer only once in the read path. With this there
is an immediate >10X speed improvement in crashstate save time.

Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>
---
Changes from v1:
	Addressed comments from Jordan Crouse

 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 80 ++++++++++++++++++++++++---------
 drivers/gpu/drm/msm/msm_gpu.h           |  1 +
 2 files changed, 60 insertions(+), 21 deletions(-)

--
1.9.1

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index bbf8d3e..7749967 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -441,11 +441,15 @@  void adreno_gpu_state_destroy(struct msm_gpu_state *state)
 {
 	int i;

-	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
+	for (i = 0; i < ARRAY_SIZE(state->ring); i++) {
 		kvfree(state->ring[i].bo.data);
+		kvfree(state->ring[i].bo.encoded);
+	}

-	for (i = 0; state->bos && i < state->nr_bos; i++)
+	for (i = 0; state->bos && i < state->nr_bos; i++) {
 		kvfree(state->bos[i].data);
+		kvfree(state->bos[i].encoded);
+	}

 	kfree(state->bos);
 	kfree(state->comm);
@@ -472,34 +476,70 @@  int adreno_gpu_state_put(struct msm_gpu_state *state)

 #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)

-static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
+static char *adreno_gpu_ascii85_encode(u32 *src, size_t len)
 {
-	char out[ASCII85_BUFSZ];
-	long l, datalen, i;
+	void *buf;
+	size_t buf_itr = 0;
+	long i, l;

-	if (!ptr || !len)
-		return;
+	if (!len)
+		return NULL;
+
+	l = ascii85_encode_len(len);

 	/*
-	 * Only dump the non-zero part of the buffer - rarely will any data
-	 * completely fill the entire allocated size of the buffer
+	 * ascii85 outputs either a 5 byte string or a 1 byte string. So we
+	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
 	 */
-	for (datalen = 0, i = 0; i < len >> 2; i++) {
-		if (ptr[i])
-			datalen = (i << 2) + 1;
+	buf = kvmalloc((l * 5) + 1, GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	for (i = 0; i < l; i++) {
+		ascii85_encode(src[i], buf + buf_itr);
+
+		if (src[i] == 0)
+			buf_itr += 1;
+		else
+			buf_itr += 5;
 	}

-	/* Skip printing the object if it is empty */
-	if (datalen == 0)
+	return buf;
+}
+
+static void adreno_show_object(struct drm_printer *p,
+		struct msm_gpu_state_bo *bo)
+{
+	if ((!bo->data && !bo->encoded) || !bo->size)
 		return;

-	l = ascii85_encode_len(datalen);
+	if (!bo->encoded) {
+		long datalen, i;
+		u32 *buf = bo->data;
+
+		/*
+		 * Only dump the non-zero part of the buffer - rarely will
+		 * any data completely fill the entire allocated size of
+		 * the buffer.
+		 */
+		for (datalen = 0, i = 0; i < (bo->size) >> 2; i++) {
+			if (buf[i])
+				datalen = ((i + 1) << 2);
+		}
+
+		bo->encoded = adreno_gpu_ascii85_encode(buf, datalen);
+
+		kvfree(bo->data);
+		bo->data = NULL;
+
+		if (!bo->encoded)
+			return;
+	}

 	drm_puts(p, "    data: !!ascii85 |\n");
 	drm_puts(p, "     ");

-	for (i = 0; i < l; i++)
-		drm_puts(p, ascii85_encode(ptr[i], out));
+	drm_puts(p, bo->encoded);

 	drm_puts(p, "\n");
 }
@@ -531,8 +571,7 @@  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
 		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);

-		adreno_show_object(p, state->ring[i].bo.data,
-			state->ring[i].bo.size);
+		adreno_show_object(p, &(state->ring[i].bo));
 	}

 	if (state->bos) {
@@ -543,8 +582,7 @@  void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
 				state->bos[i].iova);
 			drm_printf(p, "    size: %zd\n", state->bos[i].size);

-			adreno_show_object(p, state->bos[i].data,
-				state->bos[i].size);
+			adreno_show_object(p, &(state->bos[i]));
 		}
 	}

diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index a3a6371..737bf45 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -191,6 +191,7 @@  struct msm_gpu_state_bo {
 	u64 iova;
 	size_t size;
 	void *data;
+	char *encoded;
 };

 struct msm_gpu_state {

Comments

On Tue, Nov 20, 2018 at 05:07:31PM +0530, Sharat Masetty wrote:
> When the userspace tries to read the crashstate dump, the read side
> implementation in the driver currently ascii85 encodes all the binary
> buffers and it does this each time the read system call is called.
> A userspace tool like cat typically does a page by page read and the
> number of read calls depends on the size of the data captured by the
> driver. This is certainly not desirable and does not scale well with
> large captures.
> 
> This patch encodes the buffer only once in the read path. With this there
> is an immediate >10X speed improvement in crashstate save time.
> 
> Signed-off-by: Sharat Masetty <smasetty@codeaurora.org>

Reviewed-by: Jordan Crouse <jcrouse@codeaurora.org>

I like this a lot.

> ---
> Changes from v1:
> 	Addressed comments from Jordan Crouse
> 
>  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 80 ++++++++++++++++++++++++---------
>  drivers/gpu/drm/msm/msm_gpu.h           |  1 +
>  2 files changed, 60 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> index bbf8d3e..7749967 100644
> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
> @@ -441,11 +441,15 @@ void adreno_gpu_state_destroy(struct msm_gpu_state *state)
>  {
>  	int i;
> 
> -	for (i = 0; i < ARRAY_SIZE(state->ring); i++)
> +	for (i = 0; i < ARRAY_SIZE(state->ring); i++) {
>  		kvfree(state->ring[i].bo.data);
> +		kvfree(state->ring[i].bo.encoded);
> +	}
> 
> -	for (i = 0; state->bos && i < state->nr_bos; i++)
> +	for (i = 0; state->bos && i < state->nr_bos; i++) {
>  		kvfree(state->bos[i].data);
> +		kvfree(state->bos[i].encoded);
> +	}
> 
>  	kfree(state->bos);
>  	kfree(state->comm);
> @@ -472,34 +476,70 @@ int adreno_gpu_state_put(struct msm_gpu_state *state)
> 
>  #if defined(CONFIG_DEBUG_FS) || defined(CONFIG_DEV_COREDUMP)
> 
> -static void adreno_show_object(struct drm_printer *p, u32 *ptr, int len)
> +static char *adreno_gpu_ascii85_encode(u32 *src, size_t len)
>  {
> -	char out[ASCII85_BUFSZ];
> -	long l, datalen, i;
> +	void *buf;
> +	size_t buf_itr = 0;
> +	long i, l;
> 
> -	if (!ptr || !len)
> -		return;
> +	if (!len)
> +		return NULL;
> +
> +	l = ascii85_encode_len(len);
> 
>  	/*
> -	 * Only dump the non-zero part of the buffer - rarely will any data
> -	 * completely fill the entire allocated size of the buffer
> +	 * ascii85 outputs either a 5 byte string or a 1 byte string. So we
> +	 * account for the worst case of 5 bytes per dword plus the 1 for '\0'
>  	 */
> -	for (datalen = 0, i = 0; i < len >> 2; i++) {
> -		if (ptr[i])
> -			datalen = (i << 2) + 1;
> +	buf = kvmalloc((l * 5) + 1, GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	for (i = 0; i < l; i++) {
> +		ascii85_encode(src[i], buf + buf_itr);
> +
> +		if (src[i] == 0)
> +			buf_itr += 1;
> +		else
> +			buf_itr += 5;
>  	}
> 
> -	/* Skip printing the object if it is empty */
> -	if (datalen == 0)
> +	return buf;
> +}
> +
> +static void adreno_show_object(struct drm_printer *p,
> +		struct msm_gpu_state_bo *bo)
> +{
> +	if ((!bo->data && !bo->encoded) || !bo->size)
>  		return;
> 
> -	l = ascii85_encode_len(datalen);
> +	if (!bo->encoded) {
> +		long datalen, i;
> +		u32 *buf = bo->data;
> +
> +		/*
> +		 * Only dump the non-zero part of the buffer - rarely will
> +		 * any data completely fill the entire allocated size of
> +		 * the buffer.
> +		 */
> +		for (datalen = 0, i = 0; i < (bo->size) >> 2; i++) {
> +			if (buf[i])
> +				datalen = ((i + 1) << 2);
> +		}
> +
> +		bo->encoded = adreno_gpu_ascii85_encode(buf, datalen);
> +
> +		kvfree(bo->data);
> +		bo->data = NULL;
> +
> +		if (!bo->encoded)
> +			return;
> +	}
> 
>  	drm_puts(p, "    data: !!ascii85 |\n");
>  	drm_puts(p, "     ");
> 
> -	for (i = 0; i < l; i++)
> -		drm_puts(p, ascii85_encode(ptr[i], out));
> +	drm_puts(p, bo->encoded);
> 
>  	drm_puts(p, "\n");
>  }
> @@ -531,8 +571,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>  		drm_printf(p, "    wptr: %d\n", state->ring[i].wptr);
>  		drm_printf(p, "    size: %d\n", MSM_GPU_RINGBUFFER_SZ);
> 
> -		adreno_show_object(p, state->ring[i].bo.data,
> -			state->ring[i].bo.size);
> +		adreno_show_object(p, &(state->ring[i].bo));
>  	}
> 
>  	if (state->bos) {
> @@ -543,8 +582,7 @@ void adreno_show(struct msm_gpu *gpu, struct msm_gpu_state *state,
>  				state->bos[i].iova);
>  			drm_printf(p, "    size: %zd\n", state->bos[i].size);
> 
> -			adreno_show_object(p, state->bos[i].data,
> -				state->bos[i].size);
> +			adreno_show_object(p, &(state->bos[i]));
>  		}
>  	}
> 
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index a3a6371..737bf45 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -191,6 +191,7 @@ struct msm_gpu_state_bo {
>  	u64 iova;
>  	size_t size;
>  	void *data;
> +	char *encoded;
>  };
> 
>  struct msm_gpu_state {
> --
> 1.9.1
>