[Mesa-dev] i965: Add GPU BLIT of texture image to PBO in Intel driver

Submitted by Jon Ashburn on Feb. 28, 2014, 4:08 p.m.

Details

Message ID 1393603734-22599-2-git-send-email-jon@lunarg.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Jon Ashburn Feb. 28, 2014, 4:08 p.m.
Add Intel driver hook for glGetTexImage to accelerate the case of reading
texture image into a PBO.  This case gets huge performance gains by using
GPU BLIT directly to PBO rather than GPU BLIT to temporary texture followed
by memcpy.

No regressions on Piglit tests  with Intel driver.
Performance gain (1280 x 800 FBO, Ivybridge):
glGetTexImage + glMapBufferRange  with patch 1.45 msec
glGetTexImage + glMapBufferRange without patch 4.68 msec
---
 src/mesa/drivers/dri/i965/intel_tex_image.c | 114 ++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index ee02e68..1d0d72d 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -15,6 +15,8 @@ 
 #include "main/teximage.h"
 #include "main/texstore.h"
 
+#include "drivers/common/meta.h"
+
 #include "intel_mipmap_tree.h"
 #include "intel_buffer_objects.h"
 #include "intel_batchbuffer.h"
@@ -415,10 +417,122 @@  intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
                                   image->tile_x, image->tile_y);
 }
 
+static bool
+IntelBlitTexToPbo(struct gl_context * ctx,
+                   GLenum format, GLenum type,
+                   GLvoid * pixels, struct gl_texture_image *texImage)
+{
+   struct intel_texture_image *intelImage = intel_texture_image(texImage);
+   struct brw_context *brw = brw_context(ctx);
+   const struct gl_pixelstore_attrib *pack = &(ctx->Pack);
+   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
+   GLuint dst_offset;
+   drm_intel_bo *dst_buffer;
+   GLenum target = texImage->TexObject->Target;
+
+   DBG("%s\n", __FUNCTION__);
+
+   /*
+    * Check if we can use GPU blit to copy from the hardware texture
+    * format to the user's format/type.
+    * Note that GL's pixel transfer ops don't apply to glGetTexImage()
+    */
+
+   if (!_mesa_format_matches_format_and_type(
+           intelImage->mt->format, format, type, false) ||
+         (_mesa_is_format_compressed(texImage->TexFormat) &&
+          _mesa_get_format_datatype(texImage->TexFormat) == GL_UNSIGNED_NORMALIZED)) {
+      DBG("%s - bad format for blit to PBO\n", __FUNCTION__);
+      return false;
+   }
+
+   if (ctx->_ImageTransferState) {
+      DBG("%s - bad transfer state for blit to PBO\n", __FUNCTION__);
+      return false;
+   }
+
+   if (pack->SkipPixels  || pack->SkipRows) {
+      DBG("%s - bad skip params for blit to PBO\n", __FUNCTION__);
+      return false;
+   }
+
+   if (pack->SwapBytes || pack->LsbFirst) {
+      DBG("%s: bad packing params\n", __FUNCTION__);
+      return false;
+   }
+
+   if (target == GL_TEXTURE_1D_ARRAY || target == GL_TEXTURE_CUBE_MAP_ARRAY ||
+       target == GL_TEXTURE_2D_ARRAY) {
+      DBG("%s: no support for array textures\n", __FUNCTION__);
+      return false;
+   }
+
+   int dst_stride = _mesa_image_row_stride(pack, texImage->Width, format, type);
+   bool dst_flip = false;
+   /* Mesa flips the dst_stride for ctx->Pack.Invert, our mt must have a
+    * normal dst_stride.
+    */
+   struct gl_pixelstore_attrib uninverted_pack = *pack;
+   if (ctx->Pack.Invert) {
+      dst_stride = -dst_stride;
+      dst_flip = true;
+      uninverted_pack.Invert = false;
+   }
+   dst_offset = (GLintptr) pixels;
+   dst_offset += _mesa_image_offset(2, &uninverted_pack, texImage->Width,
+                                    texImage->Height, format, type, 0, 0, 0);
+   dst_buffer = intel_bufferobj_buffer(brw, dst, dst_offset,
+                                       texImage->Height * dst_stride);
+
+   struct intel_mipmap_tree *pbo_mt =
+            intel_miptree_create_for_bo(brw,
+                                        dst_buffer,
+                                        intelImage->mt->format,
+                                        dst_offset,
+                                        texImage->Width, texImage->Height,
+                                        dst_stride, I915_TILING_NONE);
+
+   if (!pbo_mt)
+      return false;
+
+   if (!intel_miptree_blit(brw,
+                           intelImage->mt, texImage->Level, texImage->Face,
+                           0, 0, false,
+                           pbo_mt, 0, 0,
+                           0, 0, dst_flip,
+                           texImage->Width, texImage->Height, GL_COPY))
+      return false;
+
+   intel_miptree_release(&pbo_mt);
+
+   DBG("%s - DONE\n", __FUNCTION__);
+
+   return true;
+}
+
+static void
+intel_get_tex_image(struct gl_context *ctx,
+                       GLenum format, GLenum type, GLvoid *pixels,
+                       struct gl_texture_image *texImage) {
+   struct brw_context *brw = brw_context(ctx);
+   DBG("%s\n", __FUNCTION__);
+
+   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
+      /* Using PBOs, so try the BLT based path. */
+      if (IntelBlitTexToPbo(ctx, format, type, pixels, texImage))
+         return;
+
+      perf_debug("%s: fallback to CPU mapping in PBO case\n", __FUNCTION__);
+   }
+
+   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
+}
+
 void
 intelInitTextureImageFuncs(struct dd_function_table *functions)
 {
    functions->TexImage = intelTexImage;
    functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
    functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image;
+   functions->GetTexImage = intel_get_tex_image;
 }

Comments

On 02/28/2014 08:08 AM, Jon Ashburn wrote:
> Add Intel driver hook for glGetTexImage to accelerate the case of reading
> texture image into a PBO.  This case gets huge performance gains by using
> GPU BLIT directly to PBO rather than GPU BLIT to temporary texture followed
> by memcpy.
>
> No regressions on Piglit tests  with Intel driver.
> Performance gain (1280 x 800 FBO, Ivybridge):
> glGetTexImage + glMapBufferRange  with patch 1.45 msec
> glGetTexImage + glMapBufferRange without patch 4.68 msec

Depending on the exact paths they're hitting, it may be worth 
investigating methods that will use the 3D engine.  We have some 
evidence that for larger than very small size, the 3D engine is much 
faster than the blit engine.  Maybe add a driver hook to use a BO as the 
backing store for a texture.  Texture-from-buffer-object, if you will. 
Then use that texture with meta's CopyTexImage path or something.  That 
might also avoid the fallbacks for array textures.

> ---
>   src/mesa/drivers/dri/i965/intel_tex_image.c | 114 ++++++++++++++++++++++++++++
>   1 file changed, 114 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
> index ee02e68..1d0d72d 100644
> --- a/src/mesa/drivers/dri/i965/intel_tex_image.c
> +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
> @@ -15,6 +15,8 @@
>   #include "main/teximage.h"
>   #include "main/texstore.h"
>
> +#include "drivers/common/meta.h"
> +
>   #include "intel_mipmap_tree.h"
>   #include "intel_buffer_objects.h"
>   #include "intel_batchbuffer.h"
> @@ -415,10 +417,122 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
>                                     image->tile_x, image->tile_y);
>   }
>
> +static bool
> +IntelBlitTexToPbo(struct gl_context * ctx,
> +                   GLenum format, GLenum type,
> +                   GLvoid * pixels, struct gl_texture_image *texImage)
> +{
> +   struct intel_texture_image *intelImage = intel_texture_image(texImage);
> +   struct brw_context *brw = brw_context(ctx);
> +   const struct gl_pixelstore_attrib *pack = &(ctx->Pack);
> +   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
> +   GLuint dst_offset;
> +   drm_intel_bo *dst_buffer;
> +   GLenum target = texImage->TexObject->Target;
> +
> +   DBG("%s\n", __FUNCTION__);
> +
> +   /*
> +    * Check if we can use GPU blit to copy from the hardware texture
> +    * format to the user's format/type.
> +    * Note that GL's pixel transfer ops don't apply to glGetTexImage()
> +    */
> +
> +   if (!_mesa_format_matches_format_and_type(
> +           intelImage->mt->format, format, type, false) ||
> +         (_mesa_is_format_compressed(texImage->TexFormat) &&
> +          _mesa_get_format_datatype(texImage->TexFormat) == GL_UNSIGNED_NORMALIZED)) {

I'm confused about this check.  You can't get the compressed data using 
glGetTexImage.  You have to use glGetCompressedTexImage.  The blitter 
can't do decompression, so shouldn't this check just be

    if (!_mesa_format_matches_format_and_type(
            intelImage->mt->format, format, type, false)) {
       ...
    }

This will let us hit the fast path for ETC textures that we internally 
store as uncompressed RGB.

> +      DBG("%s - bad format for blit to PBO\n", __FUNCTION__);

I think these should be perf_debug.  For most of the performance 
warnings we want to tell the application developer why they didn't get 
the fast path.  That gives them some indication how to change their code 
to get back on the fast path.

> +      return false;
> +   }
> +
> +   if (ctx->_ImageTransferState) {
> +      DBG("%s - bad transfer state for blit to PBO\n", __FUNCTION__);
> +      return false;
> +   }
> +
> +   if (pack->SkipPixels  || pack->SkipRows) {
> +      DBG("%s - bad skip params for blit to PBO\n", __FUNCTION__);
> +      return false;
> +   }

Does pack->RowLength need to be checked?

> +   if (pack->SwapBytes || pack->LsbFirst) {
> +      DBG("%s: bad packing params\n", __FUNCTION__);
> +      return false;
> +   }
> +
> +   if (target == GL_TEXTURE_1D_ARRAY || target == GL_TEXTURE_CUBE_MAP_ARRAY ||
> +       target == GL_TEXTURE_2D_ARRAY) {
> +      DBG("%s: no support for array textures\n", __FUNCTION__);

Do regular cubemaps actually work?  3D textures?  I don't know whether 
we have piglit tests that would hit that path...

> +      return false;
> +   }
> +
> +   int dst_stride = _mesa_image_row_stride(pack, texImage->Width, format, type);
> +   bool dst_flip = false;
> +   /* Mesa flips the dst_stride for ctx->Pack.Invert, our mt must have a
> +    * normal dst_stride.
> +    */
> +   struct gl_pixelstore_attrib uninverted_pack = *pack;
> +   if (ctx->Pack.Invert) {
> +      dst_stride = -dst_stride;
> +      dst_flip = true;
> +      uninverted_pack.Invert = false;
> +   }
> +   dst_offset = (GLintptr) pixels;
> +   dst_offset += _mesa_image_offset(2, &uninverted_pack, texImage->Width,
> +                                    texImage->Height, format, type, 0, 0, 0);
> +   dst_buffer = intel_bufferobj_buffer(brw, dst, dst_offset,
> +                                       texImage->Height * dst_stride);
> +
> +   struct intel_mipmap_tree *pbo_mt =
> +            intel_miptree_create_for_bo(brw,
> +                                        dst_buffer,
> +                                        intelImage->mt->format,
> +                                        dst_offset,
> +                                        texImage->Width, texImage->Height,
> +                                        dst_stride, I915_TILING_NONE);
> +
> +   if (!pbo_mt)
> +      return false;
> +
> +   if (!intel_miptree_blit(brw,
> +                           intelImage->mt, texImage->Level, texImage->Face,
> +                           0, 0, false,
> +                           pbo_mt, 0, 0,
> +                           0, 0, dst_flip,
> +                           texImage->Width, texImage->Height, GL_COPY))
> +      return false;
> +
> +   intel_miptree_release(&pbo_mt);
> +
> +   DBG("%s - DONE\n", __FUNCTION__);
> +
> +   return true;
> +}
> +
> +static void
> +intel_get_tex_image(struct gl_context *ctx,
> +                       GLenum format, GLenum type, GLvoid *pixels,
> +                       struct gl_texture_image *texImage) {
> +   struct brw_context *brw = brw_context(ctx);
> +   DBG("%s\n", __FUNCTION__);
> +
> +   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
> +      /* Using PBOs, so try the BLT based path. */
> +      if (IntelBlitTexToPbo(ctx, format, type, pixels, texImage))
> +         return;
> +
> +      perf_debug("%s: fallback to CPU mapping in PBO case\n", __FUNCTION__);

If the DBG messages in IntelBlitTexToPbo are changed to perf_debug 
messages, this one should get removed.

> +   }
> +
> +   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
> +}
> +
>   void
>   intelInitTextureImageFuncs(struct dd_function_table *functions)
>   {
>      functions->TexImage = intelTexImage;
>      functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
>      functions->BindRenderbufferTexImage = intel_bind_renderbuffer_tex_image;
> +   functions->GetTexImage = intel_get_tex_image;
>   }
>
On 02/28/2014 12:56 PM, Ian Romanick wrote:
> On 02/28/2014 08:08 AM, Jon Ashburn wrote:
>> Add Intel driver hook for glGetTexImage to accelerate the case of 
>> reading
>> texture image into a PBO.  This case gets huge performance gains by 
>> using
>> GPU BLIT directly to PBO rather than GPU BLIT to temporary texture 
>> followed
>> by memcpy.
>>
>> No regressions on Piglit tests  with Intel driver.
>> Performance gain (1280 x 800 FBO, Ivybridge):
>> glGetTexImage + glMapBufferRange  with patch 1.45 msec
>> glGetTexImage + glMapBufferRange without patch 4.68 msec
>
> Depending on the exact paths they're hitting, it may be worth 
> investigating methods that will use the 3D engine.  We have some 
> evidence that for larger than very small size, the 3D engine is much 
> faster than the blit engine.  Maybe add a driver hook to use a BO as 
> the backing store for a texture. Texture-from-buffer-object, if you 
> will. Then use that texture with meta's CopyTexImage path or 
> something.  That might also avoid the fallbacks for array textures.
>
Using  3D engine (brw_blorp_blit_miptrees ) is a little slower on some 
benchmarks in virtualized environments.  3D engine is faster on my 
simple test on native Linux that has very very small GPU workload.  
Thus, it appears  the BLIT engine is the way to go for real world workloads.

>> ---
>>   src/mesa/drivers/dri/i965/intel_tex_image.c | 114 
>> ++++++++++++++++++++++++++++
>>   1 file changed, 114 insertions(+)
>>
>> diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c 
>> b/src/mesa/drivers/dri/i965/intel_tex_image.c
>> index ee02e68..1d0d72d 100644
>> --- a/src/mesa/drivers/dri/i965/intel_tex_image.c
>> +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
>> @@ -15,6 +15,8 @@
>>   #include "main/teximage.h"
>>   #include "main/texstore.h"
>>
>> +#include "drivers/common/meta.h"
>> +
>>   #include "intel_mipmap_tree.h"
>>   #include "intel_buffer_objects.h"
>>   #include "intel_batchbuffer.h"
>> @@ -415,10 +417,122 @@ intel_image_target_texture_2d(struct 
>> gl_context *ctx, GLenum target,
>>                                     image->tile_x, image->tile_y);
>>   }
>>
>> +static bool
>> +IntelBlitTexToPbo(struct gl_context * ctx,
>> +                   GLenum format, GLenum type,
>> +                   GLvoid * pixels, struct gl_texture_image *texImage)
>> +{
>> +   struct intel_texture_image *intelImage = 
>> intel_texture_image(texImage);
>> +   struct brw_context *brw = brw_context(ctx);
>> +   const struct gl_pixelstore_attrib *pack = &(ctx->Pack);
>> +   struct intel_buffer_object *dst = 
>> intel_buffer_object(pack->BufferObj);
>> +   GLuint dst_offset;
>> +   drm_intel_bo *dst_buffer;
>> +   GLenum target = texImage->TexObject->Target;
>> +
>> +   DBG("%s\n", __FUNCTION__);
>> +
>> +   /*
>> +    * Check if we can use GPU blit to copy from the hardware texture
>> +    * format to the user's format/type.
>> +    * Note that GL's pixel transfer ops don't apply to glGetTexImage()
>> +    */
>> +
>> +   if (!_mesa_format_matches_format_and_type(
>> +           intelImage->mt->format, format, type, false) ||
>> +         (_mesa_is_format_compressed(texImage->TexFormat) &&
>> +          _mesa_get_format_datatype(texImage->TexFormat) == 
>> GL_UNSIGNED_NORMALIZED)) {
>
> I'm confused about this check.  You can't get the compressed data 
> using glGetTexImage.  You have to use glGetCompressedTexImage. The 
> blitter can't do decompression, so shouldn't this check just be
>
>    if (!_mesa_format_matches_format_and_type(
>            intelImage->mt->format, format, type, false)) {
>       ...
>    }
>
Yes you are right.
> This will let us hit the fast path for ETC textures that we internally 
> store as uncompressed RGB.
>
>> +      DBG("%s - bad format for blit to PBO\n", __FUNCTION__);
>
> I think these should be perf_debug.  For most of the performance 
> warnings we want to tell the application developer why they didn't get 
> the fast path.  That gives them some indication how to change their 
> code to get back on the fast path.
okay
>
>> +      return false;
>> +   }
>> +
>> +   if (ctx->_ImageTransferState) {
>> +      DBG("%s - bad transfer state for blit to PBO\n", __FUNCTION__);
>> +      return false;
>> +   }
>> +
>> +   if (pack->SkipPixels  || pack->SkipRows) {
>> +      DBG("%s - bad skip params for blit to PBO\n", __FUNCTION__);
>> +      return false;
>> +   }
>
> Does pack->RowLength need to be checked?
no since use _mesa_image_row_stride() which takes RowLength into 
consideration. And actually the skipPixels and SkipRows test aren't 
needed after further testing.
>
>> +   if (pack->SwapBytes || pack->LsbFirst) {
>> +      DBG("%s: bad packing params\n", __FUNCTION__);
>> +      return false;
>> +   }
>> +
>> +   if (target == GL_TEXTURE_1D_ARRAY || target == 
>> GL_TEXTURE_CUBE_MAP_ARRAY ||
>> +       target == GL_TEXTURE_2D_ARRAY) {
>> +      DBG("%s: no support for array textures\n", __FUNCTION__);
>
> Do regular cubemaps actually work?  3D textures?  I don't know whether 
> we have piglit tests that would hit that path...
>
Regular cubemaps work (see piglit cubemap-getteximage-pbo). See my new 
Piglit tests for 3D texture, and Array texture targets, which all work.
>> +      return false;
>> +   }
>> +
>> +   int dst_stride = _mesa_image_row_stride(pack, texImage->Width, 
>> format, type);
>> +   bool dst_flip = false;
>> +   /* Mesa flips the dst_stride for ctx->Pack.Invert, our mt must 
>> have a
>> +    * normal dst_stride.
>> +    */
>> +   struct gl_pixelstore_attrib uninverted_pack = *pack;
>> +   if (ctx->Pack.Invert) {
>> +      dst_stride = -dst_stride;
>> +      dst_flip = true;
>> +      uninverted_pack.Invert = false;
>> +   }
>> +   dst_offset = (GLintptr) pixels;
>> +   dst_offset += _mesa_image_offset(2, &uninverted_pack, 
>> texImage->Width,
>> +                                    texImage->Height, format, type, 
>> 0, 0, 0);
>> +   dst_buffer = intel_bufferobj_buffer(brw, dst, dst_offset,
>> +                                       texImage->Height * dst_stride);
>> +
>> +   struct intel_mipmap_tree *pbo_mt =
>> +            intel_miptree_create_for_bo(brw,
>> +                                        dst_buffer,
>> + intelImage->mt->format,
>> +                                        dst_offset,
>> +                                        texImage->Width, 
>> texImage->Height,
>> +                                        dst_stride, I915_TILING_NONE);
>> +
>> +   if (!pbo_mt)
>> +      return false;
>> +
>> +   if (!intel_miptree_blit(brw,
>> +                           intelImage->mt, texImage->Level, 
>> texImage->Face,
>> +                           0, 0, false,
>> +                           pbo_mt, 0, 0,
>> +                           0, 0, dst_flip,
>> +                           texImage->Width, texImage->Height, GL_COPY))
>> +      return false;
>> +
>> +   intel_miptree_release(&pbo_mt);
>> +
>> +   DBG("%s - DONE\n", __FUNCTION__);
>> +
>> +   return true;
>> +}
>> +
>> +static void
>> +intel_get_tex_image(struct gl_context *ctx,
>> +                       GLenum format, GLenum type, GLvoid *pixels,
>> +                       struct gl_texture_image *texImage) {
>> +   struct brw_context *brw = brw_context(ctx);
>> +   DBG("%s\n", __FUNCTION__);
>> +
>> +   if (_mesa_is_bufferobj(ctx->Pack.BufferObj)) {
>> +      /* Using PBOs, so try the BLT based path. */
>> +      if (IntelBlitTexToPbo(ctx, format, type, pixels, texImage))
>> +         return;
>> +
>> +      perf_debug("%s: fallback to CPU mapping in PBO case\n", 
>> __FUNCTION__);
>
> If the DBG messages in IntelBlitTexToPbo are changed to perf_debug 
> messages, this one should get removed.
okay
>
>> +   }
>> +
>> +   _mesa_meta_GetTexImage(ctx, format, type, pixels, texImage);
>> +}
>> +
>>   void
>>   intelInitTextureImageFuncs(struct dd_function_table *functions)
>>   {
>>      functions->TexImage = intelTexImage;
>>      functions->EGLImageTargetTexture2D = intel_image_target_texture_2d;
>>      functions->BindRenderbufferTexImage = 
>> intel_bind_renderbuffer_tex_image;
>> +   functions->GetTexImage = intel_get_tex_image;
>>   }
>>
>