Workgroup reduce add optimization

Submitted by Grigore Lupescu on Dec. 23, 2015, 3:32 p.m.

Details

Message ID 1450884739-5438-1-git-send-email-grigore.lupescu@intel.com
State New
Headers show
Series "Workgroup reduce add optimization" ( rev: 2 ) in Beignet

Not browsing as part of any series.

Commit Message

Grigore Lupescu Dec. 23, 2015, 3:32 p.m.
Signed-off-by: Grigore Lupescu <grigore.lupescu@intel.com>
---
 backend/src/backend/gen_context.cpp | 48 ++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index a2e11a4..52e988e 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2943,21 +2943,38 @@  namespace gbe
           }
         }
       }
-    } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) {
-      GBE_ASSERT(tmp.type == theVal.type);
-      GenRegister v = GenRegister::toUniform(tmp, theVal.type);
-      for (uint32_t i = 0; i < simd; i++) {
-        p->ADD(threadData, threadData, v);
-        v.subnr += typeSize(theVal.type);
-        if (v.subnr == 32) {
-          v.subnr = 0;
-          v.nr++;
-        }
-      }
-    }
-
-    p->pop();
-  }
+    } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){
+
+		tmp.hstride = GEN_HORIZONTAL_STRIDE_1;
+		tmp.vstride = GEN_VERTICAL_STRIDE_4;
+		tmp.width = GEN_WIDTH_4;
+
+		GBE_ASSERT(tmp.type == theVal.type);
+		GenRegister partialSum = tmp;
+
+		/* adjust offset, compute add with ADD4/ADD */
+		for (uint32_t i = 1; i < simd/4; i++){
+			tmp = tmp.suboffset(tmp, 4);
+			GenNativeInstruction* insnQ1 = p->next(GEN_OPCODE_ADD);
+			p->setHeader(insnQ1);
+			p->setDst(insnQ1, partialSum);
+			p->setSrc0(insnQ1, partialSum);
+			p->setSrc1(insnQ1, tmp);
+			insnQ1->header.execution_size = GEN_WIDTH_4;
+		}
+
+		partialSum = GenRegister::toUniform(partialSum, theVal.type);
+		for (uint32_t i = 0; i < 4; i++){
+			p->ADD(threadData, threadData, partialSum);
+			partialSum.subnr += typeSize(theVal.type);
+			if (partialSum.subnr == 32) {
+				partialSum.subnr = 0;
+				partialSum.nr++;
+			}
+		}
+	}
+	p->pop();
+}
 
 #define SEND_RESULT_MSG() \
 do { \
@@ -3123,7 +3140,6 @@  do { \
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->WAIT(2);
         p->patchJMPI(jip, (p->n_instruction() - jip), 0);
-    
         /* Do something when get the msg. */
         p->curr.execWidth = simd;
         p->MOV(dst, msgData);

Comments

I think the basic idea is OK and the result is als OK.
Please pay attention to the code format, we prefer spaces rather
than TABs to begin the line.

And some comments below.

On Wed, Dec 23, 2015 at 05:32:19PM +0200, Grigore Lupescu wrote:
> Date: Wed, 23 Dec 2015 17:32:19 +0200
> From: Grigore Lupescu <grigore.lupescu@intel.com>
> To: beignet@lists.freedesktop.org
> Subject: [Beignet]  [PATCH] Workgroup reduce add optimization
> X-Mailer: git-send-email 2.1.4
> 
> Signed-off-by: Grigore Lupescu <grigore.lupescu@intel.com>
> ---
>  backend/src/backend/gen_context.cpp | 48 ++++++++++++++++++++++++-------------
>  1 file changed, 32 insertions(+), 16 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
> index a2e11a4..52e988e 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -2943,21 +2943,38 @@ namespace gbe
>            }
>          }
>        }
> -    } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD) {
> -      GBE_ASSERT(tmp.type == theVal.type);
> -      GenRegister v = GenRegister::toUniform(tmp, theVal.type);
> -      for (uint32_t i = 0; i < simd; i++) {
> -        p->ADD(threadData, threadData, v);
> -        v.subnr += typeSize(theVal.type);
> -        if (v.subnr == 32) {
> -          v.subnr = 0;
> -          v.nr++;
> -        }
> -      }
> -    }
> -
> -    p->pop();
> -  }
> +    } else if (wg_op == ir::WORKGROUP_OP_REDUCE_ADD){
> +
> +		tmp.hstride = GEN_HORIZONTAL_STRIDE_1;
> +		tmp.vstride = GEN_VERTICAL_STRIDE_4;
> +		tmp.width = GEN_WIDTH_4;
> +
> +		GBE_ASSERT(tmp.type == theVal.type);
> +		GenRegister partialSum = tmp;
> +
> +		/* adjust offset, compute add with ADD4/ADD */
> +		for (uint32_t i = 1; i < simd/4; i++){
> +			tmp = tmp.suboffset(tmp, 4);
> +			GenNativeInstruction* insnQ1 = p->next(GEN_OPCODE_ADD);
> +			p->setHeader(insnQ1);
> +			p->setDst(insnQ1, partialSum);
> +			p->setSrc0(insnQ1, partialSum);
> +			p->setSrc1(insnQ1, tmp);
> +			insnQ1->header.execution_size = GEN_WIDTH_4;
> +		}
I think it is not good to use generate the instruction directly here.
Maybe you can set simd=4 and call p->ADD.
We want to put all the instructions generation to gen_encoder.cpp

> +
> +		partialSum = GenRegister::toUniform(partialSum, theVal.type);
> +		for (uint32_t i = 0; i < 4; i++){
> +			p->ADD(threadData, threadData, partialSum);
> +			partialSum.subnr += typeSize(theVal.type);
> +			if (partialSum.subnr == 32) {
> +				partialSum.subnr = 0;
> +				partialSum.nr++;
> +			}
I think you can also use suboffset here.
> +		}
> +	}
> +	p->pop();
> +}
>  
>  #define SEND_RESULT_MSG() \
>  do { \
> @@ -3123,7 +3140,6 @@ do { \
>          p->curr.predicate = GEN_PREDICATE_NONE;
>          p->WAIT(2);
>          p->patchJMPI(jip, (p->n_instruction() - jip), 0);
> -    
>          /* Do something when get the msg. */
>          p->curr.execWidth = simd;
>          p->MOV(dst, msgData);
> -- 
> 2.1.4
> 
> _______________________________________________
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet