[V2,1/2] Use NP2 stack size to avoid cache line conflict.

Submitted by Yang, Rong R on June 24, 2015, 1:58 a.m.

Details

Message ID 1435111106-10385-1-git-send-email-rong.r.yang@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Yang, Rong R June 24, 2015, 1:58 a.m.
The L3 cacheline size 64B, so calc the stack size from 64, and mul 3 per step.
Gen only support D * W before GEN8. So when calculate per lane stack address, need take care of the mul.

V2: calc the stack size from 128B, because long16 need 128B alignment.
Signed-off-by: Yang Rong <rong.r.yang@intel.com>
---
 backend/src/backend/context.cpp       |  4 ++--
 backend/src/backend/gen75_context.cpp | 13 ++++++-------
 backend/src/backend/gen_context.cpp   | 13 ++++++-------
 3 files changed, 14 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index 0dc60b7..b8dfa8c 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -400,9 +400,9 @@  namespace gbe
       return;
     // Be sure that the stack pointer is set
     // GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER, 0) >= 0);
-    uint32_t stackSize = 1*KB;
+    uint32_t stackSize = 128;
     while (stackSize < fn.getStackSize()) {
-      stackSize <<= 1;
+      stackSize *= 3;
       GBE_ASSERT(stackSize <= 64*KB);
     }
     this->kernel->stackSize = stackSize;
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index caf7043..b9dfb18 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -74,12 +74,7 @@  namespace gbe
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
-    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
-    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
 
-    // Use shifts rather than muls which are limited to 32x16 bit sources
-    const uint32_t perLaneShift = logi2(perLaneSize);
-    const uint32_t perThreadShift = logi2(perThreadSize);
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
@@ -95,11 +90,15 @@  namespace gbe
       p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180));
       p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7));
       p->curr.execWidth = this->simdWidth;
-      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
       p->curr.execWidth = 1;
       p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2));
       p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4));
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      if(perThreadSize > 0xffff) {
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
+      } else
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
       p->curr.execWidth = this->simdWidth;
       p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
     p->pop();
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 43d14d2..db27377 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -182,12 +182,7 @@  namespace gbe
     const uint32_t perLaneSize = kernel->getStackSize();
     const uint32_t perThreadSize = perLaneSize * this->simdWidth;
     GBE_ASSERT(perLaneSize > 0);
-    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);
-    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);
 
-    // Use shifts rather than muls which are limited to 32x16 bit sources
-    const uint32_t perLaneShift = logi2(perLaneSize);
-    const uint32_t perThreadShift = logi2(perThreadSize);
     const GenRegister selStatckPtr = this->simdWidth == 8 ?
       GenRegister::ud8grf(ir::ocl::stackptr) :
       GenRegister::ud16grf(ir::ocl::stackptr);
@@ -201,9 +196,13 @@  namespace gbe
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff));
       p->curr.execWidth = this->simdWidth;
-      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));
+      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));  //perLaneSize < 64K
       p->curr.execWidth = 1;
-      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift));
+      if(perThreadSize > 0xffff) {
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perLaneSize));
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize < 64K
+      } else
+        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immuw(perThreadSize));
       p->curr.execWidth = this->simdWidth;
       p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));
     p->pop();

Comments

The patchset LGTM. Thanks!

> -----Original Message-----

> From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf Of

> Yang Rong

> Sent: Wednesday, June 24, 2015 9:58 AM

> To: beignet@lists.freedesktop.org

> Cc: Yang, Rong R

> Subject: [Beignet] [Patch V2 1/2] Use NP2 stack size to avoid cache line

> conflict.

> 

> The L3 cacheline size 64B, so calc the stack size from 64, and mul 3 per step.

> Gen only support D * W before GEN8. So when calculate per lane stack

> address, need take care of the mul.

> 

> V2: calc the stack size from 128B, because long16 need 128B alignment.

> Signed-off-by: Yang Rong <rong.r.yang@intel.com>

> ---

>  backend/src/backend/context.cpp       |  4 ++--

>  backend/src/backend/gen75_context.cpp | 13 ++++++-------

>  backend/src/backend/gen_context.cpp   | 13 ++++++-------

>  3 files changed, 14 insertions(+), 16 deletions(-)

> 

> diff --git a/backend/src/backend/context.cpp

> b/backend/src/backend/context.cpp index 0dc60b7..b8dfa8c 100644

> --- a/backend/src/backend/context.cpp

> +++ b/backend/src/backend/context.cpp

> @@ -400,9 +400,9 @@ namespace gbe

>        return;

>      // Be sure that the stack pointer is set

>      //

> GBE_ASSERT(this->kernel->getCurbeOffset(GBE_CURBE_STACK_POINTER,

> 0) >= 0);

> -    uint32_t stackSize = 1*KB;

> +    uint32_t stackSize = 128;

>      while (stackSize < fn.getStackSize()) {

> -      stackSize <<= 1;

> +      stackSize *= 3;

>        GBE_ASSERT(stackSize <= 64*KB);

>      }

>      this->kernel->stackSize = stackSize; diff --git

> a/backend/src/backend/gen75_context.cpp

> b/backend/src/backend/gen75_context.cpp

> index caf7043..b9dfb18 100644

> --- a/backend/src/backend/gen75_context.cpp

> +++ b/backend/src/backend/gen75_context.cpp

> @@ -74,12 +74,7 @@ namespace gbe

>      const uint32_t perLaneSize = kernel->getStackSize();

>      const uint32_t perThreadSize = perLaneSize * this->simdWidth;

>      GBE_ASSERT(perLaneSize > 0);

> -    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);

> -    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);

> 

> -    // Use shifts rather than muls which are limited to 32x16 bit sources

> -    const uint32_t perLaneShift = logi2(perLaneSize);

> -    const uint32_t perThreadShift = logi2(perThreadSize);

>      const GenRegister selStatckPtr = this->simdWidth == 8 ?

>        GenRegister::ud8grf(ir::ocl::stackptr) :

>        GenRegister::ud16grf(ir::ocl::stackptr);

> @@ -95,11 +90,15 @@ namespace gbe

>        p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5),

> GenRegister::immud(0x180));

>        p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4),

> GenRegister::immud(7));

>        p->curr.execWidth = this->simdWidth;

> -      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));

> +      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));

> + //perLaneSize < 64K

>        p->curr.execWidth = 1;

>        p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immud(2));

>        p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::ud1grf(126, 4));

> -      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immud(perThreadShift));

> +      if(perThreadSize > 0xffff) {

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immuw(perLaneSize));

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize

> < 64K

> +      } else

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> + GenRegister::immuw(perThreadSize));

>        p->curr.execWidth = this->simdWidth;

>        p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));

>      p->pop();

> diff --git a/backend/src/backend/gen_context.cpp

> b/backend/src/backend/gen_context.cpp

> index 43d14d2..db27377 100644

> --- a/backend/src/backend/gen_context.cpp

> +++ b/backend/src/backend/gen_context.cpp

> @@ -182,12 +182,7 @@ namespace gbe

>      const uint32_t perLaneSize = kernel->getStackSize();

>      const uint32_t perThreadSize = perLaneSize * this->simdWidth;

>      GBE_ASSERT(perLaneSize > 0);

> -    GBE_ASSERT(isPowerOf<2>(perLaneSize) == true);

> -    GBE_ASSERT(isPowerOf<2>(perThreadSize) == true);

> 

> -    // Use shifts rather than muls which are limited to 32x16 bit sources

> -    const uint32_t perLaneShift = logi2(perLaneSize);

> -    const uint32_t perThreadShift = logi2(perThreadSize);

>      const GenRegister selStatckPtr = this->simdWidth == 8 ?

>        GenRegister::ud8grf(ir::ocl::stackptr) :

>        GenRegister::ud16grf(ir::ocl::stackptr);

> @@ -201,9 +196,13 @@ namespace gbe

>        p->curr.predicate = GEN_PREDICATE_NONE;

>        p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5),

> GenRegister::immud(0x1ff));

>        p->curr.execWidth = this->simdWidth;

> -      p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift));

> +      p->MUL(stackptr, stackptr, GenRegister::immuw(perLaneSize));

> + //perLaneSize < 64K

>        p->curr.execWidth = 1;

> -      p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immud(perThreadShift));

> +      if(perThreadSize > 0xffff) {

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immuw(perLaneSize));

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> GenRegister::immuw(this->simdWidth));  //Only support W * D, perLaneSize

> < 64K

> +      } else

> +        p->MUL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0),

> + GenRegister::immuw(perThreadSize));

>        p->curr.execWidth = this->simdWidth;

>        p->ADD(stackptr, stackptr, GenRegister::ud1grf(126,0));

>      p->pop();

> --

> 1.8.3.2

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet