[v2] nv110/exa: update sched codes

Submitted by Aaryaman Vasishta on June 3, 2017, 2:16 p.m.

Details

Message ID 20170603141608.17771-1-jem456.vasishta@gmail.com
State New
Headers show
Series "nv110/exa: update sched codes" ( rev: 2 ) in Nouveau

Not browsing as part of any series.

Commit Message

Aaryaman Vasishta June 3, 2017, 2:16 p.m.
v2: Add missing delays

This patch adds proper delays to maxwell exa shaders. rendercheck tests
seem consistent with/without this patch. I haven't extensively tested
them though.

Trello:
https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays

Signed-off-by: Aaryaman Vasishta <jem456.vasishta@gmail.com>
---
 src/shader/exac8nv110.fp  | 10 +++++-----
 src/shader/exac8nv110.fpc | 18 +++++++++---------
 src/shader/exacanv110.fp  | 10 +++++-----
 src/shader/exacanv110.fpc | 18 +++++++++---------
 src/shader/exacmnv110.fp  | 10 +++++-----
 src/shader/exacmnv110.fpc | 18 +++++++++---------
 src/shader/exas8nv110.fp  |  6 +++---
 src/shader/exas8nv110.fpc | 12 ++++++------
 src/shader/exasanv110.fp  | 10 +++++-----
 src/shader/exasanv110.fpc | 18 +++++++++---------
 src/shader/exascnv110.fp  |  6 +++---
 src/shader/exascnv110.fpc | 10 +++++-----
 src/shader/videonv110.fp  | 14 +++++++-------
 src/shader/videonv110.fpc | 26 +++++++++++++-------------
 14 files changed, 93 insertions(+), 93 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
index ce78036..1c4a4f1 100644
--- a/src/shader/exac8nv110.fp
+++ b/src/shader/exac8nv110.fp
@@ -25,23 +25,23 @@  NV110FP_Composite_A8[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr 0x1 wt 0x2)
 ipa $r2 a[0x90] $r0 0x0 0x1
 tex nodep $r1 $r2 0x0 0x1 t2d 0x8
 ipa $r3 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
 ipa $r2 a[0x80] $r0 0x0 0x1
 tex nodep $r0 $r2 0x0 0x0 t2d 0x8
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
 fmul ftz $r3 $r0 $r1
 mov $r2 $r3 0xf
 mov $r1 $r3 0xf
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6) (st 0xf) (st 0x0)
 mov $r0 $r3 0xf
 exit
 #endif
diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
index 4aa1368..46943b7 100644
--- a/src/shader/exac8nv110.fpc
+++ b/src/shader/exac8nv110.fpc
@@ -1,36 +1,36 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff03,
 0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0x21e0072f,
+0x005cbc03,
 0x0007ff02,
 0xe043ff89,
 0x2ff70201,
 0xc03a0014,
 0x4007ff03,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0074f,
+0x001fbc06,
 0x0007ff02,
 0xe043ff88,
 0x2ff70200,
 0xc03a0004,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfcc01fe6,
+0x001f8400,
 0x00170003,
 0x5c681000,
 0x00370002,
 0x5c980780,
 0x00370001,
 0x5c980780,
-0xfc0007e0,
+0xfde007e6,
 0x001f8000,
 0x00370000,
 0x5c980780,
diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
index a70d5c5..d7c2867 100644
--- a/src/shader/exacanv110.fp
+++ b/src/shader/exacanv110.fp
@@ -25,23 +25,23 @@  NV110FP_CAComposite[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
 ipa $r2 a[0x90] $r0 0x0 0x1
 tex nodep $r4 $r2 0x0 0x1 t2d 0xf
 ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
 ipa $r0 a[0x80] $r0 0x0 0x1
 tex nodep $r0 $r0 0x0 0x0 t2d 0xf
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
 fmul ftz $r3 $r3 $r7
 fmul ftz $r2 $r2 $r6
 fmul ftz $r1 $r1 $r5
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
 fmul ftz $r0 $r0 $r4
 exit
 #endif
diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
index 7c0ca5e..9cad139 100644
--- a/src/shader/exacanv110.fpc
+++ b/src/shader/exacanv110.fpc
@@ -1,36 +1,36 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff03,
 0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
 0x0007ff02,
 0xe043ff89,
 0xaff70204,
 0xc03a0017,
 0x4007ff01,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
 0x0007ff00,
 0xe043ff88,
 0xaff70000,
 0xc03a0007,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
 0x00770303,
 0x5c681000,
 0x00670202,
 0x5c681000,
 0x00570101,
 0x5c681000,
-0xfc0007e0,
+0xfde01fe1,
 0x001f8000,
 0x00470000,
 0x5c681000,
diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
index fe5c294..d717138 100644
--- a/src/shader/exacmnv110.fp
+++ b/src/shader/exacmnv110.fp
@@ -25,23 +25,23 @@  NV110FP_Composite[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
 ipa $r2 a[0x90] $r0 0x0 0x1
 tex nodep $r4 $r2 0x0 0x1 t2d 0x8
 ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
 ipa $r0 a[0x80] $r0 0x0 0x1
 tex nodep $r0 $r0 0x0 0x0 t2d 0xf
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
 fmul ftz $r3 $r3 $r4
 fmul ftz $r2 $r2 $r4
 fmul ftz $r1 $r1 $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
 fmul ftz $r0 $r0 $r4
 exit
 #endif
diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
index 9d62c1a..c150875 100644
--- a/src/shader/exacmnv110.fpc
+++ b/src/shader/exacmnv110.fpc
@@ -1,36 +1,36 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff03,
 0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
 0x0007ff02,
 0xe043ff89,
 0x2ff70204,
 0xc03a0014,
 0x4007ff01,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
 0x0007ff00,
 0xe043ff88,
 0xaff70000,
 0xc03a0007,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
 0x00470303,
 0x5c681000,
 0x00470202,
 0x5c681000,
 0x00470101,
 0x5c681000,
-0xfc0007e0,
+0xfde017e6,
 0x001f8000,
 0x00470000,
 0x5c681000,
diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
index 4fe2e19..a555beb 100644
--- a/src/shader/exas8nv110.fp
+++ b/src/shader/exas8nv110.fp
@@ -25,15 +25,15 @@  NV110FP_Source_A8[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
 ipa $r0 a[0x80] $r0 0x0 0x1
 tex nodep $r0 $r0 0x0 0x0 t2d 0x8
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
 mov $r3 $r0 0xf
 mov $r2 $r0 0xf
 mov $r1 $r0 0xf
diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
index 1181c41..e58d168 100644
--- a/src/shader/exas8nv110.fpc
+++ b/src/shader/exas8nv110.fpc
@@ -1,21 +1,21 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff01,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x001fbc03,
 0x0007ff00,
 0xe043ff88,
 0x2ff70000,
 0xc03a0004,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc200fe1,
+0x001f8400,
 0x00070003,
 0x5c980780,
 0x00070002,
diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
index 61374a6..ad7ca36 100644
--- a/src/shader/exasanv110.fp
+++ b/src/shader/exasanv110.fp
@@ -25,23 +25,23 @@  NV110FP_CACompositeSrcAlpha[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r3 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
 ipa $r2 a[0x80] $r0 0x0 0x1
 tex nodep $r4 $r2 0x0 0x0 t2d 0x8
 ipa $r1 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
 ipa $r0 a[0x90] $r0 0x0 0x1
 tex nodep $r0 $r0 0x0 0x1 t2d 0xf
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
 fmul ftz $r3 $r3 $r4
 fmul ftz $r2 $r2 $r4
 fmul ftz $r1 $r1 $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
 fmul ftz $r0 $r0 $r4
 exit
 #endif
diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
index 5516a03..1485f11 100644
--- a/src/shader/exasanv110.fpc
+++ b/src/shader/exasanv110.fpc
@@ -1,36 +1,36 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff03,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
 0x0007ff02,
 0xe043ff88,
 0x2ff70204,
 0xc03a0004,
 0x4007ff01,
 0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
 0x0007ff00,
 0xe043ff89,
 0xaff70000,
 0xc03a0017,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
 0x00470303,
 0x5c681000,
 0x00470202,
 0x5c681000,
 0x00470101,
 0x5c681000,
-0xfc0007e0,
+0xfde017e1,
 0x001f8000,
 0x00470000,
 0x5c681000,
diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
index 90bbb55..86e14e8 100644
--- a/src/shader/exascnv110.fp
+++ b/src/shader/exascnv110.fp
@@ -25,14 +25,14 @@  NV110FP_Source[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r0 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r0 $r0
 ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
 ipa $r0 a[0x80] $r0 0x0 0x1
 tex nodep $r0 $r0 0x0 0x0 t2d 0xf
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x0) (st 0x0)
 exit
 #endif
diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
index 2dba15d..1fef5d2 100644
--- a/src/shader/exascnv110.fpc
+++ b/src/shader/exascnv110.fpc
@@ -1,20 +1,20 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff00,
 0xe003ff87,
 0x00470000,
 0x50800000,
 0x4007ff01,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xfde0072f,
+0x001fbc03,
 0x0007ff00,
 0xe043ff88,
 0xaff70000,
 0xc03a0007,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
+0xfc0007ef,
 0x001f8000,
 0x0007000f,
 0xe3000000,
diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
index 2728311..dd3816c 100644
--- a/src/shader/videonv110.fp
+++ b/src/shader/videonv110.fp
@@ -25,30 +25,30 @@  NV110FP_NV12[] = {
 };
 #else
 
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
 ipa pass $r2 a[0x7c] 0x0 0x0 0x1
 mufu rcp $r2 $r2
 ipa $r0 a[0x80] $r2 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
 ipa $r1 a[0x84] $r2 0x0 0x1
 tex nodep $r4 $r0 0x0 0x0 t2d 0x8
 tex nodep $r0 $r0 0x0 0x1 t2d 0xc
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
 depbar le 0x5 0x1 0x1
 fmul ftz $r5 $r4 c0[0x0]
 fadd ftz $r3 $r5 c0[0x4]
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6) (st 0x6) (st 0xf)
 fadd ftz $r4 $r5 c0[0x8]
 fadd ftz $r5 $r5 c0[0xc]
 depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
 ffma ftz $r3 $r0 c0[0x10] $r3
 ffma ftz $r4 $r0 c0[0x14] $r4
 ffma ftz $r5 $r0 c0[0x18] $r5
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1) (st 0x1) (st 0x6)
 ffma ftz $r0 $r1 c0[0x1c] $r3
 ffma ftz $r2 $r1 c0[0x24] $r5
 ffma ftz $r1 $r1 c0[0x20] $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x0) (st 0x0)
 exit
 #endif
diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
index 31d745a..8fbc246 100644
--- a/src/shader/videonv110.fpc
+++ b/src/shader/videonv110.fpc
@@ -1,52 +1,52 @@ 
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
 0xcff7ff02,
 0xe003ff87,
 0x00470202,
 0x50800000,
 0x0027ff00,
 0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x001cbc03,
 0x4027ff01,
 0xe043ff88,
 0x2ff70004,
 0xc03a0004,
 0x2ff70000,
 0xc03a0016,
-0xfc0007e0,
-0x001f8000,
+0xfcc007ef,
+0x001f9801,
 0x34170001,
 0xf0f00000,
 0x00070405,
 0x4c681000,
 0x00170503,
 0x4c581000,
-0xfc0007e0,
-0x001f8000,
+0xfcc007e6,
+0x001fbc00,
 0x00270504,
 0x4c581000,
 0x00370505,
 0x4c581000,
 0x34070000,
 0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc2017e6,
+0x001f8400,
 0x00470003,
 0x49a00180,
 0x00570004,
 0x49a00200,
 0x00670005,
 0x49a00280,
-0xfc0007e0,
-0x001f8000,
+0xfc2007e1,
+0x001f9800,
 0x00770100,
 0x49a00180,
 0x00970102,
 0x49a00280,
 0x00870101,
 0x49a00200,
-0xfc0007e0,
+0xfc0007ef,
 0x001f8000,
 0x0007000f,
 0xe3000000,

Comments

Nice work!

See my comments below, and double-check if some of them can be applied 
to the shaders I didn't review yet.

I recommend you to test your work because if one sched code is wrong, 
you are likely going to kill your card and reboot your box. :-)

On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
> v2: Add missing delays
> 
> This patch adds proper delays to maxwell exa shaders. rendercheck tests
> seem consistent with/without this patch. I haven't extensively tested
> them though.
> 
> Trello:
> https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays
> 
> Signed-off-by: Aaryaman Vasishta <jem456.vasishta@gmail.com>
> ---
>   src/shader/exac8nv110.fp  | 10 +++++-----
>   src/shader/exac8nv110.fpc | 18 +++++++++---------
>   src/shader/exacanv110.fp  | 10 +++++-----
>   src/shader/exacanv110.fpc | 18 +++++++++---------
>   src/shader/exacmnv110.fp  | 10 +++++-----
>   src/shader/exacmnv110.fpc | 18 +++++++++---------
>   src/shader/exas8nv110.fp  |  6 +++---
>   src/shader/exas8nv110.fpc | 12 ++++++------
>   src/shader/exasanv110.fp  | 10 +++++-----
>   src/shader/exasanv110.fpc | 18 +++++++++---------
>   src/shader/exascnv110.fp  |  6 +++---
>   src/shader/exascnv110.fpc | 10 +++++-----
>   src/shader/videonv110.fp  | 14 +++++++-------
>   src/shader/videonv110.fpc | 26 +++++++++++++-------------
>   14 files changed, 93 insertions(+), 93 deletions(-)
> 
> diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
> index ce78036..1c4a4f1 100644
> --- a/src/shader/exac8nv110.fp
> +++ b/src/shader/exac8nv110.fp
> @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r3 a[0x94] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr 0x1 wt 0x2)
>   ipa $r2 a[0x90] $r0 0x0 0x1
>   tex nodep $r1 $r2 0x0 0x1 t2d 0x8
>   ipa $r3 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>   ipa $r2 a[0x80] $r0 0x0 0x1
>   tex nodep $r0 $r2 0x0 0x0 t2d 0x8

Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?

>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
>   fmul ftz $r3 $r0 $r1
>   mov $r2 $r3 0xf

You can stall for only one cycle here, but the 6 cycles on fmul is needed.

>   mov $r1 $r3 0xf
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x6) (st 0xf) (st 0x0)
>   mov $r0 $r3 0xf

Same here.

>   exit
>   #endif
> diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
> index 4aa1368..46943b7 100644
> --- a/src/shader/exac8nv110.fpc
> +++ b/src/shader/exac8nv110.fpc
> @@ -1,36 +1,36 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff03,
>   0xe043ff89,
> -0xfc0007e0,
> -0x001f8000,
> +0x21e0072f,
> +0x005cbc03,
>   0x0007ff02,
>   0xe043ff89,
>   0x2ff70201,
>   0xc03a0014,
>   0x4007ff03,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe5e0074f,
> +0x001fbc06,
>   0x0007ff02,
>   0xe043ff88,
>   0x2ff70200,
>   0xc03a0004,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfcc01fe6,
> +0x001f8400,
>   0x00170003,
>   0x5c681000,
>   0x00370002,
>   0x5c980780,
>   0x00370001,
>   0x5c980780,
> -0xfc0007e0,
> +0xfde007e6,
>   0x001f8000,
>   0x00370000,
>   0x5c980780,
> diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
> index a70d5c5..d7c2867 100644
> --- a/src/shader/exacanv110.fp
> +++ b/src/shader/exacanv110.fp
> @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r3 a[0x94] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>   ipa $r2 a[0x90] $r0 0x0 0x1
>   tex nodep $r4 $r2 0x0 0x1 t2d 0xf

Please add a read-dep-bar and wait for on the first fmul because $r2:$r3 
are re-used before $r4. Should be safer.

>   ipa $r1 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>   ipa $r0 a[0x80] $r0 0x0 0x1
>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>   fmul ftz $r3 $r3 $r7

Why are you waiting all barriers? Only $r3 is needed here.

>   fmul ftz $r2 $r2 $r6
>   fmul ftz $r1 $r1 $r5
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
>   fmul ftz $r0 $r0 $r4
>   exit
>   #endif
> diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
> index 7c0ca5e..9cad139 100644
> --- a/src/shader/exacanv110.fpc
> +++ b/src/shader/exacanv110.fpc
> @@ -1,36 +1,36 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff03,
>   0xe043ff89,
> -0xfc0007e0,
> -0x001f8000,
> +0xe1e0072f,
> +0x0008bc03,
>   0x0007ff02,
>   0xe043ff89,
>   0xaff70204,
>   0xc03a0017,
>   0x4007ff01,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe5e0274f,
> +0x001fbc06,
>   0x0007ff00,
>   0xe043ff88,
>   0xaff70000,
>   0xc03a0007,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc21ffe1,
> +0x001f8400,
>   0x00770303,
>   0x5c681000,
>   0x00670202,
>   0x5c681000,
>   0x00570101,
>   0x5c681000,
> -0xfc0007e0,
> +0xfde01fe1,
>   0x001f8000,
>   0x00470000,
>   0x5c681000,
> diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
> index fe5c294..d717138 100644
> --- a/src/shader/exacmnv110.fp
> +++ b/src/shader/exacmnv110.fp
> @@ -25,23 +25,23 @@ NV110FP_Composite[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r3 a[0x94] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>   ipa $r2 a[0x90] $r0 0x0 0x1
>   tex nodep $r4 $r2 0x0 0x1 t2d 0x8
>   ipa $r1 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>   ipa $r0 a[0x80] $r0 0x0 0x1
>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>   fmul ftz $r3 $r3 $r4
>   fmul ftz $r2 $r2 $r4
>   fmul ftz $r1 $r1 $r4
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
>   fmul ftz $r0 $r0 $r4
>   exit
>   #endif
> diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
> index 9d62c1a..c150875 100644
> --- a/src/shader/exacmnv110.fpc
> +++ b/src/shader/exacmnv110.fpc
> @@ -1,36 +1,36 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff03,
>   0xe043ff89,
> -0xfc0007e0,
> -0x001f8000,
> +0xe1e0072f,
> +0x0008bc03,
>   0x0007ff02,
>   0xe043ff89,
>   0x2ff70204,
>   0xc03a0014,
>   0x4007ff01,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe5e0274f,
> +0x001fbc06,
>   0x0007ff00,
>   0xe043ff88,
>   0xaff70000,
>   0xc03a0007,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc21ffe1,
> +0x001f8400,
>   0x00470303,
>   0x5c681000,
>   0x00470202,
>   0x5c681000,
>   0x00470101,
>   0x5c681000,
> -0xfc0007e0,
> +0xfde017e6,
>   0x001f8000,
>   0x00470000,
>   0x5c681000,
> diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
> index 4fe2e19..a555beb 100644
> --- a/src/shader/exas8nv110.fp
> +++ b/src/shader/exas8nv110.fp
> @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r1 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
>   ipa $r0 a[0x80] $r0 0x0 0x1
>   tex nodep $r0 $r0 0x0 0x0 t2d 0x8
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
>   mov $r3 $r0 0xf
>   mov $r2 $r0 0xf
>   mov $r1 $r0 0xf

This one looks good!

> diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
> index 1181c41..e58d168 100644
> --- a/src/shader/exas8nv110.fpc
> +++ b/src/shader/exas8nv110.fpc
> @@ -1,21 +1,21 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff01,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe1e0072f,
> +0x001fbc03,
>   0x0007ff00,
>   0xe043ff88,
>   0x2ff70000,
>   0xc03a0004,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc200fe1,
> +0x001f8400,
>   0x00070003,
>   0x5c980780,
>   0x00070002,
> diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
> index 61374a6..ad7ca36 100644
> --- a/src/shader/exasanv110.fp
> +++ b/src/shader/exasanv110.fp
> @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r3 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>   ipa $r2 a[0x80] $r0 0x0 0x1
>   tex nodep $r4 $r2 0x0 0x0 t2d 0x8
>   ipa $r1 a[0x94] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>   ipa $r0 a[0x90] $r0 0x0 0x1
>   tex nodep $r0 $r0 0x0 0x1 t2d 0xf
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>   fmul ftz $r3 $r3 $r4
>   fmul ftz $r2 $r2 $r4
>   fmul ftz $r1 $r1 $r4
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
>   fmul ftz $r0 $r0 $r4
>   exit
>   #endif
> diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
> index 5516a03..1485f11 100644
> --- a/src/shader/exasanv110.fpc
> +++ b/src/shader/exasanv110.fpc
> @@ -1,36 +1,36 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff03,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe1e0072f,
> +0x0008bc03,
>   0x0007ff02,
>   0xe043ff88,
>   0x2ff70204,
>   0xc03a0004,
>   0x4007ff01,
>   0xe043ff89,
> -0xfc0007e0,
> -0x001f8000,
> +0xe5e0274f,
> +0x001fbc06,
>   0x0007ff00,
>   0xe043ff89,
>   0xaff70000,
>   0xc03a0017,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc21ffe1,
> +0x001f8400,
>   0x00470303,
>   0x5c681000,
>   0x00470202,
>   0x5c681000,
>   0x00470101,
>   0x5c681000,
> -0xfc0007e0,
> +0xfde017e1,
>   0x001f8000,
>   0x00470000,
>   0x5c681000,
> diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
> index 90bbb55..86e14e8 100644
> --- a/src/shader/exascnv110.fp
> +++ b/src/shader/exascnv110.fp
> @@ -25,14 +25,14 @@ NV110FP_Source[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r0 $r0
>   ipa $r1 a[0x84] $r0 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
>   ipa $r0 a[0x80] $r0 0x0 0x1
>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf) (st 0x0) (st 0x0)

Looks good.

>   exit
>   #endif
> diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
> index 2dba15d..1fef5d2 100644
> --- a/src/shader/exascnv110.fpc
> +++ b/src/shader/exascnv110.fpc
> @@ -1,20 +1,20 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff00,
>   0xe003ff87,
>   0x00470000,
>   0x50800000,
>   0x4007ff01,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xfde0072f,
> +0x001fbc03,
>   0x0007ff00,
>   0xe043ff88,
>   0xaff70000,
>   0xc03a0007,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> +0xfc0007ef,
>   0x001f8000,
>   0x0007000f,
>   0xe3000000,
> diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
> index 2728311..dd3816c 100644
> --- a/src/shader/videonv110.fp
> +++ b/src/shader/videonv110.fp
> @@ -25,30 +25,30 @@ NV110FP_NV12[] = {
>   };
>   #else
>   
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>   ipa pass $r2 a[0x7c] 0x0 0x0 0x1
>   mufu rcp $r2 $r2
>   ipa $r0 a[0x80] $r2 0x0 0x1
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
>   ipa $r1 a[0x84] $r2 0x0 0x1
>   tex nodep $r4 $r0 0x0 0x0 t2d 0x8
>   tex nodep $r0 $r0 0x0 0x1 t2d 0xc
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
>   depbar le 0x5 0x1 0x1
>   fmul ftz $r5 $r4 c0[0x0]
>   fadd ftz $r3 $r5 c0[0x4]
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x6) (st 0x6) (st 0xf)
>   fadd ftz $r4 $r5 c0[0x8]
>   fadd ftz $r5 $r5 c0[0xc]
>   depbar le 0x5 0x0 0x0
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
>   ffma ftz $r3 $r0 c0[0x10] $r3
>   ffma ftz $r4 $r0 c0[0x14] $r4
>   ffma ftz $r5 $r0 c0[0x18] $r5
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0x1) (st 0x1) (st 0x6)
>   ffma ftz $r0 $r1 c0[0x1c] $r3
>   ffma ftz $r2 $r1 c0[0x24] $r5
>   ffma ftz $r1 $r1 c0[0x20] $r4
> -sched (st 0x0) (st 0x0) (st 0x0)
> +sched (st 0xf) (st 0x0) (st 0x0)
>   exit
>   #endif
> diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
> index 31d745a..8fbc246 100644
> --- a/src/shader/videonv110.fpc
> +++ b/src/shader/videonv110.fpc
> @@ -1,52 +1,52 @@
> -0xfc0007e0,
> -0x001f8000,
> +0xe1a0070f,
> +0x003c3c01,
>   0xcff7ff02,
>   0xe003ff87,
>   0x00470202,
>   0x50800000,
>   0x0027ff00,
>   0xe043ff88,
> -0xfc0007e0,
> -0x001f8000,
> +0xe1e0072f,
> +0x001cbc03,
>   0x4027ff01,
>   0xe043ff88,
>   0x2ff70004,
>   0xc03a0004,
>   0x2ff70000,
>   0xc03a0016,
> -0xfc0007e0,
> -0x001f8000,
> +0xfcc007ef,
> +0x001f9801,
>   0x34170001,
>   0xf0f00000,
>   0x00070405,
>   0x4c681000,
>   0x00170503,
>   0x4c581000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfcc007e6,
> +0x001fbc00,
>   0x00270504,
>   0x4c581000,
>   0x00370505,
>   0x4c581000,
>   0x34070000,
>   0xf0f00000,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc2017e6,
> +0x001f8400,
>   0x00470003,
>   0x49a00180,
>   0x00570004,
>   0x49a00200,
>   0x00670005,
>   0x49a00280,
> -0xfc0007e0,
> -0x001f8000,
> +0xfc2007e1,
> +0x001f9800,
>   0x00770100,
>   0x49a00180,
>   0x00970102,
>   0x49a00280,
>   0x00870101,
>   0x49a00200,
> -0xfc0007e0,
> +0xfc0007ef,
>   0x001f8000,
>   0x0007000f,
>   0xe3000000,
>
On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset <samuel.pitoiset@gmail.com>
wrote:

> Nice work!
>
> See my comments below, and double-check if some of them can be applied to
> the shaders I didn't review yet.
>
> I recommend you to test your work because if one sched code is wrong, you
> are likely going to kill your card and reboot your box. :-)
>
>
> On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
>
>> v2: Add missing delays
>>
>> This patch adds proper delays to maxwell exa shaders. rendercheck tests
>> seem consistent with/without this patch. I haven't extensively tested
>> them though.
>>
>> Trello:
>> https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wit
>> h-proper-delays
>>
>> Signed-off-by: Aaryaman Vasishta <jem456.vasishta@gmail.com>
>> ---
>>   src/shader/exac8nv110.fp  | 10 +++++-----
>>   src/shader/exac8nv110.fpc | 18 +++++++++---------
>>   src/shader/exacanv110.fp  | 10 +++++-----
>>   src/shader/exacanv110.fpc | 18 +++++++++---------
>>   src/shader/exacmnv110.fp  | 10 +++++-----
>>   src/shader/exacmnv110.fpc | 18 +++++++++---------
>>   src/shader/exas8nv110.fp  |  6 +++---
>>   src/shader/exas8nv110.fpc | 12 ++++++------
>>   src/shader/exasanv110.fp  | 10 +++++-----
>>   src/shader/exasanv110.fpc | 18 +++++++++---------
>>   src/shader/exascnv110.fp  |  6 +++---
>>   src/shader/exascnv110.fpc | 10 +++++-----
>>   src/shader/videonv110.fp  | 14 +++++++-------
>>   src/shader/videonv110.fpc | 26 +++++++++++++-------------
>>   14 files changed, 93 insertions(+), 93 deletions(-)
>>
>> diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
>> index ce78036..1c4a4f1 100644
>> --- a/src/shader/exac8nv110.fp
>> +++ b/src/shader/exac8nv110.fp
>> @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r3 a[0x94] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr 0x1 wt
>> 0x2)
>>   ipa $r2 a[0x90] $r0 0x0 0x1
>>   tex nodep $r1 $r2 0x0 0x1 t2d 0x8
>>   ipa $r3 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>   ipa $r2 a[0x80] $r0 0x0 0x1
>>   tex nodep $r0 $r2 0x0 0x0 t2d 0x8
>>
>
> Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?

Missed it, thanks for pointing it out.

>
>
>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
>>   fmul ftz $r3 $r0 $r1
>>   mov $r2 $r3 0xf
>>
>
> You can stall for only one cycle here, but the 6 cycles on fmul is needed.
>
>   mov $r1 $r3 0xf
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x6) (st 0xf) (st 0x0)
>>   mov $r0 $r3 0xf
>>
>
> Same here.


>
>   exit
>>   #endif
>> diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
>> index 4aa1368..46943b7 100644
>> --- a/src/shader/exac8nv110.fpc
>> +++ b/src/shader/exac8nv110.fpc
>> @@ -1,36 +1,36 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff03,
>>   0xe043ff89,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0x21e0072f,
>> +0x005cbc03,
>>   0x0007ff02,
>>   0xe043ff89,
>>   0x2ff70201,
>>   0xc03a0014,
>>   0x4007ff03,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe5e0074f,
>> +0x001fbc06,
>>   0x0007ff02,
>>   0xe043ff88,
>>   0x2ff70200,
>>   0xc03a0004,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfcc01fe6,
>> +0x001f8400,
>>   0x00170003,
>>   0x5c681000,
>>   0x00370002,
>>   0x5c980780,
>>   0x00370001,
>>   0x5c980780,
>> -0xfc0007e0,
>> +0xfde007e6,
>>   0x001f8000,
>>   0x00370000,
>>   0x5c980780,
>> diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
>> index a70d5c5..d7c2867 100644
>> --- a/src/shader/exacanv110.fp
>> +++ b/src/shader/exacanv110.fp
>> @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r3 a[0x94] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>>   ipa $r2 a[0x90] $r0 0x0 0x1
>>   tex nodep $r4 $r2 0x0 0x1 t2d 0xf
>>
>
> Please add a read-dep-bar and wait for on the first fmul because $r2:$r3
> are re-used before $r4. Should be safer.


>
>   ipa $r1 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>   ipa $r0 a[0x80] $r0 0x0 0x1
>>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>   fmul ftz $r3 $r3 $r7
>>
>
> Why are you waiting all barriers? Only $r3 is needed here.

After adding a read-dep-bar and waiting on that over here, I wasn't able to
pass the same number of `rendercheck -f a8r8g8b8` tests as before this
patch.  After a little trial and error I discovered that wt 0xc fixes it,
which means that bar 3 and 4 were being used in this fmul somehow (assuming
bars start from 1), which is odd because this fmul only uses $r3 and $r7,
and I think it should wait on the read-dep-bar set on "tex nodep $r4 $r2
0x0 0x1 t2d 0xf" (I could be wrong though). I'm kinda stumped on what's
going on within this fmul that's causing this behavior.

>
>
>   fmul ftz $r2 $r2 $r6
>>   fmul ftz $r1 $r1 $r5
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
>>   fmul ftz $r0 $r0 $r4
>>   exit
>>   #endif
>> diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
>> index 7c0ca5e..9cad139 100644
>> --- a/src/shader/exacanv110.fpc
>> +++ b/src/shader/exacanv110.fpc
>> @@ -1,36 +1,36 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff03,
>>   0xe043ff89,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1e0072f,
>> +0x0008bc03,
>>   0x0007ff02,
>>   0xe043ff89,
>>   0xaff70204,
>>   0xc03a0017,
>>   0x4007ff01,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe5e0274f,
>> +0x001fbc06,
>>   0x0007ff00,
>>   0xe043ff88,
>>   0xaff70000,
>>   0xc03a0007,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc21ffe1,
>> +0x001f8400,
>>   0x00770303,
>>   0x5c681000,
>>   0x00670202,
>>   0x5c681000,
>>   0x00570101,
>>   0x5c681000,
>> -0xfc0007e0,
>> +0xfde01fe1,
>>   0x001f8000,
>>   0x00470000,
>>   0x5c681000,
>> diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
>> index fe5c294..d717138 100644
>> --- a/src/shader/exacmnv110.fp
>> +++ b/src/shader/exacmnv110.fp
>> @@ -25,23 +25,23 @@ NV110FP_Composite[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r3 a[0x94] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>>   ipa $r2 a[0x90] $r0 0x0 0x1
>>   tex nodep $r4 $r2 0x0 0x1 t2d 0x8
>>   ipa $r1 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>   ipa $r0 a[0x80] $r0 0x0 0x1
>>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>   fmul ftz $r3 $r3 $r4
>>   fmul ftz $r2 $r2 $r4
>>   fmul ftz $r1 $r1 $r4
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
>>   fmul ftz $r0 $r0 $r4
>>   exit
>>   #endif
>> diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
>> index 9d62c1a..c150875 100644
>> --- a/src/shader/exacmnv110.fpc
>> +++ b/src/shader/exacmnv110.fpc
>> @@ -1,36 +1,36 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff03,
>>   0xe043ff89,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1e0072f,
>> +0x0008bc03,
>>   0x0007ff02,
>>   0xe043ff89,
>>   0x2ff70204,
>>   0xc03a0014,
>>   0x4007ff01,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe5e0274f,
>> +0x001fbc06,
>>   0x0007ff00,
>>   0xe043ff88,
>>   0xaff70000,
>>   0xc03a0007,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc21ffe1,
>> +0x001f8400,
>>   0x00470303,
>>   0x5c681000,
>>   0x00470202,
>>   0x5c681000,
>>   0x00470101,
>>   0x5c681000,
>> -0xfc0007e0,
>> +0xfde017e6,
>>   0x001f8000,
>>   0x00470000,
>>   0x5c681000,
>> diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
>> index 4fe2e19..a555beb 100644
>> --- a/src/shader/exas8nv110.fp
>> +++ b/src/shader/exas8nv110.fp
>> @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r1 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
>>   ipa $r0 a[0x80] $r0 0x0 0x1
>>   tex nodep $r0 $r0 0x0 0x0 t2d 0x8
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
>>   mov $r3 $r0 0xf
>>   mov $r2 $r0 0xf
>>   mov $r1 $r0 0xf
>>
>
> This one looks good!
>
>
> diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
>> index 1181c41..e58d168 100644
>> --- a/src/shader/exas8nv110.fpc
>> +++ b/src/shader/exas8nv110.fpc
>> @@ -1,21 +1,21 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff01,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1e0072f,
>> +0x001fbc03,
>>   0x0007ff00,
>>   0xe043ff88,
>>   0x2ff70000,
>>   0xc03a0004,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc200fe1,
>> +0x001f8400,
>>   0x00070003,
>>   0x5c980780,
>>   0x00070002,
>> diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
>> index 61374a6..ad7ca36 100644
>> --- a/src/shader/exasanv110.fp
>> +++ b/src/shader/exasanv110.fp
>> @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r3 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>>   ipa $r2 a[0x80] $r0 0x0 0x1
>>   tex nodep $r4 $r2 0x0 0x0 t2d 0x8
>>   ipa $r1 a[0x94] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>   ipa $r0 a[0x90] $r0 0x0 0x1
>>   tex nodep $r0 $r0 0x0 0x1 t2d 0xf
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>   fmul ftz $r3 $r3 $r4
>>   fmul ftz $r2 $r2 $r4
>>   fmul ftz $r1 $r1 $r4
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
>>   fmul ftz $r0 $r0 $r4
>>   exit
>>   #endif
>> diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
>> index 5516a03..1485f11 100644
>> --- a/src/shader/exasanv110.fpc
>> +++ b/src/shader/exasanv110.fpc
>> @@ -1,36 +1,36 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff03,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1e0072f,
>> +0x0008bc03,
>>   0x0007ff02,
>>   0xe043ff88,
>>   0x2ff70204,
>>   0xc03a0004,
>>   0x4007ff01,
>>   0xe043ff89,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe5e0274f,
>> +0x001fbc06,
>>   0x0007ff00,
>>   0xe043ff89,
>>   0xaff70000,
>>   0xc03a0017,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc21ffe1,
>> +0x001f8400,
>>   0x00470303,
>>   0x5c681000,
>>   0x00470202,
>>   0x5c681000,
>>   0x00470101,
>>   0x5c681000,
>> -0xfc0007e0,
>> +0xfde017e1,
>>   0x001f8000,
>>   0x00470000,
>>   0x5c681000,
>> diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
>> index 90bbb55..86e14e8 100644
>> --- a/src/shader/exascnv110.fp
>> +++ b/src/shader/exascnv110.fp
>> @@ -25,14 +25,14 @@ NV110FP_Source[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r0 $r0
>>   ipa $r1 a[0x84] $r0 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
>>   ipa $r0 a[0x80] $r0 0x0 0x1
>>   tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf) (st 0x0) (st 0x0)
>>
>
> Looks good.
>
>
>   exit
>>   #endif
>> diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
>> index 2dba15d..1fef5d2 100644
>> --- a/src/shader/exascnv110.fpc
>> +++ b/src/shader/exascnv110.fpc
>> @@ -1,20 +1,20 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff00,
>>   0xe003ff87,
>>   0x00470000,
>>   0x50800000,
>>   0x4007ff01,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfde0072f,
>> +0x001fbc03,
>>   0x0007ff00,
>>   0xe043ff88,
>>   0xaff70000,
>>   0xc03a0007,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> +0xfc0007ef,
>>   0x001f8000,
>>   0x0007000f,
>>   0xe3000000,
>> diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
>> index 2728311..dd3816c 100644
>> --- a/src/shader/videonv110.fp
>> +++ b/src/shader/videonv110.fp
>> @@ -25,30 +25,30 @@ NV110FP_NV12[] = {
>>   };
>>   #else
>>   -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>>   ipa pass $r2 a[0x7c] 0x0 0x0 0x1
>>   mufu rcp $r2 $r2
>>   ipa $r0 a[0x80] $r2 0x0 0x1
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
>>   ipa $r1 a[0x84] $r2 0x0 0x1
>>   tex nodep $r4 $r0 0x0 0x0 t2d 0x8
>>   tex nodep $r0 $r0 0x0 0x1 t2d 0xc
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
>>   depbar le 0x5 0x1 0x1
>>   fmul ftz $r5 $r4 c0[0x0]
>>   fadd ftz $r3 $r5 c0[0x4]
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x6) (st 0x6) (st 0xf)
>>   fadd ftz $r4 $r5 c0[0x8]
>>   fadd ftz $r5 $r5 c0[0xc]
>>   depbar le 0x5 0x0 0x0
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
>>   ffma ftz $r3 $r0 c0[0x10] $r3
>>   ffma ftz $r4 $r0 c0[0x14] $r4
>>   ffma ftz $r5 $r0 c0[0x18] $r5
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0x1) (st 0x1) (st 0x6)
>>   ffma ftz $r0 $r1 c0[0x1c] $r3
>>   ffma ftz $r2 $r1 c0[0x24] $r5
>>   ffma ftz $r1 $r1 c0[0x20] $r4
>> -sched (st 0x0) (st 0x0) (st 0x0)
>> +sched (st 0xf) (st 0x0) (st 0x0)
>>   exit
>>   #endif
>> diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
>> index 31d745a..8fbc246 100644
>> --- a/src/shader/videonv110.fpc
>> +++ b/src/shader/videonv110.fpc
>> @@ -1,52 +1,52 @@
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1a0070f,
>> +0x003c3c01,
>>   0xcff7ff02,
>>   0xe003ff87,
>>   0x00470202,
>>   0x50800000,
>>   0x0027ff00,
>>   0xe043ff88,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xe1e0072f,
>> +0x001cbc03,
>>   0x4027ff01,
>>   0xe043ff88,
>>   0x2ff70004,
>>   0xc03a0004,
>>   0x2ff70000,
>>   0xc03a0016,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfcc007ef,
>> +0x001f9801,
>>   0x34170001,
>>   0xf0f00000,
>>   0x00070405,
>>   0x4c681000,
>>   0x00170503,
>>   0x4c581000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfcc007e6,
>> +0x001fbc00,
>>   0x00270504,
>>   0x4c581000,
>>   0x00370505,
>>   0x4c581000,
>>   0x34070000,
>>   0xf0f00000,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc2017e6,
>> +0x001f8400,
>>   0x00470003,
>>   0x49a00180,
>>   0x00570004,
>>   0x49a00200,
>>   0x00670005,
>>   0x49a00280,
>> -0xfc0007e0,
>> -0x001f8000,
>> +0xfc2007e1,
>> +0x001f9800,
>>   0x00770100,
>>   0x49a00180,
>>   0x00970102,
>>   0x49a00280,
>>   0x00870101,
>>   0x49a00200,
>> -0xfc0007e0,
>> +0xfc0007ef,
>>   0x001f8000,
>>   0x0007000f,
>>   0xe3000000,
>>
>
As for your other comments, I have made the suggested changes.

Thanks for your review!

Cheers,
Aaryaman
On 06/07/2017 06:58 PM, Aaryaman Vasishta wrote:
> 
> 
> On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset 
> <samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>> wrote:
> 
>     Nice work!
> 
>     See my comments below, and double-check if some of them can be
>     applied to the shaders I didn't review yet.
> 
>     I recommend you to test your work because if one sched code is
>     wrong, you are likely going to kill your card and reboot your box. :-)
> 
> 
>     On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
> 
>         v2: Add missing delays
> 
>         This patch adds proper delays to maxwell exa shaders.
>         rendercheck tests
>         seem consistent with/without this patch. I haven't extensively
>         tested
>         them though.
> 
>         Trello:
>         https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays
>         <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays>
> 
>         Signed-off-by: Aaryaman Vasishta <jem456.vasishta@gmail.com
>         <mailto:jem456.vasishta@gmail.com>>
>         ---
>            src/shader/exac8nv110.fp  | 10 +++++-----
>            src/shader/exac8nv110.fpc | 18 +++++++++---------
>            src/shader/exacanv110.fp  | 10 +++++-----
>            src/shader/exacanv110.fpc | 18 +++++++++---------
>            src/shader/exacmnv110.fp  | 10 +++++-----
>            src/shader/exacmnv110.fpc | 18 +++++++++---------
>            src/shader/exas8nv110.fp  |  6 +++---
>            src/shader/exas8nv110.fpc | 12 ++++++------
>            src/shader/exasanv110.fp  | 10 +++++-----
>            src/shader/exasanv110.fpc | 18 +++++++++---------
>            src/shader/exascnv110.fp  |  6 +++---
>            src/shader/exascnv110.fpc | 10 +++++-----
>            src/shader/videonv110.fp  | 14 +++++++-------
>            src/shader/videonv110.fpc | 26 +++++++++++++-------------
>            14 files changed, 93 insertions(+), 93 deletions(-)
> 
>         diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
>         index ce78036..1c4a4f1 100644
>         --- a/src/shader/exac8nv110.fp
>         +++ b/src/shader/exac8nv110.fp
>         @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r3 a[0x94] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr
>         0x1 wt 0x2)
>            ipa $r2 a[0x90] $r0 0x0 0x1
>            tex nodep $r1 $r2 0x0 0x1 t2d 0x8
>            ipa $r3 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>            ipa $r2 a[0x80] $r0 0x0 0x1
>            tex nodep $r0 $r2 0x0 0x0 t2d 0x8
> 
> 
>     Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?
> 
> Missed it, thanks for pointing it out.

You don't have to. 'tex' reads two sources ($r2:$r3) and writes into 
$r0, but as $r2:$r3 are NOT re-used before $r0 is read, you can assume 
that $r0 will be ready and don't need any read-dep-bar.

> 
> 
> 
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
>            fmul ftz $r3 $r0 $r1
>            mov $r2 $r3 0xf
> 
> 
>     You can stall for only one cycle here, but the 6 cycles on fmul is
>     needed.
> 
>            mov $r1 $r3 0xf
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x6) (st 0xf) (st 0x0)
>            mov $r0 $r3 0xf
> 
> 
>     Same here. 
> 
> 
> 
>            exit
>            #endif
>         diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
>         index 4aa1368..46943b7 100644
>         --- a/src/shader/exac8nv110.fpc
>         +++ b/src/shader/exac8nv110.fpc
>         @@ -1,36 +1,36 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff03,
>            0xe043ff89,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0x21e0072f,
>         +0x005cbc03,
>            0x0007ff02,
>            0xe043ff89,
>            0x2ff70201,
>            0xc03a0014,
>            0x4007ff03,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe5e0074f,
>         +0x001fbc06,
>            0x0007ff02,
>            0xe043ff88,
>            0x2ff70200,
>            0xc03a0004,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfcc01fe6,
>         +0x001f8400,
>            0x00170003,
>            0x5c681000,
>            0x00370002,
>            0x5c980780,
>            0x00370001,
>            0x5c980780,
>         -0xfc0007e0,
>         +0xfde007e6,
>            0x001f8000,
>            0x00370000,
>            0x5c980780,
>         diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
>         index a70d5c5..d7c2867 100644
>         --- a/src/shader/exacanv110.fp
>         +++ b/src/shader/exacanv110.fp
>         @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r3 a[0x94] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>            ipa $r2 a[0x90] $r0 0x0 0x1
>            tex nodep $r4 $r2 0x0 0x1 t2d 0xf
> 
> 
>     Please add a read-dep-bar and wait for on the first fmul because
>     $r2:$r3 are re-used before $r4. Should be safer. 
> 
> 
> 
>            ipa $r1 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>            ipa $r0 a[0x80] $r0 0x0 0x1
>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>            fmul ftz $r3 $r3 $r7
> 
> 
>     Why are you waiting all barriers? Only $r3 is needed here.
> 
> After adding a read-dep-bar and waiting on that over here, I wasn't able 
> to pass the same number of `rendercheck -f a8r8g8b8` tests as before 
> this patch.  After a little trial and error I discovered that wt 0xc 
> fixes it, which means that bar 3 and 4 were being used in this fmul 
> somehow (assuming bars start from 1), which is odd because this fmul 
> only uses $r3 and $r7, and I think it should wait on the read-dep-bar 
> set on "tex nodep $r4 $r2 0x0 0x1 t2d 0xf" (I could be wrong though). 
> I'm kinda stumped on what's going on within this fmul that's causing 
> this behavior.

Because you are missing a read-dep-bar on the first 'tex' in this 
shader. Presumably, if you add one, you no longer need to wait for all bars.

Samuel.
> 
> 
> 
>            fmul ftz $r2 $r2 $r6
>            fmul ftz $r1 $r1 $r5
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
>            fmul ftz $r0 $r0 $r4
>            exit
>            #endif
>         diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
>         index 7c0ca5e..9cad139 100644
>         --- a/src/shader/exacanv110.fpc
>         +++ b/src/shader/exacanv110.fpc
>         @@ -1,36 +1,36 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff03,
>            0xe043ff89,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1e0072f,
>         +0x0008bc03,
>            0x0007ff02,
>            0xe043ff89,
>            0xaff70204,
>            0xc03a0017,
>            0x4007ff01,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe5e0274f,
>         +0x001fbc06,
>            0x0007ff00,
>            0xe043ff88,
>            0xaff70000,
>            0xc03a0007,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc21ffe1,
>         +0x001f8400,
>            0x00770303,
>            0x5c681000,
>            0x00670202,
>            0x5c681000,
>            0x00570101,
>            0x5c681000,
>         -0xfc0007e0,
>         +0xfde01fe1,
>            0x001f8000,
>            0x00470000,
>            0x5c681000,
>         diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
>         index fe5c294..d717138 100644
>         --- a/src/shader/exacmnv110.fp
>         +++ b/src/shader/exacmnv110.fp
>         @@ -25,23 +25,23 @@ NV110FP_Composite[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r3 a[0x94] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>            ipa $r2 a[0x90] $r0 0x0 0x1
>            tex nodep $r4 $r2 0x0 0x1 t2d 0x8
>            ipa $r1 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>            ipa $r0 a[0x80] $r0 0x0 0x1
>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>            fmul ftz $r3 $r3 $r4
>            fmul ftz $r2 $r2 $r4
>            fmul ftz $r1 $r1 $r4
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
>            fmul ftz $r0 $r0 $r4
>            exit
>            #endif
>         diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
>         index 9d62c1a..c150875 100644
>         --- a/src/shader/exacmnv110.fpc
>         +++ b/src/shader/exacmnv110.fpc
>         @@ -1,36 +1,36 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff03,
>            0xe043ff89,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1e0072f,
>         +0x0008bc03,
>            0x0007ff02,
>            0xe043ff89,
>            0x2ff70204,
>            0xc03a0014,
>            0x4007ff01,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe5e0274f,
>         +0x001fbc06,
>            0x0007ff00,
>            0xe043ff88,
>            0xaff70000,
>            0xc03a0007,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc21ffe1,
>         +0x001f8400,
>            0x00470303,
>            0x5c681000,
>            0x00470202,
>            0x5c681000,
>            0x00470101,
>            0x5c681000,
>         -0xfc0007e0,
>         +0xfde017e6,
>            0x001f8000,
>            0x00470000,
>            0x5c681000,
>         diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
>         index 4fe2e19..a555beb 100644
>         --- a/src/shader/exas8nv110.fp
>         +++ b/src/shader/exas8nv110.fp
>         @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r1 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
>            ipa $r0 a[0x80] $r0 0x0 0x1
>            tex nodep $r0 $r0 0x0 0x0 t2d 0x8
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
>            mov $r3 $r0 0xf
>            mov $r2 $r0 0xf
>            mov $r1 $r0 0xf
> 
> 
>     This one looks good!
> 
> 
>         diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
>         index 1181c41..e58d168 100644
>         --- a/src/shader/exas8nv110.fpc
>         +++ b/src/shader/exas8nv110.fpc
>         @@ -1,21 +1,21 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff01,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1e0072f,
>         +0x001fbc03,
>            0x0007ff00,
>            0xe043ff88,
>            0x2ff70000,
>            0xc03a0004,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc200fe1,
>         +0x001f8400,
>            0x00070003,
>            0x5c980780,
>            0x00070002,
>         diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
>         index 61374a6..ad7ca36 100644
>         --- a/src/shader/exasanv110.fp
>         +++ b/src/shader/exasanv110.fp
>         @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r3 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
>            ipa $r2 a[0x80] $r0 0x0 0x1
>            tex nodep $r4 $r2 0x0 0x0 t2d 0x8
>            ipa $r1 a[0x94] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>            ipa $r0 a[0x90] $r0 0x0 0x1
>            tex nodep $r0 $r0 0x0 0x1 t2d 0xf
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>            fmul ftz $r3 $r3 $r4
>            fmul ftz $r2 $r2 $r4
>            fmul ftz $r1 $r1 $r4
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
>            fmul ftz $r0 $r0 $r4
>            exit
>            #endif
>         diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
>         index 5516a03..1485f11 100644
>         --- a/src/shader/exasanv110.fpc
>         +++ b/src/shader/exasanv110.fpc
>         @@ -1,36 +1,36 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff03,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1e0072f,
>         +0x0008bc03,
>            0x0007ff02,
>            0xe043ff88,
>            0x2ff70204,
>            0xc03a0004,
>            0x4007ff01,
>            0xe043ff89,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe5e0274f,
>         +0x001fbc06,
>            0x0007ff00,
>            0xe043ff89,
>            0xaff70000,
>            0xc03a0017,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc21ffe1,
>         +0x001f8400,
>            0x00470303,
>            0x5c681000,
>            0x00470202,
>            0x5c681000,
>            0x00470101,
>            0x5c681000,
>         -0xfc0007e0,
>         +0xfde017e1,
>            0x001f8000,
>            0x00470000,
>            0x5c681000,
>         diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
>         index 90bbb55..86e14e8 100644
>         --- a/src/shader/exascnv110.fp
>         +++ b/src/shader/exascnv110.fp
>         @@ -25,14 +25,14 @@ NV110FP_Source[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r0 $r0
>            ipa $r1 a[0x84] $r0 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
>            ipa $r0 a[0x80] $r0 0x0 0x1
>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf) (st 0x0) (st 0x0)
> 
> 
>     Looks good.
> 
> 
>            exit
>            #endif
>         diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
>         index 2dba15d..1fef5d2 100644
>         --- a/src/shader/exascnv110.fpc
>         +++ b/src/shader/exascnv110.fpc
>         @@ -1,20 +1,20 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff00,
>            0xe003ff87,
>            0x00470000,
>            0x50800000,
>            0x4007ff01,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfde0072f,
>         +0x001fbc03,
>            0x0007ff00,
>            0xe043ff88,
>            0xaff70000,
>            0xc03a0007,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         +0xfc0007ef,
>            0x001f8000,
>            0x0007000f,
>            0xe3000000,
>         diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
>         index 2728311..dd3816c 100644
>         --- a/src/shader/videonv110.fp
>         +++ b/src/shader/videonv110.fp
>         @@ -25,30 +25,30 @@ NV110FP_NV12[] = {
>            };
>            #else
>            -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
>            ipa pass $r2 a[0x7c] 0x0 0x0 0x1
>            mufu rcp $r2 $r2
>            ipa $r0 a[0x80] $r2 0x0 0x1
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
>            ipa $r1 a[0x84] $r2 0x0 0x1
>            tex nodep $r4 $r0 0x0 0x0 t2d 0x8
>            tex nodep $r0 $r0 0x0 0x1 t2d 0xc
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
>            depbar le 0x5 0x1 0x1
>            fmul ftz $r5 $r4 c0[0x0]
>            fadd ftz $r3 $r5 c0[0x4]
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x6) (st 0x6) (st 0xf)
>            fadd ftz $r4 $r5 c0[0x8]
>            fadd ftz $r5 $r5 c0[0xc]
>            depbar le 0x5 0x0 0x0
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
>            ffma ftz $r3 $r0 c0[0x10] $r3
>            ffma ftz $r4 $r0 c0[0x14] $r4
>            ffma ftz $r5 $r0 c0[0x18] $r5
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0x1) (st 0x1) (st 0x6)
>            ffma ftz $r0 $r1 c0[0x1c] $r3
>            ffma ftz $r2 $r1 c0[0x24] $r5
>            ffma ftz $r1 $r1 c0[0x20] $r4
>         -sched (st 0x0) (st 0x0) (st 0x0)
>         +sched (st 0xf) (st 0x0) (st 0x0)
>            exit
>            #endif
>         diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
>         index 31d745a..8fbc246 100644
>         --- a/src/shader/videonv110.fpc
>         +++ b/src/shader/videonv110.fpc
>         @@ -1,52 +1,52 @@
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1a0070f,
>         +0x003c3c01,
>            0xcff7ff02,
>            0xe003ff87,
>            0x00470202,
>            0x50800000,
>            0x0027ff00,
>            0xe043ff88,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xe1e0072f,
>         +0x001cbc03,
>            0x4027ff01,
>            0xe043ff88,
>            0x2ff70004,
>            0xc03a0004,
>            0x2ff70000,
>            0xc03a0016,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfcc007ef,
>         +0x001f9801,
>            0x34170001,
>            0xf0f00000,
>            0x00070405,
>            0x4c681000,
>            0x00170503,
>            0x4c581000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfcc007e6,
>         +0x001fbc00,
>            0x00270504,
>            0x4c581000,
>            0x00370505,
>            0x4c581000,
>            0x34070000,
>            0xf0f00000,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc2017e6,
>         +0x001f8400,
>            0x00470003,
>            0x49a00180,
>            0x00570004,
>            0x49a00200,
>            0x00670005,
>            0x49a00280,
>         -0xfc0007e0,
>         -0x001f8000,
>         +0xfc2007e1,
>         +0x001f9800,
>            0x00770100,
>            0x49a00180,
>            0x00970102,
>            0x49a00280,
>            0x00870101,
>            0x49a00200,
>         -0xfc0007e0,
>         +0xfc0007ef,
>            0x001f8000,
>            0x0007000f,
>            0xe3000000,
> 
> As for your other comments, I have made the suggested changes.
> 
> Thanks for your review!
> 
> Cheers,
> Aaryaman
On Thu, Jun 8, 2017 at 5:01 AM, Samuel Pitoiset <samuel.pitoiset@gmail.com>
wrote:

>
>
> On 06/07/2017 06:58 PM, Aaryaman Vasishta wrote:
>
>>
>>
>> On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset <
>> samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>> wrote:
>>
>>     Nice work!
>>
>>     See my comments below, and double-check if some of them can be
>>     applied to the shaders I didn't review yet.
>>
>>     I recommend you to test your work because if one sched code is
>>     wrong, you are likely going to kill your card and reboot your box. :-)
>>
>>
>>     On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
>>
>>         v2: Add missing delays
>>
>>         This patch adds proper delays to maxwell exa shaders.
>>         rendercheck tests
>>         seem consistent with/without this patch. I haven't extensively
>>         tested
>>         them though.
>>
>>         Trello:
>>         https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wit
>> h-proper-delays
>>         <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-wi
>> th-proper-delays>
>>
>>         Signed-off-by: Aaryaman Vasishta <jem456.vasishta@gmail.com
>>         <mailto:jem456.vasishta@gmail.com>>
>>
>>         ---
>>            src/shader/exac8nv110.fp  | 10 +++++-----
>>            src/shader/exac8nv110.fpc | 18 +++++++++---------
>>            src/shader/exacanv110.fp  | 10 +++++-----
>>            src/shader/exacanv110.fpc | 18 +++++++++---------
>>            src/shader/exacmnv110.fp  | 10 +++++-----
>>            src/shader/exacmnv110.fpc | 18 +++++++++---------
>>            src/shader/exas8nv110.fp  |  6 +++---
>>            src/shader/exas8nv110.fpc | 12 ++++++------
>>            src/shader/exasanv110.fp  | 10 +++++-----
>>            src/shader/exasanv110.fpc | 18 +++++++++---------
>>            src/shader/exascnv110.fp  |  6 +++---
>>            src/shader/exascnv110.fpc | 10 +++++-----
>>            src/shader/videonv110.fp  | 14 +++++++-------
>>            src/shader/videonv110.fpc | 26 +++++++++++++-------------
>>            14 files changed, 93 insertions(+), 93 deletions(-)
>>
>>         diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
>>         index ce78036..1c4a4f1 100644
>>         --- a/src/shader/exac8nv110.fp
>>         +++ b/src/shader/exac8nv110.fp
>>         @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r3 a[0x94] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr
>>         0x1 wt 0x2)
>>            ipa $r2 a[0x90] $r0 0x0 0x1
>>            tex nodep $r1 $r2 0x0 0x1 t2d 0x8
>>            ipa $r3 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>            ipa $r2 a[0x80] $r0 0x0 0x1
>>            tex nodep $r0 $r2 0x0 0x0 t2d 0x8
>>
>>
>>     Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?
>>
>> Missed it, thanks for pointing it out.
>>
>
> You don't have to. 'tex' reads two sources ($r2:$r3) and writes into $r0,
> but as $r2:$r3 are NOT re-used before $r0 is read, you can assume that $r0
> will be ready and don't need any read-dep-bar.

Ah, so r2:r3, which are written on by the two 'ipa' above it, have already
been waited on in this tex, and both of them read $r0 so we can safely
assume that since the two 'ipa' instructions are already waited on, $r0
will be ready?

>




>
>
>>
>>
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
>>            fmul ftz $r3 $r0 $r1
>>            mov $r2 $r3 0xf
>>
>>
>>     You can stall for only one cycle here, but the 6 cycles on fmul is
>>     needed.
>>
>>            mov $r1 $r3 0xf
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x6) (st 0xf) (st 0x0)
>>            mov $r0 $r3 0xf
>>
>>
>>     Same here.
>>
>>
>>            exit
>>            #endif
>>         diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
>>         index 4aa1368..46943b7 100644
>>         --- a/src/shader/exac8nv110.fpc
>>         +++ b/src/shader/exac8nv110.fpc
>>         @@ -1,36 +1,36 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff03,
>>            0xe043ff89,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0x21e0072f,
>>         +0x005cbc03,
>>            0x0007ff02,
>>            0xe043ff89,
>>            0x2ff70201,
>>            0xc03a0014,
>>            0x4007ff03,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe5e0074f,
>>         +0x001fbc06,
>>            0x0007ff02,
>>            0xe043ff88,
>>            0x2ff70200,
>>            0xc03a0004,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfcc01fe6,
>>         +0x001f8400,
>>            0x00170003,
>>            0x5c681000,
>>            0x00370002,
>>            0x5c980780,
>>            0x00370001,
>>            0x5c980780,
>>         -0xfc0007e0,
>>         +0xfde007e6,
>>            0x001f8000,
>>            0x00370000,
>>            0x5c980780,
>>         diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
>>         index a70d5c5..d7c2867 100644
>>         --- a/src/shader/exacanv110.fp
>>         +++ b/src/shader/exacanv110.fp
>>         @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r3 a[0x94] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd
>> 0x2)
>>            ipa $r2 a[0x90] $r0 0x0 0x1
>>            tex nodep $r4 $r2 0x0 0x1 t2d 0xf
>>
>>
>>     Please add a read-dep-bar and wait for on the first fmul because
>>     $r2:$r3 are re-used before $r4. Should be safer.
>>
>>
>>            ipa $r1 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>            ipa $r0 a[0x80] $r0 0x0 0x1
>>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>            fmul ftz $r3 $r3 $r7
>>
>>
>>     Why are you waiting all barriers? Only $r3 is needed here.
>>
>> After adding a read-dep-bar and waiting on that over here, I wasn't able
>> to pass the same number of `rendercheck -f a8r8g8b8` tests as before this
>> patch.  After a little trial and error I discovered that wt 0xc fixes it,
>> which means that bar 3 and 4 were being used in this fmul somehow (assuming
>> bars start from 1), which is odd because this fmul only uses $r3 and $r7,
>> and I think it should wait on the read-dep-bar set on "tex nodep $r4 $r2
>> 0x0 0x1 t2d 0xf" (I could be wrong though). I'm kinda stumped on what's
>> going on within this fmul that's causing this behavior.
>>
>
> Because you are missing a read-dep-bar on the first 'tex' in this shader.
> Presumably, if you add one, you no longer need to wait for all bars.

I made some changes which you can check out here
https://hastebin.com/cazirimeva.bash. Here are my comments on the same:
In ' sched (st 0x1 wt 0x2) (st 0x1) (st 0x1 wt 0x4)' I'm facing the same
issue, even after waiting on the read-dep bar set on the first 'tex'. I
replaced 'wt 0x2' with 'wt 0x4' so it waits on bar 3 instead of bar 2 which
was set as the read-dep bar, and the above issue goes away. It's kinda odd
for me because bar 3 is set for $r0 on the second 'tex' and that fmul
doesn't seem to use it anywhere.

Cheers,
Aaryaman

>
>
> Samuel.
>
>
>>
>>
>>            fmul ftz $r2 $r2 $r6
>>            fmul ftz $r1 $r1 $r5
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
>>            fmul ftz $r0 $r0 $r4
>>            exit
>>            #endif
>>         diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
>>         index 7c0ca5e..9cad139 100644
>>         --- a/src/shader/exacanv110.fpc
>>         +++ b/src/shader/exacanv110.fpc
>>         @@ -1,36 +1,36 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff03,
>>            0xe043ff89,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1e0072f,
>>         +0x0008bc03,
>>            0x0007ff02,
>>            0xe043ff89,
>>            0xaff70204,
>>            0xc03a0017,
>>            0x4007ff01,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe5e0274f,
>>         +0x001fbc06,
>>            0x0007ff00,
>>            0xe043ff88,
>>            0xaff70000,
>>            0xc03a0007,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc21ffe1,
>>         +0x001f8400,
>>            0x00770303,
>>            0x5c681000,
>>            0x00670202,
>>            0x5c681000,
>>            0x00570101,
>>            0x5c681000,
>>         -0xfc0007e0,
>>         +0xfde01fe1,
>>            0x001f8000,
>>            0x00470000,
>>            0x5c681000,
>>         diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
>>         index fe5c294..d717138 100644
>>         --- a/src/shader/exacmnv110.fp
>>         +++ b/src/shader/exacmnv110.fp
>>         @@ -25,23 +25,23 @@ NV110FP_Composite[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r3 a[0x94] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd
>> 0x2)
>>            ipa $r2 a[0x90] $r0 0x0 0x1
>>            tex nodep $r4 $r2 0x0 0x1 t2d 0x8
>>            ipa $r1 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>            ipa $r0 a[0x80] $r0 0x0 0x1
>>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>            fmul ftz $r3 $r3 $r4
>>            fmul ftz $r2 $r2 $r4
>>            fmul ftz $r1 $r1 $r4
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
>>            fmul ftz $r0 $r0 $r4
>>            exit
>>            #endif
>>         diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
>>         index 9d62c1a..c150875 100644
>>         --- a/src/shader/exacmnv110.fpc
>>         +++ b/src/shader/exacmnv110.fpc
>>         @@ -1,36 +1,36 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff03,
>>            0xe043ff89,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1e0072f,
>>         +0x0008bc03,
>>            0x0007ff02,
>>            0xe043ff89,
>>            0x2ff70204,
>>            0xc03a0014,
>>            0x4007ff01,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe5e0274f,
>>         +0x001fbc06,
>>            0x0007ff00,
>>            0xe043ff88,
>>            0xaff70000,
>>            0xc03a0007,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc21ffe1,
>>         +0x001f8400,
>>            0x00470303,
>>            0x5c681000,
>>            0x00470202,
>>            0x5c681000,
>>            0x00470101,
>>            0x5c681000,
>>         -0xfc0007e0,
>>         +0xfde017e6,
>>            0x001f8000,
>>            0x00470000,
>>            0x5c681000,
>>         diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
>>         index 4fe2e19..a555beb 100644
>>         --- a/src/shader/exas8nv110.fp
>>         +++ b/src/shader/exas8nv110.fp
>>         @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r1 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
>>            ipa $r0 a[0x80] $r0 0x0 0x1
>>            tex nodep $r0 $r0 0x0 0x0 t2d 0x8
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
>>            mov $r3 $r0 0xf
>>            mov $r2 $r0 0xf
>>            mov $r1 $r0 0xf
>>
>>
>>     This one looks good!
>>
>>
>>         diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
>>         index 1181c41..e58d168 100644
>>         --- a/src/shader/exas8nv110.fpc
>>         +++ b/src/shader/exas8nv110.fpc
>>         @@ -1,21 +1,21 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff01,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1e0072f,
>>         +0x001fbc03,
>>            0x0007ff00,
>>            0xe043ff88,
>>            0x2ff70000,
>>            0xc03a0004,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc200fe1,
>>         +0x001f8400,
>>            0x00070003,
>>            0x5c980780,
>>            0x00070002,
>>         diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
>>         index 61374a6..ad7ca36 100644
>>         --- a/src/shader/exasanv110.fp
>>         +++ b/src/shader/exasanv110.fp
>>         @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r3 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd
>> 0x2)
>>            ipa $r2 a[0x80] $r0 0x0 0x1
>>            tex nodep $r4 $r2 0x0 0x0 t2d 0x8
>>            ipa $r1 a[0x94] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>>            ipa $r0 a[0x90] $r0 0x0 0x1
>>            tex nodep $r0 $r0 0x0 0x1 t2d 0xf
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>>            fmul ftz $r3 $r3 $r4
>>            fmul ftz $r2 $r2 $r4
>>            fmul ftz $r1 $r1 $r4
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
>>            fmul ftz $r0 $r0 $r4
>>            exit
>>            #endif
>>         diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
>>         index 5516a03..1485f11 100644
>>         --- a/src/shader/exasanv110.fpc
>>         +++ b/src/shader/exasanv110.fpc
>>         @@ -1,36 +1,36 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff03,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1e0072f,
>>         +0x0008bc03,
>>            0x0007ff02,
>>            0xe043ff88,
>>            0x2ff70204,
>>            0xc03a0004,
>>            0x4007ff01,
>>            0xe043ff89,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe5e0274f,
>>         +0x001fbc06,
>>            0x0007ff00,
>>            0xe043ff89,
>>            0xaff70000,
>>            0xc03a0017,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc21ffe1,
>>         +0x001f8400,
>>            0x00470303,
>>            0x5c681000,
>>            0x00470202,
>>            0x5c681000,
>>            0x00470101,
>>            0x5c681000,
>>         -0xfc0007e0,
>>         +0xfde017e1,
>>            0x001f8000,
>>            0x00470000,
>>            0x5c681000,
>>         diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
>>         index 90bbb55..86e14e8 100644
>>         --- a/src/shader/exascnv110.fp
>>         +++ b/src/shader/exascnv110.fp
>>         @@ -25,14 +25,14 @@ NV110FP_Source[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r0 $r0
>>            ipa $r1 a[0x84] $r0 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
>>            ipa $r0 a[0x80] $r0 0x0 0x1
>>            tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf) (st 0x0) (st 0x0)
>>
>>
>>     Looks good.
>>
>>
>>            exit
>>            #endif
>>         diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
>>         index 2dba15d..1fef5d2 100644
>>         --- a/src/shader/exascnv110.fpc
>>         +++ b/src/shader/exascnv110.fpc
>>         @@ -1,20 +1,20 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff00,
>>            0xe003ff87,
>>            0x00470000,
>>            0x50800000,
>>            0x4007ff01,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfde0072f,
>>         +0x001fbc03,
>>            0x0007ff00,
>>            0xe043ff88,
>>            0xaff70000,
>>            0xc03a0007,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         +0xfc0007ef,
>>            0x001f8000,
>>            0x0007000f,
>>            0xe3000000,
>>         diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
>>         index 2728311..dd3816c 100644
>>         --- a/src/shader/videonv110.fp
>>         +++ b/src/shader/videonv110.fp
>>         @@ -25,30 +25,30 @@ NV110FP_NV12[] = {
>>            };
>>            #else
>>            -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt
>> 0x1)
>>            ipa pass $r2 a[0x7c] 0x0 0x0 0x1
>>            mufu rcp $r2 $r2
>>            ipa $r0 a[0x80] $r2 0x0 0x1
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
>>            ipa $r1 a[0x84] $r2 0x0 0x1
>>            tex nodep $r4 $r0 0x0 0x0 t2d 0x8
>>            tex nodep $r0 $r0 0x0 0x1 t2d 0xc
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
>>            depbar le 0x5 0x1 0x1
>>            fmul ftz $r5 $r4 c0[0x0]
>>            fadd ftz $r3 $r5 c0[0x4]
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x6) (st 0x6) (st 0xf)
>>            fadd ftz $r4 $r5 c0[0x8]
>>            fadd ftz $r5 $r5 c0[0xc]
>>            depbar le 0x5 0x0 0x0
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
>>            ffma ftz $r3 $r0 c0[0x10] $r3
>>            ffma ftz $r4 $r0 c0[0x14] $r4
>>            ffma ftz $r5 $r0 c0[0x18] $r5
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0x1) (st 0x1) (st 0x6)
>>            ffma ftz $r0 $r1 c0[0x1c] $r3
>>            ffma ftz $r2 $r1 c0[0x24] $r5
>>            ffma ftz $r1 $r1 c0[0x20] $r4
>>         -sched (st 0x0) (st 0x0) (st 0x0)
>>         +sched (st 0xf) (st 0x0) (st 0x0)
>>            exit
>>            #endif
>>         diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
>>         index 31d745a..8fbc246 100644
>>         --- a/src/shader/videonv110.fpc
>>         +++ b/src/shader/videonv110.fpc
>>         @@ -1,52 +1,52 @@
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1a0070f,
>>         +0x003c3c01,
>>            0xcff7ff02,
>>            0xe003ff87,
>>            0x00470202,
>>            0x50800000,
>>            0x0027ff00,
>>            0xe043ff88,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xe1e0072f,
>>         +0x001cbc03,
>>            0x4027ff01,
>>            0xe043ff88,
>>            0x2ff70004,
>>            0xc03a0004,
>>            0x2ff70000,
>>            0xc03a0016,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfcc007ef,
>>         +0x001f9801,
>>            0x34170001,
>>            0xf0f00000,
>>            0x00070405,
>>            0x4c681000,
>>            0x00170503,
>>            0x4c581000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfcc007e6,
>>         +0x001fbc00,
>>            0x00270504,
>>            0x4c581000,
>>            0x00370505,
>>            0x4c581000,
>>            0x34070000,
>>            0xf0f00000,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc2017e6,
>>         +0x001f8400,
>>            0x00470003,
>>            0x49a00180,
>>            0x00570004,
>>            0x49a00200,
>>            0x00670005,
>>            0x49a00280,
>>         -0xfc0007e0,
>>         -0x001f8000,
>>         +0xfc2007e1,
>>         +0x001f9800,
>>            0x00770100,
>>            0x49a00180,
>>            0x00970102,
>>            0x49a00280,
>>            0x00870101,
>>            0x49a00200,
>>         -0xfc0007e0,
>>         +0xfc0007ef,
>>            0x001f8000,
>>            0x0007000f,
>>            0xe3000000,
>>
>> As for your other comments, I have made the suggested changes.
>>
>> Thanks for your review!
>>
>> Cheers,
>> Aaryaman
>>
>
On 06/08/2017 05:19 PM, Aaryaman Vasishta wrote:
> 
> 
> On Thu, Jun 8, 2017 at 5:01 AM, Samuel Pitoiset 
> <samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>> wrote:
> 
> 
> 
>     On 06/07/2017 06:58 PM, Aaryaman Vasishta wrote:
> 
> 
> 
>         On Tue, Jun 6, 2017 at 7:15 AM, Samuel Pitoiset
>         <samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>
>         <mailto:samuel.pitoiset@gmail.com
>         <mailto:samuel.pitoiset@gmail.com>>> wrote:
> 
>              Nice work!
> 
>              See my comments below, and double-check if some of them can be
>              applied to the shaders I didn't review yet.
> 
>              I recommend you to test your work because if one sched code is
>              wrong, you are likely going to kill your card and reboot
>         your box. :-)
> 
> 
>              On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
> 
>                  v2: Add missing delays
> 
>                  This patch adds proper delays to maxwell exa shaders.
>                  rendercheck tests
>                  seem consistent with/without this patch. I haven't
>         extensively
>                  tested
>                  them though.
> 
>                  Trello:
>         https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays
>         <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays>
>                 
>         <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays
>         <https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays>>
> 
>                  Signed-off-by: Aaryaman Vasishta
>         <jem456.vasishta@gmail.com <mailto:jem456.vasishta@gmail.com>
>                  <mailto:jem456.vasishta@gmail.com
>         <mailto:jem456.vasishta@gmail.com>>>
> 
>                  ---
>                     src/shader/exac8nv110.fp  | 10 +++++-----
>                     src/shader/exac8nv110.fpc | 18 +++++++++---------
>                     src/shader/exacanv110.fp  | 10 +++++-----
>                     src/shader/exacanv110.fpc | 18 +++++++++---------
>                     src/shader/exacmnv110.fp  | 10 +++++-----
>                     src/shader/exacmnv110.fpc | 18 +++++++++---------
>                     src/shader/exas8nv110.fp  |  6 +++---
>                     src/shader/exas8nv110.fpc | 12 ++++++------
>                     src/shader/exasanv110.fp  | 10 +++++-----
>                     src/shader/exasanv110.fpc | 18 +++++++++---------
>                     src/shader/exascnv110.fp  |  6 +++---
>                     src/shader/exascnv110.fpc | 10 +++++-----
>                     src/shader/videonv110.fp  | 14 +++++++-------
>                     src/shader/videonv110.fpc | 26
>         +++++++++++++-------------
>                     14 files changed, 93 insertions(+), 93 deletions(-)
> 
>                  diff --git a/src/shader/exac8nv110.fp
>         b/src/shader/exac8nv110.fp
>                  index ce78036..1c4a4f1 100644
>                  --- a/src/shader/exac8nv110.fp
>                  +++ b/src/shader/exac8nv110.fp
>                  @@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r3 a[0x94] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3)
>         (st 0xf wr
>                  0x1 wt 0x2)
>                     ipa $r2 a[0x90] $r0 0x0 0x1
>                     tex nodep $r1 $r2 0x0 0x1 t2d 0x8
>                     ipa $r3 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
>                     ipa $r2 a[0x80] $r0 0x0 0x1
>                     tex nodep $r0 $r2 0x0 0x0 t2d 0x8
> 
> 
>              Out of curiosity, what didn't you add a read-dep-bar on
>         $r2:$r3 here?
> 
>         Missed it, thanks for pointing it out.
> 
> 
>     You don't have to. 'tex' reads two sources ($r2:$r3) and writes into
>     $r0, but as $r2:$r3 are NOT re-used before $r0 is read, you can
>     assume that $r0 will be ready and don't need any read-dep-bar.
> 
> Ah, so r2:r3, which are written on by the two 'ipa' above it, have 
> already been waited on in this tex, and both of them read $r0 so we can 
> safely assume that since the two 'ipa' instructions are already waited 
> on, $r0 will be ready?

No.

It's because the next 'fmul' waits for $r0 (output of 'tex'). So, if $r0 
is "ready", you can assume that $r2:$r3 can be re-used. It's a 
particular situation which doesn't need to emit any read-dep-bars, you 
can add them if you want but that's useless.

> 
> 
> 
> 
> 
> 
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
>                     fmul ftz $r3 $r0 $r1
>                     mov $r2 $r3 0xf
> 
> 
>              You can stall for only one cycle here, but the 6 cycles on
>         fmul is
>              needed.
> 
>                     mov $r1 $r3 0xf
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x6) (st 0xf) (st 0x0)
>                     mov $r0 $r3 0xf
> 
> 
>              Same here.
> 
> 
>                     exit
>                     #endif
>                  diff --git a/src/shader/exac8nv110.fpc
>         b/src/shader/exac8nv110.fpc
>                  index 4aa1368..46943b7 100644
>                  --- a/src/shader/exac8nv110.fpc
>                  +++ b/src/shader/exac8nv110.fpc
>                  @@ -1,36 +1,36 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff03,
>                     0xe043ff89,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0x21e0072f,
>                  +0x005cbc03,
>                     0x0007ff02,
>                     0xe043ff89,
>                     0x2ff70201,
>                     0xc03a0014,
>                     0x4007ff03,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe5e0074f,
>                  +0x001fbc06,
>                     0x0007ff02,
>                     0xe043ff88,
>                     0x2ff70200,
>                     0xc03a0004,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfcc01fe6,
>                  +0x001f8400,
>                     0x00170003,
>                     0x5c681000,
>                     0x00370002,
>                     0x5c980780,
>                     0x00370001,
>                     0x5c980780,
>                  -0xfc0007e0,
>                  +0xfde007e6,
>                     0x001f8000,
>                     0x00370000,
>                     0x5c980780,
>                  diff --git a/src/shader/exacanv110.fp
>         b/src/shader/exacanv110.fp
>                  index a70d5c5..d7c2867 100644
>                  --- a/src/shader/exacanv110.fp
>                  +++ b/src/shader/exacanv110.fp
>                  @@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r3 a[0x94] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf
>         wr 0x1 rd 0x2)
>                     ipa $r2 a[0x90] $r0 0x0 0x1
>                     tex nodep $r4 $r2 0x0 0x1 t2d 0xf
> 
> 
>              Please add a read-dep-bar and wait for on the first fmul
>         because
>              $r2:$r3 are re-used before $r4. Should be safer.
> 
> 
>                     ipa $r1 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6)
>         (st 0xf)
>                     ipa $r0 a[0x80] $r0 0x0 0x1
>                     tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>                     fmul ftz $r3 $r3 $r7
> 
> 
>              Why are you waiting all barriers? Only $r3 is needed here.
> 
>         After adding a read-dep-bar and waiting on that over here, I
>         wasn't able to pass the same number of `rendercheck -f a8r8g8b8`
>         tests as before this patch.  After a little trial and error I
>         discovered that wt 0xc fixes it, which means that bar 3 and 4
>         were being used in this fmul somehow (assuming bars start from
>         1), which is odd because this fmul only uses $r3 and $r7, and I
>         think it should wait on the read-dep-bar set on "tex nodep $r4
>         $r2 0x0 0x1 t2d 0xf" (I could be wrong though). I'm kinda
>         stumped on what's going on within this fmul that's causing this
>         behavior.
> 
> 
>     Because you are missing a read-dep-bar on the first 'tex' in this
>     shader. Presumably, if you add one, you no longer need to wait for
>     all bars.
> 
> I made some changes which you can check out here 
> https://hastebin.com/cazirimeva.bash 
> <https://hastebin.com/cazirimeva.bash>. Here are my comments on the same:
> In ' sched (st 0x1 wt 0x2) (st 0x1) (st 0x1 wt 0x4)' I'm facing the same 
> issue, even after waiting on the read-dep bar set on the first 'tex'. I 
> replaced 'wt 0x2' with 'wt 0x4' so it waits on bar 3 instead of bar 2 
> which was set as the read-dep bar, and the above issue goes away. It's 
> kinda odd for me because bar 3 is set for $r0 on the second 'tex' and 
> that fmul doesn't seem to use it anywhere.

Mmmh that looks weird. Anyway, will be easier to debug with an updated 
version. :)

Waiting for the v3 now.

Thanks,
Samuel.

> 
> Cheers,
> Aaryaman
> 
> 
> 
>     Samuel.
> 
> 
> 
> 
>                     fmul ftz $r2 $r2 $r6
>                     fmul ftz $r1 $r1 $r5
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
>                     fmul ftz $r0 $r0 $r4
>                     exit
>                     #endif
>                  diff --git a/src/shader/exacanv110.fpc
>         b/src/shader/exacanv110.fpc
>                  index 7c0ca5e..9cad139 100644
>                  --- a/src/shader/exacanv110.fpc
>                  +++ b/src/shader/exacanv110.fpc
>                  @@ -1,36 +1,36 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff03,
>                     0xe043ff89,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1e0072f,
>                  +0x0008bc03,
>                     0x0007ff02,
>                     0xe043ff89,
>                     0xaff70204,
>                     0xc03a0017,
>                     0x4007ff01,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe5e0274f,
>                  +0x001fbc06,
>                     0x0007ff00,
>                     0xe043ff88,
>                     0xaff70000,
>                     0xc03a0007,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc21ffe1,
>                  +0x001f8400,
>                     0x00770303,
>                     0x5c681000,
>                     0x00670202,
>                     0x5c681000,
>                     0x00570101,
>                     0x5c681000,
>                  -0xfc0007e0,
>                  +0xfde01fe1,
>                     0x001f8000,
>                     0x00470000,
>                     0x5c681000,
>                  diff --git a/src/shader/exacmnv110.fp
>         b/src/shader/exacmnv110.fp
>                  index fe5c294..d717138 100644
>                  --- a/src/shader/exacmnv110.fp
>                  +++ b/src/shader/exacmnv110.fp
>                  @@ -25,23 +25,23 @@ NV110FP_Composite[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r3 a[0x94] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf
>         wr 0x1 rd 0x2)
>                     ipa $r2 a[0x90] $r0 0x0 0x1
>                     tex nodep $r4 $r2 0x0 0x1 t2d 0x8
>                     ipa $r1 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6)
>         (st 0xf)
>                     ipa $r0 a[0x80] $r0 0x0 0x1
>                     tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>                     fmul ftz $r3 $r3 $r4
>                     fmul ftz $r2 $r2 $r4
>                     fmul ftz $r1 $r1 $r4
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
>                     fmul ftz $r0 $r0 $r4
>                     exit
>                     #endif
>                  diff --git a/src/shader/exacmnv110.fpc
>         b/src/shader/exacmnv110.fpc
>                  index 9d62c1a..c150875 100644
>                  --- a/src/shader/exacmnv110.fpc
>                  +++ b/src/shader/exacmnv110.fpc
>                  @@ -1,36 +1,36 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff03,
>                     0xe043ff89,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1e0072f,
>                  +0x0008bc03,
>                     0x0007ff02,
>                     0xe043ff89,
>                     0x2ff70204,
>                     0xc03a0014,
>                     0x4007ff01,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe5e0274f,
>                  +0x001fbc06,
>                     0x0007ff00,
>                     0xe043ff88,
>                     0xaff70000,
>                     0xc03a0007,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc21ffe1,
>                  +0x001f8400,
>                     0x00470303,
>                     0x5c681000,
>                     0x00470202,
>                     0x5c681000,
>                     0x00470101,
>                     0x5c681000,
>                  -0xfc0007e0,
>                  +0xfde017e6,
>                     0x001f8000,
>                     0x00470000,
>                     0x5c681000,
>                  diff --git a/src/shader/exas8nv110.fp
>         b/src/shader/exas8nv110.fp
>                  index 4fe2e19..a555beb 100644
>                  --- a/src/shader/exas8nv110.fp
>                  +++ b/src/shader/exas8nv110.fp
>                  @@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r1 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
>                     ipa $r0 a[0x80] $r0 0x0 0x1
>                     tex nodep $r0 $r0 0x0 0x0 t2d 0x8
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
>                     mov $r3 $r0 0xf
>                     mov $r2 $r0 0xf
>                     mov $r1 $r0 0xf
> 
> 
>              This one looks good!
> 
> 
>                  diff --git a/src/shader/exas8nv110.fpc
>         b/src/shader/exas8nv110.fpc
>                  index 1181c41..e58d168 100644
>                  --- a/src/shader/exas8nv110.fpc
>                  +++ b/src/shader/exas8nv110.fpc
>                  @@ -1,21 +1,21 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff01,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1e0072f,
>                  +0x001fbc03,
>                     0x0007ff00,
>                     0xe043ff88,
>                     0x2ff70000,
>                     0xc03a0004,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc200fe1,
>                  +0x001f8400,
>                     0x00070003,
>                     0x5c980780,
>                     0x00070002,
>                  diff --git a/src/shader/exasanv110.fp
>         b/src/shader/exasanv110.fp
>                  index 61374a6..ad7ca36 100644
>                  --- a/src/shader/exasanv110.fp
>                  +++ b/src/shader/exasanv110.fp
>                  @@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r3 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf
>         wr 0x1 rd 0x2)
>                     ipa $r2 a[0x80] $r0 0x0 0x1
>                     tex nodep $r4 $r2 0x0 0x0 t2d 0x8
>                     ipa $r1 a[0x94] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6)
>         (st 0xf)
>                     ipa $r0 a[0x90] $r0 0x0 0x1
>                     tex nodep $r0 $r0 0x0 0x1 t2d 0xf
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
>                     fmul ftz $r3 $r3 $r4
>                     fmul ftz $r2 $r2 $r4
>                     fmul ftz $r1 $r1 $r4
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
>                     fmul ftz $r0 $r0 $r4
>                     exit
>                     #endif
>                  diff --git a/src/shader/exasanv110.fpc
>         b/src/shader/exasanv110.fpc
>                  index 5516a03..1485f11 100644
>                  --- a/src/shader/exasanv110.fpc
>                  +++ b/src/shader/exasanv110.fpc
>                  @@ -1,36 +1,36 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff03,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1e0072f,
>                  +0x0008bc03,
>                     0x0007ff02,
>                     0xe043ff88,
>                     0x2ff70204,
>                     0xc03a0004,
>                     0x4007ff01,
>                     0xe043ff89,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe5e0274f,
>                  +0x001fbc06,
>                     0x0007ff00,
>                     0xe043ff89,
>                     0xaff70000,
>                     0xc03a0017,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc21ffe1,
>                  +0x001f8400,
>                     0x00470303,
>                     0x5c681000,
>                     0x00470202,
>                     0x5c681000,
>                     0x00470101,
>                     0x5c681000,
>                  -0xfc0007e0,
>                  +0xfde017e1,
>                     0x001f8000,
>                     0x00470000,
>                     0x5c681000,
>                  diff --git a/src/shader/exascnv110.fp
>         b/src/shader/exascnv110.fp
>                  index 90bbb55..86e14e8 100644
>                  --- a/src/shader/exascnv110.fp
>                  +++ b/src/shader/exascnv110.fp
>                  @@ -25,14 +25,14 @@ NV110FP_Source[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r0 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r0 $r0
>                     ipa $r1 a[0x84] $r0 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
>                     ipa $r0 a[0x80] $r0 0x0 0x1
>                     tex nodep $r0 $r0 0x0 0x0 t2d 0xf
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf) (st 0x0) (st 0x0)
> 
> 
>              Looks good.
> 
> 
>                     exit
>                     #endif
>                  diff --git a/src/shader/exascnv110.fpc
>         b/src/shader/exascnv110.fpc
>                  index 2dba15d..1fef5d2 100644
>                  --- a/src/shader/exascnv110.fpc
>                  +++ b/src/shader/exascnv110.fpc
>                  @@ -1,20 +1,20 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff00,
>                     0xe003ff87,
>                     0x00470000,
>                     0x50800000,
>                     0x4007ff01,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfde0072f,
>                  +0x001fbc03,
>                     0x0007ff00,
>                     0xe043ff88,
>                     0xaff70000,
>                     0xc03a0007,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  +0xfc0007ef,
>                     0x001f8000,
>                     0x0007000f,
>                     0xe3000000,
>                  diff --git a/src/shader/videonv110.fp
>         b/src/shader/videonv110.fp
>                  index 2728311..dd3816c 100644
>                  --- a/src/shader/videonv110.fp
>                  +++ b/src/shader/videonv110.fp
>                  @@ -25,30 +25,30 @@ NV110FP_NV12[] = {
>                     };
>                     #else
>                     -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf
>         wr 0x0 wt 0x1)
>                     ipa pass $r2 a[0x7c] 0x0 0x0 0x1
>                     mufu rcp $r2 $r2
>                     ipa $r0 a[0x80] $r2 0x0 0x1
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf
>         wr 0x1)
>                     ipa $r1 a[0x84] $r2 0x0 0x1
>                     tex nodep $r4 $r0 0x0 0x0 t2d 0x8
>                     tex nodep $r0 $r0 0x0 0x1 t2d 0xc
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
>                     depbar le 0x5 0x1 0x1
>                     fmul ftz $r5 $r4 c0[0x0]
>                     fadd ftz $r3 $r5 c0[0x4]
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x6) (st 0x6) (st 0xf)
>                     fadd ftz $r4 $r5 c0[0x8]
>                     fadd ftz $r5 $r5 c0[0xc]
>                     depbar le 0x5 0x0 0x0
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
>                     ffma ftz $r3 $r0 c0[0x10] $r3
>                     ffma ftz $r4 $r0 c0[0x14] $r4
>                     ffma ftz $r5 $r0 c0[0x18] $r5
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0x1) (st 0x1) (st 0x6)
>                     ffma ftz $r0 $r1 c0[0x1c] $r3
>                     ffma ftz $r2 $r1 c0[0x24] $r5
>                     ffma ftz $r1 $r1 c0[0x20] $r4
>                  -sched (st 0x0) (st 0x0) (st 0x0)
>                  +sched (st 0xf) (st 0x0) (st 0x0)
>                     exit
>                     #endif
>                  diff --git a/src/shader/videonv110.fpc
>         b/src/shader/videonv110.fpc
>                  index 31d745a..8fbc246 100644
>                  --- a/src/shader/videonv110.fpc
>                  +++ b/src/shader/videonv110.fpc
>                  @@ -1,52 +1,52 @@
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1a0070f,
>                  +0x003c3c01,
>                     0xcff7ff02,
>                     0xe003ff87,
>                     0x00470202,
>                     0x50800000,
>                     0x0027ff00,
>                     0xe043ff88,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xe1e0072f,
>                  +0x001cbc03,
>                     0x4027ff01,
>                     0xe043ff88,
>                     0x2ff70004,
>                     0xc03a0004,
>                     0x2ff70000,
>                     0xc03a0016,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfcc007ef,
>                  +0x001f9801,
>                     0x34170001,
>                     0xf0f00000,
>                     0x00070405,
>                     0x4c681000,
>                     0x00170503,
>                     0x4c581000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfcc007e6,
>                  +0x001fbc00,
>                     0x00270504,
>                     0x4c581000,
>                     0x00370505,
>                     0x4c581000,
>                     0x34070000,
>                     0xf0f00000,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc2017e6,
>                  +0x001f8400,
>                     0x00470003,
>                     0x49a00180,
>                     0x00570004,
>                     0x49a00200,
>                     0x00670005,
>                     0x49a00280,
>                  -0xfc0007e0,
>                  -0x001f8000,
>                  +0xfc2007e1,
>                  +0x001f9800,
>                     0x00770100,
>                     0x49a00180,
>                     0x00970102,
>                     0x49a00280,
>                     0x00870101,
>                     0x49a00200,
>                  -0xfc0007e0,
>                  +0xfc0007ef,
>                     0x001f8000,
>                     0x0007000f,
>                     0xe3000000,
> 
>         As for your other comments, I have made the suggested changes.
> 
>         Thanks for your review!
> 
>         Cheers,
>         Aaryaman
> 
>