From 04bdfbcceeb1e368510734f2bb4e03f99ff3b9b7 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Wed, 19 Nov 2014 16:38:43 -0800
Subject: [PATCH] Add %cmpi/s and %cmpi/u instructions for performance

These bypass the vec4 stack in some common cases, saving instructions
and vec4 manipulations.

Also, minor improvement to the %flag/set/vec4 statement.

Kill a few warnings.
---
 tgt-vvp/eval_vec4.c |  17 ++++--
 vvp/codes.h         |   2 +
 vvp/compile.cc      |   2 +
 vvp/opcodes.txt     |   2 +
 vvp/vthread.cc      | 123 ++++++++++++++++++++++++++++++++------------
 5 files changed, 109 insertions(+), 37 deletions(-)
diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index 0f29a9dc3..fdd0c893c 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr)
       draw_eval_vec4(le);
       resize_vec4_wid(le, use_wid);
 
-      draw_eval_vec4(re);
-      resize_vec4_wid(re, use_wid);
+      if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
+	      /* Special case: If the right operand can be handled as
+		 an immediate operand, then use that instead. */
+	    char opcode[8];
+	    snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag);
+	    draw_immediate_vec4(re, opcode);
+
+      } else {
+	    draw_eval_vec4(re);
+	    resize_vec4_wid(re, use_wid);
+
+	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
+      }
 
       switch (use_opcode) {
 	  case 'L':
-	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
 	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
 	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
 	    fprintf(vvp_out, "    %%or;\n");
 	    break;
 	  case '<':
-	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
 	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
 	    break;
 	  default:
diff --git a/vvp/codes.h b/vvp/codes.h
index af6c99ba0..9489b92c7 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code);
 extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
 extern bool of_CAST2(vthread_t thr, vvp_code_t code);
 extern bool of_CMPS(vthread_t thr, vvp_code_t code);
+extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
 extern bool of_CMPU(vthread_t thr, vvp_code_t code);
+extern bool of_CMPIU(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWU(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 933ef8563..a521b61ab 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%cmp/x",  of_CMPX,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/z",  of_CMPZ,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
+      { "%cmpi/s", of_CMPIS,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmpi/u", of_CMPIU,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%concat/str",  of_CONCAT_STR,  0,{OA_NONE,  OA_NONE,  OA_NONE} },
       { "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE,  OA_NONE,  OA_NONE} },
       { "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE,  OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 4813e60dd..1f5a342f9 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -252,6 +252,8 @@ vector2 (binary) value, and push the result.
 
 * %cmp/s
 * %cmp/u
+* %cmpi/s <vala>, <valb>, <wid>
+* %cmpi/u <vala>, <valb>, <wid>
 
 These instructions perform a generic comparison of two vectors of
 equal size. Two values are pulled from the top of the stack, and not
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 68837fbc8..ceab39474 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
       return true;
 }
 
-/*
- *  %cmp/s
- *
- * Pop the operands from the stack, and do not replace them. The
- * results are written to flag bits:
- *
- *	4: eq  (equal)
- *	5: lt  (less than)
- *	6: eeq (case equal)
- */
-bool of_CMPS(vthread_t thr, vvp_code_t)
+static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 {
       vvp_bit4_t eq  = BIT4_1;
       vvp_bit4_t eeq = BIT4_1;
       vvp_bit4_t lt  = BIT4_0;
 
-	// We are going to pop these and push nothing in their
-	// place, but for now it is more efficient to use a constant
-	// reference. When we finish, pop the stack without copies.
-      const vvp_vector4_t&rval = thr->peek_vec4(0);
-      const vvp_vector4_t&lval = thr->peek_vec4(1);
 
       assert(rval.size() == lval.size());
 
@@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
 	    thr->flags[4] = BIT4_X; // eq
 	    thr->flags[5] = BIT4_X; // lt
 	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
-	    thr->pop_vec4(2);
-	    return true;
+	    return;
       }
 
 	// Past this point, we know we are dealing only with fully
@@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
       thr->flags[4] = eq;
       thr->flags[5] = lt;
       thr->flags[6] = eeq;
+}
+
+/*
+ *  %cmp/s
+ *
+ * Pop the operands from the stack, and do not replace them. The
+ * results are written to flag bits:
+ *
+ *	4: eq  (equal)
+ *	5: lt  (less than)
+ *	6: eeq (case equal)
+ */
+bool of_CMPS(vthread_t thr, vvp_code_t)
+{
+	// We are going to pop these and push nothing in their
+	// place, but for now it is more efficient to use a constant
+	// reference. When we finish, pop the stack without copies.
+      const vvp_vector4_t&rval = thr->peek_vec4(0);
+      const vvp_vector4_t&lval = thr->peek_vec4(1);
+
+      do_CMPS(thr, lval, rval);
 
       thr->pop_vec4(2);
       return true;
 }
 
+/*
+ * %cmpi/s <vala>, <valb>, <wid>
+ *
+ * Pop1 operand, get the other operand from the arguments.
+ */
+bool of_CMPIS(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t&lval = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t rval (wid, BIT4_0);
+      get_immediate_rval (cp, rval);
+
+      do_CMPS(thr, lval, rval);
+
+      thr->pop_vec4(1);
+      return true;
+}
+
 bool of_CMPSTR(vthread_t thr, vvp_code_t)
 {
       string re = thr->pop_str();
@@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
       return true;
 }
 
-bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
-			  const vvp_vector4_t&lval, const vvp_vector4_t&rval)
+static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
+				 const vvp_vector4_t&lval,
+				 const vvp_vector4_t&rval)
 {
       vvp_bit4_t eq = BIT4_1;
       vvp_bit4_t eeq = BIT4_1;
@@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
       thr->flags[4] = eq;
       thr->flags[5] = BIT4_X;
       thr->flags[6] = eeq;
-
-      return true;
 }
 
-bool of_CMPU(vthread_t thr, vvp_code_t cp)
+static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 {
       vvp_bit4_t eq = BIT4_1;
       vvp_bit4_t lt = BIT4_0;
 
-      vvp_vector4_t rval = thr->pop_vec4();
-      vvp_vector4_t lval = thr->pop_vec4();
-
       if (rval.size() != lval.size()) {
 	    cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
 		 << ", rval=" << rval << endl;
@@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
       unsigned wid = lval.size();
 
       unsigned long*larray = lval.subarray(0,wid);
-      if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
+      if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);
 
       unsigned long*rarray = rval.subarray(0,wid);
       if (rarray == 0) {
 	    delete[]larray;
-	    return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
+	    return of_CMPU_the_hard_way(thr, wid, lval, rval);
       }
 
       unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
@@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
       thr->flags[4] = eq;
       thr->flags[5] = lt;
       thr->flags[6] = eq;
+}
+
+bool of_CMPU(vthread_t thr, vvp_code_t)
+{
+
+      vvp_vector4_t rval = thr->pop_vec4();
+      vvp_vector4_t lval = thr->pop_vec4();
+
+      do_CMPU(thr, lval, rval);
 
       return true;
 }
 
+/*
+ * %cmpi/u <vala>, <valb>, <wid>
+ *
+ * Pop1 operand, get the other operand from the arguments.
+ */
+bool of_CMPIU(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t&lval = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t rval (wid, BIT4_0);
+      get_immediate_rval (cp, rval);
+
+      do_CMPU(thr, lval, rval);
+
+      thr->pop_vec4(1);
+      return true;
+}
+
+
 /*
  * %cmp/x
  */
@@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
       int flag = cp->number;
       assert(flag < vthread_s::FLAGS_COUNT);
 
-      vvp_vector4_t val = thr->pop_vec4();
+      const vvp_vector4_t&val = thr->peek_vec4();
       thr->flags[flag] = val.value(0);
+      thr->pop_vec4(1);
 
       return true;
 }
@@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-static void do_verylong_mod(vthread_t thr,
-			    vvp_vector4_t&vala, const vvp_vector4_t&valb,
+static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
 			    bool left_is_neg, bool right_is_neg)
 {
       bool out_is_neg = left_is_neg;
@@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
 	    return true;
 
       } else {
-	    do_verylong_mod(thr, vala, valb, false, false);
+	    do_verylong_mod(vala, valb, false, false);
 	    return true;
       }
 
@@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
 
 	    bool left_is_neg  = vala.value(vala.size()-1) == BIT4_1;
 	    bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
-	    do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg);
+	    do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
 	    return true;
       }