Add %cmpi/s and %cmpi/u instructions for performance

These bypass the vec4 stack in some common cases, saving instructions and vec4 manipulations. Also, minor improvement to the %flag/set/vec4 statement. Kill a few warnings.
2014-11-19 16:38:43 -08:00 · 2014-11-19 16:38:43 -08:00 · 04bdfbccee
parent 2acc9fbdee
commit 04bdfbccee
5 changed files with 109 additions and 37 deletions
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr)
      draw_eval_vec4(le);
      resize_vec4_wid(le, use_wid);
      if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
 	      /* Special case: If the right operand can be handled as
 		 an immediate operand, then use that instead. */
 	    char opcode[8];
 	    snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag);
 	    draw_immediate_vec4(re, opcode);
      } else {
 	    draw_eval_vec4(re);
 	    resize_vec4_wid(re, use_wid);
 	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
      }
      switch (use_opcode) {
 	  case 'L':
 	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
 	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
 	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
 	    fprintf(vvp_out, "    %%or;\n");
 	    break;
 	  case '<':
 	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
 	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
 	    break;
 	  default:
--- a/vvp/codes.h
+++ b/vvp/codes.h
@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code);
 extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
 extern bool of_CAST2(vthread_t thr, vvp_code_t code);
 extern bool of_CMPS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
 extern bool of_CMPU(vthread_t thr, vvp_code_t code);
 extern bool of_CMPIU(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPWU(vthread_t thr, vvp_code_t code);
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = {
      { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
      { "%cmp/x",  of_CMPX,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmp/z",  of_CMPZ,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmpi/s", of_CMPIS,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%cmpi/u", of_CMPIU,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%concat/str",  of_CONCAT_STR,  0,{OA_NONE,  OA_NONE,  OA_NONE} },
      { "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE,  OA_NONE,  OA_NONE} },
      { "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE,  OA_NONE} },
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@ -252,6 +252,8 @@ vector2 (binary) value, and push the result.
 * %cmp/s
 * %cmp/u
 * %cmpi/s <vala>, <valb>, <wid>
 * %cmpi/u <vala>, <valb>, <wid>
 These instructions perform a generic comparison of two vectors of
 equal size. Two values are pulled from the top of the stack, and not
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
      return true;
 }
-/*
+static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 *  %cmp/s
 *
 * Pop the operands from the stack, and do not replace them. The
 * results are written to flag bits:
 *
 *	4: eq  (equal)
 *	5: lt  (less than)
 *	6: eeq (case equal)
 */
 bool of_CMPS(vthread_t thr, vvp_code_t)
 {
      vvp_bit4_t eq  = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;
      vvp_bit4_t lt  = BIT4_0;
 	// We are going to pop these and push nothing in their
 	// place, but for now it is more efficient to use a constant
 	// reference. When we finish, pop the stack without copies.
      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);
      assert(rval.size() == lval.size());
@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
 	    thr->flags[4] = BIT4_X; // eq
 	    thr->flags[5] = BIT4_X; // lt
 	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
-	    thr->pop_vec4(2);
+	    return;
 	    return true;
      }
 	// Past this point, we know we are dealing only with fully
@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
      thr->flags[4] = eq;
      thr->flags[5] = lt;
      thr->flags[6] = eeq;
 }
 /*
 *  %cmp/s
 *
 * Pop the operands from the stack, and do not replace them. The
 * results are written to flag bits:
 *
 *	4: eq  (equal)
 *	5: lt  (less than)
 *	6: eeq (case equal)
 */
 bool of_CMPS(vthread_t thr, vvp_code_t)
 {
 	// We are going to pop these and push nothing in their
 	// place, but for now it is more efficient to use a constant
 	// reference. When we finish, pop the stack without copies.
      const vvp_vector4_t&rval = thr->peek_vec4(0);
      const vvp_vector4_t&lval = thr->peek_vec4(1);
      do_CMPS(thr, lval, rval);
      thr->pop_vec4(2);
      return true;
 }
 /*
 * %cmpi/s <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments.
 */
 bool of_CMPIS(vthread_t thr, vvp_code_t cp)
 {
      unsigned wid = cp->number;
      vvp_vector4_t&lval = thr->peek_vec4();
 	// I expect that most of the bits of an immediate value are
 	// going to be zero, so start the result vector with all zero
 	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);
      do_CMPS(thr, lval, rval);
      thr->pop_vec4(1);
      return true;
 }
 bool of_CMPSTR(vthread_t thr, vvp_code_t)
 {
      string re = thr->pop_str();
@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
      return true;
 }
-bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
+static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
-			  const vvp_vector4_t&lval, const vvp_vector4_t&rval)
+				 const vvp_vector4_t&lval,
 				 const vvp_vector4_t&rval)
 {
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;
@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
      thr->flags[4] = eq;
      thr->flags[5] = BIT4_X;
      thr->flags[6] = eeq;
      return true;
 }
-bool of_CMPU(vthread_t thr, vvp_code_t cp)
+static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 {
      vvp_bit4_t eq = BIT4_1;
      vvp_bit4_t lt = BIT4_0;
      vvp_vector4_t rval = thr->pop_vec4();
      vvp_vector4_t lval = thr->pop_vec4();
      if (rval.size() != lval.size()) {
 	    cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
 		 << ", rval=" << rval << endl;
@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
      unsigned wid = lval.size();
      unsigned long*larray = lval.subarray(0,wid);
-      if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
+      if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);
      unsigned long*rarray = rval.subarray(0,wid);
      if (rarray == 0) {
 	    delete[]larray;
-	    return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
+	    return of_CMPU_the_hard_way(thr, wid, lval, rval);
      }
      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
      thr->flags[4] = eq;
      thr->flags[5] = lt;
      thr->flags[6] = eq;
 }
 bool of_CMPU(vthread_t thr, vvp_code_t)
 {
      vvp_vector4_t rval = thr->pop_vec4();
      vvp_vector4_t lval = thr->pop_vec4();
      do_CMPU(thr, lval, rval);
      return true;
 }
 /*
 * %cmpi/u <vala>, <valb>, <wid>
 *
 * Pop1 operand, get the other operand from the arguments.
 */
 bool of_CMPIU(vthread_t thr, vvp_code_t cp)
 {
      unsigned wid = cp->number;
      vvp_vector4_t&lval = thr->peek_vec4();
 	// I expect that most of the bits of an immediate value are
 	// going to be zero, so start the result vector with all zero
 	// bits. Then we only need to replace the bits that are different.
      vvp_vector4_t rval (wid, BIT4_0);
      get_immediate_rval (cp, rval);
      do_CMPU(thr, lval, rval);
      thr->pop_vec4(1);
      return true;
 }
 /*
 * %cmp/x
 */
@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
      int flag = cp->number;
      assert(flag < vthread_s::FLAGS_COUNT);
-      vvp_vector4_t val = thr->pop_vec4();
+      const vvp_vector4_t&val = thr->peek_vec4();
      thr->flags[flag] = val.value(0);
      thr->pop_vec4(1);
      return true;
 }
@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
      return true;
 }
-static void do_verylong_mod(vthread_t thr,
+static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
 			    vvp_vector4_t&vala, const vvp_vector4_t&valb,
 			    bool left_is_neg, bool right_is_neg)
 {
      bool out_is_neg = left_is_neg;
@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
 	    return true;
      } else {
-	    do_verylong_mod(thr, vala, valb, false, false);
+	    do_verylong_mod(vala, valb, false, false);
 	    return true;
      }
@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
 	    bool left_is_neg  = vala.value(vala.size()-1) == BIT4_1;
 	    bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
-	    do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg);
+	    do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
 	    return true;
      }