From 85c7b07a9bd2888d7d7f4ba015e8d89e96d2c48b Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Wed, 3 Dec 2014 11:06:11 -0800
Subject: [PATCH] Implement %cmp/ne and %cmpi/ne

These pull in the inversion of the output flags so that they more
efficiently implement != and !==, without %flag_inv instructions.
---
 tgt-vvp/eval_condit.c | 17 +++++++++++------
 vvp/codes.h           |  2 ++
 vvp/compile.cc        |  2 ++
 vvp/opcodes.txt       |  9 ++++++++-
 vvp/vthread.cc        | 38 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/tgt-vvp/eval_condit.c b/tgt-vvp/eval_condit.c
index 2e78e503b..d4cd104a6 100644
--- a/tgt-vvp/eval_condit.c
+++ b/tgt-vvp/eval_condit.c
@@ -83,26 +83,31 @@ static int draw_condition_binary_compare(ivl_expr_t expr)
       draw_eval_vec4(le);
       resize_vec4_wid(le, use_wid);
 
+      char use_opcode = ivl_expr_opcode(expr);
+
+
       if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
 	      /* Special case: If the right operand can be handled as
 		 an immediate operand, then use that instead. */
-	    draw_immediate_vec4(re, "%cmpi/e");
+	    if (use_opcode=='n' || use_opcode=='N')
+		  draw_immediate_vec4(re, "%cmpi/ne");
+	    else
+		  draw_immediate_vec4(re, "%cmpi/e");
       } else {
 	    draw_eval_vec4(re);
 	    resize_vec4_wid(re, use_wid);
-	    fprintf(vvp_out, "    %%cmp/e;\n");
+	    if (use_opcode=='n' || use_opcode=='N')
+		  fprintf(vvp_out, "    %%cmp/ne;\n");
+	    else
+		  fprintf(vvp_out, "    %%cmp/e;\n");
       }
 
       switch (ivl_expr_opcode(expr)) {
 	  case 'n': /* != */
-	    fprintf(vvp_out, "    %%flag_inv 4;\n");
-	    ; /* fall through.. */
 	  case 'e': /* == */
 	    return 4;
 	    break;
 	  case 'N': /* !== */
-	    fprintf(vvp_out, "    %%flag_inv 6;\n");
-	    ; /* fall through.. */
 	  case 'E': /* === */
 	    return 6;
 	  default:
diff --git a/vvp/codes.h b/vvp/codes.h
index b8f5eb295..e31dd9cae 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -63,6 +63,8 @@ extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
 extern bool of_CAST2(vthread_t thr, vvp_code_t code);
 extern bool of_CMPE(vthread_t thr, vvp_code_t code);
 extern bool of_CMPIE(vthread_t thr, vvp_code_t code);
+extern bool of_CMPINE(vthread_t thr, vvp_code_t code);
+extern bool of_CMPNE(vthread_t thr, vvp_code_t code);
 extern bool of_CMPS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
 extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 95a11a691..ea71009a8 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -113,6 +113,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cassign/wr",  of_CASSIGN_WR,  1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
       { "%cast2",  of_CAST2,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/e",  of_CMPE,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
+      { "%cmp/ne", of_CMPNE,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/s",  of_CMPS,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/str",of_CMPSTR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/u",  of_CMPU,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -122,6 +123,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cmp/x",  of_CMPX,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/z",  of_CMPZ,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmpi/e", of_CMPIE,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmpi/ne",of_CMPINE, 3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%cmpi/s", of_CMPIS,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%cmpi/u", of_CMPIU,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%concat/str",  of_CONCAT_STR,  0,{OA_NONE,  OA_NONE,  OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 89353899c..b1fba8fc8 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -253,9 +253,11 @@ vector2 (binary) value, and push the result.
 * %cmp/s
 * %cmp/u
 * %cmp/e
+* %cmp/ne
 * %cmpi/s <vala>, <valb>, <wid>
 * %cmpi/u <vala>, <valb>, <wid>
 * %cmpi/e <vala>, <valb>, <wid>
+* %cmpi/ne <vala>, <valb>, <wid>
 
 These instructions perform a generic comparison of two vectors of
 equal size. Two values are pulled from the top of the stack, and not
@@ -284,9 +286,14 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
 compare. In either case, if either operand contains x or z, then lt
 bit gets the x value.
 
-Thje %cmp/e and %cmpi/e variants are the same, but they do not bother
+The %cmp/e and %cmpi/e variants are the same, but they do not bother
 to calculate the lt flag. These are faster if the lt flag is not needed.
 
+The %cmp/ne and %cmpi/ne variants are the same as the %cmp/e and
+%cmpi/e variants, but the 4 and 6 flags are inverted in order to
+eliminate the need for a %flag_inv instruction to implement != and !==
+operations.
+
 * %cmp/wr
 
 Compare real values for equality and less-then. This opcode pops to
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index bb7d6116b..11061e2e6 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -1560,6 +1560,23 @@ bool of_CMPE(vthread_t thr, vvp_code_t)
       return true;
 }
 
+bool of_CMPNE(vthread_t thr, vvp_code_t)
+{
+	// We are going to pop these and push nothing in their
+	// place, but for now it is more efficient to use a constant
+	// reference. When we finish, pop the stack without copies.
+      const vvp_vector4_t&rval = thr->peek_vec4(0);
+      const vvp_vector4_t&lval = thr->peek_vec4(1);
+
+      do_CMPE(thr, lval, rval);
+
+      thr->flags[4] =  ~thr->flags[4];
+      thr->flags[6] =  ~thr->flags[6];
+
+      thr->pop_vec4(2);
+      return true;
+}
+
 /*
  * %cmpi/e <vala>, <valb>, <wid>
  *
@@ -1583,6 +1600,27 @@ bool of_CMPIE(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+bool of_CMPINE(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t&lval = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t rval (wid, BIT4_0);
+      get_immediate_rval (cp, rval);
+
+      do_CMPE(thr, lval, rval);
+
+      thr->flags[4] =  ~thr->flags[4];
+      thr->flags[6] =  ~thr->flags[6];
+
+      thr->pop_vec4(1);
+      return true;
+}
+
 
 
 static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)