From e5eb7541508512ab9289819717154a934247e5be Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Sun, 5 Jan 2014 14:12:27 -0800
Subject: [PATCH] vec4 versions of a bunch of unary operators.

---
 tgt-vvp/eval_vec4.c |  81 +++++++++++++++++++++-
 vvp/compile.cc      |  14 ++--
 vvp/opcodes.txt     |  14 ++--
 vvp/vthread.cc      | 160 ++++++++++++++++++--------------------------
 4 files changed, 163 insertions(+), 106 deletions(-)

diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index 54f4e1cda..086e42be3 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -609,6 +609,21 @@ static void draw_unary_vec4(ivl_expr_t expr, int stuff_ok_flag)
       ivl_expr_t sub = ivl_expr_oper1(expr);
 
       switch (ivl_expr_opcode(expr)) {
+	  case '&':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%and/r;\n");
+	    break;
+
+	  case '|':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%or/r;\n");
+	    break;
+
+	  case '^':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%xor/r;\n");
+	    break;
+
 	  case '~':
 	    draw_eval_vec4(sub, stuff_ok_flag);
 	    fprintf(vvp_out, "    %%inv;\n");
@@ -619,8 +634,72 @@ static void draw_unary_vec4(ivl_expr_t expr, int stuff_ok_flag)
 	    fprintf(vvp_out, "    %%nor/r;\n");
 	    break;
 
+	  case '-':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%inv;\n");
+	    fprintf(vvp_out, "    %%pushi/vec4 1, 0, %u;\n", ivl_expr_width(sub));
+	    fprintf(vvp_out, "    %%add;\n");
+	    break;
+
+	  case 'A': /* nand (~&) */
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%nand/r;\n");
+	    break;
+
+	  case 'N': /* nor (~|) */
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%nor/r;\n");
+	    break;
+
+	  case 'X': /* xnor (~^) */
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%xnor/r;\n");
+	    break;
+
+	  case 'm': /* abs(m) */
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    if (! ivl_expr_signed(sub))
+		  break;
+
+	      /* Test if (m) < 0 */
+	    fprintf(vvp_out, "    %%dup/vec4;\n");
+	    fprintf(vvp_out, "    %%pushi/vec4 0, 0, %u;\n", ivl_expr_width(sub));
+	    fprintf(vvp_out, "    %%cmp/s;\n");
+	    fprintf(vvp_out, "    %%jmp/0xz T_%u.%u, 5;\n", thread_count, local_count);
+	      /* If so, calculate -(m) */
+	    fprintf(vvp_out, "    %%inv;\n");
+	    fprintf(vvp_out, "    %%pushi/vec4 1, 0, %u;\n", ivl_expr_width(sub));
+	    fprintf(vvp_out, "    %%add;\n");
+	    fprintf(vvp_out, "T_%u.%u ;\n", thread_count, local_count);
+	    break;
+
+	  case 'v': /* Cast real to vec4 */
+	    assert(ivl_expr_value(sub) == IVL_VT_REAL);
+	    draw_eval_real(sub);
+	    fprintf(vvp_out, "    %%cvt/vr %u;\n", ivl_expr_width(expr));
+	    break;
+
+	  case '2': /* Cast expression to bool */
+	    switch (ivl_expr_value(sub)) {
+		case IVL_VT_LOGIC:
+		  draw_eval_vec4(sub, STUFF_OK_XZ);
+		  fprintf(vvp_out, "    %%cast2;\n");
+		  break;
+		case IVL_VT_BOOL:
+		  draw_eval_vec4(sub, 0);
+		  break;
+		case IVL_VT_REAL:
+		  draw_eval_real(sub);
+		  fprintf(vvp_out, "    %%cvt/vr;\n");
+		  break;
+		default:
+		  assert(0);
+		  break;
+	    }
+	    break;
+
 	  default:
-	    fprintf(stderr, "XXXX Unary operator %c no implemented\n", ivl_expr_opcode(expr));
+	    fprintf(stderr, "XXXX Unary operator %c not implemented\n", ivl_expr_opcode(expr));
 	    break;
       }
 }
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 8ebf6bb71..e34adeed8 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -90,7 +90,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%alloc",  of_ALLOC,  1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
       { "%and",    of_AND,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%and/r",  of_ANDR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%assign/ar",of_ASSIGN_AR,2,{OA_ARR_PTR,OA_BIT1,     OA_NONE} },
       { "%assign/ar/d",of_ASSIGN_ARD,2,{OA_ARR_PTR,OA_BIT1,  OA_NONE} },
@@ -117,7 +117,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cassign/vec4",    of_CASSIGN_VEC4,    1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
       { "%cassign/vec4/off",of_CASSIGN_VEC4_OFF,2,{OA_FUNC_PTR,OA_BIT1,     OA_NONE} },
       { "%cassign/wr",  of_CASSIGN_WR,  1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
-      { "%cast2",  of_CAST2,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cast2",  of_CAST2,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/s",  of_CMPS,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/str",of_CMPSTR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/u",  of_CMPU,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -137,7 +137,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cvt/rv/s", of_CVT_RV_S,2, {OA_BIT1,    OA_BIT2,    OA_NONE} },
       { "%cvt/sr", of_CVT_SR, 1,  {OA_BIT1,     OA_NONE,     OA_NONE} },
       { "%cvt/ur", of_CVT_UR, 1,  {OA_BIT1,     OA_NONE,     OA_NONE} },
-      { "%cvt/vr", of_CVT_VR, 2,  {OA_BIT1,     OA_NUMBER,   OA_NONE} },
+      { "%cvt/vr", of_CVT_VR, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
       { "%deassign",of_DEASSIGN,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
       { "%deassign/wr",of_DEASSIGN_WR,1,{OA_FUNC_PTR, OA_NONE,     OA_NONE} },
       { "%debug/thr",  of_DEBUG_THR,  0,{OA_NONE,     OA_NONE,     OA_NONE} },
@@ -210,7 +210,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%mul/wr", of_MUL_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%muli",   of_MULI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%nand",   of_NAND,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%nand/r", of_NANDR,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%nand/r", of_NANDR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%new/cobj",  of_NEW_COBJ,  1, {OA_VPI_PTR,OA_NONE,  OA_NONE} },
       { "%new/darray",of_NEW_DARRAY,2, {OA_BIT1,   OA_STRING,OA_NONE} },
       { "%noop",   of_NOOP,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -218,7 +218,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%nor/r",  of_NORR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%null",   of_NULL,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%or",     of_OR,     0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%or/r",   of_ORR,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%pad/s",  of_PAD_S,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
       { "%pad/u",  of_PAD_U,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
       { "%part/s", of_PART_S, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
@@ -276,9 +276,9 @@ static const struct opcode_table_s opcode_table[] = {
       { "%wait",   of_WAIT,   1,  {OA_FUNC_PTR, OA_NONE,     OA_NONE} },
       { "%wait/fork",of_WAIT_FORK,0,{OA_NONE,   OA_NONE,     OA_NONE} },
       { "%xnor",   of_XNOR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%xnor/r", of_XNORR,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%xnor/r", of_XNORR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%xor",    of_XOR,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%xor/r",  of_XORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%xor/r",  of_XORR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { 0, of_NOOP, 0, {OA_NONE, OA_NONE, OA_NONE} }
 };
 
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index c1a1e995b..bb1dd3a43 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -95,6 +95,11 @@ bits. AND means the following:
 The input vectors must be the same width, and the output vector will
 be the width of the input.
 
+* %and/r
+
+Pop the top value from the vec4 stack, perform a reduction &, then
+return the single-bit result.
+
 * %assign/ar <array-label>, <delay>
 * %assign/ar/d <array-label>, <delayx>
 * %assign/ar/e <array-label>
@@ -414,11 +419,12 @@ value stack. Precision may be lost in the conversion.
 The %cvt/rv/s instruction is the same as %cvt/rv, but treats the thread
 vector as a signed value.
 
-* %cvt/vr <bit-l>, <wid>
+* %cvt/vr <wid>
 
-The %cvt/vr opcode converts a real word from the stack to a thread vector
-starting at <bit-l> and with the width <wid>. Non-integer precision is
-lost in the conversion, and the real value is popped from the stack.
+The %cvt/vr opcode converts a real word from the stack to a vec4 that
+is <wid> wide. Non-integer precision is lost in the conversion, and
+the real value is popped from the stack. The result is pushed to the
+vec4 stack.
 
 * %deassign <var-label>, <base>, <width>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 71791c3cf..c8ffb6ba4 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -1742,36 +1742,25 @@ bool of_CASSIGN_WR(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-
-bool of_CAST2(vthread_t thr, vvp_code_t cp)
+/*
+ * %cast2
+ */
+bool of_CAST2(vthread_t thr, vvp_code_t)
 {
-#if 0
-      unsigned dst = cp->bit_idx[0];
-      unsigned src = cp->bit_idx[1];
-      unsigned wid = cp->number;
+      vvp_vector4_t val = thr->pop_vec4();
+      unsigned wid = val.size();
 
-      thr_check_addr(thr, dst+wid-1);
-      thr_check_addr(thr, src+wid-1);
-
-      vvp_vector4_t res;
-      switch (src) {
-	  case 0:
-	  case 2:
-	  case 3:
-	    res = vvp_vector4_t(wid, BIT4_0);
-	    break;
-	  case 1:
-	    res = vvp_vector4_t(wid, BIT4_1);
-	    break;
-	  default:
-	    res = vector2_to_vector4(vvp_vector2_t(vthread_bits_to_vector(thr, src, wid)), wid);
-	    break;
+      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+	    switch (val.value(idx)) {
+		case BIT4_1:
+		  val.set_bit(idx, BIT4_1);
+		  break;
+		default:
+		  val.set_bit(idx, BIT4_0);
+		  break;
+	    }
       }
-
-      thr->bits4.set_vec(dst, res);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cast2 ...\n");
-#endif
+      thr->push_vec4(val);
       return true;
 }
 
@@ -2272,22 +2261,15 @@ bool of_CVT_UR(vthread_t thr, vvp_code_t cp)
 }
 
 /*
- * %cvt/vr <bit> <wid>
+ * %cvt/vr <wid>
  */
 bool of_CVT_VR(vthread_t thr, vvp_code_t cp)
 {
-#if 0
       double r = thr->pop_real();
-      unsigned base = cp->bit_idx[0];
       unsigned wid = cp->number;
-      vvp_vector4_t tmp(wid, r);
 
-	/* Make sure there is enough space for the new vector. */
-      thr_check_addr(thr, base+wid-1);
-      thr->bits4.set_vec(base, tmp);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cvt/vr ...\n");
-#endif
+      vvp_vector4_t tmp(wid, r);
+      thr->push_vec4(tmp);
       return true;
 }
 
@@ -4709,45 +4691,43 @@ bool of_NULL(vthread_t thr, vvp_code_t)
       return true;
 }
 
-
-bool of_ANDR(vthread_t thr, vvp_code_t cp)
+/*
+ * %and/r
+ */
+bool of_ANDR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_1;
-      unsigned idx2 = cp->bit_idx[1];
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_0) {
 		  lb = BIT4_0;
 		  break;
 	    }
 
-	    if (rb != BIT4_1)
+	    if (rb != 1)
 		  lb = BIT4_X;
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%and/r ...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
+
       return true;
 }
 
-bool of_NANDR(vthread_t thr, vvp_code_t cp)
+/*
+ * %nand/r
+ */
+bool of_NANDR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_0;
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_0) {
 		  lb = BIT4_1;
 		  break;
@@ -4757,24 +4737,22 @@ bool of_NANDR(vthread_t thr, vvp_code_t cp)
 		  lb = BIT4_X;
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nand/r ...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
+
       return true;
 }
 
-bool of_ORR(vthread_t thr, vvp_code_t cp)
+/*
+ * %or/r
+ */
+bool of_ORR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_0;
-      unsigned idx2 = cp->bit_idx[1];
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_1) {
 		  lb = BIT4_1;
 		  break;
@@ -4784,24 +4762,22 @@ bool of_ORR(vthread_t thr, vvp_code_t cp)
 		  lb = BIT4_X;
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%orr ...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
       return true;
 }
 
-bool of_XORR(vthread_t thr, vvp_code_t cp)
+/*
+ * %xor/r
+ */
+bool of_XORR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_0;
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_1)
 		  lb = ~lb;
 	    else if (rb != BIT4_0) {
@@ -4810,24 +4786,22 @@ bool of_XORR(vthread_t thr, vvp_code_t cp)
 	    }
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xorr ...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
       return true;
 }
 
+/*
+ * %xnor/r
+ */
 bool of_XNORR(vthread_t thr, vvp_code_t cp)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_1;
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_1)
 		  lb = ~lb;
 	    else if (rb != BIT4_0) {
@@ -4836,10 +4810,8 @@ bool of_XNORR(vthread_t thr, vvp_code_t cp)
 	    }
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xnorr...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
       return true;
 }