From 301edf69d3879ba38c2cee6afba6bbf5ad9bdc58 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 18 Nov 2014 12:27:55 -0800
Subject: [PATCH] Add and use %concati/vec4 and %addi instructions.

Also, clean up some warnings, and optimize some existing opcodes.
---
 tgt-vvp/eval_vec4.c | 202 +++++++++++++++++++++++++++++++++-----------
 vvp/codes.h         |   2 +
 vvp/compile.cc      |   8 +-
 vvp/opcodes.txt     |  12 +++
 vvp/vthread.cc      | 176 ++++++++++++++++++++++++++++++++------
 5 files changed, 325 insertions(+), 75 deletions(-)

diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index 4367986af..0f29a9dc3 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -38,6 +38,66 @@ void resize_vec4_wid(ivl_expr_t expr, unsigned wid)
 	    fprintf(vvp_out, "    %%pad/u %u;\n", wid);
 }
 
+/*
+ * Test if the draw_immediate_vec4 instruction can be used.
+ */
+static int test_immediate_vec4_ok(ivl_expr_t re)
+{
+      const char*bits;
+      unsigned idx;
+
+      if (ivl_expr_type(re) != IVL_EX_NUMBER)
+	    return 0;
+
+      if (ivl_expr_width(re) <= 32)
+	    return 1;
+
+      bits = ivl_expr_bits(re);
+
+      for (idx = 32 ; idx < ivl_expr_width(re) ; idx += 1) {
+	    if (bits[idx] != '0')
+		  return 0;
+      }
+
+      return 1;
+}
+
+static void draw_immediate_vec4(ivl_expr_t re, const char*opcode)
+{
+      unsigned long val0 = 0;
+      unsigned long valx = 0;
+      unsigned wid = ivl_expr_width(re);
+      const char*bits = ivl_expr_bits(re);
+
+      unsigned idx;
+
+      for (idx = 0 ; idx < wid ; idx += 1) {
+	    assert( ((val0|valx)&0x80000000UL) == 0UL );
+	    val0 <<= 1;
+	    valx <<= 1;
+	    switch (bits[wid-idx-1]) {
+		case '0':
+		  break;
+		case '1':
+		  val0 |= 1;
+		  break;
+		case 'x':
+		  val0 |= 1;
+		  valx |= 1;
+		  break;
+		case 'z':
+		  val0 |= 0;
+		  valx |= 1;
+		  break;
+		default:
+		  assert(0);
+		  break;
+	    }
+      }
+
+      fprintf(vvp_out, "    %s %lu, %lu, %u;\n", opcode, val0, valx, wid);
+}
+
 static void draw_binary_vec4_arith(ivl_expr_t expr)
 {
       ivl_expr_t le = ivl_expr_oper1(expr);
@@ -58,6 +118,21 @@ static void draw_binary_vec4_arith(ivl_expr_t expr)
       if (lwid != ewid) {
 	    fprintf(vvp_out, "    %%pad/%c %u;\n", ivl_expr_signed(le)? 's' : 'u', ewid);
       }
+
+	/* Special case: If the re expression can be collected into an
+	   immediate operand, and the instruction supports it, then
+	   generate an immediate instruction instead of the generic
+	   version. */
+      if (rwid==ewid && test_immediate_vec4_ok(re)) {
+	    switch (ivl_expr_opcode(expr)) {
+		case '+':
+		  draw_immediate_vec4(re, "%addi");
+		  return;
+		default:
+		  break;
+	    }
+      }
+
       draw_eval_vec4(re);
       if (rwid != ewid) {
 	    fprintf(vvp_out, "    %%pad/%c %u;\n", ivl_expr_signed(re)? 's' : 'u', ewid);
@@ -618,40 +693,18 @@ static void draw_binary_vec4(ivl_expr_t expr)
       }
 }
 
-static void draw_concat_vec4(ivl_expr_t expr)
-{
-	/* Repeat the concatenation this many times to make a
-	   super-concatenation. */
-      unsigned repeat = ivl_expr_repeat(expr);
-	/* This is the number of expressions that go into the
-	   concatenation. */
-      unsigned num_sube = ivl_expr_parms(expr);
-      unsigned sub_idx;
-
-      assert(num_sube > 0);
-
-	/* Start with the least-significant bits. */
-      draw_eval_vec4(ivl_expr_parm(expr, 0));
-
-      for (sub_idx = 1 ; sub_idx < num_sube ; sub_idx += 1) {
-	      /* Concatenate progressively higher parts. */
-	    draw_eval_vec4(ivl_expr_parm(expr, sub_idx));
-	    fprintf(vvp_out, "    %%concat/vec4;\n");
-      }
-
-      if (repeat > 1) {
-	    fprintf(vvp_out, "    %%replicate %u;\n", repeat);
-      }
-}
-
 /*
- * Push a number into the vec4 stack using %pushi/vec4
- * instructions. The %pushi/vec4 instruction can only handle up to 32
- * non-zero bits, so if there are more than that, then generate
- * multiple %pushi/vec4 statements, and use %concat/vec4 statements to
- * concatenate the vectors into the desired result.
+ * This handles two special cases:
+ *   1) Making a large IVL_EX_NUMBER as an immediate value. In this
+ *   case, start with a %pushi/vec4 to get the stack started, then
+ *   continue with %concati/vec4 instructions to build that number
+ *   up.
+ *
+ *   2) Concatenating a large IVL_EX_NUMBER to the current top of the
+ *   stack. In this case, start with %concati/vec4 and continue
+ *   generating %concati/vec4 instructions to finish up the large number.
  */
-static void draw_number_vec4(ivl_expr_t expr)
+static void draw_concat_number_vec4(ivl_expr_t expr, int as_concati)
 {
       unsigned long val0 = 0;
       unsigned long valx = 0;
@@ -660,7 +713,7 @@ static void draw_number_vec4(ivl_expr_t expr)
 
       unsigned idx;
       int accum = 0;
-      int count_pushi = 0;
+      int count_pushi = as_concati? 1 : 0;
 
 	/* Scan the literal bits, MSB first. */
       for (idx = 0 ; idx < wid ; idx += 1) {
@@ -693,27 +746,82 @@ static void draw_number_vec4(ivl_expr_t expr)
 		 then write it out, generate a %concat/vec4, and set
 		 up to handle more bits. */
 	    if ( (val0|valx) & 0x80000000UL ) {
-		  fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %d;\n", val0, valx, accum);
+		  if (count_pushi) {
+			fprintf(vvp_out, "    %%concati/vec4 %lu, %lu, %d;\n",
+				val0, valx, accum);
+
+		  } else {
+			fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %d;\n",
+				val0, valx, accum);
+		  }
+
 		  accum = 0;
 		  val0 = 0;
 		  valx = 0;
-		    /* If there is already at least 1 pushi, then
-		       concatenate this result to what we've done
-		       already. */
-		  if (count_pushi)
-			fprintf(vvp_out, "    %%concat/vec4;\n");
+
 		  count_pushi += 1;
 	    }
       }
 
       if (accum) {
-	    fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n", val0, valx, accum);
-	    if (count_pushi)
-		  fprintf(vvp_out, "    %%concat/vec4;\n");
-	    count_pushi += 1;
+	    if (count_pushi) {
+		  fprintf(vvp_out, "    %%concati/vec4 %lu, %lu, %u;\n",
+			  val0, valx, accum);
+	    } else {
+		  fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n",
+			  val0, valx, accum);
+	    }
       }
 }
 
+static void draw_concat_vec4(ivl_expr_t expr)
+{
+	/* Repeat the concatenation this many times to make a
+	   super-concatenation. */
+      unsigned repeat = ivl_expr_repeat(expr);
+	/* This is the number of expressions that go into the
+	   concatenation. */
+      unsigned num_sube = ivl_expr_parms(expr);
+      unsigned sub_idx;
+
+      assert(num_sube > 0);
+
+	/* Start with the most-significant bits. */
+      draw_eval_vec4(ivl_expr_parm(expr, 0));
+
+      for (sub_idx = 1 ; sub_idx < num_sube ; sub_idx += 1) {
+	      /* Concatenate progressively lower parts. */
+	    ivl_expr_t sube = ivl_expr_parm(expr, sub_idx);
+
+	      /* Special case: The next expression is a NUMBER that
+		 can be concatenated using %concati/vec4
+		 instructions. */
+	    if (ivl_expr_type(sube) == IVL_EX_NUMBER) {
+		  draw_concat_number_vec4(sube, 1);
+		  continue;
+	    }
+
+	    draw_eval_vec4(sube);
+	    fprintf(vvp_out, "    %%concat/vec4; draw_concat_vec4\n");
+      }
+
+      if (repeat > 1) {
+	    fprintf(vvp_out, "    %%replicate %u;\n", repeat);
+      }
+}
+
+/*
+ * Push a number into the vec4 stack using %pushi/vec4
+ * instructions. The %pushi/vec4 instruction can only handle up to 32
+ * non-zero bits, so if there are more than that, then generate
+ * multiple %pushi/vec4 statements, and use %concat/vec4 statements to
+ * concatenate the vectors into the desired result.
+ */
+static void draw_number_vec4(ivl_expr_t expr)
+{
+      draw_concat_number_vec4(expr, 0);
+}
+
 static void draw_property_vec4(ivl_expr_t expr)
 {
       ivl_signal_t sig = ivl_expr_signal(expr);
@@ -873,20 +981,20 @@ static void draw_string_vec4(ivl_expr_t expr)
 	    p += 1;
 	    tmp_wid += 8;
 	    if (tmp_wid == 32) {
-		  fprintf(vvp_out, "    %%pushi/vec4 %lu, 0, 32;\n", tmp);
+		  fprintf(vvp_out, "    %%pushi/vec4 %lu, 0, 32; draw_string_vec4\n", tmp);
 		  tmp = 0;
 		  tmp_wid = 0;
 		  if (push_flag == 0)
 			push_flag += 1;
 		  else
-			fprintf(vvp_out, "    %%concat/vec4;\n");
+			fprintf(vvp_out, "    %%concat/vec4; draw_string_vec4\n");
 	    }
       }
 
       if (tmp_wid > 0) {
-	    fprintf(vvp_out, "    %%pushi/vec4 %lu, 0, %u;\n", tmp, tmp_wid);
+	    fprintf(vvp_out, "    %%pushi/vec4 %lu, 0, %u; draw_string_vec4\n", tmp, tmp_wid);
 	    if (push_flag != 0)
-		  fprintf(vvp_out, "    %%concat/vec4;\n");
+		  fprintf(vvp_out, "    %%concat/vec4; draw_string_vec4\n");
       }
 
       free(fp);
diff --git a/vvp/codes.h b/vvp/codes.h
index f3c0511ff..c89b05572 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -34,6 +34,7 @@ typedef bool (*vvp_code_fun)(vthread_t thr, vvp_code_t code);
 extern bool of_ABS_WR(vthread_t thr, vvp_code_t code);
 extern bool of_ADD(vthread_t thr, vvp_code_t code);
 extern bool of_ADD_WR(vthread_t thr, vvp_code_t code);
+extern bool of_ADDI(vthread_t thr, vvp_code_t code);
 extern bool of_ALLOC(vthread_t thr, vvp_code_t code);
 extern bool of_AND(vthread_t thr, vvp_code_t code);
 extern bool of_ANDR(vthread_t thr, vvp_code_t code);
@@ -71,6 +72,7 @@ extern bool of_CMPZ(vthread_t thr, vvp_code_t code);
 extern bool of_CONCAT_STR(vthread_t thr, vvp_code_t code);
 extern bool of_CONCATI_STR(vthread_t thr, vvp_code_t code);
 extern bool of_CONCAT_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_CONCATI_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_CVT_RS(vthread_t thr, vvp_code_t code);
 extern bool of_CVT_RU(vthread_t thr, vvp_code_t code);
 extern bool of_CVT_RV(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 989a0f645..63e6f9f75 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -87,6 +87,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%abs/wr", of_ABS_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%add",    of_ADD,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%add/wr", of_ADD_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
+      { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%alloc",  of_ALLOC,  1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
       { "%and",    of_AND,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%and/r",  of_ANDR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -119,9 +120,10 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%cmp/x",  of_CMPX,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/z",  of_CMPZ,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%concat/str", of_CONCAT_STR, 0,{OA_NONE,  OA_NONE,  OA_NONE} },
-      { "%concat/vec4",of_CONCAT_VEC4,0,{OA_NONE,  OA_NONE,  OA_NONE} },
-      { "%concati/str",of_CONCATI_STR,1,{OA_STRING,OA_NONE,  OA_NONE} },
+      { "%concat/str",  of_CONCAT_STR,  0,{OA_NONE,  OA_NONE,  OA_NONE} },
+      { "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE,  OA_NONE,  OA_NONE} },
+      { "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE,  OA_NONE} },
+      { "%concati/vec4",of_CONCATI_VEC4,3,{OA_BIT1,  OA_BIT2,  OA_NUMBER} },
       { "%cvt/rs", of_CVT_RS, 1,  {OA_BIT1,     OA_NONE,     OA_NONE} },
       { "%cvt/ru", of_CVT_RU, 1,  {OA_BIT1,     OA_NONE,     OA_NONE} },
       { "%cvt/rv",   of_CVT_RV,  0, {OA_NONE,   OA_NONE,     OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 1c64be7a5..a0b0d6666 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -54,11 +54,15 @@ sum.
 See also the %sub instruction.
 
 * %add
+* %addi <vala>, <valb>, <wid>
 
 This opcode pops and adds two vec4 values from the vec4 stack, adds
 them, and pushes the result back to the stack. The input values must
 have the same size, and the pushed result will have the same width.
 
+The %addi variant takes one operand from the stack, the other is an
+immediate value (See %pushi/vec4).
+
 See also the %sub instruction.
 
 * %add/wr <bit-l>, <bit-r>
@@ -319,12 +323,20 @@ of it as passing the tail, then the head, concatenating them, and
 pushing the result. The stack starts with two strings in the stack,
 and ends with one string in the stack.
 
+The %concati/str form pops only one value from the stack. The right
+part comes from the immediate value.
+
 * %concat/vec4
+* %concati/vec4 <vala>, <valb>, <wid>
 
 Pop two vec4 vectors, concatenate them, and push the combined
 result. The top of the vec4 stack is the LSB of the result, and the
 next in this stack is the MSB bits of the result.
 
+The %concati/vec4 form takes an immediate value and appends it (lsb)
+to the value on the top of the stack. See the %pushi/vec4 instruction
+for how to describe the immediate value.
+
 * %cvt/sr <bit-l>
 * %cvt/rs <bit-l>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 6e2bc8f61..2caafcb21 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -334,7 +334,7 @@ static inline void thr_put_bit(struct vthread_s*thr,
 }
 #endif
 
-vvp_bit4_t vthread_get_bit(struct vthread_s*thr, unsigned addr)
+vvp_bit4_t vthread_get_bit(struct vthread_s* /*thr*/, unsigned addr)
 {
 #if 0
       if (vpi_mode_flag == VPI_MODE_COMPILETF) return BIT4_X;
@@ -345,7 +345,7 @@ vvp_bit4_t vthread_get_bit(struct vthread_s*thr, unsigned addr)
 #endif
 }
 
-void vthread_put_bit(struct vthread_s*thr, unsigned addr, vvp_bit4_t bit)
+void vthread_put_bit(struct vthread_s* /*thr*/, unsigned addr, vvp_bit4_t bit)
 {
 #if 0
       thr_put_bit(thr, addr, bit);
@@ -900,24 +900,47 @@ bool of_AND(vthread_t thr, vvp_code_t)
       return true;
 }
 
-/*
- * %add
- *
- * Pop r,
- * Pop l,
- * Push l+r
- *
- * Pop 2 and push 1 is the same as pop 1 and replace the remaining top
- * of the stack with a new value. That is what we will do.
- */
-bool of_ADD(vthread_t thr, vvp_code_t)
+static void get_immediate_rval(vvp_code_t cp, vvp_vector4_t&val)
 {
-      vvp_vector4_t r = thr->pop_vec4();
-	// Rather then pop l, use it directly from the stack. When we
-	// assign to 'l', that will edit the top of the stack, which
-	// replaces a pop and a pull.
-      vvp_vector4_t&l = thr->peek_vec4();
+      uint32_t vala = cp->bit_idx[0];
+      uint32_t valb = cp->bit_idx[1];
+      unsigned wid  = cp->number;
 
+	// The immediate value can be values bigger then 32 bits, but
+	// only if the high bits are zero. So at most we need to run
+	// through the loop below 32 times. Maybe less, if the target
+	// width is less. We don't have to do anything special on that
+	// because vala/valb bits will shift away so (vala|valb) will
+	// turn to zero at or before 32 shifts.
+
+      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
+	    uint32_t ba = 0;
+	      // Convert the vala/valb bits to a ba number that can be
+	      // used to select what goes into the value.
+	    ba = (valb & 1) << 1;
+	    ba |= vala & 1;
+
+	    switch (ba) {
+		case 1:
+		  val.set_bit(idx, BIT4_1);
+		  break;
+		case 2:
+		  val.set_bit(idx, BIT4_Z);
+		  break;
+		case 3:
+		  val.set_bit(idx, BIT4_X);
+		  break;
+		default:
+		  break;
+	    }
+
+	    vala >>= 1;
+	    valb >>= 1;
+      }
+}
+
+static bool do_ADD(vvp_vector4_t&l, const vvp_vector4_t&r)
+{
       unsigned wid = l.size();
       assert(wid == r.size());
 
@@ -946,6 +969,48 @@ bool of_ADD(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ * %add
+ *
+ * Pop r,
+ * Pop l,
+ * Push l+r
+ *
+ * Pop 2 and push 1 is the same as pop 1 and replace the remaining top
+ * of the stack with a new value. That is what we will do.
+ */
+bool of_ADD(vthread_t thr, vvp_code_t)
+{
+      vvp_vector4_t r = thr->pop_vec4();
+	// Rather then pop l, use it directly from the stack. When we
+	// assign to 'l', that will edit the top of the stack, which
+	// replaces a pop and a pull.
+      vvp_vector4_t&l = thr->peek_vec4();
+
+      return do_ADD(l, r);
+}
+
+/*
+ * %addi <vala>, <valb>, <wid>
+ *
+ * Pop1 operand, get the other operand from the arguments, and push
+ * the result.
+ */
+bool of_ADDI(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t&l = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t r (wid, BIT4_0);
+      get_immediate_rval (cp, r);
+
+      return do_ADD(l, r);
+}
+
 bool of_ADD_WR(vthread_t thr, vvp_code_t)
 {
       double r = thr->pop_real();
@@ -1057,7 +1122,7 @@ bool of_ASSIGN_VEC4_A_D(vthread_t thr, vvp_code_t cp)
 		  return true;
 
 	    int use_off = -off;
-	    assert(wid > use_off);
+	    assert(wid > (unsigned)use_off);
 	    unsigned use_wid = wid - use_off;
 	    val = val.subvalue(use_off, use_wid);
 	    off = 0;
@@ -1149,7 +1214,7 @@ bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t cp)
 		  return true;
 
 	    int use_off = -off;
-	    assert(wid > use_off);
+	    assert(wid > (unsigned)use_off);
 	    unsigned use_wid = wid - use_off;
 	    val = val.subvalue(use_off, use_wid);
 	    off = 0;
@@ -1488,20 +1553,20 @@ bool of_CASSIGN_WR(vthread_t thr, vvp_code_t cp)
  */
 bool of_CAST2(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
+      vvp_vector4_t&val = thr->peek_vec4();
       unsigned wid = val.size();
 
       for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    switch (val.value(idx)) {
+		case BIT4_0:
 		case BIT4_1:
-		  val.set_bit(idx, BIT4_1);
 		  break;
 		default:
 		  val.set_bit(idx, BIT4_0);
 		  break;
 	    }
       }
-      thr->push_vec4(val);
+
       return true;
 }
 
@@ -1818,6 +1883,65 @@ bool of_CONCAT_VEC4(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ * %concati/vec4 <vala>, <valb>, <wid>
+ *
+ * Concat the immediate value to the LOW bits of the concatenation.
+ * Get the HIGH bits from the top of the vec4 stack.
+ */
+bool of_CONCATI_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      uint32_t vala = cp->bit_idx[0];
+      uint32_t valb = cp->bit_idx[1];
+      unsigned wid  = cp->number;
+
+      vvp_vector4_t&msb = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t lsb (wid, BIT4_0);
+
+	// The %concati/vec4 can create values bigger then 32 bits, but
+	// only if the high bits are zero. So at most we need to run
+	// through the loop below 32 times. Maybe less, if the target
+	// width is less. We don't have to do anything special on that
+	// because vala/valb bits will shift away so (vala|valb) will
+	// turn to zero at or before 32 shifts.
+
+      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
+	    uint32_t ba = 0;
+	      // Convert the vala/valb bits to a ba number that can be
+	      // used to select what goes into the value.
+	    ba = (valb & 1) << 1;
+	    ba |= vala & 1;
+
+	    switch (ba) {
+		case 1:
+		  lsb.set_bit(idx, BIT4_1);
+		  break;
+		case 2:
+		  lsb.set_bit(idx, BIT4_Z);
+		  break;
+		case 3:
+		  lsb.set_bit(idx, BIT4_X);
+		  break;
+		default:
+		  break;
+	    }
+
+	    vala >>= 1;
+	    valb >>= 1;
+      }
+
+      vvp_vector4_t res (msb.size()+lsb.size(), BIT4_X);
+      res.set_vec(0, lsb);
+      res.set_vec(lsb.size(), msb);
+
+      msb = res;
+      return true;
+}
+
 bool of_CVT_RS(vthread_t thr, vvp_code_t cp)
 {
       int64_t r = thr->words[cp->bit_idx[0]].w_int;
@@ -4136,12 +4260,14 @@ bool of_XNORR(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ * %or
+ */
 bool of_OR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t vala = thr->pop_vec4();
       vvp_vector4_t valb = thr->pop_vec4();
+      vvp_vector4_t&vala = thr->peek_vec4();
       vala |= valb;
-      thr->push_vec4(vala);
       return true;
 }