From 5ef077fdf6a27dbf729dadf789024b8d0148b054 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Fri, 27 Dec 2013 17:04:42 +0200
Subject: [PATCH] Start work on converting vec4 expressions to use stack.

Instead of using a bit4 space to hold thread vectors, create a
vec4 stack--much like the real, string, and object stacks--to
hold intermediate values.
---
 tgt-vvp/Makefile.in   |    1 +
 tgt-vvp/eval_expr.c   |   46 +-
 tgt-vvp/eval_vec4.c   |  498 ++++++++++++++++++
 tgt-vvp/stmt_assign.c |   35 +-
 tgt-vvp/vvp.c         |   26 +
 tgt-vvp/vvp_priv.h    |   12 +
 tgt-vvp/vvp_process.c |  192 ++++---
 vvp/codes.h           |   29 +-
 vvp/compile.cc        |   45 +-
 vvp/opcodes.txt       |  210 ++++++--
 vvp/vthread.cc        | 1169 +++++++++++++++++++++++++++++++----------
 11 files changed, 1802 insertions(+), 461 deletions(-)
 create mode 100644 tgt-vvp/eval_vec4.c

diff --git a/tgt-vvp/Makefile.in b/tgt-vvp/Makefile.in
index 0aee3a915..0a793a29c 100644
--- a/tgt-vvp/Makefile.in
+++ b/tgt-vvp/Makefile.in
@@ -50,6 +50,7 @@ LDFLAGS = @LDFLAGS@
 O = vvp.o draw_class.o draw_enum.o draw_mux.o draw_net_input.o \
     draw_switch.o draw_ufunc.o draw_vpi.o \
     eval_bool.o eval_expr.o eval_object.o eval_real.o eval_string.o \
+    eval_vec4.o \
     modpath.o stmt_assign.o vector.o \
     vvp_process.o vvp_scope.o
 
diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index c315039bc..a9d15198f 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -189,7 +189,7 @@ uint64_t get_number_immediate64(ivl_expr_t expr)
       return imm;
 }
 
-static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
+void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 {
       switch (ivl_expr_type(expr)) {
 
@@ -200,7 +200,7 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		    if (number_is_unknown(expr)) {
 			    /* We are loading a 'bx so mimic %ix/get. */
 			  fprintf(vvp_out, "    %%ix/load %u, 0, 0;\n", ix);
-			  fprintf(vvp_out, "    %%mov 4, 1, 1;\n");
+			  fprintf(vvp_out, "    %%flag_set/imm 4, 1;\n");
 			  break;
 		    }
 		    long imm = get_number_immediate(expr);
@@ -210,11 +210,14 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 			  fprintf(vvp_out, "    %%ix/load %u, 0, 0; loading %ld\n", ix, imm);
 			  fprintf(vvp_out, "    %%ix/sub %u, %ld, 0;\n", ix, -imm);
 		    }
-		      /* This can not have have a X/Z value so clear bit 4. */
-		    fprintf(vvp_out, "    %%mov 4, 0, 1;\n");
+		      /* This can not have have a X/Z value so clear flag 4. */
+		    fprintf(vvp_out, "    %%flag_set/imm 4, 0;\n");
 	      }
 	      break;
 
+		/* Special case: There is an %ix instruction for
+		   reading index values directly from variables. In
+		   this case, try to use that special instruction. */
 	  case IVL_EX_SIGNAL: {
 		ivl_signal_t sig = ivl_expr_signal(expr);
 
@@ -227,11 +230,8 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 			   variable array. In this case, the ix/getv
 			   will not work, so do it the hard way. */
 		      if (ivl_signal_type(sig) == IVL_SIT_REG) {
-			    struct vector_info rv;
-			    rv = draw_eval_expr(expr, 0);
-			    fprintf(vvp_out, "    %%ix/get%s %u, %u, %u;\n",
-				    type, ix, rv.base, rv.wid);
-			    clr_vector(rv);
+			    draw_eval_vec4(expr, 0);
+			    fprintf(vvp_out, "    %%ix/vec4%s %u;\n", type, ix);
 			    break;
 		      }
 
@@ -240,11 +240,8 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		            assert(! number_is_unknown(ixe));
 		            word = get_number_immediate(ixe);
 		      } else {
-		            struct vector_info rv;
-		            rv = draw_eval_expr(expr, 0);
-		            fprintf(vvp_out, "    %%ix/get%s %u, %u, %u;\n",
-		                    type, ix, rv.base, rv.wid);
-		            clr_vector(rv);
+		            draw_eval_vec4(expr, 0);
+		            fprintf(vvp_out, "    %%ix/vec4%s %u;\n", type, ix);
 		            break;
 		      }
 		}
@@ -254,20 +251,15 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		break;
 	  }
 
-	  default: {
-		  struct vector_info rv;
-		  rv = draw_eval_expr(expr, 0);
-		    /* Is this a signed expression? */
-		  if (ivl_expr_signed(expr)) {
-		      fprintf(vvp_out, "    %%ix/get/s %u, %u, %u;\n",
-		                       ix, rv.base, rv.wid);
-		  } else {
-		      fprintf(vvp_out, "    %%ix/get %u, %u, %u;\n",
-		                       ix, rv.base, rv.wid);
-		  }
-		  clr_vector(rv);
-		  break;
+	  default:
+	    draw_eval_vec4(expr, 0);
+	      /* Is this a signed expression? */
+	    if (ivl_expr_signed(expr)) {
+		  fprintf(vvp_out, "    %%ix/vec4/s %u;\n", ix);
+	    } else {
+		  fprintf(vvp_out, "    %%ix/vec4 %u;\n", ix);
 	    }
+	    break;
       }
 }
 
diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
new file mode 100644
index 000000000..7781625b4
--- /dev/null
+++ b/tgt-vvp/eval_vec4.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2013 Stephen Williams (steve@icarus.com)
+ *
+ *    This source code is free software; you can redistribute it
+ *    and/or modify it in source code form under the terms of the GNU
+ *    General Public License as published by the Free Software
+ *    Foundation; either version 2 of the License, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file includes functions for evaluating VECTOR expressions.
+ */
+# include  "vvp_priv.h"
+# include  <string.h>
+# include  <stdlib.h>
+# include  <math.h>
+# include  <assert.h>
+# include  <stdbool.h>
+
+static void draw_binary_vec4_arith(ivl_expr_t expr, int stuff_ok_flag)
+{
+      draw_eval_vec4(ivl_expr_oper1(expr), stuff_ok_flag);
+      draw_eval_vec4(ivl_expr_oper2(expr), stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '+':
+	    fprintf(vvp_out, "    %%add;\n");
+	    break;
+	  case '-':
+	    fprintf(vvp_out, "    %%sub;\n");
+	    break;
+	  case '*':
+	    fprintf(vvp_out, "    %%mul;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_bitwise(ivl_expr_t expr, int stuff_ok_flag)
+{
+      draw_eval_vec4(ivl_expr_oper1(expr), stuff_ok_flag);
+      draw_eval_vec4(ivl_expr_oper2(expr), stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '&':
+	    fprintf(vvp_out, "    %%and;\n");
+	    break;
+	  case '|':
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_compare_real(ivl_expr_t expr)
+{
+      draw_eval_real(ivl_expr_oper1(expr));
+      draw_eval_real(ivl_expr_oper2(expr));
+
+      switch (ivl_expr_opcode(expr)) {
+	  case 'e': /* == */
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    break;
+	  case 'n': /* != */
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    assert(0);
+      }
+}
+
+static void draw_binary_vec4_compare(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+      if ((ivl_expr_value(le) == IVL_VT_REAL)
+	  || (ivl_expr_value(re) == IVL_VT_REAL)) {
+	    draw_binary_vec4_compare_real(expr);
+	    return;
+      }
+
+      draw_eval_vec4(le, stuff_ok_flag);
+      draw_eval_vec4(re, stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case 'e': /* == */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    break;
+	  case 'n': /* != */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  case 'E': /* === */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 6;\n");
+	    break;
+	  case 'N': /* !== */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 6;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    assert(0);
+      }
+}
+
+static void draw_binary_vec4_le_real(ivl_expr_t expr)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '<':
+	    draw_eval_real(le);
+	    draw_eval_real(re);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+
+	  case 'L': /* <= */
+	    draw_eval_real(le);
+	    draw_eval_real(re);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+
+	  case '>':
+	    draw_eval_real(re);
+	    draw_eval_real(le);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+
+	  case 'G': /* >= */
+	    draw_eval_real(re);
+	    draw_eval_real(le);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_le(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+      ivl_expr_t tmp;
+
+      if ((ivl_expr_value(le) == IVL_VT_REAL)
+	  || (ivl_expr_value(re) == IVL_VT_REAL)) {
+	    draw_binary_vec4_le_real(expr);
+	    return;
+      }
+
+      char use_opcode = ivl_expr_opcode(expr);
+      char s_flag = (ivl_expr_signed(le) && ivl_expr_signed(re)) ? 's' : 'u';
+
+	/* If this is a > or >=, then convert it to < or <= by
+	   swapping the operands. Adjust the opcode to match. */
+      switch (use_opcode) {
+	  case 'G':
+	    tmp = le;
+	    le = re;
+	    re = tmp;
+	    use_opcode = 'L';
+	    break;
+	  case '>':
+	    tmp = le;
+	    le = re;
+	    re = tmp;
+	    use_opcode = '<';
+	    break;
+      }
+
+      draw_eval_vec4(le, stuff_ok_flag);
+      draw_eval_vec4(re, stuff_ok_flag);
+
+      switch (use_opcode) {
+	  case 'L':
+	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+	  case '<':
+	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_lor(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+	/* Push the left expression. Reduce it to a single bit if
+	   necessary. */
+      draw_eval_vec4(le, STUFF_OK_XZ);
+      if (ivl_expr_width(le) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+
+	/* Now push the right expression. Again, reduce to a single
+	   bit if necessasry. */
+      draw_eval_vec4(re, STUFF_OK_XZ);
+      if (ivl_expr_width(re) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+
+      fprintf(vvp_out, "    %%or;\n");
+
+      if (ivl_expr_width(expr) > 1)
+	    fprintf(vvp_out, "    %%pad/u %u;\n", ivl_expr_width(expr));
+}
+
+static void draw_binary_vec4_lrs(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+	// Push the left expression onto the stack.
+      draw_eval_vec4(le, stuff_ok_flag);
+
+	// Calculate the shift amount into an index register.
+      int use_index_reg = allocate_word();
+      assert(use_index_reg >= 0);
+      draw_eval_expr_into_integer(re, use_index_reg);
+
+	// Emit the actual shift instruction. This will pop the top of
+	// the stack and replace it with the result of the shift.
+      switch (ivl_expr_opcode(expr)) {
+	  case 'l': /* << */
+	    fprintf(vvp_out, "    %%shiftl %u;\n", use_index_reg);
+	    break;
+	  case 'r': /* >> */
+	    fprintf(vvp_out, "    %%shiftr %u;\n", use_index_reg);
+	    break;
+	  case 'R': /* >>> */
+	    fprintf(vvp_out, "    %%shiftrs %u;\n", use_index_reg);
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+
+      clr_word(use_index_reg);
+}
+
+static void draw_binary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      switch (ivl_expr_opcode(expr)) {
+	  case '+':
+	  case '-':
+	  case '*':
+	    draw_binary_vec4_arith(expr, stuff_ok_flag);
+	    break;
+
+	  case '&':
+	  case '|':
+	    draw_binary_vec4_bitwise(expr, stuff_ok_flag);
+	    break;
+
+	  case 'e': /* == */
+	  case 'E': /* === */
+	  case 'n': /* !== */
+	  case 'N': /* !== */
+	    draw_binary_vec4_compare(expr, stuff_ok_flag);
+	    break;
+
+	  case 'G': /* >= */
+	  case 'L': /* <= */
+	  case '>':
+	  case '<':
+	    draw_binary_vec4_le(expr, stuff_ok_flag);
+	    break;
+
+	  case 'l': /* << */
+	  case 'r': /* >> */
+	  case 'R': /* >>> */
+	    draw_binary_vec4_lrs(expr, stuff_ok_flag);
+	    break;
+
+	  case 'o': /* || (logical or) */
+	    draw_binary_vec4_lor(expr, stuff_ok_flag);
+	    break;
+
+	  default:
+	    fprintf(stderr, "vvp.tgt error: unsupported binary (%c)\n",
+		    ivl_expr_opcode(expr));
+	    assert(0);
+      }
+}
+
+static void draw_number_vec4(ivl_expr_t expr)
+{
+      unsigned long val0 = 0;
+      unsigned long valx = 0;
+      unsigned wid = ivl_expr_width(expr);
+      const char*bits = ivl_expr_bits(expr);
+
+      int idx;
+
+      assert(wid <= 64);
+
+      for (idx = 0 ; idx < wid ; idx += 1) {
+	    val0 <<= 1;
+	    valx <<= 1;
+	    switch (bits[wid-idx-1]) {
+		case '0':
+		  break;
+		case '1':
+		  val0 |= 1;
+		  break;
+		case 'x':
+		  val0 |= 1;
+		  valx |= 1;
+		  break;
+		case 'z':
+		  val0 |= 0;
+		  valx |= 1;
+		  break;
+		default:
+		  assert(0);
+		  break;
+	    }
+      }
+      fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n", val0, valx, wid);
+}
+
+static void draw_select_vec4(ivl_expr_t expr)
+{
+	// This is the sub-expression to part-select.
+      ivl_expr_t subexpr = ivl_expr_oper1(expr);
+	// This is the base of the part select
+      ivl_expr_t base = ivl_expr_oper2(expr);
+	// This is the part select width
+      unsigned wid = ivl_expr_width(expr);
+
+      draw_eval_vec4(subexpr, 0);
+      draw_eval_vec4(base, 0);
+      fprintf(vvp_out, "    %%part %u;\n", wid);
+}
+
+static void draw_select_pad_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+	// This is the sub-expression to pad/truncate
+      ivl_expr_t subexpr = ivl_expr_oper1(expr);
+	// This is the target width of the expression
+      unsigned wid = ivl_expr_width(expr);
+
+	// Push the sub-expression onto the stack.
+      draw_eval_vec4(subexpr, stuff_ok_flag);
+
+	// Special case: The expression is already the correct width,
+	// so there is nothing to be done.
+      if (wid == ivl_expr_width(subexpr))
+	    return;
+
+      if (ivl_expr_signed(expr))
+	    fprintf(vvp_out, "    %%pad/s %u;\n", wid);
+      else
+	    fprintf(vvp_out, "    %%pad/u %u;\n", wid);
+}
+
+static void draw_signal_vec4(ivl_expr_t expr)
+{
+      ivl_signal_t sig = ivl_expr_signal(expr);
+
+      assert(ivl_signal_dimensions(sig) == 0);
+      fprintf(vvp_out, "    %%load/vec4 v%p_0;\n", sig);
+}
+
+static void draw_ternary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t cond = ivl_expr_oper1(expr);
+      ivl_expr_t true_ex = ivl_expr_oper2(expr);
+      ivl_expr_t false_ex = ivl_expr_oper3(expr);
+
+      unsigned lab_true  = local_count++;
+      unsigned lab_out   = local_count++;
+
+      int use_flag = allocate_flag();
+
+	/* Evaluate the condition expression, including optionally
+	   reducing it to a single bit. Put the result into a flag bit
+	   for use by all the tests. */
+      draw_eval_vec4(cond, STUFF_OK_XZ);
+      if (ivl_expr_width(cond) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
+
+      fprintf(vvp_out, "    %%jmp/0 T_%u.%u, %d;\n", thread_count, lab_true, use_flag);
+
+	/* If the condition is true or xz (not false), we need the true
+	   expression. If the condition is true, then we ONLY need the
+	   true expression. */
+      draw_eval_vec4(true_ex, stuff_ok_flag);
+      fprintf(vvp_out, "    %%jmp/1 T_%u.%u, %d;\n", thread_count, lab_out, use_flag);
+      fprintf(vvp_out, "T_%u.%u ; End of true expr.\n", thread_count, lab_true);
+
+	/* If the condition is false or xz (not true), we need the false
+	   expression. If the condition is false, then we ONLY need
+	   the false expression. */
+      draw_eval_vec4(false_ex, stuff_ok_flag);
+      fprintf(vvp_out, "    %%jmp/0 T_%u.%u, %d;\n", thread_count, lab_out, use_flag);
+      fprintf(vvp_out, " ; End of false expr.\n");
+
+	/* Here, the condition is not true or false, it is xz. Both
+	   the true and false expressions have been pushed onto the
+	   stack, we just need to blend the bits. */
+      fprintf(vvp_out, "    %%blend;\n");
+      fprintf(vvp_out, "T_%u.%u;\n", thread_count, lab_out);
+
+      clr_flag(use_flag);
+}
+
+static void draw_unary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t sub = ivl_expr_oper1(expr);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '~':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    fprintf(stderr, "XXXX Unary operator %c no implemented\n", ivl_expr_opcode(expr));
+	    break;
+      }
+}
+
+void draw_eval_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      switch (ivl_expr_type(expr)) {
+	  case IVL_EX_BINARY:
+	    draw_binary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  case IVL_EX_NUMBER:
+	    draw_number_vec4(expr);
+	    return;
+
+	  case IVL_EX_SELECT:
+	    if (ivl_expr_oper2(expr)==0)
+		  draw_select_pad_vec4(expr, stuff_ok_flag);
+	    else
+		  draw_select_vec4(expr);
+	    return;
+
+	  case IVL_EX_SIGNAL:
+	    draw_signal_vec4(expr);
+	    return;
+
+	  case IVL_EX_TERNARY:
+	    draw_ternary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  case IVL_EX_UNARY:
+	    draw_unary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  default:
+	    break;
+      }
+
+      fprintf(stderr, "XXXX Evaluate VEC4 expression (%d)\n", ivl_expr_type(expr));
+      fprintf(vvp_out, "; XXXX Evaluate VEC4 expression (%d)\n", ivl_expr_type(expr));
+}
diff --git a/tgt-vvp/stmt_assign.c b/tgt-vvp/stmt_assign.c
index 4201778aa..db4b08638 100644
--- a/tgt-vvp/stmt_assign.c
+++ b/tgt-vvp/stmt_assign.c
@@ -339,6 +339,7 @@ static ivl_type_t draw_lval_expr(ivl_lval_t lval)
       return ivl_type_prop_type(sub_type, ivl_lval_property_idx(lval));
 }
 
+#if 0
 static void set_vec_to_lval_slice_nest(ivl_lval_t lval, unsigned bit, unsigned wid)
 {
       ivl_lval_t lval_nest = ivl_lval_nest(lval);
@@ -349,7 +350,9 @@ static void set_vec_to_lval_slice_nest(ivl_lval_t lval, unsigned bit, unsigned w
 	      ivl_lval_property_idx(lval), bit, wid);
       fprintf(vvp_out, "    %%pop/obj 1, 0;\n");
 }
+#endif
 
+#if 0
 static void set_vec_to_lval_slice(ivl_lval_t lval, unsigned bit, unsigned wid)
 {
       ivl_signal_t sig  = ivl_lval_sig(lval);
@@ -507,8 +510,8 @@ static void set_vec_to_lval_slice(ivl_lval_t lval, unsigned bit, unsigned wid)
 
       }
 }
-
-
+#endif
+#if 0
 /*
  * This is a private function to generate %set code for the
  * statement. At this point, the r-value is evaluated and stored in
@@ -542,6 +545,24 @@ static void set_vec_to_lval(ivl_statement_t net, struct vector_info res)
 	    cur_rbit += bit_limit;
       }
 }
+#endif
+
+/*
+ * Store a vector from the vec4 stack to the statement l-values. This
+ * all assumes that the value to be assigned is already on the top of
+ * the stack.
+ */
+static void store_vec4_to_lval(ivl_statement_t net)
+{
+      assert(ivl_stmt_lvals(net) == 1);
+
+      ivl_lval_t lval = ivl_stmt_lval(net,0);
+      ivl_signal_t lsig = ivl_lval_sig(lval);
+
+      assert(ivl_lval_width(lval) == ivl_signal_width(lsig));
+
+      fprintf(vvp_out, "    %%store/vec4 v%p_0, %u;\n", lsig, ivl_signal_width(lsig));
+}
 
 static int show_stmt_assign_vector(ivl_statement_t net)
 {
@@ -554,7 +575,7 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	   of the l-value. We need these values as part of the r-value
 	   calculation. */
       if (ivl_stmt_opcode(net) != 0) {
-	    slices = calloc(ivl_stmt_lvals(net), sizeof(struct vec_slice_info));
+            slices = calloc(ivl_stmt_lvals(net), sizeof(struct vec_slice_info));
 	    lres = get_vec_from_lval(net, slices);
       }
 
@@ -563,7 +584,7 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	   result to a vector. Then store that vector into the
 	   l-value. */
       if (ivl_expr_value(rval) == IVL_VT_REAL) {
-	    draw_eval_real(rval);
+            draw_eval_real(rval);
 	      /* This is the accumulated with of the l-value of the
 		 assignment. */
 	    unsigned wid = ivl_stmt_lwidth(net);
@@ -582,12 +603,14 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	    fprintf(vvp_out, "    %%cvt/vr %u, %u;\n", res.base, res.wid);
 
       } else {
-	    res = draw_eval_expr(rval, 0);
+	    draw_eval_vec4(rval, 0);
+	    res.base = 0; // XXXX This is just to suppress the clr_vector below.
+	    res.wid = 0;
       }
 
       switch (ivl_stmt_opcode(net)) {
 	  case 0:
-	    set_vec_to_lval(net, res);
+	    store_vec4_to_lval(net);
 	    break;
 
 	  case '+':
diff --git a/tgt-vvp/vvp.c b/tgt-vvp/vvp.c
index c7af7e4ce..fd32dd9a5 100644
--- a/tgt-vvp/vvp.c
+++ b/tgt-vvp/vvp.c
@@ -48,6 +48,8 @@ FILE*vvp_out = 0;
 int vvp_errors = 0;
 unsigned show_file_line = 0;
 
+static uint32_t allocate_flag_mask = 0x00ff;
+
 __inline__ static void draw_execute_header(ivl_design_t des)
 {
       const char*cp = ivl_design_flag(des, "VVP_EXECUTABLE");
@@ -85,6 +87,30 @@ __inline__ static void draw_module_declarations(ivl_design_t des)
       }
 }
 
+int allocate_flag(void)
+{
+      int idx;
+      for (idx = 0 ; idx < 8*sizeof(allocate_flag_mask) ; idx += 1) {
+	    uint32_t mask = 1 << idx;
+	    if (allocate_flag_mask & mask)
+		  continue;
+
+	    allocate_flag_mask |= mask;
+	    return idx;
+      }
+
+      return -1;
+}
+
+void clr_flag(int idx)
+{
+      assert(idx < 8*sizeof(allocate_flag_mask));
+      uint32_t mask = 1 << idx;
+
+      assert(allocate_flag_mask & mask);
+
+      allocate_flag_mask &= ~mask;
+}
 
 int target_design(ivl_design_t des)
 
diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h
index 40d8d87dc..957288e16 100644
--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@@ -306,6 +306,12 @@ extern int number_is_immediate(ivl_expr_t ex, unsigned lim_wid, int negative_is_
 extern long get_number_immediate(ivl_expr_t ex);
 extern uint64_t get_number_immediate64(ivl_expr_t ex);
 
+/*
+ * draw_eval_vec4 evaluates vec4 expressions. The result of the
+ * evaluation is the vec4 result in the top of the vec4 expression stack.
+ */
+extern void draw_eval_vec4(ivl_expr_t ex, int stuff_ok_flag);
+
 /*
  * draw_eval_real evaluates real value expressions. The result of the
  * evaluation is the real result in the top of the real expression stack.
@@ -342,6 +348,12 @@ extern void show_stmt_file_line(ivl_statement_t net, const char*desc);
 extern int allocate_word(void);
 extern void clr_word(int idx);
 
+/*
+ * These functions manage flag bit allocation.
+ */
+extern int allocate_flag(void);
+extern void clr_flag(int idx);
+
 /*
  * These are used to count labels as I generate code.
  */
diff --git a/tgt-vvp/vvp_process.c b/tgt-vvp/vvp_process.c
index f9aca0528..f4808f669 100644
--- a/tgt-vvp/vvp_process.c
+++ b/tgt-vvp/vvp_process.c
@@ -209,9 +209,9 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
       clear_expression_lookaside();
 }
 
-static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
+static void assign_to_lvector(ivl_lval_t lval,
 			      uint64_t delay, ivl_expr_t dexp,
-			      unsigned width, unsigned nevents)
+			      unsigned nevents)
 {
       ivl_signal_t sig = ivl_lval_sig(lval);
       ivl_expr_t part_off_ex = ivl_lval_part_off(lval);
@@ -221,9 +221,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
       const unsigned long use_word = 0;
 
       if (ivl_signal_dimensions(sig) > 0) {
+#if 0
 	    assert(word_ix);
 	    assign_to_array_word(sig, word_ix, bit, delay, dexp, part_off_ex,
 	                         width, nevents);
+#else
+	    fprintf(stderr, "XXXX %%assign to array word not supported yet.\n");
+#endif
 	    return;
       }
 
@@ -247,9 +251,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		  fprintf(vvp_out, "    %%assign/v0/x1/d v%p_%lu, %d, %u;\n",
 		          sig, use_word, delay_index, bit);
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 		  clr_word(delay_index);
 	    } else if (nevents != 0) {
@@ -257,9 +265,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		  fprintf(vvp_out, "    %%assign/v0/x1/e v%p_%lu, %u;\n",
 		          sig, use_word, bit);
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 		  fprintf(vvp_out, "    %%evctl/c;\n");
 	    } else {
@@ -267,6 +279,7 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		    /*
 		     * The %assign can only take a 32 bit delay. For a larger
@@ -285,10 +298,14 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 			        "    %%assign/v0/x1 v%p_%lu, %lu, %u;\n",
 			        sig, use_word, low_d, bit);
 		  }
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 	    }
 
       } else if (part_off>0 || ivl_lval_width(lval)!=ivl_signal_width(sig)) {
+#if 0
 	      /* There is no mux expression, but a constant part
 		 offset. Load that into index x1 and generate a
 		 single-bit set instruction. */
@@ -331,23 +348,41 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 			        sig, use_word, low_d, bit);
 		  }
 	    }
+#else
+	    if (dexp != 0) {
+		  assert(0); // XXXX
+
+	    } else if (nevents != 0) {
+		  assert(0); // XXXX
+
+	    } else {
+		  int offset_index = allocate_word();
+		  int delay_index = allocate_word();
+		  fprintf(vvp_out, "    %%ix/load %d, %lu, 0;\n", offset_index, part_off);
+		  if (dexp)
+			draw_eval_expr_into_integer(dexp,delay_index);
+		  else
+			fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
+				delay_index, low_d, hig_d);
+		  fprintf(vvp_out, "    %%assign/vec4/off/d v%p_%lu, %d, %d;\n",
+			  sig, use_word, offset_index, delay_index);
+		  clr_word(offset_index);
+		  clr_word(delay_index);
+	    }
+#endif
 
       } else if (dexp != 0) {
 	      /* Calculated delay... */
 	    int delay_index = allocate_word();
 	    draw_eval_expr_into_integer(dexp, delay_index);
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
-	    fprintf(vvp_out, "    %%assign/v0/d v%p_%lu, %d, %u;\n",
-		    sig, use_word, delay_index, bit);
+	    fprintf(vvp_out, "    %%assign/vec4/d v%p_%lu, %d;\n",
+		    sig, use_word, delay_index);
 	    clr_word(delay_index);
       } else if (nevents != 0) {
 	      /* Event control delay... */
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
-	    fprintf(vvp_out, "    %%assign/v0/e v%p_%lu, %u;\n",
-		    sig, use_word, bit);
+	    fprintf(vvp_out, "    %%assign/vec4/e v%p_%lu;\n",
+		    sig, use_word);
       } else {
-	      /* Constant delay... */
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 	      /*
 	       * The %assign can only take a 32 bit delay. For a larger
 	       * delay we need to put it into an index register.
@@ -356,12 +391,12 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  int delay_index = allocate_word();
 		  fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
 		          delay_index, low_d, hig_d);
-		  fprintf(vvp_out, "    %%assign/v0/d v%p_%lu, %d, %u;\n",
-		          sig, use_word, delay_index, bit);
+		  fprintf(vvp_out, "    %%assign/vec4/d v%p_%lu, %d;\n",
+		          sig, use_word, delay_index);
 		  clr_word(delay_index);
 	    } else {
-		  fprintf(vvp_out, "    %%assign/v0 v%p_%lu, %lu, %u;\n",
-		          sig, use_word, low_d, bit);
+		  fprintf(vvp_out, "    %%assign/vec4 v%p_%lu, %lu;\n",
+		          sig, use_word, low_d);
 	    }
       }
 }
@@ -546,7 +581,7 @@ static int show_stmt_assign_nb(ivl_statement_t net)
       }
 
 
-      { struct vector_info res;
+      { struct vector_info res = {0,0};
 	unsigned wid;
 	unsigned lidx;
 	unsigned cur_rbit = 0;
@@ -574,21 +609,29 @@ static int show_stmt_assign_nb(ivl_statement_t net)
 		      res.base, res.wid);
 
 	} else {
-	      res = draw_eval_expr(rval, 0);
-	      wid = res.wid;
+	      wid = ivl_stmt_lwidth(net);
+	      draw_eval_vec4(rval, 0);
+	      if (ivl_expr_width(rval) != wid) {
+		    if (ivl_expr_signed(rval))
+			  fprintf(vvp_out, "    %%pad/s %u;\n", wid);
+		    else
+			  fprintf(vvp_out, "    %%pad/u %u;\n", wid);
+	      }
 	}
 
+	  /* Spread the r-value vector over the bits of the l-value. */
 	for (lidx = 0 ;  lidx < ivl_stmt_lvals(net) ;  lidx += 1) {
 	      unsigned bit_limit = wid - cur_rbit;
-	      unsigned bidx;
 
 	      lval = ivl_stmt_lval(net, lidx);
 
 	      if (bit_limit > ivl_lval_width(lval))
 		    bit_limit = ivl_lval_width(lval);
 
-	      bidx = res.base < 4? res.base : (res.base+cur_rbit);
-	      assign_to_lvector(lval, bidx, delay, del, bit_limit, nevents);
+		/* XXXX For now, don't know how to actually split
+		   vectors */
+	      assert(lidx == 0);
+	      assign_to_lvector(lval, delay, del, nevents);
 
 	      cur_rbit += bit_limit;
 
@@ -655,7 +698,6 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 {
       int rc = 0;
       ivl_expr_t expr = ivl_stmt_cond_expr(net);
-      struct vector_info cond = draw_eval_expr(expr, 0);
       unsigned count = ivl_stmt_case_count(net);
 
       unsigned local_base = local_count;
@@ -666,6 +708,11 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 
       local_count += count + 1;
 
+	/* Evaluate the case condition to the top of the vec4
+	   stack. This expression will be compared multiple times to
+	   each case guard. */
+      draw_eval_vec4(expr,0);
+
 	/* First draw the branch table.  All the non-default cases
 	   generate a branch out of here, to the code that implements
 	   the case. The default will fall through all the tests. */
@@ -673,55 +720,34 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 
       for (idx = 0 ;  idx < count ;  idx += 1) {
 	    ivl_expr_t cex = ivl_stmt_case_expr(net, idx);
-	    struct vector_info cvec;
 
 	    if (cex == 0) {
 		  default_case = idx;
 		  continue;
 	    }
 
-	      /* Is the guard expression something I can pass to a
-		 %cmpi/u instruction? If so, use that instead. */
-
-	    if ((ivl_statement_type(net) == IVL_ST_CASE)
-		&& (ivl_expr_type(cex) == IVL_EX_NUMBER)
-		&& (! number_is_unknown(cex))
-		&& number_is_immediate(cex, 16, 0)) {
-
-		  unsigned long imm = get_number_immediate(cex);
-
-		  fprintf(vvp_out, "    %%cmpi/u %u, %lu, %u;\n",
-			  cond.base, imm, cond.wid);
-		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 6;\n",
-			  thread_count, local_base+idx);
-
-		  continue;
-	    }
-
-	      /* Oh well, do this case the hard way. */
-
-	    cvec = draw_eval_expr_wid(cex, cond.wid, STUFF_OK_RO);
-	    assert(cvec.wid == cond.wid);
+	      /* Duplicate the case expression so that the cmp
+		 instructions below do not completely erase the
+		 value. Do this in fromt of each compare. */
+	    fprintf(vvp_out, "    %%dup/vec4;\n");
+	    draw_eval_vec4(cex, STUFF_OK_RO);
 
 	    switch (ivl_statement_type(net)) {
 
 		case IVL_ST_CASE:
-		  fprintf(vvp_out, "    %%cmp/u %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/u;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 6;\n",
 			  thread_count, local_base+idx);
 		  break;
 
 		case IVL_ST_CASEX:
-		  fprintf(vvp_out, "    %%cmp/x %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/x;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 4;\n",
 			  thread_count, local_base+idx);
 		  break;
 
 		case IVL_ST_CASEZ:
-		  fprintf(vvp_out, "    %%cmp/z %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/z;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 4;\n",
 			  thread_count, local_base+idx);
 		  break;
@@ -729,14 +755,8 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 		default:
 		  assert(0);
 	    }
-
-	      /* Done with the case expression */
-	    clr_vector(cvec);
       }
 
-	/* Done with the condition expression */
-      clr_vector(cond);
-
 	/* Emit code for the default case. */
       if (default_case < count) {
 	    ivl_statement_t cst = ivl_stmt_case_stmt(net, default_case);
@@ -757,6 +777,7 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 	    clear_expression_lookaside();
 	    rc += show_statement(cst, sscope);
 
+	      /* Statement is done, jump to the out of the case. */
 	    fprintf(vvp_out, "    %%jmp T_%u.%u;\n", thread_count,
 		    local_base+count);
 
@@ -765,6 +786,10 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 
 	/* The out of the case. */
       fprintf(vvp_out, "T_%u.%u ;\n",  thread_count, local_base+count);
+	/* The case tests will leave the case expression on the top of
+	   the stack, but we are done with it now. Pop it. */
+      fprintf(vvp_out, "    %%pop/vec4 1;\n");
+
       clear_expression_lookaside();
 
       return rc;
@@ -1238,23 +1263,20 @@ static int show_stmt_condit(ivl_statement_t net, ivl_scope_t sscope)
       int rc = 0;
       unsigned lab_false, lab_out;
       ivl_expr_t expr = ivl_stmt_cond_expr(net);
-      struct vector_info cond;
 
       show_stmt_file_line(net, "If statement.");
 
-      cond = draw_eval_expr(expr, STUFF_OK_XZ|STUFF_OK_47|STUFF_OK_RO);
-
-      assert(cond.wid == 1);
+      draw_eval_vec4(expr, STUFF_OK_XZ|STUFF_OK_47|STUFF_OK_RO);
 
       lab_false = local_count++;
       lab_out = local_count++;
 
-      fprintf(vvp_out, "    %%jmp/0xz  T_%u.%u, %u;\n",
-	      thread_count, lab_false, cond.base);
-
-	/* Done with the condition expression. */
-      if (cond.base >= 8)
-	    clr_vector(cond);
+      int use_flag = allocate_flag();
+	/* The %flag/vec4 pops the vec4 bit and puts it to the flag. */
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
+      fprintf(vvp_out, "    %%jmp/0xz  T_%u.%u, %d;\n",
+	      thread_count, lab_false, use_flag);
+      clr_flag(use_flag);
 
       if (ivl_stmt_cond_true(net))
 	    rc += show_statement(ivl_stmt_cond_true(net), sscope);
@@ -1320,20 +1342,19 @@ static int show_stmt_delayx(ivl_statement_t net, ivl_scope_t sscope)
 
       show_stmt_file_line(net, "Delay statement.");
 
+      int use_idx = allocate_word();
       switch (ivl_expr_value(expr)) {
 
 	  case IVL_VT_BOOL:
 	  case IVL_VT_LOGIC: {
-		struct vector_info del = draw_eval_expr(expr, 0);
-		fprintf(vvp_out, "    %%ix/get 0, %u, %u;\n",
-			del.base, del.wid);
-		clr_vector(del);
+		draw_eval_vec4(expr, 0);
+		fprintf(vvp_out, "    %%ix/vec4 %d;\n", use_idx);
 		break;
 	  }
 
 	  case IVL_VT_REAL: {
 		draw_eval_real(expr);
-		fprintf(vvp_out, "    %%cvt/ur 0;\n");
+		fprintf(vvp_out, "    %%cvt/ur %d;\n", use_idx);
 		break;
 	  }
 
@@ -1341,7 +1362,9 @@ static int show_stmt_delayx(ivl_statement_t net, ivl_scope_t sscope)
 	    assert(0);
       }
 
-      fprintf(vvp_out, "    %%delayx 0;\n");
+      fprintf(vvp_out, "    %%delayx %d;\n", use_idx);
+      clr_word(use_idx);
+
 	/* Lots of things can happen during a delay. */
       clear_expression_lookaside();
 
@@ -1755,7 +1778,6 @@ static int show_stmt_wait(ivl_statement_t net, ivl_scope_t sscope)
 static int show_stmt_while(ivl_statement_t net, ivl_scope_t sscope)
 {
       int rc = 0;
-      struct vector_info cvec;
 
       unsigned top_label = local_count++;
       unsigned out_label = local_count++;
@@ -1771,14 +1793,16 @@ static int show_stmt_while(ivl_statement_t net, ivl_scope_t sscope)
 	/* Draw the evaluation of the condition expression, and test
 	   the result. If the expression evaluates to false, then
 	   branch to the out label. */
-      cvec = draw_eval_expr(ivl_stmt_cond_expr(net), STUFF_OK_XZ|STUFF_OK_47);
-      if (cvec.wid > 1)
-	    cvec = reduction_or(cvec);
+      draw_eval_vec4(ivl_stmt_cond_expr(net), STUFF_OK_XZ|STUFF_OK_47);
+      if (ivl_expr_width(ivl_stmt_cond_expr(net)) > 1) {
+	    fprintf(vvp_out, "    %%or/r;\n");
+      }
 
+      int use_flag = allocate_flag();
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
       fprintf(vvp_out, "    %%jmp/0xz T_%u.%u, %u;\n",
-	      thread_count, out_label, cvec.base);
-      if (cvec.base >= 8)
-	    clr_vector(cvec);
+	      thread_count, out_label, use_flag);
+      clr_flag(use_flag);
 
 	/* Draw the body of the loop. */
       rc += show_statement(ivl_stmt_sub_stmt(net), sscope);
@@ -1966,7 +1990,7 @@ static unsigned is_repeat_event_assign(ivl_scope_t scope,
  */
 static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
 {
-      ivl_statement_t while_wait, wait, wait_stmt;
+      ivl_statement_t while_wait, wait_x, wait_stmt;
       ivl_expr_t while_expr, expr;
       const char *bits;
 	/* We must have two block elements. */
@@ -1975,9 +1999,9 @@ static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
       while_wait = ivl_stmt_block_stmt(stmt, 0);
       if (ivl_statement_type(while_wait) != IVL_ST_WHILE) return 0;
 	/* That has a wait with a NOOP statement. */
-      wait = ivl_stmt_sub_stmt(while_wait);
-      if (ivl_statement_type(wait) != IVL_ST_WAIT) return 0;
-      wait_stmt = ivl_stmt_sub_stmt(wait);
+      wait_x = ivl_stmt_sub_stmt(while_wait);
+      if (ivl_statement_type(wait_x) != IVL_ST_WAIT) return 0;
+      wait_stmt = ivl_stmt_sub_stmt(wait_x);
       if (ivl_statement_type(wait_stmt) != IVL_ST_NOOP) return 0;
 	/* Check that the while condition has the correct form. */
       while_expr = ivl_stmt_cond_expr(while_wait);
@@ -1994,7 +2018,7 @@ static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
 	/* And finally the two statements that represent the wait must
 	 * have the same line number as the block. */
       if ((ivl_stmt_lineno(stmt) != ivl_stmt_lineno(while_wait)) ||
-          (ivl_stmt_lineno(stmt) != ivl_stmt_lineno(wait))) {
+          (ivl_stmt_lineno(stmt) != ivl_stmt_lineno(wait_x))) {
 	    return 0;
       }
 
diff --git a/vvp/codes.h b/vvp/codes.h
index 3cab77057..4921b3056 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -47,9 +47,10 @@ extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AVE(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_MV(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0D(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0E(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4D(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4E(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1E(vthread_t thr, vvp_code_t code);
@@ -86,6 +87,7 @@ extern bool of_CVT_UR(vthread_t thr, vvp_code_t code);
 extern bool of_CVT_VR(vthread_t thr, vvp_code_t code);
 extern bool of_DEASSIGN(vthread_t thr, vvp_code_t code);
 extern bool of_DEASSIGN_WR(vthread_t thr, vvp_code_t code);
+extern bool of_DEBUG_THR(vthread_t thr, vvp_code_t code);
 extern bool of_DELAY(vthread_t thr, vvp_code_t code);
 extern bool of_DELAYX(vthread_t thr, vvp_code_t code);
 extern bool of_DELETE_OBJ(vthread_t thr, vvp_code_t code);
@@ -95,12 +97,16 @@ extern bool of_DIV(vthread_t thr, vvp_code_t code);
 extern bool of_DIV_S(vthread_t thr, vvp_code_t code);
 extern bool of_DIV_WR(vthread_t thr, vvp_code_t code);
 extern bool of_DUP_REAL(vthread_t thr, vvp_code_t code);
+extern bool of_DUP_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_END(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTL(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLC(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLI(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLS(vthread_t thr, vvp_code_t code);
 extern bool of_FILE_LINE(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_GET_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_SET_IMM(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_LINK(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_V(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_WR(vthread_t thr, vvp_code_t code);
@@ -117,6 +123,8 @@ extern bool of_IX_LOAD(vthread_t thr, vvp_code_t code);
 extern bool of_IX_MOV(vthread_t thr, vvp_code_t code);
 extern bool of_IX_MUL(vthread_t thr, vvp_code_t code);
 extern bool of_IX_SUB(vthread_t thr, vvp_code_t code);
+extern bool of_IX_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_IX_VEC4_S(vthread_t thr, vvp_code_t code);
 extern bool of_JMP(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0XZ(vthread_t thr, vvp_code_t code);
@@ -135,7 +143,7 @@ extern bool of_LOAD_DAR_STR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STRA(vthread_t thr, vvp_code_t code);
-extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X1P(vthread_t thr, vvp_code_t code);
@@ -160,10 +168,13 @@ extern bool of_NORR(vthread_t thr, vvp_code_t code);
 extern bool of_NULL(vthread_t thr, vvp_code_t code);
 extern bool of_OR(vthread_t thr, vvp_code_t code);
 extern bool of_ORR(vthread_t thr, vvp_code_t code);
-extern bool of_PAD(vthread_t thr, vvp_code_t code);
+extern bool of_PAD_S(vthread_t thr, vvp_code_t code);
+extern bool of_PAD_U(vthread_t thr, vvp_code_t code);
+extern bool of_PART(vthread_t thr, vvp_code_t code);
 extern bool of_POP_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_POP_REAL(vthread_t thr, vvp_code_t code);
 extern bool of_POP_STR(vthread_t thr, vvp_code_t code);
+extern bool of_POP_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_POW(vthread_t thr, vvp_code_t code);
 extern bool of_POW_S(vthread_t thr, vvp_code_t code);
 extern bool of_POW_WR(vthread_t thr, vvp_code_t code);
@@ -173,6 +184,7 @@ extern bool of_PROP_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PROP_V(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHI_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHI_REAL(vthread_t thr, vvp_code_t code);
+extern bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHV_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PUTC_STR_V(vthread_t thr, vvp_code_t code);
 extern bool of_RELEASE_NET(vthread_t thr, vvp_code_t code);
@@ -187,9 +199,9 @@ extern bool of_SET_DAR_OBJ_STR(vthread_t thr, vvp_code_t code);
 extern bool of_SET_VEC(vthread_t thr, vvp_code_t code);
 extern bool of_SET_X0(vthread_t thr, vvp_code_t code);
 extern bool of_SET_X0_X(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTL_I0(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTR_I0(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTR_S_I0(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTL(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTR(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTR_S(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_DAR_R(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_DAR_STR(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_OBJ(vthread_t thr, vvp_code_t code);
@@ -201,6 +213,7 @@ extern bool of_STORE_REAL(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_REALA(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STR(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STRA(vthread_t thr, vvp_code_t code);
+extern bool of_STORE_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_SUB(vthread_t thr, vvp_code_t code);
 extern bool of_SUB_WR(vthread_t thr, vvp_code_t code);
 extern bool of_SUBI(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index f79b5b883..85b551656 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -85,11 +85,11 @@ struct opcode_table_s {
 
 static const struct opcode_table_s opcode_table[] = {
       { "%abs/wr", of_ABS_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%add",    of_ADD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%add",    of_ADD,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%add/wr", of_ADD_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%alloc",  of_ALLOC,  1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
-      { "%and",    of_AND,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%and",    of_AND,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%assign/ar",of_ASSIGN_AR,2,{OA_ARR_PTR,OA_BIT1,     OA_NONE} },
@@ -98,17 +98,18 @@ static const struct opcode_table_s opcode_table[] = {
       { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1,     OA_BIT2} },
       { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1,  OA_BIT2} },
       { "%assign/av/e",of_ASSIGN_AVE,2,{OA_ARR_PTR,OA_BIT1,  OA_NONE} },
-      { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
-      { "%assign/v0/d",of_ASSIGN_V0D,3,{OA_FUNC_PTR,OA_BIT1, OA_BIT2} },
-      { "%assign/v0/e",of_ASSIGN_V0E,2,{OA_FUNC_PTR,OA_BIT1, OA_NONE} },
       { "%assign/v0/x1",of_ASSIGN_V0X1,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
       { "%assign/v0/x1/d",of_ASSIGN_V0X1D,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
       { "%assign/v0/x1/e",of_ASSIGN_V0X1E,2,{OA_FUNC_PTR,OA_BIT1,OA_NONE} },
+      { "%assign/vec4",  of_ASSIGN_VEC4, 2,{OA_FUNC_PTR, OA_BIT1, OA_NONE} },
+      { "%assign/vec4/d",of_ASSIGN_VEC4D,2,{OA_FUNC_PTR, OA_BIT1, OA_NONE} },
+      { "%assign/vec4/e",of_ASSIGN_VEC4E,1,{OA_FUNC_PTR, OA_NONE, OA_NONE} },
+      { "%assign/vec4/off/d",of_ASSIGN_VEC4_OFF_D, 3,{OA_FUNC_PTR, OA_BIT1, OA_BIT2} },
       { "%assign/wr",  of_ASSIGN_WR, 2,{OA_VPI_PTR, OA_BIT1, OA_NONE} },
       { "%assign/wr/d",of_ASSIGN_WRD,2,{OA_VPI_PTR, OA_BIT1, OA_NONE} },
       { "%assign/wr/e",of_ASSIGN_WRE,1,{OA_VPI_PTR, OA_NONE, OA_NONE} },
       { "%assign/x0",of_ASSIGN_X0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
-      { "%blend",    of_BLEND,   3,  {OA_BIT1,  OA_BIT2,     OA_NUMBER} },
+      { "%blend",    of_BLEND,   0,  {OA_NONE,  OA_NONE,     OA_NONE} },
       { "%blend/wr", of_BLEND_WR,0,  {OA_NONE,  OA_NONE,     OA_NONE} },
       { "%breakpoint", of_BREAKPOINT, 0,  {OA_NONE, OA_NONE, OA_NONE} },
       { "%cassign/link",of_CASSIGN_LINK,2,{OA_FUNC_PTR,OA_FUNC_PTR2,OA_NONE} },
@@ -116,9 +117,9 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cassign/wr",of_CASSIGN_WR,1,{OA_FUNC_PTR,OA_NONE,  OA_NONE} },
       { "%cassign/x0",of_CASSIGN_X0,3,{OA_FUNC_PTR,OA_BIT1,  OA_BIT2} },
       { "%cast2",  of_CAST2,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%cmp/s",  of_CMPS,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmp/s",  of_CMPS,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/str",of_CMPSTR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%cmp/u",  of_CMPU,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmp/u",  of_CMPU,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/wr", of_CMPWR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%cmp/ws", of_CMPWS,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
@@ -137,6 +138,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%cvt/vr", of_CVT_VR, 2,  {OA_BIT1,     OA_NUMBER,   OA_NONE} },
       { "%deassign",of_DEASSIGN,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
       { "%deassign/wr",of_DEASSIGN_WR,1,{OA_FUNC_PTR, OA_NONE,     OA_NONE} },
+      { "%debug/thr",  of_DEBUG_THR,  0,{OA_NONE,     OA_NONE,     OA_NONE} },
       { "%delay",  of_DELAY,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%delayx", of_DELAYX, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
       { "%delete/obj",of_DELETE_OBJ,1,{OA_FUNC_PTR,OA_NONE,  OA_NONE} },
@@ -145,17 +147,21 @@ static const struct opcode_table_s opcode_table[] = {
       { "%div/s",  of_DIV_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%div/wr",   of_DIV_WR,  0, {OA_NONE,   OA_NONE,     OA_NONE} },
       { "%dup/real", of_DUP_REAL,0, {OA_NONE,   OA_NONE,     OA_NONE} },
+      { "%dup/vec4", of_DUP_VEC4,0, {OA_NONE,   OA_NONE,     OA_NONE} },
       { "%end",    of_END,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%evctl",  of_EVCTL,  2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
       { "%evctl/c",of_EVCTLC, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%evctl/i",of_EVCTLI, 2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
       { "%evctl/s",of_EVCTLS, 2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
+      { "%flag_get/vec4", of_FLAG_GET_VEC4, 1, {OA_NUMBER, OA_NONE, OA_NONE} },
+      { "%flag_set/imm",  of_FLAG_SET_IMM,  2, {OA_NUMBER, OA_BIT1, OA_NONE} },
+      { "%flag_set/vec4", of_FLAG_SET_VEC4, 1, {OA_NUMBER, OA_NONE, OA_NONE} },
       { "%force/link",of_FORCE_LINK,2,{OA_FUNC_PTR,OA_FUNC_PTR2,OA_NONE} },
       { "%force/v",of_FORCE_V,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
       { "%force/wr",of_FORCE_WR,1,{OA_FUNC_PTR, OA_NONE,     OA_NONE} },
       { "%force/x0",of_FORCE_X0,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
       { "%free",   of_FREE,   1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
-      { "%inv",    of_INV,    2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
+      { "%inv",    of_INV,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%ix/add", of_IX_ADD, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
       { "%ix/get", of_IX_GET, 3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%ix/get/s",of_IX_GET_S,3,{OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@@ -165,6 +171,8 @@ static const struct opcode_table_s opcode_table[] = {
       { "%ix/mov", of_IX_MOV, 2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%ix/mul", of_IX_MUL, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
       { "%ix/sub", of_IX_SUB, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
+      { "%ix/vec4",  of_IX_VEC4,  1,  {OA_NUMBER,   OA_NONE, OA_NONE} },
+      { "%ix/vec4/s",of_IX_VEC4_S,1,  {OA_NUMBER,   OA_NONE, OA_NONE} },
       { "%jmp",    of_JMP,    1,  {OA_CODE_PTR, OA_NONE,     OA_NONE} },
       { "%jmp/0",  of_JMP0,   2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
       { "%jmp/0xz",of_JMP0XZ, 2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
@@ -183,7 +191,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%load/real", of_LOAD_REAL,1,{OA_VPI_PTR, OA_NONE,     OA_NONE} },
       { "%load/str",  of_LOAD_STR, 1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
       { "%load/stra", of_LOAD_STRA,2,{OA_ARR_PTR, OA_BIT1,     OA_NONE} },
-      { "%load/v", of_LOAD_VEC,3,    {OA_BIT1,    OA_FUNC_PTR, OA_BIT2} },
+      { "%load/vec4", of_LOAD_VEC4,1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
       { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/vp0/s",of_LOAD_VP0_S,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/x1p",of_LOAD_X1P,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
@@ -206,12 +214,15 @@ static const struct opcode_table_s opcode_table[] = {
       { "%nor",    of_NOR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%nor/r",  of_NORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%null",   of_NULL,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%or",     of_OR,     3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%or",     of_OR,     0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%pad",    of_PAD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%pad/s",  of_PAD_S,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%pad/u",  of_PAD_U,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%part",   of_PART,   1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
       { "%pop/obj", of_POP_OBJ, 2, {OA_BIT1,    OA_BIT2,     OA_NONE} },
       { "%pop/real",of_POP_REAL,1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
       { "%pop/str", of_POP_STR, 1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
+      { "%pop/vec4",of_POP_VEC4,1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
       { "%pow",    of_POW,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pow/s",  of_POW_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pow/wr", of_POW_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -221,6 +232,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%prop/v",  of_PROP_V,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pushi/real",of_PUSHI_REAL,2,{OA_BIT1,   OA_BIT2,   OA_NONE} },
       { "%pushi/str", of_PUSHI_STR, 1,{OA_STRING, OA_NONE,   OA_NONE} },
+      { "%pushi/vec4",of_PUSHI_VEC4,3,{OA_BIT1,   OA_BIT2,   OA_NUMBER} },
       { "%pushv/str", of_PUSHV_STR, 2, {OA_BIT1,OA_BIT2,     OA_NONE} },
       { "%putc/str/v",of_PUTC_STR_V,3,{OA_FUNC_PTR,OA_BIT1,  OA_BIT2} },
       { "%release/net",of_RELEASE_NET,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
@@ -234,9 +246,9 @@ static const struct opcode_table_s opcode_table[] = {
       { "%set/dar/obj/str", of_SET_DAR_OBJ_STR, 1,{OA_NUMBER,OA_NONE,OA_NONE} },
       { "%set/v",  of_SET_VEC,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
       { "%set/x0", of_SET_X0, 3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
-      { "%shiftl/i0", of_SHIFTL_I0, 2, {OA_BIT1,OA_NUMBER,   OA_NONE} },
-      { "%shiftr/i0", of_SHIFTR_I0, 2, {OA_BIT1,OA_NUMBER,   OA_NONE} },
-      { "%shiftr/s/i0", of_SHIFTR_S_I0,2,{OA_BIT1,OA_NUMBER, OA_NONE} },
+      { "%shiftl",   of_SHIFTL,   1, {OA_NUMBER, OA_NONE,   OA_NONE} },
+      { "%shiftr",   of_SHIFTR,   1, {OA_NUMBER, OA_NONE,   OA_NONE} },
+      { "%shiftr/s", of_SHIFTR_S, 1, {OA_NUMBER, OA_NONE,   OA_NONE} },
       { "%store/dar/r",  of_STORE_DAR_R,   1, {OA_FUNC_PTR, OA_NONE, OA_NONE} },
       { "%store/dar/str",of_STORE_DAR_STR, 1, {OA_FUNC_PTR, OA_NONE, OA_NONE} },
       { "%store/obj",   of_STORE_OBJ,   1, {OA_FUNC_PTR,OA_NONE, OA_NONE} },
@@ -248,7 +260,8 @@ static const struct opcode_table_s opcode_table[] = {
       { "%store/reala", of_STORE_REALA, 2, {OA_ARR_PTR, OA_BIT1, OA_NONE} },
       { "%store/str",   of_STORE_STR,   1, {OA_FUNC_PTR,OA_NONE, OA_NONE} },
       { "%store/stra",  of_STORE_STRA,  2, {OA_ARR_PTR, OA_BIT1, OA_NONE} },
-      { "%sub",    of_SUB,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%store/vec4",  of_STORE_VEC4,  2, {OA_FUNC_PTR,OA_BIT1, OA_NONE} },
+      { "%sub",    of_SUB,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%sub/wr", of_SUB_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%subi",   of_SUBI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%substr",  of_SUBSTR,  2,{OA_BIT1,     OA_BIT2,     OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index d3e702356..cfcced0d9 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -32,12 +32,19 @@ experience of implementing it for strings, I'll want to change other
 types around to using this method as well. Keep this in mind whenever
 considering adding new instructions to vvp.
 
+FLAGS
+
+There are up to 16 bits in each thread that are available for
+flags. These are used as destinations for operations that return
+boolean values, for example comparisons. They are also used as inputs
+for test and branch opcodes.
+
 * %abs/wr <bit-o>, <bit-i>
 
 This instruction calculates the absolute value of a real value. It uses
 the fabs() function in the run-time to do the work.
 
-* %add <bit-l>, <bit-r>, <wid>
+* %add <bit-l>, <bit-r>, <wid> (XXXX Old version)
 
 This instruction adds the right vector into the left vector, the
 vectors having the width <wid>. If any of the bits of either vector
@@ -46,6 +53,13 @@ sum.
 
 See also the %sub instruction.
 
+* %add
+
+This opcode pops and adds two vec4 values from the vec4 stack, adds
+them, and pushes the result back to the stack. The input values must
+have the same size, and the pushed result will have the same width.
+
+See also the %sub instruction.
 
 * %add/wr <bit-l>, <bit-r>
 
@@ -67,17 +81,20 @@ is zero extended to match any width.
 This instruction allocates the storage for a new instance of an
 automatically allocated scope.
 
-* %and <bit-l>, <bit-r>, <wid>
+* %and
 
-Perform the bitwise AND of the two vectors, and store the result in
-the left vector. Each bit is calculated independent of other bits. AND
-means the following:
+Perform the bitwise AND of the two vectors popped from the vec4 stack,
+and push the result. Each bit is calculated independent of other
+bits. AND means the following:
 
 	0 and ? --> 0
 	? and 0 --> 0
 	1 and 1 --> 1
 	otherwise   x
 
+The input vectors must be the same width, and the output vector will
+be the width of the input.
+
 * %assign/ar <array-label>, <delay>
 * %assign/ar/d <array-label>, <delayx>
 * %assign/ar/e <array-label>
@@ -123,9 +140,9 @@ The %assign/av/e variation uses the information in the thread
 event control registers to determine when to perform the assign.
 %evctl is used to set the event control information.
 
-* %assign/v0 <var-label>, <delay>, <bit>
-* %assign/v0/d <var-label>, <delayx>, <bit>
-* %assign/v0/e <var-label>, <bit>
+* %assign/v0 <var-label>, <delay>, <bit> (XXXX Old description)
+* %assign/v0/d <var-label>, <delayx>, <bit> (XXXX Old description
+* %assign/v0/e <var-label>, <bit> (XXXX Old description)
 
 The %assign/v0 instruction is a vector version of non-blocking
 assignment. The <delay> is the number of clock ticks in the future
@@ -152,6 +169,27 @@ This is similar to the %assign/v0 instruction, but adds the index-1
 index register with the canonical index of the destination where the
 vector is to be written. This allows for part writes into the vector.
 
+* %assign/vec4 <var-label>, <delay>
+* %assign/vec4/d <var-label>, <delayx>
+* %assign/vec4/e <var-label>
+
+The %assign/vec4 instruction if a vec4 version of non-blocking
+assignment, The <delay> is the number lf clock ticks in the future
+where the assignment should schedule, and the value to assign is
+pulled from the vec4 stack.
+
+The %assign/vec4/d instruction is the same, but gets its delay value
+from the index register <delayx> instead.
+
+* %assign/vec4/off/d <var-label>, <off-index>, <delay-index>
+
+This is for writing parts to the target variable. The <var-label> is
+the variable to write, as usual. The <off-index> selects an index
+register that holds the offset into the target variable, and the
+<delay-index> selects the index register that contains the delay. The
+offset is in canonical bits. The width that is written is taken from
+the width of the value on the stack.
+
 * %assign/wr <vpi-label>, <delay>
 * %assign/wr/d <vpi-label>, <delayx>
 * %assign/wr/e <vpi-label>
@@ -180,10 +218,12 @@ The <bit> is the address of the thread register that contains the bit
 value to assign.
 
 
-* %blend <bit-l>, <bit-r>, <wid>
+* %blend
 
-This instruction blends the bits of a vector into the destination in a
-manner like the expression (x ? <a> : <b>). The truth table is:
+This instruction blends the bits of two vectors into a result in a
+manner line the expressions ('bx ? <a> : <b>). The two source vectors
+are popped from the vec4 stack (and must have the same width) and the
+result poshed in their place. The truth table for each bit is:
 
 	1  1 --> 1
 	0  0 --> 0
@@ -238,8 +278,8 @@ Convert the source vector, of type logic, to a bool vector by
 changing all the X and Z bits to 0. The source and destinations may
 overlap.
 
-* %cmp/u <bit-l>, <bit-r>, <wid>
-* %cmp/s <bit-l>, <bit-r>, <wid>
+* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
+* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
 
 These instructions perform a generic comparison of two vectors of equal
 size. The <bit-l> and <bit-r> numbers address the least-significant
@@ -268,6 +308,21 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
 compare. In either case, if either operand contains x or z, then lt
 bit gets the x value.
 
+* %cmp/s
+* %cmp/u
+
+These instructions perform a generic comparison of two vectors of
+equal size. Two values are pulled from the top of the stack, and not
+replaced. The results are written into flag bits 4,5,6. The
+expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
+from the stack first, then (a).
+
+The results of the comparison go into flags 4, 5, 6 and 7:
+
+	4: eq  (equal)
+	5: lt  (less than)
+	6: eeq (case equal)
+
 * %cmpi/s <bit-l>, <immr>, <wid>
 * %cmpi/u <bit-l>, <immr>, <wid>
 
@@ -424,6 +479,7 @@ right operand is 0, then the result is NaN.
 
 
 * dup/real
+* dup/vec4
 
 These opcodes duplicate the value on the top of the stack for the
 corresponding type.
@@ -458,6 +514,18 @@ the format of the output is:
 <description> is a string, if string is 0 then the following default
 message is used: "Procedural tracing.".
 
+* %flag_set/imm <flag>, <value>
+
+This instruction sets an immediate value into a flag bit. This is a
+single bit, and the value is 0==0, 1==1, 2==z, 3==x.
+
+* %flag_get/vec4 <flag>
+* %flag_set/vec4 <flag>
+
+These instructions provide a means for accessing flag bits. The
+%flag_get/vec4 loads the numbered flag as a vec4 on top of the vec4
+stack, and the %flag_set/vec4 pops the top of the vec4 stack and
+writes the LSB to the selected flag.
 
 * %force/v <label>, <bit>, <wid>
 
@@ -497,10 +565,10 @@ This instruction de-allocates the storage for a previously allocated
 instance of as automatically allocated scope.
 
 
-* %inv <bit>, <wid>
+* %inv
 
-Perform a bitwise invert of the vector starting at <bit>. The result
-replaces the input. Invert means the following, independently for each
+Perform a bitwise invert of the vector on top of the vec4 stack. The result
+replaces the input. Invert means the following, independently, for each
 bit:
 
 	0  --> 1
@@ -509,20 +577,20 @@ bit:
 	z  --> x
 
 
-* %ix/get <idx>, <bit>, <wid>
-* %ix/get/s <idx>, <bit>, <wid>
+* %ix/vec4 <idx>
+* %ix/vec4/s <idx>
 
-This instruction loads a thread vector starting at <bit>, size <wid>,
-into the index register <idx>. The <bit> is the LSB of the value in
-thread bit space, and <wid> is the width of the vector.
+This instruction loads a vec4 value from the vec4 stack, into the
+index register <idx>. The value is popped from the vec4 stack and
+written to the index register.
 
-The function converts the 4-value bits into a binary number, without
-sign extension. If any of the bits of the vector is x or z, then the
-index register gets the value 0. The %ix/get/s is the same, except
-that it assumes the source vector is sign extended to fit the index
-register.
+The %ix/vec4 instruction converts the 4-value bits into a binary
+number, without sign extension. If any of the bits of the vector is x
+or z, then the index register gets the value 0. The %ix/vec4/s
+instruction is the same, except that it assumes the source vector is
+sign extended to fit the index register.
 
-The function also writes into bit 4 a 1 if any of the bits of the
+The instruction also writes into bit 4 a 1 if any of the bits of the
 input vector are x or z. This is a flag that the 0 value written into
 the index register is really the result of calculating from unknown
 bits.
@@ -568,10 +636,10 @@ the index register <src>.
 The %jmp instruction performs an unconditional branch to a given
 location. The parameter is the label of the destination instruction.
 
-* %jmp/[01xz] <code-label>, <bit>
+* %jmp/[01xz] <code-label>, <flag>
 
 This is a conditional version of the %jmp instruction. In this case,
-a single bit (addressed by <bit>) is tested. If it is one of the
+a flag bit (addressed by <bit>) is tested. If it is one of the
 values in the part after the /, the jump is taken. For example:
 
 	%jmp/xz T_label, 8;
@@ -663,7 +731,7 @@ strings, and there is an index value in index register 3.
 (See also %store/dar/str)
 
 
-* %load/v <bit>, <functor-label>, <wid>
+* %load/v <bit>, <functor-label>, <wid> (XXXX Old implementation)
 
 This instruction loads a vector value from the given functor node into
 the specified thread register bit. The functor-label can refer to a
@@ -674,6 +742,11 @@ width at the functor. If the <wid> is less than the width at the
 functor, then the most significant bits are dropped. If the <wid> is
 more than the width at the functor, the value is padded with X bits.
 
+* %load/vec4 <var-label>
+
+This instruction loads a vector value from the given functor node and
+pushes it onto the vec4 stack. See also the %store/vec4 instruction.
+
 * %load/vp0 <bit>, <functor-label>, <wid>
 * %load/vp0/s <bit>, <functor-label>, <wid>
 
@@ -836,10 +909,11 @@ the vector.
 Push a null object and push it to the object stack. The null object
 can be used with any class or darray object, so it is not typed.
 
-* %or <dst>, <src>, <wid>
+* %or
 
-Perform the bitwise or of the vectors. Each bit in the <dst> is
-combined with the corresponding bit in the source, according to the
+Perform the bitwise or of twp vectors. Pop two values from the vec4
+stack to get the input arguments. Each bit in the result is combined
+with the corresponding bit in the input arguments, according to the
 truth table:
 
 	1 or ? --> 1
@@ -847,6 +921,8 @@ truth table:
 	0 or 0 --> 0
 	otherwise  x
 
+The results is then pushed onto the vec4 stack. The inputs and the
+output are all the same width.
 
 * %or/r <dst>, <src>, <wid>
 
@@ -855,18 +931,33 @@ and the <dst> is a writable scalar. The <dst> gets the value of the
 or of all the bits of the src vector.
 
 
-* %pad <dst>, <src>, <wid>
+* %pad <dst>, <src>, <wid> (XXXX Old version)
 
 This instruction replicates a single bit in register space into a
 destination vector in register space. The destination may overlap
 the source bit. The <dst> may not be 0-3. This is useful for zero
 or sign extending a vector.
 
+* %pad/s <wid>
+* %pad/u <wid>
+
+These instruction change the size of the top item in the vec4
+stack. If this item is larger then this, it is truncated. If smaller,
+then extended. The /s variant sign extends, the /u variant unsigned
+extends.
+
+* %part <wid>
+
+This instruction implements a part select. It pops from the top of the
+vec4 the base value, then it pops the base to select from. The width
+is the fixed number <wid>. The result is pushed back to the stack.
+
 * %pop/str <num>
 * %pop/real <num>
 * %pop/obj <num>, <skip>
+* %pop/vec4 <num>
 
-Pop <num> items from the string/real/object stack. This is the
+Pop <num> items from the string/real/object/vec4 stack. This is the
 opposite of the %pushX/str opcode which pushes a string to the
 stack. The %pop/str is not normally needed because the %store/str
 includes an implicit pop, but sometimes it is necessary to pop
@@ -917,6 +1008,21 @@ If <exp>==0x3fff and <mant> != 0, the value is NaN.
 
 Push a literal string to the string stack.
 
+* %pushi/vec4 <vala>, <valb>, <wid>
+
+This opcode loads an immediate value, vector4, into the vector
+stack. The <vala> is the boolean value bits, and the <valb> bits are
+modifiers to support z and x values. The a/b encodings for the 4
+possible logic values are:
+
+   a b  val
+   0 0   0
+   1 0   1
+   1 1   x
+   0 1   z
+
+This opcode is limited to 32bit numbers.
+
 * %pushv/str <src>, <wid>
 
 Convert a vector to a string and push the string to the string stack.
@@ -1013,7 +1119,7 @@ not assigned. Also, if the bits go beyond the end of the signal, those
 bits are not written anywhere.
 
 
-* %shiftl/i0 <bit>, <wid>
+* %shiftl/i0 <bit>, <wid> (XXXX Old implementation)
 
 This instruction shifts the vector left (towards more significant
 bits) by the amount in index register 0. The <bit> is the address of
@@ -1022,8 +1128,8 @@ done in place. Zero values are shifted in.
 
 For a negative shift the value is padded with 'bx.
 
-* %shiftr/i0 <bit>, <wid>
-* %shiftr/s/i0 <bit>, <wid>
+* %shiftr/i0 <bit>, <wid> (XXXX Old implementation)
+* %shiftr/s/i0 <bit>, <wid> (XXXX Old implementation)
 
 This instruction shifts the vector right (towards the less significant
 bits) by the amount in the index register 0. The <bit> is the address
@@ -1035,6 +1141,14 @@ top bits. %shiftr/s/i0 is a signed shift, so the value is sign-extended.
 
 For a negative shift %shiftr/i0 will pad the value with 'bx.
 
+* %shiftl <idx>
+* %shiftr <idx>
+* %shiftr/s <idx>
+
+These instructions shift the top value in the vec4 stack left (towards
+MSB) or right, possibly signed. The <idx> is the address of the index
+register that contains the amount to shift.
+
 * %store/obj <var-label>
 
 This pops the top of the object stack and writes it to the object
@@ -1077,7 +1191,14 @@ The %store/stra targets an array.
 The %store/dar/str is similar, but the target is a dynamic array of
 string string. The index is taken from signed index register 3.
 
-* %sub <bit-l>, <bit-r>, <wid>
+* %store/vec4 <var-label>, <wid>
+
+Store a logic vector into the variable. The value (and its width) is
+popped off the top of the stack and written to the variable. The value
+is then optionally truncated to <wid> bits and assigned to the
+variable. It is an error for the value to be fewer then <wid> bits.
+
+* %sub <bit-l>, <bit-r>, <wid> (XXXX Old version)
 
 This instruction arithmetically subtracts the right vector out of the
 left vector. It accomplishes this by adding to the left vector 1 plus
@@ -1088,6 +1209,14 @@ operand are x, then the entire result is x.
 
 See also the %add instruction.
 
+* %sub
+
+This instruction subtracts vec4 values. The right value is popped from
+the vec4 stack, then the left value is popped. The right is subtracted
+from the left, and the result pushed.
+
+See also the %add instruction.
+
 * %subi <bit-l>, <imm>, <wid>
 
 This instruction arithmetically subtracts the immediate value from the
@@ -1121,7 +1250,8 @@ values into the vector space. The string value is NOT popped.
 * %test_nul <var-label>
 
 This instruction tests the contents of the addressed variable to see
-if it is null. If it is, set bit 4 to 1. Otherwise, set bit 4 to 0.
+if it is null. If it is, set flag bit 4 to 1. Otherwise, set flag bit
+4 to 0.
 
 This is intended to implement the SystemVerilog expression
 (<var>==null), where <var> is a class variable.
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index dbf2c3a2d..951cd50dc 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -97,16 +97,48 @@ using namespace std;
 struct vthread_s {
       vthread_s();
 
+      void debug_dump(ostream&fd);
+
 	/* This is the program counter. */
       vvp_code_t pc;
 	/* These hold the private thread bits. */
-      vvp_vector4_t bits4;
+	//vvp_vector4_t bits4;
+      enum { FLAGS_COUNT = 16, WORDS_COUNT = 16 };
+      vvp_bit4_t flags[FLAGS_COUNT];
 
 	/* These are the word registers. */
       union {
 	    int64_t  w_int;
 	    uint64_t w_uint;
-      } words[16];
+      } words[WORDS_COUNT];
+
+    private:
+      vector<vvp_vector4_t>stack_vec4_;
+    public:
+      inline vvp_vector4_t pop_vec4(void)
+      {
+	    assert(! stack_vec4_.empty());
+	    vvp_vector4_t val = stack_vec4_.back();
+	    stack_vec4_.pop_back();
+	    return val;
+      }
+      inline void push_vec4(const vvp_vector4_t&val)
+      {
+	    stack_vec4_.push_back(val);
+      }
+      inline const vvp_vector4_t& peek_vec4(unsigned depth)
+      {
+	    assert(depth < stack_vec4_.size());
+	    unsigned use_index = stack_vec4_.size()-1-depth;
+	    return stack_vec4_[use_index];
+      }
+      inline void pop_vec4(unsigned cnt)
+      {
+	    while (cnt > 0) {
+		  stack_vec4_.pop_back();
+		  cnt -= 1;
+	    }
+      }
 
     private:
       vector<double> stack_real_;
@@ -233,7 +265,7 @@ struct vthread_s {
 
       inline void cleanup()
       {
-	    bits4 = vvp_vector4_t();
+	    assert(stack_vec4_.empty());
 	    assert(stack_real_.empty());
 	    assert(stack_str_.empty());
 	    assert(stack_obj_size_ == 0);
@@ -245,6 +277,18 @@ inline vthread_s::vthread_s()
       stack_obj_size_ = 0;
 }
 
+void vthread_s::debug_dump(ostream&fd)
+{
+      fd << "**** Flags: ";
+      for (int idx = 0 ; idx < FLAGS_COUNT ; idx += 1)
+	    fd << flags[idx];
+      fd << endl;
+      fd << "**** vec4 stack..." << endl;
+      for (size_t idx = stack_vec4_.size() ; idx > 0 ; idx -= 1)
+	    fd << "    " << (stack_vec4_.size()-idx) << ": " << stack_vec4_[idx-1] << endl;
+      fd << "**** Done ****" << endl;
+}
+
 static bool test_joinable(vthread_t thr, vthread_t child);
 static void do_join(vthread_t thr, vthread_t child);
 
@@ -259,6 +303,7 @@ struct vthread_s*running_thread = 0;
 // vvp_bit4_t bit values.
 static vvp_bit4_t thr_index_to_bit4[4] = { BIT4_0, BIT4_1, BIT4_X, BIT4_Z };
 
+#if 0
 static inline void thr_check_addr(struct vthread_s*thr, unsigned addr)
 {
       if (thr->bits4.size() <= addr)
@@ -277,22 +322,26 @@ static inline void thr_put_bit(struct vthread_s*thr,
       thr_check_addr(thr, addr);
       thr->bits4.set_bit(addr, val);
 }
-
-// REMOVE ME
-static inline void thr_clr_bit_(struct vthread_s*thr, unsigned addr)
-{
-      thr->bits4.set_bit(addr, BIT4_0);
-}
+#endif
 
 vvp_bit4_t vthread_get_bit(struct vthread_s*thr, unsigned addr)
 {
+#if 0
       if (vpi_mode_flag == VPI_MODE_COMPILETF) return BIT4_X;
       else return thr_get_bit(thr, addr);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: vthread_get_bit(..., %u)\n", addr);
+      return BIT4_X;
+#endif
 }
 
 void vthread_put_bit(struct vthread_s*thr, unsigned addr, vvp_bit4_t bit)
 {
+#if 0
       thr_put_bit(thr, addr, bit);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: vthread_put_bit(..., %u, %u)\n", addr, bit);
+#endif
 }
 
 void vthread_push_real(struct vthread_s*thr, double val)
@@ -336,7 +385,7 @@ template <class T> T coerce_to_width(const T&that, unsigned width)
 /* Explicitly define the vvp_vector4_t version of coerce_to_width(). */
 template vvp_vector4_t coerce_to_width(const vvp_vector4_t&that,
                                        unsigned width);
-
+#if 0
 static unsigned long* vector_to_array(struct vthread_s*thr,
 				      unsigned addr, unsigned wid)
 {
@@ -365,7 +414,9 @@ static unsigned long* vector_to_array(struct vthread_s*thr,
 
       return thr->bits4.subarray(addr, wid);
 }
+#endif
 
+#if 0
 /*
  * This function gets from the thread a vector of bits starting from
  * the addressed location and for the specified width.
@@ -382,6 +433,7 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
 	    return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
       }
 }
+#endif
 
 /*
  * Some of the instructions do wide addition to arrays of long. They
@@ -540,7 +592,7 @@ vthread_t vthread_new(vvp_code_t pc, struct __vpiScope*scope)
 {
       vthread_t thr = new struct vthread_s;
       thr->pc     = pc;
-      thr->bits4  = vvp_vector4_t(32);
+	//thr->bits4  = vvp_vector4_t(32);
       thr->parent = 0;
       thr->parent_scope = scope;
       thr->wait_next = 0;
@@ -557,10 +609,12 @@ vthread_t vthread_new(vvp_code_t pc, struct __vpiScope*scope)
       thr->event  = 0;
       thr->ecount = 0;
 
-      thr_put_bit(thr, 0, BIT4_0);
-      thr_put_bit(thr, 1, BIT4_1);
-      thr_put_bit(thr, 2, BIT4_X);
-      thr_put_bit(thr, 3, BIT4_Z);
+      thr->flags[0] = BIT4_0;
+      thr->flags[1] = BIT4_1;
+      thr->flags[2] = BIT4_X;
+      thr->flags[3] = BIT4_Z;
+      for (int idx = 4 ; idx < 8 ; idx += 1)
+	    thr->flags[idx] = BIT4_X;
 
       scope->threads .insert(thr);
       return thr;
@@ -787,52 +841,20 @@ bool of_ALLOC(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-static bool of_AND_wide(vthread_t thr, vvp_code_t cp)
+bool of_AND(vthread_t thr, vvp_code_t)
 {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
-      val &= vthread_bits_to_vector(thr, idx2, wid);
-      thr->bits4.set_vec(idx1, val);
-
+      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t valb = thr->pop_vec4();
+      assert(vala.size() == valb.size());
+      vala &= valb;
+      thr->push_vec4(vala);
       return true;
 }
 
-static bool of_AND_narrow(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-	    thr_put_bit(thr, idx1, lb&rb);
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
-      }
-
-      return true;
-}
-
-bool of_AND(vthread_t thr, vvp_code_t cp)
-{
-      assert(cp->bit_idx[0] >= 4);
-
-      if (cp->number <= 4)
-	    cp->opcode = &of_AND_narrow;
-      else
-	    cp->opcode = &of_AND_wide;
-
-      return cp->opcode(thr, cp);
-}
-
 
 bool of_ANDI(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned idx1 = cp->bit_idx[0];
       unsigned long imm = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -850,9 +872,13 @@ bool of_ANDI(vthread_t thr, vvp_code_t cp)
       val &= imv;
 
       thr->bits4.set_vec(idx1, val);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%andi ...\n");
+#endif
       return true;
 }
 
+#if 0
 bool of_ADD(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
@@ -886,6 +912,42 @@ bool of_ADD(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
+#endif
+
+bool of_ADD(vthread_t thr, vvp_code_t)
+{
+      vvp_vector4_t r = thr->pop_vec4();
+      vvp_vector4_t l = thr->pop_vec4();
+
+      unsigned wid = l.size();
+      assert(wid = r.size());
+
+      unsigned long*lva = l.subarray(0,wid);
+      unsigned long*lvb = r.subarray(0,wid);
+      if (lva==0 || lvb==0)
+	    goto x_out;
+
+      unsigned long carry;
+      carry = 0;
+      for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < wid ; idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
+
+      l.setarray(0,wid,lva);
+
+      thr->push_vec4(l);
+
+      delete[]lva;
+      delete[]lvb;
+      return true;
+
+ x_out:
+      delete[]lva;
+      delete[]lvb;
+
+      vvp_vector4_t tmp (wid, BIT4_X);
+      thr->push_vec4(tmp);
+      return true;
+}
 
 bool of_ADD_WR(vthread_t thr, vvp_code_t)
 {
@@ -903,6 +965,7 @@ bool of_ADD_WR(vthread_t thr, vvp_code_t)
  */
 bool of_ADDI(vthread_t thr, vvp_code_t cp)
 {
+#if 0
 	// Collect arguments
       unsigned bit_addr       = cp->bit_idx[0];
       unsigned long imm_value = cp->bit_idx[1];
@@ -938,7 +1001,9 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
 
       vvp_vector4_t tmp (bit_width, BIT4_X);
       thr->bits4.set_vec(bit_addr, tmp);
-
+#else
+      fprintf(stderr, "XXXX NOT IMLEMENTED: %%addi ...\n");
+#endif
       return true;
 }
 
@@ -1010,6 +1075,7 @@ bool of_ASSIGN_ARE(vthread_t thr, vvp_code_t cp)
  */
 bool of_ASSIGN_AV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = thr->words[0].w_int;
       long off = thr->words[1].w_int;
       long adr = thr->words[3].w_int;
@@ -1038,6 +1104,9 @@ bool of_ASSIGN_AV(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
 
       schedule_assign_array_word(cp->array, adr, off, value, delay);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/av ...\n");
+#endif
       return true;
 }
 
@@ -1049,6 +1118,7 @@ bool of_ASSIGN_AV(vthread_t thr, vvp_code_t cp)
  */
 bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = thr->words[0].w_int;
       long off = thr->words[1].w_int;
       long adr = thr->words[3].w_int;
@@ -1077,6 +1147,9 @@ bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
 
       schedule_assign_array_word(cp->array, adr, off, value, delay);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED %%assign/av/d ...\n");
+#endif
       return true;
 }
 
@@ -1106,6 +1179,7 @@ bool of_ASSIGN_AVE(vthread_t thr, vvp_code_t cp)
 
       assert(wid > 0);
 
+#if 0
       vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
 	// If the count is zero then just put the value.
       if (thr->ecount == 0) {
@@ -1113,9 +1187,13 @@ bool of_ASSIGN_AVE(vthread_t thr, vvp_code_t cp)
       } else {
 	    schedule_evctl(cp->array, adr, value, off, thr->event, thr->ecount);
       }
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/av/e ...\n");
+#endif
       return true;
 }
 
+#if 0
 /*
  * This is %assign/v0 <label>, <delay>, <bit>
  * Index register 0 contains a vector width.
@@ -1139,7 +1217,50 @@ bool of_ASSIGN_V0(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
+#endif
 
+/*
+ * %assign/vec4 <var>, <delay>
+ */
+bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      vvp_net_ptr_t ptr (cp->net, 0);
+      unsigned delay = cp->bit_idx[0];
+      vvp_vector4_t val = thr->pop_vec4();
+
+      schedule_assign_plucked_vector(ptr, delay, val, 0, val.size());
+      return true;
+}
+
+/*
+ * %assign/vec4/off/d <var>, <off>, <del>
+ */
+bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t cp)
+{
+      vvp_net_ptr_t ptr (cp->net, 0);
+      unsigned off_index = cp->bit_idx[0];
+      unsigned del_index = cp->bit_idx[1];
+      vvp_vector4_t val = thr->pop_vec4();
+
+      int off = thr->words[off_index].w_int;
+      int del = thr->words[del_index].w_int;
+
+      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
+      assert(sig);
+
+      if (off >= (long)sig->value_size())
+	    return true;
+      if (off < 0) {
+	    if ((unsigned)-off >= sig->value_size())
+		  return true;
+	    assert(0); // XXXX Not implemented yet.
+      }
+
+      schedule_assign_vector(ptr, off, sig->value_size(), val, del);
+      return true;
+}
+
+#if 0
 /*
  * This is %assign/v0/d <label>, <delay_idx>, <bit>
  * Index register 0 contains a vector width, and the named index
@@ -1154,7 +1275,6 @@ bool of_ASSIGN_V0D(vthread_t thr, vvp_code_t cp)
       unsigned bit = cp->bit_idx[1];
 
       vvp_net_ptr_t ptr (cp->net, 0);
-
       if (bit >= 4) {
 	    schedule_assign_plucked_vector(ptr, delay, thr->bits4, bit, wid);
       } else {
@@ -1164,7 +1284,8 @@ bool of_ASSIGN_V0D(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
-
+#endif
+#if 0
 /*
  * This is %assign/v0/e <label>, <bit>
  * Index register 0 contains a vector width.
@@ -1191,6 +1312,19 @@ bool of_ASSIGN_V0E(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
+#endif
+
+bool of_ASSIGN_VEC4D(vthread_t thr, vvp_code_t cp)
+{
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/vec4/d\n");
+      return true;
+}
+
+bool of_ASSIGN_VEC4E(vthread_t thr, vvp_code_t cp)
+{
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/vec4/e\n");
+      return true;
+}
 
 /*
  * This is %assign/v0/x1 <label>, <delay>, <bit>
@@ -1199,6 +1333,7 @@ bool of_ASSIGN_V0E(vthread_t thr, vvp_code_t cp)
  */
 bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = thr->words[0].w_int;
       long off = thr->words[1].w_int;
       unsigned delay = cp->bit_idx[0];
@@ -1224,7 +1359,9 @@ bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t cp)
 
       vvp_net_ptr_t ptr (cp->net, 0);
       schedule_assign_vector(ptr, off, sig->value_size(), value, delay);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/v0/x1 ...\n");
+#endif
       return true;
 }
 
@@ -1235,6 +1372,7 @@ bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t cp)
  */
 bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = thr->words[0].w_int;
       long off = thr->words[1].w_int;
       vvp_time64_t delay = thr->words[cp->bit_idx[0]].w_uint;
@@ -1260,7 +1398,9 @@ bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t cp)
 
       vvp_net_ptr_t ptr (cp->net, 0);
       schedule_assign_vector(ptr, off, sig->value_size(), value, delay);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/v0/x1/d ...\n");
+#endif
       return true;
 }
 
@@ -1271,6 +1411,7 @@ bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t cp)
  */
 bool of_ASSIGN_V0X1E(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = thr->words[0].w_int;
       long off = thr->words[1].w_int;
       unsigned bit = cp->bit_idx[0];
@@ -1311,7 +1452,9 @@ bool of_ASSIGN_V0X1E(vthread_t thr, vvp_code_t cp)
 
       thr->event = 0;
       thr->ecount = 0;
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%assign/v0/x1/e ...\n");
+#endif
       return true;
 }
 
@@ -1395,25 +1538,20 @@ bool of_ASSIGN_X0(vthread_t, vvp_code_t)
       return true;
 }
 
-bool of_BLEND(vthread_t thr, vvp_code_t cp)
+bool of_BLEND(vthread_t thr, vvp_code_t)
 {
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t valb = thr->pop_vec4();
+      assert(vala.size() == valb.size());
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ; idx < vala.size() ; idx += 1) {
+	    if (vala.value(idx) == valb.value(idx))
+		  continue;
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (lb != rb)
-		  thr_put_bit(thr, idx1, BIT4_X);
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    vala.set_bit(idx, BIT4_X);
       }
 
+      thr->push_vec4(vala);
       return true;
 }
 
@@ -1482,6 +1620,7 @@ bool of_CASSIGN_LINK(vthread_t, vvp_code_t cp)
  */
 bool of_CASSIGN_V(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       vvp_net_t*net  = cp->net;
       unsigned  base = cp->bit_idx[0];
       unsigned  wid  = cp->bit_idx[1];
@@ -1492,7 +1631,9 @@ bool of_CASSIGN_V(vthread_t thr, vvp_code_t cp)
 	/* set the value into port 1 of the destination. */
       vvp_net_ptr_t ptr (net, 1);
       vvp_send_vec4(ptr, value, 0);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cassign/v ...\n");
+#endif
       return true;
 }
 
@@ -1510,6 +1651,7 @@ bool of_CASSIGN_WR(vthread_t thr, vvp_code_t cp)
 
 bool of_CASSIGN_X0(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       vvp_net_t*net = cp->net;
       unsigned base = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
@@ -1538,12 +1680,15 @@ bool of_CASSIGN_X0(vthread_t thr, vvp_code_t cp)
 
       vvp_net_ptr_t ptr (net, 1);
       vvp_send_vec4_pv(ptr, vector, index, wid, sig->value_size(), 0);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cassign/x0 ...\n");
+#endif
       return true;
 }
 
 bool of_CAST2(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned dst = cp->bit_idx[0];
       unsigned src = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -1567,6 +1712,9 @@ bool of_CAST2(vthread_t thr, vvp_code_t cp)
       }
 
       thr->bits4.set_vec(dst, res);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cast2 ...\n");
+#endif
       return true;
 }
 
@@ -1576,23 +1724,18 @@ bool of_CMPS(vthread_t thr, vvp_code_t cp)
       vvp_bit4_t eeq = BIT4_1;
       vvp_bit4_t lt  = BIT4_0;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      vvp_vector4_t rval = thr->pop_vec4();
+      vvp_vector4_t lval = thr->pop_vec4();
 
-      const unsigned end1 = (idx1 < 4)? idx1 : idx1 + cp->number - 1;
-      const unsigned end2 = (idx2 < 4)? idx2 : idx2 + cp->number - 1;
+      assert(rval.size() == lval.size());
+      unsigned wid = lval.size();
 
-      if (end1 > end2)
-	    thr_check_addr(thr, end1);
-      else
-	    thr_check_addr(thr, end2);
+      const vvp_bit4_t sig1 = lval.value(wid-1);
+      const vvp_bit4_t sig2 = rval.value(wid-1);
 
-      const vvp_bit4_t sig1 = thr_get_bit(thr, end1);
-      const vvp_bit4_t sig2 = thr_get_bit(thr, end2);
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rv = thr_get_bit(thr, idx2);
+      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+	    vvp_bit4_t lv = lval.value(idx);
+	    vvp_bit4_t rv = rval.value(idx);
 
 	    if (lv > rv) {
 		  lt  = BIT4_0;
@@ -1609,9 +1752,6 @@ bool of_CMPS(vthread_t thr, vvp_code_t cp)
 		  if (bit4_is_xz(lv) || bit4_is_xz(rv))
 			eq = BIT4_X;
 	    }
-
-	    if (idx1 >= 4) idx1 += 1;
-	    if (idx2 >= 4) idx2 += 1;
       }
 
       if (eq == BIT4_X)
@@ -1634,9 +1774,9 @@ bool of_CMPS(vthread_t thr, vvp_code_t cp)
 		  lt = BIT4_0;
       }
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
-      thr_put_bit(thr, 6, eeq);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
+      thr->flags[6] = eeq;
 
       return true;
 }
@@ -1662,8 +1802,8 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
 	    lt = BIT4_0;
       }
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
 
       return true;
 }
@@ -1673,7 +1813,7 @@ bool of_CMPIS(vthread_t thr, vvp_code_t cp)
       vvp_bit4_t eq  = BIT4_1;
       vvp_bit4_t eeq = BIT4_1;
       vvp_bit4_t lt  = BIT4_0;
-
+#if 0
       unsigned idx1 = cp->bit_idx[0];
       unsigned imm  = cp->bit_idx[1];
 
@@ -1709,10 +1849,12 @@ bool of_CMPIS(vthread_t thr, vvp_code_t cp)
 	    lt = BIT4_X;
       else if (sig1 == BIT4_1)
 	    lt = BIT4_1;
-
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
-      thr_put_bit(thr, 6, eeq);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cmpi/s ...\n");
+#endif
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
+      thr->flags[6] = eeq;
 
       return true;
 }
@@ -1724,6 +1866,7 @@ bool of_CMPIS(vthread_t thr, vvp_code_t cp)
  * none in the imm value) so the eeq result must be false. Otherwise,
  * the eq result may be 0 or x, and the lt bit is x.
  */
+#if 0
 static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 {
 
@@ -1759,9 +1902,10 @@ static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
-
+#endif
 bool of_CMPIU(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned addr = cp->bit_idx[0];
       unsigned long imm  = cp->bit_idx[1];
       unsigned wid  = cp->number;
@@ -1791,20 +1935,21 @@ bool of_CMPIU(vthread_t thr, vvp_code_t cp)
       thr_put_bit(thr, 4, eq);
       thr_put_bit(thr, 5, lt);
       thr_put_bit(thr, 6, eq);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cmpi/u ...\n");
+#endif
       return true;
 }
 
-bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t cp)
+bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t cp, unsigned wid,
+			  const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 {
       vvp_bit4_t eq = BIT4_1;
       vvp_bit4_t eeq = BIT4_1;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rv = thr_get_bit(thr, idx2);
+      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+	    vvp_bit4_t lv = lval.value(idx);
+	    vvp_bit4_t rv = rval.value(idx);
 
 	    if (lv != rv)
 		  eeq = BIT4_0;
@@ -1819,14 +1964,11 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t cp)
 	    if (eq == BIT4_0)
 		  break;
 
-	    if (idx1 >= 4) idx1 += 1;
-	    if (idx2 >= 4) idx2 += 1;
-
       }
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, BIT4_X);
-      thr_put_bit(thr, 6, eeq);
+      thr->flags[4] = eq;
+      thr->flags[5] = BIT4_X;
+      thr->flags[6] = eeq;
 
       return true;
 }
@@ -1836,17 +1978,19 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
       vvp_bit4_t eq = BIT4_1;
       vvp_bit4_t lt = BIT4_0;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid  = cp->number;
+      vvp_vector4_t rval = thr->pop_vec4();
+      vvp_vector4_t lval = thr->pop_vec4();
 
-      unsigned long*larray = vector_to_array(thr, idx1, wid);
-      if (larray == 0) return of_CMPU_the_hard_way(thr, cp);
+      assert(rval.size() == lval.size());
+      unsigned wid = lval.size();
 
-      unsigned long*rarray = vector_to_array(thr, idx2, wid);
+      unsigned long*larray = lval.subarray(0,wid);
+      if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
+
+      unsigned long*rarray = rval.subarray(0,wid);
       if (rarray == 0) {
 	    delete[]larray;
-	    return of_CMPU_the_hard_way(thr, cp);
+	    return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
       }
 
       unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
@@ -1865,9 +2009,9 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
       delete[]larray;
       delete[]rarray;
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
-      thr_put_bit(thr, 6, eq);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
+      thr->flags[6] = eq;
 
       return true;
 }
@@ -1875,7 +2019,7 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
 bool of_CMPX(vthread_t thr, vvp_code_t cp)
 {
       vvp_bit4_t eq = BIT4_1;
-
+#if 0
       unsigned idx1 = cp->bit_idx[0];
       unsigned idx2 = cp->bit_idx[1];
 
@@ -1891,8 +2035,10 @@ bool of_CMPX(vthread_t thr, vvp_code_t cp)
 	    if (idx1 >= 4) idx1 += 1;
 	    if (idx2 >= 4) idx2 += 1;
       }
-
-      thr_put_bit(thr, 4, eq);
+#else
+      fprintf(stderr, "XXXX NOT IMLEMENTED: %%cmpx ...\n");
+#endif
+      thr->flags[4] = eq;
 
       return true;
 }
@@ -1905,8 +2051,8 @@ bool of_CMPWR(vthread_t thr, vvp_code_t)
       vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
       vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
 
       return true;
 }
@@ -1919,8 +2065,8 @@ bool of_CMPWS(vthread_t thr, vvp_code_t cp)
       vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
       vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
 
       return true;
 }
@@ -1933,8 +2079,8 @@ bool of_CMPWU(vthread_t thr, vvp_code_t cp)
       vvp_bit4_t eq = (l == r)? BIT4_1 : BIT4_0;
       vvp_bit4_t lt = (l <  r)? BIT4_1 : BIT4_0;
 
-      thr_put_bit(thr, 4, eq);
-      thr_put_bit(thr, 5, lt);
+      thr->flags[4] = eq;
+      thr->flags[5] = lt;
 
       return true;
 }
@@ -1942,7 +2088,7 @@ bool of_CMPWU(vthread_t thr, vvp_code_t cp)
 bool of_CMPZ(vthread_t thr, vvp_code_t cp)
 {
       vvp_bit4_t eq = BIT4_1;
-
+#if 0
       unsigned idx1 = cp->bit_idx[0];
       unsigned idx2 = cp->bit_idx[1];
 
@@ -1958,8 +2104,10 @@ bool of_CMPZ(vthread_t thr, vvp_code_t cp)
 	    if (idx1 >= 4) idx1 += 1;
 	    if (idx2 >= 4) idx2 += 1;
       }
-
-      thr_put_bit(thr, 4, eq);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cmpz ...\n");
+#endif
+      thr->flags[4] = eq;
 
       return true;
 }
@@ -2002,25 +2150,31 @@ bool of_CVT_RU(vthread_t thr, vvp_code_t cp)
 
 bool of_CVT_RV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned base = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       vvp_vector4_t vector = vthread_bits_to_vector(thr, base, wid);
       double val;
       vector4_to_value(vector, val, false);
       thr->push_real(val);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cvt/rv ...\n");
+#endif
       return true;
 }
 
 bool of_CVT_RV_S(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned base = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       vvp_vector4_t vector = vthread_bits_to_vector(thr, base, wid);
       double val;
       vector4_to_value(vector, val, true);
       thr->push_real(val);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cvt/rv/s ...\n");
+#endif
       return true;
 }
 
@@ -2053,6 +2207,7 @@ bool of_CVT_UR(vthread_t thr, vvp_code_t cp)
  */
 bool of_CVT_VR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       double r = thr->pop_real();
       unsigned base = cp->bit_idx[0];
       unsigned wid = cp->number;
@@ -2061,7 +2216,9 @@ bool of_CVT_VR(vthread_t thr, vvp_code_t cp)
 	/* Make sure there is enough space for the new vector. */
       thr_check_addr(thr, base+wid-1);
       thr->bits4.set_vec(base, tmp);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cvt/vr ...\n");
+#endif
       return true;
 }
 
@@ -2129,6 +2286,14 @@ bool of_DEASSIGN_WR(vthread_t, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %debug/thr
+ */
+bool of_DEBUG_THR(vthread_t thr, vvp_code_t)
+{
+      thr->debug_dump(cerr);
+      return true;
+}
 
 /*
  * The delay takes two 32bit numbers to make up a 64bit time.
@@ -2152,7 +2317,7 @@ bool of_DELAYX(vthread_t thr, vvp_code_t cp)
 {
       vvp_time64_t delay;
 
-      assert(cp->number < 4);
+      assert(cp->number < vthread_s::WORDS_COUNT);
       delay = thr->words[cp->number].w_uint;
       schedule_vthread(thr, delay);
       return false;
@@ -2405,6 +2570,7 @@ static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned w
 
 bool of_DIV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned adra = cp->bit_idx[0];
       unsigned adrb = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -2457,6 +2623,9 @@ bool of_DIV(vthread_t thr, vvp_code_t cp)
       delete[]ap;
       delete[]bp;
       delete[]result;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%div ...\n");
+#endif
       return true;
 }
 
@@ -2470,6 +2639,7 @@ static void negate_words(unsigned long*val, unsigned words)
 
 bool of_DIV_S(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned adra = cp->bit_idx[0];
       unsigned adrb = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -2553,6 +2723,9 @@ bool of_DIV_S(vthread_t thr, vvp_code_t cp)
       delete[]ap;
       delete[]bp;
       delete[]result;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%div/s ...\n");
+#endif
       return true;
 }
 
@@ -2571,6 +2744,12 @@ bool of_DUP_REAL(vthread_t thr, vvp_code_t)
       return true;
 }
 
+bool of_DUP_VEC4(vthread_t thr, vvp_code_t)
+{
+      thr->push_vec4(thr->peek_vec4(0));
+      return true;
+}
+
 /*
  * This terminates the current thread. If there is a parent who is
  * waiting for me to die, then I schedule it. At any rate, I mark
@@ -2682,6 +2861,41 @@ bool of_EVCTLS(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+bool of_FLAG_GET_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      int flag = cp->number;
+      assert(flag < vthread_s::FLAGS_COUNT);
+
+      vvp_vector4_t val (1, thr->flags[flag]);
+      thr->push_vec4(val);
+
+      return true;
+}
+
+bool of_FLAG_SET_IMM(vthread_t thr, vvp_code_t cp)
+{
+      int flag = cp->number;
+      int vali = cp->bit_idx[0];
+
+      assert(flag < vthread_s::FLAGS_COUNT);
+      assert(vali >= 0 && vali < 4);
+
+      static vvp_bit4_t map_bit[4] = {BIT4_0, BIT4_1, BIT4_Z, BIT4_X};
+      thr->flags[flag] = map_bit[vali];
+      return true;
+}
+
+bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      int flag = cp->number;
+      assert(flag < vthread_s::FLAGS_COUNT);
+
+      vvp_vector4_t val = thr->pop_vec4();
+      thr->flags[flag] = val.value(0);
+
+      return true;
+}
+
 /*
  * the %force/link instruction connects a source node to a
  * destination node. The destination node must be a signal, as it is
@@ -2714,6 +2928,7 @@ bool of_FORCE_LINK(vthread_t, vvp_code_t cp)
  */
 bool of_FORCE_V(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       vvp_net_t*net  = cp->net;
       unsigned  base = cp->bit_idx[0];
       unsigned  wid  = cp->bit_idx[1];
@@ -2728,7 +2943,9 @@ bool of_FORCE_V(vthread_t thr, vvp_code_t cp)
 	    value = coerce_to_width(value, net->fil->filter_size());
 
       net->force_vec4(value, vvp_vector2_t(vvp_vector2_t::FILL1, net->fil->filter_size()));
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%force/v ...\n");
+#endif
       return true;
 }
 
@@ -2745,6 +2962,7 @@ bool of_FORCE_WR(vthread_t thr, vvp_code_t cp)
 
 bool of_FORCE_X0(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       vvp_net_t*net = cp->net;
       unsigned base = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
@@ -2781,7 +2999,9 @@ bool of_FORCE_X0(vthread_t thr, vvp_code_t cp)
       value.set_vec(index, vector);
 
       net->force_vec4(value, mask);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%force/x0 ...\n");
+#endif
       return true;
 }
 
@@ -2838,41 +3058,11 @@ bool of_FREE(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-static bool of_INV_wide(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned wid = cp->bit_idx[1];
-
-      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
-      thr->bits4.set_vec(idx1, ~val);
-
-      return true;
-}
-
-static bool of_INV_narrow(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned wid = cp->bit_idx[1];
-
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    thr_put_bit(thr, idx1, ~lb);
-	    idx1 += 1;
-      }
-
-      return true;
-}
-
 bool of_INV(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-
-      if (cp->number <= 4)
-	    cp->opcode = &of_INV_narrow;
-      else
-	    cp->opcode = &of_INV_wide;
-
-      return cp->opcode(thr, cp);
+      vvp_vector4_t val = thr->pop_vec4();
+      thr->push_vec4(~val);
+      return true;
 }
 
 
@@ -2943,7 +3133,7 @@ static uint64_t vector_to_index(vthread_t thr, unsigned base,
 {
       uint64_t v = 0;
       bool unknown_flag = false;
-
+#if 0
       vvp_bit4_t vv = BIT4_0;
       for (unsigned i = 0 ;  i < width ;  i += 1) {
 	    vv = thr_get_bit(thr, base);
@@ -2966,9 +3156,11 @@ static uint64_t vector_to_index(vthread_t thr, unsigned base,
 		  v |= pad << i;
 	    }
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: vector_to_index(...)\n");
+#endif
 	/* Set bit 4 as a flag if the input is unknown. */
-      thr_put_bit(thr, 4, unknown_flag ? BIT4_1 : BIT4_0);
+      thr->flags[4] = unknown_flag? BIT4_1 : BIT4_0;
 
       return v;
 }
@@ -3017,7 +3209,7 @@ bool of_IX_GETV(vthread_t thr, vvp_code_t cp)
 	    thr->words[index].w_uint = 0;
 
 	/* Set bit 4 as a flag if the input is unknown. */
-      thr_put_bit(thr, 4, known_flag ? BIT4_0 : BIT4_1);
+      thr->flags[4] = known_flag ? BIT4_0 : BIT4_1;
 
       return true;
 }
@@ -3048,11 +3240,60 @@ bool of_IX_GETV_S(vthread_t thr, vvp_code_t cp)
 	    thr->words[index].w_int = 0;
 
 	/* Set bit 4 as a flag if the input is unknown. */
-      thr_put_bit(thr, 4, known_flag ? BIT4_0 : BIT4_1);
+      thr->flags[4] = known_flag? BIT4_0 : BIT4_1;
 
       return true;
 }
 
+static uint64_t vec4_to_index(vthread_t thr, bool signed_flag)
+{
+      vvp_vector4_t val = thr->pop_vec4();
+      uint64_t v = 0;
+      bool unknown_flag = false;
+
+      vvp_bit4_t vv = BIT4_0;
+      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
+	    vv = val.value(idx);
+	    if (bit4_is_xz(vv)) {
+		  v = 0UL;
+		  unknown_flag = true;
+		  break;
+	    }
+
+	    v |= (uint64_t) vv << idx;
+      }
+
+      if (signed_flag && !unknown_flag) {
+	    uint64_t pad = vv;
+	    for (unsigned idx = val.size() ; idx < 8*sizeof(v) ; idx += 1) {
+		  v |= pad << idx;
+	    }
+      }
+
+      thr->flags[4] = unknown_flag? BIT4_1 : BIT4_0;
+      return v;
+}
+
+/*
+ * %ix/vec4 <idx>
+ */
+bool of_IX_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      unsigned use_idx = cp->number;
+      thr->words[use_idx].w_uint = vec4_to_index(thr, false);
+      return true;
+}
+
+/*
+ * %ix/vec4/s <idx>
+ */
+bool of_IX_VEC4_S(vthread_t thr, vvp_code_t cp)
+{
+      unsigned use_idx = cp->number;
+      thr->words[use_idx].w_uint = vec4_to_index(thr, true);
+      return true;
+}
+
 /*
  * The various JMP instruction work simply by pulling the new program
  * counter from the instruction and resuming. If the jump is
@@ -3074,9 +3315,12 @@ bool of_JMP(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %jmp/0 <pc>, <flag>
+ */
 bool of_JMP0(vthread_t thr, vvp_code_t cp)
 {
-      if (thr_get_bit(thr, cp->bit_idx[0]) == 0)
+      if (thr->flags[cp->bit_idx[0]] == BIT4_0)
 	    thr->pc = cp->cptr;
 
 	/* Normally, this returns true so that the processor just
@@ -3091,9 +3335,12 @@ bool of_JMP0(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %jmp/0xz <pc>, <flag>
+ */
 bool of_JMP0XZ(vthread_t thr, vvp_code_t cp)
 {
-      if (thr_get_bit(thr, cp->bit_idx[0]) != BIT4_1)
+      if (thr->flags[cp->bit_idx[0]] != BIT4_1)
 	    thr->pc = cp->cptr;
 
 	/* Normally, this returns true so that the processor just
@@ -3108,9 +3355,12 @@ bool of_JMP0XZ(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %jmp/1 <pc>, <flag>
+ */
 bool of_JMP1(vthread_t thr, vvp_code_t cp)
 {
-      if (thr_get_bit(thr, cp->bit_idx[0]) == 1)
+      if (thr->flags[cp->bit_idx[0]] == BIT4_1)
 	    thr->pc = cp->cptr;
 
 	/* Normally, this returns true so that the processor just
@@ -3237,7 +3487,7 @@ bool of_LOAD_AR(vthread_t thr, vvp_code_t cp)
       double word;
 
 	/* The result is 0.0 if the address is undefined. */
-      if (thr_get_bit(thr, 4) == BIT4_1) {
+      if (thr->flags[4] == BIT4_1) {
 	    word = 0.0;
       } else {
 	    word = array_get_word_r(cp->array, adr);
@@ -3258,6 +3508,7 @@ bool of_LOAD_AR(vthread_t thr, vvp_code_t cp)
  */
 bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
@@ -3285,7 +3536,9 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
 	   with BIT4_X values. */
       for (unsigned idx = word.size() ; idx < wid ; idx += 1)
 	    thr->bits4.set_bit(bit+idx, BIT4_X);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%load/av ...\n");
+#endif
       return true;
 }
 
@@ -3294,6 +3547,7 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
 */
 bool of_LOAD_DAR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
@@ -3312,7 +3566,9 @@ bool of_LOAD_DAR(vthread_t thr, vvp_code_t cp)
 
       thr_check_addr(thr, bit+word.size());
       thr->bits4.set_vec(bit, word);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%load/dar ...\n");
+#endif
       return true;
 }
 
@@ -3367,6 +3623,7 @@ bool of_LOAD_DAR_STR(vthread_t thr, vvp_code_t cp)
 #endif
 static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&sig_value)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       int64_t addend = thr->words[0].w_int;
@@ -3394,6 +3651,9 @@ static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&si
 	   directly to skip the excess calls to thr_check_addr. */
       thr->bits4.setarray(bit, wid, val);
       delete[]val;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: load_vp0_common()\n");
+#endif
 }
 
 /*
@@ -3408,6 +3668,7 @@ static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&si
  */
 bool of_LOAD_AVP0(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
 
@@ -3426,11 +3687,15 @@ bool of_LOAD_AVP0(vthread_t thr, vvp_code_t cp)
       sig_value.copy_bits(array_get_word(cp->array, adr));
 
       load_vp0_common(thr, cp, sig_value);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%load/avp0 ...\n");
+#endif
       return true;
 }
 
 bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned wid = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
 
@@ -3451,11 +3716,14 @@ bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t cp)
       sig_value.copy_bits(tmp);
 
       load_vp0_common(thr, cp, sig_value);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%load/avp0/s ... \n");
+#endif
       return true;
 }
 
 /*
- * %load/avx.p <bit>, <array-label>, <idx> ;
+ * %load/avx.p <bit>, <array-label>, <wid> ;
  *
  * <bit> is the thread bit address for the result
  * <array-label> is the array to access, and
@@ -3465,6 +3733,7 @@ bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t cp)
  */
 bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned index = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
@@ -3486,7 +3755,9 @@ bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t cp)
       }
 
       thr->words[index].w_int = use_index + 1;
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%load/avx.p ...\n");
+#endif
       return true;
 }
 
@@ -3542,8 +3813,7 @@ bool of_LOAD_STRA(vthread_t thr, vvp_code_t cp)
       unsigned adr = thr->words[idx].w_int;
       string word;
 
-	/* The result is 0.0 if the address is undefined. */
-      if (thr_get_bit(thr, 4) == BIT4_1) {
+      if (thr->flags[4] == BIT4_1) {
 	    word = "";
       } else {
 	    word = array_get_word_str(cp->array, adr);
@@ -3584,6 +3854,7 @@ static void load_base(vvp_code_t cp, vvp_vector4_t&dst)
       sig->vec4_value(dst);
 }
 
+#if 0
 bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
 {
       unsigned bit = cp->bit_idx[0];
@@ -3609,6 +3880,20 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
+#endif
+
+/*
+ * %load/vec4 <net>
+ */
+bool of_LOAD_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      vvp_vector4_t sig_value;
+      load_base(cp, sig_value);
+
+      thr->push_vec4(sig_value);
+
+      return true;
+}
 
 /*
  * This is like of_LOAD_VEC, but includes an add of an integer value from
@@ -3654,6 +3939,7 @@ bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t cp)
  */
 bool of_LOAD_X1P(vthread_t thr, vvp_code_t cp)
 {
+#if 0
 	// <bit> is the thread bit to load
       assert(cp->bit_idx[0] >= 4);
       unsigned bit = cp->bit_idx[0];
@@ -3681,10 +3967,12 @@ bool of_LOAD_X1P(vthread_t thr, vvp_code_t cp)
 
 	    thr_put_bit(thr, bit+idx, val);
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMLEMENTED: %%load/x1p ...\n");
+#endif
       return true;
 }
-
+#if 0
 static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
 			    bool left_is_neg, bool right_is_neg)
 {
@@ -3809,7 +4097,7 @@ static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
 
       return;
 }
-
+#endif
 bool of_MAX_WR(vthread_t thr, vvp_code_t)
 {
       double r = thr->pop_real();
@@ -3842,6 +4130,7 @@ bool of_MIN_WR(vthread_t thr, vvp_code_t)
 
 bool of_MOD(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       if(cp->number <= 8*sizeof(unsigned long long)) {
@@ -3884,12 +4173,15 @@ bool of_MOD(vthread_t thr, vvp_code_t cp)
  x_out:
       for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
 	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mod ...\n");
+#endif
       return true;
 }
 
 bool of_MOD_S(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
 	/* Handle the case that we can fit the bits into a long-long
@@ -3947,7 +4239,9 @@ bool of_MOD_S(vthread_t thr, vvp_code_t cp)
  x_out:
       for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
 	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mod/s ...\n");
+#endif
       return true;
 }
 
@@ -3972,7 +4266,7 @@ bool of_MOD_WR(vthread_t thr, vvp_code_t)
  *   more directly does the job. All the of_MOV*_ functions are
  *   functions that of_MOV might use to replace itself.
  */
-
+#if 0
 static bool of_MOV1XZ_(vthread_t thr, vvp_code_t cp)
 {
       thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
@@ -3994,9 +4288,10 @@ static bool of_MOV_(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
-
+#endif
 bool of_MOV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       if (cp->bit_idx[1] >= 4) {
@@ -4007,23 +4302,104 @@ bool of_MOV(vthread_t thr, vvp_code_t cp)
 	    cp->opcode = &of_MOV1XZ_;
 	    return cp->opcode(thr, cp);
       }
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mov ...\n");
+#endif
+      return true;
+}
+
+/*
+ * %pad/s <wid>
+ */
+bool of_PAD_S(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t val = thr->pop_vec4();
+      unsigned old_size = val.size();
+      val.resize(wid);
+
+	// Sign-extend.
+      if (old_size < wid) {
+	    vvp_bit4_t sb = val.value(old_size-1);
+	    for (unsigned idx = old_size ; idx < wid ; idx += 1)
+		  val.set_bit(idx, sb);
+      }
+
+      thr->push_vec4(val);
 
       return true;
 }
 
-bool of_PAD(vthread_t thr, vvp_code_t cp)
+/*
+ * %pad/u <wid>
+ */
+bool of_PAD_U(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned wid = cp->number;
 
-      vvp_bit4_t pad_bit;
-      if (cp->bit_idx[1] < 4)
-            pad_bit = thr_index_to_bit4[cp->bit_idx[1]];
-      else
-            pad_bit = thr->bits4.value(cp->bit_idx[1]);
+      vvp_vector4_t val = thr->pop_vec4();
+      unsigned old_size = val.size();
+      val.resize(wid);
+
+      if (old_size < wid) {
+	    vvp_bit4_t pad = BIT4_0;
+	    for (unsigned idx = old_size ; idx < wid ; idx += 1)
+		  val.set_bit(idx,pad);
+      }
+
+      thr->push_vec4(val);
+
+      return true;
+}
+
+/*
+ * %part <wid>
+ * Two values are popped from the stack. First, pop the canonical
+ * index of the part select, and second is the value to be
+ * selected. The result is pushed back to the stack.
+ */
+bool of_PART(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t base4 = thr->pop_vec4();
+      vvp_vector4_t value = thr->pop_vec4();
+
+      vvp_vector4_t res (wid, BIT4_X);
+
+	// NOTE: This is treating the vector as signed. Is that correct?
+      int32_t base;
+      bool value_ok = vector4_to_value(base4, base, true);
+      if (! value_ok) {
+	    thr->push_vec4(res);
+	    return true;
+      }
+
+      if (base >= (int32_t)value.size()) {
+	    thr->push_vec4(res);
+	    return true;
+      }
+
+      if ((base+(int)wid) <= 0) {
+	    thr->push_vec4(res);
+	    return true;
+      }
+
+      long vbase = 0;
+      if (base < 0) {
+	    vbase = -base;
+	    wid -= vbase;
+	    base = 0;
+      }
+
+      if ((base+wid) > value.size()) {
+	    wid = value.size() - base;
+      }
+
+      res .set_vec(vbase, value.subvalue(base, wid));
+      thr->push_vec4(res);
 
-      thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
-      vvp_vector4_t tmp (cp->number, pad_bit);
-      thr->bits4.set_vec(cp->bit_idx[0], tmp);
       return true;
 }
 
@@ -4041,6 +4417,7 @@ bool of_MOV_WU(vthread_t thr, vvp_code_t cp)
 
 bool of_MOVI(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned dst = cp->bit_idx[0];
       static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       unsigned wid = cp->number;
@@ -4060,12 +4437,15 @@ bool of_MOVI(vthread_t thr, vvp_code_t cp)
 	    wid -= trans;
 	    dst += trans;
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%movi ...\n");
+#endif
       return true;
 }
 
 bool of_MUL(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned adra = cp->bit_idx[0];
       unsigned adrb = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -4119,6 +4499,9 @@ bool of_MUL(vthread_t thr, vvp_code_t cp)
       delete[]ap;
       delete[]bp;
       delete[]res;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mul ...\n");
+#endif
       return true;
 }
 
@@ -4133,6 +4516,7 @@ bool of_MUL_WR(vthread_t thr, vvp_code_t)
 
 bool of_MULI(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned adr = cp->bit_idx[0];
       unsigned long imm = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -4163,9 +4547,12 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(adr, wid, res);
       delete[]val;
       delete[]res;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%muli ...\n");
+#endif
       return true;
 }
-
+#if 0
 static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
 {
       unsigned idx1 = cp->bit_idx[0];
@@ -4196,9 +4583,10 @@ static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
-
+#endif
 bool of_NAND(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       if (cp->number <= 4)
@@ -4207,6 +4595,10 @@ bool of_NAND(vthread_t thr, vvp_code_t cp)
 	    cp->opcode = &of_NAND_wide;
 
       return cp->opcode(thr, cp);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nand ...\n");
+      return true;
+#endif
 }
 
 /*
@@ -4267,6 +4659,7 @@ bool of_NOOP(vthread_t, vvp_code_t)
 
 bool of_NORR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_1;
@@ -4285,7 +4678,9 @@ bool of_NORR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nor/r ...\n");
+#endif
       return true;
 }
 
@@ -4302,6 +4697,7 @@ bool of_NULL(vthread_t thr, vvp_code_t)
 
 bool of_ANDR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_1;
@@ -4320,12 +4716,15 @@ bool of_ANDR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%and/r ...\n");
+#endif
       return true;
 }
 
 bool of_NANDR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_0;
@@ -4344,12 +4743,15 @@ bool of_NANDR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nand/r ...\n");
+#endif
       return true;
 }
 
 bool of_ORR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_0;
@@ -4368,12 +4770,15 @@ bool of_ORR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%orr ...\n");
+#endif
       return true;
 }
 
 bool of_XORR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_0;
@@ -4391,12 +4796,15 @@ bool of_XORR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xorr ...\n");
+#endif
       return true;
 }
 
 bool of_XNORR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       vvp_bit4_t lb = BIT4_1;
@@ -4414,53 +4822,22 @@ bool of_XNORR(vthread_t thr, vvp_code_t cp)
       }
 
       thr_put_bit(thr, cp->bit_idx[0], lb);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xnorr...\n");
+#endif
       return true;
 }
 
-static bool of_OR_wide(vthread_t thr, vvp_code_t cp)
+bool of_OR(vthread_t thr, vvp_code_t)
 {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
-      val |= vthread_bits_to_vector(thr, idx2, wid);
-      thr->bits4.set_vec(idx1, val);
-
+      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t valb = thr->pop_vec4();
+      vala |= valb;
+      thr->push_vec4(vala);
       return true;
 }
 
-static bool of_OR_narrow(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-	    thr_put_bit(thr, idx1, lb|rb);
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
-      }
-
-      return true;
-}
-
-bool of_OR(vthread_t thr, vvp_code_t cp)
-{
-      assert(cp->bit_idx[0] >= 4);
-
-      if (cp->number <= 4)
-	    cp->opcode = &of_OR_narrow;
-      else
-	    cp->opcode = &of_OR_wide;
-
-      return cp->opcode(thr, cp);
-}
-
+#if 0
 static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
@@ -4493,17 +4870,20 @@ static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
 
       return true;
 }
-
+#endif
 bool of_NOR(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-
+#if 0
       if (cp->number <= 4)
 	    cp->opcode = &of_NOR_narrow;
       else
 	    cp->opcode = &of_NOR_wide;
 
       return cp->opcode(thr, cp);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nor ...\n");
+      return true;
+#endif
 }
 
 /*
@@ -4540,8 +4920,19 @@ bool of_POP_STR(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ *  %pop/vec4 <number>
+ */
+bool of_POP_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      unsigned cnt = cp->number;
+      thr->pop_vec4(cnt);
+      return true;
+}
+
 bool of_POW(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       unsigned idx = cp->bit_idx[0];
@@ -4574,12 +4965,15 @@ bool of_POW(vthread_t thr, vvp_code_t cp)
       for (unsigned jdx = 0;  jdx < wid;  jdx += 1)
 	    thr_put_bit(thr, cp->bit_idx[0]+jdx,
 	                result.value(jdx) ? BIT4_1 : BIT4_0);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%pow ...\n");
+#endif
       return true;
 }
 
 bool of_POW_S(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       unsigned idx = cp->bit_idx[0];
@@ -4607,7 +5001,9 @@ bool of_POW_S(vthread_t thr, vvp_code_t cp)
         /* Copy the result. */
       for (unsigned jdx = 0;  jdx < wid;  jdx += 1)
 	    thr_put_bit(thr, cp->bit_idx[0]+jdx, res.value(jdx));
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%pow/s ...\n");
+#endif
       return true;
 }
 
@@ -4686,6 +5082,7 @@ bool of_PROP_STR(vthread_t thr, vvp_code_t cp)
  */
 bool of_PROP_V(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned pid = cp->bit_idx[0];
       unsigned dst = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -4706,7 +5103,9 @@ bool of_PROP_V(vthread_t thr, vvp_code_t cp)
 	    for (unsigned idx = val.size() ; idx < wid ; idx += 1)
 		  thr->bits4.set_bit(dst+idx, BIT4_X);
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%prop/v ...\n");
+#endif
       return true;
 }
 
@@ -4748,8 +5147,53 @@ bool of_PUSHI_STR(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %pushi/vec4 <vala>, <valb>, <wid>
+ */
+bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
+{
+      uint32_t vala = cp->bit_idx[0];
+      uint32_t valb = cp->bit_idx[1];
+      unsigned wid  = cp->number;
+
+      vvp_vector4_t val (wid, BIT4_0);
+      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+	    uint32_t ba = 0;
+	      // If the requested width is /32, then there are no
+	      // actual immediate bits, but we can pad with zero. So
+	      // here we test if we are still working on he LSB, and
+	      // process them if so.
+	    if (idx < 32) {
+		  ba = ((valb >> idx) & 1) << 1;
+		  ba |= (vala >> idx) & 1;
+	    }
+	    vvp_bit4_t use_bit = BIT4_0;
+	    switch (ba) {
+		case 1:
+		  use_bit = BIT4_1;
+		  break;
+		case 2:
+		  use_bit = BIT4_Z;
+		  break;
+		case 3:
+		  use_bit = BIT4_X;
+		  break;
+		default:
+		  break;
+	    }
+	    if (use_bit == BIT4_0)
+		  continue;
+	    val.set_bit(idx, use_bit);
+      }
+
+      thr->push_vec4(val);
+
+      return true;
+}
+
 bool of_PUSHV_STR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned src = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
 
@@ -4780,6 +5224,9 @@ bool of_PUSHV_STR(vthread_t thr, vvp_code_t cp)
       }
 
       thr->push_str(val);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%push/str ...\n");
+#endif
       return true;
 }
 
@@ -4788,6 +5235,7 @@ bool of_PUSHV_STR(vthread_t thr, vvp_code_t cp)
  */
 bool of_PUTC_STR_V(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned muxr = cp->bit_idx[0];
       unsigned base = cp->bit_idx[1];
 
@@ -4827,7 +5275,9 @@ bool of_PUTC_STR_V(vthread_t thr, vvp_code_t cp)
 	   variable so that the new value propagates. */
       val[mux] = tmp_val;
       vvp_send_string(vvp_net_ptr_t(cp->net, 0), val, thr->wt_context);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%putc/str/v ...\n");
+#endif
       return true;
 }
 
@@ -4912,6 +5362,7 @@ bool of_SCOPY(vthread_t thr, vvp_code_t)
  */
 bool of_SET_AV(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       unsigned off = thr->words[1].w_int;
@@ -4921,6 +5372,9 @@ bool of_SET_AV(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t value = vthread_bits_to_vector(thr, bit, wid);
 
       array_set_word(cp->array, adr, off, value);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%set/av ...\n");
+#endif
       return true;
 }
 
@@ -4929,6 +5383,7 @@ bool of_SET_AV(vthread_t thr, vvp_code_t cp)
  */
 bool of_SET_DAR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
       unsigned adr = thr->words[3].w_int;
@@ -4944,6 +5399,9 @@ bool of_SET_DAR(vthread_t thr, vvp_code_t cp)
       assert(darray);
 
       darray->set_word(adr, value);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%set/dar ...\n");
+#endif
       return true;
 }
 
@@ -4952,6 +5410,7 @@ bool of_SET_DAR(vthread_t thr, vvp_code_t cp)
  */
 bool of_SET_DAR_OBJ(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       unsigned adr = thr->words[cp->number].w_int;
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
@@ -4963,6 +5422,9 @@ bool of_SET_DAR_OBJ(vthread_t thr, vvp_code_t cp)
       assert(darray);
 
       darray->set_word(adr, value);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%set/dar/obj ...\n");
+#endif
       return true;
 }
 
@@ -5013,6 +5475,7 @@ bool of_SET_DAR_OBJ_STR(vthread_t thr, vvp_code_t cp)
  */
 bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[1] > 0);
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
@@ -5022,7 +5485,9 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
 
       vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid),
                     thr->wt_context);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%set/vec ...\n");
+#endif
       return true;
 }
 
@@ -5039,6 +5504,7 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
  */
 bool of_SET_X0(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       vvp_net_t*net = cp->net;
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
@@ -5083,10 +5549,13 @@ bool of_SET_X0(vthread_t thr, vvp_code_t cp)
 
       vvp_net_ptr_t ptr (net, 0);
       vvp_send_vec4_pv(ptr, bit_vec, index, wid, sig->value_size(), thr->wt_context);
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%set/x0 ...\n");
+#endif
       return true;
 }
 
+#if 0
 bool of_SHIFTL_I0(vthread_t thr, vvp_code_t cp)
 {
       int base = cp->bit_idx[0];
@@ -5132,7 +5601,52 @@ bool of_SHIFTL_I0(vthread_t thr, vvp_code_t cp)
       }
       return true;
 }
+#endif
 
+/*
+ * %shiftl <idx>
+ */
+bool of_SHIFTL(vthread_t thr, vvp_code_t cp)
+{
+      int use_index = cp->number;
+      int shift = thr->words[use_index].w_int;
+
+      vvp_vector4_t val = thr->pop_vec4();
+      int wid  = val.size();
+
+      if (thr->flags[4] == BIT4_1) {
+	      // The result is 'bx if the shift amount is undefined
+	    val = vvp_vector4_t(wid, BIT4_X);
+
+      } else if (shift >= wid) {
+	      // Shift is so big that all value is shifted out. Write
+	      // a constant 0 result.
+	    val = vvp_vector4_t(wid, BIT4_0);
+
+      } else if (shift > 0) {
+	    val.mov(shift, 0, wid-shift);
+
+	    vvp_vector4_t tmp (shift, BIT4_0);
+	    val.set_vec(0, tmp);
+
+      } else if (shift < -wid) {
+	    val = vvp_vector4_t(wid, BIT4_X);
+
+      } else if (shift < 0) {
+	      // Negative left shift is a right shift.
+	      // For a negative shift, we pad with 'bx.
+	    int use_shift = -shift;
+	    val.mov(0, use_shift, wid-use_shift);
+	    vvp_vector4_t tmp (use_shift, BIT4_X);
+	    val.set_vec(wid-use_shift, tmp);
+      }
+
+      thr->push_vec4(val);
+      return true;
+}
+
+
+#if 0
 /*
  * This is an unsigned right shift:
  *
@@ -5184,7 +5698,51 @@ bool of_SHIFTR_I0(vthread_t thr, vvp_code_t cp)
       }
       return true;
 }
+#endif
 
+/*
+ * %shiftr <idx>
+ * This is an unsigned right shift. The <idx> is a number that selects
+ * the index register with the amount of the shift. This instruction
+ * checks flag bit 4, which will be true if the shift is invalid.
+ */
+bool of_SHIFTR(vthread_t thr, vvp_code_t cp)
+{
+      int use_index = cp->number;
+      int shift = thr->words[use_index].w_int;
+
+      vvp_vector4_t val = thr->pop_vec4();
+      int wid  = val.size();
+
+      if (thr->flags[4] == BIT4_1) {
+	    val = vvp_vector4_t(wid, BIT4_X);
+
+      } else if (shift > wid) {
+	    val = vvp_vector4_t(wid, BIT4_0);
+
+      } else if (shift > 0) {
+	    val.mov(0, shift, wid-shift);
+
+	    vvp_vector4_t tmp (shift, BIT4_0);
+	    val.set_vec(wid-shift, tmp);
+
+      } else if (shift < -wid) {
+	    val = vvp_vector4_t(wid, BIT4_X);
+
+      } else if (shift < 0) {
+	      // Negative right shift is a left shift.
+	      // For a negative shift, we pad with 'bx.
+	    int use_shift = -shift;
+	    val.mov(use_shift, 0, wid-use_shift);
+	    vvp_vector4_t tmp (use_shift, BIT4_X);
+	    val.set_vec(0, tmp);
+      }
+
+      thr->push_vec4(val);
+      return true;
+}
+
+#if 0
 bool of_SHIFTR_S_I0(vthread_t thr, vvp_code_t cp)
 {
       int base = cp->bit_idx[0];
@@ -5226,6 +5784,14 @@ bool of_SHIFTR_S_I0(vthread_t thr, vvp_code_t cp)
       }
       return true;
 }
+#endif
+
+bool of_SHIFTR_S(vthread_t thr, vvp_code_t cp)
+{
+      fprintf(stderr, "XXXX of_SHIFTR_S not implemented\n");
+      return true;
+}
+
 
 bool of_STORE_DAR_R(vthread_t thr, vvp_code_t cp)
 {
@@ -5353,6 +5919,7 @@ bool of_STORE_PROP_STR(vthread_t thr, vvp_code_t cp)
  */
 bool of_STORE_PROP_V(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       size_t pid = cp->bit_idx[0];
       unsigned src = cp->bit_idx[1];
       unsigned wid = cp->number;
@@ -5364,6 +5931,9 @@ bool of_STORE_PROP_V(vthread_t thr, vvp_code_t cp)
       assert(cobj);
 
       cobj->set_vec4(pid, val);
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%store/prop/v ...\n");
+#endif
       return true;
 }
 
@@ -5416,27 +5986,51 @@ bool of_STORE_STRA(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %store/vec4 <var-label>, <wid>
+ */
+bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
+{
+	/* Set the value into port 0 of the destination */
+      vvp_net_ptr_t ptr (cp->net, 0);
+      unsigned wid = cp->bit_idx[0];
+
+      vvp_vector4_t val = thr->pop_vec4();
+      assert(val.size() >= wid);
+      if (val.size() > wid)
+	    val.resize(wid);
+
+      vvp_send_vec4(ptr, val, thr->wt_context);
+
+      return true;
+}
 
 bool of_SUB(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t r = thr->pop_vec4();
+      vvp_vector4_t l = thr->pop_vec4();
 
-      unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
-      unsigned long*lvb = vector_to_array(thr, cp->bit_idx[1], cp->number);
+      unsigned wid = l.size();
+      assert(wid == r.size());
+
+      unsigned long*lva = l.subarray(0,wid);
+      unsigned long*lvb = r.subarray(0,wid);
       if (lva == 0 || lvb == 0)
 	    goto x_out;
 
 
       unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < wid ;  idx += 1)
 	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);
 
 
 	/* We know from the vector_to_array that the address is valid
 	   in the thr->bitr4 vector, so just do the set bit. */
 
-      thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);
+      l.setarray(0,wid,lva);
+      thr->push_vec4(l);
+
       delete[]lva;
       delete[]lvb;
 
@@ -5447,7 +6041,7 @@ bool of_SUB(vthread_t thr, vvp_code_t cp)
       delete[]lvb;
 
       vvp_vector4_t tmp(cp->number, BIT4_X);
-      thr->bits4.set_vec(cp->bit_idx[0], tmp);
+      thr->push_vec4(tmp);
 
       return true;
 }
@@ -5462,6 +6056,7 @@ bool of_SUB_WR(vthread_t thr, vvp_code_t)
 
 bool of_SUBI(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS;
@@ -5494,6 +6089,10 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       thr->bits4.set_vec(cp->bit_idx[0], tmp);
 
       return true;
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%subi ...\n");
+      return true;
+#endif
 }
 
 /*
@@ -5522,11 +6121,11 @@ bool of_SUBSTR(vthread_t thr, vvp_code_t cp)
  */
 bool of_SUBSTR_V(vthread_t thr, vvp_code_t cp)
 {
-      string&val = thr->peek_str(0);
+	//string&val = thr->peek_str(0);
       uint32_t bitl = cp->bit_idx[0];
       uint32_t sel = cp->bit_idx[1];
       unsigned wid = cp->number;
-
+#if 0
       thr_check_addr(thr, bitl+wid);
       assert(bitl >= 4);
 
@@ -5547,7 +6146,9 @@ bool of_SUBSTR_V(vthread_t thr, vvp_code_t cp)
 	    bitl += 8;
 	    use_sel += 1;
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED : %%substr/v %u, %u, %u\n", bitl, sel, wid);
+#endif
       return true;
 }
 
@@ -5565,6 +6166,8 @@ bool of_FILE_LINE(vthread_t, vvp_code_t cp)
 
 /*
  * %test_nul <var-label>;
+ * Test if the object at the specified variable is nil. If so, write
+ * "1" into flags[4], otherwise write "0" into flags[4].
  */
 bool of_TEST_NUL(vthread_t thr, vvp_code_t cp)
 {
@@ -5575,9 +6178,9 @@ bool of_TEST_NUL(vthread_t thr, vvp_code_t cp)
       assert(obj);
 
       if (obj->get_object().test_nil())
-	    thr_put_bit(thr, 4, BIT4_1);
+	    thr->flags[4] = BIT4_1;
       else
-	    thr_put_bit(thr, 4, BIT4_0);
+	    thr->flags[4] = BIT4_0;
 
       return true;
 }
@@ -5643,6 +6246,7 @@ bool of_WAIT_FORK(vthread_t thr, vvp_code_t)
 
 bool of_XNOR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       unsigned idx1 = cp->bit_idx[0];
@@ -5658,13 +6262,16 @@ bool of_XNOR(vthread_t thr, vvp_code_t cp)
 	    if (idx2 >= 4)
 		  idx2 += 1;
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xnor ...\n");
+#endif
       return true;
 }
 
 
 bool of_XOR(vthread_t thr, vvp_code_t cp)
 {
+#if 0
       assert(cp->bit_idx[0] >= 4);
 
       unsigned idx1 = cp->bit_idx[0];
@@ -5695,7 +6302,9 @@ bool of_XOR(vthread_t thr, vvp_code_t cp)
 	    if (idx2 >= 4)
 		  idx2 += 1;
       }
-
+#else
+      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xor ...\n");
+#endif
       return true;
 }