Start work on converting vec4 expressions to use stack.

Instead of using a bit4 space to hold thread vectors, create a vec4 stack--much like the real, string, and object stacks--to hold intermediate values.
2013-12-27 17:04:42 +02:00 · 2013-12-27 17:04:42 +02:00 · 5ef077fdf6
parent 92e4ca3a92
commit 5ef077fdf6
11 changed files with 1802 additions and 461 deletions
--- a/tgt-vvp/Makefile.in
+++ b/tgt-vvp/Makefile.in
@ -50,6 +50,7 @@ LDFLAGS = @LDFLAGS@
 O = vvp.o draw_class.o draw_enum.o draw_mux.o draw_net_input.o \
    draw_switch.o draw_ufunc.o draw_vpi.o \
    eval_bool.o eval_expr.o eval_object.o eval_real.o eval_string.o \
+    eval_vec4.o \
    modpath.o stmt_assign.o vector.o \
    vvp_process.o vvp_scope.o

--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@ -189,7 +189,7 @@ uint64_t get_number_immediate64(ivl_expr_t expr)
      return imm;
 }

-static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
+void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 {
      switch (ivl_expr_type(expr)) {

@ -200,7 +200,7 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		    if (number_is_unknown(expr)) {
 			    /* We are loading a 'bx so mimic %ix/get. */
 			  fprintf(vvp_out, "    %%ix/load %u, 0, 0;\n", ix);
-			  fprintf(vvp_out, "    %%mov 4, 1, 1;\n");
+			  fprintf(vvp_out, "    %%flag_set/imm 4, 1;\n");
 			  break;
 		    }
 		    long imm = get_number_immediate(expr);
@ -210,11 +210,14 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 			  fprintf(vvp_out, "    %%ix/load %u, 0, 0; loading %ld\n", ix, imm);
 			  fprintf(vvp_out, "    %%ix/sub %u, %ld, 0;\n", ix, -imm);
 		    }
-		      /* This can not have have a X/Z value so clear bit 4. */
-		    fprintf(vvp_out, "    %%mov 4, 0, 1;\n");
+		      /* This can not have have a X/Z value so clear flag 4. */
+		    fprintf(vvp_out, "    %%flag_set/imm 4, 0;\n");
 	      }
 	      break;

+		/* Special case: There is an %ix instruction for
+		   reading index values directly from variables. In
+		   this case, try to use that special instruction. */
 	  case IVL_EX_SIGNAL: {
 		ivl_signal_t sig = ivl_expr_signal(expr);

@ -227,11 +230,8 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 			   variable array. In this case, the ix/getv
 			   will not work, so do it the hard way. */
 		      if (ivl_signal_type(sig) == IVL_SIT_REG) {
-			    struct vector_info rv;
-			    rv = draw_eval_expr(expr, 0);
-			    fprintf(vvp_out, "    %%ix/get%s %u, %u, %u;\n",
-				    type, ix, rv.base, rv.wid);
-			    clr_vector(rv);
+			    draw_eval_vec4(expr, 0);
+			    fprintf(vvp_out, "    %%ix/vec4%s %u;\n", type, ix);
 			    break;
 		      }

@ -240,11 +240,8 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		            assert(! number_is_unknown(ixe));
 		            word = get_number_immediate(ixe);
 		      } else {
-		            struct vector_info rv;
-		            rv = draw_eval_expr(expr, 0);
-		            fprintf(vvp_out, "    %%ix/get%s %u, %u, %u;\n",
-		                    type, ix, rv.base, rv.wid);
-		            clr_vector(rv);
+		            draw_eval_vec4(expr, 0);
+		            fprintf(vvp_out, "    %%ix/vec4%s %u;\n", type, ix);
 		            break;
 		      }
 		}
@ -254,20 +251,15 @@ static void eval_logic_into_integer(ivl_expr_t expr, unsigned ix)
 		break;
 	  }

-	  default: {
-		  struct vector_info rv;
-		  rv = draw_eval_expr(expr, 0);
-		    /* Is this a signed expression? */
-		  if (ivl_expr_signed(expr)) {
-		      fprintf(vvp_out, "    %%ix/get/s %u, %u, %u;\n",
-		                       ix, rv.base, rv.wid);
-		  } else {
-		      fprintf(vvp_out, "    %%ix/get %u, %u, %u;\n",
-		                       ix, rv.base, rv.wid);
-		  }
-		  clr_vector(rv);
-		  break;
+	  default:
+	    draw_eval_vec4(expr, 0);
+	      /* Is this a signed expression? */
+	    if (ivl_expr_signed(expr)) {
+		  fprintf(vvp_out, "    %%ix/vec4/s %u;\n", ix);
+	    } else {
+		  fprintf(vvp_out, "    %%ix/vec4 %u;\n", ix);
 	    }
+	    break;
      }
 }

--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2013 Stephen Williams (steve@icarus.com)
+ *
+ *    This source code is free software; you can redistribute it
+ *    and/or modify it in source code form under the terms of the GNU
+ *    General Public License as published by the Free Software
+ *    Foundation; either version 2 of the License, or (at your option)
+ *    any later version.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    GNU General Public License for more details.
+ *
+ *    You should have received a copy of the GNU General Public License
+ *    along with this program; if not, write to the Free Software
+ *    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+/*
+ * This file includes functions for evaluating VECTOR expressions.
+ */
+# include  "vvp_priv.h"
+# include  <string.h>
+# include  <stdlib.h>
+# include  <math.h>
+# include  <assert.h>
+# include  <stdbool.h>
+
+static void draw_binary_vec4_arith(ivl_expr_t expr, int stuff_ok_flag)
+{
+      draw_eval_vec4(ivl_expr_oper1(expr), stuff_ok_flag);
+      draw_eval_vec4(ivl_expr_oper2(expr), stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '+':
+	    fprintf(vvp_out, "    %%add;\n");
+	    break;
+	  case '-':
+	    fprintf(vvp_out, "    %%sub;\n");
+	    break;
+	  case '*':
+	    fprintf(vvp_out, "    %%mul;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_bitwise(ivl_expr_t expr, int stuff_ok_flag)
+{
+      draw_eval_vec4(ivl_expr_oper1(expr), stuff_ok_flag);
+      draw_eval_vec4(ivl_expr_oper2(expr), stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '&':
+	    fprintf(vvp_out, "    %%and;\n");
+	    break;
+	  case '|':
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_compare_real(ivl_expr_t expr)
+{
+      draw_eval_real(ivl_expr_oper1(expr));
+      draw_eval_real(ivl_expr_oper2(expr));
+
+      switch (ivl_expr_opcode(expr)) {
+	  case 'e': /* == */
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    break;
+	  case 'n': /* != */
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    assert(0);
+      }
+}
+
+static void draw_binary_vec4_compare(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+      if ((ivl_expr_value(le) == IVL_VT_REAL)
+	  || (ivl_expr_value(re) == IVL_VT_REAL)) {
+	    draw_binary_vec4_compare_real(expr);
+	    return;
+      }
+
+      draw_eval_vec4(le, stuff_ok_flag);
+      draw_eval_vec4(re, stuff_ok_flag);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case 'e': /* == */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    break;
+	  case 'n': /* != */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  case 'E': /* === */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 6;\n");
+	    break;
+	  case 'N': /* !== */
+	    fprintf(vvp_out, "    %%cmp/u;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 6;\n");
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    assert(0);
+      }
+}
+
+static void draw_binary_vec4_le_real(ivl_expr_t expr)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '<':
+	    draw_eval_real(le);
+	    draw_eval_real(re);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+
+	  case 'L': /* <= */
+	    draw_eval_real(le);
+	    draw_eval_real(re);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+
+	  case '>':
+	    draw_eval_real(re);
+	    draw_eval_real(le);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+
+	  case 'G': /* >= */
+	    draw_eval_real(re);
+	    draw_eval_real(le);
+	    fprintf(vvp_out, "    %%cmp/wr;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_le(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+      ivl_expr_t tmp;
+
+      if ((ivl_expr_value(le) == IVL_VT_REAL)
+	  || (ivl_expr_value(re) == IVL_VT_REAL)) {
+	    draw_binary_vec4_le_real(expr);
+	    return;
+      }
+
+      char use_opcode = ivl_expr_opcode(expr);
+      char s_flag = (ivl_expr_signed(le) && ivl_expr_signed(re)) ? 's' : 'u';
+
+	/* If this is a > or >=, then convert it to < or <= by
+	   swapping the operands. Adjust the opcode to match. */
+      switch (use_opcode) {
+	  case 'G':
+	    tmp = le;
+	    le = re;
+	    re = tmp;
+	    use_opcode = 'L';
+	    break;
+	  case '>':
+	    tmp = le;
+	    le = re;
+	    re = tmp;
+	    use_opcode = '<';
+	    break;
+      }
+
+      draw_eval_vec4(le, stuff_ok_flag);
+      draw_eval_vec4(re, stuff_ok_flag);
+
+      switch (use_opcode) {
+	  case 'L':
+	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
+	    fprintf(vvp_out, "    %%flag_get/vec4 4;\n");
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    fprintf(vvp_out, "    %%or;\n");
+	    break;
+	  case '<':
+	    fprintf(vvp_out, "    %%cmp/%c;\n", s_flag);
+	    fprintf(vvp_out, "    %%flag_get/vec4 5;\n");
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+}
+
+static void draw_binary_vec4_lor(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+	/* Push the left expression. Reduce it to a single bit if
+	   necessary. */
+      draw_eval_vec4(le, STUFF_OK_XZ);
+      if (ivl_expr_width(le) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+
+	/* Now push the right expression. Again, reduce to a single
+	   bit if necessasry. */
+      draw_eval_vec4(re, STUFF_OK_XZ);
+      if (ivl_expr_width(re) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+
+      fprintf(vvp_out, "    %%or;\n");
+
+      if (ivl_expr_width(expr) > 1)
+	    fprintf(vvp_out, "    %%pad/u %u;\n", ivl_expr_width(expr));
+}
+
+static void draw_binary_vec4_lrs(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+	// Push the left expression onto the stack.
+      draw_eval_vec4(le, stuff_ok_flag);
+
+	// Calculate the shift amount into an index register.
+      int use_index_reg = allocate_word();
+      assert(use_index_reg >= 0);
+      draw_eval_expr_into_integer(re, use_index_reg);
+
+	// Emit the actual shift instruction. This will pop the top of
+	// the stack and replace it with the result of the shift.
+      switch (ivl_expr_opcode(expr)) {
+	  case 'l': /* << */
+	    fprintf(vvp_out, "    %%shiftl %u;\n", use_index_reg);
+	    break;
+	  case 'r': /* >> */
+	    fprintf(vvp_out, "    %%shiftr %u;\n", use_index_reg);
+	    break;
+	  case 'R': /* >>> */
+	    fprintf(vvp_out, "    %%shiftrs %u;\n", use_index_reg);
+	    break;
+	  default:
+	    assert(0);
+	    break;
+      }
+
+      clr_word(use_index_reg);
+}
+
+static void draw_binary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      switch (ivl_expr_opcode(expr)) {
+	  case '+':
+	  case '-':
+	  case '*':
+	    draw_binary_vec4_arith(expr, stuff_ok_flag);
+	    break;
+
+	  case '&':
+	  case '|':
+	    draw_binary_vec4_bitwise(expr, stuff_ok_flag);
+	    break;
+
+	  case 'e': /* == */
+	  case 'E': /* === */
+	  case 'n': /* !== */
+	  case 'N': /* !== */
+	    draw_binary_vec4_compare(expr, stuff_ok_flag);
+	    break;
+
+	  case 'G': /* >= */
+	  case 'L': /* <= */
+	  case '>':
+	  case '<':
+	    draw_binary_vec4_le(expr, stuff_ok_flag);
+	    break;
+
+	  case 'l': /* << */
+	  case 'r': /* >> */
+	  case 'R': /* >>> */
+	    draw_binary_vec4_lrs(expr, stuff_ok_flag);
+	    break;
+
+	  case 'o': /* || (logical or) */
+	    draw_binary_vec4_lor(expr, stuff_ok_flag);
+	    break;
+
+	  default:
+	    fprintf(stderr, "vvp.tgt error: unsupported binary (%c)\n",
+		    ivl_expr_opcode(expr));
+	    assert(0);
+      }
+}
+
+static void draw_number_vec4(ivl_expr_t expr)
+{
+      unsigned long val0 = 0;
+      unsigned long valx = 0;
+      unsigned wid = ivl_expr_width(expr);
+      const char*bits = ivl_expr_bits(expr);
+
+      int idx;
+
+      assert(wid <= 64);
+
+      for (idx = 0 ; idx < wid ; idx += 1) {
+	    val0 <<= 1;
+	    valx <<= 1;
+	    switch (bits[wid-idx-1]) {
+		case '0':
+		  break;
+		case '1':
+		  val0 |= 1;
+		  break;
+		case 'x':
+		  val0 |= 1;
+		  valx |= 1;
+		  break;
+		case 'z':
+		  val0 |= 0;
+		  valx |= 1;
+		  break;
+		default:
+		  assert(0);
+		  break;
+	    }
+      }
+      fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n", val0, valx, wid);
+}
+
+static void draw_select_vec4(ivl_expr_t expr)
+{
+	// This is the sub-expression to part-select.
+      ivl_expr_t subexpr = ivl_expr_oper1(expr);
+	// This is the base of the part select
+      ivl_expr_t base = ivl_expr_oper2(expr);
+	// This is the part select width
+      unsigned wid = ivl_expr_width(expr);
+
+      draw_eval_vec4(subexpr, 0);
+      draw_eval_vec4(base, 0);
+      fprintf(vvp_out, "    %%part %u;\n", wid);
+}
+
+static void draw_select_pad_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+	// This is the sub-expression to pad/truncate
+      ivl_expr_t subexpr = ivl_expr_oper1(expr);
+	// This is the target width of the expression
+      unsigned wid = ivl_expr_width(expr);
+
+	// Push the sub-expression onto the stack.
+      draw_eval_vec4(subexpr, stuff_ok_flag);
+
+	// Special case: The expression is already the correct width,
+	// so there is nothing to be done.
+      if (wid == ivl_expr_width(subexpr))
+	    return;
+
+      if (ivl_expr_signed(expr))
+	    fprintf(vvp_out, "    %%pad/s %u;\n", wid);
+      else
+	    fprintf(vvp_out, "    %%pad/u %u;\n", wid);
+}
+
+static void draw_signal_vec4(ivl_expr_t expr)
+{
+      ivl_signal_t sig = ivl_expr_signal(expr);
+
+      assert(ivl_signal_dimensions(sig) == 0);
+      fprintf(vvp_out, "    %%load/vec4 v%p_0;\n", sig);
+}
+
+static void draw_ternary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t cond = ivl_expr_oper1(expr);
+      ivl_expr_t true_ex = ivl_expr_oper2(expr);
+      ivl_expr_t false_ex = ivl_expr_oper3(expr);
+
+      unsigned lab_true  = local_count++;
+      unsigned lab_out   = local_count++;
+
+      int use_flag = allocate_flag();
+
+	/* Evaluate the condition expression, including optionally
+	   reducing it to a single bit. Put the result into a flag bit
+	   for use by all the tests. */
+      draw_eval_vec4(cond, STUFF_OK_XZ);
+      if (ivl_expr_width(cond) > 1)
+	    fprintf(vvp_out, "    %%or/r;\n");
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
+
+      fprintf(vvp_out, "    %%jmp/0 T_%u.%u, %d;\n", thread_count, lab_true, use_flag);
+
+	/* If the condition is true or xz (not false), we need the true
+	   expression. If the condition is true, then we ONLY need the
+	   true expression. */
+      draw_eval_vec4(true_ex, stuff_ok_flag);
+      fprintf(vvp_out, "    %%jmp/1 T_%u.%u, %d;\n", thread_count, lab_out, use_flag);
+      fprintf(vvp_out, "T_%u.%u ; End of true expr.\n", thread_count, lab_true);
+
+	/* If the condition is false or xz (not true), we need the false
+	   expression. If the condition is false, then we ONLY need
+	   the false expression. */
+      draw_eval_vec4(false_ex, stuff_ok_flag);
+      fprintf(vvp_out, "    %%jmp/0 T_%u.%u, %d;\n", thread_count, lab_out, use_flag);
+      fprintf(vvp_out, " ; End of false expr.\n");
+
+	/* Here, the condition is not true or false, it is xz. Both
+	   the true and false expressions have been pushed onto the
+	   stack, we just need to blend the bits. */
+      fprintf(vvp_out, "    %%blend;\n");
+      fprintf(vvp_out, "T_%u.%u;\n", thread_count, lab_out);
+
+      clr_flag(use_flag);
+}
+
+static void draw_unary_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      ivl_expr_t sub = ivl_expr_oper1(expr);
+
+      switch (ivl_expr_opcode(expr)) {
+	  case '~':
+	    draw_eval_vec4(sub, stuff_ok_flag);
+	    fprintf(vvp_out, "    %%inv;\n");
+	    break;
+	  default:
+	    fprintf(stderr, "XXXX Unary operator %c no implemented\n", ivl_expr_opcode(expr));
+	    break;
+      }
+}
+
+void draw_eval_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      switch (ivl_expr_type(expr)) {
+	  case IVL_EX_BINARY:
+	    draw_binary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  case IVL_EX_NUMBER:
+	    draw_number_vec4(expr);
+	    return;
+
+	  case IVL_EX_SELECT:
+	    if (ivl_expr_oper2(expr)==0)
+		  draw_select_pad_vec4(expr, stuff_ok_flag);
+	    else
+		  draw_select_vec4(expr);
+	    return;
+
+	  case IVL_EX_SIGNAL:
+	    draw_signal_vec4(expr);
+	    return;
+
+	  case IVL_EX_TERNARY:
+	    draw_ternary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  case IVL_EX_UNARY:
+	    draw_unary_vec4(expr, stuff_ok_flag);
+	    return;
+
+	  default:
+	    break;
+      }
+
+      fprintf(stderr, "XXXX Evaluate VEC4 expression (%d)\n", ivl_expr_type(expr));
+      fprintf(vvp_out, "; XXXX Evaluate VEC4 expression (%d)\n", ivl_expr_type(expr));
+}
--- a/tgt-vvp/stmt_assign.c
+++ b/tgt-vvp/stmt_assign.c
@ -339,6 +339,7 @@ static ivl_type_t draw_lval_expr(ivl_lval_t lval)
      return ivl_type_prop_type(sub_type, ivl_lval_property_idx(lval));
 }

+#if 0
 static void set_vec_to_lval_slice_nest(ivl_lval_t lval, unsigned bit, unsigned wid)
 {
      ivl_lval_t lval_nest = ivl_lval_nest(lval);
@ -349,7 +350,9 @@ static void set_vec_to_lval_slice_nest(ivl_lval_t lval, unsigned bit, unsigned w
 	      ivl_lval_property_idx(lval), bit, wid);
      fprintf(vvp_out, "    %%pop/obj 1, 0;\n");
 }
+#endif

+#if 0
 static void set_vec_to_lval_slice(ivl_lval_t lval, unsigned bit, unsigned wid)
 {
      ivl_signal_t sig  = ivl_lval_sig(lval);
@ -507,8 +510,8 @@ static void set_vec_to_lval_slice(ivl_lval_t lval, unsigned bit, unsigned wid)

      }
 }
-
-
+#endif
+#if 0
 /*
 * This is a private function to generate %set code for the
 * statement. At this point, the r-value is evaluated and stored in
@ -542,6 +545,24 @@ static void set_vec_to_lval(ivl_statement_t net, struct vector_info res)
 	    cur_rbit += bit_limit;
      }
 }
+#endif
+
+/*
+ * Store a vector from the vec4 stack to the statement l-values. This
+ * all assumes that the value to be assigned is already on the top of
+ * the stack.
+ */
+static void store_vec4_to_lval(ivl_statement_t net)
+{
+      assert(ivl_stmt_lvals(net) == 1);
+
+      ivl_lval_t lval = ivl_stmt_lval(net,0);
+      ivl_signal_t lsig = ivl_lval_sig(lval);
+
+      assert(ivl_lval_width(lval) == ivl_signal_width(lsig));
+
+      fprintf(vvp_out, "    %%store/vec4 v%p_0, %u;\n", lsig, ivl_signal_width(lsig));
+}

 static int show_stmt_assign_vector(ivl_statement_t net)
 {
@ -554,7 +575,7 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	   of the l-value. We need these values as part of the r-value
 	   calculation. */
      if (ivl_stmt_opcode(net) != 0) {
-	    slices = calloc(ivl_stmt_lvals(net), sizeof(struct vec_slice_info));
+            slices = calloc(ivl_stmt_lvals(net), sizeof(struct vec_slice_info));
 	    lres = get_vec_from_lval(net, slices);
      }

@ -563,7 +584,7 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	   result to a vector. Then store that vector into the
 	   l-value. */
      if (ivl_expr_value(rval) == IVL_VT_REAL) {
-	    draw_eval_real(rval);
+            draw_eval_real(rval);
 	      /* This is the accumulated with of the l-value of the
 		 assignment. */
 	    unsigned wid = ivl_stmt_lwidth(net);
@ -582,12 +603,14 @@ static int show_stmt_assign_vector(ivl_statement_t net)
 	    fprintf(vvp_out, "    %%cvt/vr %u, %u;\n", res.base, res.wid);

      } else {
-	    res = draw_eval_expr(rval, 0);
+	    draw_eval_vec4(rval, 0);
+	    res.base = 0; // XXXX This is just to suppress the clr_vector below.
+	    res.wid = 0;
      }

      switch (ivl_stmt_opcode(net)) {
 	  case 0:
-	    set_vec_to_lval(net, res);
+	    store_vec4_to_lval(net);
 	    break;

 	  case '+':
--- a/tgt-vvp/vvp.c
+++ b/tgt-vvp/vvp.c
@ -48,6 +48,8 @@ FILE*vvp_out = 0;
 int vvp_errors = 0;
 unsigned show_file_line = 0;

+static uint32_t allocate_flag_mask = 0x00ff;
+
 __inline__ static void draw_execute_header(ivl_design_t des)
 {
      const char*cp = ivl_design_flag(des, "VVP_EXECUTABLE");
@ -85,6 +87,30 @@ __inline__ static void draw_module_declarations(ivl_design_t des)
      }
 }

+int allocate_flag(void)
+{
+      int idx;
+      for (idx = 0 ; idx < 8*sizeof(allocate_flag_mask) ; idx += 1) {
+	    uint32_t mask = 1 << idx;
+	    if (allocate_flag_mask & mask)
+		  continue;
+
+	    allocate_flag_mask |= mask;
+	    return idx;
+      }
+
+      return -1;
+}
+
+void clr_flag(int idx)
+{
+      assert(idx < 8*sizeof(allocate_flag_mask));
+      uint32_t mask = 1 << idx;
+
+      assert(allocate_flag_mask & mask);
+
+      allocate_flag_mask &= ~mask;
+}

 int target_design(ivl_design_t des)

--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@ -306,6 +306,12 @@ extern int number_is_immediate(ivl_expr_t ex, unsigned lim_wid, int negative_is_
 extern long get_number_immediate(ivl_expr_t ex);
 extern uint64_t get_number_immediate64(ivl_expr_t ex);

+/*
+ * draw_eval_vec4 evaluates vec4 expressions. The result of the
+ * evaluation is the vec4 result in the top of the vec4 expression stack.
+ */
+extern void draw_eval_vec4(ivl_expr_t ex, int stuff_ok_flag);
+
 /*
 * draw_eval_real evaluates real value expressions. The result of the
 * evaluation is the real result in the top of the real expression stack.
@ -342,6 +348,12 @@ extern void show_stmt_file_line(ivl_statement_t net, const char*desc);
 extern int allocate_word(void);
 extern void clr_word(int idx);

+/*
+ * These functions manage flag bit allocation.
+ */
+extern int allocate_flag(void);
+extern void clr_flag(int idx);
+
 /*
 * These are used to count labels as I generate code.
 */
--- a/tgt-vvp/vvp_process.c
+++ b/tgt-vvp/vvp_process.c
@ -209,9 +209,9 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
      clear_expression_lookaside();
 }

-static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
+static void assign_to_lvector(ivl_lval_t lval,
 			      uint64_t delay, ivl_expr_t dexp,
-			      unsigned width, unsigned nevents)
+			      unsigned nevents)
 {
      ivl_signal_t sig = ivl_lval_sig(lval);
      ivl_expr_t part_off_ex = ivl_lval_part_off(lval);
@ -221,9 +221,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
      const unsigned long use_word = 0;

      if (ivl_signal_dimensions(sig) > 0) {
+#if 0
 	    assert(word_ix);
 	    assign_to_array_word(sig, word_ix, bit, delay, dexp, part_off_ex,
 	                         width, nevents);
+#else
+	    fprintf(stderr, "XXXX %%assign to array word not supported yet.\n");
+#endif
 	    return;
      }

@ -247,9 +251,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		  fprintf(vvp_out, "    %%assign/v0/x1/d v%p_%lu, %d, %u;\n",
 		          sig, use_word, delay_index, bit);
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 		  clr_word(delay_index);
 	    } else if (nevents != 0) {
@ -257,9 +265,13 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		  fprintf(vvp_out, "    %%assign/v0/x1/e v%p_%lu, %u;\n",
 		          sig, use_word, bit);
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 		  fprintf(vvp_out, "    %%evctl/c;\n");
 	    } else {
@ -267,6 +279,7 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  draw_eval_expr_into_integer(part_off_ex, 1);
 		    /* If the index expression has XZ bits, skip the assign. */
 		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+#if 0
 		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 		    /*
 		     * The %assign can only take a 32 bit delay. For a larger
@ -285,10 +298,14 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 			        "    %%assign/v0/x1 v%p_%lu, %lu, %u;\n",
 			        sig, use_word, low_d, bit);
 		  }
+#else
+		  assert(0); // XXXX
+#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 	    }

      } else if (part_off>0 || ivl_lval_width(lval)!=ivl_signal_width(sig)) {
+#if 0
 	      /* There is no mux expression, but a constant part
 		 offset. Load that into index x1 and generate a
 		 single-bit set instruction. */
@ -331,23 +348,41 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 			        sig, use_word, low_d, bit);
 		  }
 	    }
+#else
+	    if (dexp != 0) {
+		  assert(0); // XXXX
+
+	    } else if (nevents != 0) {
+		  assert(0); // XXXX
+
+	    } else {
+		  int offset_index = allocate_word();
+		  int delay_index = allocate_word();
+		  fprintf(vvp_out, "    %%ix/load %d, %lu, 0;\n", offset_index, part_off);
+		  if (dexp)
+			draw_eval_expr_into_integer(dexp,delay_index);
+		  else
+			fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
+				delay_index, low_d, hig_d);
+		  fprintf(vvp_out, "    %%assign/vec4/off/d v%p_%lu, %d, %d;\n",
+			  sig, use_word, offset_index, delay_index);
+		  clr_word(offset_index);
+		  clr_word(delay_index);
+	    }
+#endif

      } else if (dexp != 0) {
 	      /* Calculated delay... */
 	    int delay_index = allocate_word();
 	    draw_eval_expr_into_integer(dexp, delay_index);
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
-	    fprintf(vvp_out, "    %%assign/v0/d v%p_%lu, %d, %u;\n",
-		    sig, use_word, delay_index, bit);
+	    fprintf(vvp_out, "    %%assign/vec4/d v%p_%lu, %d;\n",
+		    sig, use_word, delay_index);
 	    clr_word(delay_index);
      } else if (nevents != 0) {
 	      /* Event control delay... */
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
-	    fprintf(vvp_out, "    %%assign/v0/e v%p_%lu, %u;\n",
-		    sig, use_word, bit);
+	    fprintf(vvp_out, "    %%assign/vec4/e v%p_%lu;\n",
+		    sig, use_word);
      } else {
-	      /* Constant delay... */
-	    fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
 	      /*
 	       * The %assign can only take a 32 bit delay. For a larger
 	       * delay we need to put it into an index register.
@ -356,12 +391,12 @@ static void assign_to_lvector(ivl_lval_t lval, unsigned bit,
 		  int delay_index = allocate_word();
 		  fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
 		          delay_index, low_d, hig_d);
-		  fprintf(vvp_out, "    %%assign/v0/d v%p_%lu, %d, %u;\n",
-		          sig, use_word, delay_index, bit);
+		  fprintf(vvp_out, "    %%assign/vec4/d v%p_%lu, %d;\n",
+		          sig, use_word, delay_index);
 		  clr_word(delay_index);
 	    } else {
-		  fprintf(vvp_out, "    %%assign/v0 v%p_%lu, %lu, %u;\n",
-		          sig, use_word, low_d, bit);
+		  fprintf(vvp_out, "    %%assign/vec4 v%p_%lu, %lu;\n",
+		          sig, use_word, low_d);
 	    }
      }
 }
@ -546,7 +581,7 @@ static int show_stmt_assign_nb(ivl_statement_t net)
      }


-      { struct vector_info res;
+      { struct vector_info res = {0,0};
 	unsigned wid;
 	unsigned lidx;
 	unsigned cur_rbit = 0;
@ -574,21 +609,29 @@ static int show_stmt_assign_nb(ivl_statement_t net)
 		      res.base, res.wid);

 	} else {
-	      res = draw_eval_expr(rval, 0);
-	      wid = res.wid;
+	      wid = ivl_stmt_lwidth(net);
+	      draw_eval_vec4(rval, 0);
+	      if (ivl_expr_width(rval) != wid) {
+		    if (ivl_expr_signed(rval))
+			  fprintf(vvp_out, "    %%pad/s %u;\n", wid);
+		    else
+			  fprintf(vvp_out, "    %%pad/u %u;\n", wid);
+	      }
 	}

+	  /* Spread the r-value vector over the bits of the l-value. */
 	for (lidx = 0 ;  lidx < ivl_stmt_lvals(net) ;  lidx += 1) {
 	      unsigned bit_limit = wid - cur_rbit;
-	      unsigned bidx;

 	      lval = ivl_stmt_lval(net, lidx);

 	      if (bit_limit > ivl_lval_width(lval))
 		    bit_limit = ivl_lval_width(lval);

-	      bidx = res.base < 4? res.base : (res.base+cur_rbit);
-	      assign_to_lvector(lval, bidx, delay, del, bit_limit, nevents);
+		/* XXXX For now, don't know how to actually split
+		   vectors */
+	      assert(lidx == 0);
+	      assign_to_lvector(lval, delay, del, nevents);

 	      cur_rbit += bit_limit;

@ -655,7 +698,6 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 {
      int rc = 0;
      ivl_expr_t expr = ivl_stmt_cond_expr(net);
-      struct vector_info cond = draw_eval_expr(expr, 0);
      unsigned count = ivl_stmt_case_count(net);

      unsigned local_base = local_count;
@ -666,6 +708,11 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)

      local_count += count + 1;

+	/* Evaluate the case condition to the top of the vec4
+	   stack. This expression will be compared multiple times to
+	   each case guard. */
+      draw_eval_vec4(expr,0);
+
 	/* First draw the branch table.  All the non-default cases
 	   generate a branch out of here, to the code that implements
 	   the case. The default will fall through all the tests. */
@ -673,55 +720,34 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)

      for (idx = 0 ;  idx < count ;  idx += 1) {
 	    ivl_expr_t cex = ivl_stmt_case_expr(net, idx);
-	    struct vector_info cvec;

 	    if (cex == 0) {
 		  default_case = idx;
 		  continue;
 	    }

-	      /* Is the guard expression something I can pass to a
-		 %cmpi/u instruction? If so, use that instead. */
-
-	    if ((ivl_statement_type(net) == IVL_ST_CASE)
-		&& (ivl_expr_type(cex) == IVL_EX_NUMBER)
-		&& (! number_is_unknown(cex))
-		&& number_is_immediate(cex, 16, 0)) {
-
-		  unsigned long imm = get_number_immediate(cex);
-
-		  fprintf(vvp_out, "    %%cmpi/u %u, %lu, %u;\n",
-			  cond.base, imm, cond.wid);
-		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 6;\n",
-			  thread_count, local_base+idx);
-
-		  continue;
-	    }
-
-	      /* Oh well, do this case the hard way. */
-
-	    cvec = draw_eval_expr_wid(cex, cond.wid, STUFF_OK_RO);
-	    assert(cvec.wid == cond.wid);
+	      /* Duplicate the case expression so that the cmp
+		 instructions below do not completely erase the
+		 value. Do this in fromt of each compare. */
+	    fprintf(vvp_out, "    %%dup/vec4;\n");
+	    draw_eval_vec4(cex, STUFF_OK_RO);

 	    switch (ivl_statement_type(net)) {

 		case IVL_ST_CASE:
-		  fprintf(vvp_out, "    %%cmp/u %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/u;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 6;\n",
 			  thread_count, local_base+idx);
 		  break;

 		case IVL_ST_CASEX:
-		  fprintf(vvp_out, "    %%cmp/x %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/x;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 4;\n",
 			  thread_count, local_base+idx);
 		  break;

 		case IVL_ST_CASEZ:
-		  fprintf(vvp_out, "    %%cmp/z %u, %u, %u;\n",
-			  cond.base, cvec.base, cond.wid);
+		  fprintf(vvp_out, "    %%cmp/z;\n");
 		  fprintf(vvp_out, "    %%jmp/1 T_%u.%u, 4;\n",
 			  thread_count, local_base+idx);
 		  break;
@ -729,14 +755,8 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 		default:
 		  assert(0);
 	    }
-
-	      /* Done with the case expression */
-	    clr_vector(cvec);
      }

-	/* Done with the condition expression */
-      clr_vector(cond);
-
 	/* Emit code for the default case. */
      if (default_case < count) {
 	    ivl_statement_t cst = ivl_stmt_case_stmt(net, default_case);
@ -757,6 +777,7 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)
 	    clear_expression_lookaside();
 	    rc += show_statement(cst, sscope);

+	      /* Statement is done, jump to the out of the case. */
 	    fprintf(vvp_out, "    %%jmp T_%u.%u;\n", thread_count,
 		    local_base+count);

@ -765,6 +786,10 @@ static int show_stmt_case(ivl_statement_t net, ivl_scope_t sscope)

 	/* The out of the case. */
      fprintf(vvp_out, "T_%u.%u ;\n",  thread_count, local_base+count);
+	/* The case tests will leave the case expression on the top of
+	   the stack, but we are done with it now. Pop it. */
+      fprintf(vvp_out, "    %%pop/vec4 1;\n");
+
      clear_expression_lookaside();

      return rc;
@ -1238,23 +1263,20 @@ static int show_stmt_condit(ivl_statement_t net, ivl_scope_t sscope)
      int rc = 0;
      unsigned lab_false, lab_out;
      ivl_expr_t expr = ivl_stmt_cond_expr(net);
-      struct vector_info cond;

      show_stmt_file_line(net, "If statement.");

-      cond = draw_eval_expr(expr, STUFF_OK_XZ|STUFF_OK_47|STUFF_OK_RO);
-
-      assert(cond.wid == 1);
+      draw_eval_vec4(expr, STUFF_OK_XZ|STUFF_OK_47|STUFF_OK_RO);

      lab_false = local_count++;
      lab_out = local_count++;

-      fprintf(vvp_out, "    %%jmp/0xz  T_%u.%u, %u;\n",
-	      thread_count, lab_false, cond.base);
-
-	/* Done with the condition expression. */
-      if (cond.base >= 8)
-	    clr_vector(cond);
+      int use_flag = allocate_flag();
+	/* The %flag/vec4 pops the vec4 bit and puts it to the flag. */
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
+      fprintf(vvp_out, "    %%jmp/0xz  T_%u.%u, %d;\n",
+	      thread_count, lab_false, use_flag);
+      clr_flag(use_flag);

      if (ivl_stmt_cond_true(net))
 	    rc += show_statement(ivl_stmt_cond_true(net), sscope);
@ -1320,20 +1342,19 @@ static int show_stmt_delayx(ivl_statement_t net, ivl_scope_t sscope)

      show_stmt_file_line(net, "Delay statement.");

+      int use_idx = allocate_word();
      switch (ivl_expr_value(expr)) {

 	  case IVL_VT_BOOL:
 	  case IVL_VT_LOGIC: {
-		struct vector_info del = draw_eval_expr(expr, 0);
-		fprintf(vvp_out, "    %%ix/get 0, %u, %u;\n",
-			del.base, del.wid);
-		clr_vector(del);
+		draw_eval_vec4(expr, 0);
+		fprintf(vvp_out, "    %%ix/vec4 %d;\n", use_idx);
 		break;
 	  }

 	  case IVL_VT_REAL: {
 		draw_eval_real(expr);
-		fprintf(vvp_out, "    %%cvt/ur 0;\n");
+		fprintf(vvp_out, "    %%cvt/ur %d;\n", use_idx);
 		break;
 	  }

@ -1341,7 +1362,9 @@ static int show_stmt_delayx(ivl_statement_t net, ivl_scope_t sscope)
 	    assert(0);
      }

-      fprintf(vvp_out, "    %%delayx 0;\n");
+      fprintf(vvp_out, "    %%delayx %d;\n", use_idx);
+      clr_word(use_idx);
+
 	/* Lots of things can happen during a delay. */
      clear_expression_lookaside();

@ -1755,7 +1778,6 @@ static int show_stmt_wait(ivl_statement_t net, ivl_scope_t sscope)
 static int show_stmt_while(ivl_statement_t net, ivl_scope_t sscope)
 {
      int rc = 0;
-      struct vector_info cvec;

      unsigned top_label = local_count++;
      unsigned out_label = local_count++;
@ -1771,14 +1793,16 @@ static int show_stmt_while(ivl_statement_t net, ivl_scope_t sscope)
 	/* Draw the evaluation of the condition expression, and test
 	   the result. If the expression evaluates to false, then
 	   branch to the out label. */
-      cvec = draw_eval_expr(ivl_stmt_cond_expr(net), STUFF_OK_XZ|STUFF_OK_47);
-      if (cvec.wid > 1)
-	    cvec = reduction_or(cvec);
+      draw_eval_vec4(ivl_stmt_cond_expr(net), STUFF_OK_XZ|STUFF_OK_47);
+      if (ivl_expr_width(ivl_stmt_cond_expr(net)) > 1) {
+	    fprintf(vvp_out, "    %%or/r;\n");
+      }

+      int use_flag = allocate_flag();
+      fprintf(vvp_out, "    %%flag_set/vec4 %d;\n", use_flag);
      fprintf(vvp_out, "    %%jmp/0xz T_%u.%u, %u;\n",
-	      thread_count, out_label, cvec.base);
-      if (cvec.base >= 8)
-	    clr_vector(cvec);
+	      thread_count, out_label, use_flag);
+      clr_flag(use_flag);

 	/* Draw the body of the loop. */
      rc += show_statement(ivl_stmt_sub_stmt(net), sscope);
@ -1966,7 +1990,7 @@ static unsigned is_repeat_event_assign(ivl_scope_t scope,
 */
 static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
 {
-      ivl_statement_t while_wait, wait, wait_stmt;
+      ivl_statement_t while_wait, wait_x, wait_stmt;
      ivl_expr_t while_expr, expr;
      const char *bits;
 	/* We must have two block elements. */
@ -1975,9 +1999,9 @@ static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
      while_wait = ivl_stmt_block_stmt(stmt, 0);
      if (ivl_statement_type(while_wait) != IVL_ST_WHILE) return 0;
 	/* That has a wait with a NOOP statement. */
-      wait = ivl_stmt_sub_stmt(while_wait);
-      if (ivl_statement_type(wait) != IVL_ST_WAIT) return 0;
-      wait_stmt = ivl_stmt_sub_stmt(wait);
+      wait_x = ivl_stmt_sub_stmt(while_wait);
+      if (ivl_statement_type(wait_x) != IVL_ST_WAIT) return 0;
+      wait_stmt = ivl_stmt_sub_stmt(wait_x);
      if (ivl_statement_type(wait_stmt) != IVL_ST_NOOP) return 0;
 	/* Check that the while condition has the correct form. */
      while_expr = ivl_stmt_cond_expr(while_wait);
@ -1994,7 +2018,7 @@ static unsigned is_wait(ivl_scope_t scope, ivl_statement_t stmt)
 	/* And finally the two statements that represent the wait must
 	 * have the same line number as the block. */
      if ((ivl_stmt_lineno(stmt) != ivl_stmt_lineno(while_wait)) ||
-          (ivl_stmt_lineno(stmt) != ivl_stmt_lineno(wait))) {
+          (ivl_stmt_lineno(stmt) != ivl_stmt_lineno(wait_x))) {
 	    return 0;
      }

--- a/vvp/codes.h
+++ b/vvp/codes.h
@ -47,9 +47,10 @@ extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AVE(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_MV(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0D(vthread_t thr, vvp_code_t code);
-extern bool of_ASSIGN_V0E(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4D(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4E(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1E(vthread_t thr, vvp_code_t code);
@ -86,6 +87,7 @@ extern bool of_CVT_UR(vthread_t thr, vvp_code_t code);
 extern bool of_CVT_VR(vthread_t thr, vvp_code_t code);
 extern bool of_DEASSIGN(vthread_t thr, vvp_code_t code);
 extern bool of_DEASSIGN_WR(vthread_t thr, vvp_code_t code);
+extern bool of_DEBUG_THR(vthread_t thr, vvp_code_t code);
 extern bool of_DELAY(vthread_t thr, vvp_code_t code);
 extern bool of_DELAYX(vthread_t thr, vvp_code_t code);
 extern bool of_DELETE_OBJ(vthread_t thr, vvp_code_t code);
@ -95,12 +97,16 @@ extern bool of_DIV(vthread_t thr, vvp_code_t code);
 extern bool of_DIV_S(vthread_t thr, vvp_code_t code);
 extern bool of_DIV_WR(vthread_t thr, vvp_code_t code);
 extern bool of_DUP_REAL(vthread_t thr, vvp_code_t code);
+extern bool of_DUP_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_END(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTL(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLC(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLI(vthread_t thr, vvp_code_t code);
 extern bool of_EVCTLS(vthread_t thr, vvp_code_t code);
 extern bool of_FILE_LINE(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_GET_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_SET_IMM(vthread_t thr, vvp_code_t code);
+extern bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_LINK(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_V(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_WR(vthread_t thr, vvp_code_t code);
@ -117,6 +123,8 @@ extern bool of_IX_LOAD(vthread_t thr, vvp_code_t code);
 extern bool of_IX_MOV(vthread_t thr, vvp_code_t code);
 extern bool of_IX_MUL(vthread_t thr, vvp_code_t code);
 extern bool of_IX_SUB(vthread_t thr, vvp_code_t code);
+extern bool of_IX_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_IX_VEC4_S(vthread_t thr, vvp_code_t code);
 extern bool of_JMP(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0XZ(vthread_t thr, vvp_code_t code);
@ -135,7 +143,7 @@ extern bool of_LOAD_DAR_STR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STRA(vthread_t thr, vvp_code_t code);
-extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X1P(vthread_t thr, vvp_code_t code);
@ -160,10 +168,13 @@ extern bool of_NORR(vthread_t thr, vvp_code_t code);
 extern bool of_NULL(vthread_t thr, vvp_code_t code);
 extern bool of_OR(vthread_t thr, vvp_code_t code);
 extern bool of_ORR(vthread_t thr, vvp_code_t code);
-extern bool of_PAD(vthread_t thr, vvp_code_t code);
+extern bool of_PAD_S(vthread_t thr, vvp_code_t code);
+extern bool of_PAD_U(vthread_t thr, vvp_code_t code);
+extern bool of_PART(vthread_t thr, vvp_code_t code);
 extern bool of_POP_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_POP_REAL(vthread_t thr, vvp_code_t code);
 extern bool of_POP_STR(vthread_t thr, vvp_code_t code);
+extern bool of_POP_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_POW(vthread_t thr, vvp_code_t code);
 extern bool of_POW_S(vthread_t thr, vvp_code_t code);
 extern bool of_POW_WR(vthread_t thr, vvp_code_t code);
@ -173,6 +184,7 @@ extern bool of_PROP_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PROP_V(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHI_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHI_REAL(vthread_t thr, vvp_code_t code);
+extern bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_PUSHV_STR(vthread_t thr, vvp_code_t code);
 extern bool of_PUTC_STR_V(vthread_t thr, vvp_code_t code);
 extern bool of_RELEASE_NET(vthread_t thr, vvp_code_t code);
@ -187,9 +199,9 @@ extern bool of_SET_DAR_OBJ_STR(vthread_t thr, vvp_code_t code);
 extern bool of_SET_VEC(vthread_t thr, vvp_code_t code);
 extern bool of_SET_X0(vthread_t thr, vvp_code_t code);
 extern bool of_SET_X0_X(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTL_I0(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTR_I0(vthread_t thr, vvp_code_t code);
-extern bool of_SHIFTR_S_I0(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTL(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTR(vthread_t thr, vvp_code_t code);
+extern bool of_SHIFTR_S(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_DAR_R(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_DAR_STR(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_OBJ(vthread_t thr, vvp_code_t code);
@ -201,6 +213,7 @@ extern bool of_STORE_REAL(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_REALA(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STR(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STRA(vthread_t thr, vvp_code_t code);
+extern bool of_STORE_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_SUB(vthread_t thr, vvp_code_t code);
 extern bool of_SUB_WR(vthread_t thr, vvp_code_t code);
 extern bool of_SUBI(vthread_t thr, vvp_code_t code);
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@ -85,11 +85,11 @@ struct opcode_table_s {

 static const struct opcode_table_s opcode_table[] = {
      { "%abs/wr", of_ABS_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%add",    of_ADD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%add",    of_ADD,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%add/wr", of_ADD_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%alloc",  of_ALLOC,  1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
-      { "%and",    of_AND,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%and",    of_AND,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%assign/ar",of_ASSIGN_AR,2,{OA_ARR_PTR,OA_BIT1,     OA_NONE} },
@ -98,17 +98,18 @@ static const struct opcode_table_s opcode_table[] = {
      { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1,     OA_BIT2} },
      { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1,  OA_BIT2} },
      { "%assign/av/e",of_ASSIGN_AVE,2,{OA_ARR_PTR,OA_BIT1,  OA_NONE} },
-      { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
-      { "%assign/v0/d",of_ASSIGN_V0D,3,{OA_FUNC_PTR,OA_BIT1, OA_BIT2} },
-      { "%assign/v0/e",of_ASSIGN_V0E,2,{OA_FUNC_PTR,OA_BIT1, OA_NONE} },
      { "%assign/v0/x1",of_ASSIGN_V0X1,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
      { "%assign/v0/x1/d",of_ASSIGN_V0X1D,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
      { "%assign/v0/x1/e",of_ASSIGN_V0X1E,2,{OA_FUNC_PTR,OA_BIT1,OA_NONE} },
+      { "%assign/vec4",  of_ASSIGN_VEC4, 2,{OA_FUNC_PTR, OA_BIT1, OA_NONE} },
+      { "%assign/vec4/d",of_ASSIGN_VEC4D,2,{OA_FUNC_PTR, OA_BIT1, OA_NONE} },
+      { "%assign/vec4/e",of_ASSIGN_VEC4E,1,{OA_FUNC_PTR, OA_NONE, OA_NONE} },
+      { "%assign/vec4/off/d",of_ASSIGN_VEC4_OFF_D, 3,{OA_FUNC_PTR, OA_BIT1, OA_BIT2} },
      { "%assign/wr",  of_ASSIGN_WR, 2,{OA_VPI_PTR, OA_BIT1, OA_NONE} },
      { "%assign/wr/d",of_ASSIGN_WRD,2,{OA_VPI_PTR, OA_BIT1, OA_NONE} },
      { "%assign/wr/e",of_ASSIGN_WRE,1,{OA_VPI_PTR, OA_NONE, OA_NONE} },
      { "%assign/x0",of_ASSIGN_X0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
-      { "%blend",    of_BLEND,   3,  {OA_BIT1,  OA_BIT2,     OA_NUMBER} },
+      { "%blend",    of_BLEND,   0,  {OA_NONE,  OA_NONE,     OA_NONE} },
      { "%blend/wr", of_BLEND_WR,0,  {OA_NONE,  OA_NONE,     OA_NONE} },
      { "%breakpoint", of_BREAKPOINT, 0,  {OA_NONE, OA_NONE, OA_NONE} },
      { "%cassign/link",of_CASSIGN_LINK,2,{OA_FUNC_PTR,OA_FUNC_PTR2,OA_NONE} },
@ -116,9 +117,9 @@ static const struct opcode_table_s opcode_table[] = {
      { "%cassign/wr",of_CASSIGN_WR,1,{OA_FUNC_PTR,OA_NONE,  OA_NONE} },
      { "%cassign/x0",of_CASSIGN_X0,3,{OA_FUNC_PTR,OA_BIT1,  OA_BIT2} },
      { "%cast2",  of_CAST2,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%cmp/s",  of_CMPS,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmp/s",  of_CMPS,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmp/str",of_CMPSTR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%cmp/u",  of_CMPU,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmp/u",  of_CMPU,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmp/wr", of_CMPWR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmp/ws", of_CMPWS,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
      { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
@ -137,6 +138,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%cvt/vr", of_CVT_VR, 2,  {OA_BIT1,     OA_NUMBER,   OA_NONE} },
      { "%deassign",of_DEASSIGN,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
      { "%deassign/wr",of_DEASSIGN_WR,1,{OA_FUNC_PTR, OA_NONE,     OA_NONE} },
+      { "%debug/thr",  of_DEBUG_THR,  0,{OA_NONE,     OA_NONE,     OA_NONE} },
      { "%delay",  of_DELAY,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
      { "%delayx", of_DELAYX, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
      { "%delete/obj",of_DELETE_OBJ,1,{OA_FUNC_PTR,OA_NONE,  OA_NONE} },
@ -145,17 +147,21 @@ static const struct opcode_table_s opcode_table[] = {
      { "%div/s",  of_DIV_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%div/wr",   of_DIV_WR,  0, {OA_NONE,   OA_NONE,     OA_NONE} },
      { "%dup/real", of_DUP_REAL,0, {OA_NONE,   OA_NONE,     OA_NONE} },
+      { "%dup/vec4", of_DUP_VEC4,0, {OA_NONE,   OA_NONE,     OA_NONE} },
      { "%end",    of_END,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%evctl",  of_EVCTL,  2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
      { "%evctl/c",of_EVCTLC, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%evctl/i",of_EVCTLI, 2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
      { "%evctl/s",of_EVCTLS, 2,  {OA_FUNC_PTR, OA_BIT1,     OA_NONE} },
+      { "%flag_get/vec4", of_FLAG_GET_VEC4, 1, {OA_NUMBER, OA_NONE, OA_NONE} },
+      { "%flag_set/imm",  of_FLAG_SET_IMM,  2, {OA_NUMBER, OA_BIT1, OA_NONE} },
+      { "%flag_set/vec4", of_FLAG_SET_VEC4, 1, {OA_NUMBER, OA_NONE, OA_NONE} },
      { "%force/link",of_FORCE_LINK,2,{OA_FUNC_PTR,OA_FUNC_PTR2,OA_NONE} },
      { "%force/v",of_FORCE_V,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
      { "%force/wr",of_FORCE_WR,1,{OA_FUNC_PTR, OA_NONE,     OA_NONE} },
      { "%force/x0",of_FORCE_X0,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
      { "%free",   of_FREE,   1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
-      { "%inv",    of_INV,    2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
+      { "%inv",    of_INV,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%ix/add", of_IX_ADD, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
      { "%ix/get", of_IX_GET, 3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%ix/get/s",of_IX_GET_S,3,{OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@ -165,6 +171,8 @@ static const struct opcode_table_s opcode_table[] = {
      { "%ix/mov", of_IX_MOV, 2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
      { "%ix/mul", of_IX_MUL, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
      { "%ix/sub", of_IX_SUB, 3,  {OA_NUMBER,   OA_BIT1,     OA_BIT2} },
+      { "%ix/vec4",  of_IX_VEC4,  1,  {OA_NUMBER,   OA_NONE, OA_NONE} },
+      { "%ix/vec4/s",of_IX_VEC4_S,1,  {OA_NUMBER,   OA_NONE, OA_NONE} },
      { "%jmp",    of_JMP,    1,  {OA_CODE_PTR, OA_NONE,     OA_NONE} },
      { "%jmp/0",  of_JMP0,   2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
      { "%jmp/0xz",of_JMP0XZ, 2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
@ -183,7 +191,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%load/real", of_LOAD_REAL,1,{OA_VPI_PTR, OA_NONE,     OA_NONE} },
      { "%load/str",  of_LOAD_STR, 1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
      { "%load/stra", of_LOAD_STRA,2,{OA_ARR_PTR, OA_BIT1,     OA_NONE} },
-      { "%load/v", of_LOAD_VEC,3,    {OA_BIT1,    OA_FUNC_PTR, OA_BIT2} },
+      { "%load/vec4", of_LOAD_VEC4,1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
      { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/vp0/s",of_LOAD_VP0_S,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/x1p",of_LOAD_X1P,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
@ -206,12 +214,15 @@ static const struct opcode_table_s opcode_table[] = {
      { "%nor",    of_NOR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%nor/r",  of_NORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%null",   of_NULL,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%or",     of_OR,     3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%or",     of_OR,     0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%pad",    of_PAD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%pad/s",  of_PAD_S,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%pad/u",  of_PAD_U,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%part",   of_PART,   1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
      { "%pop/obj", of_POP_OBJ, 2, {OA_BIT1,    OA_BIT2,     OA_NONE} },
      { "%pop/real",of_POP_REAL,1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
      { "%pop/str", of_POP_STR, 1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
+      { "%pop/vec4",of_POP_VEC4,1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
      { "%pow",    of_POW,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%pow/s",  of_POW_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%pow/wr", of_POW_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@ -221,6 +232,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%prop/v",  of_PROP_V,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%pushi/real",of_PUSHI_REAL,2,{OA_BIT1,   OA_BIT2,   OA_NONE} },
      { "%pushi/str", of_PUSHI_STR, 1,{OA_STRING, OA_NONE,   OA_NONE} },
+      { "%pushi/vec4",of_PUSHI_VEC4,3,{OA_BIT1,   OA_BIT2,   OA_NUMBER} },
      { "%pushv/str", of_PUSHV_STR, 2, {OA_BIT1,OA_BIT2,     OA_NONE} },
      { "%putc/str/v",of_PUTC_STR_V,3,{OA_FUNC_PTR,OA_BIT1,  OA_BIT2} },
      { "%release/net",of_RELEASE_NET,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
@ -234,9 +246,9 @@ static const struct opcode_table_s opcode_table[] = {
      { "%set/dar/obj/str", of_SET_DAR_OBJ_STR, 1,{OA_NUMBER,OA_NONE,OA_NONE} },
      { "%set/v",  of_SET_VEC,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
      { "%set/x0", of_SET_X0, 3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
-      { "%shiftl/i0", of_SHIFTL_I0, 2, {OA_BIT1,OA_NUMBER,   OA_NONE} },
-      { "%shiftr/i0", of_SHIFTR_I0, 2, {OA_BIT1,OA_NUMBER,   OA_NONE} },
-      { "%shiftr/s/i0", of_SHIFTR_S_I0,2,{OA_BIT1,OA_NUMBER, OA_NONE} },
+      { "%shiftl",   of_SHIFTL,   1, {OA_NUMBER, OA_NONE,   OA_NONE} },
+      { "%shiftr",   of_SHIFTR,   1, {OA_NUMBER, OA_NONE,   OA_NONE} },
+      { "%shiftr/s", of_SHIFTR_S, 1, {OA_NUMBER, OA_NONE,   OA_NONE} },
      { "%store/dar/r",  of_STORE_DAR_R,   1, {OA_FUNC_PTR, OA_NONE, OA_NONE} },
      { "%store/dar/str",of_STORE_DAR_STR, 1, {OA_FUNC_PTR, OA_NONE, OA_NONE} },
      { "%store/obj",   of_STORE_OBJ,   1, {OA_FUNC_PTR,OA_NONE, OA_NONE} },
@ -248,7 +260,8 @@ static const struct opcode_table_s opcode_table[] = {
      { "%store/reala", of_STORE_REALA, 2, {OA_ARR_PTR, OA_BIT1, OA_NONE} },
      { "%store/str",   of_STORE_STR,   1, {OA_FUNC_PTR,OA_NONE, OA_NONE} },
      { "%store/stra",  of_STORE_STRA,  2, {OA_ARR_PTR, OA_BIT1, OA_NONE} },
-      { "%sub",    of_SUB,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%store/vec4",  of_STORE_VEC4,  2, {OA_FUNC_PTR,OA_BIT1, OA_NONE} },
+      { "%sub",    of_SUB,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%sub/wr", of_SUB_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%subi",   of_SUBI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%substr",  of_SUBSTR,  2,{OA_BIT1,     OA_BIT2,     OA_NONE} },
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@ -32,12 +32,19 @@ experience of implementing it for strings, I'll want to change other
 types around to using this method as well. Keep this in mind whenever
 considering adding new instructions to vvp.

+FLAGS
+
+There are up to 16 bits in each thread that are available for
+flags. These are used as destinations for operations that return
+boolean values, for example comparisons. They are also used as inputs
+for test and branch opcodes.
+
 * %abs/wr <bit-o>, <bit-i>

 This instruction calculates the absolute value of a real value. It uses
 the fabs() function in the run-time to do the work.

-* %add <bit-l>, <bit-r>, <wid>
+* %add <bit-l>, <bit-r>, <wid> (XXXX Old version)

 This instruction adds the right vector into the left vector, the
 vectors having the width <wid>. If any of the bits of either vector
@ -46,6 +53,13 @@ sum.

 See also the %sub instruction.

+* %add
+
+This opcode pops and adds two vec4 values from the vec4 stack, adds
+them, and pushes the result back to the stack. The input values must
+have the same size, and the pushed result will have the same width.
+
+See also the %sub instruction.

 * %add/wr <bit-l>, <bit-r>

@ -67,17 +81,20 @@ is zero extended to match any width.
 This instruction allocates the storage for a new instance of an
 automatically allocated scope.

-* %and <bit-l>, <bit-r>, <wid>
+* %and

-Perform the bitwise AND of the two vectors, and store the result in
-the left vector. Each bit is calculated independent of other bits. AND
-means the following:
+Perform the bitwise AND of the two vectors popped from the vec4 stack,
+and push the result. Each bit is calculated independent of other
+bits. AND means the following:

 	0 and ? --> 0
 	? and 0 --> 0
 	1 and 1 --> 1
 	otherwise   x

+The input vectors must be the same width, and the output vector will
+be the width of the input.
+
 * %assign/ar <array-label>, <delay>
 * %assign/ar/d <array-label>, <delayx>
 * %assign/ar/e <array-label>
@ -123,9 +140,9 @@ The %assign/av/e variation uses the information in the thread
 event control registers to determine when to perform the assign.
 %evctl is used to set the event control information.

-* %assign/v0 <var-label>, <delay>, <bit>
-* %assign/v0/d <var-label>, <delayx>, <bit>
-* %assign/v0/e <var-label>, <bit>
+* %assign/v0 <var-label>, <delay>, <bit> (XXXX Old description)
+* %assign/v0/d <var-label>, <delayx>, <bit> (XXXX Old description
+* %assign/v0/e <var-label>, <bit> (XXXX Old description)

 The %assign/v0 instruction is a vector version of non-blocking
 assignment. The <delay> is the number of clock ticks in the future
@ -152,6 +169,27 @@ This is similar to the %assign/v0 instruction, but adds the index-1
 index register with the canonical index of the destination where the
 vector is to be written. This allows for part writes into the vector.

+* %assign/vec4 <var-label>, <delay>
+* %assign/vec4/d <var-label>, <delayx>
+* %assign/vec4/e <var-label>
+
+The %assign/vec4 instruction if a vec4 version of non-blocking
+assignment, The <delay> is the number lf clock ticks in the future
+where the assignment should schedule, and the value to assign is
+pulled from the vec4 stack.
+
+The %assign/vec4/d instruction is the same, but gets its delay value
+from the index register <delayx> instead.
+
+* %assign/vec4/off/d <var-label>, <off-index>, <delay-index>
+
+This is for writing parts to the target variable. The <var-label> is
+the variable to write, as usual. The <off-index> selects an index
+register that holds the offset into the target variable, and the
+<delay-index> selects the index register that contains the delay. The
+offset is in canonical bits. The width that is written is taken from
+the width of the value on the stack.
+
 * %assign/wr <vpi-label>, <delay>
 * %assign/wr/d <vpi-label>, <delayx>
 * %assign/wr/e <vpi-label>
@ -180,10 +218,12 @@ The <bit> is the address of the thread register that contains the bit
 value to assign.


-* %blend <bit-l>, <bit-r>, <wid>
+* %blend

-This instruction blends the bits of a vector into the destination in a
-manner like the expression (x ? <a> : <b>). The truth table is:
+This instruction blends the bits of two vectors into a result in a
+manner line the expressions ('bx ? <a> : <b>). The two source vectors
+are popped from the vec4 stack (and must have the same width) and the
+result poshed in their place. The truth table for each bit is:

 	1  1 --> 1
 	0  0 --> 0
@ -238,8 +278,8 @@ Convert the source vector, of type logic, to a bool vector by
 changing all the X and Z bits to 0. The source and destinations may
 overlap.

-* %cmp/u <bit-l>, <bit-r>, <wid>
-* %cmp/s <bit-l>, <bit-r>, <wid>
+* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
+* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)

 These instructions perform a generic comparison of two vectors of equal
 size. The <bit-l> and <bit-r> numbers address the least-significant
@ -268,6 +308,21 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
 compare. In either case, if either operand contains x or z, then lt
 bit gets the x value.

+* %cmp/s
+* %cmp/u
+
+These instructions perform a generic comparison of two vectors of
+equal size. Two values are pulled from the top of the stack, and not
+replaced. The results are written into flag bits 4,5,6. The
+expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
+from the stack first, then (a).
+
+The results of the comparison go into flags 4, 5, 6 and 7:
+
+	4: eq  (equal)
+	5: lt  (less than)
+	6: eeq (case equal)
+
 * %cmpi/s <bit-l>, <immr>, <wid>
 * %cmpi/u <bit-l>, <immr>, <wid>

@ -424,6 +479,7 @@ right operand is 0, then the result is NaN.


 * dup/real
+* dup/vec4

 These opcodes duplicate the value on the top of the stack for the
 corresponding type.
@ -458,6 +514,18 @@ the format of the output is:
 <description> is a string, if string is 0 then the following default
 message is used: "Procedural tracing.".

+* %flag_set/imm <flag>, <value>
+
+This instruction sets an immediate value into a flag bit. This is a
+single bit, and the value is 0==0, 1==1, 2==z, 3==x.
+
+* %flag_get/vec4 <flag>
+* %flag_set/vec4 <flag>
+
+These instructions provide a means for accessing flag bits. The
+%flag_get/vec4 loads the numbered flag as a vec4 on top of the vec4
+stack, and the %flag_set/vec4 pops the top of the vec4 stack and
+writes the LSB to the selected flag.

 * %force/v <label>, <bit>, <wid>

@ -497,10 +565,10 @@ This instruction de-allocates the storage for a previously allocated
 instance of as automatically allocated scope.


-* %inv <bit>, <wid>
+* %inv

-Perform a bitwise invert of the vector starting at <bit>. The result
-replaces the input. Invert means the following, independently for each
+Perform a bitwise invert of the vector on top of the vec4 stack. The result
+replaces the input. Invert means the following, independently, for each
 bit:

 	0  --> 1
@ -509,20 +577,20 @@ bit:
 	z  --> x


-* %ix/get <idx>, <bit>, <wid>
-* %ix/get/s <idx>, <bit>, <wid>
+* %ix/vec4 <idx>
+* %ix/vec4/s <idx>

-This instruction loads a thread vector starting at <bit>, size <wid>,
-into the index register <idx>. The <bit> is the LSB of the value in
-thread bit space, and <wid> is the width of the vector.
+This instruction loads a vec4 value from the vec4 stack, into the
+index register <idx>. The value is popped from the vec4 stack and
+written to the index register.

-The function converts the 4-value bits into a binary number, without
-sign extension. If any of the bits of the vector is x or z, then the
-index register gets the value 0. The %ix/get/s is the same, except
-that it assumes the source vector is sign extended to fit the index
-register.
+The %ix/vec4 instruction converts the 4-value bits into a binary
+number, without sign extension. If any of the bits of the vector is x
+or z, then the index register gets the value 0. The %ix/vec4/s
+instruction is the same, except that it assumes the source vector is
+sign extended to fit the index register.

-The function also writes into bit 4 a 1 if any of the bits of the
+The instruction also writes into bit 4 a 1 if any of the bits of the
 input vector are x or z. This is a flag that the 0 value written into
 the index register is really the result of calculating from unknown
 bits.
@ -568,10 +636,10 @@ the index register <src>.
 The %jmp instruction performs an unconditional branch to a given
 location. The parameter is the label of the destination instruction.

-* %jmp/[01xz] <code-label>, <bit>
+* %jmp/[01xz] <code-label>, <flag>

 This is a conditional version of the %jmp instruction. In this case,
-a single bit (addressed by <bit>) is tested. If it is one of the
+a flag bit (addressed by <bit>) is tested. If it is one of the
 values in the part after the /, the jump is taken. For example:

 	%jmp/xz T_label, 8;
@ -663,7 +731,7 @@ strings, and there is an index value in index register 3.
 (See also %store/dar/str)


-* %load/v <bit>, <functor-label>, <wid>
+* %load/v <bit>, <functor-label>, <wid> (XXXX Old implementation)

 This instruction loads a vector value from the given functor node into
 the specified thread register bit. The functor-label can refer to a
@ -674,6 +742,11 @@ width at the functor. If the <wid> is less than the width at the
 functor, then the most significant bits are dropped. If the <wid> is
 more than the width at the functor, the value is padded with X bits.

+* %load/vec4 <var-label>
+
+This instruction loads a vector value from the given functor node and
+pushes it onto the vec4 stack. See also the %store/vec4 instruction.
+
 * %load/vp0 <bit>, <functor-label>, <wid>
 * %load/vp0/s <bit>, <functor-label>, <wid>

@ -836,10 +909,11 @@ the vector.
 Push a null object and push it to the object stack. The null object
 can be used with any class or darray object, so it is not typed.

-* %or <dst>, <src>, <wid>
+* %or

-Perform the bitwise or of the vectors. Each bit in the <dst> is
-combined with the corresponding bit in the source, according to the
+Perform the bitwise or of twp vectors. Pop two values from the vec4
+stack to get the input arguments. Each bit in the result is combined
+with the corresponding bit in the input arguments, according to the
 truth table:

 	1 or ? --> 1
@ -847,6 +921,8 @@ truth table:
 	0 or 0 --> 0
 	otherwise  x

+The results is then pushed onto the vec4 stack. The inputs and the
+output are all the same width.

 * %or/r <dst>, <src>, <wid>

@ -855,18 +931,33 @@ and the <dst> is a writable scalar. The <dst> gets the value of the
 or of all the bits of the src vector.


-* %pad <dst>, <src>, <wid>
+* %pad <dst>, <src>, <wid> (XXXX Old version)

 This instruction replicates a single bit in register space into a
 destination vector in register space. The destination may overlap
 the source bit. The <dst> may not be 0-3. This is useful for zero
 or sign extending a vector.

+* %pad/s <wid>
+* %pad/u <wid>
+
+These instruction change the size of the top item in the vec4
+stack. If this item is larger then this, it is truncated. If smaller,
+then extended. The /s variant sign extends, the /u variant unsigned
+extends.
+
+* %part <wid>
+
+This instruction implements a part select. It pops from the top of the
+vec4 the base value, then it pops the base to select from. The width
+is the fixed number <wid>. The result is pushed back to the stack.
+
 * %pop/str <num>
 * %pop/real <num>
 * %pop/obj <num>, <skip>
+* %pop/vec4 <num>

-Pop <num> items from the string/real/object stack. This is the
+Pop <num> items from the string/real/object/vec4 stack. This is the
 opposite of the %pushX/str opcode which pushes a string to the
 stack. The %pop/str is not normally needed because the %store/str
 includes an implicit pop, but sometimes it is necessary to pop
@ -917,6 +1008,21 @@ If <exp>==0x3fff and <mant> != 0, the value is NaN.

 Push a literal string to the string stack.

+* %pushi/vec4 <vala>, <valb>, <wid>
+
+This opcode loads an immediate value, vector4, into the vector
+stack. The <vala> is the boolean value bits, and the <valb> bits are
+modifiers to support z and x values. The a/b encodings for the 4
+possible logic values are:
+
+   a b  val
+   0 0   0
+   1 0   1
+   1 1   x
+   0 1   z
+
+This opcode is limited to 32bit numbers.
+
 * %pushv/str <src>, <wid>

 Convert a vector to a string and push the string to the string stack.
@ -1013,7 +1119,7 @@ not assigned. Also, if the bits go beyond the end of the signal, those
 bits are not written anywhere.


-* %shiftl/i0 <bit>, <wid>
+* %shiftl/i0 <bit>, <wid> (XXXX Old implementation)

 This instruction shifts the vector left (towards more significant
 bits) by the amount in index register 0. The <bit> is the address of
@ -1022,8 +1128,8 @@ done in place. Zero values are shifted in.

 For a negative shift the value is padded with 'bx.

-* %shiftr/i0 <bit>, <wid>
-* %shiftr/s/i0 <bit>, <wid>
+* %shiftr/i0 <bit>, <wid> (XXXX Old implementation)
+* %shiftr/s/i0 <bit>, <wid> (XXXX Old implementation)

 This instruction shifts the vector right (towards the less significant
 bits) by the amount in the index register 0. The <bit> is the address
@ -1035,6 +1141,14 @@ top bits. %shiftr/s/i0 is a signed shift, so the value is sign-extended.

 For a negative shift %shiftr/i0 will pad the value with 'bx.

+* %shiftl <idx>
+* %shiftr <idx>
+* %shiftr/s <idx>
+
+These instructions shift the top value in the vec4 stack left (towards
+MSB) or right, possibly signed. The <idx> is the address of the index
+register that contains the amount to shift.
+
 * %store/obj <var-label>

 This pops the top of the object stack and writes it to the object
@ -1077,7 +1191,14 @@ The %store/stra targets an array.
 The %store/dar/str is similar, but the target is a dynamic array of
 string string. The index is taken from signed index register 3.

-* %sub <bit-l>, <bit-r>, <wid>
+* %store/vec4 <var-label>, <wid>
+
+Store a logic vector into the variable. The value (and its width) is
+popped off the top of the stack and written to the variable. The value
+is then optionally truncated to <wid> bits and assigned to the
+variable. It is an error for the value to be fewer then <wid> bits.
+
+* %sub <bit-l>, <bit-r>, <wid> (XXXX Old version)

 This instruction arithmetically subtracts the right vector out of the
 left vector. It accomplishes this by adding to the left vector 1 plus
@ -1088,6 +1209,14 @@ operand are x, then the entire result is x.

 See also the %add instruction.

+* %sub
+
+This instruction subtracts vec4 values. The right value is popped from
+the vec4 stack, then the left value is popped. The right is subtracted
+from the left, and the result pushed.
+
+See also the %add instruction.
+
 * %subi <bit-l>, <imm>, <wid>

 This instruction arithmetically subtracts the immediate value from the
@ -1121,7 +1250,8 @@ values into the vector space. The string value is NOT popped.
 * %test_nul <var-label>

 This instruction tests the contents of the addressed variable to see
-if it is null. If it is, set bit 4 to 1. Otherwise, set bit 4 to 0.
+if it is null. If it is, set flag bit 4 to 1. Otherwise, set flag bit
+4 to 0.

 This is intended to implement the SystemVerilog expression
 (<var>==null), where <var> is a class variable.
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc