diff --git a/scripts/devel-stub.sh b/scripts/devel-stub.sh
index 3f26a56aa..7d12e59ba 100644
--- a/scripts/devel-stub.sh
+++ b/scripts/devel-stub.sh
@@ -9,6 +9,6 @@
 #
 # NOTE: DO NOT INSTALL THIS FILE.
 
-./ivl -v -Ctgt-stub/stub.conf -C./scripts/devel-stub.conf -Pa.pf -Na.net -fDLL=tgt-stub/stub.tgt foo.vl |& tee foo.log
+./ivl -v -Ctgt-stub/stub.conf -C./scripts/devel-stub.conf -Pa.pf -Na.net -fDLL=tgt-stub/stub.tgt foo.vl | tee foo.log 2>&1
 
 echo "*** ivl command completed"
diff --git a/tgt-vvp/draw_vpi.c b/tgt-vvp/draw_vpi.c
index 2583256c3..4a2f652f9 100644
--- a/tgt-vvp/draw_vpi.c
+++ b/tgt-vvp/draw_vpi.c
@@ -29,13 +29,13 @@
 
 struct args_info {
       char*text;
-      int vec_flag; /* True if the vec must be released. */
-      struct vector_info vec;
+	/* True if this argument is a calculated vec4. */
+      char vec_flag;
 	/* True if this argument is a calculated string. */
       char str_flag;
 	/* True if this argument is a calculated real. */
       char real_flag;
-	/* Stack position if this argument is a calculated string. */
+	/* Stack position if this argument is a calculated value. */
       unsigned stack;
       struct args_info *child; /* Arguments can be nested. */
 };
@@ -156,6 +156,7 @@ static int get_vpi_taskfunc_signal_arg(struct args_info *result,
 			      return 0;
 			}
 		  } else if (word_ex) {
+#if 0
 			/* Fallback case: evaluate expression. */
 			struct vector_info av;
 			av = draw_eval_expr(word_ex, STUFF_OK_XZ);
@@ -164,6 +165,9 @@ static int get_vpi_taskfunc_signal_arg(struct args_info *result,
 			         (ivl_expr_signed(word_ex) ? "s" : "u"));
 			result->vec = av;
 			result->vec_flag = 1;
+#else
+			assert(0); // XXXX
+#endif
 		  } else {
 			assert(use_word_defined);
 			snprintf(buffer, sizeof buffer, "&A<v%p, %u>",
@@ -247,6 +251,7 @@ static int get_vpi_taskfunc_signal_arg(struct args_info *result,
 			return 0;
 		  }
 	    } else {
+#if 0
 		    /* Fallback case: evaluate the expression. */
 		  struct vector_info rv;
 		  rv = draw_eval_expr(bexpr, STUFF_OK_XZ);
@@ -258,6 +263,9 @@ static int get_vpi_taskfunc_signal_arg(struct args_info *result,
 		           ivl_expr_width(expr));
 		  result->vec = rv;
 		  result->vec_flag = 1;
+#else
+		  assert(0); // XXXX
+#endif
 	    }
 	    result->text = strdup(buffer);
 	    return 1;
@@ -286,6 +294,7 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 	/* Keep track of how much string stack this function call is
 	   going to need. We'll need this for making stack references,
 	   and also to clean out the stack when done. */
+      unsigned vec4_stack_need = 0;
       unsigned str_stack_need = 0;
       unsigned real_stack_need = 0;
 
@@ -389,17 +398,17 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 	    switch (ivl_expr_value(expr)) {
 		case IVL_VT_LOGIC:
 		case IVL_VT_BOOL:
+		  draw_eval_vec4(expr, 0);
 		  args[idx].vec_flag = 1;
-		  args[idx].vec = draw_eval_expr(expr, 0);
-		  snprintf(buffer, sizeof buffer,
-			   "T<%u,%u,%s>", args[idx].vec.base, args[idx].vec.wid,
-			   ivl_expr_signed(expr)? "s" : "u");
+		  args[idx].str_flag = 0;
+		  args[idx].real_flag = 0;
+		  args[idx].stack = vec4_stack_need;
+		  vec4_stack_need += 1;
+		  buffer[0] = 0;
 		  break;
 		case IVL_VT_REAL:
 		  draw_eval_real(expr);
 		  args[idx].vec_flag = 0;
-		  args[idx].vec.base = 0;
-		  args[idx].vec.wid  = 0;
 		  args[idx].str_flag = 0;
 		  args[idx].real_flag = 1;
 		  args[idx].stack = real_stack_need;
@@ -411,9 +420,8 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 		       about the stack position. */
 		  draw_eval_string(expr);
 		  args[idx].vec_flag = 0;
-		  args[idx].vec.base = 0;
-		  args[idx].vec.wid = 0;
 		  args[idx].str_flag = 1;
+		  args[idx].real_flag = 0;
 		  args[idx].stack = str_stack_need;
 		  args[idx].real_flag = 0;
 		  str_stack_need += 1;
@@ -434,7 +442,7 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 	    struct args_info*ptr;
 
 	    if (args[idx].str_flag) {
-		    /* If this is a string stack reference, then
+		    /* If this is a stack reference, then
 		       calculate the stack depth and use that to
 		       generate the completed string. */
 		  unsigned pos = str_stack_need - args[idx].stack - 1;
@@ -442,18 +450,14 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 	    } else if (args[idx].real_flag) {
 		  unsigned pos = real_stack_need - args[idx].stack - 1;
 		  fprintf(vvp_out, ", W<%u,r>",pos);
+	    } else if (args[idx].vec_flag) {
+		  unsigned pos = vec4_stack_need - args[idx].stack - 1;
+		  fprintf(vvp_out, ", S<%u,vec4>",pos);
 	    } else {
 		  fprintf(vvp_out, ", %s", args[idx].text);
 	    }
 
 	    free(args[idx].text);
-	      /* Clear the nested children vectors. */
-	    for (ptr = &args[idx]; ptr != NULL; ptr = ptr->child) {
-		  if (ptr->vec_flag) {
-			if (ptr->vec.wid > 0) clr_vector(ptr->vec);
-			else clr_word(ptr->vec.base);
-		  }
-	    }
 	      /* Free the nested children. */
 	    ptr = args[idx].child;
 	    while (ptr != NULL) {
@@ -465,7 +469,7 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 
       free(args);
 
-      fprintf(vvp_out, " {%u %u}", real_stack_need, str_stack_need);
+      fprintf(vvp_out, " {%u %u %u}", vec4_stack_need, real_stack_need, str_stack_need);
       fprintf(vvp_out, ";\n");
 }
 
@@ -487,7 +491,7 @@ void draw_vpi_task_call(ivl_statement_t tnet)
       }
 
       if (parm_count == 0) {
-            fprintf(vvp_out, "    %s %u %u \"%s\" {0 0};\n", command,
+            fprintf(vvp_out, "    %s %u %u \"%s\" {0 0 0};\n", command,
                     ivl_file_table_index(ivl_stmt_file(tnet)),
                     ivl_stmt_lineno(tnet), ivl_stmt_name(tnet));
       } else {
@@ -499,27 +503,16 @@ void draw_vpi_task_call(ivl_statement_t tnet)
       }
 }
 
-struct vector_info draw_vpi_func_call(ivl_expr_t fnet, unsigned wid)
+void draw_vpi_func_call(ivl_expr_t fnet)
 {
       char call_string[1024];
-      struct vector_info res;
 
-      res.base = allocate_vector(wid);
-      res.wid  = wid;
-      if (res.base == 0) {
-	    fprintf(stderr, "%s:%u: vvp.tgt error: "
-		    "Unable to allocate %u thread bits for system function result.\n",
-		    ivl_expr_file(fnet), ivl_expr_lineno(fnet), wid);
-	    vvp_errors += 1;
-      }
-
-      sprintf(call_string, "    %%vpi_func %u %u \"%s\", %u, %u",
+      sprintf(call_string, "    %%vpi_func %u %u \"%s\" %u",
               ivl_file_table_index(ivl_expr_file(fnet)),
-	      ivl_expr_lineno(fnet), ivl_expr_name(fnet), res.base, res.wid);
+	      ivl_expr_lineno(fnet), ivl_expr_name(fnet),
+	      ivl_expr_width(fnet));
 
       draw_vpi_taskfunc_args(call_string, 0, fnet);
-
-      return res;
 }
 
 void draw_vpi_rfunc_call(ivl_expr_t fnet)
diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index a9d15198f..c6f50038d 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -3123,6 +3123,7 @@ static struct vector_info draw_ternary_expr(ivl_expr_t expr, unsigned wid)
 
 static struct vector_info draw_sfunc_expr(ivl_expr_t expr, unsigned wid)
 {
+#if 0
       unsigned parm_count = ivl_expr_parms(expr);
       struct vector_info res;
 
@@ -3148,6 +3149,13 @@ static struct vector_info draw_sfunc_expr(ivl_expr_t expr, unsigned wid)
       clear_expression_lookaside();
 
       return res;
+#else
+      fprintf(stderr, "XXXX draw_sfunc_expr: Not implemented\n");
+      struct vector_info res;
+      res.base = 0;
+      res.wid = 0;
+      return res;
+#endif
 }
 
 static struct vector_info increment(ivl_expr_t e, unsigned wid, bool pre)
diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index 634919489..fa83c88c5 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -29,8 +29,14 @@
 
 static void draw_binary_vec4_arith(ivl_expr_t expr, int stuff_ok_flag)
 {
-      draw_eval_vec4(ivl_expr_oper1(expr), stuff_ok_flag);
-      draw_eval_vec4(ivl_expr_oper2(expr), stuff_ok_flag);
+      ivl_expr_t le = ivl_expr_oper1(expr);
+      ivl_expr_t re = ivl_expr_oper2(expr);
+
+      int signed_flag = ivl_expr_signed(le) && ivl_expr_signed(re) ? 1 : 0;
+      const char*signed_string = signed_flag? "/s" : "";
+
+      draw_eval_vec4(le, stuff_ok_flag);
+      draw_eval_vec4(re, stuff_ok_flag);
 
       switch (ivl_expr_opcode(expr)) {
 	  case '+':
@@ -42,6 +48,21 @@ static void draw_binary_vec4_arith(ivl_expr_t expr, int stuff_ok_flag)
 	  case '*':
 	    fprintf(vvp_out, "    %%mul;\n");
 	    break;
+	  case '/':
+	    fprintf(vvp_out, "    %%div%s;\n", signed_string);
+	    break;
+	  case '%':
+	    fprintf(vvp_out, "    %%mod%s;\n", signed_string);
+	    break;
+	  case 'p':
+	      /* Note that the power operator is signed if EITHER of
+		 the operands is signed. This is different from other
+		 arithmetic operators. */
+	    if (ivl_expr_signed(le) || ivl_expr_signed(re))
+		  signed_string = "/s";
+	    fprintf(vvp_out, "    %%pow%s;\n", signed_string);
+	    break;
+
 	  default:
 	    assert(0);
 	    break;
@@ -282,6 +303,9 @@ static void draw_binary_vec4(ivl_expr_t expr, int stuff_ok_flag)
 	  case '+':
 	  case '-':
 	  case '*':
+	  case '/':
+	  case '%':
+	  case 'p': /* ** (power) */
 	    draw_binary_vec4_arith(expr, stuff_ok_flag);
 	    break;
 
@@ -355,9 +379,10 @@ static void draw_number_vec4(ivl_expr_t expr)
       const char*bits = ivl_expr_bits(expr);
 
       int idx;
+      int accum = 0;
+      int count_pushi = 0;
 
-      assert(wid <= 64);
-
+	/* Scan the literal bits, MSB first. */
       for (idx = 0 ; idx < wid ; idx += 1) {
 	    val0 <<= 1;
 	    valx <<= 1;
@@ -379,8 +404,27 @@ static void draw_number_vec4(ivl_expr_t expr)
 		  assert(0);
 		  break;
 	    }
+	    accum += 1;
+	    if (accum == 32) {
+		  fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, 32;\n", val0, valx);
+		  accum = 0;
+		  val0 = 0;
+		  valx = 0;
+		    /* If there is already at least 1 pushi, then
+		       concatenate this result to what we've done
+		       already. */
+		  if (count_pushi)
+			fprintf(vvp_out, "    %%concat/vec4;\n");
+		  count_pushi += 1;
+	    }
+      }
+
+      if (accum) {
+	    fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n", val0, valx, accum);
+	    if (count_pushi)
+		  fprintf(vvp_out, "    %%concat/vec4;\n");
+	    count_pushi += 1;
       }
-      fprintf(vvp_out, "    %%pushi/vec4 %lu, %lu, %u;\n", val0, valx, wid);
 }
 
 static void draw_select_vec4(ivl_expr_t expr)
@@ -418,6 +462,26 @@ static void draw_select_pad_vec4(ivl_expr_t expr, int stuff_ok_flag)
 	    fprintf(vvp_out, "    %%pad/u %u;\n", wid);
 }
 
+static void draw_sfunc_vec4(ivl_expr_t expr, int stuff_ok_flag)
+{
+      unsigned parm_count = ivl_expr_parms(expr);
+
+	/* Special case: If there are no arguments to print, then the
+	   %vpi_call statement is easy to draw. */
+      if (parm_count == 0) {
+	    assert(ivl_expr_value(expr)==IVL_VT_LOGIC
+		   || ivl_expr_value(expr)==IVL_VT_BOOL);
+
+	    fprintf(vvp_out, "    %%vpi_func %u %u \"%s\" %u {0 0 0};\n",
+		    ivl_file_table_index(ivl_expr_file(expr)),
+		    ivl_expr_lineno(expr), ivl_expr_name(expr),
+		    ivl_expr_width(expr));
+	    return;
+      }
+
+      draw_vpi_func_call(expr);
+}
+
 static void draw_signal_vec4(ivl_expr_t expr)
 {
       ivl_signal_t sig = ivl_expr_signal(expr);
@@ -479,6 +543,12 @@ static void draw_unary_vec4(ivl_expr_t expr, int stuff_ok_flag)
 	    draw_eval_vec4(sub, stuff_ok_flag);
 	    fprintf(vvp_out, "    %%inv;\n");
 	    break;
+
+	  case '!':
+	    draw_eval_vec4(sub, STUFF_OK_XZ);
+	    fprintf(vvp_out, "    %%nor/r;\n");
+	    break;
+
 	  default:
 	    fprintf(stderr, "XXXX Unary operator %c no implemented\n", ivl_expr_opcode(expr));
 	    break;
@@ -507,6 +577,10 @@ void draw_eval_vec4(ivl_expr_t expr, int stuff_ok_flag)
 		  draw_select_vec4(expr);
 	    return;
 
+	  case IVL_EX_SFUNC:
+	    draw_sfunc_vec4(expr, stuff_ok_flag);
+	    return;
+
 	  case IVL_EX_SIGNAL:
 	    draw_signal_vec4(expr);
 	    return;
diff --git a/tgt-vvp/vector.c b/tgt-vvp/vector.c
index 6dce423d0..feba2d85f 100644
--- a/tgt-vvp/vector.c
+++ b/tgt-vvp/vector.c
@@ -128,11 +128,16 @@ static unsigned allocate_vector_no_lookaside(unsigned wid, int skip_lookaside)
  */
 unsigned allocate_vector(unsigned wid)
 {
+#if 0
       unsigned base = allocate_vector_no_lookaside(wid, 1);
 
       if (base == 0)
 	    base = allocate_vector_no_lookaside(wid, 0);
       return base;
+#else
+      assert(0);
+      return 0;
+#endif
 }
 
 /*
diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h
index 957288e16..d390b0b40 100644
--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@@ -134,8 +134,7 @@ extern void cleanup_modpath(void);
  */
 extern void draw_vpi_task_call(ivl_statement_t net);
 
-extern struct vector_info draw_vpi_func_call(ivl_expr_t expr,
-					     unsigned wid);
+extern void draw_vpi_func_call(ivl_expr_t expr);
 extern void draw_vpi_rfunc_call(ivl_expr_t expr);
 
 extern void draw_class_in_scope(ivl_type_t classtype);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 0eee42f11..5f78294ec 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -199,12 +199,12 @@ static const struct opcode_table_s opcode_table[] = {
       { "%max/wr", of_MAX_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%min/wr", of_MIN_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mod",    of_MOD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%mod/s",  of_MOD_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%mod/s",  of_MOD_S,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mod/wr", of_MOD_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mov",    of_MOV,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%mov/wu", of_MOV_WU, 2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%movi",   of_MOVI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%mul",    of_MUL,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%mul",    of_MUL,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mul/wr", of_MUL_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%muli",   of_MULI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%nand",   of_NAND,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@@ -213,7 +213,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%new/darray",of_NEW_DARRAY,2, {OA_BIT1,   OA_STRING,OA_NONE} },
       { "%noop",   of_NOOP,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%nor",    of_NOR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%nor/r",  of_NORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%nor/r",  of_NORR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%null",   of_NULL,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%or",     of_OR,     0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@@ -1831,7 +1831,7 @@ void compile_vpi_call(char*label, char*name,
                       bool func_as_task_err, bool func_as_task_warn,
                       long file_idx, long lineno,
                       unsigned argc, vpiHandle*argv,
-		      unsigned real_stack, unsigned string_stack)
+		      unsigned vec4_stack, unsigned real_stack, unsigned string_stack)
 {
       if (label)
 	    compile_codelabel(label);
@@ -1842,9 +1842,9 @@ void compile_vpi_call(char*label, char*name,
 
 	/* Create a vpiHandle that bundles the call information, and
 	   store that handle in the instruction. */
-      code->handle = vpip_build_vpi_call(name, 0, 0, 0,
-                                         func_as_task_err, func_as_task_warn,
-                                         argc, argv, real_stack, string_stack,
+      code->handle = vpip_build_vpi_call(name, 0, 0,
+					 0, func_as_task_err, func_as_task_warn,
+                                         argc, argv, vec4_stack, real_stack, string_stack,
 					 file_idx, lineno);
       if (code->handle == 0)
 	    compile_errors += 1;
@@ -1854,9 +1854,10 @@ void compile_vpi_call(char*label, char*name,
 }
 
 void compile_vpi_func_call(char*label, char*name,
-			   unsigned vbit, int vwid,
+			   int val_type, unsigned val_wid,
 			   long file_idx, long lineno,
 			   unsigned argc, vpiHandle*argv,
+			   unsigned vec4_stack,
 			   unsigned real_stack,
 			   unsigned string_stack)
 {
@@ -1869,8 +1870,9 @@ void compile_vpi_func_call(char*label, char*name,
 
 	/* Create a vpiHandle that bundles the call information, and
 	   store that handle in the instruction. */
-      code->handle = vpip_build_vpi_call(name, vbit, vwid, 0, true, false,
-                                         argc, argv, real_stack, string_stack,
+      code->handle = vpip_build_vpi_call(name, val_type, val_wid,
+					 0, true, false,
+                                         argc, argv, vec4_stack, real_stack, string_stack,
 					 file_idx, lineno);
       if (code->handle == 0)
 	    compile_errors += 1;
diff --git a/vvp/compile.h b/vvp/compile.h
index 6685cc2d1..c8af533df 100644
--- a/vvp/compile.h
+++ b/vvp/compile.h
@@ -422,6 +422,7 @@ extern void compile_vpi_call(char*label, char*name,
 			     bool func_as_task_err, bool func_as_task_warn,
 			     long file_idx, long lineno,
 			     unsigned argc, vpiHandle*argv,
+			     unsigned vec4_stack,
 			     unsigned real_stack,
 			     unsigned string_stack);
 
@@ -430,9 +431,10 @@ extern void compile_vpi_call(char*label, char*name,
    <0, the return type is -vpiRealConst or some other constant subtype
    code that represents the function type. */
 extern void compile_vpi_func_call(char*label, char*name,
-				  unsigned vbit, int vwid,
+				  int val_type, unsigned val_wid,
 				  long file_idx, long lineno,
 				  unsigned argc, vpiHandle*argv,
+				  unsigned vec4_stack,
 				  unsigned real_stack,
 				  unsigned string_stack);
 extern void print_vpi_call_errors();
diff --git a/vvp/lexor.lex b/vvp/lexor.lex
index a2d8cbf8f..c30570b99 100644
--- a/vvp/lexor.lex
+++ b/vvp/lexor.lex
@@ -269,6 +269,11 @@ static char* strdupnew(char const *str)
       assert(yylval.text);
       return T_SYMBOL; }
 
+"S<"[0-9]*",vec4>" {
+      yylval.text = strdup(yytext);
+      assert(yylval.text);
+      return T_SYMBOL; }
+
 "T<"[0-9]*","[0-9]*","[us]">" {
       yylval.text = strdup(yytext);
       assert(yylval.text);
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 9d3d42150..56a29990f 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -808,13 +808,13 @@ This instruction pops the top two values from the real stack and
 pushes back the max(min) value. Avoid returning NaN by selecting the
 other if either is NaN.
 
-* %mod   <bit-l>, <bit-r>, <wid>
-* %mod/s <bit-l>, <bit-r>, <wid>
+* %mod
+* %mod/s
 
 This instruction calculates the modulus %r of the left operand, and
-replaces the left operand with the result. The <wid> gives the width
-of the left and the right vectors, and the left vector is completely
-replaced with the result.
+replaces the left operand with the result. The left and right vectors
+are popped from the vec4 stack and have identical width. The result is
+pushed onto the vec4 stack.
 
 The /s form does signed %.
 
@@ -836,12 +836,13 @@ The %movi variant moves a binary value, LSB first, into the
 destination vector. The immediate value is up to 32bits, padded with
 zeros to fill out the width.
 
-* %mul <bit-l>, <bit-r>, <wid>
+* %mul
 
 This instruction multiplies the left vector by the right vector, the
-vectors having the width <wid>. If any of the bits of either vector
-are x or z, the result is x. Otherwise, the result is the arithmetic
-product.
+vectors pare popped from the vec4 stack and have the same width. If
+any of the bits of either vector are x or z, the result is
+x. Otherwise, the result is the arithmetic product. In any case, the
+result is pushed back on the vec4 stack.
 
 
 * %mul/wr
@@ -898,7 +899,7 @@ truth table:
 	otherwise  x
 
 
-* %nor/r <dst>, <src>, <wid>
+* %nor/r <dst>, <src>, <wid> (XXXX Old definition)
 
 The %nor/r instruction is a reduction nor. That is, the <src> is a
 vector with width, but the result is a single bit. The <src> vector is
@@ -909,6 +910,12 @@ it is valid to place the <dst> within the <src>.
 The actual operation performed is the inverted or of all the bits in
 the vector.
 
+* %nor/r
+
+The %nor/r instruction is a reduction nor. That is, a vec4 value is
+popped from the vec4 stack, the bits of the vector are or'ed together
+to a signal bit, that bit is inverted and the resulting 1-bit vector
+pushed back to the vec4 stack. See also the "%or" instruction.
 
 * %null
 
@@ -1271,7 +1278,7 @@ if it is null. If it is, set flag bit 4 to 1. Otherwise, set flag bit
 This is intended to implement the SystemVerilog expression
 (<var>==null), where <var> is a class variable.
 
-* %vpi_call <name> [, ...] {<real> <str>}
+* %vpi_call <name> [, ...] {<vec4> <real> <str>}
 
 This instruction makes a call to a system task that was declared using
 VPI. The operands are compiled down to a vpiHandle for the call. The
@@ -1282,18 +1289,19 @@ The {...} part is stack information. This tells the run-time how many
 stack items the call uses so that it knows how many to pop off the
 stack when the call returns.
 
-* %vpi_func <name>, <dst>, <wid> [, ...] {<real> <str>}
+* %vpi_func <file> <line> <name> [, ...] {<vec4> <real> <str>}
+* %vpi_func/r <file> <line> <name> [, ...] {<vec4> <real> <str>}
 
 This instruction is similar to %vpi_call, except that it is for
-calling system functions. The difference here is the <dst> and <wid>
-parameters that specify where the return value is to go. The normal
+calling system functions. The difference here is the return value from
+the function call is pushed onto the appropriate stack. The normal
 means that the VPI code uses to write the return value causes those
 bits to go here.
 
 The {...} part is stack information. This tells the run-time how many
-stack items the call uses so that it knows how many to pop off the
-stack when the call returns. The function call will pop the real and
-string stacks, and will push any return value.
+stack items the call uses from each stack so that it knows how many to
+pop off the stack when the call returns. The function call will pop
+the real and string stacks, and will push any return value.
 
 
 * %wait <functor-label>
diff --git a/vvp/parse.y b/vvp/parse.y
index 8fff1c223..3be51483e 100644
--- a/vvp/parse.y
+++ b/vvp/parse.y
@@ -593,33 +593,33 @@ statement
 
   /* This version does not allow a function to be called as a task. */
   | label_opt K_vpi_call T_NUMBER T_NUMBER T_STRING
-    argument_opt '{' T_NUMBER T_NUMBER '}' ';'
+    argument_opt '{' T_NUMBER T_NUMBER T_NUMBER '}' ';'
       { compile_vpi_call($1, $5, true, false, $3, $4,
-			 $6.argc, $6.argv, $8, $9); }
+			 $6.argc, $6.argv, $8, $9, $10); }
 
   /* This version allows a function to be called as a task, but prints a
    * warning message. */
   | label_opt K_vpi_call_w T_NUMBER T_NUMBER T_STRING
-    argument_opt '{' T_NUMBER T_NUMBER '}' ';'
+    argument_opt '{' T_NUMBER T_NUMBER T_NUMBER '}' ';'
       { compile_vpi_call($1, $5, false, true, $3, $4,
-			 $6.argc, $6.argv, $8, $9); }
+			 $6.argc, $6.argv, $8, $9, $10); }
 
   /* This version allows a function to be called as a task and does not
    * print a message. */
   | label_opt K_vpi_call_i T_NUMBER T_NUMBER T_STRING
-    argument_opt '{' T_NUMBER T_NUMBER '}' ';'
+    argument_opt '{' T_NUMBER T_NUMBER T_NUMBER '}' ';'
       { compile_vpi_call($1, $5, false, false, $3, $4,
-			 $6.argc, $6.argv, $8, $9); }
+			 $6.argc, $6.argv, $8, $9, $10); }
 
-  | label_opt K_vpi_func T_NUMBER T_NUMBER T_STRING ','
-    T_NUMBER ',' T_NUMBER argument_opt  '{' T_NUMBER T_NUMBER '}' ';'
-      { compile_vpi_func_call($1, $5, $7, $9, $3, $4,
-			      $10.argc, $10.argv, $12, $13); }
+  | label_opt K_vpi_func T_NUMBER T_NUMBER T_STRING T_NUMBER
+    argument_opt  '{' T_NUMBER T_NUMBER T_NUMBER '}' ';'
+      { compile_vpi_func_call($1, $5, -vpiVectorVal, $6, $3, $4,
+			      $7.argc, $7.argv, $9, $10, $11); }
 
   | label_opt K_vpi_func_r T_NUMBER T_NUMBER T_STRING
-    argument_opt '{' T_NUMBER T_NUMBER '}' ';'
-      { compile_vpi_func_call($1, $5, 0, -vpiRealConst, $3, $4,
-			      $6.argc, $6.argv, $8, $9); }
+    argument_opt '{' T_NUMBER T_NUMBER T_NUMBER '}' ';'
+      { compile_vpi_func_call($1, $5, -vpiRealVal, 0, $3, $4,
+			      $6.argc, $6.argv, $8, $9, $10); }
 
   /* %disable statements are instructions that takes a scope reference
      as an operand. It therefore is parsed uniquely. */
diff --git a/vvp/sfunc.cc b/vvp/sfunc.cc
index 822083371..2fb678cf1 100644
--- a/vvp/sfunc.cc
+++ b/vvp/sfunc.cc
@@ -147,17 +147,27 @@ void compile_sfunc(char*label, char*name,  char*format_string,
 		   unsigned argc, struct symb_s*argv,
                    char*trigger_label)
 {
+      unsigned vec4_stack = 0;
       unsigned real_stack = 0;
       unsigned string_stack = 0;
       vpiHandle*vpi_argv = new vpiHandle[argc];
-      int width_code = make_vpi_argv(argc, vpi_argv, format_string);
+      int val_code = make_vpi_argv(argc, vpi_argv, format_string);
+      unsigned val_width = 0;
       delete[] format_string;
 
+	// The make_vpi_argv returns for the function return value a
+	// >0 value for the vector width if this is a vector. Convert
+	// it to the form that the vpip_build_vpi_call uses.
+      if (val_code > 0) {
+	    val_width = val_code;
+	    val_code = -vpiVectorVal;
+      }
+
       vvp_net_t*ptr = new vvp_net_t;
 
-      vpiHandle sys = vpip_build_vpi_call(name, 0, width_code, ptr,
+      vpiHandle sys = vpip_build_vpi_call(name, val_code, val_width, ptr,
                                           true, false, argc, vpi_argv,
-					  real_stack, string_stack,
+					  vec4_stack, real_stack, string_stack,
                                           file_idx, lineno);
       assert(sys);
 
diff --git a/vvp/stop.cc b/vvp/stop.cc
index 4f1e67503..767a000c0 100644
--- a/vvp/stop.cc
+++ b/vvp/stop.cc
@@ -174,7 +174,7 @@ static void cmd_call(unsigned argc, char*argv[])
 	    vpiHandle call_handle = vpip_build_vpi_call(argv[0], 0, 0, 0,
 	                                                true, false,
 	                                                vpi_argc, vpi_argv,
-							0, 0,
+							0, 0, 0,
 	                                                1, 0);
 	    if (call_handle == 0)
 		  goto out;
diff --git a/vvp/vpi_priv.h b/vvp/vpi_priv.h
index 0f6c0b693..7c1f66c53 100644
--- a/vvp/vpi_priv.h
+++ b/vvp/vpi_priv.h
@@ -574,6 +574,7 @@ struct __vpiSysTaskCall : public __vpiHandle {
       unsigned nargs;
       vpiHandle*args;
 	/* Stack consumed by this call */
+      unsigned vec4_stack;
       unsigned real_stack;
       unsigned string_stack;
 	/* Support for vpi_get_userdata. */
@@ -690,20 +691,22 @@ extern unsigned vpip_module_path_cnt;
  * call. However, the vpiSysTaskCall that is the returned handle,
  * holds a parameter argument list that is passed in here.
  *
- * The vbit and vwid fields are used if this turns out to be a system
- * function. In that case, the vbit and vwid are used to address the
- * vector in thread bit space where the result is supposed to go.
+ * The val_type and return_width fields are used if this turns out to
+ * be a system function. In that case, the val_type encodes the return
+ * type (-vpiRealVal, -vpiVectorVal) and if a vector the return_width
+ * has the vector width.
  *
  * Note that the argv array is saved in the handle, and should should
  * not be released by the caller.
  */
 extern vpiHandle vpip_build_vpi_call(const char*name,
-				     unsigned vbit, int vwid,
+				     int val_type, unsigned return_width,
 				     class vvp_net_t*fnet,
 				     bool func_as_task_err,
 				     bool func_as_task_warn,
 				     unsigned argc,
 				     vpiHandle*argv,
+				     unsigned vec4_stack,
 				     unsigned real_stack,
 				     unsigned string_stack,
 				     long file_idx,
diff --git a/vvp/vpi_tasks.cc b/vvp/vpi_tasks.cc
index 90310b38b..7d6efbb48 100644
--- a/vvp/vpi_tasks.cc
+++ b/vvp/vpi_tasks.cc
@@ -461,6 +461,54 @@ vpiHandle sysfunc_real::vpi_put_value(p_vpi_value vp, int)
       return 0;
 }
 
+class sysfunc_vec4 : public __vpiSysTaskCall {
+    public:
+      inline sysfunc_vec4(unsigned wid): return_value_(wid, BIT4_X) { }
+      int get_type_code(void) const { return vpiSysFuncCall; }
+      int vpi_get(int code)         { return sysfunc_get(code, this); }
+      char* vpi_get_str(int code)   { return systask_get_str(code, this); }
+      vpiHandle vpi_put_value(p_vpi_value val, int flags);
+      vpiHandle vpi_handle(int code)
+            { return systask_handle(code, this); }
+      vpiHandle vpi_iterate(int code)
+            { return systask_iter(code, this); }
+
+      inline const vvp_vector4_t& return_value() const { return return_value_; }
+
+    private:
+      vpiHandle put_value_int_(p_vpi_value vp);
+
+    private:
+      vvp_vector4_t return_value_;
+};
+
+vpiHandle sysfunc_vec4::put_value_int_(p_vpi_value vp)
+{
+      long tmp = vp->value.integer;
+      unsigned width = return_value_.size();
+      for (unsigned idx = 0 ;  idx < width ;  idx += 1) {
+	    return_value_.set_bit(idx, (tmp&1)? BIT4_1 : BIT4_0);
+	    tmp >>= 1;
+      }
+
+      return 0;
+}
+
+vpiHandle sysfunc_vec4::vpi_put_value(p_vpi_value vp, int)
+{
+      put_value = true;
+
+      switch (vp->format) {
+	  case vpiIntVal:
+	    return put_value_int_(vp);
+	  default:
+	    fprintf(stderr, "Unsupported format %d.\n", (int)vp->format);
+	    assert(0);
+      }
+
+      return 0;
+}
+
 struct sysfunc_4net : public __vpiSysTaskCall {
       inline sysfunc_4net() { }
       int get_type_code(void) const { return vpiSysFuncCall; }
@@ -729,11 +777,11 @@ static void cleanup_vpi_call_args(unsigned argc, vpiHandle*argv)
  * non-zero value that represents the width or type of the result. The
  * vbit is also a non-zero value, the address in thread space of the result.
  */
-vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
+vpiHandle vpip_build_vpi_call(const char*name, int val_code, unsigned return_width,
 			      vvp_net_t*fnet,
 			      bool func_as_task_err, bool func_as_task_warn,
 			      unsigned argc, vpiHandle*argv,
-			      unsigned real_stack, unsigned string_stack,
+			      unsigned vec4_stack, unsigned real_stack, unsigned string_stack,
 			      long file_idx, long lineno)
 {
       assert(!(func_as_task_err && func_as_task_warn));
@@ -749,7 +797,7 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
 
       switch (defn->info.type) {
 	  case vpiSysTask:
-	    if (vwid != 0 || fnet != 0) {
+	    if (val_code != 0 || fnet != 0) {
 		  add_vpi_call_error(VPI_CALL_TASK_AS_FUNC, name, file_idx,
 		                     lineno);
 #ifdef CHECK_WITH_VALGRIND
@@ -757,11 +805,10 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
 #endif
 		  return 0;
 	    }
-	    assert(vbit == 0);
 	    break;
 
 	  case vpiSysFunc:
-	    if (vwid == 0 && fnet == 0) {
+	    if (val_code == 0 && fnet == 0) {
 		  if (func_as_task_err) {
 			add_vpi_call_error(VPI_CALL_FUNC_AS_TASK,
 			                   name, file_idx, lineno);
@@ -790,22 +837,26 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
 	    break;
 
 	  case vpiSysFunc:
-	    if (fnet && vwid == -vpiRealConst) {
+	    if (fnet && val_code == -vpiRealVal) {
 		  obj = new sysfunc_rnet;
 
-	    } else if (fnet && vwid > 0) {
+	    } else if (fnet && val_code > 0) { // XXXX What's this?
 		  obj = new sysfunc_4net;
 
-	    } else if (vwid == -vpiRealConst) {
+	    } else if (val_code == -vpiRealVal) {
 		  obj = new sysfunc_real;
 
-	    } else if (vwid > 0) {
+	    } else if (val_code == -vpiVectorVal) {
+		  obj = new sysfunc_vec4(return_width);
+
+	    } else if (val_code > 0) { // XXXX should not happen?
 		  obj = new sysfunc_def;
 
-           } else if (vwid == 0 && fnet == 0) {
+           } else if (val_code == 0 && fnet == 0) {
 		  obj = new sysfunc_no;
 
 	    } else {
+		  fprintf(stderr, "XXXX fnet=%p, val_code=%d\n", fnet, val_code);
 		  assert(0);
 	    }
 	    break;
@@ -815,10 +866,11 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
       obj->defn  = defn;
       obj->nargs = argc;
       obj->args  = argv;
+      obj->vec4_stack = vec4_stack;
       obj->real_stack = real_stack;
       obj->string_stack = string_stack;
-      obj->vbit  = vbit;
-      obj->vwid  = vwid;
+      obj->vbit  = 0;
+      obj->vwid  = 0;
       obj->fnet  = fnet;
       obj->file_idx  = (unsigned) file_idx;
       obj->lineno   = (unsigned) lineno;
@@ -902,7 +954,7 @@ void vpip_execute_vpi_call(vthread_t thr, vpiHandle ref)
 	    if (ref->get_type_code() == vpiSysFuncCall &&
 	        !vpip_cur_task->put_value) {
 		  s_vpi_value val;
-		  if (vpip_cur_task->vwid == -vpiRealConst) {
+		  if (vpip_cur_task->vwid == -vpiRealVal) {
 			val.format = vpiRealVal;
 			val.value.real = 0.0;
 		  } else {
@@ -912,6 +964,8 @@ void vpip_execute_vpi_call(vthread_t thr, vpiHandle ref)
 		  vpi_put_value(ref, &val, 0, vpiNoDelay);
 	    }
       }
+      if (vpip_cur_task->vec4_stack > 0)
+	    vthread_pop_vec4(thr, vpip_cur_task->vec4_stack);
       if (vpip_cur_task->real_stack > 0)
 	    vthread_pop_real(thr, vpip_cur_task->real_stack);
       if (vpip_cur_task->string_stack > 0)
@@ -922,6 +976,9 @@ void vpip_execute_vpi_call(vthread_t thr, vpiHandle ref)
       if (sysfunc_real*func_real = dynamic_cast<sysfunc_real*>(ref)) {
 	    vthread_push_real(thr, func_real->return_value_);
       }
+      if (sysfunc_vec4*func_vec4 = dynamic_cast<sysfunc_vec4*>(ref)) {
+	    vthread_push_vec4(thr, func_vec4->return_value());
+      }
 }
 
 /*
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index b44424210..7c0d0baa3 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -344,11 +344,21 @@ void vthread_put_bit(struct vthread_s*thr, unsigned addr, vvp_bit4_t bit)
 #endif
 }
 
+void vthread_push_vec4(struct vthread_s*thr, const vvp_vector4_t&val)
+{
+      thr->push_vec4(val);
+}
+
 void vthread_push_real(struct vthread_s*thr, double val)
 {
       thr->push_real(val);
 }
 
+void vthread_pop_vec4(struct vthread_s*thr, unsigned depth)
+{
+      thr->pop_vec4(depth);
+}
+
 void vthread_pop_real(struct vthread_s*thr, unsigned depth)
 {
       thr->pop_real(depth);
@@ -3988,12 +3998,13 @@ bool of_LOAD_X1P(vthread_t thr, vvp_code_t cp)
 #endif
       return true;
 }
-#if 0
-static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
+
+static void do_verylong_mod(vthread_t thr,
+			    const vvp_vector4_t&vala, const vvp_vector4_t&valb,
 			    bool left_is_neg, bool right_is_neg)
 {
       bool out_is_neg = left_is_neg;
-      int len=cp->number;
+      const int len=vala.size();
       unsigned char *a, *z, *t;
       a = new unsigned char[len+1];
       z = new unsigned char[len+1];
@@ -4006,20 +4017,19 @@ static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
       int i;
       int current, copylen;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-
       unsigned lb_carry = left_is_neg? 1 : 0;
       unsigned rb_carry = right_is_neg? 1 : 0;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    unsigned lb = thr_get_bit(thr, idx1);
-	    unsigned rb = thr_get_bit(thr, idx2);
+      for (int idx = 0 ;  idx < len ;  idx += 1) {
+	    unsigned lb = vala.value(idx);
+	    unsigned rb = valb.value(idx);
 
 	    if ((lb | rb) & 2) {
 		  delete []t;
 		  delete []z;
 		  delete []a;
-		  goto x_out;
+		  vvp_vector4_t tmp(len, BIT4_X);
+		  thr->push_vec4(tmp);
+		  return;
 	    }
 
 	    if (left_is_neg) {
@@ -4035,10 +4045,6 @@ static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
 
 	    z[idx]=lb;
 	    a[idx]=1-rb;	// for 2s complement add..
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
       }
 
       z[len]=0;
@@ -4063,7 +4069,9 @@ static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
 		  delete []t;
 		  delete []z;
 		  delete []a;
-		  goto x_out;
+		  vvp_vector4_t tmpx (len, BIT4_X);
+		  thr->push_vec4(tmpx);
+		  return;
 	    }
 
 	    goto tally;
@@ -4091,29 +4099,23 @@ static void do_verylong_mod(vthread_t thr, vvp_code_t cp,
 
  tally:
 
+      vvp_vector4_t tmp (len, BIT4_X);
       carry = out_is_neg? 1 : 0;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
 	    unsigned ob = z[idx];
 	    if (out_is_neg) {
 		  ob = (1-ob) + carry;
 		  carry = (ob & ~1)? 1 : 0;
 		  ob = ob & 1;
 	    }
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, ob?BIT4_1:BIT4_0);
+	    tmp.set_bit(idx, ob?BIT4_1:BIT4_0);
       }
-
+      thr->push_vec4(tmp);
       delete []t;
       delete []z;
       delete []a;
-      return;
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
-      return;
 }
-#endif
+
 bool of_MAX_WR(vthread_t thr, vvp_code_t)
 {
       double r = thr->pop_real();
@@ -4195,69 +4197,65 @@ bool of_MOD(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-bool of_MOD_S(vthread_t thr, vvp_code_t cp)
+/*
+ * %mod/s
+ */
+bool of_MOD_S(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t valb = thr->pop_vec4();
+      vvp_vector4_t vala = thr->pop_vec4();
+
+      assert(vala.size()==valb.size());
+      unsigned wid = vala.size();
 
 	/* Handle the case that we can fit the bits into a long-long
 	   variable. We cause use native % to do the work. */
-      if(cp->number <= 8*sizeof(long long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
+      if(wid <= 8*sizeof(long long)) {
 	    long long lv = 0, rv = 0;
 
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  long long lb = thr_get_bit(thr, idx1);
-		  long long rb = thr_get_bit(thr, idx2);
+	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+		  long long lb = vala.value(idx);
+		  long long rb = valb.value(idx);
 
 		  if ((lb | rb) & 2)
 			goto x_out;
 
 		  lv |= (long long) lb << idx;
 		  rv |= (long long) rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
 	    }
 
 	    if (rv == 0)
 		  goto x_out;
 
 	      /* Sign extend the signed operands when needed. */
-	    if (cp->number < 8*sizeof(long long)) {
-		  if (lv & (1LL << (cp->number-1)))
-			lv |= -1LL << cp->number;
-		  if (rv & (1LL << (cp->number-1)))
-			rv |= -1LL << cp->number;
+	    if (wid < 8*sizeof(long long)) {
+		  if (lv & (1LL << (wid-1)))
+			lv |= -1LL << wid;
+		  if (rv & (1LL << (wid-1)))
+			rv |= -1LL << wid;
 	    }
 
 	    lv %= rv;
 
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0);
+	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+		  vala.set_bit(idx, (lv&1)? BIT4_1 : BIT4_0);
 		  lv >>= 1;
 	    }
+	    thr->push_vec4(vala);
 
 	    return true;
 
       } else {
 
-	    bool left_is_neg
-		  = thr_get_bit(thr,cp->bit_idx[0]+cp->number-1) == 1;
-	    bool right_is_neg
-		  = thr_get_bit(thr,cp->bit_idx[1]+cp->number-1) == 1;
-	    do_verylong_mod(thr, cp, left_is_neg, right_is_neg);
+	    bool left_is_neg  = vala.value(vala.size()-1) == BIT4_1;
+	    bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
+	    do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg);
 	    return true;
       }
 
  x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mod/s ...\n");
-#endif
+      vvp_vector4_t tmp (wid, BIT4_X);
+      thr->push_vec4(tmp);
       return true;
 }
 
@@ -4459,34 +4457,36 @@ bool of_MOVI(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %mul
+ */
 bool of_MUL(vthread_t thr, vvp_code_t cp)
 {
-#if 0
-      unsigned adra = cp->bit_idx[0];
-      unsigned adrb = cp->bit_idx[1];
-      unsigned wid = cp->number;
+      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t valb = thr->pop_vec4();
+      assert(vala.size() == valb.size());
+      unsigned wid = vala.size();
 
-      assert(adra >= 4);
-
-      unsigned long*ap = vector_to_array(thr, adra, wid);
+      unsigned long*ap = vala.subarray(0, wid);
       if (ap == 0) {
 	    vvp_vector4_t tmp(wid, BIT4_X);
-	    thr->bits4.set_vec(adra, tmp);
+	    thr->push_vec4(tmp);
 	    return true;
       }
 
-      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      unsigned long*bp = valb.subarray(0, wid);
       if (bp == 0) {
 	    delete[]ap;
 	    vvp_vector4_t tmp(wid, BIT4_X);
-	    thr->bits4.set_vec(adra, tmp);
+	    thr->push_vec4(tmp);
 	    return true;
       }
 
 	// If the value fits in a single CPU word, then do it the easy way.
       if (wid <= CPU_WORD_BITS) {
 	    ap[0] *= bp[0];
-	    thr->bits4.setarray(adra, wid, ap);
+	    vala.setarray(0, wid, ap);
+	    thr->push_vec4(vala);
 	    delete[]ap;
 	    delete[]bp;
 	    return true;
@@ -4511,13 +4511,12 @@ bool of_MUL(vthread_t thr, vvp_code_t cp)
 	    }
       }
 
-      thr->bits4.setarray(adra, wid, res);
+      vala.setarray(0, wid, res);
+      thr->push_vec4(vala);
       delete[]ap;
       delete[]bp;
       delete[]res;
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%mul ...\n");
-#endif
+
       return true;
 }
 
@@ -4673,17 +4672,18 @@ bool of_NOOP(vthread_t, vvp_code_t)
       return true;
 }
 
-bool of_NORR(vthread_t thr, vvp_code_t cp)
+/*
+ * %nor/r
+ */
+bool of_NORR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t val = thr->pop_vec4();
 
       vvp_bit4_t lb = BIT4_1;
-      unsigned idx2 = cp->bit_idx[1];
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      for (unsigned idx = 0 ;  idx < val.size() ;  idx += 1) {
 
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2+idx);
+	    vvp_bit4_t rb = val.value(idx);
 	    if (rb == BIT4_1) {
 		  lb = BIT4_0;
 		  break;
@@ -4693,10 +4693,9 @@ bool of_NORR(vthread_t thr, vvp_code_t cp)
 		  lb = BIT4_X;
       }
 
-      thr_put_bit(thr, cp->bit_idx[0], lb);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nor/r ...\n");
-#endif
+      vvp_vector4_t res (1, lb);
+      thr->push_vec4(res);
+
       return true;
 }
 
diff --git a/vvp/vthread.h b/vvp/vthread.h
index 196b78c38..c31f7f721 100644
--- a/vvp/vthread.h
+++ b/vvp/vthread.h
@@ -116,8 +116,10 @@ extern vvp_context_item_t vthread_get_rd_context_item(unsigned context_idx);
 extern vvp_bit4_t vthread_get_bit(struct vthread_s*thr, unsigned addr);
 extern void vthread_put_bit(struct vthread_s*thr, unsigned addr, vvp_bit4_t bit);
 
+extern void vthread_push_vec4(struct vthread_s*thr, const vvp_vector4_t&val);
 extern void vthread_push_real(struct vthread_s*thr, double val);
 
+extern void vthread_pop_vec4(struct vthread_s*thr, unsigned count);
 extern void vthread_pop_str(struct vthread_s*thr, unsigned count);
 extern void vthread_pop_real(struct vthread_s*thr, unsigned count);