diff --git a/Makefile.in b/Makefile.in
index 3df9bb21b..6bde10e07 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -188,7 +188,13 @@ iverilog-vpi.pdf: iverilog-vpi.ps
 .PHONY: version.h
 version.h:
 ifeq ($(GIT),none)
-	@echo '#define VERSION_TAG ""' > $@;
+	@if test -r $(srcdir)/$@; then \
+	    echo "Using $(srcdir)/$@ for VERSION_TAG"; \
+	    diff $(srcdir)/$@ $@ > /dev/null 2>&1 || cp $(srcdir)/$@ $@; \
+	else \
+	    echo "Using empty VERSION_TAG"; \
+	    echo '#define VERSION_TAG ""' > $@; \
+	fi
 else
 	@if test -d $(srcdir)/.git; then \
 	    echo "Using git-describe for VERSION_TAG"; \
diff --git a/elab_expr.cc b/elab_expr.cc
index 90c1937fc..d452b31e4 100644
--- a/elab_expr.cc
+++ b/elab_expr.cc
@@ -716,6 +716,15 @@ NetExpr* PEConcat::elaborate_expr(Design*des, NetScope*scope,
 		  des->errors += 1;
 	    }
 
+	    if (!rep->value().is_defined()) {
+		  cerr << get_fileline() << ": error: Concatenation repeat "
+		       << "may not be undefined (" << rep->value()
+		       << ")." << endl;
+		  des->errors += 1;
+		  concat_depth -= 1;
+		  return 0;
+	    }
+
 	    if (rep->value().is_negative()) {
 		  cerr << get_fileline() << ": error: Concatenation repeat "
 		       << "may not be negative (" << rep->value().as_long()
diff --git a/elab_net.cc b/elab_net.cc
index 97fcb76cd..2e9224483 100644
--- a/elab_net.cc
+++ b/elab_net.cc
@@ -1578,6 +1578,14 @@ NetNet* PEConcat::elaborate_net(Design*des, NetScope*scope,
 		  return 0;
 	    }
 
+	    if (!erep->value().is_defined()) {
+		  cerr << get_fileline() << ": error: Concatenation repeat "
+		       << "may not be undefined (" << erep->value()
+		       << ")." << endl;
+		  des->errors += 1;
+		  return 0;
+	    }
+
 	    if (erep->value().is_negative()) {
 		  cerr << get_fileline() << ": error: Concatenation repeat "
 		       << "may not be negative (" << erep->value().as_long()
diff --git a/ivl.def b/ivl.def
index 7d0751006..17dcc0c49 100644
--- a/ivl.def
+++ b/ivl.def
@@ -226,8 +226,6 @@ ivl_switch_file
 ivl_switch_lineno
 ivl_switch_scope
 ivl_switch_type
-ivl_switch_attr_cnt
-ivl_switch_attr_val
 
 ivl_udp_init
 ivl_udp_name
diff --git a/ivl_target.h b/ivl_target.h
index 7f09116ea..715fcd4c8 100644
--- a/ivl_target.h
+++ b/ivl_target.h
@@ -1854,9 +1854,6 @@ extern ivl_statement_t ivl_stmt_sub_stmt(ivl_statement_t net);
  * ivl_switch_basename
  *    This is the name given to the device in the source code.
  *
- * ivl_switch_scope
- *    The scope where the switch device appears.
- *
  * ivl_switch_a
  * ivl_switch_b
  *    The a and b ports are the two ports of the switch.
@@ -1871,17 +1868,17 @@ extern ivl_statement_t ivl_stmt_sub_stmt(ivl_statement_t net);
  */
 extern ivl_switch_type_t ivl_switch_type(ivl_switch_t net);
 extern const char*ivl_switch_basename(ivl_switch_t net);
-extern ivl_scope_t ivl_switch_scope(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_a(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_b(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_enable(ivl_switch_t net);
 
+/* Not implemented yet
 extern unsigned        ivl_switch_attr_cnt(ivl_switch_t net);
 extern ivl_attribute_t ivl_switch_attr_val(ivl_switch_t net, unsigned idx);
 
 extern const char* ivl_switch_file(ivl_switch_t net);
 extern unsigned ivl_switch_lineno(ivl_switch_t net);
-
+*** */
 #if defined(__MINGW32__) || defined (__CYGWIN32__)
 #  define DLLEXPORT __declspec(dllexport)
 #else
diff --git a/tgt-vvp/draw_vpi.c b/tgt-vvp/draw_vpi.c
index ab290867c..2930c6a21 100644
--- a/tgt-vvp/draw_vpi.c
+++ b/tgt-vvp/draw_vpi.c
@@ -29,6 +29,82 @@
 #define snprintf _snprintf
 #endif
 
+/*
+ * Check to see if the expression (number) can be correctly represented
+ * with a long variable.
+ */
+static int is_constant_number(ivl_expr_t ex)
+{
+	/* Make sure this matches the return type of constant_number(). */
+      unsigned lim_wid = 8*sizeof(long);
+      const char*bits;
+      char pad_bit = '0';
+      unsigned idx;
+      unsigned nbits = ivl_expr_width(ex);
+
+      if (ivl_expr_type(ex) != IVL_EX_NUMBER
+          && ivl_expr_type(ex) != IVL_EX_ULONG)
+            return 0;
+
+      bits = ivl_expr_bits(ex);
+
+	/* For unsigned values the effective MSB and on must be '0'. */
+      if (!ivl_expr_signed(ex)) lim_wid -= 1;
+
+	/* For negative values the pad bit is '1'. */
+      if (ivl_expr_signed(ex) && bits[nbits-1]=='1') {
+            pad_bit = '1';
+      }
+
+	/* For the number to fit in the variable all the upper bits must
+	 * match the pad bits. */
+      for (idx = lim_wid ;  idx < nbits ;  idx += 1) {
+            if (bits[idx] != pad_bit) return 0;
+      }
+
+      return 1;
+}
+
+/*
+ * Convert the expression (number) to a long value.
+ */
+static long get_constant_number(ivl_expr_t ex)
+{
+      long rtn = 0;
+
+      switch (ivl_expr_type(ex)) {
+	  case IVL_EX_ULONG:
+	    rtn = (signed)ivl_expr_value(ex);
+	    break;
+	  case IVL_EX_NUMBER: {
+	    unsigned idx;
+	    const char*bits = ivl_expr_bits(ex);
+	    unsigned nbits = ivl_expr_width(ex);
+	    char pad_bit = bits[nbits-1];
+	      /* Define all the bits in the long (negative numbers). */
+	    for (idx = 0 ;  idx < 8*sizeof(long) ;  idx += 1) {
+		  char bit;
+		  if (idx < nbits) bit = bits[idx];
+		  else bit = pad_bit;
+		  switch (bit) {
+		      case '0':
+			break;
+		      case '1':
+			rtn |= 1 << idx;
+			break;
+		      default:
+			assert(0);
+		  }
+	    }
+	    break;
+	  }
+	  default:
+	    assert(0);
+      }
+
+      return rtn;
+}
+
 static const char* magic_sfuncs[] = {
       "$time",
       "$stime",
@@ -217,6 +293,39 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 			continue;
 		  }
 
+		case IVL_EX_SELECT: {
+		  ivl_expr_t vexpr = ivl_expr_oper1(expr);
+                  assert(vexpr);
+
+		    /* This code is only for signals. */
+		  if (ivl_expr_type(vexpr) != IVL_EX_SIGNAL) break;
+
+		    /* The signal is part of an array. */
+		    /* Add &APV<> code here when it is finished. */
+		  if (ivl_expr_oper1(vexpr)) break;
+
+                  ivl_expr_t bexpr = ivl_expr_oper2(expr);
+                  assert(bexpr);
+
+		    /* This is a constant bit/part select. */
+                  if (is_constant_number(bexpr)) {
+			snprintf(buffer, sizeof buffer, "&PV<v%p_0, %ld, %u>",
+			         ivl_expr_signal(vexpr),
+			         get_constant_number(bexpr),
+			         ivl_expr_width(expr));
+		    /* This is an indexed bit/part select. */
+                  } else {
+			struct vector_info rv;
+			rv = draw_eval_expr(bexpr, STUFF_OK_XZ);
+			snprintf(buffer, sizeof buffer, "&PV<v%p_0, %u %u, %u>",
+			         ivl_expr_signal(vexpr),
+			         rv.base, rv.wid,
+			         ivl_expr_width(expr));
+                  }
+		  args[idx].text = strdup(buffer);
+		  continue;
+		}
+
 		    /* Everything else will need to be evaluated and
 		       passed as a constant to the vpi task. */
 		default:
diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 08080fe7e..d4f341cd7 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -92,7 +92,8 @@ unsigned long get_number_immediate(ivl_expr_t ex)
 		    case '0':
 		      break;
 		    case '1':
-		      imm |= 1 << idx;
+		      assert(idx < 8*sizeof(imm));
+		      imm |= 1UL << idx;
 		      break;
 		    default:
 		      assert(0);
@@ -909,12 +910,43 @@ static struct vector_info draw_binary_expr_le(ivl_expr_t exp,
       return lv;
 }
 
+static struct vector_info draw_logic_immediate(ivl_expr_t exp,
+					       ivl_expr_t le,
+					       ivl_expr_t re,
+					       unsigned wid)
+{
+      struct vector_info lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
+      unsigned long imm = get_number_immediate(re);
+
+      assert(lv.base >= 4);
+
+      switch (ivl_expr_opcode(exp)) {
+
+	  case '&':
+	    fprintf(vvp_out, "   %%andi %u, %lu, %u;\n", lv.base, imm, lv.wid);
+	    break;
+
+	  default:
+	    assert(0);
+	    break;
+      }
+
+      return lv;
+}
+
 static struct vector_info draw_binary_expr_logic(ivl_expr_t exp,
 						 unsigned wid)
 {
       ivl_expr_t le = ivl_expr_oper1(exp);
       ivl_expr_t re = ivl_expr_oper2(exp);
 
+      if (ivl_expr_opcode(exp) == '&') {
+	    if (number_is_immediate(re, IMM_WID) && !number_is_unknown(re))
+		  return draw_logic_immediate(exp, le, re, wid);
+	    if (number_is_immediate(le, IMM_WID) && !number_is_unknown(le))
+		  return draw_logic_immediate(exp, re, le, wid);
+      }
+
       struct vector_info lv;
       struct vector_info rv;
 
@@ -1166,23 +1198,50 @@ static struct vector_info draw_add_immediate(ivl_expr_t le,
 
       imm = get_number_immediate(re);
 
-	/* Now generate enough %addi instructions to add the entire
-	   immediate value to the destination. The adds are done 16
-	   bits at a time, but 17 bits are done to push the carry into
-	   the higher bits if needed. */
-      { unsigned base;
-        for (base = 0 ;  base < lv.wid ;  base += 16) {
-	      unsigned long tmp = imm & 0xffffUL;
-	      unsigned add_wid = lv.wid - base;
+	/* This shouldn't generally happen, because the elaborator
+	   should take care of simple constant propagation like this,
+	   but it doesn't have to and it is easy to catch here. */
+      if (imm == 0)
+	    return lv;
 
-	      imm >>= 16;
+      switch (lv.base) {
+	  case 0: /* Left expression is 0. */
+	    lv.base = allocate_vector(wid);
+	    if (lv.base == 0) {
+		  fprintf(stderr, "%s:%u: vvp.tgt error: "
+			  "Unable to allocate %u thread bits "
+			  "for result of addition.\n",
+			  ivl_expr_file(re), ivl_expr_lineno(re), wid);
+		  vvp_errors += 1;
+	    }
+	    fprintf(vvp_out, "   %%movi %u, %lu %u;\n", lv.base, imm, wid);
+	    break;
 
-	      fprintf(vvp_out, "    %%addi %u, %lu, %u;\n",
-		      lv.base+base, tmp, add_wid);
+	  case 1: /* Left expression is 1...1 (i.e. -1) */
+	    imm -= 1;
+	    if (imm == 0) {
+		  lv.base = 0;
+	    } else {
+		  lv.base = allocate_vector(wid);
+		  if (lv.base == 0) {
+			fprintf(stderr, "%s:%u: vvp.tgt error: "
+				"Unable to allocate %u thread bits "
+				"for result of addition.\n",
+				ivl_expr_file(re), ivl_expr_lineno(re), wid);
+			vvp_errors += 1;
+		  }
+		  fprintf(vvp_out, "   %%movi %u, %lu %u;\n", lv.base, imm, wid);
+	    }
+	    break;
 
-	      if (imm == 0)
-		    break;
-	}
+	  case 2: /* Left expression is X or Z */
+	  case 3:
+	    lv.base = 2;
+	    break;
+
+	  default: /* The regular case. */
+	    fprintf(vvp_out, "   %%addi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
       }
 
       return lv;
@@ -1203,7 +1262,8 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
-      assert( (imm & ~0xffff) == 0 );
+      if (imm == 0)
+	    return lv;
 
       switch (lv.base) {
 	  case 0:
@@ -1217,21 +1277,21 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
 		  vvp_errors += 1;
 	    }
 
-	    fprintf(vvp_out, "    %%mov %u, %u, %u;\n", tmp, lv.base, wid);
+	    fprintf(vvp_out, "   %%mov %u, %u, %u;\n", tmp, lv.base, wid);
 	    lv.base = tmp;
-	    fprintf(vvp_out, "    %%subi %u, %lu, %u;\n", lv.base, imm, wid);
-	    return lv;
+	    fprintf(vvp_out, "   %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
 
 	  case 2:
 	  case 3:
 	    lv.base = 2;
-	    return lv;
+	    break;
 
 	  default:
-	    fprintf(vvp_out, "    %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    fprintf(vvp_out, "   %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
       }
 
-
       return lv;
 }
 
@@ -1246,8 +1306,10 @@ static struct vector_info draw_mul_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
+      if (imm == 0)
+	    return lv;
 
-      fprintf(vvp_out, "    %%muli %u, %lu, %u;\n", lv.base, imm, lv.wid);
+      fprintf(vvp_out, "   %%muli %u, %lu, %u;\n", lv.base, imm, lv.wid);
 
       return lv;
 }
@@ -1299,13 +1361,13 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)
       if ((ivl_expr_opcode(exp) == '-')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_sub_immediate(le, re, wid);
 
       if ((ivl_expr_opcode(exp) == '*')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_mul_immediate(le, re, wid);
 
       lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
@@ -1612,9 +1674,9 @@ static struct vector_info draw_number_expr(ivl_expr_t exp, unsigned wid)
 	    vvp_errors += 1;
       }
 
-      if ((!number_is_unknown(exp)) && number_is_immediate(exp, 16)) {
-	    int val = get_number_immediate(exp);
-	    fprintf(vvp_out, "    %%movi %u, %d, %u;\n", res.base, val, wid);
+      if ((!number_is_unknown(exp)) && number_is_immediate(exp, IMM_WID)) {
+	    unsigned long val = get_number_immediate(exp);
+	    fprintf(vvp_out, "   %%movi %u, %lu, %u;\n", res.base, val, wid);
 	    return res;
       }
 
@@ -1836,8 +1898,8 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
       idx = 0;
       while (idx < nwid) {
 	    unsigned bits;
-	    unsigned trans = 16;
-	    if (nwid-idx < 16)
+	    unsigned trans = IMM_WID;
+	    if (nwid-idx < trans)
 		  trans = nwid-idx;
 
 	    bits = *p;
@@ -1845,6 +1907,14 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
 	    if (trans > 8) {
 		  bits |= *p << 8;
 		  p -= 1;
+		  if (trans > 16) {
+			bits |= *p << 16;
+			p -= 1;
+			if (trans > 24) {
+			      bits |= *p << 24;
+			      p -= 1;
+			}
+		  }
 	    }
 	    fprintf(vvp_out, "  %%movi %u, %u, %u;\n", res.base+idx,bits,trans);
 
@@ -1881,8 +1951,14 @@ void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned swid)
 			  res.base+idx, res.base+swid-1);
 
       } else {
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
-		    res.base+swid, res.wid-swid);
+	    unsigned base = res.base+swid;
+	    unsigned count = res.wid-swid;
+	      /* The %movi is faster for larger widths, but for very
+		 small counts, the %mov is faster. */
+	    if (count > 4)
+		  fprintf(vvp_out, "   %%movi %u, 0, %u;\n", base, count);
+	    else
+		  fprintf(vvp_out, "   %%mov %u, 0, %u;\n", base, count);
       }
 }
 
@@ -2086,7 +2162,7 @@ static struct vector_info draw_select_signal(ivl_expr_t sube,
 
       for (idx = 0 ;  idx < res.wid ;  idx += 1) {
 	    if (idx >= bit_wid) {
-		  fprintf(vvp_out, "   %%mov %u, 0, %u; Pad from %u to %u\n",
+		  fprintf(vvp_out, "   %%movi %u, 0, %u; Pad from %u to %u\n",
 			  res.base+idx, res.wid-idx,
 			  ivl_expr_width(sube), wid);
 		  break;
@@ -2410,7 +2486,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
@@ -2460,7 +2536,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 		  assert(res.base);
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h
index ffb2f12d3..0ccf36343 100644
--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@@ -39,6 +39,12 @@ struct vector_info {
       unsigned wid;
 };
 
+/*
+ * Convenient constants...
+ */
+  /* Width limit for typical immediate arguments. */
+# define IMM_WID 32
+
 /*
  * Mangle all non-symbol characters in an identifier, quotes in names
  */
diff --git a/vpi/sys_display.c b/vpi/sys_display.c
index c291d00e3..8b559fb15 100644
--- a/vpi/sys_display.c
+++ b/vpi/sys_display.c
@@ -827,6 +827,7 @@ static void do_display(unsigned int mcd, struct strobe_cb_info*info)
 		case vpiReg:
 		case vpiIntegerVar:
 		case vpiMemoryWord:
+		case vpiPartSelect:
 		  do_display_numeric(mcd, info, item);
 		  break;
 
@@ -1836,6 +1837,7 @@ static char *get_display(unsigned int *rtnsz, struct strobe_cb_info *info)
       case vpiReg:
       case vpiIntegerVar:
       case vpiMemoryWord:
+      case vpiPartSelect:
         width = get_numeric(&result, info, item);
         rtn = realloc(rtn, (size+width)*sizeof(char));
         memcpy(rtn+size-1, result, width);
diff --git a/vpi_user.h b/vpi_user.h
index 0087b52b5..6aba54f7f 100644
--- a/vpi_user.h
+++ b/vpi_user.h
@@ -279,6 +279,7 @@ typedef struct t_vpi_delay  {
 #define vpiNamedFork   35
 #define vpiNet         36
 #define vpiParameter   41
+#define vpiPartSelect  42
 #define vpiPathTerm    43
 #define vpiRealVar     47
 #define vpiReg         48
@@ -297,6 +298,7 @@ typedef struct t_vpi_delay  {
 #define vpiModPathIn     95
 #define vpiModPathOut    96 
 #define vpiVariables   100
+#define vpiExpr        102
 
 #define vpiCallback  1000
 
@@ -346,8 +348,8 @@ typedef struct t_vpi_delay  {
 #   define vpiSysFuncReal  vpiRealFunc
 #   define vpiSysFuncTime  vpiTimeFunc
 #   define vpiSysFuncSized vpiSizedFunc
-#define vpiSigned    65
-#define vpiExpr      102
+#define vpiConstantSelect 53
+#define vpiSigned         65
 /* IVL private properties */
 #define _vpiNexusId 0x1000000
 
diff --git a/vvp/codes.h b/vvp/codes.h
index 0e93f72de..0ac52e0c6 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -37,6 +37,7 @@ extern bool of_ADD(vthread_t thr, vvp_code_t code);
 extern bool of_ADD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_ADDI(vthread_t thr, vvp_code_t code);
 extern bool of_AND(vthread_t thr, vvp_code_t code);
+extern bool of_ANDI(vthread_t thr, vvp_code_t code);
 extern bool of_ANDR(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AV(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code);
@@ -167,7 +168,7 @@ struct vvp_code_s {
       };
 
       union {
-	    unsigned bit_idx[2];
+	    uint32_t    bit_idx[2];
 	    vvp_net_t   *net2;
 	    vvp_code_t   cptr2;
 	    struct ufunc_core*ufunc_core_ptr;
diff --git a/vvp/compile.cc b/vvp/compile.cc
index d2e1c96d8..4ad8bf17d 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -85,6 +85,7 @@ const static struct opcode_table_s opcode_table[] = {
       { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and",    of_AND,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1,     OA_BIT2} },
       { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1,  OA_BIT2} },
       { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
diff --git a/vvp/lexor.lex b/vvp/lexor.lex
index c3488bb72..fb2ec4ade 100644
--- a/vvp/lexor.lex
+++ b/vvp/lexor.lex
@@ -173,22 +173,24 @@
 "%disable"  { return K_disable; }
 "%fork"     { return K_fork; }
 
+  /* Handle the specialized variable access functions. */
+
+"&A" { return K_A; }
+"&PV" { return K_PV; }
+
 "%"[.$_/a-zA-Z0-9]+ {
       yylval.text = strdup(yytext);
       assert(yylval.text);
       return T_INSTR; }
 
 [0-9][0-9]* {
-      yylval.numb = strtol(yytext, 0, 0);
+      yylval.numb = strtoul(yytext, 0, 0);
       return T_NUMBER; }
 
 "0x"[0-9a-fA-F]+ {
-      yylval.numb = strtol(yytext, 0, 0);
+      yylval.numb = strtoul(yytext, 0, 0);
       return T_NUMBER; }
 
-
-"&A" { return K_A; }
-
   /* Handle some specialized constant/literals as symbols. */
 
 "C4<"[01xz]*">" {
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 84941815c..59bf80097 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -531,7 +531,8 @@ is one of the 4 constant bits, the effect is to replicate the value
 into the destination vector. This is useful for filling a vector.
 
 The %movi variant moves a binary value, LSB first, into the
-destination vector.
+destination vector. The immediate value is up to 32bits, padded with
+zeros to fillout the width.
 
 * %mul <bit-l>, <bit-r>, <wid>
 
diff --git a/vvp/parse.y b/vvp/parse.y
index dd8a2b22d..fd12da226 100644
--- a/vvp/parse.y
+++ b/vvp/parse.y
@@ -47,7 +47,7 @@ static struct __vpiModPath*modpath_dst = 0;
 %union {
       char*text;
       char **table;
-      long numb;
+      unsigned long numb;
       bool flag;
 
       comp_operands_t opa;
@@ -77,7 +77,7 @@ static struct __vpiModPath*modpath_dst = 0;
 %token K_EVENT K_EVENT_OR K_EXTEND_S K_FUNCTOR K_MODPATH K_NET K_NET_S K_NET_R
 %token K_NET8 K_NET8_S
 %token K_PARAM_STR K_PARAM_L K_PARAM_REAL K_PART K_PART_PV
-%token K_PART_V K_REDUCE_AND K_REDUCE_OR K_REDUCE_XOR
+%token K_PART_V K_PV K_REDUCE_AND K_REDUCE_OR K_REDUCE_XOR
 %token K_REDUCE_NAND K_REDUCE_NOR K_REDUCE_XNOR K_REPEAT
 %token K_RESOLV K_SCOPE K_SFUNC K_SHIFTL K_SHIFTR K_SHIFTRS
 %token K_THREAD K_TIMESCALE K_UFUNC
@@ -790,9 +790,14 @@ argument
       }
   | K_A '<' T_SYMBOL ',' T_NUMBER '>'
       { $$ = vpip_make_vthr_A($3, $5); }
+  | K_PV '<' T_SYMBOL ',' T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, $5, $7); }
+  | K_PV '<' T_SYMBOL ',' '-' T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, -$6, $8); }
+  | K_PV '<' T_SYMBOL ',' T_NUMBER T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, $5, $6, $8); }
   ;
 
-
   /* functor operands can only be a list of symbols. */
 symbols
 	: symbol
diff --git a/vvp/vpi_priv.h b/vvp/vpi_priv.h
index fc43caeff..be3f42bef 100644
--- a/vvp/vpi_priv.h
+++ b/vvp/vpi_priv.h
@@ -220,6 +220,20 @@ extern vpiHandle vpip_make_reg(const char*name, int msb, int lsb,
 extern vpiHandle vpip_make_net(const char*name, int msb, int lsb,
 			       bool signed_flag, vvp_net_t*node);
 
+/*
+ * This is used by system calls to represent a bit/part select of
+ * a simple variable or constant array word.
+ */
+struct __vpiPV {
+      struct __vpiHandle base;
+      vpiHandle parent;
+      vvp_net_t*net;
+      int tbase;
+      unsigned twid, width;
+};
+extern vpiHandle vpip_make_PV(char*name, int base, int width);
+extern vpiHandle vpip_make_PV(char*name, int tbase, int twid, int width);
+
 /*
  * This function safely converts a vpiHandle back to a
  * __vpiSignal. Return a nil if the type is not appropriate.
@@ -368,6 +382,7 @@ struct __vpiSysTaskCall {
       class vvp_net_t*fnet;
       unsigned file_idx;
       unsigned lineno;
+      bool put_value;
 };
 
 extern struct __vpiSysTaskCall*vpip_cur_task;
diff --git a/vvp/vpi_signal.cc b/vvp/vpi_signal.cc
index 7f09be83f..f1efa7f36 100644
--- a/vvp/vpi_signal.cc
+++ b/vvp/vpi_signal.cc
@@ -48,7 +48,7 @@
  * draw_tt.c program.
  */
 extern const char hex_digits[256];
-extern const char oct_digits[256];
+extern const char oct_digits[64];
 
 /*
  * The string values need a result buf to hold the results. This
@@ -109,6 +109,385 @@ char *generic_get_str(int code, vpiHandle ref, const char *name, const char *ind
 	return res;
 }
 
+/*
+ * The standard formating/conversion routines.
+ * They work with full or partial signals.
+ */
+
+static void format_vpiBinStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      char *rbuf = need_result_buf(wid+1, RBUF_VAL);
+      long offset = wid - 1 + base;
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (idx < 0 || idx >= ssize) {
+                  rbuf[offset-idx] = 'x';
+	    } else {
+                  rbuf[offset-idx] = vvp_bit4_to_ascii(sig->value(idx));
+	    }
+      }
+      rbuf[wid] = 0;
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiOctStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      unsigned dwid = (wid + 2) / 3;
+      char *rbuf = need_result_buf(dwid+1, RBUF_VAL);
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+      unsigned val = 0;
+
+      rbuf[dwid] = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    unsigned bit = 0;
+	    if (idx < 0 || idx >= ssize) {
+                  bit = 2; // BIT4_X
+	    } else {
+                  switch (sig->value(idx)) {
+		      case BIT4_0:
+			bit = 0;
+			break;
+		      case BIT4_1:
+			bit = 1;
+			break;
+		      case BIT4_X:
+			bit = 2;
+			break;
+		      case BIT4_Z:
+			bit = 3;
+			break;
+                  }
+	    }
+	    val |= bit << 2*((idx-base) % 3);
+
+	    if ((idx-base) % 3 == 2) {
+		dwid -= 1;
+		rbuf[dwid] = oct_digits[val];
+		val = 0;
+	    }
+      }
+
+	/* Fill in X or Z if they are the only thing in the value. */
+      switch (wid % 3) {
+	  case 1:
+	    if (val == 2) val = 42;
+	    else if (val == 3) val = 63;
+	    break;
+	  case 2:
+	    if (val == 10) val = 42;
+	    else if (val == 15) val = 63;
+	    break;
+      }
+
+      if (dwid > 0) rbuf[0] = oct_digits[val];
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiHexStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      unsigned dwid = (wid + 3) / 4;
+      char *rbuf = need_result_buf(dwid+1, RBUF_VAL);
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+      unsigned val = 0;
+
+      rbuf[dwid] = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    unsigned bit = 0;
+	    if (idx < 0 || idx >= ssize) {
+                  bit = 2; // BIT4_X
+	    } else {
+                  switch (sig->value(idx)) {
+		      case BIT4_0:
+			bit = 0;
+			break;
+		      case BIT4_1:
+			bit = 1;
+			break;
+		      case BIT4_X:
+			bit = 2;
+			break;
+		      case BIT4_Z:
+			bit = 3;
+			break;
+                  }
+	    }
+	    val |= bit << 2*((idx-base) % 4);
+
+	    if ((idx-base) % 4 == 3) {
+		dwid -= 1;
+		rbuf[dwid] = hex_digits[val];
+		val = 0;
+	    }
+      }
+
+	/* Fill in X or Z if they are the only thing in the value. */
+      switch (wid % 4) {
+	  case 1:
+	    if (val == 2) val = 170;
+	    else if (val == 3) val = 255;
+	    break;
+	  case 2:
+	    if (val == 10) val = 170;
+	    else if (val == 15) val = 255;
+	    break;
+	  case 3:
+	    if (val == 42) val = 170;
+	    else if (val == 63) val = 255;
+	    break;
+      }
+
+      if (dwid > 0) rbuf[0] = hex_digits[val];
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiDecStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                int signed_flag, s_vpi_value*vp)
+{
+      unsigned hwid = (sig->size()+2) / 3 + 1;
+      char *rbuf = need_result_buf(hwid, RBUF_VAL);
+      long ssize = (signed)sig->size();
+      long end = base + (signed)wid;
+
+	/* Do we have an end outside of the real signal vector. */
+      if (base < 0 || end > ssize) {
+	    bool all_x = true;
+	    if (end > ssize) end = ssize;
+	    if (base < 0) base = 0;
+	    for (long idx = base ;  idx < end ;  idx += 1) {
+		  if (sig->value(idx) != BIT4_X) {
+			all_x = false;
+			break;
+		  }
+	    }
+
+	    if (all_x) {
+		  rbuf[0] = 'x';
+	    } else {
+		  rbuf[0] = 'X';
+	    }
+	    rbuf[1] = 0;
+
+	    vp->value.str = rbuf;
+	    return;
+      }
+
+      vvp_vector4_t vec4;
+      if (base == 0 && end == ssize) {
+	    vec4 = sig->vec4_value();
+      } else {
+	    vec4 = sig->vec4_value().subvalue(base, wid);
+      }
+
+      vpip_vec4_to_dec_str(vec4, rbuf, hwid, signed_flag);
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiIntVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                             s_vpi_value*vp)
+{
+      unsigned iwid = 8 * sizeof(vp->value.integer);
+      long ssize = (signed)sig->size();
+
+      if (wid > iwid) wid = iwid;
+      long end = base + (signed)wid;
+      if (end > ssize) end = ssize;
+
+      vp->value.integer = 0;
+      for (long idx = (base < 0) ? 0 : base ;  idx < end ;  idx += 1) {
+	    if (sig->value(idx) == BIT4_1) {
+		  vp->value.integer |= 1<<(idx-base);
+	    }
+      }
+}
+
+static void format_vpiRealVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                              int signed_flag, s_vpi_value*vp)
+{
+      vvp_vector4_t vec4(wid);
+      long ssize = (signed)sig->size();
+      long end = base + (signed)wid;
+      if (end > ssize) end = ssize;
+
+      for (long idx = (base < 0) ? 0 : base ;  idx < end ;  idx += 1) {
+	    vec4.set_bit(idx-base, sig->value(idx));
+      }
+
+      vp->value.real = 0.0;
+      vector4_to_value(vec4, vp->value.real, signed_flag);
+}
+
+static void format_vpiStringVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      /* The result will use a character for each 8 bits of the
+	 vector. Add one extra character for the highest bits that
+	 don't form an 8 bit group. */
+      char *rbuf = need_result_buf(wid/8 + ((wid&7)!=0) + 1, RBUF_VAL);
+      char *cp = rbuf;
+
+      char tmp = 0;
+      for (long idx = base+(signed)wid-1; idx >= base; idx -= 1) {
+	    tmp <<= 1;
+
+	    if (idx >=0 && idx < (signed)sig->size() &&
+	        sig->value(idx) == BIT4_1) {
+		   tmp |= 1;
+	    }
+
+	    if (((idx-base)&7)==0){
+		  /* Skip leading nulls. */
+		  if (tmp == 0 && cp == rbuf)
+			continue;
+
+		  /* Nulls in the middle get turned into spaces. */
+		  *cp++ = tmp ? tmp : ' ';
+		  tmp = 0;
+	    }
+      }
+      *cp++ = 0;
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiScalarVal(vvp_fun_signal_vec*sig, int base, 
+                                s_vpi_value*vp)
+{
+      if (base >= 0 && base < (signed)sig->size()) {
+	    switch (sig->value(base)) {
+		case BIT4_0:
+		  vp->value.scalar = vpi0;
+		  break;
+		case BIT4_1:
+		  vp->value.scalar = vpi1;
+		  break;
+		case BIT4_X: {
+		  vvp_scalar_t strn = sig->scalar_value(base);
+		  if (strn.strength0() == 1) vp->value.scalar = vpiH;
+		  else if (strn.strength1() == 1) vp->value.scalar = vpiL;
+		  else vp->value.scalar = vpiX;
+		  break;
+		}
+		case BIT4_Z:
+		  vp->value.scalar = vpiZ;
+		  break;
+	    }
+      } else {
+	    vp->value.scalar = vpiX;
+      }
+}
+
+static void format_vpiStrengthVal(vvp_fun_signal_vec*sig, int base,
+                                  unsigned wid, s_vpi_value*vp)
+{
+      long end = base + (signed)wid;
+      s_vpi_strengthval*op;
+
+      op = (s_vpi_strengthval*)
+	    need_result_buf(wid * sizeof(s_vpi_strengthval), RBUF_VAL);
+
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (idx >=0 && idx < (signed)sig->size()) {
+		  vvp_scalar_t val = sig->scalar_value(idx);
+
+		  /* vvp_scalar_t strengths are 0-7, but the vpi strength
+		     is bit0-bit7. This gets the vpi form of the strengths
+		     from the vvp_scalar_t strengths. */
+		  unsigned s0 = 1 << val.strength0();
+		  unsigned s1 = 1 << val.strength1();
+
+		  switch (val.value()) {
+		      case BIT4_0:
+			op[idx-base].logic = vpi0;
+			op[idx-base].s0 = s0|s1;
+			op[idx-base].s1 = 0;
+			break;
+
+		      case BIT4_1:
+			op[idx-base].logic = vpi1;
+			op[idx-base].s0 = 0;
+			op[idx-base].s1 = s0|s1;
+			break;
+
+		      case BIT4_X:
+			op[idx-base].logic = vpiX;
+			op[idx-base].s0 = s0;
+			op[idx-base].s1 = s1;
+			break;
+
+		      case BIT4_Z:
+			op[idx-base].logic = vpiZ;
+			op[idx-base].s0 = vpiHiZ;
+			op[idx-base].s1 = vpiHiZ;
+			break;
+		  }
+	    } else {
+		  op[idx-base].logic = vpiX;
+		  op[idx-base].s0 = vpiStrongDrive;
+		  op[idx-base].s1 = vpiStrongDrive;
+	    }
+      }
+
+      vp->value.strength = op;
+}
+
+static void format_vpiVectorVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      long end = base + (signed)wid;
+      unsigned int obit = 0;
+      unsigned hwid = (wid - 1)/32 + 1;
+
+      s_vpi_vecval *op = (p_vpi_vecval)
+                         need_result_buf(hwid * sizeof(s_vpi_vecval), RBUF_VAL);
+      vp->value.vector = op;
+
+      op->aval = op->bval = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (base >= 0 && base < (signed)sig->size()) {
+		switch (sig->value(idx)) {
+		case BIT4_0:
+		  op->aval &= ~(1 << obit);
+		  op->bval &= ~(1 << obit);
+		  break;
+		case BIT4_1:
+		  op->aval |= (1 << obit);
+		  op->bval &= ~(1 << obit);
+		  break;
+		case BIT4_X:
+		  op->aval |= (1 << obit);
+		  op->bval |= (1 << obit);
+		  break;
+		case BIT4_Z:
+		  op->aval &= ~(1 << obit);
+		  op->bval |= (1 << obit);
+		  break;
+		}
+	    } else {  /* BIT4_X */
+		  op->aval |= (1 << obit);
+		  op->bval |= (1 << obit);
+	    }
+
+	    obit++;
+	    if (!(obit % 32)) {
+		  op += 1;
+		  if ((op - vp->value.vector) < (ptrdiff_t)hwid)
+			op->aval = op->bval = 0;
+		  obit = 0;
+	    }
+      }
+}
+
 struct __vpiSignal* vpip_signal_from_handle(vpiHandle ref)
 {
       if ((ref->vpi_type->type_code != vpiNet)
@@ -245,65 +624,6 @@ static vpiHandle signal_iterate(int code, vpiHandle ref)
       return 0;
 }
 
-
-static char *signal_vpiDecStrVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-      assert(vsig);
-
-      unsigned hwid = (vsig->size()+2) / 3 + 1;
-      char *rbuf = need_result_buf(hwid, RBUF_VAL);
-
-      vpip_vec4_to_dec_str(vsig->vec4_value(), rbuf, hwid, rfp->signed_flag);
-
-      return rbuf;
-}
-
-
-static char *signal_vpiStringVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      unsigned wid = (rfp->msb >= rfp->lsb)
-	    ? (rfp->msb - rfp->lsb + 1)
-	    : (rfp->lsb - rfp->msb + 1);
-
-      vvp_fun_signal*vsig = dynamic_cast<vvp_fun_signal*>(rfp->node->fun);
-
-      /* The result will use a character for each 8 bits of the
-	 vector. Add one extra character for the highest bits that
-	 don't form an 8 bit group. */
-      char *rbuf = need_result_buf(wid/8 + ((wid&7)!=0) + 1, RBUF_VAL);
-      char *cp = rbuf;
-
-      char tmp = 0;
-      int bitnr;
-      for(bitnr=wid-1; bitnr>=0; bitnr--){
-	  tmp <<= 1;
-
-	  switch (vsig->value(bitnr)) {
-	  case BIT4_0:
-	      break;
-	  case  BIT4_1:
-	      tmp |= 1;
-	      break;
-	  default:
-	      break;
-	  }
-
-	  if ((bitnr&7)==0){
-		  /* Skip leading nulls. */
-		if (tmp == 0 && cp == rbuf)
-		      continue;
-
-		  /* Nulls in the middle get turned into spaces. */
-		*cp++ = tmp? tmp : ' ';
-		tmp = 0;
-	  }
-      }
-      *cp++ = 0;
-
-      return rbuf;
-}
-
 static unsigned signal_width(const struct __vpiSignal*rfp)
 {
       unsigned wid = (rfp->msb >= rfp->lsb)
@@ -313,97 +633,6 @@ static unsigned signal_width(const struct __vpiSignal*rfp)
       return wid;
 }
 
-static void signal_get_IntVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      unsigned wid = signal_width(rfp);
-      unsigned iwid = 8 * sizeof vp->value.integer;
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-
-      if (wid > iwid) {
-            wid = iwid;
-      }
-      vp->value.integer = 0;
-
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    switch (vsig->value(idx)) {
-		case BIT4_0:
-		  break;
-		case BIT4_1:
-		  vp->value.integer |= 1<<idx;
-		  break;
-		default:
-		    /* vpi_get_value of vpiIntVal treats x and z
-		       values as 0. */
-		  break;
-	    }
-      }
-}
-
-static void signal_get_ScalarVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal*vsig = dynamic_cast<vvp_fun_signal*>(rfp->node->fun);
-
-      switch (vsig->value(0)) {
-	  case BIT4_0:
-	    vp->value.scalar = vpi0;
-	    break;
-	  case BIT4_1:
-	    vp->value.scalar = vpi1;
-	    break;
-	  case BIT4_X:
-	    vp->value.scalar = vpiX;
-	    break;
-	  case BIT4_Z:
-	    vp->value.scalar = vpiZ;
-	    break;
-      }
-}
-
-static void signal_get_StrengthVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-      unsigned wid = signal_width(rfp);
-      s_vpi_strengthval*op;
-
-      op = (s_vpi_strengthval*)
-	    need_result_buf(wid * sizeof(s_vpi_strengthval), RBUF_VAL);
-
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    vvp_scalar_t val = vsig->scalar_value(idx);
-
-	     /* vvp_scalar_t strengths are 0-7, but the vpi strength
-		is bit0-bit7. This gets the vpi form of the strengths
-		from the vvp_scalar_t strengths. */
-	    unsigned s0 = 1 << val.strength0();
-	    unsigned s1 = 1 << val.strength1();
-
-	    switch (val.value()) {
-	        case BIT4_0:
-		  op[idx].logic = vpi0;
-		  op[idx].s0 = s0|s1;
-		  op[idx].s1 = 0;
-		  break;
-	        case BIT4_1:
-		  op[idx].logic = vpi1;
-		  op[idx].s0 = 0;
-		  op[idx].s1 = s0|s1;
-		  break;
-	        case BIT4_X:
-		  op[idx].logic = vpiX;
-		  op[idx].s0 = s0;
-		  op[idx].s1 = s1;
-		  break;
-	        case BIT4_Z:
-		  op[idx].logic = vpiZ;
-		  op[idx].s0 = vpiHiZ;
-		  op[idx].s1 = vpiHiZ;
-		  break;
-	    }
-      }
-
-      vp->value.strength = op;
-}
-
 /*
  * The get_value method reads the values of the functors and returns
  * the vector to the caller. This causes no side-effect, and reads the
@@ -421,146 +650,48 @@ static void signal_get_value(vpiHandle ref, s_vpi_value*vp)
       vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
       assert(vsig);
 
-      char *rbuf = 0;
-
       switch (vp->format) {
 
 	  case vpiIntVal:
-	    signal_get_IntVal(rfp, vp);
+	    format_vpiIntVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiScalarVal:
-	    signal_get_ScalarVal(rfp, vp);
+	    format_vpiScalarVal(vsig, 0, vp);
 	    break;
 
 	  case vpiStrengthVal:
-	    signal_get_StrengthVal(rfp, vp);
+	    format_vpiStrengthVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiBinStrVal:
-	    rbuf = need_result_buf(wid+1, RBUF_VAL);
-
-	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		  rbuf[wid-idx-1] = vvp_bit4_to_ascii(vsig->value(idx));
-	    }
-	    rbuf[wid] = 0;
-	    vp->value.str = rbuf;
+	    format_vpiBinStrVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiHexStrVal: {
-		unsigned hwid = (wid + 3) / 4;
-
-		rbuf = need_result_buf(hwid+1, RBUF_VAL);
-		rbuf[hwid] = 0;
-
-		vpip_vec4_to_hex_str(vsig->vec4_value(), rbuf, hwid+1, false);
-		vp->value.str = rbuf;
-		break;
+	    format_vpiHexStrVal(vsig, 0, wid, vp);
+	    break;
 	  }
 
-	  case vpiOctStrVal: {
-		unsigned hval, hwid;
-		hwid = (wid + 2) / 3;
-
-		rbuf = need_result_buf(hwid+1, RBUF_VAL);
-		rbuf[hwid] = 0;
-		hval = 0;
-		for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		      unsigned tmp = 0;
-		      switch (vsig->value(idx)) {
-			  case BIT4_0:
-			    tmp = 0;
-			    break;
-			  case BIT4_1:
-			    tmp = 1;
-			    break;
-			  case BIT4_Z:
-			    tmp = 3;
-			    break;
-			  case BIT4_X:
-			    tmp = 2;
-			    break;
-		      }
-		      hval = hval | (tmp << 2*(idx % 3));
-
-		      if (idx%3 == 2) {
-			    hwid -= 1;
-			    rbuf[hwid] = oct_digits[hval];
-			    hval = 0;
-		      }
-		}
-
-		if (hwid > 0) {
-		      hwid -= 1;
-		      rbuf[hwid] = oct_digits[hval];
-		      unsigned padd = 0;
-		      switch(rbuf[hwid]) {
-			  case 'X': padd = 2; break;
-			  case 'Z': padd = 3; break;
-		      }
-		      if (padd) {
-			    for (unsigned idx = wid % 3; idx < 3; idx += 1) {
-				  hval = hval | padd << 2*idx;
-			    }
-			    rbuf[hwid] = oct_digits[hval];
-		      }
-		}
-		vp->value.str = rbuf;
-		break;
-	  }
+	  case vpiOctStrVal:
+	    format_vpiOctStrVal(vsig, 0, wid, vp);
+	    break;
 
 	  case vpiDecStrVal:
-	    vp->value.str = signal_vpiDecStrVal(rfp, vp);
+	    format_vpiDecStrVal(vsig, 0, wid, rfp->signed_flag, vp);
 	    break;
 
 	  case vpiStringVal:
-	    vp->value.str = signal_vpiStringVal(rfp, vp);
+	    format_vpiStringVal(vsig, 0, wid, vp);
 	    break;
 
-	  case vpiVectorVal: {
-	      unsigned int obit = 0;
-	      unsigned hwid = (wid - 1)/32 + 1;
-
-	      rbuf = need_result_buf(hwid * sizeof(s_vpi_vecval), RBUF_VAL);
-	      s_vpi_vecval *op = (p_vpi_vecval)rbuf;
-	      vp->value.vector = op;
-
-	      op->aval = op->bval = 0;
-	      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		switch (vsig->value(idx)) {
-		case BIT4_0:
-		  op->aval &= ~(1 << obit);
-		  op->bval &= ~(1 << obit);
-		  break;
-		case BIT4_1:
-		  op->aval |= (1 << obit);
-		  op->bval &= ~(1 << obit);
-		  break;
-		case BIT4_X:
-		  op->aval |= (1 << obit);
-		  op->bval |= (1 << obit);
-		  break;
-		case BIT4_Z:
-		  op->aval &= ~(1 << obit);
-		  op->bval |= (1 << obit);
-		  break;
-		}
-		obit++;
-		if (!(obit % 32)) {
-		      op += 1;
-		      if ((op - vp->value.vector) < (ptrdiff_t)hwid)
-			    op->aval = op->bval = 0;
-		      obit = 0;
-		}
-	      }
-	      break;
-	    }
+	  case vpiVectorVal:
+	    format_vpiVectorVal(vsig, 0, wid, vp);
+	    break;
 
 	  case vpiRealVal: {
-		bool flag = rfp->signed_flag;
-		vp->value.real = 0.0;
-		vector4_to_value(vsig->vec4_value(), vp->value.real, flag);
-		break;
+	    format_vpiRealVal(vsig, 0, wid, rfp->signed_flag, vp);
+	    break;
 	  }
 
 	  default:
@@ -812,3 +943,218 @@ vpiHandle vpip_make_net(const char*name, int msb, int lsb,
 
       return &obj->base;
 }
+
+static int PV_get_base(struct __vpiPV*rfp)
+{
+      if (rfp->twid == 0) return rfp->tbase;
+
+      int tval = 0;
+      for (unsigned idx = 0 ;  idx < rfp->twid ;  idx += 1) {
+	    vvp_bit4_t bit = vthread_get_bit(vpip_current_vthread,
+                                              rfp->tbase + idx);
+	    if (bit == BIT4_1) {
+		  tval |= 1<<idx;
+	    }
+      }
+
+      return tval;
+}
+
+static int PV_get(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      int rval = 0;
+      switch (code) {
+	case vpiLineNo:
+	    return 0;  // Not implemented for now!
+
+	case vpiSigned:
+	    return 0;  // A part/bit select is always unsigned!
+
+	case vpiSize:
+	    return rfp->width;
+
+	case vpiConstantSelect:
+	    return rfp->twid == 0;
+
+	case vpiLeftRange: rval += rfp->width;
+	case vpiRightRange:
+	    rval += vpi_get(vpiRightRange, rfp->parent) + PV_get_base(rfp);
+	    return rval;
+
+	default:
+	    fprintf(stderr, "PV_get: property %d is unknown\n", code);
+      }
+
+      return 0;
+}
+
+static char* PV_get_str(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      switch (code) {
+	case vpiFile:  // Not implemented for now!
+	    return simple_set_rbuf_str(file_names[0]);
+
+	case vpiName:
+	case vpiFullName: {
+	    const char*nm = vpi_get_str(code, rfp->parent);
+	    char full[1024+strlen(nm)];
+	    sprintf(full, "%s[%d:%d]", nm, vpi_get(vpiLeftRange, ref),
+	                                   vpi_get(vpiRightRange, ref));
+	    return simple_set_rbuf_str(full);
+	}
+
+	default:
+	    fprintf(stderr, "PV_get_str: property %d is unknown\n", code);
+      }
+
+      return 0;
+}
+
+static void PV_get_value(vpiHandle ref, p_vpi_value vp)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      vvp_fun_signal_vec*sig = dynamic_cast<vvp_fun_signal_vec*>(rfp->net->fun);
+      assert(sig);
+
+      switch (vp->format) {
+
+	  case vpiIntVal:
+	    format_vpiIntVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiBinStrVal:
+	    format_vpiBinStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiOctStrVal:
+	    format_vpiOctStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiHexStrVal:
+	    format_vpiHexStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiDecStrVal:
+	    format_vpiDecStrVal(sig, PV_get_base(rfp), rfp->width, 0, vp);
+	    break;
+
+	  case vpiStringVal:
+	    format_vpiStringVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiScalarVal:
+	    format_vpiScalarVal(sig, PV_get_base(rfp), vp);
+	    break;
+
+	  case vpiStrengthVal:
+	    format_vpiStrengthVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiVectorVal:
+	    format_vpiVectorVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiRealVal:
+	    format_vpiRealVal(sig, PV_get_base(rfp), rfp->width, 0, vp);
+	    break;
+
+	  default:
+	    fprintf(stderr, "vvp internal error: PV_get_value: "
+		    "value type %u not implemented. Signal is %s.\n",
+		    vp->format, vpi_get_str(vpiFullName, rfp->parent));
+	    assert(0);
+      }
+}
+
+static vpiHandle PV_put_value(vpiHandle ref, p_vpi_value vp, int)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+      vvp_fun_signal_vec*sig = reinterpret_cast<vvp_fun_signal_vec*>(rfp->net);
+      assert(sig);
+
+      unsigned width = rfp->width;
+      int base = PV_get_base(rfp);
+      if (base >= (signed) sig->size()) return 0;
+      if (base < 0) {
+	    width += base;
+	    base = 0;
+      }
+      if (base+width > sig->size()) width = sig->size() - base;
+
+      bool full_sig = base == 0 && width == sig->size();
+
+      vvp_net_ptr_t ptr (rfp->net, 0);
+
+/* We only support integer values. */
+      assert(vp->format == vpiIntVal);
+      if (full_sig) {
+	    vvp_send_long(ptr, vp->value.integer);
+      } else {
+	    vvp_send_long_pv(ptr, vp->value.integer, base, width);
+      }
+
+      return 0;
+}
+
+static vpiHandle PV_get_handle(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code==vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      switch (code) {
+
+	  case vpiParent:
+	    return rfp->parent;
+	    break;
+      }
+
+      return 0;
+}
+
+static const struct __vpirt vpip_PV_rt = {
+      vpiPartSelect,
+      PV_get,
+      PV_get_str,
+      PV_get_value,
+      PV_put_value,
+      PV_get_handle,
+      0
+};
+
+vpiHandle vpip_make_PV(char*var, int base, int width)
+{
+
+      struct __vpiPV*obj = (struct __vpiPV*) malloc(sizeof(struct __vpiPV));
+      obj->base.vpi_type = &vpip_PV_rt;
+      obj->parent = vvp_lookup_handle(var);
+      obj->tbase = base;
+      obj->twid = 0;
+      obj->width = (unsigned) width;
+      obj->net = (vvp_net_t*) malloc(sizeof(vvp_net_t));
+      functor_ref_lookup(&obj->net, var);
+
+      return &obj->base;
+}
+
+vpiHandle vpip_make_PV(char*var, int tbase, int twid, int width)
+{
+      struct __vpiPV*obj = (struct __vpiPV*) malloc(sizeof(struct __vpiPV));
+      obj->base.vpi_type = &vpip_PV_rt;
+      obj->parent = vvp_lookup_handle(var);
+      obj->tbase = tbase;
+      obj->twid = (unsigned) twid;
+      obj->width = (unsigned) width;
+      obj->net = (vvp_net_t*) malloc(sizeof(vvp_net_t));
+      functor_ref_lookup(&obj->net, var);
+
+      return &obj->base;
+}
diff --git a/vvp/vpi_tasks.cc b/vvp/vpi_tasks.cc
index f82a43c41..fdad4ffc3 100644
--- a/vvp/vpi_tasks.cc
+++ b/vvp/vpi_tasks.cc
@@ -167,6 +167,8 @@ static vpiHandle sysfunc_put_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
       assert(rfp->vbit >= 4);
 
       switch (vp->format) {
@@ -271,6 +273,8 @@ static vpiHandle sysfunc_put_real_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
 	/* Make sure this is a real valued function. */
       assert(rfp->vwid == -vpiRealConst);
 
@@ -297,6 +301,8 @@ static vpiHandle sysfunc_put_4net_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
       unsigned vwid = (unsigned) rfp->vwid;
       vvp_vector4_t val (vwid);
 
@@ -384,8 +390,10 @@ static vpiHandle sysfunc_put_rnet_value(vpiHandle ref, p_vpi_value vp, int)
       assert(ref->vpi_type->type_code == vpiSysFuncCall);
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
-      double val;
 
+      rfp->put_value = true;
+
+      double val;
       switch (vp->format) {
 
 	  case vpiRealVal:
@@ -563,6 +571,7 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
       obj->file_idx  = (unsigned) file_idx;
       obj->lineno   = (unsigned) lineno;
       obj->userdata  = 0;
+      obj->put_value = false;
 
       compile_compiletf(obj);
 
@@ -590,8 +599,23 @@ void vpip_execute_vpi_call(vthread_t thr, vpiHandle ref)
       if (vpip_cur_task->defn->info.calltf) {
 	    assert(vpi_mode_flag == VPI_MODE_NONE);
 	    vpi_mode_flag = VPI_MODE_CALLTF;
+	    vpip_cur_task->put_value = false;
 	    vpip_cur_task->defn->info.calltf(vpip_cur_task->defn->info.user_data);
 	    vpi_mode_flag = VPI_MODE_NONE;
+	      /* If the function call did not set a value then put a
+	       * default value (0). */
+	    if (ref->vpi_type->type_code == vpiSysFuncCall &&
+	        !vpip_cur_task->put_value) {
+		  s_vpi_value val;
+		  if (vpip_cur_task->vwid == -vpiRealConst) {
+			val.format = vpiRealVal;
+			val.value.real = 0.0;
+		  } else {
+			val.format = vpiIntVal;
+			val.value.integer = 0;
+		  }
+		  vpi_put_value(ref, &val, 0, vpiNoDelay);
+	    }
       }
 }
 
diff --git a/vvp/vpi_vthr_vector.cc b/vvp/vpi_vthr_vector.cc
index 72a043fe5..85082b935 100644
--- a/vvp/vpi_vthr_vector.cc
+++ b/vvp/vpi_vthr_vector.cc
@@ -67,7 +67,7 @@ void set_bit(struct __vpiVThrVec *rfp, unsigned idx, vvp_bit4_t bit)
 
 extern const char hex_digits[256];
 
-extern const char oct_digits[256];
+extern const char oct_digits[64];
 
 /*
  *  vpi_get
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index a7cf69f05..0f00ac35f 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -214,6 +214,85 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
       }
 }
 
+/*
+ * Some of the instructions do wide addition to arrays of long. They
+ * use this add_with_cary function to help.
+ */
+static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
+					   unsigned long&carry)
+{
+      unsigned long tmp = b + carry;
+      unsigned long sum = a + tmp;
+      carry = 0;
+      if (tmp < b)
+	    carry = 1;
+      if (sum < tmp)
+	    carry = 1;
+      if (sum < a)
+	    carry = 1;
+      return sum;
+}
+
+static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
+					 unsigned long&carry)
+{
+      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
+      unsigned long a0 = a & mask;
+      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
+      unsigned long b0 = b & mask;
+      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long tmp = a0 * b0;
+
+      unsigned long r00 = tmp & mask;
+      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a0 * b1;
+
+      unsigned long r01 = tmp & mask;
+      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b0;
+
+      unsigned long r10 = tmp & mask;
+      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b1;
+
+      unsigned long r11 = tmp & mask;
+      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long r1 = c00 + r01 + r10;
+      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
+      r1 &= mask;
+      r2 += c01 + c10 + r11;
+      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
+      r2 &= mask;
+      r3 += c11;
+      r3 &= mask;
+
+      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
+      return (r1 << (CPU_WORD_BITS/2)) + r00;
+}
+
+static void multiply_array_imm(unsigned long*res, unsigned long*val,
+			       unsigned words, unsigned long imm)
+{
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
+
+      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
+	    unsigned long sum;
+	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
+
+	    unsigned long carry = 0;
+	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
+	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
+		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+		  sum = 0;
+	    }
+      }
+}
 
 /*
  * Create a new thread with the given start address.
@@ -454,6 +533,28 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
 }
 
 
+bool of_ANDI(vthread_t thr, vvp_code_t cp)
+{
+      unsigned idx1 = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
+
+      assert(idx1 >= 4);
+
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
+      vvp_vector4_t imv (wid, BIT4_0);
+
+      unsigned trans = wid;
+      if (trans > CPU_WORD_BITS)
+	    trans = CPU_WORD_BITS;
+      imv.setarray(0, trans, &imm);
+
+      val &= imv;
+
+      thr->bits4.set_vec(idx1, val);
+      return true;
+}
+
 bool of_ADD(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
@@ -465,19 +566,8 @@ bool of_ADD(vthread_t thr, vvp_code_t cp)
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
 
 	/* We know from the vector_to_array that the address is valid
 	   in the thr->bitr4 vector, so just do the set bit. */
@@ -525,30 +615,15 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       unsigned word_count = (bit_width+CPU_WORD_BITS-1)/CPU_WORD_BITS;
 
       unsigned long*lva = vector_to_array(thr, bit_addr, bit_width);
-      unsigned long*lvb = 0;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-      lvb[0] = imm_value;
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = 0;
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < bit_width ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], imm_value, carry);
+	    imm_value = 0;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -557,7 +632,6 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(bit_addr, bit_width, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 
@@ -1028,7 +1102,7 @@ static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 {
 
       unsigned idx1 = cp->bit_idx[0];
-      unsigned imm  = cp->bit_idx[1];
+      unsigned long imm  = cp->bit_idx[1];
       unsigned wid  = cp->number;
       if (idx1 >= 4)
 	    thr_check_addr(thr, idx1+wid-1);
@@ -1042,8 +1116,8 @@ static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 
       vvp_bit4_t eq  = BIT4_0;
       for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    vvp_bit4_t rv = (imm & 1)? BIT4_1 : BIT4_0;
-	    imm >>= 1;
+	    vvp_bit4_t rv = (imm & 1UL)? BIT4_1 : BIT4_0;
+	    imm >>= 1UL;
 
 	    if (bit4_is_xz(lv)) {
 		  eq = BIT4_X;
@@ -1477,297 +1551,268 @@ bool of_DISABLE(vthread_t thr, vvp_code_t cp)
       return ! disabled_myself_flag;
 }
 
-static void divide_bits(unsigned len, unsigned char*lbits,
-			const unsigned char*rbits)
+/*
+ * This function divides a 2-word number {high, a} by a 1-word
+ * number. Assume that high < b.
+ */
+static unsigned long divide2words(unsigned long a, unsigned long b,
+				  unsigned long high)
 {
-      unsigned char *a, *b, *z, *t;
-      a = new unsigned char[len+1];
-      b = new unsigned char[len+1];
-      z = new unsigned char[len+1];
-      t = new unsigned char[len+1];
+      unsigned long result = 0;
+      while (high > 0) {
+	    unsigned long tmp_result = ULONG_MAX / b;
+	    unsigned long remain = ULONG_MAX % b;
 
-      unsigned char carry;
-      unsigned char temp;
-
-      int mxa = -1, mxz = -1;
-      int i;
-      int current, copylen;
-
-
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    unsigned lb = lbits[idx];
-	    unsigned rb = rbits[idx];
-
-	    z[idx]=lb;
-	    a[idx]=1-rb;	// for 2s complement add..
-
-      }
-      z[len]=0;
-      a[len]=1;
-
-      for(i=0;i<(int)len+1;i++) {
-	    b[i]=0;
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(!a[i]) {
-		  mxa=i;
-		  break;
-	    }
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(z[i]) {
-		  mxz=i;
-		  break;
-	    }
-      }
-
-      if((mxa>mxz)||(mxa==-1)) {
-	    if(mxa==-1) {
-		  fprintf(stderr, "Division By Zero error, exiting.\n");
-		  exit(255);
+	    remain += 1;
+	    if (remain >= b) {
+		  remain -= b;
+		  result += 1;
 	    }
 
-	    goto tally;
+	      // Now 0x1_0...0 = b*tmp_result + remain
+	      // high*0x1_0...0 = high*(b*tmp_result + remain)
+	      // high*0x1_0...0 = high*b*tmp_result + high*remain
+
+	      // We know that high*0x1_0...0 >= high*b*tmp_result, and
+	      // we know that high*0x1_0...0 > high*remain. Use
+	      // high*remain as the remainder for another iteration,
+	      // and add tmp_result*high into the current estimate of
+	      // the result.
+	    result += tmp_result * high;
+
+	      // The new iteration starts with high*remain + a.
+	    remain = multiply_with_carry(high, remain, high);
+	    a = add_with_carry(a, remain, high);
+
+	      // Now result*b + {high,a} == the input {high,a}. It is
+	      // possible that the new high >= 1. If so, it will
+	      // certainly be less then high from the previous
+	      // iteration. Do another iteration and it will shrink,
+	      // eventually to 0.
       }
 
-      copylen = mxa + 2;
-      current = mxz - mxa;
+	// high is now 0, so a is the remaining remainder, so we can
+	// finish off the integer divide with a simple a/b.
 
-      while(current > -1) {
-	    carry = 1;
-	    for(i=0;i<copylen;i++) {
-		  temp = z[i+current] + a[i] + carry;
-		  t[i] = (temp&1);
-		  carry = (temp>>1);
+      return result + a/b;
+}
+
+static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid)
+{
+
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+
+      unsigned btop = words-1;
+      while (btop > 0 && bp[btop] == 0)
+	    btop -= 1;
+
+	// Detect divide by 0, and exit.
+      if (btop==0 && bp[0]==0)
+	    return 0;
+
+      unsigned long*diff  = new unsigned long[words];
+      unsigned long*result= new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    result[idx] = 0;
+
+      for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) {
+	    unsigned cur_ptr = cur-1;
+	    unsigned long cur_res;
+	    if (ap[cur_ptr+btop] >= bp[btop]) {
+		  cur_res = ap[cur_ptr+btop] / bp[btop];
+
+	    } else if (cur_ptr+btop+1 >= words) {
+		  continue;
+
+	    } else if (ap[cur_ptr+btop+1] == 0) {
+		  continue;
+
+	    } else {
+		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop],
+					 ap[cur_ptr+btop+1]);
 	    }
 
-	    if(carry) {
-		  for(i=0;i<copylen;i++) {
-			z[i+current] = t[i];
-		  }
-		  b[current] = 1;
+	      // cur_res is a guestimate of the result this far. It
+	      // may be 1 too big. (But it will also be >0) Try it,
+	      // and if the difference comes out negative, then adjust
+	      // then.
+
+	    multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res);
+	    unsigned long carry = 1;
+	    for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+		  ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry);
+
+	      // ap has the diff subtracted out of it. If cur_res was
+	      // too large, then ap will turn negative. (We easily
+	      // tell that ap turned negative by looking at
+	      // carry&1. If it is 0, then it is *negative*.) In that
+	      // case, we know that cur_res was too large by 1. Correct by
+	      // adding 1b back in and reducing cur_res.
+	    if (carry&1 == 0) {
+		  cur_res -= 1;
+		  carry = 0;
+		  for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+			ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry);
+		    // The sign *must* have changed again.
+		  assert(carry == 1);
 	    }
 
-	    current--;
+	    result[cur_ptr] = cur_res;
       }
 
- tally:
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	      // n.b., z[] has the remainder...
-	    lbits[idx] = b[idx];
-      }
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
 
-      delete []t;
-      delete []z;
-      delete []b;
-      delete []a;
+      delete[]diff;
+      return result;
 }
 
 bool of_DIV(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-		  vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-		  if (bit4_is_xz(lb) || bit4_is_xz(rb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-		  rv |= (unsigned long) rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	    if (rv == BIT4_0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
-
-	    return true;
-
-      } else {
-
-	      /* Make a string of the bits of the numbers to be
-		 divided. Then divide them, and write the results into
-		 the thread. */
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
 	    return true;
       }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  ap[0] /= bp[0];
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
-static void negate_bits(unsigned len, unsigned char*bits)
+
+static void negate_words(unsigned long*val, unsigned words)
 {
-      unsigned char carry = 1;
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    carry += bits[idx]? 0 : 1;
-	    bits[idx] = carry & 1;
-	    carry >>= 1;
-      }
+      unsigned long carry = 1;
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    val[idx] = add_with_carry(0, ~val[idx], carry);
 }
 
 bool of_DIV_S(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
 
-      if(cp->number <= 8*sizeof(long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    unsigned lb = 0;
-	    unsigned rb = 0;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lb = thr_get_bit(thr, idx1);
-		  rb = thr_get_bit(thr, idx2);
-
-		  if ((lb | rb) & 2)
-			goto x_out;
-
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Extend the sign to fill the native long. */
-	    for (unsigned idx = cp->number; idx < (8*sizeof lv); idx += 1) {
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-	    }
-
-	    if (rv == 0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0);
-		  lv >>= 1;
-	    }
-
-      } else {
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	      /* Signed division is unsigned division on the absolute
-		 values of the operands, then corrected for the number
-		 of signs. */
-	    unsigned sign_flag = 0;
-	    if (lbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, lbits);
-	    }
-	    if (rbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, rbits);
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    if (sign_flag & 1) {
-		  negate_bits(cp->number, lbits);
-	    }
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long sign_mask = 0;
+      if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) {
+	    sign_mask = -1UL << (CPU_WORD_BITS-sign_bits);
+	    if (ap[words-1] & (sign_mask>>1))
+		  ap[words-1] |= sign_mask;
+	    if (bp[words-1] & (sign_mask>>1))
+		  bp[words-1] |= sign_mask;
+      }
 
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  long tmpa = (long) ap[0];
+		  long tmpb = (long) bp[0];
+		  long res = tmpa / tmpb;
+		  ap[0] = ((unsigned long)res) & ~sign_mask;
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+	// We need to the actual division to positive integers. Make
+	// them positive here, and remember the negations.
+      bool negate_flag = false;
+      if ( ((long) ap[words-1]) < 0 ) {
+	    negate_flag = true;
+	    negate_words(ap, words);
+      }
+      if ( ((long) bp[words-1]) < 0 ) {
+	    negate_flag ^= true;
+	    negate_words(bp, words);
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+      if (negate_flag) {
+	    negate_words(result, words);
+      }
+
+      result[words-1] &= ~sign_mask;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
@@ -2442,7 +2487,7 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp)
  * The functor to read from is the vvp_net_t object pointed to by the
  * cp->net pointer.
  */
-vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
+static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
       assert(cp->bit_idx[1] > 0);
@@ -2456,9 +2501,8 @@ vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
       if (sig == 0) {
 	    cerr << "%%load/v error: Net arg not a vector signal? "
 		 << typeid(*net->fun).name() << endl;
+	    assert(sig);
       }
-      assert(sig);
-
 
       vvp_vector4_t sig_value = sig->vec4_value();
       sig_value.resize(wid);
@@ -2498,15 +2542,35 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t sig_value(wid, BIT4_0);
       sig_value.copy_bits(load_base(thr, cp));
 
-	/* Add the addend value */
-      sig_value += addend;
-
 	/* Check the address once, before we scan the vector. */
       thr_check_addr(thr, bit+wid-1);
 
+      unsigned long*val = sig_value.subarray(0, wid);
+      if (val == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(bit, tmp);
+	    return true;
+      }
+
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
+      unsigned long carry = 0;
+      unsigned long imm = addend;
+      if (addend >= 0) {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = 0UL;
+	    }
+      } else {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = -1UL;
+	    }
+      }
+
 	/* Copy the vector bits into the bits4 vector. Do the copy
 	   directly to skip the excess calls to thr_check_addr. */
-      thr->bits4.set_vec(bit, sig_value);
+      thr->bits4.setarray(bit, wid, val);
+      delete[]val;
 
       return true;
 }
@@ -2908,114 +2972,83 @@ bool of_MOV_WR(vthread_t thr, vvp_code_t cp)
 bool of_MOVI(vthread_t thr, vvp_code_t cp)
 {
       unsigned dst = cp->bit_idx[0];
-      unsigned val = cp->bit_idx[1];
+      static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       unsigned wid = cp->number;
 
       thr_check_addr(thr, dst+wid-1);
 
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1, val >>= 1)
-	    thr->bits4.set_bit(dst+idx, (val&1)? BIT4_1 : BIT4_0);
+      val[0] = cp->bit_idx[1];
+
+      while (wid > 0) {
+	    unsigned trans = wid;
+	    if (trans > 8*CPU_WORD_BITS)
+		  trans = 8*CPU_WORD_BITS;
+
+	    thr->bits4.setarray(dst, trans, val);
+
+	    val[0] = 0;
+	    wid -= trans;
+	    dst += trans;
+      }
 
       return true;
 }
 
 bool of_MUL(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-      if(cp->number <= 8*sizeof(unsigned long)) {
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  goto x_out;
-
-	    lv |= (unsigned long) lb << idx;
-	    rv |= (unsigned long) rb << idx;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      lv *= rv;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-	    lv >>= 1;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
-      } else {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    ap[0] *= bp[0];
+	    thr->bits4.setarray(adra, wid, ap);
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
 
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
 
-      int mxa = -1;
-      int mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
+      for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
+	    for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) {
+		  unsigned long sum;
+		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
+		  unsigned base = mul_a + mul_b;
+		  unsigned long carry = 0;
+		  res[base] = add_with_carry(res[base], tmp, carry);
+		  for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
+			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+			sum = 0;
 		  }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    }
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++)
-                {
-                if(b[i])
-                        {
-                        unsigned char carry=0;
-                        unsigned char temp;
-
-                        for(int j=0;j<=mxa;j++)
-                                {
-                                if(i+j>=(int)cp->number) break;
-                                temp=sum[i+j]+a[j]+carry;
-                                sum[i+j]=(temp&1);
-                                carry=(temp>>1);
-                                }
-                        }
-                }
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-      return true;
-      }
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+      thr->bits4.setarray(adra, wid, res);
+      delete[]ap;
+      delete[]bp;
+      delete[]res;
       return true;
 }
 
@@ -3030,101 +3063,36 @@ bool of_MUL_WR(vthread_t thr, vvp_code_t cp)
 
 bool of_MULI(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adr = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-	/* If the value fits into a native unsigned long, then make an
-	   unsigned long variable with the numbers, to a native
-	   multiply, and work with that. */
-
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned long lv = 0, rv = cp->bit_idx[1];
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-
-		  if (bit4_is_xz(lb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-
-		  idx1 += 1;
-	    }
-
-	    lv *= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
+      assert(adr >= 4);
 
+      unsigned long*val = vector_to_array(thr, adr, wid);
+	// If there are X bits in the value, then return X.
+      if (val == 0) {
+	    vvp_vector4_t tmp(cp->number, BIT4_X);
+	    thr->bits4.set_vec(cp->bit_idx[0], tmp);
 	    return true;
       }
 
-	/* number is too large for local long, so do bitwise
-	   multiply. */
-
-      unsigned idx1; idx1 = cp->bit_idx[0];
-      unsigned imm;  imm  = cp->bit_idx[1];
-
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
-
-      int mxa; mxa = -1;
-      int mxb; mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = (imm & 1)? BIT4_1 : BIT4_0;
-
-	    imm >>= 1;
-
-	    if (bit4_is_xz(lb)) {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
-	    }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
+	// If everything fits in a word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    val[0] *= imm;
+	    thr->bits4.setarray(adr, wid, val);
+	    delete[]val;
+	    return true;
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++) {
-	    if(b[i]) {
-		  unsigned char carry=0;
-		  unsigned char temp;
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
 
-		  for(int j=0;j<=mxa;j++) {
-			if(i+j>=(int)cp->number) break;
-			temp=sum[i+j]+a[j]+carry;
-			sum[i+j]=(temp&1);
-			carry=(temp>>1);
-		  }
-	    }
-      }
-
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-
-      return true;
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      multiply_array_imm(res, val, words, imm);
 
+      thr->bits4.setarray(adr, wid, res);
+      delete[]val;
+      delete[]res;
       return true;
 }
 
@@ -3754,20 +3722,10 @@ bool of_SUB(vthread_t thr, vvp_code_t cp)
 	    goto x_out;
 
 
-      unsigned carry;
+      unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-	    unsigned long tmp = ~lvb[idx] + carry;
-	    unsigned long sum = tmp + lva[idx];
-	    carry = 0;
-	    if (tmp < ~lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);
 
 
 	/* We know from the vector_to_array that the address is valid
@@ -3802,34 +3760,17 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       assert(cp->bit_idx[0] >= 4);
 
       unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS;
-
+      unsigned long imm = cp->bit_idx[1];
       unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
-      unsigned long*lvb;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-
-      lvb[0] = cp->bit_idx[1];
-      lvb[0] = ~lvb[0];
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = ~0UL;
 
       unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0UL;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
+	    imm = 0UL;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -3838,7 +3779,6 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 9f9a50e55..dfc77fb40 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -61,28 +61,6 @@ vvp_bit4_t add_with_carry(vvp_bit4_t a, vvp_bit4_t b, vvp_bit4_t&c)
       }
 }
 
-vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_0)
-	    return BIT4_0;
-      if (b == BIT4_0)
-	    return BIT4_0;
-      if (bit4_is_xz(a))
-	    return BIT4_X;
-      if (bit4_is_xz(b))
-	    return BIT4_X;
-      return BIT4_1;
-}
-
-vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_1)
-	    return BIT4_1;
-      if (b == BIT4_1)
-	    return BIT4_1;
-      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
-}
-
 vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b)
 {
       if (bit4_is_xz(a))
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 3d4f6b881..18548c157 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -87,8 +87,21 @@ inline vvp_bit4_t bit4_z2x(vvp_bit4_t a)
 inline vvp_bit4_t operator ~ (vvp_bit4_t a)
 { return bit4_z2x((vvp_bit4_t) (((int)a) ^ 1)); }
 
-extern vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b);
-extern vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b);
+inline vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_1 || b==BIT4_1)
+	    return BIT4_1;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+inline vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_0 || b==BIT4_0)
+	    return BIT4_0;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+
 extern vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b);
 extern ostream& operator<< (ostream&o, vvp_bit4_t a);
 
@@ -294,41 +307,47 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
       assert(idx < size_);
 
       unsigned long off = idx % BITS_PER_WORD;
-      unsigned long amask = 0, bmask = 0;
-      switch (val) {
-	  case BIT4_0:
-	    amask = 0;
-	    bmask = 0;
-	    break;
-	  case BIT4_1:
-	    amask = 1;
-	    bmask = 0;
-	    break;
-	  case BIT4_X:
-	    amask = 1;
-	    bmask = 1;
-	    break;
-	  case BIT4_Z:
-	    amask = 0;
-	    bmask = 1;
-	    break;
-      }
-
       unsigned long mask = 1UL << off;
-      amask <<= off;
-      bmask <<= off;
 
       if (size_ > BITS_PER_WORD) {
 	    unsigned wdx = idx / BITS_PER_WORD;
-	    abits_ptr_[wdx] &= ~mask;
-	    abits_ptr_[wdx] |= amask;
-	    bbits_ptr_[wdx] &= ~mask;
-	    bbits_ptr_[wdx] |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+	    }
       } else {
-	    abits_val_ &= ~mask;
-	    abits_val_ |= amask;
-	    bbits_val_ &= ~mask;
-	    bbits_val_ |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_val_ &= ~mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_val_ |=  mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_val_ |=  mask;
+		  bbits_val_ |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_val_ &= ~mask;
+		  bbits_val_ |=  mask;
+		  break;
+	    }
       }
 }