Handle vec4 part selects / vec4 cassign / repeat statements

These features need to be adapted to the vec4 stack.
2014-01-04 22:11:07 +00:00 · 2014-01-04 22:11:07 +00:00 · e708a5b59d
parent d55e4c0552
commit e708a5b59d
10 changed files with 424 additions and 226 deletions
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@ -435,10 +435,12 @@ static void draw_select_vec4(ivl_expr_t expr)
      ivl_expr_t base = ivl_expr_oper2(expr);
 	// This is the part select width
      unsigned wid = ivl_expr_width(expr);
+	// Is the select base expression signed or unsigned?
+      char sign_suff = ivl_expr_signed(base)? 's' : 'u';

      draw_eval_vec4(subexpr, 0);
      draw_eval_vec4(base, 0);
-      fprintf(vvp_out, "    %%part %u;\n", wid);
+      fprintf(vvp_out, "    %%part/%c %u;\n", sign_suff, wid);
 }

 static void draw_select_pad_vec4(ivl_expr_t expr, int stuff_ok_flag)
@ -486,8 +488,19 @@ static void draw_signal_vec4(ivl_expr_t expr)
 {
      ivl_signal_t sig = ivl_expr_signal(expr);

-      assert(ivl_signal_dimensions(sig) == 0);
+	/* Handle the simple case, a signal expression that is a
+	   simple vector, no array dimensions. */
+      if (ivl_signal_dimensions(sig) == 0) {
 	    fprintf(vvp_out, "    %%load/vec4 v%p_0;\n", sig);
+	    return;
+      }
+
+	/* calculate the array index... */
+      int addr_index = allocate_word();
+      draw_eval_expr_into_integer(ivl_expr_oper1(expr), addr_index);
+
+      fprintf(vvp_out, "    %%load/vec4a v%p, %d;\n", sig, addr_index);
+      clr_word(addr_index);
 }

 static void draw_ternary_vec4(ivl_expr_t expr, int stuff_ok_flag)
--- a/tgt-vvp/stmt_assign.c
+++ b/tgt-vvp/stmt_assign.c
@ -551,17 +551,38 @@ static void set_vec_to_lval(ivl_statement_t net, struct vector_info res)
 * Store a vector from the vec4 stack to the statement l-values. This
 * all assumes that the value to be assigned is already on the top of
 * the stack.
+ *
+ * NOTE TO SELF: The %store/vec4 takes a width, but the %assign/vec4
+ * instructions do not, instead relying on the expression width. I
+ * think that it the proper way to do it, so soon I should change the
+ * %store/vec4 to not include the width operand.
 */
 static void store_vec4_to_lval(ivl_statement_t net)
 {
-      assert(ivl_stmt_lvals(net) == 1);
-
-      ivl_lval_t lval = ivl_stmt_lval(net,0);
+      for (unsigned lidx = 0 ; lidx < ivl_stmt_lvals(net) ; lidx += 1) {
+	    ivl_lval_t lval = ivl_stmt_lval(net,lidx);
 	    ivl_signal_t lsig = ivl_lval_sig(lval);
+	    unsigned lwid = ivl_lval_width(lval);

-      assert(ivl_lval_width(lval) == ivl_signal_width(lsig));
+	    ivl_expr_t part_off_ex = ivl_lval_part_off(lval);

-      fprintf(vvp_out, "    %%store/vec4 v%p_0, %u;\n", lsig, ivl_signal_width(lsig));
+	    if (lidx+1 < ivl_stmt_lvals(net))
+		  fprintf(vvp_out, "    %%split/vec4 %u;\n", lwid);
+
+	    if (part_off_ex) {
+		    /* Dynamically calculated part offset */
+		  int offset_index = allocate_word();
+		  draw_eval_expr_into_integer(part_off_ex, offset_index);
+		  fprintf(vvp_out, "    %%store/vec4/off v%p_0, %d, %u;\n",
+			  lsig, offset_index, lwid);
+		  clr_word(offset_index);
+
+	    } else {
+		    /* No offset expression, so use simpler store function. */
+		  assert(lwid == ivl_signal_width(lsig));
+		  fprintf(vvp_out, "    %%store/vec4 v%p_0, %u;\n", lsig, lwid);
+	    }
+      }
 }

 static int show_stmt_assign_vector(ivl_statement_t net)
--- a/tgt-vvp/vvp_process.c
+++ b/tgt-vvp/vvp_process.c
@ -111,8 +111,8 @@ static void assign_to_array_r_word(ivl_signal_t lsig, ivl_expr_t word_ix,
 }

 static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
-				 unsigned bit, uint64_t delay, ivl_expr_t dexp,
-				 ivl_expr_t part_off_ex, unsigned width,
+				 uint64_t delay, ivl_expr_t dexp,
+				 ivl_expr_t part_off_ex,
 				 unsigned nevents)
 {
      unsigned skip_assign = transient_id++;
@ -121,6 +121,10 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
      int delay_index;
      unsigned long part_off = 0;

+	/* Figure the constant part offset, if possible. If we can do
+	   so, then forget about the expression and use the calculated
+	   value. After this block, if the part_off_ex!=0, then the
+	   part offset is used, otherwise, use part_off. */
      if (part_off_ex == 0) {
 	    part_off = 0;
      } else if (number_is_immediate(part_off_ex, IMM_WID, 0) &&
@ -136,25 +140,16 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
 	    word_ix_reg = allocate_word();
      }

-	/* This code is common to all the different types of array delays. */
-      if (number_is_immediate(word_ix, IMM_WID, 0) &&
-	  !number_is_unknown(word_ix)) {
-	    fprintf(vvp_out, "    %%ix/load %d, %lu, 0; address\n", word_ix_reg,
-	                     get_number_immediate(word_ix));
-      } else {
 	/* Calculate array word index into word index register */
      draw_eval_expr_into_integer(word_ix, word_ix_reg);
 	/* Skip assignment if word expression is not defined. */
      fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
-      }
+
      if (part_off_ex) {
 	    draw_eval_expr_into_integer(part_off_ex, part_off_reg);
 	      /* If the index expression has XZ bits, skip the assign. */
 	    fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
-	    if (dexp == 0) {
-		  fprintf(vvp_out, "    %%ix/mov 3, %u;\n", word_ix_reg);
-		  clr_word(word_ix_reg);
-	    }
+
      } else {
 	      /* Store word part select into part_off_reg */
 	    fprintf(vvp_out, "    %%ix/load %d, %lu, 0; part off\n",
@ -165,42 +160,34 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
 	      /* Calculated delay... */
 	    delay_index = allocate_word();
 	    draw_eval_expr_into_integer(dexp, delay_index);
-      }
-
-	/* Store expression width into index word 0 */
-      fprintf(vvp_out, "    %%ix/load 0, %u, 0; word width\n", width);
-
-      if (dexp != 0) {
-	    fprintf(vvp_out, "    %%ix/mov 1, %u;\n", part_off_reg);
+	    if (word_ix_reg != 3) {
 		  fprintf(vvp_out, "    %%ix/mov 3, %u;\n", word_ix_reg);
-	    fprintf(vvp_out, "    %%assign/av/d v%p, %d, %u;\n", lsig,
-	                     delay_index, bit);
-	    clr_word(part_off_reg);
 		  clr_word(word_ix_reg);
+	    }
+	    fprintf(vvp_out, "    %%assign/vec4/a/d v%p, %d, %d;\n", lsig,
+	                     delay_index, part_off_reg);
+	    clr_word(part_off_reg);
 	    clr_word(delay_index);
+
      } else if (nevents != 0) {
 	      /* Event control delay... */
-	    fprintf(vvp_out, "    %%assign/av/e v%p, %u;\n", lsig, bit);
+	    fprintf(vvp_out, "    %%assign/vec4/a/e v%p, 0;\n", lsig);
+
      } else {
 	      /* Constant delay... */
 	    unsigned long low_d = delay % UINT64_C(0x100000000);
 	    unsigned long hig_d = delay / UINT64_C(0x100000000);

-	      /*
-	       * The %assign can only take a 32 bit delay. For a larger
-	       * delay we need to put it into an index register.
-	       */
-	    if (hig_d != 0) {
 	    delay_index = allocate_word();
-		  fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
+	    fprintf(vvp_out, "    %%ix/load %d, %lu, %lu; Constant delay\n",
 		    delay_index, low_d, hig_d);
-		  fprintf(vvp_out, "    %%assign/av/d v%p, %d, %u;\n", lsig,
-		  delay_index, bit);
-		  clr_word(delay_index);
-	    } else {
-		  fprintf(vvp_out, "    %%assign/av v%p, %lu, %u;\n",
-		          lsig, low_d, bit);
+	    if (word_ix_reg != 3) {
+		  fprintf(vvp_out, "    %%ix/mov 3, %u;\n", word_ix_reg);
+		  clr_word(word_ix_reg);
 	    }
+	    fprintf(vvp_out, "    %%assign/vec4/a/d v%p, %d, %u;\n", lsig,
+		    part_off_reg, delay_index);
+	    clr_word(delay_index);
      }

      fprintf(vvp_out, "t_%u ;\n", skip_assign);
@ -209,6 +196,11 @@ static void assign_to_array_word(ivl_signal_t lsig, ivl_expr_t word_ix,
      clear_expression_lookaside();
 }

+/*
+ * The code to generate here assumes that a vec4 vector of the right
+ * width is top of the vec4 stack. Arrange for it to be popped and
+ * assigned to the given l-value.
+ */
 static void assign_to_lvector(ivl_lval_t lval,
 			      uint64_t delay, ivl_expr_t dexp,
 			      unsigned nevents)
@ -217,17 +209,15 @@ static void assign_to_lvector(ivl_lval_t lval,
      ivl_expr_t part_off_ex = ivl_lval_part_off(lval);
      unsigned long part_off = 0;

-      ivl_expr_t word_ix = ivl_lval_idx(lval);
      const unsigned long use_word = 0;

+	// Detect the case that this is actually a non-blocking assign
+	// to an array word. In that case, run off somewhere else to
+	// deal with it.
      if (ivl_signal_dimensions(sig) > 0) {
-#if 0
+	    ivl_expr_t word_ix = ivl_lval_idx(lval);
 	    assert(word_ix);
-	    assign_to_array_word(sig, word_ix, bit, delay, dexp, part_off_ex,
-	                         width, nevents);
-#else
-	    fprintf(stderr, "XXXX %%assign to array word not supported yet.\n");
-#endif
+	    assign_to_array_word(sig, word_ix, delay, dexp, part_off_ex, nevents);
 	    return;
      }

@ -243,6 +233,10 @@ static void assign_to_lvector(ivl_lval_t lval,
      unsigned long hig_d = delay / UINT64_C(0x100000000);

      if (part_off_ex) {
+	      // The part select offset is calculated (not constant)
+	      // so in these cases we'll need to use
+	      // %assign/vec4/off/... variants.
+
 	    unsigned skip_assign = transient_id++;
 	    if (dexp != 0) {
 		    /* Calculated delay... */
@ -275,32 +269,20 @@ static void assign_to_lvector(ivl_lval_t lval,
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 		  fprintf(vvp_out, "    %%evctl/c;\n");
 	    } else {
-		    /* Constant delay... */
-		  draw_eval_expr_into_integer(part_off_ex, 1);
-		    /* If the index expression has XZ bits, skip the assign. */
-		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
-#if 0
-		  fprintf(vvp_out, "    %%ix/load 0, %u, 0;\n", width);
-		    /*
-		     * The %assign can only take a 32 bit delay. For a larger
-		     * delay we need to put it into an index register.
-		     */
-		  if (hig_d != 0) {
+		  int offset_index = allocate_word();
 		  int delay_index = allocate_word();
+
+		    /* Constant delay... */
 		  fprintf(vvp_out, "    %%ix/load %d, %lu, %lu;\n",
 			  delay_index, low_d, hig_d);
-			fprintf(vvp_out,
-			        "    %%assign/v0/x1/d v%p_%lu, %d, %u;\n",
-			        sig, use_word, delay_index, bit);
+		    /* Calculated part offset... */
+		  draw_eval_expr_into_integer(part_off_ex, offset_index);
+		    /* If the index expression has XZ bits, skip the assign. */
+		  fprintf(vvp_out, "    %%jmp/1 t_%u, 4;\n", skip_assign);
+		  fprintf(vvp_out, "    %%assign/vec4/off/d v%p_%lu, %d, %d;\n",
+			  sig, use_word, offset_index, delay_index);
+		  clr_word(offset_index);
 		  clr_word(delay_index);
-		  } else {
-			fprintf(vvp_out,
-			        "    %%assign/v0/x1 v%p_%lu, %lu, %u;\n",
-			        sig, use_word, low_d, bit);
-		  }
-#else
-		  assert(0); // XXXX
-#endif
 		  fprintf(vvp_out, "t_%u ;\n", skip_assign);
 	    }

@ -356,6 +338,9 @@ static void assign_to_lvector(ivl_lval_t lval,
 		  assert(0); // XXXX

 	    } else {
+		    // Constant part offset, non-constant (calculated)
+		    // assignment delay. Use the %assign/vec4/off/d
+		    // instruction to handle this case.
 		  int offset_index = allocate_word();
 		  int delay_index = allocate_word();
 		  fprintf(vvp_out, "    %%ix/load %d, %lu, 0;\n", offset_index, part_off);
@ -932,26 +917,21 @@ static void force_real_to_lval(ivl_statement_t net)
      fprintf(vvp_out, "    %s v%p_%lu;\n", command_name, lsig, use_word);
 }

-static void force_vector_to_lval(ivl_statement_t net, struct vector_info rvec)
+static void force_vector_to_lval(ivl_statement_t net)
 {
      unsigned lidx;
-      unsigned roff = 0;

      const char*command_name;
-      const char*command_name_x0;

      switch (ivl_statement_type(net)) {
 	  case IVL_ST_CASSIGN:
-	    command_name = "%cassign/v";
-	    command_name_x0 = "%cassign/x0";
+	    command_name = "%cassign/vec4";
 	    break;
 	  case IVL_ST_FORCE:
-	    command_name = "%force/v";
-	    command_name_x0 = "%force/x0";
+	    command_name = "%force/vec4";
 	    break;
 	  default:
 	    command_name = "ERROR";
-	    command_name_x0 = "ERROR";
 	    assert(0);
 	    break;
      }
@ -960,24 +940,9 @@ static void force_vector_to_lval(ivl_statement_t net, struct vector_info rvec)
 	    ivl_lval_t lval = ivl_stmt_lval(net, lidx);
 	    ivl_signal_t lsig = ivl_lval_sig(lval);

-	    unsigned use_wid = ivl_lval_width(lval);
-	    ivl_expr_t part_off_ex = ivl_lval_part_off(lval);
-	    unsigned long part_off;
 	    ivl_expr_t word_idx = ivl_lval_idx(lval);
 	    unsigned long use_word = 0;

-	    if (part_off_ex == 0) {
-		  part_off = 0;
-	    } else {
-		  assert(number_is_immediate(part_off_ex, IMM_WID, 0));
-		    /* An out-of-range or undefined offset will have been
-		       converted to a canonical offset of 1'bx. Skip the
-		       assignment in this case. */
-		  if (number_is_unknown(part_off_ex))
-			return;
-		  part_off = get_number_immediate(part_off_ex);
-	    }
-
 	    if (word_idx != 0) {
 		  assert(number_is_immediate(word_idx, IMM_WID, 0));
 		    /* An out-of-range or undefined index will have been
@ -1001,24 +966,13 @@ static void force_vector_to_lval(ivl_statement_t net, struct vector_info rvec)

 	      /* L-Value must be a signal: reg or wire */
 	    assert(lsig != 0);
-
-	    if (part_off != 0 || use_wid != ivl_signal_width(lsig)) {
-
-		  command_name = command_name_x0;
-		  fprintf(vvp_out, "    %%ix/load 0, %lu, 0;\n", part_off);
-
-	    } else {
 	      /* Do not support bit or part selects of l-values yet. */
 	    assert(ivl_lval_width(lval) == ivl_signal_width(lsig));
+	    assert(!ivl_lval_part_off(lval));

-		  assert((roff + use_wid) <= rvec.wid);
-	    }

-	    fprintf(vvp_out, "    %s v%p_%lu, %u, %u;\n", command_name,
-		    lsig, use_word, rvec.base+roff, use_wid);
+	    fprintf(vvp_out, "    %s v%p_%lu;\n", command_name, lsig, use_word);

-	    if (rvec.base >= 4)
-		  roff += use_wid;
      }
 }

@ -1170,13 +1124,11 @@ static int show_stmt_cassign(ivl_statement_t net)
 	    force_real_to_lval(net);

      } else {
-	    struct vector_info rvec;
-
-	    rvec = draw_eval_expr(rval, STUFF_OK_47);

+	    draw_eval_vec4(rval, STUFF_OK_47);
 	      /* Write out initial continuous assign instructions to assign
 	         the expression value to the l-value. */
-	    force_vector_to_lval(net, rvec);
+	    force_vector_to_lval(net);
      }

      force_link_rval(net, rval);
@ -1474,13 +1426,12 @@ static int show_stmt_force(ivl_statement_t net)
            force_real_to_lval(net);

      } else {
-            struct vector_info rvec;

-            rvec = draw_eval_expr(rval, STUFF_OK_47);
+            draw_eval_vec4(rval, STUFF_OK_47);

              /* Write out initial continuous assign instructions to assign
                 the expression value to the l-value. */
-            force_vector_to_lval(net, rvec);
+            force_vector_to_lval(net);
      }

      force_link_rval(net, rval);
@ -1686,28 +1637,27 @@ static int show_stmt_repeat(ivl_statement_t net, ivl_scope_t sscope)
      int rc = 0;
      unsigned lab_top = local_count++, lab_out = local_count++;
      ivl_expr_t expr = ivl_stmt_cond_expr(net);
-      struct vector_info cnt;
      const char *sign = ivl_expr_signed(expr) ? "s" : "u";

      show_stmt_file_line(net, "Repeat statement.");

-      cnt = draw_eval_expr(expr, 0);
+	/* Calculate the repeat count onto the top of the vec4 stack. */
+      draw_eval_vec4(expr, STUFF_OK_XZ);

 	/* Test that 0 < expr */
-      fprintf(vvp_out, "T_%u.%u %%cmp/%s 0, %u, %u;\n", thread_count,
-	      lab_top, sign, cnt.base, cnt.wid);
-      clear_expression_lookaside();
-      fprintf(vvp_out, "    %%jmp/0xz T_%u.%u, 5;\n", thread_count, lab_out);
+      fprintf(vvp_out, "T_%u.%u %%dup/vec4;\n", thread_count, lab_top);
+      fprintf(vvp_out, "    %%pushi/vec4 0, 0, %u;\n", ivl_expr_width(expr));
+      fprintf(vvp_out, "    %%cmp/%s;\n", sign);
+      fprintf(vvp_out, "    %%jmp/1xz T_%u.%u, 5;\n", thread_count, lab_out);
 	/* This adds -1 (all ones in 2's complement) to the count. */
-      fprintf(vvp_out, "    %%add %u, 1, %u;\n", cnt.base, cnt.wid);
+      fprintf(vvp_out, "    %%pushi/vec4 1, 0, %u;\n", ivl_expr_width(expr));
+      fprintf(vvp_out, "    %%sub;\n");

      rc += show_statement(ivl_stmt_sub_stmt(net), sscope);

      fprintf(vvp_out, "    %%jmp T_%u.%u;\n", thread_count, lab_top);
      fprintf(vvp_out, "T_%u.%u ;\n", thread_count, lab_out);
-      clear_expression_lookaside();
-
-      clr_vector(cnt);
+      fprintf(vvp_out, "    %%pop/vec4 1;\n");

      return rc;
 }
--- a/vvp/codes.h
+++ b/vvp/codes.h
@ -50,6 +50,7 @@ extern bool of_ASSIGN_MV(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_VEC4D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_VEC4E(vthread_t thr, vvp_code_t code);
+extern bool of_ASSIGN_VEC4_A_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_VEC4_OFF_D(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_V0X1D(vthread_t thr, vvp_code_t code);
@ -62,7 +63,7 @@ extern bool of_BLEND(vthread_t thr, vvp_code_t code);
 extern bool of_BLEND_WR(vthread_t thr, vvp_code_t code);
 extern bool of_BREAKPOINT(vthread_t thr, vvp_code_t code);
 extern bool of_CASSIGN_LINK(vthread_t thr, vvp_code_t code);
-extern bool of_CASSIGN_V(vthread_t thr, vvp_code_t code);
+extern bool of_CASSIGN_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
 extern bool of_CASSIGN_X0(vthread_t thr, vvp_code_t code);
 extern bool of_CAST2(vthread_t thr, vvp_code_t code);
@ -109,7 +110,7 @@ extern bool of_FLAG_GET_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_FLAG_SET_IMM(vthread_t thr, vvp_code_t code);
 extern bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_LINK(vthread_t thr, vvp_code_t code);
-extern bool of_FORCE_V(vthread_t thr, vvp_code_t code);
+extern bool of_FORCE_VEC4(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_WR(vthread_t thr, vvp_code_t code);
 extern bool of_FORCE_X0(vthread_t thr, vvp_code_t code);
 extern bool of_FORK(vthread_t thr, vvp_code_t code);
@ -130,6 +131,7 @@ extern bool of_JMP(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0(vthread_t thr, vvp_code_t code);
 extern bool of_JMP0XZ(vthread_t thr, vvp_code_t code);
 extern bool of_JMP1(vthread_t thr, vvp_code_t code);
+extern bool of_JMP1XZ(vthread_t thr, vvp_code_t code);
 extern bool of_JOIN(vthread_t thr, vvp_code_t code);
 extern bool of_JOIN_DETACH(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_AR(vthread_t thr, vvp_code_t code);
@ -145,6 +147,7 @@ extern bool of_LOAD_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_STRA(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X1P(vthread_t thr, vvp_code_t code);
@ -171,7 +174,8 @@ extern bool of_OR(vthread_t thr, vvp_code_t code);
 extern bool of_ORR(vthread_t thr, vvp_code_t code);
 extern bool of_PAD_S(vthread_t thr, vvp_code_t code);
 extern bool of_PAD_U(vthread_t thr, vvp_code_t code);
-extern bool of_PART(vthread_t thr, vvp_code_t code);
+extern bool of_PART_S(vthread_t thr, vvp_code_t code);
+extern bool of_PART_U(vthread_t thr, vvp_code_t code);
 extern bool of_POP_OBJ(vthread_t thr, vvp_code_t code);
 extern bool of_POP_REAL(vthread_t thr, vvp_code_t code);
 extern bool of_POP_STR(vthread_t thr, vvp_code_t code);
@ -191,6 +195,7 @@ extern bool of_PUTC_STR_V(vthread_t thr, vvp_code_t code);
 extern bool of_RELEASE_NET(vthread_t thr, vvp_code_t code);
 extern bool of_RELEASE_REG(vthread_t thr, vvp_code_t code);
 extern bool of_RELEASE_WR(vthread_t thr, vvp_code_t code);
+extern bool of_REPLICATE(vthread_t thr, vvp_code_t code);
 extern bool of_SCOPY(vthread_t thr, vvp_code_t code);
 extern bool of_SET_AV(vthread_t thr, vvp_code_t code);
 extern bool of_SET_DAR(vthread_t thr, vvp_code_t code);
@ -216,6 +221,7 @@ extern bool of_STORE_REALA(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STR(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_STRA(vthread_t thr, vvp_code_t code);
 extern bool of_STORE_VEC4(vthread_t thr, vvp_code_t code);
+extern bool of_STORE_VEC4_OFF(vthread_t thr, vvp_code_t code);
 extern bool of_SUB(vthread_t thr, vvp_code_t code);
 extern bool of_SUB_WR(vthread_t thr, vvp_code_t code);
 extern bool of_SUBI(vthread_t thr, vvp_code_t code);
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@ -102,6 +102,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%assign/v0/x1/d",of_ASSIGN_V0X1D,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
      { "%assign/v0/x1/e",of_ASSIGN_V0X1E,2,{OA_FUNC_PTR,OA_BIT1,OA_NONE} },
      { "%assign/vec4",      of_ASSIGN_VEC4,       2, {OA_FUNC_PTR, OA_BIT1, OA_NONE} },
+      { "%assign/vec4/a/d",  of_ASSIGN_VEC4_A_D,   3, {OA_ARR_PTR,  OA_BIT1, OA_BIT2} },
      { "%assign/vec4/d",    of_ASSIGN_VEC4D,      2, {OA_FUNC_PTR, OA_BIT1, OA_NONE} },
      { "%assign/vec4/e",    of_ASSIGN_VEC4E,      1, {OA_FUNC_PTR, OA_NONE, OA_NONE} },
      { "%assign/vec4/off/d",of_ASSIGN_VEC4_OFF_D, 3, {OA_FUNC_PTR, OA_BIT1, OA_BIT2} },
@ -113,7 +114,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%blend/wr", of_BLEND_WR,0,  {OA_NONE,  OA_NONE,     OA_NONE} },
      { "%breakpoint", of_BREAKPOINT, 0,  {OA_NONE, OA_NONE, OA_NONE} },
      { "%cassign/link",of_CASSIGN_LINK,2,{OA_FUNC_PTR,OA_FUNC_PTR2,OA_NONE} },
-      { "%cassign/v",of_CASSIGN_V,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
+      { "%cassign/vec4",of_CASSIGN_VEC4,1,{OA_FUNC_PTR,OA_NONE ,    OA_NONE} },
      { "%cassign/wr",  of_CASSIGN_WR,  1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
      { "%cassign/x0",of_CASSIGN_X0,3,{OA_FUNC_PTR,OA_BIT1,  OA_BIT2} },
      { "%cast2",  of_CAST2,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@ -123,8 +124,8 @@ static const struct opcode_table_s opcode_table[] = {
      { "%cmp/wr", of_CMPWR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmp/ws", of_CMPWS,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
      { "%cmp/wu", of_CMPWU,  2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
-      { "%cmp/x",  of_CMPX,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%cmp/z",  of_CMPZ,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%cmp/x",  of_CMPX,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
+      { "%cmp/z",  of_CMPZ,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%cmpi/s", of_CMPIS,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%cmpi/u", of_CMPIU,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%concat/str", of_CONCAT_STR, 0,{OA_NONE,  OA_NONE,  OA_NONE} },
@ -158,7 +159,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%flag_set/imm",  of_FLAG_SET_IMM,  2, {OA_NUMBER, OA_BIT1, OA_NONE} },
      { "%flag_set/vec4", of_FLAG_SET_VEC4, 1, {OA_NUMBER, OA_NONE, OA_NONE} },
      { "%force/link",of_FORCE_LINK,2,{OA_FUNC_PTR, OA_FUNC_PTR2, OA_NONE} },
-      { "%force/v",of_FORCE_V,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
+      { "%force/vec4",of_FORCE_VEC4,1,{OA_FUNC_PTR, OA_NONE,      OA_NONE} },
      { "%force/wr",  of_FORCE_WR,  1,{OA_FUNC_PTR, OA_NONE,      OA_NONE} },
      { "%force/x0",of_FORCE_X0,3,{OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
      { "%free",   of_FREE,   1,  {OA_VPI_PTR,  OA_NONE,     OA_NONE} },
@ -178,6 +179,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%jmp/0",  of_JMP0,   2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
      { "%jmp/0xz",of_JMP0XZ, 2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
      { "%jmp/1",  of_JMP1,   2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
+      { "%jmp/1xz",of_JMP1XZ, 2,  {OA_CODE_PTR, OA_BIT1,     OA_NONE} },
      { "%join",   of_JOIN,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%join/detach",of_JOIN_DETACH,1,{OA_NUMBER,OA_NONE,  OA_NONE} },
      { "%load/ar",of_LOAD_AR,2,  {OA_ARR_PTR,  OA_BIT1,     OA_NONE} },
@ -193,6 +195,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%load/str",   of_LOAD_STR,  1,{OA_FUNC_PTR,OA_NONE, OA_NONE} },
      { "%load/stra",  of_LOAD_STRA, 2,{OA_ARR_PTR, OA_BIT1, OA_NONE} },
      { "%load/vec4",  of_LOAD_VEC4, 1,{OA_FUNC_PTR,OA_NONE,  OA_NONE} },
+      { "%load/vec4a", of_LOAD_VEC4A,2,{OA_ARR_PTR, OA_BIT1, OA_NONE} },
      { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/vp0/s",of_LOAD_VP0_S,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/x1p",of_LOAD_X1P,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
@ -219,7 +222,8 @@ static const struct opcode_table_s opcode_table[] = {
      { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
      { "%pad/s",  of_PAD_S,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
      { "%pad/u",  of_PAD_U,  1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
-      { "%part",   of_PART,   1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%part/s", of_PART_S, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
+      { "%part/u", of_PART_U, 1,  {OA_NUMBER,   OA_NONE,     OA_NONE} },
      { "%pop/obj", of_POP_OBJ, 2, {OA_BIT1,    OA_BIT2,     OA_NONE} },
      { "%pop/real",of_POP_REAL,1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
      { "%pop/str", of_POP_STR, 1, {OA_NUMBER,  OA_NONE,     OA_NONE} },
@ -239,6 +243,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%release/net",of_RELEASE_NET,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
      { "%release/reg",of_RELEASE_REG,3,{OA_FUNC_PTR,OA_BIT1,OA_BIT2} },
      { "%release/wr", of_RELEASE_WR, 2,{OA_FUNC_PTR,OA_BIT1,OA_NONE} },
+      { "%replicate", of_REPLICATE,   1,{OA_NUMBER,  OA_NONE,OA_NONE} },
      { "%scopy",  of_SCOPY,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%set/av", of_SET_AV, 3,  {OA_ARR_PTR,  OA_BIT1,     OA_BIT2} },
      { "%set/dar",of_SET_DAR,3,  {OA_FUNC_PTR, OA_BIT1,     OA_BIT2} },
@ -263,6 +268,7 @@ static const struct opcode_table_s opcode_table[] = {
      { "%store/str",     of_STORE_STR,     1, {OA_FUNC_PTR,OA_NONE, OA_NONE} },
      { "%store/stra",    of_STORE_STRA,    2, {OA_ARR_PTR, OA_BIT1, OA_NONE} },
      { "%store/vec4",    of_STORE_VEC4,    2, {OA_FUNC_PTR,OA_BIT1, OA_NONE} },
+      { "%store/vec4/off",of_STORE_VEC4_OFF,3, {OA_FUNC_PTR,OA_BIT1, OA_BIT2} },
      { "%sub",    of_SUB,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%sub/wr", of_SUB_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
      { "%subi",   of_SUBI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
@ -567,6 +573,12 @@ bool vpi_handle_resolv_list_s::resolve(bool mes)

 		  val.ptr = vpip_make_vthr_str_stack(base);
 		  sym_set_value(sym_vpi, label(), val);
+
+	    } else if (1 == sscanf(label(), "S<%u,vec4>%n", &base, &n)
+		       && n == strlen(label())) {
+
+		  val.ptr = vpip_make_vthr_vec4_stack(base);
+		  sym_set_value(sym_vpi, label(), val);
 	    }

      }
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@ -115,9 +115,9 @@ The %assign/ar/e variation uses the information in the thread
 event control registers to determine when to perform the assign.
 %evctl is used to set the event control information.

-* %assign/av <array-label>, <delay>, <bit>
-* %assign/av/d <array-label>, <delayx>, <bit>
-* %assign/av/e <array-label>, <bit>
+* %assign/av <array-label>, <delay>, <bit> (XXXX Old definition)
+* %assign/av/d <array-label>, <delayx>, <bit> (XXXX Old definition)
+* %assign/av/e <array-label>, <bit> (XXXX Old definition)

 The %assign/av instruction assigns a vector value to a word in the
 labeled array. The <delay> is the delay in simulation time to the
@ -181,6 +181,19 @@ pulled from the vec4 stack.
 The %assign/vec4/d instruction is the same, but gets its delay value
 from the index register <delayx> instead.

+* %assign/vec4/a/d <var-label>, <off-index>, <delay-index>
+* %assign/vec4/a/e <var-label>, <off-index>
+
+This instruction implements delayed assignment to an array word. The
+value is popped from the vec4 stack; the width is taken from the
+popped value. The <off-index> index register contains the canonical
+offset into the memory word for a part select, and the <delay-index>
+index register contains the delay for the assignment. Index register 3
+contains the word address.
+
+The <off-index> and <delay-index> index registers can be 0, which
+means a zero value instead of the contents of index register 0.
+
 * %assign/vec4/off/d <var-label>, <off-index>, <delay-index>

 This is for writing parts to the target variable. The <var-label> is
@ -250,7 +263,7 @@ debugger commands.
 This may not work on all platforms. If run-time debugging is compiled
 out, then this function is a no-op.

-* %cassign/v <var-label>, <bit>, <wid>
+* %cassign/vec4 <var-label>

 Perform a continuous assign of a constant value to the target
 variable. This is similar to %set, but it uses the cassign port
@ -342,8 +355,8 @@ popped from the stack first, then (a).

 [compare signed/unsigned integer words.]

-* %cmp/z <bit-l>, <bit-r>, <wid>
-* %cmp/x <bit-l>, <bit-r>, <wid>
+* %cmp/z
+* %cmp/x

 These instructions are for implementing the casez and casex
 comparisons. These work similar to the %cmp/u instructions, except
@ -427,6 +440,12 @@ will be deactivated. For a full deactivation the <base> is 0 and

 The same as %deassign above except this is used for real variables.

+* %debug/thr
+
+These opcodes are aids for debugging the vvp engine. The vvp code
+generator should not generate these, and they should not alter code
+flow, data contents, etc.
+
 * %delay <low>, <high>

 This opcode pauses the thread, and causes it to be rescheduled for a
@ -536,7 +555,7 @@ writes the LSB to the selected flag.
 * %force/v <label>, <bit>, <wid>

 Force a constant value to the target variable. This is similar to %set
-and %cassign/v, but it uses the force port (port-2) of the signal
+and %cassign/vec4, but it uses the force port (port-2) of the signal
 functor instead of the normal assign port (port-0), so the signal
 responds differently. See "VARIABLE STATEMENTS" and "NET STATEMENTS"
 in the README.txt file.
@ -753,6 +772,15 @@ more than the width at the functor, the value is padded with X bits.
 This instruction loads a vector value from the given functor node and
 pushes it onto the vec4 stack. See also the %store/vec4 instruction.

+* %load/vec4a <arr-label>, <addr-index>
+
+This instruction loads a vec4 value from the array and pushes the
+value onto the stack. The <addr-index> is the index register that
+holds the canonical array index.
+
+The load checks flag bit 4. If it is 1, then the load it cancelled and
+replaced with a load of all X bits. See %ix/vec4.
+
 * %load/vp0 <bit>, <functor-label>, <wid>
 * %load/vp0/s <bit>, <functor-label>, <wid>

@ -1070,6 +1098,12 @@ Release the force on the real signal that is represented by the functor
 statement. The <type> is 0 for nets and 1 for registers. See the other
 %release commands above.

+* %replicate <count>
+
+Pop the vec4 value, replicate it <count> times, then push the
+result. In other words, push the concatenation of <count> copies.
+See also the %concat instruction.
+
 * %set/dar <var-label>, <bit>, <wid>
 * %set/dar/obj <index>, <bit>, <wid>

@ -1214,12 +1248,18 @@ The %store/dar/str is similar, but the target is a dynamic array of
 string string. The index is taken from signed index register 3.

 * %store/vec4 <var-label>, <wid>
+* %store/vec4/off <var-label>, <offset>, <wid>

 Store a logic vector into the variable. The value (and its width) is
 popped off the top of the stack and written to the variable. The value
 is then optionally truncated to <wid> bits and assigned to the
 variable. It is an error for the value to be fewer then <wid> bits.

+The %store/vec4/off is similar, but it uses the index register
+<offset> to get a vector offset into the target vec4 variable.
+
+NOTE: The <wid> is not necessary, and should be removed.
+
 * %sub <bit-l>, <bit-r>, <wid> (XXXX Old version)

 This instruction arithmetically subtracts the right vector out of the
--- a/vvp/vpi_priv.h
+++ b/vvp/vpi_priv.h
@ -660,6 +660,7 @@ vpiHandle vpip_make_vthr_vector(unsigned base, unsigned wid, bool signed_flag);

 vpiHandle vpip_make_vthr_word(unsigned base, const char*type);
 vpiHandle vpip_make_vthr_str_stack(unsigned depth);
+vpiHandle vpip_make_vthr_vec4_stack(unsigned depth);

 vpiHandle vpip_make_vthr_A(char*label, unsigned index);
 vpiHandle vpip_make_vthr_A(char*label, char*symbol);
--- a/vvp/vpi_vthr_vector.cc
+++ b/vvp/vpi_vthr_vector.cc
@ -650,7 +650,6 @@ class __vpiVThrStrStack : public __vpiHandle {
      int vpi_get(int code);
      void vpi_get_value(p_vpi_value val);
    private:
-      const char* name;
      unsigned depth_;
 };

@ -703,6 +702,57 @@ void __vpiVThrStrStack::vpi_get_value(p_vpi_value vp)
      }
 }

+class __vpiVThrVec4Stack : public __vpiHandle {
+    public:
+      __vpiVThrVec4Stack(unsigned depth);
+      int get_type_code(void) const;
+      int vpi_get(int code);
+      void vpi_get_value(p_vpi_value val);
+    private:
+      unsigned depth_;
+};
+
+__vpiVThrVec4Stack::__vpiVThrVec4Stack(unsigned d)
+: depth_(d)
+{
+}
+
+int __vpiVThrVec4Stack::get_type_code(void) const
+{ return vpiConstant; }
+
+
+int __vpiVThrVec4Stack::vpi_get(int code)
+{
+      switch (code) {
+	  case vpiSigned:
+	    return 0;
+
+	  case vpiConstType:
+	    return vpiBinaryConst;
+
+#if defined(CHECK_WITH_VALGRIND) || defined(BR916_STOPGAP_FIX)
+	  case _vpiFromThr:
+	    return _vpiVThr;
+#endif
+
+	  default:
+	    return 0;
+      }
+}
+
+void __vpiVThrVec4Stack::vpi_get_value(p_vpi_value vp)
+{
+      assert(vpip_current_vthread);
+      vvp_vector4_t val = vthread_get_vec4_stack(vpip_current_vthread, depth_);
+
+      switch (vp->format) {
+	  default:
+	    fprintf(stderr, "internal error: vpi_get_value(<format=%d>)"
+		    " not implemented for __vpiVThrVec4Stack.\n", vp->format);
+	    assert(0);
+      }
+
+}

 vpiHandle vpip_make_vthr_str_stack(unsigned depth)
 {
@ -710,6 +760,12 @@ vpiHandle vpip_make_vthr_str_stack(unsigned depth)
      return obj;
 }

+vpiHandle vpip_make_vthr_vec4_stack(unsigned depth)
+{
+      class __vpiVThrVec4Stack*obj = new __vpiVThrVec4Stack(depth);
+      return obj;
+}
+
 #ifdef CHECK_WITH_VALGRIND
 static map<vpiHandle, bool> stack_map;

--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -379,6 +379,11 @@ double vthread_get_real_stack(struct vthread_s*thr, unsigned depth)
      return thr->peek_real(depth);
 }

+const vvp_vector4_t& vthread_get_vec4_stack(struct vthread_s*thr, unsigned depth)
+{
+      return thr->peek_vec4(depth);
+}
+
 template <class T> T coerce_to_width(const T&that, unsigned width)
 {
      if (that.size() == width)
@ -1242,6 +1247,25 @@ bool of_ASSIGN_VEC4(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %assign/vec4/a/d <arr>, <offx>, <delx>
+ */
+bool of_ASSIGN_VEC4_A_D(vthread_t thr, vvp_code_t cp)
+{
+      int off_idx = cp->bit_idx[0];
+      int del_idx = cp->bit_idx[1];
+      int adr_idx = 3;
+
+      long     off = off_idx? thr->words[off_idx].w_int  : 0;
+      unsigned del = del_idx? thr->words[del_idx].w_uint : 0;
+      long     adr = thr->words[adr_idx].w_int;
+
+      vvp_vector4_t value = thr->pop_vec4();
+
+      schedule_assign_array_word(cp->array, adr, off, value, del);
+      return true;
+}
+
 /*
 * %assign/vec4/off/d <var>, <off>, <del>
 */
@ -1617,10 +1641,10 @@ bool of_CASSIGN_LINK(vthread_t, vvp_code_t cp)
 }

 /*
- * the %cassign/v instruction invokes a continuous assign of a
+ * the %cassign/vec4 instruction invokes a continuous assign of a
 * constant value to a signal. The instruction arguments are:
 *
- *     %cassign/v <net>, <base>, <wid> ;
+ *     %cassign/vec4 <net>;
 *
 * Where the <net> is the net label assembled into a vvp_net pointer,
 * and the <base> and <wid> are stashed in the bit_idx array.
@ -1628,22 +1652,15 @@ bool of_CASSIGN_LINK(vthread_t, vvp_code_t cp)
 * This instruction writes vvp_vector4_t values to port-1 of the
 * target signal.
 */
-bool of_CASSIGN_V(vthread_t thr, vvp_code_t cp)
+bool of_CASSIGN_VEC4(vthread_t thr, vvp_code_t cp)
 {
-#if 0
      vvp_net_t*net = cp->net;
-      unsigned  base = cp->bit_idx[0];
-      unsigned  wid  = cp->bit_idx[1];
-
-	/* Collect the thread bits into a vector4 item. */
-      vvp_vector4_t value = vthread_bits_to_vector(thr, base, wid);
+      vvp_vector4_t value = thr->pop_vec4();

 	/* set the value into port 1 of the destination. */
      vvp_net_ptr_t ptr (net, 1);
      vvp_send_vec4(ptr, value, 0);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cassign/v ...\n");
-#endif
+
      return true;
 }

@ -2026,30 +2043,28 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %cmp/x
+ */
 bool of_CMPX(vthread_t thr, vvp_code_t cp)
 {
      vvp_bit4_t eq = BIT4_1;
-#if 0
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      vvp_vector4_t rval = thr->pop_vec4();
+      vvp_vector4_t lval = thr->pop_vec4();

-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rv = thr_get_bit(thr, idx2);
+      assert(rval.size() == lval.size());
+      unsigned wid = lval.size();

+      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+	    vvp_bit4_t lv = lval.value(idx);
+	    vvp_bit4_t rv = rval.value(idx);
 	    if ((lv != rv) && !bit4_is_xz(lv) && !bit4_is_xz(rv)) {
 		  eq = BIT4_0;
 		  break;
 	    }
-
-	    if (idx1 >= 4) idx1 += 1;
-	    if (idx2 >= 4) idx2 += 1;
      }
-#else
-      fprintf(stderr, "XXXX NOT IMLEMENTED: %%cmpx ...\n");
-#endif
-      thr->flags[4] = eq;

+      thr->flags[4] = eq;
      return true;
 }

@ -2095,30 +2110,28 @@ bool of_CMPWU(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %cmp/z
+ */
 bool of_CMPZ(vthread_t thr, vvp_code_t cp)
 {
      vvp_bit4_t eq = BIT4_1;
-#if 0
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      vvp_vector4_t rval = thr->pop_vec4();
+      vvp_vector4_t lval = thr->pop_vec4();

-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lv = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rv = thr_get_bit(thr, idx2);
+      assert(rval.size() == lval.size());
+      unsigned wid = lval.size();

-	    if ((lv != BIT4_Z) && (rv != BIT4_Z) && (lv != rv)) {
+      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+	    vvp_bit4_t lv = lval.value(idx);
+	    vvp_bit4_t rv = rval.value(idx);
+	    if ((lv != rv) && (rv != BIT4_Z) && (lv != BIT4_Z)) {
 		  eq = BIT4_0;
 		  break;
 	    }
-
-	    if (idx1 >= 4) idx1 += 1;
-	    if (idx2 >= 4) idx2 += 1;
      }
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%cmpz ...\n");
-#endif
-      thr->flags[4] = eq;

+      thr->flags[4] = eq;
      return true;
 }

@ -2941,26 +2954,22 @@ bool of_FORCE_LINK(vthread_t, vvp_code_t cp)
 }

 /*
- * The %force/v instruction invokes a force assign of a constant value
+ * The %force/vec4 instruction invokes a force assign of a constant value
 * to a signal. The instruction arguments are:
 *
- *     %force/v <net>, <base>, <wid> ;
+ *     %force/vec4 <net> ;
 *
 * where the <net> is the net label assembled into a vvp_net pointer,
- * and the <base> and <wid> are stashed in the bit_idx array.
+ * and the value to be forced is popped from the vec4 stack.\.
 *
 * The instruction writes a vvp_vector4_t value to port-2 of the
 * target signal.
 */
-bool of_FORCE_V(vthread_t thr, vvp_code_t cp)
+bool of_FORCE_VEC4(vthread_t thr, vvp_code_t cp)
 {
-#if 0
      vvp_net_t*net = cp->net;
-      unsigned  base = cp->bit_idx[0];
-      unsigned  wid  = cp->bit_idx[1];

-	/* Collect the thread bits into a vector4 item. */
-      vvp_vector4_t value = vthread_bits_to_vector(thr, base, wid);
+      vvp_vector4_t value = thr->pop_vec4();

 	/* Send the force value to the filter on the node. */

@ -2969,9 +2978,7 @@ bool of_FORCE_V(vthread_t thr, vvp_code_t cp)
 	    value = coerce_to_width(value, net->fil->filter_size());

      net->force_vec4(value, vvp_vector2_t(vvp_vector2_t::FILL1, net->fil->filter_size()));
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%force/v ...\n");
-#endif
+
      return true;
 }

@ -3401,6 +3408,26 @@ bool of_JMP1(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %jmp/1xz <pc>, <flag>
+ */
+bool of_JMP1XZ(vthread_t thr, vvp_code_t cp)
+{
+      if (thr->flags[cp->bit_idx[0]] != BIT4_0)
+	    thr->pc = cp->cptr;
+
+	/* Normally, this returns true so that the processor just
+	   keeps going to the next instruction. However, if there was
+	   a $stop or vpiStop, returning false here can break the
+	   simulation out of a hung loop. */
+      if (schedule_stopped()) {
+	    schedule_vthread(thr, 0, false);
+	    return false;
+      }
+
+      return true;
+}
+
 /*
 * The %join instruction causes the thread to wait for one child
 * to die.  If a child is already dead (and a zombie) then I reap
@ -3921,6 +3948,29 @@ bool of_LOAD_VEC4(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %load/vec4a <arr>, <adrx>
+ */
+bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
+{
+      int adr_index = cp->bit_idx[0];
+
+      long adr = thr->words[adr_index].w_int;
+
+	// If flag[3] is set, then the calculation of the address
+	// failed, and this load should return X instead of the actual
+	// value.
+      if (thr->flags[4] == BIT4_1) {
+	    vvp_vector4_t tmp (get_array_word_size(cp->array), BIT4_X);
+	    thr->push_vec4(tmp);
+	    return true;
+      }
+
+      vvp_vector4_t tmp (array_get_word(cp->array, adr));
+      thr->push_vec4(tmp);
+      return true;
+}
+
 /*
 * This is like of_LOAD_VEC, but includes an add of an integer value from
 * index 0. The <wid> is the expected result width not the vector width.
@ -4368,12 +4418,13 @@ bool of_PAD_U(vthread_t thr, vvp_code_t cp)
 }

 /*
- * %part <wid>
+ * %part/s <wid>
+ * %part/u <wid>
 * Two values are popped from the stack. First, pop the canonical
 * index of the part select, and second is the value to be
 * selected. The result is pushed back to the stack.
 */
-bool of_PART(vthread_t thr, vvp_code_t cp)
+static bool of_PART_base(vthread_t thr, vvp_code_t cp, bool signed_flag)
 {
      unsigned wid = cp->number;

@ -4384,7 +4435,7 @@ bool of_PART(vthread_t thr, vvp_code_t cp)

 	// NOTE: This is treating the vector as signed. Is that correct?
      int32_t base;
-      bool value_ok = vector4_to_value(base4, base, true);
+      bool value_ok = vector4_to_value(base4, base, signed_flag);
      if (! value_ok) {
 	    thr->push_vec4(res);
 	    return true;
@ -4417,6 +4468,16 @@ bool of_PART(vthread_t thr, vvp_code_t cp)
      return true;
 }

+bool of_PART_S(vthread_t thr, vvp_code_t cp)
+{
+      return of_PART_base(thr, cp, true);
+}
+
+bool of_PART_U(vthread_t thr, vvp_code_t cp)
+{
+      return of_PART_base(thr, cp, false);
+}
+
 /*
 *  %mov/wu <dst>, <src>
 */
@ -5359,6 +5420,21 @@ bool of_RELEASE_WR(vthread_t, vvp_code_t cp)
      return true;
 }

+bool of_REPLICATE(vthread_t thr, vvp_code_t cp)
+{
+      int rept = cp->number;
+      vvp_vector4_t val = thr->pop_vec4();
+      vvp_vector4_t res (val.size() * rept, BIT4_X);
+
+      for (int idx = 0 ; idx < rept ; idx += 1) {
+	    res.set_vec(idx * val.size(), val);
+      }
+
+      thr->push_vec4(res);
+
+      return true;
+}
+
 bool of_SCOPY(vthread_t thr, vvp_code_t)
 {
      vvp_object_t tmp;
@ -6037,6 +6113,28 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
      return true;
 }

+/*
+ * %storevec4/off <var-label>, <offset>, <wid>
+ */
+bool of_STORE_VEC4_OFF(vthread_t thr, vvp_code_t cp)
+{
+      vvp_net_ptr_t ptr(cp->net, 0);
+      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
+      unsigned off_index = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
+
+      int off = thr->words[off_index].w_int;
+
+      vvp_vector4_t val = thr->pop_vec4();
+      assert(val.size() >= wid);
+      if (val.size() > wid)
+	    val.resize(wid);
+
+      vvp_send_vec4_pv(ptr, val, off, wid, sig->value_size(), thr->wt_context);
+
+      return true;
+}
+
 bool of_SUB(vthread_t thr, vvp_code_t cp)
 {
      vvp_vector4_t r = thr->pop_vec4();
--- a/vvp/vthread.h
+++ b/vvp/vthread.h
@ -129,6 +129,7 @@ extern void vthread_pop_real(struct vthread_s*thr, unsigned count);
   depth==1, etc. */
 extern const std::string&vthread_get_str_stack(struct vthread_s*thr, unsigned depth);
 extern double vthread_get_real_stack(struct vthread_s*thr, unsigned depth);
+extern const vvp_vector4_t& vthread_get_vec4_stack(struct vthread_s*thr, unsigned depth);

 /* This is used to actually delete a thread once we are done with it. */
 extern void vthread_delete(vthread_t thr);