diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index caba3bce3..154d66fa7 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -81,6 +81,18 @@ static void draw_binary_vec4_bitwise(ivl_expr_t expr, int stuff_ok_flag)
 	  case '|':
 	    fprintf(vvp_out, "    %%or;\n");
 	    break;
+	  case '^':
+	    fprintf(vvp_out, "    %%xor;\n");
+	    break;
+	  case 'A': /* ~& */
+	    fprintf(vvp_out, "    %%nand;\n");
+	    break;
+	  case 'O': /* ~| */
+	    fprintf(vvp_out, "    %%nor;\n");
+	    break;
+	  case 'X': /* ~^ */
+	    fprintf(vvp_out, "    %%xnor;\n");
+	    break;
 	  default:
 	    assert(0);
 	    break;
@@ -311,6 +323,10 @@ static void draw_binary_vec4(ivl_expr_t expr, int stuff_ok_flag)
 
 	  case '&':
 	  case '|':
+	  case '^':
+	  case 'A': /* NAND (~&) */
+	  case 'O': /* NOR  (~|) */
+	  case 'X': /* exclusive nor (~^) */
 	    draw_binary_vec4_bitwise(expr, stuff_ok_flag);
 	    break;
 
diff --git a/vvp/compile.cc b/vvp/compile.cc
index acdaa9473..c9dd60c31 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -210,12 +210,12 @@ static const struct opcode_table_s opcode_table[] = {
       { "%mul",    of_MUL,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mul/wr", of_MUL_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%muli",   of_MULI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%nand",   of_NAND,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%nand",   of_NAND,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%nand/r", of_NANDR,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%new/cobj",  of_NEW_COBJ,  1, {OA_VPI_PTR,OA_NONE,  OA_NONE} },
       { "%new/darray",of_NEW_DARRAY,2, {OA_BIT1,   OA_STRING,OA_NONE} },
       { "%noop",   of_NOOP,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
-      { "%nor",    of_NOR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%nor",    of_NOR,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%nor/r",  of_NORR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%null",   of_NULL,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%or",     of_OR,     0,  {OA_NONE,     OA_NONE,     OA_NONE} },
@@ -277,9 +277,9 @@ static const struct opcode_table_s opcode_table[] = {
       { "%test_nul",  of_TEST_NUL, 1,{OA_FUNC_PTR,OA_NONE,     OA_NONE} },
       { "%wait",   of_WAIT,   1,  {OA_FUNC_PTR, OA_NONE,     OA_NONE} },
       { "%wait/fork",of_WAIT_FORK,0,{OA_NONE,   OA_NONE,     OA_NONE} },
-      { "%xnor",   of_XNOR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%xnor",   of_XNOR,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%xnor/r", of_XNORR,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
-      { "%xor",    of_XOR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%xor",    of_XOR,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%xor/r",  of_XORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { 0, of_NOOP, 0, {OA_NONE, OA_NONE, OA_NONE} }
 };
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index a19111d31..b5010c596 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -877,11 +877,10 @@ This instruction is the same as %mul, but the second operand is an
 immediate value that is padded to the width of the result.
 
 
-* %nand <dst>, <src>, <wid>
+* %nand
 
-Perform the bitwise NAND of the two vectors, and store the result in
-the left vector. Each bit is calculated independent of other bits. NAND
-means the following:
+Perform the bitwise NAND of two vec4 vectors, and push the result. Each
+bit is calculated independent of other bits. NAND means the following:
 
 	0 and ? --> 1
 	? and 0 --> 1
@@ -908,11 +907,11 @@ The supported types are:
 	 "r"        - real
 	 "S"        - SystemVerilog string
 
-* %nor <dst>, <src>, <wid>
+* %nor
 
-Perform the bitwise nor of the vectors. Each bit in the <dst> is
-combined with the corresponding bit in the source, according to the
-truth table:
+Perform the bitwise nor of vec4 vectors, and push the result. Eack bit
+in the source vectors is combined to make a result bit according to the
+truth table.
 
 	1 nor ? --> 0
 	? nor 1 --> 0
@@ -1351,10 +1350,11 @@ This instruction puts the current thread to sleep until all the detached
 children have finished executing. The last detached child is responsible
 for restarting the parent when it finishes.
 
-* %xnor <dst>, <src>, <wid>
+* %xnor
 
-This does a bitwise exclusive nor (~^) of the <src> and <dst> vector,
-and leaves the result in the <dst> vector. xnor is this:
+This instruction pops two vectors from the vec4 stack, does a bitwise
+exclusive nor (~^) of the vectors, and pushes the result. The truth
+table for the xor is:
 
 	0 xnor 0 --> 1
 	0 xnor 1 --> 0
@@ -1363,16 +1363,17 @@ and leaves the result in the <dst> vector. xnor is this:
 	otherwise    x
 
 
-* %xor <dst>, <src>, <wid>
+* %xor
 
-This does a bitwise exclusive or (^) of the <src> and <dst> vector,
-and leaves the result in the <dst> vector. xor is this:
+This instruction pops two vectors from the vec4 stack, does a bitwise
+exclusive or (^) of the vectors, and pushes the result. The truth
+table for the xor is:
 
 	0 xor 0 --> 0
 	0 xor 1 --> 1
 	1 xor 0 --> 1
 	1 xor 1 --> 0
-	otherwise    x
+	otherwise   x
 
 
 /*
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 2ebbd4a11..459494944 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -4628,54 +4628,23 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
 #endif
       return true;
 }
-#if 0
-static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
+
+bool of_NAND(vthread_t thr, vvp_code_t)
 {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
-      val &= vthread_bits_to_vector(thr, idx2, wid);
-      thr->bits4.set_vec(idx1, ~val);
-
-      return true;
-}
-
-static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
+      vvp_vector4_t valr = thr->pop_vec4();
+      vvp_vector4_t vall = thr->pop_vec4();
+      assert(vall.size() == valr.size());
+      unsigned wid = vall.size();
 
       for (unsigned idx = 0 ; idx < wid ; idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-	    thr_put_bit(thr, idx1, ~(lb&rb));
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    vvp_bit4_t lb = vall.value(idx);
+	    vvp_bit4_t rb = valr.value(idx);
+	    vall.set_bit(idx, ~(lb&rb));
       }
 
+      thr->push_vec4(vall);
       return true;
 }
-#endif
-bool of_NAND(vthread_t thr, vvp_code_t cp)
-{
-#if 0
-      assert(cp->bit_idx[0] >= 4);
-
-      if (cp->number <= 4)
-	    cp->opcode = &of_NAND_narrow;
-      else
-	    cp->opcode = &of_NAND_wide;
-
-      return cp->opcode(thr, cp);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nand ...\n");
-      return true;
-#endif
-}
 
 /*
  * %new/cobj <vpi_object>
@@ -4913,53 +4882,24 @@ bool of_OR(vthread_t thr, vvp_code_t)
       return true;
 }
 
-#if 0
-static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
-{
-      assert(cp->bit_idx[0] >= 4);
-
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
-      val |= vthread_bits_to_vector(thr, idx2, wid);
-      thr->bits4.set_vec(idx1, ~val);
-
-      return true;
-}
-
-static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
-{
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned wid = cp->number;
-
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-	    thr_put_bit(thr, idx1, ~(lb|rb));
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
-      }
-
-      return true;
-}
-#endif
+/*
+ * %nor
+ */
 bool of_NOR(vthread_t thr, vvp_code_t cp)
 {
-#if 0
-      if (cp->number <= 4)
-	    cp->opcode = &of_NOR_narrow;
-      else
-	    cp->opcode = &of_NOR_wide;
+      vvp_vector4_t valr = thr->pop_vec4();
+      vvp_vector4_t vall = thr->pop_vec4();
+      assert(vall.size() == valr.size());
+      unsigned wid = vall.size();
 
-      return cp->opcode(thr, cp);
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%nor ...\n");
+      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+	    vvp_bit4_t lb = vall.value(idx);
+	    vvp_bit4_t rb = valr.value(idx);
+	    vall.set_bit(idx, ~(lb|rb));
+      }
+
+      thr->push_vec4(vall);
       return true;
-#endif
 }
 
 /*
@@ -6373,68 +6313,45 @@ bool of_WAIT_FORK(vthread_t thr, vvp_code_t)
       return false;
 }
 
-
-bool of_XNOR(vthread_t thr, vvp_code_t cp)
+/*
+ * %xnor
+ */
+bool of_XNOR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t valr = thr->pop_vec4();
+      vvp_vector4_t vall = thr->pop_vec4();
+      assert(vall.size() == valr.size());
+      unsigned wid = vall.size();
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-	    thr_put_bit(thr, idx1, ~(lb ^ rb));
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    vvp_bit4_t lb = vall.value(idx);
+	    vvp_bit4_t rb = valr.value(idx);
+	    vall.set_bit(idx, ~(lb ^ rb));
       }
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xnor ...\n");
-#endif
+
+      thr->push_vec4(vall);
       return true;
 }
 
-
-bool of_XOR(vthread_t thr, vvp_code_t cp)
+/*
+ * %xor
+ */
+bool of_XOR(vthread_t thr, vvp_code_t)
 {
-#if 0
-      assert(cp->bit_idx[0] >= 4);
+      vvp_vector4_t valr = thr->pop_vec4();
+      vvp_vector4_t vall = thr->pop_vec4();
+      assert(vall.size() == valr.size());
+      unsigned wid = vall.size();
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if ((lb == BIT4_1) && (rb == BIT4_1)) {
-		  thr_put_bit(thr, idx1, BIT4_0);
-
-	    } else if ((lb == BIT4_0) && (rb == BIT4_0)) {
-		  thr_put_bit(thr, idx1, BIT4_0);
-
-	    } else if ((lb == BIT4_1) && (rb == BIT4_0)) {
-		  thr_put_bit(thr, idx1, BIT4_1);
-
-	    } else if ((lb == BIT4_0) && (rb == BIT4_1)) {
-		  thr_put_bit(thr, idx1, BIT4_1);
-
-	    } else {
-		  thr_put_bit(thr, idx1, BIT4_X);
-	    }
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    vvp_bit4_t lb = vall.value(idx);
+	    vvp_bit4_t rb = valr.value(idx);
+	    vall.set_bit(idx, lb ^ rb);
       }
-#else
-      fprintf(stderr, "XXXX NOT IMPLEMENTED: %%xor ...\n");
-#endif
+
+      thr->push_vec4(vall);
       return true;
 }