diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 1fb496ebe..330a90642 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -5461,41 +5461,9 @@ bool of_STORE_VEC4A(vthread_t thr, vvp_code_t cp)
 bool of_SUB(vthread_t thr, vvp_code_t)
 {
       vvp_vector4_t r = thr->pop_vec4();
-      vvp_vector4_t l = thr->pop_vec4();
-
-      unsigned wid = l.size();
-      assert(wid == r.size());
-
-      unsigned long*lva = l.subarray(0,wid);
-      unsigned long*lvb = r.subarray(0,wid);
-      if (lva == 0 || lvb == 0)
-	    goto x_out;
-
-
-      unsigned long carry;
-      carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < wid ;  idx += 1)
-	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);
-
-
-	/* We know from the vector_to_array that the address is valid
-	   in the thr->bitr4 vector, so just do the set bit. */
-
-      l.setarray(0,wid,lva);
-      thr->push_vec4(l);
-
-      delete[]lva;
-      delete[]lvb;
-
-      return true;
-
- x_out:
-      delete[]lva;
-      delete[]lvb;
-
-      vvp_vector4_t tmp(wid, BIT4_X);
-      thr->push_vec4(tmp);
+      vvp_vector4_t&l = thr->peek_vec4();
 
+      l.sub(r);
       return true;
 }
 
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 1b0f9ce45..9ea45d8c6 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -1480,6 +1480,66 @@ void vvp_vector4_t::add(const vvp_vector4_t&that)
       }
 }
 
+void vvp_vector4_t::sub(const vvp_vector4_t&that)
+{
+      assert(size_ == that.size_);
+
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = ~(-1UL << size_);
+	    if ((bbits_val_|that.bbits_val_) & mask) {
+		  abits_val_ |= mask;
+		  bbits_val_ |= mask;
+		  return;
+	    }
+
+	    abits_val_ -= that.abits_val_;
+	    abits_val_ &= mask;
+	    return;
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    if (bbits_val_ | that.bbits_val_) {
+		  abits_val_ = WORD_X_ABITS;
+		  bbits_val_ = WORD_X_BBITS;
+	    } else {
+		  abits_val_ -= that.abits_val_;
+	    }
+	    return;
+      }
+
+      int cnt = size_ / BITS_PER_WORD;
+      unsigned long carry = 1;
+      for (int idx = 0 ; idx < cnt ; idx += 1) {
+	    if (bbits_ptr_[idx] | that.bbits_ptr_[idx])
+		  goto x_out;
+
+	    abits_ptr_[idx] = add_with_carry(abits_ptr_[idx], ~that.abits_ptr_[idx], carry);
+      }
+
+      if (unsigned tail = size_ % BITS_PER_WORD) {
+	    unsigned long mask = ~( -1UL << tail );
+	    if ((bbits_ptr_[cnt] | that.bbits_ptr_[cnt])&mask)
+		  goto x_out;
+
+	    abits_ptr_[cnt] = add_with_carry(abits_ptr_[cnt], ~that.abits_ptr_[cnt], carry);
+	    abits_ptr_[cnt] &= mask;
+      }
+
+      return;
+
+ x_out:
+      for (int idx = 0 ; idx < cnt ; idx += 1) {
+	    abits_ptr_[idx] = WORD_X_ABITS;
+	    bbits_ptr_[idx] = WORD_X_BBITS;
+      }
+      if (unsigned tail = size_%BITS_PER_WORD) {
+	    unsigned long mask = ~( -1UL << tail );
+	    abits_ptr_[cnt] = WORD_X_ABITS&mask;
+	    bbits_ptr_[cnt] = WORD_X_BBITS&mask;
+      }
+
+}
+
 void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
 {
       assert(dst+cnt <= size_);
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index a1292cf9b..d86f3a1ba 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -296,6 +296,9 @@ class vvp_vector4_t {
 	// Add that to this in the Verilog way.
       void add(const vvp_vector4_t&that);
 
+	// Subtract that from this in the Verilog way.
+      void sub(const vvp_vector4_t&that);
+
 	// Multiply this by that in the Verilog way.
       void mul(const vvp_vector4_t&that);