diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 21bb765ac..9dab1c384 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -132,6 +132,12 @@ struct vthread_s {
 	    unsigned use_index = stack_vec4_.size()-1-depth;
 	    return stack_vec4_[use_index];
       }
+      inline vvp_vector4_t& peek_vec4(void)
+      {
+	    assert(! stack_vec4_.empty());
+	    unsigned use_index = stack_vec4_.size()-1;
+	    return stack_vec4_[use_index];
+      }
       inline void pop_vec4(unsigned cnt)
       {
 	    while (cnt > 0) {
@@ -894,10 +900,23 @@ bool of_AND(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ * %add
+ *
+ * Pop r,
+ * Pop l,
+ * Push l+r
+ *
+ * Pop 2 and push 1 is the same as pop 1 and replace the remaining top
+ * of the stack with a new value. That is what we will do.
+ */
 bool of_ADD(vthread_t thr, vvp_code_t)
 {
       vvp_vector4_t r = thr->pop_vec4();
-      vvp_vector4_t l = thr->pop_vec4();
+	// Rather then pop l, use it directly from the stack. When we
+	// assign to 'l', that will edit the top of the stack, which
+	// replaces a pop and a pull.
+      vvp_vector4_t&l = thr->peek_vec4();
 
       unsigned wid = l.size();
       assert(wid == r.size());
@@ -914,8 +933,6 @@ bool of_ADD(vthread_t thr, vvp_code_t)
 
       l.setarray(0,wid,lva);
 
-      thr->push_vec4(l);
-
       delete[]lva;
       delete[]lvb;
       return true;
@@ -925,7 +942,7 @@ bool of_ADD(vthread_t thr, vvp_code_t)
       delete[]lvb;
 
       vvp_vector4_t tmp (wid, BIT4_X);
-      thr->push_vec4(tmp);
+      l = tmp;
       return true;
 }
 
@@ -1504,8 +1521,11 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
       vvp_bit4_t eeq = BIT4_1;
       vvp_bit4_t lt  = BIT4_0;
 
-      vvp_vector4_t rval = thr->pop_vec4();
-      vvp_vector4_t lval = thr->pop_vec4();
+	// We are going to pop these and push nothing in their
+	// place, but for now it is more efficient to use a constant
+	// reference. When we finish, pop the stack without copies.
+      const vvp_vector4_t&rval = thr->peek_vec4(0);
+      const vvp_vector4_t&lval = thr->peek_vec4(1);
 
       assert(rval.size() == lval.size());
 
@@ -1516,6 +1536,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
 	    thr->flags[4] = BIT4_X; // eq
 	    thr->flags[5] = BIT4_X; // lt
 	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
+	    thr->pop_vec4(2);
 	    return true;
       }
 
@@ -1559,6 +1580,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
       thr->flags[5] = lt;
       thr->flags[6] = eeq;
 
+      thr->pop_vec4(2);
       return true;
 }
 
@@ -2736,10 +2758,17 @@ bool of_FREE(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+/*
+ * %inv
+ *
+ * Logically, this pops a value, inverts is (Verilog style, with Z and
+ * X converted to X) and pushes the result. We can more efficiently
+ * just to the invert in place.
+ */
 bool of_INV(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
-      thr->push_vec4(~val);
+      vvp_vector4_t&val = thr->peek_vec4();
+      val.invert();
       return true;
 }
 
@@ -3419,7 +3448,7 @@ bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t cp)
 }
 
 static void do_verylong_mod(vthread_t thr,
-			    const vvp_vector4_t&vala, const vvp_vector4_t&valb,
+			    vvp_vector4_t&vala, const vvp_vector4_t&valb,
 			    bool left_is_neg, bool right_is_neg)
 {
       bool out_is_neg = left_is_neg;
@@ -3447,7 +3476,7 @@ static void do_verylong_mod(vthread_t thr,
 		  delete []z;
 		  delete []a;
 		  vvp_vector4_t tmp(len, BIT4_X);
-		  thr->push_vec4(tmp);
+		  vala = tmp;
 		  return;
 	    }
 
@@ -3489,7 +3518,7 @@ static void do_verylong_mod(vthread_t thr,
 		  delete []z;
 		  delete []a;
 		  vvp_vector4_t tmpx (len, BIT4_X);
-		  thr->push_vec4(tmpx);
+		  vala = tmpx;
 		  return;
 	    }
 
@@ -3529,7 +3558,7 @@ static void do_verylong_mod(vthread_t thr,
 	    }
 	    tmp.set_bit(idx, ob?BIT4_1:BIT4_0);
       }
-      thr->push_vec4(tmp);
+      vala = tmp;
       delete []t;
       delete []z;
       delete []a;
@@ -3568,7 +3597,7 @@ bool of_MIN_WR(vthread_t thr, vvp_code_t)
 bool of_MOD(vthread_t thr, vvp_code_t)
 {
       vvp_vector4_t valb = thr->pop_vec4();
-      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t&vala = thr->peek_vec4();
 
       assert(vala.size()==valb.size());
       unsigned wid = vala.size();
@@ -3596,7 +3625,6 @@ bool of_MOD(vthread_t thr, vvp_code_t)
 		  vala.set_bit(idx, (lv&1)?BIT4_1 : BIT4_0);
 		  lv >>= 1;
 	    }
-	    thr->push_vec4(vala);
 
 	    return true;
 
@@ -3606,8 +3634,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
       }
 
  x_out:
-      vvp_vector4_t tmp (wid, BIT4_X);
-      thr->push_vec4(tmp);
+      vala = vvp_vector4_t(wid, BIT4_X);
       return true;
 }
 
@@ -3617,7 +3644,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
 bool of_MOD_S(vthread_t thr, vvp_code_t)
 {
       vvp_vector4_t valb = thr->pop_vec4();
-      vvp_vector4_t vala = thr->pop_vec4();
+      vvp_vector4_t&vala = thr->peek_vec4();
 
       assert(vala.size()==valb.size());
       unsigned wid = vala.size();
@@ -3655,7 +3682,9 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
 		  vala.set_bit(idx, (lv&1)? BIT4_1 : BIT4_0);
 		  lv >>= 1;
 	    }
-	    thr->push_vec4(vala);
+
+	      // vala is the top of the stack, edited in place, so we
+	      // do not need to push the result.
 
 	    return true;
 
@@ -3668,8 +3697,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
       }
 
  x_out:
-      vvp_vector4_t tmp (wid, BIT4_X);
-      thr->push_vec4(tmp);
+      vala = vvp_vector4_t(wid, BIT4_X);
       return true;
 }