diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 2690e3c9d..f81864dd0 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -210,12 +210,7 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
 	    return vvp_vector4_t(thr->bits4, bit, wid);
 
       } else {
-	    vvp_vector4_t value(wid);
-	    vvp_bit4_t bit_val = thr_index_to_bit4[bit];
-	    for (unsigned idx = 0; idx < wid; idx +=1) {
-		  value.set_bit(idx, bit_val);
-	    }
-	    return value;
+	    return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
       }
 }
 
@@ -2282,13 +2277,15 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
       if (word.size() != wid) {
 	    fprintf(stderr, "internal error: array width=%u, word.size()=%u, wid=%u\n",
 		    0, word.size(), wid);
+	    assert(word.size() == wid);
       }
-      assert(word.size() == wid);
 
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1, bit += 1) {
-	    vvp_bit4_t val = word.value(idx);
-	    thr_put_bit(thr, bit, val);
-      }
+	/* Check the address once, before we scan the vector. */
+      thr_check_addr(thr, bit+wid-1);
+
+	/* Copy the vector bits into the bits4 vector. Do the copy
+	   directly to skip the excess calls to thr_check_addr. */
+      thr->bits4.set_vec(bit, word);
 
       return true;
 }
@@ -3526,16 +3523,7 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
 	/* set the value into port 0 of the destination. */
       vvp_net_ptr_t ptr (cp->net, 0);
 
-      if (bit >= 4) {
-	    vvp_vector4_t value(thr->bits4,bit,wid);
-	    vvp_send_vec4(ptr, value);
-
-      } else {
-	      /* Make a vector of the desired width. */
-	    vvp_bit4_t bit_val = thr_index_to_bit4[bit];
-	    vvp_vector4_t value(wid, bit_val);
-	    vvp_send_vec4(ptr, value);
-      }
+      vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid));
 
       return true;
 }
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 83d60cc12..5a4bd3313 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -202,10 +202,84 @@ void vvp_send_long_pv(vvp_net_ptr_t ptr, long val,
 
 void vvp_vector4_t::copy_bits(const vvp_vector4_t&that)
 {
-      unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
 
-      for (unsigned idx = 0; idx < bits_to_copy; idx += 1)
-	    set_bit(idx, that.value(idx));
+      if (size_ == that.size_) {
+	    if (size_ > BITS_PER_WORD) {
+		  unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
+		  for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+			abits_ptr_[idx] = that.abits_ptr_[idx];
+		  for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+			bbits_ptr_[idx] = that.bbits_ptr_[idx];
+	    } else {
+		  abits_val_ = that.abits_val_;
+		  bbits_val_ = that.bbits_val_;
+	    }
+	    return;
+      }
+
+	/* Now we know that the sizes of this and that are definitely
+	   different. We can use that in code below. In any case, we
+	   need to copy only the smaller of the sizes. */
+
+	/* If source and destination are both short, then mask/copy
+	   the bit values. */
+      if (size_ <= BITS_PER_WORD && that.size_ <= BITS_PER_WORD) {
+	    unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
+	    unsigned long mask = (1UL << bits_to_copy) - 1UL;
+	    abits_val_ &= ~mask;
+	    bbits_val_ &= ~mask;
+	    abits_val_ |= that.abits_val_&mask;
+	    bbits_val_ |= that.bbits_val_&mask;
+	    return;
+      }
+
+	/* Now we know that either source or destination are long. If
+	   the destination is short, then mask/copy from the low word
+	   of the long source. */
+      if (size_ <= BITS_PER_WORD) {
+	    abits_val_ = that.abits_ptr_[0];
+	    bbits_val_ = that.bbits_ptr_[0];
+	    if (size_ < BITS_PER_WORD) {
+		  unsigned long mask = (1UL << size_) - 1UL;
+		  abits_val_ &= mask;
+		  bbits_val_ &= mask;
+	    }
+	    return;
+      }
+
+	/* Now we know that the destination must be long. If the
+	   source is short, then mask/copy from its value. */
+      if (that.size_ <= BITS_PER_WORD) {
+	    unsigned long mask;
+	    if (that.size_ < BITS_PER_WORD) {
+		  mask = (1UL << that.size_) - 1UL;
+		  abits_ptr_[0] &= ~mask;
+		  bbits_ptr_[0] &= ~mask;
+	    } else {
+		  mask = -1UL;
+	    }
+	    abits_ptr_[0] |= that.abits_val_&mask;
+	    bbits_ptr_[0] |= that.bbits_val_&mask;
+	    return;
+      }
+
+	/* Finally, we know that source and destination are long. copy
+	   words until we get to the last. */
+      unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
+      unsigned word = 0;
+      while (bits_to_copy >= BITS_PER_WORD) {
+	    abits_ptr_[word] = that.abits_ptr_[word];
+	    bbits_ptr_[word] = that.bbits_ptr_[word];
+	    bits_to_copy -= BITS_PER_WORD;
+	    word += 1;
+      }
+      if (bits_to_copy > 0) {
+	    unsigned long mask = (1UL << bits_to_copy) - 1UL;
+	    abits_ptr_[word] &= ~mask;
+	    bbits_ptr_[word] &= ~mask;
+	    abits_ptr_[word] |= that.abits_ptr_[word] & mask;
+	    bbits_ptr_[word] |= that.bbits_ptr_[word] & mask;
+      }
 }
 
 void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
@@ -283,10 +357,58 @@ vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
 		  dst += 1;
 	    }
 
-      } else {
-	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		  set_bit(idx, that.value(adr+idx));
+      } else if (that.size_ > BITS_PER_WORD) {
+	      /* In this case, the subvector fits in a single word,
+		 but the source is large. */
+	    unsigned ptr = adr / BITS_PER_WORD;
+	    unsigned long off = adr % BITS_PER_WORD;
+	    unsigned trans = BITS_PER_WORD - off;
+	    if (trans > wid)
+		  trans = wid;
+
+	    if (trans == BITS_PER_WORD) {
+		    // Very special case: Copy exactly 1 perfectly
+		    // aligned word.
+		  abits_val_ = that.abits_ptr_[ptr];
+		  bbits_val_ = that.bbits_ptr_[ptr];
+
+	    } else {
+		    // lmask is the low bits of the destination,
+		    // masked into the source.
+		  unsigned long lmask = (1UL<<trans) - 1UL;
+		  lmask <<= off;
+
+		    // The low bits of the result.
+		  abits_val_ = (that.abits_ptr_[ptr] & lmask) >> off;
+		  bbits_val_ = (that.bbits_ptr_[ptr] & lmask) >> off;
+
+		  if (trans < wid) {
+			  // If there are more bits, then get them
+			  // from the bottom of the next word of the
+			  // source.
+			unsigned long hmask = (1UL << (wid-trans)) - 1UL;
+
+			  // The high bits of the result.
+			abits_val_ |= (that.abits_ptr_[ptr+1]&hmask) << trans;
+			bbits_val_ |= (that.bbits_ptr_[ptr+1]&hmask) << trans;
+		  }
 	    }
+
+      } else if (size_ == BITS_PER_WORD) {
+	      /* We know that source and destination are short. If the
+		 destination is a full word, then we know the copy is
+		 aligned and complete. */
+	    abits_val_ = that.abits_val_;
+	    bbits_val_ = that.bbits_val_;
+
+      } else {
+	      /* Finally, the source and destination vectors are both
+		 short, so there is a single mask/shift/copy. */
+	    unsigned long mask = (1UL << size_) - 1UL;
+	    mask <<= adr;
+
+	    abits_val_ = (that.abits_val_ & mask) >> adr;
+	    bbits_val_ = (that.bbits_val_ & mask) >> adr;
       }
 
 }