Optimize vvp_vector4 vector handling.

Improve vvp_vector4_t methods copy_bits and the part selecting constructor to make better use of vector words. Eliminate bit-by-bit processing by these methods to take advantage of host processor words. Improve vthread_bits_to_vector to use these improved methods and Update the %load/av and %set/v instructions to take advantage of these changes.
2008-05-23 14:30:32 -07:00 · 2008-05-23 14:30:32 -07:00 · 492b240304
parent d2106a3d3a
commit 492b240304
2 changed files with 137 additions and 27 deletions
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -210,12 +210,7 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
 	    return vvp_vector4_t(thr->bits4, bit, wid);

      } else {
-	    vvp_vector4_t value(wid);
-	    vvp_bit4_t bit_val = thr_index_to_bit4[bit];
-	    for (unsigned idx = 0; idx < wid; idx +=1) {
-		  value.set_bit(idx, bit_val);
-	    }
-	    return value;
+	    return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
      }
 }

@ -2282,13 +2277,15 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
      if (word.size() != wid) {
 	    fprintf(stderr, "internal error: array width=%u, word.size()=%u, wid=%u\n",
 		    0, word.size(), wid);
+	    assert(word.size() == wid);
      }
-      assert(word.size() == wid);

-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1, bit += 1) {
-	    vvp_bit4_t val = word.value(idx);
-	    thr_put_bit(thr, bit, val);
-      }
+	/* Check the address once, before we scan the vector. */
+      thr_check_addr(thr, bit+wid-1);
+
+	/* Copy the vector bits into the bits4 vector. Do the copy
+	   directly to skip the excess calls to thr_check_addr. */
+      thr->bits4.set_vec(bit, word);

      return true;
 }
@ -3526,16 +3523,7 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
 	/* set the value into port 0 of the destination. */
      vvp_net_ptr_t ptr (cp->net, 0);

-      if (bit >= 4) {
-	    vvp_vector4_t value(thr->bits4,bit,wid);
-	    vvp_send_vec4(ptr, value);
-
-      } else {
-	      /* Make a vector of the desired width. */
-	    vvp_bit4_t bit_val = thr_index_to_bit4[bit];
-	    vvp_vector4_t value(wid, bit_val);
-	    vvp_send_vec4(ptr, value);
-      }
+      vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid));

      return true;
 }
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -202,10 +202,84 @@ void vvp_send_long_pv(vvp_net_ptr_t ptr, long val,

 void vvp_vector4_t::copy_bits(const vvp_vector4_t&that)
 {
-      unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;

-      for (unsigned idx = 0; idx < bits_to_copy; idx += 1)
-	    set_bit(idx, that.value(idx));
+      if (size_ == that.size_) {
+	    if (size_ > BITS_PER_WORD) {
+		  unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
+		  for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+			abits_ptr_[idx] = that.abits_ptr_[idx];
+		  for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+			bbits_ptr_[idx] = that.bbits_ptr_[idx];
+	    } else {
+		  abits_val_ = that.abits_val_;
+		  bbits_val_ = that.bbits_val_;
+	    }
+	    return;
+      }
+
+	/* Now we know that the sizes of this and that are definitely
+	   different. We can use that in code below. In any case, we
+	   need to copy only the smaller of the sizes. */
+
+	/* If source and destination are both short, then mask/copy
+	   the bit values. */
+      if (size_ <= BITS_PER_WORD && that.size_ <= BITS_PER_WORD) {
+	    unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
+	    unsigned long mask = (1UL << bits_to_copy) - 1UL;
+	    abits_val_ &= ~mask;
+	    bbits_val_ &= ~mask;
+	    abits_val_ |= that.abits_val_&mask;
+	    bbits_val_ |= that.bbits_val_&mask;
+	    return;
+      }
+
+	/* Now we know that either source or destination are long. If
+	   the destination is short, then mask/copy from the low word
+	   of the long source. */
+      if (size_ <= BITS_PER_WORD) {
+	    abits_val_ = that.abits_ptr_[0];
+	    bbits_val_ = that.bbits_ptr_[0];
+	    if (size_ < BITS_PER_WORD) {
+		  unsigned long mask = (1UL << size_) - 1UL;
+		  abits_val_ &= mask;
+		  bbits_val_ &= mask;
+	    }
+	    return;
+      }
+
+	/* Now we know that the destination must be long. If the
+	   source is short, then mask/copy from its value. */
+      if (that.size_ <= BITS_PER_WORD) {
+	    unsigned long mask;
+	    if (that.size_ < BITS_PER_WORD) {
+		  mask = (1UL << that.size_) - 1UL;
+		  abits_ptr_[0] &= ~mask;
+		  bbits_ptr_[0] &= ~mask;
+	    } else {
+		  mask = -1UL;
+	    }
+	    abits_ptr_[0] |= that.abits_val_&mask;
+	    bbits_ptr_[0] |= that.bbits_val_&mask;
+	    return;
+      }
+
+	/* Finally, we know that source and destination are long. copy
+	   words until we get to the last. */
+      unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
+      unsigned word = 0;
+      while (bits_to_copy >= BITS_PER_WORD) {
+	    abits_ptr_[word] = that.abits_ptr_[word];
+	    bbits_ptr_[word] = that.bbits_ptr_[word];
+	    bits_to_copy -= BITS_PER_WORD;
+	    word += 1;
+      }
+      if (bits_to_copy > 0) {
+	    unsigned long mask = (1UL << bits_to_copy) - 1UL;
+	    abits_ptr_[word] &= ~mask;
+	    bbits_ptr_[word] &= ~mask;
+	    abits_ptr_[word] |= that.abits_ptr_[word] & mask;
+	    bbits_ptr_[word] |= that.bbits_ptr_[word] & mask;
+      }
 }

 void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
@ -283,10 +357,58 @@ vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
 		  dst += 1;
 	    }

-      } else {
-	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		  set_bit(idx, that.value(adr+idx));
+      } else if (that.size_ > BITS_PER_WORD) {
+	      /* In this case, the subvector fits in a single word,
+		 but the source is large. */
+	    unsigned ptr = adr / BITS_PER_WORD;
+	    unsigned long off = adr % BITS_PER_WORD;
+	    unsigned trans = BITS_PER_WORD - off;
+	    if (trans > wid)
+		  trans = wid;
+
+	    if (trans == BITS_PER_WORD) {
+		    // Very special case: Copy exactly 1 perfectly
+		    // aligned word.
+		  abits_val_ = that.abits_ptr_[ptr];
+		  bbits_val_ = that.bbits_ptr_[ptr];
+
+	    } else {
+		    // lmask is the low bits of the destination,
+		    // masked into the source.
+		  unsigned long lmask = (1UL<<trans) - 1UL;
+		  lmask <<= off;
+
+		    // The low bits of the result.
+		  abits_val_ = (that.abits_ptr_[ptr] & lmask) >> off;
+		  bbits_val_ = (that.bbits_ptr_[ptr] & lmask) >> off;
+
+		  if (trans < wid) {
+			  // If there are more bits, then get them
+			  // from the bottom of the next word of the
+			  // source.
+			unsigned long hmask = (1UL << (wid-trans)) - 1UL;
+
+			  // The high bits of the result.
+			abits_val_ |= (that.abits_ptr_[ptr+1]&hmask) << trans;
+			bbits_val_ |= (that.bbits_ptr_[ptr+1]&hmask) << trans;
+		  }
 	    }
+
+      } else if (size_ == BITS_PER_WORD) {
+	      /* We know that source and destination are short. If the
+		 destination is a full word, then we know the copy is
+		 aligned and complete. */
+	    abits_val_ = that.abits_val_;
+	    bbits_val_ = that.bbits_val_;
+
+      } else {
+	      /* Finally, the source and destination vectors are both
+		 short, so there is a single mask/shift/copy. */
+	    unsigned long mask = (1UL << size_) - 1UL;
+	    mask <<= adr;
+
+	    abits_val_ = (that.abits_val_ & mask) >> adr;
+	    bbits_val_ = (that.bbits_val_ & mask) >> adr;
      }

 }