From 0601b4e43bd9e9328e0bae7da4e691e9d12dce36 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Fri, 14 Nov 2014 14:41:04 -0800
Subject: [PATCH] Optimizations for %pushi/vec4 and %load/vec4

- Have %pushi/vec4 handle some special cases optimally.
- Eliminate some duplicated method calls in %load/vec4.
- Optimize the vvp_vector4_t::copy_from_ method by inlining
some parts.
---
 vvp/vthread.cc | 114 ++++++++++++++++++++++++++-----------------------
 vvp/vvp_net.cc |  27 ++++++------
 vvp/vvp_net.h  |  13 +++++-
 3 files changed, 84 insertions(+), 70 deletions(-)
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index bb7356807..21bb765ac 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3345,34 +3345,6 @@ static void load_base(vvp_code_t cp, vvp_vector4_t&dst)
       sig->vec4_value(dst);
 }
 
-#if 0
-bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
-{
-      unsigned bit = cp->bit_idx[0];
-      unsigned wid = cp->bit_idx[1];
-
-      vvp_vector4_t sig_value;
-      load_base(cp, sig_value);
-
-	/* Check the address once, before we scan the vector. */
-      thr_check_addr(thr, bit+wid-1);
-
-      if (sig_value.size() > wid)
-	    sig_value.resize(wid);
-
-	/* Copy the vector bits into the bits4 vector. Do the copy
-	   directly to skip the excess calls to thr_check_addr. */
-      thr->bits4.set_vec(bit, sig_value);
-
-	/* If the source is shorter than the desired width, then pad
-	   with BIT4_X values. */
-      for (unsigned idx = sig_value.size() ; idx < wid ; idx += 1)
-	    thr->bits4.set_bit(bit+idx, BIT4_X);
-
-      return true;
-}
-#endif
-
 /*
  * %load/vec4 <net>
  */
@@ -4423,35 +4395,57 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
       uint32_t valb = cp->bit_idx[1];
       unsigned wid  = cp->number;
 
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
       vvp_vector4_t val (wid, BIT4_0);
+
+	// Special case: Immediate zero is super easy.
+      if (vala==0 && valb==0) {
+	    thr->push_vec4(val);
+	    return true;
+      }
+
+	// Special case: If the value is defined (no X or Z) and fits
+	// in an unsigned long, then use the setarray method to write
+	// the value all in one shot.
+      if ((valb==0) && (wid <= 8*sizeof(unsigned long))) {
+	    unsigned long tmp = vala;
+	    val.setarray(0, wid, &tmp);
+	    thr->push_vec4(val);
+	    return true;
+      }
+
+	// The %pushi/vec4 can create values bigger then 32 bits, but
+	// only if the high bits are zero. So at most we need to run
+	// through the loop below 32 times. Maybe less, if the target
+	// width is less. We don't have to do anything special on that
+	// because vala/valb bits will shift away so (vala|valb) will
+	// turn to zero at or before 32 shifts.
+
       for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
 	    uint32_t ba = 0;
-	      // If the requested width is /32, then there are no
-	      // actual immediate bits, but we can pad with zero. So
-	      // here we test if we are still working on he LSB, and
-	      // process them if so.
-	    if (idx < 32) {
-		  ba = (valb & 1) << 1;
-		  ba |= vala & 1;
-	    }
-	    vala >>= 1;
-	    valb >>= 1;
-	    if (ba == 0) continue;
-	    vvp_bit4_t use_bit = BIT4_0;
+	      // Convert the vala/valb bits to a ba number that can be
+	      // used to select what goes into the value.
+	    ba = (valb & 1) << 1;
+	    ba |= vala & 1;
+
 	    switch (ba) {
 		case 1:
-		  use_bit = BIT4_1;
+		  val.set_bit(idx, BIT4_1);
 		  break;
 		case 2:
-		  use_bit = BIT4_Z;
+		  val.set_bit(idx, BIT4_Z);
 		  break;
 		case 3:
-		  use_bit = BIT4_X;
+		  val.set_bit(idx, BIT4_X);
 		  break;
 		default:
 		  break;
 	    }
-	    val.set_bit(idx, use_bit);
+
+	    vala >>= 1;
+	    valb >>= 1;
       }
 
       thr->push_vec4(val);
@@ -5281,6 +5275,17 @@ bool of_STORE_STRA(vthread_t thr, vvp_code_t cp)
 /*
  * %store/vec4 <var-label>, <offset>, <wid>
  *
+ * <offset> is the index register that contains the base offset into
+ * the destination. If zero, the offset of 0 is used instead of index
+ * register zero. The offset value is SIGNED, and can be negative.
+ *
+ * <wid> is the actual width, an unsigned number.
+ *
+ * This function tests flag bit 4. If that flag is set, and <offset>
+ * is an actual index register (not zero) then this assumes that the
+ * calculation of the <offset> contents failed, and the store is
+ * aborted.
+ *
  * NOTE: This instruction may loose the <wid> argument because it is
  * not consistent with the %store/vec4/<etc> instructions which have
  * no <wid>.
@@ -5290,18 +5295,19 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
       vvp_net_ptr_t ptr(cp->net, 0);
       vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (cp->net->fil);
       unsigned off_index = cp->bit_idx[0];
-      unsigned wid = cp->bit_idx[1];
+      int wid = cp->bit_idx[1];
 
       int off = off_index? thr->words[off_index].w_int : 0;
+      const int sig_value_size = sig->value_size();
 
       vvp_vector4_t val = thr->pop_vec4();
 
-      if (val.size() < wid) {
+      if (val.size() < (unsigned)wid) {
 	    cerr << "XXXX Internal error: val.size()=" << val.size()
 		 << ", expecting >= " << wid << endl;
       }
-      assert(val.size() >= wid);
-      if (val.size() > wid)
+      assert(val.size() >= (unsigned)wid);
+      if (val.size() > (unsigned)wid)
 	    val.resize(wid);
 
 	// If there is a problem loading the index register, flags-4
@@ -5309,9 +5315,9 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
       if (off_index!=0 && thr->flags[4] == BIT4_1)
 	    return true;
 
-      if (off <= -(int)wid)
+      if (off <= -wid)
 	    return true;
-      if (off >= (int)sig->value_size())
+      if (off >= sig_value_size)
 	    return true;
 
 	// If the index is below the vector, then only assign the high
@@ -5325,17 +5331,17 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp)
 
 	// If the value is partly above the target, then only assign
 	// the bits that overlap.
-      if ((off+wid) > sig->value_size()) {
-	    wid = sig->value_size()-off;
+      if ((off+wid) > sig_value_size) {
+	    wid = sig_value_size - off;
 	    val = val.subvalue(0, wid);
 	    val.resize(wid);
       }
 
 
-      if (off==0 && val.size()==sig->value_size())
+      if (off==0 && val.size()==(unsigned)sig_value_size)
 	    vvp_send_vec4(ptr, val, thr->wt_context);
       else
-	    vvp_send_vec4_pv(ptr, val, off, wid, sig->value_size(), thr->wt_context);
+	    vvp_send_vec4_pv(ptr, val, off, wid, sig_value_size, thr->wt_context);
 
       return true;
 }
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index afa44efbe..5ae18616d 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -643,23 +643,20 @@ void vvp_vector4_t::copy_bits(const vvp_vector4_t&that)
       }
 }
 
-void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
+/*
+ * This function should ONLY BE CALLED FROM vvp_vector4_t::copy_from_,
+ * as it performs part of that functions tasks.
+ */
+void vvp_vector4_t::copy_from_big_(const vvp_vector4_t&that)
 {
-      size_ = that.size_;
-      if (size_ > BITS_PER_WORD) {
-	    unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
-	    abits_ptr_ = new unsigned long[2*words];
-	    bbits_ptr_ = abits_ptr_ + words;
+      unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
+      abits_ptr_ = new unsigned long[2*words];
+      bbits_ptr_ = abits_ptr_ + words;
 
-	    for (unsigned idx = 0 ;  idx < words ;  idx += 1)
-		  abits_ptr_[idx] = that.abits_ptr_[idx];
-	    for (unsigned idx = 0 ;  idx < words ;  idx += 1)
-		  bbits_ptr_[idx] = that.bbits_ptr_[idx];
-
-      } else {
-	    abits_val_ = that.abits_val_;
-	    bbits_val_ = that.bbits_val_;
-      }
+      for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+	    abits_ptr_[idx] = that.abits_ptr_[idx];
+      for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+	    bbits_ptr_[idx] = that.bbits_ptr_[idx];
 }
 
 /*
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index e2a37d26a..b204dcd40 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -244,7 +244,7 @@ class vvp_vector4_t {
 
       ~vvp_vector4_t();
 
-      unsigned size() const { return size_; }
+      inline unsigned size() const { return size_; }
       void resize(unsigned new_size);
 
 	// Get the bit at the specified address
@@ -317,6 +317,7 @@ class vvp_vector4_t {
 	// Initialize and operator= use this private method to copy
 	// the data from that object into this object.
       void copy_from_(const vvp_vector4_t&that);
+      void copy_from_big_(const vvp_vector4_t&that);
       void copy_inverted_from_(const vvp_vector4_t&that);
 
       void allocate_words_(unsigned long inita, unsigned long initb);
@@ -396,6 +397,16 @@ inline vvp_vector4_t& vvp_vector4_t::operator= (const vvp_vector4_t&that)
       return *this;
 }
 
+inline void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
+{
+      size_ = that.size_;
+      if (size_ <= BITS_PER_WORD) {
+	    abits_val_ = that.abits_val_;
+	    bbits_val_ = that.bbits_val_;
+      } else {
+	    copy_from_big_(that);
+      }
+}
 
 inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
 {