Optimize the %add and %addi instructions

Tightly integrate with the vvp_vector4_t class to get much better add performance.
2014-12-04 12:38:08 -08:00 · 2014-12-04 12:38:08 -08:00 · 46ce236cfb
parent 86139c855d
commit 46ce236cfb
3 changed files with 103 additions and 48 deletions
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -842,60 +842,20 @@ static void get_immediate_rval(vvp_code_t cp, vvp_vector4_t&val)

      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
 	    uint32_t ba = 0;
-	      // Convert the vala/valb bits to a ba number that can be
-	      // used to select what goes into the value.
+	      // Convert the vala/valb bits to a ba number that
+	      // matches the encoding of the vvp_bit4_t enumeration.
 	    ba = (valb & 1) << 1;
 	    ba |= vala & 1;

-	    switch (ba) {
-		case 1:
-		  val.set_bit(idx, BIT4_1);
-		  break;
-		case 2:
-		  val.set_bit(idx, BIT4_Z);
-		  break;
-		case 3:
-		  val.set_bit(idx, BIT4_X);
-		  break;
-		default:
-		  break;
-	    }
+	      // Note that the val is already pre-filled with BIT4_0
+	      // bits, os we only need to set non-zero bit values.
+	    if (ba) val.set_bit(idx, (vvp_bit4_t)ba);

 	    vala >>= 1;
 	    valb >>= 1;
      }
 }

-static bool do_ADD(vvp_vector4_t&l, const vvp_vector4_t&r)
-{
-      unsigned wid = l.size();
-      assert(wid == r.size());
-
-      unsigned long*lva = l.subarray(0,wid);
-      unsigned long*lvb = r.subarray(0,wid);
-      if (lva==0 || lvb==0)
-	    goto x_out;
-
-      unsigned long carry;
-      carry = 0;
-      for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < wid ; idx += 1)
-	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
-
-      l.setarray(0,wid,lva);
-
-      delete[]lva;
-      delete[]lvb;
-      return true;
-
- x_out:
-      delete[]lva;
-      delete[]lvb;
-
-      vvp_vector4_t tmp (wid, BIT4_X);
-      l = tmp;
-      return true;
-}
-
 /*
 * %add
 *
@ -914,7 +874,9 @@ bool of_ADD(vthread_t thr, vvp_code_t)
 	// replaces a pop and a pull.
      vvp_vector4_t&l = thr->peek_vec4();

-      return do_ADD(l, r);
+      l.add(r);
+
+      return true;
 }

 /*
@ -935,7 +897,9 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
      vvp_vector4_t r (wid, BIT4_0);
      get_immediate_rval (cp, r);

-      return do_ADD(l, r);
+      l.add(r);
+
+      return true;
 }

 bool of_ADD_WR(vthread_t thr, vvp_code_t)
@ -1184,7 +1148,7 @@ bool of_ASSIGN_VEC4_OFF_E(vthread_t thr, vvp_code_t cp)
 		  return true;

 	    int use_off = -off;
-	    assert(wid > use_off);
+	    assert((int)wid > use_off);
 	    unsigned use_wid = wid - use_off;
 	    val = val.subvalue(use_off, use_wid);
 	    off = 0;
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -510,6 +510,25 @@ int edge(vvp_bit4_t from, vvp_bit4_t to)
      return 0;
 }

+/*
+ * Some of the instructions do wide addition to arrays of long. They
+ * use this add_with_carry function to help.
+ */
+static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
+					   unsigned long&carry)
+{
+      unsigned long tmp = b + carry;
+      unsigned long sum = a + tmp;
+      carry = 0;
+      if (tmp < b)
+	    carry = 1;
+      if (sum < tmp)
+	    carry = 1;
+      if (sum < a)
+	    carry = 1;
+      return sum;
+}
+
 void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val)
 {
      while (vvp_net_t*cur = ptr.ptr()) {
@ -1366,6 +1385,72 @@ bool vvp_vector4_t::set_vec(unsigned adr, const vvp_vector4_t&that)
      return diff_flag;
 }

+/*
+ * Add that vector to this vector. Do it in the Verilog way, which
+ * means if we detect any X or Z bits, change the entire results to
+ * all X.
+ *
+ * Assume both vectors are the same size.
+ */
+void vvp_vector4_t::add(const vvp_vector4_t&that)
+{
+      assert(size_ == that.size_);
+
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = ~(-1UL << size_);
+	    if ((bbits_val_|that.bbits_val_) & mask) {
+		  abits_val_ |= mask;
+		  bbits_val_ |= mask;
+		  return;
+	    }
+
+	    abits_val_ += that.abits_val_;
+	    abits_val_ &= mask;
+	    return;
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    if (bbits_val_ | that.bbits_val_) {
+		  abits_val_ = WORD_X_ABITS;
+		  bbits_val_ = WORD_X_BBITS;
+	    } else {
+		  abits_val_ += that.abits_val_;
+	    }
+	    return;
+      }
+
+      int cnt = size_ / BITS_PER_WORD;
+      unsigned long carry = 0;
+      for (int idx = 0 ; idx < cnt ; idx += 1) {
+	    if (bbits_ptr_[idx] | that.bbits_ptr_[idx])
+		  goto x_out;
+
+	    abits_ptr_[idx] = add_with_carry(abits_ptr_[idx], that.abits_ptr_[idx], carry);
+      }
+
+      if (unsigned tail = size_ % BITS_PER_WORD) {
+	    unsigned long mask = ~( -1UL << tail );
+	    if ((bbits_ptr_[cnt] | that.bbits_ptr_[cnt])&mask)
+		  goto x_out;
+
+	    abits_ptr_[cnt] = add_with_carry(abits_ptr_[cnt], that.abits_ptr_[cnt], carry);
+	    abits_ptr_[cnt] &= mask;
+      }
+
+      return;
+
+ x_out:
+      for (int idx = 0 ; idx < cnt ; idx += 1) {
+	    abits_ptr_[idx] = WORD_X_ABITS;
+	    bbits_ptr_[idx] = WORD_X_BBITS;
+      }
+      if (unsigned tail = size_%BITS_PER_WORD) {
+	    unsigned long mask = ~( -1UL << tail );
+	    abits_ptr_[cnt] = WORD_X_ABITS&mask;
+	    bbits_ptr_[cnt] = WORD_X_BBITS&mask;
+      }
+}
+
 void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
 {
      assert(dst+cnt <= size_);
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -130,6 +130,9 @@ struct automatic_hooks_s {
 * values. The enumeration has fixed numeric values that can be
 * expressed in 2 real bits, so that some of the internal classes can
 * pack them tightly.
+ *
+ * WARNING: Many things rely on this encoding for the BIT4_* enumeration
+ * values, so accept that these values are cast in stone.
 */
 enum vvp_bit4_t {
      BIT4_0 = 0,
@ -268,6 +271,9 @@ class vvp_vector4_t {
 	// Move bits within this vector.
      void mov(unsigned dst, unsigned src, unsigned cnt);

+	// Add that to this in the Verilog way.
+      void add(const vvp_vector4_t&that);
+
 	// Test that the vectors are exactly equal
      bool eeq(const vvp_vector4_t&that) const;