From 0f740289e98bdf0be6eccbe9f30daee63fd491ee Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Thu, 4 Dec 2014 16:00:57 -0800
Subject: [PATCH] Optimize %mul instructions by integrating with vvp_vector4_t
 class.

---
 vvp/vthread.cc | 122 ++-----------------------------------
 vvp/vvp_net.cc | 159 ++++++++++++++++++++++++++++++++++++++++++++-----
 vvp/vvp_net.h  |  25 ++++++++
 3 files changed, 172 insertions(+), 134 deletions(-)
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 70f514b69..1fb496ebe 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -390,66 +390,6 @@ template <class T> T coerce_to_width(const T&that, unsigned width)
 template vvp_vector4_t coerce_to_width(const vvp_vector4_t&that,
                                        unsigned width);
 
-/*
- * Some of the instructions do wide addition to arrays of long. They
- * use this add_with_carry function to help.
- */
-static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
-					   unsigned long&carry)
-{
-      unsigned long tmp = b + carry;
-      unsigned long sum = a + tmp;
-      carry = 0;
-      if (tmp < b)
-	    carry = 1;
-      if (sum < tmp)
-	    carry = 1;
-      if (sum < a)
-	    carry = 1;
-      return sum;
-}
-
-static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
-					 unsigned long&carry)
-{
-      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
-      unsigned long a0 = a & mask;
-      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
-      unsigned long b0 = b & mask;
-      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
-
-      unsigned long tmp = a0 * b0;
-
-      unsigned long r00 = tmp & mask;
-      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
-
-      tmp = a0 * b1;
-
-      unsigned long r01 = tmp & mask;
-      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
-
-      tmp = a1 * b0;
-
-      unsigned long r10 = tmp & mask;
-      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
-
-      tmp = a1 * b1;
-
-      unsigned long r11 = tmp & mask;
-      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
-
-      unsigned long r1 = c00 + r01 + r10;
-      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
-      r1 &= mask;
-      r2 += c01 + c10 + r11;
-      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
-      r2 &= mask;
-      r3 += c11;
-      r3 &= mask;
-
-      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
-      return (r1 << (CPU_WORD_BITS/2)) + r00;
-}
 
 static void multiply_array_imm(unsigned long*res, unsigned long*val,
 			       unsigned words, unsigned long imm)
@@ -4021,62 +3961,6 @@ bool of_MOV_WU(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-static bool do_MUL(vvp_vector4_t&vala, const vvp_vector4_t&valb)
-{
-      assert(vala.size() == valb.size());
-      unsigned wid = vala.size();
-
-      unsigned long*ap = vala.subarray(0, wid);
-      if (ap == 0) {
-	    vvp_vector4_t tmp(wid, BIT4_X);
-	    vala = tmp;
-	    return true;
-      }
-
-      unsigned long*bp = valb.subarray(0, wid);
-      if (bp == 0) {
-	    delete[]ap;
-	    vvp_vector4_t tmp(wid, BIT4_X);
-	    vala = tmp;
-	    return true;
-      }
-
-	// If the value fits in a single CPU word, then do it the easy way.
-      if (wid <= CPU_WORD_BITS) {
-	    ap[0] *= bp[0];
-	    vala.setarray(0, wid, ap);
-	    delete[]ap;
-	    delete[]bp;
-	    return true;
-      }
-
-      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
-      unsigned long*res = new unsigned long[words];
-      for (unsigned idx = 0 ; idx < words ; idx += 1)
-	    res[idx] = 0;
-
-      for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
-	    for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) {
-		  unsigned long sum;
-		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
-		  unsigned base = mul_a + mul_b;
-		  unsigned long carry = 0;
-		  res[base] = add_with_carry(res[base], tmp, carry);
-		  for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
-			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
-			sum = 0;
-		  }
-	    }
-      }
-
-      vala.setarray(0, wid, res);
-      delete[]ap;
-      delete[]bp;
-      delete[]res;
-
-      return true;
-}
-
 /*
  * %mul
  */
@@ -4088,7 +3972,8 @@ bool of_MUL(vthread_t thr, vvp_code_t)
 	// replaces a pop and a pull.
       vvp_vector4_t&l = thr->peek_vec4();
 
-      return do_MUL(l, r);
+      l.mul(r);
+      return true;
 }
 
 /*
@@ -4109,7 +3994,8 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t r (wid, BIT4_0);
       get_immediate_rval (cp, r);
 
-      return do_MUL(l, r);
+      l.mul(r);
+      return true;
 }
 
 bool of_MUL_WR(vthread_t thr, vvp_code_t)
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 6a2508327..1b0f9ce45 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -41,6 +41,11 @@
 # include  "ivl_alloc.h"
 #endif
 
+/* This is the size of an unsigned long in bits. This is just a
+   convenience macro. */
+# define CPU_WORD_BITS (8*sizeof(unsigned long))
+# define TOP_BIT (1UL << (CPU_WORD_BITS-1))
+
 permaheap vvp_net_fun_t::heap_;
 permaheap vvp_net_fil_t::heap_;
 
@@ -510,25 +515,49 @@ int edge(vvp_bit4_t from, vvp_bit4_t to)
       return 0;
 }
 
-/*
- * Some of the instructions do wide addition to arrays of long. They
- * use this add_with_carry function to help.
- */
-static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
-					   unsigned long&carry)
+unsigned long multiply_with_carry(unsigned long a, unsigned long b,
+				  unsigned long&carry)
 {
-      unsigned long tmp = b + carry;
-      unsigned long sum = a + tmp;
-      carry = 0;
-      if (tmp < b)
-	    carry = 1;
-      if (sum < tmp)
-	    carry = 1;
-      if (sum < a)
-	    carry = 1;
-      return sum;
+      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
+      unsigned long a0 = a & mask;
+      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
+      unsigned long b0 = b & mask;
+      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long tmp = a0 * b0;
+
+      unsigned long r00 = tmp & mask;
+      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a0 * b1;
+
+      unsigned long r01 = tmp & mask;
+      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b0;
+
+      unsigned long r10 = tmp & mask;
+      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b1;
+
+      unsigned long r11 = tmp & mask;
+      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long r1 = c00 + r01 + r10;
+      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
+      r1 &= mask;
+      r2 += c01 + c10 + r11;
+      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
+      r2 &= mask;
+      r3 += c11;
+      r3 &= mask;
+
+      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
+      return (r1 << (CPU_WORD_BITS/2)) + r00;
 }
 
+
 void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val)
 {
       while (vvp_net_t*cur = ptr.ptr()) {
@@ -1524,6 +1553,104 @@ void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
       }
 }
 
+void vvp_vector4_t::mul(const vvp_vector4_t&that)
+{
+      assert(size_ == that.size_);
+
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = ~(-1UL << size_);
+	    if ((bbits_val_|that.bbits_val_) & mask) {
+		  abits_val_ |= mask;
+		  bbits_val_ |= mask;
+		  return;
+	    }
+
+	    abits_val_ *= that.abits_val_;
+	    abits_val_ &= mask;
+	    return;
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    if (bbits_val_ || that.bbits_val_) {
+		  abits_val_ = WORD_X_ABITS;
+		  bbits_val_ = WORD_X_BBITS;
+	    } else {
+		  abits_val_ *= that.abits_val_;
+	    }
+	    return;
+      }
+
+      const int cnt = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
+
+      unsigned long mask;
+      if (unsigned tail = size_%BITS_PER_WORD) {
+	    mask = ~( -1UL << tail );
+      } else {
+	    mask = ~0UL;
+      }
+
+	// Check for any XZ values ahead of time in a first pass. If
+	// we find any, then force the entire result to be X and be
+	// done.
+      for (int idx = 0 ; idx < cnt ; idx += 1) {
+	    unsigned long lval = bbits_ptr_[idx];
+	    unsigned long rval = that.bbits_ptr_[idx];
+	    if (idx == (cnt-1)) {
+		  lval &= mask;
+		  rval &= mask;
+	    }
+	    if (lval || rval) {
+		  for (int xdx = 0 ; xdx < cnt-1 ; xdx += 1) {
+			abits_ptr_[xdx] = WORD_X_ABITS;
+			bbits_ptr_[xdx] = WORD_X_BBITS;
+		  }
+		  abits_ptr_[cnt-1] = WORD_X_ABITS & mask;
+		  bbits_ptr_[cnt-1] = WORD_X_BBITS & mask;
+		  return;
+	    }
+      }
+
+	// Calculate the result into a res array. We need to keep is
+	// separate from the "this" array because we are making
+	// multiple passes.
+      unsigned long*res = new unsigned long[cnt];
+      for (int idx = 0 ; idx < cnt ; idx += 1)
+	    res[idx] = 0;
+
+      for (int mul_a = 0 ; mul_a < cnt ; mul_a += 1) {
+	    unsigned long lval = abits_ptr_[mul_a];
+	    if (mul_a == (cnt-1))
+		  lval &= mask;
+
+	    for (int mul_b = 0 ; mul_b < (cnt-mul_a) ; mul_b += 1) {
+		  unsigned long rval = that.abits_ptr_[mul_b];
+		  if (mul_b == (cnt-1))
+			rval &= mask;
+
+		  unsigned long sum;
+		  unsigned long tmp = multiply_with_carry(lval, rval, sum);
+		  int base = mul_a + mul_b;
+		  unsigned long carry = 0;
+		  res[base] = add_with_carry(res[base], tmp, carry);
+		  for (int add_idx = base+1 ; add_idx < cnt ; add_idx += 1) {
+			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+			sum = 0;
+		  }
+	    }
+      }
+
+	// Replace the "this" value with the calculated result. We
+	// know a-priori that the bbits are zero and unchanged.
+      res[cnt-1] &= mask;
+      for (int idx = 0 ; idx < cnt ; idx += 1)
+	    abits_ptr_[idx] = res[idx];
+
+      delete[]res;
+      return;
+
+
+}
+
 bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
 {
       if (size_ != that.size_)
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 16a37a953..a1292cf9b 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -209,6 +209,28 @@ inline void update_driver_counts(vvp_bit4_t bit, unsigned counts[3])
       }
 }
 
+/*
+ * Some of the instructions do wide addition to arrays of long. They
+ * use this add_with_carry function to help.
+ */
+static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
+					   unsigned long&carry)
+{
+      unsigned long tmp = b + carry;
+      unsigned long sum = a + tmp;
+      carry = 0;
+      if (tmp < b)
+	    carry = 1;
+      if (sum < tmp)
+	    carry = 1;
+      if (sum < a)
+	    carry = 1;
+      return sum;
+}
+
+extern unsigned long multiply_with_carry(unsigned long a, unsigned long b,
+					 unsigned long&carry);
+
 /*
  * This class represents scalar values collected into vectors. The
  * vector values can be accessed individually, or treated as a
@@ -274,6 +296,9 @@ class vvp_vector4_t {
 	// Add that to this in the Verilog way.
       void add(const vvp_vector4_t&that);
 
+	// Multiply this by that in the Verilog way.
+      void mul(const vvp_vector4_t&that);
+
 	// Test that the vectors are exactly equal
       bool eeq(const vvp_vector4_t&that) const;