From 5853f7d8678544d20348cee2bce2cde0ca53c885 Mon Sep 17 00:00:00 2001
From: Martin Whitaker <icarus@martin-whitaker.me.uk>
Date: Sat, 15 Feb 2014 21:40:55 +0000
Subject: [PATCH] Fix for GitHub issue 9 part 1 : Efficiency of
 vvp_vector2_t::pow() function.

The vvp_vector2_t::pow() function is recursive, and performs a multiplication
operation on each step. The multiplication operator was expanding the result
vector to accomodate the maximum possible result value for the given operand
vectors, thus causing the execution time of the power operation to be
exponentially proportional to the exponent value. Both in this case and
in general, it is unnecessary for the multiplication result vector to be
expanded, as the compiler has already determined the required vector width
during elaboration, and sizes the operand vectors to match.
---
 vvp/vthread.cc | 11 -----------
 vvp/vvp_net.cc | 26 ++++++++++----------------
 2 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index b2b9fd8ce..73d9b41f5 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -4557,19 +4557,8 @@ bool of_POW(vthread_t thr, vvp_code_t cp)
 	    return true;
       }
 
-        /* To make the result more manageable trim off the extra bits. */
-      xv2.trim();
-      yv2.trim();
-
       vvp_vector2_t result = pow(xv2, yv2);
 
-        /* If the result is too small zero pad it. */
-      if (result.size() < wid) {
-	    for (unsigned jdx = wid-1;  jdx >= result.size();  jdx -= 1)
-		  thr_put_bit(thr, cp->bit_idx[0]+jdx, BIT4_0);
-	    wid = result.size();
-      }
-
         /* Copy only what we need of the result. */
       for (unsigned jdx = 0;  jdx < wid;  jdx += 1)
 	    thr_put_bit(thr, cp->bit_idx[0]+jdx,
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 3e9f32695..c87e26ef9 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -2362,23 +2362,21 @@ bool vvp_vector2_t::is_zero() const
  */
 vvp_vector2_t pow(const vvp_vector2_t&x, vvp_vector2_t&y)
 {
-        /* If we have a zero exponent just return a 1 bit wide 1. */
+        /* If we have a zero exponent just return 1. */
       if (y == vvp_vector2_t(0L, 1)) {
-	    return vvp_vector2_t(1L, 1);
+ 	    return vvp_vector2_t(1L, x.size());
       }
 
         /* Is the value odd? */
       if (y.value(0) == 1) {
 	    y.set_bit(0, 0);  // A quick subtract by 1.
 	    vvp_vector2_t res = x * pow(x, y);
-	    res.trim();  // To keep the size under control trim extra zeros.
 	    return res;
       }
 
       y >>= 1;  // A fast divide by two. We know the LSB is zero.
       vvp_vector2_t z = pow(x, y);
       vvp_vector2_t res = z * z;
-      res.trim();  // To keep the size under control trim extra zeros.
       return res;
 }
 
@@ -2422,25 +2420,22 @@ static void multiply_long(unsigned long a, unsigned long b,
       low  = (res[1] << 4UL*sizeof(unsigned long)) | res[0];
 }
 
-/*
- * Multiplication of two vector2 vectors returns a product as wide as
- * the sum of the widths of the input vectors.
- */
 vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
 {
       const unsigned bits_per_word = 8 * sizeof(a.vec_[0]);
-      vvp_vector2_t r (0, a.size() + b.size());
 
-      unsigned awords = (a.wid_ + bits_per_word - 1) / bits_per_word;
-      unsigned bwords = (b.wid_ + bits_per_word - 1) / bits_per_word;
-      unsigned rwords = (r.wid_ + bits_per_word - 1) / bits_per_word;
+	// The compiler ensures that the two operands are of equal size.
+      assert(a.size() == b.size());
+      vvp_vector2_t r (0, a.size());
 
-      for (unsigned bdx = 0 ;  bdx < bwords ;  bdx += 1) {
+      unsigned words = (r.wid_ + bits_per_word - 1) / bits_per_word;
+
+      for (unsigned bdx = 0 ;  bdx < words ;  bdx += 1) {
 	    unsigned long tmpb = b.vec_[bdx];
 	    if (tmpb == 0)
 		  continue;
 
-	    for (unsigned adx = 0 ;  adx < awords ;  adx += 1) {
+	    for (unsigned adx = 0 ;  adx < words ;  adx += 1) {
 		  unsigned long tmpa = a.vec_[adx];
 		  if (tmpa == 0)
 			continue;
@@ -2450,7 +2445,7 @@ vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
 
 		  unsigned long carry = 0;
 		  for (unsigned sdx = 0
-			     ; (adx+bdx+sdx) < rwords
+			     ; (adx+bdx+sdx) < words
 			     ;  sdx += 1) {
 
 			r.vec_[adx+bdx+sdx] = add_carry(r.vec_[adx+bdx+sdx],
@@ -2461,7 +2456,6 @@ vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
 	    }
       }
 
-
       return r;
 }