From 8aca66b109e5f4afb0f2373fecacf5ca1027f7c3 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Fri, 14 Nov 2014 11:48:36 -0800
Subject: [PATCH] Performance tweaks

Better performance for %cmp/s and the vvp_fun_part_sa node.
---
 vvp/opcodes.txt | 31 +++++-------------
 vvp/part.cc     |  6 +---
 vvp/vthread.cc  | 84 ++++++++++++++++++++++++++++---------------------
 vvp/vvp_net.cc  | 23 ++++++++++++--
 vvp/vvp_net.h   | 15 ++++++---
 5 files changed, 88 insertions(+), 71 deletions(-)
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 95d005cb7..1c64be7a5 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -246,16 +246,16 @@ value stack.
 Pop a value from the vec4 stack, convert it using Verilog rules to a
 vector2 (binary) value, and push the result.
 
-* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
-* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
+* %cmp/s
+* %cmp/u
 
-These instructions perform a generic comparison of two vectors of equal
-size. The <bit-l> and <bit-r> numbers address the least-significant
-bit of each vector, and <wid> is the width. If either operand is 0,
-1, 2 or 3 then it is taken to be a constant replicated to the selected
-width.
+These instructions perform a generic comparison of two vectors of
+equal size. Two values are pulled from the top of the stack, and not
+replaced. The results are written into flag bits 4,5,6. The
+expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
+from the stack first, then (a).
 
-The results of the comparison go into bits 4, 5, 6 and 7:
+The results of the comparison go into flags 4, 5, 6 and 7:
 
 	4: eq  (equal)
 	5: lt  (less than)
@@ -276,21 +276,6 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
 compare. In either case, if either operand contains x or z, then lt
 bit gets the x value.
 
-* %cmp/s
-* %cmp/u
-
-These instructions perform a generic comparison of two vectors of
-equal size. Two values are pulled from the top of the stack, and not
-replaced. The results are written into flag bits 4,5,6. The
-expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
-from the stack first, then (a).
-
-The results of the comparison go into flags 4, 5, 6 and 7:
-
-	4: eq  (equal)
-	5: lt  (less than)
-	6: eeq (case equal)
-
 * %cmp/wr
 
 Compare real values for equality and less-then. This opcode pops to
diff --git a/vvp/part.cc b/vvp/part.cc
index fb158723a..6ba40927a 100644
--- a/vvp/part.cc
+++ b/vvp/part.cc
@@ -57,11 +57,7 @@ void vvp_fun_part_sa::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
 {
       assert(port.port() == 0);
 
-      vvp_vector4_t tmp (wid_, BIT4_X);
-      for (unsigned idx = 0 ;  idx < wid_ ;  idx += 1) {
-	    if (idx + base_ < bit.size())
-		  tmp.set_bit(idx, bit.value(base_+idx));
-      }
+      vvp_vector4_t tmp (bit, base_, wid_);
       if (val_ .eeq( tmp ))
 	    return;
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 0e10500b6..bb7356807 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -1488,6 +1488,16 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ *  %cmp/s
+ *
+ * Pop the operands from the stack, and do not replace them. The
+ * results are written to flag bits:
+ *
+ *	4: eq  (equal)
+ *	5: lt  (less than)
+ *	6: eeq (case equal)
+ */
 bool of_CMPS(vthread_t thr, vvp_code_t)
 {
       vvp_bit4_t eq  = BIT4_1;
@@ -1498,50 +1508,51 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
       vvp_vector4_t lval = thr->pop_vec4();
 
       assert(rval.size() == lval.size());
+
+	// If either value has XZ bits, then the eq and lt values are
+	// known already to be X. Just calculate the eeq result as a
+	// special case and short circuit the rest of the compare.
+      if (lval.has_xz() || rval.has_xz()) {
+	    thr->flags[4] = BIT4_X; // eq
+	    thr->flags[5] = BIT4_X; // lt
+	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
+	    return true;
+      }
+
+	// Past this point, we know we are dealing only with fully
+	// defined values.
       unsigned wid = lval.size();
 
       const vvp_bit4_t sig1 = lval.value(wid-1);
       const vvp_bit4_t sig2 = rval.value(wid-1);
 
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+      for (unsigned idx = 0 ;  idx < (wid-1) ;  idx += 1) {
 	    vvp_bit4_t lv = lval.value(idx);
 	    vvp_bit4_t rv = rval.value(idx);
 
-	    if (lv > rv) {
-		  lt  = BIT4_0;
-		  eeq = BIT4_0;
-	    } else if (lv < rv) {
-		  lt  = BIT4_1;
-		  eeq = BIT4_0;
-	    }
-	    if (eq != BIT4_X) {
-		  if ((lv == BIT4_0) && (rv != BIT4_0))
-			eq = BIT4_0;
-		  if ((lv == BIT4_1) && (rv != BIT4_1))
-			eq = BIT4_0;
-		  if (bit4_is_xz(lv) || bit4_is_xz(rv))
-			eq = BIT4_X;
+	    if (lv==BIT4_0 && rv==BIT4_1) {
+		  eeq = eq = BIT4_0;
+		  lt = BIT4_1;
+	    } else if (lv==BIT4_1 && rv==BIT4_0) {
+		  eeq = eq = BIT4_0;
+		  lt = BIT4_0;
 	    }
       }
 
-      if (eq == BIT4_X)
-	    lt = BIT4_X;
-      else if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
-	    lt = BIT4_1;
-      else if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
-	    lt = BIT4_0;
-
 	/* Correct the lt bit to account for the sign of the parameters. */
-      if (lt != BIT4_X) {
-	      /* If the first is negative and the last positive, then
-		 a < b for certain. */
-	    if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
-		  lt = BIT4_1;
 
-	      /* If the first is positive and the last negative, then
-		 a > b for certain. */
-	    if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
-		  lt = BIT4_0;
+	// If the first is negative and the last positive, then
+	//    a < b for certain.
+      if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
+	    lt = BIT4_1;
+	    eeq = eq = BIT4_0;
+      }
+
+	// If the first is positive and the last negative, then
+	//    a > b for certain.
+      if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
+	    lt = BIT4_0;
+	    eeq = eq = BIT4_0;
       }
 
       thr->flags[4] = eq;
@@ -4413,16 +4424,19 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
       unsigned wid  = cp->number;
 
       vvp_vector4_t val (wid, BIT4_0);
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
 	    uint32_t ba = 0;
 	      // If the requested width is /32, then there are no
 	      // actual immediate bits, but we can pad with zero. So
 	      // here we test if we are still working on he LSB, and
 	      // process them if so.
 	    if (idx < 32) {
-		  ba = ((valb >> idx) & 1) << 1;
-		  ba |= (vala >> idx) & 1;
+		  ba = (valb & 1) << 1;
+		  ba |= vala & 1;
 	    }
+	    vala >>= 1;
+	    valb >>= 1;
+	    if (ba == 0) continue;
 	    vvp_bit4_t use_bit = BIT4_0;
 	    switch (ba) {
 		case 1:
@@ -4437,8 +4451,6 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
 		default:
 		  break;
 	    }
-	    if (use_bit == BIT4_0)
-		  continue;
 	    val.set_bit(idx, use_bit);
       }
 
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 15f6fa5b3..afa44efbe 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -793,11 +793,30 @@ vvp_vector4_t::vvp_vector4_t(unsigned size__, double val)
 vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
 			    unsigned adr, unsigned wid)
 {
+	// Set up and initialize the destination.
       size_ = wid;
-      assert((adr + wid) <= that.size_);
-
       allocate_words_(WORD_X_ABITS, WORD_X_BBITS);
 
+	// Special case: selecting from far beyond the source vector,
+	// to the result is all X bits. We're done.
+      if (adr >= that.size_)
+	    return;
+
+	// Special case: The source is not quite big enough to supply
+	// all bits, so get the bits that we can. The remainder will
+	// be left at BIT4_X.
+      if ((adr + wid) > that.size_) {
+	    unsigned use_wid = that.size_ - adr;
+	    for (unsigned idx = 0 ; idx < use_wid ; idx += 1)
+		  set_bit(idx, that.value(adr+idx));
+
+	    return;
+      }
+
+	// At the point, we know that the source part is entirely
+	// contained in the source vector.
+	// assert((adr + wid) <= that.size_);
+
       if (wid > BITS_PER_WORD) {
 	      /* In this case, the subvector and the source vector are
 		 long. Do the transfer reasonably efficiently. */
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 2a5e1a71f..e2a37d26a 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -230,7 +230,11 @@ class vvp_vector4_t {
 
       explicit vvp_vector4_t(unsigned size, double val);
 
-	// Construct a vector4 from the subvalue of another vector4.
+	// Construct a vector4 from the subvalue of another
+	// vector4. The width of the result is 'wid', and the bits are
+	// pulled from 'that' to implement the Verilog part select
+	// semantics. This means that part select beyond 'that'
+	// returns X bits.
       explicit vvp_vector4_t(const vvp_vector4_t&that,
 			     unsigned adr, unsigned wid);
 
@@ -398,14 +402,16 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
       if (idx >= size_)
 	    return BIT4_X;
 
-      unsigned wdx = idx / BITS_PER_WORD;
-      unsigned long off = idx % BITS_PER_WORD;
+      unsigned long off;
 
       unsigned long abits, bbits;
       if (size_ > BITS_PER_WORD) {
+	    unsigned wdx = idx / BITS_PER_WORD;
+	    off = idx % BITS_PER_WORD;
 	    abits = abits_ptr_[wdx];
 	    bbits = bbits_ptr_[wdx];
       } else {
+	    off = idx;
 	    abits = abits_val_;
 	    bbits = bbits_val_;
       }
@@ -420,8 +426,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
 	    BIT4_X  // bbit==1, abit==1
       };
 
-	/* Casting is evil, but this cast matches the un-cast done
-	   when the vvp_bit4_t value is put into the vector. */
+	// This map converts the bit-pattern to a vvp_bit4_t value.
       return bits_bit4_map[tmp];
 }