Performance tweaks

Better performance for %cmp/s and the vvp_fun_part_sa node.
2014-11-14 11:48:36 -08:00 · 2014-11-14 11:48:36 -08:00 · 8aca66b109
parent 2aeb3871ed
commit 8aca66b109
5 changed files with 88 additions and 71 deletions
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@ -246,16 +246,16 @@ value stack.
 Pop a value from the vec4 stack, convert it using Verilog rules to a
 vector2 (binary) value, and push the result.

-* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
-* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
+* %cmp/s
+* %cmp/u

-These instructions perform a generic comparison of two vectors of equal
-size. The <bit-l> and <bit-r> numbers address the least-significant
-bit of each vector, and <wid> is the width. If either operand is 0,
-1, 2 or 3 then it is taken to be a constant replicated to the selected
-width.
+These instructions perform a generic comparison of two vectors of
+equal size. Two values are pulled from the top of the stack, and not
+replaced. The results are written into flag bits 4,5,6. The
+expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
+from the stack first, then (a).

-The results of the comparison go into bits 4, 5, 6 and 7:
+The results of the comparison go into flags 4, 5, 6 and 7:

 	4: eq  (equal)
 	5: lt  (less than)
@ -276,21 +276,6 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
 compare. In either case, if either operand contains x or z, then lt
 bit gets the x value.

-* %cmp/s
-* %cmp/u
-
-These instructions perform a generic comparison of two vectors of
-equal size. Two values are pulled from the top of the stack, and not
-replaced. The results are written into flag bits 4,5,6. The
-expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
-from the stack first, then (a).
-
-The results of the comparison go into flags 4, 5, 6 and 7:
-
-	4: eq  (equal)
-	5: lt  (less than)
-	6: eeq (case equal)
-
 * %cmp/wr

 Compare real values for equality and less-then. This opcode pops to
--- a/vvp/part.cc
+++ b/vvp/part.cc
@ -57,11 +57,7 @@ void vvp_fun_part_sa::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
 {
      assert(port.port() == 0);

-      vvp_vector4_t tmp (wid_, BIT4_X);
-      for (unsigned idx = 0 ;  idx < wid_ ;  idx += 1) {
-	    if (idx + base_ < bit.size())
-		  tmp.set_bit(idx, bit.value(base_+idx));
-      }
+      vvp_vector4_t tmp (bit, base_, wid_);
      if (val_ .eeq( tmp ))
 	    return;

--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -1488,6 +1488,16 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
      return true;
 }

+/*
+ *  %cmp/s
+ *
+ * Pop the operands from the stack, and do not replace them. The
+ * results are written to flag bits:
+ *
+ *	4: eq  (equal)
+ *	5: lt  (less than)
+ *	6: eeq (case equal)
+ */
 bool of_CMPS(vthread_t thr, vvp_code_t)
 {
      vvp_bit4_t eq  = BIT4_1;
@ -1498,50 +1508,51 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
      vvp_vector4_t lval = thr->pop_vec4();

      assert(rval.size() == lval.size());
+
+	// If either value has XZ bits, then the eq and lt values are
+	// known already to be X. Just calculate the eeq result as a
+	// special case and short circuit the rest of the compare.
+      if (lval.has_xz() || rval.has_xz()) {
+	    thr->flags[4] = BIT4_X; // eq
+	    thr->flags[5] = BIT4_X; // lt
+	    thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
+	    return true;
+      }
+
+	// Past this point, we know we are dealing only with fully
+	// defined values.
      unsigned wid = lval.size();

      const vvp_bit4_t sig1 = lval.value(wid-1);
      const vvp_bit4_t sig2 = rval.value(wid-1);

-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
+      for (unsigned idx = 0 ;  idx < (wid-1) ;  idx += 1) {
 	    vvp_bit4_t lv = lval.value(idx);
 	    vvp_bit4_t rv = rval.value(idx);

-	    if (lv > rv) {
-		  lt  = BIT4_0;
-		  eeq = BIT4_0;
-	    } else if (lv < rv) {
-		  lt  = BIT4_1;
-		  eeq = BIT4_0;
-	    }
-	    if (eq != BIT4_X) {
-		  if ((lv == BIT4_0) && (rv != BIT4_0))
-			eq = BIT4_0;
-		  if ((lv == BIT4_1) && (rv != BIT4_1))
-			eq = BIT4_0;
-		  if (bit4_is_xz(lv) || bit4_is_xz(rv))
-			eq = BIT4_X;
+	    if (lv==BIT4_0 && rv==BIT4_1) {
+		  eeq = eq = BIT4_0;
+		  lt = BIT4_1;
+	    } else if (lv==BIT4_1 && rv==BIT4_0) {
+		  eeq = eq = BIT4_0;
+		  lt = BIT4_0;
 	    }
      }

-      if (eq == BIT4_X)
-	    lt = BIT4_X;
-      else if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
-	    lt = BIT4_1;
-      else if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
-	    lt = BIT4_0;
-
 	/* Correct the lt bit to account for the sign of the parameters. */
-      if (lt != BIT4_X) {
-	      /* If the first is negative and the last positive, then
-		 a < b for certain. */
-	    if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
-		  lt = BIT4_1;

-	      /* If the first is positive and the last negative, then
-		 a > b for certain. */
-	    if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
-		  lt = BIT4_0;
+	// If the first is negative and the last positive, then
+	//    a < b for certain.
+      if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
+	    lt = BIT4_1;
+	    eeq = eq = BIT4_0;
+      }
+
+	// If the first is positive and the last negative, then
+	//    a > b for certain.
+      if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
+	    lt = BIT4_0;
+	    eeq = eq = BIT4_0;
      }

      thr->flags[4] = eq;
@ -4413,16 +4424,19 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
      unsigned wid  = cp->number;

      vvp_vector4_t val (wid, BIT4_0);
-      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
+      for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
 	    uint32_t ba = 0;
 	      // If the requested width is /32, then there are no
 	      // actual immediate bits, but we can pad with zero. So
 	      // here we test if we are still working on he LSB, and
 	      // process them if so.
 	    if (idx < 32) {
-		  ba = ((valb >> idx) & 1) << 1;
-		  ba |= (vala >> idx) & 1;
+		  ba = (valb & 1) << 1;
+		  ba |= vala & 1;
 	    }
+	    vala >>= 1;
+	    valb >>= 1;
+	    if (ba == 0) continue;
 	    vvp_bit4_t use_bit = BIT4_0;
 	    switch (ba) {
 		case 1:
@ -4437,8 +4451,6 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
 		default:
 		  break;
 	    }
-	    if (use_bit == BIT4_0)
-		  continue;
 	    val.set_bit(idx, use_bit);
      }

--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -793,11 +793,30 @@ vvp_vector4_t::vvp_vector4_t(unsigned size__, double val)
 vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
 			    unsigned adr, unsigned wid)
 {
+	// Set up and initialize the destination.
      size_ = wid;
-      assert((adr + wid) <= that.size_);
-
      allocate_words_(WORD_X_ABITS, WORD_X_BBITS);

+	// Special case: selecting from far beyond the source vector,
+	// to the result is all X bits. We're done.
+      if (adr >= that.size_)
+	    return;
+
+	// Special case: The source is not quite big enough to supply
+	// all bits, so get the bits that we can. The remainder will
+	// be left at BIT4_X.
+      if ((adr + wid) > that.size_) {
+	    unsigned use_wid = that.size_ - adr;
+	    for (unsigned idx = 0 ; idx < use_wid ; idx += 1)
+		  set_bit(idx, that.value(adr+idx));
+
+	    return;
+      }
+
+	// At the point, we know that the source part is entirely
+	// contained in the source vector.
+	// assert((adr + wid) <= that.size_);
+
      if (wid > BITS_PER_WORD) {
 	      /* In this case, the subvector and the source vector are
 		 long. Do the transfer reasonably efficiently. */
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -230,7 +230,11 @@ class vvp_vector4_t {

      explicit vvp_vector4_t(unsigned size, double val);

-	// Construct a vector4 from the subvalue of another vector4.
+	// Construct a vector4 from the subvalue of another
+	// vector4. The width of the result is 'wid', and the bits are
+	// pulled from 'that' to implement the Verilog part select
+	// semantics. This means that part select beyond 'that'
+	// returns X bits.
      explicit vvp_vector4_t(const vvp_vector4_t&that,
 			     unsigned adr, unsigned wid);

@ -398,14 +402,16 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
      if (idx >= size_)
 	    return BIT4_X;

-      unsigned wdx = idx / BITS_PER_WORD;
-      unsigned long off = idx % BITS_PER_WORD;
+      unsigned long off;

      unsigned long abits, bbits;
      if (size_ > BITS_PER_WORD) {
+	    unsigned wdx = idx / BITS_PER_WORD;
+	    off = idx % BITS_PER_WORD;
 	    abits = abits_ptr_[wdx];
 	    bbits = bbits_ptr_[wdx];
      } else {
+	    off = idx;
 	    abits = abits_val_;
 	    bbits = bbits_val_;
      }
@ -420,8 +426,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
 	    BIT4_X  // bbit==1, abit==1
      };

-	/* Casting is evil, but this cast matches the un-cast done
-	   when the vvp_bit4_t value is put into the vector. */
+	// This map converts the bit-pattern to a vvp_bit4_t value.
      return bits_bit4_map[tmp];
 }