Optimize the vec4-stack %cmp/s and %cmpi/s instructions.

Magnitude compare is called a LOT, so it is worth putting some special effort into it.
2014-12-04 10:42:48 -08:00 · 2014-12-04 10:42:48 -08:00 · 86139c855d
parent eb070b061b
commit 86139c855d
2 changed files with 39 additions and 41 deletions
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -1625,11 +1625,6 @@ bool of_CMPINE(vthread_t thr, vvp_code_t cp)
 static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
 {
      vvp_bit4_t eq  = BIT4_1;
      vvp_bit4_t eeq = BIT4_1;
      vvp_bit4_t lt  = BIT4_0;
      assert(rval.size() == lval.size());
 	// If either value has XZ bits, then the eq and lt values are
@ -1649,38 +1644,48 @@ static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t
      const vvp_bit4_t sig1 = lval.value(wid-1);
      const vvp_bit4_t sig2 = rval.value(wid-1);
-      for (unsigned idx = 0 ;  idx < (wid-1) ;  idx += 1) {
+	// If the lval is <0 and the rval is >=0, then we know the result.
 	    vvp_bit4_t lv = lval.value(idx);
 	    vvp_bit4_t rv = rval.value(idx);
 	    if (lv==BIT4_0 && rv==BIT4_1) {
 		  eeq = eq = BIT4_0;
 		  lt = BIT4_1;
 	    } else if (lv==BIT4_1 && rv==BIT4_0) {
 		  eeq = eq = BIT4_0;
 		  lt = BIT4_0;
 	    }
      }
 	/* Correct the lt bit to account for the sign of the parameters. */
 	// If the first is negative and the last positive, then
 	//    a < b for certain.
      if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
-	    lt = BIT4_1;
+	    thr->flags[4] = BIT4_0; // eq;
-	    eeq = eq = BIT4_0;
+	    thr->flags[5] = BIT4_1; // lt;
 	    thr->flags[6] = BIT4_0; // eeq
 	    return;
      }
-	// If the first is positive and the last negative, then
+	// If the lval is >=0 and the rval is <0, then we know the result.
 	//    a > b for certain.
      if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
-	    lt = BIT4_0;
+	    thr->flags[4] = BIT4_0; // eq;
-	    eeq = eq = BIT4_0;
+	    thr->flags[5] = BIT4_0; // lt;
 	    thr->flags[6] = BIT4_0; // eeq
 	    return;
      }
-      thr->flags[4] = eq;
+	// The values have the same sign, so we have to look at the
-      thr->flags[5] = lt;
+	// actual value. Scan from the MSB down. As soon as we find a
-      thr->flags[6] = eeq;
+	// bit that differs, we know the result.
      for (unsigned idx = 1 ;  idx < wid ;  idx += 1) {
 	    vvp_bit4_t lv = lval.value(wid-1-idx);
 	    vvp_bit4_t rv = rval.value(wid-1-idx);
 	    if (lv == rv)
 		  continue;
 	    thr->flags[4] = BIT4_0; // eq
 	    thr->flags[6] = BIT4_0; // eeq
 	    if (lv==BIT4_0) {
 		  thr->flags[5] = BIT4_1; // lt
 	    } else {
 		  thr->flags[5] = BIT4_0; // lt
 	    }
 	    return;
      }
 	// If we survive the loop above, then the values must be equal.
      thr->flags[4] = BIT4_1;
      thr->flags[5] = BIT4_0;
      thr->flags[6] = BIT4_1;
 }
 /*
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -413,7 +413,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
      if (idx >= size_)
 	    return BIT4_X;
-      unsigned long off;
+      unsigned off;
      unsigned long abits, bbits;
      if (size_ > BITS_PER_WORD) {
@ -430,15 +430,8 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
      abits >>= off;
      bbits >>= off;
      int tmp = ((bbits&1) << 1) + (abits&1);
-      static const vvp_bit4_t bits_bit4_map[4] = {
+	// This cast works since b==1,a==1 is X and b==1,a==0 is Z.
-	    BIT4_0, // bbit==0, abit==0
+      return (vvp_bit4_t)tmp;
 	    BIT4_1, // bbit==0, abit==1
 	    BIT4_Z, // bbit==1, abit==0
 	    BIT4_X  // bbit==1, abit==1
      };
 	// This map converts the bit-pattern to a vvp_bit4_t value.
      return bits_bit4_map[tmp];
 }
 inline vvp_vector4_t vvp_vector4_t::subvalue(unsigned adr, unsigned wid) const