From 8aca66b109e5f4afb0f2373fecacf5ca1027f7c3 Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Fri, 14 Nov 2014 11:48:36 -0800 Subject: [PATCH] Performance tweaks Better performance for %cmp/s and the vvp_fun_part_sa node. --- vvp/opcodes.txt | 31 +++++------------- vvp/part.cc | 6 +--- vvp/vthread.cc | 84 ++++++++++++++++++++++++++++--------------------- vvp/vvp_net.cc | 23 ++++++++++++-- vvp/vvp_net.h | 15 ++++++--- 5 files changed, 88 insertions(+), 71 deletions(-) diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt index 95d005cb7..1c64be7a5 100644 --- a/vvp/opcodes.txt +++ b/vvp/opcodes.txt @@ -246,16 +246,16 @@ value stack. Pop a value from the vec4 stack, convert it using Verilog rules to a vector2 (binary) value, and push the result. -* %cmp/u , , (XXXX Old meaning) -* %cmp/s , , (XXXX Old meaning) +* %cmp/s +* %cmp/u -These instructions perform a generic comparison of two vectors of equal -size. The and numbers address the least-significant -bit of each vector, and is the width. If either operand is 0, -1, 2 or 3 then it is taken to be a constant replicated to the selected -width. +These instructions perform a generic comparison of two vectors of +equal size. Two values are pulled from the top of the stack, and not +replaced. The results are written into flag bits 4,5,6. The +expressions (apop_vec4(); assert(rval.size() == lval.size()); + + // If either value has XZ bits, then the eq and lt values are + // known already to be X. Just calculate the eeq result as a + // special case and short circuit the rest of the compare. + if (lval.has_xz() || rval.has_xz()) { + thr->flags[4] = BIT4_X; // eq + thr->flags[5] = BIT4_X; // lt + thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0; + return true; + } + + // Past this point, we know we are dealing only with fully + // defined values. unsigned wid = lval.size(); const vvp_bit4_t sig1 = lval.value(wid-1); const vvp_bit4_t sig2 = rval.value(wid-1); - for (unsigned idx = 0 ; idx < wid ; idx += 1) { + for (unsigned idx = 0 ; idx < (wid-1) ; idx += 1) { vvp_bit4_t lv = lval.value(idx); vvp_bit4_t rv = rval.value(idx); - if (lv > rv) { - lt = BIT4_0; - eeq = BIT4_0; - } else if (lv < rv) { - lt = BIT4_1; - eeq = BIT4_0; - } - if (eq != BIT4_X) { - if ((lv == BIT4_0) && (rv != BIT4_0)) - eq = BIT4_0; - if ((lv == BIT4_1) && (rv != BIT4_1)) - eq = BIT4_0; - if (bit4_is_xz(lv) || bit4_is_xz(rv)) - eq = BIT4_X; + if (lv==BIT4_0 && rv==BIT4_1) { + eeq = eq = BIT4_0; + lt = BIT4_1; + } else if (lv==BIT4_1 && rv==BIT4_0) { + eeq = eq = BIT4_0; + lt = BIT4_0; } } - if (eq == BIT4_X) - lt = BIT4_X; - else if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) - lt = BIT4_1; - else if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) - lt = BIT4_0; - /* Correct the lt bit to account for the sign of the parameters. */ - if (lt != BIT4_X) { - /* If the first is negative and the last positive, then - a < b for certain. */ - if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) - lt = BIT4_1; - /* If the first is positive and the last negative, then - a > b for certain. */ - if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) - lt = BIT4_0; + // If the first is negative and the last positive, then + // a < b for certain. + if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) { + lt = BIT4_1; + eeq = eq = BIT4_0; + } + + // If the first is positive and the last negative, then + // a > b for certain. + if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) { + lt = BIT4_0; + eeq = eq = BIT4_0; } thr->flags[4] = eq; @@ -4413,16 +4424,19 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp) unsigned wid = cp->number; vvp_vector4_t val (wid, BIT4_0); - for (unsigned idx = 0 ; idx < wid ; idx += 1) { + for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) { uint32_t ba = 0; // If the requested width is /32, then there are no // actual immediate bits, but we can pad with zero. So // here we test if we are still working on he LSB, and // process them if so. if (idx < 32) { - ba = ((valb >> idx) & 1) << 1; - ba |= (vala >> idx) & 1; + ba = (valb & 1) << 1; + ba |= vala & 1; } + vala >>= 1; + valb >>= 1; + if (ba == 0) continue; vvp_bit4_t use_bit = BIT4_0; switch (ba) { case 1: @@ -4437,8 +4451,6 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp) default: break; } - if (use_bit == BIT4_0) - continue; val.set_bit(idx, use_bit); } diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc index 15f6fa5b3..afa44efbe 100644 --- a/vvp/vvp_net.cc +++ b/vvp/vvp_net.cc @@ -793,11 +793,30 @@ vvp_vector4_t::vvp_vector4_t(unsigned size__, double val) vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that, unsigned adr, unsigned wid) { + // Set up and initialize the destination. size_ = wid; - assert((adr + wid) <= that.size_); - allocate_words_(WORD_X_ABITS, WORD_X_BBITS); + // Special case: selecting from far beyond the source vector, + // to the result is all X bits. We're done. + if (adr >= that.size_) + return; + + // Special case: The source is not quite big enough to supply + // all bits, so get the bits that we can. The remainder will + // be left at BIT4_X. + if ((adr + wid) > that.size_) { + unsigned use_wid = that.size_ - adr; + for (unsigned idx = 0 ; idx < use_wid ; idx += 1) + set_bit(idx, that.value(adr+idx)); + + return; + } + + // At the point, we know that the source part is entirely + // contained in the source vector. + // assert((adr + wid) <= that.size_); + if (wid > BITS_PER_WORD) { /* In this case, the subvector and the source vector are long. Do the transfer reasonably efficiently. */ diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h index 2a5e1a71f..e2a37d26a 100644 --- a/vvp/vvp_net.h +++ b/vvp/vvp_net.h @@ -230,7 +230,11 @@ class vvp_vector4_t { explicit vvp_vector4_t(unsigned size, double val); - // Construct a vector4 from the subvalue of another vector4. + // Construct a vector4 from the subvalue of another + // vector4. The width of the result is 'wid', and the bits are + // pulled from 'that' to implement the Verilog part select + // semantics. This means that part select beyond 'that' + // returns X bits. explicit vvp_vector4_t(const vvp_vector4_t&that, unsigned adr, unsigned wid); @@ -398,14 +402,16 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const if (idx >= size_) return BIT4_X; - unsigned wdx = idx / BITS_PER_WORD; - unsigned long off = idx % BITS_PER_WORD; + unsigned long off; unsigned long abits, bbits; if (size_ > BITS_PER_WORD) { + unsigned wdx = idx / BITS_PER_WORD; + off = idx % BITS_PER_WORD; abits = abits_ptr_[wdx]; bbits = bbits_ptr_[wdx]; } else { + off = idx; abits = abits_val_; bbits = bbits_val_; } @@ -420,8 +426,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const BIT4_X // bbit==1, abit==1 }; - /* Casting is evil, but this cast matches the un-cast done - when the vvp_bit4_t value is put into the vector. */ + // This map converts the bit-pattern to a vvp_bit4_t value. return bits_bit4_map[tmp]; }