Performance tweaks

Better performance for %cmp/s and the vvp_fun_part_sa node.
This commit is contained in:
Stephen Williams 2014-11-14 11:48:36 -08:00
parent 2aeb3871ed
commit 8aca66b109
5 changed files with 88 additions and 71 deletions

View File

@ -246,16 +246,16 @@ value stack.
Pop a value from the vec4 stack, convert it using Verilog rules to a
vector2 (binary) value, and push the result.
* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
* %cmp/s
* %cmp/u
These instructions perform a generic comparison of two vectors of equal
size. The <bit-l> and <bit-r> numbers address the least-significant
bit of each vector, and <wid> is the width. If either operand is 0,
1, 2 or 3 then it is taken to be a constant replicated to the selected
width.
These instructions perform a generic comparison of two vectors of
equal size. Two values are pulled from the top of the stack, and not
replaced. The results are written into flag bits 4,5,6. The
expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
from the stack first, then (a).
The results of the comparison go into bits 4, 5, 6 and 7:
The results of the comparison go into flags 4, 5, 6 and 7:
4: eq (equal)
5: lt (less than)
@ -276,21 +276,6 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
compare. In either case, if either operand contains x or z, then lt
bit gets the x value.
* %cmp/s
* %cmp/u
These instructions perform a generic comparison of two vectors of
equal size. Two values are pulled from the top of the stack, and not
replaced. The results are written into flag bits 4,5,6. The
expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
from the stack first, then (a).
The results of the comparison go into flags 4, 5, 6 and 7:
4: eq (equal)
5: lt (less than)
6: eeq (case equal)
* %cmp/wr
Compare real values for equality and less-then. This opcode pops to

View File

@ -57,11 +57,7 @@ void vvp_fun_part_sa::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
{
assert(port.port() == 0);
vvp_vector4_t tmp (wid_, BIT4_X);
for (unsigned idx = 0 ; idx < wid_ ; idx += 1) {
if (idx + base_ < bit.size())
tmp.set_bit(idx, bit.value(base_+idx));
}
vvp_vector4_t tmp (bit, base_, wid_);
if (val_ .eeq( tmp ))
return;

View File

@ -1488,6 +1488,16 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
return true;
}
/*
* %cmp/s
*
* Pop the operands from the stack, and do not replace them. The
* results are written to flag bits:
*
* 4: eq (equal)
* 5: lt (less than)
* 6: eeq (case equal)
*/
bool of_CMPS(vthread_t thr, vvp_code_t)
{
vvp_bit4_t eq = BIT4_1;
@ -1498,50 +1508,51 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
vvp_vector4_t lval = thr->pop_vec4();
assert(rval.size() == lval.size());
// If either value has XZ bits, then the eq and lt values are
// known already to be X. Just calculate the eeq result as a
// special case and short circuit the rest of the compare.
if (lval.has_xz() || rval.has_xz()) {
thr->flags[4] = BIT4_X; // eq
thr->flags[5] = BIT4_X; // lt
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
return true;
}
// Past this point, we know we are dealing only with fully
// defined values.
unsigned wid = lval.size();
const vvp_bit4_t sig1 = lval.value(wid-1);
const vvp_bit4_t sig2 = rval.value(wid-1);
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
for (unsigned idx = 0 ; idx < (wid-1) ; idx += 1) {
vvp_bit4_t lv = lval.value(idx);
vvp_bit4_t rv = rval.value(idx);
if (lv > rv) {
lt = BIT4_0;
eeq = BIT4_0;
} else if (lv < rv) {
lt = BIT4_1;
eeq = BIT4_0;
}
if (eq != BIT4_X) {
if ((lv == BIT4_0) && (rv != BIT4_0))
eq = BIT4_0;
if ((lv == BIT4_1) && (rv != BIT4_1))
eq = BIT4_0;
if (bit4_is_xz(lv) || bit4_is_xz(rv))
eq = BIT4_X;
if (lv==BIT4_0 && rv==BIT4_1) {
eeq = eq = BIT4_0;
lt = BIT4_1;
} else if (lv==BIT4_1 && rv==BIT4_0) {
eeq = eq = BIT4_0;
lt = BIT4_0;
}
}
if (eq == BIT4_X)
lt = BIT4_X;
else if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
lt = BIT4_1;
else if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
lt = BIT4_0;
/* Correct the lt bit to account for the sign of the parameters. */
if (lt != BIT4_X) {
/* If the first is negative and the last positive, then
a < b for certain. */
if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
lt = BIT4_1;
/* If the first is positive and the last negative, then
a > b for certain. */
if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
lt = BIT4_0;
// If the first is negative and the last positive, then
// a < b for certain.
if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
lt = BIT4_1;
eeq = eq = BIT4_0;
}
// If the first is positive and the last negative, then
// a > b for certain.
if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
lt = BIT4_0;
eeq = eq = BIT4_0;
}
thr->flags[4] = eq;
@ -4413,16 +4424,19 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
unsigned wid = cp->number;
vvp_vector4_t val (wid, BIT4_0);
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
uint32_t ba = 0;
// If the requested width is /32, then there are no
// actual immediate bits, but we can pad with zero. So
// here we test if we are still working on he LSB, and
// process them if so.
if (idx < 32) {
ba = ((valb >> idx) & 1) << 1;
ba |= (vala >> idx) & 1;
ba = (valb & 1) << 1;
ba |= vala & 1;
}
vala >>= 1;
valb >>= 1;
if (ba == 0) continue;
vvp_bit4_t use_bit = BIT4_0;
switch (ba) {
case 1:
@ -4437,8 +4451,6 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
default:
break;
}
if (use_bit == BIT4_0)
continue;
val.set_bit(idx, use_bit);
}

View File

@ -793,11 +793,30 @@ vvp_vector4_t::vvp_vector4_t(unsigned size__, double val)
vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
unsigned adr, unsigned wid)
{
// Set up and initialize the destination.
size_ = wid;
assert((adr + wid) <= that.size_);
allocate_words_(WORD_X_ABITS, WORD_X_BBITS);
// Special case: selecting from far beyond the source vector,
// to the result is all X bits. We're done.
if (adr >= that.size_)
return;
// Special case: The source is not quite big enough to supply
// all bits, so get the bits that we can. The remainder will
// be left at BIT4_X.
if ((adr + wid) > that.size_) {
unsigned use_wid = that.size_ - adr;
for (unsigned idx = 0 ; idx < use_wid ; idx += 1)
set_bit(idx, that.value(adr+idx));
return;
}
// At the point, we know that the source part is entirely
// contained in the source vector.
// assert((adr + wid) <= that.size_);
if (wid > BITS_PER_WORD) {
/* In this case, the subvector and the source vector are
long. Do the transfer reasonably efficiently. */

View File

@ -230,7 +230,11 @@ class vvp_vector4_t {
explicit vvp_vector4_t(unsigned size, double val);
// Construct a vector4 from the subvalue of another vector4.
// Construct a vector4 from the subvalue of another
// vector4. The width of the result is 'wid', and the bits are
// pulled from 'that' to implement the Verilog part select
// semantics. This means that part select beyond 'that'
// returns X bits.
explicit vvp_vector4_t(const vvp_vector4_t&that,
unsigned adr, unsigned wid);
@ -398,14 +402,16 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
if (idx >= size_)
return BIT4_X;
unsigned wdx = idx / BITS_PER_WORD;
unsigned long off = idx % BITS_PER_WORD;
unsigned long off;
unsigned long abits, bbits;
if (size_ > BITS_PER_WORD) {
unsigned wdx = idx / BITS_PER_WORD;
off = idx % BITS_PER_WORD;
abits = abits_ptr_[wdx];
bbits = bbits_ptr_[wdx];
} else {
off = idx;
abits = abits_val_;
bbits = bbits_val_;
}
@ -420,8 +426,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
BIT4_X // bbit==1, abit==1
};
/* Casting is evil, but this cast matches the un-cast done
when the vvp_bit4_t value is put into the vector. */
// This map converts the bit-pattern to a vvp_bit4_t value.
return bits_bit4_map[tmp];
}