Performance tweaks
Better performance for %cmp/s and the vvp_fun_part_sa node.
This commit is contained in:
parent
2aeb3871ed
commit
8aca66b109
|
|
@ -246,16 +246,16 @@ value stack.
|
|||
Pop a value from the vec4 stack, convert it using Verilog rules to a
|
||||
vector2 (binary) value, and push the result.
|
||||
|
||||
* %cmp/u <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
|
||||
* %cmp/s <bit-l>, <bit-r>, <wid> (XXXX Old meaning)
|
||||
* %cmp/s
|
||||
* %cmp/u
|
||||
|
||||
These instructions perform a generic comparison of two vectors of equal
|
||||
size. The <bit-l> and <bit-r> numbers address the least-significant
|
||||
bit of each vector, and <wid> is the width. If either operand is 0,
|
||||
1, 2 or 3 then it is taken to be a constant replicated to the selected
|
||||
width.
|
||||
These instructions perform a generic comparison of two vectors of
|
||||
equal size. Two values are pulled from the top of the stack, and not
|
||||
replaced. The results are written into flag bits 4,5,6. The
|
||||
expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
|
||||
from the stack first, then (a).
|
||||
|
||||
The results of the comparison go into bits 4, 5, 6 and 7:
|
||||
The results of the comparison go into flags 4, 5, 6 and 7:
|
||||
|
||||
4: eq (equal)
|
||||
5: lt (less than)
|
||||
|
|
@ -276,21 +276,6 @@ The %cmp/u and %cmp/s differ only in the handling of the lt bit. The
|
|||
compare. In either case, if either operand contains x or z, then lt
|
||||
bit gets the x value.
|
||||
|
||||
* %cmp/s
|
||||
* %cmp/u
|
||||
|
||||
These instructions perform a generic comparison of two vectors of
|
||||
equal size. Two values are pulled from the top of the stack, and not
|
||||
replaced. The results are written into flag bits 4,5,6. The
|
||||
expressions (a<b), (a==b) and (a===b) are calculated, with (b) popped
|
||||
from the stack first, then (a).
|
||||
|
||||
The results of the comparison go into flags 4, 5, 6 and 7:
|
||||
|
||||
4: eq (equal)
|
||||
5: lt (less than)
|
||||
6: eeq (case equal)
|
||||
|
||||
* %cmp/wr
|
||||
|
||||
Compare real values for equality and less-then. This opcode pops to
|
||||
|
|
|
|||
|
|
@ -57,11 +57,7 @@ void vvp_fun_part_sa::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
|
|||
{
|
||||
assert(port.port() == 0);
|
||||
|
||||
vvp_vector4_t tmp (wid_, BIT4_X);
|
||||
for (unsigned idx = 0 ; idx < wid_ ; idx += 1) {
|
||||
if (idx + base_ < bit.size())
|
||||
tmp.set_bit(idx, bit.value(base_+idx));
|
||||
}
|
||||
vvp_vector4_t tmp (bit, base_, wid_);
|
||||
if (val_ .eeq( tmp ))
|
||||
return;
|
||||
|
||||
|
|
|
|||
|
|
@ -1488,6 +1488,16 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %cmp/s
|
||||
*
|
||||
* Pop the operands from the stack, and do not replace them. The
|
||||
* results are written to flag bits:
|
||||
*
|
||||
* 4: eq (equal)
|
||||
* 5: lt (less than)
|
||||
* 6: eeq (case equal)
|
||||
*/
|
||||
bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_bit4_t eq = BIT4_1;
|
||||
|
|
@ -1498,50 +1508,51 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
|
|||
vvp_vector4_t lval = thr->pop_vec4();
|
||||
|
||||
assert(rval.size() == lval.size());
|
||||
|
||||
// If either value has XZ bits, then the eq and lt values are
|
||||
// known already to be X. Just calculate the eeq result as a
|
||||
// special case and short circuit the rest of the compare.
|
||||
if (lval.has_xz() || rval.has_xz()) {
|
||||
thr->flags[4] = BIT4_X; // eq
|
||||
thr->flags[5] = BIT4_X; // lt
|
||||
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Past this point, we know we are dealing only with fully
|
||||
// defined values.
|
||||
unsigned wid = lval.size();
|
||||
|
||||
const vvp_bit4_t sig1 = lval.value(wid-1);
|
||||
const vvp_bit4_t sig2 = rval.value(wid-1);
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
for (unsigned idx = 0 ; idx < (wid-1) ; idx += 1) {
|
||||
vvp_bit4_t lv = lval.value(idx);
|
||||
vvp_bit4_t rv = rval.value(idx);
|
||||
|
||||
if (lv > rv) {
|
||||
lt = BIT4_0;
|
||||
eeq = BIT4_0;
|
||||
} else if (lv < rv) {
|
||||
lt = BIT4_1;
|
||||
eeq = BIT4_0;
|
||||
}
|
||||
if (eq != BIT4_X) {
|
||||
if ((lv == BIT4_0) && (rv != BIT4_0))
|
||||
eq = BIT4_0;
|
||||
if ((lv == BIT4_1) && (rv != BIT4_1))
|
||||
eq = BIT4_0;
|
||||
if (bit4_is_xz(lv) || bit4_is_xz(rv))
|
||||
eq = BIT4_X;
|
||||
if (lv==BIT4_0 && rv==BIT4_1) {
|
||||
eeq = eq = BIT4_0;
|
||||
lt = BIT4_1;
|
||||
} else if (lv==BIT4_1 && rv==BIT4_0) {
|
||||
eeq = eq = BIT4_0;
|
||||
lt = BIT4_0;
|
||||
}
|
||||
}
|
||||
|
||||
if (eq == BIT4_X)
|
||||
lt = BIT4_X;
|
||||
else if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
|
||||
lt = BIT4_1;
|
||||
else if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
|
||||
lt = BIT4_0;
|
||||
|
||||
/* Correct the lt bit to account for the sign of the parameters. */
|
||||
if (lt != BIT4_X) {
|
||||
/* If the first is negative and the last positive, then
|
||||
a < b for certain. */
|
||||
if ((sig1 == BIT4_1) && (sig2 == BIT4_0))
|
||||
lt = BIT4_1;
|
||||
|
||||
/* If the first is positive and the last negative, then
|
||||
a > b for certain. */
|
||||
if ((sig1 == BIT4_0) && (sig2 == BIT4_1))
|
||||
lt = BIT4_0;
|
||||
// If the first is negative and the last positive, then
|
||||
// a < b for certain.
|
||||
if ((sig1 == BIT4_1) && (sig2 == BIT4_0)) {
|
||||
lt = BIT4_1;
|
||||
eeq = eq = BIT4_0;
|
||||
}
|
||||
|
||||
// If the first is positive and the last negative, then
|
||||
// a > b for certain.
|
||||
if ((sig1 == BIT4_0) && (sig2 == BIT4_1)) {
|
||||
lt = BIT4_0;
|
||||
eeq = eq = BIT4_0;
|
||||
}
|
||||
|
||||
thr->flags[4] = eq;
|
||||
|
|
@ -4413,16 +4424,19 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
|
|||
unsigned wid = cp->number;
|
||||
|
||||
vvp_vector4_t val (wid, BIT4_0);
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
|
||||
uint32_t ba = 0;
|
||||
// If the requested width is /32, then there are no
|
||||
// actual immediate bits, but we can pad with zero. So
|
||||
// here we test if we are still working on he LSB, and
|
||||
// process them if so.
|
||||
if (idx < 32) {
|
||||
ba = ((valb >> idx) & 1) << 1;
|
||||
ba |= (vala >> idx) & 1;
|
||||
ba = (valb & 1) << 1;
|
||||
ba |= vala & 1;
|
||||
}
|
||||
vala >>= 1;
|
||||
valb >>= 1;
|
||||
if (ba == 0) continue;
|
||||
vvp_bit4_t use_bit = BIT4_0;
|
||||
switch (ba) {
|
||||
case 1:
|
||||
|
|
@ -4437,8 +4451,6 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp)
|
|||
default:
|
||||
break;
|
||||
}
|
||||
if (use_bit == BIT4_0)
|
||||
continue;
|
||||
val.set_bit(idx, use_bit);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -793,11 +793,30 @@ vvp_vector4_t::vvp_vector4_t(unsigned size__, double val)
|
|||
vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
|
||||
unsigned adr, unsigned wid)
|
||||
{
|
||||
// Set up and initialize the destination.
|
||||
size_ = wid;
|
||||
assert((adr + wid) <= that.size_);
|
||||
|
||||
allocate_words_(WORD_X_ABITS, WORD_X_BBITS);
|
||||
|
||||
// Special case: selecting from far beyond the source vector,
|
||||
// to the result is all X bits. We're done.
|
||||
if (adr >= that.size_)
|
||||
return;
|
||||
|
||||
// Special case: The source is not quite big enough to supply
|
||||
// all bits, so get the bits that we can. The remainder will
|
||||
// be left at BIT4_X.
|
||||
if ((adr + wid) > that.size_) {
|
||||
unsigned use_wid = that.size_ - adr;
|
||||
for (unsigned idx = 0 ; idx < use_wid ; idx += 1)
|
||||
set_bit(idx, that.value(adr+idx));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// At the point, we know that the source part is entirely
|
||||
// contained in the source vector.
|
||||
// assert((adr + wid) <= that.size_);
|
||||
|
||||
if (wid > BITS_PER_WORD) {
|
||||
/* In this case, the subvector and the source vector are
|
||||
long. Do the transfer reasonably efficiently. */
|
||||
|
|
|
|||
|
|
@ -230,7 +230,11 @@ class vvp_vector4_t {
|
|||
|
||||
explicit vvp_vector4_t(unsigned size, double val);
|
||||
|
||||
// Construct a vector4 from the subvalue of another vector4.
|
||||
// Construct a vector4 from the subvalue of another
|
||||
// vector4. The width of the result is 'wid', and the bits are
|
||||
// pulled from 'that' to implement the Verilog part select
|
||||
// semantics. This means that part select beyond 'that'
|
||||
// returns X bits.
|
||||
explicit vvp_vector4_t(const vvp_vector4_t&that,
|
||||
unsigned adr, unsigned wid);
|
||||
|
||||
|
|
@ -398,14 +402,16 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
|
|||
if (idx >= size_)
|
||||
return BIT4_X;
|
||||
|
||||
unsigned wdx = idx / BITS_PER_WORD;
|
||||
unsigned long off = idx % BITS_PER_WORD;
|
||||
unsigned long off;
|
||||
|
||||
unsigned long abits, bbits;
|
||||
if (size_ > BITS_PER_WORD) {
|
||||
unsigned wdx = idx / BITS_PER_WORD;
|
||||
off = idx % BITS_PER_WORD;
|
||||
abits = abits_ptr_[wdx];
|
||||
bbits = bbits_ptr_[wdx];
|
||||
} else {
|
||||
off = idx;
|
||||
abits = abits_val_;
|
||||
bbits = bbits_val_;
|
||||
}
|
||||
|
|
@ -420,8 +426,7 @@ inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const
|
|||
BIT4_X // bbit==1, abit==1
|
||||
};
|
||||
|
||||
/* Casting is evil, but this cast matches the un-cast done
|
||||
when the vvp_bit4_t value is put into the vector. */
|
||||
// This map converts the bit-pattern to a vvp_bit4_t value.
|
||||
return bits_bit4_map[tmp];
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue