Vectorize AND/OR/NAND/NOR/INV instructions when reasonable.

When processing wide vectors of these operations, it pays to process
them as vectors. This improves run-time performance. Have the run time
select vectorized or not based on the vector width.
This commit is contained in:
Stephen Williams 2008-05-23 17:52:43 -07:00
parent 492b240304
commit 9af459f95b
3 changed files with 248 additions and 88 deletions

View File

@ -410,20 +410,29 @@ bool of_ABS_WR(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
bool of_AND(vthread_t thr, vvp_code_t cp) static bool of_AND_wide(vthread_t thr, vvp_code_t cp)
{ {
assert(cp->bit_idx[0] >= 4);
unsigned idx1 = cp->bit_idx[0]; unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1]; unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
val &= vthread_bits_to_vector(thr, idx2, wid);
thr->bits4.set_vec(idx1, val);
return true;
}
static bool of_AND_narrow(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
vvp_bit4_t lb = thr_get_bit(thr, idx1); vvp_bit4_t lb = thr_get_bit(thr, idx1);
vvp_bit4_t rb = thr_get_bit(thr, idx2); vvp_bit4_t rb = thr_get_bit(thr, idx2);
thr_put_bit(thr, idx1, lb&rb);
thr_put_bit(thr, idx1, lb & rb);
idx1 += 1; idx1 += 1;
if (idx2 >= 4) if (idx2 >= 4)
idx2 += 1; idx2 += 1;
@ -432,6 +441,18 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
bool of_AND(vthread_t thr, vvp_code_t cp)
{
assert(cp->bit_idx[0] >= 4);
if (cp->number <= 4)
cp->opcode = &of_AND_narrow;
else
cp->opcode = &of_AND_wide;
return cp->opcode(thr, cp);
}
bool of_ADD(vthread_t thr, vvp_code_t cp) bool of_ADD(vthread_t thr, vvp_code_t cp)
{ {
@ -2007,25 +2028,41 @@ bool of_FORK(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
static bool of_INV_wide(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned wid = cp->bit_idx[1];
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
thr->bits4.set_vec(idx1, ~val);
return true;
}
static bool of_INV_narrow(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned wid = cp->bit_idx[1];
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
vvp_bit4_t lb = thr_get_bit(thr, idx1);
thr_put_bit(thr, idx1, ~lb);
idx1 += 1;
}
return true;
}
bool of_INV(vthread_t thr, vvp_code_t cp) bool of_INV(vthread_t thr, vvp_code_t cp)
{ {
assert(cp->bit_idx[0] >= 4); assert(cp->bit_idx[0] >= 4);
for (unsigned idx = 0 ; idx < cp->bit_idx[1] ; idx += 1) {
vvp_bit4_t val = thr_get_bit(thr, cp->bit_idx[0]+idx); if (cp->number <= 4)
switch (val) { cp->opcode = &of_INV_narrow;
case BIT4_0: else
val = BIT4_1; cp->opcode = &of_INV_wide;
break;
case BIT4_1: return cp->opcode(thr, cp);
val = BIT4_0;
break;
default:
val = BIT4_X;
break;
}
thr_put_bit(thr, cp->bit_idx[0]+idx, val);
}
return true;
} }
@ -3091,28 +3128,29 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
bool of_NAND(vthread_t thr, vvp_code_t cp) static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
{ {
assert(cp->bit_idx[0] >= 4);
unsigned idx1 = cp->bit_idx[0]; unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1]; unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
val &= vthread_bits_to_vector(thr, idx2, wid);
thr->bits4.set_vec(idx1, ~val);
return true;
}
static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
vvp_bit4_t lb = thr_get_bit(thr, idx1); vvp_bit4_t lb = thr_get_bit(thr, idx1);
vvp_bit4_t rb = thr_get_bit(thr, idx2); vvp_bit4_t rb = thr_get_bit(thr, idx2);
thr_put_bit(thr, idx1, ~(lb&rb));
if ((lb == BIT4_0) || (rb == BIT4_0)) {
thr_put_bit(thr, idx1, BIT4_1);
} else if ((lb == BIT4_1) && (rb == BIT4_1)) {
thr_put_bit(thr, idx1, BIT4_0);
} else {
thr_put_bit(thr, idx1, BIT4_X);
}
idx1 += 1; idx1 += 1;
if (idx2 >= 4) if (idx2 >= 4)
idx2 += 1; idx2 += 1;
@ -3121,6 +3159,18 @@ bool of_NAND(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
bool of_NAND(vthread_t thr, vvp_code_t cp)
{
assert(cp->bit_idx[0] >= 4);
if (cp->number <= 4)
cp->opcode = &of_NAND_narrow;
else
cp->opcode = &of_NAND_wide;
return cp->opcode(thr, cp);
}
bool of_NOOP(vthread_t thr, vvp_code_t cp) bool of_NOOP(vthread_t thr, vvp_code_t cp)
{ {
@ -3269,28 +3319,74 @@ bool of_XNORR(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
static bool of_OR_wide(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
val |= vthread_bits_to_vector(thr, idx2, wid);
thr->bits4.set_vec(idx1, val);
return true;
}
static bool of_OR_narrow(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
vvp_bit4_t lb = thr_get_bit(thr, idx1);
vvp_bit4_t rb = thr_get_bit(thr, idx2);
thr_put_bit(thr, idx1, lb|rb);
idx1 += 1;
if (idx2 >= 4)
idx2 += 1;
}
return true;
}
bool of_OR(vthread_t thr, vvp_code_t cp) bool of_OR(vthread_t thr, vvp_code_t cp)
{ {
assert(cp->bit_idx[0] >= 4); assert(cp->bit_idx[0] >= 4);
if (cp->number <= 4)
cp->opcode = &of_OR_narrow;
else
cp->opcode = &of_OR_wide;
return cp->opcode(thr, cp);
}
static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
{
assert(cp->bit_idx[0] >= 4);
unsigned idx1 = cp->bit_idx[0]; unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1]; unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
val |= vthread_bits_to_vector(thr, idx2, wid);
thr->bits4.set_vec(idx1, ~val);
return true;
}
static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
{
unsigned idx1 = cp->bit_idx[0];
unsigned idx2 = cp->bit_idx[1];
unsigned wid = cp->number;
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
vvp_bit4_t lb = thr_get_bit(thr, idx1); vvp_bit4_t lb = thr_get_bit(thr, idx1);
vvp_bit4_t rb = thr_get_bit(thr, idx2); vvp_bit4_t rb = thr_get_bit(thr, idx2);
thr_put_bit(thr, idx1, ~(lb|rb));
if ((lb == BIT4_1) || (rb == BIT4_1)) {
thr_put_bit(thr, idx1, BIT4_1);
} else if ((lb == BIT4_0) && (rb == BIT4_0)) {
thr_put_bit(thr, idx1, BIT4_0);
} else {
thr_put_bit(thr, idx1, BIT4_X);
}
idx1 += 1; idx1 += 1;
if (idx2 >= 4) if (idx2 >= 4)
idx2 += 1; idx2 += 1;
@ -3303,30 +3399,12 @@ bool of_NOR(vthread_t thr, vvp_code_t cp)
{ {
assert(cp->bit_idx[0] >= 4); assert(cp->bit_idx[0] >= 4);
unsigned idx1 = cp->bit_idx[0]; if (cp->number <= 4)
unsigned idx2 = cp->bit_idx[1]; cp->opcode = &of_NOR_narrow;
else
cp->opcode = &of_NOR_wide;
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { return cp->opcode(thr, cp);
vvp_bit4_t lb = thr_get_bit(thr, idx1);
vvp_bit4_t rb = thr_get_bit(thr, idx2);
if ((lb == BIT4_1) || (rb == BIT4_1)) {
thr_put_bit(thr, idx1, BIT4_0);
} else if ((lb == BIT4_0) && (rb == BIT4_0)) {
thr_put_bit(thr, idx1, BIT4_1);
} else {
thr_put_bit(thr, idx1, BIT4_X);
}
idx1 += 1;
if (idx2 >= 4)
idx2 += 1;
}
return true;
} }
bool of_POW(vthread_t thr, vvp_code_t cp) bool of_POW(vthread_t thr, vvp_code_t cp)

View File

@ -874,6 +874,93 @@ char* vvp_vector4_t::as_string(char*buf, size_t buf_len)
return res; return res;
} }
void vvp_vector4_t::invert()
{
if (size_ <= BITS_PER_WORD) {
unsigned long mask = (size_<BITS_PER_WORD)? (1UL<<size_)-1UL : -1UL;
abits_val_ = mask & ~abits_val_;
abits_val_ |= bbits_val_;
} else {
unsigned remaining = size_;
unsigned idx = 0;
while (remaining >= BITS_PER_WORD) {
abits_ptr_[idx] = ~abits_ptr_[idx];
abits_ptr_[idx] |= bbits_ptr_[idx];
idx += 1;
remaining -= BITS_PER_WORD;
}
if (remaining > 0) {
unsigned long mask = (1UL<<remaining) - 1UL;
abits_ptr_[idx] = mask & ~abits_ptr_[idx];
abits_ptr_[idx] |= bbits_ptr_[idx];
}
}
}
vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
{
// Make sure that all Z bits are turned into X bits.
change_z2x();
// This is sneaky. The truth table is:
// 00 01 11
// 00 00 00 00
// 01 00 01 11
// 11 00 11 11
if (size_ <= BITS_PER_WORD) {
// Each tmp bit is true if that is 1, X or Z.
unsigned long tmp = that.abits_val_ | that.bbits_val_;
abits_val_ &= that.abits_val_;
bbits_val_ = (bbits_val_ & tmp) | (abits_val_&that.bbits_val_);
} else {
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
for (unsigned idx = 0; idx < words ; idx += 1) {
unsigned long tmp = that.abits_ptr_[idx]|that.bbits_ptr_[idx];
abits_ptr_[idx] &= that.abits_ptr_[idx];
bbits_ptr_[idx] = (bbits_ptr_[idx]&tmp) | (abits_ptr_[idx]&that.bbits_ptr_[idx]);
}
}
return *this;
}
vvp_vector4_t& vvp_vector4_t::operator |= (const vvp_vector4_t&that)
{
// Make sure that all Z bits are turned into X bits.
change_z2x();
// This is sneaky.
// The OR is 1 if either operand is 1.
// The OR is 0 if both operants are 0.
// Otherwise, the AND is X. The truth table is:
//
// 00 01 11
// 00 00 01 11
// 01 01 01 01
// 11 11 01 11
if (size_ <= BITS_PER_WORD) {
// Each tmp bit is true if that is 1, X or Z.
unsigned long tmp1 = abits_val_ | bbits_val_;
unsigned long tmp2 = that.abits_val_ | that.bbits_val_;
bbits_val_ = (bbits_val_& ~(that.abits_val_^that.bbits_val_))
| (that.bbits_val_& ~abits_val_);
abits_val_ = tmp1 | tmp2;
} else {
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
for (unsigned idx = 0; idx < words ; idx += 1) {
unsigned long tmp1 = abits_ptr_[idx] | bbits_ptr_[idx];
unsigned long tmp2 = that.abits_ptr_[idx] | that.bbits_ptr_[idx];
bbits_ptr_[idx] = (bbits_ptr_[idx]& ~(that.abits_ptr_[idx]^that.bbits_ptr_[idx]))
| (that.bbits_ptr_[idx]& ~abits_ptr_[idx]);
abits_ptr_[idx] = tmp1 | tmp2;
}
}
return *this;
}
/* /*
* Add an integer to the vvp_vector4_t in place, bit by bit so that * Add an integer to the vvp_vector4_t in place, bit by bit so that
* there is no size limitations. * there is no size limitations.
@ -2872,20 +2959,6 @@ vvp_bit4_t compare_gtge(const vvp_vector4_t&lef, const vvp_vector4_t&rig,
return out_if_equal; return out_if_equal;
} }
vvp_vector4_t operator ~ (const vvp_vector4_t&that)
{
vvp_vector4_t res = that;
if (res.size_ <= vvp_vector4_t::BITS_PER_WORD) {
res.abits_val_ = res.bbits_val_ | ~res.abits_val_;
} else {
unsigned cnt = (res.size_ + vvp_vector4_t::BITS_PER_WORD - 1) / vvp_vector4_t::BITS_PER_WORD;
for (unsigned idx = 0 ; idx < cnt ; idx += 1)
res.abits_ptr_[idx] = res.bbits_val_ | ~res.abits_val_;
}
return res;
}
vvp_bit4_t compare_gtge_signed(const vvp_vector4_t&a, vvp_bit4_t compare_gtge_signed(const vvp_vector4_t&a,
const vvp_vector4_t&b, const vvp_vector4_t&b,
vvp_bit4_t out_if_equal) vvp_bit4_t out_if_equal)

View File

@ -152,6 +152,9 @@ class vvp_vector4_t {
// Display the value into the buf as a string. // Display the value into the buf as a string.
char*as_string(char*buf, size_t buf_len); char*as_string(char*buf, size_t buf_len);
void invert();
vvp_vector4_t& operator &= (const vvp_vector4_t&that);
vvp_vector4_t& operator |= (const vvp_vector4_t&that);
vvp_vector4_t& operator += (int64_t); vvp_vector4_t& operator += (int64_t);
private: private:
@ -329,7 +332,13 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
} }
} }
extern vvp_vector4_t operator ~ (const vvp_vector4_t&that); inline vvp_vector4_t operator ~ (const vvp_vector4_t&that)
{
vvp_vector4_t res = that;
res.invert();
return res;
}
extern ostream& operator << (ostream&, const vvp_vector4_t&); extern ostream& operator << (ostream&, const vvp_vector4_t&);
extern vvp_bit4_t compare_gtge(const vvp_vector4_t&a, extern vvp_bit4_t compare_gtge(const vvp_vector4_t&a,