Vectorize AND/OR/NAND/NOR/INV instructions when reasonable.
When processing wide vectors of these operations, it pays to process them as vectors. This improves run-time performance. Have the run time select vectorized or not based on the vector width.
This commit is contained in:
parent
492b240304
commit
9af459f95b
224
vvp/vthread.cc
224
vvp/vthread.cc
|
|
@ -410,20 +410,29 @@ bool of_ABS_WR(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool of_AND(vthread_t thr, vvp_code_t cp)
|
||||
static bool of_AND_wide(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) {
|
||||
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
|
||||
val &= vthread_bits_to_vector(thr, idx2, wid);
|
||||
thr->bits4.set_vec(idx1, val);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool of_AND_narrow(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
vvp_bit4_t rb = thr_get_bit(thr, idx2);
|
||||
|
||||
thr_put_bit(thr, idx1, lb & rb);
|
||||
|
||||
thr_put_bit(thr, idx1, lb&rb);
|
||||
idx1 += 1;
|
||||
if (idx2 >= 4)
|
||||
idx2 += 1;
|
||||
|
|
@ -432,6 +441,18 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool of_AND(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
if (cp->number <= 4)
|
||||
cp->opcode = &of_AND_narrow;
|
||||
else
|
||||
cp->opcode = &of_AND_wide;
|
||||
|
||||
return cp->opcode(thr, cp);
|
||||
}
|
||||
|
||||
|
||||
bool of_ADD(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
|
|
@ -2007,25 +2028,41 @@ bool of_FORK(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool of_INV_wide(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned wid = cp->bit_idx[1];
|
||||
|
||||
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
|
||||
thr->bits4.set_vec(idx1, ~val);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool of_INV_narrow(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned wid = cp->bit_idx[1];
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
thr_put_bit(thr, idx1, ~lb);
|
||||
idx1 += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_INV(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
for (unsigned idx = 0 ; idx < cp->bit_idx[1] ; idx += 1) {
|
||||
vvp_bit4_t val = thr_get_bit(thr, cp->bit_idx[0]+idx);
|
||||
switch (val) {
|
||||
case BIT4_0:
|
||||
val = BIT4_1;
|
||||
break;
|
||||
case BIT4_1:
|
||||
val = BIT4_0;
|
||||
break;
|
||||
default:
|
||||
val = BIT4_X;
|
||||
break;
|
||||
}
|
||||
thr_put_bit(thr, cp->bit_idx[0]+idx, val);
|
||||
}
|
||||
return true;
|
||||
|
||||
if (cp->number <= 4)
|
||||
cp->opcode = &of_INV_narrow;
|
||||
else
|
||||
cp->opcode = &of_INV_wide;
|
||||
|
||||
return cp->opcode(thr, cp);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -3091,28 +3128,29 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool of_NAND(vthread_t thr, vvp_code_t cp)
|
||||
static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) {
|
||||
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
|
||||
val &= vthread_bits_to_vector(thr, idx2, wid);
|
||||
thr->bits4.set_vec(idx1, ~val);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
vvp_bit4_t rb = thr_get_bit(thr, idx2);
|
||||
|
||||
if ((lb == BIT4_0) || (rb == BIT4_0)) {
|
||||
thr_put_bit(thr, idx1, BIT4_1);
|
||||
|
||||
} else if ((lb == BIT4_1) && (rb == BIT4_1)) {
|
||||
thr_put_bit(thr, idx1, BIT4_0);
|
||||
|
||||
} else {
|
||||
thr_put_bit(thr, idx1, BIT4_X);
|
||||
}
|
||||
|
||||
thr_put_bit(thr, idx1, ~(lb&rb));
|
||||
idx1 += 1;
|
||||
if (idx2 >= 4)
|
||||
idx2 += 1;
|
||||
|
|
@ -3121,6 +3159,18 @@ bool of_NAND(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool of_NAND(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
if (cp->number <= 4)
|
||||
cp->opcode = &of_NAND_narrow;
|
||||
else
|
||||
cp->opcode = &of_NAND_wide;
|
||||
|
||||
return cp->opcode(thr, cp);
|
||||
}
|
||||
|
||||
|
||||
bool of_NOOP(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
|
|
@ -3269,28 +3319,74 @@ bool of_XNORR(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool of_OR_wide(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
|
||||
val |= vthread_bits_to_vector(thr, idx2, wid);
|
||||
thr->bits4.set_vec(idx1, val);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool of_OR_narrow(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
vvp_bit4_t rb = thr_get_bit(thr, idx2);
|
||||
thr_put_bit(thr, idx1, lb|rb);
|
||||
idx1 += 1;
|
||||
if (idx2 >= 4)
|
||||
idx2 += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_OR(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
if (cp->number <= 4)
|
||||
cp->opcode = &of_OR_narrow;
|
||||
else
|
||||
cp->opcode = &of_OR_wide;
|
||||
|
||||
return cp->opcode(thr, cp);
|
||||
}
|
||||
|
||||
static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) {
|
||||
vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
|
||||
val |= vthread_bits_to_vector(thr, idx2, wid);
|
||||
thr->bits4.set_vec(idx1, ~val);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
unsigned wid = cp->number;
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
vvp_bit4_t rb = thr_get_bit(thr, idx2);
|
||||
|
||||
if ((lb == BIT4_1) || (rb == BIT4_1)) {
|
||||
thr_put_bit(thr, idx1, BIT4_1);
|
||||
|
||||
} else if ((lb == BIT4_0) && (rb == BIT4_0)) {
|
||||
thr_put_bit(thr, idx1, BIT4_0);
|
||||
|
||||
} else {
|
||||
thr_put_bit(thr, idx1, BIT4_X);
|
||||
}
|
||||
|
||||
thr_put_bit(thr, idx1, ~(lb|rb));
|
||||
idx1 += 1;
|
||||
if (idx2 >= 4)
|
||||
idx2 += 1;
|
||||
|
|
@ -3303,30 +3399,12 @@ bool of_NOR(vthread_t thr, vvp_code_t cp)
|
|||
{
|
||||
assert(cp->bit_idx[0] >= 4);
|
||||
|
||||
unsigned idx1 = cp->bit_idx[0];
|
||||
unsigned idx2 = cp->bit_idx[1];
|
||||
if (cp->number <= 4)
|
||||
cp->opcode = &of_NOR_narrow;
|
||||
else
|
||||
cp->opcode = &of_NOR_wide;
|
||||
|
||||
for (unsigned idx = 0 ; idx < cp->number ; idx += 1) {
|
||||
|
||||
vvp_bit4_t lb = thr_get_bit(thr, idx1);
|
||||
vvp_bit4_t rb = thr_get_bit(thr, idx2);
|
||||
|
||||
if ((lb == BIT4_1) || (rb == BIT4_1)) {
|
||||
thr_put_bit(thr, idx1, BIT4_0);
|
||||
|
||||
} else if ((lb == BIT4_0) && (rb == BIT4_0)) {
|
||||
thr_put_bit(thr, idx1, BIT4_1);
|
||||
|
||||
} else {
|
||||
thr_put_bit(thr, idx1, BIT4_X);
|
||||
}
|
||||
|
||||
idx1 += 1;
|
||||
if (idx2 >= 4)
|
||||
idx2 += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
return cp->opcode(thr, cp);
|
||||
}
|
||||
|
||||
bool of_POW(vthread_t thr, vvp_code_t cp)
|
||||
|
|
|
|||
101
vvp/vvp_net.cc
101
vvp/vvp_net.cc
|
|
@ -874,6 +874,93 @@ char* vvp_vector4_t::as_string(char*buf, size_t buf_len)
|
|||
return res;
|
||||
}
|
||||
|
||||
void vvp_vector4_t::invert()
|
||||
{
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
unsigned long mask = (size_<BITS_PER_WORD)? (1UL<<size_)-1UL : -1UL;
|
||||
abits_val_ = mask & ~abits_val_;
|
||||
abits_val_ |= bbits_val_;
|
||||
} else {
|
||||
unsigned remaining = size_;
|
||||
unsigned idx = 0;
|
||||
while (remaining >= BITS_PER_WORD) {
|
||||
abits_ptr_[idx] = ~abits_ptr_[idx];
|
||||
abits_ptr_[idx] |= bbits_ptr_[idx];
|
||||
idx += 1;
|
||||
remaining -= BITS_PER_WORD;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
unsigned long mask = (1UL<<remaining) - 1UL;
|
||||
abits_ptr_[idx] = mask & ~abits_ptr_[idx];
|
||||
abits_ptr_[idx] |= bbits_ptr_[idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
|
||||
{
|
||||
// Make sure that all Z bits are turned into X bits.
|
||||
change_z2x();
|
||||
|
||||
// This is sneaky. The truth table is:
|
||||
// 00 01 11
|
||||
// 00 00 00 00
|
||||
// 01 00 01 11
|
||||
// 11 00 11 11
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
// Each tmp bit is true if that is 1, X or Z.
|
||||
unsigned long tmp = that.abits_val_ | that.bbits_val_;
|
||||
abits_val_ &= that.abits_val_;
|
||||
bbits_val_ = (bbits_val_ & tmp) | (abits_val_&that.bbits_val_);
|
||||
|
||||
} else {
|
||||
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
for (unsigned idx = 0; idx < words ; idx += 1) {
|
||||
unsigned long tmp = that.abits_ptr_[idx]|that.bbits_ptr_[idx];
|
||||
abits_ptr_[idx] &= that.abits_ptr_[idx];
|
||||
bbits_ptr_[idx] = (bbits_ptr_[idx]&tmp) | (abits_ptr_[idx]&that.bbits_ptr_[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
vvp_vector4_t& vvp_vector4_t::operator |= (const vvp_vector4_t&that)
|
||||
{
|
||||
// Make sure that all Z bits are turned into X bits.
|
||||
change_z2x();
|
||||
|
||||
// This is sneaky.
|
||||
// The OR is 1 if either operand is 1.
|
||||
// The OR is 0 if both operants are 0.
|
||||
// Otherwise, the AND is X. The truth table is:
|
||||
//
|
||||
// 00 01 11
|
||||
// 00 00 01 11
|
||||
// 01 01 01 01
|
||||
// 11 11 01 11
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
// Each tmp bit is true if that is 1, X or Z.
|
||||
unsigned long tmp1 = abits_val_ | bbits_val_;
|
||||
unsigned long tmp2 = that.abits_val_ | that.bbits_val_;
|
||||
bbits_val_ = (bbits_val_& ~(that.abits_val_^that.bbits_val_))
|
||||
| (that.bbits_val_& ~abits_val_);
|
||||
abits_val_ = tmp1 | tmp2;
|
||||
|
||||
} else {
|
||||
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
for (unsigned idx = 0; idx < words ; idx += 1) {
|
||||
unsigned long tmp1 = abits_ptr_[idx] | bbits_ptr_[idx];
|
||||
unsigned long tmp2 = that.abits_ptr_[idx] | that.bbits_ptr_[idx];
|
||||
bbits_ptr_[idx] = (bbits_ptr_[idx]& ~(that.abits_ptr_[idx]^that.bbits_ptr_[idx]))
|
||||
| (that.bbits_ptr_[idx]& ~abits_ptr_[idx]);
|
||||
abits_ptr_[idx] = tmp1 | tmp2;
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add an integer to the vvp_vector4_t in place, bit by bit so that
|
||||
* there is no size limitations.
|
||||
|
|
@ -2872,20 +2959,6 @@ vvp_bit4_t compare_gtge(const vvp_vector4_t&lef, const vvp_vector4_t&rig,
|
|||
return out_if_equal;
|
||||
}
|
||||
|
||||
vvp_vector4_t operator ~ (const vvp_vector4_t&that)
|
||||
{
|
||||
vvp_vector4_t res = that;
|
||||
if (res.size_ <= vvp_vector4_t::BITS_PER_WORD) {
|
||||
res.abits_val_ = res.bbits_val_ | ~res.abits_val_;
|
||||
} else {
|
||||
unsigned cnt = (res.size_ + vvp_vector4_t::BITS_PER_WORD - 1) / vvp_vector4_t::BITS_PER_WORD;
|
||||
for (unsigned idx = 0 ; idx < cnt ; idx += 1)
|
||||
res.abits_ptr_[idx] = res.bbits_val_ | ~res.abits_val_;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
vvp_bit4_t compare_gtge_signed(const vvp_vector4_t&a,
|
||||
const vvp_vector4_t&b,
|
||||
vvp_bit4_t out_if_equal)
|
||||
|
|
|
|||
|
|
@ -152,6 +152,9 @@ class vvp_vector4_t {
|
|||
// Display the value into the buf as a string.
|
||||
char*as_string(char*buf, size_t buf_len);
|
||||
|
||||
void invert();
|
||||
vvp_vector4_t& operator &= (const vvp_vector4_t&that);
|
||||
vvp_vector4_t& operator |= (const vvp_vector4_t&that);
|
||||
vvp_vector4_t& operator += (int64_t);
|
||||
|
||||
private:
|
||||
|
|
@ -329,7 +332,13 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
|
|||
}
|
||||
}
|
||||
|
||||
extern vvp_vector4_t operator ~ (const vvp_vector4_t&that);
|
||||
inline vvp_vector4_t operator ~ (const vvp_vector4_t&that)
|
||||
{
|
||||
vvp_vector4_t res = that;
|
||||
res.invert();
|
||||
return res;
|
||||
}
|
||||
|
||||
extern ostream& operator << (ostream&, const vvp_vector4_t&);
|
||||
|
||||
extern vvp_bit4_t compare_gtge(const vvp_vector4_t&a,
|
||||
|
|
|
|||
Loading…
Reference in New Issue