Optimize the %add and %addi instructions

Tightly integrate with the vvp_vector4_t class to get much
better add performance.
This commit is contained in:
Stephen Williams 2014-12-04 12:38:08 -08:00
parent 86139c855d
commit 46ce236cfb
3 changed files with 103 additions and 48 deletions

View File

@ -842,60 +842,20 @@ static void get_immediate_rval(vvp_code_t cp, vvp_vector4_t&val)
for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
uint32_t ba = 0;
// Convert the vala/valb bits to a ba number that can be
// used to select what goes into the value.
// Convert the vala/valb bits to a ba number that
// matches the encoding of the vvp_bit4_t enumeration.
ba = (valb & 1) << 1;
ba |= vala & 1;
switch (ba) {
case 1:
val.set_bit(idx, BIT4_1);
break;
case 2:
val.set_bit(idx, BIT4_Z);
break;
case 3:
val.set_bit(idx, BIT4_X);
break;
default:
break;
}
// Note that the val is already pre-filled with BIT4_0
// bits, os we only need to set non-zero bit values.
if (ba) val.set_bit(idx, (vvp_bit4_t)ba);
vala >>= 1;
valb >>= 1;
}
}
static bool do_ADD(vvp_vector4_t&l, const vvp_vector4_t&r)
{
unsigned wid = l.size();
assert(wid == r.size());
unsigned long*lva = l.subarray(0,wid);
unsigned long*lvb = r.subarray(0,wid);
if (lva==0 || lvb==0)
goto x_out;
unsigned long carry;
carry = 0;
for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < wid ; idx += 1)
lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
l.setarray(0,wid,lva);
delete[]lva;
delete[]lvb;
return true;
x_out:
delete[]lva;
delete[]lvb;
vvp_vector4_t tmp (wid, BIT4_X);
l = tmp;
return true;
}
/*
* %add
*
@ -914,7 +874,9 @@ bool of_ADD(vthread_t thr, vvp_code_t)
// replaces a pop and a pull.
vvp_vector4_t&l = thr->peek_vec4();
return do_ADD(l, r);
l.add(r);
return true;
}
/*
@ -935,7 +897,9 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
vvp_vector4_t r (wid, BIT4_0);
get_immediate_rval (cp, r);
return do_ADD(l, r);
l.add(r);
return true;
}
bool of_ADD_WR(vthread_t thr, vvp_code_t)
@ -1184,7 +1148,7 @@ bool of_ASSIGN_VEC4_OFF_E(vthread_t thr, vvp_code_t cp)
return true;
int use_off = -off;
assert(wid > use_off);
assert((int)wid > use_off);
unsigned use_wid = wid - use_off;
val = val.subvalue(use_off, use_wid);
off = 0;

View File

@ -510,6 +510,25 @@ int edge(vvp_bit4_t from, vvp_bit4_t to)
return 0;
}
/*
* Some of the instructions do wide addition to arrays of long. They
* use this add_with_carry function to help.
*/
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
unsigned long&carry)
{
unsigned long tmp = b + carry;
unsigned long sum = a + tmp;
carry = 0;
if (tmp < b)
carry = 1;
if (sum < tmp)
carry = 1;
if (sum < a)
carry = 1;
return sum;
}
void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val)
{
while (vvp_net_t*cur = ptr.ptr()) {
@ -1366,6 +1385,72 @@ bool vvp_vector4_t::set_vec(unsigned adr, const vvp_vector4_t&that)
return diff_flag;
}
/*
* Add that vector to this vector. Do it in the Verilog way, which
* means if we detect any X or Z bits, change the entire results to
* all X.
*
* Assume both vectors are the same size.
*/
void vvp_vector4_t::add(const vvp_vector4_t&that)
{
assert(size_ == that.size_);
if (size_ < BITS_PER_WORD) {
unsigned long mask = ~(-1UL << size_);
if ((bbits_val_|that.bbits_val_) & mask) {
abits_val_ |= mask;
bbits_val_ |= mask;
return;
}
abits_val_ += that.abits_val_;
abits_val_ &= mask;
return;
}
if (size_ == BITS_PER_WORD) {
if (bbits_val_ | that.bbits_val_) {
abits_val_ = WORD_X_ABITS;
bbits_val_ = WORD_X_BBITS;
} else {
abits_val_ += that.abits_val_;
}
return;
}
int cnt = size_ / BITS_PER_WORD;
unsigned long carry = 0;
for (int idx = 0 ; idx < cnt ; idx += 1) {
if (bbits_ptr_[idx] | that.bbits_ptr_[idx])
goto x_out;
abits_ptr_[idx] = add_with_carry(abits_ptr_[idx], that.abits_ptr_[idx], carry);
}
if (unsigned tail = size_ % BITS_PER_WORD) {
unsigned long mask = ~( -1UL << tail );
if ((bbits_ptr_[cnt] | that.bbits_ptr_[cnt])&mask)
goto x_out;
abits_ptr_[cnt] = add_with_carry(abits_ptr_[cnt], that.abits_ptr_[cnt], carry);
abits_ptr_[cnt] &= mask;
}
return;
x_out:
for (int idx = 0 ; idx < cnt ; idx += 1) {
abits_ptr_[idx] = WORD_X_ABITS;
bbits_ptr_[idx] = WORD_X_BBITS;
}
if (unsigned tail = size_%BITS_PER_WORD) {
unsigned long mask = ~( -1UL << tail );
abits_ptr_[cnt] = WORD_X_ABITS&mask;
bbits_ptr_[cnt] = WORD_X_BBITS&mask;
}
}
void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
{
assert(dst+cnt <= size_);

View File

@ -130,6 +130,9 @@ struct automatic_hooks_s {
* values. The enumeration has fixed numeric values that can be
* expressed in 2 real bits, so that some of the internal classes can
* pack them tightly.
*
* WARNING: Many things rely on this encoding for the BIT4_* enumeration
* values, so accept that these values are cast in stone.
*/
enum vvp_bit4_t {
BIT4_0 = 0,
@ -268,6 +271,9 @@ class vvp_vector4_t {
// Move bits within this vector.
void mov(unsigned dst, unsigned src, unsigned cnt);
// Add that to this in the Verilog way.
void add(const vvp_vector4_t&that);
// Test that the vectors are exactly equal
bool eeq(const vvp_vector4_t&that) const;