Optimize the %add and %addi instructions
Tightly integrate with the vvp_vector4_t class to get much better add performance.
This commit is contained in:
parent
86139c855d
commit
46ce236cfb
|
|
@ -842,60 +842,20 @@ static void get_immediate_rval(vvp_code_t cp, vvp_vector4_t&val)
|
|||
|
||||
for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) {
|
||||
uint32_t ba = 0;
|
||||
// Convert the vala/valb bits to a ba number that can be
|
||||
// used to select what goes into the value.
|
||||
// Convert the vala/valb bits to a ba number that
|
||||
// matches the encoding of the vvp_bit4_t enumeration.
|
||||
ba = (valb & 1) << 1;
|
||||
ba |= vala & 1;
|
||||
|
||||
switch (ba) {
|
||||
case 1:
|
||||
val.set_bit(idx, BIT4_1);
|
||||
break;
|
||||
case 2:
|
||||
val.set_bit(idx, BIT4_Z);
|
||||
break;
|
||||
case 3:
|
||||
val.set_bit(idx, BIT4_X);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
// Note that the val is already pre-filled with BIT4_0
|
||||
// bits, os we only need to set non-zero bit values.
|
||||
if (ba) val.set_bit(idx, (vvp_bit4_t)ba);
|
||||
|
||||
vala >>= 1;
|
||||
valb >>= 1;
|
||||
}
|
||||
}
|
||||
|
||||
static bool do_ADD(vvp_vector4_t&l, const vvp_vector4_t&r)
|
||||
{
|
||||
unsigned wid = l.size();
|
||||
assert(wid == r.size());
|
||||
|
||||
unsigned long*lva = l.subarray(0,wid);
|
||||
unsigned long*lvb = r.subarray(0,wid);
|
||||
if (lva==0 || lvb==0)
|
||||
goto x_out;
|
||||
|
||||
unsigned long carry;
|
||||
carry = 0;
|
||||
for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < wid ; idx += 1)
|
||||
lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
|
||||
|
||||
l.setarray(0,wid,lva);
|
||||
|
||||
delete[]lva;
|
||||
delete[]lvb;
|
||||
return true;
|
||||
|
||||
x_out:
|
||||
delete[]lva;
|
||||
delete[]lvb;
|
||||
|
||||
vvp_vector4_t tmp (wid, BIT4_X);
|
||||
l = tmp;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %add
|
||||
*
|
||||
|
|
@ -914,7 +874,9 @@ bool of_ADD(vthread_t thr, vvp_code_t)
|
|||
// replaces a pop and a pull.
|
||||
vvp_vector4_t&l = thr->peek_vec4();
|
||||
|
||||
return do_ADD(l, r);
|
||||
l.add(r);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -935,7 +897,9 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
|
|||
vvp_vector4_t r (wid, BIT4_0);
|
||||
get_immediate_rval (cp, r);
|
||||
|
||||
return do_ADD(l, r);
|
||||
l.add(r);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_ADD_WR(vthread_t thr, vvp_code_t)
|
||||
|
|
@ -1184,7 +1148,7 @@ bool of_ASSIGN_VEC4_OFF_E(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
|
||||
int use_off = -off;
|
||||
assert(wid > use_off);
|
||||
assert((int)wid > use_off);
|
||||
unsigned use_wid = wid - use_off;
|
||||
val = val.subvalue(use_off, use_wid);
|
||||
off = 0;
|
||||
|
|
|
|||
|
|
@ -510,6 +510,25 @@ int edge(vvp_bit4_t from, vvp_bit4_t to)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some of the instructions do wide addition to arrays of long. They
|
||||
* use this add_with_carry function to help.
|
||||
*/
|
||||
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
{
|
||||
unsigned long tmp = b + carry;
|
||||
unsigned long sum = a + tmp;
|
||||
carry = 0;
|
||||
if (tmp < b)
|
||||
carry = 1;
|
||||
if (sum < tmp)
|
||||
carry = 1;
|
||||
if (sum < a)
|
||||
carry = 1;
|
||||
return sum;
|
||||
}
|
||||
|
||||
void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val)
|
||||
{
|
||||
while (vvp_net_t*cur = ptr.ptr()) {
|
||||
|
|
@ -1366,6 +1385,72 @@ bool vvp_vector4_t::set_vec(unsigned adr, const vvp_vector4_t&that)
|
|||
return diff_flag;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add that vector to this vector. Do it in the Verilog way, which
|
||||
* means if we detect any X or Z bits, change the entire results to
|
||||
* all X.
|
||||
*
|
||||
* Assume both vectors are the same size.
|
||||
*/
|
||||
void vvp_vector4_t::add(const vvp_vector4_t&that)
|
||||
{
|
||||
assert(size_ == that.size_);
|
||||
|
||||
if (size_ < BITS_PER_WORD) {
|
||||
unsigned long mask = ~(-1UL << size_);
|
||||
if ((bbits_val_|that.bbits_val_) & mask) {
|
||||
abits_val_ |= mask;
|
||||
bbits_val_ |= mask;
|
||||
return;
|
||||
}
|
||||
|
||||
abits_val_ += that.abits_val_;
|
||||
abits_val_ &= mask;
|
||||
return;
|
||||
}
|
||||
|
||||
if (size_ == BITS_PER_WORD) {
|
||||
if (bbits_val_ | that.bbits_val_) {
|
||||
abits_val_ = WORD_X_ABITS;
|
||||
bbits_val_ = WORD_X_BBITS;
|
||||
} else {
|
||||
abits_val_ += that.abits_val_;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int cnt = size_ / BITS_PER_WORD;
|
||||
unsigned long carry = 0;
|
||||
for (int idx = 0 ; idx < cnt ; idx += 1) {
|
||||
if (bbits_ptr_[idx] | that.bbits_ptr_[idx])
|
||||
goto x_out;
|
||||
|
||||
abits_ptr_[idx] = add_with_carry(abits_ptr_[idx], that.abits_ptr_[idx], carry);
|
||||
}
|
||||
|
||||
if (unsigned tail = size_ % BITS_PER_WORD) {
|
||||
unsigned long mask = ~( -1UL << tail );
|
||||
if ((bbits_ptr_[cnt] | that.bbits_ptr_[cnt])&mask)
|
||||
goto x_out;
|
||||
|
||||
abits_ptr_[cnt] = add_with_carry(abits_ptr_[cnt], that.abits_ptr_[cnt], carry);
|
||||
abits_ptr_[cnt] &= mask;
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
x_out:
|
||||
for (int idx = 0 ; idx < cnt ; idx += 1) {
|
||||
abits_ptr_[idx] = WORD_X_ABITS;
|
||||
bbits_ptr_[idx] = WORD_X_BBITS;
|
||||
}
|
||||
if (unsigned tail = size_%BITS_PER_WORD) {
|
||||
unsigned long mask = ~( -1UL << tail );
|
||||
abits_ptr_[cnt] = WORD_X_ABITS&mask;
|
||||
bbits_ptr_[cnt] = WORD_X_BBITS&mask;
|
||||
}
|
||||
}
|
||||
|
||||
void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
|
||||
{
|
||||
assert(dst+cnt <= size_);
|
||||
|
|
|
|||
|
|
@ -130,6 +130,9 @@ struct automatic_hooks_s {
|
|||
* values. The enumeration has fixed numeric values that can be
|
||||
* expressed in 2 real bits, so that some of the internal classes can
|
||||
* pack them tightly.
|
||||
*
|
||||
* WARNING: Many things rely on this encoding for the BIT4_* enumeration
|
||||
* values, so accept that these values are cast in stone.
|
||||
*/
|
||||
enum vvp_bit4_t {
|
||||
BIT4_0 = 0,
|
||||
|
|
@ -268,6 +271,9 @@ class vvp_vector4_t {
|
|||
// Move bits within this vector.
|
||||
void mov(unsigned dst, unsigned src, unsigned cnt);
|
||||
|
||||
// Add that to this in the Verilog way.
|
||||
void add(const vvp_vector4_t&that);
|
||||
|
||||
// Test that the vectors are exactly equal
|
||||
bool eeq(const vvp_vector4_t&that) const;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue