Optimize %mul instructions by integrating with vvp_vector4_t class.
This commit is contained in:
parent
46ce236cfb
commit
0f740289e9
122
vvp/vthread.cc
122
vvp/vthread.cc
|
|
@ -390,66 +390,6 @@ template <class T> T coerce_to_width(const T&that, unsigned width)
|
|||
template vvp_vector4_t coerce_to_width(const vvp_vector4_t&that,
|
||||
unsigned width);
|
||||
|
||||
/*
|
||||
* Some of the instructions do wide addition to arrays of long. They
|
||||
* use this add_with_carry function to help.
|
||||
*/
|
||||
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
{
|
||||
unsigned long tmp = b + carry;
|
||||
unsigned long sum = a + tmp;
|
||||
carry = 0;
|
||||
if (tmp < b)
|
||||
carry = 1;
|
||||
if (sum < tmp)
|
||||
carry = 1;
|
||||
if (sum < a)
|
||||
carry = 1;
|
||||
return sum;
|
||||
}
|
||||
|
||||
static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
{
|
||||
const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
|
||||
unsigned long a0 = a & mask;
|
||||
unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
|
||||
unsigned long b0 = b & mask;
|
||||
unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
unsigned long tmp = a0 * b0;
|
||||
|
||||
unsigned long r00 = tmp & mask;
|
||||
unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a0 * b1;
|
||||
|
||||
unsigned long r01 = tmp & mask;
|
||||
unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a1 * b0;
|
||||
|
||||
unsigned long r10 = tmp & mask;
|
||||
unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a1 * b1;
|
||||
|
||||
unsigned long r11 = tmp & mask;
|
||||
unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
unsigned long r1 = c00 + r01 + r10;
|
||||
unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
|
||||
r1 &= mask;
|
||||
r2 += c01 + c10 + r11;
|
||||
unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
|
||||
r2 &= mask;
|
||||
r3 += c11;
|
||||
r3 &= mask;
|
||||
|
||||
carry = (r3 << (CPU_WORD_BITS/2)) + r2;
|
||||
return (r1 << (CPU_WORD_BITS/2)) + r00;
|
||||
}
|
||||
|
||||
static void multiply_array_imm(unsigned long*res, unsigned long*val,
|
||||
unsigned words, unsigned long imm)
|
||||
|
|
@ -4021,62 +3961,6 @@ bool of_MOV_WU(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
static bool do_MUL(vvp_vector4_t&vala, const vvp_vector4_t&valb)
|
||||
{
|
||||
assert(vala.size() == valb.size());
|
||||
unsigned wid = vala.size();
|
||||
|
||||
unsigned long*ap = vala.subarray(0, wid);
|
||||
if (ap == 0) {
|
||||
vvp_vector4_t tmp(wid, BIT4_X);
|
||||
vala = tmp;
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned long*bp = valb.subarray(0, wid);
|
||||
if (bp == 0) {
|
||||
delete[]ap;
|
||||
vvp_vector4_t tmp(wid, BIT4_X);
|
||||
vala = tmp;
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the value fits in a single CPU word, then do it the easy way.
|
||||
if (wid <= CPU_WORD_BITS) {
|
||||
ap[0] *= bp[0];
|
||||
vala.setarray(0, wid, ap);
|
||||
delete[]ap;
|
||||
delete[]bp;
|
||||
return true;
|
||||
}
|
||||
|
||||
unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
|
||||
unsigned long*res = new unsigned long[words];
|
||||
for (unsigned idx = 0 ; idx < words ; idx += 1)
|
||||
res[idx] = 0;
|
||||
|
||||
for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
|
||||
for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) {
|
||||
unsigned long sum;
|
||||
unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
|
||||
unsigned base = mul_a + mul_b;
|
||||
unsigned long carry = 0;
|
||||
res[base] = add_with_carry(res[base], tmp, carry);
|
||||
for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
|
||||
res[add_idx] = add_with_carry(res[add_idx], sum, carry);
|
||||
sum = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vala.setarray(0, wid, res);
|
||||
delete[]ap;
|
||||
delete[]bp;
|
||||
delete[]res;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %mul
|
||||
*/
|
||||
|
|
@ -4088,7 +3972,8 @@ bool of_MUL(vthread_t thr, vvp_code_t)
|
|||
// replaces a pop and a pull.
|
||||
vvp_vector4_t&l = thr->peek_vec4();
|
||||
|
||||
return do_MUL(l, r);
|
||||
l.mul(r);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -4109,7 +3994,8 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
|
|||
vvp_vector4_t r (wid, BIT4_0);
|
||||
get_immediate_rval (cp, r);
|
||||
|
||||
return do_MUL(l, r);
|
||||
l.mul(r);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_MUL_WR(vthread_t thr, vvp_code_t)
|
||||
|
|
|
|||
159
vvp/vvp_net.cc
159
vvp/vvp_net.cc
|
|
@ -41,6 +41,11 @@
|
|||
# include "ivl_alloc.h"
|
||||
#endif
|
||||
|
||||
/* This is the size of an unsigned long in bits. This is just a
|
||||
convenience macro. */
|
||||
# define CPU_WORD_BITS (8*sizeof(unsigned long))
|
||||
# define TOP_BIT (1UL << (CPU_WORD_BITS-1))
|
||||
|
||||
permaheap vvp_net_fun_t::heap_;
|
||||
permaheap vvp_net_fil_t::heap_;
|
||||
|
||||
|
|
@ -510,25 +515,49 @@ int edge(vvp_bit4_t from, vvp_bit4_t to)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Some of the instructions do wide addition to arrays of long. They
|
||||
* use this add_with_carry function to help.
|
||||
*/
|
||||
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
unsigned long multiply_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
{
|
||||
unsigned long tmp = b + carry;
|
||||
unsigned long sum = a + tmp;
|
||||
carry = 0;
|
||||
if (tmp < b)
|
||||
carry = 1;
|
||||
if (sum < tmp)
|
||||
carry = 1;
|
||||
if (sum < a)
|
||||
carry = 1;
|
||||
return sum;
|
||||
const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
|
||||
unsigned long a0 = a & mask;
|
||||
unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
|
||||
unsigned long b0 = b & mask;
|
||||
unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
unsigned long tmp = a0 * b0;
|
||||
|
||||
unsigned long r00 = tmp & mask;
|
||||
unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a0 * b1;
|
||||
|
||||
unsigned long r01 = tmp & mask;
|
||||
unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a1 * b0;
|
||||
|
||||
unsigned long r10 = tmp & mask;
|
||||
unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
tmp = a1 * b1;
|
||||
|
||||
unsigned long r11 = tmp & mask;
|
||||
unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
|
||||
|
||||
unsigned long r1 = c00 + r01 + r10;
|
||||
unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
|
||||
r1 &= mask;
|
||||
r2 += c01 + c10 + r11;
|
||||
unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
|
||||
r2 &= mask;
|
||||
r3 += c11;
|
||||
r3 &= mask;
|
||||
|
||||
carry = (r3 << (CPU_WORD_BITS/2)) + r2;
|
||||
return (r1 << (CPU_WORD_BITS/2)) + r00;
|
||||
}
|
||||
|
||||
|
||||
void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val)
|
||||
{
|
||||
while (vvp_net_t*cur = ptr.ptr()) {
|
||||
|
|
@ -1524,6 +1553,104 @@ void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
|
|||
}
|
||||
}
|
||||
|
||||
void vvp_vector4_t::mul(const vvp_vector4_t&that)
|
||||
{
|
||||
assert(size_ == that.size_);
|
||||
|
||||
if (size_ < BITS_PER_WORD) {
|
||||
unsigned long mask = ~(-1UL << size_);
|
||||
if ((bbits_val_|that.bbits_val_) & mask) {
|
||||
abits_val_ |= mask;
|
||||
bbits_val_ |= mask;
|
||||
return;
|
||||
}
|
||||
|
||||
abits_val_ *= that.abits_val_;
|
||||
abits_val_ &= mask;
|
||||
return;
|
||||
}
|
||||
|
||||
if (size_ == BITS_PER_WORD) {
|
||||
if (bbits_val_ || that.bbits_val_) {
|
||||
abits_val_ = WORD_X_ABITS;
|
||||
bbits_val_ = WORD_X_BBITS;
|
||||
} else {
|
||||
abits_val_ *= that.abits_val_;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const int cnt = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
|
||||
|
||||
unsigned long mask;
|
||||
if (unsigned tail = size_%BITS_PER_WORD) {
|
||||
mask = ~( -1UL << tail );
|
||||
} else {
|
||||
mask = ~0UL;
|
||||
}
|
||||
|
||||
// Check for any XZ values ahead of time in a first pass. If
|
||||
// we find any, then force the entire result to be X and be
|
||||
// done.
|
||||
for (int idx = 0 ; idx < cnt ; idx += 1) {
|
||||
unsigned long lval = bbits_ptr_[idx];
|
||||
unsigned long rval = that.bbits_ptr_[idx];
|
||||
if (idx == (cnt-1)) {
|
||||
lval &= mask;
|
||||
rval &= mask;
|
||||
}
|
||||
if (lval || rval) {
|
||||
for (int xdx = 0 ; xdx < cnt-1 ; xdx += 1) {
|
||||
abits_ptr_[xdx] = WORD_X_ABITS;
|
||||
bbits_ptr_[xdx] = WORD_X_BBITS;
|
||||
}
|
||||
abits_ptr_[cnt-1] = WORD_X_ABITS & mask;
|
||||
bbits_ptr_[cnt-1] = WORD_X_BBITS & mask;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the result into a res array. We need to keep is
|
||||
// separate from the "this" array because we are making
|
||||
// multiple passes.
|
||||
unsigned long*res = new unsigned long[cnt];
|
||||
for (int idx = 0 ; idx < cnt ; idx += 1)
|
||||
res[idx] = 0;
|
||||
|
||||
for (int mul_a = 0 ; mul_a < cnt ; mul_a += 1) {
|
||||
unsigned long lval = abits_ptr_[mul_a];
|
||||
if (mul_a == (cnt-1))
|
||||
lval &= mask;
|
||||
|
||||
for (int mul_b = 0 ; mul_b < (cnt-mul_a) ; mul_b += 1) {
|
||||
unsigned long rval = that.abits_ptr_[mul_b];
|
||||
if (mul_b == (cnt-1))
|
||||
rval &= mask;
|
||||
|
||||
unsigned long sum;
|
||||
unsigned long tmp = multiply_with_carry(lval, rval, sum);
|
||||
int base = mul_a + mul_b;
|
||||
unsigned long carry = 0;
|
||||
res[base] = add_with_carry(res[base], tmp, carry);
|
||||
for (int add_idx = base+1 ; add_idx < cnt ; add_idx += 1) {
|
||||
res[add_idx] = add_with_carry(res[add_idx], sum, carry);
|
||||
sum = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Replace the "this" value with the calculated result. We
|
||||
// know a-priori that the bbits are zero and unchanged.
|
||||
res[cnt-1] &= mask;
|
||||
for (int idx = 0 ; idx < cnt ; idx += 1)
|
||||
abits_ptr_[idx] = res[idx];
|
||||
|
||||
delete[]res;
|
||||
return;
|
||||
|
||||
|
||||
}
|
||||
|
||||
bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
|
||||
{
|
||||
if (size_ != that.size_)
|
||||
|
|
|
|||
|
|
@ -209,6 +209,28 @@ inline void update_driver_counts(vvp_bit4_t bit, unsigned counts[3])
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Some of the instructions do wide addition to arrays of long. They
|
||||
* use this add_with_carry function to help.
|
||||
*/
|
||||
static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry)
|
||||
{
|
||||
unsigned long tmp = b + carry;
|
||||
unsigned long sum = a + tmp;
|
||||
carry = 0;
|
||||
if (tmp < b)
|
||||
carry = 1;
|
||||
if (sum < tmp)
|
||||
carry = 1;
|
||||
if (sum < a)
|
||||
carry = 1;
|
||||
return sum;
|
||||
}
|
||||
|
||||
extern unsigned long multiply_with_carry(unsigned long a, unsigned long b,
|
||||
unsigned long&carry);
|
||||
|
||||
/*
|
||||
* This class represents scalar values collected into vectors. The
|
||||
* vector values can be accessed individually, or treated as a
|
||||
|
|
@ -274,6 +296,9 @@ class vvp_vector4_t {
|
|||
// Add that to this in the Verilog way.
|
||||
void add(const vvp_vector4_t&that);
|
||||
|
||||
// Multiply this by that in the Verilog way.
|
||||
void mul(const vvp_vector4_t&that);
|
||||
|
||||
// Test that the vectors are exactly equal
|
||||
bool eeq(const vvp_vector4_t&that) const;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue