Fix for GitHub issue 9 part 1 : Efficiency of vvp_vector2_t::pow() function.
The vvp_vector2_t::pow() function is recursive, and performs a multiplication operation on each step. The multiplication operator was expanding the result vector to accomodate the maximum possible result value for the given operand vectors, thus causing the execution time of the power operation to be exponentially proportional to the exponent value. Both in this case and in general, it is unnecessary for the multiplication result vector to be expanded, as the compiler has already determined the required vector width during elaboration, and sizes the operand vectors to match.
This commit is contained in:
parent
68f8de28af
commit
5853f7d867
|
|
@ -4557,19 +4557,8 @@ bool of_POW(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
/* To make the result more manageable trim off the extra bits. */
|
||||
xv2.trim();
|
||||
yv2.trim();
|
||||
|
||||
vvp_vector2_t result = pow(xv2, yv2);
|
||||
|
||||
/* If the result is too small zero pad it. */
|
||||
if (result.size() < wid) {
|
||||
for (unsigned jdx = wid-1; jdx >= result.size(); jdx -= 1)
|
||||
thr_put_bit(thr, cp->bit_idx[0]+jdx, BIT4_0);
|
||||
wid = result.size();
|
||||
}
|
||||
|
||||
/* Copy only what we need of the result. */
|
||||
for (unsigned jdx = 0; jdx < wid; jdx += 1)
|
||||
thr_put_bit(thr, cp->bit_idx[0]+jdx,
|
||||
|
|
|
|||
|
|
@ -2362,23 +2362,21 @@ bool vvp_vector2_t::is_zero() const
|
|||
*/
|
||||
vvp_vector2_t pow(const vvp_vector2_t&x, vvp_vector2_t&y)
|
||||
{
|
||||
/* If we have a zero exponent just return a 1 bit wide 1. */
|
||||
/* If we have a zero exponent just return 1. */
|
||||
if (y == vvp_vector2_t(0L, 1)) {
|
||||
return vvp_vector2_t(1L, 1);
|
||||
return vvp_vector2_t(1L, x.size());
|
||||
}
|
||||
|
||||
/* Is the value odd? */
|
||||
if (y.value(0) == 1) {
|
||||
y.set_bit(0, 0); // A quick subtract by 1.
|
||||
vvp_vector2_t res = x * pow(x, y);
|
||||
res.trim(); // To keep the size under control trim extra zeros.
|
||||
return res;
|
||||
}
|
||||
|
||||
y >>= 1; // A fast divide by two. We know the LSB is zero.
|
||||
vvp_vector2_t z = pow(x, y);
|
||||
vvp_vector2_t res = z * z;
|
||||
res.trim(); // To keep the size under control trim extra zeros.
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
@ -2422,25 +2420,22 @@ static void multiply_long(unsigned long a, unsigned long b,
|
|||
low = (res[1] << 4UL*sizeof(unsigned long)) | res[0];
|
||||
}
|
||||
|
||||
/*
|
||||
* Multiplication of two vector2 vectors returns a product as wide as
|
||||
* the sum of the widths of the input vectors.
|
||||
*/
|
||||
vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
|
||||
{
|
||||
const unsigned bits_per_word = 8 * sizeof(a.vec_[0]);
|
||||
vvp_vector2_t r (0, a.size() + b.size());
|
||||
|
||||
unsigned awords = (a.wid_ + bits_per_word - 1) / bits_per_word;
|
||||
unsigned bwords = (b.wid_ + bits_per_word - 1) / bits_per_word;
|
||||
unsigned rwords = (r.wid_ + bits_per_word - 1) / bits_per_word;
|
||||
// The compiler ensures that the two operands are of equal size.
|
||||
assert(a.size() == b.size());
|
||||
vvp_vector2_t r (0, a.size());
|
||||
|
||||
for (unsigned bdx = 0 ; bdx < bwords ; bdx += 1) {
|
||||
unsigned words = (r.wid_ + bits_per_word - 1) / bits_per_word;
|
||||
|
||||
for (unsigned bdx = 0 ; bdx < words ; bdx += 1) {
|
||||
unsigned long tmpb = b.vec_[bdx];
|
||||
if (tmpb == 0)
|
||||
continue;
|
||||
|
||||
for (unsigned adx = 0 ; adx < awords ; adx += 1) {
|
||||
for (unsigned adx = 0 ; adx < words ; adx += 1) {
|
||||
unsigned long tmpa = a.vec_[adx];
|
||||
if (tmpa == 0)
|
||||
continue;
|
||||
|
|
@ -2450,7 +2445,7 @@ vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
|
|||
|
||||
unsigned long carry = 0;
|
||||
for (unsigned sdx = 0
|
||||
; (adx+bdx+sdx) < rwords
|
||||
; (adx+bdx+sdx) < words
|
||||
; sdx += 1) {
|
||||
|
||||
r.vec_[adx+bdx+sdx] = add_carry(r.vec_[adx+bdx+sdx],
|
||||
|
|
@ -2461,7 +2456,6 @@ vvp_vector2_t operator * (const vvp_vector2_t&a, const vvp_vector2_t&b)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue