From 0f740289e98bdf0be6eccbe9f30daee63fd491ee Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Thu, 4 Dec 2014 16:00:57 -0800 Subject: [PATCH] Optimize %mul instructions by integrating with vvp_vector4_t class. --- vvp/vthread.cc | 122 ++----------------------------------- vvp/vvp_net.cc | 159 ++++++++++++++++++++++++++++++++++++++++++++----- vvp/vvp_net.h | 25 ++++++++ 3 files changed, 172 insertions(+), 134 deletions(-) diff --git a/vvp/vthread.cc b/vvp/vthread.cc index 70f514b69..1fb496ebe 100644 --- a/vvp/vthread.cc +++ b/vvp/vthread.cc @@ -390,66 +390,6 @@ template T coerce_to_width(const T&that, unsigned width) template vvp_vector4_t coerce_to_width(const vvp_vector4_t&that, unsigned width); -/* - * Some of the instructions do wide addition to arrays of long. They - * use this add_with_carry function to help. - */ -static inline unsigned long add_with_carry(unsigned long a, unsigned long b, - unsigned long&carry) -{ - unsigned long tmp = b + carry; - unsigned long sum = a + tmp; - carry = 0; - if (tmp < b) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < a) - carry = 1; - return sum; -} - -static unsigned long multiply_with_carry(unsigned long a, unsigned long b, - unsigned long&carry) -{ - const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1; - unsigned long a0 = a & mask; - unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask; - unsigned long b0 = b & mask; - unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask; - - unsigned long tmp = a0 * b0; - - unsigned long r00 = tmp & mask; - unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask; - - tmp = a0 * b1; - - unsigned long r01 = tmp & mask; - unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask; - - tmp = a1 * b0; - - unsigned long r10 = tmp & mask; - unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask; - - tmp = a1 * b1; - - unsigned long r11 = tmp & mask; - unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask; - - unsigned long r1 = c00 + r01 + r10; - unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask; - r1 &= mask; - r2 += c01 + c10 + r11; - unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask; - r2 &= mask; - r3 += c11; - r3 &= mask; - - carry = (r3 << (CPU_WORD_BITS/2)) + r2; - return (r1 << (CPU_WORD_BITS/2)) + r00; -} static void multiply_array_imm(unsigned long*res, unsigned long*val, unsigned words, unsigned long imm) @@ -4021,62 +3961,6 @@ bool of_MOV_WU(vthread_t thr, vvp_code_t cp) return true; } -static bool do_MUL(vvp_vector4_t&vala, const vvp_vector4_t&valb) -{ - assert(vala.size() == valb.size()); - unsigned wid = vala.size(); - - unsigned long*ap = vala.subarray(0, wid); - if (ap == 0) { - vvp_vector4_t tmp(wid, BIT4_X); - vala = tmp; - return true; - } - - unsigned long*bp = valb.subarray(0, wid); - if (bp == 0) { - delete[]ap; - vvp_vector4_t tmp(wid, BIT4_X); - vala = tmp; - return true; - } - - // If the value fits in a single CPU word, then do it the easy way. - if (wid <= CPU_WORD_BITS) { - ap[0] *= bp[0]; - vala.setarray(0, wid, ap); - delete[]ap; - delete[]bp; - return true; - } - - unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; - unsigned long*res = new unsigned long[words]; - for (unsigned idx = 0 ; idx < words ; idx += 1) - res[idx] = 0; - - for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) { - for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) { - unsigned long sum; - unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum); - unsigned base = mul_a + mul_b; - unsigned long carry = 0; - res[base] = add_with_carry(res[base], tmp, carry); - for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) { - res[add_idx] = add_with_carry(res[add_idx], sum, carry); - sum = 0; - } - } - } - - vala.setarray(0, wid, res); - delete[]ap; - delete[]bp; - delete[]res; - - return true; -} - /* * %mul */ @@ -4088,7 +3972,8 @@ bool of_MUL(vthread_t thr, vvp_code_t) // replaces a pop and a pull. vvp_vector4_t&l = thr->peek_vec4(); - return do_MUL(l, r); + l.mul(r); + return true; } /* @@ -4109,7 +3994,8 @@ bool of_MULI(vthread_t thr, vvp_code_t cp) vvp_vector4_t r (wid, BIT4_0); get_immediate_rval (cp, r); - return do_MUL(l, r); + l.mul(r); + return true; } bool of_MUL_WR(vthread_t thr, vvp_code_t) diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc index 6a2508327..1b0f9ce45 100644 --- a/vvp/vvp_net.cc +++ b/vvp/vvp_net.cc @@ -41,6 +41,11 @@ # include "ivl_alloc.h" #endif +/* This is the size of an unsigned long in bits. This is just a + convenience macro. */ +# define CPU_WORD_BITS (8*sizeof(unsigned long)) +# define TOP_BIT (1UL << (CPU_WORD_BITS-1)) + permaheap vvp_net_fun_t::heap_; permaheap vvp_net_fil_t::heap_; @@ -510,25 +515,49 @@ int edge(vvp_bit4_t from, vvp_bit4_t to) return 0; } -/* - * Some of the instructions do wide addition to arrays of long. They - * use this add_with_carry function to help. - */ -static inline unsigned long add_with_carry(unsigned long a, unsigned long b, - unsigned long&carry) +unsigned long multiply_with_carry(unsigned long a, unsigned long b, + unsigned long&carry) { - unsigned long tmp = b + carry; - unsigned long sum = a + tmp; - carry = 0; - if (tmp < b) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < a) - carry = 1; - return sum; + const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1; + unsigned long a0 = a & mask; + unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask; + unsigned long b0 = b & mask; + unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask; + + unsigned long tmp = a0 * b0; + + unsigned long r00 = tmp & mask; + unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a0 * b1; + + unsigned long r01 = tmp & mask; + unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a1 * b0; + + unsigned long r10 = tmp & mask; + unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a1 * b1; + + unsigned long r11 = tmp & mask; + unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + unsigned long r1 = c00 + r01 + r10; + unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask; + r1 &= mask; + r2 += c01 + c10 + r11; + unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask; + r2 &= mask; + r3 += c11; + r3 &= mask; + + carry = (r3 << (CPU_WORD_BITS/2)) + r2; + return (r1 << (CPU_WORD_BITS/2)) + r00; } + void vvp_send_vec8(vvp_net_ptr_t ptr, const vvp_vector8_t&val) { while (vvp_net_t*cur = ptr.ptr()) { @@ -1524,6 +1553,104 @@ void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt) } } +void vvp_vector4_t::mul(const vvp_vector4_t&that) +{ + assert(size_ == that.size_); + + if (size_ < BITS_PER_WORD) { + unsigned long mask = ~(-1UL << size_); + if ((bbits_val_|that.bbits_val_) & mask) { + abits_val_ |= mask; + bbits_val_ |= mask; + return; + } + + abits_val_ *= that.abits_val_; + abits_val_ &= mask; + return; + } + + if (size_ == BITS_PER_WORD) { + if (bbits_val_ || that.bbits_val_) { + abits_val_ = WORD_X_ABITS; + bbits_val_ = WORD_X_BBITS; + } else { + abits_val_ *= that.abits_val_; + } + return; + } + + const int cnt = (size_+BITS_PER_WORD-1) / BITS_PER_WORD; + + unsigned long mask; + if (unsigned tail = size_%BITS_PER_WORD) { + mask = ~( -1UL << tail ); + } else { + mask = ~0UL; + } + + // Check for any XZ values ahead of time in a first pass. If + // we find any, then force the entire result to be X and be + // done. + for (int idx = 0 ; idx < cnt ; idx += 1) { + unsigned long lval = bbits_ptr_[idx]; + unsigned long rval = that.bbits_ptr_[idx]; + if (idx == (cnt-1)) { + lval &= mask; + rval &= mask; + } + if (lval || rval) { + for (int xdx = 0 ; xdx < cnt-1 ; xdx += 1) { + abits_ptr_[xdx] = WORD_X_ABITS; + bbits_ptr_[xdx] = WORD_X_BBITS; + } + abits_ptr_[cnt-1] = WORD_X_ABITS & mask; + bbits_ptr_[cnt-1] = WORD_X_BBITS & mask; + return; + } + } + + // Calculate the result into a res array. We need to keep is + // separate from the "this" array because we are making + // multiple passes. + unsigned long*res = new unsigned long[cnt]; + for (int idx = 0 ; idx < cnt ; idx += 1) + res[idx] = 0; + + for (int mul_a = 0 ; mul_a < cnt ; mul_a += 1) { + unsigned long lval = abits_ptr_[mul_a]; + if (mul_a == (cnt-1)) + lval &= mask; + + for (int mul_b = 0 ; mul_b < (cnt-mul_a) ; mul_b += 1) { + unsigned long rval = that.abits_ptr_[mul_b]; + if (mul_b == (cnt-1)) + rval &= mask; + + unsigned long sum; + unsigned long tmp = multiply_with_carry(lval, rval, sum); + int base = mul_a + mul_b; + unsigned long carry = 0; + res[base] = add_with_carry(res[base], tmp, carry); + for (int add_idx = base+1 ; add_idx < cnt ; add_idx += 1) { + res[add_idx] = add_with_carry(res[add_idx], sum, carry); + sum = 0; + } + } + } + + // Replace the "this" value with the calculated result. We + // know a-priori that the bbits are zero and unchanged. + res[cnt-1] &= mask; + for (int idx = 0 ; idx < cnt ; idx += 1) + abits_ptr_[idx] = res[idx]; + + delete[]res; + return; + + +} + bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const { if (size_ != that.size_) diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h index 16a37a953..a1292cf9b 100644 --- a/vvp/vvp_net.h +++ b/vvp/vvp_net.h @@ -209,6 +209,28 @@ inline void update_driver_counts(vvp_bit4_t bit, unsigned counts[3]) } } +/* + * Some of the instructions do wide addition to arrays of long. They + * use this add_with_carry function to help. + */ +static inline unsigned long add_with_carry(unsigned long a, unsigned long b, + unsigned long&carry) +{ + unsigned long tmp = b + carry; + unsigned long sum = a + tmp; + carry = 0; + if (tmp < b) + carry = 1; + if (sum < tmp) + carry = 1; + if (sum < a) + carry = 1; + return sum; +} + +extern unsigned long multiply_with_carry(unsigned long a, unsigned long b, + unsigned long&carry); + /* * This class represents scalar values collected into vectors. The * vector values can be accessed individually, or treated as a @@ -274,6 +296,9 @@ class vvp_vector4_t { // Add that to this in the Verilog way. void add(const vvp_vector4_t&that); + // Multiply this by that in the Verilog way. + void mul(const vvp_vector4_t&that); + // Test that the vectors are exactly equal bool eeq(const vvp_vector4_t&that) const;