From 0601b4e43bd9e9328e0bae7da4e691e9d12dce36 Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Fri, 14 Nov 2014 14:41:04 -0800 Subject: [PATCH] Optimizations for %pushi/vec4 and %load/vec4 - Have %pushi/vec4 handle some special cases optimally. - Eliminate some duplicated method calls in %load/vec4. - Optimize the vvp_vector4_t::copy_from_ method by inlining some parts. --- vvp/vthread.cc | 114 ++++++++++++++++++++++++++----------------------- vvp/vvp_net.cc | 27 ++++++------ vvp/vvp_net.h | 13 +++++- 3 files changed, 84 insertions(+), 70 deletions(-) diff --git a/vvp/vthread.cc b/vvp/vthread.cc index bb7356807..21bb765ac 100644 --- a/vvp/vthread.cc +++ b/vvp/vthread.cc @@ -3345,34 +3345,6 @@ static void load_base(vvp_code_t cp, vvp_vector4_t&dst) sig->vec4_value(dst); } -#if 0 -bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp) -{ - unsigned bit = cp->bit_idx[0]; - unsigned wid = cp->bit_idx[1]; - - vvp_vector4_t sig_value; - load_base(cp, sig_value); - - /* Check the address once, before we scan the vector. */ - thr_check_addr(thr, bit+wid-1); - - if (sig_value.size() > wid) - sig_value.resize(wid); - - /* Copy the vector bits into the bits4 vector. Do the copy - directly to skip the excess calls to thr_check_addr. */ - thr->bits4.set_vec(bit, sig_value); - - /* If the source is shorter than the desired width, then pad - with BIT4_X values. */ - for (unsigned idx = sig_value.size() ; idx < wid ; idx += 1) - thr->bits4.set_bit(bit+idx, BIT4_X); - - return true; -} -#endif - /* * %load/vec4 */ @@ -4423,35 +4395,57 @@ bool of_PUSHI_VEC4(vthread_t thr, vvp_code_t cp) uint32_t valb = cp->bit_idx[1]; unsigned wid = cp->number; + // I expect that most of the bits of an immediate value are + // going to be zero, so start the result vector with all zero + // bits. Then we only need to replace the bits that are different. vvp_vector4_t val (wid, BIT4_0); + + // Special case: Immediate zero is super easy. + if (vala==0 && valb==0) { + thr->push_vec4(val); + return true; + } + + // Special case: If the value is defined (no X or Z) and fits + // in an unsigned long, then use the setarray method to write + // the value all in one shot. + if ((valb==0) && (wid <= 8*sizeof(unsigned long))) { + unsigned long tmp = vala; + val.setarray(0, wid, &tmp); + thr->push_vec4(val); + return true; + } + + // The %pushi/vec4 can create values bigger then 32 bits, but + // only if the high bits are zero. So at most we need to run + // through the loop below 32 times. Maybe less, if the target + // width is less. We don't have to do anything special on that + // because vala/valb bits will shift away so (vala|valb) will + // turn to zero at or before 32 shifts. + for (unsigned idx = 0 ; idx < wid && (vala|valb) ; idx += 1) { uint32_t ba = 0; - // If the requested width is /32, then there are no - // actual immediate bits, but we can pad with zero. So - // here we test if we are still working on he LSB, and - // process them if so. - if (idx < 32) { - ba = (valb & 1) << 1; - ba |= vala & 1; - } - vala >>= 1; - valb >>= 1; - if (ba == 0) continue; - vvp_bit4_t use_bit = BIT4_0; + // Convert the vala/valb bits to a ba number that can be + // used to select what goes into the value. + ba = (valb & 1) << 1; + ba |= vala & 1; + switch (ba) { case 1: - use_bit = BIT4_1; + val.set_bit(idx, BIT4_1); break; case 2: - use_bit = BIT4_Z; + val.set_bit(idx, BIT4_Z); break; case 3: - use_bit = BIT4_X; + val.set_bit(idx, BIT4_X); break; default: break; } - val.set_bit(idx, use_bit); + + vala >>= 1; + valb >>= 1; } thr->push_vec4(val); @@ -5281,6 +5275,17 @@ bool of_STORE_STRA(vthread_t thr, vvp_code_t cp) /* * %store/vec4 , , * + * is the index register that contains the base offset into + * the destination. If zero, the offset of 0 is used instead of index + * register zero. The offset value is SIGNED, and can be negative. + * + * is the actual width, an unsigned number. + * + * This function tests flag bit 4. If that flag is set, and + * is an actual index register (not zero) then this assumes that the + * calculation of the contents failed, and the store is + * aborted. + * * NOTE: This instruction may loose the argument because it is * not consistent with the %store/vec4/ instructions which have * no . @@ -5290,18 +5295,19 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp) vvp_net_ptr_t ptr(cp->net, 0); vvp_signal_value*sig = dynamic_cast (cp->net->fil); unsigned off_index = cp->bit_idx[0]; - unsigned wid = cp->bit_idx[1]; + int wid = cp->bit_idx[1]; int off = off_index? thr->words[off_index].w_int : 0; + const int sig_value_size = sig->value_size(); vvp_vector4_t val = thr->pop_vec4(); - if (val.size() < wid) { + if (val.size() < (unsigned)wid) { cerr << "XXXX Internal error: val.size()=" << val.size() << ", expecting >= " << wid << endl; } - assert(val.size() >= wid); - if (val.size() > wid) + assert(val.size() >= (unsigned)wid); + if (val.size() > (unsigned)wid) val.resize(wid); // If there is a problem loading the index register, flags-4 @@ -5309,9 +5315,9 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp) if (off_index!=0 && thr->flags[4] == BIT4_1) return true; - if (off <= -(int)wid) + if (off <= -wid) return true; - if (off >= (int)sig->value_size()) + if (off >= sig_value_size) return true; // If the index is below the vector, then only assign the high @@ -5325,17 +5331,17 @@ bool of_STORE_VEC4(vthread_t thr, vvp_code_t cp) // If the value is partly above the target, then only assign // the bits that overlap. - if ((off+wid) > sig->value_size()) { - wid = sig->value_size()-off; + if ((off+wid) > sig_value_size) { + wid = sig_value_size - off; val = val.subvalue(0, wid); val.resize(wid); } - if (off==0 && val.size()==sig->value_size()) + if (off==0 && val.size()==(unsigned)sig_value_size) vvp_send_vec4(ptr, val, thr->wt_context); else - vvp_send_vec4_pv(ptr, val, off, wid, sig->value_size(), thr->wt_context); + vvp_send_vec4_pv(ptr, val, off, wid, sig_value_size, thr->wt_context); return true; } diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc index afa44efbe..5ae18616d 100644 --- a/vvp/vvp_net.cc +++ b/vvp/vvp_net.cc @@ -643,23 +643,20 @@ void vvp_vector4_t::copy_bits(const vvp_vector4_t&that) } } -void vvp_vector4_t::copy_from_(const vvp_vector4_t&that) +/* + * This function should ONLY BE CALLED FROM vvp_vector4_t::copy_from_, + * as it performs part of that functions tasks. + */ +void vvp_vector4_t::copy_from_big_(const vvp_vector4_t&that) { - size_ = that.size_; - if (size_ > BITS_PER_WORD) { - unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD; - abits_ptr_ = new unsigned long[2*words]; - bbits_ptr_ = abits_ptr_ + words; + unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD; + abits_ptr_ = new unsigned long[2*words]; + bbits_ptr_ = abits_ptr_ + words; - for (unsigned idx = 0 ; idx < words ; idx += 1) - abits_ptr_[idx] = that.abits_ptr_[idx]; - for (unsigned idx = 0 ; idx < words ; idx += 1) - bbits_ptr_[idx] = that.bbits_ptr_[idx]; - - } else { - abits_val_ = that.abits_val_; - bbits_val_ = that.bbits_val_; - } + for (unsigned idx = 0 ; idx < words ; idx += 1) + abits_ptr_[idx] = that.abits_ptr_[idx]; + for (unsigned idx = 0 ; idx < words ; idx += 1) + bbits_ptr_[idx] = that.bbits_ptr_[idx]; } /* diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h index e2a37d26a..b204dcd40 100644 --- a/vvp/vvp_net.h +++ b/vvp/vvp_net.h @@ -244,7 +244,7 @@ class vvp_vector4_t { ~vvp_vector4_t(); - unsigned size() const { return size_; } + inline unsigned size() const { return size_; } void resize(unsigned new_size); // Get the bit at the specified address @@ -317,6 +317,7 @@ class vvp_vector4_t { // Initialize and operator= use this private method to copy // the data from that object into this object. void copy_from_(const vvp_vector4_t&that); + void copy_from_big_(const vvp_vector4_t&that); void copy_inverted_from_(const vvp_vector4_t&that); void allocate_words_(unsigned long inita, unsigned long initb); @@ -396,6 +397,16 @@ inline vvp_vector4_t& vvp_vector4_t::operator= (const vvp_vector4_t&that) return *this; } +inline void vvp_vector4_t::copy_from_(const vvp_vector4_t&that) +{ + size_ = that.size_; + if (size_ <= BITS_PER_WORD) { + abits_val_ = that.abits_val_; + bbits_val_ = that.bbits_val_; + } else { + copy_from_big_(that); + } +} inline vvp_bit4_t vvp_vector4_t::value(unsigned idx) const {