From d0b1936fb50dc80bd2a795d3f2c89962104565bd Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Mon, 11 Jan 2010 11:42:25 -0800 Subject: [PATCH] Optimize handling of invert gates / Tweak vvp_vector8_t performance The process of inverting and copying can be collapsed into a single operation that should run a little faster. Also, inverting readily vectorizes itself. I've also possibly reduced useless vvp_not_fun iterations. Also, include some minor tweaks to the vvp_vector8_t handling of vector copies. This is more to clean up code, although it should slightly improve performance as well. --- vvp/logic.cc | 10 +--- vvp/vvp_island.cc | 7 ++- vvp/vvp_net.cc | 117 +++++++++++++++++++++++++++++++++++----------- vvp/vvp_net.h | 63 +++++++++++++++---------- 4 files changed, 138 insertions(+), 59 deletions(-) diff --git a/vvp/logic.cc b/vvp/logic.cc index 2ed65d138..fc5e433c0 100644 --- a/vvp/logic.cc +++ b/vvp/logic.cc @@ -399,7 +399,7 @@ void vvp_fun_not::recv_vec4(vvp_net_ptr_t ptr, const vvp_vector4_t&bit, if (ptr.port() != 0) return; - if (input_ .eeq( bit )) + if (input_ .eq_xz( bit )) return; input_ = bit; @@ -414,13 +414,7 @@ void vvp_fun_not::run_run() vvp_net_t*ptr = net_; net_ = 0; - vvp_vector4_t result (input_); - - for (unsigned idx = 0 ; idx < result.size() ; idx += 1) { - vvp_bit4_t bitbit = ~ result.value(idx); - result.set_bit(idx, bitbit); - } - + vvp_vector4_t result (input_, true /* invert */); ptr->send_vec4(result, 0); } diff --git a/vvp/vvp_island.cc b/vvp/vvp_island.cc index 17573cde4..d1fe652b6 100644 --- a/vvp/vvp_island.cc +++ b/vvp/vvp_island.cc @@ -173,7 +173,12 @@ vvp_island_port::~vvp_island_port() void vvp_island_port::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit, vvp_context_t) { - recv_vec8(port, vvp_vector8_t(bit, 6, 6)); + vvp_vector8_t tmp (bit, 6, 6); + if (invalue .eeq(tmp)) + return; + + invalue = tmp; + island_->flag_island(); } void vvp_island_port::recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit, diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc index b9d1a1198..afa2d2fcc 100644 --- a/vvp/vvp_net.cc +++ b/vvp/vvp_net.cc @@ -583,6 +583,41 @@ void vvp_vector4_t::copy_from_(const vvp_vector4_t&that) } } +/* + * The copy_inverted_from_ method is just like the copy_from_ method, + * except that we combine that with an invert. This allows the ~ and + * the assignment to be blended in many common cases. + */ +void vvp_vector4_t::copy_inverted_from_(const vvp_vector4_t&that) +{ + size_ = that.size_; + if (size_ > BITS_PER_WORD) { + unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD; + abits_ptr_ = new unsigned long[2*words]; + bbits_ptr_ = abits_ptr_ + words; + + unsigned remaining = size_; + unsigned idx = 0; + while (remaining >= BITS_PER_WORD) { + abits_ptr_[idx] = that.bbits_ptr_[idx] | ~that.abits_ptr_[idx]; + idx += 1; + remaining -= BITS_PER_WORD; + } + if (remaining > 0) { + unsigned long mask = (1UL< BITS_PER_WORD) { @@ -1238,6 +1273,40 @@ bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const return true; } +bool vvp_vector4_t::eq_xz(const vvp_vector4_t&that) const +{ + if (size_ != that.size_) + return false; + + if (size_ < BITS_PER_WORD) { + unsigned long mask = (1UL << size_) - 1; + return ((abits_val_|bbits_val_)&mask) == ((that.abits_val_|that.bbits_val_)&mask) + && (bbits_val_&mask) == (that.bbits_val_&mask); + } + + if (size_ == BITS_PER_WORD) { + return ((abits_val_|bbits_val_) == (that.abits_val_|that.bbits_val_)) + && (bbits_val_ == that.bbits_val_); + } + + unsigned words = size_ / BITS_PER_WORD; + for (unsigned idx = 0 ; idx < words ; idx += 1) { + if ((abits_ptr_[idx]|bbits_ptr_[idx]) != (that.abits_ptr_[idx]|that.bbits_ptr_[idx])) + return false; + if (bbits_ptr_[idx] != that.bbits_ptr_[idx]) + return false; + } + + unsigned long mask = size_%BITS_PER_WORD; + if (mask > 0) { + mask = (1UL << mask) - 1; + return ((abits_ptr_[words]|bbits_ptr_[words])&mask) == ((that.abits_ptr_[words]|that.bbits_ptr_[words])&mask) + && (bbits_ptr_[words]&mask) == (that.bbits_ptr_[words]&mask); + } + + return true; +} + bool vvp_vector4_t::has_xz() const { if (size_ < BITS_PER_WORD) { @@ -2557,12 +2626,11 @@ ostream& operator<< (ostream&out, const vvp_vector2_t&that) vvp_vector8_t::vvp_vector8_t(const vvp_vector8_t&that) { size_ = that.size_; - if (size_ <= PTR_THRESH) { - memcpy(val_, that.val_, sizeof(val_)); + if (size_ <= sizeof val_) { + ptr_ = that.ptr_; } else { - ptr_ = new vvp_scalar_t[size_]; - for (unsigned idx = 0 ; idx < size_ ; idx += 1) - ptr_[idx] = that.ptr_[idx]; + ptr_ = new unsigned char[size_]; + memcpy(ptr_, that.ptr_, size_); } } @@ -2573,15 +2641,14 @@ vvp_vector8_t::vvp_vector8_t(const vvp_vector4_t&that, if (size_ == 0) return; - vvp_scalar_t*tmp; - if (size_ <= PTR_THRESH) - tmp = new (val_) vvp_scalar_t[PTR_THRESH]; - else - tmp = ptr_ = new vvp_scalar_t[size_]; - - for (unsigned idx = 0 ; idx < size_ ; idx += 1) - tmp[idx] = vvp_scalar_t (that.value(idx), str0, str1); - + if (size_ <= sizeof val_) { + for (unsigned idx = 0 ; idx < size_ ; idx += 1) + val_[idx] = vvp_scalar_t(that.value(idx),str0, str1).raw(); + } else { + ptr_ = new unsigned char[size_]; + for (unsigned idx = 0 ; idx < size_ ; idx += 1) + ptr_[idx] = vvp_scalar_t(that.value(idx), str0, str1).raw(); + } } const vvp_vector8_t vvp_vector8_t::nil; @@ -2589,12 +2656,11 @@ const vvp_vector8_t vvp_vector8_t::nil; vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that) { // Assign to self. - if (this == &that || (size_ > PTR_THRESH && that.size_ > PTR_THRESH && - ptr_ == that.ptr_)) + if (this == &that) return *this; if (size_ != that.size_) { - if (size_ > PTR_THRESH) + if (size_ > sizeof val_) delete[]ptr_; size_ = 0; } @@ -2604,7 +2670,7 @@ vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that) return *this; } - if (that.size_ <= PTR_THRESH) { + if (that.size_ <= sizeof val_) { size_ = that.size_; memcpy(val_, that.val_, sizeof(val_)); return *this; @@ -2612,11 +2678,10 @@ vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that) if (size_ == 0) { size_ = that.size_; - ptr_ = new vvp_scalar_t[size_]; + ptr_ = new unsigned char[size_]; } - for (unsigned idx = 0 ; idx < size_ ; idx += 1) - ptr_[idx] = that.ptr_[idx]; + memcpy(ptr_, that.ptr_, size_); return *this; } @@ -2625,12 +2690,12 @@ vvp_vector8_t vvp_vector8_t::subvalue(unsigned base, unsigned wid) const { vvp_vector8_t tmp (wid); - vvp_scalar_t* tmp_ptr = tmp.size_<=PTR_THRESH? reinterpret_cast(tmp.val_) : tmp.ptr_; - const vvp_scalar_t* ptr = size_<=PTR_THRESH? reinterpret_cast(val_) : ptr_; + unsigned char*tmp_ptr = tmp.size_ <= sizeof val_? tmp.val_ : tmp.ptr_; + const unsigned char*use_ptr = size_ <= sizeof val_? val_ : ptr_; unsigned idx = 0; while ((idx < wid) && (base+idx < size_)) { - tmp_ptr[idx] = ptr[base+idx]; + tmp_ptr[idx] = use_ptr[base+idx]; idx += 1; } @@ -2649,8 +2714,8 @@ vvp_vector8_t part_expand(const vvp_vector8_t&that, unsigned wid, unsigned off) assert(off < wid); vvp_vector8_t tmp (wid); - vvp_scalar_t* tmp_ptr = tmp.size_<=vvp_vector8_t::PTR_THRESH? reinterpret_cast(tmp.val_) : tmp.ptr_; - const vvp_scalar_t* that_ptr = that.size_<=vvp_vector8_t::PTR_THRESH? reinterpret_cast(that.val_) : that.ptr_; + unsigned char* tmp_ptr = tmp.size_<= sizeof tmp.val_? tmp.val_ : tmp.ptr_; + const unsigned char* that_ptr = that.size_<= sizeof that.val_? that.val_ : that.ptr_; unsigned idx = off; diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h index b264a0a26..f12e10f81 100644 --- a/vvp/vvp_net.h +++ b/vvp/vvp_net.h @@ -211,6 +211,7 @@ class vvp_vector4_t { unsigned adr, unsigned wid); vvp_vector4_t(const vvp_vector4_t&that); + vvp_vector4_t(const vvp_vector4_t&that, bool invert_flag); vvp_vector4_t& operator= (const vvp_vector4_t&that); ~vvp_vector4_t(); @@ -240,6 +241,9 @@ class vvp_vector4_t { // Test that the vectors are exactly equal bool eeq(const vvp_vector4_t&that) const; + // Test that the vectors are equal, with xz comparing as equal. + bool eq_xz(const vvp_vector4_t&that) const; + // Return true if there is an X or Z anywhere in the vector. bool has_xz() const; @@ -283,6 +287,7 @@ class vvp_vector4_t { // Initialize and operator= use this private method to copy // the data from that object into this object. void copy_from_(const vvp_vector4_t&that); + void copy_inverted_from_(const vvp_vector4_t&that); void allocate_words_(unsigned size, unsigned long inita, unsigned long initb); @@ -313,6 +318,14 @@ inline vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that) copy_from_(that); } +inline vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that, bool invert_flag) +{ + if (invert_flag) + copy_inverted_from_(that); + else + copy_from_(that); +} + inline vvp_vector4_t::vvp_vector4_t(unsigned size__, vvp_bit4_t val) : size_(size__) { @@ -442,8 +455,7 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val) inline vvp_vector4_t operator ~ (const vvp_vector4_t&that) { - vvp_vector4_t res = that; - res.invert(); + vvp_vector4_t res (that, true); return res; } @@ -700,6 +712,14 @@ class vvp_scalar_t { bool eeq(vvp_scalar_t that) const { return value_ == that.value_; } bool is_hiz() const { return value_ == 0; } + private: + // This class and the vvp_vector8_t class are closely related, + // so allow vvp_vector8_t access to the raw encoding so that + // it can do compact vectoring of vvp_scalar_t objects. + friend class vvp_vector8_t; + explicit vvp_scalar_t(unsigned char raw) : value_(raw) { } + unsigned char raw() const { return value_; } + private: unsigned char value_; }; @@ -813,11 +833,10 @@ class vvp_vector8_t { // This is the number of vvp_scalar_t objects we can keep in // the val_ buffer. If the vector8 is bigger then this, then // resort to allocations to get a larger buffer. - enum { PTR_THRESH = 8 }; unsigned size_; union { - vvp_scalar_t*ptr_; - char val_[PTR_THRESH * sizeof(vvp_scalar_t)]; + unsigned char*ptr_; + unsigned char val_[sizeof(void*)]; }; }; @@ -853,35 +872,36 @@ extern ostream& operator<< (ostream&, const vvp_vector8_t&); inline vvp_vector8_t::vvp_vector8_t(unsigned size__) : size_(size__) { - if (size_ <= PTR_THRESH) { - new (val_) vvp_scalar_t[PTR_THRESH]; + if (size_ <= sizeof val_) { + ptr_ = 0; } else { - ptr_ = new vvp_scalar_t[size_]; + ptr_ = new unsigned char[size_]; + memset(ptr_, 0, size_); } } inline vvp_vector8_t::~vvp_vector8_t() { - if (size_ > PTR_THRESH) + if (size_ > sizeof val_) delete[]ptr_; } inline vvp_scalar_t vvp_vector8_t::value(unsigned idx) const { assert(idx < size_); - if (size_ <= PTR_THRESH) - return reinterpret_cast(val_) [idx]; + if (size_ <= sizeof val_) + return vvp_scalar_t(val_[idx]); else - return ptr_[idx]; + return vvp_scalar_t(ptr_[idx]); } inline void vvp_vector8_t::set_bit(unsigned idx, vvp_scalar_t val) { assert(idx < size_); - if (size_ <= PTR_THRESH) - reinterpret_cast(val_) [idx] = val; + if (size_ <= sizeof val_) + val_[idx] = val.raw(); else - ptr_[idx] = val; + ptr_[idx] = val.raw(); } // Exactly-equal for vvp_vector8_t is common and should be as tight @@ -893,15 +913,10 @@ inline bool vvp_vector8_t::eeq(const vvp_vector8_t&that) const if (size_ == 0) return true; - if (size_ <= PTR_THRESH) - return 0 == memcmp(val_, that.val_, sizeof(val_)); - - for (unsigned idx = 0 ; idx < size_ ; idx += 1) { - if (! ptr_[idx] .eeq( that.ptr_[idx] )) - return false; - } - - return true; + if (size_ <= sizeof val_) + return ptr_ == that.ptr_; + else + return memcmp(ptr_, that.ptr_, size_) == 0; } /*