From d0b1936fb50dc80bd2a795d3f2c89962104565bd Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 11 Jan 2010 11:42:25 -0800
Subject: [PATCH] Optimize handling of invert gates / Tweak vvp_vector8_t
 performance

The process of inverting and copying can be collapsed into a single
operation that should run a little faster. Also, inverting readily
vectorizes itself. I've also possibly reduced useless vvp_not_fun
iterations.

Also, include some minor tweaks to the vvp_vector8_t handling of
vector copies. This is more to clean up code, although it should
slightly improve performance as well.
---
 vvp/logic.cc      |  10 +---
 vvp/vvp_island.cc |   7 ++-
 vvp/vvp_net.cc    | 117 +++++++++++++++++++++++++++++++++++-----------
 vvp/vvp_net.h     |  63 +++++++++++++++----------
 4 files changed, 138 insertions(+), 59 deletions(-)

diff --git a/vvp/logic.cc b/vvp/logic.cc
index 2ed65d138..fc5e433c0 100644
--- a/vvp/logic.cc
+++ b/vvp/logic.cc
@@ -399,7 +399,7 @@ void vvp_fun_not::recv_vec4(vvp_net_ptr_t ptr, const vvp_vector4_t&bit,
       if (ptr.port() != 0)
 	    return;
 
-      if (input_ .eeq( bit ))
+      if (input_ .eq_xz( bit ))
 	    return;
 
       input_ = bit;
@@ -414,13 +414,7 @@ void vvp_fun_not::run_run()
       vvp_net_t*ptr = net_;
       net_ = 0;
 
-      vvp_vector4_t result (input_);
-
-      for (unsigned idx = 0 ;  idx < result.size() ;  idx += 1) {
-	    vvp_bit4_t bitbit = ~ result.value(idx);
-	    result.set_bit(idx, bitbit);
-      }
-
+      vvp_vector4_t result (input_, true /* invert */);
       ptr->send_vec4(result, 0);
 }
 
diff --git a/vvp/vvp_island.cc b/vvp/vvp_island.cc
index 17573cde4..d1fe652b6 100644
--- a/vvp/vvp_island.cc
+++ b/vvp/vvp_island.cc
@@ -173,7 +173,12 @@ vvp_island_port::~vvp_island_port()
 void vvp_island_port::recv_vec4(vvp_net_ptr_t port, const vvp_vector4_t&bit,
                                 vvp_context_t)
 {
-      recv_vec8(port, vvp_vector8_t(bit, 6, 6));
+      vvp_vector8_t tmp (bit, 6, 6);
+      if (invalue .eeq(tmp))
+	    return;
+
+      invalue = tmp;
+      island_->flag_island();
 }
 
 void vvp_island_port::recv_vec4_pv(vvp_net_ptr_t port, const vvp_vector4_t&bit,
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index b9d1a1198..afa2d2fcc 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -583,6 +583,41 @@ void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
       }
 }
 
+/*
+ * The copy_inverted_from_ method is just like the copy_from_ method,
+ * except that we combine that with an invert. This allows the ~ and
+ * the assignment to be blended in many common cases.
+ */
+void vvp_vector4_t::copy_inverted_from_(const vvp_vector4_t&that)
+{
+      size_ = that.size_;
+      if (size_ > BITS_PER_WORD) {
+	    unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
+	    abits_ptr_ = new unsigned long[2*words];
+	    bbits_ptr_ = abits_ptr_ + words;
+
+	    unsigned remaining = size_;
+	    unsigned idx = 0;
+	    while (remaining >= BITS_PER_WORD) {
+		  abits_ptr_[idx] = that.bbits_ptr_[idx] | ~that.abits_ptr_[idx];
+		  idx += 1;
+		  remaining -= BITS_PER_WORD;
+	    }
+	    if (remaining > 0) {
+		  unsigned long mask = (1UL<<remaining) - 1UL;
+		  abits_ptr_[idx] = mask & (that.bbits_ptr_[idx] | ~that.abits_ptr_[idx]);
+	    }
+
+	    for (unsigned idx = 0 ;  idx < words ;  idx += 1)
+		  bbits_ptr_[idx] = that.bbits_ptr_[idx];
+
+      } else {
+	    unsigned long mask = (size_<BITS_PER_WORD)? (1UL<<size_)-1UL : -1UL;
+	    abits_val_ = mask & (that.bbits_val_ | ~that.abits_val_);
+	    bbits_val_ = that.bbits_val_;
+      }
+}
+
 void vvp_vector4_t::allocate_words_(unsigned wid, unsigned long inita, unsigned long initb)
 {
       if (size_ > BITS_PER_WORD) {
@@ -1238,6 +1273,40 @@ bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
       return true;
 }
 
+bool vvp_vector4_t::eq_xz(const vvp_vector4_t&that) const
+{
+      if (size_ != that.size_)
+	    return false;
+
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = (1UL << size_) - 1;
+	    return ((abits_val_|bbits_val_)&mask) == ((that.abits_val_|that.bbits_val_)&mask)
+		  && (bbits_val_&mask) == (that.bbits_val_&mask);
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    return ((abits_val_|bbits_val_) == (that.abits_val_|that.bbits_val_))
+		  && (bbits_val_ == that.bbits_val_);
+      }
+
+      unsigned words = size_ / BITS_PER_WORD;
+      for (unsigned idx = 0 ;  idx < words ;  idx += 1) {
+	    if ((abits_ptr_[idx]|bbits_ptr_[idx]) != (that.abits_ptr_[idx]|that.bbits_ptr_[idx]))
+		  return false;
+	    if (bbits_ptr_[idx] != that.bbits_ptr_[idx])
+		  return false;
+      }
+
+      unsigned long mask = size_%BITS_PER_WORD;
+      if (mask > 0) {
+	    mask = (1UL << mask) - 1;
+	    return ((abits_ptr_[words]|bbits_ptr_[words])&mask) == ((that.abits_ptr_[words]|that.bbits_ptr_[words])&mask)
+		  && (bbits_ptr_[words]&mask) == (that.bbits_ptr_[words]&mask);
+      }
+
+      return true;
+}
+
 bool vvp_vector4_t::has_xz() const
 {
       if (size_ < BITS_PER_WORD) {
@@ -2557,12 +2626,11 @@ ostream& operator<< (ostream&out, const vvp_vector2_t&that)
 vvp_vector8_t::vvp_vector8_t(const vvp_vector8_t&that)
 {
       size_ = that.size_;
-      if (size_ <= PTR_THRESH) {
-	    memcpy(val_, that.val_, sizeof(val_));
+      if (size_ <= sizeof val_) {
+	    ptr_ = that.ptr_;
       } else {
-	    ptr_ = new vvp_scalar_t[size_];
-	    for (unsigned idx = 0 ;  idx < size_ ;  idx += 1)
-		  ptr_[idx] = that.ptr_[idx];
+	    ptr_ = new unsigned char[size_];
+	    memcpy(ptr_, that.ptr_, size_);
       }
 }
 
@@ -2573,15 +2641,14 @@ vvp_vector8_t::vvp_vector8_t(const vvp_vector4_t&that,
       if (size_ == 0)
 	    return;
 
-      vvp_scalar_t*tmp;
-      if (size_ <= PTR_THRESH)
-	    tmp = new (val_) vvp_scalar_t[PTR_THRESH];
-      else
-	    tmp = ptr_ = new vvp_scalar_t[size_];
-
-      for (unsigned idx = 0 ;  idx < size_ ;  idx += 1)
-	    tmp[idx] = vvp_scalar_t (that.value(idx), str0, str1);
-
+      if (size_ <= sizeof val_) {
+	    for (unsigned idx = 0 ; idx < size_ ; idx += 1)
+		  val_[idx] = vvp_scalar_t(that.value(idx),str0, str1).raw();
+      } else {
+	    ptr_ = new unsigned char[size_];
+	    for (unsigned idx = 0 ;  idx < size_ ;  idx += 1)
+		  ptr_[idx] = vvp_scalar_t(that.value(idx), str0, str1).raw();
+      }
 }
 
 const vvp_vector8_t vvp_vector8_t::nil;
@@ -2589,12 +2656,11 @@ const vvp_vector8_t vvp_vector8_t::nil;
 vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that)
 {
 	// Assign to self.
-      if (this == &that || (size_ > PTR_THRESH && that.size_ > PTR_THRESH &&
-                            ptr_ == that.ptr_))
+      if (this == &that)
 	    return *this;
 
       if (size_ != that.size_) {
-	    if (size_ > PTR_THRESH)
+	    if (size_ > sizeof val_)
 		  delete[]ptr_;
 	    size_ = 0;
       }
@@ -2604,7 +2670,7 @@ vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that)
 	    return *this;
       }
 
-      if (that.size_ <= PTR_THRESH) {
+      if (that.size_ <= sizeof val_) {
 	    size_ = that.size_;
 	    memcpy(val_, that.val_, sizeof(val_));
 	    return *this;
@@ -2612,11 +2678,10 @@ vvp_vector8_t& vvp_vector8_t::operator= (const vvp_vector8_t&that)
 
       if (size_ == 0) {
 	    size_ = that.size_;
-	    ptr_ = new vvp_scalar_t[size_];
+	    ptr_ = new unsigned char[size_];
       }
 
-      for (unsigned idx = 0 ;  idx < size_ ;  idx += 1)
-	    ptr_[idx] = that.ptr_[idx];
+      memcpy(ptr_, that.ptr_, size_);
 
       return *this;
 }
@@ -2625,12 +2690,12 @@ vvp_vector8_t vvp_vector8_t::subvalue(unsigned base, unsigned wid) const
 {
       vvp_vector8_t tmp (wid);
 
-      vvp_scalar_t* tmp_ptr = tmp.size_<=PTR_THRESH? reinterpret_cast<vvp_scalar_t*>(tmp.val_) : tmp.ptr_;
-      const vvp_scalar_t* ptr = size_<=PTR_THRESH? reinterpret_cast<const vvp_scalar_t*>(val_) : ptr_;
+      unsigned char*tmp_ptr = tmp.size_ <= sizeof val_? tmp.val_ : tmp.ptr_;
+      const unsigned char*use_ptr = size_ <= sizeof val_? val_ : ptr_;
 
       unsigned idx = 0;
       while ((idx < wid) && (base+idx < size_)) {
-	    tmp_ptr[idx] = ptr[base+idx];
+	    tmp_ptr[idx] = use_ptr[base+idx];
 	    idx += 1;
       }
 
@@ -2649,8 +2714,8 @@ vvp_vector8_t part_expand(const vvp_vector8_t&that, unsigned wid, unsigned off)
       assert(off < wid);
       vvp_vector8_t tmp (wid);
 
-      vvp_scalar_t* tmp_ptr = tmp.size_<=vvp_vector8_t::PTR_THRESH? reinterpret_cast<vvp_scalar_t*>(tmp.val_) : tmp.ptr_;
-      const vvp_scalar_t* that_ptr = that.size_<=vvp_vector8_t::PTR_THRESH? reinterpret_cast<const vvp_scalar_t*>(that.val_) : that.ptr_;
+      unsigned char* tmp_ptr = tmp.size_<= sizeof tmp.val_? tmp.val_ : tmp.ptr_;
+      const unsigned char* that_ptr = that.size_<= sizeof that.val_? that.val_ : that.ptr_;
 
       unsigned idx = off;
 
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index b264a0a26..f12e10f81 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -211,6 +211,7 @@ class vvp_vector4_t {
 			     unsigned adr, unsigned wid);
 
       vvp_vector4_t(const vvp_vector4_t&that);
+      vvp_vector4_t(const vvp_vector4_t&that, bool invert_flag);
       vvp_vector4_t& operator= (const vvp_vector4_t&that);
 
       ~vvp_vector4_t();
@@ -240,6 +241,9 @@ class vvp_vector4_t {
 	// Test that the vectors are exactly equal
       bool eeq(const vvp_vector4_t&that) const;
 
+	// Test that the vectors are equal, with xz comparing as equal.
+      bool eq_xz(const vvp_vector4_t&that) const;
+
 	// Return true if there is an X or Z anywhere in the vector.
       bool has_xz() const;
 
@@ -283,6 +287,7 @@ class vvp_vector4_t {
 	// Initialize and operator= use this private method to copy
 	// the data from that object into this object.
       void copy_from_(const vvp_vector4_t&that);
+      void copy_inverted_from_(const vvp_vector4_t&that);
 
       void allocate_words_(unsigned size, unsigned long inita, unsigned long initb);
 
@@ -313,6 +318,14 @@ inline vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that)
       copy_from_(that);
 }
 
+inline vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that, bool invert_flag)
+{
+      if (invert_flag)
+	    copy_inverted_from_(that);
+      else
+	    copy_from_(that);
+}
+
 inline vvp_vector4_t::vvp_vector4_t(unsigned size__, vvp_bit4_t val)
 : size_(size__)
 {
@@ -442,8 +455,7 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
 
 inline vvp_vector4_t operator ~ (const vvp_vector4_t&that)
 {
-      vvp_vector4_t res = that;
-      res.invert();
+      vvp_vector4_t res (that, true);
       return res;
 }
 
@@ -700,6 +712,14 @@ class vvp_scalar_t {
       bool eeq(vvp_scalar_t that) const { return value_ == that.value_; }
       bool is_hiz() const { return value_ == 0; }
 
+    private:
+	// This class and the vvp_vector8_t class are closely related,
+	// so allow vvp_vector8_t access to the raw encoding so that
+	// it can do compact vectoring of vvp_scalar_t objects.
+      friend class vvp_vector8_t;
+      explicit vvp_scalar_t(unsigned char raw) : value_(raw) { }
+      unsigned char raw() const { return value_; }
+
     private:
       unsigned char value_;
 };
@@ -813,11 +833,10 @@ class vvp_vector8_t {
 	// This is the number of vvp_scalar_t objects we can keep in
 	// the val_ buffer. If the vector8 is bigger then this, then
 	// resort to allocations to get a larger buffer.
-      enum { PTR_THRESH = 8 };
       unsigned size_;
       union {
-	    vvp_scalar_t*ptr_;
-	    char val_[PTR_THRESH * sizeof(vvp_scalar_t)];
+	    unsigned char*ptr_;
+	    unsigned char val_[sizeof(void*)];
       };
 };
 
@@ -853,35 +872,36 @@ extern ostream& operator<< (ostream&, const vvp_vector8_t&);
 inline vvp_vector8_t::vvp_vector8_t(unsigned size__)
 : size_(size__)
 {
-      if (size_ <= PTR_THRESH) {
-	    new (val_) vvp_scalar_t[PTR_THRESH];
+      if (size_ <= sizeof val_) {
+	    ptr_ = 0;
       } else {
-	    ptr_ = new vvp_scalar_t[size_];
+	    ptr_ = new unsigned char[size_];
+	    memset(ptr_, 0, size_);
       }
 }
 
 inline vvp_vector8_t::~vvp_vector8_t()
 {
-      if (size_ > PTR_THRESH)
+      if (size_ > sizeof val_)
 	    delete[]ptr_;
 }
 
 inline vvp_scalar_t vvp_vector8_t::value(unsigned idx) const
 {
       assert(idx < size_);
-      if (size_ <= PTR_THRESH)
-	    return reinterpret_cast<const vvp_scalar_t*>(val_) [idx];
+      if (size_ <= sizeof val_)
+	    return vvp_scalar_t(val_[idx]);
       else
-	    return ptr_[idx];
+	    return vvp_scalar_t(ptr_[idx]);
 }
 
 inline void vvp_vector8_t::set_bit(unsigned idx, vvp_scalar_t val)
 {
       assert(idx < size_);
-      if (size_ <= PTR_THRESH)
-	    reinterpret_cast<vvp_scalar_t*>(val_) [idx] = val;
+      if (size_ <= sizeof val_)
+	    val_[idx] = val.raw();
       else
-	    ptr_[idx] = val;
+	    ptr_[idx] = val.raw();
 }
 
   // Exactly-equal for vvp_vector8_t is common and should be as tight
@@ -893,15 +913,10 @@ inline bool vvp_vector8_t::eeq(const vvp_vector8_t&that) const
       if (size_ == 0)
 	    return true;
 
-      if (size_ <= PTR_THRESH)
-	    return 0 == memcmp(val_, that.val_, sizeof(val_));
-
-      for (unsigned idx = 0 ;  idx < size_ ;  idx += 1) {
-	    if (! ptr_[idx] .eeq( that.ptr_[idx] ))
-		return false;
-      }
-
-      return true;
+      if (size_ <= sizeof val_)
+	    return ptr_ == that.ptr_;
+      else
+	    return memcmp(ptr_, that.ptr_, size_) == 0;
 }
 
 /*