Vectorize AND/OR/NAND/NOR/INV instructions when reasonable.

When processing wide vectors of these operations, it pays to process them as vectors. This improves run-time performance. Have the run time select vectorized or not based on the vector width.
2008-05-23 17:52:43 -07:00 · 2008-05-23 17:52:43 -07:00 · 9af459f95b
parent 492b240304
commit 9af459f95b
3 changed files with 248 additions and 88 deletions
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -410,20 +410,29 @@ bool of_ABS_WR(vthread_t thr, vvp_code_t cp)
      return true;
 }
-bool of_AND(vthread_t thr, vvp_code_t cp)
+static bool of_AND_wide(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val &= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, val);
      return true;
 }
 static bool of_AND_narrow(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
+	    thr_put_bit(thr, idx1, lb&rb);
 	    thr_put_bit(thr, idx1, lb & rb);
 	    idx1 += 1;
 	    if (idx2 >= 4)
 		  idx2 += 1;
@ -432,6 +441,18 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
      return true;
 }
 bool of_AND(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      if (cp->number <= 4)
 	    cp->opcode = &of_AND_narrow;
      else
 	    cp->opcode = &of_AND_wide;
      return cp->opcode(thr, cp);
 }
 bool of_ADD(vthread_t thr, vvp_code_t cp)
 {
@ -2007,25 +2028,41 @@ bool of_FORK(vthread_t thr, vvp_code_t cp)
      return true;
 }
 static bool of_INV_wide(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      thr->bits4.set_vec(idx1, ~val);
      return true;
 }
 static bool of_INV_narrow(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    thr_put_bit(thr, idx1, ~lb);
 	    idx1 += 1;
      }
      return true;
 }
 bool of_INV(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
-      for (unsigned idx = 0 ;  idx < cp->bit_idx[1] ;  idx += 1) {
+
-	    vvp_bit4_t val = thr_get_bit(thr, cp->bit_idx[0]+idx);
+      if (cp->number <= 4)
-	    switch (val) {
+	    cp->opcode = &of_INV_narrow;
-		case BIT4_0:
+      else
-		  val = BIT4_1;
+	    cp->opcode = &of_INV_wide;
-		  break;
+
-		case BIT4_1:
+      return cp->opcode(thr, cp);
 		  val = BIT4_0;
 		  break;
 		default:
 		  val = BIT4_X;
 		  break;
 	    }
 	    thr_put_bit(thr, cp->bit_idx[0]+idx, val);
      }
      return true;
 }
@ -3091,28 +3128,29 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
      return true;
 }
-bool of_NAND(vthread_t thr, vvp_code_t cp)
+static bool of_NAND_wide(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val &= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, ~val);
      return true;
 }
 static bool of_NAND_narrow(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
+	    thr_put_bit(thr, idx1, ~(lb&rb));
 	    if ((lb == BIT4_0) || (rb == BIT4_0)) {
 		  thr_put_bit(thr, idx1, BIT4_1);
 	    } else if ((lb == BIT4_1) && (rb == BIT4_1)) {
 		  thr_put_bit(thr, idx1, BIT4_0);
 	    } else {
 		  thr_put_bit(thr, idx1, BIT4_X);
 	    }
 	    idx1 += 1;
 	    if (idx2 >= 4)
 		  idx2 += 1;
@ -3121,6 +3159,18 @@ bool of_NAND(vthread_t thr, vvp_code_t cp)
      return true;
 }
 bool of_NAND(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      if (cp->number <= 4)
 	    cp->opcode = &of_NAND_narrow;
      else
 	    cp->opcode = &of_NAND_wide;
      return cp->opcode(thr, cp);
 }
 bool of_NOOP(vthread_t thr, vvp_code_t cp)
 {
@ -3269,28 +3319,74 @@ bool of_XNORR(vthread_t thr, vvp_code_t cp)
      return true;
 }
 static bool of_OR_wide(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val |= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, val);
      return true;
 }
 static bool of_OR_narrow(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
 	    thr_put_bit(thr, idx1, lb|rb);
 	    idx1 += 1;
 	    if (idx2 >= 4)
 		  idx2 += 1;
      }
      return true;
 }
 bool of_OR(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      if (cp->number <= 4)
 	    cp->opcode = &of_OR_narrow;
      else
 	    cp->opcode = &of_OR_wide;
      return cp->opcode(thr, cp);
 }
 static bool of_NOR_wide(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
      val |= vthread_bits_to_vector(thr, idx2, wid);
      thr->bits4.set_vec(idx1, ~val);
      return true;
 }
 static bool of_NOR_narrow(vthread_t thr, vvp_code_t cp)
 {
      unsigned idx1 = cp->bit_idx[0];
      unsigned idx2 = cp->bit_idx[1];
      unsigned wid = cp->number;
      for (unsigned idx = 0 ; idx < wid ; idx += 1) {
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
+	    thr_put_bit(thr, idx1, ~(lb|rb));
 	    if ((lb == BIT4_1) || (rb == BIT4_1)) {
 		  thr_put_bit(thr, idx1, BIT4_1);
 	    } else if ((lb == BIT4_0) && (rb == BIT4_0)) {
 		  thr_put_bit(thr, idx1, BIT4_0);
 	    } else {
 		  thr_put_bit(thr, idx1, BIT4_X);
 	    }
 	    idx1 += 1;
 	    if (idx2 >= 4)
 		  idx2 += 1;
@ -3303,30 +3399,12 @@ bool of_NOR(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
-      unsigned idx1 = cp->bit_idx[0];
+      if (cp->number <= 4)
-      unsigned idx2 = cp->bit_idx[1];
+	    cp->opcode = &of_NOR_narrow;
      else
 	    cp->opcode = &of_NOR_wide;
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
+      return cp->opcode(thr, cp);
 	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
 	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
 	    if ((lb == BIT4_1) || (rb == BIT4_1)) {
 		  thr_put_bit(thr, idx1, BIT4_0);
 	    } else if ((lb == BIT4_0) && (rb == BIT4_0)) {
 		  thr_put_bit(thr, idx1, BIT4_1);
 	    } else {
 		  thr_put_bit(thr, idx1, BIT4_X);
 	    }
 	    idx1 += 1;
 	    if (idx2 >= 4)
 		  idx2 += 1;
      }
      return true;
 }
 bool of_POW(vthread_t thr, vvp_code_t cp)
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -874,6 +874,93 @@ char* vvp_vector4_t::as_string(char*buf, size_t buf_len)
      return res;
 }
 void vvp_vector4_t::invert()
 {
      if (size_ <= BITS_PER_WORD) {
 	    unsigned long mask = (size_<BITS_PER_WORD)? (1UL<<size_)-1UL : -1UL;
 	    abits_val_ = mask & ~abits_val_;
 	    abits_val_ |= bbits_val_;
      } else {
 	    unsigned remaining = size_;
 	    unsigned idx = 0;
 	    while (remaining >= BITS_PER_WORD) {
 		  abits_ptr_[idx] = ~abits_ptr_[idx];
 		  abits_ptr_[idx] |= bbits_ptr_[idx];
 		  idx += 1;
 		  remaining -= BITS_PER_WORD;
 	    }
 	    if (remaining > 0) {
 		  unsigned long mask = (1UL<<remaining) - 1UL;
 		  abits_ptr_[idx] = mask & ~abits_ptr_[idx];
 		  abits_ptr_[idx] |= bbits_ptr_[idx];
 	    }
      }
 }
 vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
 {
 	// Make sure that all Z bits are turned into X bits.
      change_z2x();
 	// This is sneaky. The truth table is:
 	//     00 01 11
 	//  00 00 00 00
 	//  01 00 01 11
 	//  11 00 11 11
      if (size_ <= BITS_PER_WORD) {
 	      // Each tmp bit is true if that is 1, X or Z.
 	    unsigned long tmp = that.abits_val_ | that.bbits_val_;
 	    abits_val_ &= that.abits_val_;
 	    bbits_val_ = (bbits_val_ & tmp) | (abits_val_&that.bbits_val_);
      } else {
 	    unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
 	    for (unsigned idx = 0; idx < words ; idx += 1) {
 		  unsigned long tmp = that.abits_ptr_[idx]|that.bbits_ptr_[idx];
 		  abits_ptr_[idx] &= that.abits_ptr_[idx];
 		  bbits_ptr_[idx] = (bbits_ptr_[idx]&tmp) | (abits_ptr_[idx]&that.bbits_ptr_[idx]);
 	    }
      }
      return *this;
 }
 vvp_vector4_t& vvp_vector4_t::operator |= (const vvp_vector4_t&that)
 {
 	// Make sure that all Z bits are turned into X bits.
      change_z2x();
 	// This is sneaky.
 	// The OR is 1 if either operand is 1.
 	// The OR is 0 if both operants are 0.
 	// Otherwise, the AND is X. The truth table is:
 	//
 	//     00 01 11
 	//  00 00 01 11
 	//  01 01 01 01
 	//  11 11 01 11
      if (size_ <= BITS_PER_WORD) {
 	      // Each tmp bit is true if that is 1, X or Z.
 	    unsigned long tmp1 = abits_val_ | bbits_val_;
 	    unsigned long tmp2 = that.abits_val_ | that.bbits_val_;
 	    bbits_val_ =  (bbits_val_& ~(that.abits_val_^that.bbits_val_))
 		        | (that.bbits_val_& ~abits_val_);
 	    abits_val_ = tmp1 | tmp2;
      } else {
 	    unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
 	    for (unsigned idx = 0; idx < words ; idx += 1) {
 		  unsigned long tmp1 = abits_ptr_[idx] | bbits_ptr_[idx];
 		  unsigned long tmp2 = that.abits_ptr_[idx] | that.bbits_ptr_[idx];
 	    bbits_ptr_[idx] =  (bbits_ptr_[idx]& ~(that.abits_ptr_[idx]^that.bbits_ptr_[idx]))
 		        | (that.bbits_ptr_[idx]& ~abits_ptr_[idx]);
 	    abits_ptr_[idx] = tmp1 | tmp2;
 	    }
      }
      return *this;
 }
 /*
 * Add an integer to the vvp_vector4_t in place, bit by bit so that
 * there is no size limitations.
@ -2872,20 +2959,6 @@ vvp_bit4_t compare_gtge(const vvp_vector4_t&lef, const vvp_vector4_t&rig,
      return out_if_equal;
 }
 vvp_vector4_t operator ~ (const vvp_vector4_t&that)
 {
      vvp_vector4_t res = that;
      if (res.size_ <= vvp_vector4_t::BITS_PER_WORD) {
 	    res.abits_val_ = res.bbits_val_ | ~res.abits_val_;
      } else {
 	    unsigned cnt = (res.size_ + vvp_vector4_t::BITS_PER_WORD - 1) / vvp_vector4_t::BITS_PER_WORD;
 	    for (unsigned idx = 0 ; idx < cnt ; idx += 1)
 		  res.abits_ptr_[idx] = res.bbits_val_ | ~res.abits_val_;
      }
      return res;
 }
 vvp_bit4_t compare_gtge_signed(const vvp_vector4_t&a,
 			       const vvp_vector4_t&b,
 			       vvp_bit4_t out_if_equal)
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -152,6 +152,9 @@ class vvp_vector4_t {
 	// Display the value into the buf as a string.
      char*as_string(char*buf, size_t buf_len);
      void invert();
      vvp_vector4_t& operator &= (const vvp_vector4_t&that);
      vvp_vector4_t& operator |= (const vvp_vector4_t&that);
      vvp_vector4_t& operator += (int64_t);
    private:
@ -329,7 +332,13 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
      }
 }
-extern vvp_vector4_t operator ~ (const vvp_vector4_t&that);
+inline vvp_vector4_t operator ~ (const vvp_vector4_t&that)
 {
      vvp_vector4_t res = that;
      res.invert();
      return res;
 }
 extern ostream& operator << (ostream&, const vvp_vector4_t&);
 extern vvp_bit4_t compare_gtge(const vvp_vector4_t&a,