Merge pull request #1342 from larsclausen/vvp-reduce-speed-up

vvp: Improve reduction operator performance
2026-05-08 05:17:52 -07:00 · 2026-05-08 05:17:52 -07:00 · e0e4a2af48
parent e4c4247266 48242818b3
commit e0e4a2af48
4 changed files with 122 additions and 134 deletions
--- a/vvp/reduce.cc
+++ b/vvp/reduce.cc
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2005-2025 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2005-2026 Stephen Williams (steve@icarus.com)
 *
 *    This source code is free software; you can redistribute it
 *    and/or modify it in source code form under the terms of the GNU
@ -97,12 +97,7 @@ vvp_reduce_and::~vvp_reduce_and()

 vvp_bit4_t vvp_reduce_and::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_1;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res & bits_.value(idx);
-
-      return res;
+      return bits_.reduce_and();
 }

 class vvp_reduce_or  : public vvp_reduce_base {
@ -123,12 +118,7 @@ vvp_reduce_or::~vvp_reduce_or()

 vvp_bit4_t vvp_reduce_or::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_0;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res | bits_.value(idx);
-
-      return res;
+      return bits_.reduce_or();
 }

 class vvp_reduce_xor  : public vvp_reduce_base {
@ -149,12 +139,7 @@ vvp_reduce_xor::~vvp_reduce_xor()

 vvp_bit4_t vvp_reduce_xor::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_0;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res ^ bits_.value(idx);
-
-      return res;
+      return bits_.reduce_xor();
 }

 class vvp_reduce_nand  : public vvp_reduce_base {
@ -175,12 +160,7 @@ vvp_reduce_nand::~vvp_reduce_nand()

 vvp_bit4_t vvp_reduce_nand::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_1;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res & bits_.value(idx);
-
-      return ~res;
+      return ~bits_.reduce_and();
 }

 class vvp_reduce_nor  : public vvp_reduce_base {
@ -201,12 +181,7 @@ vvp_reduce_nor::~vvp_reduce_nor()

 vvp_bit4_t vvp_reduce_nor::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_0;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res | bits_.value(idx);
-
-      return ~res;
+      return ~bits_.reduce_or();
 }

 class vvp_reduce_xnor  : public vvp_reduce_base {
@ -227,12 +202,7 @@ vvp_reduce_xnor::~vvp_reduce_xnor()

 vvp_bit4_t vvp_reduce_xnor::calculate_result() const
 {
-      vvp_bit4_t res =  BIT4_0;
-
-      for (unsigned idx = 0 ;  idx < bits_.size() ;  idx += 1)
-	    res = res ^ bits_.value(idx);
-
-      return ~res;
+      return ~bits_.reduce_xor();
 }

 static void make_reduce(char*label, vvp_net_fun_t*red, const struct symb_s&arg)
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -4576,24 +4576,8 @@ bool of_NOOP(vthread_t, vvp_code_t)
 */
 bool of_NORR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
-
-      vvp_bit4_t lb = BIT4_1;
-
-      for (unsigned idx = 0 ;  idx < val.size() ;  idx += 1) {
-
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_1) {
-		  lb = BIT4_0;
-		  break;
-	    }
-
-	    if (rb != BIT4_0)
-		  lb = BIT4_X;
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, ~val.reduce_or());

      return true;
 }
@ -4613,23 +4597,8 @@ bool of_NULL(vthread_t thr, vvp_code_t)
 */
 bool of_ANDR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
-
-      vvp_bit4_t lb = BIT4_1;
-
-      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_0) {
-		  lb = BIT4_0;
-		  break;
-	    }
-
-	    if (rb != 1)
-		  lb = BIT4_X;
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, val.reduce_and());

      return true;
 }
@ -4639,23 +4608,8 @@ bool of_ANDR(vthread_t thr, vvp_code_t)
 */
 bool of_NANDR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
-
-      vvp_bit4_t lb = BIT4_0;
-      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
-
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_0) {
-		  lb = BIT4_1;
-		  break;
-	    }
-
-	    if (rb != BIT4_1)
-		  lb = BIT4_X;
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, ~val.reduce_and());

      return true;
 }
@ -4665,22 +4619,9 @@ bool of_NANDR(vthread_t thr, vvp_code_t)
 */
 bool of_ORR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, val.reduce_or());

-      vvp_bit4_t lb = BIT4_0;
-      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_1) {
-		  lb = BIT4_1;
-		  break;
-	    }
-
-	    if (rb != BIT4_0)
-		  lb = BIT4_X;
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
      return true;
 }

@ -4689,22 +4630,9 @@ bool of_ORR(vthread_t thr, vvp_code_t)
 */
 bool of_XORR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, val.reduce_xor());

-      vvp_bit4_t lb = BIT4_0;
-      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
-
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_1)
-		  lb = ~lb;
-	    else if (rb != BIT4_0) {
-		  lb = BIT4_X;
-		  break;
-	    }
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
      return true;
 }

@ -4713,22 +4641,9 @@ bool of_XORR(vthread_t thr, vvp_code_t)
 */
 bool of_XNORR(vthread_t thr, vvp_code_t)
 {
-      vvp_vector4_t val = thr->pop_vec4();
+      vvp_vector4_t&val = thr->peek_vec4();
+      val = vvp_vector4_t(1, ~val.reduce_xor());

-      vvp_bit4_t lb = BIT4_1;
-      for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
-
-	    vvp_bit4_t rb = val.value(idx);
-	    if (rb == BIT4_1)
-		  lb = ~lb;
-	    else if (rb != BIT4_0) {
-		  lb = BIT4_X;
-		  break;
-	    }
-      }
-
-      vvp_vector4_t res (1, lb);
-      thr->push_vec4(res);
      return true;
 }

--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -1905,6 +1905,105 @@ void vvp_vector4_t::invert()
      }
 }

+#define BIT_MASK(n) ((n) ? ((~0UL) >> (BITS_PER_WORD - (n))) : ~0UL)
+
+vvp_bit4_t vvp_vector4_t::reduce_or() const
+{
+      unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
+      vvp_bit4_t res = BIT4_0;
+
+      if (size_ <= BITS_PER_WORD) {
+	    if ((abits_val_ & ~bbits_val_ & mask) != 0UL)
+		  return BIT4_1;
+	    if ((bbits_val_ & mask) != 0UL)
+		  return BIT4_X;
+      } else {
+	    unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
+	    unsigned idx;
+	    for (idx = 0; idx < words - 1; idx += 1) {
+		  if ((abits_ptr_[idx] & ~bbits_ptr_[idx]) != 0UL)
+			return BIT4_1;
+		  if (bbits_ptr_[idx] != 0UL)
+			res = BIT4_X;
+	    }
+	    if ((abits_ptr_[idx] & ~bbits_ptr_[idx] & mask) != 0UL)
+		  return BIT4_1;
+	    if ((bbits_ptr_[idx] & mask) != 0UL)
+		  res = BIT4_X;
+      }
+
+      return res;
+}
+
+vvp_bit4_t vvp_vector4_t::reduce_and() const
+{
+      unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
+      vvp_bit4_t res = BIT4_1;
+
+      if (size_ <= BITS_PER_WORD) {
+	    if ((abits_val_ | bbits_val_ | ~mask) != ~0UL)
+		  return BIT4_0;
+	    if ((bbits_val_ & mask) != 0UL)
+		  return BIT4_X;
+	} else {
+	    unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
+	    unsigned idx;
+	    for (idx = 0; idx < words - 1; idx += 1) {
+		  if ((abits_ptr_[idx] | bbits_ptr_[idx]) != ~0UL)
+			return BIT4_0;
+		  if (bbits_ptr_[idx] != 0UL)
+			res = BIT4_X;
+	    }
+	    if ((abits_ptr_[idx] | bbits_ptr_[idx] | ~mask) != ~0UL)
+		  return BIT4_0;
+	    if ((bbits_ptr_[idx] & mask) != 0UL)
+		  res = BIT4_X;
+      }
+
+      return res;
+}
+
+static unsigned long parity(unsigned long val)
+{
+#if defined(__GNUC__)
+      // The compiler builtin can use target-specific CPU instructions.
+      return __builtin_parityl(val);
+#else
+#if ULONG_MAX > 0xffffffffUL
+	val ^= val >> 32;
+#endif
+	val ^= val >> 16;
+	val ^= val >> 8;
+	val ^= val >> 4;
+
+      return (0x6996 >> (val & 0xf)) & 1;
+#endif
+}
+
+vvp_bit4_t vvp_vector4_t::reduce_xor() const
+{
+      unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
+
+      if (size_ <= BITS_PER_WORD) {
+	    if ((bbits_val_ & mask) != 0UL)
+		  return BIT4_X;
+	    return parity(abits_val_ & mask) ? BIT4_1 : BIT4_0;
+      } else {
+	    unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
+	    unsigned long val_a = 0UL;
+	    unsigned idx;
+	    for (idx = 0; idx < words - 1; idx += 1) {
+		  if (bbits_ptr_[idx] != 0UL)
+			return BIT4_X;
+		  val_a ^= abits_ptr_[idx];
+	    }
+	    if ((bbits_ptr_[idx] & mask) != 0UL)
+		  return BIT4_X;
+	    val_a ^= abits_ptr_[idx] & mask;
+	    return parity(val_a) ? BIT4_1 : BIT4_0;
+      }
+}
+
 vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
 {
 	// The truth table is:
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -326,6 +326,10 @@ class vvp_vector4_t {
      vvp_vector4_t& operator ^= (const vvp_vector4_t&that);
      vvp_vector4_t& operator += (int64_t);

+      vvp_bit4_t reduce_or() const;
+      vvp_bit4_t reduce_and() const;
+      vvp_bit4_t reduce_xor() const;
+
    private:
 	// Number of vvp_bit4_t bits that can be shoved into a word.
      enum { BITS_PER_WORD = 8*sizeof(unsigned long) };