Merge pull request #1342 from larsclausen/vvp-reduce-speed-up

vvp: Improve reduction operator performance
This commit is contained in:
Cary R. 2026-05-08 05:17:52 -07:00 committed by GitHub
commit e0e4a2af48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 122 additions and 134 deletions

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005-2025 Stephen Williams (steve@icarus.com)
* Copyright (c) 2005-2026 Stephen Williams (steve@icarus.com)
*
* This source code is free software; you can redistribute it
* and/or modify it in source code form under the terms of the GNU
@ -97,12 +97,7 @@ vvp_reduce_and::~vvp_reduce_and()
vvp_bit4_t vvp_reduce_and::calculate_result() const
{
vvp_bit4_t res = BIT4_1;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res & bits_.value(idx);
return res;
return bits_.reduce_and();
}
class vvp_reduce_or : public vvp_reduce_base {
@ -123,12 +118,7 @@ vvp_reduce_or::~vvp_reduce_or()
vvp_bit4_t vvp_reduce_or::calculate_result() const
{
vvp_bit4_t res = BIT4_0;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res | bits_.value(idx);
return res;
return bits_.reduce_or();
}
class vvp_reduce_xor : public vvp_reduce_base {
@ -149,12 +139,7 @@ vvp_reduce_xor::~vvp_reduce_xor()
vvp_bit4_t vvp_reduce_xor::calculate_result() const
{
vvp_bit4_t res = BIT4_0;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res ^ bits_.value(idx);
return res;
return bits_.reduce_xor();
}
class vvp_reduce_nand : public vvp_reduce_base {
@ -175,12 +160,7 @@ vvp_reduce_nand::~vvp_reduce_nand()
vvp_bit4_t vvp_reduce_nand::calculate_result() const
{
vvp_bit4_t res = BIT4_1;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res & bits_.value(idx);
return ~res;
return ~bits_.reduce_and();
}
class vvp_reduce_nor : public vvp_reduce_base {
@ -201,12 +181,7 @@ vvp_reduce_nor::~vvp_reduce_nor()
vvp_bit4_t vvp_reduce_nor::calculate_result() const
{
vvp_bit4_t res = BIT4_0;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res | bits_.value(idx);
return ~res;
return ~bits_.reduce_or();
}
class vvp_reduce_xnor : public vvp_reduce_base {
@ -227,12 +202,7 @@ vvp_reduce_xnor::~vvp_reduce_xnor()
vvp_bit4_t vvp_reduce_xnor::calculate_result() const
{
vvp_bit4_t res = BIT4_0;
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
res = res ^ bits_.value(idx);
return ~res;
return ~bits_.reduce_xor();
}
static void make_reduce(char*label, vvp_net_fun_t*red, const struct symb_s&arg)

View File

@ -4576,24 +4576,8 @@ bool of_NOOP(vthread_t, vvp_code_t)
*/
bool of_NORR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_bit4_t lb = BIT4_1;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_1) {
lb = BIT4_0;
break;
}
if (rb != BIT4_0)
lb = BIT4_X;
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, ~val.reduce_or());
return true;
}
@ -4613,23 +4597,8 @@ bool of_NULL(vthread_t thr, vvp_code_t)
*/
bool of_ANDR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_bit4_t lb = BIT4_1;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_0) {
lb = BIT4_0;
break;
}
if (rb != 1)
lb = BIT4_X;
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, val.reduce_and());
return true;
}
@ -4639,23 +4608,8 @@ bool of_ANDR(vthread_t thr, vvp_code_t)
*/
bool of_NANDR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_bit4_t lb = BIT4_0;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_0) {
lb = BIT4_1;
break;
}
if (rb != BIT4_1)
lb = BIT4_X;
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, ~val.reduce_and());
return true;
}
@ -4665,22 +4619,9 @@ bool of_NANDR(vthread_t thr, vvp_code_t)
*/
bool of_ORR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, val.reduce_or());
vvp_bit4_t lb = BIT4_0;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_1) {
lb = BIT4_1;
break;
}
if (rb != BIT4_0)
lb = BIT4_X;
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
return true;
}
@ -4689,22 +4630,9 @@ bool of_ORR(vthread_t thr, vvp_code_t)
*/
bool of_XORR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, val.reduce_xor());
vvp_bit4_t lb = BIT4_0;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_1)
lb = ~lb;
else if (rb != BIT4_0) {
lb = BIT4_X;
break;
}
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
return true;
}
@ -4713,22 +4641,9 @@ bool of_XORR(vthread_t thr, vvp_code_t)
*/
bool of_XNORR(vthread_t thr, vvp_code_t)
{
vvp_vector4_t val = thr->pop_vec4();
vvp_vector4_t&val = thr->peek_vec4();
val = vvp_vector4_t(1, ~val.reduce_xor());
vvp_bit4_t lb = BIT4_1;
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
vvp_bit4_t rb = val.value(idx);
if (rb == BIT4_1)
lb = ~lb;
else if (rb != BIT4_0) {
lb = BIT4_X;
break;
}
}
vvp_vector4_t res (1, lb);
thr->push_vec4(res);
return true;
}

View File

@ -1905,6 +1905,105 @@ void vvp_vector4_t::invert()
}
}
#define BIT_MASK(n) ((n) ? ((~0UL) >> (BITS_PER_WORD - (n))) : ~0UL)
vvp_bit4_t vvp_vector4_t::reduce_or() const
{
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
vvp_bit4_t res = BIT4_0;
if (size_ <= BITS_PER_WORD) {
if ((abits_val_ & ~bbits_val_ & mask) != 0UL)
return BIT4_1;
if ((bbits_val_ & mask) != 0UL)
return BIT4_X;
} else {
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
unsigned idx;
for (idx = 0; idx < words - 1; idx += 1) {
if ((abits_ptr_[idx] & ~bbits_ptr_[idx]) != 0UL)
return BIT4_1;
if (bbits_ptr_[idx] != 0UL)
res = BIT4_X;
}
if ((abits_ptr_[idx] & ~bbits_ptr_[idx] & mask) != 0UL)
return BIT4_1;
if ((bbits_ptr_[idx] & mask) != 0UL)
res = BIT4_X;
}
return res;
}
vvp_bit4_t vvp_vector4_t::reduce_and() const
{
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
vvp_bit4_t res = BIT4_1;
if (size_ <= BITS_PER_WORD) {
if ((abits_val_ | bbits_val_ | ~mask) != ~0UL)
return BIT4_0;
if ((bbits_val_ & mask) != 0UL)
return BIT4_X;
} else {
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
unsigned idx;
for (idx = 0; idx < words - 1; idx += 1) {
if ((abits_ptr_[idx] | bbits_ptr_[idx]) != ~0UL)
return BIT4_0;
if (bbits_ptr_[idx] != 0UL)
res = BIT4_X;
}
if ((abits_ptr_[idx] | bbits_ptr_[idx] | ~mask) != ~0UL)
return BIT4_0;
if ((bbits_ptr_[idx] & mask) != 0UL)
res = BIT4_X;
}
return res;
}
static unsigned long parity(unsigned long val)
{
#if defined(__GNUC__)
// The compiler builtin can use target-specific CPU instructions.
return __builtin_parityl(val);
#else
#if ULONG_MAX > 0xffffffffUL
val ^= val >> 32;
#endif
val ^= val >> 16;
val ^= val >> 8;
val ^= val >> 4;
return (0x6996 >> (val & 0xf)) & 1;
#endif
}
vvp_bit4_t vvp_vector4_t::reduce_xor() const
{
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
if (size_ <= BITS_PER_WORD) {
if ((bbits_val_ & mask) != 0UL)
return BIT4_X;
return parity(abits_val_ & mask) ? BIT4_1 : BIT4_0;
} else {
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
unsigned long val_a = 0UL;
unsigned idx;
for (idx = 0; idx < words - 1; idx += 1) {
if (bbits_ptr_[idx] != 0UL)
return BIT4_X;
val_a ^= abits_ptr_[idx];
}
if ((bbits_ptr_[idx] & mask) != 0UL)
return BIT4_X;
val_a ^= abits_ptr_[idx] & mask;
return parity(val_a) ? BIT4_1 : BIT4_0;
}
}
vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
{
// The truth table is:

View File

@ -326,6 +326,10 @@ class vvp_vector4_t {
vvp_vector4_t& operator ^= (const vvp_vector4_t&that);
vvp_vector4_t& operator += (int64_t);
vvp_bit4_t reduce_or() const;
vvp_bit4_t reduce_and() const;
vvp_bit4_t reduce_xor() const;
private:
// Number of vvp_bit4_t bits that can be shoved into a word.
enum { BITS_PER_WORD = 8*sizeof(unsigned long) };