Merge pull request #1342 from larsclausen/vvp-reduce-speed-up
vvp: Improve reduction operator performance
This commit is contained in:
commit
e0e4a2af48
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005-2025 Stephen Williams (steve@icarus.com)
|
||||
* Copyright (c) 2005-2026 Stephen Williams (steve@icarus.com)
|
||||
*
|
||||
* This source code is free software; you can redistribute it
|
||||
* and/or modify it in source code form under the terms of the GNU
|
||||
|
|
@ -97,12 +97,7 @@ vvp_reduce_and::~vvp_reduce_and()
|
|||
|
||||
vvp_bit4_t vvp_reduce_and::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_1;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res & bits_.value(idx);
|
||||
|
||||
return res;
|
||||
return bits_.reduce_and();
|
||||
}
|
||||
|
||||
class vvp_reduce_or : public vvp_reduce_base {
|
||||
|
|
@ -123,12 +118,7 @@ vvp_reduce_or::~vvp_reduce_or()
|
|||
|
||||
vvp_bit4_t vvp_reduce_or::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_0;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res | bits_.value(idx);
|
||||
|
||||
return res;
|
||||
return bits_.reduce_or();
|
||||
}
|
||||
|
||||
class vvp_reduce_xor : public vvp_reduce_base {
|
||||
|
|
@ -149,12 +139,7 @@ vvp_reduce_xor::~vvp_reduce_xor()
|
|||
|
||||
vvp_bit4_t vvp_reduce_xor::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_0;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res ^ bits_.value(idx);
|
||||
|
||||
return res;
|
||||
return bits_.reduce_xor();
|
||||
}
|
||||
|
||||
class vvp_reduce_nand : public vvp_reduce_base {
|
||||
|
|
@ -175,12 +160,7 @@ vvp_reduce_nand::~vvp_reduce_nand()
|
|||
|
||||
vvp_bit4_t vvp_reduce_nand::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_1;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res & bits_.value(idx);
|
||||
|
||||
return ~res;
|
||||
return ~bits_.reduce_and();
|
||||
}
|
||||
|
||||
class vvp_reduce_nor : public vvp_reduce_base {
|
||||
|
|
@ -201,12 +181,7 @@ vvp_reduce_nor::~vvp_reduce_nor()
|
|||
|
||||
vvp_bit4_t vvp_reduce_nor::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_0;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res | bits_.value(idx);
|
||||
|
||||
return ~res;
|
||||
return ~bits_.reduce_or();
|
||||
}
|
||||
|
||||
class vvp_reduce_xnor : public vvp_reduce_base {
|
||||
|
|
@ -227,12 +202,7 @@ vvp_reduce_xnor::~vvp_reduce_xnor()
|
|||
|
||||
vvp_bit4_t vvp_reduce_xnor::calculate_result() const
|
||||
{
|
||||
vvp_bit4_t res = BIT4_0;
|
||||
|
||||
for (unsigned idx = 0 ; idx < bits_.size() ; idx += 1)
|
||||
res = res ^ bits_.value(idx);
|
||||
|
||||
return ~res;
|
||||
return ~bits_.reduce_xor();
|
||||
}
|
||||
|
||||
static void make_reduce(char*label, vvp_net_fun_t*red, const struct symb_s&arg)
|
||||
|
|
|
|||
109
vvp/vthread.cc
109
vvp/vthread.cc
|
|
@ -4576,24 +4576,8 @@ bool of_NOOP(vthread_t, vvp_code_t)
|
|||
*/
|
||||
bool of_NORR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
|
||||
vvp_bit4_t lb = BIT4_1;
|
||||
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_1) {
|
||||
lb = BIT4_0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rb != BIT4_0)
|
||||
lb = BIT4_X;
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, ~val.reduce_or());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -4613,23 +4597,8 @@ bool of_NULL(vthread_t thr, vvp_code_t)
|
|||
*/
|
||||
bool of_ANDR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
|
||||
vvp_bit4_t lb = BIT4_1;
|
||||
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_0) {
|
||||
lb = BIT4_0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rb != 1)
|
||||
lb = BIT4_X;
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, val.reduce_and());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -4639,23 +4608,8 @@ bool of_ANDR(vthread_t thr, vvp_code_t)
|
|||
*/
|
||||
bool of_NANDR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
|
||||
vvp_bit4_t lb = BIT4_0;
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_0) {
|
||||
lb = BIT4_1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rb != BIT4_1)
|
||||
lb = BIT4_X;
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, ~val.reduce_and());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -4665,22 +4619,9 @@ bool of_NANDR(vthread_t thr, vvp_code_t)
|
|||
*/
|
||||
bool of_ORR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, val.reduce_or());
|
||||
|
||||
vvp_bit4_t lb = BIT4_0;
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_1) {
|
||||
lb = BIT4_1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rb != BIT4_0)
|
||||
lb = BIT4_X;
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -4689,22 +4630,9 @@ bool of_ORR(vthread_t thr, vvp_code_t)
|
|||
*/
|
||||
bool of_XORR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, val.reduce_xor());
|
||||
|
||||
vvp_bit4_t lb = BIT4_0;
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_1)
|
||||
lb = ~lb;
|
||||
else if (rb != BIT4_0) {
|
||||
lb = BIT4_X;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -4713,22 +4641,9 @@ bool of_XORR(vthread_t thr, vvp_code_t)
|
|||
*/
|
||||
bool of_XNORR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
vvp_vector4_t&val = thr->peek_vec4();
|
||||
val = vvp_vector4_t(1, ~val.reduce_xor());
|
||||
|
||||
vvp_bit4_t lb = BIT4_1;
|
||||
for (unsigned idx = 0 ; idx < val.size() ; idx += 1) {
|
||||
|
||||
vvp_bit4_t rb = val.value(idx);
|
||||
if (rb == BIT4_1)
|
||||
lb = ~lb;
|
||||
else if (rb != BIT4_0) {
|
||||
lb = BIT4_X;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
vvp_vector4_t res (1, lb);
|
||||
thr->push_vec4(res);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1905,6 +1905,105 @@ void vvp_vector4_t::invert()
|
|||
}
|
||||
}
|
||||
|
||||
#define BIT_MASK(n) ((n) ? ((~0UL) >> (BITS_PER_WORD - (n))) : ~0UL)
|
||||
|
||||
vvp_bit4_t vvp_vector4_t::reduce_or() const
|
||||
{
|
||||
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
|
||||
vvp_bit4_t res = BIT4_0;
|
||||
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
if ((abits_val_ & ~bbits_val_ & mask) != 0UL)
|
||||
return BIT4_1;
|
||||
if ((bbits_val_ & mask) != 0UL)
|
||||
return BIT4_X;
|
||||
} else {
|
||||
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
unsigned idx;
|
||||
for (idx = 0; idx < words - 1; idx += 1) {
|
||||
if ((abits_ptr_[idx] & ~bbits_ptr_[idx]) != 0UL)
|
||||
return BIT4_1;
|
||||
if (bbits_ptr_[idx] != 0UL)
|
||||
res = BIT4_X;
|
||||
}
|
||||
if ((abits_ptr_[idx] & ~bbits_ptr_[idx] & mask) != 0UL)
|
||||
return BIT4_1;
|
||||
if ((bbits_ptr_[idx] & mask) != 0UL)
|
||||
res = BIT4_X;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
vvp_bit4_t vvp_vector4_t::reduce_and() const
|
||||
{
|
||||
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
|
||||
vvp_bit4_t res = BIT4_1;
|
||||
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
if ((abits_val_ | bbits_val_ | ~mask) != ~0UL)
|
||||
return BIT4_0;
|
||||
if ((bbits_val_ & mask) != 0UL)
|
||||
return BIT4_X;
|
||||
} else {
|
||||
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
unsigned idx;
|
||||
for (idx = 0; idx < words - 1; idx += 1) {
|
||||
if ((abits_ptr_[idx] | bbits_ptr_[idx]) != ~0UL)
|
||||
return BIT4_0;
|
||||
if (bbits_ptr_[idx] != 0UL)
|
||||
res = BIT4_X;
|
||||
}
|
||||
if ((abits_ptr_[idx] | bbits_ptr_[idx] | ~mask) != ~0UL)
|
||||
return BIT4_0;
|
||||
if ((bbits_ptr_[idx] & mask) != 0UL)
|
||||
res = BIT4_X;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static unsigned long parity(unsigned long val)
|
||||
{
|
||||
#if defined(__GNUC__)
|
||||
// The compiler builtin can use target-specific CPU instructions.
|
||||
return __builtin_parityl(val);
|
||||
#else
|
||||
#if ULONG_MAX > 0xffffffffUL
|
||||
val ^= val >> 32;
|
||||
#endif
|
||||
val ^= val >> 16;
|
||||
val ^= val >> 8;
|
||||
val ^= val >> 4;
|
||||
|
||||
return (0x6996 >> (val & 0xf)) & 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
vvp_bit4_t vvp_vector4_t::reduce_xor() const
|
||||
{
|
||||
unsigned long mask = BIT_MASK(size_ % BITS_PER_WORD);
|
||||
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
if ((bbits_val_ & mask) != 0UL)
|
||||
return BIT4_X;
|
||||
return parity(abits_val_ & mask) ? BIT4_1 : BIT4_0;
|
||||
} else {
|
||||
unsigned words = (size_ + BITS_PER_WORD - 1) / BITS_PER_WORD;
|
||||
unsigned long val_a = 0UL;
|
||||
unsigned idx;
|
||||
for (idx = 0; idx < words - 1; idx += 1) {
|
||||
if (bbits_ptr_[idx] != 0UL)
|
||||
return BIT4_X;
|
||||
val_a ^= abits_ptr_[idx];
|
||||
}
|
||||
if ((bbits_ptr_[idx] & mask) != 0UL)
|
||||
return BIT4_X;
|
||||
val_a ^= abits_ptr_[idx] & mask;
|
||||
return parity(val_a) ? BIT4_1 : BIT4_0;
|
||||
}
|
||||
}
|
||||
|
||||
vvp_vector4_t& vvp_vector4_t::operator &= (const vvp_vector4_t&that)
|
||||
{
|
||||
// The truth table is:
|
||||
|
|
|
|||
|
|
@ -326,6 +326,10 @@ class vvp_vector4_t {
|
|||
vvp_vector4_t& operator ^= (const vvp_vector4_t&that);
|
||||
vvp_vector4_t& operator += (int64_t);
|
||||
|
||||
vvp_bit4_t reduce_or() const;
|
||||
vvp_bit4_t reduce_and() const;
|
||||
vvp_bit4_t reduce_xor() const;
|
||||
|
||||
private:
|
||||
// Number of vvp_bit4_t bits that can be shoved into a word.
|
||||
enum { BITS_PER_WORD = 8*sizeof(unsigned long) };
|
||||
|
|
|
|||
Loading…
Reference in New Issue