Optimize vvp_vector4 vector handling.
Improve vvp_vector4_t methods copy_bits and the part selecting constructor to make better use of vector words. Eliminate bit-by-bit processing by these methods to take advantage of host processor words. Improve vthread_bits_to_vector to use these improved methods and Update the %load/av and %set/v instructions to take advantage of these changes.
This commit is contained in:
parent
d2106a3d3a
commit
492b240304
|
|
@ -210,12 +210,7 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
|
|||
return vvp_vector4_t(thr->bits4, bit, wid);
|
||||
|
||||
} else {
|
||||
vvp_vector4_t value(wid);
|
||||
vvp_bit4_t bit_val = thr_index_to_bit4[bit];
|
||||
for (unsigned idx = 0; idx < wid; idx +=1) {
|
||||
value.set_bit(idx, bit_val);
|
||||
}
|
||||
return value;
|
||||
return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2282,13 +2277,15 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
|
|||
if (word.size() != wid) {
|
||||
fprintf(stderr, "internal error: array width=%u, word.size()=%u, wid=%u\n",
|
||||
0, word.size(), wid);
|
||||
assert(word.size() == wid);
|
||||
}
|
||||
assert(word.size() == wid);
|
||||
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1, bit += 1) {
|
||||
vvp_bit4_t val = word.value(idx);
|
||||
thr_put_bit(thr, bit, val);
|
||||
}
|
||||
/* Check the address once, before we scan the vector. */
|
||||
thr_check_addr(thr, bit+wid-1);
|
||||
|
||||
/* Copy the vector bits into the bits4 vector. Do the copy
|
||||
directly to skip the excess calls to thr_check_addr. */
|
||||
thr->bits4.set_vec(bit, word);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -3526,16 +3523,7 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
|
|||
/* set the value into port 0 of the destination. */
|
||||
vvp_net_ptr_t ptr (cp->net, 0);
|
||||
|
||||
if (bit >= 4) {
|
||||
vvp_vector4_t value(thr->bits4,bit,wid);
|
||||
vvp_send_vec4(ptr, value);
|
||||
|
||||
} else {
|
||||
/* Make a vector of the desired width. */
|
||||
vvp_bit4_t bit_val = thr_index_to_bit4[bit];
|
||||
vvp_vector4_t value(wid, bit_val);
|
||||
vvp_send_vec4(ptr, value);
|
||||
}
|
||||
vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
|||
134
vvp/vvp_net.cc
134
vvp/vvp_net.cc
|
|
@ -202,10 +202,84 @@ void vvp_send_long_pv(vvp_net_ptr_t ptr, long val,
|
|||
|
||||
void vvp_vector4_t::copy_bits(const vvp_vector4_t&that)
|
||||
{
|
||||
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
|
||||
|
||||
for (unsigned idx = 0; idx < bits_to_copy; idx += 1)
|
||||
set_bit(idx, that.value(idx));
|
||||
if (size_ == that.size_) {
|
||||
if (size_ > BITS_PER_WORD) {
|
||||
unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
|
||||
for (unsigned idx = 0 ; idx < words ; idx += 1)
|
||||
abits_ptr_[idx] = that.abits_ptr_[idx];
|
||||
for (unsigned idx = 0 ; idx < words ; idx += 1)
|
||||
bbits_ptr_[idx] = that.bbits_ptr_[idx];
|
||||
} else {
|
||||
abits_val_ = that.abits_val_;
|
||||
bbits_val_ = that.bbits_val_;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Now we know that the sizes of this and that are definitely
|
||||
different. We can use that in code below. In any case, we
|
||||
need to copy only the smaller of the sizes. */
|
||||
|
||||
/* If source and destination are both short, then mask/copy
|
||||
the bit values. */
|
||||
if (size_ <= BITS_PER_WORD && that.size_ <= BITS_PER_WORD) {
|
||||
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
|
||||
unsigned long mask = (1UL << bits_to_copy) - 1UL;
|
||||
abits_val_ &= ~mask;
|
||||
bbits_val_ &= ~mask;
|
||||
abits_val_ |= that.abits_val_&mask;
|
||||
bbits_val_ |= that.bbits_val_&mask;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Now we know that either source or destination are long. If
|
||||
the destination is short, then mask/copy from the low word
|
||||
of the long source. */
|
||||
if (size_ <= BITS_PER_WORD) {
|
||||
abits_val_ = that.abits_ptr_[0];
|
||||
bbits_val_ = that.bbits_ptr_[0];
|
||||
if (size_ < BITS_PER_WORD) {
|
||||
unsigned long mask = (1UL << size_) - 1UL;
|
||||
abits_val_ &= mask;
|
||||
bbits_val_ &= mask;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* Now we know that the destination must be long. If the
|
||||
source is short, then mask/copy from its value. */
|
||||
if (that.size_ <= BITS_PER_WORD) {
|
||||
unsigned long mask;
|
||||
if (that.size_ < BITS_PER_WORD) {
|
||||
mask = (1UL << that.size_) - 1UL;
|
||||
abits_ptr_[0] &= ~mask;
|
||||
bbits_ptr_[0] &= ~mask;
|
||||
} else {
|
||||
mask = -1UL;
|
||||
}
|
||||
abits_ptr_[0] |= that.abits_val_&mask;
|
||||
bbits_ptr_[0] |= that.bbits_val_&mask;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Finally, we know that source and destination are long. copy
|
||||
words until we get to the last. */
|
||||
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
|
||||
unsigned word = 0;
|
||||
while (bits_to_copy >= BITS_PER_WORD) {
|
||||
abits_ptr_[word] = that.abits_ptr_[word];
|
||||
bbits_ptr_[word] = that.bbits_ptr_[word];
|
||||
bits_to_copy -= BITS_PER_WORD;
|
||||
word += 1;
|
||||
}
|
||||
if (bits_to_copy > 0) {
|
||||
unsigned long mask = (1UL << bits_to_copy) - 1UL;
|
||||
abits_ptr_[word] &= ~mask;
|
||||
bbits_ptr_[word] &= ~mask;
|
||||
abits_ptr_[word] |= that.abits_ptr_[word] & mask;
|
||||
bbits_ptr_[word] |= that.bbits_ptr_[word] & mask;
|
||||
}
|
||||
}
|
||||
|
||||
void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
|
||||
|
|
@ -283,10 +357,58 @@ vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
|
|||
dst += 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
|
||||
set_bit(idx, that.value(adr+idx));
|
||||
} else if (that.size_ > BITS_PER_WORD) {
|
||||
/* In this case, the subvector fits in a single word,
|
||||
but the source is large. */
|
||||
unsigned ptr = adr / BITS_PER_WORD;
|
||||
unsigned long off = adr % BITS_PER_WORD;
|
||||
unsigned trans = BITS_PER_WORD - off;
|
||||
if (trans > wid)
|
||||
trans = wid;
|
||||
|
||||
if (trans == BITS_PER_WORD) {
|
||||
// Very special case: Copy exactly 1 perfectly
|
||||
// aligned word.
|
||||
abits_val_ = that.abits_ptr_[ptr];
|
||||
bbits_val_ = that.bbits_ptr_[ptr];
|
||||
|
||||
} else {
|
||||
// lmask is the low bits of the destination,
|
||||
// masked into the source.
|
||||
unsigned long lmask = (1UL<<trans) - 1UL;
|
||||
lmask <<= off;
|
||||
|
||||
// The low bits of the result.
|
||||
abits_val_ = (that.abits_ptr_[ptr] & lmask) >> off;
|
||||
bbits_val_ = (that.bbits_ptr_[ptr] & lmask) >> off;
|
||||
|
||||
if (trans < wid) {
|
||||
// If there are more bits, then get them
|
||||
// from the bottom of the next word of the
|
||||
// source.
|
||||
unsigned long hmask = (1UL << (wid-trans)) - 1UL;
|
||||
|
||||
// The high bits of the result.
|
||||
abits_val_ |= (that.abits_ptr_[ptr+1]&hmask) << trans;
|
||||
bbits_val_ |= (that.bbits_ptr_[ptr+1]&hmask) << trans;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (size_ == BITS_PER_WORD) {
|
||||
/* We know that source and destination are short. If the
|
||||
destination is a full word, then we know the copy is
|
||||
aligned and complete. */
|
||||
abits_val_ = that.abits_val_;
|
||||
bbits_val_ = that.bbits_val_;
|
||||
|
||||
} else {
|
||||
/* Finally, the source and destination vectors are both
|
||||
short, so there is a single mask/shift/copy. */
|
||||
unsigned long mask = (1UL << size_) - 1UL;
|
||||
mask <<= adr;
|
||||
|
||||
abits_val_ = (that.abits_val_ & mask) >> adr;
|
||||
bbits_val_ = (that.bbits_val_ & mask) >> adr;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue