Optimize vvp_vector4 vector handling.

Improve vvp_vector4_t methods copy_bits and the part selecting constructor
to make better use of vector words. Eliminate bit-by-bit processing by
these methods to take advantage of host processor words.

Improve vthread_bits_to_vector to use these improved methods and Update
the %load/av and %set/v instructions to take advantage of these changes.
This commit is contained in:
Stephen Williams 2008-05-23 14:30:32 -07:00
parent d2106a3d3a
commit 492b240304
2 changed files with 137 additions and 27 deletions

View File

@ -210,12 +210,7 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
return vvp_vector4_t(thr->bits4, bit, wid);
} else {
vvp_vector4_t value(wid);
vvp_bit4_t bit_val = thr_index_to_bit4[bit];
for (unsigned idx = 0; idx < wid; idx +=1) {
value.set_bit(idx, bit_val);
}
return value;
return vvp_vector4_t(wid, thr_index_to_bit4[bit]);
}
}
@ -2282,13 +2277,15 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
if (word.size() != wid) {
fprintf(stderr, "internal error: array width=%u, word.size()=%u, wid=%u\n",
0, word.size(), wid);
assert(word.size() == wid);
}
assert(word.size() == wid);
for (unsigned idx = 0 ; idx < wid ; idx += 1, bit += 1) {
vvp_bit4_t val = word.value(idx);
thr_put_bit(thr, bit, val);
}
/* Check the address once, before we scan the vector. */
thr_check_addr(thr, bit+wid-1);
/* Copy the vector bits into the bits4 vector. Do the copy
directly to skip the excess calls to thr_check_addr. */
thr->bits4.set_vec(bit, word);
return true;
}
@ -3526,16 +3523,7 @@ bool of_SET_VEC(vthread_t thr, vvp_code_t cp)
/* set the value into port 0 of the destination. */
vvp_net_ptr_t ptr (cp->net, 0);
if (bit >= 4) {
vvp_vector4_t value(thr->bits4,bit,wid);
vvp_send_vec4(ptr, value);
} else {
/* Make a vector of the desired width. */
vvp_bit4_t bit_val = thr_index_to_bit4[bit];
vvp_vector4_t value(wid, bit_val);
vvp_send_vec4(ptr, value);
}
vvp_send_vec4(ptr, vthread_bits_to_vector(thr, bit, wid));
return true;
}

View File

@ -202,10 +202,84 @@ void vvp_send_long_pv(vvp_net_ptr_t ptr, long val,
void vvp_vector4_t::copy_bits(const vvp_vector4_t&that)
{
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
for (unsigned idx = 0; idx < bits_to_copy; idx += 1)
set_bit(idx, that.value(idx));
if (size_ == that.size_) {
if (size_ > BITS_PER_WORD) {
unsigned words = (size_+BITS_PER_WORD-1) / BITS_PER_WORD;
for (unsigned idx = 0 ; idx < words ; idx += 1)
abits_ptr_[idx] = that.abits_ptr_[idx];
for (unsigned idx = 0 ; idx < words ; idx += 1)
bbits_ptr_[idx] = that.bbits_ptr_[idx];
} else {
abits_val_ = that.abits_val_;
bbits_val_ = that.bbits_val_;
}
return;
}
/* Now we know that the sizes of this and that are definitely
different. We can use that in code below. In any case, we
need to copy only the smaller of the sizes. */
/* If source and destination are both short, then mask/copy
the bit values. */
if (size_ <= BITS_PER_WORD && that.size_ <= BITS_PER_WORD) {
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
unsigned long mask = (1UL << bits_to_copy) - 1UL;
abits_val_ &= ~mask;
bbits_val_ &= ~mask;
abits_val_ |= that.abits_val_&mask;
bbits_val_ |= that.bbits_val_&mask;
return;
}
/* Now we know that either source or destination are long. If
the destination is short, then mask/copy from the low word
of the long source. */
if (size_ <= BITS_PER_WORD) {
abits_val_ = that.abits_ptr_[0];
bbits_val_ = that.bbits_ptr_[0];
if (size_ < BITS_PER_WORD) {
unsigned long mask = (1UL << size_) - 1UL;
abits_val_ &= mask;
bbits_val_ &= mask;
}
return;
}
/* Now we know that the destination must be long. If the
source is short, then mask/copy from its value. */
if (that.size_ <= BITS_PER_WORD) {
unsigned long mask;
if (that.size_ < BITS_PER_WORD) {
mask = (1UL << that.size_) - 1UL;
abits_ptr_[0] &= ~mask;
bbits_ptr_[0] &= ~mask;
} else {
mask = -1UL;
}
abits_ptr_[0] |= that.abits_val_&mask;
bbits_ptr_[0] |= that.bbits_val_&mask;
return;
}
/* Finally, we know that source and destination are long. copy
words until we get to the last. */
unsigned bits_to_copy = (that.size_ < size_) ? that.size_ : size_;
unsigned word = 0;
while (bits_to_copy >= BITS_PER_WORD) {
abits_ptr_[word] = that.abits_ptr_[word];
bbits_ptr_[word] = that.bbits_ptr_[word];
bits_to_copy -= BITS_PER_WORD;
word += 1;
}
if (bits_to_copy > 0) {
unsigned long mask = (1UL << bits_to_copy) - 1UL;
abits_ptr_[word] &= ~mask;
bbits_ptr_[word] &= ~mask;
abits_ptr_[word] |= that.abits_ptr_[word] & mask;
bbits_ptr_[word] |= that.bbits_ptr_[word] & mask;
}
}
void vvp_vector4_t::copy_from_(const vvp_vector4_t&that)
@ -283,10 +357,58 @@ vvp_vector4_t::vvp_vector4_t(const vvp_vector4_t&that,
dst += 1;
}
} else {
for (unsigned idx = 0 ; idx < wid ; idx += 1) {
set_bit(idx, that.value(adr+idx));
} else if (that.size_ > BITS_PER_WORD) {
/* In this case, the subvector fits in a single word,
but the source is large. */
unsigned ptr = adr / BITS_PER_WORD;
unsigned long off = adr % BITS_PER_WORD;
unsigned trans = BITS_PER_WORD - off;
if (trans > wid)
trans = wid;
if (trans == BITS_PER_WORD) {
// Very special case: Copy exactly 1 perfectly
// aligned word.
abits_val_ = that.abits_ptr_[ptr];
bbits_val_ = that.bbits_ptr_[ptr];
} else {
// lmask is the low bits of the destination,
// masked into the source.
unsigned long lmask = (1UL<<trans) - 1UL;
lmask <<= off;
// The low bits of the result.
abits_val_ = (that.abits_ptr_[ptr] & lmask) >> off;
bbits_val_ = (that.bbits_ptr_[ptr] & lmask) >> off;
if (trans < wid) {
// If there are more bits, then get them
// from the bottom of the next word of the
// source.
unsigned long hmask = (1UL << (wid-trans)) - 1UL;
// The high bits of the result.
abits_val_ |= (that.abits_ptr_[ptr+1]&hmask) << trans;
bbits_val_ |= (that.bbits_ptr_[ptr+1]&hmask) << trans;
}
}
} else if (size_ == BITS_PER_WORD) {
/* We know that source and destination are short. If the
destination is a full word, then we know the copy is
aligned and complete. */
abits_val_ = that.abits_val_;
bbits_val_ = that.bbits_val_;
} else {
/* Finally, the source and destination vectors are both
short, so there is a single mask/shift/copy. */
unsigned long mask = (1UL << size_) - 1UL;
mask <<= adr;
abits_val_ = (that.abits_val_ & mask) >> adr;
bbits_val_ = (that.bbits_val_ & mask) >> adr;
}
}