Signed load-and-add for arrays.

The load-and-add for vectors %load/vp0/s can be combined with the
load-and-add for array words, and the %load/avp0/s added to round
out the combinations. This can make for fewer instructions when
words are padded in arithmetic expressions.
This commit is contained in:
Stephen Williams 2008-06-14 19:59:57 -07:00
parent 6521ceaf92
commit 9013dcb527
5 changed files with 72 additions and 60 deletions

View File

@ -1987,17 +1987,17 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
draw_eval_expr_into_integer(ix, 3);
if (add_index < 0) {
fprintf(vvp_out, " %%load/av %u, v%p, %u;\n",
fprintf(vvp_out, " %%load/av %u, v%p, %u;\n",
res.base, sig, swid);
pad_expr_in_place(exp, res, swid);
} else {
assert(add_index == 0);
const char*sign_flag = (add_index>0)? "/s" : "";
/* Add an immediate value to an array value. */
fprintf(vvp_out, " %%ix/load 0, %lu;\n", immediate);
fprintf(vvp_out, " %%load/avp0 %u, v%p, %u;\n",
res.base, sig, swid);
fprintf(vvp_out, " %%ix/load 0, %lu;\n", immediate);
fprintf(vvp_out, " %%load/avp0%s %u, v%p, %u;\n",
sign_flag, res.base, sig, res.wid);
}
pad_expr_in_place(exp, res, swid);
return;
}

View File

@ -97,6 +97,7 @@ extern bool of_JMP1(vthread_t thr, vvp_code_t code);
extern bool of_JOIN(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_AV(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_AVP0(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);

View File

@ -140,6 +140,7 @@ const static struct opcode_table_s opcode_table[] = {
{ "%join", of_JOIN, 0, {OA_NONE, OA_NONE, OA_NONE} },
{ "%load/av",of_LOAD_AV,3, {OA_BIT1, OA_ARR_PTR, OA_BIT2} },
{ "%load/avp0",of_LOAD_AVP0,3, {OA_BIT1, OA_ARR_PTR, OA_BIT2} },
{ "%load/avp0/s",of_LOAD_AVP0_S,3,{OA_BIT1, OA_ARR_PTR, OA_BIT2} },
{ "%load/avx.p",of_LOAD_AVX_P,3,{OA_BIT1, OA_ARR_PTR, OA_BIT2} },
{ "%load/v", of_LOAD_VEC,3, {OA_BIT1, OA_FUNC_PTR, OA_BIT2} },
{ "%load/vp0",of_LOAD_VP0,3,{OA_BIT1, OA_FUNC_PTR, OA_BIT2} },

View File

@ -418,10 +418,16 @@ address is in index register 3. The width should match the width of
the array word.
* %load/avp0 <bit>, <array-label>, <wid>
* %load/avp0/s <bit>, <array-label>, <wid>
This instruction is a mix of %load/av and %load/vp0. It loads an array
value like %load/av and then adds a value from index register 0 to the
result like %load/vp0.
result like %load/vp0. The loaded value is zero-extended to <wid>,
then added arithmetically to the signed index register 0. The result
is then stored in <bit>.
The %load/avp0/s instruction is the same, except that the loaded
vector is sign extended (instead of 0-extended) before the addition.
* %load/avx.p <bit>, <array-label>, <index>

View File

@ -2381,6 +2381,46 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
return true;
}
/*
* %load/vp0, %load/vp0/s, %load/avp0 and %load/avp0/s share this function.
*/
static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&sig_value)
{
unsigned bit = cp->bit_idx[0];
unsigned wid = cp->bit_idx[1];
int64_t addend = thr->words[0].w_int;
/* Check the address once, before we scan the vector. */
thr_check_addr(thr, bit+wid-1);
unsigned long*val = sig_value.subarray(0, wid);
if (val == 0) {
vvp_vector4_t tmp(wid, BIT4_X);
thr->bits4.set_vec(bit, tmp);
return;
}
unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
unsigned long carry = 0;
unsigned long imm = addend;
if (addend >= 0) {
for (unsigned idx = 0 ; idx < words ; idx += 1) {
val[idx] = add_with_carry(val[idx], imm, carry);
imm = 0UL;
}
} else {
for (unsigned idx = 0 ; idx < words ; idx += 1) {
val[idx] = add_with_carry(val[idx], imm, carry);
imm = -1UL;
}
}
/* Copy the vector bits into the bits4 vector. Do the copy
directly to skip the excess calls to thr_check_addr. */
thr->bits4.setarray(bit, wid, val);
delete[]val;
}
/*
* %load/avp0 <bit>, <array-label>, <wid> ;
*
@ -2393,30 +2433,31 @@ bool of_LOAD_AV(vthread_t thr, vvp_code_t cp)
*/
bool of_LOAD_AVP0(vthread_t thr, vvp_code_t cp)
{
unsigned bit = cp->bit_idx[0];
unsigned wid = cp->bit_idx[1];
int64_t addend = thr->words[0].w_int;
unsigned adr = thr->words[3].w_int;
vvp_vector4_t word = array_get_word(cp->array, adr);
/* We need a vector this wide to make the math work correctly.
* Copy the base bits into the vector, but keep the width. */
vvp_vector4_t sig_value(wid, BIT4_0);
sig_value.copy_bits(array_get_word(cp->array, adr));
if (word.size() != wid) {
fprintf(stderr, "internal error: array width=%u, word.size()=%u, wid=%u\n",
0, word.size(), wid);
}
assert(word.size() == wid);
load_vp0_common(thr, cp, sig_value);
return true;
}
/* Add the addend value */
word += addend;
bool of_LOAD_AVP0_S(vthread_t thr, vvp_code_t cp)
{
unsigned wid = cp->bit_idx[1];
unsigned adr = thr->words[3].w_int;
/* Check the address once, before we scan the vector. */
thr_check_addr(thr, bit+wid-1);
/* Copy the vector bits into the bits4 vector. Do the copy
directly to skip the excess calls to thr_check_addr. */
thr->bits4.set_vec(bit, word);
vvp_vector4_t tmp (array_get_word(cp->array, adr));
/* We need a vector this wide to make the math work correctly.
* Copy the base bits into the vector, but keep the width. */
vvp_vector4_t sig_value(wid, tmp.value(tmp.size()-1));
sig_value.copy_bits(tmp);
load_vp0_common(thr, cp, sig_value);
return true;
}
@ -2511,43 +2552,6 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
* index 0. The <wid> is the expected result width not the vector width.
*/
static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&sig_value)
{
unsigned bit = cp->bit_idx[0];
unsigned wid = cp->bit_idx[1];
int64_t addend = thr->words[0].w_int;
/* Check the address once, before we scan the vector. */
thr_check_addr(thr, bit+wid-1);
unsigned long*val = sig_value.subarray(0, wid);
if (val == 0) {
vvp_vector4_t tmp(wid, BIT4_X);
thr->bits4.set_vec(bit, tmp);
return;
}
unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
unsigned long carry = 0;
unsigned long imm = addend;
if (addend >= 0) {
for (unsigned idx = 0 ; idx < words ; idx += 1) {
val[idx] = add_with_carry(val[idx], imm, carry);
imm = 0UL;
}
} else {
for (unsigned idx = 0 ; idx < words ; idx += 1) {
val[idx] = add_with_carry(val[idx], imm, carry);
imm = -1UL;
}
}
/* Copy the vector bits into the bits4 vector. Do the copy
directly to skip the excess calls to thr_check_addr. */
thr->bits4.setarray(bit, wid, val);
delete[]val;
}
bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
{
unsigned wid = cp->bit_idx[1];