Add %cmpi/s and %cmpi/u instructions for performance
These bypass the vec4 stack in some common cases, saving instructions and vec4 manipulations. Also, minor improvement to the %flag/set/vec4 statement. Kill a few warnings.
This commit is contained in:
parent
2acc9fbdee
commit
04bdfbccee
|
|
@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr)
|
|||
draw_eval_vec4(le);
|
||||
resize_vec4_wid(le, use_wid);
|
||||
|
||||
draw_eval_vec4(re);
|
||||
resize_vec4_wid(re, use_wid);
|
||||
if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
|
||||
/* Special case: If the right operand can be handled as
|
||||
an immediate operand, then use that instead. */
|
||||
char opcode[8];
|
||||
snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag);
|
||||
draw_immediate_vec4(re, opcode);
|
||||
|
||||
} else {
|
||||
draw_eval_vec4(re);
|
||||
resize_vec4_wid(re, use_wid);
|
||||
|
||||
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
||||
}
|
||||
|
||||
switch (use_opcode) {
|
||||
case 'L':
|
||||
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
||||
fprintf(vvp_out, " %%flag_get/vec4 4;\n");
|
||||
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
||||
fprintf(vvp_out, " %%or;\n");
|
||||
break;
|
||||
case '<':
|
||||
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
||||
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
||||
break;
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code);
|
|||
extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CAST2(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPS(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPU(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPIU(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
|
||||
extern bool of_CMPWU(vthread_t thr, vvp_code_t code);
|
||||
|
|
|
|||
|
|
@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = {
|
|||
{ "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} },
|
||||
{ "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
||||
{ "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
||||
{ "%cmpi/s", of_CMPIS, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
|
||||
{ "%cmpi/u", of_CMPIU, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
|
||||
{ "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
||||
{ "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
||||
{ "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} },
|
||||
|
|
|
|||
|
|
@ -252,6 +252,8 @@ vector2 (binary) value, and push the result.
|
|||
|
||||
* %cmp/s
|
||||
* %cmp/u
|
||||
* %cmpi/s <vala>, <valb>, <wid>
|
||||
* %cmpi/u <vala>, <valb>, <wid>
|
||||
|
||||
These instructions perform a generic comparison of two vectors of
|
||||
equal size. Two values are pulled from the top of the stack, and not
|
||||
|
|
|
|||
123
vvp/vthread.cc
123
vvp/vthread.cc
|
|
@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %cmp/s
|
||||
*
|
||||
* Pop the operands from the stack, and do not replace them. The
|
||||
* results are written to flag bits:
|
||||
*
|
||||
* 4: eq (equal)
|
||||
* 5: lt (less than)
|
||||
* 6: eeq (case equal)
|
||||
*/
|
||||
bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||
static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
||||
{
|
||||
vvp_bit4_t eq = BIT4_1;
|
||||
vvp_bit4_t eeq = BIT4_1;
|
||||
vvp_bit4_t lt = BIT4_0;
|
||||
|
||||
// We are going to pop these and push nothing in their
|
||||
// place, but for now it is more efficient to use a constant
|
||||
// reference. When we finish, pop the stack without copies.
|
||||
const vvp_vector4_t&rval = thr->peek_vec4(0);
|
||||
const vvp_vector4_t&lval = thr->peek_vec4(1);
|
||||
|
||||
assert(rval.size() == lval.size());
|
||||
|
||||
|
|
@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
|
|||
thr->flags[4] = BIT4_X; // eq
|
||||
thr->flags[5] = BIT4_X; // lt
|
||||
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
|
||||
thr->pop_vec4(2);
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Past this point, we know we are dealing only with fully
|
||||
|
|
@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
|
|||
thr->flags[4] = eq;
|
||||
thr->flags[5] = lt;
|
||||
thr->flags[6] = eeq;
|
||||
}
|
||||
|
||||
/*
|
||||
* %cmp/s
|
||||
*
|
||||
* Pop the operands from the stack, and do not replace them. The
|
||||
* results are written to flag bits:
|
||||
*
|
||||
* 4: eq (equal)
|
||||
* 5: lt (less than)
|
||||
* 6: eeq (case equal)
|
||||
*/
|
||||
bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
// We are going to pop these and push nothing in their
|
||||
// place, but for now it is more efficient to use a constant
|
||||
// reference. When we finish, pop the stack without copies.
|
||||
const vvp_vector4_t&rval = thr->peek_vec4(0);
|
||||
const vvp_vector4_t&lval = thr->peek_vec4(1);
|
||||
|
||||
do_CMPS(thr, lval, rval);
|
||||
|
||||
thr->pop_vec4(2);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %cmpi/s <vala>, <valb>, <wid>
|
||||
*
|
||||
* Pop1 operand, get the other operand from the arguments.
|
||||
*/
|
||||
bool of_CMPIS(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned wid = cp->number;
|
||||
|
||||
vvp_vector4_t&lval = thr->peek_vec4();
|
||||
|
||||
// I expect that most of the bits of an immediate value are
|
||||
// going to be zero, so start the result vector with all zero
|
||||
// bits. Then we only need to replace the bits that are different.
|
||||
vvp_vector4_t rval (wid, BIT4_0);
|
||||
get_immediate_rval (cp, rval);
|
||||
|
||||
do_CMPS(thr, lval, rval);
|
||||
|
||||
thr->pop_vec4(1);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_CMPSTR(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
string re = thr->pop_str();
|
||||
|
|
@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
|
||||
const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
||||
static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
|
||||
const vvp_vector4_t&lval,
|
||||
const vvp_vector4_t&rval)
|
||||
{
|
||||
vvp_bit4_t eq = BIT4_1;
|
||||
vvp_bit4_t eeq = BIT4_1;
|
||||
|
|
@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
|
|||
thr->flags[4] = eq;
|
||||
thr->flags[5] = BIT4_X;
|
||||
thr->flags[6] = eeq;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
||||
static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
||||
{
|
||||
vvp_bit4_t eq = BIT4_1;
|
||||
vvp_bit4_t lt = BIT4_0;
|
||||
|
||||
vvp_vector4_t rval = thr->pop_vec4();
|
||||
vvp_vector4_t lval = thr->pop_vec4();
|
||||
|
||||
if (rval.size() != lval.size()) {
|
||||
cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
|
||||
<< ", rval=" << rval << endl;
|
||||
|
|
@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
|||
unsigned wid = lval.size();
|
||||
|
||||
unsigned long*larray = lval.subarray(0,wid);
|
||||
if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
|
||||
if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);
|
||||
|
||||
unsigned long*rarray = rval.subarray(0,wid);
|
||||
if (rarray == 0) {
|
||||
delete[]larray;
|
||||
return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
|
||||
return of_CMPU_the_hard_way(thr, wid, lval, rval);
|
||||
}
|
||||
|
||||
unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
|
||||
|
|
@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
|||
thr->flags[4] = eq;
|
||||
thr->flags[5] = lt;
|
||||
thr->flags[6] = eq;
|
||||
}
|
||||
|
||||
bool of_CMPU(vthread_t thr, vvp_code_t)
|
||||
{
|
||||
|
||||
vvp_vector4_t rval = thr->pop_vec4();
|
||||
vvp_vector4_t lval = thr->pop_vec4();
|
||||
|
||||
do_CMPU(thr, lval, rval);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* %cmpi/u <vala>, <valb>, <wid>
|
||||
*
|
||||
* Pop1 operand, get the other operand from the arguments.
|
||||
*/
|
||||
bool of_CMPIU(vthread_t thr, vvp_code_t cp)
|
||||
{
|
||||
unsigned wid = cp->number;
|
||||
|
||||
vvp_vector4_t&lval = thr->peek_vec4();
|
||||
|
||||
// I expect that most of the bits of an immediate value are
|
||||
// going to be zero, so start the result vector with all zero
|
||||
// bits. Then we only need to replace the bits that are different.
|
||||
vvp_vector4_t rval (wid, BIT4_0);
|
||||
get_immediate_rval (cp, rval);
|
||||
|
||||
do_CMPU(thr, lval, rval);
|
||||
|
||||
thr->pop_vec4(1);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* %cmp/x
|
||||
*/
|
||||
|
|
@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
|
|||
int flag = cp->number;
|
||||
assert(flag < vthread_s::FLAGS_COUNT);
|
||||
|
||||
vvp_vector4_t val = thr->pop_vec4();
|
||||
const vvp_vector4_t&val = thr->peek_vec4();
|
||||
thr->flags[flag] = val.value(0);
|
||||
thr->pop_vec4(1);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
|
|||
return true;
|
||||
}
|
||||
|
||||
static void do_verylong_mod(vthread_t thr,
|
||||
vvp_vector4_t&vala, const vvp_vector4_t&valb,
|
||||
static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
|
||||
bool left_is_neg, bool right_is_neg)
|
||||
{
|
||||
bool out_is_neg = left_is_neg;
|
||||
|
|
@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
|
|||
return true;
|
||||
|
||||
} else {
|
||||
do_verylong_mod(thr, vala, valb, false, false);
|
||||
do_verylong_mod(vala, valb, false, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
|
|||
|
||||
bool left_is_neg = vala.value(vala.size()-1) == BIT4_1;
|
||||
bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
|
||||
do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg);
|
||||
do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue