Add %cmpi/s and %cmpi/u instructions for performance

These bypass the vec4 stack in some common cases, saving instructions
and vec4 manipulations.

Also, minor improvement to the %flag/set/vec4 statement.

Kill a few warnings.
This commit is contained in:
Stephen Williams 2014-11-19 16:38:43 -08:00
parent 2acc9fbdee
commit 04bdfbccee
5 changed files with 109 additions and 37 deletions

View File

@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr)
draw_eval_vec4(le); draw_eval_vec4(le);
resize_vec4_wid(le, use_wid); resize_vec4_wid(le, use_wid);
if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
/* Special case: If the right operand can be handled as
an immediate operand, then use that instead. */
char opcode[8];
snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag);
draw_immediate_vec4(re, opcode);
} else {
draw_eval_vec4(re); draw_eval_vec4(re);
resize_vec4_wid(re, use_wid); resize_vec4_wid(re, use_wid);
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
}
switch (use_opcode) { switch (use_opcode) {
case 'L': case 'L':
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
fprintf(vvp_out, " %%flag_get/vec4 4;\n"); fprintf(vvp_out, " %%flag_get/vec4 4;\n");
fprintf(vvp_out, " %%flag_get/vec4 5;\n"); fprintf(vvp_out, " %%flag_get/vec4 5;\n");
fprintf(vvp_out, " %%or;\n"); fprintf(vvp_out, " %%or;\n");
break; break;
case '<': case '<':
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
fprintf(vvp_out, " %%flag_get/vec4 5;\n"); fprintf(vvp_out, " %%flag_get/vec4 5;\n");
break; break;
default: default:

View File

@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code);
extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code); extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
extern bool of_CAST2(vthread_t thr, vvp_code_t code); extern bool of_CAST2(vthread_t thr, vvp_code_t code);
extern bool of_CMPS(vthread_t thr, vvp_code_t code); extern bool of_CMPS(vthread_t thr, vvp_code_t code);
extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
extern bool of_CMPSTR(vthread_t thr, vvp_code_t code); extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
extern bool of_CMPU(vthread_t thr, vvp_code_t code); extern bool of_CMPU(vthread_t thr, vvp_code_t code);
extern bool of_CMPIU(vthread_t thr, vvp_code_t code);
extern bool of_CMPWR(vthread_t thr, vvp_code_t code); extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
extern bool of_CMPWS(vthread_t thr, vvp_code_t code); extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
extern bool of_CMPWU(vthread_t thr, vvp_code_t code); extern bool of_CMPWU(vthread_t thr, vvp_code_t code);

View File

@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = {
{ "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} }, { "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} },
{ "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} }, { "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} },
{ "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} }, { "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} },
{ "%cmpi/s", of_CMPIS, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
{ "%cmpi/u", of_CMPIU, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
{ "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} }, { "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} },
{ "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} }, { "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} },
{ "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} }, { "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} },

View File

@ -252,6 +252,8 @@ vector2 (binary) value, and push the result.
* %cmp/s * %cmp/s
* %cmp/u * %cmp/u
* %cmpi/s <vala>, <valb>, <wid>
* %cmpi/u <vala>, <valb>, <wid>
These instructions perform a generic comparison of two vectors of These instructions perform a generic comparison of two vectors of
equal size. Two values are pulled from the top of the stack, and not equal size. Two values are pulled from the top of the stack, and not

View File

@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
return true; return true;
} }
/* static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
* %cmp/s
*
* Pop the operands from the stack, and do not replace them. The
* results are written to flag bits:
*
* 4: eq (equal)
* 5: lt (less than)
* 6: eeq (case equal)
*/
bool of_CMPS(vthread_t thr, vvp_code_t)
{ {
vvp_bit4_t eq = BIT4_1; vvp_bit4_t eq = BIT4_1;
vvp_bit4_t eeq = BIT4_1; vvp_bit4_t eeq = BIT4_1;
vvp_bit4_t lt = BIT4_0; vvp_bit4_t lt = BIT4_0;
// We are going to pop these and push nothing in their
// place, but for now it is more efficient to use a constant
// reference. When we finish, pop the stack without copies.
const vvp_vector4_t&rval = thr->peek_vec4(0);
const vvp_vector4_t&lval = thr->peek_vec4(1);
assert(rval.size() == lval.size()); assert(rval.size() == lval.size());
@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
thr->flags[4] = BIT4_X; // eq thr->flags[4] = BIT4_X; // eq
thr->flags[5] = BIT4_X; // lt thr->flags[5] = BIT4_X; // lt
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0; thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
thr->pop_vec4(2); return;
return true;
} }
// Past this point, we know we are dealing only with fully // Past this point, we know we are dealing only with fully
@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
thr->flags[4] = eq; thr->flags[4] = eq;
thr->flags[5] = lt; thr->flags[5] = lt;
thr->flags[6] = eeq; thr->flags[6] = eeq;
}
/*
* %cmp/s
*
* Pop the operands from the stack, and do not replace them. The
* results are written to flag bits:
*
* 4: eq (equal)
* 5: lt (less than)
* 6: eeq (case equal)
*/
bool of_CMPS(vthread_t thr, vvp_code_t)
{
// We are going to pop these and push nothing in their
// place, but for now it is more efficient to use a constant
// reference. When we finish, pop the stack without copies.
const vvp_vector4_t&rval = thr->peek_vec4(0);
const vvp_vector4_t&lval = thr->peek_vec4(1);
do_CMPS(thr, lval, rval);
thr->pop_vec4(2); thr->pop_vec4(2);
return true; return true;
} }
/*
* %cmpi/s <vala>, <valb>, <wid>
*
* Pop1 operand, get the other operand from the arguments.
*/
bool of_CMPIS(vthread_t thr, vvp_code_t cp)
{
unsigned wid = cp->number;
vvp_vector4_t&lval = thr->peek_vec4();
// I expect that most of the bits of an immediate value are
// going to be zero, so start the result vector with all zero
// bits. Then we only need to replace the bits that are different.
vvp_vector4_t rval (wid, BIT4_0);
get_immediate_rval (cp, rval);
do_CMPS(thr, lval, rval);
thr->pop_vec4(1);
return true;
}
bool of_CMPSTR(vthread_t thr, vvp_code_t) bool of_CMPSTR(vthread_t thr, vvp_code_t)
{ {
string re = thr->pop_str(); string re = thr->pop_str();
@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
return true; return true;
} }
bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid, static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
const vvp_vector4_t&lval, const vvp_vector4_t&rval) const vvp_vector4_t&lval,
const vvp_vector4_t&rval)
{ {
vvp_bit4_t eq = BIT4_1; vvp_bit4_t eq = BIT4_1;
vvp_bit4_t eeq = BIT4_1; vvp_bit4_t eeq = BIT4_1;
@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
thr->flags[4] = eq; thr->flags[4] = eq;
thr->flags[5] = BIT4_X; thr->flags[5] = BIT4_X;
thr->flags[6] = eeq; thr->flags[6] = eeq;
return true;
} }
bool of_CMPU(vthread_t thr, vvp_code_t cp) static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
{ {
vvp_bit4_t eq = BIT4_1; vvp_bit4_t eq = BIT4_1;
vvp_bit4_t lt = BIT4_0; vvp_bit4_t lt = BIT4_0;
vvp_vector4_t rval = thr->pop_vec4();
vvp_vector4_t lval = thr->pop_vec4();
if (rval.size() != lval.size()) { if (rval.size() != lval.size()) {
cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
<< ", rval=" << rval << endl; << ", rval=" << rval << endl;
@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
unsigned wid = lval.size(); unsigned wid = lval.size();
unsigned long*larray = lval.subarray(0,wid); unsigned long*larray = lval.subarray(0,wid);
if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval); if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);
unsigned long*rarray = rval.subarray(0,wid); unsigned long*rarray = rval.subarray(0,wid);
if (rarray == 0) { if (rarray == 0) {
delete[]larray; delete[]larray;
return of_CMPU_the_hard_way(thr, cp, wid, lval, rval); return of_CMPU_the_hard_way(thr, wid, lval, rval);
} }
unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
thr->flags[4] = eq; thr->flags[4] = eq;
thr->flags[5] = lt; thr->flags[5] = lt;
thr->flags[6] = eq; thr->flags[6] = eq;
}
bool of_CMPU(vthread_t thr, vvp_code_t)
{
vvp_vector4_t rval = thr->pop_vec4();
vvp_vector4_t lval = thr->pop_vec4();
do_CMPU(thr, lval, rval);
return true; return true;
} }
/*
* %cmpi/u <vala>, <valb>, <wid>
*
* Pop1 operand, get the other operand from the arguments.
*/
bool of_CMPIU(vthread_t thr, vvp_code_t cp)
{
unsigned wid = cp->number;
vvp_vector4_t&lval = thr->peek_vec4();
// I expect that most of the bits of an immediate value are
// going to be zero, so start the result vector with all zero
// bits. Then we only need to replace the bits that are different.
vvp_vector4_t rval (wid, BIT4_0);
get_immediate_rval (cp, rval);
do_CMPU(thr, lval, rval);
thr->pop_vec4(1);
return true;
}
/* /*
* %cmp/x * %cmp/x
*/ */
@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
int flag = cp->number; int flag = cp->number;
assert(flag < vthread_s::FLAGS_COUNT); assert(flag < vthread_s::FLAGS_COUNT);
vvp_vector4_t val = thr->pop_vec4(); const vvp_vector4_t&val = thr->peek_vec4();
thr->flags[flag] = val.value(0); thr->flags[flag] = val.value(0);
thr->pop_vec4(1);
return true; return true;
} }
@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
return true; return true;
} }
static void do_verylong_mod(vthread_t thr, static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
vvp_vector4_t&vala, const vvp_vector4_t&valb,
bool left_is_neg, bool right_is_neg) bool left_is_neg, bool right_is_neg)
{ {
bool out_is_neg = left_is_neg; bool out_is_neg = left_is_neg;
@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
return true; return true;
} else { } else {
do_verylong_mod(thr, vala, valb, false, false); do_verylong_mod(vala, valb, false, false);
return true; return true;
} }
@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
bool left_is_neg = vala.value(vala.size()-1) == BIT4_1; bool left_is_neg = vala.value(vala.size()-1) == BIT4_1;
bool right_is_neg = valb.value(valb.size()-1) == BIT4_1; bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg); do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
return true; return true;
} }