Add %cmpi/s and %cmpi/u instructions for performance
These bypass the vec4 stack in some common cases, saving instructions and vec4 manipulations. Also, minor improvement to the %flag/set/vec4 statement. Kill a few warnings.
This commit is contained in:
parent
2acc9fbdee
commit
04bdfbccee
|
|
@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr)
|
||||||
draw_eval_vec4(le);
|
draw_eval_vec4(le);
|
||||||
resize_vec4_wid(le, use_wid);
|
resize_vec4_wid(le, use_wid);
|
||||||
|
|
||||||
|
if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) {
|
||||||
|
/* Special case: If the right operand can be handled as
|
||||||
|
an immediate operand, then use that instead. */
|
||||||
|
char opcode[8];
|
||||||
|
snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag);
|
||||||
|
draw_immediate_vec4(re, opcode);
|
||||||
|
|
||||||
|
} else {
|
||||||
draw_eval_vec4(re);
|
draw_eval_vec4(re);
|
||||||
resize_vec4_wid(re, use_wid);
|
resize_vec4_wid(re, use_wid);
|
||||||
|
|
||||||
|
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
||||||
|
}
|
||||||
|
|
||||||
switch (use_opcode) {
|
switch (use_opcode) {
|
||||||
case 'L':
|
case 'L':
|
||||||
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
|
||||||
fprintf(vvp_out, " %%flag_get/vec4 4;\n");
|
fprintf(vvp_out, " %%flag_get/vec4 4;\n");
|
||||||
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
||||||
fprintf(vvp_out, " %%or;\n");
|
fprintf(vvp_out, " %%or;\n");
|
||||||
break;
|
break;
|
||||||
case '<':
|
case '<':
|
||||||
fprintf(vvp_out, " %%cmp/%c;\n", s_flag);
|
|
||||||
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
fprintf(vvp_out, " %%flag_get/vec4 5;\n");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
|
|
||||||
|
|
@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
|
extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CAST2(vthread_t thr, vvp_code_t code);
|
extern bool of_CAST2(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPS(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPS(vthread_t thr, vvp_code_t code);
|
||||||
|
extern bool of_CMPIS(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPSTR(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPU(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPU(vthread_t thr, vvp_code_t code);
|
||||||
|
extern bool of_CMPIU(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPWR(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPWS(vthread_t thr, vvp_code_t code);
|
||||||
extern bool of_CMPWU(vthread_t thr, vvp_code_t code);
|
extern bool of_CMPWU(vthread_t thr, vvp_code_t code);
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = {
|
||||||
{ "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} },
|
{ "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} },
|
||||||
{ "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
{ "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
||||||
{ "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
{ "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} },
|
||||||
|
{ "%cmpi/s", of_CMPIS, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
|
||||||
|
{ "%cmpi/u", of_CMPIU, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} },
|
||||||
{ "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
{ "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
||||||
{ "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
{ "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} },
|
||||||
{ "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} },
|
{ "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} },
|
||||||
|
|
|
||||||
|
|
@ -252,6 +252,8 @@ vector2 (binary) value, and push the result.
|
||||||
|
|
||||||
* %cmp/s
|
* %cmp/s
|
||||||
* %cmp/u
|
* %cmp/u
|
||||||
|
* %cmpi/s <vala>, <valb>, <wid>
|
||||||
|
* %cmpi/u <vala>, <valb>, <wid>
|
||||||
|
|
||||||
These instructions perform a generic comparison of two vectors of
|
These instructions perform a generic comparison of two vectors of
|
||||||
equal size. Two values are pulled from the top of the stack, and not
|
equal size. Two values are pulled from the top of the stack, and not
|
||||||
|
|
|
||||||
123
vvp/vthread.cc
123
vvp/vthread.cc
|
|
@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
||||||
* %cmp/s
|
|
||||||
*
|
|
||||||
* Pop the operands from the stack, and do not replace them. The
|
|
||||||
* results are written to flag bits:
|
|
||||||
*
|
|
||||||
* 4: eq (equal)
|
|
||||||
* 5: lt (less than)
|
|
||||||
* 6: eeq (case equal)
|
|
||||||
*/
|
|
||||||
bool of_CMPS(vthread_t thr, vvp_code_t)
|
|
||||||
{
|
{
|
||||||
vvp_bit4_t eq = BIT4_1;
|
vvp_bit4_t eq = BIT4_1;
|
||||||
vvp_bit4_t eeq = BIT4_1;
|
vvp_bit4_t eeq = BIT4_1;
|
||||||
vvp_bit4_t lt = BIT4_0;
|
vvp_bit4_t lt = BIT4_0;
|
||||||
|
|
||||||
// We are going to pop these and push nothing in their
|
|
||||||
// place, but for now it is more efficient to use a constant
|
|
||||||
// reference. When we finish, pop the stack without copies.
|
|
||||||
const vvp_vector4_t&rval = thr->peek_vec4(0);
|
|
||||||
const vvp_vector4_t&lval = thr->peek_vec4(1);
|
|
||||||
|
|
||||||
assert(rval.size() == lval.size());
|
assert(rval.size() == lval.size());
|
||||||
|
|
||||||
|
|
@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||||
thr->flags[4] = BIT4_X; // eq
|
thr->flags[4] = BIT4_X; // eq
|
||||||
thr->flags[5] = BIT4_X; // lt
|
thr->flags[5] = BIT4_X; // lt
|
||||||
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
|
thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0;
|
||||||
thr->pop_vec4(2);
|
return;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Past this point, we know we are dealing only with fully
|
// Past this point, we know we are dealing only with fully
|
||||||
|
|
@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||||
thr->flags[4] = eq;
|
thr->flags[4] = eq;
|
||||||
thr->flags[5] = lt;
|
thr->flags[5] = lt;
|
||||||
thr->flags[6] = eeq;
|
thr->flags[6] = eeq;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* %cmp/s
|
||||||
|
*
|
||||||
|
* Pop the operands from the stack, and do not replace them. The
|
||||||
|
* results are written to flag bits:
|
||||||
|
*
|
||||||
|
* 4: eq (equal)
|
||||||
|
* 5: lt (less than)
|
||||||
|
* 6: eeq (case equal)
|
||||||
|
*/
|
||||||
|
bool of_CMPS(vthread_t thr, vvp_code_t)
|
||||||
|
{
|
||||||
|
// We are going to pop these and push nothing in their
|
||||||
|
// place, but for now it is more efficient to use a constant
|
||||||
|
// reference. When we finish, pop the stack without copies.
|
||||||
|
const vvp_vector4_t&rval = thr->peek_vec4(0);
|
||||||
|
const vvp_vector4_t&lval = thr->peek_vec4(1);
|
||||||
|
|
||||||
|
do_CMPS(thr, lval, rval);
|
||||||
|
|
||||||
thr->pop_vec4(2);
|
thr->pop_vec4(2);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* %cmpi/s <vala>, <valb>, <wid>
|
||||||
|
*
|
||||||
|
* Pop1 operand, get the other operand from the arguments.
|
||||||
|
*/
|
||||||
|
bool of_CMPIS(vthread_t thr, vvp_code_t cp)
|
||||||
|
{
|
||||||
|
unsigned wid = cp->number;
|
||||||
|
|
||||||
|
vvp_vector4_t&lval = thr->peek_vec4();
|
||||||
|
|
||||||
|
// I expect that most of the bits of an immediate value are
|
||||||
|
// going to be zero, so start the result vector with all zero
|
||||||
|
// bits. Then we only need to replace the bits that are different.
|
||||||
|
vvp_vector4_t rval (wid, BIT4_0);
|
||||||
|
get_immediate_rval (cp, rval);
|
||||||
|
|
||||||
|
do_CMPS(thr, lval, rval);
|
||||||
|
|
||||||
|
thr->pop_vec4(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool of_CMPSTR(vthread_t thr, vvp_code_t)
|
bool of_CMPSTR(vthread_t thr, vvp_code_t)
|
||||||
{
|
{
|
||||||
string re = thr->pop_str();
|
string re = thr->pop_str();
|
||||||
|
|
@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
|
static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid,
|
||||||
const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
const vvp_vector4_t&lval,
|
||||||
|
const vvp_vector4_t&rval)
|
||||||
{
|
{
|
||||||
vvp_bit4_t eq = BIT4_1;
|
vvp_bit4_t eq = BIT4_1;
|
||||||
vvp_bit4_t eeq = BIT4_1;
|
vvp_bit4_t eeq = BIT4_1;
|
||||||
|
|
@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid,
|
||||||
thr->flags[4] = eq;
|
thr->flags[4] = eq;
|
||||||
thr->flags[5] = BIT4_X;
|
thr->flags[5] = BIT4_X;
|
||||||
thr->flags[6] = eeq;
|
thr->flags[6] = eeq;
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval)
|
||||||
{
|
{
|
||||||
vvp_bit4_t eq = BIT4_1;
|
vvp_bit4_t eq = BIT4_1;
|
||||||
vvp_bit4_t lt = BIT4_0;
|
vvp_bit4_t lt = BIT4_0;
|
||||||
|
|
||||||
vvp_vector4_t rval = thr->pop_vec4();
|
|
||||||
vvp_vector4_t lval = thr->pop_vec4();
|
|
||||||
|
|
||||||
if (rval.size() != lval.size()) {
|
if (rval.size() != lval.size()) {
|
||||||
cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
|
cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval
|
||||||
<< ", rval=" << rval << endl;
|
<< ", rval=" << rval << endl;
|
||||||
|
|
@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
||||||
unsigned wid = lval.size();
|
unsigned wid = lval.size();
|
||||||
|
|
||||||
unsigned long*larray = lval.subarray(0,wid);
|
unsigned long*larray = lval.subarray(0,wid);
|
||||||
if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
|
if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval);
|
||||||
|
|
||||||
unsigned long*rarray = rval.subarray(0,wid);
|
unsigned long*rarray = rval.subarray(0,wid);
|
||||||
if (rarray == 0) {
|
if (rarray == 0) {
|
||||||
delete[]larray;
|
delete[]larray;
|
||||||
return of_CMPU_the_hard_way(thr, cp, wid, lval, rval);
|
return of_CMPU_the_hard_way(thr, wid, lval, rval);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
|
unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
|
||||||
|
|
@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp)
|
||||||
thr->flags[4] = eq;
|
thr->flags[4] = eq;
|
||||||
thr->flags[5] = lt;
|
thr->flags[5] = lt;
|
||||||
thr->flags[6] = eq;
|
thr->flags[6] = eq;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool of_CMPU(vthread_t thr, vvp_code_t)
|
||||||
|
{
|
||||||
|
|
||||||
|
vvp_vector4_t rval = thr->pop_vec4();
|
||||||
|
vvp_vector4_t lval = thr->pop_vec4();
|
||||||
|
|
||||||
|
do_CMPU(thr, lval, rval);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* %cmpi/u <vala>, <valb>, <wid>
|
||||||
|
*
|
||||||
|
* Pop1 operand, get the other operand from the arguments.
|
||||||
|
*/
|
||||||
|
bool of_CMPIU(vthread_t thr, vvp_code_t cp)
|
||||||
|
{
|
||||||
|
unsigned wid = cp->number;
|
||||||
|
|
||||||
|
vvp_vector4_t&lval = thr->peek_vec4();
|
||||||
|
|
||||||
|
// I expect that most of the bits of an immediate value are
|
||||||
|
// going to be zero, so start the result vector with all zero
|
||||||
|
// bits. Then we only need to replace the bits that are different.
|
||||||
|
vvp_vector4_t rval (wid, BIT4_0);
|
||||||
|
get_immediate_rval (cp, rval);
|
||||||
|
|
||||||
|
do_CMPU(thr, lval, rval);
|
||||||
|
|
||||||
|
thr->pop_vec4(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* %cmp/x
|
* %cmp/x
|
||||||
*/
|
*/
|
||||||
|
|
@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp)
|
||||||
int flag = cp->number;
|
int flag = cp->number;
|
||||||
assert(flag < vthread_s::FLAGS_COUNT);
|
assert(flag < vthread_s::FLAGS_COUNT);
|
||||||
|
|
||||||
vvp_vector4_t val = thr->pop_vec4();
|
const vvp_vector4_t&val = thr->peek_vec4();
|
||||||
thr->flags[flag] = val.value(0);
|
thr->flags[flag] = val.value(0);
|
||||||
|
thr->pop_vec4(1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp)
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void do_verylong_mod(vthread_t thr,
|
static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
|
||||||
vvp_vector4_t&vala, const vvp_vector4_t&valb,
|
|
||||||
bool left_is_neg, bool right_is_neg)
|
bool left_is_neg, bool right_is_neg)
|
||||||
{
|
{
|
||||||
bool out_is_neg = left_is_neg;
|
bool out_is_neg = left_is_neg;
|
||||||
|
|
@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
do_verylong_mod(thr, vala, valb, false, false);
|
do_verylong_mod(vala, valb, false, false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t)
|
||||||
|
|
||||||
bool left_is_neg = vala.value(vala.size()-1) == BIT4_1;
|
bool left_is_neg = vala.value(vala.size()-1) == BIT4_1;
|
||||||
bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
|
bool right_is_neg = valb.value(valb.size()-1) == BIT4_1;
|
||||||
do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg);
|
do_verylong_mod(vala, valb, left_is_neg, right_is_neg);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue