From 04bdfbcceeb1e368510734f2bb4e03f99ff3b9b7 Mon Sep 17 00:00:00 2001 From: Stephen Williams Date: Wed, 19 Nov 2014 16:38:43 -0800 Subject: [PATCH] Add %cmpi/s and %cmpi/u instructions for performance These bypass the vec4 stack in some common cases, saving instructions and vec4 manipulations. Also, minor improvement to the %flag/set/vec4 statement. Kill a few warnings. --- tgt-vvp/eval_vec4.c | 17 ++++-- vvp/codes.h | 2 + vvp/compile.cc | 2 + vvp/opcodes.txt | 2 + vvp/vthread.cc | 123 ++++++++++++++++++++++++++++++++------------ 5 files changed, 109 insertions(+), 37 deletions(-) diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c index 0f29a9dc3..fdd0c893c 100644 --- a/tgt-vvp/eval_vec4.c +++ b/tgt-vvp/eval_vec4.c @@ -558,18 +558,27 @@ static void draw_binary_vec4_le(ivl_expr_t expr) draw_eval_vec4(le); resize_vec4_wid(le, use_wid); - draw_eval_vec4(re); - resize_vec4_wid(re, use_wid); + if (ivl_expr_width(re)==use_wid && test_immediate_vec4_ok(re)) { + /* Special case: If the right operand can be handled as + an immediate operand, then use that instead. */ + char opcode[8]; + snprintf(opcode, sizeof opcode, "%%cmpi/%c", s_flag); + draw_immediate_vec4(re, opcode); + + } else { + draw_eval_vec4(re); + resize_vec4_wid(re, use_wid); + + fprintf(vvp_out, " %%cmp/%c;\n", s_flag); + } switch (use_opcode) { case 'L': - fprintf(vvp_out, " %%cmp/%c;\n", s_flag); fprintf(vvp_out, " %%flag_get/vec4 4;\n"); fprintf(vvp_out, " %%flag_get/vec4 5;\n"); fprintf(vvp_out, " %%or;\n"); break; case '<': - fprintf(vvp_out, " %%cmp/%c;\n", s_flag); fprintf(vvp_out, " %%flag_get/vec4 5;\n"); break; default: diff --git a/vvp/codes.h b/vvp/codes.h index af6c99ba0..9489b92c7 100644 --- a/vvp/codes.h +++ b/vvp/codes.h @@ -62,8 +62,10 @@ extern bool of_CASSIGN_VEC4_OFF(vthread_t thr, vvp_code_t code); extern bool of_CASSIGN_WR(vthread_t thr, vvp_code_t code); extern bool of_CAST2(vthread_t thr, vvp_code_t code); extern bool of_CMPS(vthread_t thr, vvp_code_t code); +extern bool of_CMPIS(vthread_t thr, vvp_code_t code); extern bool of_CMPSTR(vthread_t thr, vvp_code_t code); extern bool of_CMPU(vthread_t thr, vvp_code_t code); +extern bool of_CMPIU(vthread_t thr, vvp_code_t code); extern bool of_CMPWR(vthread_t thr, vvp_code_t code); extern bool of_CMPWS(vthread_t thr, vvp_code_t code); extern bool of_CMPWU(vthread_t thr, vvp_code_t code); diff --git a/vvp/compile.cc b/vvp/compile.cc index 933ef8563..a521b61ab 100644 --- a/vvp/compile.cc +++ b/vvp/compile.cc @@ -120,6 +120,8 @@ static const struct opcode_table_s opcode_table[] = { { "%cmp/wu", of_CMPWU, 2, {OA_BIT1, OA_BIT2, OA_NONE} }, { "%cmp/x", of_CMPX, 0, {OA_NONE, OA_NONE, OA_NONE} }, { "%cmp/z", of_CMPZ, 0, {OA_NONE, OA_NONE, OA_NONE} }, + { "%cmpi/s", of_CMPIS, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, + { "%cmpi/u", of_CMPIU, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, { "%concat/str", of_CONCAT_STR, 0,{OA_NONE, OA_NONE, OA_NONE} }, { "%concat/vec4", of_CONCAT_VEC4, 0,{OA_NONE, OA_NONE, OA_NONE} }, { "%concati/str", of_CONCATI_STR, 1,{OA_STRING,OA_NONE, OA_NONE} }, diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt index 4813e60dd..1f5a342f9 100644 --- a/vvp/opcodes.txt +++ b/vvp/opcodes.txt @@ -252,6 +252,8 @@ vector2 (binary) value, and push the result. * %cmp/s * %cmp/u +* %cmpi/s , , +* %cmpi/u , , These instructions perform a generic comparison of two vectors of equal size. Two values are pulled from the top of the stack, and not diff --git a/vvp/vthread.cc b/vvp/vthread.cc index 68837fbc8..ceab39474 100644 --- a/vvp/vthread.cc +++ b/vvp/vthread.cc @@ -1476,27 +1476,12 @@ bool of_CAST2(vthread_t thr, vvp_code_t) return true; } -/* - * %cmp/s - * - * Pop the operands from the stack, and do not replace them. The - * results are written to flag bits: - * - * 4: eq (equal) - * 5: lt (less than) - * 6: eeq (case equal) - */ -bool of_CMPS(vthread_t thr, vvp_code_t) +static void do_CMPS(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval) { vvp_bit4_t eq = BIT4_1; vvp_bit4_t eeq = BIT4_1; vvp_bit4_t lt = BIT4_0; - // We are going to pop these and push nothing in their - // place, but for now it is more efficient to use a constant - // reference. When we finish, pop the stack without copies. - const vvp_vector4_t&rval = thr->peek_vec4(0); - const vvp_vector4_t&lval = thr->peek_vec4(1); assert(rval.size() == lval.size()); @@ -1507,8 +1492,7 @@ bool of_CMPS(vthread_t thr, vvp_code_t) thr->flags[4] = BIT4_X; // eq thr->flags[5] = BIT4_X; // lt thr->flags[6] = lval.eeq(rval)? BIT4_1 : BIT4_0; - thr->pop_vec4(2); - return true; + return; } // Past this point, we know we are dealing only with fully @@ -1550,11 +1534,55 @@ bool of_CMPS(vthread_t thr, vvp_code_t) thr->flags[4] = eq; thr->flags[5] = lt; thr->flags[6] = eeq; +} + +/* + * %cmp/s + * + * Pop the operands from the stack, and do not replace them. The + * results are written to flag bits: + * + * 4: eq (equal) + * 5: lt (less than) + * 6: eeq (case equal) + */ +bool of_CMPS(vthread_t thr, vvp_code_t) +{ + // We are going to pop these and push nothing in their + // place, but for now it is more efficient to use a constant + // reference. When we finish, pop the stack without copies. + const vvp_vector4_t&rval = thr->peek_vec4(0); + const vvp_vector4_t&lval = thr->peek_vec4(1); + + do_CMPS(thr, lval, rval); thr->pop_vec4(2); return true; } +/* + * %cmpi/s , , + * + * Pop1 operand, get the other operand from the arguments. + */ +bool of_CMPIS(vthread_t thr, vvp_code_t cp) +{ + unsigned wid = cp->number; + + vvp_vector4_t&lval = thr->peek_vec4(); + + // I expect that most of the bits of an immediate value are + // going to be zero, so start the result vector with all zero + // bits. Then we only need to replace the bits that are different. + vvp_vector4_t rval (wid, BIT4_0); + get_immediate_rval (cp, rval); + + do_CMPS(thr, lval, rval); + + thr->pop_vec4(1); + return true; +} + bool of_CMPSTR(vthread_t thr, vvp_code_t) { string re = thr->pop_str(); @@ -1582,8 +1610,9 @@ bool of_CMPSTR(vthread_t thr, vvp_code_t) return true; } -bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid, - const vvp_vector4_t&lval, const vvp_vector4_t&rval) +static void of_CMPU_the_hard_way(vthread_t thr, unsigned wid, + const vvp_vector4_t&lval, + const vvp_vector4_t&rval) { vvp_bit4_t eq = BIT4_1; vvp_bit4_t eeq = BIT4_1; @@ -1610,18 +1639,13 @@ bool of_CMPU_the_hard_way(vthread_t thr, vvp_code_t, unsigned wid, thr->flags[4] = eq; thr->flags[5] = BIT4_X; thr->flags[6] = eeq; - - return true; } -bool of_CMPU(vthread_t thr, vvp_code_t cp) +static void do_CMPU(vthread_t thr, const vvp_vector4_t&lval, const vvp_vector4_t&rval) { vvp_bit4_t eq = BIT4_1; vvp_bit4_t lt = BIT4_0; - vvp_vector4_t rval = thr->pop_vec4(); - vvp_vector4_t lval = thr->pop_vec4(); - if (rval.size() != lval.size()) { cerr << "VVP ERROR: %cmp/u operand width mismatch: lval=" << lval << ", rval=" << rval << endl; @@ -1630,12 +1654,12 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp) unsigned wid = lval.size(); unsigned long*larray = lval.subarray(0,wid); - if (larray == 0) return of_CMPU_the_hard_way(thr, cp, wid, lval, rval); + if (larray == 0) return of_CMPU_the_hard_way(thr, wid, lval, rval); unsigned long*rarray = rval.subarray(0,wid); if (rarray == 0) { delete[]larray; - return of_CMPU_the_hard_way(thr, cp, wid, lval, rval); + return of_CMPU_the_hard_way(thr, wid, lval, rval); } unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; @@ -1657,10 +1681,43 @@ bool of_CMPU(vthread_t thr, vvp_code_t cp) thr->flags[4] = eq; thr->flags[5] = lt; thr->flags[6] = eq; +} + +bool of_CMPU(vthread_t thr, vvp_code_t) +{ + + vvp_vector4_t rval = thr->pop_vec4(); + vvp_vector4_t lval = thr->pop_vec4(); + + do_CMPU(thr, lval, rval); return true; } +/* + * %cmpi/u , , + * + * Pop1 operand, get the other operand from the arguments. + */ +bool of_CMPIU(vthread_t thr, vvp_code_t cp) +{ + unsigned wid = cp->number; + + vvp_vector4_t&lval = thr->peek_vec4(); + + // I expect that most of the bits of an immediate value are + // going to be zero, so start the result vector with all zero + // bits. Then we only need to replace the bits that are different. + vvp_vector4_t rval (wid, BIT4_0); + get_immediate_rval (cp, rval); + + do_CMPU(thr, lval, rval); + + thr->pop_vec4(1); + return true; +} + + /* * %cmp/x */ @@ -2632,8 +2689,9 @@ bool of_FLAG_SET_VEC4(vthread_t thr, vvp_code_t cp) int flag = cp->number; assert(flag < vthread_s::FLAGS_COUNT); - vvp_vector4_t val = thr->pop_vec4(); + const vvp_vector4_t&val = thr->peek_vec4(); thr->flags[flag] = val.value(0); + thr->pop_vec4(1); return true; } @@ -3397,8 +3455,7 @@ bool of_LOAD_VEC4A(vthread_t thr, vvp_code_t cp) return true; } -static void do_verylong_mod(vthread_t thr, - vvp_vector4_t&vala, const vvp_vector4_t&valb, +static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb, bool left_is_neg, bool right_is_neg) { bool out_is_neg = left_is_neg; @@ -3579,7 +3636,7 @@ bool of_MOD(vthread_t thr, vvp_code_t) return true; } else { - do_verylong_mod(thr, vala, valb, false, false); + do_verylong_mod(vala, valb, false, false); return true; } @@ -3642,7 +3699,7 @@ bool of_MOD_S(vthread_t thr, vvp_code_t) bool left_is_neg = vala.value(vala.size()-1) == BIT4_1; bool right_is_neg = valb.value(valb.size()-1) == BIT4_1; - do_verylong_mod(thr, vala, valb, left_is_neg, right_is_neg); + do_verylong_mod(vala, valb, left_is_neg, right_is_neg); return true; }