diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c index 08080fe7e..e1fb2013c 100644 --- a/tgt-vvp/eval_expr.c +++ b/tgt-vvp/eval_expr.c @@ -92,7 +92,7 @@ unsigned long get_number_immediate(ivl_expr_t ex) case '0': break; case '1': - imm |= 1 << idx; + imm |= 1UL << idx; break; default: assert(0); @@ -909,12 +909,43 @@ static struct vector_info draw_binary_expr_le(ivl_expr_t exp, return lv; } +static struct vector_info draw_logic_immediate(ivl_expr_t exp, + ivl_expr_t le, + ivl_expr_t re, + unsigned wid) +{ + struct vector_info lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ); + unsigned long imm = get_number_immediate(re); + + assert(lv.base >= 4); + + switch (ivl_expr_opcode(exp)) { + + case '&': + fprintf(vvp_out, " %%andi %u, %lu, %u;\n", lv.base, imm, lv.wid); + break; + + default: + assert(0); + break; + } + + return lv; +} + static struct vector_info draw_binary_expr_logic(ivl_expr_t exp, unsigned wid) { ivl_expr_t le = ivl_expr_oper1(exp); ivl_expr_t re = ivl_expr_oper2(exp); + if (ivl_expr_opcode(exp) == '&') { + if (number_is_immediate(re, IMM_WID) && !number_is_unknown(re)) + return draw_logic_immediate(exp, le, re, wid); + if (number_is_immediate(le, IMM_WID) && !number_is_unknown(le)) + return draw_logic_immediate(exp, re, le, wid); + } + struct vector_info lv; struct vector_info rv; @@ -1167,15 +1198,15 @@ static struct vector_info draw_add_immediate(ivl_expr_t le, imm = get_number_immediate(re); /* Now generate enough %addi instructions to add the entire - immediate value to the destination. The adds are done 16 - bits at a time, but 17 bits are done to push the carry into + immediate value to the destination. The adds are done IMM_WID + bits at a time, but +1 bits are done to push the carry into the higher bits if needed. */ { unsigned base; - for (base = 0 ; base < lv.wid ; base += 16) { - unsigned long tmp = imm & 0xffffUL; + for (base = 0 ; base < lv.wid ; base += IMM_WID) { + unsigned long tmp = imm & 0xffffffffUL; unsigned add_wid = lv.wid - base; - imm >>= 16; + imm >>= IMM_WID; fprintf(vvp_out, " %%addi %u, %lu, %u;\n", lv.base+base, tmp, add_wid); @@ -1203,7 +1234,7 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le, assert(lv.wid == wid); imm = get_number_immediate(re); - assert( (imm & ~0xffff) == 0 ); + assert( (imm & ~0xffffffffUL) == 0 ); switch (lv.base) { case 0: @@ -1299,13 +1330,13 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid) if ((ivl_expr_opcode(exp) == '-') && (ivl_expr_type(re) == IVL_EX_NUMBER) && (! number_is_unknown(re)) - && number_is_immediate(re, 16)) + && number_is_immediate(re, IMM_WID)) return draw_sub_immediate(le, re, wid); if ((ivl_expr_opcode(exp) == '*') && (ivl_expr_type(re) == IVL_EX_NUMBER) && (! number_is_unknown(re)) - && number_is_immediate(re, 16)) + && number_is_immediate(re, IMM_WID)) return draw_mul_immediate(le, re, wid); lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ); @@ -1612,9 +1643,9 @@ static struct vector_info draw_number_expr(ivl_expr_t exp, unsigned wid) vvp_errors += 1; } - if ((!number_is_unknown(exp)) && number_is_immediate(exp, 16)) { - int val = get_number_immediate(exp); - fprintf(vvp_out, " %%movi %u, %d, %u;\n", res.base, val, wid); + if ((!number_is_unknown(exp)) && number_is_immediate(exp, IMM_WID)) { + unsigned long val = get_number_immediate(exp); + fprintf(vvp_out, " %%movi %u, %lu, %u;\n", res.base, val, wid); return res; } @@ -1836,8 +1867,8 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid) idx = 0; while (idx < nwid) { unsigned bits; - unsigned trans = 16; - if (nwid-idx < 16) + unsigned trans = IMM_WID; + if (nwid-idx < trans) trans = nwid-idx; bits = *p; @@ -1845,6 +1876,14 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid) if (trans > 8) { bits |= *p << 8; p -= 1; + if (trans > 16) { + bits |= *p << 16; + p -= 1; + if (trans > 24) { + bits |= *p << 24; + p -= 1; + } + } } fprintf(vvp_out, " %%movi %u, %u, %u;\n", res.base+idx,bits,trans); @@ -1881,8 +1920,14 @@ void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned swid) res.base+idx, res.base+swid-1); } else { - fprintf(vvp_out, " %%mov %u, 0, %u;\n", - res.base+swid, res.wid-swid); + unsigned base = res.base+swid; + unsigned count = res.wid-swid; + /* The %movi is faster for larger widths, but for very + small counts, the %mov is faster. */ + if (count > 4) + fprintf(vvp_out, " %%movi %u, 0, %u;\n", base, count); + else + fprintf(vvp_out, " %%mov %u, 0, %u;\n", base, count); } } @@ -2086,7 +2131,7 @@ static struct vector_info draw_select_signal(ivl_expr_t sube, for (idx = 0 ; idx < res.wid ; idx += 1) { if (idx >= bit_wid) { - fprintf(vvp_out, " %%mov %u, 0, %u; Pad from %u to %u\n", + fprintf(vvp_out, " %%movi %u, 0, %u; Pad from %u to %u\n", res.base+idx, res.wid-idx, ivl_expr_width(sube), wid); break; @@ -2410,7 +2455,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid) fprintf(vvp_out, " %%mov %u, %u, %u;\n", tmp.base, res.base, res.wid); - fprintf(vvp_out, " %%mov %u, 0, %u;\n", + fprintf(vvp_out, " %%movi %u, 0, %u;\n", tmp.base+res.wid, tmp.wid-res.wid); clr_vector(res); res = tmp; @@ -2460,7 +2505,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid) assert(res.base); fprintf(vvp_out, " %%mov %u, %u, %u;\n", tmp.base, res.base, res.wid); - fprintf(vvp_out, " %%mov %u, 0, %u;\n", + fprintf(vvp_out, " %%movi %u, 0, %u;\n", tmp.base+res.wid, tmp.wid-res.wid); clr_vector(res); res = tmp; diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h index 53c31618e..15cf5d886 100644 --- a/tgt-vvp/vvp_priv.h +++ b/tgt-vvp/vvp_priv.h @@ -39,6 +39,12 @@ struct vector_info { unsigned wid; }; +/* + * Convenient constants... + */ + /* Width limit for typical immediate arguments. */ +# define IMM_WID 32 + /* * Mangle all non-symbol characters in an identifier, quotes in names */ diff --git a/vvp/codes.h b/vvp/codes.h index 0e93f72de..0ac52e0c6 100644 --- a/vvp/codes.h +++ b/vvp/codes.h @@ -37,6 +37,7 @@ extern bool of_ADD(vthread_t thr, vvp_code_t code); extern bool of_ADD_WR(vthread_t thr, vvp_code_t code); extern bool of_ADDI(vthread_t thr, vvp_code_t code); extern bool of_AND(vthread_t thr, vvp_code_t code); +extern bool of_ANDI(vthread_t thr, vvp_code_t code); extern bool of_ANDR(vthread_t thr, vvp_code_t code); extern bool of_ASSIGN_AV(vthread_t thr, vvp_code_t code); extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code); @@ -167,7 +168,7 @@ struct vvp_code_s { }; union { - unsigned bit_idx[2]; + uint32_t bit_idx[2]; vvp_net_t *net2; vvp_code_t cptr2; struct ufunc_core*ufunc_core_ptr; diff --git a/vvp/compile.cc b/vvp/compile.cc index d2e1c96d8..4ad8bf17d 100644 --- a/vvp/compile.cc +++ b/vvp/compile.cc @@ -85,6 +85,7 @@ const static struct opcode_table_s opcode_table[] = { { "%addi", of_ADDI, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, { "%and", of_AND, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, { "%and/r", of_ANDR, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, + { "%andi", of_ANDI, 3, {OA_BIT1, OA_BIT2, OA_NUMBER} }, { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1, OA_BIT2} }, { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1, OA_BIT2} }, { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1, OA_BIT2} }, diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt index 84941815c..59bf80097 100644 --- a/vvp/opcodes.txt +++ b/vvp/opcodes.txt @@ -531,7 +531,8 @@ is one of the 4 constant bits, the effect is to replicate the value into the destination vector. This is useful for filling a vector. The %movi variant moves a binary value, LSB first, into the -destination vector. +destination vector. The immediate value is up to 32bits, padded with +zeros to fillout the width. * %mul , , diff --git a/vvp/vthread.cc b/vvp/vthread.cc index a7cf69f05..5f82644ba 100644 --- a/vvp/vthread.cc +++ b/vvp/vthread.cc @@ -214,6 +214,85 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr, } } +/* + * Some of the instructions do wide addition to arrays of long. They + * use this add_with_cary function to help. + */ +static inline unsigned long add_with_carry(unsigned long a, unsigned long b, + unsigned long&carry) +{ + unsigned long tmp = b + carry; + unsigned long sum = a + tmp; + carry = 0; + if (tmp < b) + carry = 1; + if (sum < tmp) + carry = 1; + if (sum < a) + carry = 1; + return sum; +} + +static unsigned long multiply_with_carry(unsigned long a, unsigned long b, + unsigned long&carry) +{ + const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1; + unsigned long a0 = a & mask; + unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask; + unsigned long b0 = b & mask; + unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask; + + unsigned long tmp = a0 * b0; + + unsigned long r00 = tmp & mask; + unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a0 * b1; + + unsigned long r01 = tmp & mask; + unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a1 * b0; + + unsigned long r10 = tmp & mask; + unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + tmp = a1 * b1; + + unsigned long r11 = tmp & mask; + unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask; + + unsigned long r1 = c00 + r01 + r10; + unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask; + r1 &= mask; + r2 += c01 + c10 + r11; + unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask; + r2 &= mask; + r3 += c11; + r3 &= mask; + + carry = (r3 << (CPU_WORD_BITS/2)) + r2; + return (r1 << (CPU_WORD_BITS/2)) + r00; +} + +static void multiply_array_imm(unsigned long*res, unsigned long*val, + unsigned words, unsigned long imm) +{ + for (unsigned idx = 0 ; idx < words ; idx += 1) + res[idx] = 0; + + for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) { + unsigned long sum; + unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum); + + unsigned long carry = 0; + res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry); + for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) { + res[add_idx] = add_with_carry(res[add_idx], sum, carry); + sum = 0; + } + } +} /* * Create a new thread with the given start address. @@ -454,6 +533,28 @@ bool of_AND(vthread_t thr, vvp_code_t cp) } +bool of_ANDI(vthread_t thr, vvp_code_t cp) +{ + unsigned idx1 = cp->bit_idx[0]; + unsigned long imm = cp->bit_idx[1]; + unsigned wid = cp->number; + + assert(idx1 >= 4); + + vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid); + vvp_vector4_t imv (wid, BIT4_0); + + unsigned trans = wid; + if (trans > CPU_WORD_BITS) + trans = CPU_WORD_BITS; + imv.setarray(0, trans, &imm); + + val &= imv; + + thr->bits4.set_vec(idx1, val); + return true; +} + bool of_ADD(vthread_t thr, vvp_code_t cp) { assert(cp->bit_idx[0] >= 4); @@ -465,19 +566,8 @@ bool of_ADD(vthread_t thr, vvp_code_t cp) unsigned long carry; carry = 0; - for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < cp->number ; idx += 1) { - - unsigned long tmp = lvb[idx] + carry; - unsigned long sum = lva[idx] + tmp; - carry = 0; - if (tmp < lvb[idx]) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < lva[idx]) - carry = 1; - lva[idx] = sum; - } + for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < cp->number ; idx += 1) + lva[idx] = add_with_carry(lva[idx], lvb[idx], carry); /* We know from the vector_to_array that the address is valid in the thr->bitr4 vector, so just do the set bit. */ @@ -525,30 +615,15 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp) unsigned word_count = (bit_width+CPU_WORD_BITS-1)/CPU_WORD_BITS; unsigned long*lva = vector_to_array(thr, bit_addr, bit_width); - unsigned long*lvb = 0; if (lva == 0) goto x_out; - lvb = new unsigned long[word_count]; - - lvb[0] = imm_value; - for (unsigned idx = 1 ; idx < word_count ; idx += 1) - lvb[idx] = 0; unsigned long carry; carry = 0; - for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < bit_width ; idx += 1) { - - unsigned long tmp = lvb[idx] + carry; - unsigned long sum = lva[idx] + tmp; - carry = 0; - if (tmp < lvb[idx]) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < lva[idx]) - carry = 1; - lva[idx] = sum; + for (unsigned idx = 0 ; idx < word_count ; idx += 1) { + lva[idx] = add_with_carry(lva[idx], imm_value, carry); + imm_value = 0; } /* We know from the vector_to_array that the address is valid @@ -557,7 +632,6 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp) thr->bits4.setarray(bit_addr, bit_width, lva); delete[]lva; - delete[]lvb; return true; @@ -1477,297 +1551,268 @@ bool of_DISABLE(vthread_t thr, vvp_code_t cp) return ! disabled_myself_flag; } -static void divide_bits(unsigned len, unsigned char*lbits, - const unsigned char*rbits) +/* + * This function divides a 2-word number {high, a} by a 1-word + * number. Assume that high < b. + */ +static unsigned long divide2words(unsigned long a, unsigned long b, + unsigned long high) { - unsigned char *a, *b, *z, *t; - a = new unsigned char[len+1]; - b = new unsigned char[len+1]; - z = new unsigned char[len+1]; - t = new unsigned char[len+1]; + unsigned long result = 0; + while (high > 0) { + unsigned long tmp_result = ULONG_MAX / b; + unsigned long remain = ULONG_MAX % b; - unsigned char carry; - unsigned char temp; - - int mxa = -1, mxz = -1; - int i; - int current, copylen; - - - for (unsigned idx = 0 ; idx < len ; idx += 1) { - unsigned lb = lbits[idx]; - unsigned rb = rbits[idx]; - - z[idx]=lb; - a[idx]=1-rb; // for 2s complement add.. - - } - z[len]=0; - a[len]=1; - - for(i=0;i<(int)len+1;i++) { - b[i]=0; - } - - for(i=len-1;i>=0;i--) { - if(!a[i]) { - mxa=i; - break; - } - } - - for(i=len-1;i>=0;i--) { - if(z[i]) { - mxz=i; - break; - } - } - - if((mxa>mxz)||(mxa==-1)) { - if(mxa==-1) { - fprintf(stderr, "Division By Zero error, exiting.\n"); - exit(255); + remain += 1; + if (remain >= b) { + remain -= b; + result += 1; } - goto tally; + // Now 0x1_0...0 = b*tmp_result + remain + // high*0x1_0...0 = high*(b*tmp_result + remain) + // high*0x1_0...0 = high*b*tmp_result + high*remain + + // We know that high*0x1_0...0 >= high*b*tmp_result, and + // we know that high*0x1_0...0 > high*remain. Use + // high*remain as the remainder for another iteration, + // and add tmp_result*high into the current estimate of + // the result. + result += tmp_result * high; + + // The new iteration starts with high*remain + a. + remain = multiply_with_carry(high, remain, high); + a = add_with_carry(a, remain, high); + + // Now result*b + {high,a} == the input {high,a}. It is + // possible that the new high >= 1. If so, it will + // certainly be less then high from the previous + // iteration. Do another iteration and it will shrink, + // eventually to 0. } - copylen = mxa + 2; - current = mxz - mxa; + // high is now 0, so a is the remaining remainder, so we can + // finish off the integer divide with a simple a/b. - while(current > -1) { - carry = 1; - for(i=0;i>1); + return result + a/b; +} + +static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid) +{ + + unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; + + unsigned btop = words-1; + while (btop > 0 && bp[btop] == 0) + btop -= 1; + + // Detect divide by 0, and exit. + if (btop==0 && bp[0]==0) + return 0; + + unsigned long*diff = new unsigned long[words]; + unsigned long*result= new unsigned long[words]; + for (unsigned idx = 0 ; idx < words ; idx += 1) + result[idx] = 0; + + for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) { + unsigned cur_ptr = cur-1; + unsigned long cur_res; + if (ap[cur_ptr+btop] >= bp[btop]) { + cur_res = ap[cur_ptr+btop] / bp[btop]; + + } else if (cur_ptr+btop+1 >= words) { + continue; + + } else if (ap[cur_ptr+btop+1] == 0) { + continue; + + } else { + cur_res = divide2words(ap[cur_ptr+btop], bp[btop], + ap[cur_ptr+btop+1]); } - if(carry) { - for(i=0;i0) Try it, + // and if the difference comes out negative, then adjust + // then. + + multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res); + unsigned long carry = 1; + for (unsigned idx = cur_ptr ; idx < words ; idx += 1) + ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry); + + // ap has the diff subtracted out of it. If cur_res was + // too large, then ap will turn negative. (We easily + // tell that ap turned negative by looking at + // carry&1. If it is 0, then it is *negative*.) In that + // case, we know that cur_res was too large by 1. Correct by + // adding 1b back in and reducing cur_res. + if (carry&1 == 0) { + cur_res -= 1; + carry = 0; + for (unsigned idx = cur_ptr ; idx < words ; idx += 1) + ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry); + // The sign *must* have changed again. + assert(carry == 1); } - current--; + result[cur_ptr] = cur_res; } - tally: - for (unsigned idx = 0 ; idx < len ; idx += 1) { - // n.b., z[] has the remainder... - lbits[idx] = b[idx]; - } + // Now ap contains the remainder and result contains the + // desired result. We should find that: + // input-a = bp * result + ap; - delete []t; - delete []z; - delete []b; - delete []a; + delete[]diff; + return result; } bool of_DIV(vthread_t thr, vvp_code_t cp) { - assert(cp->bit_idx[0] >= 4); + unsigned adra = cp->bit_idx[0]; + unsigned adrb = cp->bit_idx[1]; + unsigned wid = cp->number; - if(cp->number <= 8*sizeof(unsigned long)) { - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; - unsigned long lv = 0, rv = 0; + assert(adra >= 4); - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - vvp_bit4_t lb = thr_get_bit(thr, idx1); - vvp_bit4_t rb = thr_get_bit(thr, idx2); - - if (bit4_is_xz(lb) || bit4_is_xz(rb)) - goto x_out; - - lv |= (unsigned long) lb << idx; - rv |= (unsigned long) rb << idx; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; - } - - if (rv == BIT4_0) - goto x_out; - - lv /= rv; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0); - lv >>= 1; - } - - return true; - - } else { - - /* Make a string of the bits of the numbers to be - divided. Then divide them, and write the results into - the thread. */ - unsigned char*lbits = new unsigned char[cp->number]; - unsigned char*rbits = new unsigned char[cp->number]; - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; - bool rval_is_zero = true; - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - lbits[idx] = thr_get_bit(thr, idx1); - rbits[idx] = thr_get_bit(thr, idx2); - if ((lbits[idx] | rbits[idx]) > 1) { - delete[]lbits; - delete[]rbits; - goto x_out; - } - - if (rbits[idx] != 0) - rval_is_zero = false; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; - } - - /* Notice the special case of divide by 0. */ - if (rval_is_zero) { - delete[]lbits; - delete[]rbits; - goto x_out; - } - - divide_bits(cp->number, lbits, rbits); - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0); - } - - delete[]lbits; - delete[]rbits; + unsigned long*ap = vector_to_array(thr, adra, wid); + if (ap == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); return true; } - x_out: - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) - thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X); + unsigned long*bp = vector_to_array(thr, adrb, wid); + if (bp == 0) { + delete[]ap; + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; + } + // If the value fits in a single CPU word, then do it the easy way. + if (wid <= CPU_WORD_BITS) { + if (bp[0] == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + } else { + ap[0] /= bp[0]; + thr->bits4.setarray(adra, wid, ap); + } + delete[]ap; + delete[]bp; + return true; + } + + unsigned long*result = divide_bits(ap, bp, wid); + if (result == 0) { + delete[]ap; + delete[]bp; + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; + } + + // Now ap contains the remainder and result contains the + // desired result. We should find that: + // input-a = bp * result + ap; + + thr->bits4.setarray(adra, wid, result); + delete[]ap; + delete[]bp; + delete[]result; return true; } -static void negate_bits(unsigned len, unsigned char*bits) + +static void negate_words(unsigned long*val, unsigned words) { - unsigned char carry = 1; - for (unsigned idx = 0 ; idx < len ; idx += 1) { - carry += bits[idx]? 0 : 1; - bits[idx] = carry & 1; - carry >>= 1; - } + unsigned long carry = 1; + for (unsigned idx = 0 ; idx < words ; idx += 1) + val[idx] = add_with_carry(0, ~val[idx], carry); } bool of_DIV_S(vthread_t thr, vvp_code_t cp) { - assert(cp->bit_idx[0] >= 4); + unsigned adra = cp->bit_idx[0]; + unsigned adrb = cp->bit_idx[1]; + unsigned wid = cp->number; + unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS; - if(cp->number <= 8*sizeof(long)) { - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; - long lv = 0, rv = 0; + assert(adra >= 4); - unsigned lb = 0; - unsigned rb = 0; - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - lb = thr_get_bit(thr, idx1); - rb = thr_get_bit(thr, idx2); - - if ((lb | rb) & 2) - goto x_out; - - lv |= (long)lb << idx; - rv |= (long)rb << idx; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; - } - - /* Extend the sign to fill the native long. */ - for (unsigned idx = cp->number; idx < (8*sizeof lv); idx += 1) { - lv |= (long)lb << idx; - rv |= (long)rb << idx; - } - - if (rv == 0) - goto x_out; - - lv /= rv; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0); - lv >>= 1; - } - - } else { - unsigned char*lbits = new unsigned char[cp->number]; - unsigned char*rbits = new unsigned char[cp->number]; - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; - bool rval_is_zero = true; - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - lbits[idx] = thr_get_bit(thr, idx1); - rbits[idx] = thr_get_bit(thr, idx2); - if ((lbits[idx] | rbits[idx]) > 1) { - delete[]lbits; - delete[]rbits; - goto x_out; - } - - if (rbits[idx] != 0) - rval_is_zero = false; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; - } - - /* Notice the special case of divide by 0. */ - if (rval_is_zero) { - delete[]lbits; - delete[]rbits; - goto x_out; - } - - /* Signed division is unsigned division on the absolute - values of the operands, then corrected for the number - of signs. */ - unsigned sign_flag = 0; - if (lbits[cp->number-1]) { - sign_flag += 1; - negate_bits(cp->number, lbits); - } - if (rbits[cp->number-1]) { - sign_flag += 1; - negate_bits(cp->number, rbits); - } - - divide_bits(cp->number, lbits, rbits); - - if (sign_flag & 1) { - negate_bits(cp->number, lbits); - } - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0); - } - - delete[]lbits; - delete[]rbits; + unsigned long*ap = vector_to_array(thr, adra, wid); + if (ap == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; } - return true; + unsigned long*bp = vector_to_array(thr, adrb, wid); + if (bp == 0) { + delete[]ap; + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; + } - x_out: - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) - thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X); + unsigned long sign_mask = 0; + if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) { + sign_mask = -1UL << (CPU_WORD_BITS-sign_bits); + if (ap[words-1] & (sign_mask>>1)) + ap[words-1] |= sign_mask; + if (bp[words-1] & (sign_mask>>1)) + bp[words-1] |= sign_mask; + } + if (wid <= CPU_WORD_BITS) { + if (bp[0] == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + } else { + long tmpa = (long) ap[0]; + long tmpb = (long) bp[0]; + long res = tmpa / tmpb; + ap[0] = ((unsigned long)res) & ~sign_mask; + thr->bits4.setarray(adra, wid, ap); + } + delete[]ap; + delete[]bp; + return true; + } + + // We need to the actual division to positive integers. Make + // them positive here, and remember the negations. + bool negate_flag = false; + if ( ((long) ap[words-1]) < 0 ) { + negate_flag = true; + negate_words(ap, words); + } + if ( ((long) bp[words-1]) < 0 ) { + negate_flag ^= true; + negate_words(bp, words); + } + + unsigned long*result = divide_bits(ap, bp, wid); + if (result == 0) { + delete[]ap; + delete[]bp; + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; + } + + if (negate_flag) { + negate_words(result, words); + } + + result[words-1] &= ~sign_mask; + + thr->bits4.setarray(adra, wid, result); + delete[]ap; + delete[]bp; + delete[]result; return true; } @@ -2442,7 +2487,7 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp) * The functor to read from is the vvp_net_t object pointed to by the * cp->net pointer. */ -vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp) +static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp) { assert(cp->bit_idx[0] >= 4); assert(cp->bit_idx[1] > 0); @@ -2456,9 +2501,8 @@ vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp) if (sig == 0) { cerr << "%%load/v error: Net arg not a vector signal? " << typeid(*net->fun).name() << endl; + assert(sig); } - assert(sig); - vvp_vector4_t sig_value = sig->vec4_value(); sig_value.resize(wid); @@ -2498,15 +2542,35 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp) vvp_vector4_t sig_value(wid, BIT4_0); sig_value.copy_bits(load_base(thr, cp)); - /* Add the addend value */ - sig_value += addend; - /* Check the address once, before we scan the vector. */ thr_check_addr(thr, bit+wid-1); + unsigned long*val = sig_value.subarray(0, wid); + if (val == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(bit, tmp); + return true; + } + + unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS; + unsigned long carry = 0; + unsigned long imm = addend; + if (addend >= 0) { + for (unsigned idx = 0 ; idx < words ; idx += 1) { + val[idx] = add_with_carry(val[idx], imm, carry); + imm = 0UL; + } + } else { + for (unsigned idx = 0 ; idx < words ; idx += 1) { + val[idx] = add_with_carry(val[idx], imm, carry); + imm = -1UL; + } + } + /* Copy the vector bits into the bits4 vector. Do the copy directly to skip the excess calls to thr_check_addr. */ - thr->bits4.set_vec(bit, sig_value); + thr->bits4.setarray(bit, wid, val); + delete[]val; return true; } @@ -2908,114 +2972,83 @@ bool of_MOV_WR(vthread_t thr, vvp_code_t cp) bool of_MOVI(vthread_t thr, vvp_code_t cp) { unsigned dst = cp->bit_idx[0]; - unsigned val = cp->bit_idx[1]; + static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0}; unsigned wid = cp->number; thr_check_addr(thr, dst+wid-1); - for (unsigned idx = 0 ; idx < wid ; idx += 1, val >>= 1) - thr->bits4.set_bit(dst+idx, (val&1)? BIT4_1 : BIT4_0); + val[0] = cp->bit_idx[1]; + + while (wid > 0) { + unsigned trans = wid; + if (trans > 8*CPU_WORD_BITS) + trans = 8*CPU_WORD_BITS; + + thr->bits4.setarray(dst, trans, val); + + val[0] = 0; + wid -= trans; + dst += trans; + } return true; } bool of_MUL(vthread_t thr, vvp_code_t cp) { - assert(cp->bit_idx[0] >= 4); - if(cp->number <= 8*sizeof(unsigned long)) { + unsigned adra = cp->bit_idx[0]; + unsigned adrb = cp->bit_idx[1]; + unsigned wid = cp->number; - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; - unsigned long lv = 0, rv = 0; + assert(adra >= 4); - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - vvp_bit4_t lb = thr_get_bit(thr, idx1); - vvp_bit4_t rb = thr_get_bit(thr, idx2); - - if (bit4_is_xz(lb) || bit4_is_xz(rb)) - goto x_out; - - lv |= (unsigned long) lb << idx; - rv |= (unsigned long) rb << idx; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; + unsigned long*ap = vector_to_array(thr, adra, wid); + if (ap == 0) { + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; } - lv *= rv; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0); - lv >>= 1; + unsigned long*bp = vector_to_array(thr, adrb, wid); + if (bp == 0) { + delete[]ap; + vvp_vector4_t tmp(wid, BIT4_X); + thr->bits4.set_vec(adra, tmp); + return true; } - return true; - } else { - unsigned idx1 = cp->bit_idx[0]; - unsigned idx2 = cp->bit_idx[1]; + // If the value fits in a single CPU word, then do it the easy way. + if (wid <= CPU_WORD_BITS) { + ap[0] *= bp[0]; + thr->bits4.setarray(adra, wid, ap); + delete[]ap; + delete[]bp; + return true; + } - unsigned char *a, *b, *sum; - a = new unsigned char[cp->number]; - b = new unsigned char[cp->number]; - sum = new unsigned char[cp->number]; + unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; + unsigned long*res = new unsigned long[words]; + for (unsigned idx = 0 ; idx < words ; idx += 1) + res[idx] = 0; - int mxa = -1; - int mxb = -1; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - vvp_bit4_t lb = thr_get_bit(thr, idx1); - vvp_bit4_t rb = thr_get_bit(thr, idx2); - - if (bit4_is_xz(lb) || bit4_is_xz(rb)) - { - delete[]sum; - delete[]b; - delete[]a; - goto x_out; + for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) { + for (unsigned mul_b = 0 ; mul_b < words ; mul_b += 1) { + unsigned long sum; + unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum); + unsigned base = mul_a + mul_b; + unsigned long carry = 0; + res[base] = add_with_carry(res[base], tmp, carry); + for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) { + res[add_idx] = add_with_carry(res[add_idx], sum, carry); + sum = 0; } - - if((a[idx] = lb)) mxa=idx+1; - if((b[idx] = rb)) mxb=idx; - sum[idx]=0; - - idx1 += 1; - if (idx2 >= 4) - idx2 += 1; + } } -// do "unsigned ZZ sum = a * b" the hard way.. - for(int i=0;i<=mxb;i++) - { - if(b[i]) - { - unsigned char carry=0; - unsigned char temp; - - for(int j=0;j<=mxa;j++) - { - if(i+j>=(int)cp->number) break; - temp=sum[i+j]+a[j]+carry; - sum[i+j]=(temp&1); - carry=(temp>>1); - } - } - } - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0); - } - - delete[]sum; - delete[]b; - delete[]a; - return true; - } - - x_out: - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) - thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X); - + thr->bits4.setarray(adra, wid, res); + delete[]ap; + delete[]bp; + delete[]res; return true; } @@ -3030,101 +3063,36 @@ bool of_MUL_WR(vthread_t thr, vvp_code_t cp) bool of_MULI(vthread_t thr, vvp_code_t cp) { - assert(cp->bit_idx[0] >= 4); + unsigned adr = cp->bit_idx[0]; + unsigned long imm = cp->bit_idx[1]; + unsigned wid = cp->number; - /* If the value fits into a native unsigned long, then make an - unsigned long variable with the numbers, to a native - multiply, and work with that. */ - - if(cp->number <= 8*sizeof(unsigned long)) { - unsigned idx1 = cp->bit_idx[0]; - unsigned long lv = 0, rv = cp->bit_idx[1]; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - vvp_bit4_t lb = thr_get_bit(thr, idx1); - - if (bit4_is_xz(lb)) - goto x_out; - - lv |= (unsigned long) lb << idx; - - idx1 += 1; - } - - lv *= rv; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)? BIT4_1 : BIT4_0); - lv >>= 1; - } + assert(adr >= 4); + unsigned long*val = vector_to_array(thr, adr, wid); + // If there are X bits in the value, then return X. + if (val == 0) { + vvp_vector4_t tmp(cp->number, BIT4_X); + thr->bits4.set_vec(cp->bit_idx[0], tmp); return true; } - /* number is too large for local long, so do bitwise - multiply. */ - - unsigned idx1; idx1 = cp->bit_idx[0]; - unsigned imm; imm = cp->bit_idx[1]; - - unsigned char *a, *b, *sum; - a = new unsigned char[cp->number]; - b = new unsigned char[cp->number]; - sum = new unsigned char[cp->number]; - - int mxa; mxa = -1; - int mxb; mxb = -1; - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - vvp_bit4_t lb = thr_get_bit(thr, idx1); - vvp_bit4_t rb = (imm & 1)? BIT4_1 : BIT4_0; - - imm >>= 1; - - if (bit4_is_xz(lb)) { - delete[]sum; - delete[]b; - delete[]a; - goto x_out; - } - - if((a[idx] = lb)) mxa=idx+1; - if((b[idx] = rb)) mxb=idx; - sum[idx]=0; - - idx1 += 1; + // If everything fits in a word, then do it the easy way. + if (wid <= CPU_WORD_BITS) { + val[0] *= imm; + thr->bits4.setarray(adr, wid, val); + delete[]val; + return true; } -// do "unsigned ZZ sum = a * b" the hard way.. - for(int i=0;i<=mxb;i++) { - if(b[i]) { - unsigned char carry=0; - unsigned char temp; + unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS; + unsigned long*res = new unsigned long[words]; - for(int j=0;j<=mxa;j++) { - if(i+j>=(int)cp->number) break; - temp=sum[i+j]+a[j]+carry; - sum[i+j]=(temp&1); - carry=(temp>>1); - } - } - } - - - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) { - thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0); - } - - delete[]sum; - delete[]b; - delete[]a; - - return true; - - x_out: - for (unsigned idx = 0 ; idx < cp->number ; idx += 1) - thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X); + multiply_array_imm(res, val, words, imm); + thr->bits4.setarray(adr, wid, res); + delete[]val; + delete[]res; return true; } @@ -3754,20 +3722,10 @@ bool of_SUB(vthread_t thr, vvp_code_t cp) goto x_out; - unsigned carry; + unsigned long carry; carry = 1; - for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < cp->number ; idx += 1) { - unsigned long tmp = ~lvb[idx] + carry; - unsigned long sum = tmp + lva[idx]; - carry = 0; - if (tmp < ~lvb[idx]) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < lva[idx]) - carry = 1; - lva[idx] = sum; - } + for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < cp->number ; idx += 1) + lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry); /* We know from the vector_to_array that the address is valid @@ -3802,34 +3760,17 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp) assert(cp->bit_idx[0] >= 4); unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS; - + unsigned long imm = cp->bit_idx[1]; unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number); - unsigned long*lvb; if (lva == 0) goto x_out; - lvb = new unsigned long[word_count]; - - - lvb[0] = cp->bit_idx[1]; - lvb[0] = ~lvb[0]; - for (unsigned idx = 1 ; idx < word_count ; idx += 1) - lvb[idx] = ~0UL; unsigned long carry; carry = 1; - for (unsigned idx = 0 ; (idx*CPU_WORD_BITS) < cp->number ; idx += 1) { - - unsigned long tmp = lvb[idx] + carry; - unsigned long sum = lva[idx] + tmp; - carry = 0UL; - if (tmp < lvb[idx]) - carry = 1; - if (sum < tmp) - carry = 1; - if (sum < lva[idx]) - carry = 1; - lva[idx] = sum; + for (unsigned idx = 0 ; idx < word_count ; idx += 1) { + lva[idx] = add_with_carry(lva[idx], ~imm, carry); + imm = 0UL; } /* We know from the vector_to_array that the address is valid @@ -3838,7 +3779,6 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp) thr->bits4.setarray(cp->bit_idx[0], cp->number, lva); delete[]lva; - delete[]lvb; return true; diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc index 9f9a50e55..dfc77fb40 100644 --- a/vvp/vvp_net.cc +++ b/vvp/vvp_net.cc @@ -61,28 +61,6 @@ vvp_bit4_t add_with_carry(vvp_bit4_t a, vvp_bit4_t b, vvp_bit4_t&c) } } -vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b) -{ - if (a == BIT4_0) - return BIT4_0; - if (b == BIT4_0) - return BIT4_0; - if (bit4_is_xz(a)) - return BIT4_X; - if (bit4_is_xz(b)) - return BIT4_X; - return BIT4_1; -} - -vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b) -{ - if (a == BIT4_1) - return BIT4_1; - if (b == BIT4_1) - return BIT4_1; - return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) ); -} - vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b) { if (bit4_is_xz(a)) diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h index 3d4f6b881..18548c157 100644 --- a/vvp/vvp_net.h +++ b/vvp/vvp_net.h @@ -87,8 +87,21 @@ inline vvp_bit4_t bit4_z2x(vvp_bit4_t a) inline vvp_bit4_t operator ~ (vvp_bit4_t a) { return bit4_z2x((vvp_bit4_t) (((int)a) ^ 1)); } -extern vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b); -extern vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b); +inline vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b) +{ + if (a==BIT4_1 || b==BIT4_1) + return BIT4_1; + return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) ); +} + +inline vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b) +{ + if (a==BIT4_0 || b==BIT4_0) + return BIT4_0; + return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) ); +} + + extern vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b); extern ostream& operator<< (ostream&o, vvp_bit4_t a); @@ -294,41 +307,47 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val) assert(idx < size_); unsigned long off = idx % BITS_PER_WORD; - unsigned long amask = 0, bmask = 0; - switch (val) { - case BIT4_0: - amask = 0; - bmask = 0; - break; - case BIT4_1: - amask = 1; - bmask = 0; - break; - case BIT4_X: - amask = 1; - bmask = 1; - break; - case BIT4_Z: - amask = 0; - bmask = 1; - break; - } - unsigned long mask = 1UL << off; - amask <<= off; - bmask <<= off; if (size_ > BITS_PER_WORD) { unsigned wdx = idx / BITS_PER_WORD; - abits_ptr_[wdx] &= ~mask; - abits_ptr_[wdx] |= amask; - bbits_ptr_[wdx] &= ~mask; - bbits_ptr_[wdx] |= bmask; + switch (val) { + case BIT4_0: + abits_ptr_[wdx] &= ~mask; + bbits_ptr_[wdx] &= ~mask; + break; + case BIT4_1: + abits_ptr_[wdx] |= mask; + bbits_ptr_[wdx] &= ~mask; + break; + case BIT4_X: + abits_ptr_[wdx] |= mask; + bbits_ptr_[wdx] |= mask; + break; + case BIT4_Z: + abits_ptr_[wdx] &= ~mask; + bbits_ptr_[wdx] |= mask; + break; + } } else { - abits_val_ &= ~mask; - abits_val_ |= amask; - bbits_val_ &= ~mask; - bbits_val_ |= bmask; + switch (val) { + case BIT4_0: + abits_val_ &= ~mask; + bbits_val_ &= ~mask; + break; + case BIT4_1: + abits_val_ |= mask; + bbits_val_ &= ~mask; + break; + case BIT4_X: + abits_val_ |= mask; + bbits_val_ |= mask; + break; + case BIT4_Z: + abits_val_ &= ~mask; + bbits_val_ |= mask; + break; + } } }