Optimize load-add with load/add instruction

Where and expression is an immediate value added to a signal value, it is possible to optimize them to a single instruction that combines the load with an add at the same time.
2007-12-04 19:15:15 -08:00 · 2007-12-04 19:15:15 -08:00 · 8f519531f3
parent 68a9526fec
commit 8f519531f3
6 changed files with 163 additions and 4 deletions
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@ -27,6 +27,8 @@

 static void draw_eval_expr_dest(ivl_expr_t exp, struct vector_info dest,
 				int ok_flags);
+static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
+			     int add_index);

 int number_is_unknown(ivl_expr_t ex)
 {
@ -998,6 +1000,28 @@ static struct vector_info draw_binary_expr_lrs(ivl_expr_t exp, unsigned wid)
      return lv;
 }

+static struct vector_info draw_load_add_immediate(ivl_expr_t le,
+						  ivl_expr_t re,
+						  unsigned wid)
+{
+      struct vector_info lv;
+      unsigned long imm;
+
+      imm = get_number_immediate(re);
+
+	/* Load the immidiate value into word register 0 */
+      fprintf(vvp_out, "  %%ix/load 0, %lu;\n", imm);
+
+      lv.base = allocate_vector(wid);
+      lv.wid = wid;
+
+	/* Load the signal value with %loads that add the index
+	   register to the value being loaded. */
+      draw_signal_dest(le, lv, 0);
+
+      return lv;
+}
+
 static struct vector_info draw_add_immediate(ivl_expr_t le,
 					     ivl_expr_t re,
 					     unsigned wid)
@ -1098,6 +1122,26 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)

      const char*sign_string = ivl_expr_signed(exp)? "/s" : "";

+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(re) == IVL_EX_ULONG))
+	    return draw_load_add_immediate(le, re, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(re) == IVL_EX_NUMBER))
+	    return draw_load_add_immediate(le, re, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(le) == IVL_EX_ULONG))
+	    return draw_load_add_immediate(re, le, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(le) == IVL_EX_NUMBER))
+	    return draw_load_add_immediate(re, le, wid);
+
      if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(re) == IVL_EX_ULONG))
 	    return draw_add_immediate(le, re, wid);
@ -1663,8 +1707,12 @@ static void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned s
 * into the thread bits. Remember to account for the part select by
 * offsetting the read from the lsi (least significant index) of the
 * signal.
+ *
+ * If the add_index is >=0, then generate a %load/vpp to add the
+ * word0 value to the loaded value before storing it into the destination.
 */
-static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)
+static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
+			     int add_index)
 {
      unsigned swid = ivl_expr_width(exp);
      ivl_signal_t sig = ivl_expr_signal(exp);
@ -1679,6 +1727,7 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)
      if (ivl_signal_array_count(sig) > 1) {
 	    ivl_expr_t ix = ivl_expr_oper1(exp);
 	    if (!number_is_immediate(ix, 8*sizeof(unsigned long))) {
+		  assert(add_index < 0);
 		  draw_eval_expr_into_integer(ix, 3);
 		  fprintf(vvp_out, "   %%load/av %u, v%p, %u;\n",
 			  res.base, sig, swid);
@ -1694,11 +1743,20 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)

      if (ivl_signal_data_type(sig) == IVL_VT_REAL) {

+	    assert(add_index < 0);
 	    int tmp = allocate_word();
 	    fprintf(vvp_out, " %%load/wr %d, v%p_%u;\n", tmp, sig, word);
 	    fprintf(vvp_out, " %%cvt/vr %u, %d, %u;\n", res.base, tmp, res.wid);
 	    clr_word(tmp);

+      } else if (add_index >= 0) {
+
+	    assert(add_index == 0);
+
+	      /* If this is a REG (a variable) then I can do a vector read. */
+	    fprintf(vvp_out, "    %%load/vp0 %u, v%p_%u, %u;\n",
+		    res.base, sig, word, swid);
+
      } else {

 	      /* If this is a REG (a variable) then I can do a vector read. */
@ -1730,7 +1788,7 @@ static struct vector_info draw_signal_expr(ivl_expr_t exp, unsigned wid,
      res.wid  = wid;
      save_expression_lookaside(res.base, exp, wid);

-      draw_signal_dest(exp, res);
+      draw_signal_dest(exp, res, -1);
      return res;
 }

@ -2232,7 +2290,7 @@ static void draw_eval_expr_dest(ivl_expr_t exp, struct vector_info dest,
      switch (ivl_expr_type(exp)) {

 	  case IVL_EX_SIGNAL:
-	    draw_signal_dest(exp, dest);
+	    draw_signal_dest(exp, dest, -1);
 	    return;

 	  default:
--- a/vvp/codes.h
+++ b/vvp/codes.h
@ -95,6 +95,7 @@ extern bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_MV(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_NX(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_XP(vthread_t thr, vvp_code_t code);
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@ -141,6 +141,7 @@ const static struct opcode_table_s opcode_table[] = {
      { "%load/mv",of_LOAD_MV,3,  {OA_BIT1,     OA_MEM_PTR,  OA_BIT2} },
      { "%load/nx",of_LOAD_NX,3,  {OA_BIT1,     OA_VPI_PTR,  OA_BIT2} },
      { "%load/v", of_LOAD_VEC,3, {OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
+      { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/wr",of_LOAD_WR,2,  {OA_BIT1,     OA_VPI_PTR,  OA_BIT2} },
      { "%load/x", of_LOAD_X, 3,  {OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
      { "%load/x.p",of_LOAD_XP, 3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -2188,13 +2188,14 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp)
 * The functor to read from is the vvp_net_t object pointed to by the
 * cp->net pointer.
 */
-bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
+vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
      assert(cp->bit_idx[0] >= 4);
      assert(cp->bit_idx[1] > 0);

      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];
+      int64_t addend = thr->words[0].w_int;
      vvp_net_t*net = cp->net;

 	/* For the %load to work, the functor must actually be a
@ -2209,6 +2210,40 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
      vvp_vector4_t sig_value = sig->vec4_value();
      sig_value.resize(wid);

+      return sig_value;
+}
+
+bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
+{
+      unsigned bit = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
+
+      vvp_vector4_t sig_value = load_base(thr, cp);
+
+	/* Check the address once, before we scan the vector. */
+      thr_check_addr(thr, bit+wid-1);
+
+	/* Copy the vector bits into the bits4 vector. Do the copy
+	   directly to skip the excess calls to thr_check_addr. */
+      thr->bits4.set_vec(bit, sig_value);
+
+      return true;
+}
+
+/*
+* This is like of_LOAD_VEC, but includes an add of an integer value.
+*/
+bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
+{
+      unsigned bit = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
+      int64_t addend = thr->words[0].w_int;
+
+      vvp_vector4_t sig_value = load_base(thr, cp);
+
+	/* Add the addend value */
+      sig_value += addend;
+
 	/* Check the address once, before we scan the vector. */
      thr_check_addr(thr, bit+wid-1);

--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -557,6 +557,31 @@ bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
      return true;
 }

+bool vvp_vector4_t::has_xz() const
+{
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = WORD_X_BITS >> 2*(BITS_PER_WORD - size_);
+	    return 0 != (bits_val_&mask);
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    return 0 != (bits_val_&WORD_X_BITS);
+      }
+
+      unsigned words = size_ / BITS_PER_WORD;
+      for (unsigned idx = 0 ; idx < words ; idx += 1) {
+	    if (bits_ptr_[idx] & WORD_X_BITS)
+		  return true;
+      }
+
+      unsigned long mask = size_%BITS_PER_WORD;
+      if (mask > 0) {
+	    mask = WORD_X_BITS >> 2*(BITS_PER_WORD - mask);
+	    return 0 != bits_ptr_[words]&mask;
+      }
+
+      return false;
+}

 void vvp_vector4_t::change_z2x()
 {
@ -602,6 +627,40 @@ char* vvp_vector4_t::as_string(char*buf, size_t buf_len)
      return res;
 }

+/*
+* Add an integer to the vvp_vector4_t in place, bit by bit so that
+* there is no size limitations.
+*/
+vvp_vector4_t& vvp_vector4_t::operator += (int64_t that)
+{
+      vvp_bit4_t carry = BIT4_0;
+      unsigned idx;
+
+      if (has_xz()) {
+	    vvp_vector4_t xxx (size(), BIT4_X);
+	    *this = xxx;
+	    return *this;
+      }
+
+      for (idx = 0 ; idx < size() ; idx += 1) {
+	    if (that == 0 && carry==BIT4_0)
+		  break;
+
+	    vvp_bit4_t that_bit = (that&1)? BIT4_1 : BIT4_0;
+	    that >>= 1;
+
+	    if (that_bit==BIT4_0 && carry==BIT4_0)
+		  continue;
+
+	    vvp_bit4_t bit = value(idx);
+	    bit = add_with_carry(bit, that_bit, carry);
+
+	    set_bit(idx, bit);
+      }
+
+      return *this;
+}
+
 ostream& operator<< (ostream&out, const vvp_vector4_t&that)
 {
      out << that.size() << "'b";
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -121,12 +121,17 @@ class vvp_vector4_t {
 	// Test that the vectors are exactly equal
      bool eeq(const vvp_vector4_t&that) const;

+	// Return true if there is an X or Z anywhere in the vector.
+      bool has_xz() const;
+
 	// Change all Z bits to X bits.
      void change_z2x();

 	// Display the value into the buf as a string.
      char*as_string(char*buf, size_t buf_len);

+      vvp_vector4_t& operator += (int64_t);
+
    private:
 	// Number of vvp_bit4_t bits that can be shoved into a word.
      enum { BITS_PER_WORD = 8*sizeof(unsigned long)/2 };