Performance optimizations

For the %mov instruction, implement a vvp_vector4_t::mov method to manipulate the thread vector directly. For the %load/v instruction, rework the vec4_value() methods to avoid creating vvp_vector4_t temporaries, and therefore reduce the copy overhead.
2009-11-20 17:54:48 -08:00 · 2009-11-20 17:54:48 -08:00 · 971179d617
parent 0fc136fad9
commit 971179d617
8 changed files with 121 additions and 42 deletions
--- a/vvp/array.cc
+++ b/vvp/array.cc
@ -931,7 +931,8 @@ vvp_vector4_t array_get_word(vvp_array_t arr, unsigned address)
      vvp_signal_value*sig = dynamic_cast<vvp_signal_value*> (vsig->node->fil);
      assert(sig);

-      vvp_vector4_t val = sig->vec4_value();
+      vvp_vector4_t val;
+      sig->vec4_value(val);
      return val;
 }

--- a/vvp/vpi_signal.cc
+++ b/vvp/vpi_signal.cc
@ -304,9 +304,11 @@ static void format_vpiDecStrVal(vvp_signal_value*sig, int base, unsigned wid,

      vvp_vector4_t vec4;
      if (base == 0 && end == ssize) {
-	    vec4 = sig->vec4_value();
+	    sig->vec4_value(vec4);
      } else {
-	    vec4 = sig->vec4_value().subvalue(base, wid);
+	    vvp_vector4_t tmp;
+	    sig->vec4_value(tmp);
+	    vec4 = tmp.subvalue(base, wid);
      }

      vpip_vec4_to_dec_str(vec4, rbuf, hwid, signed_flag);
@ -317,7 +319,9 @@ static void format_vpiDecStrVal(vvp_signal_value*sig, int base, unsigned wid,
 static void format_vpiIntVal(vvp_signal_value*sig, int base, unsigned wid,
                             int signed_flag, s_vpi_value*vp)
 {
-      vvp_vector4_t sub = sig->vec4_value().subvalue(base, wid);
+      vvp_vector4_t tmp;
+      sig->vec4_value(tmp);
+      vvp_vector4_t sub = tmp.subvalue(base, wid);
      long val = 0;
      vector4_to_value(sub, val, signed_flag, false);
      vp->value.integer = val;
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@ -2688,7 +2688,8 @@ bool of_IX_GETV(vthread_t thr, vvp_code_t cp)
      }
      assert(sig);

-      vvp_vector4_t vec = sig->vec4_value();
+      vvp_vector4_t vec;
+      sig->vec4_value(vec);
      unsigned long val;
      bool known_flag = vector4_to_value(vec, val);

@ -2717,7 +2718,8 @@ bool of_IX_GETVS(vthread_t thr, vvp_code_t cp)
      }
      assert(sig);

-      vvp_vector4_t vec = sig->vec4_value();
+      vvp_vector4_t vec;
+      sig->vec4_value(vec);
      long val;
      bool known_flag = vector4_to_value(vec, val, true, true);

@ -3055,7 +3057,7 @@ bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t cp)
 * The functor to read from is the vvp_net_t object pointed to by the
 * cp->net pointer.
 */
-static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
+static void load_base(vthread_t thr, vvp_code_t cp, vvp_vector4_t&dst)
 {
      vvp_net_t*net = cp->net;

@ -3068,7 +3070,7 @@ static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 	    assert(sig);
      }

-      return sig->vec4_value();
+      sig->vec4_value(dst);
 }

 bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
@ -3076,7 +3078,8 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
      unsigned bit = cp->bit_idx[0];
      unsigned wid = cp->bit_idx[1];

-      vvp_vector4_t sig_value = load_base(thr, cp);
+      vvp_vector4_t sig_value;
+      load_base(thr, cp, sig_value);

 	/* Check the address once, before we scan the vector. */
      thr_check_addr(thr, bit+wid-1);
@ -3108,7 +3111,10 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
      vvp_vector4_t sig_value(wid, BIT4_0);
-      sig_value.copy_bits(load_base(thr, cp));
+
+      vvp_vector4_t tmp;
+      load_base(thr, cp, tmp);
+      sig_value.copy_bits(tmp);

      load_vp0_common(thr, cp, sig_value);
      return true;
@ -3118,7 +3124,8 @@ bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t cp)
 {
      unsigned wid = cp->bit_idx[1];

-      vvp_vector4_t tmp (load_base(thr, cp));
+      vvp_vector4_t tmp;
+      load_base(thr, cp, tmp);

        /* We need a vector this wide to make the math work correctly.
         * Copy the base bits into the vector, but keep the width. */
@ -3483,10 +3490,8 @@ static bool of_MOV_(vthread_t thr, vvp_code_t cp)

      thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
      thr_check_addr(thr, cp->bit_idx[1]+cp->number-1);
-	// Read the source vector out
-      vvp_vector4_t tmp (thr->bits4, cp->bit_idx[1], cp->number);
-	// Write it in the new place.
-      thr->bits4.set_vec(cp->bit_idx[0], tmp);
+
+      thr->bits4.mov(cp->bit_idx[0], cp->bit_idx[1], cp->number);

      return true;
 }
@ -4262,15 +4267,17 @@ bool of_SHIFTR_I0(vthread_t thr, vvp_code_t cp)
 	    vvp_vector4_t tmp (wid, BIT4_X);
 	    thr->bits4.set_vec(base, tmp);

+      } else if (shift > wid) {
+	      // Shift so far that the entire vector is shifted out.
+	    vvp_vector4_t tmp (wid, BIT4_0);
+	    thr->bits4.set_vec(base, tmp);
+
      } else if (shift > 0) {
-	    unsigned idx;
-	    for (idx = 0 ;  (idx+shift) < wid ;  idx += 1) {
-		  unsigned src = base + idx + shift;
-		  unsigned dst = base + idx;
-		  thr_put_bit(thr, dst, thr_get_bit(thr, src));
-	    }
-	    for ( ;  idx < wid ;  idx += 1)
-		  thr_put_bit(thr, base+idx, BIT4_0);
+	      // The mov method should handle overlapped source/dest
+	    thr->bits4.mov(base, base+shift, wid-shift);
+
+	    vvp_vector4_t tmp (shift, BIT4_0);
+	    thr->bits4.set_vec(base+wid-shift, tmp);

      } else if (shift < -(long)wid) {
 	      // Negative shift is so far that all the value is shifted out.
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@ -1131,6 +1131,63 @@ void vvp_vector4_t::set_vec(unsigned adr, const vvp_vector4_t&that)

 }

+void vvp_vector4_t::mov(unsigned dst, unsigned src, unsigned cnt)
+{
+      assert(dst+cnt <= size_);
+      assert(src+cnt <= size_);
+
+      if (size_ <= BITS_PER_WORD) {
+	    unsigned long vmask = (1UL << cnt) - 1;
+	    unsigned long tmp;
+
+	    tmp = (abits_val_ >> src) & vmask;
+	    abits_val_ &= ~ (vmask << dst);
+	    abits_val_ |= tmp << dst;
+
+	    tmp = (bbits_val_ >> src) & vmask;
+	    bbits_val_ &= ~ (vmask << dst);
+	    bbits_val_ |= tmp << dst;
+
+      } else {
+	    unsigned sptr = src / BITS_PER_WORD;
+	    unsigned dptr = dst / BITS_PER_WORD;
+	    unsigned soff = src % BITS_PER_WORD;
+	    unsigned doff = dst % BITS_PER_WORD;
+
+	    while (cnt > 0) {
+		  unsigned trans = cnt;
+		  if ((soff+trans) > BITS_PER_WORD)
+			trans = BITS_PER_WORD - soff;
+
+		  if ((doff+trans) > BITS_PER_WORD)
+			trans = BITS_PER_WORD - doff;
+
+		  unsigned long vmask = (1UL << trans) - 1;
+		  unsigned long tmp;
+
+		  tmp = (abits_ptr_[sptr] >> soff) & vmask;
+		  abits_ptr_[dptr] &= ~ (vmask << doff);
+		  abits_ptr_[dptr] |= tmp << doff;
+
+		  tmp = (bbits_ptr_[sptr] >> soff) & vmask;
+		  bbits_ptr_[dptr] &= ~ (vmask << doff);
+		  bbits_ptr_[dptr] |= tmp << doff;
+
+		  cnt -= trans;
+		  soff += trans;
+		  if (soff >= BITS_PER_WORD) {
+			soff = 0;
+			sptr += 1;
+		  }
+		  doff += trans;
+		  if (doff >= BITS_PER_WORD) {
+			doff = 0;
+			dptr += 1;
+		  }
+	    }
+      }
+}
+
 bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
 {
      if (size_ != that.size_)
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@ -234,6 +234,9 @@ class vvp_vector4_t {
        // Get the bits from another vector, but keep my size.
      void copy_bits(const vvp_vector4_t&that);

+	// Move bits within this vector.
+      void mov(unsigned dst, unsigned src, unsigned cnt);
+
 	// Test that the vectors are exactly equal
      bool eeq(const vvp_vector4_t&that) const;

--- a/vvp/vvp_net_sig.cc
+++ b/vvp/vvp_net_sig.cc
@ -380,17 +380,19 @@ vvp_scalar_t vvp_fun_signal4_aa::scalar_value(unsigned idx) const
      return vvp_scalar_t(bits4->value(idx), 6, 6);
 }

-vvp_vector4_t vvp_fun_signal4_aa::vec4_value() const
+void vvp_fun_signal4_aa::vec4_value(vvp_vector4_t&val) const
 {
      vvp_vector4_t*bits4 = static_cast<vvp_vector4_t*>
            (vthread_get_rd_context_item(context_idx_));

-      return *bits4;
+      val = *bits4;
 }

 vvp_vector4_t vvp_fun_signal4_aa::vec4_unfiltered_value() const
 {
-      return vec4_value();
+      vvp_vector4_t tmp;
+      vec4_value(tmp);
+      return tmp;
 }

 void vvp_fun_signal4_aa::operator delete(void*)
@ -597,7 +599,7 @@ vvp_scalar_t vvp_fun_signal_real_aa::scalar_value(unsigned idx) const
      assert(0);
 }

-vvp_vector4_t vvp_fun_signal_real_aa::vec4_value() const
+void vvp_fun_signal_real_aa::vec4_value(vvp_vector4_t&) const
 {
      assert(0);
 }
@ -787,12 +789,14 @@ vvp_scalar_t vvp_wire_vec4::scalar_value(unsigned idx) const
      return vvp_scalar_t(value(idx),6,6);
 }

-vvp_vector4_t vvp_wire_vec4::vec4_value() const
+void vvp_wire_vec4::vec4_value(vvp_vector4_t&val) const
 {
-      vvp_vector4_t tmp = bits4_;
+      val = bits4_;
+      if (test_force_mask_is_zero())
+	    return;
+
      for (unsigned idx = 0 ; idx < bits4_.size() ; idx += 1)
-	    tmp.set_bit(idx, filtered_value_(idx));
-      return tmp;
+	    val.set_bit(idx, filtered_value_(idx));
 }

 vvp_wire_vec8::vvp_wire_vec8(unsigned wid)
@ -930,9 +934,9 @@ vvp_vector8_t vvp_wire_vec8::vec8_value() const
      return tmp;
 }

-vvp_vector4_t vvp_wire_vec8::vec4_value() const
+void vvp_wire_vec8::vec4_value(vvp_vector4_t&val) const
 {
-      return reduce4(vec8_value());
+      val = reduce4(vec8_value());
 }

 vvp_wire_real::vvp_wire_real()
@ -1011,7 +1015,7 @@ vvp_scalar_t vvp_wire_real::scalar_value(unsigned idx) const
      assert(0);
 }

-vvp_vector4_t vvp_wire_real::vec4_value() const
+void vvp_wire_real::vec4_value(vvp_vector4_t&) const
 {
      assert(0);
 }
--- a/vvp/vvp_net_sig.h
+++ b/vvp/vvp_net_sig.h
@ -99,7 +99,7 @@ class vvp_signal_value {
      virtual unsigned value_size() const =0;
      virtual vvp_bit4_t value(unsigned idx) const =0;
      virtual vvp_scalar_t scalar_value(unsigned idx) const =0;
-      virtual vvp_vector4_t vec4_value() const =0;
+      virtual void vec4_value(vvp_vector4_t&) const =0;
      virtual double real_value() const;

      virtual void get_signal_value(struct t_vpi_value*vp);
@ -182,7 +182,7 @@ class vvp_fun_signal4_aa : public vvp_fun_signal_vec, public automatic_signal_ba
      unsigned   value_size() const;
      vvp_bit4_t value(unsigned idx) const;
      vvp_scalar_t scalar_value(unsigned idx) const;
-      vvp_vector4_t vec4_value() const;
+      void vec4_value(vvp_vector4_t&) const;
      vvp_vector4_t vec4_unfiltered_value() const;

    public: // These objects are only permallocated.
@ -268,7 +268,7 @@ class vvp_fun_signal_real_aa : public vvp_fun_signal_real, public automatic_sign
      unsigned   value_size() const;
      vvp_bit4_t value(unsigned idx) const;
      vvp_scalar_t scalar_value(unsigned idx) const;
-      vvp_vector4_t vec4_value() const;
+      void vec4_value(vvp_vector4_t&) const;
      double real_value() const;
      void get_signal_value(struct t_vpi_value*vp);

@ -322,7 +322,7 @@ class vvp_wire_vec4 : public vvp_wire_base {
      unsigned value_size() const;
      vvp_bit4_t value(unsigned idx) const;
      vvp_scalar_t scalar_value(unsigned idx) const;
-      vvp_vector4_t vec4_value() const;
+      void vec4_value(vvp_vector4_t&) const;

    private:
      vvp_bit4_t filtered_value_(unsigned idx) const;
@ -358,7 +358,7 @@ class vvp_wire_vec8 : public vvp_wire_base {
      unsigned value_size() const;
      vvp_bit4_t value(unsigned idx) const;
      vvp_scalar_t scalar_value(unsigned idx) const;
-      vvp_vector4_t vec4_value() const;
+      void vec4_value(vvp_vector4_t&) const;
 	// This is new to vvp_wire_vec8
      vvp_vector8_t vec8_value() const;

@ -393,7 +393,7 @@ class vvp_wire_real : public vvp_wire_base {
      unsigned value_size() const;
      vvp_bit4_t value(unsigned idx) const;
      vvp_scalar_t scalar_value(unsigned idx) const;
-      vvp_vector4_t vec4_value() const;
+      void vec4_value(vvp_vector4_t&) const;
      double real_value() const;

      void get_signal_value(struct t_vpi_value*vp);
--- a/vvp/words.cc
+++ b/vvp/words.cc
@ -118,8 +118,11 @@ static void __compile_var(char*label, char*name,
      if (name) {
 	    assert(!array);
 	    if (obj) vpip_attach_to_current_scope(obj);
-            if (!vpip_peek_current_scope()->is_automatic)
-	          schedule_init_vector(vvp_net_ptr_t(net,0), vfil->vec4_value());
+            if (!vpip_peek_current_scope()->is_automatic) {
+		  vvp_vector4_t tmp;
+		  vfil->vec4_value(tmp);
+	          schedule_init_vector(vvp_net_ptr_t(net,0), tmp);
+	    }
      }
 	// If this is an array word, then it does not have a name, and
 	// it is attached to the addressed array.