From 8190307dd37b02079c57c56e01032a6eaa4463ae Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 11:09:33 -0700
Subject: [PATCH 1/7] Optimize/inline vvp_bit4_r AND, OR and vector set bit.

The AND and OR operators for vvp_bit4_t are slightly tweaked to be
lighter and inlinable.

The vvp_vector4_t::set_bit is optimized to do less silly mask fiddling.
---
 vvp/vvp_net.cc | 22 -------------
 vvp/vvp_net.h  | 83 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 9f9a50e55..dfc77fb40 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -61,28 +61,6 @@ vvp_bit4_t add_with_carry(vvp_bit4_t a, vvp_bit4_t b, vvp_bit4_t&c)
       }
 }
 
-vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_0)
-	    return BIT4_0;
-      if (b == BIT4_0)
-	    return BIT4_0;
-      if (bit4_is_xz(a))
-	    return BIT4_X;
-      if (bit4_is_xz(b))
-	    return BIT4_X;
-      return BIT4_1;
-}
-
-vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_1)
-	    return BIT4_1;
-      if (b == BIT4_1)
-	    return BIT4_1;
-      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
-}
-
 vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b)
 {
       if (bit4_is_xz(a))
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 3d4f6b881..18548c157 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -87,8 +87,21 @@ inline vvp_bit4_t bit4_z2x(vvp_bit4_t a)
 inline vvp_bit4_t operator ~ (vvp_bit4_t a)
 { return bit4_z2x((vvp_bit4_t) (((int)a) ^ 1)); }
 
-extern vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b);
-extern vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b);
+inline vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_1 || b==BIT4_1)
+	    return BIT4_1;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+inline vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_0 || b==BIT4_0)
+	    return BIT4_0;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+
 extern vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b);
 extern ostream& operator<< (ostream&o, vvp_bit4_t a);
 
@@ -294,41 +307,47 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
       assert(idx < size_);
 
       unsigned long off = idx % BITS_PER_WORD;
-      unsigned long amask = 0, bmask = 0;
-      switch (val) {
-	  case BIT4_0:
-	    amask = 0;
-	    bmask = 0;
-	    break;
-	  case BIT4_1:
-	    amask = 1;
-	    bmask = 0;
-	    break;
-	  case BIT4_X:
-	    amask = 1;
-	    bmask = 1;
-	    break;
-	  case BIT4_Z:
-	    amask = 0;
-	    bmask = 1;
-	    break;
-      }
-
       unsigned long mask = 1UL << off;
-      amask <<= off;
-      bmask <<= off;
 
       if (size_ > BITS_PER_WORD) {
 	    unsigned wdx = idx / BITS_PER_WORD;
-	    abits_ptr_[wdx] &= ~mask;
-	    abits_ptr_[wdx] |= amask;
-	    bbits_ptr_[wdx] &= ~mask;
-	    bbits_ptr_[wdx] |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+	    }
       } else {
-	    abits_val_ &= ~mask;
-	    abits_val_ |= amask;
-	    bbits_val_ &= ~mask;
-	    bbits_val_ |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_val_ &= ~mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_val_ |=  mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_val_ |=  mask;
+		  bbits_val_ |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_val_ &= ~mask;
+		  bbits_val_ |=  mask;
+		  break;
+	    }
       }
 }
 

From 5cc376ebd49711498a2ef79895e9d97acc2c9945 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 16:00:16 -0700
Subject: [PATCH 2/7] Optimize ADD and MUL instructions

Make better use of the CPU word in ADD and MUL instructions.
---
 vvp/vthread.cc | 381 +++++++++++++++++++------------------------------
 1 file changed, 146 insertions(+), 235 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index a7cf69f05..43154bf47 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -214,6 +214,67 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
       }
 }
 
+/*
+ * Some of the instructions do wide addition to arrays of long. They
+ * use this add_with_cary function to help.
+ */
+static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
+					   unsigned long&carry)
+{
+      unsigned long tmp = b + carry;
+      unsigned long sum = a + tmp;
+      carry = 0;
+      if (tmp < b)
+	    carry = 1;
+      if (sum < tmp)
+	    carry = 1;
+      if (sum < a)
+	    carry = 1;
+      return sum;
+}
+
+static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
+					 unsigned long&carry)
+{
+      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
+      unsigned long a0 = a & mask;
+      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
+      unsigned long b0 = b & mask;
+      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long tmp = a0 * b0;
+
+      unsigned long r00 = tmp & mask;
+      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a0 * b1;
+
+      unsigned long r01 = tmp & mask;
+      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b0;
+
+      unsigned long r10 = tmp & mask;
+      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b1;
+
+      unsigned long r11 = tmp & mask;
+      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long r1 = c00 + r01 + r10;
+      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
+      r1 &= mask;
+      r2 += c01 + c10 + r11;
+      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
+      r2 &= mask;
+      r3 += c11;
+      r3 &= mask;
+
+      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
+      return (r1 << (CPU_WORD_BITS/2)) + r00;
+}
+
 
 /*
  * Create a new thread with the given start address.
@@ -465,19 +526,8 @@ bool of_ADD(vthread_t thr, vvp_code_t cp)
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
 
 	/* We know from the vector_to_array that the address is valid
 	   in the thr->bitr4 vector, so just do the set bit. */
@@ -525,30 +575,15 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       unsigned word_count = (bit_width+CPU_WORD_BITS-1)/CPU_WORD_BITS;
 
       unsigned long*lva = vector_to_array(thr, bit_addr, bit_width);
-      unsigned long*lvb = 0;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-      lvb[0] = imm_value;
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = 0;
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < bit_width ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], imm_value, carry);
+	    imm_value = 0;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -557,7 +592,6 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(bit_addr, bit_width, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 
@@ -2921,101 +2955,59 @@ bool of_MOVI(vthread_t thr, vvp_code_t cp)
 
 bool of_MUL(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-      if(cp->number <= 8*sizeof(unsigned long)) {
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  goto x_out;
-
-	    lv |= (unsigned long) lb << idx;
-	    rv |= (unsigned long) rb << idx;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      lv *= rv;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-	    lv >>= 1;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
-      } else {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    ap[0] *= bp[0];
+	    thr->bits4.setarray(adra, wid, ap);
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
 
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
 
-      int mxa = -1;
-      int mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
+      for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
+	    for (unsigned mul_b = 0 ; mul_b < words ; mul_b += 1) {
+		  unsigned long sum;
+		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
+		  unsigned base = mul_a + mul_b;
+		  unsigned long carry = 0;
+		  res[base] = add_with_carry(res[base], tmp, carry);
+		  for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
+			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+			sum = 0;
 		  }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    }
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++)
-                {
-                if(b[i])
-                        {
-                        unsigned char carry=0;
-                        unsigned char temp;
-
-                        for(int j=0;j<=mxa;j++)
-                                {
-                                if(i+j>=(int)cp->number) break;
-                                temp=sum[i+j]+a[j]+carry;
-                                sum[i+j]=(temp&1);
-                                carry=(temp>>1);
-                                }
-                        }
-                }
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-      return true;
-      }
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+      thr->bits4.setarray(adra, wid, res);
+      delete[]ap;
+      delete[]bp;
+      delete[]res;
       return true;
 }
 
@@ -3030,101 +3022,48 @@ bool of_MUL_WR(vthread_t thr, vvp_code_t cp)
 
 bool of_MULI(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adr = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-	/* If the value fits into a native unsigned long, then make an
-	   unsigned long variable with the numbers, to a native
-	   multiply, and work with that. */
-
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned long lv = 0, rv = cp->bit_idx[1];
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-
-		  if (bit4_is_xz(lb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-
-		  idx1 += 1;
-	    }
-
-	    lv *= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
+      assert(adr >= 4);
 
+      unsigned long*val = vector_to_array(thr, adr, wid);
+	// If there are X bits in the value, then return X.
+      if (val == 0) {
+	    vvp_vector4_t tmp(cp->number, BIT4_X);
+	    thr->bits4.set_vec(cp->bit_idx[0], tmp);
 	    return true;
       }
 
-	/* number is too large for local long, so do bitwise
-	   multiply. */
-
-      unsigned idx1; idx1 = cp->bit_idx[0];
-      unsigned imm;  imm  = cp->bit_idx[1];
-
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
-
-      int mxa; mxa = -1;
-      int mxb; mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = (imm & 1)? BIT4_1 : BIT4_0;
-
-	    imm >>= 1;
-
-	    if (bit4_is_xz(lb)) {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
-	    }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
+	// If everything fits in a word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    val[0] *= imm;
+	    thr->bits4.setarray(adr, wid, val);
+	    delete[]val;
+	    return true;
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++) {
-	    if(b[i]) {
-		  unsigned char carry=0;
-		  unsigned char temp;
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
 
-		  for(int j=0;j<=mxa;j++) {
-			if(i+j>=(int)cp->number) break;
-			temp=sum[i+j]+a[j]+carry;
-			sum[i+j]=(temp&1);
-			carry=(temp>>1);
-		  }
+      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
+	    unsigned long sum;
+	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
+
+	    unsigned long carry = 0;
+	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
+	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
+		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+		  sum = 0;
 	    }
       }
 
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-
-      return true;
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+      thr->bits4.setarray(adr, wid, res);
+      delete[]val;
+      delete[]res;
       return true;
 }
 
@@ -3754,20 +3693,10 @@ bool of_SUB(vthread_t thr, vvp_code_t cp)
 	    goto x_out;
 
 
-      unsigned carry;
+      unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-	    unsigned long tmp = ~lvb[idx] + carry;
-	    unsigned long sum = tmp + lva[idx];
-	    carry = 0;
-	    if (tmp < ~lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);
 
 
 	/* We know from the vector_to_array that the address is valid
@@ -3802,34 +3731,17 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       assert(cp->bit_idx[0] >= 4);
 
       unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS;
-
+      unsigned long imm = cp->bit_idx[1];
       unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
-      unsigned long*lvb;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-
-      lvb[0] = cp->bit_idx[1];
-      lvb[0] = ~lvb[0];
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = ~0UL;
 
       unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0UL;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
+	    imm = ~0UL;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -3838,7 +3750,6 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 

From 6987d16bd33bfcf6383bb10838630cda4669d976 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 16:44:58 -0700
Subject: [PATCH 3/7] Optimize the %load/vp0 to use subarrays.

This instruction adds an integer value to the value being loaded. This
optimization uses subarrays instead of the += operator. This is faster
because the value is best loaded into the vector as a subarray anyhow.
---
 vvp/vthread.cc | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 43154bf47..4076c1ccc 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -2476,7 +2476,7 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp)
  * The functor to read from is the vvp_net_t object pointed to by the
  * cp->net pointer.
  */
-vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
+static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
       assert(cp->bit_idx[1] > 0);
@@ -2490,9 +2490,8 @@ vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
       if (sig == 0) {
 	    cerr << "%%load/v error: Net arg not a vector signal? "
 		 << typeid(*net->fun).name() << endl;
+	    assert(sig);
       }
-      assert(sig);
-
 
       vvp_vector4_t sig_value = sig->vec4_value();
       sig_value.resize(wid);
@@ -2532,15 +2531,35 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t sig_value(wid, BIT4_0);
       sig_value.copy_bits(load_base(thr, cp));
 
-	/* Add the addend value */
-      sig_value += addend;
-
 	/* Check the address once, before we scan the vector. */
       thr_check_addr(thr, bit+wid-1);
 
+      unsigned long*val = sig_value.subarray(0, wid);
+      if (val == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(bit, tmp);
+	    return true;
+      }
+
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
+      unsigned long carry = 0;
+      unsigned long imm = addend;
+      if (addend >= 0) {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = 0UL;
+	    }
+      } else {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = -1UL;
+	    }
+      }
+
 	/* Copy the vector bits into the bits4 vector. Do the copy
 	   directly to skip the excess calls to thr_check_addr. */
-      thr->bits4.set_vec(bit, sig_value);
+      thr->bits4.setarray(bit, wid, val);
+      delete[]val;
 
       return true;
 }

From 0fa3099ded623d7e46a5496fb10cfad7d1f5ed3f Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 11:54:39 -0700
Subject: [PATCH 4/7] Optimize %div and %div/s

Use high radix long division to take advantage of the divide hardware
of the host computer. It looks brute force at first glance, but since
it is using the optimized arithmetic of the host processor, it is much
faster then implementing "fast" algorithms the hard way.
---
 vvp/vthread.cc | 509 +++++++++++++++++++++++--------------------------
 1 file changed, 243 insertions(+), 266 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 4076c1ccc..049995d17 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -275,6 +275,24 @@ static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
       return (r1 << (CPU_WORD_BITS/2)) + r00;
 }
 
+static void multiply_array_imm(unsigned long*res, unsigned long*val,
+			       unsigned words, unsigned long imm)
+{
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
+
+      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
+	    unsigned long sum;
+	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
+
+	    unsigned long carry = 0;
+	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
+	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
+		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+		  sum = 0;
+	    }
+      }
+}
 
 /*
  * Create a new thread with the given start address.
@@ -1511,297 +1529,268 @@ bool of_DISABLE(vthread_t thr, vvp_code_t cp)
       return ! disabled_myself_flag;
 }
 
-static void divide_bits(unsigned len, unsigned char*lbits,
-			const unsigned char*rbits)
+/*
+ * This function divides a 2-word number {high, a} by a 1-word
+ * number. Assume that high < b.
+ */
+static unsigned long divide2words(unsigned long a, unsigned long b,
+				  unsigned long high)
 {
-      unsigned char *a, *b, *z, *t;
-      a = new unsigned char[len+1];
-      b = new unsigned char[len+1];
-      z = new unsigned char[len+1];
-      t = new unsigned char[len+1];
+      unsigned long result = 0;
+      while (high > 0) {
+	    unsigned long tmp_result = ULONG_MAX / b;
+	    unsigned long remain = ULONG_MAX % b;
 
-      unsigned char carry;
-      unsigned char temp;
-
-      int mxa = -1, mxz = -1;
-      int i;
-      int current, copylen;
-
-
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    unsigned lb = lbits[idx];
-	    unsigned rb = rbits[idx];
-
-	    z[idx]=lb;
-	    a[idx]=1-rb;	// for 2s complement add..
-
-      }
-      z[len]=0;
-      a[len]=1;
-
-      for(i=0;i<(int)len+1;i++) {
-	    b[i]=0;
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(!a[i]) {
-		  mxa=i;
-		  break;
-	    }
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(z[i]) {
-		  mxz=i;
-		  break;
-	    }
-      }
-
-      if((mxa>mxz)||(mxa==-1)) {
-	    if(mxa==-1) {
-		  fprintf(stderr, "Division By Zero error, exiting.\n");
-		  exit(255);
+	    remain += 1;
+	    if (remain >= b) {
+		  remain -= b;
+		  result += 1;
 	    }
 
-	    goto tally;
+	      // Now 0x1_0...0 = b*tmp_result + remain
+	      // high*0x1_0...0 = high*(b*tmp_result + remain)
+	      // high*0x1_0...0 = high*b*tmp_result + high*remain
+
+	      // We know that high*0x1_0...0 >= high*b*tmp_result, and
+	      // we know that high*0x1_0...0 > high*remain. Use
+	      // high*remain as the remainder for another iteration,
+	      // and add tmp_result*high into the current estimate of
+	      // the result.
+	    result += tmp_result * high;
+
+	      // The new iteration starts with high*remain + a.
+	    remain = multiply_with_carry(high, remain, high);
+	    a = add_with_carry(a, remain, high);
+
+	      // Now result*b + {high,a} == the input {high,a}. It is
+	      // possible that the new high >= 1. If so, it will
+	      // certainly be less then high from the previous
+	      // iteration. Do another iteration and it will shrink,
+	      // eventually to 0.
       }
 
-      copylen = mxa + 2;
-      current = mxz - mxa;
+	// high is now 0, so a is the remaining remainder, so we can
+	// finish off the integer divide with a simple a/b.
 
-      while(current > -1) {
-	    carry = 1;
-	    for(i=0;i<copylen;i++) {
-		  temp = z[i+current] + a[i] + carry;
-		  t[i] = (temp&1);
-		  carry = (temp>>1);
+      return result + a/b;
+}
+
+static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid)
+{
+
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+
+      unsigned btop = words-1;
+      while (btop > 0 && bp[btop] == 0)
+	    btop -= 1;
+
+	// Detect divide by 0, and exit.
+      if (btop==0 && bp[0]==0)
+	    return 0;
+
+      unsigned long*diff  = new unsigned long[words];
+      unsigned long*result= new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    result[idx] = 0;
+
+      for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) {
+	    unsigned cur_ptr = cur-1;
+	    unsigned long cur_res;
+	    if (ap[cur_ptr+btop] >= bp[btop]) {
+		  cur_res = ap[cur_ptr+btop] / bp[btop];
+
+	    } else if (cur_ptr+btop+1 >= words) {
+		  continue;
+
+	    } else if (ap[cur_ptr+btop+1] == 0) {
+		  continue;
+
+	    } else {
+		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop],
+					 ap[cur_ptr+btop+1]);
 	    }
 
-	    if(carry) {
-		  for(i=0;i<copylen;i++) {
-			z[i+current] = t[i];
-		  }
-		  b[current] = 1;
+	      // cur_res is a guestimate of the result this far. It
+	      // may be 1 too big. (But it will also be >0) Try it,
+	      // and if the difference comes out negative, then adjust
+	      // then.
+
+	    multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res);
+	    unsigned long carry = 1;
+	    for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+		  ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry);
+
+	      // ap has the diff subtracted out of it. If cur_res was
+	      // too large, then ap will turn negative. (We easily
+	      // tell that ap turned negative by looking at
+	      // carry&1. If it is 0, then it is *negative*.) In that
+	      // case, we know that cur_res was too large by 1. Correct by
+	      // adding 1b back in and reducing cur_res.
+	    if (carry&1 == 0) {
+		  cur_res -= 1;
+		  carry = 0;
+		  for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+			ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry);
+		    // The sign *must* have changed again.
+		  assert(carry == 1);
 	    }
 
-	    current--;
+	    result[cur_ptr] = cur_res;
       }
 
- tally:
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	      // n.b., z[] has the remainder...
-	    lbits[idx] = b[idx];
-      }
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
 
-      delete []t;
-      delete []z;
-      delete []b;
-      delete []a;
+      delete[]diff;
+      return result;
 }
 
 bool of_DIV(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-		  vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-		  if (bit4_is_xz(lb) || bit4_is_xz(rb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-		  rv |= (unsigned long) rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	    if (rv == BIT4_0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
-
-	    return true;
-
-      } else {
-
-	      /* Make a string of the bits of the numbers to be
-		 divided. Then divide them, and write the results into
-		 the thread. */
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
 	    return true;
       }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  ap[0] /= bp[0];
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
-static void negate_bits(unsigned len, unsigned char*bits)
+
+static void negate_words(unsigned long*val, unsigned words)
 {
-      unsigned char carry = 1;
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    carry += bits[idx]? 0 : 1;
-	    bits[idx] = carry & 1;
-	    carry >>= 1;
-      }
+      unsigned long carry = 1;
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    val[idx] = add_with_carry(0, ~val[idx], carry);
 }
 
 bool of_DIV_S(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
 
-      if(cp->number <= 8*sizeof(long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    unsigned lb = 0;
-	    unsigned rb = 0;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lb = thr_get_bit(thr, idx1);
-		  rb = thr_get_bit(thr, idx2);
-
-		  if ((lb | rb) & 2)
-			goto x_out;
-
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Extend the sign to fill the native long. */
-	    for (unsigned idx = cp->number; idx < (8*sizeof lv); idx += 1) {
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-	    }
-
-	    if (rv == 0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0);
-		  lv >>= 1;
-	    }
-
-      } else {
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	      /* Signed division is unsigned division on the absolute
-		 values of the operands, then corrected for the number
-		 of signs. */
-	    unsigned sign_flag = 0;
-	    if (lbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, lbits);
-	    }
-	    if (rbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, rbits);
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    if (sign_flag & 1) {
-		  negate_bits(cp->number, lbits);
-	    }
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long sign_mask = 0;
+      if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) {
+	    sign_mask = -1UL << (CPU_WORD_BITS-sign_bits);
+	    if (ap[words-1] & (sign_mask>>1))
+		  ap[words-1] |= sign_mask;
+	    if (bp[words-1] & (sign_mask>>1))
+		  bp[words-1] |= sign_mask;
+      }
 
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  long tmpa = (long) ap[0];
+		  long tmpb = (long) bp[0];
+		  long res = tmpa / tmpb;
+		  ap[0] = ((unsigned long)res) & ~sign_mask;
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+	// We need to the actual division to positive integers. Make
+	// them positive here, and remember the negations.
+      bool negate_flag = false;
+      if ( ((long) ap[words-1]) < 0 ) {
+	    negate_flag = true;
+	    negate_words(ap, words);
+      }
+      if ( ((long) bp[words-1]) < 0 ) {
+	    negate_flag ^= true;
+	    negate_words(bp, words);
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+      if (negate_flag) {
+	    negate_words(result, words);
+      }
+
+      result[words-1] &= ~sign_mask;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
@@ -3065,20 +3054,8 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
 
       unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
       unsigned long*res = new unsigned long[words];
-      for (unsigned idx = 0 ; idx < words ; idx += 1)
-	    res[idx] = 0;
 
-      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
-	    unsigned long sum;
-	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
-
-	    unsigned long carry = 0;
-	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
-	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
-		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
-		  sum = 0;
-	    }
-      }
+      multiply_array_imm(res, val, words, imm);
 
       thr->bits4.setarray(adr, wid, res);
       delete[]val;

From 5a0fe9ff8375f9e32d61f9d5e0d4649d8f93864e Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 17:51:28 -0700
Subject: [PATCH 5/7] Better use of immediate operands.

Clarify that operands are typically 32bits, and have the code generator
make better use of this.

Also improve the %movi implementation to work well with marger vectors.

Add the %andi instruction to use immediate operands.
---
 tgt-vvp/eval_expr.c | 83 ++++++++++++++++++++++++++++++++++-----------
 tgt-vvp/vvp_priv.h  |  6 ++++
 vvp/codes.h         |  3 +-
 vvp/compile.cc      |  1 +
 vvp/opcodes.txt     |  3 +-
 vvp/vthread.cc      | 39 +++++++++++++++++++--
 6 files changed, 111 insertions(+), 24 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 08080fe7e..4753bd0d5 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -92,7 +92,7 @@ unsigned long get_number_immediate(ivl_expr_t ex)
 		    case '0':
 		      break;
 		    case '1':
-		      imm |= 1 << idx;
+		      imm |= 1UL << idx;
 		      break;
 		    default:
 		      assert(0);
@@ -909,12 +909,43 @@ static struct vector_info draw_binary_expr_le(ivl_expr_t exp,
       return lv;
 }
 
+static struct vector_info draw_logic_immediate(ivl_expr_t exp,
+					       ivl_expr_t le,
+					       ivl_expr_t re,
+					       unsigned wid)
+{
+      struct vector_info lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
+      unsigned long imm = get_number_immediate(re);
+
+      assert(lv.base >= 4);
+
+      switch (ivl_expr_opcode(exp)) {
+
+	  case '&':
+	    fprintf(vvp_out, "   %%andi %u, %lu, %u;\n", lv.base, imm, lv.wid);
+	    break;
+
+	  default:
+	    assert(0);
+	    break;
+      }
+
+      return lv;
+}
+
 static struct vector_info draw_binary_expr_logic(ivl_expr_t exp,
 						 unsigned wid)
 {
       ivl_expr_t le = ivl_expr_oper1(exp);
       ivl_expr_t re = ivl_expr_oper2(exp);
 
+      if (ivl_expr_opcode(exp) == '&') {
+	    if (number_is_immediate(re, IMM_WID))
+		  return draw_logic_immediate(exp, le, re, wid);
+	    if (number_is_immediate(le, IMM_WID))
+		  return draw_logic_immediate(exp, re, le, wid);
+      }
+
       struct vector_info lv;
       struct vector_info rv;
 
@@ -1167,15 +1198,15 @@ static struct vector_info draw_add_immediate(ivl_expr_t le,
       imm = get_number_immediate(re);
 
 	/* Now generate enough %addi instructions to add the entire
-	   immediate value to the destination. The adds are done 16
-	   bits at a time, but 17 bits are done to push the carry into
+	   immediate value to the destination. The adds are done IMM_WID
+	   bits at a time, but +1 bits are done to push the carry into
 	   the higher bits if needed. */
       { unsigned base;
-        for (base = 0 ;  base < lv.wid ;  base += 16) {
-	      unsigned long tmp = imm & 0xffffUL;
+        for (base = 0 ;  base < lv.wid ;  base += IMM_WID) {
+	      unsigned long tmp = imm & 0xffffffffUL;
 	      unsigned add_wid = lv.wid - base;
 
-	      imm >>= 16;
+	      imm >>= IMM_WID;
 
 	      fprintf(vvp_out, "    %%addi %u, %lu, %u;\n",
 		      lv.base+base, tmp, add_wid);
@@ -1203,7 +1234,7 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
-      assert( (imm & ~0xffff) == 0 );
+      assert( (imm & ~0xffffffffUL) == 0 );
 
       switch (lv.base) {
 	  case 0:
@@ -1299,13 +1330,13 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)
       if ((ivl_expr_opcode(exp) == '-')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_sub_immediate(le, re, wid);
 
       if ((ivl_expr_opcode(exp) == '*')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_mul_immediate(le, re, wid);
 
       lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
@@ -1612,9 +1643,9 @@ static struct vector_info draw_number_expr(ivl_expr_t exp, unsigned wid)
 	    vvp_errors += 1;
       }
 
-      if ((!number_is_unknown(exp)) && number_is_immediate(exp, 16)) {
-	    int val = get_number_immediate(exp);
-	    fprintf(vvp_out, "    %%movi %u, %d, %u;\n", res.base, val, wid);
+      if ((!number_is_unknown(exp)) && number_is_immediate(exp, IMM_WID)) {
+	    unsigned long val = get_number_immediate(exp);
+	    fprintf(vvp_out, "   %%movi %u, %lu, %u;\n", res.base, val, wid);
 	    return res;
       }
 
@@ -1836,8 +1867,8 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
       idx = 0;
       while (idx < nwid) {
 	    unsigned bits;
-	    unsigned trans = 16;
-	    if (nwid-idx < 16)
+	    unsigned trans = IMM_WID;
+	    if (nwid-idx < trans)
 		  trans = nwid-idx;
 
 	    bits = *p;
@@ -1845,6 +1876,14 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
 	    if (trans > 8) {
 		  bits |= *p << 8;
 		  p -= 1;
+		  if (trans > 16) {
+			bits |= *p << 16;
+			p -= 1;
+			if (trans > 24) {
+			      bits |= *p << 24;
+			      p -= 1;
+			}
+		  }
 	    }
 	    fprintf(vvp_out, "  %%movi %u, %u, %u;\n", res.base+idx,bits,trans);
 
@@ -1881,8 +1920,14 @@ void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned swid)
 			  res.base+idx, res.base+swid-1);
 
       } else {
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
-		    res.base+swid, res.wid-swid);
+	    unsigned base = res.base+swid;
+	    unsigned count = res.wid-swid;
+	      /* The %movi is faster for larger widths, but for very
+		 small counts, the %mov is faster. */
+	    if (count > 4)
+		  fprintf(vvp_out, "   %%movi %u, 0, %u;\n", base, count);
+	    else
+		  fprintf(vvp_out, "   %%mov %u, 0, %u;\n", base, count);
       }
 }
 
@@ -2086,7 +2131,7 @@ static struct vector_info draw_select_signal(ivl_expr_t sube,
 
       for (idx = 0 ;  idx < res.wid ;  idx += 1) {
 	    if (idx >= bit_wid) {
-		  fprintf(vvp_out, "   %%mov %u, 0, %u; Pad from %u to %u\n",
+		  fprintf(vvp_out, "   %%movi %u, 0, %u; Pad from %u to %u\n",
 			  res.base+idx, res.wid-idx,
 			  ivl_expr_width(sube), wid);
 		  break;
@@ -2410,7 +2455,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
@@ -2460,7 +2505,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 		  assert(res.base);
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h
index 53c31618e..15cf5d886 100644
--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@@ -39,6 +39,12 @@ struct vector_info {
       unsigned wid;
 };
 
+/*
+ * Convenient constants...
+ */
+  /* Width limit for typical immediate arguments. */
+# define IMM_WID 32
+
 /*
  * Mangle all non-symbol characters in an identifier, quotes in names
  */
diff --git a/vvp/codes.h b/vvp/codes.h
index 0e93f72de..0ac52e0c6 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -37,6 +37,7 @@ extern bool of_ADD(vthread_t thr, vvp_code_t code);
 extern bool of_ADD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_ADDI(vthread_t thr, vvp_code_t code);
 extern bool of_AND(vthread_t thr, vvp_code_t code);
+extern bool of_ANDI(vthread_t thr, vvp_code_t code);
 extern bool of_ANDR(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AV(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code);
@@ -167,7 +168,7 @@ struct vvp_code_s {
       };
 
       union {
-	    unsigned bit_idx[2];
+	    uint32_t    bit_idx[2];
 	    vvp_net_t   *net2;
 	    vvp_code_t   cptr2;
 	    struct ufunc_core*ufunc_core_ptr;
diff --git a/vvp/compile.cc b/vvp/compile.cc
index d2e1c96d8..4ad8bf17d 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -85,6 +85,7 @@ const static struct opcode_table_s opcode_table[] = {
       { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and",    of_AND,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1,     OA_BIT2} },
       { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1,  OA_BIT2} },
       { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 84941815c..59bf80097 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -531,7 +531,8 @@ is one of the 4 constant bits, the effect is to replicate the value
 into the destination vector. This is useful for filling a vector.
 
 The %movi variant moves a binary value, LSB first, into the
-destination vector.
+destination vector. The immediate value is up to 32bits, padded with
+zeros to fillout the width.
 
 * %mul <bit-l>, <bit-r>, <wid>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 049995d17..442fc0be6 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -533,6 +533,28 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
 }
 
 
+bool of_ANDI(vthread_t thr, vvp_code_t cp)
+{
+      unsigned idx1 = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
+
+      assert(idx1 >= 4);
+
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
+      vvp_vector4_t imv (wid, BIT4_0);
+
+      unsigned trans = wid;
+      if (trans > CPU_WORD_BITS)
+	    trans = CPU_WORD_BITS;
+      imv.setarray(0, trans, &imm);
+
+      val &= imv;
+
+      thr->bits4.set_vec(idx1, val);
+      return true;
+}
+
 bool of_ADD(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
@@ -2950,13 +2972,24 @@ bool of_MOV_WR(vthread_t thr, vvp_code_t cp)
 bool of_MOVI(vthread_t thr, vvp_code_t cp)
 {
       unsigned dst = cp->bit_idx[0];
-      unsigned val = cp->bit_idx[1];
+      static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       unsigned wid = cp->number;
 
       thr_check_addr(thr, dst+wid-1);
 
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1, val >>= 1)
-	    thr->bits4.set_bit(dst+idx, (val&1)? BIT4_1 : BIT4_0);
+      val[0] = cp->bit_idx[1];
+
+      while (wid > 0) {
+	    unsigned trans = wid;
+	    if (trans > 8*CPU_WORD_BITS)
+		  trans = 8*CPU_WORD_BITS;
+
+	    thr->bits4.setarray(dst, trans, val);
+
+	    val[0] = 0;
+	    wid -= trans;
+	    dst += trans;
+      }
 
       return true;
 }

From b5e9e44e0709ac040feb6611dd692cf73ff31e0d Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Tue, 27 May 2008 18:11:31 -0700
Subject: [PATCH 6/7] Fix error in of_SUBI with wide results.

This patch fixes an error in the recent rework of of_SUBI.
It was doing a double bit inversion.
---
 vvp/vthread.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 442fc0be6..5f82644ba 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3770,7 +3770,7 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       carry = 1;
       for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
 	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
-	    imm = ~0UL;
+	    imm = 0UL;
       }
 
 	/* We know from the vector_to_array that the address is valid

From 2179797763556e2ad3b5c15a07aa5940a82a6a9a Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 19:48:31 -0700
Subject: [PATCH 7/7] Do not allow unknows to be handled as logic immediate.

---
 tgt-vvp/eval_expr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 4753bd0d5..e1fb2013c 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -940,9 +940,9 @@ static struct vector_info draw_binary_expr_logic(ivl_expr_t exp,
       ivl_expr_t re = ivl_expr_oper2(exp);
 
       if (ivl_expr_opcode(exp) == '&') {
-	    if (number_is_immediate(re, IMM_WID))
+	    if (number_is_immediate(re, IMM_WID) && !number_is_unknown(re))
 		  return draw_logic_immediate(exp, le, re, wid);
-	    if (number_is_immediate(le, IMM_WID))
+	    if (number_is_immediate(le, IMM_WID) && !number_is_unknown(le))
 		  return draw_logic_immediate(exp, re, le, wid);
       }