From 8190307dd37b02079c57c56e01032a6eaa4463ae Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 11:09:33 -0700
Subject: [PATCH 01/16] Optimize/inline vvp_bit4_r AND, OR and vector set bit.

The AND and OR operators for vvp_bit4_t are slightly tweaked to be
lighter and inlinable.

The vvp_vector4_t::set_bit is optimized to do less silly mask fiddling.
---
 vvp/vvp_net.cc | 22 -------------
 vvp/vvp_net.h  | 83 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 51 insertions(+), 54 deletions(-)

diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 9f9a50e55..dfc77fb40 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -61,28 +61,6 @@ vvp_bit4_t add_with_carry(vvp_bit4_t a, vvp_bit4_t b, vvp_bit4_t&c)
       }
 }
 
-vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_0)
-	    return BIT4_0;
-      if (b == BIT4_0)
-	    return BIT4_0;
-      if (bit4_is_xz(a))
-	    return BIT4_X;
-      if (bit4_is_xz(b))
-	    return BIT4_X;
-      return BIT4_1;
-}
-
-vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
-{
-      if (a == BIT4_1)
-	    return BIT4_1;
-      if (b == BIT4_1)
-	    return BIT4_1;
-      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
-}
-
 vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b)
 {
       if (bit4_is_xz(a))
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 3d4f6b881..18548c157 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -87,8 +87,21 @@ inline vvp_bit4_t bit4_z2x(vvp_bit4_t a)
 inline vvp_bit4_t operator ~ (vvp_bit4_t a)
 { return bit4_z2x((vvp_bit4_t) (((int)a) ^ 1)); }
 
-extern vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b);
-extern vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b);
+inline vvp_bit4_t operator | (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_1 || b==BIT4_1)
+	    return BIT4_1;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+inline vvp_bit4_t operator & (vvp_bit4_t a, vvp_bit4_t b)
+{
+      if (a==BIT4_0 || b==BIT4_0)
+	    return BIT4_0;
+      return bit4_z2x( (vvp_bit4_t) ((int)a | (int)b) );
+}
+
+
 extern vvp_bit4_t operator ^ (vvp_bit4_t a, vvp_bit4_t b);
 extern ostream& operator<< (ostream&o, vvp_bit4_t a);
 
@@ -294,41 +307,47 @@ inline void vvp_vector4_t::set_bit(unsigned idx, vvp_bit4_t val)
       assert(idx < size_);
 
       unsigned long off = idx % BITS_PER_WORD;
-      unsigned long amask = 0, bmask = 0;
-      switch (val) {
-	  case BIT4_0:
-	    amask = 0;
-	    bmask = 0;
-	    break;
-	  case BIT4_1:
-	    amask = 1;
-	    bmask = 0;
-	    break;
-	  case BIT4_X:
-	    amask = 1;
-	    bmask = 1;
-	    break;
-	  case BIT4_Z:
-	    amask = 0;
-	    bmask = 1;
-	    break;
-      }
-
       unsigned long mask = 1UL << off;
-      amask <<= off;
-      bmask <<= off;
 
       if (size_ > BITS_PER_WORD) {
 	    unsigned wdx = idx / BITS_PER_WORD;
-	    abits_ptr_[wdx] &= ~mask;
-	    abits_ptr_[wdx] |= amask;
-	    bbits_ptr_[wdx] &= ~mask;
-	    bbits_ptr_[wdx] |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_ptr_[wdx] |=  mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_ptr_[wdx] &= ~mask;
+		  bbits_ptr_[wdx] |=  mask;
+		  break;
+	    }
       } else {
-	    abits_val_ &= ~mask;
-	    abits_val_ |= amask;
-	    bbits_val_ &= ~mask;
-	    bbits_val_ |= bmask;
+	    switch (val) {
+		case BIT4_0:
+		  abits_val_ &= ~mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_1:
+		  abits_val_ |=  mask;
+		  bbits_val_ &= ~mask;
+		  break;
+		case BIT4_X:
+		  abits_val_ |=  mask;
+		  bbits_val_ |=  mask;
+		  break;
+		case BIT4_Z:
+		  abits_val_ &= ~mask;
+		  bbits_val_ |=  mask;
+		  break;
+	    }
       }
 }
 

From 5cc376ebd49711498a2ef79895e9d97acc2c9945 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 16:00:16 -0700
Subject: [PATCH 02/16] Optimize ADD and MUL instructions

Make better use of the CPU word in ADD and MUL instructions.
---
 vvp/vthread.cc | 381 +++++++++++++++++++------------------------------
 1 file changed, 146 insertions(+), 235 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index a7cf69f05..43154bf47 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -214,6 +214,67 @@ static vvp_vector4_t vthread_bits_to_vector(struct vthread_s*thr,
       }
 }
 
+/*
+ * Some of the instructions do wide addition to arrays of long. They
+ * use this add_with_cary function to help.
+ */
+static inline unsigned long add_with_carry(unsigned long a, unsigned long b,
+					   unsigned long&carry)
+{
+      unsigned long tmp = b + carry;
+      unsigned long sum = a + tmp;
+      carry = 0;
+      if (tmp < b)
+	    carry = 1;
+      if (sum < tmp)
+	    carry = 1;
+      if (sum < a)
+	    carry = 1;
+      return sum;
+}
+
+static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
+					 unsigned long&carry)
+{
+      const unsigned long mask = (1UL << (CPU_WORD_BITS/2)) - 1;
+      unsigned long a0 = a & mask;
+      unsigned long a1 = (a >> (CPU_WORD_BITS/2)) & mask;
+      unsigned long b0 = b & mask;
+      unsigned long b1 = (b >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long tmp = a0 * b0;
+
+      unsigned long r00 = tmp & mask;
+      unsigned long c00 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a0 * b1;
+
+      unsigned long r01 = tmp & mask;
+      unsigned long c01 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b0;
+
+      unsigned long r10 = tmp & mask;
+      unsigned long c10 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      tmp = a1 * b1;
+
+      unsigned long r11 = tmp & mask;
+      unsigned long c11 = (tmp >> (CPU_WORD_BITS/2)) & mask;
+
+      unsigned long r1 = c00 + r01 + r10;
+      unsigned long r2 = (r1 >> (CPU_WORD_BITS/2)) & mask;
+      r1 &= mask;
+      r2 += c01 + c10 + r11;
+      unsigned long r3 = (r2 >> (CPU_WORD_BITS/2)) & mask;
+      r2 &= mask;
+      r3 += c11;
+      r3 &= mask;
+
+      carry = (r3 << (CPU_WORD_BITS/2)) + r2;
+      return (r1 << (CPU_WORD_BITS/2)) + r00;
+}
+
 
 /*
  * Create a new thread with the given start address.
@@ -465,19 +526,8 @@ bool of_ADD(vthread_t thr, vvp_code_t cp)
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], lvb[idx], carry);
 
 	/* We know from the vector_to_array that the address is valid
 	   in the thr->bitr4 vector, so just do the set bit. */
@@ -525,30 +575,15 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       unsigned word_count = (bit_width+CPU_WORD_BITS-1)/CPU_WORD_BITS;
 
       unsigned long*lva = vector_to_array(thr, bit_addr, bit_width);
-      unsigned long*lvb = 0;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-      lvb[0] = imm_value;
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = 0;
 
       unsigned long carry;
       carry = 0;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < bit_width ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], imm_value, carry);
+	    imm_value = 0;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -557,7 +592,6 @@ bool of_ADDI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(bit_addr, bit_width, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 
@@ -2921,101 +2955,59 @@ bool of_MOVI(vthread_t thr, vvp_code_t cp)
 
 bool of_MUL(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-      if(cp->number <= 8*sizeof(unsigned long)) {
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
-      unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  goto x_out;
-
-	    lv |= (unsigned long) lb << idx;
-	    rv |= (unsigned long) rb << idx;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      lv *= rv;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-	    lv >>= 1;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
-      } else {
-      unsigned idx1 = cp->bit_idx[0];
-      unsigned idx2 = cp->bit_idx[1];
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    ap[0] *= bp[0];
+	    thr->bits4.setarray(adra, wid, ap);
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
 
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
 
-      int mxa = -1;
-      int mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-	    if (bit4_is_xz(lb) || bit4_is_xz(rb))
-		  {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
+      for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
+	    for (unsigned mul_b = 0 ; mul_b < words ; mul_b += 1) {
+		  unsigned long sum;
+		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
+		  unsigned base = mul_a + mul_b;
+		  unsigned long carry = 0;
+		  res[base] = add_with_carry(res[base], tmp, carry);
+		  for (unsigned add_idx = base+1; add_idx < words; add_idx += 1) {
+			res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+			sum = 0;
 		  }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
-	    if (idx2 >= 4)
-		  idx2 += 1;
+	    }
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++)
-                {
-                if(b[i])
-                        {
-                        unsigned char carry=0;
-                        unsigned char temp;
-
-                        for(int j=0;j<=mxa;j++)
-                                {
-                                if(i+j>=(int)cp->number) break;
-                                temp=sum[i+j]+a[j]+carry;
-                                sum[i+j]=(temp&1);
-                                carry=(temp>>1);
-                                }
-                        }
-                }
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-      return true;
-      }
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+      thr->bits4.setarray(adra, wid, res);
+      delete[]ap;
+      delete[]bp;
+      delete[]res;
       return true;
 }
 
@@ -3030,101 +3022,48 @@ bool of_MUL_WR(vthread_t thr, vvp_code_t cp)
 
 bool of_MULI(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adr = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-	/* If the value fits into a native unsigned long, then make an
-	   unsigned long variable with the numbers, to a native
-	   multiply, and work with that. */
-
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned long lv = 0, rv = cp->bit_idx[1];
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-
-		  if (bit4_is_xz(lb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-
-		  idx1 += 1;
-	    }
-
-	    lv *= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
+      assert(adr >= 4);
 
+      unsigned long*val = vector_to_array(thr, adr, wid);
+	// If there are X bits in the value, then return X.
+      if (val == 0) {
+	    vvp_vector4_t tmp(cp->number, BIT4_X);
+	    thr->bits4.set_vec(cp->bit_idx[0], tmp);
 	    return true;
       }
 
-	/* number is too large for local long, so do bitwise
-	   multiply. */
-
-      unsigned idx1; idx1 = cp->bit_idx[0];
-      unsigned imm;  imm  = cp->bit_idx[1];
-
-      unsigned char *a, *b, *sum;
-      a = new unsigned char[cp->number];
-      b = new unsigned char[cp->number];
-      sum = new unsigned char[cp->number];
-
-      int mxa; mxa = -1;
-      int mxb; mxb = -1;
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    vvp_bit4_t lb = thr_get_bit(thr, idx1);
-	    vvp_bit4_t rb = (imm & 1)? BIT4_1 : BIT4_0;
-
-	    imm >>= 1;
-
-	    if (bit4_is_xz(lb)) {
-                  delete[]sum;
-                  delete[]b;
-                  delete[]a;
-		  goto x_out;
-	    }
-
-	    if((a[idx] = lb)) mxa=idx+1;
-	    if((b[idx] = rb)) mxb=idx;
-            sum[idx]=0;
-
-	    idx1 += 1;
+	// If everything fits in a word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    val[0] *= imm;
+	    thr->bits4.setarray(adr, wid, val);
+	    delete[]val;
+	    return true;
       }
 
-//    do "unsigned ZZ sum = a * b" the hard way..
-      for(int i=0;i<=mxb;i++) {
-	    if(b[i]) {
-		  unsigned char carry=0;
-		  unsigned char temp;
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+      unsigned long*res = new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
 
-		  for(int j=0;j<=mxa;j++) {
-			if(i+j>=(int)cp->number) break;
-			temp=sum[i+j]+a[j]+carry;
-			sum[i+j]=(temp&1);
-			carry=(temp>>1);
-		  }
+      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
+	    unsigned long sum;
+	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
+
+	    unsigned long carry = 0;
+	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
+	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
+		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+		  sum = 0;
 	    }
       }
 
-
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, sum[idx]?BIT4_1:BIT4_0);
-      }
-
-      delete[]sum;
-      delete[]b;
-      delete[]a;
-
-      return true;
-
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
-
+      thr->bits4.setarray(adr, wid, res);
+      delete[]val;
+      delete[]res;
       return true;
 }
 
@@ -3754,20 +3693,10 @@ bool of_SUB(vthread_t thr, vvp_code_t cp)
 	    goto x_out;
 
 
-      unsigned carry;
+      unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-	    unsigned long tmp = ~lvb[idx] + carry;
-	    unsigned long sum = tmp + lva[idx];
-	    carry = 0;
-	    if (tmp < ~lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
-      }
+      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1)
+	    lva[idx] = add_with_carry(lva[idx], ~lvb[idx], carry);
 
 
 	/* We know from the vector_to_array that the address is valid
@@ -3802,34 +3731,17 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       assert(cp->bit_idx[0] >= 4);
 
       unsigned word_count = (cp->number+CPU_WORD_BITS-1)/CPU_WORD_BITS;
-
+      unsigned long imm = cp->bit_idx[1];
       unsigned long*lva = vector_to_array(thr, cp->bit_idx[0], cp->number);
-      unsigned long*lvb;
       if (lva == 0)
 	    goto x_out;
 
-      lvb = new unsigned long[word_count];
-
-
-      lvb[0] = cp->bit_idx[1];
-      lvb[0] = ~lvb[0];
-      for (unsigned idx = 1 ;  idx < word_count ;  idx += 1)
-	    lvb[idx] = ~0UL;
 
       unsigned long carry;
       carry = 1;
-      for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
-
-	    unsigned long tmp = lvb[idx] + carry;
-	    unsigned long sum = lva[idx] + tmp;
-	    carry = 0UL;
-	    if (tmp < lvb[idx])
-		  carry = 1;
-	    if (sum < tmp)
-		  carry = 1;
-	    if (sum < lva[idx])
-		  carry = 1;
-	    lva[idx] = sum;
+      for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
+	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
+	    imm = ~0UL;
       }
 
 	/* We know from the vector_to_array that the address is valid
@@ -3838,7 +3750,6 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       thr->bits4.setarray(cp->bit_idx[0], cp->number, lva);
 
       delete[]lva;
-      delete[]lvb;
 
       return true;
 

From 6987d16bd33bfcf6383bb10838630cda4669d976 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Mon, 26 May 2008 16:44:58 -0700
Subject: [PATCH 03/16] Optimize the %load/vp0 to use subarrays.

This instruction adds an integer value to the value being loaded. This
optimization uses subarrays instead of the += operator. This is faster
because the value is best loaded into the vector as a subarray anyhow.
---
 vvp/vthread.cc | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 43154bf47..4076c1ccc 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -2476,7 +2476,7 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp)
  * The functor to read from is the vvp_net_t object pointed to by the
  * cp->net pointer.
  */
-vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
+static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
       assert(cp->bit_idx[1] > 0);
@@ -2490,9 +2490,8 @@ vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
       if (sig == 0) {
 	    cerr << "%%load/v error: Net arg not a vector signal? "
 		 << typeid(*net->fun).name() << endl;
+	    assert(sig);
       }
-      assert(sig);
-
 
       vvp_vector4_t sig_value = sig->vec4_value();
       sig_value.resize(wid);
@@ -2532,15 +2531,35 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t sig_value(wid, BIT4_0);
       sig_value.copy_bits(load_base(thr, cp));
 
-	/* Add the addend value */
-      sig_value += addend;
-
 	/* Check the address once, before we scan the vector. */
       thr_check_addr(thr, bit+wid-1);
 
+      unsigned long*val = sig_value.subarray(0, wid);
+      if (val == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(bit, tmp);
+	    return true;
+      }
+
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
+      unsigned long carry = 0;
+      unsigned long imm = addend;
+      if (addend >= 0) {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = 0UL;
+	    }
+      } else {
+	    for (unsigned idx = 0 ; idx < words ; idx += 1) {
+		  val[idx] = add_with_carry(val[idx], imm, carry);
+		  imm = -1UL;
+	    }
+      }
+
 	/* Copy the vector bits into the bits4 vector. Do the copy
 	   directly to skip the excess calls to thr_check_addr. */
-      thr->bits4.set_vec(bit, sig_value);
+      thr->bits4.setarray(bit, wid, val);
+      delete[]val;
 
       return true;
 }

From 0fa3099ded623d7e46a5496fb10cfad7d1f5ed3f Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 11:54:39 -0700
Subject: [PATCH 04/16] Optimize %div and %div/s

Use high radix long division to take advantage of the divide hardware
of the host computer. It looks brute force at first glance, but since
it is using the optimized arithmetic of the host processor, it is much
faster then implementing "fast" algorithms the hard way.
---
 vvp/vthread.cc | 509 +++++++++++++++++++++++--------------------------
 1 file changed, 243 insertions(+), 266 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 4076c1ccc..049995d17 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -275,6 +275,24 @@ static unsigned long multiply_with_carry(unsigned long a, unsigned long b,
       return (r1 << (CPU_WORD_BITS/2)) + r00;
 }
 
+static void multiply_array_imm(unsigned long*res, unsigned long*val,
+			       unsigned words, unsigned long imm)
+{
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    res[idx] = 0;
+
+      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
+	    unsigned long sum;
+	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
+
+	    unsigned long carry = 0;
+	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
+	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
+		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
+		  sum = 0;
+	    }
+      }
+}
 
 /*
  * Create a new thread with the given start address.
@@ -1511,297 +1529,268 @@ bool of_DISABLE(vthread_t thr, vvp_code_t cp)
       return ! disabled_myself_flag;
 }
 
-static void divide_bits(unsigned len, unsigned char*lbits,
-			const unsigned char*rbits)
+/*
+ * This function divides a 2-word number {high, a} by a 1-word
+ * number. Assume that high < b.
+ */
+static unsigned long divide2words(unsigned long a, unsigned long b,
+				  unsigned long high)
 {
-      unsigned char *a, *b, *z, *t;
-      a = new unsigned char[len+1];
-      b = new unsigned char[len+1];
-      z = new unsigned char[len+1];
-      t = new unsigned char[len+1];
+      unsigned long result = 0;
+      while (high > 0) {
+	    unsigned long tmp_result = ULONG_MAX / b;
+	    unsigned long remain = ULONG_MAX % b;
 
-      unsigned char carry;
-      unsigned char temp;
-
-      int mxa = -1, mxz = -1;
-      int i;
-      int current, copylen;
-
-
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    unsigned lb = lbits[idx];
-	    unsigned rb = rbits[idx];
-
-	    z[idx]=lb;
-	    a[idx]=1-rb;	// for 2s complement add..
-
-      }
-      z[len]=0;
-      a[len]=1;
-
-      for(i=0;i<(int)len+1;i++) {
-	    b[i]=0;
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(!a[i]) {
-		  mxa=i;
-		  break;
-	    }
-      }
-
-      for(i=len-1;i>=0;i--) {
-	    if(z[i]) {
-		  mxz=i;
-		  break;
-	    }
-      }
-
-      if((mxa>mxz)||(mxa==-1)) {
-	    if(mxa==-1) {
-		  fprintf(stderr, "Division By Zero error, exiting.\n");
-		  exit(255);
+	    remain += 1;
+	    if (remain >= b) {
+		  remain -= b;
+		  result += 1;
 	    }
 
-	    goto tally;
+	      // Now 0x1_0...0 = b*tmp_result + remain
+	      // high*0x1_0...0 = high*(b*tmp_result + remain)
+	      // high*0x1_0...0 = high*b*tmp_result + high*remain
+
+	      // We know that high*0x1_0...0 >= high*b*tmp_result, and
+	      // we know that high*0x1_0...0 > high*remain. Use
+	      // high*remain as the remainder for another iteration,
+	      // and add tmp_result*high into the current estimate of
+	      // the result.
+	    result += tmp_result * high;
+
+	      // The new iteration starts with high*remain + a.
+	    remain = multiply_with_carry(high, remain, high);
+	    a = add_with_carry(a, remain, high);
+
+	      // Now result*b + {high,a} == the input {high,a}. It is
+	      // possible that the new high >= 1. If so, it will
+	      // certainly be less then high from the previous
+	      // iteration. Do another iteration and it will shrink,
+	      // eventually to 0.
       }
 
-      copylen = mxa + 2;
-      current = mxz - mxa;
+	// high is now 0, so a is the remaining remainder, so we can
+	// finish off the integer divide with a simple a/b.
 
-      while(current > -1) {
-	    carry = 1;
-	    for(i=0;i<copylen;i++) {
-		  temp = z[i+current] + a[i] + carry;
-		  t[i] = (temp&1);
-		  carry = (temp>>1);
+      return result + a/b;
+}
+
+static unsigned long* divide_bits(unsigned long*ap, unsigned long*bp, unsigned wid)
+{
+
+      unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
+
+      unsigned btop = words-1;
+      while (btop > 0 && bp[btop] == 0)
+	    btop -= 1;
+
+	// Detect divide by 0, and exit.
+      if (btop==0 && bp[0]==0)
+	    return 0;
+
+      unsigned long*diff  = new unsigned long[words];
+      unsigned long*result= new unsigned long[words];
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    result[idx] = 0;
+
+      for (unsigned cur = words-btop ; cur > 0 ; cur -= 1) {
+	    unsigned cur_ptr = cur-1;
+	    unsigned long cur_res;
+	    if (ap[cur_ptr+btop] >= bp[btop]) {
+		  cur_res = ap[cur_ptr+btop] / bp[btop];
+
+	    } else if (cur_ptr+btop+1 >= words) {
+		  continue;
+
+	    } else if (ap[cur_ptr+btop+1] == 0) {
+		  continue;
+
+	    } else {
+		  cur_res = divide2words(ap[cur_ptr+btop], bp[btop],
+					 ap[cur_ptr+btop+1]);
 	    }
 
-	    if(carry) {
-		  for(i=0;i<copylen;i++) {
-			z[i+current] = t[i];
-		  }
-		  b[current] = 1;
+	      // cur_res is a guestimate of the result this far. It
+	      // may be 1 too big. (But it will also be >0) Try it,
+	      // and if the difference comes out negative, then adjust
+	      // then.
+
+	    multiply_array_imm(diff+cur_ptr, bp, words-cur_ptr, cur_res);
+	    unsigned long carry = 1;
+	    for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+		  ap[idx] = add_with_carry(ap[idx], ~diff[idx], carry);
+
+	      // ap has the diff subtracted out of it. If cur_res was
+	      // too large, then ap will turn negative. (We easily
+	      // tell that ap turned negative by looking at
+	      // carry&1. If it is 0, then it is *negative*.) In that
+	      // case, we know that cur_res was too large by 1. Correct by
+	      // adding 1b back in and reducing cur_res.
+	    if (carry&1 == 0) {
+		  cur_res -= 1;
+		  carry = 0;
+		  for (unsigned idx = cur_ptr ; idx < words ; idx += 1)
+			ap[idx] = add_with_carry(ap[idx], bp[idx-cur_ptr], carry);
+		    // The sign *must* have changed again.
+		  assert(carry == 1);
 	    }
 
-	    current--;
+	    result[cur_ptr] = cur_res;
       }
 
- tally:
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	      // n.b., z[] has the remainder...
-	    lbits[idx] = b[idx];
-      }
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
 
-      delete []t;
-      delete []z;
-      delete []b;
-      delete []a;
+      delete[]diff;
+      return result;
 }
 
 bool of_DIV(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
 
-      if(cp->number <= 8*sizeof(unsigned long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    unsigned long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  vvp_bit4_t lb = thr_get_bit(thr, idx1);
-		  vvp_bit4_t rb = thr_get_bit(thr, idx2);
-
-		  if (bit4_is_xz(lb) || bit4_is_xz(rb))
-			goto x_out;
-
-		  lv |= (unsigned long) lb << idx;
-		  rv |= (unsigned long) rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	    if (rv == BIT4_0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1) ? BIT4_1 : BIT4_0);
-		  lv >>= 1;
-	    }
-
-	    return true;
-
-      } else {
-
-	      /* Make a string of the bits of the numbers to be
-		 divided. Then divide them, and write the results into
-		 the thread. */
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
 	    return true;
       }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
+	// If the value fits in a single CPU word, then do it the easy way.
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  ap[0] /= bp[0];
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+	// Now ap contains the remainder and result contains the
+	// desired result. We should find that:
+	//  input-a = bp * result + ap;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
-static void negate_bits(unsigned len, unsigned char*bits)
+
+static void negate_words(unsigned long*val, unsigned words)
 {
-      unsigned char carry = 1;
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
-	    carry += bits[idx]? 0 : 1;
-	    bits[idx] = carry & 1;
-	    carry >>= 1;
-      }
+      unsigned long carry = 1;
+      for (unsigned idx = 0 ; idx < words ; idx += 1)
+	    val[idx] = add_with_carry(0, ~val[idx], carry);
 }
 
 bool of_DIV_S(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
+      unsigned adra = cp->bit_idx[0];
+      unsigned adrb = cp->bit_idx[1];
+      unsigned wid = cp->number;
+      unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
 
-      if(cp->number <= 8*sizeof(long)) {
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    long lv = 0, rv = 0;
+      assert(adra >= 4);
 
-	    unsigned lb = 0;
-	    unsigned rb = 0;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lb = thr_get_bit(thr, idx1);
-		  rb = thr_get_bit(thr, idx2);
-
-		  if ((lb | rb) & 2)
-			goto x_out;
-
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Extend the sign to fill the native long. */
-	    for (unsigned idx = cp->number; idx < (8*sizeof lv); idx += 1) {
-		  lv |= (long)lb << idx;
-		  rv |= (long)rb << idx;
-	    }
-
-	    if (rv == 0)
-		  goto x_out;
-
-	    lv /= rv;
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, (lv&1)?BIT4_1:BIT4_0);
-		  lv >>= 1;
-	    }
-
-      } else {
-	    unsigned char*lbits = new unsigned char[cp->number];
-	    unsigned char*rbits = new unsigned char[cp->number];
-	    unsigned idx1 = cp->bit_idx[0];
-	    unsigned idx2 = cp->bit_idx[1];
-	    bool rval_is_zero = true;
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  lbits[idx] = thr_get_bit(thr, idx1);
-		  rbits[idx] = thr_get_bit(thr, idx2);
-		  if ((lbits[idx] | rbits[idx]) > 1) {
-			delete[]lbits;
-			delete[]rbits;
-			goto x_out;
-		  }
-
-		  if (rbits[idx] != 0)
-			rval_is_zero = false;
-
-		  idx1 += 1;
-		  if (idx2 >= 4)
-			idx2 += 1;
-	    }
-
-	      /* Notice the special case of divide by 0. */
-	    if (rval_is_zero) {
-		  delete[]lbits;
-		  delete[]rbits;
-		  goto x_out;
-	    }
-
-	      /* Signed division is unsigned division on the absolute
-		 values of the operands, then corrected for the number
-		 of signs. */
-	    unsigned sign_flag = 0;
-	    if (lbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, lbits);
-	    }
-	    if (rbits[cp->number-1]) {
-		  sign_flag += 1;
-		  negate_bits(cp->number, rbits);
-	    }
-
-	    divide_bits(cp->number, lbits, rbits);
-
-	    if (sign_flag & 1) {
-		  negate_bits(cp->number, lbits);
-	    }
-
-	    for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1) {
-		  thr_put_bit(thr, cp->bit_idx[0]+idx, lbits[idx]?BIT4_1:BIT4_0);
-	    }
-
-	    delete[]lbits;
-	    delete[]rbits;
+      unsigned long*ap = vector_to_array(thr, adra, wid);
+      if (ap == 0) {
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
       }
 
-      return true;
+      unsigned long*bp = vector_to_array(thr, adrb, wid);
+      if (bp == 0) {
+	    delete[]ap;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
 
- x_out:
-      for (unsigned idx = 0 ;  idx < cp->number ;  idx += 1)
-	    thr_put_bit(thr, cp->bit_idx[0]+idx, BIT4_X);
+      unsigned long sign_mask = 0;
+      if (unsigned long sign_bits = (words*CPU_WORD_BITS) - wid) {
+	    sign_mask = -1UL << (CPU_WORD_BITS-sign_bits);
+	    if (ap[words-1] & (sign_mask>>1))
+		  ap[words-1] |= sign_mask;
+	    if (bp[words-1] & (sign_mask>>1))
+		  bp[words-1] |= sign_mask;
+      }
 
+      if (wid <= CPU_WORD_BITS) {
+	    if (bp[0] == 0) {
+		  vvp_vector4_t tmp(wid, BIT4_X);
+		  thr->bits4.set_vec(adra, tmp);
+	    } else {
+		  long tmpa = (long) ap[0];
+		  long tmpb = (long) bp[0];
+		  long res = tmpa / tmpb;
+		  ap[0] = ((unsigned long)res) & ~sign_mask;
+		  thr->bits4.setarray(adra, wid, ap);
+	    }
+	    delete[]ap;
+	    delete[]bp;
+	    return true;
+      }
+
+	// We need to the actual division to positive integers. Make
+	// them positive here, and remember the negations.
+      bool negate_flag = false;
+      if ( ((long) ap[words-1]) < 0 ) {
+	    negate_flag = true;
+	    negate_words(ap, words);
+      }
+      if ( ((long) bp[words-1]) < 0 ) {
+	    negate_flag ^= true;
+	    negate_words(bp, words);
+      }
+
+      unsigned long*result = divide_bits(ap, bp, wid);
+      if (result == 0) {
+	    delete[]ap;
+	    delete[]bp;
+	    vvp_vector4_t tmp(wid, BIT4_X);
+	    thr->bits4.set_vec(adra, tmp);
+	    return true;
+      }
+
+      if (negate_flag) {
+	    negate_words(result, words);
+      }
+
+      result[words-1] &= ~sign_mask;
+
+      thr->bits4.setarray(adra, wid, result);
+      delete[]ap;
+      delete[]bp;
+      delete[]result;
       return true;
 }
 
@@ -3065,20 +3054,8 @@ bool of_MULI(vthread_t thr, vvp_code_t cp)
 
       unsigned words = (wid+CPU_WORD_BITS-1) / CPU_WORD_BITS;
       unsigned long*res = new unsigned long[words];
-      for (unsigned idx = 0 ; idx < words ; idx += 1)
-	    res[idx] = 0;
 
-      for (unsigned mul_idx = 0 ; mul_idx < words ; mul_idx += 1) {
-	    unsigned long sum;
-	    unsigned long tmp = multiply_with_carry(val[mul_idx], imm, sum);
-
-	    unsigned long carry = 0;
-	    res[mul_idx] = add_with_carry(res[mul_idx], tmp, carry);
-	    for (unsigned add_idx = mul_idx+1 ; add_idx < words ; add_idx += 1) {
-		  res[add_idx] = add_with_carry(res[add_idx], sum, carry);
-		  sum = 0;
-	    }
-      }
+      multiply_array_imm(res, val, words, imm);
 
       thr->bits4.setarray(adr, wid, res);
       delete[]val;

From 5a0fe9ff8375f9e32d61f9d5e0d4649d8f93864e Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 17:51:28 -0700
Subject: [PATCH 05/16] Better use of immediate operands.

Clarify that operands are typically 32bits, and have the code generator
make better use of this.

Also improve the %movi implementation to work well with marger vectors.

Add the %andi instruction to use immediate operands.
---
 tgt-vvp/eval_expr.c | 83 ++++++++++++++++++++++++++++++++++-----------
 tgt-vvp/vvp_priv.h  |  6 ++++
 vvp/codes.h         |  3 +-
 vvp/compile.cc      |  1 +
 vvp/opcodes.txt     |  3 +-
 vvp/vthread.cc      | 39 +++++++++++++++++++--
 6 files changed, 111 insertions(+), 24 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 08080fe7e..4753bd0d5 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -92,7 +92,7 @@ unsigned long get_number_immediate(ivl_expr_t ex)
 		    case '0':
 		      break;
 		    case '1':
-		      imm |= 1 << idx;
+		      imm |= 1UL << idx;
 		      break;
 		    default:
 		      assert(0);
@@ -909,12 +909,43 @@ static struct vector_info draw_binary_expr_le(ivl_expr_t exp,
       return lv;
 }
 
+static struct vector_info draw_logic_immediate(ivl_expr_t exp,
+					       ivl_expr_t le,
+					       ivl_expr_t re,
+					       unsigned wid)
+{
+      struct vector_info lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
+      unsigned long imm = get_number_immediate(re);
+
+      assert(lv.base >= 4);
+
+      switch (ivl_expr_opcode(exp)) {
+
+	  case '&':
+	    fprintf(vvp_out, "   %%andi %u, %lu, %u;\n", lv.base, imm, lv.wid);
+	    break;
+
+	  default:
+	    assert(0);
+	    break;
+      }
+
+      return lv;
+}
+
 static struct vector_info draw_binary_expr_logic(ivl_expr_t exp,
 						 unsigned wid)
 {
       ivl_expr_t le = ivl_expr_oper1(exp);
       ivl_expr_t re = ivl_expr_oper2(exp);
 
+      if (ivl_expr_opcode(exp) == '&') {
+	    if (number_is_immediate(re, IMM_WID))
+		  return draw_logic_immediate(exp, le, re, wid);
+	    if (number_is_immediate(le, IMM_WID))
+		  return draw_logic_immediate(exp, re, le, wid);
+      }
+
       struct vector_info lv;
       struct vector_info rv;
 
@@ -1167,15 +1198,15 @@ static struct vector_info draw_add_immediate(ivl_expr_t le,
       imm = get_number_immediate(re);
 
 	/* Now generate enough %addi instructions to add the entire
-	   immediate value to the destination. The adds are done 16
-	   bits at a time, but 17 bits are done to push the carry into
+	   immediate value to the destination. The adds are done IMM_WID
+	   bits at a time, but +1 bits are done to push the carry into
 	   the higher bits if needed. */
       { unsigned base;
-        for (base = 0 ;  base < lv.wid ;  base += 16) {
-	      unsigned long tmp = imm & 0xffffUL;
+        for (base = 0 ;  base < lv.wid ;  base += IMM_WID) {
+	      unsigned long tmp = imm & 0xffffffffUL;
 	      unsigned add_wid = lv.wid - base;
 
-	      imm >>= 16;
+	      imm >>= IMM_WID;
 
 	      fprintf(vvp_out, "    %%addi %u, %lu, %u;\n",
 		      lv.base+base, tmp, add_wid);
@@ -1203,7 +1234,7 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
-      assert( (imm & ~0xffff) == 0 );
+      assert( (imm & ~0xffffffffUL) == 0 );
 
       switch (lv.base) {
 	  case 0:
@@ -1299,13 +1330,13 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)
       if ((ivl_expr_opcode(exp) == '-')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_sub_immediate(le, re, wid);
 
       if ((ivl_expr_opcode(exp) == '*')
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER)
 	  && (! number_is_unknown(re))
-	  && number_is_immediate(re, 16))
+	  && number_is_immediate(re, IMM_WID))
 	    return draw_mul_immediate(le, re, wid);
 
       lv = draw_eval_expr_wid(le, wid, STUFF_OK_XZ);
@@ -1612,9 +1643,9 @@ static struct vector_info draw_number_expr(ivl_expr_t exp, unsigned wid)
 	    vvp_errors += 1;
       }
 
-      if ((!number_is_unknown(exp)) && number_is_immediate(exp, 16)) {
-	    int val = get_number_immediate(exp);
-	    fprintf(vvp_out, "    %%movi %u, %d, %u;\n", res.base, val, wid);
+      if ((!number_is_unknown(exp)) && number_is_immediate(exp, IMM_WID)) {
+	    unsigned long val = get_number_immediate(exp);
+	    fprintf(vvp_out, "   %%movi %u, %lu, %u;\n", res.base, val, wid);
 	    return res;
       }
 
@@ -1836,8 +1867,8 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
       idx = 0;
       while (idx < nwid) {
 	    unsigned bits;
-	    unsigned trans = 16;
-	    if (nwid-idx < 16)
+	    unsigned trans = IMM_WID;
+	    if (nwid-idx < trans)
 		  trans = nwid-idx;
 
 	    bits = *p;
@@ -1845,6 +1876,14 @@ static struct vector_info draw_string_expr(ivl_expr_t exp, unsigned wid)
 	    if (trans > 8) {
 		  bits |= *p << 8;
 		  p -= 1;
+		  if (trans > 16) {
+			bits |= *p << 16;
+			p -= 1;
+			if (trans > 24) {
+			      bits |= *p << 24;
+			      p -= 1;
+			}
+		  }
 	    }
 	    fprintf(vvp_out, "  %%movi %u, %u, %u;\n", res.base+idx,bits,trans);
 
@@ -1881,8 +1920,14 @@ void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned swid)
 			  res.base+idx, res.base+swid-1);
 
       } else {
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
-		    res.base+swid, res.wid-swid);
+	    unsigned base = res.base+swid;
+	    unsigned count = res.wid-swid;
+	      /* The %movi is faster for larger widths, but for very
+		 small counts, the %mov is faster. */
+	    if (count > 4)
+		  fprintf(vvp_out, "   %%movi %u, 0, %u;\n", base, count);
+	    else
+		  fprintf(vvp_out, "   %%mov %u, 0, %u;\n", base, count);
       }
 }
 
@@ -2086,7 +2131,7 @@ static struct vector_info draw_select_signal(ivl_expr_t sube,
 
       for (idx = 0 ;  idx < res.wid ;  idx += 1) {
 	    if (idx >= bit_wid) {
-		  fprintf(vvp_out, "   %%mov %u, 0, %u; Pad from %u to %u\n",
+		  fprintf(vvp_out, "   %%movi %u, 0, %u; Pad from %u to %u\n",
 			  res.base+idx, res.wid-idx,
 			  ivl_expr_width(sube), wid);
 		  break;
@@ -2410,7 +2455,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
@@ -2460,7 +2505,7 @@ static struct vector_info draw_unary_expr(ivl_expr_t exp, unsigned wid)
 		  assert(res.base);
 		  fprintf(vvp_out, "    %%mov %u, %u, %u;\n",
 			  tmp.base, res.base, res.wid);
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n",
 			  tmp.base+res.wid, tmp.wid-res.wid);
 		  clr_vector(res);
 		  res = tmp;
diff --git a/tgt-vvp/vvp_priv.h b/tgt-vvp/vvp_priv.h
index 53c31618e..15cf5d886 100644
--- a/tgt-vvp/vvp_priv.h
+++ b/tgt-vvp/vvp_priv.h
@@ -39,6 +39,12 @@ struct vector_info {
       unsigned wid;
 };
 
+/*
+ * Convenient constants...
+ */
+  /* Width limit for typical immediate arguments. */
+# define IMM_WID 32
+
 /*
  * Mangle all non-symbol characters in an identifier, quotes in names
  */
diff --git a/vvp/codes.h b/vvp/codes.h
index 0e93f72de..0ac52e0c6 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -37,6 +37,7 @@ extern bool of_ADD(vthread_t thr, vvp_code_t code);
 extern bool of_ADD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_ADDI(vthread_t thr, vvp_code_t code);
 extern bool of_AND(vthread_t thr, vvp_code_t code);
+extern bool of_ANDI(vthread_t thr, vvp_code_t code);
 extern bool of_ANDR(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AV(vthread_t thr, vvp_code_t code);
 extern bool of_ASSIGN_AVD(vthread_t thr, vvp_code_t code);
@@ -167,7 +168,7 @@ struct vvp_code_s {
       };
 
       union {
-	    unsigned bit_idx[2];
+	    uint32_t    bit_idx[2];
 	    vvp_net_t   *net2;
 	    vvp_code_t   cptr2;
 	    struct ufunc_core*ufunc_core_ptr;
diff --git a/vvp/compile.cc b/vvp/compile.cc
index d2e1c96d8..4ad8bf17d 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -85,6 +85,7 @@ const static struct opcode_table_s opcode_table[] = {
       { "%addi",   of_ADDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and",    of_AND,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%and/r",  of_ANDR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%andi",   of_ANDI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%assign/av",of_ASSIGN_AV,3,{OA_ARR_PTR,OA_BIT1,     OA_BIT2} },
       { "%assign/av/d",of_ASSIGN_AVD,3,{OA_ARR_PTR,OA_BIT1,  OA_BIT2} },
       { "%assign/v0",of_ASSIGN_V0,3,{OA_FUNC_PTR,OA_BIT1,    OA_BIT2} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 84941815c..59bf80097 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -531,7 +531,8 @@ is one of the 4 constant bits, the effect is to replicate the value
 into the destination vector. This is useful for filling a vector.
 
 The %movi variant moves a binary value, LSB first, into the
-destination vector.
+destination vector. The immediate value is up to 32bits, padded with
+zeros to fillout the width.
 
 * %mul <bit-l>, <bit-r>, <wid>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 049995d17..442fc0be6 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -533,6 +533,28 @@ bool of_AND(vthread_t thr, vvp_code_t cp)
 }
 
 
+bool of_ANDI(vthread_t thr, vvp_code_t cp)
+{
+      unsigned idx1 = cp->bit_idx[0];
+      unsigned long imm = cp->bit_idx[1];
+      unsigned wid = cp->number;
+
+      assert(idx1 >= 4);
+
+      vvp_vector4_t val = vthread_bits_to_vector(thr, idx1, wid);
+      vvp_vector4_t imv (wid, BIT4_0);
+
+      unsigned trans = wid;
+      if (trans > CPU_WORD_BITS)
+	    trans = CPU_WORD_BITS;
+      imv.setarray(0, trans, &imm);
+
+      val &= imv;
+
+      thr->bits4.set_vec(idx1, val);
+      return true;
+}
+
 bool of_ADD(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
@@ -2950,13 +2972,24 @@ bool of_MOV_WR(vthread_t thr, vvp_code_t cp)
 bool of_MOVI(vthread_t thr, vvp_code_t cp)
 {
       unsigned dst = cp->bit_idx[0];
-      unsigned val = cp->bit_idx[1];
+      static unsigned long val[8] = {0, 0, 0, 0, 0, 0, 0, 0};
       unsigned wid = cp->number;
 
       thr_check_addr(thr, dst+wid-1);
 
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1, val >>= 1)
-	    thr->bits4.set_bit(dst+idx, (val&1)? BIT4_1 : BIT4_0);
+      val[0] = cp->bit_idx[1];
+
+      while (wid > 0) {
+	    unsigned trans = wid;
+	    if (trans > 8*CPU_WORD_BITS)
+		  trans = 8*CPU_WORD_BITS;
+
+	    thr->bits4.setarray(dst, trans, val);
+
+	    val[0] = 0;
+	    wid -= trans;
+	    dst += trans;
+      }
 
       return true;
 }

From b5e9e44e0709ac040feb6611dd692cf73ff31e0d Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Tue, 27 May 2008 18:11:31 -0700
Subject: [PATCH 06/16] Fix error in of_SUBI with wide results.

This patch fixes an error in the recent rework of of_SUBI.
It was doing a double bit inversion.
---
 vvp/vthread.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 442fc0be6..5f82644ba 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3770,7 +3770,7 @@ bool of_SUBI(vthread_t thr, vvp_code_t cp)
       carry = 1;
       for (unsigned idx = 0 ;  idx < word_count ;  idx += 1) {
 	    lva[idx] = add_with_carry(lva[idx], ~imm, carry);
-	    imm = ~0UL;
+	    imm = 0UL;
       }
 
 	/* We know from the vector_to_array that the address is valid

From 2179797763556e2ad3b5c15a07aa5940a82a6a9a Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 27 May 2008 19:48:31 -0700
Subject: [PATCH 07/16] Do not allow unknows to be handled as logic immediate.

---
 tgt-vvp/eval_expr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 4753bd0d5..e1fb2013c 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -940,9 +940,9 @@ static struct vector_info draw_binary_expr_logic(ivl_expr_t exp,
       ivl_expr_t re = ivl_expr_oper2(exp);
 
       if (ivl_expr_opcode(exp) == '&') {
-	    if (number_is_immediate(re, IMM_WID))
+	    if (number_is_immediate(re, IMM_WID) && !number_is_unknown(re))
 		  return draw_logic_immediate(exp, le, re, wid);
-	    if (number_is_immediate(le, IMM_WID))
+	    if (number_is_immediate(le, IMM_WID) && !number_is_unknown(le))
 		  return draw_logic_immediate(exp, re, le, wid);
       }
 

From 9aa610f489ae0e22bc87952d404ae334b1e68573 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Wed, 28 May 2008 08:47:28 -0700
Subject: [PATCH 08/16] Fix handling of 32bit IMM in %addi on 32bit machines

The handling of immediate add used to do 16bits at a time. When it went
up to 32bits, the need to work in chunks vanished, but the chunk handling
was still there, this time shifting by 32, which causes problems on 32bit
machines. Simplify the %addi handling to avoid this.
---
 tgt-vvp/eval_expr.c | 74 +++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 22 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index e1fb2013c..3bcbf2717 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -1197,23 +1197,50 @@ static struct vector_info draw_add_immediate(ivl_expr_t le,
 
       imm = get_number_immediate(re);
 
-	/* Now generate enough %addi instructions to add the entire
-	   immediate value to the destination. The adds are done IMM_WID
-	   bits at a time, but +1 bits are done to push the carry into
-	   the higher bits if needed. */
-      { unsigned base;
-        for (base = 0 ;  base < lv.wid ;  base += IMM_WID) {
-	      unsigned long tmp = imm & 0xffffffffUL;
-	      unsigned add_wid = lv.wid - base;
+	/* This shouldn't generally happen, because the elaborator
+	   should take care of simple constant propagation like this,
+	   but it doesn't have to and it is easy to catch here. */
+      if (imm == 0)
+	    return lv;
 
-	      imm >>= IMM_WID;
+      switch (lv.base) {
+	  case 0: /* Left expression is 0. */
+	    lv.base = allocate_vector(wid);
+	    if (lv.base == 0) {
+		  fprintf(stderr, "%s:%u: vvp.tgt error: "
+			  "Unable to allocate %u thread bits "
+			  "for result of addition.\n",
+			  ivl_expr_file(re), ivl_expr_lineno(re), wid);
+		  vvp_errors += 1;
+	    }
+	    fprintf(vvp_out, "   %%movi %u, %lu %u;\n", lv.base, imm, wid);
+	    break;
 
-	      fprintf(vvp_out, "    %%addi %u, %lu, %u;\n",
-		      lv.base+base, tmp, add_wid);
+	  case 1: /* Left expression is 1...1 (i.e. -1) */
+	    imm -= 1;
+	    if (imm == 0) {
+		  lv.base = 0;
+	    } else {
+		  lv.base = allocate_vector(wid);
+		  if (lv.base == 0) {
+			fprintf(stderr, "%s:%u: vvp.tgt error: "
+				"Unable to allocate %u thread bits "
+				"for result of addition.\n",
+				ivl_expr_file(re), ivl_expr_lineno(re), wid);
+			vvp_errors += 1;
+		  }
+		  fprintf(vvp_out, "   %%movi %u, %lu %u;\n", lv.base, imm, wid);
+	    }
+	    break;
 
-	      if (imm == 0)
-		    break;
-	}
+	  case 2: /* Left expression is X or Z */
+	  case 3:
+	    lv.base = 2;
+	    break;
+
+	  default: /* The regular case. */
+	    fprintf(vvp_out, "   %%addi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
       }
 
       return lv;
@@ -1234,7 +1261,8 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
-      assert( (imm & ~0xffffffffUL) == 0 );
+      if (imm == 0)
+	    return lv;
 
       switch (lv.base) {
 	  case 0:
@@ -1248,21 +1276,21 @@ static struct vector_info draw_sub_immediate(ivl_expr_t le,
 		  vvp_errors += 1;
 	    }
 
-	    fprintf(vvp_out, "    %%mov %u, %u, %u;\n", tmp, lv.base, wid);
+	    fprintf(vvp_out, "   %%mov %u, %u, %u;\n", tmp, lv.base, wid);
 	    lv.base = tmp;
-	    fprintf(vvp_out, "    %%subi %u, %lu, %u;\n", lv.base, imm, wid);
-	    return lv;
+	    fprintf(vvp_out, "   %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
 
 	  case 2:
 	  case 3:
 	    lv.base = 2;
-	    return lv;
+	    break;
 
 	  default:
-	    fprintf(vvp_out, "    %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    fprintf(vvp_out, "   %%subi %u, %lu, %u;\n", lv.base, imm, wid);
+	    break;
       }
 
-
       return lv;
 }
 
@@ -1277,8 +1305,10 @@ static struct vector_info draw_mul_immediate(ivl_expr_t le,
       assert(lv.wid == wid);
 
       imm = get_number_immediate(re);
+      if (imm == 0)
+	    return lv;
 
-      fprintf(vvp_out, "    %%muli %u, %lu, %u;\n", lv.base, imm, lv.wid);
+      fprintf(vvp_out, "   %%muli %u, %lu, %u;\n", lv.base, imm, lv.wid);
 
       return lv;
 }

From 653e2661b2202a2acea91fb2f34256d2f938da72 Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Fri, 23 May 2008 10:01:54 -0700
Subject: [PATCH 09/16] The no git code also check for an existing version.h

My previous patch always used an empty file if git was not
available. This patch extends this to use the existing
version.h file if it exists (snapshot, etc.)
---
 Makefile.in | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Makefile.in b/Makefile.in
index 3df9bb21b..6bde10e07 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -188,7 +188,13 @@ iverilog-vpi.pdf: iverilog-vpi.ps
 .PHONY: version.h
 version.h:
 ifeq ($(GIT),none)
-	@echo '#define VERSION_TAG ""' > $@;
+	@if test -r $(srcdir)/$@; then \
+	    echo "Using $(srcdir)/$@ for VERSION_TAG"; \
+	    diff $(srcdir)/$@ $@ > /dev/null 2>&1 || cp $(srcdir)/$@ $@; \
+	else \
+	    echo "Using empty VERSION_TAG"; \
+	    echo '#define VERSION_TAG ""' > $@; \
+	fi
 else
 	@if test -d $(srcdir)/.git; then \
 	    echo "Using git-describe for VERSION_TAG"; \

From b2bdce98abc16c6301cdfcd348dfa7a8711d49f1 Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Mon, 26 May 2008 11:14:59 -0700
Subject: [PATCH 10/16] It is an error for the concatenation repeat to be
 undefined.

Print an error message if the concatenation repeat expression
is undefined.
---
 elab_expr.cc | 9 +++++++++
 elab_net.cc  | 8 ++++++++
 2 files changed, 17 insertions(+)

diff --git a/elab_expr.cc b/elab_expr.cc
index 90c1937fc..d452b31e4 100644
--- a/elab_expr.cc
+++ b/elab_expr.cc
@@ -716,6 +716,15 @@ NetExpr* PEConcat::elaborate_expr(Design*des, NetScope*scope,
 		  des->errors += 1;
 	    }
 
+	    if (!rep->value().is_defined()) {
+		  cerr << get_fileline() << ": error: Concatenation repeat "
+		       << "may not be undefined (" << rep->value()
+		       << ")." << endl;
+		  des->errors += 1;
+		  concat_depth -= 1;
+		  return 0;
+	    }
+
 	    if (rep->value().is_negative()) {
 		  cerr << get_fileline() << ": error: Concatenation repeat "
 		       << "may not be negative (" << rep->value().as_long()
diff --git a/elab_net.cc b/elab_net.cc
index 97fcb76cd..2e9224483 100644
--- a/elab_net.cc
+++ b/elab_net.cc
@@ -1578,6 +1578,14 @@ NetNet* PEConcat::elaborate_net(Design*des, NetScope*scope,
 		  return 0;
 	    }
 
+	    if (!erep->value().is_defined()) {
+		  cerr << get_fileline() << ": error: Concatenation repeat "
+		       << "may not be undefined (" << erep->value()
+		       << ")." << endl;
+		  des->errors += 1;
+		  return 0;
+	    }
+
 	    if (erep->value().is_negative()) {
 		  cerr << get_fileline() << ": error: Concatenation repeat "
 		       << "may not be negative (" << erep->value().as_long()

From 7a1180868add9cae9328a4e2e933af9e155b68c7 Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Mon, 26 May 2008 13:57:10 -0700
Subject: [PATCH 11/16] Remove definition for non-existent routines.

ivl_switch_scope, ivl_switch_attr_cnt and ivl_switch_attr_val
are non-existent routines and should not be in ivl.def. I also
removed them from ivl_target.h. Cygwin expects that if a routine
is listed in ivl.def that it will find a real implementation.
---
 ivl.def      | 3 ---
 ivl_target.h | 7 -------
 2 files changed, 10 deletions(-)

diff --git a/ivl.def b/ivl.def
index 48b798910..16989795a 100644
--- a/ivl.def
+++ b/ivl.def
@@ -222,10 +222,7 @@ ivl_switch_a
 ivl_switch_b
 ivl_switch_basename
 ivl_switch_enable
-ivl_switch_scope
 ivl_switch_type
-ivl_switch_attr_cnt;
-ivl_switch_attr_val;
 
 ivl_udp_init
 ivl_udp_name
diff --git a/ivl_target.h b/ivl_target.h
index a39366a18..13477eb62 100644
--- a/ivl_target.h
+++ b/ivl_target.h
@@ -1854,9 +1854,6 @@ extern ivl_statement_t ivl_stmt_sub_stmt(ivl_statement_t net);
  * ivl_switch_basename
  *    This is the name given to the device in the source code.
  *
- * ivl_switch_scope
- *    The scope where the switch device appears.
- *
  * ivl_switch_a
  * ivl_switch_b
  *    The a and b ports are the two ports of the switch.
@@ -1871,14 +1868,10 @@ extern ivl_statement_t ivl_stmt_sub_stmt(ivl_statement_t net);
  */
 extern ivl_switch_type_t ivl_switch_type(ivl_switch_t net);
 extern const char*ivl_switch_basename(ivl_switch_t net);
-extern ivl_scope_t ivl_switch_scope(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_a(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_b(ivl_switch_t net);
 extern ivl_nexus_t ivl_switch_enable(ivl_switch_t net);
 
-extern unsigned        ivl_switch_attr_cnt(ivl_switch_t net);
-extern ivl_attribute_t ivl_switch_attr_val(ivl_switch_t net, unsigned idx);
-
 #if defined(__MINGW32__) || defined (__CYGWIN32__)
 #  define DLLEXPORT __declspec(dllexport)
 #else

From f04fb0fc450b460ad8a52beb0f00db7f16228fc1 Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Tue, 27 May 2008 15:11:17 -0700
Subject: [PATCH 12/16] System functions have a default return value.

If a system function does not call vpi_put_value it is
supposed to have a default return value of 0. This patch
adds this functionality.
---
 vvp/vpi_priv.h   |  1 +
 vvp/vpi_tasks.cc | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/vvp/vpi_priv.h b/vvp/vpi_priv.h
index fc43caeff..ab4e11658 100644
--- a/vvp/vpi_priv.h
+++ b/vvp/vpi_priv.h
@@ -368,6 +368,7 @@ struct __vpiSysTaskCall {
       class vvp_net_t*fnet;
       unsigned file_idx;
       unsigned lineno;
+      bool put_value;
 };
 
 extern struct __vpiSysTaskCall*vpip_cur_task;
diff --git a/vvp/vpi_tasks.cc b/vvp/vpi_tasks.cc
index f82a43c41..fdad4ffc3 100644
--- a/vvp/vpi_tasks.cc
+++ b/vvp/vpi_tasks.cc
@@ -167,6 +167,8 @@ static vpiHandle sysfunc_put_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
       assert(rfp->vbit >= 4);
 
       switch (vp->format) {
@@ -271,6 +273,8 @@ static vpiHandle sysfunc_put_real_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
 	/* Make sure this is a real valued function. */
       assert(rfp->vwid == -vpiRealConst);
 
@@ -297,6 +301,8 @@ static vpiHandle sysfunc_put_4net_value(vpiHandle ref, p_vpi_value vp, int)
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
 
+      rfp->put_value = true;
+
       unsigned vwid = (unsigned) rfp->vwid;
       vvp_vector4_t val (vwid);
 
@@ -384,8 +390,10 @@ static vpiHandle sysfunc_put_rnet_value(vpiHandle ref, p_vpi_value vp, int)
       assert(ref->vpi_type->type_code == vpiSysFuncCall);
 
       struct __vpiSysTaskCall*rfp = (struct __vpiSysTaskCall*)ref;
-      double val;
 
+      rfp->put_value = true;
+
+      double val;
       switch (vp->format) {
 
 	  case vpiRealVal:
@@ -563,6 +571,7 @@ vpiHandle vpip_build_vpi_call(const char*name, unsigned vbit, int vwid,
       obj->file_idx  = (unsigned) file_idx;
       obj->lineno   = (unsigned) lineno;
       obj->userdata  = 0;
+      obj->put_value = false;
 
       compile_compiletf(obj);
 
@@ -590,8 +599,23 @@ void vpip_execute_vpi_call(vthread_t thr, vpiHandle ref)
       if (vpip_cur_task->defn->info.calltf) {
 	    assert(vpi_mode_flag == VPI_MODE_NONE);
 	    vpi_mode_flag = VPI_MODE_CALLTF;
+	    vpip_cur_task->put_value = false;
 	    vpip_cur_task->defn->info.calltf(vpip_cur_task->defn->info.user_data);
 	    vpi_mode_flag = VPI_MODE_NONE;
+	      /* If the function call did not set a value then put a
+	       * default value (0). */
+	    if (ref->vpi_type->type_code == vpiSysFuncCall &&
+	        !vpip_cur_task->put_value) {
+		  s_vpi_value val;
+		  if (vpip_cur_task->vwid == -vpiRealConst) {
+			val.format = vpiRealVal;
+			val.value.real = 0.0;
+		  } else {
+			val.format = vpiIntVal;
+			val.value.integer = 0;
+		  }
+		  vpi_put_value(ref, &val, 0, vpiNoDelay);
+	    }
       }
 }
 

From 2fab3159dda33872d9e306913cc7d5ad03b85bbe Mon Sep 17 00:00:00 2001
From: Cary R <cygcary@yahoo.com>
Date: Tue, 27 May 2008 14:29:08 -0700
Subject: [PATCH 13/16] Add smart part select for system functions &PV<>.

This patch adds a smart part select that allows system functions
to have full access to the real bits of the part select.
---
 tgt-vvp/draw_vpi.c     | 109 +++++
 vpi/sys_display.c      |   2 +
 vpi_user.h             |   6 +-
 vvp/lexor.lex          |   8 +-
 vvp/parse.y            |   9 +-
 vvp/vpi_priv.h         |  14 +
 vvp/vpi_signal.cc      | 876 ++++++++++++++++++++++++++++-------------
 vvp/vpi_vthr_vector.cc |   2 +-
 8 files changed, 753 insertions(+), 273 deletions(-)

diff --git a/tgt-vvp/draw_vpi.c b/tgt-vvp/draw_vpi.c
index ab290867c..2930c6a21 100644
--- a/tgt-vvp/draw_vpi.c
+++ b/tgt-vvp/draw_vpi.c
@@ -29,6 +29,82 @@
 #define snprintf _snprintf
 #endif
 
+/*
+ * Check to see if the expression (number) can be correctly represented
+ * with a long variable.
+ */
+static int is_constant_number(ivl_expr_t ex)
+{
+	/* Make sure this matches the return type of constant_number(). */
+      unsigned lim_wid = 8*sizeof(long);
+      const char*bits;
+      char pad_bit = '0';
+      unsigned idx;
+      unsigned nbits = ivl_expr_width(ex);
+
+      if (ivl_expr_type(ex) != IVL_EX_NUMBER
+          && ivl_expr_type(ex) != IVL_EX_ULONG)
+            return 0;
+
+      bits = ivl_expr_bits(ex);
+
+	/* For unsigned values the effective MSB and on must be '0'. */
+      if (!ivl_expr_signed(ex)) lim_wid -= 1;
+
+	/* For negative values the pad bit is '1'. */
+      if (ivl_expr_signed(ex) && bits[nbits-1]=='1') {
+            pad_bit = '1';
+      }
+
+	/* For the number to fit in the variable all the upper bits must
+	 * match the pad bits. */
+      for (idx = lim_wid ;  idx < nbits ;  idx += 1) {
+            if (bits[idx] != pad_bit) return 0;
+      }
+
+      return 1;
+}
+
+/*
+ * Convert the expression (number) to a long value.
+ */
+static long get_constant_number(ivl_expr_t ex)
+{
+      long rtn = 0;
+
+      switch (ivl_expr_type(ex)) {
+	  case IVL_EX_ULONG:
+	    rtn = (signed)ivl_expr_value(ex);
+	    break;
+	  case IVL_EX_NUMBER: {
+	    unsigned idx;
+	    const char*bits = ivl_expr_bits(ex);
+	    unsigned nbits = ivl_expr_width(ex);
+	    char pad_bit = bits[nbits-1];
+	      /* Define all the bits in the long (negative numbers). */
+	    for (idx = 0 ;  idx < 8*sizeof(long) ;  idx += 1) {
+		  char bit;
+		  if (idx < nbits) bit = bits[idx];
+		  else bit = pad_bit;
+		  switch (bit) {
+		      case '0':
+			break;
+		      case '1':
+			rtn |= 1 << idx;
+			break;
+		      default:
+			assert(0);
+		  }
+	    }
+	    break;
+	  }
+	  default:
+	    assert(0);
+      }
+
+      return rtn;
+}
+
 static const char* magic_sfuncs[] = {
       "$time",
       "$stime",
@@ -217,6 +293,39 @@ static void draw_vpi_taskfunc_args(const char*call_string,
 			continue;
 		  }
 
+		case IVL_EX_SELECT: {
+		  ivl_expr_t vexpr = ivl_expr_oper1(expr);
+                  assert(vexpr);
+
+		    /* This code is only for signals. */
+		  if (ivl_expr_type(vexpr) != IVL_EX_SIGNAL) break;
+
+		    /* The signal is part of an array. */
+		    /* Add &APV<> code here when it is finished. */
+		  if (ivl_expr_oper1(vexpr)) break;
+
+                  ivl_expr_t bexpr = ivl_expr_oper2(expr);
+                  assert(bexpr);
+
+		    /* This is a constant bit/part select. */
+                  if (is_constant_number(bexpr)) {
+			snprintf(buffer, sizeof buffer, "&PV<v%p_0, %ld, %u>",
+			         ivl_expr_signal(vexpr),
+			         get_constant_number(bexpr),
+			         ivl_expr_width(expr));
+		    /* This is an indexed bit/part select. */
+                  } else {
+			struct vector_info rv;
+			rv = draw_eval_expr(bexpr, STUFF_OK_XZ);
+			snprintf(buffer, sizeof buffer, "&PV<v%p_0, %u %u, %u>",
+			         ivl_expr_signal(vexpr),
+			         rv.base, rv.wid,
+			         ivl_expr_width(expr));
+                  }
+		  args[idx].text = strdup(buffer);
+		  continue;
+		}
+
 		    /* Everything else will need to be evaluated and
 		       passed as a constant to the vpi task. */
 		default:
diff --git a/vpi/sys_display.c b/vpi/sys_display.c
index c291d00e3..8b559fb15 100644
--- a/vpi/sys_display.c
+++ b/vpi/sys_display.c
@@ -827,6 +827,7 @@ static void do_display(unsigned int mcd, struct strobe_cb_info*info)
 		case vpiReg:
 		case vpiIntegerVar:
 		case vpiMemoryWord:
+		case vpiPartSelect:
 		  do_display_numeric(mcd, info, item);
 		  break;
 
@@ -1836,6 +1837,7 @@ static char *get_display(unsigned int *rtnsz, struct strobe_cb_info *info)
       case vpiReg:
       case vpiIntegerVar:
       case vpiMemoryWord:
+      case vpiPartSelect:
         width = get_numeric(&result, info, item);
         rtn = realloc(rtn, (size+width)*sizeof(char));
         memcpy(rtn+size-1, result, width);
diff --git a/vpi_user.h b/vpi_user.h
index 0087b52b5..6aba54f7f 100644
--- a/vpi_user.h
+++ b/vpi_user.h
@@ -279,6 +279,7 @@ typedef struct t_vpi_delay  {
 #define vpiNamedFork   35
 #define vpiNet         36
 #define vpiParameter   41
+#define vpiPartSelect  42
 #define vpiPathTerm    43
 #define vpiRealVar     47
 #define vpiReg         48
@@ -297,6 +298,7 @@ typedef struct t_vpi_delay  {
 #define vpiModPathIn     95
 #define vpiModPathOut    96 
 #define vpiVariables   100
+#define vpiExpr        102
 
 #define vpiCallback  1000
 
@@ -346,8 +348,8 @@ typedef struct t_vpi_delay  {
 #   define vpiSysFuncReal  vpiRealFunc
 #   define vpiSysFuncTime  vpiTimeFunc
 #   define vpiSysFuncSized vpiSizedFunc
-#define vpiSigned    65
-#define vpiExpr      102
+#define vpiConstantSelect 53
+#define vpiSigned         65
 /* IVL private properties */
 #define _vpiNexusId 0x1000000
 
diff --git a/vvp/lexor.lex b/vvp/lexor.lex
index c3488bb72..0a266b389 100644
--- a/vvp/lexor.lex
+++ b/vvp/lexor.lex
@@ -173,6 +173,11 @@
 "%disable"  { return K_disable; }
 "%fork"     { return K_fork; }
 
+  /* Handle the specialized variable access functions. */
+
+"&A" { return K_A; }
+"&PV" { return K_PV; }
+
 "%"[.$_/a-zA-Z0-9]+ {
       yylval.text = strdup(yytext);
       assert(yylval.text);
@@ -186,9 +191,6 @@
       yylval.numb = strtol(yytext, 0, 0);
       return T_NUMBER; }
 
-
-"&A" { return K_A; }
-
   /* Handle some specialized constant/literals as symbols. */
 
 "C4<"[01xz]*">" {
diff --git a/vvp/parse.y b/vvp/parse.y
index dd8a2b22d..729a0158d 100644
--- a/vvp/parse.y
+++ b/vvp/parse.y
@@ -77,7 +77,7 @@ static struct __vpiModPath*modpath_dst = 0;
 %token K_EVENT K_EVENT_OR K_EXTEND_S K_FUNCTOR K_MODPATH K_NET K_NET_S K_NET_R
 %token K_NET8 K_NET8_S
 %token K_PARAM_STR K_PARAM_L K_PARAM_REAL K_PART K_PART_PV
-%token K_PART_V K_REDUCE_AND K_REDUCE_OR K_REDUCE_XOR
+%token K_PART_V K_PV K_REDUCE_AND K_REDUCE_OR K_REDUCE_XOR
 %token K_REDUCE_NAND K_REDUCE_NOR K_REDUCE_XNOR K_REPEAT
 %token K_RESOLV K_SCOPE K_SFUNC K_SHIFTL K_SHIFTR K_SHIFTRS
 %token K_THREAD K_TIMESCALE K_UFUNC
@@ -790,9 +790,14 @@ argument
       }
   | K_A '<' T_SYMBOL ',' T_NUMBER '>'
       { $$ = vpip_make_vthr_A($3, $5); }
+  | K_PV '<' T_SYMBOL ',' T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, $5, $7); }
+  | K_PV '<' T_SYMBOL ',' '-' T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, -$6, $8); }
+  | K_PV '<' T_SYMBOL ',' T_NUMBER T_NUMBER ',' T_NUMBER '>'
+      { $$ = vpip_make_PV($3, $5, $6, $8); }
   ;
 
-
   /* functor operands can only be a list of symbols. */
 symbols
 	: symbol
diff --git a/vvp/vpi_priv.h b/vvp/vpi_priv.h
index ab4e11658..be3f42bef 100644
--- a/vvp/vpi_priv.h
+++ b/vvp/vpi_priv.h
@@ -220,6 +220,20 @@ extern vpiHandle vpip_make_reg(const char*name, int msb, int lsb,
 extern vpiHandle vpip_make_net(const char*name, int msb, int lsb,
 			       bool signed_flag, vvp_net_t*node);
 
+/*
+ * This is used by system calls to represent a bit/part select of
+ * a simple variable or constant array word.
+ */
+struct __vpiPV {
+      struct __vpiHandle base;
+      vpiHandle parent;
+      vvp_net_t*net;
+      int tbase;
+      unsigned twid, width;
+};
+extern vpiHandle vpip_make_PV(char*name, int base, int width);
+extern vpiHandle vpip_make_PV(char*name, int tbase, int twid, int width);
+
 /*
  * This function safely converts a vpiHandle back to a
  * __vpiSignal. Return a nil if the type is not appropriate.
diff --git a/vvp/vpi_signal.cc b/vvp/vpi_signal.cc
index 7f09be83f..f1efa7f36 100644
--- a/vvp/vpi_signal.cc
+++ b/vvp/vpi_signal.cc
@@ -48,7 +48,7 @@
  * draw_tt.c program.
  */
 extern const char hex_digits[256];
-extern const char oct_digits[256];
+extern const char oct_digits[64];
 
 /*
  * The string values need a result buf to hold the results. This
@@ -109,6 +109,385 @@ char *generic_get_str(int code, vpiHandle ref, const char *name, const char *ind
 	return res;
 }
 
+/*
+ * The standard formating/conversion routines.
+ * They work with full or partial signals.
+ */
+
+static void format_vpiBinStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      char *rbuf = need_result_buf(wid+1, RBUF_VAL);
+      long offset = wid - 1 + base;
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (idx < 0 || idx >= ssize) {
+                  rbuf[offset-idx] = 'x';
+	    } else {
+                  rbuf[offset-idx] = vvp_bit4_to_ascii(sig->value(idx));
+	    }
+      }
+      rbuf[wid] = 0;
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiOctStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      unsigned dwid = (wid + 2) / 3;
+      char *rbuf = need_result_buf(dwid+1, RBUF_VAL);
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+      unsigned val = 0;
+
+      rbuf[dwid] = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    unsigned bit = 0;
+	    if (idx < 0 || idx >= ssize) {
+                  bit = 2; // BIT4_X
+	    } else {
+                  switch (sig->value(idx)) {
+		      case BIT4_0:
+			bit = 0;
+			break;
+		      case BIT4_1:
+			bit = 1;
+			break;
+		      case BIT4_X:
+			bit = 2;
+			break;
+		      case BIT4_Z:
+			bit = 3;
+			break;
+                  }
+	    }
+	    val |= bit << 2*((idx-base) % 3);
+
+	    if ((idx-base) % 3 == 2) {
+		dwid -= 1;
+		rbuf[dwid] = oct_digits[val];
+		val = 0;
+	    }
+      }
+
+	/* Fill in X or Z if they are the only thing in the value. */
+      switch (wid % 3) {
+	  case 1:
+	    if (val == 2) val = 42;
+	    else if (val == 3) val = 63;
+	    break;
+	  case 2:
+	    if (val == 10) val = 42;
+	    else if (val == 15) val = 63;
+	    break;
+      }
+
+      if (dwid > 0) rbuf[0] = oct_digits[val];
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiHexStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      unsigned dwid = (wid + 3) / 4;
+      char *rbuf = need_result_buf(dwid+1, RBUF_VAL);
+      long end = base + (signed)wid;
+      long ssize = (signed)sig->size();
+      unsigned val = 0;
+
+      rbuf[dwid] = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    unsigned bit = 0;
+	    if (idx < 0 || idx >= ssize) {
+                  bit = 2; // BIT4_X
+	    } else {
+                  switch (sig->value(idx)) {
+		      case BIT4_0:
+			bit = 0;
+			break;
+		      case BIT4_1:
+			bit = 1;
+			break;
+		      case BIT4_X:
+			bit = 2;
+			break;
+		      case BIT4_Z:
+			bit = 3;
+			break;
+                  }
+	    }
+	    val |= bit << 2*((idx-base) % 4);
+
+	    if ((idx-base) % 4 == 3) {
+		dwid -= 1;
+		rbuf[dwid] = hex_digits[val];
+		val = 0;
+	    }
+      }
+
+	/* Fill in X or Z if they are the only thing in the value. */
+      switch (wid % 4) {
+	  case 1:
+	    if (val == 2) val = 170;
+	    else if (val == 3) val = 255;
+	    break;
+	  case 2:
+	    if (val == 10) val = 170;
+	    else if (val == 15) val = 255;
+	    break;
+	  case 3:
+	    if (val == 42) val = 170;
+	    else if (val == 63) val = 255;
+	    break;
+      }
+
+      if (dwid > 0) rbuf[0] = hex_digits[val];
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiDecStrVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                int signed_flag, s_vpi_value*vp)
+{
+      unsigned hwid = (sig->size()+2) / 3 + 1;
+      char *rbuf = need_result_buf(hwid, RBUF_VAL);
+      long ssize = (signed)sig->size();
+      long end = base + (signed)wid;
+
+	/* Do we have an end outside of the real signal vector. */
+      if (base < 0 || end > ssize) {
+	    bool all_x = true;
+	    if (end > ssize) end = ssize;
+	    if (base < 0) base = 0;
+	    for (long idx = base ;  idx < end ;  idx += 1) {
+		  if (sig->value(idx) != BIT4_X) {
+			all_x = false;
+			break;
+		  }
+	    }
+
+	    if (all_x) {
+		  rbuf[0] = 'x';
+	    } else {
+		  rbuf[0] = 'X';
+	    }
+	    rbuf[1] = 0;
+
+	    vp->value.str = rbuf;
+	    return;
+      }
+
+      vvp_vector4_t vec4;
+      if (base == 0 && end == ssize) {
+	    vec4 = sig->vec4_value();
+      } else {
+	    vec4 = sig->vec4_value().subvalue(base, wid);
+      }
+
+      vpip_vec4_to_dec_str(vec4, rbuf, hwid, signed_flag);
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiIntVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                             s_vpi_value*vp)
+{
+      unsigned iwid = 8 * sizeof(vp->value.integer);
+      long ssize = (signed)sig->size();
+
+      if (wid > iwid) wid = iwid;
+      long end = base + (signed)wid;
+      if (end > ssize) end = ssize;
+
+      vp->value.integer = 0;
+      for (long idx = (base < 0) ? 0 : base ;  idx < end ;  idx += 1) {
+	    if (sig->value(idx) == BIT4_1) {
+		  vp->value.integer |= 1<<(idx-base);
+	    }
+      }
+}
+
+static void format_vpiRealVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                              int signed_flag, s_vpi_value*vp)
+{
+      vvp_vector4_t vec4(wid);
+      long ssize = (signed)sig->size();
+      long end = base + (signed)wid;
+      if (end > ssize) end = ssize;
+
+      for (long idx = (base < 0) ? 0 : base ;  idx < end ;  idx += 1) {
+	    vec4.set_bit(idx-base, sig->value(idx));
+      }
+
+      vp->value.real = 0.0;
+      vector4_to_value(vec4, vp->value.real, signed_flag);
+}
+
+static void format_vpiStringVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      /* The result will use a character for each 8 bits of the
+	 vector. Add one extra character for the highest bits that
+	 don't form an 8 bit group. */
+      char *rbuf = need_result_buf(wid/8 + ((wid&7)!=0) + 1, RBUF_VAL);
+      char *cp = rbuf;
+
+      char tmp = 0;
+      for (long idx = base+(signed)wid-1; idx >= base; idx -= 1) {
+	    tmp <<= 1;
+
+	    if (idx >=0 && idx < (signed)sig->size() &&
+	        sig->value(idx) == BIT4_1) {
+		   tmp |= 1;
+	    }
+
+	    if (((idx-base)&7)==0){
+		  /* Skip leading nulls. */
+		  if (tmp == 0 && cp == rbuf)
+			continue;
+
+		  /* Nulls in the middle get turned into spaces. */
+		  *cp++ = tmp ? tmp : ' ';
+		  tmp = 0;
+	    }
+      }
+      *cp++ = 0;
+
+      vp->value.str = rbuf;
+}
+
+static void format_vpiScalarVal(vvp_fun_signal_vec*sig, int base, 
+                                s_vpi_value*vp)
+{
+      if (base >= 0 && base < (signed)sig->size()) {
+	    switch (sig->value(base)) {
+		case BIT4_0:
+		  vp->value.scalar = vpi0;
+		  break;
+		case BIT4_1:
+		  vp->value.scalar = vpi1;
+		  break;
+		case BIT4_X: {
+		  vvp_scalar_t strn = sig->scalar_value(base);
+		  if (strn.strength0() == 1) vp->value.scalar = vpiH;
+		  else if (strn.strength1() == 1) vp->value.scalar = vpiL;
+		  else vp->value.scalar = vpiX;
+		  break;
+		}
+		case BIT4_Z:
+		  vp->value.scalar = vpiZ;
+		  break;
+	    }
+      } else {
+	    vp->value.scalar = vpiX;
+      }
+}
+
+static void format_vpiStrengthVal(vvp_fun_signal_vec*sig, int base,
+                                  unsigned wid, s_vpi_value*vp)
+{
+      long end = base + (signed)wid;
+      s_vpi_strengthval*op;
+
+      op = (s_vpi_strengthval*)
+	    need_result_buf(wid * sizeof(s_vpi_strengthval), RBUF_VAL);
+
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (idx >=0 && idx < (signed)sig->size()) {
+		  vvp_scalar_t val = sig->scalar_value(idx);
+
+		  /* vvp_scalar_t strengths are 0-7, but the vpi strength
+		     is bit0-bit7. This gets the vpi form of the strengths
+		     from the vvp_scalar_t strengths. */
+		  unsigned s0 = 1 << val.strength0();
+		  unsigned s1 = 1 << val.strength1();
+
+		  switch (val.value()) {
+		      case BIT4_0:
+			op[idx-base].logic = vpi0;
+			op[idx-base].s0 = s0|s1;
+			op[idx-base].s1 = 0;
+			break;
+
+		      case BIT4_1:
+			op[idx-base].logic = vpi1;
+			op[idx-base].s0 = 0;
+			op[idx-base].s1 = s0|s1;
+			break;
+
+		      case BIT4_X:
+			op[idx-base].logic = vpiX;
+			op[idx-base].s0 = s0;
+			op[idx-base].s1 = s1;
+			break;
+
+		      case BIT4_Z:
+			op[idx-base].logic = vpiZ;
+			op[idx-base].s0 = vpiHiZ;
+			op[idx-base].s1 = vpiHiZ;
+			break;
+		  }
+	    } else {
+		  op[idx-base].logic = vpiX;
+		  op[idx-base].s0 = vpiStrongDrive;
+		  op[idx-base].s1 = vpiStrongDrive;
+	    }
+      }
+
+      vp->value.strength = op;
+}
+
+static void format_vpiVectorVal(vvp_fun_signal_vec*sig, int base, unsigned wid,
+                                s_vpi_value*vp)
+{
+      long end = base + (signed)wid;
+      unsigned int obit = 0;
+      unsigned hwid = (wid - 1)/32 + 1;
+
+      s_vpi_vecval *op = (p_vpi_vecval)
+                         need_result_buf(hwid * sizeof(s_vpi_vecval), RBUF_VAL);
+      vp->value.vector = op;
+
+      op->aval = op->bval = 0;
+      for (long idx = base ;  idx < end ;  idx += 1) {
+	    if (base >= 0 && base < (signed)sig->size()) {
+		switch (sig->value(idx)) {
+		case BIT4_0:
+		  op->aval &= ~(1 << obit);
+		  op->bval &= ~(1 << obit);
+		  break;
+		case BIT4_1:
+		  op->aval |= (1 << obit);
+		  op->bval &= ~(1 << obit);
+		  break;
+		case BIT4_X:
+		  op->aval |= (1 << obit);
+		  op->bval |= (1 << obit);
+		  break;
+		case BIT4_Z:
+		  op->aval &= ~(1 << obit);
+		  op->bval |= (1 << obit);
+		  break;
+		}
+	    } else {  /* BIT4_X */
+		  op->aval |= (1 << obit);
+		  op->bval |= (1 << obit);
+	    }
+
+	    obit++;
+	    if (!(obit % 32)) {
+		  op += 1;
+		  if ((op - vp->value.vector) < (ptrdiff_t)hwid)
+			op->aval = op->bval = 0;
+		  obit = 0;
+	    }
+      }
+}
+
 struct __vpiSignal* vpip_signal_from_handle(vpiHandle ref)
 {
       if ((ref->vpi_type->type_code != vpiNet)
@@ -245,65 +624,6 @@ static vpiHandle signal_iterate(int code, vpiHandle ref)
       return 0;
 }
 
-
-static char *signal_vpiDecStrVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-      assert(vsig);
-
-      unsigned hwid = (vsig->size()+2) / 3 + 1;
-      char *rbuf = need_result_buf(hwid, RBUF_VAL);
-
-      vpip_vec4_to_dec_str(vsig->vec4_value(), rbuf, hwid, rfp->signed_flag);
-
-      return rbuf;
-}
-
-
-static char *signal_vpiStringVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      unsigned wid = (rfp->msb >= rfp->lsb)
-	    ? (rfp->msb - rfp->lsb + 1)
-	    : (rfp->lsb - rfp->msb + 1);
-
-      vvp_fun_signal*vsig = dynamic_cast<vvp_fun_signal*>(rfp->node->fun);
-
-      /* The result will use a character for each 8 bits of the
-	 vector. Add one extra character for the highest bits that
-	 don't form an 8 bit group. */
-      char *rbuf = need_result_buf(wid/8 + ((wid&7)!=0) + 1, RBUF_VAL);
-      char *cp = rbuf;
-
-      char tmp = 0;
-      int bitnr;
-      for(bitnr=wid-1; bitnr>=0; bitnr--){
-	  tmp <<= 1;
-
-	  switch (vsig->value(bitnr)) {
-	  case BIT4_0:
-	      break;
-	  case  BIT4_1:
-	      tmp |= 1;
-	      break;
-	  default:
-	      break;
-	  }
-
-	  if ((bitnr&7)==0){
-		  /* Skip leading nulls. */
-		if (tmp == 0 && cp == rbuf)
-		      continue;
-
-		  /* Nulls in the middle get turned into spaces. */
-		*cp++ = tmp? tmp : ' ';
-		tmp = 0;
-	  }
-      }
-      *cp++ = 0;
-
-      return rbuf;
-}
-
 static unsigned signal_width(const struct __vpiSignal*rfp)
 {
       unsigned wid = (rfp->msb >= rfp->lsb)
@@ -313,97 +633,6 @@ static unsigned signal_width(const struct __vpiSignal*rfp)
       return wid;
 }
 
-static void signal_get_IntVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      unsigned wid = signal_width(rfp);
-      unsigned iwid = 8 * sizeof vp->value.integer;
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-
-      if (wid > iwid) {
-            wid = iwid;
-      }
-      vp->value.integer = 0;
-
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    switch (vsig->value(idx)) {
-		case BIT4_0:
-		  break;
-		case BIT4_1:
-		  vp->value.integer |= 1<<idx;
-		  break;
-		default:
-		    /* vpi_get_value of vpiIntVal treats x and z
-		       values as 0. */
-		  break;
-	    }
-      }
-}
-
-static void signal_get_ScalarVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal*vsig = dynamic_cast<vvp_fun_signal*>(rfp->node->fun);
-
-      switch (vsig->value(0)) {
-	  case BIT4_0:
-	    vp->value.scalar = vpi0;
-	    break;
-	  case BIT4_1:
-	    vp->value.scalar = vpi1;
-	    break;
-	  case BIT4_X:
-	    vp->value.scalar = vpiX;
-	    break;
-	  case BIT4_Z:
-	    vp->value.scalar = vpiZ;
-	    break;
-      }
-}
-
-static void signal_get_StrengthVal(struct __vpiSignal*rfp, s_vpi_value*vp)
-{
-      vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
-      unsigned wid = signal_width(rfp);
-      s_vpi_strengthval*op;
-
-      op = (s_vpi_strengthval*)
-	    need_result_buf(wid * sizeof(s_vpi_strengthval), RBUF_VAL);
-
-      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    vvp_scalar_t val = vsig->scalar_value(idx);
-
-	     /* vvp_scalar_t strengths are 0-7, but the vpi strength
-		is bit0-bit7. This gets the vpi form of the strengths
-		from the vvp_scalar_t strengths. */
-	    unsigned s0 = 1 << val.strength0();
-	    unsigned s1 = 1 << val.strength1();
-
-	    switch (val.value()) {
-	        case BIT4_0:
-		  op[idx].logic = vpi0;
-		  op[idx].s0 = s0|s1;
-		  op[idx].s1 = 0;
-		  break;
-	        case BIT4_1:
-		  op[idx].logic = vpi1;
-		  op[idx].s0 = 0;
-		  op[idx].s1 = s0|s1;
-		  break;
-	        case BIT4_X:
-		  op[idx].logic = vpiX;
-		  op[idx].s0 = s0;
-		  op[idx].s1 = s1;
-		  break;
-	        case BIT4_Z:
-		  op[idx].logic = vpiZ;
-		  op[idx].s0 = vpiHiZ;
-		  op[idx].s1 = vpiHiZ;
-		  break;
-	    }
-      }
-
-      vp->value.strength = op;
-}
-
 /*
  * The get_value method reads the values of the functors and returns
  * the vector to the caller. This causes no side-effect, and reads the
@@ -421,146 +650,48 @@ static void signal_get_value(vpiHandle ref, s_vpi_value*vp)
       vvp_fun_signal_vec*vsig = dynamic_cast<vvp_fun_signal_vec*>(rfp->node->fun);
       assert(vsig);
 
-      char *rbuf = 0;
-
       switch (vp->format) {
 
 	  case vpiIntVal:
-	    signal_get_IntVal(rfp, vp);
+	    format_vpiIntVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiScalarVal:
-	    signal_get_ScalarVal(rfp, vp);
+	    format_vpiScalarVal(vsig, 0, vp);
 	    break;
 
 	  case vpiStrengthVal:
-	    signal_get_StrengthVal(rfp, vp);
+	    format_vpiStrengthVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiBinStrVal:
-	    rbuf = need_result_buf(wid+1, RBUF_VAL);
-
-	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		  rbuf[wid-idx-1] = vvp_bit4_to_ascii(vsig->value(idx));
-	    }
-	    rbuf[wid] = 0;
-	    vp->value.str = rbuf;
+	    format_vpiBinStrVal(vsig, 0, wid, vp);
 	    break;
 
 	  case vpiHexStrVal: {
-		unsigned hwid = (wid + 3) / 4;
-
-		rbuf = need_result_buf(hwid+1, RBUF_VAL);
-		rbuf[hwid] = 0;
-
-		vpip_vec4_to_hex_str(vsig->vec4_value(), rbuf, hwid+1, false);
-		vp->value.str = rbuf;
-		break;
+	    format_vpiHexStrVal(vsig, 0, wid, vp);
+	    break;
 	  }
 
-	  case vpiOctStrVal: {
-		unsigned hval, hwid;
-		hwid = (wid + 2) / 3;
-
-		rbuf = need_result_buf(hwid+1, RBUF_VAL);
-		rbuf[hwid] = 0;
-		hval = 0;
-		for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		      unsigned tmp = 0;
-		      switch (vsig->value(idx)) {
-			  case BIT4_0:
-			    tmp = 0;
-			    break;
-			  case BIT4_1:
-			    tmp = 1;
-			    break;
-			  case BIT4_Z:
-			    tmp = 3;
-			    break;
-			  case BIT4_X:
-			    tmp = 2;
-			    break;
-		      }
-		      hval = hval | (tmp << 2*(idx % 3));
-
-		      if (idx%3 == 2) {
-			    hwid -= 1;
-			    rbuf[hwid] = oct_digits[hval];
-			    hval = 0;
-		      }
-		}
-
-		if (hwid > 0) {
-		      hwid -= 1;
-		      rbuf[hwid] = oct_digits[hval];
-		      unsigned padd = 0;
-		      switch(rbuf[hwid]) {
-			  case 'X': padd = 2; break;
-			  case 'Z': padd = 3; break;
-		      }
-		      if (padd) {
-			    for (unsigned idx = wid % 3; idx < 3; idx += 1) {
-				  hval = hval | padd << 2*idx;
-			    }
-			    rbuf[hwid] = oct_digits[hval];
-		      }
-		}
-		vp->value.str = rbuf;
-		break;
-	  }
+	  case vpiOctStrVal:
+	    format_vpiOctStrVal(vsig, 0, wid, vp);
+	    break;
 
 	  case vpiDecStrVal:
-	    vp->value.str = signal_vpiDecStrVal(rfp, vp);
+	    format_vpiDecStrVal(vsig, 0, wid, rfp->signed_flag, vp);
 	    break;
 
 	  case vpiStringVal:
-	    vp->value.str = signal_vpiStringVal(rfp, vp);
+	    format_vpiStringVal(vsig, 0, wid, vp);
 	    break;
 
-	  case vpiVectorVal: {
-	      unsigned int obit = 0;
-	      unsigned hwid = (wid - 1)/32 + 1;
-
-	      rbuf = need_result_buf(hwid * sizeof(s_vpi_vecval), RBUF_VAL);
-	      s_vpi_vecval *op = (p_vpi_vecval)rbuf;
-	      vp->value.vector = op;
-
-	      op->aval = op->bval = 0;
-	      for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-		switch (vsig->value(idx)) {
-		case BIT4_0:
-		  op->aval &= ~(1 << obit);
-		  op->bval &= ~(1 << obit);
-		  break;
-		case BIT4_1:
-		  op->aval |= (1 << obit);
-		  op->bval &= ~(1 << obit);
-		  break;
-		case BIT4_X:
-		  op->aval |= (1 << obit);
-		  op->bval |= (1 << obit);
-		  break;
-		case BIT4_Z:
-		  op->aval &= ~(1 << obit);
-		  op->bval |= (1 << obit);
-		  break;
-		}
-		obit++;
-		if (!(obit % 32)) {
-		      op += 1;
-		      if ((op - vp->value.vector) < (ptrdiff_t)hwid)
-			    op->aval = op->bval = 0;
-		      obit = 0;
-		}
-	      }
-	      break;
-	    }
+	  case vpiVectorVal:
+	    format_vpiVectorVal(vsig, 0, wid, vp);
+	    break;
 
 	  case vpiRealVal: {
-		bool flag = rfp->signed_flag;
-		vp->value.real = 0.0;
-		vector4_to_value(vsig->vec4_value(), vp->value.real, flag);
-		break;
+	    format_vpiRealVal(vsig, 0, wid, rfp->signed_flag, vp);
+	    break;
 	  }
 
 	  default:
@@ -812,3 +943,218 @@ vpiHandle vpip_make_net(const char*name, int msb, int lsb,
 
       return &obj->base;
 }
+
+static int PV_get_base(struct __vpiPV*rfp)
+{
+      if (rfp->twid == 0) return rfp->tbase;
+
+      int tval = 0;
+      for (unsigned idx = 0 ;  idx < rfp->twid ;  idx += 1) {
+	    vvp_bit4_t bit = vthread_get_bit(vpip_current_vthread,
+                                              rfp->tbase + idx);
+	    if (bit == BIT4_1) {
+		  tval |= 1<<idx;
+	    }
+      }
+
+      return tval;
+}
+
+static int PV_get(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      int rval = 0;
+      switch (code) {
+	case vpiLineNo:
+	    return 0;  // Not implemented for now!
+
+	case vpiSigned:
+	    return 0;  // A part/bit select is always unsigned!
+
+	case vpiSize:
+	    return rfp->width;
+
+	case vpiConstantSelect:
+	    return rfp->twid == 0;
+
+	case vpiLeftRange: rval += rfp->width;
+	case vpiRightRange:
+	    rval += vpi_get(vpiRightRange, rfp->parent) + PV_get_base(rfp);
+	    return rval;
+
+	default:
+	    fprintf(stderr, "PV_get: property %d is unknown\n", code);
+      }
+
+      return 0;
+}
+
+static char* PV_get_str(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      switch (code) {
+	case vpiFile:  // Not implemented for now!
+	    return simple_set_rbuf_str(file_names[0]);
+
+	case vpiName:
+	case vpiFullName: {
+	    const char*nm = vpi_get_str(code, rfp->parent);
+	    char full[1024+strlen(nm)];
+	    sprintf(full, "%s[%d:%d]", nm, vpi_get(vpiLeftRange, ref),
+	                                   vpi_get(vpiRightRange, ref));
+	    return simple_set_rbuf_str(full);
+	}
+
+	default:
+	    fprintf(stderr, "PV_get_str: property %d is unknown\n", code);
+      }
+
+      return 0;
+}
+
+static void PV_get_value(vpiHandle ref, p_vpi_value vp)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      vvp_fun_signal_vec*sig = dynamic_cast<vvp_fun_signal_vec*>(rfp->net->fun);
+      assert(sig);
+
+      switch (vp->format) {
+
+	  case vpiIntVal:
+	    format_vpiIntVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiBinStrVal:
+	    format_vpiBinStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiOctStrVal:
+	    format_vpiOctStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiHexStrVal:
+	    format_vpiHexStrVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiDecStrVal:
+	    format_vpiDecStrVal(sig, PV_get_base(rfp), rfp->width, 0, vp);
+	    break;
+
+	  case vpiStringVal:
+	    format_vpiStringVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiScalarVal:
+	    format_vpiScalarVal(sig, PV_get_base(rfp), vp);
+	    break;
+
+	  case vpiStrengthVal:
+	    format_vpiStrengthVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiVectorVal:
+	    format_vpiVectorVal(sig, PV_get_base(rfp), rfp->width, vp);
+	    break;
+
+	  case vpiRealVal:
+	    format_vpiRealVal(sig, PV_get_base(rfp), rfp->width, 0, vp);
+	    break;
+
+	  default:
+	    fprintf(stderr, "vvp internal error: PV_get_value: "
+		    "value type %u not implemented. Signal is %s.\n",
+		    vp->format, vpi_get_str(vpiFullName, rfp->parent));
+	    assert(0);
+      }
+}
+
+static vpiHandle PV_put_value(vpiHandle ref, p_vpi_value vp, int)
+{
+      assert(ref->vpi_type->type_code == vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+      vvp_fun_signal_vec*sig = reinterpret_cast<vvp_fun_signal_vec*>(rfp->net);
+      assert(sig);
+
+      unsigned width = rfp->width;
+      int base = PV_get_base(rfp);
+      if (base >= (signed) sig->size()) return 0;
+      if (base < 0) {
+	    width += base;
+	    base = 0;
+      }
+      if (base+width > sig->size()) width = sig->size() - base;
+
+      bool full_sig = base == 0 && width == sig->size();
+
+      vvp_net_ptr_t ptr (rfp->net, 0);
+
+/* We only support integer values. */
+      assert(vp->format == vpiIntVal);
+      if (full_sig) {
+	    vvp_send_long(ptr, vp->value.integer);
+      } else {
+	    vvp_send_long_pv(ptr, vp->value.integer, base, width);
+      }
+
+      return 0;
+}
+
+static vpiHandle PV_get_handle(int code, vpiHandle ref)
+{
+      assert(ref->vpi_type->type_code==vpiPartSelect);
+      struct __vpiPV*rfp = (struct __vpiPV*)ref;
+
+      switch (code) {
+
+	  case vpiParent:
+	    return rfp->parent;
+	    break;
+      }
+
+      return 0;
+}
+
+static const struct __vpirt vpip_PV_rt = {
+      vpiPartSelect,
+      PV_get,
+      PV_get_str,
+      PV_get_value,
+      PV_put_value,
+      PV_get_handle,
+      0
+};
+
+vpiHandle vpip_make_PV(char*var, int base, int width)
+{
+
+      struct __vpiPV*obj = (struct __vpiPV*) malloc(sizeof(struct __vpiPV));
+      obj->base.vpi_type = &vpip_PV_rt;
+      obj->parent = vvp_lookup_handle(var);
+      obj->tbase = base;
+      obj->twid = 0;
+      obj->width = (unsigned) width;
+      obj->net = (vvp_net_t*) malloc(sizeof(vvp_net_t));
+      functor_ref_lookup(&obj->net, var);
+
+      return &obj->base;
+}
+
+vpiHandle vpip_make_PV(char*var, int tbase, int twid, int width)
+{
+      struct __vpiPV*obj = (struct __vpiPV*) malloc(sizeof(struct __vpiPV));
+      obj->base.vpi_type = &vpip_PV_rt;
+      obj->parent = vvp_lookup_handle(var);
+      obj->tbase = tbase;
+      obj->twid = (unsigned) twid;
+      obj->width = (unsigned) width;
+      obj->net = (vvp_net_t*) malloc(sizeof(vvp_net_t));
+      functor_ref_lookup(&obj->net, var);
+
+      return &obj->base;
+}
diff --git a/vvp/vpi_vthr_vector.cc b/vvp/vpi_vthr_vector.cc
index 72a043fe5..85082b935 100644
--- a/vvp/vpi_vthr_vector.cc
+++ b/vvp/vpi_vthr_vector.cc
@@ -67,7 +67,7 @@ void set_bit(struct __vpiVThrVec *rfp, unsigned idx, vvp_bit4_t bit)
 
 extern const char hex_digits[256];
 
-extern const char oct_digits[256];
+extern const char oct_digits[64];
 
 /*
  *  vpi_get

From 6f308131021fadb20bdda0284df2dada65b4e019 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@wing.icarus.com>
Date: Thu, 29 May 2008 13:52:12 -0700
Subject: [PATCH 14/16] Prevent overflow when parsing 32bit values

The source can carry 32bit numbers. Watch out that they are handled
all the way through to the compiled results on 32bit systems.
---
 vvp/lexor.lex  | 4 ++--
 vvp/parse.y    | 2 +-
 vvp/vthread.cc | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/vvp/lexor.lex b/vvp/lexor.lex
index 0a266b389..fb2ec4ade 100644
--- a/vvp/lexor.lex
+++ b/vvp/lexor.lex
@@ -184,11 +184,11 @@
       return T_INSTR; }
 
 [0-9][0-9]* {
-      yylval.numb = strtol(yytext, 0, 0);
+      yylval.numb = strtoul(yytext, 0, 0);
       return T_NUMBER; }
 
 "0x"[0-9a-fA-F]+ {
-      yylval.numb = strtol(yytext, 0, 0);
+      yylval.numb = strtoul(yytext, 0, 0);
       return T_NUMBER; }
 
   /* Handle some specialized constant/literals as symbols. */
diff --git a/vvp/parse.y b/vvp/parse.y
index 729a0158d..fd12da226 100644
--- a/vvp/parse.y
+++ b/vvp/parse.y
@@ -47,7 +47,7 @@ static struct __vpiModPath*modpath_dst = 0;
 %union {
       char*text;
       char **table;
-      long numb;
+      unsigned long numb;
       bool flag;
 
       comp_operands_t opa;
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 5f82644ba..8b3b693fa 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -1102,7 +1102,7 @@ static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 {
 
       unsigned idx1 = cp->bit_idx[0];
-      unsigned imm  = cp->bit_idx[1];
+      unsigned long imm  = cp->bit_idx[1];
       unsigned wid  = cp->number;
       if (idx1 >= 4)
 	    thr_check_addr(thr, idx1+wid-1);
@@ -1116,8 +1116,8 @@ static bool of_CMPIU_the_hard_way(vthread_t thr, vvp_code_t cp)
 
       vvp_bit4_t eq  = BIT4_0;
       for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
-	    vvp_bit4_t rv = (imm & 1)? BIT4_1 : BIT4_0;
-	    imm >>= 1;
+	    vvp_bit4_t rv = (imm & 1UL)? BIT4_1 : BIT4_0;
+	    imm >>= 1UL;
 
 	    if (bit4_is_xz(lv)) {
 		  eq = BIT4_X;

From 6f0d98cf186ddae536e1e5c9952f334f0242984f Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Thu, 29 May 2008 14:00:03 -0700
Subject: [PATCH 15/16] Constrain multiply word to prevent overflow.

The multiply runs does not need to do all the combinations of digit
products, because the higher ones cannot add into the result. Fix the
iteration to limit the scan.
---
 vvp/vthread.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 5f82644ba..b170cb01f 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3032,7 +3032,7 @@ bool of_MUL(vthread_t thr, vvp_code_t cp)
 	    res[idx] = 0;
 
       for (unsigned mul_a = 0 ; mul_a < words ; mul_a += 1) {
-	    for (unsigned mul_b = 0 ; mul_b < words ; mul_b += 1) {
+	    for (unsigned mul_b = 0 ; mul_b < (words-mul_a) ; mul_b += 1) {
 		  unsigned long sum;
 		  unsigned long tmp = multiply_with_carry(ap[mul_a], bp[mul_b], sum);
 		  unsigned base = mul_a + mul_b;

From a8f492776aa38ff1358e8fc3cd13ace415dcee3b Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Thu, 29 May 2008 14:00:32 -0700
Subject: [PATCH 16/16] Check range of immediate value.

---
 tgt-vvp/eval_expr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 3bcbf2717..d4f341cd7 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -92,6 +92,7 @@ unsigned long get_number_immediate(ivl_expr_t ex)
 		    case '0':
 		      break;
 		    case '1':
+		      assert(idx < 8*sizeof(imm));
 		      imm |= 1UL << idx;
 		      break;
 		    default: