From e5381feb85b1d3a711b8205b5b2fff627b5a773d Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Sun, 2 Dec 2007 08:47:06 -0800
Subject: [PATCH 1/5] Clean up functor counters

The functor counters were left over from the v0.8 release. Rework
the counters to be relevent to the current state of vvp.

Signed-off-by: Stephen Williams <steve@icarus.com>
---
 vvp/logic.cc      | 16 +++++---
 vvp/main.cc       | 95 ++---------------------------------------------
 vvp/resolv.cc     | 12 +-----
 vvp/statistics.cc | 29 ++-------------
 vvp/statistics.h  | 27 ++------------
 vvp/vvp_net.cc    |  6 ++-
 6 files changed, 25 insertions(+), 160 deletions(-)

diff --git a/vvp/logic.cc b/vvp/logic.cc
index 5ffece35b..585ce8e52 100644
--- a/vvp/logic.cc
+++ b/vvp/logic.cc
@@ -43,7 +43,7 @@
 table_functor_s::table_functor_s(truth_t t)
 : table(t)
 {
-      count_functors_table += 1;
+      count_functors_logic += 1;
 }
 
 table_functor_s::~table_functor_s()
@@ -125,6 +125,7 @@ void vvp_fun_boolean_::recv_vec4_pv(vvp_net_ptr_t ptr, const vvp_vector4_t&bit,
 vvp_fun_and::vvp_fun_and(unsigned wid, bool invert)
 : vvp_fun_boolean_(wid), invert_(invert)
 {
+      count_functors_logic += 1;
 }
 
 vvp_fun_and::~vvp_fun_and()
@@ -160,6 +161,7 @@ void vvp_fun_and::run_run()
 vvp_fun_eeq::vvp_fun_eeq(unsigned wid, bool invert)
 : vvp_fun_boolean_(wid), invert_(invert)
 {
+      count_functors_logic += 1;
 }
 
 vvp_fun_eeq::~vvp_fun_eeq()
@@ -188,7 +190,7 @@ void vvp_fun_eeq::run_run()
 vvp_fun_buf::vvp_fun_buf()
 {
       net_ = 0;
-      count_functors_table += 1;
+      count_functors_logic += 1;
 }
 
 vvp_fun_buf::~vvp_fun_buf()
@@ -227,7 +229,7 @@ void vvp_fun_buf::run_run()
 
 vvp_fun_bufz::vvp_fun_bufz()
 {
-      count_functors_table += 1;
+      count_functors_logic += 1;
 }
 
 vvp_fun_bufz::~vvp_fun_bufz()
@@ -257,7 +259,7 @@ void vvp_fun_bufz::recv_real(vvp_net_ptr_t ptr, double bit)
 vvp_fun_muxr::vvp_fun_muxr()
 : a_(0.0), b_(0.0)
 {
-      count_functors_table += 1;
+      count_functors_logic += 1;
       select_ = 2;
 }
 
@@ -333,7 +335,7 @@ void vvp_fun_muxr::recv_real(vvp_net_ptr_t ptr, double bit)
 vvp_fun_muxz::vvp_fun_muxz(unsigned wid)
 : a_(wid), b_(wid)
 {
-      count_functors_table += 1;
+      count_functors_logic += 1;
       select_ = 2;
       for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
 	    a_.set_bit(idx, BIT4_X);
@@ -408,7 +410,7 @@ void vvp_fun_muxz::recv_vec4(vvp_net_ptr_t ptr, const vvp_vector4_t&bit)
 vvp_fun_not::vvp_fun_not()
 {
       net_ = 0;
-      count_functors_table += 1;
+      count_functors_logic += 1;
 }
 
 vvp_fun_not::~vvp_fun_not()
@@ -452,6 +454,7 @@ void vvp_fun_not::run_run()
 vvp_fun_or::vvp_fun_or(unsigned wid, bool invert)
 : vvp_fun_boolean_(wid), invert_(invert)
 {
+      count_functors_logic += 1;
 }
 
 vvp_fun_or::~vvp_fun_or()
@@ -487,6 +490,7 @@ void vvp_fun_or::run_run()
 vvp_fun_xor::vvp_fun_xor(unsigned wid, bool invert)
 : vvp_fun_boolean_(wid), invert_(invert)
 {
+      count_functors_logic += 1;
 }
 
 vvp_fun_xor::~vvp_fun_xor()
diff --git a/vvp/main.cc b/vvp/main.cc
index 5bf72a593..cecdd6990 100644
--- a/vvp/main.cc
+++ b/vvp/main.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2001-2007 Stephen Williams (steve@icarus.com)
  *
  *    This source code is free software; you can redistribute it
  *    and/or modify it in source code form under the terms of the GNU
@@ -16,9 +16,6 @@
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
-#ifdef HAVE_CVS_IDENT
-#ident "$Id: main.cc,v 1.44 2007/02/16 23:30:14 steve Exp $"
-#endif
 
 # include  "config.h"
 # include  "parse_misc.h"
@@ -267,10 +264,10 @@ int main(int argc, char*argv[])
 
       if (verbose_flag) {
 	    vpi_mcd_printf(1, " ... %8lu functors\n", count_functors);
-	    vpi_mcd_printf(1, "           %8lu table\n",  count_functors_table);
+	    vpi_mcd_printf(1, "           %8lu logic\n",  count_functors_logic);
 	    vpi_mcd_printf(1, "           %8lu bufif\n",  count_functors_bufif);
 	    vpi_mcd_printf(1, "           %8lu resolv\n",count_functors_resolv);
-	    vpi_mcd_printf(1, "           %8lu variable\n", count_functors_var);
+	    vpi_mcd_printf(1, "           %8lu signals\n", count_functors_sig);
 	    vpi_mcd_printf(1, " ... %8lu opcodes (%lu bytes)\n",
 		    count_opcodes, (unsigned long)size_opcodes);
 	    vpi_mcd_printf(1, " ... %8lu nets\n",     count_vpi_nets);
@@ -306,89 +303,3 @@ int main(int argc, char*argv[])
       return 0;
 }
 
-/*
- * $Log: main.cc,v $
- * Revision 1.44  2007/02/16 23:30:14  steve
- *  Get page size from sysconf.
- *
- * Revision 1.43  2006/04/28 15:44:37  steve
- *  Include math.h with lround implementation.
- *
- * Revision 1.42  2006/04/28 15:40:30  steve
- *  lround takes double, not float.
- *
- * Revision 1.41  2006/04/27 05:04:59  steve
- *  Detect missing lround function.
- *
- * Revision 1.40  2005/01/29 06:28:19  steve
- *  Add the -s flag to start up interactive.
- *
- * Revision 1.39  2004/10/04 01:10:59  steve
- *  Clean up spurious trailing white space.
- *
- * Revision 1.38  2003/06/25 04:04:19  steve
- *  Fix mingw portability problems.
- *
- * Revision 1.37  2003/06/13 19:51:08  steve
- *  Include verbose messages in log output.
- *
- * Revision 1.36  2003/05/15 16:51:09  steve
- *  Arrange for mcd id=00_00_00_01 to go to stdout
- *  as well as a user specified log file, set log
- *  file to buffer lines.
- *
- *  Add vpi_flush function, and clear up some cunfused
- *  return codes from other vpi functions.
- *
- *  Adjust $display and vcd/lxt messages to use the
- *  standard output/log file.
- *
- * Revision 1.35  2003/03/13 04:36:57  steve
- *  Remove the obsolete functor delete functions.
- *
- * Revision 1.34  2003/02/07 02:45:05  steve
- *  Mke getopt ignore options after the file name.
- *
- * Revision 1.33  2003/01/18 23:55:35  steve
- *  Add a means to clear the module search path.
- *
- * Revision 1.32  2003/01/06 23:57:26  steve
- *  Schedule wait lists of threads as a single event,
- *  to save on events. Also, improve efficiency of
- *  event_s allocation. Add some event statistics to
- *  get an idea where performance is really going.
- *
- * Revision 1.31  2002/09/18 03:34:07  steve
- *  printf size warning.
- *
- * Revision 1.30  2002/08/12 01:35:08  steve
- *  conditional ident string using autoconfig.
- *
- * Revision 1.29  2002/07/15 00:21:42  steve
- *  Fix initialization of symbol table string heap.
- *
- * Revision 1.28  2002/07/05 20:08:44  steve
- *  Count different types of functors.
- *
- * Revision 1.27  2002/07/05 17:14:15  steve
- *  Names of vpi objects allocated as vpip_strings.
- *
- * Revision 1.26  2002/07/05 03:47:06  steve
- *  Track opcode memory space.
- *
- * Revision 1.25  2002/07/05 02:50:58  steve
- *  Remove the vpi object symbol table after compile.
- *
- * Revision 1.24  2002/04/12 02:44:02  steve
- *  Formally define extended arguments to vvp.
- *
- * Revision 1.23  2002/03/01 05:43:14  steve
- *  Add cleanup to verbose messages.
- *
- * Revision 1.22  2002/01/09 03:15:23  steve
- *  Add vpi_get_vlog_info support.
- *
- * Revision 1.21  2001/10/20 01:03:42  steve
- *  Print memory usage information if requested (Stephan Boettcher)
- */
-
diff --git a/vvp/resolv.cc b/vvp/resolv.cc
index 5bf36ff28..e375b5c41 100644
--- a/vvp/resolv.cc
+++ b/vvp/resolv.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2001-2004 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2001-2007 Stephen Williams (steve@icarus.com)
  *
  *    This source code is free software; you can redistribute it
  *    and/or modify it in source code form under the terms of the GNU
@@ -16,9 +16,6 @@
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
-#ifdef HAVE_CVS_IDENT
-#ident "$Id: resolv.cc,v 1.26 2005/06/22 18:30:12 steve Exp $"
-#endif
 
 # include  "resolv.h"
 # include  "schedule.h"
@@ -31,6 +28,7 @@
 resolv_functor::resolv_functor(vvp_scalar_t hiz_value, const char*debug_l)
 : hiz_(hiz_value), debug_label_(debug_l)
 {
+      count_functors_resolv += 1;
 }
 
 resolv_functor::~resolv_functor()
@@ -95,9 +93,3 @@ void resolv_functor::recv_vec8(vvp_net_ptr_t port, vvp_vector8_t bit)
 
       vvp_send_vec8(ptr->out, out);
 }
-
-
-/*
- * $Log: resolv.cc,v $
- */
-
diff --git a/vvp/statistics.cc b/vvp/statistics.cc
index 024599f3c..79bb19d24 100644
--- a/vvp/statistics.cc
+++ b/vvp/statistics.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2002-2007 Stephen Williams (steve@icarus.com)
  *
  *    This source code is free software; you can redistribute it
  *    and/or modify it in source code form under the terms of the GNU
@@ -16,9 +16,6 @@
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
-#ifdef HAVE_CVS_IDENT
-#ident "$Id: statistics.cc,v 1.5 2002/08/12 01:35:08 steve Exp $"
-#endif
 
 # include  "statistics.h"
 
@@ -28,10 +25,10 @@
 unsigned long count_opcodes = 0;
 
 unsigned long count_functors = 0;
-unsigned long count_functors_table = 0;
+unsigned long count_functors_logic = 0;
 unsigned long count_functors_bufif = 0;
 unsigned long count_functors_resolv= 0;
-unsigned long count_functors_var   = 0;
+unsigned long count_functors_sig   = 0;
 
 unsigned long count_vpi_nets = 0;
 
@@ -41,23 +38,3 @@ unsigned long count_vpi_scopes = 0;
 
 size_t size_opcodes = 0;
 
-
-/*
- * $Log: statistics.cc,v $
- * Revision 1.5  2002/08/12 01:35:08  steve
- *  conditional ident string using autoconfig.
- *
- * Revision 1.4  2002/07/05 20:08:44  steve
- *  Count different types of functors.
- *
- * Revision 1.3  2002/07/05 17:14:15  steve
- *  Names of vpi objects allocated as vpip_strings.
- *
- * Revision 1.2  2002/07/05 03:46:43  steve
- *  Track opcode memory space.
- *
- * Revision 1.1  2002/07/05 02:50:58  steve
- *  Remove the vpi object symbol table after compile.
- *
- */
-
diff --git a/vvp/statistics.h b/vvp/statistics.h
index 848b06699..1bd6e3bee 100644
--- a/vvp/statistics.h
+++ b/vvp/statistics.h
@@ -1,7 +1,7 @@
 #ifndef __statistics_H
 #define __statistics_H
 /*
- * Copyright (c) 2002 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2002-2007 Stephen Williams (steve@icarus.com)
  *
  *    This source code is free software; you can redistribute it
  *    and/or modify it in source code form under the terms of the GNU
@@ -18,40 +18,19 @@
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
-#ifdef HAVE_CVS_IDENT
-#ident "$Id: statistics.h,v 1.5 2002/08/12 01:35:08 steve Exp $"
-#endif
 
 # include  <stddef.h>
 
 extern unsigned long count_opcodes;
 extern unsigned long count_functors;
-extern unsigned long count_functors_table;
+extern unsigned long count_functors_logic;
 extern unsigned long count_functors_bufif;
 extern unsigned long count_functors_resolv;
-extern unsigned long count_functors_var;
+extern unsigned long count_functors_sig;
 extern unsigned long count_vpi_nets;
 extern unsigned long count_vpi_scopes;
 extern unsigned long count_vpi_memories;
 
 extern size_t size_opcodes;
 
-/*
- * $Log: statistics.h,v $
- * Revision 1.5  2002/08/12 01:35:08  steve
- *  conditional ident string using autoconfig.
- *
- * Revision 1.4  2002/07/05 20:08:44  steve
- *  Count different types of functors.
- *
- * Revision 1.3  2002/07/05 17:14:15  steve
- *  Names of vpi objects allocated as vpip_strings.
- *
- * Revision 1.2  2002/07/05 03:46:43  steve
- *  Track opcode memory space.
- *
- * Revision 1.1  2002/07/05 02:50:58  steve
- *  Remove the vpi object symbol table after compile.
- *
- */
 #endif
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 595f02443..e1eade218 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2005 Stephen Williams (steve@icarus.com)
+ * Copyright (c) 2004-2007 Stephen Williams (steve@icarus.com)
  *
  *    This source code is free software; you can redistribute it
  *    and/or modify it in source code form under the terms of the GNU
@@ -16,11 +16,11 @@
  *    along with this program; if not, write to the Free Software
  *    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
  */
-#ident "$Id: vvp_net.cc,v 1.64 2007/06/12 02:36:58 steve Exp $"
 
 # include  "config.h"
 # include  "vvp_net.h"
 # include  "schedule.h"
+# include  "statistics.h"
 # include  <stdio.h>
 # include  <iostream>
 # include  <typeinfo>
@@ -1406,6 +1406,7 @@ ostream& operator<<(ostream&out, const vvp_vector8_t&that)
 
 vvp_net_fun_t::vvp_net_fun_t()
 {
+      count_functors += 1;
 }
 
 vvp_net_fun_t::~vvp_net_fun_t()
@@ -1477,6 +1478,7 @@ vvp_fun_signal_base::vvp_fun_signal_base()
       continuous_assign_active_ = false;
       force_link = 0;
       cassign_link = 0;
+      count_functors_sig += 1;
 }
 
 void vvp_fun_signal_base::deassign()

From 68a9526fec3f3dc29dc40a5b79a9a5428efe159b Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Sun, 2 Dec 2007 19:00:12 -0800
Subject: [PATCH 2/5] Minor performance tweak of vector_to_array function.

---
 vvp/vthread.cc |  3 ++-
 vvp/vvp_net.cc | 12 +++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 878418e17..3fe8e53e6 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -170,16 +170,17 @@ void vthread_put_real(struct vthread_s*thr, unsigned addr, double val)
 static unsigned long* vector_to_array(struct vthread_s*thr,
 				      unsigned addr, unsigned wid)
 {
-      unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
 
 
       if (addr == 0) {
+	    unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
 	    unsigned long*val = new unsigned long[awid];
 	    for (unsigned idx = 0 ;  idx < awid ;  idx += 1)
 		  val[idx] = 0;
 	    return val;
       }
       if (addr == 1) {
+	    unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
 	    unsigned long*val = new unsigned long[awid];
 	    for (unsigned idx = 0 ;  idx < awid ;  idx += 1)
 		  val[idx] = -1UL;
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index e1eade218..aed907fe7 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -355,6 +355,9 @@ unsigned long* vvp_vector4_t::subarray(unsigned adr, unsigned wid) const
 	    unsigned long tmp = bits_ptr_[adr/BITS_PER_WORD];
 	    tmp >>= 2UL * (adr%BITS_PER_WORD);
 
+	    unsigned long mask1 = 1;
+	    const unsigned long mask1_last = 1UL << (BIT2_PER_WORD-1);
+	    unsigned long*val_ptr = val;
 	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
 		    /* Starting a new word? */
 		  if (adr%BITS_PER_WORD == 0)
@@ -363,10 +366,17 @@ unsigned long* vvp_vector4_t::subarray(unsigned adr, unsigned wid) const
 		  if (tmp&2)
 			goto x_out;
 		  if (tmp&1)
-			val[idx/BIT2_PER_WORD] |= 1UL << (idx % BIT2_PER_WORD);
+			*val_ptr |= mask1;
 
 		  adr += 1;
 		  tmp >>= 2UL;
+
+		  if (mask1 == mask1_last) {
+			val_ptr += 1;
+			mask1 = 1;
+		  } else {
+			mask1 <<= 1;
+		  }
 	    }
       }
 

From 8f519531f30fec420bffc1495318fa091ec68519 Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 4 Dec 2007 19:15:15 -0800
Subject: [PATCH 3/5] Optimize load-add with load/add instruction

Where and expression is an immediate value added to a signal value,
it is possible to optimize them to a single instruction that combines
the load with an add at the same time.
---
 tgt-vvp/eval_expr.c | 64 ++++++++++++++++++++++++++++++++++++++++++---
 vvp/codes.h         |  1 +
 vvp/compile.cc      |  1 +
 vvp/vthread.cc      | 37 +++++++++++++++++++++++++-
 vvp/vvp_net.cc      | 59 +++++++++++++++++++++++++++++++++++++++++
 vvp/vvp_net.h       |  5 ++++
 6 files changed, 163 insertions(+), 4 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index c9a406e16..e01fbb97d 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -27,6 +27,8 @@
 
 static void draw_eval_expr_dest(ivl_expr_t exp, struct vector_info dest,
 				int ok_flags);
+static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
+			     int add_index);
 
 int number_is_unknown(ivl_expr_t ex)
 {
@@ -998,6 +1000,28 @@ static struct vector_info draw_binary_expr_lrs(ivl_expr_t exp, unsigned wid)
       return lv;
 }
 
+static struct vector_info draw_load_add_immediate(ivl_expr_t le,
+						  ivl_expr_t re,
+						  unsigned wid)
+{
+      struct vector_info lv;
+      unsigned long imm;
+
+      imm = get_number_immediate(re);
+
+	/* Load the immidiate value into word register 0 */
+      fprintf(vvp_out, "  %%ix/load 0, %lu;\n", imm);
+
+      lv.base = allocate_vector(wid);
+      lv.wid = wid;
+
+	/* Load the signal value with %loads that add the index
+	   register to the value being loaded. */
+      draw_signal_dest(le, lv, 0);
+
+      return lv;
+}
+
 static struct vector_info draw_add_immediate(ivl_expr_t le,
 					     ivl_expr_t re,
 					     unsigned wid)
@@ -1098,6 +1122,26 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)
 
       const char*sign_string = ivl_expr_signed(exp)? "/s" : "";
 
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(re) == IVL_EX_ULONG))
+	    return draw_load_add_immediate(le, re, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(re) == IVL_EX_NUMBER))
+	    return draw_load_add_immediate(le, re, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(le) == IVL_EX_ULONG))
+	    return draw_load_add_immediate(re, le, wid);
+
+      if ((ivl_expr_opcode(exp) == '+')
+	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
+	  && (ivl_expr_type(le) == IVL_EX_NUMBER))
+	    return draw_load_add_immediate(re, le, wid);
+
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(re) == IVL_EX_ULONG))
 	    return draw_add_immediate(le, re, wid);
@@ -1663,8 +1707,12 @@ static void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned s
  * into the thread bits. Remember to account for the part select by
  * offsetting the read from the lsi (least significant index) of the
  * signal.
+ *
+ * If the add_index is >=0, then generate a %load/vpp to add the
+ * word0 value to the loaded value before storing it into the destination.
  */
-static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)
+static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
+			     int add_index)
 {
       unsigned swid = ivl_expr_width(exp);
       ivl_signal_t sig = ivl_expr_signal(exp);
@@ -1679,6 +1727,7 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)
       if (ivl_signal_array_count(sig) > 1) {
 	    ivl_expr_t ix = ivl_expr_oper1(exp);
 	    if (!number_is_immediate(ix, 8*sizeof(unsigned long))) {
+		  assert(add_index < 0);
 		  draw_eval_expr_into_integer(ix, 3);
 		  fprintf(vvp_out, "   %%load/av %u, v%p, %u;\n",
 			  res.base, sig, swid);
@@ -1694,11 +1743,20 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res)
 
       if (ivl_signal_data_type(sig) == IVL_VT_REAL) {
 
+	    assert(add_index < 0);
 	    int tmp = allocate_word();
 	    fprintf(vvp_out, " %%load/wr %d, v%p_%u;\n", tmp, sig, word);
 	    fprintf(vvp_out, " %%cvt/vr %u, %d, %u;\n", res.base, tmp, res.wid);
 	    clr_word(tmp);
 
+      } else if (add_index >= 0) {
+
+	    assert(add_index == 0);
+
+	      /* If this is a REG (a variable) then I can do a vector read. */
+	    fprintf(vvp_out, "    %%load/vp0 %u, v%p_%u, %u;\n",
+		    res.base, sig, word, swid);
+
       } else {
 
 	      /* If this is a REG (a variable) then I can do a vector read. */
@@ -1730,7 +1788,7 @@ static struct vector_info draw_signal_expr(ivl_expr_t exp, unsigned wid,
       res.wid  = wid;
       save_expression_lookaside(res.base, exp, wid);
 
-      draw_signal_dest(exp, res);
+      draw_signal_dest(exp, res, -1);
       return res;
 }
 
@@ -2232,7 +2290,7 @@ static void draw_eval_expr_dest(ivl_expr_t exp, struct vector_info dest,
       switch (ivl_expr_type(exp)) {
 
 	  case IVL_EX_SIGNAL:
-	    draw_signal_dest(exp, dest);
+	    draw_signal_dest(exp, dest, -1);
 	    return;
 
 	  default:
diff --git a/vvp/codes.h b/vvp/codes.h
index c70093494..3e8bd4335 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -95,6 +95,7 @@ extern bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_MV(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_NX(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_XP(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 71ba609e0..6fae6ded4 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -141,6 +141,7 @@ const static struct opcode_table_s opcode_table[] = {
       { "%load/mv",of_LOAD_MV,3,  {OA_BIT1,     OA_MEM_PTR,  OA_BIT2} },
       { "%load/nx",of_LOAD_NX,3,  {OA_BIT1,     OA_VPI_PTR,  OA_BIT2} },
       { "%load/v", of_LOAD_VEC,3, {OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
+      { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/wr",of_LOAD_WR,2,  {OA_BIT1,     OA_VPI_PTR,  OA_BIT2} },
       { "%load/x", of_LOAD_X, 3,  {OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/x.p",of_LOAD_XP, 3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 3fe8e53e6..6bfbbe328 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -2188,13 +2188,14 @@ bool of_LOAD_NX(vthread_t thr, vvp_code_t cp)
  * The functor to read from is the vvp_net_t object pointed to by the
  * cp->net pointer.
  */
-bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
+vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
       assert(cp->bit_idx[0] >= 4);
       assert(cp->bit_idx[1] > 0);
 
       unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
+      int64_t addend = thr->words[0].w_int;
       vvp_net_t*net = cp->net;
 
 	/* For the %load to work, the functor must actually be a
@@ -2209,6 +2210,40 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
       vvp_vector4_t sig_value = sig->vec4_value();
       sig_value.resize(wid);
 
+      return sig_value;
+}
+
+bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
+{
+      unsigned bit = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
+
+      vvp_vector4_t sig_value = load_base(thr, cp);
+
+	/* Check the address once, before we scan the vector. */
+      thr_check_addr(thr, bit+wid-1);
+
+	/* Copy the vector bits into the bits4 vector. Do the copy
+	   directly to skip the excess calls to thr_check_addr. */
+      thr->bits4.set_vec(bit, sig_value);
+
+      return true;
+}
+
+/*
+* This is like of_LOAD_VEC, but includes an add of an integer value.
+*/
+bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
+{
+      unsigned bit = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
+      int64_t addend = thr->words[0].w_int;
+
+      vvp_vector4_t sig_value = load_base(thr, cp);
+
+	/* Add the addend value */
+      sig_value += addend;
+
 	/* Check the address once, before we scan the vector. */
       thr_check_addr(thr, bit+wid-1);
 
diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index aed907fe7..6be770b1b 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -557,6 +557,31 @@ bool vvp_vector4_t::eeq(const vvp_vector4_t&that) const
       return true;
 }
 
+bool vvp_vector4_t::has_xz() const
+{
+      if (size_ < BITS_PER_WORD) {
+	    unsigned long mask = WORD_X_BITS >> 2*(BITS_PER_WORD - size_);
+	    return 0 != (bits_val_&mask);
+      }
+
+      if (size_ == BITS_PER_WORD) {
+	    return 0 != (bits_val_&WORD_X_BITS);
+      }
+
+      unsigned words = size_ / BITS_PER_WORD;
+      for (unsigned idx = 0 ; idx < words ; idx += 1) {
+	    if (bits_ptr_[idx] & WORD_X_BITS)
+		  return true;
+      }
+
+      unsigned long mask = size_%BITS_PER_WORD;
+      if (mask > 0) {
+	    mask = WORD_X_BITS >> 2*(BITS_PER_WORD - mask);
+	    return 0 != bits_ptr_[words]&mask;
+      }
+
+      return false;
+}
 
 void vvp_vector4_t::change_z2x()
 {
@@ -602,6 +627,40 @@ char* vvp_vector4_t::as_string(char*buf, size_t buf_len)
       return res;
 }
 
+/*
+* Add an integer to the vvp_vector4_t in place, bit by bit so that
+* there is no size limitations.
+*/
+vvp_vector4_t& vvp_vector4_t::operator += (int64_t that)
+{
+      vvp_bit4_t carry = BIT4_0;
+      unsigned idx;
+
+      if (has_xz()) {
+	    vvp_vector4_t xxx (size(), BIT4_X);
+	    *this = xxx;
+	    return *this;
+      }
+
+      for (idx = 0 ; idx < size() ; idx += 1) {
+	    if (that == 0 && carry==BIT4_0)
+		  break;
+
+	    vvp_bit4_t that_bit = (that&1)? BIT4_1 : BIT4_0;
+	    that >>= 1;
+
+	    if (that_bit==BIT4_0 && carry==BIT4_0)
+		  continue;
+
+	    vvp_bit4_t bit = value(idx);
+	    bit = add_with_carry(bit, that_bit, carry);
+
+	    set_bit(idx, bit);
+      }
+
+      return *this;
+}
+
 ostream& operator<< (ostream&out, const vvp_vector4_t&that)
 {
       out << that.size() << "'b";
diff --git a/vvp/vvp_net.h b/vvp/vvp_net.h
index 50c272992..6bc809c5b 100644
--- a/vvp/vvp_net.h
+++ b/vvp/vvp_net.h
@@ -121,12 +121,17 @@ class vvp_vector4_t {
 	// Test that the vectors are exactly equal
       bool eeq(const vvp_vector4_t&that) const;
 
+	// Return true if there is an X or Z anywhere in the vector.
+      bool has_xz() const;
+
 	// Change all Z bits to X bits.
       void change_z2x();
 
 	// Display the value into the buf as a string.
       char*as_string(char*buf, size_t buf_len);
 
+      vvp_vector4_t& operator += (int64_t);
+
     private:
 	// Number of vvp_bit4_t bits that can be shoved into a word.
       enum { BITS_PER_WORD = 8*sizeof(unsigned long)/2 };

From 3b90a827e5a2a0d99b848b79e4770e0fe44af59c Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 4 Dec 2007 22:14:26 -0800
Subject: [PATCH 4/5] Fix simple compile warnings.

---
 vvp/vthread.cc | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 6bfbbe328..abfd580b7 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -170,8 +170,6 @@ void vthread_put_real(struct vthread_s*thr, unsigned addr, double val)
 static unsigned long* vector_to_array(struct vthread_s*thr,
 				      unsigned addr, unsigned wid)
 {
-
-
       if (addr == 0) {
 	    unsigned awid = (wid + CPU_WORD_BITS - 1) / (CPU_WORD_BITS);
 	    unsigned long*val = new unsigned long[awid];
@@ -431,7 +429,6 @@ bool of_ADD(vthread_t thr, vvp_code_t cp)
       if (lva == 0 || lvb == 0)
 	    goto x_out;
 
-
       unsigned long carry;
       carry = 0;
       for (unsigned idx = 0 ;  (idx*CPU_WORD_BITS) < cp->number ;  idx += 1) {
@@ -2193,9 +2190,7 @@ vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
       assert(cp->bit_idx[0] >= 4);
       assert(cp->bit_idx[1] > 0);
 
-      unsigned bit = cp->bit_idx[0];
       unsigned wid = cp->bit_idx[1];
-      int64_t addend = thr->words[0].w_int;
       vvp_net_t*net = cp->net;
 
 	/* For the %load to work, the functor must actually be a

From 19e8c057883c4e6f488983e9ad675e44ed1ea25e Mon Sep 17 00:00:00 2001
From: Stephen Williams <steve@icarus.com>
Date: Tue, 4 Dec 2007 22:16:31 -0800
Subject: [PATCH 5/5] Optimize X check in vector subarray

Optimize check for X bits while doing vector4 subarray. In particular,
do X checks a word at a time so that individual bits need not be tested.
---
 vvp/vvp_net.cc | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/vvp/vvp_net.cc b/vvp/vvp_net.cc
index 6be770b1b..cc66e6425 100644
--- a/vvp/vvp_net.cc
+++ b/vvp/vvp_net.cc
@@ -353,22 +353,42 @@ unsigned long* vvp_vector4_t::subarray(unsigned adr, unsigned wid) const
 	      /* Get the first word we are scanning. We may in fact be
 		 somewhere in the middle of that word. */
 	    unsigned long tmp = bits_ptr_[adr/BITS_PER_WORD];
-	    tmp >>= 2UL * (adr%BITS_PER_WORD);
+	    unsigned long off = adr%BITS_PER_WORD;
+	    tmp >>= 2UL * off;
 
+	      // Test for X bits but not beyond the desired wid.
+	    unsigned long xmask = WORD_X_BITS;
+	    if (wid < (BITS_PER_WORD-off))
+		  xmask &= ~(-1UL << 2*wid);
+	    if (tmp & xmask)
+		  goto x_out;
+
+	      // Where in the target array to write the next bit.
 	    unsigned long mask1 = 1;
 	    const unsigned long mask1_last = 1UL << (BIT2_PER_WORD-1);
 	    unsigned long*val_ptr = val;
+	      // Track where the source bit is in the source word.
+	    unsigned adr_bit = adr%BITS_PER_WORD;
+	      // Scan...
 	    for (unsigned idx = 0 ;  idx < wid ;  idx += 1) {
 		    /* Starting a new word? */
-		  if (adr%BITS_PER_WORD == 0)
+		  if (adr_bit == BITS_PER_WORD) {
 			tmp = bits_ptr_[adr/BITS_PER_WORD];
+			  // If this is the last word, then only test
+			  // for X in the valid bits.
+			xmask = WORD_X_BITS;
+			if ((wid-idx) < BITS_PER_WORD)
+			      xmask &= ~(WORD_Z_BITS<<2*(wid-idx));
+			if (tmp & xmask)
+			      goto x_out;
+			adr_bit = 0;
+		  }
 
-		  if (tmp&2)
-			goto x_out;
 		  if (tmp&1)
 			*val_ptr |= mask1;
 
 		  adr += 1;
+		  adr_bit += 1;
 		  tmp >>= 2UL;
 
 		  if (mask1 == mask1_last) {