diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 9efebb366..37f6af26c 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -28,7 +28,7 @@
 static void draw_eval_expr_dest(ivl_expr_t exp, struct vector_info dest,
 				int ok_flags);
 static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
-			     int add_index, unsigned long immediate);
+			     int add_index, long immediate);
 
 int number_is_unknown(ivl_expr_t ex)
 {
@@ -1158,12 +1158,11 @@ static struct vector_info draw_binary_expr_lrs(ivl_expr_t exp, unsigned wid)
 
 static struct vector_info draw_load_add_immediate(ivl_expr_t le,
 						  ivl_expr_t re,
-						  unsigned wid)
+						  unsigned wid,
+						  int signed_flag)
 {
       struct vector_info lv;
-      unsigned long imm;
-
-      imm = get_number_immediate(re);
+      long imm = get_number_immediate(re);
       lv.base = allocate_vector(wid);
       lv.wid = wid;
       if (lv.base == 0) {
@@ -1176,7 +1175,7 @@ static struct vector_info draw_load_add_immediate(ivl_expr_t le,
 
 	/* Load the signal value with a %load that adds the index
 	   register to the value being loaded. */
-      draw_signal_dest(le, lv, 0, imm);
+      draw_signal_dest(le, lv, signed_flag, imm);
 
       return lv;
 }
@@ -1319,25 +1318,27 @@ static struct vector_info draw_binary_expr_arith(ivl_expr_t exp, unsigned wid)
 
       const char*sign_string = ivl_expr_signed(le) && ivl_expr_signed(re)? "/s" : "";
 
+      int signed_flag = ivl_expr_signed(exp)? 1 : 0;
+
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
 	  && (ivl_expr_type(re) == IVL_EX_ULONG))
-	    return draw_load_add_immediate(le, re, wid);
+	    return draw_load_add_immediate(le, re, wid, signed_flag);
 
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(le) == IVL_EX_SIGNAL)
 	  && (ivl_expr_type(re) == IVL_EX_NUMBER))
-	    return draw_load_add_immediate(le, re, wid);
+	    return draw_load_add_immediate(le, re, wid, signed_flag);
 
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
 	  && (ivl_expr_type(le) == IVL_EX_ULONG))
-	    return draw_load_add_immediate(re, le, wid);
+	    return draw_load_add_immediate(re, le, wid, signed_flag);
 
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(re) == IVL_EX_SIGNAL)
 	  && (ivl_expr_type(le) == IVL_EX_NUMBER))
-	    return draw_load_add_immediate(re, le, wid);
+	    return draw_load_add_immediate(re, le, wid, signed_flag);
 
       if ((ivl_expr_opcode(exp) == '+')
 	  && (ivl_expr_type(re) == IVL_EX_ULONG))
@@ -1963,11 +1964,13 @@ void pad_expr_in_place(ivl_expr_t exp, struct vector_info res, unsigned swid)
  * offsetting the read from the lsi (least significant index) of the
  * signal.
  *
- * If the add_index is >=0, then generate a %load/vp0 to add the
- * word0 value to the loaded value before storing it into the destination.
+ * If the add_index is 0, then generate a %load/vp0 to add the
+ * word0 value to the loaded value before storing it into the
+ * destination. If the add_index is 1, then generate a %load/vp0/s to
+ * do a signed load.
  */
 static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
-			     int add_index, unsigned long immediate)
+			     int add_index, long immediate)
 {
       unsigned swid = ivl_expr_width(exp);
       ivl_signal_t sig = ivl_expr_signal(exp);
@@ -2009,13 +2012,17 @@ static void draw_signal_dest(ivl_expr_t exp, struct vector_info res,
 
       } else if (add_index >= 0) {
 
-	    assert(add_index == 0);
+	    const char*sign_flag = add_index==1? "/s" : "";
 
 	      /* If this is a REG (a variable) then I can do a vector read. */
-	    fprintf(vvp_out, "    %%ix/load 0, %lu;\n", immediate);
-	    fprintf(vvp_out, "    %%ix/load 2, %u;\n", res.wid);
-	    fprintf(vvp_out, "    %%load/vp0 %u, v%p_%u, %u;\n",
-		    res.base, sig, word, swid);
+	    if (immediate >= 0) {
+		  fprintf(vvp_out, "    %%ix/load 0, %lu;\n", immediate);
+	    } else {
+		  fprintf(vvp_out, "   %%ix/load 0, 0; immediate=%ld\n", immediate);
+		  fprintf(vvp_out, "   %%ix/sub 0, %ld;\n", -immediate);
+	    }
+	    fprintf(vvp_out, "    %%load/vp0%s %u, v%p_%u, %u;\n", sign_flag,
+		    res.base, sig,word, res.wid);
 	    swid = res.wid;
 
       } else {
diff --git a/vvp/codes.h b/vvp/codes.h
index 37341e6a1..58e1ee4be 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -100,6 +100,7 @@ extern bool of_LOAD_AVP0(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VEC(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_VP0(vthread_t thr, vvp_code_t code);
+extern bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_LOAD_X1P(vthread_t thr, vvp_code_t code);
 extern bool of_LOADI_WR(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 0dce76fe5..1b426498d 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -143,6 +143,7 @@ const static struct opcode_table_s opcode_table[] = {
       { "%load/avx.p",of_LOAD_AVX_P,3,{OA_BIT1, OA_ARR_PTR,  OA_BIT2} },
       { "%load/v", of_LOAD_VEC,3, {OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/vp0",of_LOAD_VP0,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
+      { "%load/vp0/s",of_LOAD_VP0_S,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%load/wr",of_LOAD_WR,2,  {OA_BIT1,     OA_VPI_PTR,  OA_BIT2} },
       { "%load/x1p",of_LOAD_X1P,3,{OA_BIT1,     OA_FUNC_PTR, OA_BIT2} },
       { "%loadi/wr",of_LOADI_WR,3,{OA_BIT1,     OA_NUMBER,   OA_BIT2} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index f72fe2917..97fc6ec50 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -437,18 +437,21 @@ the specified thread register bit. The functor-label can refer to a
 from the least significant up to <wid> bits, is loaded starting at
 thread bit <bit>. It is an OK for the width to not match the vector
 width at the functor. If the <wid> is less than the width at the
-functor, then the most significant bits are dropped.
+functor, then the most significant bits are dropped. If the <wid> is
+more than the width at the functor, the value is padded with X bits.
 
 * %load/vp0 <bit>, <functor-label>, <wid>
+* %load/vp0/s <bit>, <functor-label>, <wid>
 
-This instruction is the same as %load/v above, except that it also
-adds the integer value is index register 0 into the loaded value. The
-addition is a Verilog-style add, which means that if any of the input
-bits are X or Z, the entire result is turned into a vector of X bits.
+This instruction is the similar %load/v above, except that it also
+adds the signed integer value in index register 0 into the loaded
+value. The addition is a Verilog-style add, which means that if any of
+the input bits are X or Z, the entire result is turned into a vector
+of X bits.
 
-Index register 2 contains the result width. The addition of the loaded
-value and the index are done at this width to avoid the problem of a
-small vector with a large immediate offset indexing an array.
+The <wid> is, line the %load/v, the result width. But unlike the
+%load/v, the vector is padded with 0s (%load/vp0) or sign extended
+(%load/vp0/s) to the desired width.
 
 * %load/wr <bit>, <vpi-label>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index fa52151b4..839471ada 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -2467,9 +2467,6 @@ bool of_LOAD_AVX_P(vthread_t thr, vvp_code_t cp)
  */
 static vvp_vector4_t load_base(vthread_t thr, vvp_code_t cp)
 {
-      assert(cp->bit_idx[0] >= 4);
-      assert(cp->bit_idx[1] > 0);
-
       vvp_net_t*net = cp->net;
 
 	/* For the %load to work, the functor must actually be a
@@ -2501,6 +2498,8 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
 	   directly to skip the excess calls to thr_check_addr. */
       thr->bits4.set_vec(bit, sig_value);
 
+	/* If the source is shorter then the desired width, then pad
+	   with BIT4_X values. */
       for (unsigned idx = sig_value.size() ; idx < wid ; idx += 1)
 	    thr->bits4.set_bit(bit+idx, BIT4_X);
 
@@ -2511,16 +2510,12 @@ bool of_LOAD_VEC(vthread_t thr, vvp_code_t cp)
  * This is like of_LOAD_VEC, but includes an add of an integer value from
  * index 0. The <wid> is the expected result width not the vector width.
  */
-bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
+
+static void load_vp0_common(vthread_t thr, vvp_code_t cp, const vvp_vector4_t&sig_value)
 {
       unsigned bit = cp->bit_idx[0];
+      unsigned wid = cp->bit_idx[1];
       int64_t addend = thr->words[0].w_int;
-      unsigned wid = thr->words[2].w_int;
-
-        /* We need a vector this wide to make the math work correctly.
-         * Copy the base bits into the vector, but keep the width. */
-      vvp_vector4_t sig_value(wid, BIT4_0);
-      sig_value.copy_bits(load_base(thr, cp));
 
 	/* Check the address once, before we scan the vector. */
       thr_check_addr(thr, bit+wid-1);
@@ -2529,7 +2524,7 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
       if (val == 0) {
 	    vvp_vector4_t tmp(wid, BIT4_X);
 	    thr->bits4.set_vec(bit, tmp);
-	    return true;
+	    return;
       }
 
       unsigned words = (wid + CPU_WORD_BITS - 1) / CPU_WORD_BITS;
@@ -2551,7 +2546,33 @@ bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
 	   directly to skip the excess calls to thr_check_addr. */
       thr->bits4.setarray(bit, wid, val);
       delete[]val;
+}
 
+bool of_LOAD_VP0(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->bit_idx[1];
+
+        /* We need a vector this wide to make the math work correctly.
+         * Copy the base bits into the vector, but keep the width. */
+      vvp_vector4_t sig_value(wid, BIT4_0);
+      sig_value.copy_bits(load_base(thr, cp));
+
+      load_vp0_common(thr, cp, sig_value);
+      return true;
+}
+
+bool of_LOAD_VP0_S(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->bit_idx[1];
+
+      vvp_vector4_t tmp (load_base(thr, cp));
+
+        /* We need a vector this wide to make the math work correctly.
+         * Copy the base bits into the vector, but keep the width. */
+      vvp_vector4_t sig_value(wid, tmp.value(tmp.size()-1));
+      sig_value.copy_bits(tmp);
+
+      load_vp0_common(thr, cp, sig_value);
       return true;
 }