diff --git a/tgt-vvp/eval_vec4.c b/tgt-vvp/eval_vec4.c
index 48d1377ba..bf7eb6d30 100644
--- a/tgt-vvp/eval_vec4.c
+++ b/tgt-vvp/eval_vec4.c
@@ -138,6 +138,9 @@ static void draw_binary_vec4_arith(ivl_expr_t expr)
 		case '+':
 		  draw_immediate_vec4(re, "%addi");
 		  return;
+		case '*':
+		  draw_immediate_vec4(re, "%muli");
+		  return;
 		default:
 		  break;
 	    }
diff --git a/vvp/codes.h b/vvp/codes.h
index b1a68eb88..b8f5eb295 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -151,6 +151,7 @@ extern bool of_MOD_S(vthread_t thr, vvp_code_t code);
 extern bool of_MOD_WR(vthread_t thr, vvp_code_t code);
 extern bool of_MOV_WU(vthread_t thr, vvp_code_t code);
 extern bool of_MUL(vthread_t thr, vvp_code_t code);
+extern bool of_MULI(vthread_t thr, vvp_code_t code);
 extern bool of_MUL_WR(vthread_t thr, vvp_code_t code);
 extern bool of_NAND(vthread_t thr, vvp_code_t code);
 extern bool of_NANDR(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index 830565362..95a11a691 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -200,6 +200,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%mov/wu", of_MOV_WU, 2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
       { "%mul",    of_MUL,    0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%mul/wr", of_MUL_WR, 0,  {OA_NONE,     OA_NONE,     OA_NONE} },
+      { "%muli",   of_MULI,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%nand",   of_NAND,   0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%nand/r", of_NANDR,  0,  {OA_NONE,     OA_NONE,     OA_NONE} },
       { "%new/cobj",  of_NEW_COBJ,  1, {OA_VPI_PTR,OA_NONE,  OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 08c315643..89353899c 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -754,6 +754,7 @@ This opcode is the real-valued modulus of the two real values.
 * %mov/wu <dst>, <src>
 
 * %mul
+* %muli <vala>, <valb>, <wid>
 
 This instruction multiplies the left vector by the right vector, the
 vectors pare popped from the vec4 stack and have the same width. If
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index 5d408c874..bb7d6116b 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3676,7 +3676,7 @@ static void do_verylong_mod(vvp_vector4_t&vala, const vvp_vector4_t&valb,
 
       vvp_vector4_t tmp (len, BIT4_X);
       carry = out_is_neg? 1 : 0;
-      for (unsigned idx = 0 ;  idx < len ;  idx += 1) {
+      for (int idx = 0 ;  idx < len ;  idx += 1) {
 	    unsigned ob = z[idx];
 	    if (out_is_neg) {
 		  ob = (1-ob) + carry;
@@ -4004,13 +4004,8 @@ bool of_MOV_WU(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
-/*
- * %mul
- */
-bool of_MUL(vthread_t thr, vvp_code_t)
+static bool do_MUL(vvp_vector4_t&vala, const vvp_vector4_t&valb)
 {
-      vvp_vector4_t valb = thr->pop_vec4();
-      vvp_vector4_t&vala = thr->peek_vec4();
       assert(vala.size() == valb.size());
       unsigned wid = vala.size();
 
@@ -4065,6 +4060,41 @@ bool of_MUL(vthread_t thr, vvp_code_t)
       return true;
 }
 
+/*
+ * %mul
+ */
+bool of_MUL(vthread_t thr, vvp_code_t)
+{
+      vvp_vector4_t r = thr->pop_vec4();
+	// Rather then pop l, use it directly from the stack. When we
+	// assign to 'l', that will edit the top of the stack, which
+	// replaces a pop and a pull.
+      vvp_vector4_t&l = thr->peek_vec4();
+
+      return do_MUL(l, r);
+}
+
+/*
+ * %muli <vala>, <valb>, <wid>
+ *
+ * Pop1 operand, get the other operand from the arguments, and push
+ * the result.
+ */
+bool of_MULI(vthread_t thr, vvp_code_t cp)
+{
+      unsigned wid = cp->number;
+
+      vvp_vector4_t&l = thr->peek_vec4();
+
+	// I expect that most of the bits of an immediate value are
+	// going to be zero, so start the result vector with all zero
+	// bits. Then we only need to replace the bits that are different.
+      vvp_vector4_t r (wid, BIT4_0);
+      get_immediate_rval (cp, r);
+
+      return do_MUL(l, r);
+}
+
 bool of_MUL_WR(vthread_t thr, vvp_code_t)
 {
       double r = thr->pop_real();