From b08120e2233cafe64b1f9f4306c7da0bfed3c6c7 Mon Sep 17 00:00:00 2001
From: Martin Whitaker <icarus@martin-whitaker.me.uk>
Date: Tue, 1 Feb 2011 22:44:01 +0000
Subject: [PATCH] Patch to improve sign extension efficiency in vvp.

Currently the vvp target emits multiple single bit %mov instructions
to perform sign extension. This patch adds a new %pad instruction
that allows sign extension to be performed with just one instruction.
---
 tgt-vvp/eval_expr.c | 76 +++++++++++++++++++--------------------------
 vvp/codes.h         |  1 +
 vvp/compile.cc      |  1 +
 vvp/opcodes.txt     |  8 +++++
 vvp/vthread.cc      | 18 +++++++++++
 5 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/tgt-vvp/eval_expr.c b/tgt-vvp/eval_expr.c
index 90709ec6c..9a20299b4 100644
--- a/tgt-vvp/eval_expr.c
+++ b/tgt-vvp/eval_expr.c
@@ -395,7 +395,7 @@ static struct vector_info draw_eq_immediate(ivl_expr_t expr, unsigned ewid,
 	    lv.base = base;
 	    lv.wid = ewid;
 	    if (ewid > 1)
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, ewid-1);
+		  fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, ewid-1);
 
       } else if (lv.wid < ewid) {
 	    unsigned base = allocate_vector(ewid);
@@ -411,7 +411,7 @@ static struct vector_info draw_eq_immediate(ivl_expr_t expr, unsigned ewid,
 		  clr_vector(lv);
 	    fprintf(vvp_out, "    %%mov %u, %u, %u;\n", base,
 		    lv.base, lv.wid);
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+	    fprintf(vvp_out, "    %%pad %u, 0, %u;\n",
 		    base+lv.wid, ewid-lv.wid);
 	    lv.base = base;
 	    lv.wid = ewid;
@@ -576,7 +576,7 @@ static struct vector_info draw_binary_expr_eq(ivl_expr_t expr,
 	lv.base = base;
 	lv.wid = ewid;
 	if (ewid > 1)
-	      fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, ewid-1);
+	      fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, ewid-1);
       }
 
       return lv;
@@ -659,7 +659,7 @@ static struct vector_info draw_binary_expr_land(ivl_expr_t expr, unsigned wid)
 	clr_vector(lv);
 	lv.base = base;
 	lv.wid = wid;
-	fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, wid-1);
+	fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, wid-1);
       }
 
       return lv;
@@ -770,7 +770,7 @@ static struct vector_info draw_binary_expr_lor(ivl_expr_t expr, unsigned wid,
 	if (lv.base >= 8) clr_vector(lv);
 	lv.base = base;
 	lv.wid = wid;
-	fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, wid-1);
+	fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, wid-1);
       }
 
       return lv;
@@ -883,7 +883,7 @@ static struct vector_info draw_binary_expr_le_bool(ivl_expr_t expr,
 	tmp.base = base;
 	tmp.wid = wid;
 	if (wid > 1)
-	      fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, wid-1);
+	      fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, wid-1);
       }
 
       return tmp;
@@ -1018,7 +1018,7 @@ static struct vector_info draw_binary_expr_le(ivl_expr_t expr,
 	lv.base = base;
 	lv.wid = wid;
 	if (wid > 1)
-	      fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base+1, wid-1);
+	      fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base+1, wid-1);
       }
 
       return lv;
@@ -1762,16 +1762,15 @@ static struct vector_info draw_concat_expr(ivl_expr_t expr, unsigned wid,
 
 	      /* Pad the expression when needed. */
 	    if (wid > concat_wid) {
+                  unsigned base = res.base+concat_wid;
+                  unsigned count = wid-concat_wid;
 		    /* We can get a signed concatenation with $signed({...}). */
 		  if (ivl_expr_signed(expr)) {
-			unsigned base = res.base+concat_wid-1;
-			for (idx = 1; idx <= wid-concat_wid; idx += 1) {
-			      fprintf(vvp_out, "    %%mov %u, %u, 1;\n",
-			                       base+idx, base);
-			}
+			fprintf(vvp_out, "    %%pad %u, %u, %u;\n",
+			                 base, base-1, count);
 		  } else {
-			fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
-			                 res.base+concat_wid, wid-concat_wid);
+			fprintf(vvp_out, "    %%pad %u, 0, %u;\n",
+			                 base, count);
 		  }
 	    }
       } else {
@@ -1918,19 +1917,19 @@ static struct vector_info draw_number_expr(ivl_expr_t expr, unsigned wid)
 	/* Pad the number up to the expression width. */
       if (idx < wid) {
 	    if (ivl_expr_signed(expr) && bits[nwid-1] == '1')
-		  fprintf(vvp_out, "    %%mov %u, 1, %u;\n",
+		  fprintf(vvp_out, "    %%pad %u, 1, %u;\n",
 			  res.base+idx, wid-idx);
 
 	    else if (bits[nwid-1] == 'x')
-		  fprintf(vvp_out, "    %%mov %u, 2, %u;\n",
+		  fprintf(vvp_out, "    %%pad %u, 2, %u;\n",
 			  res.base+idx, wid-idx);
 
 	    else if (bits[nwid-1] == 'z')
-		  fprintf(vvp_out, "    %%mov %u, 3, %u;\n",
+		  fprintf(vvp_out, "    %%pad %u, 3, %u;\n",
 			  res.base+idx, wid-idx);
 
 	    else
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
+		  fprintf(vvp_out, "    %%pad %u, 0, %u;\n",
 			  res.base+idx, wid-idx);
       }
 
@@ -1948,14 +1947,14 @@ static struct vector_info draw_number_expr(ivl_expr_t expr, unsigned wid)
  */
 static void pad_in_place(struct vector_info dest, unsigned sub_width, int signed_flag)
 {
+      unsigned base = dest.base+sub_width;
+      unsigned count = dest.wid-sub_width;
       if (signed_flag) {
-	    unsigned idx;
-	    for (idx = sub_width ;  idx < dest.wid ;  idx += 1)
-			fprintf(vvp_out, "    %%mov %u, %u, 1;\n",
-				dest.base+idx, dest.base+sub_width-1);
+	    fprintf(vvp_out, "    %%pad %u, %u, %u;\n",
+		    base, base-1, count);
       } else {
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n",
-		    dest.base+sub_width, dest.wid - sub_width);
+	    fprintf(vvp_out, "    %%pad %u, 0, %u;\n",
+		    base, count);
       }
 }
 
@@ -2152,7 +2151,7 @@ static struct vector_info draw_string_expr(ivl_expr_t expr, unsigned wid)
 
 	/* Pad the number up to the expression width. */
       if (idx < wid)
-	    fprintf(vvp_out, "    %%mov %u, 0, %u;\n", res.base+idx, wid-idx);
+	    fprintf(vvp_out, "    %%pad %u, 0, %u;\n", res.base+idx, wid-idx);
 
       if (res.base >= 8)
 	    save_expression_lookaside(res.base, expr, wid);
@@ -2173,21 +2172,18 @@ void pad_expr_in_place(ivl_expr_t expr, struct vector_info res, unsigned swid)
       if (res.wid <= swid)
 	    return;
 
+      unsigned base = res.base+swid;
+      unsigned count = res.wid-swid;
       if (ivl_expr_signed(expr)) {
-	    unsigned idx;
-	    for (idx = swid ;  idx < res.wid ;  idx += 1)
-		  fprintf(vvp_out, "    %%mov %u, %u, 1;\n",
-			  res.base+idx, res.base+swid-1);
-
+            fprintf(vvp_out, "    %%pad %u, %u, %u;\n",
+                    base, base-1, count);
       } else {
-	    unsigned base = res.base+swid;
-	    unsigned count = res.wid-swid;
 	      /* The %movi is faster for larger widths, but for very
-		 small counts, the %mov is faster. */
+		 small counts, the %pad is faster. */
 	    if (count > 4)
 		  fprintf(vvp_out, "    %%movi %u, 0, %u;\n", base, count);
 	    else
-		  fprintf(vvp_out, "    %%mov %u, 0, %u;\n", base, count);
+		  fprintf(vvp_out, "    %%pad %u, 0, %u;\n", base, count);
       }
 }
 
@@ -2609,16 +2605,8 @@ static struct vector_info draw_select_unsized_literal(ivl_expr_t expr,
 	    assert(res.base);
 	    fprintf(vvp_out, "    %%mov %u, %u, %u; Pad sub-expression to match width\n",
 		    res.base, subv.base, subv.wid);
-	    if (ivl_expr_signed(sube)) {
-		  unsigned idx;
-		  for (idx = subv.wid ; idx < res.wid ; idx += 1) {
-			fprintf(vvp_out, "    %%mov %u, %u, 1;\n",
-				res.base+idx, subv.base+subv.wid-1);
-		  }
-	    } else {
-		  fprintf(vvp_out, "    %%mov %u, 0, %u\n",
-			  res.base+subv.wid, wid-subv.wid);
-	    }
+
+            pad_in_place(res, subv.wid, ivl_expr_signed(sube));
 
 	    subv = res;
       }
diff --git a/vvp/codes.h b/vvp/codes.h
index 2d4a86466..32dc25e66 100644
--- a/vvp/codes.h
+++ b/vvp/codes.h
@@ -141,6 +141,7 @@ extern bool of_NOR(vthread_t thr, vvp_code_t code);
 extern bool of_NORR(vthread_t thr, vvp_code_t code);
 extern bool of_OR(vthread_t thr, vvp_code_t code);
 extern bool of_ORR(vthread_t thr, vvp_code_t code);
+extern bool of_PAD(vthread_t thr, vvp_code_t code);
 extern bool of_POW(vthread_t thr, vvp_code_t code);
 extern bool of_POW_S(vthread_t thr, vvp_code_t code);
 extern bool of_POW_WR(vthread_t thr, vvp_code_t code);
diff --git a/vvp/compile.cc b/vvp/compile.cc
index c913deb33..e7641f47b 100644
--- a/vvp/compile.cc
+++ b/vvp/compile.cc
@@ -184,6 +184,7 @@ static const struct opcode_table_s opcode_table[] = {
       { "%nor/r",  of_NORR,   3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%or",     of_OR,     3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%or/r",   of_ORR,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
+      { "%pad",    of_PAD,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pow",    of_POW,    3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pow/s",  of_POW_S,  3,  {OA_BIT1,     OA_BIT2,     OA_NUMBER} },
       { "%pow/wr", of_POW_WR, 2,  {OA_BIT1,     OA_BIT2,     OA_NONE} },
diff --git a/vvp/opcodes.txt b/vvp/opcodes.txt
index 6a55fbd7f..fc3b62106 100644
--- a/vvp/opcodes.txt
+++ b/vvp/opcodes.txt
@@ -706,6 +706,14 @@ and the <dst> is a writable scalar. The <dst> gets the value of the
 or of all the bits of the src vector.
 
 
+* %pad <dst>, <src>, <wid>
+
+This instruction replicates a single bit in register space into a
+destination vector in register space. The destination may overlap
+the source bit. The <dst> may not be 0-3. This is useful for zero
+or sign extending a vector.
+
+
 * %pow <bit-l>, <bit-r>, <wid>
 * %pow/s <bit-l>, <bit-r>, <wid>
 
diff --git a/vvp/vthread.cc b/vvp/vthread.cc
index af75ced8a..40d9fb854 100644
--- a/vvp/vthread.cc
+++ b/vvp/vthread.cc
@@ -3558,6 +3558,24 @@ bool of_MOV(vthread_t thr, vvp_code_t cp)
       return true;
 }
 
+bool of_PAD(vthread_t thr, vvp_code_t cp)
+{
+      assert(cp->bit_idx[0] >= 4);
+
+      vvp_bit4_t pad_bit;
+      if (cp->bit_idx[1] < 4)
+            pad_bit = thr_index_to_bit4[cp->bit_idx[1]];
+      else
+            pad_bit = thr->bits4.value(cp->bit_idx[1]);
+
+      thr_check_addr(thr, cp->bit_idx[0]+cp->number-1);
+      vvp_vector4_t tmp (cp->number, pad_bit);
+      thr->bits4.set_vec(cp->bit_idx[0], tmp);
+      return true;
+
+      return true;
+}
+
 /*
 *  %mov/wr <dst>, <src>
 */