From 47b5157f01e22f83ebee137334e41d0b64daf4f4 Mon Sep 17 00:00:00 2001
From: Wilson Snyder <wsnyder@wsnyder.org>
Date: Mon, 26 Oct 2009 20:12:09 -0400
Subject: [PATCH] Support division and modulus of > 64 bit vectors.

---
 Changes                       |   4 +-
 bin/verilator                 |   6 --
 include/verilated.cpp         | 111 ++++++++++++++++++++++++++
 include/verilated.h           |  83 ++++++++++++++++---
 src/V3AstNodes.h              |   8 +-
 src/V3Number.cpp              | 144 ++++++++++++++++++++++++++++++---
 src/V3Number.h                |   2 +
 test_regress/t/t_math_divw.pl |  18 +++++
 test_regress/t/t_math_divw.v  | 145 ++++++++++++++++++++++++++++++++++
 test_regress/t/t_math_vgen.v  |   9 +++
 test_verilated/Makefile_obj   |   4 +
 test_verilated/vgen.pl        |  86 +++++++++++++-------
 12 files changed, 564 insertions(+), 56 deletions(-)
 create mode 100755 test_regress/t/t_math_divw.pl
 create mode 100644 test_regress/t/t_math_divw.v

diff --git a/Changes b/Changes
index 6a1e3cb3a..036829a19 100644
--- a/Changes
+++ b/Changes
@@ -7,7 +7,9 @@ indicates the contributor was also the author of the fix; Thanks!
 
 **   Support little endian bit vectors ("reg [0:2] x;").
 
-**** Fix writing to out-of-bounds arrays writing element 0.
+**   Support division and modulus of > 64 bit vectors.  [Gary Thomas]
+
+***  Fix writing to out-of-bounds arrays writing element 0.
 
 **** Fix core dump with SystemVerilog var declarations under unnamed begins.
 
diff --git a/bin/verilator b/bin/verilator
index 2fcf56127..02e51bf8b 100755
--- a/bin/verilator
+++ b/bin/verilator
@@ -1647,12 +1647,6 @@ Bit ranges must be numbered with the MSB being numbered greater or the same
 as the LSB.  Little-bit-endian busses [0:15] are not supported as they
 aren't easily made compatible with C++.
 
-=head2 32-Bit Divide
-
-The division and modulus operators are limited to 32 bits.  This can be
-easily fixed if someone contributes the appropriate wide-integer math
-functions.
-
 =head2 Gate Primitives
 
 The 2-state gate primitives (and, buf, nand, nor, not, or, xnor, xor) are
diff --git a/include/verilated.cpp b/include/verilated.cpp
index 742896c37..8338f4ddb 100644
--- a/include/verilated.cpp
+++ b/include/verilated.cpp
@@ -136,6 +136,117 @@ WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp) {
     return outwp;
 }
 
+//===========================================================================
+// Slow math
+
+WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus) {
+    // See Knuth Algorithm D.  Computes u/v = q.r
+    // This isn't massively tuned, as wide division is rare
+    // for debug see V3Number version
+    // Requires clean input
+    int words = VL_WORDS_I(lbits);
+    for (int i=0; i<words; i++) owp[i]=0;
+    // Find MSB and check for zero.
+    int umsbp1 = VL_MOSTSETBITP1_W(words,lwp); // dividend
+    int vmsbp1 = VL_MOSTSETBITP1_W(words,rwp); // divisor
+    if (VL_UNLIKELY(vmsbp1==0)  // rwp==0 so division by zero.  Return 0.
+	|| VL_UNLIKELY(umsbp1==0)) {	// 0/x so short circuit and return 0
+	return owp;
+    }
+
+    int uw = VL_WORDS_I(umsbp1);  // aka "m" in the algorithm
+    int vw = VL_WORDS_I(vmsbp1);  // aka "n" in the algorithm
+
+    if (vw == 1) {  // Single divisor word breaks rest of algorithm
+	vluint64_t k = 0;
+	for (int j = uw-1; j >= 0; j--) {
+	    vluint64_t unw64 = ((k<<VL_ULL(32)) + (vluint64_t)(lwp[j]));
+	    owp[j] = unw64 / (vluint64_t)(rwp[0]);
+	    k      = unw64 - (vluint64_t)(owp[j])*(vluint64_t)(rwp[0]);
+	}
+	if (is_modulus) {
+	    owp[0] = k;
+	    for (int i=1; i<words; i++) owp[i]=0;
+	}
+	return owp;
+    }
+
+    // +1 word as we may shift during normalization
+    uint32_t un[VL_MULS_MAX_WORDS+1]; // Fixed size, as MSVC++ doesn't allow [words] here
+    uint32_t vn[VL_MULS_MAX_WORDS+1]; // v normalized
+
+    // Zero for ease of debugging and to save having to zero for shifts
+    for (int i=0; i<words; i++) { un[i]=vn[i]=0; }
+
+    // Algorithm requires divisor MSB to be set
+    // Copy and shift to normalize divisor so MSB of vn[vw-1] is set
+    int s = 31-VL_BITBIT_I(vmsbp1-1);  // shift amount (0...31)
+    uint32_t shift_mask = s ? 0xffffffff : 0;  // otherwise >> 32 won't mask the value
+    for (int i = vw-1; i>0; i--) {
+	vn[i] = (rwp[i] << s) | (shift_mask & (rwp[i-1] >> (32-s)));
+    }
+    vn[0] = rwp[0] << s;
+
+    // Copy and shift dividend by same amount; may set new upper word
+    if (s) un[uw] = lwp[uw-1] >> (32-s);
+    else un[uw] = 0;
+    for (int i=uw-1; i>0; i--) {
+	un[i] = (lwp[i] << s) | (shift_mask & (lwp[i-1] >> (32-s)));
+    }
+    un[0] = lwp[0] << s;
+
+    // Main loop
+    for (int j = uw - vw; j >= 0; j--) {
+	// Estimate
+	vluint64_t unw64 = ((vluint64_t)(un[j+vw])<<VL_ULL(32) | (vluint64_t)(un[j+vw-1]));
+	vluint64_t qhat = unw64 / (vluint64_t)(vn[vw-1]);
+	vluint64_t rhat = unw64 - qhat*(vluint64_t)(vn[vw-1]);
+
+      again:
+	if (qhat >= VL_ULL(0x100000000)
+	    || ((qhat*vn[vw-2]) > ((rhat<<VL_ULL(32)) + un[j+vw-2]))) {
+	    qhat = qhat - 1;
+	    rhat = rhat + vn[vw-1];
+	    if (rhat < VL_ULL(0x100000000)) goto again;
+	}
+
+	vlsint64_t t = 0;  // Must be signed
+	vluint64_t k = 0;
+	for (int i=0; i<vw; i++) {
+	    vluint64_t p = qhat*vn[i];  // Multiply by estimate
+	    t = un[i+j] - k - (p & VL_ULL(0xFFFFFFFF));  // Subtract
+	    un[i+j] = t;
+	    k = (p >> VL_ULL(32)) - (t >> VL_ULL(32));
+	}
+	t = un[j+vw] - k;
+	un[j+vw] = t;
+	owp[j] = qhat; // Save quotient digit
+
+	if (t < 0) {
+	    // Over subtracted; correct by adding back
+	    owp[j]--;
+	    k = 0;
+	    for (int i=0; i<vw; i++) {
+		t = (vluint64_t)(un[i+j]) + (vluint64_t)(vn[i]) + k;
+		un[i+j] = t;
+		k = t >> VL_ULL(32);
+	    }
+	    un[j+vw] = un[j+vw] + k;
+	}
+    }
+
+    if (is_modulus) { // modulus
+	// Need to reverse normalization on copy to output
+	for (int i=0; i<vw; i++) {
+	    owp[i] = (un[i] >> s) | (shift_mask & (un[i+1] << (32-s)));
+	}
+	for (int i=vw; i<words; i++) owp[i] = 0;
+	return owp;
+    } else { // division
+	return owp;
+    }
+}
+
 //===========================================================================
 // Formatting
 
diff --git a/include/verilated.h b/include/verilated.h
index c794ae60c..82bfedf6f 100644
--- a/include/verilated.h
+++ b/include/verilated.h
@@ -192,6 +192,9 @@ extern QData  VL_RAND_RESET_Q(int obits);	///< Random reset a signal
 extern WDataOutP VL_RAND_RESET_W(int obits, WDataOutP outwp);	///< Random reset a signal
 extern WDataOutP VL_ZERO_RESET_W(int obits, WDataOutP outwp);	///< Zero reset a signal
 
+/// Math
+extern WDataOutP _vl_moddiv_w(int lbits, WDataOutP owp, WDataInP lwp, WDataInP rwp, bool is_modulus);
+
 /// File I/O
 extern IData VL_FGETS_IXQ(int sbits, void* strgp, QData fpq);
 
@@ -228,6 +231,7 @@ extern IData VL_SSCANF_IWX(int lbits, WDataInP lwp, const char* formatp, ...);
 #define VL_SET_WQ(owp,data)	{ owp[0]=(data); owp[1]=((data)>>VL_WORDSIZE); }
 #define VL_SET_WI(owp,data)	{ owp[0]=(data); owp[1]=0; }
 #define VL_SET_QW(lwp)		( ((QData)(lwp[0])) | ((QData)(lwp[1])<<((QData)(VL_WORDSIZE)) ))
+#define _VL_SET_QII(ld,rd)      ( ((QData)(ld)<<VL_ULL(32)) | (QData)(rd) )
 
 // Use a union to avoid cast-to-different-size warnings
 /// Return FILE* from QData
@@ -302,6 +306,11 @@ extern double sc_time_stamp();
 
 // EMIT_RULE: VL_ASSIGNCLEAN:  oclean=clean; obits==lbits;
 #define VL_ASSIGNCLEAN_W(obits,owp,lwp) VL_CLEAN_WW(obits,obits,owp,lwp)
+static inline WDataOutP _VL_CLEAN_INPLACE_W(int obits, WDataOutP owp) {
+    int words = VL_WORDS_I(obits);
+    owp[words-1] &= VL_MASK_I(obits);
+    return(owp);
+}
 static inline WDataOutP VL_CLEAN_WW(int obits, int, WDataOutP owp, WDataInP lwp){
     int words = VL_WORDS_I(obits);
     for (int i=0; (i < (words-1)); i++) owp[i] = lwp[i];
@@ -609,7 +618,7 @@ static inline IData VL_CLOG2_Q(QData lhs) {
 static inline IData VL_CLOG2_W(int words, WDataInP lwp) {
     IData adjust = (VL_COUNTONES_W(words,lwp)==1) ? 0 : 1;
     for (int i=words-1; i>=0; i--) {
-	if (lwp[i]) {
+	if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
 	    for (int bit=31; bit>=0; bit--) {
 		if (VL_UNLIKELY(VL_BITISSET_I(lwp[i],bit))) {
 		    return i*VL_WORDSIZE + bit + adjust;
@@ -621,6 +630,21 @@ static inline IData VL_CLOG2_W(int words, WDataInP lwp) {
     return 0;
 }
 
+static inline IData VL_MOSTSETBITP1_W(int words, WDataInP lwp) {
+    // MSB set bit plus one; similar to FLS.  0=value is zero
+    for (int i=words-1; i>=0; i--) {
+	if (VL_UNLIKELY(lwp[i])) {  // Shorter worst case if predict not taken
+	    for (int bit=31; bit>=0; bit--) {
+		if (VL_UNLIKELY(VL_BITISSET_I(lwp[i],bit))) {
+		    return i*VL_WORDSIZE + bit + 1;
+		}
+	    }
+	    // Can't get here - one bit must be set
+	}
+    }
+    return 0;
+}
+
 //===================================================================
 // SIMPLE LOGICAL OPERATORS
 
@@ -759,10 +783,12 @@ static inline int _VL_CMPS_W(int lbits, WDataInP lwp, WDataInP rwp) {
 // EMIT_RULE: VL_MUL:    oclean=dirty; lclean==clean; rclean==clean;
 // EMIT_RULE: VL_DIV:    oclean=dirty; lclean==clean; rclean==clean;
 // EMIT_RULE: VL_MODDIV: oclean=dirty; lclean==clean; rclean==clean;
-#define VL_DIV_I(lhs,rhs)	(((rhs)==0)?0:(lhs)/(rhs))
-#define VL_DIV_Q(lhs,rhs)	(((rhs)==0)?0:(lhs)/(rhs))
-#define VL_MODDIV_I(lhs,rhs)	(((rhs)==0)?0:(lhs)%(rhs))
-#define VL_MODDIV_Q(lhs,rhs)	(((rhs)==0)?0:(lhs)%(rhs))
+#define VL_DIV_III(lbits,lhs,rhs)	(((rhs)==0)?0:(lhs)/(rhs))
+#define VL_DIV_QQQ(lbits,lhs,rhs)	(((rhs)==0)?0:(lhs)/(rhs))
+#define VL_DIV_WWW(lbits,owp,lwp,rwp)   (_vl_moddiv_w(lbits,owp,lwp,rwp,0))
+#define VL_MODDIV_III(lbits,lhs,rhs)	(((rhs)==0)?0:(lhs)%(rhs))
+#define VL_MODDIV_QQQ(lbits,lhs,rhs)	(((rhs)==0)?0:(lhs)%(rhs))
+#define VL_MODDIV_WWW(lbits,owp,lwp,rwp) (_vl_moddiv_w(lbits,owp,lwp,rwp,1))
 
 static inline WDataOutP VL_ADD_W(int words, WDataOutP owp,WDataInP lwp,WDataInP rwp){
     QData carry = 0;
@@ -866,31 +892,70 @@ static inline WDataOutP VL_MULS_WWW(int,int lbits,int, WDataOutP owp,WDataInP lw
     return(owp);
 }
 
-static inline IData VL_DIVS_III(int,int lbits,int, IData lhs,IData rhs) {
+static inline IData VL_DIVS_III(int lbits, IData lhs,IData rhs) {
     if (rhs==0) return 0;
     vlsint32_t lhs_signed = VL_EXTENDS_II(32, lbits, lhs);
     vlsint32_t rhs_signed = VL_EXTENDS_II(32, lbits, rhs);
     return lhs_signed / rhs_signed;
 }
-static inline QData VL_DIVS_QQQ(int,int lbits,int, QData lhs,QData rhs) {
+static inline QData VL_DIVS_QQQ(int lbits, QData lhs,QData rhs) {
     if (rhs==0) return 0;
     vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
     vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
     return lhs_signed / rhs_signed;
 }
-static inline IData VL_MODDIVS_III(int,int lbits,int, IData lhs,IData rhs) {
+static inline IData VL_MODDIVS_III(int lbits, IData lhs,IData rhs) {
     if (rhs==0) return 0;
     vlsint32_t lhs_signed = VL_EXTENDS_II(32, lbits, lhs);
     vlsint32_t rhs_signed = VL_EXTENDS_II(32, lbits, rhs);
     return lhs_signed % rhs_signed;
 }
-static inline QData VL_MODDIVS_QQQ(int,int lbits,int, QData lhs,QData rhs) {
+static inline QData VL_MODDIVS_QQQ(int lbits, QData lhs,QData rhs) {
     if (rhs==0) return 0;
     vlsint64_t lhs_signed = VL_EXTENDS_QQ(64, lbits, lhs);
     vlsint64_t rhs_signed = VL_EXTENDS_QQ(64, lbits, rhs);
     return lhs_signed % rhs_signed;
 }
 
+static inline WDataOutP VL_DIVS_WWW(int lbits, WDataOutP owp,WDataInP lwp,WDataInP rwp) {
+    int words = VL_WORDS_I(lbits);
+    IData lsign = VL_SIGN_I(lbits,lwp[words-1]);
+    IData rsign = VL_SIGN_I(lbits,rwp[words-1]);
+    IData lwstore[VL_MULS_MAX_WORDS]; // Fixed size, as MSVC++ doesn't allow [words] here
+    IData rwstore[VL_MULS_MAX_WORDS];
+    WDataInP ltup = lwp;
+    WDataInP rtup = rwp;
+    if (lsign) { ltup = _VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), lwstore, lwp)); }
+    if (rsign) { rtup = _VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), rwstore, rwp)); }
+    if ((lsign && !rsign) || (!lsign && rsign)) {
+	IData qNoSign[VL_MULS_MAX_WORDS];
+	VL_DIV_WWW(lbits,qNoSign,ltup,rtup);
+	_VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), owp, qNoSign));
+	return owp;
+    } else {
+	return VL_DIV_WWW(lbits,owp,ltup,rtup);
+    }
+}
+static inline WDataOutP VL_MODDIVS_WWW(int lbits, WDataOutP owp,WDataInP lwp,WDataInP rwp) {
+    int words = VL_WORDS_I(lbits);
+    IData lsign = VL_SIGN_I(lbits,lwp[words-1]);
+    IData rsign = VL_SIGN_I(lbits,rwp[words-1]);
+    IData lwstore[VL_MULS_MAX_WORDS]; // Fixed size, as MSVC++ doesn't allow [words] here
+    IData rwstore[VL_MULS_MAX_WORDS];
+    WDataInP ltup = lwp;
+    WDataInP rtup = rwp;
+    if (lsign) { ltup = _VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), lwstore, lwp)); }
+    if (rsign) { rtup = _VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), rwstore, rwp)); }
+    if (lsign) {  // Only dividend sign matters for modulus
+	IData qNoSign[VL_MULS_MAX_WORDS];
+	VL_MODDIV_WWW(lbits,qNoSign,ltup,rtup);
+	_VL_CLEAN_INPLACE_W(lbits, VL_UNARYMIN_W(VL_WORDS_I(lbits), owp, qNoSign));
+	return owp;
+    } else {
+	return VL_MODDIV_WWW(lbits,owp,ltup,rtup);
+    }
+}
+
 static inline IData VL_POW_III(int, int, int rbits, IData lhs, IData rhs) {
     if (lhs==0) return 0;
     IData power = lhs;
diff --git a/src/V3AstNodes.h b/src/V3AstNodes.h
index 6c8e62622..8c2b002f4 100644
--- a/src/V3AstNodes.h
+++ b/src/V3AstNodes.h
@@ -2382,7 +2382,7 @@ struct AstDiv : public AstNodeBiop {
     ASTNODE_NODE_FUNCS(Div, DIV)
     virtual void numberOperate(V3Number& out, const V3Number& lhs, const V3Number& rhs) { out.opDiv(lhs,rhs); }
     virtual string emitVerilog() { return "%k(%l %k/ %r)"; }
-    virtual string emitC() { return "VL_DIV_%lq(%lW, %P, %li, %ri)"; }
+    virtual string emitC() { return "VL_DIV_%nq%lq%rq(%lw, %P, %li, %ri)"; }
     virtual bool cleanOut() {return false;}
     virtual bool cleanLhs() {return true;} virtual bool cleanRhs() {return true;}
     virtual bool sizeMattersLhs() {return true;} virtual bool sizeMattersRhs() {return true;}
@@ -2394,7 +2394,7 @@ struct AstDivS : public AstNodeBiop {
     ASTNODE_NODE_FUNCS(DivS, DIVS)
     virtual void numberOperate(V3Number& out, const V3Number& lhs, const V3Number& rhs) { out.opDivS(lhs,rhs); }
     virtual string emitVerilog() { return "%k(%l %k/ %r)"; }
-    virtual string emitC() { return "VL_DIVS_%nq%lq%rq(%nw,%lw,%rw, %P, %li, %ri)"; }
+    virtual string emitC() { return "VL_DIVS_%nq%lq%rq(%lw, %P, %li, %ri)"; }
     virtual bool cleanOut() {return false;}
     virtual bool cleanLhs() {return true;} virtual bool cleanRhs() {return true;}
     virtual bool sizeMattersLhs() {return true;} virtual bool sizeMattersRhs() {return true;}
@@ -2407,7 +2407,7 @@ struct AstModDiv : public AstNodeBiop {
     ASTNODE_NODE_FUNCS(ModDiv, MODDIV)
     virtual void numberOperate(V3Number& out, const V3Number& lhs, const V3Number& rhs) { out.opModDiv(lhs,rhs); }
     virtual string emitVerilog() { return "%k(%l %k%% %r)"; }
-    virtual string emitC() { return "VL_MODDIV_%lq(%lW, %P, %li, %ri)"; }
+    virtual string emitC() { return "VL_MODDIV_%nq%lq%rq(%lw, %P, %li, %ri)"; }
     virtual bool cleanOut() {return false;}
     virtual bool cleanLhs() {return true;} virtual bool cleanRhs() {return true;}
     virtual bool sizeMattersLhs() {return true;} virtual bool sizeMattersRhs() {return true;}
@@ -2419,7 +2419,7 @@ struct AstModDivS : public AstNodeBiop {
     ASTNODE_NODE_FUNCS(ModDivS, MODDIVS)
     virtual void numberOperate(V3Number& out, const V3Number& lhs, const V3Number& rhs) { out.opModDivS(lhs,rhs); }
     virtual string emitVerilog() { return "%k(%l %k%% %r)"; }
-    virtual string emitC() { return "VL_MODDIVS_%nq%lq%rq(%nw,%lw,%rw, %P, %li, %ri)"; }
+    virtual string emitC() { return "VL_MODDIVS_%nq%lq%rq(%lw, %P, %li, %ri)"; }
     virtual bool cleanOut() {return false;}
     virtual bool cleanLhs() {return true;} virtual bool cleanRhs() {return true;}
     virtual bool sizeMattersLhs() {return true;} virtual bool sizeMattersRhs() {return true;}
diff --git a/src/V3Number.cpp b/src/V3Number.cpp
index b2cbedc97..3a49a4897 100644
--- a/src/V3Number.cpp
+++ b/src/V3Number.cpp
@@ -245,7 +245,7 @@ V3Number::V3Number (FileLine* fileline, const char* sourcep) {
 		case 'd': setBit(obit++,1); setBit(obit++,0); setBit(obit++,1); setBit(obit++,1); break;
 		case 'e': setBit(obit++,0); setBit(obit++,1); setBit(obit++,1); setBit(obit++,1); break;
 		case 'f': setBit(obit++,1); setBit(obit++,1); setBit(obit++,1); setBit(obit++,1); break;
-		case 'z': case '?': 
+		case 'z': case '?':
 		    setBit(obit++,'z'); setBit(obit++,'z'); setBit(obit++,'z'); setBit(obit++,'z'); break;
 		case 'x':
 		    setBit(obit++,'x'); setBit(obit++,'x'); setBit(obit++,'x'); setBit(obit++,'x'); break;
@@ -1087,37 +1087,47 @@ V3Number& V3Number::opMulS (const V3Number& lhs, const V3Number& rhs) {
     return *this;
 }
 V3Number& V3Number::opDiv (const V3Number& lhs, const V3Number& rhs) {
+    UINFO(9, "opdiv "<<lhs<<" "<<rhs<<endl);
     // i op j, max(L(lhs),L(rhs)) bit return, if any 4-state, 4-state return
     if (lhs.isFourState() || rhs.isFourState()) return setAllBitsX();
     if (rhs.isEqZero()) return setAllBitsX();
-    if (lhs.width()>64) m_fileline->v3fatalSrc("Unsupported: Large / math not implemented yet: "<<*this);
-    if (rhs.width()>64) m_fileline->v3fatalSrc("Unsupported: Large / math not implemented yet: "<<*this);
-    setQuad(lhs.toUQuad() / rhs.toUQuad());
-    return *this;
+    if (lhs.width()<=64) {
+	setQuad(lhs.toUQuad() / rhs.toUQuad());
+	return *this;
+    } else {
+	// Wide division
+	return opModDivGuts(lhs,rhs,false);
+    }
 }
 V3Number& V3Number::opDivS (const V3Number& lhs, const V3Number& rhs) {
     // Signed divide
+    //UINFO(9, ">>divs-start "<<lhs<<" "<<rhs<<endl);
     if (lhs.isFourState() || rhs.isFourState()) return setAllBitsX();
     if (rhs.isEqZero()) return setAllBitsX();
     V3Number lhsNoSign = lhs;  if (lhs.isNegative()) lhsNoSign.opUnaryMin(lhs);
     V3Number rhsNoSign = rhs;  if (rhs.isNegative()) rhsNoSign.opUnaryMin(rhs);
     V3Number qNoSign = opDiv(lhsNoSign,rhsNoSign);
+    //UINFO(9, " >divs-mid "<<lhs<<" "<<rhs<<" "<<qNoSign<<endl);
     if ((lhs.isNegative() && !rhs.isNegative())
 	|| (!lhs.isNegative() && rhs.isNegative())) {
 	opUnaryMin(qNoSign);
     } else {
 	opAssign(qNoSign);
     }
+    UINFO(9, " <divs-out "<<lhs<<" "<<rhs<<" ="<<*this<<endl);
     return *this;
 }
 V3Number& V3Number::opModDiv (const V3Number& lhs, const V3Number& rhs) {
     // i op j, max(L(lhs),L(rhs)) bit return, if any 4-state, 4-state return
     if (lhs.isFourState() || rhs.isFourState()) return setAllBitsX();
     if (rhs.isEqZero()) return setAllBitsX();
-    if (lhs.width()>64) m_fileline->v3fatalSrc("Unsupported: Large % math not implemented yet: "<<*this);
-    if (rhs.width()>64) m_fileline->v3fatalSrc("Unsupported: Large % math not implemented yet: "<<*this);
-    setQuad(lhs.toUQuad() % rhs.toUQuad());
-    return *this;
+    if (lhs.width()<=64) {
+	setQuad(lhs.toUQuad() % rhs.toUQuad());
+	return *this;
+    } else {
+	// Wide modulus
+	return opModDivGuts(lhs,rhs,true);
+    }
 }
 V3Number& V3Number::opModDivS (const V3Number& lhs, const V3Number& rhs) {
     // Signed moddiv
@@ -1133,6 +1143,122 @@ V3Number& V3Number::opModDivS (const V3Number& lhs, const V3Number& rhs) {
     }
     return *this;
 }
+V3Number& V3Number::opModDivGuts(const V3Number& lhs, const V3Number& rhs, bool is_modulus) {
+    // See Knuth Algorithm D.  Computes u/v = q.r
+    // This isn't massively tuned, as wide division is rare
+    setZero();
+    // Find MSB and check for zero.
+    int words = lhs.words();
+    int umsbp1 = lhs.mostSetBitP1(); // dividend
+    int vmsbp1 = rhs.mostSetBitP1(); // divisor
+    if (VL_UNLIKELY(vmsbp1==0)  // rwp==0 so division by zero.  Return 0.
+	|| VL_UNLIKELY(umsbp1==0)) {	// 0/x so short circuit and return 0
+	UINFO(9, "  opmoddiv-zero "<<lhs<<" "<<rhs<<" now="<<*this<<endl);
+	return *this;
+    }
+
+    int uw = VL_WORDS_I(umsbp1);  // aka "m" in the algorithm
+    int vw = VL_WORDS_I(vmsbp1);  // aka "n" in the algorithm
+
+    if (vw == 1) {  // Single divisor word breaks rest of algorithm
+	vluint64_t k = 0;
+	for (int j = uw-1; j >= 0; j--) {
+	    vluint64_t unw64 = ((k<<VL_ULL(32)) + (vluint64_t)(lhs.m_value[j]));
+	    m_value[j] = unw64 / (vluint64_t)(rhs.m_value[0]);
+	    k          = unw64 - (vluint64_t)(m_value[j])*(vluint64_t)(rhs.m_value[0]);
+	}
+	UINFO(9, "  opmoddiv-1w  "<<lhs<<" "<<rhs<<" q="<<*this<<" rem=0x"<<hex<<k<<dec<<endl);
+	if (is_modulus) { setZero(); m_value[0] = k; }
+	return *this;
+    }
+
+    // +1 word as we may shift during normalization
+    uint32_t un[VL_MULS_MAX_WORDS+1]; // Fixed size, as MSVC++ doesn't allow [words] here
+    uint32_t vn[VL_MULS_MAX_WORDS+1]; // v normalized
+
+    // Zero for ease of debugging and to save having to zero for shifts
+    for (int i=0; i<6; i++) { un[i]=vn[i]=m_value[i]=0; }
+    for (int i=6; i<words+1; i++) { un[i]=vn[i]=0; }  // +1 as vn may get extra word
+
+    // Algorithm requires divisor MSB to be set
+    // Copy and shift to normalize divisor so MSB of vn[vw-1] is set
+    int s = 31-VL_BITBIT_I(vmsbp1-1);  // shift amount (0...31)
+    uint32_t shift_mask = s ? 0xffffffff : 0;  // otherwise >> 32 won't mask the value
+    for (int i = vw-1; i>0; i--) {
+	vn[i] = (rhs.m_value[i] << s) | (shift_mask & (rhs.m_value[i-1] >> (32-s)));
+    }
+    vn[0] = rhs.m_value[0] << s;
+
+    // Copy and shift dividend by same amount; may set new upper word
+    if (s) un[uw] = lhs.m_value[uw-1] >> (32-s);
+    else   un[uw] = 0;
+    for (int i=uw-1; i>0; i--) {
+	un[i] = (lhs.m_value[i] << s) | (shift_mask & (lhs.m_value[i-1] >> (32-s)));
+    }
+    un[0] = lhs.m_value[0] << s;
+
+    //printf("  un="); for(int i=5; i>=0; i--) printf(" %08x",un[i]); printf("\n");
+    //printf("  vn="); for(int i=5; i>=0; i--) printf(" %08x",vn[i]); printf("\n");
+    //printf("  mv="); for(int i=5; i>=0; i--) printf(" %08x",m_value[i]); printf("\n");
+
+    // Main loop
+    for (int j = uw - vw; j >= 0; j--) {
+	// Estimate
+	vluint64_t unw64 = ((vluint64_t)(un[j+vw])<<VL_ULL(32) | (vluint64_t)(un[j+vw-1]));
+	vluint64_t qhat = unw64 / (vluint64_t)(vn[vw-1]);
+	vluint64_t rhat = unw64 - qhat*(vluint64_t)(vn[vw-1]);
+
+      again:
+	if (qhat >= VL_ULL(0x100000000)
+	    || ((qhat*vn[vw-2]) > ((rhat<<VL_ULL(32)) + un[j+vw-2]))) {
+	    qhat = qhat - 1;
+	    rhat = rhat + vn[vw-1];
+	    if (rhat < VL_ULL(0x100000000)) goto again;
+	}
+
+	vlsint64_t t = 0;  // Must be signed
+	vluint64_t k = 0;
+	for (int i=0; i<vw; i++) {
+	    vluint64_t p = qhat*vn[i];  // Multiply by estimate
+	    t = un[i+j] - k - (p & VL_ULL(0xFFFFFFFF));  // Subtract
+	    un[i+j] = t;
+	    k = (p >> VL_ULL(32)) - (t >> VL_ULL(32));
+	}
+	t = un[j+vw] - k;
+	un[j+vw] = t;
+	this->m_value[j] = qhat; // Save quotient digit
+
+	if (t < 0) {
+	    // Over subtracted; correct by adding back
+	    this->m_value[j]--;
+	    k = 0;
+	    for (int i=0; i<vw; i++) {
+		t = (vluint64_t)(un[i+j]) + (vluint64_t)(vn[i]) + k;
+		un[i+j] = t;
+		k = t >> VL_ULL(32);
+	    }
+	    un[j+vw] = un[j+vw] + k;
+	}
+    }
+
+    //printf("  un="); for(int i=5; i>=0; i--) printf(" %08x",un[i]); printf("\n");
+    //printf("  vn="); for(int i=5; i>=0; i--) printf(" %08x",vn[i]); printf("\n");
+    //printf("  mv="); for(int i=5; i>=0; i--) printf(" %08x",m_value[i]); printf("\n");
+
+    if (is_modulus) { // modulus
+	// Need to reverse normalization on copy to output
+	for (int i=0; i<vw; i++) {
+	    m_value[i] = (un[i] >> s) | (shift_mask & (un[i+1] << (32-s)));
+	}
+	for (int i=vw; i<words; i++) m_value[i] = 0;
+	UINFO(9, "  opmoddiv-mod "<<lhs<<" "<<rhs<<" now="<<*this<<endl);
+	return *this;
+    } else { // division
+	UINFO(9, "  opmoddiv-div "<<lhs<<" "<<rhs<<" now="<<*this<<endl);
+	return *this;
+    }
+}
+
 V3Number& V3Number::opPow (const V3Number& lhs, const V3Number& rhs) {
     // L(i) bit return, if any 4-state, 4-state return
     if (lhs.isFourState() || rhs.isFourState()) return setAllBitsX();
diff --git a/src/V3Number.h b/src/V3Number.h
index 009b14e0c..225328bdb 100644
--- a/src/V3Number.h
+++ b/src/V3Number.h
@@ -104,6 +104,8 @@ private:
 
     int words() const { return ((width()+31)/32); }
 
+    V3Number& opModDivGuts(const V3Number& lhs, const V3Number& rhs, bool is_modulus);
+
 public:
     class VerilogString {};	// for creator type-overload selection
     // CONSTRUCTORS
diff --git a/test_regress/t/t_math_divw.pl b/test_regress/t/t_math_divw.pl
new file mode 100755
index 000000000..7058e622f
--- /dev/null
+++ b/test_regress/t/t_math_divw.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl
+if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; }
+# DESCRIPTION: Verilator: Verilog Test driver/expect definition
+#
+# Copyright 2003 by Wilson Snyder. This program is free software; you can
+# redistribute it and/or modify it under the terms of either the GNU
+# Lesser General Public License Version 3 or the Perl Artistic License
+# Version 2.0.
+
+compile (
+	 );
+
+execute (
+	 check_finished=>1,
+     );
+
+ok(1);
+1;
diff --git a/test_regress/t/t_math_divw.v b/test_regress/t/t_math_divw.v
new file mode 100644
index 000000000..f8c5d899b
--- /dev/null
+++ b/test_regress/t/t_math_divw.v
@@ -0,0 +1,145 @@
+// DESCRIPTION: Verilator: Verilog Test module
+//
+// This file ONLY is placed into the Public Domain, for any use,
+// without warranty, 2004 by Wilson Snyder.
+
+module t (/*AUTOARG*/
+   // Inputs
+   clk
+   );
+
+   input clk;
+
+   // verilator lint_off WIDTH
+
+   //============================================================
+
+   reg   bad;
+   initial begin
+      bad=0;
+      c96(96'h0_0000_0000_0000_0000,	96'h8_8888_8888_8888_8888,	96'h0_0000_0000_0000_0000,	96'h0);
+      c96(96'h8_8888_8888_8888_8888,	96'h0_0000_0000_0000_0000,	96'h0_0000_0000_0000_0000,	96'h0);
+      c96(96'h8_8888_8888_8888_8888,	96'h0_0000_0000_0000_0002,	96'h4_4444_4444_4444_4444,	96'h0);
+      c96(96'h8_8888_8888_8888_8888,	96'h0_2000_0000_0000_0000,	96'h0_0000_0000_0000_0044,	96'h0_0888_8888_8888_8888);
+      c96(96'h8_8888_8888_8888_8888,	96'h8_8888_8888_8888_8888,	96'h0_0000_0000_0000_0001,	96'h0);
+      c96(96'h8_8888_8888_8888_8888,	96'h8_8888_8888_8888_8889,	96'h0_0000_0000_0000_0000,	96'h8_8888_8888_8888_8888);
+      c96(96'h1_0000_0000_8eba_434a,	96'h0_0000_0000_0000_0001,	96'h1_0000_0000_8eba_434a,	96'h0);
+
+      c96(96'h0003,			96'h0002,			96'h0001,			96'h0001);
+      c96(96'h0003,			96'h0003,			96'h0001,			96'h0000);
+      c96(96'h0003,			96'h0004,			96'h0000,			96'h0003);
+      c96(96'h0000,			96'hffff,			96'h0000,			96'h0000);
+      c96(96'hffff,			96'h0001,			96'hffff,			96'h0000);
+      c96(96'hffff,			96'hffff,			96'h0001,			96'h0000);
+      c96(96'hffff,			96'h0003,			96'h5555,			96'h0000);
+      c96(96'hffff_ffff,		96'h0001,			96'hffff_ffff,			96'h0000);
+      c96(96'hffff_ffff,		96'hffff,			96'h0001_0001,			96'h0000);
+      c96(96'hfffe_ffff,		96'hffff,			96'h0000_ffff,			96'hfffe);
+      c96(96'h1234_5678,		96'h9abc,			96'h0000_1e1e,			96'h2c70);
+      c96(96'h0000_0000,		96'h0001_0000,			96'h0000,			96'h0000_0000);
+      c96(96'h0007_0000,		96'h0003_0000,			96'h0002,			96'h0001_0000);
+      c96(96'h0007_0005,		96'h0003_0000,			96'h0002,			96'h0001_0005);
+      c96(96'h0006_0000,		96'h0002_0000,			96'h0003,			96'h0000_0000);
+      c96(96'h8000_0001,		96'h4000_7000,			96'h0001,			96'h3fff_9001);
+      c96(96'hbcde_789a,		96'hbcde_789a,			96'h0001,			96'h0000_0000);
+      c96(96'hbcde_789b,		96'hbcde_789a,			96'h0001,			96'h0000_0001);
+      c96(96'hbcde_7899,		96'hbcde_789a,			96'h0000,			96'hbcde_7899);
+      c96(96'hffff_ffff,		96'hffff_ffff,			96'h0001,			96'h0000_0000);
+      c96(96'hffff_ffff,		96'h0001_0000,			96'hffff,			96'h0000_ffff);
+      c96(96'h0123_4567_89ab,		96'h0001_0000,			96'h0123_4567,			96'h0000_89ab);
+      c96(96'h8000_fffe_0000,		96'h8000_ffff,			96'h0000_ffff,			96'h7fff_ffff);
+      c96(96'h8000_0000_0003,		96'h2000_0000_0001,		96'h0003,			96'h2000_0000_0000);
+
+      c96(96'hffff_ffff_0000_0000,	96'h0001_0000_0000,		96'hffff_ffff,			96'h0000_0000_0000);
+      c96(96'hffff_ffff_0000_0000,	96'hffff_0000_0000,		96'h0001_0001,			96'h0000_0000_0000);
+      c96(96'hfffe_ffff_0000_0000,	96'hffff_0000_0000,		96'h0000_ffff,			96'hfffe_0000_0000);
+      c96(96'h1234_5678_0000_0000,	96'h9abc_0000_0000,		96'h0000_1e1e,			96'h2c70_0000_0000);
+
+      c96(96'h0000_0000_0000_0000,	96'h0001_0000_0000_0000,	96'h0000,			96'h0000_0000_0000_0000);
+      c96(96'h0007_0000_0000_0000,	96'h0003_0000_0000_0000,	96'h0002,			96'h0001_0000_0000_0000);
+      c96(96'h0007_0005_0000_0000,	96'h0003_0000_0000_0000,	96'h0002,			96'h0001_0005_0000_0000);
+      c96(96'h0006_0000_0000_0000,	96'h0002_0000_0000_0000,	96'h0003,			96'h0000_0000_0000_0000);
+      c96(96'h8000_0001_0000_0000,	96'h4000_7000_0000_0000,	96'h0001,			96'h3fff_9001_0000_0000);
+      c96(96'hbcde_789a_0000_0000,	96'hbcde_789a_0000_0000,	96'h0001,			96'h0000_0000_0000_0000);
+      c96(96'hbcde_789b_0000_0000,	96'hbcde_789a_0000_0000,	96'h0001,			96'h0000_0001_0000_0000);
+      c96(96'hbcde_7899_0000_0000,	96'hbcde_789a_0000_0000,	96'h0000,			96'hbcde_7899_0000_0000);
+      c96(96'hffff_ffff_0000_0000,	96'hffff_ffff_0000_0000,	96'h0001,			96'h0000_0000_0000_0000);
+      c96(96'hffff_ffff_0000_0000,	96'h0001_0000_0000_0000,	96'hffff,			96'h0000_ffff_0000_0000);
+      c96(96'h7fff_8000_0000_0000,	96'h8000_0000_0001,		96'h0000_fffe,			96'h7fff_ffff_0002);
+      c96(96'h8000_0000_fffe_0000,	96'h8000_0000_ffff,		96'h0000_ffff,			96'h7fff_ffff_ffff);
+      c96(96'h0008_8888_8888_8888_8888,	96'h0002_0000_0000_0000,	96'h0004_4444,			96'h0000_8888_8888_8888);
+
+      if (bad) $stop;
+      $write("*-* All Finished *-*\n");
+      $finish;
+   end
+
+   task c96;
+      input [95:0] u;
+      input [95:0] v;
+      input [95:0] expq;
+      input [95:0] expr;
+      c96u( u, v, expq, expr);
+      c96s( u, v, expq, expr);
+      c96s(-u, v,-expq,-expr);
+      c96s( u,-v,-expq, expr);
+      c96s(-u,-v, expq,-expr);
+   endtask
+
+   task c96u;
+      input [95:0] u;
+      input [95:0] v;
+      input [95:0] expq;
+      input [95:0] expr;
+      reg [95:0]   gotq;
+      reg [95:0]   gotr;
+      gotq = u/v;
+      gotr = u%v;
+      if (gotq != expq && v!=0) begin
+	 bad = 1;
+      end
+      if (gotr != expr && v!=0) begin
+	 bad = 1;
+      end
+      if (bad
+`ifdef TEST_VERBOSE
+	  || 1
+`endif
+	  ) begin
+	 $write(" %x /u %x = got %x exp %x  %% got %x exp %x", u,v,gotq,expq,gotr,expr);
+	 // Test for v=0 to prevent Xs causing grief
+	 if (gotq != expq && v!=0) $write(" BADQ");
+	 if (gotr != expr && v!=0) $write(" BADR");
+	 $write("\n");
+      end
+   endtask
+
+   task c96s;
+      input signed [95:0] u;
+      input signed [95:0] v;
+      input signed [95:0] expq;
+      input signed [95:0] expr;
+      reg signed [95:0]   gotq;
+      reg signed [95:0]   gotr;
+      gotq = u/v;
+      gotr = u%v;
+      if (gotq != expq && v!=0) begin
+	 bad = 1;
+      end
+      if (gotr != expr && v!=0) begin
+	 bad = 1;
+      end
+      if (bad
+`ifdef TEST_VERBOSE
+	  || 1
+`endif
+	  ) begin
+	 $write(" %x /s %x = got %x exp %x  %% got %x exp %x", u,v,gotq,expq,gotr,expr);
+	 // Test for v=0 to prevent Xs causing grief
+	 if (gotq != expq && v!=0) $write(" BADQ");
+	 if (gotr != expr && v!=0) $write(" BADR");
+	 $write("\n");
+      end
+   endtask
+
+endmodule
diff --git a/test_regress/t/t_math_vgen.v b/test_regress/t/t_math_vgen.v
index d5eb09c1c..fe4f43c98 100644
--- a/test_regress/t/t_math_vgen.v
+++ b/test_regress/t/t_math_vgen.v
@@ -273,6 +273,15 @@ module t (/*AUTOARG*/
 
    //============================================================
 
+   reg signed [105:  0] W0032 /*verilator public*/; //=106'h3ff0000000100000000bd597bb1
+   always @(check) begin : Block237
+      W0032 = 106'sh3ff0000000100000000bd597bb1;
+      if ((106'sh1ca0000000000000000b96b8dc2 / 106'sh3ff0000000100000000bd597bb1) != 106'sh3fffffffffffffffffffffffe36) if (check) $stop;
+      if ((106'sh1ca0000000000000000b96b8dc2 / W0032) != 106'sh3fffffffffffffffffffffffe36) if (check) $stop;
+   end
+
+   //============================================================
+
    always @ (posedge clk) begin
       if (cyc!=0) begin
 	 cyc <= cyc + 1;
diff --git a/test_verilated/Makefile_obj b/test_verilated/Makefile_obj
index 86803d8ae..3422b7ddc 100644
--- a/test_verilated/Makefile_obj
+++ b/test_verilated/Makefile_obj
@@ -25,6 +25,10 @@ CPPFLAGS += -DWAVES=1
 CPPFLAGS += -DVL_DEBUG=1
 CPPFLAGS += $(CPPFLAGS_ADD)
 
+# Random code often does / 0.  Unfortunately VL_DIV_I(0,0) will warn
+# without this flag, even though there's a conditional to prevent the divide.
+CPPFLAGS += -Wno-div-by-zero
+
 #######################################################################
 # Linking final exe -- presumes have a sim_main.cpp
 
diff --git a/test_verilated/vgen.pl b/test_verilated/vgen.pl
index 757b37458..f82eb0472 100755
--- a/test_verilated/vgen.pl
+++ b/test_verilated/vgen.pl
@@ -49,7 +49,7 @@ our $Raise_Weight_Max = 50;
  'VBITSELP'=>	{weight=>1&&10, width=>0, signed=>0,sc=>0, terminal=>0, v=>'%i[%2+:%3]', },
  'VBITSELM'=>	{weight=>1&&10, width=>0, signed=>0,sc=>0, terminal=>0, v=>'%i[%2-:%3]', },
  # Unary
- 'VEXTEND'=>	{weight=>1&&3, width=>-2, signed=>0,sc=>0, terminal=>0, v=>'{%xw\'h0,%1}', },
+ 'VEXTEND'=>	{weight=>1&&3, width=>-2, signed=>0,sc=>0, terminal=>0, v=>'{%xd\'h0,%1}', },
  'VLOGNOT'=>	{weight=>1&&1, width=>1, signed=>0, sc=>0, terminal=>0, v=>'(! %1)', },
  'VREDAND'=>	{weight=>1&&1, width=>1, signed=>0, sc=>0, terminal=>0, v=>'(& %1)', },
  'VREDOR'=>	{weight=>1&&1, width=>1, signed=>0, sc=>0, terminal=>0, v=>'(| %1)', },
@@ -82,8 +82,8 @@ our $Raise_Weight_Max = 50;
  'VADD'=>	{weight=>1&&10, width=>0, 	    sc=>1, terminal=>0, v=>'(%1 + %2)', },
  'VSUB'=>	{weight=>1&&10, width=>0, 	    sc=>1, terminal=>0, v=>'(%1 - %2)', },
  'VMUL'=>	{weight=>1&&15,width=>0, 	    sc=>1, terminal=>0, v=>'(%1 * %2)', },  # High % as rarely applyable
- #'VDIV'=>	{weight=>2&&0, width=>-32,	    sc=>1, terminal=>0, v=>'(%1 / %2)', }, # FIX
- #'VMODDIV'=>	{weight=>2&&0, width=>-32,	    sc=>1, terminal=>0, v=>'(%1 %% %2)', }, # FIX
+ 'VDIV'=>	{weight=>1&&8, width=>0,	    sc=>1, terminal=>0, v=>'((%2)==%xw\'h0 ? %xw\'%xsh0:(%1 / %2))', },
+ 'VMODDIV'=>	{weight=>1&&8, width=>0,	    sc=>1, terminal=>0, v=>'((%2)==%xw\'h0 ? %xw\'%xsh0:(%1 %% %2))', },
  #'VPOW'=>	{weight=>2&&0,width=>-64, 	    sc=>0, terminal=>0, v=>'(%1 ** %2)', },
  'VSHIFTL'=>	{weight=>1&&8, width=>0, signed=>0, sc=>0, terminal=>0, v=>'(%1 << %2)', },
  'VSHIFTLS'=>	{weight=>1&&8, width=>0, signed=>1, sc=>0, terminal=>0, v=>'(%1 <<< %2)', },
@@ -167,8 +167,8 @@ my %ops2 =
  'VADD'=>	{pl=>'VADD   (%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);', trunc=>1,},
  'VSUB'=>	{pl=>'VSUB   (%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);', trunc=>1,},
  'VMUL'=>	{pl=>'VMUL   (%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);', trunc=>1,},  # Multiply generates larger width, so need truncate for safety
- #'VDIV'=>	{pl=>'VDIV   (%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);'},
- #'VMODDIV'=>	{pl=>'VMODDIV(%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);'},
+ 'VDIV'=>	{pl=>'VDIV   (%tr,%1r,%2r,0);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);'},
+ 'VMODDIV'=>	{pl=>'VDIV   (%tr,%1r,%2r,1);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>%tw,signed=>%tg);'},
  #'VPOW'=>	{pl=>'VPOW   (%tr,%1r,%2r);',	rnd=>'%1r=gen_leaf(width=>min(%tw,6),signed=>%tg); %2r=gen_leaf(width=>min(%tw,8),signed=>%tg);', trunc=>1,},  # Generates larger width, so need truncate for safety
  'VSHIFTL'=>	{pl=>'VSHIFTL(%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>log2(%tw)+1,signed=>%tg);'},
  'VSHIFTLS'=>	{pl=>'VSHIFTL(%tr,%1v,%2v);',	rnd=>'%1r=gen_leaf(width=>%tw,signed=>%tg); %2r=gen_leaf(width=>log2(%tw)+1,signed=>%tg);'},
@@ -227,6 +227,7 @@ if ($opt_seed==0) {
 }
 srand($opt_seed);
 init();
+selftest();
 gentest();
 write_output_sc("vgen.cpp") if $Opt_Sc;
 write_output_v("vgen.v") if !$Opt_Sc;
@@ -293,7 +294,10 @@ sub _rnd_op_ok {
     my $paramref = shift;
     return (($opref->{width} == 0
 	     || $opref->{width} == $paramref->{width}
+	     # Note -2 means >, while -32 means <!
+	     || ($opref->{width}==-31 && $paramref->{width}<=31)     # -31... must be <31 bits
 	     || ($opref->{width}==-32 && $paramref->{width}<=32)     # -32... must be <32 bits
+	     || ($opref->{width}==-63 && $paramref->{width}<=63)     # -63... must be <63 bits
 	     || ($opref->{width}==-64 && $paramref->{width}<=64)     # -64... must be <64 bits
 	     || ($opref->{width}==-2  && $paramref->{width}>=2)     # -2... must be >2 bits
 	     )
@@ -719,7 +723,8 @@ sub gen_leaf {
     $treeref->{val_size} = $treeref->{val}->Size;   #Debugging
     $treeref->{val_text} = $treeref->{val}->to_Hex; #Debugging
 
-    ($treeref->{val}->Size == $treeref->{width}) or die "%Error: Size mismatch,";
+    ($treeref->{val}->Size == $treeref->{width})
+	or die "%Error: Size mismatch ",$treeref->{val}->Size,"!=",$treeref->{width},"\n",Dumper($treeref);
 
     return $treeref;
 }
@@ -733,18 +738,20 @@ sub gen_v {
     $fmt =~ s/%3/%s/g;
     $fmt =~ s/%v/%s/g;
     $fmt =~ s/%i/%s/g;
-    $fmt =~ s/%xw/%s/g;
+    $fmt =~ s/%x[wds]/%s/g;
 
     my $argl = $opref->{v};
     my @args;
-    while ($argl =~ s/(%xw|%.)//) {
+    while ($argl =~ s/(%x.|%.)//) {
 	my $arg = $1;
 	push @args, '$treeref->{op1}{text}'	if $arg =~ /%1/;
 	push @args, '$treeref->{op2}{text}'	if $arg =~ /%2/;
 	push @args, '$treeref->{op3}{text}'	if $arg =~ /%3/;
 	push @args, '$treeref->val_to_text'	if $arg =~ /%v/;
 	push @args, '$treeref->{id}'		if $arg =~ /%i/;
-	push @args, '$treeref->{width}-$treeref->{op1}{width}'		if $arg =~ /%xw/;
+	push @args, '$treeref->{signed}?"s":""'	if $arg =~ /%xs/;
+	push @args, '$treeref->{width}'		if $arg =~ /%xw/;
+	push @args, '$treeref->{width}-$treeref->{op1}{width}'		if $arg =~ /%xd/;
     }
 
     my $func = ("sub { "
@@ -848,6 +855,21 @@ sub decl_text {
 #######################################################################
 # Math Functions
 
+sub selftest {
+    my $o = {};
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0xff)}, {val=>Bit::Vector->new_Dec(8,0x13)}, 0);
+    ($o->{val}->Word_Read(0) == 0x0d) or die;
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0xff)}, {val=>Bit::Vector->new_Dec(8,0x13)}, 1);
+    ($o->{val}->Word_Read(0) == 0x08) or die;
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0xff), signed=>1}, {val=>Bit::Vector->new_Dec(8,0x13), signed=>1}, 0);
+    ($o->{val}->Word_Read(0) == 0x00) or die;
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0xff), signed=>1}, {val=>Bit::Vector->new_Dec(8,0x13), signed=>1}, 1);
+    ($o->{val}->Word_Read(0) == 0xff) or die;
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0xff), signed=>1}, {val=>Bit::Vector->new_Dec(8,0xdb), signed=>1}, 1);
+    ($o->{val}->Word_Read(0) == 0xff) or die;
+    VDIV($o, {val=>Bit::Vector->new_Dec(8,0x72), signed=>1}, {val=>Bit::Vector->new_Dec(8,0xdb), signed=>1}, 1);
+    ($o->{val}->Word_Read(0) == 0x3) or die;
+}
 sub val_leaf { return {width=>32, signed=>0, val=>Bit::Vector->new_Dec(32,$_[0]), text=>$_[0],}; }
 
 sub makebool { return (Bit::Vector->new_Dec(1,$_[0])); }
@@ -936,25 +958,35 @@ sub VRESIZE {
 }
 sub VADD { $_[0]{val}=my $o=newsized($_[1]); $o->add($_[1],$_[2],0); }
 sub VSUB { $_[0]{val}=my $o=newsized($_[1]); $o->subtract($_[1],$_[2],0); }
-sub VMUL { # Multiply is signed, so need an additional sign bit
-	   my $a=$_[1]->Clone; $a->Resize($_[1]->Size + 1);
-	   my $b=$_[2]->Clone; $b->Resize($_[1]->Size + 1);
-	   my $mo=Bit::Vector->new($_[1]->Size + $_[2]->Size + 1);
-	   $mo->Multiply($a,$b);
-	   my $o=newsized($_[1]); $o->Interval_Copy($mo,0,0,$_[1]->Size);
-	   $_[0]{val}=$o;
-	   }
-sub VDIV { my $a=$_[1]->Clone; my $b=$_[2]->Clone;  # Else it will take them as signed #s
-	   $a->Resize($a->Size + 1); $b->Resize($b->Size + 1);
-	   print ("//DIVpp ",$_[1]->to_Hex,' ',$_[2]->to_Hex,' ',$_[1]->Size,'.',$_[2]->Size," \n");
-	   print ("//DIVpp ",$a->to_Hex,' ',$b->to_Hex,' ',$a->Size,'.',$b->Size," \n");
-	   my $o=newsized($a); my $rem=newsized($a);
-	   if (!$_[2]->is_empty) { $o->Divide($a,$b,$rem); } # No division by zero
-	   #push @Lines, ("//DIV ",$_[1]{val}->to_Hex,' ',$_[2]->to_Hex,' ',$o->to_Hex,'.',$rem->to_Hex," \n");
-	   $_[0]{val}=$o; }
+sub VMUL {
+    # Multiply is signed, so need an additional sign bit
+    my $a=$_[1]->Clone; $a->Resize($a->Size + 1);
+    my $b=$_[2]->Clone; $b->Resize($b->Size + 1);
+    my $mo=Bit::Vector->new($_[1]->Size + $_[2]->Size + 1);
+    $mo->Multiply($a,$b);
+    my $o=newsized($_[1]); $o->Interval_Copy($mo,0,0,$_[1]->Size);
+    $_[0]{val}=$o;
+}
+sub VDIV {
+    my $is_mod = $_[3];
+    if ($_[2]{val}->is_empty) {  # Avoid divide by zero
+	$_[0]{val}=newsized($_[1]{val});
+	return;
+    }
+    my $a=$_[1]{val}->Clone; if (!$_[1]->{signed}) { $a->Resize($a->Size + 1); }
+    my $b=$_[2]{val}->Clone; if (!$_[2]->{signed}) { $b->Resize($b->Size + 1); }
+    #print ("//DIVpp ",$_[1]->to_Hex,' ',$_[2]->to_Hex,' ',$_[1]->Size,'.',$_[2]->Size," \n");
+    #print ("//DIVpp ",$a->to_Hex,' ',$b->to_Hex,' ',$a->Size,'.',$b->Size," \n");
+    my $quo=newsized($a); my $rem=newsized($a);
+    $quo->Divide($a,$b,$rem); # No division by zero - handled by if above
+    my $o=newsized($_[1]{val});
+    $o->Interval_Copy($is_mod ? $rem : $quo,0,0,$_[1]{val}->Size);
+    #print "//DIV",($_[1]->{signed}?"S":" "),' w',$a->Size,' ',$_[1]{val}->to_Hex,' ',$_[2]{val}->to_Hex,' =',$quo->to_Hex,'.',$rem->to_Hex," \n";
+    $_[0]{val}=$o;
+}
 sub VPOW { # Power is a signed operation
-    my $a=$_[1]{val}->Clone; if (!$_[1]->{Signed}) { $a->Resize($_[1]{val}->Size + 1); }
-    my $b=$_[2]{val}->Clone; if (!$_[2]->{Signed}) { $b->Resize($_[2]{val}->Size + 1); }
+    my $a=$_[1]{val}->Clone; if (!$_[1]->{signed}) { $a->Resize($_[1]{val}->Size + 1); }
+    my $b=$_[2]{val}->Clone; if (!$_[2]->{signed}) { $b->Resize($_[2]{val}->Size + 1); }
     print "VVpow = ",$_[1]{val}->to_Hex," ** ",$_[2]{val}->to_Hex,"\n";
     my $mo=Bit::Vector->new($_[1]{val}->Size + 1);
     $mo->Power($a,$b);