diff --git a/Changes b/Changes index f44ef0621..6200b2026 100644 --- a/Changes +++ b/Changes @@ -12,6 +12,8 @@ The contributors that suggested a given feature are shown in []. Thanks! *** Better optimize large always block splitting, bug1244. [John Coiner] +*** Add new reloop optimization for repetitive assignment compression. + **** Fix internals to avoid 'using namespace std'. **** Fix Verilation performance issues, bug1316. [John Coiner] diff --git a/src/Makefile_obj.in b/src/Makefile_obj.in index 7ff7fa833..23124203a 100644 --- a/src/Makefile_obj.in +++ b/src/Makefile_obj.in @@ -217,6 +217,7 @@ RAW_OBJS = \ V3Param.o \ V3PreShell.o \ V3Premit.o \ + V3Reloop.o \ V3Scope.o \ V3Slice.o \ V3Split.o \ diff --git a/src/V3Options.cpp b/src/V3Options.cpp index ab09c7b3c..e82433884 100644 --- a/src/V3Options.cpp +++ b/src/V3Options.cpp @@ -723,6 +723,7 @@ void V3Options::parseOptsList(FileLine* fl, const string& optdir, int argc, char case 's': m_oSplit = flag; break; case 't': m_oLifePost = flag; break; case 'u': m_oSubst = flag; break; + case 'v': m_oReloop = flag; break; case 'x': m_oExpand = flag; break; case 'y': m_oAcycSimp = flag; break; case 'z': m_oLocalize = flag; break; @@ -1365,6 +1366,7 @@ void V3Options::optimize(int level) { m_oLife = flag; m_oLifePost = flag; m_oLocalize = flag; + m_oReloop = flag; m_oReorder = flag; m_oSplit = flag; m_oSubst = flag; diff --git a/src/V3Options.h b/src/V3Options.h index 6c169e672..c00ecff3c 100644 --- a/src/V3Options.h +++ b/src/V3Options.h @@ -159,6 +159,7 @@ class V3Options { bool m_oLifePost; // main switch: -Ot: delayed assignment elimination bool m_oLocalize; // main switch: -Oz: convert temps to local variables bool m_oInline; // main switch: -Oi: module inlining + bool m_oReloop; // main switch: -Ov: reform loops bool m_oReorder; // main switch: -Or: reorder assignments in blocks bool m_oSplit; // main switch: -Os: always assignment splitting bool m_oSubst; // main switch: -Ou: substitute expression temp values @@ -320,6 +321,7 @@ class V3Options { bool oLifePost() const { return m_oLifePost; } bool oLocalize() const { return m_oLocalize; } bool oInline() const { return m_oInline; } + bool oReloop() const { return m_oReloop; } bool oReorder() const { return m_oReorder; } bool oSplit() const { return m_oSplit; } bool oSubst() const { return m_oSubst; } diff --git a/src/V3Reloop.cpp b/src/V3Reloop.cpp new file mode 100644 index 000000000..b9acfd695 --- /dev/null +++ b/src/V3Reloop.cpp @@ -0,0 +1,266 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Recreate loops to help pack caches +// +// Code available from: http://www.veripool.org/verilator +// +//************************************************************************* +// +// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can +// redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// +// Verilator is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +//************************************************************************* +// V3Reloop's Transformations: +// +// Each CFunc: +// Look for a series of assignments that would look better in a loop: +// +// ASSIGN(ARRAYREF(var, #), ARRAYREF(var, #)) +// ASSIGN(ARRAYREF(var, #+1), ARRAYREF(var, #+1)) +// -> +// Create __Vilp local variable +// FOR(__Vilp = low; __Vilp <= high; ++__Vlip) +// ASSIGN(ARRAYREF(var, __Vilp), ARRAYREF(var, __Vilp)) +// +// Likewise vector assign to the same constant converted to a loop. +// +//************************************************************************* + +#include "config_build.h" +#include "verilatedos.h" +#include +#include +#include +#include + +#include "V3Global.h" +#include "V3Reloop.h" +#include "V3Stats.h" +#include "V3Ast.h" + +#define RELOOP_MIN_ITERS 40 // Need at least this many loops to do this optimization + +//###################################################################### + +class ReloopVisitor : public AstNVisitor { +private: + // TYPES + typedef std::vector AssVec; + + // NODE STATE + // AstCFunc::user1p -> Var* for temp var, 0=not set yet + AstUser1InUse m_inuser1; + + // STATE + V3Double0 m_statReloops; // Statistic tracking + V3Double0 m_statReItems; // Statistic tracking + AstCFunc* m_cfuncp; // Current block + + AssVec m_mgAssignps; // List of assignments merging + AstNode* m_mgCfuncp; // Parent C function + AstNode* m_mgNextp; // Next node + AstNodeSel* m_mgSelLp; // Parent select, NULL = idle + AstNodeSel* m_mgSelRp; // Parent select, NULL = constant + AstNodeVarRef* m_mgVarrefLp; // Parent varref + AstNodeVarRef* m_mgVarrefRp; // Parent varref, NULL = constant + AstConst* m_mgConstRp; // Parent RHS constant, NULL = sel + uint32_t m_mgIndexLo; // Merge range + uint32_t m_mgIndexHi; // Merge range + + // METHODS + VL_DEBUG_FUNC; // Declare debug() + + AstVar* findCreateVarTemp(FileLine* fl, AstNode* nodep) { + AstVar* varp = VN_CAST(nodep->user1p(), Var); + if (!varp) { + string newvarname = string("__Vilp"); + varp = new AstVar(fl, AstVarType::STMTTEMP, + newvarname, VFlagLogicPacked(), 32); + if (!m_cfuncp) nodep->v3fatalSrc("Assignment not under a function"); + m_cfuncp->addInitsp(varp); + nodep->user1p(varp); + } + return varp; + } + void mergeEnd() { + if (!m_mgAssignps.empty()) { + uint32_t items = m_mgIndexHi - m_mgIndexLo + 1; + UINFO(9, "End merge iter="<= RELOOP_MIN_ITERS) { + UINFO(6, "Reloop merging items="<lhsp() != m_mgSelLp) bodyp->v3fatalSrc("Corrupt queue/state"); + FileLine* fl = bodyp->fileline(); + AstVar* itp = findCreateVarTemp(fl, m_mgCfuncp); + + AstNode* initp = new AstAssign(fl, new AstVarRef(fl, itp, true), + new AstConst(fl, m_mgIndexLo)); + AstNode* condp = new AstLte(fl, new AstVarRef(fl, itp, false), + new AstConst(fl, m_mgIndexHi)); + AstNode* incp = new AstAssign(fl, new AstVarRef(fl, itp, true), + new AstAdd(fl, new AstConst(fl, 1), + new AstVarRef(fl, itp, false))); + AstWhile* whilep = new AstWhile(fl, condp, NULL, incp); + initp->addNext(whilep); + bodyp->replaceWith(initp); + whilep->addBodysp(bodyp); + + // Replace constant index with new loop index + AstNode* lbitp = m_mgSelLp->bitp(); + lbitp->replaceWith(new AstVarRef(fl, itp, false)); + lbitp->deleteTree(); VL_DANGLING(lbitp); + if (m_mgSelRp) { // else constant and no replace + AstNode* rbitp = m_mgSelRp->bitp(); + rbitp->replaceWith(new AstVarRef(fl, itp, false)); + rbitp->deleteTree(); VL_DANGLING(lbitp); + } + if (debug()>=9) initp->dumpTree(cout, "-new: "); + if (debug()>=9) whilep->dumpTree(cout, "-new: "); + + // Remove remaining assigns + for (AssVec::iterator it=m_mgAssignps.begin(); it!=m_mgAssignps.end(); ++it) { + AstNodeAssign* assp = *it; + if (assp != bodyp) { + assp->unlinkFrBack()->deleteTree(); VL_DANGLING(assp); + } + } + } + // Setup for next merge + m_mgAssignps.clear(); + m_mgSelLp = NULL; + m_mgSelRp = NULL; + m_mgVarrefLp = NULL; + m_mgVarrefRp = NULL; + m_mgConstRp = NULL; + } + } + + // VISITORS + virtual void visit(AstCFunc* nodep) { + m_cfuncp = nodep; + iterateChildren(nodep); + m_cfuncp = NULL; + } + virtual void visit(AstNodeAssign* nodep) { + if (!m_cfuncp) return; + + // Left select WordSel or ArraySel + AstNodeSel* lselp = VN_CAST(nodep->lhsp(), NodeSel); + if (!lselp) { mergeEnd(); return; } // Not ever merged + // Of a constant index + AstConst* lbitp = VN_CAST(lselp->bitp(), Const); + if (!lbitp) { mergeEnd(); return; } + uint32_t index = lbitp->toUInt(); + // Of variable + AstNodeVarRef* lvarrefp = VN_CAST(lselp->fromp(), NodeVarRef); + if (!lvarrefp) { mergeEnd(); return; } + + // RHS is a constant or a select + AstConst* rconstp = VN_CAST(nodep->rhsp(), Const); + AstNodeSel* rselp = VN_CAST(nodep->rhsp(), NodeSel); + AstNodeVarRef* rvarrefp = NULL; + if (rconstp) { // Ok + } else { + if (!rselp) { mergeEnd(); return; } + AstConst* rbitp = VN_CAST(rselp->bitp(), Const); + rvarrefp = VN_CAST(rselp->fromp(), NodeVarRef); + if (!rbitp || rbitp->toUInt() != index + || !rvarrefp + || lvarrefp->varp() == rvarrefp->varp()) { + mergeEnd(); return; + } + } + + if (m_mgSelLp) { // Old merge + if (m_mgCfuncp == m_cfuncp + && m_mgNextp == nodep + && m_mgSelLp->same(lselp) + && m_mgVarrefLp->same(lvarrefp) + && (m_mgConstRp + ? (rconstp && m_mgConstRp->same(rconstp)) + : (rselp + && m_mgSelRp->same(rselp) + && m_mgVarrefRp->same(rvarrefp))) + && (index == m_mgIndexLo-1 + || index == m_mgIndexHi+1)) { + // Sequentially next to last assign; continue merge + if (index == m_mgIndexLo-1) m_mgIndexLo = index; + else if (index == m_mgIndexHi+1) m_mgIndexHi = index; + UINFO(9, "Continue merge i="<nextp(); + return; + } + else { + // This assign doesn't merge with previous assign, + // but should start a new merge + mergeEnd(); + } + } + + // Merge start + m_mgAssignps.push_back(nodep); + m_mgCfuncp = m_cfuncp; + m_mgNextp = nodep->nextp(); + m_mgSelLp = lselp; + m_mgSelRp = rselp; + m_mgVarrefLp = lvarrefp; + m_mgVarrefRp = rvarrefp; + m_mgConstRp = rconstp; + m_mgIndexLo = index; + m_mgIndexHi = index; + UINFO(9, "Start merge i="<= 6); +} diff --git a/src/V3Reloop.h b/src/V3Reloop.h new file mode 100644 index 000000000..ad9eb58ff --- /dev/null +++ b/src/V3Reloop.h @@ -0,0 +1,35 @@ +// -*- mode: C++; c-file-style: "cc-mode" -*- +//************************************************************************* +// DESCRIPTION: Verilator: Recreate loops to help pack caches +// +// Code available from: http://www.veripool.org/verilator +// +//************************************************************************* +// +// Copyright 2003-2018 by Wilson Snyder. This program is free software; you can +// redistribute it and/or modify it under the terms of either the GNU +// Lesser General Public License Version 3 or the Perl Artistic License +// Version 2.0. +// +// Verilator is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +//************************************************************************* + +#ifndef _V3RELOOP_H_ +#define _V3RELOOP_H_ 1 +#include "config_build.h" +#include "verilatedos.h" +#include "V3Error.h" +#include "V3Ast.h" + +//============================================================================ + +class V3Reloop { +public: + static void reloopAll(AstNetlist* nodep); +}; + +#endif // Guard diff --git a/src/Verilator.cpp b/src/Verilator.cpp index 637fee721..992ce9a1c 100644 --- a/src/Verilator.cpp +++ b/src/Verilator.cpp @@ -75,6 +75,7 @@ #include "V3ParseSym.h" #include "V3PreShell.h" #include "V3Premit.h" +#include "V3Reloop.h" #include "V3Scope.h" #include "V3Slice.h" #include "V3Split.h" @@ -486,6 +487,14 @@ void process () { V3Dead::deadifyAll(v3Global.rootp()); } + if (!v3Global.opt.lintOnly() + && !v3Global.opt.xmlOnly() + && v3Global.opt.oReloop()) { + // Reform loops to reduce code size + // Must be after all Sel/array index based optimizations + V3Reloop::reloopAll(v3Global.rootp()); + } + if (!v3Global.opt.lintOnly() && !v3Global.opt.xmlOnly()) { // Fix very deep expressions diff --git a/test_regress/t/t_EXAMPLE.v b/test_regress/t/t_EXAMPLE.v index 9a8fca418..8b70d3c1b 100644 --- a/test_regress/t/t_EXAMPLE.v +++ b/test_regress/t/t_EXAMPLE.v @@ -13,7 +13,7 @@ // please note it here, otherwise:** // // This file ONLY is placed into the Public Domain, for any use, -// without warranty, 2017 by ____YOUR_NAME_HERE____. +// without warranty, 2018 by ____YOUR_NAME_HERE____. module t (/*AUTOARG*/ // Inputs diff --git a/test_regress/t/t_reloop_cam.pl b/test_regress/t/t_reloop_cam.pl new file mode 100755 index 000000000..53ce8eab1 --- /dev/null +++ b/test_regress/t/t_reloop_cam.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl +if (!$::Driver) { use FindBin; exec("$FindBin::Bin/bootstrap.pl", @ARGV, $0); die; } +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# Copyright 2003 by Wilson Snyder. This program is free software; you can +# redistribute it and/or modify it under the terms of either the GNU +# Lesser General Public License Version 3 or the Perl Artistic License +# Version 2.0. + +scenarios(simulator => 1); + +compile( + verilator_flags2 => ["-unroll-count 1024", + "--stats"], + ); + +execute( + check_finished => 1, + ); + +if ($Self->{vlt_all}) { + file_grep ($Self->{stats}, qr/Optimizations, Reloop iterations\s+(\d+)/i, + 768); + file_grep ($Self->{stats}, qr/Optimizations, Reloops\s+(\d+)/i, + 3); +} + +ok(1); +1; diff --git a/test_regress/t/t_reloop_cam.v b/test_regress/t/t_reloop_cam.v new file mode 100644 index 000000000..1b8fc5b76 --- /dev/null +++ b/test_regress/t/t_reloop_cam.v @@ -0,0 +1,179 @@ +// DESCRIPTION: Verilator: Verilog Test module +// +// This file ONLY is placed into the Public Domain, for any use, +// without warranty, 2018 by Wilson Snyder. + +module t (/*AUTOARG*/ + // Inputs + clk + ); + input clk; + + integer cyc=0; + reg [63:0] crc; + reg [63:0] sum; + reg rst; + + // Two phases, random so nothing optimizes away, and focused so get hits + logic inval; + wire [30:0] wdat = (cyc < 50 ? crc[30:0] : {29'h0, crc[1:0]}); + wire [30:0] cdat = (cyc < 50 ? crc[30:0] : {29'h0, crc[1:0]}); + wire wdat_val = 1'b1; + wire camen = crc[32]; + wire ren = crc[33]; + wire wen = crc[34]; + wire [7:0] rwidx = (cyc < 50 ? crc[63:56] : {6'h0, crc[57:56]}); + + /*AUTOWIRE*/ + // Beginning of automatic wires (for undeclared instantiated-module outputs) + logic hit_d2r; // From cam of cam.v + logic [7:0] hitidx_d1r; // From cam of cam.v + logic [255:0] hitvec_d1r; // From cam of cam.v + logic [30:0] rdat_d2r; // From cam of cam.v + logic rdat_val_d2r; // From cam of cam.v + // End of automatics + + cam cam (/*AUTOINST*/ + // Outputs + .hitvec_d1r (hitvec_d1r[255:0]), + .hitidx_d1r (hitidx_d1r[7:0]), + .hit_d2r (hit_d2r), + .rdat_d2r (rdat_d2r[30:0]), + .rdat_val_d2r (rdat_val_d2r), + // Inputs + .clk (clk), + .rst (rst), + .camen (camen), + .inval (inval), + .cdat (cdat[30:0]), + .ren (ren), + .wen (wen), + .wdat (wdat[30:0]), + .wdat_val (wdat_val), + .rwidx (rwidx[7:0])); + + // Aggregate outputs into a single result vector + wire [63:0] result = {hitvec_d1r[15:0], 15'h0, hit_d2r, rdat_val_d2r, rdat_d2r}; + + // Test loop + always @ (posedge clk) begin +`ifdef TEST_VERBOSE + $write("[%0t] cyc==%0d crc=%x result=%x\n",$time, cyc, crc, result); +`endif + cyc <= cyc + 1; + crc <= {crc[62:0], crc[63]^crc[2]^crc[0]}; + sum <= result ^ {sum[62:0],sum[63]^sum[2]^sum[0]}; + if (cyc==0) begin + // Setup + crc <= 64'h5aef0c8d_d70a4497; + sum <= '0; + rst <= 1'b1; + end + else if (cyc<10) begin + sum <= '0; + rst <= 1'b0; + end + else if (cyc==70) begin + inval <= 1'b1; + end + else if (cyc==71) begin + inval <= 1'b0; + end + else if (cyc==99) begin + $write("[%0t] cyc==%0d crc=%x sum=%x\n",$time, cyc, crc, sum); + if (crc !== 64'hc77bb9b3784ea091) $stop; +`define EXPECTED_SUM 64'h5182640870b07199 + if (sum !== `EXPECTED_SUM) $stop; + $write("*-* All Finished *-*\n"); + $finish; + end + end + +endmodule + +module cam + ( + input clk, + input rst, + + input camen, + input inval, + input [30:0] cdat, + output logic [255:0] hitvec_d1r, + output logic [7:0] hitidx_d1r, + output logic hit_d2r, + + input ren, + input wen, + input [30:0] wdat, + input wdat_val, + input [7:0] rwidx, + output logic [30:0] rdat_d2r, + output logic rdat_val_d2r + ); + + logic [30:0] rdat_d2r; + logic camen_d1r; + logic inval_d1r; + logic ren_d1r; + logic wen_d1r; + logic [7:0] rwidx_d1r; + logic [30:0] cdat_d1r; + logic [30:0] wdat_d1r; + logic wdat_val_d1r; + + logic [30:0] wdat; + + always_ff @(posedge clk) begin + camen_d1r <= camen; + inval_d1r <= inval; + ren_d1r <= ren; + wen_d1r <= wen; + + cdat_d1r <= cdat; + rwidx_d1r <= rwidx; + wdat_d1r <= wdat; + wdat_val_d1r <= wdat_val; + end + + typedef struct packed { + logic [30:0] data; + logic valid; + } entry_t; + entry_t [255:0] entries; + + always_ff @(posedge clk) begin + if (camen_d1r) begin + for (int i = 0; i < 256; i = i + 1) begin + hitvec_d1r[i] <= entries[i].valid & (entries[i].data == cdat_d1r); + end + end + end + always_ff @(posedge clk) begin + hit_d2r <= | hitvec_d1r; + end + + always_ff @(posedge clk) begin + if (rst) begin + for (int i = 0; i < 256; i = i + 1) begin + entries[i] <= '0; + end + end + else if (wen_d1r) begin + entries[rwidx_d1r] <= '{valid:wdat_val_d1r, data:wdat_d1r}; + end + else if (inval_d1r) begin + for (int i = 0; i < 256; i = i + 1) begin + entries[i] <= '{valid:'0, data:entries[i].data}; + end + end + end + + always_ff @(posedge clk) begin + if (ren_d1r) begin + rdat_d2r <= entries[rwidx_d1r].data; + rdat_val_d2r <= entries[rwidx_d1r].valid; + end + end + +endmodule