diff --git a/passes/opt/Makefile.inc b/passes/opt/Makefile.inc index 461412422..6d932bbca 100644 --- a/passes/opt/Makefile.inc +++ b/passes/opt/Makefile.inc @@ -23,6 +23,8 @@ OBJS += passes/opt/opt_ffinv.o OBJS += passes/opt/pmux2shiftx.o OBJS += passes/opt/muxpack.o OBJS += passes/opt/opt_balance_tree.o +OBJS += passes/opt/opt_parallel_prefix.o +OBJS += passes/opt/opt_prienc.o OBJS += passes/opt/peepopt.o GENFILES += passes/opt/peepopt_pm.h diff --git a/passes/opt/opt_parallel_prefix.cc b/passes/opt/opt_parallel_prefix.cc new file mode 100644 index 000000000..ae424c479 --- /dev/null +++ b/passes/opt/opt_parallel_prefix.cc @@ -0,0 +1,532 @@ +/* + * yosys -- Yosys Open SYnthesis Suite + * + * Copyright (C) 2026 Akash Levy + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include "kernel/yosys.h" +#include "kernel/sigtools.h" + +USING_YOSYS_NAMESPACE +PRIVATE_NAMESPACE_BEGIN + +enum class Topology { KOGGE_STONE, SKLANSKY, BRENT_KUNG, HAN_CARLSON }; + +static const char* topology_name(Topology t) { + switch (t) { + case Topology::KOGGE_STONE: return "Kogge-Stone"; + case Topology::SKLANSKY: return "Sklansky"; + case Topology::BRENT_KUNG: return "Brent-Kung"; + case Topology::HAN_CARLSON: return "Han-Carlson"; + } + return "?"; +} + +// One linearized cascade ready to be rebuilt as a prefix network. +struct PrefixChain { + IdString op; + bool a_signed = false; + bool b_signed = false; + // leaves[0..N-1] are the fresh operands fed into the original cascade. + // The original chain computes prefix[i] = leaves[0] op leaves[1] op ... op leaves[i]. + vector leaves; + // cells[i] is the original cell whose Y is prefix[i+1] (i in [0..N-2]). + vector cells; + // demands[i] = the original SigSpec that must equal prefix[i] after rewrite. + // i ranges over [1..N-1]; i=0 is never demanded (prefix[0] is a leaf). + dict demands; + // First-cell attributes propagated to emitted cells (for src tracking). + dict ref_attributes; +}; + +// Owns cell emission for a single chain rewrite. Tracks per-signal depth so +// the worker can log the critical-path depth of the resulting network. +struct PrefixNet { + Module* m; + IdString op; + bool a_signed; + bool b_signed; + const dict* ref_attributes; + Cell* ref_cell; + dict* cell_count; + dict depth; + int max_depth = 0; + + PrefixNet(Module* m, const PrefixChain& chain, dict* cc) + : m(m), op(chain.op), a_signed(chain.a_signed), b_signed(chain.b_signed), + ref_attributes(&chain.ref_attributes), ref_cell(chain.cells.front()), cell_count(cc) {} + + int depth_of(const SigSpec& s) { + auto it = depth.find(s); + return it == depth.end() ? 0 : it->second; + } + + SigSpec emit(const SigSpec& a, const SigSpec& b) { + // Match opt_balance_tree's natural-width convention so wreduce/equiv_opt + // behave identically. The cell itself handles A/B extension via the + // signedness parameters; we never pad operands manually. + int out_width; + if (op == ID($add)) + out_width = std::max(GetSize(a), GetSize(b)) + 1; + else if (op == ID($mul)) + out_width = GetSize(a) + GetSize(b); + else + out_width = std::max(GetSize(a), GetSize(b)); + + Cell* cell = ref_cell; + Wire* y = m->addWire(NEW_ID2_SUFFIX("pp_y"), out_width); + Cell* c = m->addCell(NEW_ID2_SUFFIX("pp"), op); + c->attributes = *ref_attributes; + c->setPort(ID::A, a); + c->setPort(ID::B, b); + c->setPort(ID::Y, y); + c->fixup_parameters(); + c->setParam(ID::A_SIGNED, a_signed); + c->setParam(ID::B_SIGNED, b_signed); + (*cell_count)[op]++; + + SigSpec y_sig(y); + int d = std::max(depth_of(a), depth_of(b)) + 1; + depth[y_sig] = d; + if (d > max_depth) max_depth = d; + return y_sig; + } +}; + +struct OptParallelPrefixWorker { + Module* module; + SigMap sigmap; + Topology topology; + + dict sig_to_driver; + dict> sig_to_sinks; + pool output_port_bits; + + dict cell_count; + int chains_built = 0; + int max_depth = 0; + int leaves_total = 0; + + OptParallelPrefixWorker(Module* m, Topology t) : module(m), sigmap(m), topology(t) { + build_indexes(); + } + + void build_indexes() { + for (auto cell : module->cells()) { + for (auto& conn : cell->connections()) { + SigSpec s = sigmap(conn.second); + if (cell->output(conn.first)) + sig_to_driver[s] = cell; + if (cell->input(conn.first)) { + sig_to_sinks[s].insert(cell); + for (auto bit : s) + sig_to_sinks[SigSpec(bit)].insert(cell); + } + } + } + for (auto wire : module->wires()) { + if (!wire->port_output) continue; + SigSpec s = sigmap(wire); + for (auto bit : s) output_port_bits.insert(bit); + } + } + + // A cell can participate in a prefix chain if it is of the right op and its + // declared Y width is large enough to fit the natural result width. + // (Truncating chains can change semantics, so we refuse them - same rule as + // opt_balance_tree's is_right_type.) + bool is_chainable(Cell* c, IdString op) { + if (c->type != op) return false; + int y_width = c->getParam(ID::Y_WIDTH).as_int(); + int a_width = c->getParam(ID::A_WIDTH).as_int(); + int b_width = c->getParam(ID::B_WIDTH).as_int(); + int natural_width; + if (op == ID($add)) + natural_width = std::max(a_width, b_width); // ignore carry bit (same as opt_balance_tree) + else if (op == ID($mul)) + natural_width = a_width + b_width; + else + natural_width = std::max(a_width, b_width); + return y_width >= natural_width; + } + + // A signal is a "leaf" w.r.t. an ongoing chain iff it is NOT produced by a + // chainable cell of the same op + signedness that is free to be merged. + bool is_leaf(const SigSpec& sig, IdString op, bool a_signed, bool b_signed, const pool& claimed) { + auto it = sig_to_driver.find(sig); + if (it == sig_to_driver.end()) return true; + Cell* drv = it->second; + if (claimed.count(drv)) return true; + if (!is_chainable(drv, op)) return true; + if (drv->getParam(ID::A_SIGNED).as_bool() != a_signed) return true; + if (drv->getParam(ID::B_SIGNED).as_bool() != b_signed) return true; + return false; + } + + // Greedy forward growth: extend the chain as long as the current running + // output drives EXACTLY ONE chainable successor whose other operand is itself + // a leaf. Any additional fanout doesn't prevent growth; it just marks the + // current node as a demand point later. + void extend_chain(PrefixChain& chain, const pool& claimed) { + while (true) { + Cell* cur = chain.cells.back(); + SigSpec cur_Y = sigmap(cur->getPort(ID::Y)); + + auto sinks_it = sig_to_sinks.find(cur_Y); + if (sinks_it == sig_to_sinks.end()) return; + + Cell* next = nullptr; + SigSpec next_leaf; + int chainable_count = 0; + for (auto s : sinks_it->second) { + if (s == cur) continue; + if (claimed.count(s)) continue; + if (!is_chainable(s, chain.op)) continue; + if (s->getParam(ID::A_SIGNED).as_bool() != chain.a_signed) continue; + if (s->getParam(ID::B_SIGNED).as_bool() != chain.b_signed) continue; + + SigSpec sA = sigmap(s->getPort(ID::A)); + SigSpec sB = sigmap(s->getPort(ID::B)); + SigSpec other; + if (sA == cur_Y && sB != cur_Y) other = sB; + else if (sB == cur_Y && sA != cur_Y) other = sA; + else continue; // Y on both inputs, or partial overlap; not chain-linear. + + if (!is_leaf(other, chain.op, chain.a_signed, chain.b_signed, claimed)) + continue; + + chainable_count++; + if (chainable_count > 1) break; + next = s; + next_leaf = other; + } + + if (chainable_count != 1) return; + + chain.leaves.push_back(next_leaf); + chain.cells.push_back(next); + } + } + + // After the chain is built, mark each cell whose output is consumed outside + // the chain (port output or any non-next-cell sink) as a demand point. + void detect_demands(PrefixChain& chain) { + int N = GetSize(chain.cells); + for (int i = 0; i < N; i++) { + Cell* c = chain.cells[i]; + SigSpec Y = sigmap(c->getPort(ID::Y)); + + bool demanded = false; + for (auto bit : Y) { + if (output_port_bits.count(bit)) { demanded = true; break; } + } + + if (!demanded) { + Cell* next_chain_cell = (i + 1 < N) ? chain.cells[i+1] : nullptr; + auto sinks_it = sig_to_sinks.find(Y); + if (sinks_it != sig_to_sinks.end()) { + for (auto s : sinks_it->second) { + if (s != next_chain_cell) { demanded = true; break; } + } + } + // Bit-level fanout (e.g. someone reads Y[3]): also a demand. + if (!demanded) { + for (auto bit : Y) { + auto bit_it = sig_to_sinks.find(SigSpec(bit)); + if (bit_it == sig_to_sinks.end()) continue; + for (auto s : bit_it->second) { + if (s != next_chain_cell) { demanded = true; break; } + } + if (demanded) break; + } + } + } + + // The chain's terminal cell is the chain's reason for existing. + if (i == N - 1) demanded = true; + + if (demanded) { + // chain.cells[i] produces prefix[i+1] (in 0-based leaf indexing). + chain.demands[i + 1] = chain.cells[i]->getPort(ID::Y); + } + } + } + + // ------------ topology builders ------------ + // Each returns prefix[0..N-1] where prefix[i] = reduce(leaves[0..i]). + // They are pure dispatchers over PrefixNet::emit and therefore work for all + // five supported ops. + + vector build_kogge_stone(PrefixNet& net, const vector& leaves) { + int N = GetSize(leaves); + vector cur = leaves; + for (int offset = 1; offset < N; offset *= 2) { + vector nxt(N); + for (int i = 0; i < N; i++) { + if (i >= offset) nxt[i] = net.emit(cur[i - offset], cur[i]); + else nxt[i] = cur[i]; + } + cur = std::move(nxt); + } + return cur; + } + + vector build_sklansky(PrefixNet& net, const vector& leaves) { + int N = GetSize(leaves); + vector cur = leaves; + for (int group = 1; group < N; group *= 2) { + vector nxt(N); + for (int i = 0; i < N; i++) { + int block = i / (2 * group); + int within = i - block * 2 * group; + if (within >= group) { + int boundary = block * 2 * group + group - 1; + nxt[i] = net.emit(cur[boundary], cur[i]); + } else { + nxt[i] = cur[i]; + } + } + cur = std::move(nxt); + } + return cur; + } + + vector build_brent_kung(PrefixNet& net, const vector& leaves) { + int N = GetSize(leaves); + vector cur = leaves; + // Upsweep: classic reduction tree, touches indices (2*stride - 1) + k*(2*stride). + for (int stride = 1; stride < N; stride *= 2) { + for (int i = 2*stride - 1; i < N; i += 2*stride) + cur[i] = net.emit(cur[i - stride], cur[i]); + } + // Downsweep: fill in the holes left by upsweep. Touches indices + // (3*stride - 1) + k*(2*stride), going from coarse stride down to 1. + int max_stride = 1; + while (3 * max_stride * 2 - 1 < N) max_stride *= 2; + for (int stride = max_stride; stride >= 1; stride /= 2) { + for (int i = 3*stride - 1; i < N; i += 2*stride) + cur[i] = net.emit(cur[i - stride], cur[i]); + } + return cur; + } + + vector build_han_carlson(PrefixNet& net, const vector& leaves) { + int N = GetSize(leaves); + vector cur = leaves; + // Step 1: pairwise reduce into odd indices. + for (int i = 1; i < N; i += 2) + cur[i] = net.emit(cur[i - 1], cur[i]); + // Step 2: Kogge-Stone on odd indices (offset doubled in original space). + int num_odd = (N + 1) / 2 - (N % 2 == 0 ? 0 : 0); + // Simpler: count odd indices directly. + num_odd = 0; + for (int i = 1; i < N; i += 2) num_odd++; + for (int off_odd = 1; off_odd < num_odd; off_odd *= 2) { + int off = 2 * off_odd; + vector nxt = cur; + for (int i = 1; i < N; i += 2) { + if (i >= off) nxt[i] = net.emit(cur[i - off], cur[i]); + } + cur = std::move(nxt); + } + // Step 3: fill in even indices from their left-neighbour odd prefix. + for (int i = 2; i < N; i += 2) + cur[i] = net.emit(cur[i - 1], cur[i]); + return cur; + } + + vector build_network(PrefixNet& net, const vector& leaves) { + switch (topology) { + case Topology::KOGGE_STONE: return build_kogge_stone(net, leaves); + case Topology::SKLANSKY: return build_sklansky(net, leaves); + case Topology::BRENT_KUNG: return build_brent_kung(net, leaves); + case Topology::HAN_CARLSON: return build_han_carlson(net, leaves); + } + return {}; + } + + void transform_chain(const PrefixChain& chain) { + PrefixNet net(module, chain, &cell_count); + // Seed prefix[0] = leaves[0]; treat leaves as depth-0 signals. + for (auto& l : chain.leaves) net.depth[l] = 0; + + vector prefix = build_network(net, chain.leaves); + + log_debug(" Built %s network with %d leaves -> depth %d\n", + topology_name(topology), GetSize(chain.leaves), net.max_depth); + + // Wire each demanded prefix to the original destination signal, matching + // opt_balance_tree's width/sign-extension recipe so wreduce can clean up. + for (auto& d : chain.demands) { + int i = d.first; + SigSpec dst = d.second; + SigSpec src = prefix[i]; + int w = std::min(GetSize(dst), GetSize(src)); + module->connect(dst.extract(0, w), src.extract(0, w)); + if (GetSize(dst) > w) { + SigBit pad = (chain.a_signed || chain.b_signed) ? src[w - 1] : SigBit(State::S0); + module->connect(dst.extract(w, GetSize(dst) - w), + SigSpec(pad, GetSize(dst) - w)); + } + } + + if (net.max_depth > max_depth) max_depth = net.max_depth; + } + + void run(const vector& ops) { + pool claimed; + vector chains; + + // Snapshot cell list once: we'll be adding cells later (the network) and + // don't want to re-scan them. + vector initial_cells(module->cells().begin(), module->cells().end()); + + for (auto op : ops) { + for (auto c : initial_cells) { + if (claimed.count(c)) continue; + if (!is_chainable(c, op)) continue; + + bool a_signed = c->getParam(ID::A_SIGNED).as_bool(); + bool b_signed = c->getParam(ID::B_SIGNED).as_bool(); + SigSpec A = sigmap(c->getPort(ID::A)); + SigSpec B = sigmap(c->getPort(ID::B)); + + // A head is a cell whose BOTH operands are leaves. That gives us + // the start of a maximal linear chain. + if (!is_leaf(A, op, a_signed, b_signed, claimed)) continue; + if (!is_leaf(B, op, a_signed, b_signed, claimed)) continue; + + PrefixChain chain; + chain.op = op; + chain.a_signed = a_signed; + chain.b_signed = b_signed; + chain.leaves.push_back(A); + chain.leaves.push_back(B); + chain.cells.push_back(c); + chain.ref_attributes = c->attributes; + + extend_chain(chain, claimed); + detect_demands(chain); + + // Only rewrite chains with 2+ demand points; single-output + // reductions are opt_balance_tree's job. + if (GetSize(chain.demands) < 2) continue; + + log_debug(" Candidate chain of %d leaves, %d demands (op=%s)\n", + GetSize(chain.leaves), GetSize(chain.demands), log_id(op)); + + for (auto cc : chain.cells) claimed.insert(cc); + chains.push_back(std::move(chain)); + } + } + + for (auto& chain : chains) { + transform_chain(chain); + chains_built++; + leaves_total += GetSize(chain.leaves); + } + + // Remove the original chain cells. + for (auto c : claimed) module->remove(c); + } +}; + +struct OptParallelPrefixPass : public Pass { + OptParallelPrefixPass() : Pass("opt_parallel_prefix", + "rebuild $add/$and/$or/$xor/$mul cascades as parallel-prefix networks") {} + + void help() override { + // |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---| + log("\n"); + log(" opt_parallel_prefix [options] [selection]\n"); + log("\n"); + log("This pass detects linear cascades of an associative operator (add, and,\n"); + log("or, xor, mul) where two or more intermediate results in the cascade are\n"); + log("consumed externally (port outputs or non-chain fanout) and rebuilds the\n"); + log("cascade as a parallel-prefix network. Intermediate prefix nodes are\n"); + log("shared across all demanded outputs so the cost is O(N log N) cells (or\n"); + log("less, depending on topology) instead of N independent balanced trees.\n"); + log("\n"); + log("Cascades with fewer than two demanded prefix points are left alone for\n"); + log("opt_balance_tree to handle.\n"); + log("\n"); + log(" -arith\n"); + log(" only convert arithmetic cells ($add, $mul).\n"); + log("\n"); + log(" -logic\n"); + log(" only convert logic cells ($and, $or, $xor).\n"); + log("\n"); + log(" -kogge-stone\n"); + log(" use the Kogge-Stone topology (default). Minimum depth log2(N),\n"); + log(" approximately N*log2(N) cells, fanout 2.\n"); + log("\n"); + log(" -sklansky\n"); + log(" use the Sklansky topology. Minimum depth log2(N), approximately\n"); + log(" (N/2)*log2(N) cells, fanout up to N/2.\n"); + log("\n"); + log(" -brent-kung\n"); + log(" use the Brent-Kung topology. Depth 2*log2(N)-2, approximately\n"); + log(" 2*N cells, fanout 2.\n"); + log("\n"); + log(" -han-carlson\n"); + log(" use the Han-Carlson topology. Depth log2(N)+1, hybrid between\n"); + log(" Kogge-Stone (on odd indices) and Brent-Kung's outer layers.\n"); + log("\n"); + } + + void execute(std::vector args, RTLIL::Design *design) override { + log_header(design, "Executing OPT_PARALLEL_PREFIX pass (cell cascades to prefix networks).\n"); + + vector cell_types = {ID($and), ID($or), ID($xor), ID($add), ID($mul)}; + Topology topology = Topology::KOGGE_STONE; + + size_t argidx; + for (argidx = 1; argidx < args.size(); argidx++) { + if (args[argidx] == "-arith") { cell_types = {ID($add), ID($mul)}; continue; } + if (args[argidx] == "-logic") { cell_types = {ID($and), ID($or), ID($xor)}; continue; } + if (args[argidx] == "-kogge-stone") { topology = Topology::KOGGE_STONE; continue; } + if (args[argidx] == "-sklansky") { topology = Topology::SKLANSKY; continue; } + if (args[argidx] == "-brent-kung") { topology = Topology::BRENT_KUNG; continue; } + if (args[argidx] == "-han-carlson") { topology = Topology::HAN_CARLSON; continue; } + break; + } + extra_args(args, argidx, design); + + log("Topology: %s\n", topology_name(topology)); + + dict total_cells; + int total_chains = 0; + int total_leaves = 0; + int total_max_depth = 0; + for (auto module : design->selected_modules()) { + OptParallelPrefixWorker worker(module, topology); + worker.run(cell_types); + for (auto& kv : worker.cell_count) total_cells[kv.first] += kv.second; + total_chains += worker.chains_built; + total_leaves += worker.leaves_total; + if (worker.max_depth > total_max_depth) total_max_depth = worker.max_depth; + } + + log("Rewrote %d chain(s) covering %d leaves (max network depth %d).\n", + total_chains, total_leaves, total_max_depth); + for (auto op : cell_types) + log(" Emitted %d %s cells.\n", total_cells[op], log_id(op)); + + Yosys::run_pass("clean -purge"); + } +} OptParallelPrefixPass; + +PRIVATE_NAMESPACE_END diff --git a/passes/opt/opt_prienc.cc b/passes/opt/opt_prienc.cc new file mode 100644 index 000000000..89a0a6040 --- /dev/null +++ b/passes/opt/opt_prienc.cc @@ -0,0 +1,604 @@ +/* + * yosys -- Yosys Open SYnthesis Suite + * + * Copyright (C) 2026 Akash Levy + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include "kernel/yosys.h" +#include "kernel/sigtools.h" +#include "kernel/consteval.h" +#include + +USING_YOSYS_NAMESPACE +PRIVATE_NAMESPACE_BEGIN + +// Priority-encoder variants the pass recognises. +enum class PEVariant { NONE, CLZ_FULL, CLZ_SHORT, CTZ_FULL, CTZ_SHORT }; + +static const char* variant_name(PEVariant v) { + switch (v) { + case PEVariant::CLZ_FULL: return "clz_full"; + case PEVariant::CLZ_SHORT: return "clz_short"; + case PEVariant::CTZ_FULL: return "ctz_full"; + case PEVariant::CTZ_SHORT: return "ctz_short"; + default: return "none"; + } +} + +static int clog2_int(int x) { + int r = 0; + while ((1 << r) < x) r++; + return r; +} + +// Build an N-bit Const from a uint64_t pattern. Bit i set in `pattern` -> bit i +// of the result. Bits beyond 64 are zero. +static Const u64_const(uint64_t pattern, int N) { + std::vector bits(N, State::S0); + for (int i = 0; i < N && i < 64; i++) + if ((pattern >> i) & 1ULL) bits[i] = State::S1; + return Const(bits); +} + +// Return the index of the highest set bit (MSB) of `c`, or -1 if all zero. +static int const_msb_set(const Const& c, int N) { + auto bits = c.to_bits(); + for (int i = N - 1; i >= 0; i--) + if (i < (int)bits.size() && bits[i] == State::S1) return i; + return -1; +} + +// Return the index of the lowest set bit (LSB) of `c`, or -1 if all zero. +static int const_lsb_set(const Const& c, int N) { + auto bits = c.to_bits(); + for (int i = 0; i < N; i++) + if (i < (int)bits.size() && bits[i] == State::S1) return i; + return -1; +} + +struct OptPriEncWorker { + Module* module; + SigMap sigmap; + Cell* cell = nullptr; + + // Bit-level driver map (combinational drivers only). + dict bit_to_driver; + pool input_port_bits; + pool sequential_cells; + + // Configuration. + bool detect_clz = true; + bool detect_ctz = true; + int max_input_width = 256; + int min_input_width = 4; + + // Stats. + int regions_rewritten = 0; + int cells_added = 0; + + // Cache of full-width CLZ/CTZ networks already emitted for a given input + // wire, so that several matched output wires sharing the same input bus + // pull from a single instantiation instead of materialising duplicate + // log-depth trees. + dict clz_full_cache; + dict ctz_full_cache; + + OptPriEncWorker(Module* m) : module(m), sigmap(m) { build_indexes(); } + + bool is_sequential(Cell* c) { + return c->type.in( + ID($ff), ID($dff), ID($dffe), ID($adff), ID($adffe), + ID($sdff), ID($sdffe), ID($sdffce), ID($dffsr), ID($dffsre), + ID($_DFF_P_), ID($_DFF_N_), + ID($_DFFE_PP_), ID($_DFFE_PN_), ID($_DFFE_NP_), ID($_DFFE_NN_), + ID($_DFF_PP0_), ID($_DFF_PP1_), ID($_DFF_PN0_), ID($_DFF_PN1_), + ID($_DFF_NP0_), ID($_DFF_NP1_), ID($_DFF_NN0_), ID($_DFF_NN1_), + ID($dlatch), ID($adlatch), ID($dlatchsr), + ID($mem), ID($mem_v2), ID($meminit), ID($meminit_v2), + ID($memrd), ID($memrd_v2), ID($memwr), ID($memwr_v2), + ID($fsm), + ID($assert), ID($assume), ID($cover), ID($live), ID($fair), + ID($print), ID($check), + ID($anyconst), ID($anyseq), ID($allconst), ID($allseq), + ID($initstate)); + } + + void build_indexes() { + for (auto cell : module->cells()) { + if (is_sequential(cell)) { + sequential_cells.insert(cell); + continue; + } + for (auto& conn : cell->connections()) { + if (!cell->output(conn.first)) continue; + for (auto bit : sigmap(conn.second)) + if (bit.wire) bit_to_driver[bit] = cell; + } + } + for (auto wire : module->wires()) { + if (!wire->port_input) continue; + for (auto bit : sigmap(wire)) + input_port_bits.insert(bit); + } + } + + // Compute the combinational fanin cone of `from`. Outputs the set of cells + // in the cone (cells whose output is reached by BFS) and the "leaf" bits + // (port-input bits or bits driven by sequential cells / undriven). + // Returns false if the cone touches anything we don't want to drive a PE. + bool get_cone(SigSpec from, pool& cone_cells, pool& leaf_bits) { + pool visited; + std::queue worklist; + for (auto bit : sigmap(from)) { + if (!bit.wire) continue; + if (visited.insert(bit).second) worklist.push(bit); + } + while (!worklist.empty()) { + SigBit bit = worklist.front(); + worklist.pop(); + if (input_port_bits.count(bit)) { leaf_bits.insert(bit); continue; } + auto it = bit_to_driver.find(bit); + if (it == bit_to_driver.end()) { leaf_bits.insert(bit); continue; } + Cell* drv = it->second; + if (sequential_cells.count(drv)) { leaf_bits.insert(bit); continue; } + if (!cone_cells.insert(drv).second) continue; + for (auto& conn : drv->connections()) { + if (!drv->input(conn.first)) continue; + for (auto in_bit : sigmap(conn.second)) { + if (!in_bit.wire) continue; + if (visited.insert(in_bit).second) worklist.push(in_bit); + } + } + } + return true; + } + + // Collect all wires in the module whose bits are entirely within the + // (leaf_bits + cone-driven bits) frontier of S's cone. These are + // candidates for the input bus T -- either a leaf wire bottoming out the + // cone (ports / FF outputs) or an internal wire produced by a cone cell. + // Wires with a valid power-of-2-friendly width are preferred but we let + // the fingerprint be the final arbiter. + vector find_candidate_Ts(Wire* S_wire, + const pool& cone_cells, + const pool& leaf_bits) { + pool cone_bits = leaf_bits; + for (Cell* c : cone_cells) { + for (auto& conn : c->connections()) { + if (!c->output(conn.first)) continue; + for (auto bit : sigmap(conn.second)) + if (bit.wire) cone_bits.insert(bit); + } + } + vector out; + for (Wire* w : module->wires()) { + if (w == S_wire) continue; + if (w->width < min_input_width || w->width > max_input_width) continue; + bool all_in = true; + for (auto bit : sigmap(SigSpec(w))) { + if (!cone_bits.count(bit)) { all_in = false; break; } + } + if (all_in) out.push_back(w); + } + // Try wider candidates first: the more bits the fingerprint constrains, + // the lower the chance of false positives, and longer chains usually + // imply a more substantial detection target. + std::sort(out.begin(), out.end(), [](Wire* a, Wire* b) { + return a->width > b->width; + }); + return out; + } + + // Build the test-vector deck for an N-bit input. + vector gen_test_vectors(int N) { + vector vs; + vs.push_back(u64_const(0, N)); + for (int k = 0; k < N; k++) { + std::vector bits(N, State::S0); + bits[k] = State::S1; + vs.push_back(Const(bits)); + } + for (int k = 1; k <= N; k++) { + std::vector bits(N, State::S0); + for (int i = 0; i < k; i++) bits[i] = State::S1; + vs.push_back(Const(bits)); + } + for (int k = 0; k < N; k++) { + std::vector bits(N, State::S1); + for (int i = 0; i < k; i++) bits[i] = State::S0; + vs.push_back(Const(bits)); + } + if (N >= 4) { + std::vector aa(N, State::S0), fivefive(N, State::S0), e8(N, State::S0); + for (int i = 0; i < N; i++) { + if (i & 1) aa[i] = State::S1; else fivefive[i] = State::S1; + } + vs.push_back(Const(aa)); + vs.push_back(Const(fivefive)); + e8[0] = State::S1; + if (N > 1) e8[N - 1] = State::S1; + vs.push_back(Const(e8)); + } + return vs; + } + + // Run all candidate test vectors through ConstEval and try to match each of + // the four PE variants against the recorded outputs. Returns the matched + // variant, or NONE. + PEVariant fingerprint(SigSpec T_sig, SigSpec S_sig, int N, int Wbits) { + ConstEval ce(module); + + bool clz_full_ok = detect_clz && (Wbits == clog2_int(N + 1)); + bool ctz_full_ok = detect_ctz && (Wbits == clog2_int(N + 1)); + bool clz_short_ok = detect_clz && (Wbits == clog2_int(N)); + bool ctz_short_ok = detect_ctz && (Wbits == clog2_int(N)); + + if (!clz_full_ok && !ctz_full_ok && !clz_short_ok && !ctz_short_ok) + return PEVariant::NONE; + + auto vs = gen_test_vectors(N); + for (auto& v : vs) { + ce.push(); + ce.set(T_sig, v); + SigSpec out = S_sig; + SigSpec undef; + bool ok = ce.eval(out, undef); + ce.pop(); + if (!ok || !out.is_fully_const()) return PEVariant::NONE; + int outval = out.as_const().as_int(); + + int msb_set = const_msb_set(v, N); + int lsb_set = const_lsb_set(v, N); + bool zero = (msb_set < 0); + + int e_clz = zero ? N : (N - 1 - msb_set); + int e_ctz = zero ? N : lsb_set; + + if (clz_full_ok && outval != e_clz) clz_full_ok = false; + if (ctz_full_ok && outval != e_ctz) ctz_full_ok = false; + if (clz_short_ok && !zero && outval != e_clz) clz_short_ok = false; + if (ctz_short_ok && !zero && outval != e_ctz) ctz_short_ok = false; + + if (!clz_full_ok && !ctz_full_ok && !clz_short_ok && !ctz_short_ok) + return PEVariant::NONE; + } + + // Prefer the most specific match (full > short; CLZ before CTZ tie-breaker). + if (clz_full_ok) return PEVariant::CLZ_FULL; + if (ctz_full_ok) return PEVariant::CTZ_FULL; + if (clz_short_ok) return PEVariant::CLZ_SHORT; + if (ctz_short_ok) return PEVariant::CTZ_SHORT; + return PEVariant::NONE; + } + + // Recursive CLZ on a power-of-2-width input. Returns a (log2(N)+1)-bit + // SigSpec whose MSB is 1 iff T == 0 and whose lower bits are the leading- + // zeros count for nonzero T. + SigSpec emit_clz_pow2(SigSpec T, int N) { + log_assert(N >= 1 && (N & (N - 1)) == 0); + if (N == 1) { + cells_added++; + return module->Not(NEW_ID2_SUFFIX("clznot"), T); + } + int N2 = N / 2; + SigSpec hi = T.extract(N2, N2); + SigSpec lo = T.extract(0, N2); + SigSpec clz_hi = emit_clz_pow2(hi, N2); + SigSpec clz_lo = emit_clz_pow2(lo, N2); + int W1 = GetSize(clz_hi); + SigBit hi_zero = clz_hi[W1 - 1]; + SigBit lo_zero = clz_lo[W1 - 1]; + + // pad_clz_hi (W bits): {1'b0, clz_hi}. When the mux selects this arm + // (hi != 0), clz_hi's MSB is guaranteed 0, so the top two bits of the + // result are 0. + SigSpec pad_clz_hi = clz_hi; + pad_clz_hi.append(SigSpec(State::S0)); + + // pad_clz_lo (W bits): logical equivalent of N/2 + clz_lo. The MSB + // becomes lo_zero (= 1 iff x == 0); the next bit becomes ~lo_zero (= + // 1 iff lo != 0, signalling result in [N/2, N-1]); the remaining bits + // are clz_lo[W1-2:0]. + SigSpec lo_nonzero_spec = module->Not(NEW_ID2_SUFFIX("clz_lonz"), SigSpec(lo_zero)); + cells_added++; + SigBit lo_nonzero = lo_nonzero_spec[0]; + + SigSpec pad_clz_lo; + if (W1 >= 2) + pad_clz_lo.append(clz_lo.extract(0, W1 - 1)); + pad_clz_lo.append(lo_nonzero); + pad_clz_lo.append(lo_zero); + + // $mux: Y = S ? B : A. We want Y = hi_zero ? pad_clz_lo : pad_clz_hi. + cells_added++; + return module->Mux(NEW_ID2_SUFFIX("clzmux"), pad_clz_hi, pad_clz_lo, SigSpec(hi_zero)); + } + + // CLZ of arbitrary-width T, returning a (clog2(N+1))-bit result. + SigSpec emit_clz_full(SigSpec T, int N) { + int Np = 1; + while (Np < N) Np *= 2; + int pad_amount = Np - N; + SigSpec padded = T; + for (int i = 0; i < pad_amount; i++) + padded.append(SigSpec(State::S0)); + SigSpec clz_padded = emit_clz_pow2(padded, Np); // log2(Np)+1 bits + if (pad_amount == 0) + return clz_padded; + // result = clz_padded - pad_amount, truncated to W = clog2(N+1) bits. + int W = clog2_int(N + 1); + SigSpec sub = module->Sub(NEW_ID2_SUFFIX("clzsub"), clz_padded, SigSpec(Const(pad_amount, GetSize(clz_padded)))); + cells_added++; + if (GetSize(sub) >= W) + return sub.extract(0, W); + SigSpec out = sub; + while (GetSize(out) < W) out.append(SigSpec(State::S0)); + return out; + } + + // CTZ via bit-reversal of T followed by CLZ. + SigSpec emit_ctz_full(SigSpec T, int N) { + SigSpec rev; + for (int i = N - 1; i >= 0; i--) + rev.append(T[i]); + return emit_clz_full(rev, N); + } + + SigSpec emit_pe(PEVariant v, Wire* T_wire, int N, int out_width) { + bool is_clz = (v == PEVariant::CLZ_FULL || v == PEVariant::CLZ_SHORT); + auto& cache = is_clz ? clz_full_cache : ctz_full_cache; + + SigSpec full; + auto it = cache.find(T_wire); + if (it != cache.end()) { + full = it->second; + } else { + SigSpec T_sig = sigmap(SigSpec(T_wire)); + full = is_clz ? emit_clz_full(T_sig, N) : emit_ctz_full(T_sig, N); + cache[T_wire] = full; + } + + if (v == PEVariant::CLZ_SHORT || v == PEVariant::CTZ_SHORT) { + if (GetSize(full) > 0) + full = full.extract(0, GetSize(full) - 1); + } + // Match the user-visible output width. + if (GetSize(full) > out_width) + full = full.extract(0, out_width); + while (GetSize(full) < out_width) + full.append(SigSpec(State::S0)); + return full; + } + + struct Rewrite { + Wire* S_wire; + Wire* T_wire; + int N; + int Wbits; + PEVariant variant; + Cell* sole_driver; + IdString out_port; + }; + + // One per (potential) candidate, lazily filled before fingerprinting. + struct Candidate { + Wire* S_wire; + pool cone_cells; + pool leaf_bits; + pool cone_bits; + Cell* sole_driver; + IdString out_port; + }; + + void run() { + vector wires_snapshot(module->wires().begin(), module->wires().end()); + + // Stage 1: build candidate set with cones, filter by driver/width. + vector candidates; + int max_W = clog2_int(max_input_width + 1); + for (Wire* S_wire : wires_snapshot) { + if (S_wire->port_input) continue; + int Wbits = S_wire->width; + if (Wbits < 2 || Wbits > max_W) continue; + + pool cone_cells; + pool leaf_bits; + if (!get_cone(SigSpec(S_wire), cone_cells, leaf_bits)) continue; + if (cone_cells.empty()) continue; + + SigSpec S_sig = sigmap(SigSpec(S_wire)); + pool drivers; + for (auto bit : S_sig) { + auto it = bit_to_driver.find(bit); + if (it == bit_to_driver.end()) { drivers.clear(); break; } + drivers.insert(it->second); + } + if (GetSize(drivers) != 1) continue; + Cell* sole_driver = *drivers.begin(); + IdString out_port; + SigSpec out_sig; + for (auto& conn : sole_driver->connections()) { + if (sole_driver->output(conn.first)) { + out_port = conn.first; + out_sig = sigmap(conn.second); + break; + } + } + if (out_sig != S_sig) continue; + + pool cone_bits = leaf_bits; + for (Cell* c : cone_cells) { + for (auto& conn : c->connections()) { + if (!c->output(conn.first)) continue; + for (auto bit : sigmap(conn.second)) + if (bit.wire) cone_bits.insert(bit); + } + } + candidates.push_back({S_wire, std::move(cone_cells), std::move(leaf_bits), + std::move(cone_bits), sole_driver, out_port}); + } + + // Stage 2: process candidates in order of cone size (LARGEST first). + // Verific-style lowerings often expose several wires along the same + // chain that all fingerprint as a PE on the same input bus (e.g. a + // "found ? chain_out : default" wrapper mux plus the raw chain tail + // plus a downstream mask & enc-merge). Rewriting only one of them + // leaves the chain alive feeding the others, so we rewrite each + // match independently and de-duplicate the emitted log-depth + // network through the per-input clz/ctz cache. + std::sort(candidates.begin(), candidates.end(), + [](const Candidate& a, const Candidate& b) { + if (GetSize(a.cone_cells) != GetSize(b.cone_cells)) + return GetSize(a.cone_cells) > GetSize(b.cone_cells); + return GetSize(a.cone_bits) > GetSize(b.cone_bits); + }); + + vector rewrites; + pool claimed_outputs; + pool claimed_drivers; + + for (auto& cand : candidates) { + if (claimed_outputs.count(cand.S_wire)) continue; + if (claimed_drivers.count(cand.sole_driver)) continue; + + int Wbits = cand.S_wire->width; + SigSpec S_sig = sigmap(SigSpec(cand.S_wire)); + + vector Ts = find_candidate_Ts(cand.S_wire, cand.cone_cells, cand.leaf_bits); + for (Wire* T_wire : Ts) { + int N = T_wire->width; + int W_full = clog2_int(N + 1); + int W_short = clog2_int(N); + if (Wbits != W_full && Wbits != W_short) continue; + + SigSpec T_sig = sigmap(SigSpec(T_wire)); + PEVariant variant = fingerprint(T_sig, S_sig, N, Wbits); + if (variant == PEVariant::NONE) continue; + + log(" %s: %s <- %s(%s) [N=%d, W=%d]\n", + log_id(module), log_id(cand.S_wire), variant_name(variant), + log_id(T_wire), N, Wbits); + + rewrites.push_back({cand.S_wire, T_wire, N, Wbits, variant, + cand.sole_driver, cand.out_port}); + claimed_outputs.insert(cand.S_wire); + claimed_drivers.insert(cand.sole_driver); + break; + } + } + + // Apply rewrites. We collected first to avoid the index growing stale + // while we add new cells/wires. + for (auto& r : rewrites) { + cell = r.sole_driver; + SigSpec new_S = emit_pe(r.variant, r.T_wire, r.N, r.Wbits); + // Disconnect the old driver by re-pointing its Y to a fresh wire. + Wire* dangling = module->addWire(NEW_ID2_SUFFIX("dangling"), r.Wbits); + r.sole_driver->setPort(r.out_port, dangling); + module->connect(SigSpec(r.S_wire), new_S); + regions_rewritten++; + } + } +}; + +struct OptPriEncPass : public Pass { + OptPriEncPass() : Pass("opt_prienc", + "detect and rewrite priority-encoder / CLZ / CTZ regions") {} + + void help() override { + // |---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---|---v---| + log("\n"); + log(" opt_prienc [options] [selection]\n"); + log("\n"); + log("This pass uses functional fingerprinting to detect combinational logic\n"); + log("regions that implement a priority encoder, count-leading-zeros (CLZ), or\n"); + log("count-trailing-zeros (CTZ) on a single contiguous input wire, regardless\n"); + log("of how the RTL was written (unrolled for-loops, casez priority lists,\n"); + log("pmux chains, etc.). Each detected region is replaced with a log-depth\n"); + log("network built from $mux/$not/$sub cells.\n"); + log("\n"); + log("Detected variants:\n"); + log("\n"); + log(" clz_full : result = N when input is 0, else N-1 - msb_set_pos.\n"); + log(" Output width = ceil(log2(N+1)).\n"); + log(" clz_short : result = N-1 - msb_set_pos for nonzero input; the\n"); + log(" output for input==0 is unconstrained. Output width =\n"); + log(" ceil(log2(N)).\n"); + log(" ctz_full : symmetric to clz_full from the LSB side.\n"); + log(" ctz_short : symmetric to clz_short from the LSB side.\n"); + log("\n"); + log(" -clz\n"); + log(" detect CLZ patterns only.\n"); + log("\n"); + log(" -ctz\n"); + log(" detect CTZ patterns only.\n"); + log("\n"); + log(" -max-width N\n"); + log(" maximum input bus width to consider (default 64).\n"); + log("\n"); + log(" -min-width N\n"); + log(" minimum input bus width to consider (default 4). Smaller\n"); + log(" inputs are too easy to alias and rarely worth rewriting.\n"); + log("\n"); + log("This pass is not invoked by the default 'opt' script; users opt in.\n"); + log("After rewriting, the original cone cells become unused and are removed\n"); + log("by the trailing 'clean -purge'.\n"); + log("\n"); + } + + void execute(std::vector args, RTLIL::Design *design) override { + log_header(design, "Executing OPT_PRIENC pass (priority encoder / CLZ / CTZ).\n"); + + bool only_clz = false; + bool only_ctz = false; + int max_width = 64; + int min_width = 4; + + size_t argidx; + for (argidx = 1; argidx < args.size(); argidx++) { + if (args[argidx] == "-clz") { only_clz = true; continue; } + if (args[argidx] == "-ctz") { only_ctz = true; continue; } + if (args[argidx] == "-max-width" && argidx + 1 < args.size()) { + max_width = std::stoi(args[++argidx]); continue; + } + if (args[argidx] == "-min-width" && argidx + 1 < args.size()) { + min_width = std::stoi(args[++argidx]); continue; + } + break; + } + extra_args(args, argidx, design); + + int total_regions = 0; + int total_cells_added = 0; + for (auto module : design->selected_modules()) { + OptPriEncWorker worker(module); + worker.detect_clz = !only_ctz; + worker.detect_ctz = !only_clz; + worker.max_input_width = max_width; + worker.min_input_width = min_width; + worker.run(); + total_regions += worker.regions_rewritten; + total_cells_added += worker.cells_added; + } + + log("Rewrote %d region(s); emitted %d new cell(s).\n", + total_regions, total_cells_added); + + Yosys::run_pass("clean -purge"); + } +} OptPriEncPass; + +PRIVATE_NAMESPACE_END diff --git a/tests/opt/opt_parallel_prefix.ys b/tests/opt/opt_parallel_prefix.ys new file mode 100644 index 000000000..db6291071 --- /dev/null +++ b/tests/opt/opt_parallel_prefix.ys @@ -0,0 +1,745 @@ +# Tests for opt_parallel_prefix +# +# Notation: +# N = number of leaves in the prefix chain (so N-1 cells originally) +# For full-demand chains (all intermediates as port outputs), expected +# network cell counts and depths per topology are: +# Kogge-Stone : cells = sum_{l=0..ceil(log2 N)-1} (N - 2^l), depth = ceil(log2 N) +# Sklansky : cells = sum_{l=0..ceil(log2 N)-1} (N - 2^l rounded up to half-block boundaries) +# depth = ceil(log2 N) +# Brent-Kung : cells ~= 2N - log2(N) - 2 (power-of-2 N), depth = 2*ceil(log2 N) - 2 +# Han-Carlson : cells ~= N/2 * log2(N) (power-of-2 N), depth = ceil(log2 N) + 1 + +# ============================================================================ +# Group A: Basic correctness for each supported op (equiv only) +# ============================================================================ + +# Test A1: 4-input AND prefix chain +log -header "A1: 4-input AND prefix chain" +log -push +design -reset +read_verilog < total network cell count and depth bound. +# The recursive halving network has 2^k - 1 muxes for an N=2^k input. The +# critical path through the muxes is k = log2(N) levels, which is the win. +log -header "B1: 16-bit CLZ structural" +log -push +design -reset +read_verilog -sv < structural bounds. +log -header "B2: 32-bit CTZ structural" +log -push +design -reset +read_verilog -sv < W=4" +log -push +design -reset +read_verilog -sv < W=3" +log -push +design -reset +read_verilog -sv < W=4" +log -push +design -reset +read_verilog -sv < W=3" +log -push +design -reset +read_verilog -sv < no rewrite" +log -push +design -reset +read_verilog -sv < no-op. +log -header "E3: cone crosses FF boundary" +log -push +design -reset +read_verilog -sv < no-op. +log -header "E4: input width 2 below min-width" +log -push +design -reset +read_verilog -sv <