diff --git a/.github/actions/setup-build-env/action.yml b/.github/actions/setup-build-env/action.yml index ff2574d31..c6ea64e2e 100644 --- a/.github/actions/setup-build-env/action.yml +++ b/.github/actions/setup-build-env/action.yml @@ -35,14 +35,14 @@ runs: if: runner.os == 'Linux' uses: awalsh128/cache-apt-pkgs-action@v1.6.0 with: - packages: gawk git make python3 bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libnsl-dev libdwarf-dev libelf-dev elfutils libdw-dev ccache + packages: gawk git make python3 bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libnsl-dev libdwarf-dev libelf-dev elfutils libdw-dev ccache version: ${{ inputs.runs-on }}-commonys - name: Linux build dependencies if: runner.os == 'Linux' && inputs.get-build-deps == 'true' uses: awalsh128/cache-apt-pkgs-action@v1.6.0 with: - packages: bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libgtest-dev + packages: gawk git make python3 bison clang flex libffi-dev libfl-dev libreadline-dev pkg-config tcl-dev zlib1g-dev libnsl-dev libdwarf-dev libelf-dev elfutils libdw-dev ccache libgtest-dev version: ${{ inputs.runs-on }}-buildys - name: Linux docs dependencies diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 372e3424c..331c062d5 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -116,7 +116,7 @@ jobs: uses: ./.github/actions/setup-build-env with: runs-on: ${{ matrix.os }} - get-test-deps: true + get-build-deps: true get-iverilog: true - name: Download build artifact diff --git a/passes/silimate/Makefile.inc b/passes/silimate/Makefile.inc index 60e03c5a8..4f480ba05 100644 --- a/passes/silimate/Makefile.inc +++ b/passes/silimate/Makefile.inc @@ -13,6 +13,7 @@ OBJS += passes/silimate/reg_rename.o OBJS += passes/silimate/splitfanout.o OBJS += passes/silimate/splitlarge.o OBJS += passes/silimate/splitnetlist.o +OBJS += passes/silimate/opt_timing_balance.o OBJS += passes/silimate/opt_expand.o GENFILES += passes/silimate/peepopt_expand.h diff --git a/passes/silimate/opt_timing_balance.cc b/passes/silimate/opt_timing_balance.cc new file mode 100644 index 000000000..f10283015 --- /dev/null +++ b/passes/silimate/opt_timing_balance.cc @@ -0,0 +1,1466 @@ +/* + * yosys -- Yosys Open SYnthesis Suite + * + * Copyright (C) 2012 Claire Xenia Wolf + * 2026 Abhinav Tondapu + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include "kernel/yosys.h" +#include "kernel/sigtools.h" +#include "kernel/celltypes.h" +#include "kernel/utils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +USING_YOSYS_NAMESPACE +PRIVATE_NAMESPACE_BEGIN + +/* Invariants: + * - Operates on internal word cells ($add/$and/$or/$xor) pre-techmap + * - Connectivity and timing keys use sigmap-mapped signals + * - Rewiring uses original head Y bits to avoid alias drift + * - Disjoint clusters are rewritten per sweep, clean/rebuild happens per iteration +*/ + +// ----------------------------------------------------------------------------- +// Shared constants, helpers, and traits +// ----------------------------------------------------------------------------- + +static constexpr double kDelayDefault = 1.0; +static constexpr double kDelayLogic = 0.5; +static constexpr double kMinIterationDelta = 1e-3; +static constexpr int kMaxPassIterations = 10; +static constexpr int kTraversalStackReserve = 256; + +static const IdString kAttrTimingBalanceGenerated = "\\timing_balance_generated"; + +static IdString make_id(Cell *anchor, const char *suffix) +{ + // NEW_ID2_SUFFIX relies on a local variable named `cell` + Cell *cell = anchor; + return NEW_ID2_SUFFIX(suffix); +} + +static inline double log2p1_int(int n) { return std::log2(static_cast(n) + 1.0); } + +static int cell_y_width(const Cell *cell) +{ + log_assert(cell != nullptr); + if (cell->hasParam(ID::Y_WIDTH)) + return std::max(1, cell->getParam(ID::Y_WIDTH).as_int()); + if (cell->hasPort(ID::Y)) + return std::max(1, GetSize(cell->getPort(ID::Y))); + + // TimingOracle can query non-target drivers, fall back to widest output port + int width = 0; + for (const auto &[port_id, sig] : cell->connections()) + if (cell->output(port_id)) + width = std::max(width, GetSize(sig)); + return std::max(1, width); +} + +enum class BalanceCategory { + Logic, + Arith +}; + +enum class WidthRule { + MaxInput, + AddCarry +}; + +enum class DelayHeuristicKind { + Fixed, + AddLike +}; + +enum class TraversalState : int { + Unseen = 0, + Active = 1, + Done = 2 +}; + +// Per-cell balancing traits and delay heuristic policy +struct SupportedCellSpec +{ + IdString type; + BalanceCategory category; + bool requires_strict_width_match = false; + bool requires_matching_signedness = false; + WidthRule width_rule = WidthRule::MaxInput; + DelayHeuristicKind delay_kind = DelayHeuristicKind::Fixed; + double fixed_delay = 0.0; +}; + +// Registry for balance targets and their delay/width behavior +// Adding a new associative target should only require editing this table +static const std::vector &supported_cell_registry() +{ + static const std::vector specs = { + {ID($and), BalanceCategory::Logic, false, false, WidthRule::MaxInput, DelayHeuristicKind::Fixed, kDelayLogic}, + {ID($or), BalanceCategory::Logic, false, false, WidthRule::MaxInput, DelayHeuristicKind::Fixed, kDelayLogic}, + {ID($xor), BalanceCategory::Logic, false, false, WidthRule::MaxInput, DelayHeuristicKind::Fixed, kDelayDefault}, + {ID($add), BalanceCategory::Arith, true, true, WidthRule::AddCarry, DelayHeuristicKind::AddLike, 0.0}, + }; + return specs; +} + +static const dict &supported_cell_registry_map() +{ + static const dict by_type = []() { + dict m; + for (const auto &spec : supported_cell_registry()) + m[spec.type] = &spec; + return m; + }(); + return by_type; +} + +static const SupportedCellSpec *get_supported_cell_spec(IdString type) +{ + const auto &by_type = supported_cell_registry_map(); + auto it = by_type.find(type); + if (it == by_type.end()) + return nullptr; + return it->second; +} + +static std::vector collect_target_cell_ids(bool enable_logic, bool enable_arith) +{ + std::vector ids; + for (const auto &spec : supported_cell_registry()) + { + bool enabled_category = (spec.category == BalanceCategory::Logic) ? enable_logic : enable_arith; + if (!enabled_category) + continue; + ids.push_back(spec.type); + } + return ids; +} + +static bool less_sigbit_key(const SigBit &a, const SigBit &b) +{ + bool a_const = a.wire == nullptr; + bool b_const = b.wire == nullptr; + if (a_const != b_const) + return a_const; + + if (a_const) { + int ad = static_cast(a.data); + int bd = static_cast(b.data); + return ad < bd; + } + + if (a.wire->name != b.wire->name) + return std::strcmp(a.wire->name.c_str(), b.wire->name.c_str()) < 0; + return a.offset < b.offset; +} + +static bool less_sigspec_key(const SigSpec &a, const SigSpec &b) +{ + if (GetSize(a) != GetSize(b)) + return GetSize(a) < GetSize(b); + + int n = GetSize(a); + for (int i = 0; i < n; i++) { + const SigBit &ab = a[i]; + const SigBit &bb = b[i]; + if (ab == bb) + continue; + return less_sigbit_key(ab, bb); + } + return false; +} + +// For supported ops here, result signedness is true only when both inputs are signed +static constexpr bool yosys_binary_result_signed(bool a_signed, bool b_signed) { return a_signed && b_signed; } + +static const dict &fixed_delay_table() +{ + static const auto table = dict{ + {ID($not), 0.0}, + {ID($pos), 0.0}, + {ID($logic_not), 0.0}, + {ID($and), kDelayLogic}, + {ID($or), kDelayLogic}, + {ID($xor), kDelayDefault}, + {ID($xnor), kDelayDefault}, + {ID($logic_and), kDelayLogic}, + {ID($logic_or), kDelayLogic}, + {ID($mux), kDelayDefault}, + }; + return table; +} + +static bool is_timing_boundary_cell(Cell *cell, const CellTypes &cell_types) +{ + if (cell == nullptr) + return true; + + // Explicit user attributes + if (cell->get_bool_attribute(ID::keep) || cell->get_bool_attribute(ID::blackbox)) + return true; + + // Flip-flops + if (cell->is_builtin_ff()) + return true; + + // Latches, memories, and formal/simulation cells + if (cell->type.in( + ID($dlatch), ID($adlatch), ID($dlatchsr), + ID($mem), ID($mem_v2), ID($memrd), ID($memrd_v2), ID($memwr), ID($memwr_v2), ID($meminit), ID($meminit_v2), + ID($anyconst), ID($anyseq), ID($allconst), ID($allseq), ID($equiv), + ID($assert), ID($assume), ID($cover), ID($check), ID($print) + )) + return true; + + // Macro or unknown cell + return !cell_types.cell_known(cell->type); +} + +static double estimate_cell_delay(const Cell *cell, int out_width) +{ + if (cell == nullptr) + return kDelayDefault; + + IdString type = cell->type; + int width = out_width; + + const auto &by_type = supported_cell_registry_map(); + auto reg_it = by_type.find(type); + if (reg_it != by_type.end()) { + const SupportedCellSpec *spec = reg_it->second; + switch (spec->delay_kind) + { + case DelayHeuristicKind::Fixed: + return spec->fixed_delay; + case DelayHeuristicKind::AddLike: + return log2p1_int(width); + } + } + + if (type == ID($pmux)) { + int s_width = 1; + if (cell->hasParam(ID::S_WIDTH)) + s_width = cell->getParam(ID::S_WIDTH).as_int(); + return log2p1_int(s_width); + } + if (type.in(ID($add), ID($sub), ID($neg), ID($alu))) + return log2p1_int(width); + if (type.in(ID($mul), ID($div), ID($mod))) + return width; + if (type.in(ID($shl), ID($shr), ID($sshl), ID($sshr))) + return log2p1_int(width); + + const auto &fixed = fixed_delay_table(); + auto it = fixed.find(type); + if (it != fixed.end()) + return it->second; + return kDelayDefault; +} + +// ----------------------------------------------------------------------------- +// Analysis: connectivity and timing oracle +// ----------------------------------------------------------------------------- + +struct ConnectivitySnapshot +{ + // One-sweep structural connectivity view + dict unique_driver_by_bit; + SigSet sinks_by_bit; + pool output_port_bits; + + ConnectivitySnapshot() = default; + ConnectivitySnapshot(Module *module, SigMap &sigmap) { build(module, sigmap); } + + void build(Module *module, SigMap &sigmap) + { + unique_driver_by_bit.clear(); + sinks_by_bit.clear(); + output_port_bits.clear(); + + // Full-module view keeps fanout checks selection-safe + for (Cell *cell : module->cells()) { + for (const auto &[port_id, sig] : cell->connections()) { + SigSpec mapped = sigmap(sig); + if (cell->output(port_id)) { + for (auto bit : mapped) { + if (!bit.wire) + continue; + auto [it, inserted] = unique_driver_by_bit.emplace(bit, cell); + if (!inserted && it->second != cell) + it->second = nullptr; + } + } + if (cell->input(port_id)) + sinks_by_bit.insert(mapped, cell); + } + } + // Output ports mark head boundaries. Input boundaries are handled in TimingOracle + for (auto wire : module->wires()) { + if (wire->port_output) { + for (auto bit : sigmap(wire)) + output_port_bits.insert(bit); + } + } + } + + Cell *get_unique_driver_mapped(const SigSpec &sig) const + { + // Caller passes sigmap-mapped signal slices + Cell *driver = nullptr; + for (auto bit : sig) + { + if (!bit.wire) + return nullptr; + auto it = unique_driver_by_bit.find(bit); + if (it == unique_driver_by_bit.end() || it->second == nullptr) + return nullptr; + if (driver == nullptr) + driver = it->second; + else if (driver != it->second) + return nullptr; + } + return driver; + } + + void collect_sinks_mapped(const SigSpec &mapped_sig, pool &sinks) + { + // SigSet::find() is non-const in current Yosys API + sinks.clear(); + sinks_by_bit.find(mapped_sig, sinks); + } + +}; + +struct TimingOracle +{ + // Lazy backward arrival estimator over the current connectivity snapshot + // Unknown or boundary drivers return 0.0, combinational cycles return +inf + const CellTypes &cell_types; + SigMap &sigmap; + const dict *driver_map; + dict arrival_cache; + dict visit_state; + struct StackEntry { + SigBit bit; + // false: expand dependencies, true: finalize after children + bool finalize_phase = false; + }; + bool cycle_detected = false; + + TimingOracle(const CellTypes &cell_types, SigMap &sigmap, + const dict &driver_map) : + cell_types(cell_types), sigmap(sigmap), driver_map(&driver_map) { } + + void clear_timing_cache() + { + arrival_cache.clear(); + visit_state.clear(); + cycle_detected = false; + } + + void rebind_driver_map(const dict &new_driver_map) + { + driver_map = &new_driver_map; + clear_timing_cache(); + } + + void cache_final_value(SigBit bit, double arrival) + { + if (!bit.wire) + return; + bit = sigmap(bit); + arrival_cache[bit] = arrival; + visit_state[bit] = TraversalState::Done; + } + + TraversalState get_visit_state(SigBit bit) const + { + if (auto it = visit_state.find(bit); it != visit_state.end()) + return it->second; + return TraversalState::Unseen; + } + + void set_visit_state(SigBit bit, TraversalState state) + { + visit_state[bit] = state; + } + + double get_arrival(const SigSpec &sig) + { + cycle_detected = false; + double t = 0.0; + for (auto bit : sigmap(sig)) + t = std::max(t, get_arrival_noguard(bit)); + return t; + } + +private: + /* + * Two-phase DFS avoids recursion, + * finalize_phase = false expands inputs, true computes and caches node arrival + * Active marks the current path, unresolved inputs during finalize are treated as cycles with +inf + */ + double get_arrival_noguard(SigBit bit) + { + SigBit start = sigmap(bit); + if (!start.wire) + return 0.0; + if (auto it = arrival_cache.find(start); it != arrival_cache.end()) + return it->second; + + // Local stack keeps traversal state scoped to one query + std::vector eval_stack; + eval_stack.reserve(kTraversalStackReserve); + eval_stack.push_back({start, false}); + + while (!eval_stack.empty()) + { + StackEntry e = std::move(eval_stack.back()); + eval_stack.pop_back(); + SigBit curr = e.bit; + if (!curr.wire) + continue; + if (arrival_cache.count(curr)) + continue; + + if (curr.wire->port_input) { + cache_final_value(curr, 0.0); + continue; + } + + Cell *driver = nullptr; + if (auto it_drv = driver_map->find(curr); it_drv != driver_map->end()) + driver = it_drv->second; + if (driver == nullptr || is_timing_boundary_cell(driver, cell_types)) { + cache_final_value(curr, 0.0); + continue; + } + + TraversalState state = get_visit_state(curr); + + if (!e.finalize_phase) + { + if (state == TraversalState::Done) + continue; + if (state == TraversalState::Active) { + // Node already on current path, skip duplicate expansion + continue; + } + + set_visit_state(curr, TraversalState::Active); + eval_stack.push_back({curr, true}); + for (const auto &[port_id, sig] : driver->connections()) { + if (!driver->input(port_id)) + continue; + for (auto in_bit : sigmap(sig)) { + if (!in_bit.wire || arrival_cache.count(in_bit)) + continue; + if (get_visit_state(in_bit) == TraversalState::Active) { + cycle_detected = true; + continue; + } + eval_stack.push_back({in_bit, false}); + } + } + continue; + } + + double max_input = 0.0; + for (const auto &[port_id, sig] : driver->connections()) { + if (!driver->input(port_id)) + continue; + for (auto in_bit : sigmap(sig)) { + double in_arrival = 0.0; + if (in_bit.wire) { + auto it = arrival_cache.find(in_bit); + if (it != arrival_cache.end()) + in_arrival = it->second; + else { + // Missing child arrival at finalize implies combinational cycle + cycle_detected = true; + in_arrival = std::numeric_limits::infinity(); + } + } + max_input = std::max(max_input, in_arrival); + } + } + + double cell_delay = estimate_cell_delay(driver, cell_y_width(driver)); + double t = max_input + cell_delay; + cache_final_value(curr, t); + } + + auto it = arrival_cache.find(start); + return it != arrival_cache.end() ? it->second : 0.0; + } +}; + +// ----------------------------------------------------------------------------- +// Rewrite planning and emission +// ----------------------------------------------------------------------------- + +static int natural_output_width(WidthRule width_rule, int a_width, int b_width) +{ + switch (width_rule) + { + case WidthRule::AddCarry: + return std::max(a_width, b_width) + 1; + case WidthRule::MaxInput: + default: + return std::max(a_width, b_width); + } +} + +static int minimum_y_width_for_reassociation(WidthRule width_rule, int a_width, int b_width) +{ + if (width_rule == WidthRule::AddCarry) + // Validation-only relaxation for modulo 2^N add reassociation + return std::max(a_width, b_width); + return natural_output_width(width_rule, a_width, b_width); +} + +struct TreeLeaf +{ + SigSpec signal; + double arrival_time = 0.0; + int width = 0; + bool is_signed = false; + int stable_id = 0; +}; + +struct MergeShape +{ + int out_width = 1; + bool a_signed = false; + bool b_signed = false; + bool out_signed = false; +}; + +struct PlannedMerge +{ + int lhs_node = -1; + int rhs_node = -1; + MergeShape shape; +}; + +// Immutable plan produced by HuffmanPlanner and consumed by TreeEmitter +struct TreePlan +{ + // Node ids are dense: + // - [0, leaves) are leaf nodes + // - [leaves, leaves+merges) are merge nodes in emission order + std::vector leaves; + std::vector merges; + int root_node = -1; + double output_arrival = 0.0; + + bool valid() const { return root_node >= 0; } + + int node_count() const { return GetSize(leaves) + GetSize(merges); } +}; + +// Computes merge order and expected arrival, does not mutate RTLIL +struct HuffmanPlanner +{ + struct PlanNode + { + int node_id = -1; + double arrival_time = 0.0; + int width = 0; + bool is_signed = false; + int stable_id = 0; + }; + + struct PlanNodeCmp + { + bool operator()(const PlanNode &a, const PlanNode &b) const + { + // Use a min-heap by inverting comparator for std::priority_queue + return std::tie(a.arrival_time, a.width, a.stable_id) > + std::tie(b.arrival_time, b.width, b.stable_id); + } + }; + + MergeShape compute_merge_shape(const TreeLeaf &a, const TreeLeaf &b, + const SupportedCellSpec &spec, int target_out_width, bool force_root_width) const + { + int out_width = std::max(1, target_out_width); + if (!force_root_width && spec.width_rule == WidthRule::AddCarry) + out_width = std::min(out_width, natural_output_width(spec.width_rule, a.width, b.width)); + bool a_signed = a.is_signed; + bool b_signed = b.is_signed; + bool out_signed = yosys_binary_result_signed(a_signed, b_signed); + return {out_width, a_signed, b_signed, out_signed}; + } + + double compute_merge_arrival(double a_arrival, double b_arrival, int out_width, const Cell *delay_ref_cell) const + { + return std::max(a_arrival, b_arrival) + estimate_cell_delay(delay_ref_cell, out_width); + } + + TreePlan plan(const std::vector &leaves, IdString cell_type, Cell *reference_cell) const + { + // Deterministic leaf ordering is provided by build_tree_leaves() + TreePlan plan; + if (leaves.empty()) + return plan; + plan.leaves = leaves; + if (GetSize(leaves) == 1) { + plan.root_node = 0; + plan.output_arrival = leaves.front().arrival_time; + return plan; + } + + const SupportedCellSpec *spec = get_supported_cell_spec(cell_type); + if (spec == nullptr) + return {}; + + int target_out_width = std::max(1, cell_y_width(reference_cell)); + + std::priority_queue, PlanNodeCmp> pq; + for (int i = 0; i < GetSize(leaves); i++) { + const auto &leaf = leaves[i]; + pq.push({i, leaf.arrival_time, leaf.width, leaf.is_signed, leaf.stable_id}); + } + + int next_internal_id = GetSize(leaves); + int next_stable_id = GetSize(leaves); + /* Greedy Huffman merge always pops the two best nodes first, + * stable_id makes tie breaks deterministic for equal arrival and width, + * root merge forces target width to preserve the head output contract + */ + while (GetSize(pq) > 1) + { + PlanNode a = pq.top(); pq.pop(); + PlanNode b = pq.top(); pq.pop(); + + bool force_root_width = pq.empty(); + TreeLeaf a_leaf = {SigSpec(), a.arrival_time, a.width, a.is_signed, a.stable_id}; + TreeLeaf b_leaf = {SigSpec(), b.arrival_time, b.width, b.is_signed, b.stable_id}; + MergeShape shape = compute_merge_shape(a_leaf, b_leaf, *spec, target_out_width, force_root_width); + int out_width = shape.out_width; + double new_arrival = compute_merge_arrival(a.arrival_time, b.arrival_time, out_width, reference_cell); + + int node_id = next_internal_id++; + plan.merges.push_back({a.node_id, b.node_id, shape}); + pq.push({node_id, new_arrival, out_width, shape.out_signed, next_stable_id++}); + } + + log_assert(!pq.empty()); + plan.root_node = pq.top().node_id; + plan.output_arrival = pq.top().arrival_time; + return plan; + } +}; + +// TreeEmitter materializes a precomputed plan into RTLIL cells and wires +struct TreeEmitter +{ + Module *module; + dict &cell_count; + + TreeEmitter(Module *module, dict &cell_count) : + module(module), cell_count(cell_count) { } + + SigSpec apply(const TreePlan &plan, IdString cell_type, Cell *reference_cell) + { + if (!plan.valid() || plan.leaves.empty()) + return {}; + if (GetSize(plan.leaves) == 1) + return plan.leaves.front().signal; + + int total_nodes = plan.node_count(); + std::vector node_signals(total_nodes); + for (int i = 0; i < GetSize(plan.leaves); i++) + node_signals[i] = plan.leaves[i].signal; + + for (int merge_idx = 0; merge_idx < GetSize(plan.merges); merge_idx++) + { + const PlannedMerge &m = plan.merges[merge_idx]; + log_assert(m.lhs_node >= 0 && m.lhs_node < total_nodes); + log_assert(m.rhs_node >= 0 && m.rhs_node < total_nodes); + + SigSpec a_sig = node_signals[m.lhs_node]; + SigSpec b_sig = node_signals[m.rhs_node]; + log_assert(GetSize(a_sig) > 0 && GetSize(b_sig) > 0); + + IdString new_cell_name = make_id(reference_cell, "timing_balance"); + Cell *new_cell = module->addCell(new_cell_name, cell_type); + new_cell->set_bool_attribute(kAttrTimingBalanceGenerated); + new_cell->set_src_attribute(reference_cell->get_src_attribute()); + IdString out_wire_name = make_id(reference_cell, "timing_balance_y"); + Wire *out_wire = module->addWire(out_wire_name, m.shape.out_width); + + new_cell->setPort(ID::A, a_sig); + new_cell->setPort(ID::B, b_sig); + new_cell->setPort(ID::Y, out_wire); + if (new_cell->hasParam(ID::A_SIGNED)) + new_cell->setParam(ID::A_SIGNED, m.shape.a_signed); + if (new_cell->hasParam(ID::B_SIGNED)) + new_cell->setParam(ID::B_SIGNED, m.shape.b_signed); + if (new_cell->hasParam(ID::A_WIDTH)) + new_cell->setParam(ID::A_WIDTH, GetSize(a_sig)); + if (new_cell->hasParam(ID::B_WIDTH)) + new_cell->setParam(ID::B_WIDTH, GetSize(b_sig)); + if (new_cell->hasParam(ID::Y_WIDTH)) + new_cell->setParam(ID::Y_WIDTH, m.shape.out_width); + new_cell->fixup_parameters(); + + int node_id = GetSize(plan.leaves) + merge_idx; + node_signals[node_id] = SigSpec(out_wire); + cell_count[cell_type]++; + } + + log_assert(plan.root_node >= 0 && plan.root_node < total_nodes); + return node_signals[plan.root_node]; + } +}; + +// ----------------------------------------------------------------------------- +// Rewrite engine: cluster harvest, evaluation, and commit loop +// ----------------------------------------------------------------------------- + +// Harvested cluster plus external source multiset for one candidate head +struct ClusterHarvest +{ + // Track source multiplicity by signedness to preserve per-use semantics + dict signed_source_uses; + dict unsigned_source_uses; + pool cluster_cells; +}; + +// Worker contract: +// Finds heads for each target type, harvests and evaluates clusters, commits +// beneficial disjoint rewrites in-sweep, and rebuilds views between iterations +struct OptTimingBalanceWorker +{ + struct RewriteStats + { + int candidates = 0; + int trees = 0; + int rewrites = 0; + }; + + struct RewriteDecision + { + SigSpec head_output; + TreePlan plan; + }; + + struct ObjectiveScore + { + double sum_arrival = 0.0; + }; + + struct SweepContext + { + pool candidate_cells; + pool consumed_cells; + RewriteStats stats; + dict target_cache; + dict y_cache; + }; + + Design *design; + Module *module; + SigMap sigmap; + CellTypes cell_types; + std::vector target_cell_ids; + dict cell_count; + HuffmanPlanner planner; + TreeEmitter emitter; + dict warned_contract_issues; + static constexpr int warnRequiredPortsErrCode = 1; + static constexpr int warnRequiredWidthParamsErrCode = 2; + + OptTimingBalanceWorker(Design *design, Module *module, const std::vector &target_cell_ids) : + design(design), module(module), sigmap(module), cell_types(design), target_cell_ids(target_cell_ids), + planner(), emitter(module, cell_count) + { } + + // View lifecycle + void rebuild_views(ConnectivitySnapshot &graph, TimingOracle &timer) + { + sigmap = SigMap(module); + graph.build(module, sigmap); + timer.rebind_driver_map(graph.unique_driver_by_bit); + } + + // Warnings and objective gate + void warn_contract_once(IdString cell_type, int err_code) + { + int &mask = warned_contract_issues[cell_type]; + if (mask & err_code) + return; + mask |= err_code; + if (err_code == warnRequiredPortsErrCode) { + log_warning("opt_timing_balance: skipping %s cells without A/B/Y ports in module %s.\n", + log_id(cell_type), log_id(module)); + } else { + log_warning("opt_timing_balance: skipping %s cells without width parameters in module %s. " + "Pass expects word-level RTL cells (run before gate-level techmapping).\n", + log_id(cell_type), log_id(module)); + } + } + + bool objective_improved(const ObjectiveScore &objective_before, const ObjectiveScore &objective_after) const + { + if (!std::isfinite(objective_after.sum_arrival)) + return false; + if (!std::isfinite(objective_before.sum_arrival)) + return true; + // Sum-only gating can regress the worst single path, but may unlock deferred global gains in later iterations + return objective_after.sum_arrival < objective_before.sum_arrival - kMinIterationDelta; + } + + // Candidate and head predicates + bool is_target_cell_type(Cell *cell, IdString cell_type, bool exclude_generated) + { + if (cell == nullptr || cell->type != cell_type) + return false; + if (exclude_generated && cell->get_bool_attribute(kAttrTimingBalanceGenerated)) + return false; + const SupportedCellSpec *spec = get_supported_cell_spec(cell_type); + if (spec == nullptr) + return false; + if (!cell->hasPort(ID::A) || !cell->hasPort(ID::B) || !cell->hasPort(ID::Y)) { + warn_contract_once(cell_type, warnRequiredPortsErrCode); + return false; + } + if (!cell->hasParam(ID::Y_WIDTH) || !cell->hasParam(ID::A_WIDTH) || !cell->hasParam(ID::B_WIDTH)) { + warn_contract_once(cell_type, warnRequiredWidthParamsErrCode); + return false; + } + + int y_width = cell->getParam(ID::Y_WIDTH).as_int(); + int a_width = cell->getParam(ID::A_WIDTH).as_int(); + int b_width = cell->getParam(ID::B_WIDTH).as_int(); + if (y_width <= 0 || a_width <= 0 || b_width <= 0) + return false; + if (GetSize(cell->getPort(ID::A)) != a_width) + return false; + if (GetSize(cell->getPort(ID::B)) != b_width) + return false; + if (GetSize(cell->getPort(ID::Y)) != y_width) + return false; + + if (spec->requires_matching_signedness) { + if (!cell->hasParam(ID::A_SIGNED) || !cell->hasParam(ID::B_SIGNED)) + return false; + } + + int required_width = minimum_y_width_for_reassociation(spec->width_rule, a_width, b_width); + return y_width >= required_width; + } + + bool is_target_cell_type_cached(Cell *cell, IdString cell_type, + bool exclude_generated, dict &target_cache) + { + if (cell == nullptr) + return false; + auto it = target_cache.find(cell); + if (it != target_cache.end()) + return it->second; + bool is_target = is_target_cell_type(cell, cell_type, exclude_generated); + target_cache[cell] = is_target; + return is_target; + } + + const SigSpec &mapped_y(Cell *cell, dict &y_cache) + { + auto it = y_cache.find(cell); + if (it != y_cache.end()) + return it->second; + y_cache[cell] = sigmap(cell->getPort(ID::Y)); + return y_cache[cell]; + } + + // Backward cluster extraction + bool is_head_cell(Cell *cell, IdString cell_type, bool exclude_generated, + ConnectivitySnapshot &graph, dict &target_cache, dict &y_cache) + { + if (cell == nullptr) + return false; + const SigSpec &y = mapped_y(cell, y_cache); + // Output-port drivers are always heads + for (auto bit : y) + if (graph.output_port_bits.count(bit)) + return true; + + pool sinks; + graph.collect_sinks_mapped(y, sinks); + // Leaf drivers are heads + if (sinks.empty()) + return true; + + // Any non-target consumer terminates same-type chain growth + for (Cell *sink : sinks) { + if (!is_target_cell_type_cached(sink, cell_type, exclude_generated, target_cache)) + return true; + } + return false; + } + + /* + * BFS over same-type unique drivers from head_cell, + * merge only when driver Y exactly matches consumed mapped bits to avoid semantic drift, + * when merge stops, record source use count with per-port signedness + */ + bool collect_cluster(IdString cell_type, Cell *head_cell, const pool &candidate_cells, + ConnectivitySnapshot &graph, dict &target_cache, dict &y_cache, + ClusterHarvest &harvest) + { + const SupportedCellSpec *spec = get_supported_cell_spec(cell_type); + if (spec == nullptr || head_cell == nullptr) + return false; + + bool enforce_strict_width_match = spec->requires_strict_width_match; + int target_width = 0; + if (enforce_strict_width_match) { + // Strict width preserves truncation points + target_width = cell_y_width(head_cell); + } + + bool enforce_matching_signedness = spec->requires_matching_signedness; + bool target_add_signed = false; + if (enforce_matching_signedness) { + if (!head_cell->hasParam(ID::A_SIGNED) || !head_cell->hasParam(ID::B_SIGNED)) + return false; + bool head_a_signed = head_cell->getParam(ID::A_SIGNED).as_bool(); + bool head_b_signed = head_cell->getParam(ID::B_SIGNED).as_bool(); + if (head_a_signed != head_b_signed) + return false; + target_add_signed = head_a_signed; + } + + harvest = ClusterHarvest(); + harvest.cluster_cells.insert(head_cell); + std::deque queue = {head_cell}; + + while (!queue.empty()) + { + Cell *cell = queue.front(); + queue.pop_front(); + + for (IdString port : {ID::A, ID::B}) { + SigSpec sig = sigmap(cell->getPort(port)); + Cell *driver = graph.get_unique_driver_mapped(sig); + + bool can_merge = true; + if (driver == nullptr || driver == cell || !candidate_cells.count(driver)) + can_merge = false; + if (can_merge && !is_target_cell_type_cached(driver, cell_type, true, target_cache)) + can_merge = false; + + if (can_merge) { + const SigSpec &drv_y = mapped_y(driver, y_cache); + // Require exact Y coverage for safe reassociation + if (GetSize(drv_y) != GetSize(sig) || drv_y != sig) + can_merge = false; + } + if (can_merge && enforce_strict_width_match && + cell_y_width(driver) != target_width) + can_merge = false; + if (can_merge && enforce_matching_signedness) { + if (!driver->hasParam(ID::A_SIGNED) || !driver->hasParam(ID::B_SIGNED)) + can_merge = false; + else { + bool a_signed = driver->getParam(ID::A_SIGNED).as_bool(); + bool b_signed = driver->getParam(ID::B_SIGNED).as_bool(); + if (a_signed != b_signed || a_signed != target_add_signed) + can_merge = false; + } + } + + if (can_merge) { + if (!harvest.cluster_cells.count(driver)) { + harvest.cluster_cells.insert(driver); + queue.push_back(driver); + } + continue; + } + + IdString signed_param = port == ID::A ? ID::A_SIGNED : ID::B_SIGNED; + bool signed_port = cell->hasParam(signed_param) && cell->getParam(signed_param).as_bool(); + if (signed_port) + harvest.signed_source_uses[sig]++; + else + harvest.unsigned_source_uses[sig]++; + } + } + + // Single-cell cluster is a no-op + return GetSize(harvest.cluster_cells) > 1; + } + + std::vector collect_candidates(IdString cell_type, bool exclude_generated, dict &target_cache) + { + std::vector cells; + for (Cell *cell : module->selected_cells()) + if (is_target_cell_type_cached(cell, cell_type, exclude_generated, target_cache)) + cells.push_back(cell); + // Sort lexically for cross-run deterministic candidate order + std::sort(cells.begin(), cells.end(), [](Cell *a, Cell *b) { + return std::strcmp(a->name.c_str(), b->name.c_str()) < 0; + }); + return cells; + } + + // Rewrite evaluation and commit + void rewrite_one_head(IdString cell_type, Cell *head, SweepContext &sweep, + ConnectivitySnapshot &graph, TimingOracle &timer) + { + // No per-head rebuild in this sweep, defer heads that read already consumed drivers + auto source_uses_consumed_driver = [&](const dict &uses) -> bool { + // Stale snapshot guard: skip heads fed by already rewritten clusters + for (const auto &[sig, use_count] : uses) { + if (use_count <= 0) + continue; + for (auto bit : sig) { + if (!bit.wire) + continue; + auto drv_it = graph.unique_driver_by_bit.find(bit); + if (drv_it == graph.unique_driver_by_bit.end()) + continue; + Cell *driver = drv_it->second; + if (driver != nullptr && sweep.consumed_cells.count(driver)) + return true; + } + } + return false; + }; + + if (sweep.consumed_cells.count(head)) + return; + if (!is_head_cell(head, cell_type, true, graph, sweep.target_cache, sweep.y_cache)) + return; + + ClusterHarvest harvest; + if (!collect_cluster(cell_type, head, sweep.candidate_cells, graph, sweep.target_cache, sweep.y_cache, harvest)) + return; + + // Batch only disjoint clusters in one sweep + for (Cell *cell : harvest.cluster_cells) + if (cell != nullptr && sweep.consumed_cells.count(cell)) + return; + + // Defer heads that depend on already rewritten snapshot drivers + if (source_uses_consumed_driver(harvest.signed_source_uses) || + source_uses_consumed_driver(harvest.unsigned_source_uses)) + return; + + RewriteDecision decision; + if (!evaluate_rewrite(cell_type, head, harvest, timer, decision)) + return; + if (!commit_rewrite(cell_type, head, decision)) + return; + + for (Cell *cell : harvest.cluster_cells) + if (cell != nullptr) + sweep.consumed_cells.insert(cell); + sweep.stats.rewrites++; + + // No per-head rebuild, invalidate rewritten Y-cache entries only + for (Cell *cell : harvest.cluster_cells) + if (cell != nullptr) + sweep.y_cache.erase(cell); + sweep.y_cache.erase(head); + } + + std::vector order_heads_by_dependency(const std::vector &heads, ConnectivitySnapshot &graph, bool &saw_cycle) + { + saw_cycle = false; + if (heads.empty()) + return {}; + + /* + * Backward DFS over driver links, + * postorder emits upstream-first head order, + * cycles fall back to conservative skip in this sweep + */ + pool head_cells; + for (auto head : heads) + head_cells.insert(head); + + dict state; + std::vector postorder_heads; + struct DfsEntry { + Cell *cell; + bool postorder; + }; + std::vector stack; + stack.reserve(kTraversalStackReserve); + + for (auto root : heads) + { + if (root == nullptr) + continue; + + stack.clear(); + stack.push_back({root, false}); + while (!stack.empty()) + { + DfsEntry e = stack.back(); + stack.pop_back(); + Cell *cell = e.cell; + if (cell == nullptr || is_timing_boundary_cell(cell, cell_types)) + continue; + + TraversalState st = TraversalState::Unseen; + if (auto it = state.find(cell); it != state.end()) + st = it->second; + + if (e.postorder) { + if (st != TraversalState::Done) { + state[cell] = TraversalState::Done; + if (head_cells.count(cell)) + postorder_heads.push_back(cell); + } + continue; + } + + if (st == TraversalState::Done) + continue; + if (st == TraversalState::Active) { + saw_cycle = true; + continue; + } + + state[cell] = TraversalState::Active; + stack.push_back({cell, true}); + + for (const auto &[port_id, sig] : cell->connections()) { + if (!cell->input(port_id)) + continue; + for (auto bit : sigmap(sig)) { + if (!bit.wire) + continue; + auto drv_it = graph.unique_driver_by_bit.find(bit); + if (drv_it == graph.unique_driver_by_bit.end()) + continue; + Cell *driver = drv_it->second; + if (driver == nullptr || driver == cell) + continue; + stack.push_back({driver, false}); + } + } + } + } + + if (saw_cycle) + log_warning("opt_timing_balance: cycle detected in head ordering in module %s, using conservative order.\n", + log_id(module)); + + // Preserve deterministic order for disconnected heads + pool seen_heads; + std::vector ordered_heads; + ordered_heads.reserve(GetSize(heads)); + for (auto head : postorder_heads) { + if (!seen_heads.count(head)) { + seen_heads.insert(head); + ordered_heads.push_back(head); + } + } + for (auto head : heads) { + if (!seen_heads.count(head)) + ordered_heads.push_back(head); + } + return ordered_heads; + } + + bool build_tree_leaves(const ClusterHarvest &harvest, TimingOracle &timer, std::vector &leaves) + { + struct SourceUse { + SigSpec sig; + bool is_signed; + int count; + }; + + leaves.clear(); + int stable_id = 0; + + // Deterministic source-use ordering for stable tree shape + std::vector uses; + uses.reserve(GetSize(harvest.signed_source_uses) + GetSize(harvest.unsigned_source_uses)); + for (const auto &[sig, count] : harvest.signed_source_uses) + uses.push_back({sig, true, count}); + for (const auto &[sig, count] : harvest.unsigned_source_uses) + uses.push_back({sig, false, count}); + std::sort(uses.begin(), uses.end(), [](const SourceUse &a, const SourceUse &b) { + if (a.sig != b.sig) + return less_sigspec_key(a.sig, b.sig); + if (a.is_signed != b.is_signed) + return a.is_signed > b.is_signed; + return a.count < b.count; + }); + + for (const auto &use : uses) + { + if (use.count <= 0) + continue; + double src_arrival = timer.get_arrival(use.sig); + if (!std::isfinite(src_arrival)) + return false; + + for (int i = 0; i < use.count; i++) + leaves.push_back({use.sig, src_arrival, GetSize(use.sig), use.is_signed, stable_id++}); + } + + return !leaves.empty() && !timer.cycle_detected; + } + + bool evaluate_rewrite(IdString cell_type, Cell *head_cell, const ClusterHarvest &harvest, + TimingOracle &timer, RewriteDecision &decision) + { + decision = RewriteDecision(); + // Keep exact head output bits. Mapping here can rewire the wrong alias + decision.head_output = head_cell->getPort(ID::Y); + + std::vector leaves; + if (!build_tree_leaves(harvest, timer, leaves)) + return false; + + double old_arrival = timer.get_arrival(decision.head_output); + if (timer.cycle_detected || !std::isfinite(old_arrival)) + return false; + + decision.plan = planner.plan(leaves, cell_type, head_cell); + if (!decision.plan.valid()) + return false; + + double estimated_new_arrival = decision.plan.output_arrival; + if (!std::isfinite(estimated_new_arrival) || estimated_new_arrival >= old_arrival - kMinIterationDelta) + return false; + return true; + } + + bool commit_rewrite(IdString cell_type, Cell *head_cell, + const RewriteDecision &decision) + { + SigSpec head_output = decision.head_output; + SigSpec tree_output = emitter.apply(decision.plan, cell_type, head_cell); + if (GetSize(head_output) <= 0 || GetSize(tree_output) <= 0) + return false; + if (GetSize(head_output) != GetSize(tree_output)) + return false; + + // Detach old driver first to avoid transient multi-driver aliasing + IdString detached_name = make_id(head_cell, "timing_balance_detach"); + Wire *detached = module->addWire(detached_name, std::max(1, GetSize(head_output))); + head_cell->setPort(ID::Y, SigSpec(detached)); + if (head_cell->hasParam(ID::Y_WIDTH)) + head_cell->setParam(ID::Y_WIDTH, GetSize(head_output)); + head_cell->fixup_parameters(); + + module->connect(head_output, tree_output); + return true; + } + + // Objective and per-type sweep + ObjectiveScore compute_delay_objective(const std::vector &target_cell_ids, ConnectivitySnapshot &graph, TimingOracle &timer) + { + ObjectiveScore objective; + for (auto cell_type : target_cell_ids) + { + dict target_cache; + dict y_cache; + std::vector candidates = collect_candidates(cell_type, false, target_cache); + std::vector heads; + for (Cell *cell : candidates) { + if (is_head_cell(cell, cell_type, false, graph, target_cache, y_cache)) + heads.push_back(cell); + } + + for (Cell *cell : heads) { + double arrival = timer.get_arrival(cell->getPort(ID::Y)); + if (timer.cycle_detected || !std::isfinite(arrival)) + return {std::numeric_limits::infinity()}; + objective.sum_arrival += arrival; + } + } + return objective; + } + + RewriteStats process_cell_type_once(IdString cell_type, ConnectivitySnapshot &graph, TimingOracle &timer) + { + SweepContext sweep; + std::vector candidates = collect_candidates(cell_type, true, sweep.target_cache); + for (Cell *cell : candidates) + sweep.candidate_cells.insert(cell); + sweep.stats.candidates = GetSize(candidates); + + std::vector heads; + for (Cell *cell : candidates) + if (is_head_cell(cell, cell_type, true, graph, sweep.target_cache, sweep.y_cache)) + heads.push_back(cell); + sweep.stats.trees = GetSize(heads); + + bool saw_cycle = false; + std::vector ordered_heads = order_heads_by_dependency(heads, graph, saw_cycle); + if (saw_cycle) { + // Cyclic cones are rejected conservatively for this sweep + return sweep.stats; + } + + for (Cell *head : ordered_heads) + rewrite_one_head(cell_type, head, sweep, graph, timer); + return sweep.stats; + } + + // Top-level worker loop + void run() + { + if (target_cell_ids.empty()) + return; + + ConnectivitySnapshot graph(module, sigmap); + TimingOracle timer(cell_types, sigmap, graph.unique_driver_by_bit); + + ObjectiveScore objective_before = compute_delay_objective(target_cell_ids, graph, timer); + bool stopped_early = false; + log(" processing module %s\n", log_id(module)); + log_flush(); + + for (int iter = 0; iter < kMaxPassIterations; iter++) { + ObjectiveScore iter_before = objective_before; + ObjectiveScore iter_after = iter_before; + bool improved = false; + int generated_before = 0; + for (IdString cell_type : target_cell_ids) + generated_before += cell_count[cell_type]; + + log(" iteration %d/%d begin\n", iter + 1, kMaxPassIterations); + int total_rewrites = 0; + for (IdString cell_type : target_cell_ids) { + RewriteStats stats = process_cell_type_once(cell_type, graph, timer); + total_rewrites += stats.rewrites; + log(" %s trees=%d candidates=%d rewrites=%d\n", + log_id(cell_type), stats.trees, stats.candidates, stats.rewrites); + } + + int generated_after = 0; + for (IdString cell_type : target_cell_ids) + generated_after += cell_count[cell_type]; + int generated_delta = generated_after - generated_before; + log(" rewrote_trees=%d generated_cells=%d\n", total_rewrites, generated_delta); + + if (total_rewrites > 0) { + log(" clean -purge begin\n"); + Pass::call_on_module(design, module, "clean -purge"); + log(" clean -purge end\n"); + rebuild_views(graph, timer); + iter_after = compute_delay_objective(target_cell_ids, graph, timer); + improved = objective_improved(iter_before, iter_after); + } + + log(" before = %.3f after = %.3f, %s\n", + iter_before.sum_arrival, iter_after.sum_arrival, + improved ? "timing estimation improved, continuing" : "timing estimation did not improve, stopping"); + log(" iteration %d/%d end\n", iter + 1, kMaxPassIterations); + log_flush(); + + if (!improved) { + stopped_early = true; + break; + } + objective_before = iter_after; + } + + if (!stopped_early) { + log(" reached iteration cap %d stopping\n", kMaxPassIterations); + log_flush(); + } + } +}; + +// ----------------------------------------------------------------------------- +// Pass wrapper +// ----------------------------------------------------------------------------- + +struct OptTimingBalancePass : public Pass +{ + OptTimingBalancePass() : Pass("opt_timing_balance", "timing-aware balancing of associative trees") { } + + void help() override + { + log("\n"); + log(" opt_timing_balance [options] [selection]\n"); + log("\n"); + log("Iterative timing-aware balancing for cascaded associative cells.\n"); + log("Uses lazy backward arrival estimation plus DAG-ordered Huffman rebuilding.\n"); + log("\n"); + log(" -arith\n"); + log(" only convert arithmetic cells ($add).\n"); + log("\n"); + log(" -logic\n"); + log(" only convert logic cells ($and/$or/$xor).\n"); + log("\n"); + } + + void execute(std::vector args, RTLIL::Design *design) override + { + log_header(design, "Executing OPT_TIMING_BALANCE pass (iterative timing-aware tree rewrite).\n"); + + size_t argidx; + bool saw_type_flag = false; + bool enable_arith = false; + bool enable_logic = false; + for (argidx = 1; argidx < (size_t)GetSize(args); argidx++) { + if (args[argidx] == "-arith") { + saw_type_flag = true; + enable_arith = true; + continue; + } + if (args[argidx] == "-logic") { + saw_type_flag = true; + enable_logic = true; + continue; + } + // Remaining args are selection filters + break; + } + extra_args(args, argidx, design); + + if (!saw_type_flag) { + enable_arith = true; + enable_logic = true; + } + + std::vector target_cell_ids = collect_target_cell_ids(enable_logic, enable_arith); + + dict cell_count; + for (auto module : design->selected_modules()) { + OptTimingBalanceWorker worker(design, module, target_cell_ids); + worker.run(); + for (const auto &[type, count] : worker.cell_count) + cell_count[type] += count; + } + + for (auto cell_type : target_cell_ids) { + log(" Converted %d %s cells into timing-balanced trees.\n", cell_count[cell_type], log_id(cell_type)); + } + } +} OptTimingBalancePass; + +PRIVATE_NAMESPACE_END diff --git a/tests/silimate/opt_timing_balance.ys b/tests/silimate/opt_timing_balance.ys new file mode 100644 index 000000000..4c5b118b4 --- /dev/null +++ b/tests/silimate/opt_timing_balance.ys @@ -0,0 +1,511 @@ +# +# opt_timing_balance regression coverage +# + +# --------------------------------------------------------------------------- +# Case: XOR chain with late leaf should be rewritten +# --------------------------------------------------------------------------- +log -header "opt_timing_balance: xor late leaf rewrite" +log -push +design -reset +read_verilog <