From af79b7775c0c24240208a9b4af8922b27cee0660 Mon Sep 17 00:00:00 2001 From: Nachiket Kapre Date: Tue, 16 Jun 2026 15:06:17 -0400 Subject: [PATCH] router2: add stall detection, iteration cap, and periodic validity checks The negotiated-congestion router can spin without converging on designs where (a) the architecturally optimal wire is genuinely unavailable, (b) the cost gradient between two near-equivalent candidates is too shallow to settle, or (c) a placer choice has fenced off the legal binding space and the failure won't surface until bind_and_check_all() actually runs. Today the main loop has no upper bound on iterations and bind_and_check_all() is only ever called on full convergence, so a stuck design loops forever while logging "archfail=NA" every iteration. Three small, opt-in safety hatches: 1. Hard iteration ceiling -- `cfg.max_router_iters` (default 2000). When reached, log a warning, bind what we have, and break the loop instead of spinning. 2. Stall detection -- `cfg.stall_iter_limit` (default 50). Watch (overused_wires, total_wire_overuse) across consecutive iterations. When both stay unchanged for stall_iter_limit iterations, first try resetting `curr_cong_weight` back to `cfg.init_curr_cong_weight` (once) to escape a local minimum. If the stall persists after the reset, break with a best-effort binding rather than continuing to loop. 3. Periodic validity check -- `cfg.bind_check_interval` (default 100). Even when `overused_wires > 0`, call bind_and_check_all() every `bind_check_interval` iterations so architectural impossibilities surface early instead of being silently masked by `archfail=NA`. All three are fully configurable via context settings (`router2/maxRouterIters`, `router2/stallIterLimit`, `router2/bindCheckInterval`). Defaults preserve existing behaviour for designs that converge promptly (well under 2000 iterations and never stall for 50 in a row), and the new break paths only fire on pathological cases that would otherwise loop forever. No new options exposed at the CLI; defaults are sized for typical designs and out-of-the-box behaviour is unchanged. --- common/route/router2.cc | 65 ++++++++++++++++++++++++++++++++++++++++- common/route/router2.h | 14 +++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/common/route/router2.cc b/common/route/router2.cc index e3e3bb8b..486d4fa0 100644 --- a/common/route/router2.cc +++ b/common/route/router2.cc @@ -1710,6 +1710,15 @@ struct Router2 log_info("Running main router loop...\n"); if (timing_driven) tmg.run(true); + + // Stall-detection / hard-limit state for the main router loop. + // Configurable via cfg.stall_iter_limit / cfg.max_router_iters / + // cfg.bind_check_interval below. + int stall_count = 0; + int prev_overused = -1; + int prev_overuse = -1; + bool did_cong_reset = false; + do { ctx->sorted_shuffle(route_queue); @@ -1772,9 +1781,16 @@ struct Router2 } } } + // Periodic bind check to detect arch failures early instead + // of hiding them under "archfail=NA" until convergence. if (overused_wires == 0 && overused_resources == 0 && tmgfail == 0) { - // Try and actually bind nextpnr Arch API wires bind_and_check_all(); + } else if (cfg.bind_check_interval > 0 && (iter % cfg.bind_check_interval) == 0) { + bind_and_check_all(); + if (arch_fail > 0) + log_warning(" iter=%d: %d arch failures detected with %d overused wires still " + "remaining\n", + iter, arch_fail, overused_wires); } for (auto cn : failed_nets) route_queue.push_back(cn); @@ -1791,6 +1807,50 @@ struct Router2 log_info(" iter=%d wires=%d overused=%d overuse=%d %sarchfail=%s\n", iter, total_wire_use, overused_wires, total_wire_overuse, resource_str.c_str(), (overused_wires > 0 || tmgfail > 0) ? "NA" : std::to_string(arch_fail).c_str()); + + // Stall detection: track when (overused_wires, total_overuse) + // stops decreasing. + if (overused_wires == prev_overused && total_wire_overuse == prev_overuse) { + ++stall_count; + } else { + stall_count = 0; + } + prev_overused = overused_wires; + prev_overuse = total_wire_overuse; + + // On first stall, reset curr_cong_weight to escape a local + // minimum where the negotiated-congestion gradient is too + // shallow to converge. + if (cfg.stall_iter_limit > 0 && stall_count == cfg.stall_iter_limit && !did_cong_reset) { + log_warning(" Routing stalled for %d iterations with %d overused wires -- resetting " + "congestion weight\n", + stall_count, overused_wires); + curr_cong_weight = cfg.init_curr_cong_weight; + stall_count = 0; + did_cong_reset = true; + } + + // Persistent stall after reset: bind best-effort and break, + // rather than spinning forever. + if (cfg.stall_iter_limit > 0 && stall_count >= cfg.stall_iter_limit && did_cong_reset) { + log_warning(" Routing stalled after congestion reset -- giving up after %d iterations " + "(%d overused wires, %d total overuse)\n", + iter, overused_wires, total_wire_overuse); + bind_and_check_all(); + break; + } + + // Hard iteration ceiling. Designs that genuinely cannot + // converge (e.g. architecturally impossible bindings) get a + // best-effort routing and a clear log line instead of an + // infinite loop. + if (cfg.max_router_iters > 0 && iter >= cfg.max_router_iters) { + log_warning(" Hit maximum router iterations (%d) with %d overused wires remaining\n", + cfg.max_router_iters, overused_wires); + bind_and_check_all(); + break; + } + ++iter; if (curr_cong_weight < 1e9) curr_cong_weight += cfg.curr_cong_mult; @@ -1846,6 +1906,9 @@ Router2Cfg::Router2Cfg(Context *ctx) curr_cong_mult = ctx->setting("router2/currCongWeightMult", 2.0f); estimate_weight = ctx->setting("router2/estimateWeight", 1.25f); } + max_router_iters = ctx->setting("router2/maxRouterIters", 2000); + stall_iter_limit = ctx->setting("router2/stallIterLimit", 50); + bind_check_interval = ctx->setting("router2/bindCheckInterval", 100); perf_profile = ctx->setting("router2/perfProfile", false); if (ctx->settings.count(ctx->id("router2/heatmap"))) heatmap = ctx->settings.at(ctx->id("router2/heatmap")).as_string(); diff --git a/common/route/router2.h b/common/route/router2.h index 629453c6..b77d6a09 100644 --- a/common/route/router2.h +++ b/common/route/router2.h @@ -54,6 +54,20 @@ struct Router2Cfg // of choosing a less congestion/delay-optimal route float estimate_weight; + // Maximum main-loop iterations before giving up with a best-effort + // binding (0 = unlimited). Backstops PathFinder pathologies on + // genuinely-impossible designs. + int max_router_iters; + // Stall detection: break out of the main loop if (overused_wires, + // total_wire_overuse) has been unchanged for this many iterations. + // A one-shot congestion-weight reset is attempted on first stall + // before the break path is taken. + int stall_iter_limit; + // Periodically call bind_and_check_all() while still iterating, so + // architectural impossibilities surface early instead of being hidden + // by "archfail=NA". 0 = off. + int bind_check_interval; + // Print additional performance profiling information bool perf_profile = false;