diff --git a/docs/internals.rst b/docs/internals.rst index bd1c5de98..7b86beff6 100644 --- a/docs/internals.rst +++ b/docs/internals.rst @@ -937,13 +937,14 @@ macro-task's dataset fits in one core's local caches. To achieve spatial locality, we tag each variable with the set of macro-tasks that access it. Let's call this set the "footprint" of that -variable. The variables in a given module have a set of footprints. We can -order those footprints to minimize the distance between them (distance is -the number of macro-tasks that are different across any two footprints) and -then emit all variables into the struct in ordered-footprint order. +variable. The variables in a given module have a set of footprints. We group +variables with identical non-empty footprints, emit those groups in deterministic +footprint-key order, then emit variables with no footprint information last. -The footprint ordering is literally the traveling salesman problem, and we -use a TSP-approximation algorithm to get close to an optimal sort. +The first emitted variable in each footprint group is aligned to a cache-line +boundary. This avoids false sharing between different macro-task footprints +without building a complete pairwise-distance graph over all footprints, which +would use excessive memory on very large models. This is an old idea. Simulators designed at DEC in the early 1990s used similar techniques to optimize both single-thread and multithread modes. diff --git a/src/V3AstNodeOther.h b/src/V3AstNodeOther.h index 88a084d48..c34c77988 100644 --- a/src/V3AstNodeOther.h +++ b/src/V3AstNodeOther.h @@ -2008,6 +2008,7 @@ class AstVar final : public AstNode { bool m_globalConstrained : 1; // Global constraint per IEEE 1800-2023 18.5.8 bool m_isStdRandomizeArg : 1; // Argument variable created for std::randomize (__Varg*) bool m_processQueue : 1; // Process queue variable + bool m_mtaskCacheLineAlign : 1; // Start MTask affinity group on a cache line void init() { m_ansi = false; m_declTyped = false; @@ -2070,6 +2071,7 @@ class AstVar final : public AstNode { m_globalConstrained = false; m_isStdRandomizeArg = false; m_processQueue = false; + m_mtaskCacheLineAlign = false; } public: @@ -2318,6 +2320,8 @@ public: } bool isUsedParam() const { return m_usedParam; } bool isUsedLoopIdx() const { return m_usedLoopIdx; } + bool mtaskCacheLineAlign() const { return m_mtaskCacheLineAlign; } + void mtaskCacheLineAlign(bool flag) { m_mtaskCacheLineAlign = flag; } bool isSc() const VL_MT_SAFE { return m_sc; } bool isScQuad() const; bool isScBv() const VL_MT_STABLE; diff --git a/src/V3EmitCBase.cpp b/src/V3EmitCBase.cpp index 070d593b0..f1680a40a 100644 --- a/src/V3EmitCBase.cpp +++ b/src/V3EmitCBase.cpp @@ -180,6 +180,7 @@ void EmitCBaseVisitorConst::emitCFuncDecl(const AstCFunc* funcp, const AstNodeMo void EmitCBaseVisitorConst::emitVarDecl(const AstVar* nodep, bool asRef) { const AstBasicDType* const basicp = nodep->basicp(); const bool refNeedParens = VN_IS(nodep->dtypeSkipRefp(), UnpackArrayDType); + if (nodep->mtaskCacheLineAlign() && !asRef) putns(nodep, "alignas(VL_CACHE_LINE_BYTES) "); const auto emitDeclArrayBrackets = [this](const AstVar* nodep) -> void { // This isn't very robust and may need cleanup for other data types diff --git a/src/V3VariableOrder.cpp b/src/V3VariableOrder.cpp index 4b61b3f9b..0f9441ef4 100644 --- a/src/V3VariableOrder.cpp +++ b/src/V3VariableOrder.cpp @@ -27,9 +27,9 @@ #include "V3AstUserAllocator.h" #include "V3EmitCBase.h" #include "V3ExecGraph.h" -#include "V3TSP.h" #include "V3ThreadPool.h" +#include #include VL_DEFINE_DEBUG_FUNCTIONS; @@ -90,41 +90,6 @@ public: } }; -//###################################################################### -// Establish mtask variable sort order in mtasks mode - -class VarTspSorter final : public V3TSP::TspStateBase { - // MEMBERS - const MTaskIdVec& m_mTaskIds; // Mtask we're ordering - static uint32_t s_serialNext; // Unique ID to establish serial order - const uint32_t m_serial = ++s_serialNext; // Serial ordering -public: - // CONSTRUCTORS - explicit VarTspSorter(const MTaskIdVec& mTaskIds) - : m_mTaskIds{mTaskIds} { - UASSERT(mTaskIds.size() == ExecMTask::numUsedIds(), "Wrong size for MTask ID vector"); - } - ~VarTspSorter() override = default; - // METHODS - bool operator<(const TspStateBase& other) const override { - return operator<(static_cast(other)); - } - bool operator<(const VarTspSorter& other) const { return m_serial < other.m_serial; } - const MTaskIdVec& mTaskIds() const { return m_mTaskIds; } - int cost(const TspStateBase* otherp) const override VL_MT_SAFE { - return cost(static_cast(otherp)); - } - int cost(const VarTspSorter* otherp) const VL_MT_SAFE { - // Compute the number of MTasks not shared (Hamming distance) - int cost = 0; - const size_t size = ExecMTask::numUsedIds(); - for (size_t i = 0; i < size; ++i) cost += m_mTaskIds.at(i) ^ otherp->m_mTaskIds.at(i); - return cost; - } -}; - -uint32_t VarTspSorter::s_serialNext = 0; - struct VarAttributes final { uint8_t stratum; // Roughly equivalent to alignment requirement, to avoid padding bool anonOk; // Can be emitted as part of anonymous structure @@ -165,10 +130,14 @@ class VariableOrder final { }); } + static bool emptyAffinity(const MTaskIdVec& vec) { + return std::find(vec.begin(), vec.end(), true) == vec.end(); + } + // Sort by MTask-affinity first, then the same as simpleSortVars - void tspSortVars(std::vector& varps) { + void mtaskSortVars(std::vector& varps) { // Map from "MTask affinity" -> "variable list" - std::map> m2v; + std::map> m2v; const MTaskIdVec emptyVec(ExecMTask::numUsedIds(), false); for (AstVar* const varp : varps) { const auto it = m_mTaskAffinity.find(varp); @@ -176,35 +145,38 @@ class VariableOrder final { m2v[key].push_back(varp); } - // Create a TSP sort state for each unique MTaskIdSet, except for the empty set - V3TSP::StateVec states; - for (const auto& pair : m2v) { - const MTaskIdVec& vec = pair.first; - const bool empty = std::find(vec.begin(), vec.end(), true) == vec.end(); - if (!empty) states.push_back(new VarTspSorter{vec}); - } - - // Do the TSP sort - V3TSP::StateVec sortedStates; - V3TSP::tspSort(states, &sortedStates); - varps.clear(); // Helper function to sort given vector, then append to 'varps' - const auto sortAndAppend = [this, &varps](std::vector& subVarps) { + const auto sortAndAppend = [this, &varps](std::vector& subVarps, + bool alignFirst) { simpleSortVars(subVarps); - for (AstVar* const varp : subVarps) varps.push_back(varp); + bool aligned = !alignFirst; + for (AstVar* const varp : subVarps) { + if (!aligned && !varp->isStatic()) { + varp->mtaskCacheLineAlign(true); + V3Stats::addStatSum("VariableOrder, MTask aligned group starts", 1); + aligned = true; + } + varps.push_back(varp); + } }; - // Enumerate by sorted MTaskIdSet, sort within the set separately - for (const V3TSP::TspStateBase* const stateBasep : sortedStates) { - const VarTspSorter* const statep = dynamic_cast(stateBasep); - sortAndAppend(m2v[statep->mTaskIds()]); - VL_DO_DANGLING(delete statep, statep); + // Sort non-empty MTask affinity groups in the map's deterministic key order. This keeps + // memory linear in the number of affinity groups, unlike the old complete pairwise-distance + // ordering. + size_t affinityGroups = 0; + for (auto& pair : m2v) { + if (emptyAffinity(pair.first)) continue; + sortAndAppend(pair.second, true); + ++affinityGroups; } // Finally add the variables with no known MTask affinity - sortAndAppend(m2v[emptyVec]); + sortAndAppend(m2v[emptyVec], false); + + V3Stats::addStatSum("VariableOrder, MTask affinity groups", affinityGroups); + V3Stats::addStatSum("VariableOrder, no-affinity variables", m2v[emptyVec].size()); } // cppcheck-suppress constParameterPointer @@ -236,7 +208,7 @@ class VariableOrder final { if (!v3Global.opt.mtasks()) { simpleSortVars(m_varps); } else { - tspSortVars(m_varps); + mtaskSortVars(m_varps); } } } diff --git a/test_regress/t/t_variable_order_mtask.py b/test_regress/t/t_variable_order_mtask.py new file mode 100755 index 000000000..a3d9b86a7 --- /dev/null +++ b/test_regress/t/t_variable_order_mtask.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# DESCRIPTION: Verilator: Verilog Test driver/expect definition +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of either the GNU Lesser General Public License Version 3 +# or the Perl Artistic License Version 2.0. +# SPDX-FileCopyrightText: 2026 Wilson Snyder +# SPDX-License-Identifier: LGPL-3.0-only OR Artistic-2.0 + +import vltest_bootstrap + +test.scenarios('vlt_all') +test.top_filename = test.obj_dir + "/t_variable_order_mtask.v" + + +def gen(filename, nregs): + with open(filename, 'w', encoding="utf8") as fh: + fh.write("// Generated by t_variable_order_mtask.py\n") + fh.write("module t(input logic clk, input logic [7:0] i, output logic [31:0] o);\n") + for i in range(nregs): + fh.write(f" logic [31:0] r{i};\n") + for i in range(nregs): + fh.write(" always_ff @(posedge clk) begin\n") + fh.write(f" r{i} <= (r{i} + {{24'd0, i}}) ^ 32'h{i + 1:08x};\n") + fh.write(" end\n") + fh.write(" always_comb begin\n") + fh.write(" o = 32'h0") + for i in range(nregs): + fh.write(f" ^ r{i}") + fh.write(";\n") + fh.write(" end\n") + fh.write("endmodule\n") + + +gen(test.top_filename, 24) + +flags = ["--cc", "--stats", "-Wno-UNOPTTHREADS"] +if test.vltmt: + flags += ["--threads-max-mtasks 8"] + +test.compile(verilator_flags2=flags, threads=(2 if test.vltmt else 1)) + +root_h = test.obj_dir + "/" + test.vm_prefix + "___024root.h" +aligned_var_re = r'alignas\(VL_CACHE_LINE_BYTES\) (?:CData|SData|IData|QData|VlWide|VL_)' + +if test.vltmt: + test.file_grep(root_h, aligned_var_re) + test.file_grep(test.stats, r'VariableOrder, MTask affinity groups\s+([1-9]\d*)') + test.file_grep(test.stats, r'VariableOrder, MTask aligned group starts\s+([1-9]\d*)') +else: + test.file_grep_not(root_h, aligned_var_re) + test.file_grep_not(test.stats, r'VariableOrder, MTask affinity groups') + test.file_grep_not(test.stats, r'VariableOrder, MTask aligned group starts') + +test.passes()